import gtk import logger from workflow import * from scipy import array,zeros from data import read_affy_annot,read_mootha,data_dict_to_matrix import plots import dataset class PCAWorkflow(Workflow): def __init__(self, app): Workflow.__init__(self, app) self.name = 'PCAs Workflow' load = Stage('load', 'Load Data') load.add_function(LoadMoothaData()) self.add_stage(load) preproc = Stage('preprocess', 'Preprocessing') preproc.add_function(Function('log2', 'Logarithm')) self.add_stage(preproc) annot = Stage('annot', 'Affy annotations') annot.add_function(LoadAnnotationsFunction()) self.add_stage(annot) model = Stage('model', 'Model') model.add_function(Function('pca', 'PCA')) self.add_stage(model) logger.log('debug', '\tPCA\'s workflow is now active') class LoadAnnotationsFunction(Function): def __init__(self): Function.__init__(self, 'load', 'Load Annotations') self.annotations = None def load_affy_file(self, filename): f = open(filename) logger.log('notice', 'Loading annotation file: %s' % filename) self.file = f def on_response(self, dialog, response): if response == gtk.RESPONSE_OK: logger.log('notice', 'Reading file: %s' % dialog.get_filename()) self.load_affy_file(dialog.get_filename()) def run(self, data): btns = ('Open', gtk.RESPONSE_OK, \ 'Cancel', gtk.RESPONSE_CANCEL) dialog = gtk.FileChooserDialog('Open Affy Annotation File', buttons=btns) dialog.connect('response', self.on_response) dialog.run() dialog.destroy() ### Reading and parsing here annot = read_affy_annot(self.file) i_want = 'Pathway' nothing = '---' ids_in_data = set(data.names('genes')) #assuming we have genes sanity_check = set(annot.keys()) if not ids_in_data.intersection(sanity_check) == ids_in_data: logger.log('debug','Some identifers in data does not exist in affy file!') for affy_id,description in annot: if affy_id in ids_in_data: pathways = description[i_want] if not pathways[0][0]=='--': pass return [self.annotations] class PCAFunction(Function): def __init__(self): Function.__init__(self, 'X', 'a_opt') self.output = None def run(self, data): logger.log('debug', 'datatype: %s' % type(data)) if not isinstance(data,dataset.Dataset): return None logger.log('debug', 'dimensions: %s' % data.dims) ## calculations T,P,E,tsq = pca(data._data,a_opt=2) comp_def = ['comp',['1','2']] singel_def = ['1',['s']] col_def = [data._dim_names[0],data.names(0)] row_def = [data._dim_names[1],data.names(1)] T = dataset.Dataset(T,[col_def,comp_def]) P = dataset.Dataset(T,[row_def,comp_def]) E = dataset.Dataset(E,[col_def,row_def]) tsq = dataset.Dataset(tsq,[row_def,sigel_def]) ## plots loading_plot = plots.ScatterPlot() return [T,P,E,r] class LoadMoothaData(Function): def __init__(self): Function.__init__(self, 'load', 'Load diabetes data') self.annotations = None def load_expression_file(self, filename): f = open(filename) logger.log('notice', 'Loading expression file: %s' % filename) self.file = f self.filename = filename def on_response(self, dialog, response): if response == gtk.RESPONSE_OK: logger.log('notice', 'Reading file: %s' % dialog.get_filename()) self.load_expression_file(dialog.get_filename()) def run(self, data): btns = ('Open', gtk.RESPONSE_OK, \ 'Cancel', gtk.RESPONSE_CANCEL) dialog = gtk.FileChooserDialog('Open diabetes expression File', buttons=btns) dialog.connect('response', self.on_response) dialog.run() dialog.destroy() ### Reading and parsing here d,sample_names = read_mootha() n_samps = len(sample_names) n_genes = len(d.keys()) typecode = 'f' x = zeros((n_samps,n_genes),typecode) gene_ids = [] for i,(id,desc) in enumerate(d.items()): gene_ids.append(id) x[:,i] = desc[0].astype(typecode) gene_def = ['genes',gene_ids] sample_def = ['samples', sample_names] X = dataset.Dataset(x,[sample_def,gene_def]) # samples x genes return [X] PCAWorkflow.name = 'PCA Workflow'