import gtk import os.path from system import dataset, logger, plots, workflow, dialogs from scipy import randn, array, transpose, zeros import cPickle class AffyWorkflow (workflow.Workflow): name = 'Affy Workflow' ident = 'affy' description = 'Affymetrics Workflow. Analysis of Affy-data.' def __init__(self, app): workflow.Workflow.__init__(self, app) load = workflow.Stage('load', 'Load Data') load.add_function(CelFileImportFunction()) load.add_function(PhenotypeImportFunction()) load.add_function(TestDataFunction()) load.add_function(DatasetLoadFunction()) self.add_stage(load) significance = workflow.Stage('significance', 'Significance analysis') significance.add_function(LimmaFunction()) self.add_stage(significance) explore = workflow.Stage('explore', 'Explorative analysis') explore.add_function(PCAFunction(self)) explore.add_function(PrintFunction()) self.add_stage(explore) save = workflow.Stage('save', 'Save Data') save.add_function(DatasetSaveFunction()) self.add_stage(save) class PrintFunction(workflow.Function): def __init__(self): workflow.Function.__init__(self, 'printer', 'Print Stuff') def run(self, data): dim1, dim2 = data.get_dim_names() print dim1, dim2 print "\t", "\t".join(data.get_identifiers(dim2)) for row in zip(data.get_identifiers(dim1), data.asarray().tolist()): print "\t".join(map(str, row)) class TestDataFunction(workflow.Function): def __init__(self): workflow.Function.__init__(self, 'test_data', 'Generate Test Data') def run(self): logger.log('notice', 'Injecting foo test data') x = randn(20,30) X = dataset.Dataset(x) return [X, plots.LinePlot(X)] class DatasetLoadFunction(workflow.Function): """Loader for previously pickled Datasets.""" def __init__(self): workflow.Function.__init__(self, 'load_data', 'Load Pickled Dataset') def run(self): chooser = gtk.FileChooserDialog(title="Select cel files...", parent=None, action=gtk.FILE_CHOOSER_ACTION_OPEN, buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, gtk.STOCK_OPEN, gtk.RESPONSE_OK)) pkl_filter = gtk.FileFilter() pkl_filter.set_name("Python pickled data files (*.pkl)") pkl_filter.add_pattern("*.[pP][kK][lL]") all_filter = gtk.FileFilter() all_filter.set_name("All Files (*.*)") all_filter.add_pattern("*") chooser.add_filter(pkl_filter) chooser.add_filter(all_filter) try: if chooser.run() == gtk.RESPONSE_OK: return [cPickle.load(open(chooser.get_filename()))] finally: chooser.destroy() class DatasetSaveFunction(workflow.Function): """QND way to save data to file for later import to this program.""" def __init__(self): workflow.Function.__init__(self, 'save_data', 'Save Pickled Dataset') def run(self, data): chooser = gtk.FileChooserDialog(title="Save pickled data...", parent=None, action=gtk.FILE_CHOOSER_ACTION_SAVE, buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, gtk.STOCK_SAVE, gtk.RESPONSE_OK)) pkl_filter = gtk.FileFilter() pkl_filter.set_name("Python pickled data files (*.pkl)") pkl_filter.add_pattern("*.[pP][kK][lL]") all_filter = gtk.FileFilter() all_filter.set_name("All Files (*.*)") all_filter.add_pattern("*") chooser.add_filter(pkl_filter) chooser.add_filter(all_filter) chooser.set_current_name(data.get_name() + ".pkl") try: if chooser.run() == gtk.RESPONSE_OK: cPickle.dump(data, open(chooser.get_filename(), "w"), protocol=2) logger.log("notice", "Saved data to %r." % chooser.get_filename()) finally: chooser.destroy() class LimmaFunction(workflow.Function): def __init__(self): workflow.Function.__init__(self, 'limma', 'Limma') def run(self, affy, data): response = dialogs.get_text('Enter contrasts...', """\ Enter comma-separated list of contrasts. Available categories: %s Example: Y-N, M-F""" % ", ".join(data.get_categories())) logger.log("notice", "contrasts selected: %s" % response) categories = [] [categories.extend(s.split("-")) for s in response.split(",")] categories = [s.strip() for s in categories] factors = data.get_factors(categories) if not factors: logger.log("warning", "nothing to do, no factors") table = data.get_phenotype_table([os.path.splitext(f)[0] for f in affy.get_identifiers('filename')]) cn = table[0] entries = zip(*table[1:]) rn = entries[0] import rpy silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r) rpy.r.library("limma") silent_eval("a <- matrix('kalle', nrow=%d, ncol=%d)" % (len(rn), len(cn))) for i, row in enumerate(entries): for j, entry in enumerate(row): silent_eval("a[%d, %d] <- '%s'" % (j+1, i+1, entry)) rpy.r.assign("rn", rn) rpy.r.assign("cn", cn) silent_eval("rownames(a) <- rn") silent_eval("colnames(a) <- cn") unique_categories = list(set(categories)) # compose fancy list of factors for design matrix silent_eval("design <- matrix(0, nrow=%d, ncol=%d)" % (len(rn), len(unique_categories))) for i, category in enumerate(unique_categories): for j, value in enumerate(data.get_category_variable(category)): silent_eval("design[%d, %d] <- %d" % (j+1, i+1, value)) rpy.r.assign("colnames.design", unique_categories) silent_eval("colnames(design) <- colnames.design") rpy.r.assign("expr", affy.asarray()) silent_eval("fit <- lmFit(expr, design)") # FIXME: might be a case for code injection... string = "contrast.matrix <- makeContrasts(%s, levels=design)" % response silent_eval(string) silent_eval("fit2 <- contrasts.fit(fit, contrast.matrix)") silent_eval("fit2 <- eBayes(fit2)") coeff = rpy.r("fit2$coefficients") amean = rpy.r("fit2$Amean") padj = rpy.r("p.adjust(fit2$p.value, method='fdr')") dim_1, dim_2 = affy.get_dim_names() coeff_data = dataset.Dataset(coeff, [(dim_1, affy.get_identifiers(dim_1)), ("contrast", [response])], name="Coefficients") amean_data = dataset.Dataset(array(amean), [("average", ["average"]), (dim_1, affy.get_identifiers(dim_1))], name="Average Intensity") padj_data = dataset.Dataset(padj, [(dim_1, affy.get_identifiers(dim_1)), ("contrast", [response])], name="Adjusted P-value") vulcano_plot = plots.ScatterPlot(coeff_data, padj_data, dim_1, 'contrast', response, response, name="Vulcano plot") # We should be nice and clean up after ourselves rpy.r("rm(list=ls())") return [coeff_data, amean_data, padj_data, vulcano_plot] class CelFileImportFunction(workflow.Function): """Loads Affymetrics .CEL-files into matrix.""" def __init__(self): workflow.Function.__init__(self, 'cel_import', 'Import Affy') def run(self): import rpy chooser = gtk.FileChooserDialog(title="Select cel files...", parent=None, action=gtk.FILE_CHOOSER_ACTION_OPEN, buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, gtk.STOCK_OPEN, gtk.RESPONSE_OK)) chooser.set_select_multiple(True) cel_filter = gtk.FileFilter() cel_filter.set_name("Cel Files (*.cel)") cel_filter.add_pattern("*.[cC][eE][lL]") all_filter = gtk.FileFilter() all_filter.set_name("All Files (*.*)") all_filter.add_pattern("*") chooser.add_filter(cel_filter) chooser.add_filter(all_filter) try: if chooser.run() == gtk.RESPONSE_OK: rpy.r.library("affy") silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r) silent_eval('E <- ReadAffy(filenames=c("%s"))' % '", "'.join(chooser.get_filenames())) silent_eval('E <- rma(E)') m = rpy.r('m <- E@exprs') vector_eval = rpy.with_mode(rpy.VECTOR_CONVERSION, rpy.r) rownames = vector_eval('rownames(m)') colnames = vector_eval('colnames(m)') # We should be nice and clean up after ourselves rpy.r.rm(["E", "m"]) if m: data = dataset.Dataset(m, (('ids', rownames), ('filename', colnames)), name="Affymetrics Data") plot = plots.LinePlot(data, "Gene profiles") return [data, plot] else: logger.log("notice", "No data loaded from importer.") finally: chooser.destroy() class PhenotypeImportFunction(workflow.Function): def __init__(self): workflow.Function.__init__(self, 'import_phenotype', 'Import Phenotypes') def run(self): chooser = gtk.FileChooserDialog(title="Select cel files...", parent=None, action=gtk.FILE_CHOOSER_ACTION_OPEN, buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, gtk.STOCK_OPEN, gtk.RESPONSE_OK)) all_filter = gtk.FileFilter() all_filter.set_name("Tab separated file (*.*)") all_filter.add_pattern("*") chooser.add_filter(all_filter) try: if chooser.run() == gtk.RESPONSE_OK: text = open(chooser.get_filename()).read() data = PhenotypeDataset(text) return [data] finally: chooser.destroy() class PCAFunction(workflow.Function): """Generic PCA function.""" def __init__(self, wf): workflow.Function.__init__(self, 'pca', 'PCA') self._workflow = wf def run(self,data): import rpy dim_2, dim_1 = data.get_dim_names() silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r) rpy.with_mode(rpy.NO_CONVERSION, rpy.r.assign)("m", data.asarray()) silent_eval("t = prcomp(t(m))") # we make a unique name for component dimension c = 0 component_dim = prefix = "component" while component_dim in data.get_all_dims(): component_dim = prefix + "_" + str(c) c += 1 T_ids = map(str, range(1, rpy.r("dim(t$x)")[1]+1)) T = dataset.Dataset(rpy.r("t$x"), [(dim_1, data.get_identifiers(dim_1)), (component_dim, T_ids)], all_dims = data.get_all_dims(), name="T") P = dataset.Dataset(rpy.r("t$rotation"), [(dim_2, data.get_identifiers(dim_2)), (component_dim, T_ids)], all_dims = data.get_all_dims(), name="P") # cleanup rpy.r.rm(["t", "m"]) loading_plot = plots.ScatterPlot(P, P, dim_2, component_dim, '1', '2', "Loadings") score_plot = plots.ScatterPlot(T, T, dim_1,component_dim, '1', '2', "Scores") return [T, P, loading_plot, score_plot] class PhenotypeDataset(dataset.Dataset): def __init__(self, string): self._table = rows = [line.split("\t") for line in string.splitlines()] columns = zip(*rows[1:]) cel_names = columns[0] col_names = rows[0][1:] phenotypes = [] categories = {} self._categories = {} for col_name, column in zip(col_names, columns[1:]): try: categories[col_name] = map(float, column) phenotypes.append(col_name) except ValueError: # category-data keys = [] entries = {} for i, entry in enumerate(column): if entry not in entries: keys.append(entry) entries[entry] = [] entries[entry].append(i) for key in keys: self._categories[key] = col_name z = zeros(len(column)) for i in entries[key]: z[i] = 1 key = "%s-%s" % (col_name, key) phenotypes.append(key) categories[key] = z matrix_data = [] for key in phenotypes: matrix_data.append(categories[key]) if matrix_data: a = transpose(array(matrix_data)) else: a = None dataset.Dataset.__init__(self, a, identifiers=[('CEL', cel_names), ('phenotypes', phenotypes)], shape=(len(cel_names),len(phenotypes)), name="Phenotype Data") def sort_cels(self, cel_names): self._dims = [] cels = {} for row in self._table[1:]: cels[row[0]] = row[1:] new_table = [self._table[0]] for name in cel_names: new_table.append([name] + cels[name]) self._table = new_table self._set_identifiers([('CEL', cel_names), ('phenotypes', self.get_identifiers('phenotypes'))], self._all_dims) def get_phenotype_table(self, cel_order=None): """Get string based table of phenotypes as read from file.""" if not cel_order: return self._table else: cels = {} for row in self._table[1:]: cels[row[0]] = row[1:] new_table = [self._table[0]] for name in cel_order: new_table.append([name] + cels[name]) return new_table def get_categories(self): """Get categories of factors. If factor 'sick' had possibilites Y/N, and 'sex' M/F, the categories would be Y, N, M and F. """ return self._categories.keys() def get_factors(self, categories): factors = set() for c in categories: factors.add(self._categories[c]) return factors def get_category_variable(self, category): # abit brute-force, but does the job until optimization is # necessary factor = self._categories[category] variable = [] for column in zip(*self.get_phenotype_table()): if column[0] == factor: for entry in column[1:]: if entry == category: variable.append(1) else: variable.append(0) return variable