From 5b1af849dca04a6de15154114b09375148c2c08f Mon Sep 17 00:00:00 2001 From: tangstad Date: Tue, 9 May 2006 13:17:17 +0000 Subject: [PATCH] Implemented Limma function for Affy workflow. Extended ScatterPlot to take two datasets and updated code using it. --- system/dialogs.py | 2 +- system/plots.py | 17 ++-- test/workflows/affy_workflowtest.py | 23 ++++++ workflows/affy_workflow.py | 116 +++++++++++++++++++++++++--- workflows/go_workflow.py | 4 +- workflows/pca_workflow.py | 6 +- 6 files changed, 144 insertions(+), 24 deletions(-) diff --git a/system/dialogs.py b/system/dialogs.py index 32277a3..e952dae 100644 --- a/system/dialogs.py +++ b/system/dialogs.py @@ -1,5 +1,5 @@ import pygtk -pygtk.require('2.0') +# pygtk.require('2.0') import gtk import sys import os diff --git a/system/plots.py b/system/plots.py index 212ac15..801b9cd 100644 --- a/system/plots.py +++ b/system/plots.py @@ -429,18 +429,19 @@ class LinePlot(Plot): class ScatterPlot(Plot): - def __init__(self, dataset, id_dim, sel_dim, id_1, id_2, name="Scatter plot"): + def __init__(self, dataset_1, dataset_2, id_dim, sel_dim, id_1, id_2, name="Scatter plot"): Plot.__init__(self, name) fig = Figure(figsize=(5,4), dpi=72) self.ax = ax = fig.add_subplot(111) self.current_dim = id_dim # testing testing - self.dataset = dataset - x_index = dataset[sel_dim][id_1] - y_index = dataset[sel_dim][id_2] + self.dataset_1 = dataset_1 - self.xaxis_data = dataset._array[:,x_index] - self.yaxis_data = dataset._array[:,y_index] + x_index = dataset_1[sel_dim][id_1] + y_index = dataset_2[sel_dim][id_2] + + self.xaxis_data = dataset_1._array[:,x_index] + self.yaxis_data = dataset_2._array[:,y_index] ax.plot(self.xaxis_data,self.yaxis_data,'og') ax.set_title(self.get_title()) ax.set_xlabel("%s - %s" % (sel_dim, id_1)) @@ -478,13 +479,13 @@ class ScatterPlot(Plot): #logger.log('debug','Selection y_start bigger than y_end') index =scipy.nonzero((xdata>x1) & (xdatay2)) - ids = self.dataset.get_identifiers(self.current_dim, index) + ids = self.dataset_1.get_identifiers(self.current_dim, index) self.selection_listener(self.current_dim, ids) def selection_changed(self, selection): ids = selection[self.current_dim] # current identifiers - index = self.dataset.get_indices(self.current_dim, ids) + index = self.dataset_1.get_indices(self.current_dim, ids) xdata_new = scipy.take(self.xaxis_data,index) #take data ydata_new = scipy.take(self.yaxis_data,index) self.ax.clear() diff --git a/test/workflows/affy_workflowtest.py b/test/workflows/affy_workflowtest.py index 2b071e2..8c6cdb2 100644 --- a/test/workflows/affy_workflowtest.py +++ b/test/workflows/affy_workflowtest.py @@ -86,6 +86,29 @@ CEL\tsex\tage\tinfected self.assertEquals(set(['F', 'M', 'I', 'N']), set(dataset.get_categories())) + def testGetFactors(self): + cel_data = """\ +CEL\tsex\tage\tinfected +02-05-33\tF\t8\tI +02-05-34\tF\t9\tN +02-05-35\tM\t8\tI +""" + dataset = PhenotypeDataset(cel_data) + self.assertEquals(set(["sex", "infected"]), dataset.get_factors(["F", "I"])) + + def testGetCategoryVariable(self): + """Can get set/unset list for given category.""" + cel_data = """\ +CEL\tsex\tage\tinfected +02-05-33\tF\t8\tI +02-05-34\tF\t9\tN +02-05-35\tM\t8\tI +""" + dataset = PhenotypeDataset(cel_data) + self.assertEquals([1, 1, 0], dataset.get_category_variable("F")) + self.assertEquals([0, 0, 1], dataset.get_category_variable("M")) + self.assertEquals([1, 0, 1], dataset.get_category_variable("I")) + self.assertEquals([0, 1, 0], dataset.get_category_variable("N")) if __name__=='__main__': diff --git a/workflows/affy_workflow.py b/workflows/affy_workflow.py index 5a4e756..349f209 100644 --- a/workflows/affy_workflow.py +++ b/workflows/affy_workflow.py @@ -1,4 +1,5 @@ import gtk +import os.path from system import dataset, logger, plots, workflow, dialogs from scipy import randn, array, transpose, zeros import cPickle @@ -17,9 +18,12 @@ class AffyWorkflow (workflow.Workflow): load.add_function(PhenotypeImportFunction()) load.add_function(TestDataFunction()) load.add_function(DatasetLoadFunction()) - load.add_function(ContrastMatrixGenerateFunction()) self.add_stage(load) + significance = workflow.Stage('significance', 'Significance analysis') + significance.add_function(LimmaFunction()) + self.add_stage(significance) + explore = workflow.Stage('explore', 'Explorative analysis') explore.add_function(PCAFunction(self)) explore.add_function(PrintFunction()) @@ -106,18 +110,86 @@ class DatasetSaveFunction(workflow.Function): chooser.destroy() -class ContrastMatrixGenerateFunction(workflow.Function): +class LimmaFunction(workflow.Function): def __init__(self): - workflow.Function.__init__(self, 'contrast_create', 'Create contrast matrix') + workflow.Function.__init__(self, 'limma', 'Limma') - def run(self, data): + def run(self, affy, data): response = dialogs.get_text('Enter contrasts...', """\ Enter comma-separated list of contrasts. Available categories: %s Example: Y-N, M-F""" % ", ".join(data.get_categories())) + logger.log("notice", "contrasts selected: %s" % response) + categories = [] + [categories.extend(s.split("-")) for s in response.split(",")] + categories = [s.strip() for s in categories] + + factors = data.get_factors(categories) + if not factors: + logger.log("warning", "nothing to do, no factors") + + table = data.get_phenotype_table() + cn = table[0] + entries = zip(*table[1:]) + rn = entries[0] + + import rpy + rpy.r.library("limma") + rpy.r("a <- matrix('kalle', nrow=%d, ncol=%d)" % (len(rn), len(cn))) + for i, row in enumerate(entries): + for j, entry in enumerate(row): + rpy.r("a[%d, %d] <- '%s'" % (j+1, i+1, entry)) + rpy.r.assign("rn", rn) + rpy.r.assign("cn", cn) + rpy.r("rownames(a) <- rn") + rpy.r("colnames(a) <- cn") + + unique_categories = list(set(categories)) + + # compose fancy list of factors for design matrix + rpy.r("design <- matrix(0, nrow=%d, ncol=%d)" % (len(rn), len(unique_categories))) + for i, category in enumerate(unique_categories): + for j, value in enumerate(data.get_category_variable(category)): + rpy.r("design[%d, %d] <- %d" % (j+1, i+1, value)) + + rpy.r.assign("colnames.design", unique_categories) + rpy.r("colnames(design) <- colnames.design") + + rpy.r.assign("expr", affy.asarray()) + rpy.r("fit <- lmFit(expr, design)") + + # FIXME: might be a case for code injection... + string = "contrast.matrix <- makeContrasts(%s, levels=design)" % response + rpy.r(string) + rpy.r("fit2 <- contrasts.fit(fit, contrast.matrix)") + rpy.r("fit2 <- eBayes(fit2)") + coeff = rpy.r("fit2$coefficients") + amean = rpy.r("fit2$Amean") + padj = rpy.r("p.adjust(fit2$p.value, method='fdr')") + + dim_1, dim_2 = affy.get_dim_names() + + + coeff_data = dataset.Dataset(coeff, [(dim_1, affy.get_identifiers(dim_1)), + ("contrast", [response])], + name="Coefficients") + amean_data = dataset.Dataset(array(amean), [("average", ["average"]), + (dim_1, affy.get_identifiers(dim_1))], + name="Average Intensity") + padj_data = dataset.Dataset(padj, [(dim_1, affy.get_identifiers(dim_1)), + ("contrast", [response])], + name="Adjusted P-value") + + vulcano_plot = plots.ScatterPlot(coeff_data, padj_data, dim_1, + 'contrast', response, response, + name="Vulcano plot") + + return [coeff_data, amean_data, padj_data, vulcano_plot] + + class CelFileImportFunction(workflow.Function): """Loads Affymetrics .CEL-files into matrix.""" @@ -147,7 +219,6 @@ class CelFileImportFunction(workflow.Function): silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r) silent_eval('E <- ReadAffy(filenames=c("%s"))' % '", "'.join(chooser.get_filenames())) silent_eval('E <- rma(E)') - m = rpy.r('m <- E@exprs') vector_eval = rpy.with_mode(rpy.VECTOR_CONVERSION, rpy.r) @@ -223,9 +294,9 @@ class PCAFunction(workflow.Function): # cleanup rpy.r.rm(["t", "m"]) - loading_plot = plots.ScatterPlot(P, dim_2, component_dim, '1', '2', + loading_plot = plots.ScatterPlot(P, P, dim_2, component_dim, '1', '2', "Loadings") - score_plot = plots.ScatterPlot(T, dim_1,component_dim, '1', '2', + score_plot = plots.ScatterPlot(T, T, dim_1,component_dim, '1', '2', "Scores") return [T, P, loading_plot, score_plot] @@ -239,7 +310,7 @@ class PhenotypeDataset(dataset.Dataset): col_names = rows[0][1:] phenotypes = [] categories = {} - self._categories = set() + self._categories = {} for col_name, column in zip(col_names, columns[1:]): try: @@ -257,7 +328,7 @@ class PhenotypeDataset(dataset.Dataset): entries[entry].append(i) for key in keys: - self._categories.add(key) + self._categories[key] = col_name z = zeros(len(column)) for i in entries[key]: z[i] = 1 @@ -288,4 +359,29 @@ class PhenotypeDataset(dataset.Dataset): If factor 'sick' had possibilites Y/N, and 'sex' M/F, the categories would be Y, N, M and F. """ - return self._categories + return self._categories.keys() + + def get_factors(self, categories): + factors = set() + for c in categories: + factors.add(self._categories[c]) + + return factors + + def get_category_variable(self, category): + # abit brute-force, but does the job until optimization is + # necessary + factor = self._categories[category] + variable = [] + for column in zip(*self.get_phenotype_table()): + if column[0] == factor: + for entry in column[1:]: + if entry == category: + variable.append(1) + else: + variable.append(0) + + return variable + + + diff --git a/workflows/go_workflow.py b/workflows/go_workflow.py index 3dd9f73..e570b7b 100644 --- a/workflows/go_workflow.py +++ b/workflows/go_workflow.py @@ -248,7 +248,7 @@ class PCAFunction(workflow.Function): # cleanup rpy.r.rm(["t", "m"]) - loading_plot = plots.ScatterPlot(P,'ids','component','1','2', "Loadings") - score_plot = plots.ScatterPlot(T,'filename','component','1','2', "Scores") + loading_plot = plots.ScatterPlot(P, P, ,'ids','component','1','2', "Loadings") + score_plot = plots.ScatterPlot(T, T,'filename','component','1','2', "Scores") return [T, P, loading_plot, score_plot] diff --git a/workflows/pca_workflow.py b/workflows/pca_workflow.py index 4f83a74..05018a6 100644 --- a/workflows/pca_workflow.py +++ b/workflows/pca_workflow.py @@ -102,9 +102,9 @@ class PCAFunction(Function): #tsq = dataset.Dataset(tsq,[singel_def,data_ids[1]) ## plots - loading_plot1 = plots.ScatterPlot(P,'genes','comp','1','2') - loading_plot2 = plots.ScatterPlot(P,'genes','comp','3','4') - score_plot = plots.ScatterPlot(T,'samples','comp','1','2') + loading_plot1 = plots.ScatterPlot(P,P,'genes','comp','1','2') + loading_plot2 = plots.ScatterPlot(P,P,'genes','comp','3','4') + score_plot = plots.ScatterPlot(T,T,'samples','comp','1','2') return [T,P,E,loading_plot1,loading_plot2,score_plot]