Implemented Limma function for Affy workflow.

Extended ScatterPlot to take two datasets and updated code using it.
This commit is contained in:
Truls Alexander Tangstad 2006-05-09 13:17:17 +00:00
parent 033d4d5333
commit 5b1af849dc
6 changed files with 144 additions and 24 deletions

View File

@ -1,5 +1,5 @@
import pygtk
pygtk.require('2.0')
# pygtk.require('2.0')
import gtk
import sys
import os

View File

@ -429,18 +429,19 @@ class LinePlot(Plot):
class ScatterPlot(Plot):
def __init__(self, dataset, id_dim, sel_dim, id_1, id_2, name="Scatter plot"):
def __init__(self, dataset_1, dataset_2, id_dim, sel_dim, id_1, id_2, name="Scatter plot"):
Plot.__init__(self, name)
fig = Figure(figsize=(5,4), dpi=72)
self.ax = ax = fig.add_subplot(111)
self.current_dim = id_dim
# testing testing
self.dataset = dataset
x_index = dataset[sel_dim][id_1]
y_index = dataset[sel_dim][id_2]
self.dataset_1 = dataset_1
self.xaxis_data = dataset._array[:,x_index]
self.yaxis_data = dataset._array[:,y_index]
x_index = dataset_1[sel_dim][id_1]
y_index = dataset_2[sel_dim][id_2]
self.xaxis_data = dataset_1._array[:,x_index]
self.yaxis_data = dataset_2._array[:,y_index]
ax.plot(self.xaxis_data,self.yaxis_data,'og')
ax.set_title(self.get_title())
ax.set_xlabel("%s - %s" % (sel_dim, id_1))
@ -478,13 +479,13 @@ class ScatterPlot(Plot):
#logger.log('debug','Selection y_start bigger than y_end')
index =scipy.nonzero((xdata>x1) & (xdata<x2) & (ydata<y1) & (ydata>y2))
ids = self.dataset.get_identifiers(self.current_dim, index)
ids = self.dataset_1.get_identifiers(self.current_dim, index)
self.selection_listener(self.current_dim, ids)
def selection_changed(self, selection):
ids = selection[self.current_dim] # current identifiers
index = self.dataset.get_indices(self.current_dim, ids)
index = self.dataset_1.get_indices(self.current_dim, ids)
xdata_new = scipy.take(self.xaxis_data,index) #take data
ydata_new = scipy.take(self.yaxis_data,index)
self.ax.clear()

View File

@ -86,6 +86,29 @@ CEL\tsex\tage\tinfected
self.assertEquals(set(['F', 'M', 'I', 'N']), set(dataset.get_categories()))
def testGetFactors(self):
cel_data = """\
CEL\tsex\tage\tinfected
02-05-33\tF\t8\tI
02-05-34\tF\t9\tN
02-05-35\tM\t8\tI
"""
dataset = PhenotypeDataset(cel_data)
self.assertEquals(set(["sex", "infected"]), dataset.get_factors(["F", "I"]))
def testGetCategoryVariable(self):
"""Can get set/unset list for given category."""
cel_data = """\
CEL\tsex\tage\tinfected
02-05-33\tF\t8\tI
02-05-34\tF\t9\tN
02-05-35\tM\t8\tI
"""
dataset = PhenotypeDataset(cel_data)
self.assertEquals([1, 1, 0], dataset.get_category_variable("F"))
self.assertEquals([0, 0, 1], dataset.get_category_variable("M"))
self.assertEquals([1, 0, 1], dataset.get_category_variable("I"))
self.assertEquals([0, 1, 0], dataset.get_category_variable("N"))
if __name__=='__main__':

View File

@ -1,4 +1,5 @@
import gtk
import os.path
from system import dataset, logger, plots, workflow, dialogs
from scipy import randn, array, transpose, zeros
import cPickle
@ -17,9 +18,12 @@ class AffyWorkflow (workflow.Workflow):
load.add_function(PhenotypeImportFunction())
load.add_function(TestDataFunction())
load.add_function(DatasetLoadFunction())
load.add_function(ContrastMatrixGenerateFunction())
self.add_stage(load)
significance = workflow.Stage('significance', 'Significance analysis')
significance.add_function(LimmaFunction())
self.add_stage(significance)
explore = workflow.Stage('explore', 'Explorative analysis')
explore.add_function(PCAFunction(self))
explore.add_function(PrintFunction())
@ -106,18 +110,86 @@ class DatasetSaveFunction(workflow.Function):
chooser.destroy()
class ContrastMatrixGenerateFunction(workflow.Function):
class LimmaFunction(workflow.Function):
def __init__(self):
workflow.Function.__init__(self, 'contrast_create', 'Create contrast matrix')
workflow.Function.__init__(self, 'limma', 'Limma')
def run(self, data):
def run(self, affy, data):
response = dialogs.get_text('Enter contrasts...', """\
Enter comma-separated list of contrasts.
Available categories: %s
Example: Y-N, M-F""" % ", ".join(data.get_categories()))
logger.log("notice", "contrasts selected: %s" % response)
categories = []
[categories.extend(s.split("-")) for s in response.split(",")]
categories = [s.strip() for s in categories]
factors = data.get_factors(categories)
if not factors:
logger.log("warning", "nothing to do, no factors")
table = data.get_phenotype_table()
cn = table[0]
entries = zip(*table[1:])
rn = entries[0]
import rpy
rpy.r.library("limma")
rpy.r("a <- matrix('kalle', nrow=%d, ncol=%d)" % (len(rn), len(cn)))
for i, row in enumerate(entries):
for j, entry in enumerate(row):
rpy.r("a[%d, %d] <- '%s'" % (j+1, i+1, entry))
rpy.r.assign("rn", rn)
rpy.r.assign("cn", cn)
rpy.r("rownames(a) <- rn")
rpy.r("colnames(a) <- cn")
unique_categories = list(set(categories))
# compose fancy list of factors for design matrix
rpy.r("design <- matrix(0, nrow=%d, ncol=%d)" % (len(rn), len(unique_categories)))
for i, category in enumerate(unique_categories):
for j, value in enumerate(data.get_category_variable(category)):
rpy.r("design[%d, %d] <- %d" % (j+1, i+1, value))
rpy.r.assign("colnames.design", unique_categories)
rpy.r("colnames(design) <- colnames.design")
rpy.r.assign("expr", affy.asarray())
rpy.r("fit <- lmFit(expr, design)")
# FIXME: might be a case for code injection...
string = "contrast.matrix <- makeContrasts(%s, levels=design)" % response
rpy.r(string)
rpy.r("fit2 <- contrasts.fit(fit, contrast.matrix)")
rpy.r("fit2 <- eBayes(fit2)")
coeff = rpy.r("fit2$coefficients")
amean = rpy.r("fit2$Amean")
padj = rpy.r("p.adjust(fit2$p.value, method='fdr')")
dim_1, dim_2 = affy.get_dim_names()
coeff_data = dataset.Dataset(coeff, [(dim_1, affy.get_identifiers(dim_1)),
("contrast", [response])],
name="Coefficients")
amean_data = dataset.Dataset(array(amean), [("average", ["average"]),
(dim_1, affy.get_identifiers(dim_1))],
name="Average Intensity")
padj_data = dataset.Dataset(padj, [(dim_1, affy.get_identifiers(dim_1)),
("contrast", [response])],
name="Adjusted P-value")
vulcano_plot = plots.ScatterPlot(coeff_data, padj_data, dim_1,
'contrast', response, response,
name="Vulcano plot")
return [coeff_data, amean_data, padj_data, vulcano_plot]
class CelFileImportFunction(workflow.Function):
"""Loads Affymetrics .CEL-files into matrix."""
@ -147,7 +219,6 @@ class CelFileImportFunction(workflow.Function):
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
silent_eval('E <- ReadAffy(filenames=c("%s"))' % '", "'.join(chooser.get_filenames()))
silent_eval('E <- rma(E)')
m = rpy.r('m <- E@exprs')
vector_eval = rpy.with_mode(rpy.VECTOR_CONVERSION, rpy.r)
@ -223,9 +294,9 @@ class PCAFunction(workflow.Function):
# cleanup
rpy.r.rm(["t", "m"])
loading_plot = plots.ScatterPlot(P, dim_2, component_dim, '1', '2',
loading_plot = plots.ScatterPlot(P, P, dim_2, component_dim, '1', '2',
"Loadings")
score_plot = plots.ScatterPlot(T, dim_1,component_dim, '1', '2',
score_plot = plots.ScatterPlot(T, T, dim_1,component_dim, '1', '2',
"Scores")
return [T, P, loading_plot, score_plot]
@ -239,7 +310,7 @@ class PhenotypeDataset(dataset.Dataset):
col_names = rows[0][1:]
phenotypes = []
categories = {}
self._categories = set()
self._categories = {}
for col_name, column in zip(col_names, columns[1:]):
try:
@ -257,7 +328,7 @@ class PhenotypeDataset(dataset.Dataset):
entries[entry].append(i)
for key in keys:
self._categories.add(key)
self._categories[key] = col_name
z = zeros(len(column))
for i in entries[key]:
z[i] = 1
@ -288,4 +359,29 @@ class PhenotypeDataset(dataset.Dataset):
If factor 'sick' had possibilites Y/N, and 'sex' M/F, the
categories would be Y, N, M and F.
"""
return self._categories
return self._categories.keys()
def get_factors(self, categories):
factors = set()
for c in categories:
factors.add(self._categories[c])
return factors
def get_category_variable(self, category):
# abit brute-force, but does the job until optimization is
# necessary
factor = self._categories[category]
variable = []
for column in zip(*self.get_phenotype_table()):
if column[0] == factor:
for entry in column[1:]:
if entry == category:
variable.append(1)
else:
variable.append(0)
return variable

View File

@ -248,7 +248,7 @@ class PCAFunction(workflow.Function):
# cleanup
rpy.r.rm(["t", "m"])
loading_plot = plots.ScatterPlot(P,'ids','component','1','2', "Loadings")
score_plot = plots.ScatterPlot(T,'filename','component','1','2', "Scores")
loading_plot = plots.ScatterPlot(P, P, ,'ids','component','1','2', "Loadings")
score_plot = plots.ScatterPlot(T, T,'filename','component','1','2', "Scores")
return [T, P, loading_plot, score_plot]

View File

@ -102,9 +102,9 @@ class PCAFunction(Function):
#tsq = dataset.Dataset(tsq,[singel_def,data_ids[1])
## plots
loading_plot1 = plots.ScatterPlot(P,'genes','comp','1','2')
loading_plot2 = plots.ScatterPlot(P,'genes','comp','3','4')
score_plot = plots.ScatterPlot(T,'samples','comp','1','2')
loading_plot1 = plots.ScatterPlot(P,P,'genes','comp','1','2')
loading_plot2 = plots.ScatterPlot(P,P,'genes','comp','3','4')
score_plot = plots.ScatterPlot(T,T,'samples','comp','1','2')
return [T,P,E,loading_plot1,loading_plot2,score_plot]