Projects/laydi
Projects
/
laydi
Archived
7
0
Fork 0

Implemented Limma function for Affy workflow.

Extended ScatterPlot to take two datasets and updated code using it.
This commit is contained in:
Truls Alexander Tangstad 2006-05-09 13:17:17 +00:00
parent 033d4d5333
commit 5b1af849dc
6 changed files with 144 additions and 24 deletions

View File

@ -1,5 +1,5 @@
import pygtk import pygtk
pygtk.require('2.0') # pygtk.require('2.0')
import gtk import gtk
import sys import sys
import os import os

View File

@ -429,18 +429,19 @@ class LinePlot(Plot):
class ScatterPlot(Plot): class ScatterPlot(Plot):
def __init__(self, dataset, id_dim, sel_dim, id_1, id_2, name="Scatter plot"): def __init__(self, dataset_1, dataset_2, id_dim, sel_dim, id_1, id_2, name="Scatter plot"):
Plot.__init__(self, name) Plot.__init__(self, name)
fig = Figure(figsize=(5,4), dpi=72) fig = Figure(figsize=(5,4), dpi=72)
self.ax = ax = fig.add_subplot(111) self.ax = ax = fig.add_subplot(111)
self.current_dim = id_dim self.current_dim = id_dim
# testing testing # testing testing
self.dataset = dataset self.dataset_1 = dataset_1
x_index = dataset[sel_dim][id_1]
y_index = dataset[sel_dim][id_2]
self.xaxis_data = dataset._array[:,x_index] x_index = dataset_1[sel_dim][id_1]
self.yaxis_data = dataset._array[:,y_index] y_index = dataset_2[sel_dim][id_2]
self.xaxis_data = dataset_1._array[:,x_index]
self.yaxis_data = dataset_2._array[:,y_index]
ax.plot(self.xaxis_data,self.yaxis_data,'og') ax.plot(self.xaxis_data,self.yaxis_data,'og')
ax.set_title(self.get_title()) ax.set_title(self.get_title())
ax.set_xlabel("%s - %s" % (sel_dim, id_1)) ax.set_xlabel("%s - %s" % (sel_dim, id_1))
@ -478,13 +479,13 @@ class ScatterPlot(Plot):
#logger.log('debug','Selection y_start bigger than y_end') #logger.log('debug','Selection y_start bigger than y_end')
index =scipy.nonzero((xdata>x1) & (xdata<x2) & (ydata<y1) & (ydata>y2)) index =scipy.nonzero((xdata>x1) & (xdata<x2) & (ydata<y1) & (ydata>y2))
ids = self.dataset.get_identifiers(self.current_dim, index) ids = self.dataset_1.get_identifiers(self.current_dim, index)
self.selection_listener(self.current_dim, ids) self.selection_listener(self.current_dim, ids)
def selection_changed(self, selection): def selection_changed(self, selection):
ids = selection[self.current_dim] # current identifiers ids = selection[self.current_dim] # current identifiers
index = self.dataset.get_indices(self.current_dim, ids) index = self.dataset_1.get_indices(self.current_dim, ids)
xdata_new = scipy.take(self.xaxis_data,index) #take data xdata_new = scipy.take(self.xaxis_data,index) #take data
ydata_new = scipy.take(self.yaxis_data,index) ydata_new = scipy.take(self.yaxis_data,index)
self.ax.clear() self.ax.clear()

View File

@ -86,6 +86,29 @@ CEL\tsex\tage\tinfected
self.assertEquals(set(['F', 'M', 'I', 'N']), set(dataset.get_categories())) self.assertEquals(set(['F', 'M', 'I', 'N']), set(dataset.get_categories()))
def testGetFactors(self):
cel_data = """\
CEL\tsex\tage\tinfected
02-05-33\tF\t8\tI
02-05-34\tF\t9\tN
02-05-35\tM\t8\tI
"""
dataset = PhenotypeDataset(cel_data)
self.assertEquals(set(["sex", "infected"]), dataset.get_factors(["F", "I"]))
def testGetCategoryVariable(self):
"""Can get set/unset list for given category."""
cel_data = """\
CEL\tsex\tage\tinfected
02-05-33\tF\t8\tI
02-05-34\tF\t9\tN
02-05-35\tM\t8\tI
"""
dataset = PhenotypeDataset(cel_data)
self.assertEquals([1, 1, 0], dataset.get_category_variable("F"))
self.assertEquals([0, 0, 1], dataset.get_category_variable("M"))
self.assertEquals([1, 0, 1], dataset.get_category_variable("I"))
self.assertEquals([0, 1, 0], dataset.get_category_variable("N"))
if __name__=='__main__': if __name__=='__main__':

View File

@ -1,4 +1,5 @@
import gtk import gtk
import os.path
from system import dataset, logger, plots, workflow, dialogs from system import dataset, logger, plots, workflow, dialogs
from scipy import randn, array, transpose, zeros from scipy import randn, array, transpose, zeros
import cPickle import cPickle
@ -17,9 +18,12 @@ class AffyWorkflow (workflow.Workflow):
load.add_function(PhenotypeImportFunction()) load.add_function(PhenotypeImportFunction())
load.add_function(TestDataFunction()) load.add_function(TestDataFunction())
load.add_function(DatasetLoadFunction()) load.add_function(DatasetLoadFunction())
load.add_function(ContrastMatrixGenerateFunction())
self.add_stage(load) self.add_stage(load)
significance = workflow.Stage('significance', 'Significance analysis')
significance.add_function(LimmaFunction())
self.add_stage(significance)
explore = workflow.Stage('explore', 'Explorative analysis') explore = workflow.Stage('explore', 'Explorative analysis')
explore.add_function(PCAFunction(self)) explore.add_function(PCAFunction(self))
explore.add_function(PrintFunction()) explore.add_function(PrintFunction())
@ -106,18 +110,86 @@ class DatasetSaveFunction(workflow.Function):
chooser.destroy() chooser.destroy()
class ContrastMatrixGenerateFunction(workflow.Function): class LimmaFunction(workflow.Function):
def __init__(self): def __init__(self):
workflow.Function.__init__(self, 'contrast_create', 'Create contrast matrix') workflow.Function.__init__(self, 'limma', 'Limma')
def run(self, data): def run(self, affy, data):
response = dialogs.get_text('Enter contrasts...', """\ response = dialogs.get_text('Enter contrasts...', """\
Enter comma-separated list of contrasts. Enter comma-separated list of contrasts.
Available categories: %s Available categories: %s
Example: Y-N, M-F""" % ", ".join(data.get_categories())) Example: Y-N, M-F""" % ", ".join(data.get_categories()))
logger.log("notice", "contrasts selected: %s" % response) logger.log("notice", "contrasts selected: %s" % response)
categories = []
[categories.extend(s.split("-")) for s in response.split(",")]
categories = [s.strip() for s in categories]
factors = data.get_factors(categories)
if not factors:
logger.log("warning", "nothing to do, no factors")
table = data.get_phenotype_table()
cn = table[0]
entries = zip(*table[1:])
rn = entries[0]
import rpy
rpy.r.library("limma")
rpy.r("a <- matrix('kalle', nrow=%d, ncol=%d)" % (len(rn), len(cn)))
for i, row in enumerate(entries):
for j, entry in enumerate(row):
rpy.r("a[%d, %d] <- '%s'" % (j+1, i+1, entry))
rpy.r.assign("rn", rn)
rpy.r.assign("cn", cn)
rpy.r("rownames(a) <- rn")
rpy.r("colnames(a) <- cn")
unique_categories = list(set(categories))
# compose fancy list of factors for design matrix
rpy.r("design <- matrix(0, nrow=%d, ncol=%d)" % (len(rn), len(unique_categories)))
for i, category in enumerate(unique_categories):
for j, value in enumerate(data.get_category_variable(category)):
rpy.r("design[%d, %d] <- %d" % (j+1, i+1, value))
rpy.r.assign("colnames.design", unique_categories)
rpy.r("colnames(design) <- colnames.design")
rpy.r.assign("expr", affy.asarray())
rpy.r("fit <- lmFit(expr, design)")
# FIXME: might be a case for code injection...
string = "contrast.matrix <- makeContrasts(%s, levels=design)" % response
rpy.r(string)
rpy.r("fit2 <- contrasts.fit(fit, contrast.matrix)")
rpy.r("fit2 <- eBayes(fit2)")
coeff = rpy.r("fit2$coefficients")
amean = rpy.r("fit2$Amean")
padj = rpy.r("p.adjust(fit2$p.value, method='fdr')")
dim_1, dim_2 = affy.get_dim_names()
coeff_data = dataset.Dataset(coeff, [(dim_1, affy.get_identifiers(dim_1)),
("contrast", [response])],
name="Coefficients")
amean_data = dataset.Dataset(array(amean), [("average", ["average"]),
(dim_1, affy.get_identifiers(dim_1))],
name="Average Intensity")
padj_data = dataset.Dataset(padj, [(dim_1, affy.get_identifiers(dim_1)),
("contrast", [response])],
name="Adjusted P-value")
vulcano_plot = plots.ScatterPlot(coeff_data, padj_data, dim_1,
'contrast', response, response,
name="Vulcano plot")
return [coeff_data, amean_data, padj_data, vulcano_plot]
class CelFileImportFunction(workflow.Function): class CelFileImportFunction(workflow.Function):
"""Loads Affymetrics .CEL-files into matrix.""" """Loads Affymetrics .CEL-files into matrix."""
@ -147,7 +219,6 @@ class CelFileImportFunction(workflow.Function):
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r) silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
silent_eval('E <- ReadAffy(filenames=c("%s"))' % '", "'.join(chooser.get_filenames())) silent_eval('E <- ReadAffy(filenames=c("%s"))' % '", "'.join(chooser.get_filenames()))
silent_eval('E <- rma(E)') silent_eval('E <- rma(E)')
m = rpy.r('m <- E@exprs') m = rpy.r('m <- E@exprs')
vector_eval = rpy.with_mode(rpy.VECTOR_CONVERSION, rpy.r) vector_eval = rpy.with_mode(rpy.VECTOR_CONVERSION, rpy.r)
@ -223,9 +294,9 @@ class PCAFunction(workflow.Function):
# cleanup # cleanup
rpy.r.rm(["t", "m"]) rpy.r.rm(["t", "m"])
loading_plot = plots.ScatterPlot(P, dim_2, component_dim, '1', '2', loading_plot = plots.ScatterPlot(P, P, dim_2, component_dim, '1', '2',
"Loadings") "Loadings")
score_plot = plots.ScatterPlot(T, dim_1,component_dim, '1', '2', score_plot = plots.ScatterPlot(T, T, dim_1,component_dim, '1', '2',
"Scores") "Scores")
return [T, P, loading_plot, score_plot] return [T, P, loading_plot, score_plot]
@ -239,7 +310,7 @@ class PhenotypeDataset(dataset.Dataset):
col_names = rows[0][1:] col_names = rows[0][1:]
phenotypes = [] phenotypes = []
categories = {} categories = {}
self._categories = set() self._categories = {}
for col_name, column in zip(col_names, columns[1:]): for col_name, column in zip(col_names, columns[1:]):
try: try:
@ -257,7 +328,7 @@ class PhenotypeDataset(dataset.Dataset):
entries[entry].append(i) entries[entry].append(i)
for key in keys: for key in keys:
self._categories.add(key) self._categories[key] = col_name
z = zeros(len(column)) z = zeros(len(column))
for i in entries[key]: for i in entries[key]:
z[i] = 1 z[i] = 1
@ -288,4 +359,29 @@ class PhenotypeDataset(dataset.Dataset):
If factor 'sick' had possibilites Y/N, and 'sex' M/F, the If factor 'sick' had possibilites Y/N, and 'sex' M/F, the
categories would be Y, N, M and F. categories would be Y, N, M and F.
""" """
return self._categories return self._categories.keys()
def get_factors(self, categories):
factors = set()
for c in categories:
factors.add(self._categories[c])
return factors
def get_category_variable(self, category):
# abit brute-force, but does the job until optimization is
# necessary
factor = self._categories[category]
variable = []
for column in zip(*self.get_phenotype_table()):
if column[0] == factor:
for entry in column[1:]:
if entry == category:
variable.append(1)
else:
variable.append(0)
return variable

View File

@ -248,7 +248,7 @@ class PCAFunction(workflow.Function):
# cleanup # cleanup
rpy.r.rm(["t", "m"]) rpy.r.rm(["t", "m"])
loading_plot = plots.ScatterPlot(P,'ids','component','1','2', "Loadings") loading_plot = plots.ScatterPlot(P, P, ,'ids','component','1','2', "Loadings")
score_plot = plots.ScatterPlot(T,'filename','component','1','2', "Scores") score_plot = plots.ScatterPlot(T, T,'filename','component','1','2', "Scores")
return [T, P, loading_plot, score_plot] return [T, P, loading_plot, score_plot]

View File

@ -102,9 +102,9 @@ class PCAFunction(Function):
#tsq = dataset.Dataset(tsq,[singel_def,data_ids[1]) #tsq = dataset.Dataset(tsq,[singel_def,data_ids[1])
## plots ## plots
loading_plot1 = plots.ScatterPlot(P,'genes','comp','1','2') loading_plot1 = plots.ScatterPlot(P,P,'genes','comp','1','2')
loading_plot2 = plots.ScatterPlot(P,'genes','comp','3','4') loading_plot2 = plots.ScatterPlot(P,P,'genes','comp','3','4')
score_plot = plots.ScatterPlot(T,'samples','comp','1','2') score_plot = plots.ScatterPlot(T,T,'samples','comp','1','2')
return [T,P,E,loading_plot1,loading_plot2,score_plot] return [T,P,E,loading_plot1,loading_plot2,score_plot]