Implemented Limma function for Affy workflow.
Extended ScatterPlot to take two datasets and updated code using it.
This commit is contained in:
parent
033d4d5333
commit
5b1af849dc
|
@ -1,5 +1,5 @@
|
|||
import pygtk
|
||||
pygtk.require('2.0')
|
||||
# pygtk.require('2.0')
|
||||
import gtk
|
||||
import sys
|
||||
import os
|
||||
|
|
|
@ -429,18 +429,19 @@ class LinePlot(Plot):
|
|||
|
||||
|
||||
class ScatterPlot(Plot):
|
||||
def __init__(self, dataset, id_dim, sel_dim, id_1, id_2, name="Scatter plot"):
|
||||
def __init__(self, dataset_1, dataset_2, id_dim, sel_dim, id_1, id_2, name="Scatter plot"):
|
||||
Plot.__init__(self, name)
|
||||
fig = Figure(figsize=(5,4), dpi=72)
|
||||
self.ax = ax = fig.add_subplot(111)
|
||||
self.current_dim = id_dim
|
||||
# testing testing
|
||||
self.dataset = dataset
|
||||
x_index = dataset[sel_dim][id_1]
|
||||
y_index = dataset[sel_dim][id_2]
|
||||
self.dataset_1 = dataset_1
|
||||
|
||||
self.xaxis_data = dataset._array[:,x_index]
|
||||
self.yaxis_data = dataset._array[:,y_index]
|
||||
x_index = dataset_1[sel_dim][id_1]
|
||||
y_index = dataset_2[sel_dim][id_2]
|
||||
|
||||
self.xaxis_data = dataset_1._array[:,x_index]
|
||||
self.yaxis_data = dataset_2._array[:,y_index]
|
||||
ax.plot(self.xaxis_data,self.yaxis_data,'og')
|
||||
ax.set_title(self.get_title())
|
||||
ax.set_xlabel("%s - %s" % (sel_dim, id_1))
|
||||
|
@ -478,13 +479,13 @@ class ScatterPlot(Plot):
|
|||
#logger.log('debug','Selection y_start bigger than y_end')
|
||||
index =scipy.nonzero((xdata>x1) & (xdata<x2) & (ydata<y1) & (ydata>y2))
|
||||
|
||||
ids = self.dataset.get_identifiers(self.current_dim, index)
|
||||
ids = self.dataset_1.get_identifiers(self.current_dim, index)
|
||||
self.selection_listener(self.current_dim, ids)
|
||||
|
||||
def selection_changed(self, selection):
|
||||
ids = selection[self.current_dim] # current identifiers
|
||||
|
||||
index = self.dataset.get_indices(self.current_dim, ids)
|
||||
index = self.dataset_1.get_indices(self.current_dim, ids)
|
||||
xdata_new = scipy.take(self.xaxis_data,index) #take data
|
||||
ydata_new = scipy.take(self.yaxis_data,index)
|
||||
self.ax.clear()
|
||||
|
|
|
@ -86,6 +86,29 @@ CEL\tsex\tage\tinfected
|
|||
|
||||
self.assertEquals(set(['F', 'M', 'I', 'N']), set(dataset.get_categories()))
|
||||
|
||||
def testGetFactors(self):
|
||||
cel_data = """\
|
||||
CEL\tsex\tage\tinfected
|
||||
02-05-33\tF\t8\tI
|
||||
02-05-34\tF\t9\tN
|
||||
02-05-35\tM\t8\tI
|
||||
"""
|
||||
dataset = PhenotypeDataset(cel_data)
|
||||
self.assertEquals(set(["sex", "infected"]), dataset.get_factors(["F", "I"]))
|
||||
|
||||
def testGetCategoryVariable(self):
|
||||
"""Can get set/unset list for given category."""
|
||||
cel_data = """\
|
||||
CEL\tsex\tage\tinfected
|
||||
02-05-33\tF\t8\tI
|
||||
02-05-34\tF\t9\tN
|
||||
02-05-35\tM\t8\tI
|
||||
"""
|
||||
dataset = PhenotypeDataset(cel_data)
|
||||
self.assertEquals([1, 1, 0], dataset.get_category_variable("F"))
|
||||
self.assertEquals([0, 0, 1], dataset.get_category_variable("M"))
|
||||
self.assertEquals([1, 0, 1], dataset.get_category_variable("I"))
|
||||
self.assertEquals([0, 1, 0], dataset.get_category_variable("N"))
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import gtk
|
||||
import os.path
|
||||
from system import dataset, logger, plots, workflow, dialogs
|
||||
from scipy import randn, array, transpose, zeros
|
||||
import cPickle
|
||||
|
@ -17,9 +18,12 @@ class AffyWorkflow (workflow.Workflow):
|
|||
load.add_function(PhenotypeImportFunction())
|
||||
load.add_function(TestDataFunction())
|
||||
load.add_function(DatasetLoadFunction())
|
||||
load.add_function(ContrastMatrixGenerateFunction())
|
||||
self.add_stage(load)
|
||||
|
||||
significance = workflow.Stage('significance', 'Significance analysis')
|
||||
significance.add_function(LimmaFunction())
|
||||
self.add_stage(significance)
|
||||
|
||||
explore = workflow.Stage('explore', 'Explorative analysis')
|
||||
explore.add_function(PCAFunction(self))
|
||||
explore.add_function(PrintFunction())
|
||||
|
@ -106,18 +110,86 @@ class DatasetSaveFunction(workflow.Function):
|
|||
chooser.destroy()
|
||||
|
||||
|
||||
class ContrastMatrixGenerateFunction(workflow.Function):
|
||||
class LimmaFunction(workflow.Function):
|
||||
def __init__(self):
|
||||
workflow.Function.__init__(self, 'contrast_create', 'Create contrast matrix')
|
||||
workflow.Function.__init__(self, 'limma', 'Limma')
|
||||
|
||||
def run(self, data):
|
||||
def run(self, affy, data):
|
||||
response = dialogs.get_text('Enter contrasts...', """\
|
||||
Enter comma-separated list of contrasts.
|
||||
Available categories: %s
|
||||
|
||||
Example: Y-N, M-F""" % ", ".join(data.get_categories()))
|
||||
|
||||
logger.log("notice", "contrasts selected: %s" % response)
|
||||
|
||||
categories = []
|
||||
[categories.extend(s.split("-")) for s in response.split(",")]
|
||||
categories = [s.strip() for s in categories]
|
||||
|
||||
factors = data.get_factors(categories)
|
||||
if not factors:
|
||||
logger.log("warning", "nothing to do, no factors")
|
||||
|
||||
table = data.get_phenotype_table()
|
||||
cn = table[0]
|
||||
entries = zip(*table[1:])
|
||||
rn = entries[0]
|
||||
|
||||
import rpy
|
||||
rpy.r.library("limma")
|
||||
rpy.r("a <- matrix('kalle', nrow=%d, ncol=%d)" % (len(rn), len(cn)))
|
||||
for i, row in enumerate(entries):
|
||||
for j, entry in enumerate(row):
|
||||
rpy.r("a[%d, %d] <- '%s'" % (j+1, i+1, entry))
|
||||
rpy.r.assign("rn", rn)
|
||||
rpy.r.assign("cn", cn)
|
||||
rpy.r("rownames(a) <- rn")
|
||||
rpy.r("colnames(a) <- cn")
|
||||
|
||||
unique_categories = list(set(categories))
|
||||
|
||||
# compose fancy list of factors for design matrix
|
||||
rpy.r("design <- matrix(0, nrow=%d, ncol=%d)" % (len(rn), len(unique_categories)))
|
||||
for i, category in enumerate(unique_categories):
|
||||
for j, value in enumerate(data.get_category_variable(category)):
|
||||
rpy.r("design[%d, %d] <- %d" % (j+1, i+1, value))
|
||||
|
||||
rpy.r.assign("colnames.design", unique_categories)
|
||||
rpy.r("colnames(design) <- colnames.design")
|
||||
|
||||
rpy.r.assign("expr", affy.asarray())
|
||||
rpy.r("fit <- lmFit(expr, design)")
|
||||
|
||||
# FIXME: might be a case for code injection...
|
||||
string = "contrast.matrix <- makeContrasts(%s, levels=design)" % response
|
||||
rpy.r(string)
|
||||
rpy.r("fit2 <- contrasts.fit(fit, contrast.matrix)")
|
||||
rpy.r("fit2 <- eBayes(fit2)")
|
||||
coeff = rpy.r("fit2$coefficients")
|
||||
amean = rpy.r("fit2$Amean")
|
||||
padj = rpy.r("p.adjust(fit2$p.value, method='fdr')")
|
||||
|
||||
dim_1, dim_2 = affy.get_dim_names()
|
||||
|
||||
|
||||
coeff_data = dataset.Dataset(coeff, [(dim_1, affy.get_identifiers(dim_1)),
|
||||
("contrast", [response])],
|
||||
name="Coefficients")
|
||||
amean_data = dataset.Dataset(array(amean), [("average", ["average"]),
|
||||
(dim_1, affy.get_identifiers(dim_1))],
|
||||
name="Average Intensity")
|
||||
padj_data = dataset.Dataset(padj, [(dim_1, affy.get_identifiers(dim_1)),
|
||||
("contrast", [response])],
|
||||
name="Adjusted P-value")
|
||||
|
||||
vulcano_plot = plots.ScatterPlot(coeff_data, padj_data, dim_1,
|
||||
'contrast', response, response,
|
||||
name="Vulcano plot")
|
||||
|
||||
return [coeff_data, amean_data, padj_data, vulcano_plot]
|
||||
|
||||
|
||||
|
||||
class CelFileImportFunction(workflow.Function):
|
||||
"""Loads Affymetrics .CEL-files into matrix."""
|
||||
|
@ -147,7 +219,6 @@ class CelFileImportFunction(workflow.Function):
|
|||
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
|
||||
silent_eval('E <- ReadAffy(filenames=c("%s"))' % '", "'.join(chooser.get_filenames()))
|
||||
silent_eval('E <- rma(E)')
|
||||
|
||||
m = rpy.r('m <- E@exprs')
|
||||
|
||||
vector_eval = rpy.with_mode(rpy.VECTOR_CONVERSION, rpy.r)
|
||||
|
@ -223,9 +294,9 @@ class PCAFunction(workflow.Function):
|
|||
# cleanup
|
||||
rpy.r.rm(["t", "m"])
|
||||
|
||||
loading_plot = plots.ScatterPlot(P, dim_2, component_dim, '1', '2',
|
||||
loading_plot = plots.ScatterPlot(P, P, dim_2, component_dim, '1', '2',
|
||||
"Loadings")
|
||||
score_plot = plots.ScatterPlot(T, dim_1,component_dim, '1', '2',
|
||||
score_plot = plots.ScatterPlot(T, T, dim_1,component_dim, '1', '2',
|
||||
"Scores")
|
||||
|
||||
return [T, P, loading_plot, score_plot]
|
||||
|
@ -239,7 +310,7 @@ class PhenotypeDataset(dataset.Dataset):
|
|||
col_names = rows[0][1:]
|
||||
phenotypes = []
|
||||
categories = {}
|
||||
self._categories = set()
|
||||
self._categories = {}
|
||||
|
||||
for col_name, column in zip(col_names, columns[1:]):
|
||||
try:
|
||||
|
@ -257,7 +328,7 @@ class PhenotypeDataset(dataset.Dataset):
|
|||
entries[entry].append(i)
|
||||
|
||||
for key in keys:
|
||||
self._categories.add(key)
|
||||
self._categories[key] = col_name
|
||||
z = zeros(len(column))
|
||||
for i in entries[key]:
|
||||
z[i] = 1
|
||||
|
@ -288,4 +359,29 @@ class PhenotypeDataset(dataset.Dataset):
|
|||
If factor 'sick' had possibilites Y/N, and 'sex' M/F, the
|
||||
categories would be Y, N, M and F.
|
||||
"""
|
||||
return self._categories
|
||||
return self._categories.keys()
|
||||
|
||||
def get_factors(self, categories):
|
||||
factors = set()
|
||||
for c in categories:
|
||||
factors.add(self._categories[c])
|
||||
|
||||
return factors
|
||||
|
||||
def get_category_variable(self, category):
|
||||
# abit brute-force, but does the job until optimization is
|
||||
# necessary
|
||||
factor = self._categories[category]
|
||||
variable = []
|
||||
for column in zip(*self.get_phenotype_table()):
|
||||
if column[0] == factor:
|
||||
for entry in column[1:]:
|
||||
if entry == category:
|
||||
variable.append(1)
|
||||
else:
|
||||
variable.append(0)
|
||||
|
||||
return variable
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -248,7 +248,7 @@ class PCAFunction(workflow.Function):
|
|||
# cleanup
|
||||
rpy.r.rm(["t", "m"])
|
||||
|
||||
loading_plot = plots.ScatterPlot(P,'ids','component','1','2', "Loadings")
|
||||
score_plot = plots.ScatterPlot(T,'filename','component','1','2', "Scores")
|
||||
loading_plot = plots.ScatterPlot(P, P, ,'ids','component','1','2', "Loadings")
|
||||
score_plot = plots.ScatterPlot(T, T,'filename','component','1','2', "Scores")
|
||||
|
||||
return [T, P, loading_plot, score_plot]
|
||||
|
|
|
@ -102,9 +102,9 @@ class PCAFunction(Function):
|
|||
#tsq = dataset.Dataset(tsq,[singel_def,data_ids[1])
|
||||
|
||||
## plots
|
||||
loading_plot1 = plots.ScatterPlot(P,'genes','comp','1','2')
|
||||
loading_plot2 = plots.ScatterPlot(P,'genes','comp','3','4')
|
||||
score_plot = plots.ScatterPlot(T,'samples','comp','1','2')
|
||||
loading_plot1 = plots.ScatterPlot(P,P,'genes','comp','1','2')
|
||||
loading_plot2 = plots.ScatterPlot(P,P,'genes','comp','3','4')
|
||||
score_plot = plots.ScatterPlot(T,T,'samples','comp','1','2')
|
||||
|
||||
return [T,P,E,loading_plot1,loading_plot2,score_plot]
|
||||
|
||||
|
|
Reference in New Issue