This repository has been archived on 2024-07-04. You can view files and clone it, but cannot push or open issues or pull requests.
laydi/workflows/affy_workflow.py
2006-05-09 14:16:22 +00:00

413 lines
15 KiB
Python

import gtk
import os.path
from system import dataset, logger, plots, workflow, dialogs
from scipy import randn, array, transpose, zeros
import cPickle
class AffyWorkflow (workflow.Workflow):
name = 'Affy Workflow'
ident = 'affy'
description = 'Affymetrics Workflow. Analysis of Affy-data.'
def __init__(self, app):
workflow.Workflow.__init__(self, app)
load = workflow.Stage('load', 'Load Data')
load.add_function(CelFileImportFunction())
load.add_function(PhenotypeImportFunction())
load.add_function(TestDataFunction())
load.add_function(DatasetLoadFunction())
self.add_stage(load)
significance = workflow.Stage('significance', 'Significance analysis')
significance.add_function(LimmaFunction())
self.add_stage(significance)
explore = workflow.Stage('explore', 'Explorative analysis')
explore.add_function(PCAFunction(self))
explore.add_function(PrintFunction())
self.add_stage(explore)
save = workflow.Stage('save', 'Save Data')
save.add_function(DatasetSaveFunction())
self.add_stage(save)
class PrintFunction(workflow.Function):
def __init__(self):
workflow.Function.__init__(self, 'printer', 'Print Stuff')
def run(self, data):
dim1, dim2 = data.get_dim_names()
print dim1, dim2
print "\t", "\t".join(data.get_identifiers(dim2))
for row in zip(data.get_identifiers(dim1), data.asarray().tolist()):
print "\t".join(map(str, row))
class TestDataFunction(workflow.Function):
def __init__(self):
workflow.Function.__init__(self, 'test_data', 'Generate Test Data')
def run(self):
logger.log('notice', 'Injecting foo test data')
x = randn(20,30)
X = dataset.Dataset(x)
return [X, plots.LinePlot(X)]
class DatasetLoadFunction(workflow.Function):
"""Loader for previously pickled Datasets."""
def __init__(self):
workflow.Function.__init__(self, 'load_data', 'Load Pickled Dataset')
def run(self):
chooser = gtk.FileChooserDialog(title="Select cel files...", parent=None,
action=gtk.FILE_CHOOSER_ACTION_OPEN,
buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
gtk.STOCK_OPEN, gtk.RESPONSE_OK))
pkl_filter = gtk.FileFilter()
pkl_filter.set_name("Python pickled data files (*.pkl)")
pkl_filter.add_pattern("*.[pP][kK][lL]")
all_filter = gtk.FileFilter()
all_filter.set_name("All Files (*.*)")
all_filter.add_pattern("*")
chooser.add_filter(pkl_filter)
chooser.add_filter(all_filter)
try:
if chooser.run() == gtk.RESPONSE_OK:
return [cPickle.load(open(chooser.get_filename()))]
finally:
chooser.destroy()
class DatasetSaveFunction(workflow.Function):
"""QND way to save data to file for later import to this program."""
def __init__(self):
workflow.Function.__init__(self, 'save_data', 'Save Pickled Dataset')
def run(self, data):
chooser = gtk.FileChooserDialog(title="Save pickled data...", parent=None,
action=gtk.FILE_CHOOSER_ACTION_SAVE,
buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
gtk.STOCK_SAVE, gtk.RESPONSE_OK))
pkl_filter = gtk.FileFilter()
pkl_filter.set_name("Python pickled data files (*.pkl)")
pkl_filter.add_pattern("*.[pP][kK][lL]")
all_filter = gtk.FileFilter()
all_filter.set_name("All Files (*.*)")
all_filter.add_pattern("*")
chooser.add_filter(pkl_filter)
chooser.add_filter(all_filter)
chooser.set_current_name(data.get_name() + ".pkl")
try:
if chooser.run() == gtk.RESPONSE_OK:
cPickle.dump(data, open(chooser.get_filename(), "w"), protocol=2)
logger.log("notice", "Saved data to %r." % chooser.get_filename())
finally:
chooser.destroy()
class LimmaFunction(workflow.Function):
def __init__(self):
workflow.Function.__init__(self, 'limma', 'Limma')
def run(self, affy, data):
response = dialogs.get_text('Enter contrasts...', """\
Enter comma-separated list of contrasts.
Available categories: %s
Example: Y-N, M-F""" % ", ".join(data.get_categories()))
logger.log("notice", "contrasts selected: %s" % response)
categories = []
[categories.extend(s.split("-")) for s in response.split(",")]
categories = [s.strip() for s in categories]
factors = data.get_factors(categories)
if not factors:
logger.log("warning", "nothing to do, no factors")
table = data.get_phenotype_table([os.path.splitext(f)[0] for f in affy.get_identifiers('filename')])
cn = table[0]
entries = zip(*table[1:])
rn = entries[0]
import rpy
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
rpy.r.library("limma")
silent_eval("a <- matrix('kalle', nrow=%d, ncol=%d)" % (len(rn), len(cn)))
for i, row in enumerate(entries):
for j, entry in enumerate(row):
silent_eval("a[%d, %d] <- '%s'" % (j+1, i+1, entry))
rpy.r.assign("rn", rn)
rpy.r.assign("cn", cn)
silent_eval("rownames(a) <- rn")
silent_eval("colnames(a) <- cn")
unique_categories = list(set(categories))
# compose fancy list of factors for design matrix
silent_eval("design <- matrix(0, nrow=%d, ncol=%d)" % (len(rn), len(unique_categories)))
for i, category in enumerate(unique_categories):
for j, value in enumerate(data.get_category_variable(category)):
silent_eval("design[%d, %d] <- %d" % (j+1, i+1, value))
rpy.r.assign("colnames.design", unique_categories)
silent_eval("colnames(design) <- colnames.design")
rpy.r.assign("expr", affy.asarray())
silent_eval("fit <- lmFit(expr, design)")
silent_eval("contrast.matrix <- makeContrasts(%s, levels=design)" % response)
silent_eval("fit2 <- contrasts.fit(fit, contrast.matrix)")
silent_eval("fit2 <- eBayes(fit2)")
coeff = rpy.r("fit2$coefficients")
amean = rpy.r("fit2$Amean")
padj = rpy.r("p.adjust(fit2$p.value, method='fdr')")
dim_1, dim_2 = affy.get_dim_names()
coeff_data = dataset.Dataset(coeff, [(dim_1, affy.get_identifiers(dim_1)),
("contrast", [response])],
name="Coefficients")
amean_data = dataset.Dataset(array(amean), [("average", ["average"]),
(dim_1, affy.get_identifiers(dim_1))],
name="Average Intensity")
padj_data = dataset.Dataset(padj, [(dim_1, affy.get_identifiers(dim_1)),
("contrast", [response])],
name="Adjusted P-value")
vulcano_plot = plots.ScatterPlot(coeff_data, padj_data, dim_1,
'contrast', response, response,
name="Vulcano plot")
# We should be nice and clean up after ourselves
rpy.r("rm(list=ls())")
return [coeff_data, amean_data, padj_data, vulcano_plot]
class CelFileImportFunction(workflow.Function):
"""Loads Affymetrics .CEL-files into matrix."""
def __init__(self):
workflow.Function.__init__(self, 'cel_import', 'Import Affy')
def run(self):
import rpy
chooser = gtk.FileChooserDialog(title="Select cel files...", parent=None,
action=gtk.FILE_CHOOSER_ACTION_OPEN,
buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
gtk.STOCK_OPEN, gtk.RESPONSE_OK))
chooser.set_select_multiple(True)
cel_filter = gtk.FileFilter()
cel_filter.set_name("Cel Files (*.cel)")
cel_filter.add_pattern("*.[cC][eE][lL]")
all_filter = gtk.FileFilter()
all_filter.set_name("All Files (*.*)")
all_filter.add_pattern("*")
chooser.add_filter(cel_filter)
chooser.add_filter(all_filter)
try:
if chooser.run() == gtk.RESPONSE_OK:
rpy.r.library("affy")
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
silent_eval('E <- ReadAffy(filenames=c("%s"))' % '", "'.join(chooser.get_filenames()))
silent_eval('E <- rma(E)')
m = rpy.r('m <- E@exprs')
vector_eval = rpy.with_mode(rpy.VECTOR_CONVERSION, rpy.r)
rownames = vector_eval('rownames(m)')
colnames = vector_eval('colnames(m)')
# We should be nice and clean up after ourselves
rpy.r.rm(["E", "m"])
if m:
data = dataset.Dataset(m, (('ids', rownames), ('filename', colnames)), name="Affymetrics Data")
plot = plots.LinePlot(data, "Gene profiles")
return [data, plot]
else:
logger.log("notice", "No data loaded from importer.")
finally:
chooser.destroy()
class PhenotypeImportFunction(workflow.Function):
def __init__(self):
workflow.Function.__init__(self, 'import_phenotype', 'Import Phenotypes')
def run(self):
chooser = gtk.FileChooserDialog(title="Select cel files...", parent=None,
action=gtk.FILE_CHOOSER_ACTION_OPEN,
buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
gtk.STOCK_OPEN, gtk.RESPONSE_OK))
all_filter = gtk.FileFilter()
all_filter.set_name("Tab separated file (*.*)")
all_filter.add_pattern("*")
chooser.add_filter(all_filter)
try:
if chooser.run() == gtk.RESPONSE_OK:
text = open(chooser.get_filename()).read()
data = PhenotypeDataset(text)
return [data]
finally:
chooser.destroy()
class PCAFunction(workflow.Function):
"""Generic PCA function."""
def __init__(self, wf):
workflow.Function.__init__(self, 'pca', 'PCA')
self._workflow = wf
def run(self,data):
import rpy
dim_2, dim_1 = data.get_dim_names()
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
rpy.with_mode(rpy.NO_CONVERSION, rpy.r.assign)("m", data.asarray())
silent_eval("t = prcomp(t(m))")
# we make a unique name for component dimension
c = 0
component_dim = prefix = "component"
while component_dim in data.get_all_dims():
component_dim = prefix + "_" + str(c)
c += 1
T_ids = map(str, range(1, rpy.r("dim(t$x)")[1]+1))
T = dataset.Dataset(rpy.r("t$x"), [(dim_1, data.get_identifiers(dim_1)),
(component_dim, T_ids)],
all_dims = data.get_all_dims(), name="T")
P = dataset.Dataset(rpy.r("t$rotation"), [(dim_2, data.get_identifiers(dim_2)),
(component_dim, T_ids)],
all_dims = data.get_all_dims(), name="P")
# cleanup
rpy.r.rm(["t", "m"])
loading_plot = plots.ScatterPlot(P, P, dim_2, component_dim, '1', '2',
"Loadings")
score_plot = plots.ScatterPlot(T, T, dim_1,component_dim, '1', '2',
"Scores")
return [T, P, loading_plot, score_plot]
class PhenotypeDataset(dataset.Dataset):
def __init__(self, string):
self._table = rows = [line.split("\t") for line in string.splitlines()]
columns = zip(*rows[1:])
cel_names = columns[0]
col_names = rows[0][1:]
phenotypes = []
categories = {}
self._categories = {}
for col_name, column in zip(col_names, columns[1:]):
try:
categories[col_name] = map(float, column)
phenotypes.append(col_name)
except ValueError:
# category-data
keys = []
entries = {}
for i, entry in enumerate(column):
if entry not in entries:
keys.append(entry)
entries[entry] = []
entries[entry].append(i)
for key in keys:
self._categories[key] = col_name
z = zeros(len(column))
for i in entries[key]:
z[i] = 1
key = "%s-%s" % (col_name, key)
phenotypes.append(key)
categories[key] = z
matrix_data = []
for key in phenotypes:
matrix_data.append(categories[key])
if matrix_data:
a = transpose(array(matrix_data))
else:
a = None
dataset.Dataset.__init__(self, a, identifiers=[('CEL', cel_names),
('phenotypes', phenotypes)],
shape=(len(cel_names),len(phenotypes)), name="Phenotype Data")
def sort_cels(self, cel_names):
self._dims = []
cels = {}
for row in self._table[1:]:
cels[row[0]] = row[1:]
new_table = [self._table[0]]
for name in cel_names:
new_table.append([name] + cels[name])
self._table = new_table
self._set_identifiers([('CEL', cel_names), ('phenotypes', self.get_identifiers('phenotypes'))], self._all_dims)
def get_phenotype_table(self, cel_order=None):
"""Get string based table of phenotypes as read from file."""
if not cel_order:
return self._table
else:
cels = {}
for row in self._table[1:]:
cels[row[0]] = row[1:]
new_table = [self._table[0]]
for name in cel_order:
new_table.append([name] + cels[name])
return new_table
def get_categories(self):
"""Get categories of factors.
If factor 'sick' had possibilites Y/N, and 'sex' M/F, the
categories would be Y, N, M and F.
"""
return self._categories.keys()
def get_factors(self, categories):
factors = set()
for c in categories:
factors.add(self._categories[c])
return factors
def get_category_variable(self, category):
# abit brute-force, but does the job until optimization is
# necessary
factor = self._categories[category]
variable = []
for column in zip(*self.get_phenotype_table()):
if column[0] == factor:
for entry in column[1:]:
if entry == category:
variable.append(1)
else:
variable.append(0)
return variable