2006-04-27 14:15:30 +02:00
|
|
|
import gtk
|
|
|
|
from system import dataset, logger, plots, workflow
|
2006-05-03 16:04:28 +02:00
|
|
|
from scipy import randn, array, transpose, zeros
|
2006-04-27 14:15:30 +02:00
|
|
|
import cPickle
|
|
|
|
|
2006-04-28 13:44:55 +02:00
|
|
|
|
2006-04-27 14:15:30 +02:00
|
|
|
class AffyWorkflow (workflow.Workflow):
|
|
|
|
|
|
|
|
name = 'Affy Workflow'
|
|
|
|
ident = 'affy'
|
|
|
|
description = 'Affymetrics Workflow. Analysis of Affy-data.'
|
|
|
|
def __init__(self, app):
|
|
|
|
workflow.Workflow.__init__(self, app)
|
|
|
|
|
|
|
|
load = workflow.Stage('load', 'Load Data')
|
|
|
|
load.add_function(CelFileImportFunction())
|
2006-05-03 16:27:38 +02:00
|
|
|
load.add_function(PhenotypeImportFunction())
|
2006-04-27 14:15:30 +02:00
|
|
|
load.add_function(TestDataFunction())
|
|
|
|
load.add_function(DatasetLoadFunction())
|
|
|
|
self.add_stage(load)
|
|
|
|
|
|
|
|
explore = workflow.Stage('explore', 'Explorative analysis')
|
|
|
|
explore.add_function(PCAFunction(self))
|
2006-05-03 16:27:38 +02:00
|
|
|
explore.add_function(PrintFunction())
|
2006-04-27 14:15:30 +02:00
|
|
|
self.add_stage(explore)
|
|
|
|
|
|
|
|
save = workflow.Stage('save', 'Save Data')
|
|
|
|
save.add_function(DatasetSaveFunction())
|
|
|
|
self.add_stage(save)
|
2006-05-03 16:27:38 +02:00
|
|
|
|
|
|
|
class PrintFunction(workflow.Function):
|
|
|
|
def __init__(self):
|
|
|
|
workflow.Function.__init__(self, 'printer', 'Print Stuff')
|
|
|
|
|
|
|
|
def run(self, data):
|
|
|
|
dim1, dim2 = data.get_dim_names()
|
|
|
|
print dim1, dim2
|
|
|
|
print "\t", "\t".join(data.get_identifiers(dim2))
|
|
|
|
for row in zip(data.get_identifiers(dim1), data.asarray().tolist()):
|
|
|
|
print "\t".join(map(str, row))
|
2006-04-27 14:15:30 +02:00
|
|
|
|
|
|
|
|
|
|
|
class TestDataFunction(workflow.Function):
|
|
|
|
def __init__(self):
|
|
|
|
workflow.Function.__init__(self, 'test_data', 'Generate Test Data')
|
|
|
|
|
2006-05-03 13:52:54 +02:00
|
|
|
def run(self):
|
2006-04-27 14:15:30 +02:00
|
|
|
logger.log('notice', 'Injecting foo test data')
|
|
|
|
x = randn(20,30)
|
|
|
|
X = dataset.Dataset(x)
|
2006-04-28 13:44:55 +02:00
|
|
|
return [X, plots.LinePlot(X)]
|
2006-04-27 14:15:30 +02:00
|
|
|
|
|
|
|
|
|
|
|
class DatasetLoadFunction(workflow.Function):
|
|
|
|
"""Loader for previously pickled Datasets."""
|
|
|
|
def __init__(self):
|
|
|
|
workflow.Function.__init__(self, 'load_data', 'Load Pickled Dataset')
|
|
|
|
|
2006-05-03 13:52:54 +02:00
|
|
|
def run(self):
|
2006-04-27 14:15:30 +02:00
|
|
|
chooser = gtk.FileChooserDialog(title="Select cel files...", parent=None,
|
|
|
|
action=gtk.FILE_CHOOSER_ACTION_OPEN,
|
|
|
|
buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
|
|
|
|
gtk.STOCK_OPEN, gtk.RESPONSE_OK))
|
|
|
|
pkl_filter = gtk.FileFilter()
|
|
|
|
pkl_filter.set_name("Python pickled data files (*.pkl)")
|
|
|
|
pkl_filter.add_pattern("*.[pP][kK][lL]")
|
|
|
|
all_filter = gtk.FileFilter()
|
|
|
|
all_filter.set_name("All Files (*.*)")
|
|
|
|
all_filter.add_pattern("*")
|
|
|
|
chooser.add_filter(pkl_filter)
|
|
|
|
chooser.add_filter(all_filter)
|
|
|
|
|
|
|
|
try:
|
|
|
|
if chooser.run() == gtk.RESPONSE_OK:
|
|
|
|
return [cPickle.load(open(chooser.get_filename()))]
|
|
|
|
finally:
|
|
|
|
chooser.destroy()
|
|
|
|
|
|
|
|
|
|
|
|
class DatasetSaveFunction(workflow.Function):
|
|
|
|
"""QND way to save data to file for later import to this program."""
|
|
|
|
def __init__(self):
|
|
|
|
workflow.Function.__init__(self, 'save_data', 'Save Pickled Dataset')
|
|
|
|
|
2006-05-05 12:10:03 +02:00
|
|
|
def run(self, data):
|
2006-04-27 14:15:30 +02:00
|
|
|
chooser = gtk.FileChooserDialog(title="Save pickled data...", parent=None,
|
|
|
|
action=gtk.FILE_CHOOSER_ACTION_SAVE,
|
|
|
|
buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
|
|
|
|
gtk.STOCK_SAVE, gtk.RESPONSE_OK))
|
|
|
|
pkl_filter = gtk.FileFilter()
|
|
|
|
pkl_filter.set_name("Python pickled data files (*.pkl)")
|
|
|
|
pkl_filter.add_pattern("*.[pP][kK][lL]")
|
|
|
|
all_filter = gtk.FileFilter()
|
|
|
|
all_filter.set_name("All Files (*.*)")
|
|
|
|
all_filter.add_pattern("*")
|
|
|
|
chooser.add_filter(pkl_filter)
|
|
|
|
chooser.add_filter(all_filter)
|
|
|
|
chooser.set_current_name(data.get_name() + ".pkl")
|
|
|
|
|
|
|
|
try:
|
|
|
|
if chooser.run() == gtk.RESPONSE_OK:
|
|
|
|
cPickle.dump(data, open(chooser.get_filename(), "w"), protocol=2)
|
|
|
|
logger.log("notice", "Saved data to %r." % chooser.get_filename())
|
|
|
|
finally:
|
|
|
|
chooser.destroy()
|
|
|
|
|
|
|
|
|
|
|
|
class CelFileImportFunction(workflow.Function):
|
|
|
|
"""Loads Affymetrics .CEL-files into matrix."""
|
|
|
|
def __init__(self):
|
|
|
|
workflow.Function.__init__(self, 'cel_import', 'Import Affy')
|
|
|
|
|
2006-05-05 12:10:03 +02:00
|
|
|
def run(self):
|
2006-04-27 14:15:30 +02:00
|
|
|
import rpy
|
|
|
|
chooser = gtk.FileChooserDialog(title="Select cel files...", parent=None,
|
|
|
|
action=gtk.FILE_CHOOSER_ACTION_OPEN,
|
|
|
|
buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
|
|
|
|
gtk.STOCK_OPEN, gtk.RESPONSE_OK))
|
|
|
|
chooser.set_select_multiple(True)
|
|
|
|
cel_filter = gtk.FileFilter()
|
|
|
|
cel_filter.set_name("Cel Files (*.cel)")
|
|
|
|
cel_filter.add_pattern("*.[cC][eE][lL]")
|
|
|
|
all_filter = gtk.FileFilter()
|
|
|
|
all_filter.set_name("All Files (*.*)")
|
|
|
|
all_filter.add_pattern("*")
|
|
|
|
chooser.add_filter(cel_filter)
|
|
|
|
chooser.add_filter(all_filter)
|
|
|
|
|
|
|
|
try:
|
|
|
|
if chooser.run() == gtk.RESPONSE_OK:
|
|
|
|
rpy.r.library("affy")
|
|
|
|
|
|
|
|
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
|
|
|
|
silent_eval('E <- ReadAffy(filenames=c("%s"))' % '", "'.join(chooser.get_filenames()))
|
|
|
|
silent_eval('E <- rma(E)')
|
|
|
|
|
|
|
|
m = rpy.r('m <- E@exprs')
|
|
|
|
|
|
|
|
vector_eval = rpy.with_mode(rpy.VECTOR_CONVERSION, rpy.r)
|
|
|
|
rownames = vector_eval('rownames(m)')
|
|
|
|
colnames = vector_eval('colnames(m)')
|
|
|
|
|
|
|
|
# We should be nice and clean up after ourselves
|
|
|
|
rpy.r.rm(["E", "m"])
|
|
|
|
|
|
|
|
if m:
|
|
|
|
data = dataset.Dataset(m, (('ids', rownames), ('filename', colnames)), name="Affymetrics Data")
|
|
|
|
plot = plots.LinePlot(data, "Gene profiles")
|
|
|
|
return [data, plot]
|
|
|
|
else:
|
|
|
|
logger.log("notice", "No data loaded from importer.")
|
|
|
|
finally:
|
|
|
|
chooser.destroy()
|
|
|
|
|
|
|
|
|
2006-05-03 16:27:38 +02:00
|
|
|
class PhenotypeImportFunction(workflow.Function):
|
|
|
|
def __init__(self):
|
|
|
|
workflow.Function.__init__(self, 'import_phenotype', 'Import Phenotypes')
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
chooser = gtk.FileChooserDialog(title="Select cel files...", parent=None,
|
|
|
|
action=gtk.FILE_CHOOSER_ACTION_OPEN,
|
|
|
|
buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
|
|
|
|
gtk.STOCK_OPEN, gtk.RESPONSE_OK))
|
|
|
|
all_filter = gtk.FileFilter()
|
|
|
|
all_filter.set_name("Tab separated file (*.*)")
|
|
|
|
all_filter.add_pattern("*")
|
|
|
|
chooser.add_filter(all_filter)
|
|
|
|
|
|
|
|
try:
|
|
|
|
if chooser.run() == gtk.RESPONSE_OK:
|
|
|
|
text = open(chooser.get_filename()).read()
|
|
|
|
data = PhenotypeDataset(text)
|
|
|
|
return [data]
|
|
|
|
|
|
|
|
finally:
|
|
|
|
chooser.destroy()
|
|
|
|
|
|
|
|
|
2006-04-27 14:15:30 +02:00
|
|
|
class PCAFunction(workflow.Function):
|
|
|
|
"""Generic PCA function."""
|
|
|
|
def __init__(self, wf):
|
|
|
|
workflow.Function.__init__(self, 'pca', 'PCA')
|
|
|
|
self._workflow = wf
|
|
|
|
|
2006-05-03 13:52:54 +02:00
|
|
|
def run(self,data):
|
2006-04-27 14:15:30 +02:00
|
|
|
import rpy
|
|
|
|
|
|
|
|
dim_2, dim_1 = data.get_dim_names()
|
|
|
|
|
|
|
|
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
|
|
|
|
rpy.with_mode(rpy.NO_CONVERSION, rpy.r.assign)("m", data.get_matrix())
|
|
|
|
silent_eval("t = prcomp(t(m))")
|
|
|
|
|
2006-04-28 13:44:55 +02:00
|
|
|
# we make a unique name for component dimension
|
|
|
|
c = 0
|
|
|
|
component_dim = prefix = "component"
|
|
|
|
while component_dim in data.get_all_dims():
|
|
|
|
component_dim = prefix + "_" + str(c)
|
|
|
|
c += 1
|
|
|
|
|
2006-04-27 14:15:30 +02:00
|
|
|
T_ids = map(str, range(1, rpy.r("dim(t$x)")[1]+1))
|
|
|
|
T = dataset.Dataset(rpy.r("t$x"), [(dim_1, data.get_identifiers(dim_1)),
|
2006-04-28 13:44:55 +02:00
|
|
|
(component_dim, T_ids)],
|
|
|
|
all_dims = data.get_all_dims(), name="T")
|
2006-04-27 14:15:30 +02:00
|
|
|
P = dataset.Dataset(rpy.r("t$rotation"), [(dim_2, data.get_identifiers(dim_2)),
|
2006-04-28 13:44:55 +02:00
|
|
|
(component_dim, T_ids)],
|
|
|
|
all_dims = data.get_all_dims(), name="P")
|
2006-04-27 14:15:30 +02:00
|
|
|
# cleanup
|
|
|
|
rpy.r.rm(["t", "m"])
|
|
|
|
|
2006-04-28 13:44:55 +02:00
|
|
|
loading_plot = plots.ScatterPlot(P, dim_2, component_dim, '1', '2',
|
|
|
|
"Loadings")
|
|
|
|
score_plot = plots.ScatterPlot(T, dim_1,component_dim, '1', '2',
|
|
|
|
"Scores")
|
2006-04-27 14:15:30 +02:00
|
|
|
|
|
|
|
return [T, P, loading_plot, score_plot]
|
2006-04-28 13:44:55 +02:00
|
|
|
|
2006-05-03 16:04:28 +02:00
|
|
|
|
|
|
|
class PhenotypeDataset(dataset.Dataset):
|
|
|
|
def __init__(self, string):
|
|
|
|
self._table = rows = [line.split("\t") for line in string.splitlines()]
|
|
|
|
columns = zip(*rows[1:])
|
|
|
|
cel_names = columns[0]
|
|
|
|
col_names = rows[0][1:]
|
|
|
|
phenotypes = []
|
|
|
|
categories = {}
|
2006-05-04 15:30:04 +02:00
|
|
|
self._categories = set()
|
2006-05-03 16:04:28 +02:00
|
|
|
|
|
|
|
for col_name, column in zip(col_names, columns[1:]):
|
|
|
|
try:
|
2006-05-03 16:27:38 +02:00
|
|
|
categories[col_name] = map(float, column)
|
2006-05-03 16:04:28 +02:00
|
|
|
phenotypes.append(col_name)
|
|
|
|
except ValueError:
|
|
|
|
# category-data
|
|
|
|
keys = []
|
|
|
|
entries = {}
|
|
|
|
for i, entry in enumerate(column):
|
|
|
|
if entry not in entries:
|
|
|
|
keys.append(entry)
|
|
|
|
entries[entry] = []
|
|
|
|
|
|
|
|
entries[entry].append(i)
|
|
|
|
|
|
|
|
for key in keys:
|
2006-05-04 15:30:04 +02:00
|
|
|
self._categories.add(key)
|
2006-05-03 16:04:28 +02:00
|
|
|
z = zeros(len(column))
|
|
|
|
for i in entries[key]:
|
|
|
|
z[i] = 1
|
|
|
|
key = "%s-%s" % (col_name, key)
|
|
|
|
phenotypes.append(key)
|
|
|
|
categories[key] = z
|
|
|
|
|
|
|
|
matrix_data = []
|
|
|
|
for key in phenotypes:
|
|
|
|
matrix_data.append(categories[key])
|
|
|
|
|
|
|
|
if matrix_data:
|
|
|
|
a = transpose(array(matrix_data))
|
|
|
|
else:
|
|
|
|
a = None
|
|
|
|
|
|
|
|
dataset.Dataset.__init__(self, a, identifiers=[('CEL', cel_names),
|
|
|
|
('phenotypes', phenotypes)],
|
2006-05-03 16:27:38 +02:00
|
|
|
shape=(len(cel_names),len(phenotypes)), name="Phenotype Data")
|
2006-05-03 16:04:28 +02:00
|
|
|
|
|
|
|
def get_phenotype_table(self):
|
2006-05-04 15:30:04 +02:00
|
|
|
"""Get string based table of phenotypes as read from file."""
|
2006-05-03 16:04:28 +02:00
|
|
|
return self._table
|
2006-05-04 15:30:04 +02:00
|
|
|
|
|
|
|
def get_categories(self):
|
|
|
|
"""Get categories of factors.
|
|
|
|
|
|
|
|
If factor 'sick' had possibilites Y/N, and 'sex' M/F, the
|
|
|
|
categories would be Y, N, M and F.
|
|
|
|
"""
|
|
|
|
return self._categories
|