Updated import statements, and removed the workflows pca_workflow and
affy_workflow.
This commit is contained in:
parent
610812f265
commit
375d45e0cc
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
from getopt import getopt
|
from getopt import getopt
|
||||||
import sys
|
import sys
|
||||||
from system import fluents, project, workflow
|
from fluents import fluents, project, workflow
|
||||||
import workflows
|
import workflows
|
||||||
|
|
||||||
PROGRAM_NAME = 'fluents'
|
PROGRAM_NAME = 'fluents'
|
||||||
|
|
|
@ -13,12 +13,12 @@ import gnome
|
||||||
import gnome.ui
|
import gnome.ui
|
||||||
import scipy
|
import scipy
|
||||||
import pango
|
import pango
|
||||||
from system import project, workflow, dataset, logger, plots, navigator, dialogs, selections
|
import project, workflow, dataset, logger, plots, navigator, dialogs, selections
|
||||||
|
|
||||||
|
|
||||||
PROGRAM_NAME = 'fluents'
|
PROGRAM_NAME = 'fluents'
|
||||||
VERSION = '0.1.0'
|
VERSION = '0.1.0'
|
||||||
DATADIR = os.path.dirname(sys.modules['system'].__file__)
|
DATADIR = os.path.dirname(sys.modules['fluents'].__file__)
|
||||||
ICONDIR = os.path.join(DATADIR,"..","icons")
|
ICONDIR = os.path.join(DATADIR,"..","icons")
|
||||||
GLADEFILENAME = os.path.join(DATADIR, 'fluents.glade')
|
GLADEFILENAME = os.path.join(DATADIR, 'fluents.glade')
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ import gtk
|
||||||
import gobject
|
import gobject
|
||||||
import plots
|
import plots
|
||||||
import time
|
import time
|
||||||
from system import dataset, logger, plots, project, workflow
|
import dataset, logger, plots, project, workflow
|
||||||
|
|
||||||
class NavigatorView (gtk.TreeView):
|
class NavigatorView (gtk.TreeView):
|
||||||
def __init__(self, project, app):
|
def __init__(self, project, app):
|
||||||
|
|
|
@ -4,7 +4,7 @@ import pygtk
|
||||||
import gobject
|
import gobject
|
||||||
import gtk
|
import gtk
|
||||||
import fluents
|
import fluents
|
||||||
from system import logger
|
import logger
|
||||||
import matplotlib
|
import matplotlib
|
||||||
from matplotlib.backends.backend_gtkagg import FigureCanvasGTKAgg as FigureCanvas
|
from matplotlib.backends.backend_gtkagg import FigureCanvasGTKAgg as FigureCanvas
|
||||||
from matplotlib.backend_bases import NavigationToolbar2,cursors
|
from matplotlib.backend_bases import NavigationToolbar2,cursors
|
||||||
|
|
|
@ -4,7 +4,7 @@ import gobject
|
||||||
import gtk
|
import gtk
|
||||||
import fluents
|
import fluents
|
||||||
import logger
|
import logger
|
||||||
from system import dataset, plots
|
import dataset, plots
|
||||||
|
|
||||||
class Project:
|
class Project:
|
||||||
def __init__(self,name="Testing"):
|
def __init__(self,name="Testing"):
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
from system import logger, dataset
|
import logger, dataset
|
||||||
|
|
||||||
import pygtk
|
import pygtk
|
||||||
import gtk
|
import gtk
|
||||||
|
|
|
@ -2,7 +2,7 @@ import gtk
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import inspect
|
import inspect
|
||||||
from system import logger
|
import logger
|
||||||
|
|
||||||
def _workflow_classes(modname):
|
def _workflow_classes(modname):
|
||||||
"""Returns a list of all subclasses of Workflow in a given module"""
|
"""Returns a list of all subclasses of Workflow in a given module"""
|
||||||
|
|
|
@ -1,412 +0,0 @@
|
||||||
import gtk
|
|
||||||
import os.path
|
|
||||||
from system import dataset, logger, plots, workflow, dialogs
|
|
||||||
from scipy import randn, array, transpose, zeros
|
|
||||||
import cPickle
|
|
||||||
|
|
||||||
|
|
||||||
class AffyWorkflow (workflow.Workflow):
|
|
||||||
|
|
||||||
name = 'Affy Workflow'
|
|
||||||
ident = 'affy'
|
|
||||||
description = 'Affymetrics Workflow. Analysis of Affy-data.'
|
|
||||||
def __init__(self, app):
|
|
||||||
workflow.Workflow.__init__(self, app)
|
|
||||||
|
|
||||||
load = workflow.Stage('load', 'Load Data')
|
|
||||||
load.add_function(CelFileImportFunction())
|
|
||||||
load.add_function(PhenotypeImportFunction())
|
|
||||||
load.add_function(TestDataFunction())
|
|
||||||
load.add_function(DatasetLoadFunction())
|
|
||||||
self.add_stage(load)
|
|
||||||
|
|
||||||
significance = workflow.Stage('significance', 'Significance analysis')
|
|
||||||
significance.add_function(LimmaFunction())
|
|
||||||
self.add_stage(significance)
|
|
||||||
|
|
||||||
explore = workflow.Stage('explore', 'Explorative analysis')
|
|
||||||
explore.add_function(PCAFunction(self))
|
|
||||||
explore.add_function(PrintFunction())
|
|
||||||
self.add_stage(explore)
|
|
||||||
|
|
||||||
save = workflow.Stage('save', 'Save Data')
|
|
||||||
save.add_function(DatasetSaveFunction())
|
|
||||||
self.add_stage(save)
|
|
||||||
|
|
||||||
class PrintFunction(workflow.Function):
|
|
||||||
def __init__(self):
|
|
||||||
workflow.Function.__init__(self, 'printer', 'Print Stuff')
|
|
||||||
|
|
||||||
def run(self, data):
|
|
||||||
dim1, dim2 = data.get_dim_names()
|
|
||||||
print dim1, dim2
|
|
||||||
print "\t", "\t".join(data.get_identifiers(dim2))
|
|
||||||
for row in zip(data.get_identifiers(dim1), data.asarray().tolist()):
|
|
||||||
print "\t".join(map(str, row))
|
|
||||||
|
|
||||||
|
|
||||||
class TestDataFunction(workflow.Function):
|
|
||||||
def __init__(self):
|
|
||||||
workflow.Function.__init__(self, 'test_data', 'Generate Test Data')
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
logger.log('notice', 'Injecting foo test data')
|
|
||||||
x = randn(20,30)
|
|
||||||
X = dataset.Dataset(x)
|
|
||||||
return [X, plots.LinePlot(X)]
|
|
||||||
|
|
||||||
|
|
||||||
class DatasetLoadFunction(workflow.Function):
|
|
||||||
"""Loader for previously pickled Datasets."""
|
|
||||||
def __init__(self):
|
|
||||||
workflow.Function.__init__(self, 'load_data', 'Load Pickled Dataset')
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
chooser = gtk.FileChooserDialog(title="Select cel files...", parent=None,
|
|
||||||
action=gtk.FILE_CHOOSER_ACTION_OPEN,
|
|
||||||
buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
|
|
||||||
gtk.STOCK_OPEN, gtk.RESPONSE_OK))
|
|
||||||
pkl_filter = gtk.FileFilter()
|
|
||||||
pkl_filter.set_name("Python pickled data files (*.pkl)")
|
|
||||||
pkl_filter.add_pattern("*.[pP][kK][lL]")
|
|
||||||
all_filter = gtk.FileFilter()
|
|
||||||
all_filter.set_name("All Files (*.*)")
|
|
||||||
all_filter.add_pattern("*")
|
|
||||||
chooser.add_filter(pkl_filter)
|
|
||||||
chooser.add_filter(all_filter)
|
|
||||||
|
|
||||||
try:
|
|
||||||
if chooser.run() == gtk.RESPONSE_OK:
|
|
||||||
return [cPickle.load(open(chooser.get_filename()))]
|
|
||||||
finally:
|
|
||||||
chooser.destroy()
|
|
||||||
|
|
||||||
|
|
||||||
class DatasetSaveFunction(workflow.Function):
|
|
||||||
"""QND way to save data to file for later import to this program."""
|
|
||||||
def __init__(self):
|
|
||||||
workflow.Function.__init__(self, 'save_data', 'Save Pickled Dataset')
|
|
||||||
|
|
||||||
def run(self, data):
|
|
||||||
chooser = gtk.FileChooserDialog(title="Save pickled data...", parent=None,
|
|
||||||
action=gtk.FILE_CHOOSER_ACTION_SAVE,
|
|
||||||
buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
|
|
||||||
gtk.STOCK_SAVE, gtk.RESPONSE_OK))
|
|
||||||
pkl_filter = gtk.FileFilter()
|
|
||||||
pkl_filter.set_name("Python pickled data files (*.pkl)")
|
|
||||||
pkl_filter.add_pattern("*.[pP][kK][lL]")
|
|
||||||
all_filter = gtk.FileFilter()
|
|
||||||
all_filter.set_name("All Files (*.*)")
|
|
||||||
all_filter.add_pattern("*")
|
|
||||||
chooser.add_filter(pkl_filter)
|
|
||||||
chooser.add_filter(all_filter)
|
|
||||||
chooser.set_current_name(data.get_name() + ".pkl")
|
|
||||||
|
|
||||||
try:
|
|
||||||
if chooser.run() == gtk.RESPONSE_OK:
|
|
||||||
cPickle.dump(data, open(chooser.get_filename(), "w"), protocol=2)
|
|
||||||
logger.log("notice", "Saved data to %r." % chooser.get_filename())
|
|
||||||
finally:
|
|
||||||
chooser.destroy()
|
|
||||||
|
|
||||||
|
|
||||||
class LimmaFunction(workflow.Function):
|
|
||||||
def __init__(self):
|
|
||||||
workflow.Function.__init__(self, 'limma', 'Limma')
|
|
||||||
|
|
||||||
def run(self, affy, data):
|
|
||||||
response = dialogs.get_text('Enter contrasts...', """\
|
|
||||||
Enter comma-separated list of contrasts.
|
|
||||||
Available categories: %s
|
|
||||||
|
|
||||||
Example: Y-N, M-F""" % ", ".join(data.get_categories()))
|
|
||||||
|
|
||||||
logger.log("notice", "contrasts selected: %s" % response)
|
|
||||||
|
|
||||||
categories = []
|
|
||||||
[categories.extend(s.split("-")) for s in response.split(",")]
|
|
||||||
categories = [s.strip() for s in categories]
|
|
||||||
|
|
||||||
factors = data.get_factors(categories)
|
|
||||||
if not factors:
|
|
||||||
logger.log("warning", "nothing to do, no factors")
|
|
||||||
|
|
||||||
table = data.get_phenotype_table([os.path.splitext(f)[0] for f in affy.get_identifiers('filename')])
|
|
||||||
cn = table[0]
|
|
||||||
entries = zip(*table[1:])
|
|
||||||
rn = entries[0]
|
|
||||||
|
|
||||||
import rpy
|
|
||||||
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
|
|
||||||
rpy.r.library("limma")
|
|
||||||
silent_eval("a <- matrix('kalle', nrow=%d, ncol=%d)" % (len(rn), len(cn)))
|
|
||||||
for i, row in enumerate(entries):
|
|
||||||
for j, entry in enumerate(row):
|
|
||||||
silent_eval("a[%d, %d] <- '%s'" % (j+1, i+1, entry))
|
|
||||||
rpy.r.assign("rn", rn)
|
|
||||||
rpy.r.assign("cn", cn)
|
|
||||||
silent_eval("rownames(a) <- rn")
|
|
||||||
silent_eval("colnames(a) <- cn")
|
|
||||||
|
|
||||||
unique_categories = list(set(categories))
|
|
||||||
|
|
||||||
# compose fancy list of factors for design matrix
|
|
||||||
silent_eval("design <- matrix(0, nrow=%d, ncol=%d)" % (len(rn), len(unique_categories)))
|
|
||||||
for i, category in enumerate(unique_categories):
|
|
||||||
for j, value in enumerate(data.get_category_variable(category)):
|
|
||||||
silent_eval("design[%d, %d] <- %d" % (j+1, i+1, value))
|
|
||||||
|
|
||||||
rpy.r.assign("colnames.design", unique_categories)
|
|
||||||
silent_eval("colnames(design) <- colnames.design")
|
|
||||||
|
|
||||||
rpy.r.assign("expr", affy.asarray())
|
|
||||||
silent_eval("fit <- lmFit(expr, design)")
|
|
||||||
silent_eval("contrast.matrix <- makeContrasts(%s, levels=design)" % response)
|
|
||||||
silent_eval("fit2 <- contrasts.fit(fit, contrast.matrix)")
|
|
||||||
silent_eval("fit2 <- eBayes(fit2)")
|
|
||||||
coeff = rpy.r("fit2$coefficients")
|
|
||||||
amean = rpy.r("fit2$Amean")
|
|
||||||
padj = rpy.r("p.adjust(fit2$p.value, method='fdr')")
|
|
||||||
|
|
||||||
dim_1, dim_2 = affy.get_dim_names()
|
|
||||||
|
|
||||||
|
|
||||||
coeff_data = dataset.Dataset(coeff, [(dim_1, affy.get_identifiers(dim_1)),
|
|
||||||
("contrast", [response])],
|
|
||||||
name="Coefficients")
|
|
||||||
amean_data = dataset.Dataset(array(amean), [("average", ["average"]),
|
|
||||||
(dim_1, affy.get_identifiers(dim_1))],
|
|
||||||
name="Average Intensity")
|
|
||||||
padj_data = dataset.Dataset(padj, [(dim_1, affy.get_identifiers(dim_1)),
|
|
||||||
("contrast", [response])],
|
|
||||||
name="Adjusted P-value")
|
|
||||||
|
|
||||||
vulcano_plot = plots.ScatterPlot(coeff_data, padj_data, dim_1,
|
|
||||||
'contrast', response, response,
|
|
||||||
name="Vulcano plot")
|
|
||||||
|
|
||||||
# We should be nice and clean up after ourselves
|
|
||||||
rpy.r("rm(list=ls())")
|
|
||||||
|
|
||||||
return [coeff_data, amean_data, padj_data, vulcano_plot]
|
|
||||||
|
|
||||||
|
|
||||||
class CelFileImportFunction(workflow.Function):
|
|
||||||
"""Loads Affymetrics .CEL-files into matrix."""
|
|
||||||
def __init__(self):
|
|
||||||
workflow.Function.__init__(self, 'cel_import', 'Import Affy')
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
import rpy
|
|
||||||
chooser = gtk.FileChooserDialog(title="Select cel files...", parent=None,
|
|
||||||
action=gtk.FILE_CHOOSER_ACTION_OPEN,
|
|
||||||
buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
|
|
||||||
gtk.STOCK_OPEN, gtk.RESPONSE_OK))
|
|
||||||
chooser.set_select_multiple(True)
|
|
||||||
cel_filter = gtk.FileFilter()
|
|
||||||
cel_filter.set_name("Cel Files (*.cel)")
|
|
||||||
cel_filter.add_pattern("*.[cC][eE][lL]")
|
|
||||||
all_filter = gtk.FileFilter()
|
|
||||||
all_filter.set_name("All Files (*.*)")
|
|
||||||
all_filter.add_pattern("*")
|
|
||||||
chooser.add_filter(cel_filter)
|
|
||||||
chooser.add_filter(all_filter)
|
|
||||||
|
|
||||||
try:
|
|
||||||
if chooser.run() == gtk.RESPONSE_OK:
|
|
||||||
rpy.r.library("affy")
|
|
||||||
|
|
||||||
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
|
|
||||||
silent_eval('E <- ReadAffy(filenames=c("%s"))' % '", "'.join(chooser.get_filenames()))
|
|
||||||
silent_eval('E <- rma(E)')
|
|
||||||
m = rpy.r('m <- E@exprs')
|
|
||||||
|
|
||||||
vector_eval = rpy.with_mode(rpy.VECTOR_CONVERSION, rpy.r)
|
|
||||||
rownames = vector_eval('rownames(m)')
|
|
||||||
colnames = vector_eval('colnames(m)')
|
|
||||||
|
|
||||||
# We should be nice and clean up after ourselves
|
|
||||||
rpy.r.rm(["E", "m"])
|
|
||||||
|
|
||||||
if m:
|
|
||||||
data = dataset.Dataset(m, (('ids', rownames), ('filename', colnames)), name="Affymetrics Data")
|
|
||||||
plot = plots.LinePlot(data, "Gene profiles")
|
|
||||||
return [data, plot]
|
|
||||||
else:
|
|
||||||
logger.log("notice", "No data loaded from importer.")
|
|
||||||
finally:
|
|
||||||
chooser.destroy()
|
|
||||||
|
|
||||||
|
|
||||||
class PhenotypeImportFunction(workflow.Function):
|
|
||||||
def __init__(self):
|
|
||||||
workflow.Function.__init__(self, 'import_phenotype', 'Import Phenotypes')
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
chooser = gtk.FileChooserDialog(title="Select cel files...", parent=None,
|
|
||||||
action=gtk.FILE_CHOOSER_ACTION_OPEN,
|
|
||||||
buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
|
|
||||||
gtk.STOCK_OPEN, gtk.RESPONSE_OK))
|
|
||||||
all_filter = gtk.FileFilter()
|
|
||||||
all_filter.set_name("Tab separated file (*.*)")
|
|
||||||
all_filter.add_pattern("*")
|
|
||||||
chooser.add_filter(all_filter)
|
|
||||||
|
|
||||||
try:
|
|
||||||
if chooser.run() == gtk.RESPONSE_OK:
|
|
||||||
text = open(chooser.get_filename()).read()
|
|
||||||
data = PhenotypeDataset(text)
|
|
||||||
return [data]
|
|
||||||
|
|
||||||
finally:
|
|
||||||
chooser.destroy()
|
|
||||||
|
|
||||||
|
|
||||||
class PCAFunction(workflow.Function):
|
|
||||||
"""Generic PCA function."""
|
|
||||||
def __init__(self, wf):
|
|
||||||
workflow.Function.__init__(self, 'pca', 'PCA')
|
|
||||||
self._workflow = wf
|
|
||||||
|
|
||||||
def run(self,data):
|
|
||||||
import rpy
|
|
||||||
|
|
||||||
dim_2, dim_1 = data.get_dim_names()
|
|
||||||
|
|
||||||
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
|
|
||||||
rpy.with_mode(rpy.NO_CONVERSION, rpy.r.assign)("m", data.asarray())
|
|
||||||
silent_eval("t = prcomp(t(m))")
|
|
||||||
|
|
||||||
# we make a unique name for component dimension
|
|
||||||
c = 0
|
|
||||||
component_dim = prefix = "component"
|
|
||||||
while component_dim in data.get_all_dims():
|
|
||||||
component_dim = prefix + "_" + str(c)
|
|
||||||
c += 1
|
|
||||||
|
|
||||||
T_ids = map(str, range(1, rpy.r("dim(t$x)")[1]+1))
|
|
||||||
T = dataset.Dataset(rpy.r("t$x"), [(dim_1, data.get_identifiers(dim_1)),
|
|
||||||
(component_dim, T_ids)],
|
|
||||||
all_dims = data.get_all_dims(), name="T")
|
|
||||||
P = dataset.Dataset(rpy.r("t$rotation"), [(dim_2, data.get_identifiers(dim_2)),
|
|
||||||
(component_dim, T_ids)],
|
|
||||||
all_dims = data.get_all_dims(), name="P")
|
|
||||||
# cleanup
|
|
||||||
rpy.r.rm(["t", "m"])
|
|
||||||
|
|
||||||
loading_plot = plots.ScatterPlot(P, P, dim_2, component_dim, '1', '2',
|
|
||||||
"Loadings")
|
|
||||||
score_plot = plots.ScatterPlot(T, T, dim_1,component_dim, '1', '2',
|
|
||||||
"Scores")
|
|
||||||
|
|
||||||
return [T, P, loading_plot, score_plot]
|
|
||||||
|
|
||||||
|
|
||||||
class PhenotypeDataset(dataset.Dataset):
|
|
||||||
def __init__(self, string):
|
|
||||||
self._table = rows = [line.split("\t") for line in string.splitlines()]
|
|
||||||
columns = zip(*rows[1:])
|
|
||||||
cel_names = columns[0]
|
|
||||||
col_names = rows[0][1:]
|
|
||||||
phenotypes = []
|
|
||||||
categories = {}
|
|
||||||
self._categories = {}
|
|
||||||
|
|
||||||
for col_name, column in zip(col_names, columns[1:]):
|
|
||||||
try:
|
|
||||||
categories[col_name] = map(float, column)
|
|
||||||
phenotypes.append(col_name)
|
|
||||||
except ValueError:
|
|
||||||
# category-data
|
|
||||||
keys = []
|
|
||||||
entries = {}
|
|
||||||
for i, entry in enumerate(column):
|
|
||||||
if entry not in entries:
|
|
||||||
keys.append(entry)
|
|
||||||
entries[entry] = []
|
|
||||||
|
|
||||||
entries[entry].append(i)
|
|
||||||
|
|
||||||
for key in keys:
|
|
||||||
self._categories[key] = col_name
|
|
||||||
z = zeros(len(column))
|
|
||||||
for i in entries[key]:
|
|
||||||
z[i] = 1
|
|
||||||
key = "%s-%s" % (col_name, key)
|
|
||||||
phenotypes.append(key)
|
|
||||||
categories[key] = z
|
|
||||||
|
|
||||||
matrix_data = []
|
|
||||||
for key in phenotypes:
|
|
||||||
matrix_data.append(categories[key])
|
|
||||||
|
|
||||||
if matrix_data:
|
|
||||||
a = transpose(array(matrix_data))
|
|
||||||
else:
|
|
||||||
a = None
|
|
||||||
|
|
||||||
dataset.Dataset.__init__(self, a, identifiers=[('CEL', cel_names),
|
|
||||||
('phenotypes', phenotypes)],
|
|
||||||
shape=(len(cel_names),len(phenotypes)), name="Phenotype Data")
|
|
||||||
|
|
||||||
def sort_cels(self, cel_names):
|
|
||||||
self._dims = []
|
|
||||||
|
|
||||||
cels = {}
|
|
||||||
for row in self._table[1:]:
|
|
||||||
cels[row[0]] = row[1:]
|
|
||||||
|
|
||||||
new_table = [self._table[0]]
|
|
||||||
for name in cel_names:
|
|
||||||
new_table.append([name] + cels[name])
|
|
||||||
|
|
||||||
self._table = new_table
|
|
||||||
self._set_identifiers([('CEL', cel_names), ('phenotypes', self.get_identifiers('phenotypes'))], self._all_dims)
|
|
||||||
|
|
||||||
def get_phenotype_table(self, cel_order=None):
|
|
||||||
"""Get string based table of phenotypes as read from file."""
|
|
||||||
if not cel_order:
|
|
||||||
return self._table
|
|
||||||
else:
|
|
||||||
cels = {}
|
|
||||||
for row in self._table[1:]:
|
|
||||||
cels[row[0]] = row[1:]
|
|
||||||
|
|
||||||
new_table = [self._table[0]]
|
|
||||||
for name in cel_order:
|
|
||||||
new_table.append([name] + cels[name])
|
|
||||||
|
|
||||||
return new_table
|
|
||||||
|
|
||||||
def get_categories(self):
|
|
||||||
"""Get categories of factors.
|
|
||||||
|
|
||||||
If factor 'sick' had possibilites Y/N, and 'sex' M/F, the
|
|
||||||
categories would be Y, N, M and F.
|
|
||||||
"""
|
|
||||||
return self._categories.keys()
|
|
||||||
|
|
||||||
def get_factors(self, categories):
|
|
||||||
factors = set()
|
|
||||||
for c in categories:
|
|
||||||
factors.add(self._categories[c])
|
|
||||||
|
|
||||||
return factors
|
|
||||||
|
|
||||||
def get_category_variable(self, category):
|
|
||||||
# abit brute-force, but does the job until optimization is
|
|
||||||
# necessary
|
|
||||||
factor = self._categories[category]
|
|
||||||
variable = []
|
|
||||||
for column in zip(*self.get_phenotype_table()):
|
|
||||||
if column[0] == factor:
|
|
||||||
for entry in column[1:]:
|
|
||||||
if entry == category:
|
|
||||||
variable.append(1)
|
|
||||||
else:
|
|
||||||
variable.append(0)
|
|
||||||
|
|
||||||
return variable
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,262 +0,0 @@
|
||||||
import gtk
|
|
||||||
import system.workflow as wf
|
|
||||||
from system.workflow import Stage, Function
|
|
||||||
import pickle
|
|
||||||
from scipy import log2,transpose,dot,divide,shape,mean,resize,zeros
|
|
||||||
from scipy.linalg import svd,inv,norm,get_blas_funcs,eig
|
|
||||||
from system import dataset, logger, plots
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class PCAWorkflow(wf.Workflow):
|
|
||||||
name = 'PCA Workflow'
|
|
||||||
ident = 'pca'
|
|
||||||
description = 'PCA workflow. Uses real microarray data from a study of diabetes (Mootha et al.).'
|
|
||||||
|
|
||||||
def __init__(self, app):
|
|
||||||
wf.Workflow.__init__(self, app)
|
|
||||||
#self.add_project(app.project)
|
|
||||||
#logger.log('notice','Current project added to: %s' %self.name)
|
|
||||||
|
|
||||||
load = Stage('load', 'Load Data')
|
|
||||||
load.add_function(LoadMoothaData())
|
|
||||||
self.add_stage(load)
|
|
||||||
|
|
||||||
preproc = Stage('preprocess', 'Preprocessing')
|
|
||||||
preproc.add_function(Log2Function())
|
|
||||||
self.add_stage(preproc)
|
|
||||||
|
|
||||||
annot = Stage('annot', 'Affy annotations')
|
|
||||||
annot.add_function(LoadAnnotationsFunction())
|
|
||||||
self.add_stage(annot)
|
|
||||||
|
|
||||||
model = Stage('model', 'Model')
|
|
||||||
model.add_function(PCAFunction(self))
|
|
||||||
self.add_stage(model)
|
|
||||||
|
|
||||||
logger.log('debug', '\tPCA\'s workflow is now active')
|
|
||||||
|
|
||||||
class LoadAnnotationsFunction(Function):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
Function.__init__(self, 'load', 'Load Annotations')
|
|
||||||
|
|
||||||
def load_affy_file(self, filename):
|
|
||||||
f = open(filename)
|
|
||||||
logger.log('notice', 'Loading annotation file: %s' % filename)
|
|
||||||
self.file = f
|
|
||||||
|
|
||||||
def on_response(self, dialog, response):
|
|
||||||
if response == gtk.RESPONSE_OK:
|
|
||||||
logger.log('notice', 'Reading file: %s' % dialog.get_filename())
|
|
||||||
self.load_affy_file(dialog.get_filename())
|
|
||||||
|
|
||||||
def run(self,data):
|
|
||||||
btns = ('Open', gtk.RESPONSE_OK, \
|
|
||||||
'Cancel', gtk.RESPONSE_CANCEL)
|
|
||||||
dialog = gtk.FileChooserDialog('Open Affy Annotation File',
|
|
||||||
buttons=btns)
|
|
||||||
dialog.connect('response', self.on_response)
|
|
||||||
dialog.run()
|
|
||||||
dialog.destroy()
|
|
||||||
|
|
||||||
### Reading and parsing here
|
|
||||||
annot = read_affy_annot(self.file)
|
|
||||||
i_want = 'Pathway'
|
|
||||||
nothing = '---'
|
|
||||||
ids_in_data = set(data.names('genes')) #assuming we have genes
|
|
||||||
sanity_check = set(annot.keys())
|
|
||||||
if not ids_in_data.intersection(sanity_check) == ids_in_data:
|
|
||||||
logger.log('debug','Some identifers in data does not exist in affy file!')
|
|
||||||
for affy_id,description in annot:
|
|
||||||
if affy_id in ids_in_data:
|
|
||||||
pathways = description[i_want]
|
|
||||||
if not pathways[0][0]=='--':
|
|
||||||
pass
|
|
||||||
D = []
|
|
||||||
return [D]
|
|
||||||
|
|
||||||
class PCAFunction(Function):
|
|
||||||
|
|
||||||
def __init__(self,workflow):
|
|
||||||
Function.__init__(self, 'pca', 'PCA')
|
|
||||||
self.output = None
|
|
||||||
self.workflow = workflow
|
|
||||||
|
|
||||||
def run(self,data):
|
|
||||||
logger.log('debug', 'datatype: %s' % type(data))
|
|
||||||
if not isinstance(data,dataset.Dataset):
|
|
||||||
return None
|
|
||||||
#logger.log('debug', 'dimensions: %s' % data.dims)
|
|
||||||
|
|
||||||
## calculations
|
|
||||||
T,P,E,tsq = self.pca(data._array,5,tsq_loads=False)
|
|
||||||
comp_def = ('comp',('1','2','3','4','5'))
|
|
||||||
singel_def = ('1',('s'))
|
|
||||||
|
|
||||||
# pull out input identifiers:
|
|
||||||
row_ids = data.get_identifiers('genes')
|
|
||||||
col_ids = data.get_identifiers('samples')
|
|
||||||
|
|
||||||
T = dataset.Dataset(T,[('samples',col_ids) ,comp_def],name='T2')
|
|
||||||
P = dataset.Dataset(P,[('genes',row_ids),comp_def],name='P')
|
|
||||||
E = dataset.Dataset(E,[('samples',col_ids),('genes',row_ids)],name='E')
|
|
||||||
#tsq = dataset.Dataset(tsq,[singel_def,data_ids[1])
|
|
||||||
|
|
||||||
## plots
|
|
||||||
loading_plot1 = plots.ScatterPlot(P,P,'genes','comp','1','2')
|
|
||||||
loading_plot2 = plots.ScatterPlot(P,P,'genes','comp','3','4')
|
|
||||||
score_plot = plots.ScatterPlot(T,T,'samples','comp','1','2')
|
|
||||||
|
|
||||||
return [T,P,E,loading_plot1,loading_plot2,score_plot]
|
|
||||||
|
|
||||||
def pca(self,X,a_opt,cent=True,scale='loads',tsq_loads=False):
|
|
||||||
"""Principal component analysis
|
|
||||||
|
|
||||||
input:
|
|
||||||
Xc -- matrix, data
|
|
||||||
a_opt -- scalar, max number of comp. to calculate
|
|
||||||
cent -- bool, centering [True]
|
|
||||||
crit -- string, pc criteria ['exp_var',['ief','rpv','average']]
|
|
||||||
scale -- string, scaling ['loads',['scores']]
|
|
||||||
tsq_loads -- bool, calculate t-squared? [True]
|
|
||||||
reg -- float, covariance regularizer for tsq calculations [0.2]
|
|
||||||
output:
|
|
||||||
T,P,E,r
|
|
||||||
|
|
||||||
"""
|
|
||||||
nSamples,nVarX = shape(X)
|
|
||||||
if cent:
|
|
||||||
Xc = self.mat_center(X)
|
|
||||||
else:
|
|
||||||
Xc = X
|
|
||||||
u,s,vh = self.esvd(Xc)
|
|
||||||
if scale=='scores':
|
|
||||||
T = u*s
|
|
||||||
T = T[:,:a_opt]
|
|
||||||
P = transpose(vh)
|
|
||||||
P = P[:,:a_opt]
|
|
||||||
elif scale=='loads':
|
|
||||||
T = u
|
|
||||||
T = T[:,:a_opt]
|
|
||||||
P = transpose(vh)*s
|
|
||||||
P = P[:,:a_opt]
|
|
||||||
|
|
||||||
E = Xc - dot(T,transpose(P))
|
|
||||||
varEach = s**2
|
|
||||||
totVar = sum(varEach)
|
|
||||||
r = divide(varEach,totVar)*100
|
|
||||||
return T,P,E,r
|
|
||||||
|
|
||||||
def mat_center(self,X,axis=0,ret_mn=False):
|
|
||||||
"""Mean center matrix along axis.
|
|
||||||
|
|
||||||
input:
|
|
||||||
X -- matrix, data
|
|
||||||
axis -- dim,
|
|
||||||
ret_mn -- bool, return mean
|
|
||||||
output:
|
|
||||||
Xc, [mnX]
|
|
||||||
|
|
||||||
NB: axis = 1 is column-centering, axis=0=row-centering
|
|
||||||
default is row centering (axis=0)
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
rows,cols = shape(X)
|
|
||||||
except ValueError:
|
|
||||||
print "The X data needs to be two-dimensional"
|
|
||||||
|
|
||||||
if axis==0:
|
|
||||||
mnX = mean(X,axis)
|
|
||||||
Xs = X - resize(mnX,(rows,cols))
|
|
||||||
|
|
||||||
elif axis==1:
|
|
||||||
mnX = mean(X,axis)
|
|
||||||
Xs = transpose(transpose(X) - resize(mnX,(cols,rows)))
|
|
||||||
if ret_mn:
|
|
||||||
return Xs,mnX
|
|
||||||
else:
|
|
||||||
return Xs
|
|
||||||
|
|
||||||
def esvd(self,data,economy=1):
|
|
||||||
"""SVD with the option of economy sized calculation
|
|
||||||
Calculate subspaces of X'X or XX' depending on the shape
|
|
||||||
of the matrix.
|
|
||||||
Good for extreme fat or thin matrices.
|
|
||||||
|
|
||||||
"""
|
|
||||||
mm = self.mm
|
|
||||||
m,n = shape(data)
|
|
||||||
if m>=n:
|
|
||||||
u,s,v = svd(mm(data,data,trans_a=1))
|
|
||||||
u = mm(data,v,trans_b=1)
|
|
||||||
for i in xrange(n):
|
|
||||||
s[i] = norm(u[:,i])
|
|
||||||
u[:,i] = u[:,i]/s[i]
|
|
||||||
else:
|
|
||||||
u,s,v = svd(mm(data,data,trans_b=1))
|
|
||||||
v = mm(u,data,trans_a=1)
|
|
||||||
for i in xrange(m):
|
|
||||||
s[i] = norm(v[i,:])
|
|
||||||
v[i,:] = v[i,:]/s[i]
|
|
||||||
return u,s,v
|
|
||||||
|
|
||||||
def mm(self,a,b, alpha=1.0, beta=0.0, c=None, trans_a=0,
|
|
||||||
trans_b=0):
|
|
||||||
"""Fast matrix multiplication
|
|
||||||
|
|
||||||
Return alpha*(a*b) + beta*c.
|
|
||||||
|
|
||||||
a,b,c : matrices
|
|
||||||
alpha, beta: scalars
|
|
||||||
trans_a : 0 (a not transposed),
|
|
||||||
1 (a transposed),
|
|
||||||
2 (a conjugate transposed)
|
|
||||||
trans_b : 0 (b not transposed),
|
|
||||||
1 (b transposed),
|
|
||||||
2 (b conjugate transposed)
|
|
||||||
"""
|
|
||||||
if c:
|
|
||||||
gemm,= get_blas_funcs(('gemm',),(a,b,c))
|
|
||||||
else:
|
|
||||||
gemm,= get_blas_funcs(('gemm',),(a,b))
|
|
||||||
|
|
||||||
return gemm(alpha, a, b, beta, c, trans_a, trans_b)
|
|
||||||
|
|
||||||
|
|
||||||
class LoadMoothaData(Function):
|
|
||||||
def __init__(self):
|
|
||||||
Function.__init__(self, 'load', 'Load diabetes data')
|
|
||||||
|
|
||||||
def run(self,data):
|
|
||||||
data_file = open('full_data.pickle','r')
|
|
||||||
data = pickle.load(data_file)
|
|
||||||
data_file.close()
|
|
||||||
sample_file = open('sample_labels.pickle','r')
|
|
||||||
sample_names = pickle.load(sample_file)
|
|
||||||
sample_file.close()
|
|
||||||
typecode='f'
|
|
||||||
nSamps = len(sample_names)
|
|
||||||
nVars = len(data.keys())
|
|
||||||
gene_ids = []
|
|
||||||
x = zeros((nSamps,nVars),typecode)
|
|
||||||
for i,(id,desc) in enumerate(data.items()):
|
|
||||||
gene_ids.append(id)
|
|
||||||
x[:,i] = desc[0].astype(typecode)
|
|
||||||
gene_def = ('genes',gene_ids)
|
|
||||||
sample_def = ('samples', sample_names)
|
|
||||||
X = dataset.Dataset(x,identifiers=[sample_def,gene_def]) # samples x genes
|
|
||||||
return [X]
|
|
||||||
|
|
||||||
class Log2Function(Function):
|
|
||||||
def __init__(self):
|
|
||||||
Function.__init__(self, 'log', 'Log2')
|
|
||||||
|
|
||||||
def run(self,data):
|
|
||||||
x = log2(data._array)
|
|
||||||
#pull out identifiers
|
|
||||||
ids = []
|
|
||||||
for dim in data.get_dim_names():
|
|
||||||
ids.append((dim,data.get_identifiers(dim)))
|
|
||||||
return [dataset.Dataset(x,identifiers=ids,name='Log2_X')]
|
|
||||||
PCAWorkflow.name = 'PCA Workflow'
|
|
|
@ -1,5 +1,5 @@
|
||||||
import gtk
|
import gtk
|
||||||
from system import dataset, logger, plots, workflow
|
from fluents import dataset, logger, plots, workflow
|
||||||
#import geneontology
|
#import geneontology
|
||||||
#import gostat
|
#import gostat
|
||||||
from scipy import array, randn, log, ones
|
from scipy import array, randn, log, ones
|
||||||
|
|
Reference in New Issue