Projects/laydi
Projects
/
laydi
Archived
7
0
Fork 0

Updated import statements, and removed the workflows pca_workflow and

affy_workflow.
This commit is contained in:
Einar Ryeng 2006-10-17 14:42:27 +00:00
parent 610812f265
commit 375d45e0cc
10 changed files with 9 additions and 683 deletions

View File

@ -2,7 +2,7 @@
from getopt import getopt from getopt import getopt
import sys import sys
from system import fluents, project, workflow from fluents import fluents, project, workflow
import workflows import workflows
PROGRAM_NAME = 'fluents' PROGRAM_NAME = 'fluents'

View File

@ -13,12 +13,12 @@ import gnome
import gnome.ui import gnome.ui
import scipy import scipy
import pango import pango
from system import project, workflow, dataset, logger, plots, navigator, dialogs, selections import project, workflow, dataset, logger, plots, navigator, dialogs, selections
PROGRAM_NAME = 'fluents' PROGRAM_NAME = 'fluents'
VERSION = '0.1.0' VERSION = '0.1.0'
DATADIR = os.path.dirname(sys.modules['system'].__file__) DATADIR = os.path.dirname(sys.modules['fluents'].__file__)
ICONDIR = os.path.join(DATADIR,"..","icons") ICONDIR = os.path.join(DATADIR,"..","icons")
GLADEFILENAME = os.path.join(DATADIR, 'fluents.glade') GLADEFILENAME = os.path.join(DATADIR, 'fluents.glade')

View File

@ -3,7 +3,7 @@ import gtk
import gobject import gobject
import plots import plots
import time import time
from system import dataset, logger, plots, project, workflow import dataset, logger, plots, project, workflow
class NavigatorView (gtk.TreeView): class NavigatorView (gtk.TreeView):
def __init__(self, project, app): def __init__(self, project, app):

View File

@ -4,7 +4,7 @@ import pygtk
import gobject import gobject
import gtk import gtk
import fluents import fluents
from system import logger import logger
import matplotlib import matplotlib
from matplotlib.backends.backend_gtkagg import FigureCanvasGTKAgg as FigureCanvas from matplotlib.backends.backend_gtkagg import FigureCanvasGTKAgg as FigureCanvas
from matplotlib.backend_bases import NavigationToolbar2,cursors from matplotlib.backend_bases import NavigationToolbar2,cursors

View File

@ -4,7 +4,7 @@ import gobject
import gtk import gtk
import fluents import fluents
import logger import logger
from system import dataset, plots import dataset, plots
class Project: class Project:
def __init__(self,name="Testing"): def __init__(self,name="Testing"):

View File

@ -1,5 +1,5 @@
from system import logger, dataset import logger, dataset
import pygtk import pygtk
import gtk import gtk

View File

@ -2,7 +2,7 @@ import gtk
import sys import sys
import os import os
import inspect import inspect
from system import logger import logger
def _workflow_classes(modname): def _workflow_classes(modname):
"""Returns a list of all subclasses of Workflow in a given module""" """Returns a list of all subclasses of Workflow in a given module"""

View File

@ -1,412 +0,0 @@
import gtk
import os.path
from system import dataset, logger, plots, workflow, dialogs
from scipy import randn, array, transpose, zeros
import cPickle
class AffyWorkflow (workflow.Workflow):
name = 'Affy Workflow'
ident = 'affy'
description = 'Affymetrics Workflow. Analysis of Affy-data.'
def __init__(self, app):
workflow.Workflow.__init__(self, app)
load = workflow.Stage('load', 'Load Data')
load.add_function(CelFileImportFunction())
load.add_function(PhenotypeImportFunction())
load.add_function(TestDataFunction())
load.add_function(DatasetLoadFunction())
self.add_stage(load)
significance = workflow.Stage('significance', 'Significance analysis')
significance.add_function(LimmaFunction())
self.add_stage(significance)
explore = workflow.Stage('explore', 'Explorative analysis')
explore.add_function(PCAFunction(self))
explore.add_function(PrintFunction())
self.add_stage(explore)
save = workflow.Stage('save', 'Save Data')
save.add_function(DatasetSaveFunction())
self.add_stage(save)
class PrintFunction(workflow.Function):
def __init__(self):
workflow.Function.__init__(self, 'printer', 'Print Stuff')
def run(self, data):
dim1, dim2 = data.get_dim_names()
print dim1, dim2
print "\t", "\t".join(data.get_identifiers(dim2))
for row in zip(data.get_identifiers(dim1), data.asarray().tolist()):
print "\t".join(map(str, row))
class TestDataFunction(workflow.Function):
def __init__(self):
workflow.Function.__init__(self, 'test_data', 'Generate Test Data')
def run(self):
logger.log('notice', 'Injecting foo test data')
x = randn(20,30)
X = dataset.Dataset(x)
return [X, plots.LinePlot(X)]
class DatasetLoadFunction(workflow.Function):
"""Loader for previously pickled Datasets."""
def __init__(self):
workflow.Function.__init__(self, 'load_data', 'Load Pickled Dataset')
def run(self):
chooser = gtk.FileChooserDialog(title="Select cel files...", parent=None,
action=gtk.FILE_CHOOSER_ACTION_OPEN,
buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
gtk.STOCK_OPEN, gtk.RESPONSE_OK))
pkl_filter = gtk.FileFilter()
pkl_filter.set_name("Python pickled data files (*.pkl)")
pkl_filter.add_pattern("*.[pP][kK][lL]")
all_filter = gtk.FileFilter()
all_filter.set_name("All Files (*.*)")
all_filter.add_pattern("*")
chooser.add_filter(pkl_filter)
chooser.add_filter(all_filter)
try:
if chooser.run() == gtk.RESPONSE_OK:
return [cPickle.load(open(chooser.get_filename()))]
finally:
chooser.destroy()
class DatasetSaveFunction(workflow.Function):
"""QND way to save data to file for later import to this program."""
def __init__(self):
workflow.Function.__init__(self, 'save_data', 'Save Pickled Dataset')
def run(self, data):
chooser = gtk.FileChooserDialog(title="Save pickled data...", parent=None,
action=gtk.FILE_CHOOSER_ACTION_SAVE,
buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
gtk.STOCK_SAVE, gtk.RESPONSE_OK))
pkl_filter = gtk.FileFilter()
pkl_filter.set_name("Python pickled data files (*.pkl)")
pkl_filter.add_pattern("*.[pP][kK][lL]")
all_filter = gtk.FileFilter()
all_filter.set_name("All Files (*.*)")
all_filter.add_pattern("*")
chooser.add_filter(pkl_filter)
chooser.add_filter(all_filter)
chooser.set_current_name(data.get_name() + ".pkl")
try:
if chooser.run() == gtk.RESPONSE_OK:
cPickle.dump(data, open(chooser.get_filename(), "w"), protocol=2)
logger.log("notice", "Saved data to %r." % chooser.get_filename())
finally:
chooser.destroy()
class LimmaFunction(workflow.Function):
def __init__(self):
workflow.Function.__init__(self, 'limma', 'Limma')
def run(self, affy, data):
response = dialogs.get_text('Enter contrasts...', """\
Enter comma-separated list of contrasts.
Available categories: %s
Example: Y-N, M-F""" % ", ".join(data.get_categories()))
logger.log("notice", "contrasts selected: %s" % response)
categories = []
[categories.extend(s.split("-")) for s in response.split(",")]
categories = [s.strip() for s in categories]
factors = data.get_factors(categories)
if not factors:
logger.log("warning", "nothing to do, no factors")
table = data.get_phenotype_table([os.path.splitext(f)[0] for f in affy.get_identifiers('filename')])
cn = table[0]
entries = zip(*table[1:])
rn = entries[0]
import rpy
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
rpy.r.library("limma")
silent_eval("a <- matrix('kalle', nrow=%d, ncol=%d)" % (len(rn), len(cn)))
for i, row in enumerate(entries):
for j, entry in enumerate(row):
silent_eval("a[%d, %d] <- '%s'" % (j+1, i+1, entry))
rpy.r.assign("rn", rn)
rpy.r.assign("cn", cn)
silent_eval("rownames(a) <- rn")
silent_eval("colnames(a) <- cn")
unique_categories = list(set(categories))
# compose fancy list of factors for design matrix
silent_eval("design <- matrix(0, nrow=%d, ncol=%d)" % (len(rn), len(unique_categories)))
for i, category in enumerate(unique_categories):
for j, value in enumerate(data.get_category_variable(category)):
silent_eval("design[%d, %d] <- %d" % (j+1, i+1, value))
rpy.r.assign("colnames.design", unique_categories)
silent_eval("colnames(design) <- colnames.design")
rpy.r.assign("expr", affy.asarray())
silent_eval("fit <- lmFit(expr, design)")
silent_eval("contrast.matrix <- makeContrasts(%s, levels=design)" % response)
silent_eval("fit2 <- contrasts.fit(fit, contrast.matrix)")
silent_eval("fit2 <- eBayes(fit2)")
coeff = rpy.r("fit2$coefficients")
amean = rpy.r("fit2$Amean")
padj = rpy.r("p.adjust(fit2$p.value, method='fdr')")
dim_1, dim_2 = affy.get_dim_names()
coeff_data = dataset.Dataset(coeff, [(dim_1, affy.get_identifiers(dim_1)),
("contrast", [response])],
name="Coefficients")
amean_data = dataset.Dataset(array(amean), [("average", ["average"]),
(dim_1, affy.get_identifiers(dim_1))],
name="Average Intensity")
padj_data = dataset.Dataset(padj, [(dim_1, affy.get_identifiers(dim_1)),
("contrast", [response])],
name="Adjusted P-value")
vulcano_plot = plots.ScatterPlot(coeff_data, padj_data, dim_1,
'contrast', response, response,
name="Vulcano plot")
# We should be nice and clean up after ourselves
rpy.r("rm(list=ls())")
return [coeff_data, amean_data, padj_data, vulcano_plot]
class CelFileImportFunction(workflow.Function):
"""Loads Affymetrics .CEL-files into matrix."""
def __init__(self):
workflow.Function.__init__(self, 'cel_import', 'Import Affy')
def run(self):
import rpy
chooser = gtk.FileChooserDialog(title="Select cel files...", parent=None,
action=gtk.FILE_CHOOSER_ACTION_OPEN,
buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
gtk.STOCK_OPEN, gtk.RESPONSE_OK))
chooser.set_select_multiple(True)
cel_filter = gtk.FileFilter()
cel_filter.set_name("Cel Files (*.cel)")
cel_filter.add_pattern("*.[cC][eE][lL]")
all_filter = gtk.FileFilter()
all_filter.set_name("All Files (*.*)")
all_filter.add_pattern("*")
chooser.add_filter(cel_filter)
chooser.add_filter(all_filter)
try:
if chooser.run() == gtk.RESPONSE_OK:
rpy.r.library("affy")
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
silent_eval('E <- ReadAffy(filenames=c("%s"))' % '", "'.join(chooser.get_filenames()))
silent_eval('E <- rma(E)')
m = rpy.r('m <- E@exprs')
vector_eval = rpy.with_mode(rpy.VECTOR_CONVERSION, rpy.r)
rownames = vector_eval('rownames(m)')
colnames = vector_eval('colnames(m)')
# We should be nice and clean up after ourselves
rpy.r.rm(["E", "m"])
if m:
data = dataset.Dataset(m, (('ids', rownames), ('filename', colnames)), name="Affymetrics Data")
plot = plots.LinePlot(data, "Gene profiles")
return [data, plot]
else:
logger.log("notice", "No data loaded from importer.")
finally:
chooser.destroy()
class PhenotypeImportFunction(workflow.Function):
def __init__(self):
workflow.Function.__init__(self, 'import_phenotype', 'Import Phenotypes')
def run(self):
chooser = gtk.FileChooserDialog(title="Select cel files...", parent=None,
action=gtk.FILE_CHOOSER_ACTION_OPEN,
buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
gtk.STOCK_OPEN, gtk.RESPONSE_OK))
all_filter = gtk.FileFilter()
all_filter.set_name("Tab separated file (*.*)")
all_filter.add_pattern("*")
chooser.add_filter(all_filter)
try:
if chooser.run() == gtk.RESPONSE_OK:
text = open(chooser.get_filename()).read()
data = PhenotypeDataset(text)
return [data]
finally:
chooser.destroy()
class PCAFunction(workflow.Function):
"""Generic PCA function."""
def __init__(self, wf):
workflow.Function.__init__(self, 'pca', 'PCA')
self._workflow = wf
def run(self,data):
import rpy
dim_2, dim_1 = data.get_dim_names()
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
rpy.with_mode(rpy.NO_CONVERSION, rpy.r.assign)("m", data.asarray())
silent_eval("t = prcomp(t(m))")
# we make a unique name for component dimension
c = 0
component_dim = prefix = "component"
while component_dim in data.get_all_dims():
component_dim = prefix + "_" + str(c)
c += 1
T_ids = map(str, range(1, rpy.r("dim(t$x)")[1]+1))
T = dataset.Dataset(rpy.r("t$x"), [(dim_1, data.get_identifiers(dim_1)),
(component_dim, T_ids)],
all_dims = data.get_all_dims(), name="T")
P = dataset.Dataset(rpy.r("t$rotation"), [(dim_2, data.get_identifiers(dim_2)),
(component_dim, T_ids)],
all_dims = data.get_all_dims(), name="P")
# cleanup
rpy.r.rm(["t", "m"])
loading_plot = plots.ScatterPlot(P, P, dim_2, component_dim, '1', '2',
"Loadings")
score_plot = plots.ScatterPlot(T, T, dim_1,component_dim, '1', '2',
"Scores")
return [T, P, loading_plot, score_plot]
class PhenotypeDataset(dataset.Dataset):
def __init__(self, string):
self._table = rows = [line.split("\t") for line in string.splitlines()]
columns = zip(*rows[1:])
cel_names = columns[0]
col_names = rows[0][1:]
phenotypes = []
categories = {}
self._categories = {}
for col_name, column in zip(col_names, columns[1:]):
try:
categories[col_name] = map(float, column)
phenotypes.append(col_name)
except ValueError:
# category-data
keys = []
entries = {}
for i, entry in enumerate(column):
if entry not in entries:
keys.append(entry)
entries[entry] = []
entries[entry].append(i)
for key in keys:
self._categories[key] = col_name
z = zeros(len(column))
for i in entries[key]:
z[i] = 1
key = "%s-%s" % (col_name, key)
phenotypes.append(key)
categories[key] = z
matrix_data = []
for key in phenotypes:
matrix_data.append(categories[key])
if matrix_data:
a = transpose(array(matrix_data))
else:
a = None
dataset.Dataset.__init__(self, a, identifiers=[('CEL', cel_names),
('phenotypes', phenotypes)],
shape=(len(cel_names),len(phenotypes)), name="Phenotype Data")
def sort_cels(self, cel_names):
self._dims = []
cels = {}
for row in self._table[1:]:
cels[row[0]] = row[1:]
new_table = [self._table[0]]
for name in cel_names:
new_table.append([name] + cels[name])
self._table = new_table
self._set_identifiers([('CEL', cel_names), ('phenotypes', self.get_identifiers('phenotypes'))], self._all_dims)
def get_phenotype_table(self, cel_order=None):
"""Get string based table of phenotypes as read from file."""
if not cel_order:
return self._table
else:
cels = {}
for row in self._table[1:]:
cels[row[0]] = row[1:]
new_table = [self._table[0]]
for name in cel_order:
new_table.append([name] + cels[name])
return new_table
def get_categories(self):
"""Get categories of factors.
If factor 'sick' had possibilites Y/N, and 'sex' M/F, the
categories would be Y, N, M and F.
"""
return self._categories.keys()
def get_factors(self, categories):
factors = set()
for c in categories:
factors.add(self._categories[c])
return factors
def get_category_variable(self, category):
# abit brute-force, but does the job until optimization is
# necessary
factor = self._categories[category]
variable = []
for column in zip(*self.get_phenotype_table()):
if column[0] == factor:
for entry in column[1:]:
if entry == category:
variable.append(1)
else:
variable.append(0)
return variable

View File

@ -1,262 +0,0 @@
import gtk
import system.workflow as wf
from system.workflow import Stage, Function
import pickle
from scipy import log2,transpose,dot,divide,shape,mean,resize,zeros
from scipy.linalg import svd,inv,norm,get_blas_funcs,eig
from system import dataset, logger, plots
class PCAWorkflow(wf.Workflow):
name = 'PCA Workflow'
ident = 'pca'
description = 'PCA workflow. Uses real microarray data from a study of diabetes (Mootha et al.).'
def __init__(self, app):
wf.Workflow.__init__(self, app)
#self.add_project(app.project)
#logger.log('notice','Current project added to: %s' %self.name)
load = Stage('load', 'Load Data')
load.add_function(LoadMoothaData())
self.add_stage(load)
preproc = Stage('preprocess', 'Preprocessing')
preproc.add_function(Log2Function())
self.add_stage(preproc)
annot = Stage('annot', 'Affy annotations')
annot.add_function(LoadAnnotationsFunction())
self.add_stage(annot)
model = Stage('model', 'Model')
model.add_function(PCAFunction(self))
self.add_stage(model)
logger.log('debug', '\tPCA\'s workflow is now active')
class LoadAnnotationsFunction(Function):
def __init__(self):
Function.__init__(self, 'load', 'Load Annotations')
def load_affy_file(self, filename):
f = open(filename)
logger.log('notice', 'Loading annotation file: %s' % filename)
self.file = f
def on_response(self, dialog, response):
if response == gtk.RESPONSE_OK:
logger.log('notice', 'Reading file: %s' % dialog.get_filename())
self.load_affy_file(dialog.get_filename())
def run(self,data):
btns = ('Open', gtk.RESPONSE_OK, \
'Cancel', gtk.RESPONSE_CANCEL)
dialog = gtk.FileChooserDialog('Open Affy Annotation File',
buttons=btns)
dialog.connect('response', self.on_response)
dialog.run()
dialog.destroy()
### Reading and parsing here
annot = read_affy_annot(self.file)
i_want = 'Pathway'
nothing = '---'
ids_in_data = set(data.names('genes')) #assuming we have genes
sanity_check = set(annot.keys())
if not ids_in_data.intersection(sanity_check) == ids_in_data:
logger.log('debug','Some identifers in data does not exist in affy file!')
for affy_id,description in annot:
if affy_id in ids_in_data:
pathways = description[i_want]
if not pathways[0][0]=='--':
pass
D = []
return [D]
class PCAFunction(Function):
def __init__(self,workflow):
Function.__init__(self, 'pca', 'PCA')
self.output = None
self.workflow = workflow
def run(self,data):
logger.log('debug', 'datatype: %s' % type(data))
if not isinstance(data,dataset.Dataset):
return None
#logger.log('debug', 'dimensions: %s' % data.dims)
## calculations
T,P,E,tsq = self.pca(data._array,5,tsq_loads=False)
comp_def = ('comp',('1','2','3','4','5'))
singel_def = ('1',('s'))
# pull out input identifiers:
row_ids = data.get_identifiers('genes')
col_ids = data.get_identifiers('samples')
T = dataset.Dataset(T,[('samples',col_ids) ,comp_def],name='T2')
P = dataset.Dataset(P,[('genes',row_ids),comp_def],name='P')
E = dataset.Dataset(E,[('samples',col_ids),('genes',row_ids)],name='E')
#tsq = dataset.Dataset(tsq,[singel_def,data_ids[1])
## plots
loading_plot1 = plots.ScatterPlot(P,P,'genes','comp','1','2')
loading_plot2 = plots.ScatterPlot(P,P,'genes','comp','3','4')
score_plot = plots.ScatterPlot(T,T,'samples','comp','1','2')
return [T,P,E,loading_plot1,loading_plot2,score_plot]
def pca(self,X,a_opt,cent=True,scale='loads',tsq_loads=False):
"""Principal component analysis
input:
Xc -- matrix, data
a_opt -- scalar, max number of comp. to calculate
cent -- bool, centering [True]
crit -- string, pc criteria ['exp_var',['ief','rpv','average']]
scale -- string, scaling ['loads',['scores']]
tsq_loads -- bool, calculate t-squared? [True]
reg -- float, covariance regularizer for tsq calculations [0.2]
output:
T,P,E,r
"""
nSamples,nVarX = shape(X)
if cent:
Xc = self.mat_center(X)
else:
Xc = X
u,s,vh = self.esvd(Xc)
if scale=='scores':
T = u*s
T = T[:,:a_opt]
P = transpose(vh)
P = P[:,:a_opt]
elif scale=='loads':
T = u
T = T[:,:a_opt]
P = transpose(vh)*s
P = P[:,:a_opt]
E = Xc - dot(T,transpose(P))
varEach = s**2
totVar = sum(varEach)
r = divide(varEach,totVar)*100
return T,P,E,r
def mat_center(self,X,axis=0,ret_mn=False):
"""Mean center matrix along axis.
input:
X -- matrix, data
axis -- dim,
ret_mn -- bool, return mean
output:
Xc, [mnX]
NB: axis = 1 is column-centering, axis=0=row-centering
default is row centering (axis=0)
"""
try:
rows,cols = shape(X)
except ValueError:
print "The X data needs to be two-dimensional"
if axis==0:
mnX = mean(X,axis)
Xs = X - resize(mnX,(rows,cols))
elif axis==1:
mnX = mean(X,axis)
Xs = transpose(transpose(X) - resize(mnX,(cols,rows)))
if ret_mn:
return Xs,mnX
else:
return Xs
def esvd(self,data,economy=1):
"""SVD with the option of economy sized calculation
Calculate subspaces of X'X or XX' depending on the shape
of the matrix.
Good for extreme fat or thin matrices.
"""
mm = self.mm
m,n = shape(data)
if m>=n:
u,s,v = svd(mm(data,data,trans_a=1))
u = mm(data,v,trans_b=1)
for i in xrange(n):
s[i] = norm(u[:,i])
u[:,i] = u[:,i]/s[i]
else:
u,s,v = svd(mm(data,data,trans_b=1))
v = mm(u,data,trans_a=1)
for i in xrange(m):
s[i] = norm(v[i,:])
v[i,:] = v[i,:]/s[i]
return u,s,v
def mm(self,a,b, alpha=1.0, beta=0.0, c=None, trans_a=0,
trans_b=0):
"""Fast matrix multiplication
Return alpha*(a*b) + beta*c.
a,b,c : matrices
alpha, beta: scalars
trans_a : 0 (a not transposed),
1 (a transposed),
2 (a conjugate transposed)
trans_b : 0 (b not transposed),
1 (b transposed),
2 (b conjugate transposed)
"""
if c:
gemm,= get_blas_funcs(('gemm',),(a,b,c))
else:
gemm,= get_blas_funcs(('gemm',),(a,b))
return gemm(alpha, a, b, beta, c, trans_a, trans_b)
class LoadMoothaData(Function):
def __init__(self):
Function.__init__(self, 'load', 'Load diabetes data')
def run(self,data):
data_file = open('full_data.pickle','r')
data = pickle.load(data_file)
data_file.close()
sample_file = open('sample_labels.pickle','r')
sample_names = pickle.load(sample_file)
sample_file.close()
typecode='f'
nSamps = len(sample_names)
nVars = len(data.keys())
gene_ids = []
x = zeros((nSamps,nVars),typecode)
for i,(id,desc) in enumerate(data.items()):
gene_ids.append(id)
x[:,i] = desc[0].astype(typecode)
gene_def = ('genes',gene_ids)
sample_def = ('samples', sample_names)
X = dataset.Dataset(x,identifiers=[sample_def,gene_def]) # samples x genes
return [X]
class Log2Function(Function):
def __init__(self):
Function.__init__(self, 'log', 'Log2')
def run(self,data):
x = log2(data._array)
#pull out identifiers
ids = []
for dim in data.get_dim_names():
ids.append((dim,data.get_identifiers(dim)))
return [dataset.Dataset(x,identifiers=ids,name='Log2_X')]
PCAWorkflow.name = 'PCA Workflow'

View File

@ -1,5 +1,5 @@
import gtk import gtk
from system import dataset, logger, plots, workflow from fluents import dataset, logger, plots, workflow
#import geneontology #import geneontology
#import gostat #import gostat
from scipy import array, randn, log, ones from scipy import array, randn, log, ones