From 375d45e0cc68aba15e13503350f901a2dc5cc4a3 Mon Sep 17 00:00:00 2001 From: einarr Date: Tue, 17 Oct 2006 14:42:27 +0000 Subject: [PATCH] Updated import statements, and removed the workflows pca_workflow and affy_workflow. --- bin/fluents | 2 +- fluents/fluents.py | 4 +- fluents/navigator.py | 2 +- fluents/plots.py | 2 +- fluents/project.py | 2 +- fluents/selections.py | 2 +- fluents/workflow.py | 2 +- workflows/affy_workflow.py | 412 ------------------------------------- workflows/pca_workflow.py | 262 ----------------------- workflows/test_workflow.py | 2 +- 10 files changed, 9 insertions(+), 683 deletions(-) delete mode 100644 workflows/affy_workflow.py delete mode 100644 workflows/pca_workflow.py diff --git a/bin/fluents b/bin/fluents index cb25aa2..0ecd71f 100755 --- a/bin/fluents +++ b/bin/fluents @@ -2,7 +2,7 @@ from getopt import getopt import sys -from system import fluents, project, workflow +from fluents import fluents, project, workflow import workflows PROGRAM_NAME = 'fluents' diff --git a/fluents/fluents.py b/fluents/fluents.py index c20ef75..003995b 100755 --- a/fluents/fluents.py +++ b/fluents/fluents.py @@ -13,12 +13,12 @@ import gnome import gnome.ui import scipy import pango -from system import project, workflow, dataset, logger, plots, navigator, dialogs, selections +import project, workflow, dataset, logger, plots, navigator, dialogs, selections PROGRAM_NAME = 'fluents' VERSION = '0.1.0' -DATADIR = os.path.dirname(sys.modules['system'].__file__) +DATADIR = os.path.dirname(sys.modules['fluents'].__file__) ICONDIR = os.path.join(DATADIR,"..","icons") GLADEFILENAME = os.path.join(DATADIR, 'fluents.glade') diff --git a/fluents/navigator.py b/fluents/navigator.py index 68e8936..e362a55 100644 --- a/fluents/navigator.py +++ b/fluents/navigator.py @@ -3,7 +3,7 @@ import gtk import gobject import plots import time -from system import dataset, logger, plots, project, workflow +import dataset, logger, plots, project, workflow class NavigatorView (gtk.TreeView): def __init__(self, project, app): diff --git a/fluents/plots.py b/fluents/plots.py index 2fbec64..2e9c123 100644 --- a/fluents/plots.py +++ b/fluents/plots.py @@ -4,7 +4,7 @@ import pygtk import gobject import gtk import fluents -from system import logger +import logger import matplotlib from matplotlib.backends.backend_gtkagg import FigureCanvasGTKAgg as FigureCanvas from matplotlib.backend_bases import NavigationToolbar2,cursors diff --git a/fluents/project.py b/fluents/project.py index a5c0961..801f3c6 100644 --- a/fluents/project.py +++ b/fluents/project.py @@ -4,7 +4,7 @@ import gobject import gtk import fluents import logger -from system import dataset, plots +import dataset, plots class Project: def __init__(self,name="Testing"): diff --git a/fluents/selections.py b/fluents/selections.py index 425e6a2..5d4844d 100644 --- a/fluents/selections.py +++ b/fluents/selections.py @@ -1,5 +1,5 @@ -from system import logger, dataset +import logger, dataset import pygtk import gtk diff --git a/fluents/workflow.py b/fluents/workflow.py index 18568a0..e882573 100644 --- a/fluents/workflow.py +++ b/fluents/workflow.py @@ -2,7 +2,7 @@ import gtk import sys import os import inspect -from system import logger +import logger def _workflow_classes(modname): """Returns a list of all subclasses of Workflow in a given module""" diff --git a/workflows/affy_workflow.py b/workflows/affy_workflow.py deleted file mode 100644 index a89071d..0000000 --- a/workflows/affy_workflow.py +++ /dev/null @@ -1,412 +0,0 @@ -import gtk -import os.path -from system import dataset, logger, plots, workflow, dialogs -from scipy import randn, array, transpose, zeros -import cPickle - - -class AffyWorkflow (workflow.Workflow): - - name = 'Affy Workflow' - ident = 'affy' - description = 'Affymetrics Workflow. Analysis of Affy-data.' - def __init__(self, app): - workflow.Workflow.__init__(self, app) - - load = workflow.Stage('load', 'Load Data') - load.add_function(CelFileImportFunction()) - load.add_function(PhenotypeImportFunction()) - load.add_function(TestDataFunction()) - load.add_function(DatasetLoadFunction()) - self.add_stage(load) - - significance = workflow.Stage('significance', 'Significance analysis') - significance.add_function(LimmaFunction()) - self.add_stage(significance) - - explore = workflow.Stage('explore', 'Explorative analysis') - explore.add_function(PCAFunction(self)) - explore.add_function(PrintFunction()) - self.add_stage(explore) - - save = workflow.Stage('save', 'Save Data') - save.add_function(DatasetSaveFunction()) - self.add_stage(save) - -class PrintFunction(workflow.Function): - def __init__(self): - workflow.Function.__init__(self, 'printer', 'Print Stuff') - - def run(self, data): - dim1, dim2 = data.get_dim_names() - print dim1, dim2 - print "\t", "\t".join(data.get_identifiers(dim2)) - for row in zip(data.get_identifiers(dim1), data.asarray().tolist()): - print "\t".join(map(str, row)) - - -class TestDataFunction(workflow.Function): - def __init__(self): - workflow.Function.__init__(self, 'test_data', 'Generate Test Data') - - def run(self): - logger.log('notice', 'Injecting foo test data') - x = randn(20,30) - X = dataset.Dataset(x) - return [X, plots.LinePlot(X)] - - -class DatasetLoadFunction(workflow.Function): - """Loader for previously pickled Datasets.""" - def __init__(self): - workflow.Function.__init__(self, 'load_data', 'Load Pickled Dataset') - - def run(self): - chooser = gtk.FileChooserDialog(title="Select cel files...", parent=None, - action=gtk.FILE_CHOOSER_ACTION_OPEN, - buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, - gtk.STOCK_OPEN, gtk.RESPONSE_OK)) - pkl_filter = gtk.FileFilter() - pkl_filter.set_name("Python pickled data files (*.pkl)") - pkl_filter.add_pattern("*.[pP][kK][lL]") - all_filter = gtk.FileFilter() - all_filter.set_name("All Files (*.*)") - all_filter.add_pattern("*") - chooser.add_filter(pkl_filter) - chooser.add_filter(all_filter) - - try: - if chooser.run() == gtk.RESPONSE_OK: - return [cPickle.load(open(chooser.get_filename()))] - finally: - chooser.destroy() - - -class DatasetSaveFunction(workflow.Function): - """QND way to save data to file for later import to this program.""" - def __init__(self): - workflow.Function.__init__(self, 'save_data', 'Save Pickled Dataset') - - def run(self, data): - chooser = gtk.FileChooserDialog(title="Save pickled data...", parent=None, - action=gtk.FILE_CHOOSER_ACTION_SAVE, - buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, - gtk.STOCK_SAVE, gtk.RESPONSE_OK)) - pkl_filter = gtk.FileFilter() - pkl_filter.set_name("Python pickled data files (*.pkl)") - pkl_filter.add_pattern("*.[pP][kK][lL]") - all_filter = gtk.FileFilter() - all_filter.set_name("All Files (*.*)") - all_filter.add_pattern("*") - chooser.add_filter(pkl_filter) - chooser.add_filter(all_filter) - chooser.set_current_name(data.get_name() + ".pkl") - - try: - if chooser.run() == gtk.RESPONSE_OK: - cPickle.dump(data, open(chooser.get_filename(), "w"), protocol=2) - logger.log("notice", "Saved data to %r." % chooser.get_filename()) - finally: - chooser.destroy() - - -class LimmaFunction(workflow.Function): - def __init__(self): - workflow.Function.__init__(self, 'limma', 'Limma') - - def run(self, affy, data): - response = dialogs.get_text('Enter contrasts...', """\ -Enter comma-separated list of contrasts. -Available categories: %s - -Example: Y-N, M-F""" % ", ".join(data.get_categories())) - - logger.log("notice", "contrasts selected: %s" % response) - - categories = [] - [categories.extend(s.split("-")) for s in response.split(",")] - categories = [s.strip() for s in categories] - - factors = data.get_factors(categories) - if not factors: - logger.log("warning", "nothing to do, no factors") - - table = data.get_phenotype_table([os.path.splitext(f)[0] for f in affy.get_identifiers('filename')]) - cn = table[0] - entries = zip(*table[1:]) - rn = entries[0] - - import rpy - silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r) - rpy.r.library("limma") - silent_eval("a <- matrix('kalle', nrow=%d, ncol=%d)" % (len(rn), len(cn))) - for i, row in enumerate(entries): - for j, entry in enumerate(row): - silent_eval("a[%d, %d] <- '%s'" % (j+1, i+1, entry)) - rpy.r.assign("rn", rn) - rpy.r.assign("cn", cn) - silent_eval("rownames(a) <- rn") - silent_eval("colnames(a) <- cn") - - unique_categories = list(set(categories)) - - # compose fancy list of factors for design matrix - silent_eval("design <- matrix(0, nrow=%d, ncol=%d)" % (len(rn), len(unique_categories))) - for i, category in enumerate(unique_categories): - for j, value in enumerate(data.get_category_variable(category)): - silent_eval("design[%d, %d] <- %d" % (j+1, i+1, value)) - - rpy.r.assign("colnames.design", unique_categories) - silent_eval("colnames(design) <- colnames.design") - - rpy.r.assign("expr", affy.asarray()) - silent_eval("fit <- lmFit(expr, design)") - silent_eval("contrast.matrix <- makeContrasts(%s, levels=design)" % response) - silent_eval("fit2 <- contrasts.fit(fit, contrast.matrix)") - silent_eval("fit2 <- eBayes(fit2)") - coeff = rpy.r("fit2$coefficients") - amean = rpy.r("fit2$Amean") - padj = rpy.r("p.adjust(fit2$p.value, method='fdr')") - - dim_1, dim_2 = affy.get_dim_names() - - - coeff_data = dataset.Dataset(coeff, [(dim_1, affy.get_identifiers(dim_1)), - ("contrast", [response])], - name="Coefficients") - amean_data = dataset.Dataset(array(amean), [("average", ["average"]), - (dim_1, affy.get_identifiers(dim_1))], - name="Average Intensity") - padj_data = dataset.Dataset(padj, [(dim_1, affy.get_identifiers(dim_1)), - ("contrast", [response])], - name="Adjusted P-value") - - vulcano_plot = plots.ScatterPlot(coeff_data, padj_data, dim_1, - 'contrast', response, response, - name="Vulcano plot") - - # We should be nice and clean up after ourselves - rpy.r("rm(list=ls())") - - return [coeff_data, amean_data, padj_data, vulcano_plot] - - -class CelFileImportFunction(workflow.Function): - """Loads Affymetrics .CEL-files into matrix.""" - def __init__(self): - workflow.Function.__init__(self, 'cel_import', 'Import Affy') - - def run(self): - import rpy - chooser = gtk.FileChooserDialog(title="Select cel files...", parent=None, - action=gtk.FILE_CHOOSER_ACTION_OPEN, - buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, - gtk.STOCK_OPEN, gtk.RESPONSE_OK)) - chooser.set_select_multiple(True) - cel_filter = gtk.FileFilter() - cel_filter.set_name("Cel Files (*.cel)") - cel_filter.add_pattern("*.[cC][eE][lL]") - all_filter = gtk.FileFilter() - all_filter.set_name("All Files (*.*)") - all_filter.add_pattern("*") - chooser.add_filter(cel_filter) - chooser.add_filter(all_filter) - - try: - if chooser.run() == gtk.RESPONSE_OK: - rpy.r.library("affy") - - silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r) - silent_eval('E <- ReadAffy(filenames=c("%s"))' % '", "'.join(chooser.get_filenames())) - silent_eval('E <- rma(E)') - m = rpy.r('m <- E@exprs') - - vector_eval = rpy.with_mode(rpy.VECTOR_CONVERSION, rpy.r) - rownames = vector_eval('rownames(m)') - colnames = vector_eval('colnames(m)') - - # We should be nice and clean up after ourselves - rpy.r.rm(["E", "m"]) - - if m: - data = dataset.Dataset(m, (('ids', rownames), ('filename', colnames)), name="Affymetrics Data") - plot = plots.LinePlot(data, "Gene profiles") - return [data, plot] - else: - logger.log("notice", "No data loaded from importer.") - finally: - chooser.destroy() - - -class PhenotypeImportFunction(workflow.Function): - def __init__(self): - workflow.Function.__init__(self, 'import_phenotype', 'Import Phenotypes') - - def run(self): - chooser = gtk.FileChooserDialog(title="Select cel files...", parent=None, - action=gtk.FILE_CHOOSER_ACTION_OPEN, - buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, - gtk.STOCK_OPEN, gtk.RESPONSE_OK)) - all_filter = gtk.FileFilter() - all_filter.set_name("Tab separated file (*.*)") - all_filter.add_pattern("*") - chooser.add_filter(all_filter) - - try: - if chooser.run() == gtk.RESPONSE_OK: - text = open(chooser.get_filename()).read() - data = PhenotypeDataset(text) - return [data] - - finally: - chooser.destroy() - - -class PCAFunction(workflow.Function): - """Generic PCA function.""" - def __init__(self, wf): - workflow.Function.__init__(self, 'pca', 'PCA') - self._workflow = wf - - def run(self,data): - import rpy - - dim_2, dim_1 = data.get_dim_names() - - silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r) - rpy.with_mode(rpy.NO_CONVERSION, rpy.r.assign)("m", data.asarray()) - silent_eval("t = prcomp(t(m))") - - # we make a unique name for component dimension - c = 0 - component_dim = prefix = "component" - while component_dim in data.get_all_dims(): - component_dim = prefix + "_" + str(c) - c += 1 - - T_ids = map(str, range(1, rpy.r("dim(t$x)")[1]+1)) - T = dataset.Dataset(rpy.r("t$x"), [(dim_1, data.get_identifiers(dim_1)), - (component_dim, T_ids)], - all_dims = data.get_all_dims(), name="T") - P = dataset.Dataset(rpy.r("t$rotation"), [(dim_2, data.get_identifiers(dim_2)), - (component_dim, T_ids)], - all_dims = data.get_all_dims(), name="P") - # cleanup - rpy.r.rm(["t", "m"]) - - loading_plot = plots.ScatterPlot(P, P, dim_2, component_dim, '1', '2', - "Loadings") - score_plot = plots.ScatterPlot(T, T, dim_1,component_dim, '1', '2', - "Scores") - - return [T, P, loading_plot, score_plot] - - -class PhenotypeDataset(dataset.Dataset): - def __init__(self, string): - self._table = rows = [line.split("\t") for line in string.splitlines()] - columns = zip(*rows[1:]) - cel_names = columns[0] - col_names = rows[0][1:] - phenotypes = [] - categories = {} - self._categories = {} - - for col_name, column in zip(col_names, columns[1:]): - try: - categories[col_name] = map(float, column) - phenotypes.append(col_name) - except ValueError: - # category-data - keys = [] - entries = {} - for i, entry in enumerate(column): - if entry not in entries: - keys.append(entry) - entries[entry] = [] - - entries[entry].append(i) - - for key in keys: - self._categories[key] = col_name - z = zeros(len(column)) - for i in entries[key]: - z[i] = 1 - key = "%s-%s" % (col_name, key) - phenotypes.append(key) - categories[key] = z - - matrix_data = [] - for key in phenotypes: - matrix_data.append(categories[key]) - - if matrix_data: - a = transpose(array(matrix_data)) - else: - a = None - - dataset.Dataset.__init__(self, a, identifiers=[('CEL', cel_names), - ('phenotypes', phenotypes)], - shape=(len(cel_names),len(phenotypes)), name="Phenotype Data") - - def sort_cels(self, cel_names): - self._dims = [] - - cels = {} - for row in self._table[1:]: - cels[row[0]] = row[1:] - - new_table = [self._table[0]] - for name in cel_names: - new_table.append([name] + cels[name]) - - self._table = new_table - self._set_identifiers([('CEL', cel_names), ('phenotypes', self.get_identifiers('phenotypes'))], self._all_dims) - - def get_phenotype_table(self, cel_order=None): - """Get string based table of phenotypes as read from file.""" - if not cel_order: - return self._table - else: - cels = {} - for row in self._table[1:]: - cels[row[0]] = row[1:] - - new_table = [self._table[0]] - for name in cel_order: - new_table.append([name] + cels[name]) - - return new_table - - def get_categories(self): - """Get categories of factors. - - If factor 'sick' had possibilites Y/N, and 'sex' M/F, the - categories would be Y, N, M and F. - """ - return self._categories.keys() - - def get_factors(self, categories): - factors = set() - for c in categories: - factors.add(self._categories[c]) - - return factors - - def get_category_variable(self, category): - # abit brute-force, but does the job until optimization is - # necessary - factor = self._categories[category] - variable = [] - for column in zip(*self.get_phenotype_table()): - if column[0] == factor: - for entry in column[1:]: - if entry == category: - variable.append(1) - else: - variable.append(0) - - return variable - - - diff --git a/workflows/pca_workflow.py b/workflows/pca_workflow.py deleted file mode 100644 index 50f9458..0000000 --- a/workflows/pca_workflow.py +++ /dev/null @@ -1,262 +0,0 @@ -import gtk -import system.workflow as wf -from system.workflow import Stage, Function -import pickle -from scipy import log2,transpose,dot,divide,shape,mean,resize,zeros -from scipy.linalg import svd,inv,norm,get_blas_funcs,eig -from system import dataset, logger, plots - - - -class PCAWorkflow(wf.Workflow): - name = 'PCA Workflow' - ident = 'pca' - description = 'PCA workflow. Uses real microarray data from a study of diabetes (Mootha et al.).' - - def __init__(self, app): - wf.Workflow.__init__(self, app) - #self.add_project(app.project) - #logger.log('notice','Current project added to: %s' %self.name) - - load = Stage('load', 'Load Data') - load.add_function(LoadMoothaData()) - self.add_stage(load) - - preproc = Stage('preprocess', 'Preprocessing') - preproc.add_function(Log2Function()) - self.add_stage(preproc) - - annot = Stage('annot', 'Affy annotations') - annot.add_function(LoadAnnotationsFunction()) - self.add_stage(annot) - - model = Stage('model', 'Model') - model.add_function(PCAFunction(self)) - self.add_stage(model) - - logger.log('debug', '\tPCA\'s workflow is now active') - -class LoadAnnotationsFunction(Function): - - def __init__(self): - Function.__init__(self, 'load', 'Load Annotations') - - def load_affy_file(self, filename): - f = open(filename) - logger.log('notice', 'Loading annotation file: %s' % filename) - self.file = f - - def on_response(self, dialog, response): - if response == gtk.RESPONSE_OK: - logger.log('notice', 'Reading file: %s' % dialog.get_filename()) - self.load_affy_file(dialog.get_filename()) - - def run(self,data): - btns = ('Open', gtk.RESPONSE_OK, \ - 'Cancel', gtk.RESPONSE_CANCEL) - dialog = gtk.FileChooserDialog('Open Affy Annotation File', - buttons=btns) - dialog.connect('response', self.on_response) - dialog.run() - dialog.destroy() - - ### Reading and parsing here - annot = read_affy_annot(self.file) - i_want = 'Pathway' - nothing = '---' - ids_in_data = set(data.names('genes')) #assuming we have genes - sanity_check = set(annot.keys()) - if not ids_in_data.intersection(sanity_check) == ids_in_data: - logger.log('debug','Some identifers in data does not exist in affy file!') - for affy_id,description in annot: - if affy_id in ids_in_data: - pathways = description[i_want] - if not pathways[0][0]=='--': - pass - D = [] - return [D] - -class PCAFunction(Function): - - def __init__(self,workflow): - Function.__init__(self, 'pca', 'PCA') - self.output = None - self.workflow = workflow - - def run(self,data): - logger.log('debug', 'datatype: %s' % type(data)) - if not isinstance(data,dataset.Dataset): - return None - #logger.log('debug', 'dimensions: %s' % data.dims) - - ## calculations - T,P,E,tsq = self.pca(data._array,5,tsq_loads=False) - comp_def = ('comp',('1','2','3','4','5')) - singel_def = ('1',('s')) - - # pull out input identifiers: - row_ids = data.get_identifiers('genes') - col_ids = data.get_identifiers('samples') - - T = dataset.Dataset(T,[('samples',col_ids) ,comp_def],name='T2') - P = dataset.Dataset(P,[('genes',row_ids),comp_def],name='P') - E = dataset.Dataset(E,[('samples',col_ids),('genes',row_ids)],name='E') - #tsq = dataset.Dataset(tsq,[singel_def,data_ids[1]) - - ## plots - loading_plot1 = plots.ScatterPlot(P,P,'genes','comp','1','2') - loading_plot2 = plots.ScatterPlot(P,P,'genes','comp','3','4') - score_plot = plots.ScatterPlot(T,T,'samples','comp','1','2') - - return [T,P,E,loading_plot1,loading_plot2,score_plot] - - def pca(self,X,a_opt,cent=True,scale='loads',tsq_loads=False): - """Principal component analysis - - input: - Xc -- matrix, data - a_opt -- scalar, max number of comp. to calculate - cent -- bool, centering [True] - crit -- string, pc criteria ['exp_var',['ief','rpv','average']] - scale -- string, scaling ['loads',['scores']] - tsq_loads -- bool, calculate t-squared? [True] - reg -- float, covariance regularizer for tsq calculations [0.2] - output: - T,P,E,r - - """ - nSamples,nVarX = shape(X) - if cent: - Xc = self.mat_center(X) - else: - Xc = X - u,s,vh = self.esvd(Xc) - if scale=='scores': - T = u*s - T = T[:,:a_opt] - P = transpose(vh) - P = P[:,:a_opt] - elif scale=='loads': - T = u - T = T[:,:a_opt] - P = transpose(vh)*s - P = P[:,:a_opt] - - E = Xc - dot(T,transpose(P)) - varEach = s**2 - totVar = sum(varEach) - r = divide(varEach,totVar)*100 - return T,P,E,r - - def mat_center(self,X,axis=0,ret_mn=False): - """Mean center matrix along axis. - - input: - X -- matrix, data - axis -- dim, - ret_mn -- bool, return mean - output: - Xc, [mnX] - - NB: axis = 1 is column-centering, axis=0=row-centering - default is row centering (axis=0) - """ - try: - rows,cols = shape(X) - except ValueError: - print "The X data needs to be two-dimensional" - - if axis==0: - mnX = mean(X,axis) - Xs = X - resize(mnX,(rows,cols)) - - elif axis==1: - mnX = mean(X,axis) - Xs = transpose(transpose(X) - resize(mnX,(cols,rows))) - if ret_mn: - return Xs,mnX - else: - return Xs - - def esvd(self,data,economy=1): - """SVD with the option of economy sized calculation - Calculate subspaces of X'X or XX' depending on the shape - of the matrix. - Good for extreme fat or thin matrices. - - """ - mm = self.mm - m,n = shape(data) - if m>=n: - u,s,v = svd(mm(data,data,trans_a=1)) - u = mm(data,v,trans_b=1) - for i in xrange(n): - s[i] = norm(u[:,i]) - u[:,i] = u[:,i]/s[i] - else: - u,s,v = svd(mm(data,data,trans_b=1)) - v = mm(u,data,trans_a=1) - for i in xrange(m): - s[i] = norm(v[i,:]) - v[i,:] = v[i,:]/s[i] - return u,s,v - - def mm(self,a,b, alpha=1.0, beta=0.0, c=None, trans_a=0, - trans_b=0): - """Fast matrix multiplication - - Return alpha*(a*b) + beta*c. - - a,b,c : matrices - alpha, beta: scalars - trans_a : 0 (a not transposed), - 1 (a transposed), - 2 (a conjugate transposed) - trans_b : 0 (b not transposed), - 1 (b transposed), - 2 (b conjugate transposed) - """ - if c: - gemm,= get_blas_funcs(('gemm',),(a,b,c)) - else: - gemm,= get_blas_funcs(('gemm',),(a,b)) - - return gemm(alpha, a, b, beta, c, trans_a, trans_b) - - -class LoadMoothaData(Function): - def __init__(self): - Function.__init__(self, 'load', 'Load diabetes data') - - def run(self,data): - data_file = open('full_data.pickle','r') - data = pickle.load(data_file) - data_file.close() - sample_file = open('sample_labels.pickle','r') - sample_names = pickle.load(sample_file) - sample_file.close() - typecode='f' - nSamps = len(sample_names) - nVars = len(data.keys()) - gene_ids = [] - x = zeros((nSamps,nVars),typecode) - for i,(id,desc) in enumerate(data.items()): - gene_ids.append(id) - x[:,i] = desc[0].astype(typecode) - gene_def = ('genes',gene_ids) - sample_def = ('samples', sample_names) - X = dataset.Dataset(x,identifiers=[sample_def,gene_def]) # samples x genes - return [X] - -class Log2Function(Function): - def __init__(self): - Function.__init__(self, 'log', 'Log2') - - def run(self,data): - x = log2(data._array) - #pull out identifiers - ids = [] - for dim in data.get_dim_names(): - ids.append((dim,data.get_identifiers(dim))) - return [dataset.Dataset(x,identifiers=ids,name='Log2_X')] -PCAWorkflow.name = 'PCA Workflow' diff --git a/workflows/test_workflow.py b/workflows/test_workflow.py index 8026b49..882a91a 100644 --- a/workflows/test_workflow.py +++ b/workflows/test_workflow.py @@ -1,5 +1,5 @@ import gtk -from system import dataset, logger, plots, workflow +from fluents import dataset, logger, plots, workflow #import geneontology #import gostat from scipy import array, randn, log, ones