From 800e7dc42eff376897dbf421efb485608d55c4b9 Mon Sep 17 00:00:00 2001 From: flatberg Date: Thu, 20 Apr 2006 15:30:29 +0000 Subject: [PATCH] New pca workflow and datset updates --- fluent | 4 +- system/dataset.py | 20 +++++++-- system/workflow.py | 3 ++ workflows/pca_workflow.py | 89 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 111 insertions(+), 5 deletions(-) create mode 100644 workflows/pca_workflow.py diff --git a/fluent b/fluent index c62c5fd..1753c2f 100755 --- a/fluent +++ b/fluent @@ -19,6 +19,7 @@ import logger import plots import navigator import go_workflow +import pca_workflow import scipy PROGRAM_NAME = 'fluent' @@ -45,7 +46,8 @@ class FluentApp: self.current_data = None gtk.glade.set_custom_handler(self.custom_object_factory) self.widget_tree = gtk.glade.XML(GLADEFILENAME, 'appwindow') - self.workflow = go_workflow.EinarsWorkflow(self) + self.workflow = pca_workflow.PCAWorkflow(self) + self.workflow.add_project(self.project) def custom_object_factory(self, glade, function_name, widget_name,\ str1, str2, int1, int2): diff --git a/system/dataset.py b/system/dataset.py index e194935..d7a7f9f 100644 --- a/system/dataset.py +++ b/system/dataset.py @@ -12,12 +12,16 @@ class Dataset: """ def __init__(self,input_array,def_list): self._data = asarray(input_array) - self.dims = shape(self._data) + dims = shape(self._data) self.def_list = def_list self._ids_set = set() self.ids={} self._dim_num = {} self._dim_names = [] + if len(dims)==1: # a vector is defined to be column vector! + self.dims = (dims[0],1) + else: + self.dims = dims if len(def_list)!=len(self.dims): raise ValueError,"array dims and identifyer mismatch" for axis,(dim_name,ids) in enumerate(def_list): @@ -25,7 +29,7 @@ class Dataset: #if dim_name not in project.c_p.dim_names: # dim_name = project.c_p.suggest_dim_name(dim_name) if not ids: - logger.log('debug','Creating identifiers along: '+dim_name) + logger.log('debug','Creating identifiers along: '+ str(dim_name)) ids = self._create_identifiers(axis) for num,name in enumerate(ids): enum_ids[name] = num @@ -40,13 +44,21 @@ class Dataset: raise ValueError,"dim size and identifyer mismatch" def names(self,axis=0): - """Returns identifier names of a dimension. NB: not in any order! """ + """Returns identifier names of a dimension. + NB: sorted by values! + OK? necessary?""" if type(axis)==int: dim_name = self._dim_names[axis] elif type(axis)==str: dim_name = axis - return self.ids[dim_name].keys() + if dim_name not in self._dim_names: + raise ValueError, dim_name + " not a dimension in dataset" + items = self.ids[dim_name].items() + backitems=[ [v[1],v[0]] for v in items] + backitems.sort() + sorted_ids=[ backitems[i][1] for i in range(0,len(backitems))] + return sorted_ids def extract_data(self,ids,dim_name): """Extracts data along a dimension by identifiers""" diff --git a/system/workflow.py b/system/workflow.py index f74d097..8d15b21 100644 --- a/system/workflow.py +++ b/system/workflow.py @@ -27,6 +27,9 @@ class Workflow: for fun in stage.functions: print ' %s' % fun.name + def add_project(self,project): + self.project = project + class Stage: """A stage is a part of the data analysis process. diff --git a/workflows/pca_workflow.py b/workflows/pca_workflow.py new file mode 100644 index 0000000..d074e9c --- /dev/null +++ b/workflows/pca_workflow.py @@ -0,0 +1,89 @@ +import gtk +import logger +from workflow import * +from scipy import array +from data import read_affy_annot +import plots + +class PCAWorkflow(Workflow): + + def __init__(self, app): + Workflow.__init__(self, app) + self.name = 'PCAs Workflow' + + load = Stage('load', 'Load Data') + load.add_function(Function('load_mootha', 'Load Microarrays')) + self.add_stage(load) + + preproc = Stage('preprocess', 'Preprocessing') + preproc.add_function(Function('log2', 'Logarithm')) + self.add_stage(preproc) + + annot = Stage('annot', 'Affy annotations') + annot.add_function(LoadAnnotationsFunction()) + self.add_stage(annot) + + model = Stage('model', 'Model') + model.add_function(Function('pca', 'PCA')) + self.add_stage(model) + + logger.log('debug', '\tPCA\'s workflow is now active') + +class LoadAnnotationsFunction(Function): + + def __init__(self): + Function.__init__(self, 'load', 'Load Annotations') + self.annotations = None + + def load_affy_file(self, filename): + f = open(filename) + logger.log('notice', 'Loading annotation file: %s' % filename) + self.file = f + + def on_response(self, dialog, response): + if response == gtk.RESPONSE_OK: + logger.log('notice', 'Reading file: %s' % dialog.get_filename()) + self.load_affy_file(dialog.get_filename()) + + def run(self, data): + btns = ('Open', gtk.RESPONSE_OK, \ + 'Cancel', gtk.RESPONSE_CANCEL) + dialog = gtk.FileChooserDialog('Open Affy Annotation File', + buttons=btns) + dialog.connect('response', self.on_response) + dialog.run() + dialog.destroy() + + ### Reading and aprsing here + annot = read_affy_annot(self.file) + return [self.annotations] + +class PCAFunction(Function): + + def __init__(self): + Function.__init__(self, 'X', 'a_opt') + self.output = None + + def run(self, data): + logger.log('debug', 'datatype: %s' % type(data)) + if not isinstance(data,dataset.Dataset): + return None + logger.log('debug', 'dimensions: %s' % data.dims) + + ## calculations + T,P,E,tsq = pca(data._data,a_opt=2) + comp_def = ['comp',['1','2']] + singel_def = ['1',['s']] + col_def = [data._dim_names[0],data.names(0)] + row_def = [data._dim_names[1],data.names(1)] + T = dataset.Dataset(T,[col_def,comp_def]) + P = dataset.Dataset(T,[row_def,comp_def]) + E = dataset.Dataset(E,[col_def,row_def]) + tsq = dataset.Dataset(tsq,[row_def,sigel_def]) + + ## plots + loading_plot = plots.ScatterPlot() + + + return [T,P,E,r] +