diff --git a/fluent b/fluent index 4e8f237..509a02c 100755 --- a/fluent +++ b/fluent @@ -30,7 +30,7 @@ class FluentApp: # Application variables self.project = None self.current_data = None - + gtk.glade.set_custom_handler(self.custom_object_factory) self.widget_tree = gtk.glade.XML(GLADEFILENAME, 'appwindow') self.workflow = workflow.EmptyWorkflow(self) diff --git a/system/dataset.py b/system/dataset.py index 71aa479..d60e198 100644 --- a/system/dataset.py +++ b/system/dataset.py @@ -1,136 +1,225 @@ import logger -from scipy import array,take,asarray,shape,nonzero -import project -from itertools import izip +from scipy import atleast_2d,asarray,ArrayType class Dataset: - """Dataset base class. - + """The Dataset base class. + A Dataset is an n-way array with defined string identifiers across all dimensions. + + example of use: + + --- + dim_name_rows = 'rows' + names_rows = ('row_a','row_b') + ids_1 = [dim_name_rows, names_rows] + + dim_name_cols = 'cols' + names_cols = ('col_a','col_b','col_c','col_d') + ids_2 = [dim_name_cols, names_cols] + + Array_X = rand(2,4) + data = Dataset(Array_X,(ids_1,ids_2),name="Testing") + + dim_names = [dim for dim in data] + + column_identifiers = [id for id in data['cols'].keys()] + column_index = [index for index in data['cols'].values()] + + 'cols' in data -> True + + --- + + data = Dataset(rand(10,20)) (generates dims and ids (no links)) """ - def __init__(self, input_array, def_list, name="Unnamed data"): - self._name = name - self._data = asarray(input_array) - dims = shape(self._data) - self.def_list = def_list - self._ids_set = set() - self.ids={} - self._dim_num = {} - self._dim_names = [] - if len(dims)==1: # a vector is defined to be column vector! - self.dims = (dims[0],1) - else: - self.dims = dims - if len(def_list)!=len(self.dims): - raise ValueError,"array dims and identifyer mismatch" - for axis,(dim_name,ids) in enumerate(def_list): - enum_ids = {} - #if dim_name not in project.c_p.dim_names: - # dim_name = project.c_p.suggest_dim_name(dim_name) - if not ids: - logger.log('debug','Creating identifiers along: '+ str(dim_name)) - ids = self._create_identifiers(axis) - for num,name in enumerate(ids): - enum_ids[name] = num - self.ids[dim_name] = enum_ids - self._ids_set = self._ids_set.union(set(ids)) - self._dim_num[dim_name] = axis - self._dim_names.append(dim_name) - - for (dimname, ids), d in izip(def_list,self.dims): #check that data and labels match - if len(self.ids[dimname]) != d: - raise ValueError,"dim size and identifyer mismatch" + def __init__(self,array=None,identifiers=None,shape=None,all_dims=[],**kwds): + self._name = kwds.get("name","Unnamed data") + self._dims = [] #existing dimensions in this dataset + self._map = {} # internal mapping for dataset: identifier <--> index + self.has_array = False + self.shape = None + + if array==None: + if shape == None: + raise ValueError, "Must define shape if array is None" + else: + self.shape = shape + if identifiers!=None: + self._set_identifiers(identifiers,all_dims) + else: + ids = self._create_identifiers(shape,all_dims) + self._set_identifiers(ids,all_dims) + elif isinstance(array,ArrayType): + array = atleast_2d(asarray(array)) + self.shape = array.shape + if shape != None: + if self.shape!=shape: + #logger.log("debug","Dataset and input shape mismatch") + raise ValueError + if identifiers!=None: + self._set_identifiers(identifiers,all_dims) + else: + ids = self._create_identifiers(self.shape,all_dims) + self._set_identifiers(ids,all_dims) - def get_name(self): + self._array = array + self.has_array = True + + else: + raise ValueError, "array input must be of ArrayType or None" + + self._all_dims = all_dims + + def __str__self(self): return self._name - def get_dim_names(self): - return self._dim_names - - def names(self,axis=0): - """Returns identifier names of a dimension. - NB: sorted by values! - OK? necessary?""" + def __iter__(self): + """Returns an iterator over dimensions of dataset.""" + return self._dims.__iter__() + + def __contains__(self,dim): + """Returns True if dim is a dimension name in dataset.""" + # return self._dims.__contains__(dim) + return self._map.__contains__(dim) + + def __len__(self): + """Returns the number of dimensions in the dataset""" + return len(self._map) + + def __getitem__(self,dim): + """Return the identifers along the dimension dim.""" + return self._map[dim] + + def _create_identifiers(self,shape,all_dims): + """Creates dimension names and identifier names, and returns + identifiers.""" - if type(axis)==int: - dim_name = self._dim_names[axis] - elif type(axis)==str: - dim_name = axis - if dim_name not in self._dim_names: - raise ValueError, dim_name + " not a dimension in dataset" - items = self.ids[dim_name].items() - backitems=[ [v[1],v[0]] for v in items] - backitems.sort() - sorted_ids=[ backitems[i][1] for i in range(0,len(backitems))] - return sorted_ids - - def extract_data(self,ids,dim_name): - """Extracts data along a dimension by identifiers""" - new_def_list = self.def_list[:] - ids_index = [self.ids[dim_name][id_name] for id_name in ids] - dim_number = self._dim_num[dim_name] - try: - out_data = take(self._data,ids_index,axis=dim_number) - except: - raise ValueError - new_def_list[dim_number][1] = ids - extracted_data = Dataset(out_data,def_list=new_def_list,parents=self.parents) - return extracted_data - - def _create_identifiers(self,axis): - """Creates identifiers along an axis""" - n_dim = self.dims[axis] - return [str(axis) + '_' + str(i) for i in range(n_dim)] - - def extract_id_from_index(self,dim_name,index): - """Returns a set of ids from array/list of indexes.""" - dim_ids = self.ids[dim_name] - if type(index)==int: - index = [index] - - return set([id for id,ind in dim_ids.items() if ind in index]) - - def extract_index_from_id(self,dim_name,id): - """Returns an array of indexes from a set/list of identifiers - (or a single id)""" - dim_ids = self.ids[dim_name] - return array([ind for name,ind in dim_ids.items() if name in id]) - - -class CategoryDataset(Dataset): - def __init__(self,array,def_list): - Dataset.__init__(self,array,def_list) - - def get_elements_by_category(self,dim,category): - """Returns all elements along input dim belonging to category. - Assumes a two-dim category data only! - """ - if type(category)!=list: - raise ValueError, "category must be list" - gene_ids = [] - axis_dim = self._dim_num[dim] - cat_index = self.extract_index_from_id(category) - for ind in cat_index: - if axis_dim==0: - gene_indx = nonzero(self._data[:,ind]) - elif axis_dim==1: - gene_indx = nonzero(self._data[ind,:]) + dim_names = ['rows','cols'] + ids = [] + for axis,n in enumerate(shape): + if axis<2: + dim_suggestion = dim_names[axis] else: - ValueError, "Only support for 2-dim data" - gene_ids.append(self.extract_id_from_index(dim,gene_index)) - return gene_ids - - - - + dim_suggestion = 'dim' + while dim_suggestion in all_dims: + dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims) + identifier_creation = [str(axis) + "_" + i for i in map(str,range(n))] + ids.append((dim_suggestion,identifier_creation)) + all_dims.append(dim_suggestion) + return ids + + def _set_identifiers(self,identifiers,all_dims): + """Creates internal mapping of identifiers structure.""" + for dim,ids in identifiers: + pos_map={} + if dim not in self._dims: + self._dims.append(dim) + all_dims.append(dim) + else: + raise ValueError, "Dimension names must be unique" - - + for pos,id in enumerate(ids): + pos_map[id] = pos + self._map[dim] = pos_map + shape_chk = [len(i) for j,i in identifiers] + if shape_chk != list(self.shape): + raise ValueError, "Shape input: %s and array: %s mismatch" %(shape_chk,self.shape) + def _suggest_dim_name(self,dim_name,all_dims): + """Suggests a unique name for dim and returns it""" + c = 0 + while dim_name in all_dims: + dim_name = dim_name + "_" + str(c) + c+=1 + return dim_name + + def asarray(self): + """Returns the numeric array (data) of dataset""" + if not self.has_array: + raise ValueError, "Dataset is empty" + else: + return self._array + + def add_array(self,array): + """Adds array as an ArrayType object. + A one-dim array is transformed to a two-dim array (row-vector) + """ + + if self.has_array: + raise ValueError, "Dataset has array" + else: + if (len(self._map)!=len(array.shape)): + raise ValueError, "range(array_dims) and range(dim_names) mismatch" + if self.shape!=array.shape: + raise ValueError, "Input array must be of similar dimensions as dataset" + self._array = atleast_2d(asarray(array)) + self.has_array = True + + def get_name(self): + return self._name + def get_all_dims(self): + return self._all_dims + + def get_identifiers(self): + #return [n for n in self._map.iteritems()] + + # ensure correct order + # this has correct dims but not identifiers + ids = [] + for dim in self._dims: + ids.append((dim,self._map[dim].keys())) + return ids + +class CategoryDataset(Dataset): + """The category dataset class. + + A dataset for representing class information as binary + matrices (0/1-matrices). + + There is support for using a less memory demanding, and + fast intersection look-ups by representing the binary matrix as a + dictionary in each dimension. + """ + def __init__(self): + Dataset.__init__(self) + self.has_collection = False + + def as_array(self): + """Returns data as binary matrix""" + if not self.has_array and self.has_collection: + #build numeric array + pass + def as_collection(self,dim): + """Returns data as collection along dim""" + pass + + def add_collection(self,input_dict): + """Adds a category data as collection. + + A collection is a datastructure that contains a dictionary for + each pair of dimension in dataset, keyed by identifiers and + values is a set of identifiers in the other dimension + """ + #build category data as double dicts + pass + + +class GraphDataset(Dataset): + """The graph dataset class. + + A dataset class for representing graphs using an adjacency matrix + (aka. restricted to square symmetric signed integers matrices) + + If the library NetworkX is installed, there is support for + representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure. + """ + def __init__(self): + Dataset.__init(self) + self.has_graph = False + class Selection: """Handles selected identifiers along each dimension of a dataset""" def __init__(self): diff --git a/system/dialogs.py b/system/dialogs.py index 03c6fd9..e9e07c6 100644 --- a/system/dialogs.py +++ b/system/dialogs.py @@ -58,7 +58,7 @@ class CreateProjectDruid(gtk.Window): for dir in wf_path: for fn in os.listdir(dir): - if fn.endswith('.py'): + if fn.endswith('.py') and ('#' not in fn) : wf_files.append(fn[:-3]) # Try to load each file and look for Workflow derived classes @@ -68,6 +68,7 @@ class CreateProjectDruid(gtk.Window): for wf in wf_info: store.insert_after(None, (getattr(wf, 'name'), wf)) except Exception, e: + wf_info = self.workflow_classes(fn) logger.log('warning', 'Cannot load workflow: %s' % fn) return store diff --git a/system/plots.py b/system/plots.py index ee79a71..d3cae3b 100644 --- a/system/plots.py +++ b/system/plots.py @@ -224,25 +224,27 @@ class SinePlot(Plot): class ScatterPlot(Plot): - def __init__(self,project): + def __init__(self,project,dataset,id_dim,sel_dim,id_1,id_2): Plot.__init__(self,project) + self.project = project fig = Figure(figsize=(5,4), dpi=72) self.ax = ax = fig.add_subplot(111) - + self.current_dim = id_dim # testing testing - self.x_dataset = project.datasets[0] - x = self.x_dataset._data - self.xaxis_data = xaxis_data = x[:,0] + scipy.randn(scipy.shape(x)[0]) - self.yaxis_data = yaxis_data = x[:,1] - self.current_dim = self.x_dataset._dim_names[0] - ax.plot(xaxis_data,yaxis_data,'og') + self.dataset = dataset + x_index = dataset[sel_dim][id_1] + y_index = dataset[sel_dim][id_2] + + self.xaxis_data = dataset._array[:,x_index] + self.yaxis_data = dataset._array[:,y_index] + ax.plot(self.xaxis_data,self.yaxis_data,'og') ### self.canvas = FigureCanvas(fig) self.add(self.canvas) rectprops = dict(facecolor='gray', edgecolor = 'black', alpha=0.2, fill=True) #cool - sel = RectangleSelector(ax, self.rectangle_select_callback, + self.sel = RectangleSelector(ax, self.rectangle_select_callback, drawtype='box',useblit=True,rectprops=rectprops) self.canvas.show() @@ -251,45 +253,36 @@ class ScatterPlot(Plot): 'event1 and event2 are the press and release events' x1, y1 = event1.xdata, event1.ydata x2, y2 = event2.xdata, event2.ydata - logger.log('debug', "(%3.2f, %3.2f) --> (%3.2f, %3.2f)"%(x1,y1,x2,y2)) - logger.log('debug',"The button you used were:%s, %s "%(event1.button, event2.button)) - # get all points within x1, y1, x2, y2 ydata = self.yaxis_data xdata = self.xaxis_data if x1>x2: - logger.log('debug','Selection x_start bigger than x_end') if y1x2) & (ydata>y1) & (ydatax2) & (ydatay2)) else: - logger.log('debug','Selection x_start less than x_end') + #logger.log('debug','Selection x_start less than x_end') if y1x1) & (xdatay1) & (ydatax1) & (xdatay2)) - if len(index)==0: logger.log('debug','No points selected!') else: - logger.log('debug','Selected:\n%s'%index) - ids = self.x_dataset.extract_id_from_index('samples',index) - #update selection object + ids = [id for id,ind in self.dataset[self.current_dim].items() if ind in index] self.project.set_selection(self.current_dim,ids) - logger.log('debug','Selected identifiers:\n%s'%ids) def update(self,project,key): curr_sel = project.get_selection() # get selection object ids = curr_sel[self.current_dim] # current identifiers - index = self.x_dataset.extract_index_from_id(self.current_dim,ids) #conversion to index + + index = [ind for id,ind in self.dataset[self.current_dim].items() if id in ids] #conversion to index xdata_new = scipy.take(self.xaxis_data,index) #take data ydata_new = scipy.take(self.yaxis_data,index) - self.ax.plot(self.xaxis_data,self.yaxis_data,'ob') + self.ax.plot(self.xaxis_data,self.yaxis_data,'og') self.ax.plot(xdata_new,ydata_new,'or') self.canvas.draw() diff --git a/system/project.py b/system/project.py index db859fe..99d8e10 100644 --- a/system/project.py +++ b/system/project.py @@ -13,8 +13,8 @@ class Project: self.name = name self.dim_names = [] self._observers = {} - self.current_data=None - self.datasets=[] + self.current_data = None + self.datasets = [] self.sel_obj = dataset.Selection() def attach(self,observer,key): @@ -69,15 +69,11 @@ class Project: def add_dataset(self,dataset): """Appends a new Dataset to the project.""" self.datasets.append(dataset) - for dim_name in dataset.ids.keys(): + for dim_name in dataset.get_all_dims(): if dim_name not in self.dim_names: self.dim_names.append(dim_name) + self.sel_obj.current_selection[dim_name] = set() - def suggest_dim_name(self,dim_name): - """Creates an arbitrary unique name for a new dimension.""" - while dim_name in self.dim_names: - dim_name = dim_name + "_t" - return dim_name def object_at(self, path): "Returns the object at a given path in the tree." diff --git a/system/workflow.py b/system/workflow.py index 822f55c..4c9e0b8 100644 --- a/system/workflow.py +++ b/system/workflow.py @@ -30,6 +30,9 @@ class Workflow: print ' %s' % fun.name def add_project(self,project): + if project == None: + logger.log('notice','Proejct is empty') + logger.log('notice','Project added in : %s' %self.name) self.project = project class EmptyWorkflow(Workflow): diff --git a/test/system/datasettest.py b/test/system/datasettest.py index ff1503c..4d2032a 100644 --- a/test/system/datasettest.py +++ b/test/system/datasettest.py @@ -5,28 +5,30 @@ from dataset import * from scipy import rand,shape class DatasetTest(unittest.TestCase): + def setUp(self): - self.dim_0_ids = ['sample_a','sample_b'] - self.dim_1_ids = ['gene_a','gene_b','gene_c'] - self.dim_labels = ['samples','genes'] - self.def_list = [[self.dim_labels[0],self.dim_0_ids],[self.dim_labels[1],self.dim_1_ids]] + dim_0_ids = ('sample_a','sample_b') + dim_1_ids = ('gene_a','gene_b','gene_c') + dim_labels = ('samples','genes') + identifiers= [(dim_labels[0],dim_0_ids),(dim_labels[1],dim_1_ids)] self.array = rand(2,3) - self.testdata = Dataset(self.array,self.def_list) + self.testdata = Dataset(self.array,identifiers) def testCreation(self): - assert self.testdata._data == self.array - assert 'sample_a' in self.testdata.ids['samples'].keys() - assert 'gene_b' in self.testdata.ids['genes'].keys() - + data = self.testdata + assert data._array == self.array + assert 'sample_a' in data['samples'].keys() + assert data['samples']['sample_b']==1 + assert 'gene_c' in data['genes'].keys() + assert data['genes']['gene_c']==2 - - def testExtraction(self): - ids = ['gene_a','gene_b'] - dim_name = 'genes' - subset = self.testdata.extract_data(ids,dim_name) - assert shape(subset._data) == (2,2) - assert subset.ids[dim_name].keys() == ids - assert subset.ids[dim_name].values() == [0,1] + #def testExtraction(self): + # ids = ['gene_a','gene_b'] + # dim_name = 'genes' + # subset = self.testdata.extract_data(ids,dim_name) + # assert shape(subset._data) == (2,2) + # assert subset.ids[dim_name].keys() == ids + # assert subset.ids[dim_name].values() == [0,1] if __name__ == '__main__': diff --git a/workflows/go_workflow.py b/workflows/go_workflow.py index d41c8f5..6d6b1d4 100644 --- a/workflows/go_workflow.py +++ b/workflows/go_workflow.py @@ -102,9 +102,7 @@ class TestDataFunction(Function): def run(self, data): logger.log('notice', 'Injecting foo test data') x = randn(20,30) - axis_0 = ['rows',[]] - axis_1 = ['cols',[]] - X = dataset.Dataset(x,[axis_0,axis_1]) + X = dataset.Dataset(x) return [X, plots.SinePlot(None)] diff --git a/workflows/pca_workflow.py b/workflows/pca_workflow.py index 6de0c53..b29ebbb 100644 --- a/workflows/pca_workflow.py +++ b/workflows/pca_workflow.py @@ -1,23 +1,27 @@ import gtk import logger from workflow import * -from scipy import array,zeros -from data import read_affy_annot,read_mootha,data_dict_to_matrix +import pickle +from scipy import log2,transpose,dot,divide,shape,mean,resize,zeros +from scipy.linalg import svd,inv,norm,get_blas_funcs,eig import plots import dataset class PCAWorkflow(Workflow): + name = 'PCA Workflow' + description = 'PCA workflow. Uses real microarray data from a study of diabetes (Mootha et al.).' def __init__(self, app): Workflow.__init__(self, app) - self.name = 'PCAs Workflow' - + #self.add_project(app.project) + #logger.log('notice','Current project added to: %s' %self.name) + load = Stage('load', 'Load Data') load.add_function(LoadMoothaData()) self.add_stage(load) preproc = Stage('preprocess', 'Preprocessing') - preproc.add_function(Function('log2', 'Logarithm')) + preproc.add_function(Log2Function()) self.add_stage(preproc) annot = Stage('annot', 'Affy annotations') @@ -25,17 +29,16 @@ class PCAWorkflow(Workflow): self.add_stage(annot) model = Stage('model', 'Model') - model.add_function(Function('pca', 'PCA')) + model.add_function(PCAFunction(self)) self.add_stage(model) - + logger.log('debug', '\tPCA\'s workflow is now active') class LoadAnnotationsFunction(Function): def __init__(self): Function.__init__(self, 'load', 'Load Annotations') - self.annotations = None - + def load_affy_file(self, filename): f = open(filename) logger.log('notice', 'Loading annotation file: %s' % filename) @@ -68,81 +71,188 @@ class LoadAnnotationsFunction(Function): pathways = description[i_want] if not pathways[0][0]=='--': pass - - - - - - - return [self.annotations] + D = [] + return [D] class PCAFunction(Function): - def __init__(self): - Function.__init__(self, 'X', 'a_opt') + def __init__(self,workflow): + Function.__init__(self, 'pca', 'PCA') self.output = None - + self.workflow = workflow + def run(self, data): logger.log('debug', 'datatype: %s' % type(data)) if not isinstance(data,dataset.Dataset): return None - logger.log('debug', 'dimensions: %s' % data.dims) + #logger.log('debug', 'dimensions: %s' % data.dims) ## calculations - T,P,E,tsq = pca(data._data,a_opt=2) - comp_def = ['comp',['1','2']] - singel_def = ['1',['s']] - col_def = [data._dim_names[0],data.names(0)] - row_def = [data._dim_names[1],data.names(1)] - T = dataset.Dataset(T,[col_def,comp_def]) - P = dataset.Dataset(T,[row_def,comp_def]) - E = dataset.Dataset(E,[col_def,row_def]) - tsq = dataset.Dataset(tsq,[row_def,sigel_def]) + T,P,E,tsq = self.pca(data._array,5,tsq_loads=False) + comp_def = ('comp',('1','2','3','4','5')) + singel_def = ('1',('s')) + + # pull out input identifiers: + data_ids = [] + for dim in data: + data_ids.append((dim,data[dim].keys())) + + T = dataset.Dataset(T,(data_ids[0],comp_def)) + P = dataset.Dataset(P,[data_ids[1],comp_def]) + E = dataset.Dataset(E,data_ids) + #tsq = dataset.Dataset(tsq,[singel_def,data_ids[1]) ## plots - loading_plot = plots.ScatterPlot() + loading_plot1 = plots.ScatterPlot(self.workflow.project,P,'genes','comp','1','2') + loading_plot2 = plots.ScatterPlot(self.workflow.project,P,'genes','comp','3','4') + score_plot = plots.ScatterPlot(self.workflow.project,T,'samples','comp','1','2') + return [T,P,E,loading_plot1,loading_plot2,score_plot] + + def pca(self,X,a_opt,cent=True,scale='loads',tsq_loads=False): + """Principal component analysis + + input: + Xc -- matrix, data + a_opt -- scalar, max number of comp. to calculate + cent -- bool, centering [True] + crit -- string, pc criteria ['exp_var',['ief','rpv','average']] + scale -- string, scaling ['loads',['scores']] + tsq_loads -- bool, calculate t-squared? [True] + reg -- float, covariance regularizer for tsq calculations [0.2] + output: + T,P,E,r + + """ + nSamples,nVarX = shape(X) + if cent: + Xc = self.mat_center(X) + else: + Xc = X + u,s,vh = self.esvd(Xc) + if scale=='scores': + T = u*s + T = T[:,:a_opt] + P = transpose(vh) + P = P[:,:a_opt] + elif scale=='loads': + T = u + T = T[:,:a_opt] + P = transpose(vh)*s + P = P[:,:a_opt] + + E = Xc - dot(T,transpose(P)) + varEach = s**2 + totVar = sum(varEach) + r = divide(varEach,totVar)*100 + return T,P,E,r + + def mat_center(self,X,axis=0,ret_mn=False): + """Mean center matrix along axis. + + input: + X -- matrix, data + axis -- dim, + ret_mn -- bool, return mean + output: + Xc, [mnX] + + NB: axis = 1 is column-centering, axis=0=row-centering + default is row centering (axis=0) + """ + try: + rows,cols = shape(X) + except ValueError: + print "The X data needs to be two-dimensional" + + if axis==0: + mnX = mean(X,axis) + Xs = X - resize(mnX,(rows,cols)) + + elif axis==1: + mnX = mean(X,axis) + Xs = transpose(transpose(X) - resize(mnX,(cols,rows))) + if ret_mn: + return Xs,mnX + else: + return Xs + + def esvd(self,data,economy=1): + """SVD with the option of economy sized calculation + Calculate subspaces of X'X or XX' depending on the shape + of the matrix. + Good for extreme fat or thin matrices. + + """ + mm = self.mm + m,n = shape(data) + if m>=n: + u,s,v = svd(mm(data,data,trans_a=1)) + u = mm(data,v,trans_b=1) + for i in xrange(n): + s[i] = norm(u[:,i]) + u[:,i] = u[:,i]/s[i] + else: + u,s,v = svd(mm(data,data,trans_b=1)) + v = mm(u,data,trans_a=1) + for i in xrange(m): + s[i] = norm(v[i,:]) + v[i,:] = v[i,:]/s[i] + return u,s,v + + def mm(self,a,b, alpha=1.0, beta=0.0, c=None, trans_a=0, + trans_b=0): + """Fast matrix multiplication + + Return alpha*(a*b) + beta*c. + + a,b,c : matrices + alpha, beta: scalars + trans_a : 0 (a not transposed), + 1 (a transposed), + 2 (a conjugate transposed) + trans_b : 0 (b not transposed), + 1 (b transposed), + 2 (b conjugate transposed) + """ + if c: + gemm,= get_blas_funcs(('gemm',),(a,b,c)) + else: + gemm,= get_blas_funcs(('gemm',),(a,b)) + + return gemm(alpha, a, b, beta, c, trans_a, trans_b) - return [T,P,E,r] class LoadMoothaData(Function): def __init__(self): Function.__init__(self, 'load', 'Load diabetes data') - self.annotations = None - def load_expression_file(self, filename): - f = open(filename) - logger.log('notice', 'Loading expression file: %s' % filename) - self.file = f - self.filename = filename - - def on_response(self, dialog, response): - if response == gtk.RESPONSE_OK: - logger.log('notice', 'Reading file: %s' % dialog.get_filename()) - self.load_expression_file(dialog.get_filename()) - - def run(self, data): - btns = ('Open', gtk.RESPONSE_OK, \ - 'Cancel', gtk.RESPONSE_CANCEL) - dialog = gtk.FileChooserDialog('Open diabetes expression File', - buttons=btns) - dialog.connect('response', self.on_response) - dialog.run() - dialog.destroy() - - ### Reading and parsing here - d,sample_names = read_mootha() - n_samps = len(sample_names) - n_genes = len(d.keys()) - typecode = 'f' - x = zeros((n_samps,n_genes),typecode) + def run(self,data): + data_file = open('full_data.pickle','r') + data = pickle.load(data_file) + data_file.close() + sample_file = open('sample_labels.pickle','r') + sample_names = pickle.load(sample_file) + sample_file.close() + typecode='f' + nSamps = len(sample_names) + nVars = len(data.keys()) gene_ids = [] - for i,(id,desc) in enumerate(d.items()): + x = zeros((nSamps,nVars),typecode) + for i,(id,desc) in enumerate(data.items()): gene_ids.append(id) x[:,i] = desc[0].astype(typecode) - gene_def = ['genes',gene_ids] - sample_def = ['samples', sample_names] - X = dataset.Dataset(x,[sample_def,gene_def]) # samples x genes + gene_def = ('genes',gene_ids) + sample_def = ('samples', sample_names) + X = dataset.Dataset(x,identifiers=[sample_def,gene_def]) # samples x genes return [X] +class Log2Function(Function): + def __init__(self): + Function.__init__(self, 'log', 'Log2') + + def run(self,data): + x = log2(data._array) + ids = data.get_identifiers() + return [dataset.Dataset(x,identifiers=ids,name='Log2_X')] PCAWorkflow.name = 'PCA Workflow'