Complete rewrite of dataset class, with (all) the necessary updates

2006-04-24 09:53:07 +00:00 · 2006-04-24 09:53:07 +00:00 · a2e4392a72
commit a2e4392a72
parent 53d0228074
9 changed files with 426 additions and 234 deletions
--- a/2
+++ b/2
@ -30,7 +30,7 @@ class FluentApp:
        # Application variables
        self.project = None
        self.current_data = None
-
+        
        gtk.glade.set_custom_handler(self.custom_object_factory)
        self.widget_tree = gtk.glade.XML(GLADEFILENAME, 'appwindow')
        self.workflow = workflow.EmptyWorkflow(self)
--- a/system/dataset.py
+++ b/system/dataset.py
@ -1,136 +1,225 @@
 import logger
-from scipy import array,take,asarray,shape,nonzero
+from scipy import atleast_2d,asarray,ArrayType
 import project
 from itertools import izip 
 class Dataset:
-    """Dataset base class.
+    """The Dataset base class.
-
+    
    A Dataset is an n-way array with defined string identifiers across
    all dimensions.
    example of use:
    ---
    dim_name_rows = 'rows'
    names_rows = ('row_a','row_b')
    ids_1 = [dim_name_rows, names_rows]
    dim_name_cols = 'cols'
    names_cols = ('col_a','col_b','col_c','col_d')
    ids_2 = [dim_name_cols, names_cols]
    Array_X = rand(2,4)
    data = Dataset(Array_X,(ids_1,ids_2),name="Testing")
    dim_names = [dim for dim in data]
    column_identifiers = [id for id in data['cols'].keys()]
    column_index = [index for index in data['cols'].values()]
    'cols' in data -> True
    ---
    data = Dataset(rand(10,20)) (generates dims and ids (no links))
    """
-    def __init__(self, input_array, def_list, name="Unnamed data"):
+    def __init__(self,array=None,identifiers=None,shape=None,all_dims=[],**kwds):
-        self._name = name
+        self._name = kwds.get("name","Unnamed data")
-        self._data = asarray(input_array)
+        self._dims = [] #existing dimensions in this dataset
-        dims = shape(self._data)
+        self._map = {} # internal mapping for dataset:  identifier <--> index
-        self.def_list = def_list
+        self.has_array = False
-        self._ids_set = set()
+        self.shape = None
-        self.ids={}
+    
-        self._dim_num = {}
+        if array==None:
-        self._dim_names = []
+            if shape == None:
-        if len(dims)==1: # a vector is defined to be column vector!
+                raise ValueError, "Must define shape if array is None"
-            self.dims = (dims[0],1)
+            else:
-        else:
+                self.shape = shape
-            self.dims = dims
+                if identifiers!=None:
-        if len(def_list)!=len(self.dims):
+                    self._set_identifiers(identifiers,all_dims)
-            raise ValueError,"array dims and identifyer mismatch"
+                else:
-        for axis,(dim_name,ids) in enumerate(def_list):
+                    ids = self._create_identifiers(shape,all_dims)
-            enum_ids = {}
+                    self._set_identifiers(ids,all_dims)
-            #if dim_name not in project.c_p.dim_names:
+        elif isinstance(array,ArrayType):
-            #    dim_name = project.c_p.suggest_dim_name(dim_name)
+            array = atleast_2d(asarray(array))
-            if not ids:
+            self.shape = array.shape
-                logger.log('debug','Creating identifiers along: '+ str(dim_name))
+            if shape != None:
-                ids = self._create_identifiers(axis)
+                if self.shape!=shape:
-            for num,name in enumerate(ids):
+                    #logger.log("debug","Dataset and input shape mismatch")
-                enum_ids[name] = num
+                    raise ValueError
-            self.ids[dim_name] = enum_ids
+            if identifiers!=None:
-            self._ids_set = self._ids_set.union(set(ids))
+                self._set_identifiers(identifiers,all_dims)
-            self._dim_num[dim_name] = axis
+            else:
-            self._dim_names.append(dim_name)
+                ids = self._create_identifiers(self.shape,all_dims)
-                    
+                self._set_identifiers(ids,all_dims)
        for (dimname, ids), d in izip(def_list,self.dims): #check that data and labels match 
            if len(self.ids[dimname]) != d:
                raise ValueError,"dim size and identifyer mismatch"
-    def get_name(self):
+            self._array = array
            self.has_array = True
        else:
            raise ValueError, "array input must be of ArrayType or None"
        self._all_dims = all_dims
    def __str__self(self):
        return self._name
-    def get_dim_names(self):
+    def __iter__(self):
-        return self._dim_names
+        """Returns an iterator over dimensions of dataset."""
-    
+        return self._dims.__iter__()
-    def names(self,axis=0):
+
-        """Returns identifier names of a dimension.
+    def __contains__(self,dim):
-        NB: sorted by values!
+        """Returns True if dim is a dimension name in dataset."""
-        OK? necessary?"""
+        # return self._dims.__contains__(dim)
        return self._map.__contains__(dim)
    def __len__(self):
        """Returns the number of dimensions in the dataset"""
        return len(self._map)
    def __getitem__(self,dim):
        """Return the identifers along the dimension dim."""
        return self._map[dim]
    def _create_identifiers(self,shape,all_dims):
        """Creates dimension names and identifier names, and returns
        identifiers."""
-        if type(axis)==int:
+        dim_names = ['rows','cols'] 
-            dim_name = self._dim_names[axis]
+        ids = []
-        elif type(axis)==str:
+        for axis,n in enumerate(shape):
-            dim_name = axis
+            if axis<2:
-        if dim_name not in self._dim_names:
+                dim_suggestion = dim_names[axis]
            raise ValueError, dim_name + " not a dimension in dataset" 
        items = self.ids[dim_name].items()
        backitems=[ [v[1],v[0]] for v in items]
        backitems.sort()
        sorted_ids=[ backitems[i][1] for i in range(0,len(backitems))]
        return sorted_ids
    def extract_data(self,ids,dim_name):
        """Extracts data along a dimension by identifiers"""
        new_def_list = self.def_list[:]
        ids_index = [self.ids[dim_name][id_name] for id_name in ids]
        dim_number = self._dim_num[dim_name]
        try:
            out_data = take(self._data,ids_index,axis=dim_number)
        except:
            raise ValueError
        new_def_list[dim_number][1] = ids
        extracted_data = Dataset(out_data,def_list=new_def_list,parents=self.parents)
        return extracted_data
    def _create_identifiers(self,axis):
        """Creates identifiers along an axis"""
        n_dim = self.dims[axis]
        return [str(axis) + '_' + str(i) for i in range(n_dim)]
    def extract_id_from_index(self,dim_name,index):
        """Returns a set of ids from array/list of indexes."""
        dim_ids = self.ids[dim_name]
        if type(index)==int:
            index = [index]
        return set([id for id,ind in dim_ids.items() if ind in index])
    def extract_index_from_id(self,dim_name,id):
        """Returns an array of indexes from a set/list of identifiers
        (or a single id)"""
        dim_ids = self.ids[dim_name]
        return array([ind for name,ind in dim_ids.items() if name in id])
 class CategoryDataset(Dataset):
    def __init__(self,array,def_list):
        Dataset.__init__(self,array,def_list)
    def get_elements_by_category(self,dim,category):
        """Returns all elements along input dim belonging to category.
        Assumes a two-dim category data only!
        """
        if type(category)!=list:
            raise ValueError, "category must be list"
        gene_ids = []
        axis_dim = self._dim_num[dim]
        cat_index = self.extract_index_from_id(category)
        for ind in cat_index:
            if axis_dim==0:
                gene_indx = nonzero(self._data[:,ind])
            elif axis_dim==1:
                gene_indx = nonzero(self._data[ind,:])
            else:
-                ValueError, "Only support for 2-dim data"
+                dim_suggestion = 'dim'
-            gene_ids.append(self.extract_id_from_index(dim,gene_index))
+            while dim_suggestion in all_dims:
-        return gene_ids
+                dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims) 
-                
+            identifier_creation = [str(axis) + "_" + i for i in map(str,range(n))]
-        
+            ids.append((dim_suggestion,identifier_creation))
-        
+            all_dims.append(dim_suggestion)
-                
+        return ids
    def _set_identifiers(self,identifiers,all_dims):
        """Creates internal mapping of identifiers structure."""
        for dim,ids in identifiers:
            pos_map={}
            if dim not in self._dims:
                self._dims.append(dim)
                all_dims.append(dim)
            else:
                raise ValueError, "Dimension names must be unique"
-        
+            for pos,id in enumerate(ids):
-        
+                pos_map[id] = pos
            self._map[dim] = pos_map
        shape_chk = [len(i) for j,i in identifiers]
        if shape_chk != list(self.shape):
            raise ValueError, "Shape input: %s and array: %s mismatch" %(shape_chk,self.shape)
    def _suggest_dim_name(self,dim_name,all_dims):
        """Suggests a unique name for dim and returns it"""
        c = 0
        while dim_name in all_dims:
            dim_name = dim_name + "_" + str(c)
            c+=1
        return dim_name
    def asarray(self):
        """Returns the numeric array (data) of dataset"""
        if not self.has_array:
            raise ValueError, "Dataset is empty"
        else:
            return self._array
    def add_array(self,array):
        """Adds array as an ArrayType object.
        A one-dim array is transformed to a two-dim array (row-vector)
        """
        if self.has_array:
            raise ValueError, "Dataset has array"
        else:
            if (len(self._map)!=len(array.shape)):
                raise ValueError, "range(array_dims) and range(dim_names) mismatch"
            if self.shape!=array.shape:
                raise ValueError, "Input array must be of similar dimensions as dataset"
            self._array = atleast_2d(asarray(array))
            self.has_array = True
    def get_name(self):
        return self._name
    def get_all_dims(self):
        return self._all_dims
    def get_identifiers(self):
        #return [n for n in self._map.iteritems()]
        # ensure correct order
        # this has correct dims but not identifiers
        ids = []
        for dim in self._dims:
            ids.append((dim,self._map[dim].keys()))
        return ids
 class CategoryDataset(Dataset):
    """The category dataset class.
    A dataset for representing class information as binary
    matrices (0/1-matrices).
    There is support for using a less memory demanding, and
    fast intersection look-ups by representing the binary matrix as a
    dictionary in each dimension.
    """
    def __init__(self):
        Dataset.__init__(self)
        self.has_collection = False
    def as_array(self):
        """Returns data as binary matrix"""
        if not self.has_array and self.has_collection:
            #build numeric array
            pass
    def as_collection(self,dim):
        """Returns data as collection along dim"""
        pass
    def add_collection(self,input_dict):
        """Adds a category data as collection.
        A collection is a datastructure that contains a dictionary for
        each pair of dimension in dataset, keyed by identifiers and
        values is a set of identifiers in the other dimension
        """
        #build category data as double dicts
        pass
 class GraphDataset(Dataset):
    """The graph dataset class.
    A dataset class for representing graphs using an adjacency matrix
    (aka. restricted to square symmetric signed integers matrices)
    If the library NetworkX is installed, there is support for
    representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure.
    """
    def __init__(self):
        Dataset.__init(self)
        self.has_graph = False
 class Selection:
    """Handles selected identifiers along each dimension of a dataset"""
    def __init__(self):
--- a/system/dialogs.py
+++ b/system/dialogs.py
@ -58,7 +58,7 @@ class CreateProjectDruid(gtk.Window):
        for dir in wf_path:
            for fn in os.listdir(dir):
-                if fn.endswith('.py'):
+                if fn.endswith('.py') and ('#' not in fn) :
                    wf_files.append(fn[:-3])
        # Try to load each file and look for Workflow derived classes
@ -68,6 +68,7 @@ class CreateProjectDruid(gtk.Window):
                for wf in wf_info:
                    store.insert_after(None, (getattr(wf, 'name'), wf))
            except Exception, e:
                wf_info = self.workflow_classes(fn)
                logger.log('warning', 'Cannot load workflow: %s' % fn)
        return store
--- a/system/plots.py
+++ b/system/plots.py
@ -224,25 +224,27 @@ class SinePlot(Plot):
 class ScatterPlot(Plot):
-    def __init__(self,project):
+    def __init__(self,project,dataset,id_dim,sel_dim,id_1,id_2):
        Plot.__init__(self,project)
        self.project = project
        fig = Figure(figsize=(5,4), dpi=72)
        self.ax = ax = fig.add_subplot(111)
-        
+        self.current_dim = id_dim
        # testing testing
-        self.x_dataset = project.datasets[0]
+        self.dataset = dataset
-        x = self.x_dataset._data
+        x_index = dataset[sel_dim][id_1]
-        self.xaxis_data = xaxis_data =  x[:,0] + scipy.randn(scipy.shape(x)[0])
+        y_index = dataset[sel_dim][id_2]
-        self.yaxis_data = yaxis_data = x[:,1]
+        
-        self.current_dim = self.x_dataset._dim_names[0]
+        self.xaxis_data = dataset._array[:,x_index]
-        ax.plot(xaxis_data,yaxis_data,'og')
+        self.yaxis_data = dataset._array[:,y_index]
        ax.plot(self.xaxis_data,self.yaxis_data,'og')
        ###
        self.canvas = FigureCanvas(fig)
        self.add(self.canvas)
        rectprops = dict(facecolor='gray', edgecolor = 'black',
                                 alpha=0.2, fill=True) #cool
-        sel = RectangleSelector(ax, self.rectangle_select_callback,
+        self.sel = RectangleSelector(ax, self.rectangle_select_callback,
                      drawtype='box',useblit=True,rectprops=rectprops)
        self.canvas.show()
@ -251,45 +253,36 @@ class ScatterPlot(Plot):
        'event1 and event2 are the press and release events'
        x1, y1 = event1.xdata, event1.ydata
        x2, y2 = event2.xdata, event2.ydata
        logger.log('debug', "(%3.2f, %3.2f) --> (%3.2f, %3.2f)"%(x1,y1,x2,y2))
        logger.log('debug',"The button you used were:%s, %s "%(event1.button, event2.button))
        # get all points within x1, y1, x2, y2
        ydata = self.yaxis_data
        xdata = self.xaxis_data
        if x1>x2:
            logger.log('debug','Selection x_start bigger than x_end')
            if y1<y2:
                logger.log('debug','Selection y_start less than y_end')
                index =scipy.nonzero((xdata<x1) & (xdata>x2) & (ydata>y1) & (ydata<y2))
            else:
                logger.log('debug','Selection y_start larger than y_end')
                index =scipy.nonzero((xdata<x1) & (xdata>x2) & (ydata<y1) & (ydata>y2))
        else:
-            logger.log('debug','Selection x_start less than x_end')
+            #logger.log('debug','Selection x_start less than x_end')
            if y1<y2:
-                logger.log('debug','Selection y_start less than y_end')
+                #logger.log('debug','Selection y_start less than y_end')
                index =scipy.nonzero((xdata>x1) & (xdata<x2) & (ydata>y1) & (ydata<y2))
            else:
-                logger.log('debug','Selection y_start bigger than y_end')
+                #logger.log('debug','Selection y_start bigger than y_end')
                index =scipy.nonzero((xdata>x1) & (xdata<x2) & (ydata<y1) & (ydata>y2))
        if len(index)==0:
            logger.log('debug','No points selected!')
        else:
-            logger.log('debug','Selected:\n%s'%index)
+            ids = [id for id,ind in self.dataset[self.current_dim].items() if ind in index]
            ids = self.x_dataset.extract_id_from_index('samples',index)
            #update selection object
            self.project.set_selection(self.current_dim,ids)
            logger.log('debug','Selected identifiers:\n%s'%ids)
    def update(self,project,key):
        curr_sel = project.get_selection() # get selection object
        ids = curr_sel[self.current_dim] # current identifiers
-        index = self.x_dataset.extract_index_from_id(self.current_dim,ids) #conversion to index
+        
        index = [ind for id,ind in self.dataset[self.current_dim].items() if id in ids] #conversion to index
        xdata_new = scipy.take(self.xaxis_data,index) #take data
        ydata_new = scipy.take(self.yaxis_data,index)
-        self.ax.plot(self.xaxis_data,self.yaxis_data,'ob')
+        self.ax.plot(self.xaxis_data,self.yaxis_data,'og')
        self.ax.plot(xdata_new,ydata_new,'or')
        self.canvas.draw()
--- a/system/project.py
+++ b/system/project.py
@ -13,8 +13,8 @@ class Project:
        self.name = name
        self.dim_names = []
        self._observers = {}
-        self.current_data=None
+        self.current_data = None
-        self.datasets=[]
+        self.datasets = []
        self.sel_obj = dataset.Selection()
    def attach(self,observer,key):
@ -69,15 +69,11 @@ class Project:
    def add_dataset(self,dataset):
        """Appends a new Dataset to the project."""
        self.datasets.append(dataset)
-        for dim_name in dataset.ids.keys():
+        for dim_name in dataset.get_all_dims():
            if dim_name not in self.dim_names:
                self.dim_names.append(dim_name)
                self.sel_obj.current_selection[dim_name] = set()
    def suggest_dim_name(self,dim_name):
        """Creates an arbitrary unique name for a new dimension."""
        while dim_name in self.dim_names:
            dim_name = dim_name + "_t" 
        return dim_name
    def object_at(self, path):
        "Returns the object at a given path in the tree."
--- a/system/workflow.py
+++ b/system/workflow.py
@ -30,6 +30,9 @@ class Workflow:
                print '        %s' % fun.name
    def add_project(self,project):
        if project == None:
            logger.log('notice','Proejct is empty')
        logger.log('notice','Project added in : %s' %self.name)
        self.project = project
 class EmptyWorkflow(Workflow):
--- a/test/system/datasettest.py
+++ b/test/system/datasettest.py
@ -5,28 +5,30 @@ from dataset import *
 from scipy import rand,shape
 class DatasetTest(unittest.TestCase):
    def setUp(self):
-        self.dim_0_ids = ['sample_a','sample_b']
+        dim_0_ids = ('sample_a','sample_b')
-        self.dim_1_ids = ['gene_a','gene_b','gene_c']
+        dim_1_ids = ('gene_a','gene_b','gene_c')
-        self.dim_labels = ['samples','genes']
+        dim_labels = ('samples','genes')
-        self.def_list = [[self.dim_labels[0],self.dim_0_ids],[self.dim_labels[1],self.dim_1_ids]]
+        identifiers= [(dim_labels[0],dim_0_ids),(dim_labels[1],dim_1_ids)]
        self.array = rand(2,3)
-        self.testdata = Dataset(self.array,self.def_list)
+        self.testdata = Dataset(self.array,identifiers)
    def testCreation(self):
-        assert self.testdata._data == self.array
+        data = self.testdata
-        assert 'sample_a' in self.testdata.ids['samples'].keys()
+        assert data._array == self.array
-        assert 'gene_b' in self.testdata.ids['genes'].keys()
+        assert 'sample_a' in data['samples'].keys()
-
+        assert data['samples']['sample_b']==1
        assert 'gene_c' in data['genes'].keys()
        assert data['genes']['gene_c']==2
-        
+    #def testExtraction(self):
-    def testExtraction(self):
+    #    ids = ['gene_a','gene_b']
-        ids = ['gene_a','gene_b']
+    #    dim_name = 'genes'
-        dim_name = 'genes'
+    #    subset = self.testdata.extract_data(ids,dim_name)
-        subset = self.testdata.extract_data(ids,dim_name)
+    #    assert shape(subset._data) == (2,2)
-        assert shape(subset._data) == (2,2)
+    #    assert subset.ids[dim_name].keys() == ids
-        assert subset.ids[dim_name].keys() == ids
+    #    assert subset.ids[dim_name].values() == [0,1]
        assert subset.ids[dim_name].values() == [0,1]
 if __name__ == '__main__':
--- a/workflows/go_workflow.py
+++ b/workflows/go_workflow.py
@ -102,9 +102,7 @@ class TestDataFunction(Function):
    def run(self, data):
        logger.log('notice', 'Injecting foo test data')
        x = randn(20,30)
-        axis_0 = ['rows',[]]
+        X = dataset.Dataset(x)
        axis_1 = ['cols',[]]
        X = dataset.Dataset(x,[axis_0,axis_1])
        return [X, plots.SinePlot(None)]
--- a/workflows/pca_workflow.py
+++ b/workflows/pca_workflow.py
@ -1,23 +1,27 @@
 import gtk
 import logger
 from workflow import *
-from scipy import array,zeros
+import pickle
-from data import read_affy_annot,read_mootha,data_dict_to_matrix
+from scipy import log2,transpose,dot,divide,shape,mean,resize,zeros
 from scipy.linalg import svd,inv,norm,get_blas_funcs,eig
 import plots
 import dataset
 class PCAWorkflow(Workflow):
    name = 'PCA Workflow'
    description = 'PCA workflow. Uses real microarray data from a study of diabetes (Mootha et al.).'
    def __init__(self, app):
        Workflow.__init__(self, app)
-        self.name = 'PCAs Workflow'
+        #self.add_project(app.project)
-
+        #logger.log('notice','Current project added to: %s' %self.name)
        load = Stage('load', 'Load Data')
        load.add_function(LoadMoothaData())
        self.add_stage(load)
        preproc = Stage('preprocess', 'Preprocessing')
-        preproc.add_function(Function('log2', 'Logarithm'))
+        preproc.add_function(Log2Function())
        self.add_stage(preproc)
        annot = Stage('annot', 'Affy annotations')
@ -25,17 +29,16 @@ class PCAWorkflow(Workflow):
        self.add_stage(annot)
        model = Stage('model', 'Model')
-        model.add_function(Function('pca', 'PCA'))
+        model.add_function(PCAFunction(self))
        self.add_stage(model)
-        
+
        logger.log('debug', '\tPCA\'s workflow is now active')
 class LoadAnnotationsFunction(Function):
    def __init__(self):
        Function.__init__(self, 'load', 'Load Annotations')
-        self.annotations = None
+        
    def load_affy_file(self, filename):
        f = open(filename)
        logger.log('notice', 'Loading annotation file: %s' % filename)
@ -68,81 +71,188 @@ class LoadAnnotationsFunction(Function):
                pathways = description[i_want]
                if not pathways[0][0]=='--':
                    pass
-                        
+        D = []    
-                    
+        return [D]
        return [self.annotations]
 class PCAFunction(Function):
-    def __init__(self):
+    def __init__(self,workflow):
-        Function.__init__(self, 'X', 'a_opt')
+        Function.__init__(self, 'pca', 'PCA')
        self.output = None
-
+        self.workflow = workflow
    def run(self, data):
        logger.log('debug', 'datatype: %s' % type(data))
        if not isinstance(data,dataset.Dataset):
            return None
-        logger.log('debug', 'dimensions: %s' % data.dims)
+        #logger.log('debug', 'dimensions: %s' % data.dims)
        ## calculations
-        T,P,E,tsq = pca(data._data,a_opt=2)
+        T,P,E,tsq = self.pca(data._array,5,tsq_loads=False)
-        comp_def = ['comp',['1','2']]
+        comp_def = ('comp',('1','2','3','4','5'))
-        singel_def = ['1',['s']]
+        singel_def = ('1',('s'))
-        col_def = [data._dim_names[0],data.names(0)]
+
-        row_def = [data._dim_names[1],data.names(1)]
+        # pull out input identifiers:
-        T = dataset.Dataset(T,[col_def,comp_def])
+        data_ids = []
-        P = dataset.Dataset(T,[row_def,comp_def])
+        for dim in data:
-        E = dataset.Dataset(E,[col_def,row_def])
+            data_ids.append((dim,data[dim].keys()))
-        tsq = dataset.Dataset(tsq,[row_def,sigel_def])
+        
        T = dataset.Dataset(T,(data_ids[0],comp_def))
        P = dataset.Dataset(P,[data_ids[1],comp_def])
        E = dataset.Dataset(E,data_ids)
        #tsq = dataset.Dataset(tsq,[singel_def,data_ids[1])
        ## plots
-        loading_plot = plots.ScatterPlot()
+        loading_plot1 = plots.ScatterPlot(self.workflow.project,P,'genes','comp','1','2')
        loading_plot2 = plots.ScatterPlot(self.workflow.project,P,'genes','comp','3','4')
        score_plot = plots.ScatterPlot(self.workflow.project,T,'samples','comp','1','2')
        return [T,P,E,loading_plot1,loading_plot2,score_plot]
    def pca(self,X,a_opt,cent=True,scale='loads',tsq_loads=False):
        """Principal component analysis
        input:
        Xc -- matrix, data
        a_opt -- scalar, max number of comp. to calculate
        cent -- bool, centering [True]
        crit -- string, pc criteria ['exp_var',['ief','rpv','average']]
        scale -- string, scaling ['loads',['scores']]
        tsq_loads -- bool, calculate t-squared? [True]
        reg -- float, covariance regularizer for tsq calculations [0.2]
        output:
        T,P,E,r
        """
        nSamples,nVarX = shape(X)
        if cent:
            Xc = self.mat_center(X)
        else:
            Xc = X
        u,s,vh = self.esvd(Xc)
        if scale=='scores':
            T = u*s
            T = T[:,:a_opt]
            P = transpose(vh)
            P = P[:,:a_opt]
        elif scale=='loads':
            T = u
            T = T[:,:a_opt]
            P = transpose(vh)*s
            P = P[:,:a_opt]
        E = Xc - dot(T,transpose(P))
        varEach = s**2
        totVar = sum(varEach)
        r = divide(varEach,totVar)*100
        return T,P,E,r
    def mat_center(self,X,axis=0,ret_mn=False):
        """Mean center matrix along axis.
        input:
        X -- matrix, data
        axis -- dim,
        ret_mn -- bool, return mean
        output:
        Xc, [mnX]
        NB: axis = 1 is column-centering, axis=0=row-centering
        default is row centering (axis=0)
        """
        try:
            rows,cols = shape(X)
        except ValueError:
            print "The X data needs to be two-dimensional"
        if axis==0:
            mnX = mean(X,axis)
            Xs = X - resize(mnX,(rows,cols))
        elif axis==1:
            mnX = mean(X,axis)
            Xs = transpose(transpose(X) - resize(mnX,(cols,rows)))
        if ret_mn:
            return Xs,mnX
        else:
            return Xs
    def esvd(self,data,economy=1):
        """SVD with the option of economy sized calculation
        Calculate subspaces of X'X or XX' depending on the shape
        of the matrix.
        Good for extreme fat or thin matrices.
        """
        mm = self.mm
        m,n = shape(data)
        if m>=n:
            u,s,v = svd(mm(data,data,trans_a=1))
            u = mm(data,v,trans_b=1)
            for i in xrange(n):
                s[i] = norm(u[:,i])
                u[:,i] = u[:,i]/s[i]
        else:
            u,s,v = svd(mm(data,data,trans_b=1))
            v = mm(u,data,trans_a=1)
            for i in xrange(m):
                s[i] = norm(v[i,:])
                v[i,:] = v[i,:]/s[i]
        return u,s,v
    def mm(self,a,b, alpha=1.0, beta=0.0, c=None, trans_a=0,
           trans_b=0):
        """Fast matrix multiplication
        Return alpha*(a*b) + beta*c.
        a,b,c : matrices
        alpha, beta: scalars
        trans_a : 0 (a not transposed),
                  1 (a transposed),
                  2 (a conjugate transposed)
        trans_b : 0 (b not transposed),
                  1 (b transposed),
                  2 (b conjugate transposed)
                  """
        if c:
            gemm,= get_blas_funcs(('gemm',),(a,b,c))
        else:
            gemm,= get_blas_funcs(('gemm',),(a,b))
        return gemm(alpha, a, b, beta, c, trans_a, trans_b)
        return [T,P,E,r]
 class LoadMoothaData(Function):
    def __init__(self):
        Function.__init__(self, 'load', 'Load diabetes data')
        self.annotations = None
-    def load_expression_file(self, filename):
+    def run(self,data):
-        f = open(filename)
+        data_file = open('full_data.pickle','r')
-        logger.log('notice', 'Loading expression file: %s' % filename)
+        data = pickle.load(data_file)
-        self.file = f
+        data_file.close()
-        self.filename = filename
+        sample_file = open('sample_labels.pickle','r')                          
-        
+        sample_names = pickle.load(sample_file)
-    def on_response(self, dialog, response):
+        sample_file.close()
-        if response == gtk.RESPONSE_OK:
+        typecode='f'
-            logger.log('notice', 'Reading file: %s' % dialog.get_filename())
+        nSamps = len(sample_names)
-            self.load_expression_file(dialog.get_filename())
+        nVars = len(data.keys())
    def run(self, data):
        btns = ('Open', gtk.RESPONSE_OK, \
                'Cancel', gtk.RESPONSE_CANCEL)
        dialog = gtk.FileChooserDialog('Open diabetes expression File',
                                       buttons=btns)
        dialog.connect('response', self.on_response)
        dialog.run()
        dialog.destroy()
        ### Reading and parsing here
        d,sample_names = read_mootha()
        n_samps = len(sample_names)
        n_genes = len(d.keys())
        typecode = 'f'
        x = zeros((n_samps,n_genes),typecode)
        gene_ids = []
-        for i,(id,desc) in enumerate(d.items()):
+        x = zeros((nSamps,nVars),typecode)
        for i,(id,desc) in enumerate(data.items()):
            gene_ids.append(id)
            x[:,i] = desc[0].astype(typecode)
-        gene_def = ['genes',gene_ids]
+        gene_def = ('genes',gene_ids)
-        sample_def = ['samples', sample_names]
+        sample_def = ('samples', sample_names)
-        X = dataset.Dataset(x,[sample_def,gene_def]) # samples x genes
+        X = dataset.Dataset(x,identifiers=[sample_def,gene_def]) # samples x genes
        return [X]
 class Log2Function(Function):
    def __init__(self):
        Function.__init__(self, 'log', 'Log2')
    def run(self,data):
        x = log2(data._array)
        ids = data.get_identifiers()
        return [dataset.Dataset(x,identifiers=ids,name='Log2_X')]
 PCAWorkflow.name = 'PCA Workflow'