Complete rewrite of dataset class, with (all) the necessary updates

2006-04-24 09:53:07 +00:00
parent 53d0228074
commit a2e4392a72
9 changed files with 426 additions and 234 deletions
--- a/2
+++ b/2
@@ -30,7 +30,7 @@ class FluentApp:
        # Application variables
        self.project = None
        self.current_data = None
-
+        
        gtk.glade.set_custom_handler(self.custom_object_factory)
        self.widget_tree = gtk.glade.XML(GLADEFILENAME, 'appwindow')
        self.workflow = workflow.EmptyWorkflow(self)
--- a/system/dataset.py
+++ b/system/dataset.py
@@ -1,136 +1,225 @@
 import logger
-from scipy import array,take,asarray,shape,nonzero
-import project
-from itertools import izip 
+from scipy import atleast_2d,asarray,ArrayType


 class Dataset:
-    """Dataset base class.
-
+    """The Dataset base class.
+    
    A Dataset is an n-way array with defined string identifiers across
    all dimensions.
+
+    example of use:
+
+    ---
+    dim_name_rows = 'rows'
+    names_rows = ('row_a','row_b')
+    ids_1 = [dim_name_rows, names_rows]
+
+    dim_name_cols = 'cols'
+    names_cols = ('col_a','col_b','col_c','col_d')
+    ids_2 = [dim_name_cols, names_cols]
+
+    Array_X = rand(2,4)
+    data = Dataset(Array_X,(ids_1,ids_2),name="Testing")
+
+    dim_names = [dim for dim in data]
+
+    column_identifiers = [id for id in data['cols'].keys()]
+    column_index = [index for index in data['cols'].values()]
+
+    'cols' in data -> True
+
+    ---
+
+    data = Dataset(rand(10,20)) (generates dims and ids (no links))
    """
-    def __init__(self, input_array, def_list, name="Unnamed data"):
-        self._name = name
-        self._data = asarray(input_array)
-        dims = shape(self._data)
-        self.def_list = def_list
-        self._ids_set = set()
-        self.ids={}
-        self._dim_num = {}
-        self._dim_names = []
-        if len(dims)==1: # a vector is defined to be column vector!
-            self.dims = (dims[0],1)
-        else:
-            self.dims = dims
-        if len(def_list)!=len(self.dims):
-            raise ValueError,"array dims and identifyer mismatch"
-        for axis,(dim_name,ids) in enumerate(def_list):
-            enum_ids = {}
-            #if dim_name not in project.c_p.dim_names:
-            #    dim_name = project.c_p.suggest_dim_name(dim_name)
-            if not ids:
-                logger.log('debug','Creating identifiers along: '+ str(dim_name))
-                ids = self._create_identifiers(axis)
-            for num,name in enumerate(ids):
-                enum_ids[name] = num
-            self.ids[dim_name] = enum_ids
-            self._ids_set = self._ids_set.union(set(ids))
-            self._dim_num[dim_name] = axis
-            self._dim_names.append(dim_name)
-                    
-        for (dimname, ids), d in izip(def_list,self.dims): #check that data and labels match 
-            if len(self.ids[dimname]) != d:
-                raise ValueError,"dim size and identifyer mismatch"
+    def __init__(self,array=None,identifiers=None,shape=None,all_dims=[],**kwds):
+        self._name = kwds.get("name","Unnamed data")
+        self._dims = [] #existing dimensions in this dataset
+        self._map = {} # internal mapping for dataset:  identifier <--> index
+        self.has_array = False
+        self.shape = None
+    
+        if array==None:
+            if shape == None:
+                raise ValueError, "Must define shape if array is None"
+            else:
+                self.shape = shape
+                if identifiers!=None:
+                    self._set_identifiers(identifiers,all_dims)
+                else:
+                    ids = self._create_identifiers(shape,all_dims)
+                    self._set_identifiers(ids,all_dims)
+        elif isinstance(array,ArrayType):
+            array = atleast_2d(asarray(array))
+            self.shape = array.shape
+            if shape != None:
+                if self.shape!=shape:
+                    #logger.log("debug","Dataset and input shape mismatch")
+                    raise ValueError
+            if identifiers!=None:
+                self._set_identifiers(identifiers,all_dims)
+            else:
+                ids = self._create_identifiers(self.shape,all_dims)
+                self._set_identifiers(ids,all_dims)
            
-    def get_name(self):
+            self._array = array
+            self.has_array = True
+            
+        else:
+            raise ValueError, "array input must be of ArrayType or None"
+
+        self._all_dims = all_dims
+                        
+    def __str__self(self):
        return self._name

-    def get_dim_names(self):
-        return self._dim_names
-    
-    def names(self,axis=0):
-        """Returns identifier names of a dimension.
-        NB: sorted by values!
-        OK? necessary?"""
+    def __iter__(self):
+        """Returns an iterator over dimensions of dataset."""
+        return self._dims.__iter__()
+
+    def __contains__(self,dim):
+        """Returns True if dim is a dimension name in dataset."""
+        # return self._dims.__contains__(dim)
+        return self._map.__contains__(dim)
+
+    def __len__(self):
+        """Returns the number of dimensions in the dataset"""
+        return len(self._map)
+
+    def __getitem__(self,dim):
+        """Return the identifers along the dimension dim."""
+        return self._map[dim]
+
+    def _create_identifiers(self,shape,all_dims):
+        """Creates dimension names and identifier names, and returns
+        identifiers."""
        
-        if type(axis)==int:
-            dim_name = self._dim_names[axis]
-        elif type(axis)==str:
-            dim_name = axis
-        if dim_name not in self._dim_names:
-            raise ValueError, dim_name + " not a dimension in dataset" 
-        items = self.ids[dim_name].items()
-        backitems=[ [v[1],v[0]] for v in items]
-        backitems.sort()
-        sorted_ids=[ backitems[i][1] for i in range(0,len(backitems))]
-        return sorted_ids
-    
-    def extract_data(self,ids,dim_name):
-        """Extracts data along a dimension by identifiers"""
-        new_def_list = self.def_list[:]
-        ids_index = [self.ids[dim_name][id_name] for id_name in ids]
-        dim_number = self._dim_num[dim_name]
-        try:
-            out_data = take(self._data,ids_index,axis=dim_number)
-        except:
-            raise ValueError
-        new_def_list[dim_number][1] = ids
-        extracted_data = Dataset(out_data,def_list=new_def_list,parents=self.parents)
-        return extracted_data
-
-    def _create_identifiers(self,axis):
-        """Creates identifiers along an axis"""
-        n_dim = self.dims[axis]
-        return [str(axis) + '_' + str(i) for i in range(n_dim)]
-
-    def extract_id_from_index(self,dim_name,index):
-        """Returns a set of ids from array/list of indexes."""
-        dim_ids = self.ids[dim_name]
-        if type(index)==int:
-            index = [index]
-            
-        return set([id for id,ind in dim_ids.items() if ind in index])
-
-    def extract_index_from_id(self,dim_name,id):
-        """Returns an array of indexes from a set/list of identifiers
-        (or a single id)"""
-        dim_ids = self.ids[dim_name]
-        return array([ind for name,ind in dim_ids.items() if name in id])
-        
-    
-class CategoryDataset(Dataset):
-    def __init__(self,array,def_list):
-        Dataset.__init__(self,array,def_list)
-
-    def get_elements_by_category(self,dim,category):
-        """Returns all elements along input dim belonging to category.
-        Assumes a two-dim category data only!
-        """
-        if type(category)!=list:
-            raise ValueError, "category must be list"
-        gene_ids = []
-        axis_dim = self._dim_num[dim]
-        cat_index = self.extract_index_from_id(category)
-        for ind in cat_index:
-            if axis_dim==0:
-                gene_indx = nonzero(self._data[:,ind])
-            elif axis_dim==1:
-                gene_indx = nonzero(self._data[ind,:])
+        dim_names = ['rows','cols'] 
+        ids = []
+        for axis,n in enumerate(shape):
+            if axis<2:
+                dim_suggestion = dim_names[axis]
            else:
-                ValueError, "Only support for 2-dim data"
-            gene_ids.append(self.extract_id_from_index(dim,gene_index))
-        return gene_ids
-                
-        
-        
-                
+                dim_suggestion = 'dim'
+            while dim_suggestion in all_dims:
+                dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims) 
+            identifier_creation = [str(axis) + "_" + i for i in map(str,range(n))]
+            ids.append((dim_suggestion,identifier_creation))
+            all_dims.append(dim_suggestion)
+        return ids
+
+    def _set_identifiers(self,identifiers,all_dims):
+        """Creates internal mapping of identifiers structure."""
+        for dim,ids in identifiers:
+            pos_map={}
+            if dim not in self._dims:
+                self._dims.append(dim)
+                all_dims.append(dim)
+            else:
+                raise ValueError, "Dimension names must be unique"
            
-        
-        
+            for pos,id in enumerate(ids):
+                pos_map[id] = pos
+            self._map[dim] = pos_map
+        shape_chk = [len(i) for j,i in identifiers]
+        if shape_chk != list(self.shape):
+            raise ValueError, "Shape input: %s and array: %s mismatch" %(shape_chk,self.shape)

+    def _suggest_dim_name(self,dim_name,all_dims):
+        """Suggests a unique name for dim and returns it"""
+        c = 0
+        while dim_name in all_dims:
+            dim_name = dim_name + "_" + str(c)
+            c+=1
+        return dim_name
+        
+    def asarray(self):
+        """Returns the numeric array (data) of dataset"""
+        if not self.has_array:
+            raise ValueError, "Dataset is empty"
+        else:
+            return self._array
+
+    def add_array(self,array):
+        """Adds array as an ArrayType object.
+        A one-dim array is transformed to a two-dim array (row-vector)
+        """
+        
+        if self.has_array:
+            raise ValueError, "Dataset has array"
+        else:
+            if (len(self._map)!=len(array.shape)):
+                raise ValueError, "range(array_dims) and range(dim_names) mismatch"
+            if self.shape!=array.shape:
+                raise ValueError, "Input array must be of similar dimensions as dataset"
+            self._array = atleast_2d(asarray(array))
+            self.has_array = True
+
+    def get_name(self):
+        return self._name
+    def get_all_dims(self):
+        return self._all_dims
+
+    def get_identifiers(self):
+        #return [n for n in self._map.iteritems()]
+
+        # ensure correct order
+        # this has correct dims but not identifiers
+        ids = []
+        for dim in self._dims:
+            ids.append((dim,self._map[dim].keys()))
+        return ids
+
+class CategoryDataset(Dataset):
+    """The category dataset class.
+
+    A dataset for representing class information as binary
+    matrices (0/1-matrices).
+
+    There is support for using a less memory demanding, and
+    fast intersection look-ups by representing the binary matrix as a
+    dictionary in each dimension.
+    """
    
+    def __init__(self):
+        Dataset.__init__(self)
+        self.has_collection = False
+            
+    def as_array(self):
+        """Returns data as binary matrix"""
+        if not self.has_array and self.has_collection:
+            #build numeric array
+            pass

+    def as_collection(self,dim):
+        """Returns data as collection along dim"""
+        pass
+    
+    def add_collection(self,input_dict):
+        """Adds a category data as collection.
+
+        A collection is a datastructure that contains a dictionary for
+        each pair of dimension in dataset, keyed by identifiers and
+        values is a set of identifiers in the other dimension
+        """
+        #build category data as double dicts
+        pass
+
+
+class GraphDataset(Dataset):
+    """The graph dataset class.
+
+    A dataset class for representing graphs using an adjacency matrix
+    (aka. restricted to square symmetric signed integers matrices)
+    
+    If the library NetworkX is installed, there is support for
+    representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure.
+    """
+    def __init__(self):
+        Dataset.__init(self)
+        self.has_graph = False
+        
 class Selection:
    """Handles selected identifiers along each dimension of a dataset"""
    def __init__(self):
--- a/system/dialogs.py
+++ b/system/dialogs.py
@@ -58,7 +58,7 @@ class CreateProjectDruid(gtk.Window):

        for dir in wf_path:
            for fn in os.listdir(dir):
-                if fn.endswith('.py'):
+                if fn.endswith('.py') and ('#' not in fn) :
                    wf_files.append(fn[:-3])

        # Try to load each file and look for Workflow derived classes
@@ -68,6 +68,7 @@ class CreateProjectDruid(gtk.Window):
                for wf in wf_info:
                    store.insert_after(None, (getattr(wf, 'name'), wf))
            except Exception, e:
+                wf_info = self.workflow_classes(fn)
                logger.log('warning', 'Cannot load workflow: %s' % fn)

        return store
--- a/system/plots.py
+++ b/system/plots.py
@@ -224,25 +224,27 @@ class SinePlot(Plot):


 class ScatterPlot(Plot):
-    def __init__(self,project):
+    def __init__(self,project,dataset,id_dim,sel_dim,id_1,id_2):
        Plot.__init__(self,project)
+        self.project = project
        fig = Figure(figsize=(5,4), dpi=72)
        self.ax = ax = fig.add_subplot(111)
-        
+        self.current_dim = id_dim
        # testing testing
-        self.x_dataset = project.datasets[0]
-        x = self.x_dataset._data
-        self.xaxis_data = xaxis_data =  x[:,0] + scipy.randn(scipy.shape(x)[0])
-        self.yaxis_data = yaxis_data = x[:,1]
-        self.current_dim = self.x_dataset._dim_names[0]
-        ax.plot(xaxis_data,yaxis_data,'og')
+        self.dataset = dataset
+        x_index = dataset[sel_dim][id_1]
+        y_index = dataset[sel_dim][id_2]
+        
+        self.xaxis_data = dataset._array[:,x_index]
+        self.yaxis_data = dataset._array[:,y_index]
+        ax.plot(self.xaxis_data,self.yaxis_data,'og')
        ###

        self.canvas = FigureCanvas(fig)
        self.add(self.canvas)
        rectprops = dict(facecolor='gray', edgecolor = 'black',
                                 alpha=0.2, fill=True) #cool
-        sel = RectangleSelector(ax, self.rectangle_select_callback,
+        self.sel = RectangleSelector(ax, self.rectangle_select_callback,
                      drawtype='box',useblit=True,rectprops=rectprops)
        
        self.canvas.show()
@@ -251,45 +253,36 @@ class ScatterPlot(Plot):
        'event1 and event2 are the press and release events'
        x1, y1 = event1.xdata, event1.ydata
        x2, y2 = event2.xdata, event2.ydata
-        logger.log('debug', "(%3.2f, %3.2f) --> (%3.2f, %3.2f)"%(x1,y1,x2,y2))
-        logger.log('debug',"The button you used were:%s, %s "%(event1.button, event2.button))
-        # get all points within x1, y1, x2, y2
        ydata = self.yaxis_data
        xdata = self.xaxis_data
        if x1>x2:
-            logger.log('debug','Selection x_start bigger than x_end')
            if y1<y2:
-                logger.log('debug','Selection y_start less than y_end')
                index =scipy.nonzero((xdata<x1) & (xdata>x2) & (ydata>y1) & (ydata<y2))
            else:
-                logger.log('debug','Selection y_start larger than y_end')
                index =scipy.nonzero((xdata<x1) & (xdata>x2) & (ydata<y1) & (ydata>y2))
        else:
-            logger.log('debug','Selection x_start less than x_end')
+            #logger.log('debug','Selection x_start less than x_end')
            if y1<y2:
-                logger.log('debug','Selection y_start less than y_end')
+                #logger.log('debug','Selection y_start less than y_end')
                index =scipy.nonzero((xdata>x1) & (xdata<x2) & (ydata>y1) & (ydata<y2))
            else:
-                logger.log('debug','Selection y_start bigger than y_end')
+                #logger.log('debug','Selection y_start bigger than y_end')
                index =scipy.nonzero((xdata>x1) & (xdata<x2) & (ydata<y1) & (ydata>y2))

-        
        if len(index)==0:
            logger.log('debug','No points selected!')
        else:
-            logger.log('debug','Selected:\n%s'%index)
-            ids = self.x_dataset.extract_id_from_index('samples',index)
-            #update selection object
+            ids = [id for id,ind in self.dataset[self.current_dim].items() if ind in index]
            self.project.set_selection(self.current_dim,ids)
-            logger.log('debug','Selected identifiers:\n%s'%ids)

    def update(self,project,key):
        curr_sel = project.get_selection() # get selection object
        ids = curr_sel[self.current_dim] # current identifiers
-        index = self.x_dataset.extract_index_from_id(self.current_dim,ids) #conversion to index
+        
+        index = [ind for id,ind in self.dataset[self.current_dim].items() if id in ids] #conversion to index
        xdata_new = scipy.take(self.xaxis_data,index) #take data
        ydata_new = scipy.take(self.yaxis_data,index)
-        self.ax.plot(self.xaxis_data,self.yaxis_data,'ob')
+        self.ax.plot(self.xaxis_data,self.yaxis_data,'og')
        self.ax.plot(xdata_new,ydata_new,'or')
        self.canvas.draw()

--- a/system/project.py
+++ b/system/project.py
@@ -13,8 +13,8 @@ class Project:
        self.name = name
        self.dim_names = []
        self._observers = {}
-        self.current_data=None
-        self.datasets=[]
+        self.current_data = None
+        self.datasets = []
        self.sel_obj = dataset.Selection()
        
    def attach(self,observer,key):
@@ -69,15 +69,11 @@ class Project:
    def add_dataset(self,dataset):
        """Appends a new Dataset to the project."""
        self.datasets.append(dataset)
-        for dim_name in dataset.ids.keys():
+        for dim_name in dataset.get_all_dims():
            if dim_name not in self.dim_names:
                self.dim_names.append(dim_name)
+                self.sel_obj.current_selection[dim_name] = set()

-    def suggest_dim_name(self,dim_name):
-        """Creates an arbitrary unique name for a new dimension."""
-        while dim_name in self.dim_names:
-            dim_name = dim_name + "_t" 
-        return dim_name
        
    def object_at(self, path):
        "Returns the object at a given path in the tree."
--- a/system/workflow.py
+++ b/system/workflow.py
@@ -30,6 +30,9 @@ class Workflow:
                print '        %s' % fun.name

    def add_project(self,project):
+        if project == None:
+            logger.log('notice','Proejct is empty')
+        logger.log('notice','Project added in : %s' %self.name)
        self.project = project

 class EmptyWorkflow(Workflow):
--- a/test/system/datasettest.py
+++ b/test/system/datasettest.py
@@ -5,28 +5,30 @@ from dataset import *
 from scipy import rand,shape

 class DatasetTest(unittest.TestCase):
+    
    def setUp(self):
-        self.dim_0_ids = ['sample_a','sample_b']
-        self.dim_1_ids = ['gene_a','gene_b','gene_c']
-        self.dim_labels = ['samples','genes']
-        self.def_list = [[self.dim_labels[0],self.dim_0_ids],[self.dim_labels[1],self.dim_1_ids]]
+        dim_0_ids = ('sample_a','sample_b')
+        dim_1_ids = ('gene_a','gene_b','gene_c')
+        dim_labels = ('samples','genes')
+        identifiers= [(dim_labels[0],dim_0_ids),(dim_labels[1],dim_1_ids)]
        self.array = rand(2,3)
-        self.testdata = Dataset(self.array,self.def_list)
+        self.testdata = Dataset(self.array,identifiers)

    def testCreation(self):
-        assert self.testdata._data == self.array
-        assert 'sample_a' in self.testdata.ids['samples'].keys()
-        assert 'gene_b' in self.testdata.ids['genes'].keys()
-
+        data = self.testdata
+        assert data._array == self.array
+        assert 'sample_a' in data['samples'].keys()
+        assert data['samples']['sample_b']==1
+        assert 'gene_c' in data['genes'].keys()
+        assert data['genes']['gene_c']==2
        
-        
-    def testExtraction(self):
-        ids = ['gene_a','gene_b']
-        dim_name = 'genes'
-        subset = self.testdata.extract_data(ids,dim_name)
-        assert shape(subset._data) == (2,2)
-        assert subset.ids[dim_name].keys() == ids
-        assert subset.ids[dim_name].values() == [0,1]
+    #def testExtraction(self):
+    #    ids = ['gene_a','gene_b']
+    #    dim_name = 'genes'
+    #    subset = self.testdata.extract_data(ids,dim_name)
+    #    assert shape(subset._data) == (2,2)
+    #    assert subset.ids[dim_name].keys() == ids
+    #    assert subset.ids[dim_name].values() == [0,1]


 if __name__ == '__main__':
--- a/workflows/go_workflow.py
+++ b/workflows/go_workflow.py
@@ -102,9 +102,7 @@ class TestDataFunction(Function):
    def run(self, data):
        logger.log('notice', 'Injecting foo test data')
        x = randn(20,30)
-        axis_0 = ['rows',[]]
-        axis_1 = ['cols',[]]
-        X = dataset.Dataset(x,[axis_0,axis_1])
+        X = dataset.Dataset(x)
        return [X, plots.SinePlot(None)]


--- a/workflows/pca_workflow.py
+++ b/workflows/pca_workflow.py
@@ -1,23 +1,27 @@
 import gtk
 import logger
 from workflow import *
-from scipy import array,zeros
-from data import read_affy_annot,read_mootha,data_dict_to_matrix
+import pickle
+from scipy import log2,transpose,dot,divide,shape,mean,resize,zeros
+from scipy.linalg import svd,inv,norm,get_blas_funcs,eig
 import plots
 import dataset

 class PCAWorkflow(Workflow):
+    name = 'PCA Workflow'
+    description = 'PCA workflow. Uses real microarray data from a study of diabetes (Mootha et al.).'

    def __init__(self, app):
        Workflow.__init__(self, app)
-        self.name = 'PCAs Workflow'
-
+        #self.add_project(app.project)
+        #logger.log('notice','Current project added to: %s' %self.name)
+        
        load = Stage('load', 'Load Data')
        load.add_function(LoadMoothaData())
        self.add_stage(load)

        preproc = Stage('preprocess', 'Preprocessing')
-        preproc.add_function(Function('log2', 'Logarithm'))
+        preproc.add_function(Log2Function())
        self.add_stage(preproc)

        annot = Stage('annot', 'Affy annotations')
@@ -25,17 +29,16 @@ class PCAWorkflow(Workflow):
        self.add_stage(annot)

        model = Stage('model', 'Model')
-        model.add_function(Function('pca', 'PCA'))
+        model.add_function(PCAFunction(self))
        self.add_stage(model)
-        
+
        logger.log('debug', '\tPCA\'s workflow is now active')

 class LoadAnnotationsFunction(Function):

    def __init__(self):
        Function.__init__(self, 'load', 'Load Annotations')
-        self.annotations = None
-
+        
    def load_affy_file(self, filename):
        f = open(filename)
        logger.log('notice', 'Loading annotation file: %s' % filename)
@@ -68,81 +71,188 @@ class LoadAnnotationsFunction(Function):
                pathways = description[i_want]
                if not pathways[0][0]=='--':
                    pass
-                        
-                    
-                    
-            
-            
-        
-        return [self.annotations]
+        D = []    
+        return [D]

 class PCAFunction(Function):

-    def __init__(self):
-        Function.__init__(self, 'X', 'a_opt')
+    def __init__(self,workflow):
+        Function.__init__(self, 'pca', 'PCA')
        self.output = None
-
+        self.workflow = workflow
+        
    def run(self, data):
        logger.log('debug', 'datatype: %s' % type(data))
        if not isinstance(data,dataset.Dataset):
            return None
-        logger.log('debug', 'dimensions: %s' % data.dims)
+        #logger.log('debug', 'dimensions: %s' % data.dims)

        ## calculations
-        T,P,E,tsq = pca(data._data,a_opt=2)
-        comp_def = ['comp',['1','2']]
-        singel_def = ['1',['s']]
-        col_def = [data._dim_names[0],data.names(0)]
-        row_def = [data._dim_names[1],data.names(1)]
-        T = dataset.Dataset(T,[col_def,comp_def])
-        P = dataset.Dataset(T,[row_def,comp_def])
-        E = dataset.Dataset(E,[col_def,row_def])
-        tsq = dataset.Dataset(tsq,[row_def,sigel_def])
+        T,P,E,tsq = self.pca(data._array,5,tsq_loads=False)
+        comp_def = ('comp',('1','2','3','4','5'))
+        singel_def = ('1',('s'))
+
+        # pull out input identifiers:
+        data_ids = []
+        for dim in data:
+            data_ids.append((dim,data[dim].keys()))
+        
+        T = dataset.Dataset(T,(data_ids[0],comp_def))
+        P = dataset.Dataset(P,[data_ids[1],comp_def])
+        E = dataset.Dataset(E,data_ids)
+        #tsq = dataset.Dataset(tsq,[singel_def,data_ids[1])
        
        ## plots
-        loading_plot = plots.ScatterPlot()
+        loading_plot1 = plots.ScatterPlot(self.workflow.project,P,'genes','comp','1','2')
+        loading_plot2 = plots.ScatterPlot(self.workflow.project,P,'genes','comp','3','4')
+        score_plot = plots.ScatterPlot(self.workflow.project,T,'samples','comp','1','2')
        
+        return [T,P,E,loading_plot1,loading_plot2,score_plot]
+
+    def pca(self,X,a_opt,cent=True,scale='loads',tsq_loads=False):
+        """Principal component analysis
+        
+        input:
+        Xc -- matrix, data
+        a_opt -- scalar, max number of comp. to calculate
+        cent -- bool, centering [True]
+        crit -- string, pc criteria ['exp_var',['ief','rpv','average']]
+        scale -- string, scaling ['loads',['scores']]
+        tsq_loads -- bool, calculate t-squared? [True]
+        reg -- float, covariance regularizer for tsq calculations [0.2]
+        output:
+        T,P,E,r
+
+        """
+        nSamples,nVarX = shape(X)
+        if cent:
+            Xc = self.mat_center(X)
+        else:
+            Xc = X
+        u,s,vh = self.esvd(Xc)
+        if scale=='scores':
+            T = u*s
+            T = T[:,:a_opt]
+            P = transpose(vh)
+            P = P[:,:a_opt]
+        elif scale=='loads':
+            T = u
+            T = T[:,:a_opt]
+            P = transpose(vh)*s
+            P = P[:,:a_opt]
+            
+        E = Xc - dot(T,transpose(P))
+        varEach = s**2
+        totVar = sum(varEach)
+        r = divide(varEach,totVar)*100
+        return T,P,E,r
+
+    def mat_center(self,X,axis=0,ret_mn=False):
+        """Mean center matrix along axis.
+        
+        input:
+        X -- matrix, data
+        axis -- dim,
+        ret_mn -- bool, return mean
+        output:
+        Xc, [mnX]
+        
+        NB: axis = 1 is column-centering, axis=0=row-centering
+        default is row centering (axis=0)
+        """
+        try:
+            rows,cols = shape(X)
+        except ValueError:
+            print "The X data needs to be two-dimensional"
+
+        if axis==0:
+            mnX = mean(X,axis)
+            Xs = X - resize(mnX,(rows,cols))
+
+        elif axis==1:
+            mnX = mean(X,axis)
+            Xs = transpose(transpose(X) - resize(mnX,(cols,rows)))
+        if ret_mn:
+            return Xs,mnX
+        else:
+            return Xs
+
+    def esvd(self,data,economy=1):
+        """SVD with the option of economy sized calculation
+        Calculate subspaces of X'X or XX' depending on the shape
+        of the matrix.
+        Good for extreme fat or thin matrices.
+        
+        """
+        mm = self.mm
+        m,n = shape(data)
+        if m>=n:
+            u,s,v = svd(mm(data,data,trans_a=1))
+            u = mm(data,v,trans_b=1)
+            for i in xrange(n):
+                s[i] = norm(u[:,i])
+                u[:,i] = u[:,i]/s[i]
+        else:
+            u,s,v = svd(mm(data,data,trans_b=1))
+            v = mm(u,data,trans_a=1)
+            for i in xrange(m):
+                s[i] = norm(v[i,:])
+                v[i,:] = v[i,:]/s[i]
+        return u,s,v
+
+    def mm(self,a,b, alpha=1.0, beta=0.0, c=None, trans_a=0,
+           trans_b=0):
+        """Fast matrix multiplication
+
+        Return alpha*(a*b) + beta*c.
+        
+        a,b,c : matrices
+        alpha, beta: scalars
+        trans_a : 0 (a not transposed),
+                  1 (a transposed),
+                  2 (a conjugate transposed)
+        trans_b : 0 (b not transposed),
+                  1 (b transposed),
+                  2 (b conjugate transposed)
+                  """
+        if c:
+            gemm,= get_blas_funcs(('gemm',),(a,b,c))
+        else:
+            gemm,= get_blas_funcs(('gemm',),(a,b))
+
+        return gemm(alpha, a, b, beta, c, trans_a, trans_b)

-        return [T,P,E,r]

 class LoadMoothaData(Function):
    def __init__(self):
        Function.__init__(self, 'load', 'Load diabetes data')
-        self.annotations = None

-    def load_expression_file(self, filename):
-        f = open(filename)
-        logger.log('notice', 'Loading expression file: %s' % filename)
-        self.file = f
-        self.filename = filename
-        
-    def on_response(self, dialog, response):
-        if response == gtk.RESPONSE_OK:
-            logger.log('notice', 'Reading file: %s' % dialog.get_filename())
-            self.load_expression_file(dialog.get_filename())
-
-    def run(self, data):
-        btns = ('Open', gtk.RESPONSE_OK, \
-                'Cancel', gtk.RESPONSE_CANCEL)
-        dialog = gtk.FileChooserDialog('Open diabetes expression File',
-                                       buttons=btns)
-        dialog.connect('response', self.on_response)
-        dialog.run()
-        dialog.destroy()
-
-        ### Reading and parsing here
-        d,sample_names = read_mootha()
-        n_samps = len(sample_names)
-        n_genes = len(d.keys())
-        typecode = 'f'
-        x = zeros((n_samps,n_genes),typecode)
+    def run(self,data):
+        data_file = open('full_data.pickle','r')
+        data = pickle.load(data_file)
+        data_file.close()
+        sample_file = open('sample_labels.pickle','r')                          
+        sample_names = pickle.load(sample_file)
+        sample_file.close()
+        typecode='f'
+        nSamps = len(sample_names)
+        nVars = len(data.keys())
        gene_ids = []
-        for i,(id,desc) in enumerate(d.items()):
+        x = zeros((nSamps,nVars),typecode)
+        for i,(id,desc) in enumerate(data.items()):
            gene_ids.append(id)
            x[:,i] = desc[0].astype(typecode)
-        gene_def = ['genes',gene_ids]
-        sample_def = ['samples', sample_names]
-        X = dataset.Dataset(x,[sample_def,gene_def]) # samples x genes
+        gene_def = ('genes',gene_ids)
+        sample_def = ('samples', sample_names)
+        X = dataset.Dataset(x,identifiers=[sample_def,gene_def]) # samples x genes
        return [X]

+class Log2Function(Function):
+    def __init__(self):
+        Function.__init__(self, 'log', 'Log2')
+
+    def run(self,data):
+        x = log2(data._array)
+        ids = data.get_identifiers()
+        return [dataset.Dataset(x,identifiers=ids,name='Log2_X')]
 PCAWorkflow.name = 'PCA Workflow'