laydi/workflows/pca_workflow.py

import gtk
import system.workflow as wf
from system.workflow import Stage, Function
import pickle
from scipy import log2,transpose,dot,divide,shape,mean,resize,zeros
from scipy.linalg import svd,inv,norm,get_blas_funcs,eig
from system import dataset, logger, plots


class PCAWorkflow(wf.Workflow):
    name = 'PCA Workflow'
    ident = 'pca'
    description = 'PCA workflow. Uses real microarray data from a study of diabetes (Mootha et al.).'

    def __init__(self, app):
        wf.Workflow.__init__(self, app)
        #self.add_project(app.project)
        #logger.log('notice','Current project added to: %s' %self.name)
        
        load = Stage('load', 'Load Data')
        load.add_function(LoadMoothaData())
        self.add_stage(load)

        preproc = Stage('preprocess', 'Preprocessing')
        preproc.add_function(Log2Function())
        self.add_stage(preproc)

        annot = Stage('annot', 'Affy annotations')
        annot.add_function(LoadAnnotationsFunction())
        self.add_stage(annot)

        model = Stage('model', 'Model')
        model.add_function(PCAFunction(self))
        self.add_stage(model)

        logger.log('debug', '\tPCA\'s workflow is now active')

class LoadAnnotationsFunction(Function):

    def __init__(self):
        Function.__init__(self, 'load', 'Load Annotations')
        
    def load_affy_file(self, filename):
        f = open(filename)
        logger.log('notice', 'Loading annotation file: %s' % filename)
        self.file = f
        
    def on_response(self, dialog, response):
        if response == gtk.RESPONSE_OK:
            logger.log('notice', 'Reading file: %s' % dialog.get_filename())
            self.load_affy_file(dialog.get_filename())

    def run(self,data):
        btns = ('Open', gtk.RESPONSE_OK, \
                'Cancel', gtk.RESPONSE_CANCEL)
        dialog = gtk.FileChooserDialog('Open Affy Annotation File',
                                       buttons=btns)
        dialog.connect('response', self.on_response)
        dialog.run()
        dialog.destroy()

        ### Reading and parsing here
        annot = read_affy_annot(self.file)
        i_want = 'Pathway'
        nothing = '---'
        ids_in_data = set(data.names('genes')) #assuming we have genes
        sanity_check = set(annot.keys())
        if not ids_in_data.intersection(sanity_check) == ids_in_data:
            logger.log('debug','Some identifers in data does not exist in affy file!')
        for affy_id,description in annot:
            if affy_id in ids_in_data:
                pathways = description[i_want]
                if not pathways[0][0]=='--':
                    pass
        D = []    
        return [D]

class PCAFunction(Function):

    def __init__(self,workflow):
        Function.__init__(self, 'pca', 'PCA')
        self.output = None
        self.workflow = workflow
        
    def run(self,data):
        logger.log('debug', 'datatype: %s' % type(data))
        if not isinstance(data,dataset.Dataset):
            return None
        #logger.log('debug', 'dimensions: %s' % data.dims)

        ## calculations
        T,P,E,tsq = self.pca(data._array,5,tsq_loads=False)
        comp_def = ('comp',('1','2','3','4','5'))
        singel_def = ('1',('s'))

        # pull out input identifiers:
        row_ids = data.get_identifiers('genes')
        col_ids = data.get_identifiers('samples')
        
        T = dataset.Dataset(T,[('samples',col_ids) ,comp_def],name='T2')
        P = dataset.Dataset(P,[('genes',row_ids),comp_def],name='P')
        E = dataset.Dataset(E,[('samples',col_ids),('genes',row_ids)],name='E')
        #tsq = dataset.Dataset(tsq,[singel_def,data_ids[1])
        
        ## plots
        loading_plot1 = plots.ScatterPlot(P,P,'genes','comp','1','2')
        loading_plot2 = plots.ScatterPlot(P,P,'genes','comp','3','4')
        score_plot = plots.ScatterPlot(T,T,'samples','comp','1','2')
        
        return [T,P,E,loading_plot1,loading_plot2,score_plot]

    def pca(self,X,a_opt,cent=True,scale='loads',tsq_loads=False):
        """Principal component analysis
        
        input:
        Xc -- matrix, data
        a_opt -- scalar, max number of comp. to calculate
        cent -- bool, centering [True]
        crit -- string, pc criteria ['exp_var',['ief','rpv','average']]
        scale -- string, scaling ['loads',['scores']]
        tsq_loads -- bool, calculate t-squared? [True]
        reg -- float, covariance regularizer for tsq calculations [0.2]
        output:
        T,P,E,r

        """
        nSamples,nVarX = shape(X)
        if cent:
            Xc = self.mat_center(X)
        else:
            Xc = X
        u,s,vh = self.esvd(Xc)
        if scale=='scores':
            T = u*s
            T = T[:,:a_opt]
            P = transpose(vh)
            P = P[:,:a_opt]
        elif scale=='loads':
            T = u
            T = T[:,:a_opt]
            P = transpose(vh)*s
            P = P[:,:a_opt]
            
        E = Xc - dot(T,transpose(P))
        varEach = s**2
        totVar = sum(varEach)
        r = divide(varEach,totVar)*100
        return T,P,E,r

    def mat_center(self,X,axis=0,ret_mn=False):
        """Mean center matrix along axis.
        
        input:
        X -- matrix, data
        axis -- dim,
        ret_mn -- bool, return mean
        output:
        Xc, [mnX]
        
        NB: axis = 1 is column-centering, axis=0=row-centering
        default is row centering (axis=0)
        """
        try:
            rows,cols = shape(X)
        except ValueError:
            print "The X data needs to be two-dimensional"

        if axis==0:
            mnX = mean(X,axis)
            Xs = X - resize(mnX,(rows,cols))

        elif axis==1:
            mnX = mean(X,axis)
            Xs = transpose(transpose(X) - resize(mnX,(cols,rows)))
        if ret_mn:
            return Xs,mnX
        else:
            return Xs

    def esvd(self,data,economy=1):
        """SVD with the option of economy sized calculation
        Calculate subspaces of X'X or XX' depending on the shape
        of the matrix.
        Good for extreme fat or thin matrices.
        
        """
        mm = self.mm
        m,n = shape(data)
        if m>=n:
            u,s,v = svd(mm(data,data,trans_a=1))
            u = mm(data,v,trans_b=1)
            for i in xrange(n):
                s[i] = norm(u[:,i])
                u[:,i] = u[:,i]/s[i]
        else:
            u,s,v = svd(mm(data,data,trans_b=1))
            v = mm(u,data,trans_a=1)
            for i in xrange(m):
                s[i] = norm(v[i,:])
                v[i,:] = v[i,:]/s[i]
        return u,s,v

    def mm(self,a,b, alpha=1.0, beta=0.0, c=None, trans_a=0,
           trans_b=0):
        """Fast matrix multiplication

        Return alpha*(a*b) + beta*c.
        
        a,b,c : matrices
        alpha, beta: scalars
        trans_a : 0 (a not transposed),
                  1 (a transposed),
                  2 (a conjugate transposed)
        trans_b : 0 (b not transposed),
                  1 (b transposed),
                  2 (b conjugate transposed)
                  """
        if c:
            gemm,= get_blas_funcs(('gemm',),(a,b,c))
        else:
            gemm,= get_blas_funcs(('gemm',),(a,b))

        return gemm(alpha, a, b, beta, c, trans_a, trans_b)


class LoadMoothaData(Function):
    def __init__(self):
        Function.__init__(self, 'load', 'Load diabetes data')

    def run(self,data):
        data_file = open('full_data.pickle','r')
        data = pickle.load(data_file)
        data_file.close()
        sample_file = open('sample_labels.pickle','r')                          
        sample_names = pickle.load(sample_file)
        sample_file.close()
        typecode='f'
        nSamps = len(sample_names)
        nVars = len(data.keys())
        gene_ids = []
        x = zeros((nSamps,nVars),typecode)
        for i,(id,desc) in enumerate(data.items()):
            gene_ids.append(id)
            x[:,i] = desc[0].astype(typecode)
        gene_def = ('genes',gene_ids)
        sample_def = ('samples', sample_names)
        X = dataset.Dataset(x,identifiers=[sample_def,gene_def]) # samples x genes
        return [X]

class Log2Function(Function):
    def __init__(self):
        Function.__init__(self, 'log', 'Log2')

    def run(self,data):
        x = log2(data._array)
        #pull out identifiers
        ids = []
        for dim in data.get_dim_names():
            ids.append((dim,data.get_identifiers(dim)))
        return [dataset.Dataset(x,identifiers=ids,name='Log2_X')]
PCAWorkflow.name = 'PCA Workflow'
New pca workflow and datset updates 2006-04-20 17:30:29 +02:00			`import gtk`
Made system a package. 2006-04-25 11:53:35 +02:00			`import system.workflow as wf`
			`from system.workflow import Stage, Function`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`import pickle`
			`from scipy import log2,transpose,dot,divide,shape,mean,resize,zeros`
			`from scipy.linalg import svd,inv,norm,get_blas_funcs,eig`
Made system a package. 2006-04-25 11:53:35 +02:00			`from system import dataset, logger, plots`
New pca workflow and datset updates 2006-04-20 17:30:29 +02:00
fixes 2006-08-28 14:06:05 +02:00

import * removed 2006-04-24 18:05:54 +02:00			`class PCAWorkflow(wf.Workflow):`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`name = 'PCA Workflow'`
Added command line options to create a new project based on a given workflow. 2006-04-25 14:19:25 +02:00			`ident = 'pca'`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`description = 'PCA workflow. Uses real microarray data from a study of diabetes (Mootha et al.).'`
New pca workflow and datset updates 2006-04-20 17:30:29 +02:00
			`def __init__(self, app):`
import * removed 2006-04-24 18:05:54 +02:00			`wf.Workflow.__init__(self, app)`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`#self.add_project(app.project)`
			`#logger.log('notice','Current project added to: %s' %self.name)`

New pca workflow and datset updates 2006-04-20 17:30:29 +02:00			`load = Stage('load', 'Load Data')`
current datset selection fix 2006-04-21 11:23:05 +02:00			`load.add_function(LoadMoothaData())`
New pca workflow and datset updates 2006-04-20 17:30:29 +02:00			`self.add_stage(load)`

			`preproc = Stage('preprocess', 'Preprocessing')`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`preproc.add_function(Log2Function())`
New pca workflow and datset updates 2006-04-20 17:30:29 +02:00			`self.add_stage(preproc)`

			`annot = Stage('annot', 'Affy annotations')`
			`annot.add_function(LoadAnnotationsFunction())`
			`self.add_stage(annot)`

			`model = Stage('model', 'Model')`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`model.add_function(PCAFunction(self))`
New pca workflow and datset updates 2006-04-20 17:30:29 +02:00			`self.add_stage(model)`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00
New pca workflow and datset updates 2006-04-20 17:30:29 +02:00			`logger.log('debug', '\tPCA\'s workflow is now active')`

			`class LoadAnnotationsFunction(Function):`

			`def __init__(self):`
			`Function.__init__(self, 'load', 'Load Annotations')`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00
New pca workflow and datset updates 2006-04-20 17:30:29 +02:00			`def load_affy_file(self, filename):`
			`f = open(filename)`
			`logger.log('notice', 'Loading annotation file: %s' % filename)`
			`self.file = f`

			`def on_response(self, dialog, response):`
			`if response == gtk.RESPONSE_OK:`
			`logger.log('notice', 'Reading file: %s' % dialog.get_filename())`
			`self.load_affy_file(dialog.get_filename())`

Validation on number of inputs added 2006-05-03 13:52:54 +02:00			`def run(self,data):`
New pca workflow and datset updates 2006-04-20 17:30:29 +02:00			`btns = ('Open', gtk.RESPONSE_OK, \`
			`'Cancel', gtk.RESPONSE_CANCEL)`
			`dialog = gtk.FileChooserDialog('Open Affy Annotation File',`
			`buttons=btns)`
			`dialog.connect('response', self.on_response)`
			`dialog.run()`
			`dialog.destroy()`

s 2006-04-21 10:29:43 +02:00			`### Reading and parsing here`
New pca workflow and datset updates 2006-04-20 17:30:29 +02:00			`annot = read_affy_annot(self.file)`
s 2006-04-21 10:29:43 +02:00			`i_want = 'Pathway'`
			`nothing = '---'`
			`ids_in_data = set(data.names('genes')) #assuming we have genes`
			`sanity_check = set(annot.keys())`
			`if not ids_in_data.intersection(sanity_check) == ids_in_data:`
			`logger.log('debug','Some identifers in data does not exist in affy file!')`
			`for affy_id,description in annot:`
			`if affy_id in ids_in_data:`
workflow update 2006-04-21 10:34:08 +02:00			`pathways = description[i_want]`
			`if not pathways[0][0]=='--':`
			`pass`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`D = []`
			`return [D]`
New pca workflow and datset updates 2006-04-20 17:30:29 +02:00
			`class PCAFunction(Function):`

Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`def __init__(self,workflow):`
			`Function.__init__(self, 'pca', 'PCA')`
New pca workflow and datset updates 2006-04-20 17:30:29 +02:00			`self.output = None`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`self.workflow = workflow`

Validation on number of inputs added 2006-05-03 13:52:54 +02:00			`def run(self,data):`
New pca workflow and datset updates 2006-04-20 17:30:29 +02:00			`logger.log('debug', 'datatype: %s' % type(data))`
			`if not isinstance(data,dataset.Dataset):`
			`return None`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`#logger.log('debug', 'dimensions: %s' % data.dims)`
New pca workflow and datset updates 2006-04-20 17:30:29 +02:00
			`## calculations`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`T,P,E,tsq = self.pca(data._array,5,tsq_loads=False)`
			`comp_def = ('comp',('1','2','3','4','5'))`
			`singel_def = ('1',('s'))`

			`# pull out input identifiers:`
get_dim_names and get_identifiers added in dataset 2006-04-24 13:23:30 +02:00			`row_ids = data.get_identifiers('genes')`
			`col_ids = data.get_identifiers('samples')`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00
fixes 2006-08-28 14:06:05 +02:00			`T = dataset.Dataset(T,[('samples',col_ids) ,comp_def],name='T2')`
			`P = dataset.Dataset(P,[('genes',row_ids),comp_def],name='P')`
			`E = dataset.Dataset(E,[('samples',col_ids),('genes',row_ids)],name='E')`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`#tsq = dataset.Dataset(tsq,[singel_def,data_ids[1])`
New pca workflow and datset updates 2006-04-20 17:30:29 +02:00
			`## plots`
Implemented Limma function for Affy workflow. Extended ScatterPlot to take two datasets and updated code using it. 2006-05-09 15:17:17 +02:00			`loading_plot1 = plots.ScatterPlot(P,P,'genes','comp','1','2')`
			`loading_plot2 = plots.ScatterPlot(P,P,'genes','comp','3','4')`
			`score_plot = plots.ScatterPlot(T,T,'samples','comp','1','2')`
New pca workflow and datset updates 2006-04-20 17:30:29 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`return [T,P,E,loading_plot1,loading_plot2,score_plot]`
New pca workflow and datset updates 2006-04-20 17:30:29 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`def pca(self,X,a_opt,cent=True,scale='loads',tsq_loads=False):`
			`"""Principal component analysis`

			`input:`
			`Xc -- matrix, data`
			`a_opt -- scalar, max number of comp. to calculate`
			`cent -- bool, centering [True]`
			`crit -- string, pc criteria ['exp_var',['ief','rpv','average']]`
			`scale -- string, scaling ['loads',['scores']]`
			`tsq_loads -- bool, calculate t-squared? [True]`
			`reg -- float, covariance regularizer for tsq calculations [0.2]`
			`output:`
			`T,P,E,r`
New pca workflow and datset updates 2006-04-20 17:30:29 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`"""`
			`nSamples,nVarX = shape(X)`
			`if cent:`
			`Xc = self.mat_center(X)`
			`else:`
			`Xc = X`
			`u,s,vh = self.esvd(Xc)`
			`if scale=='scores':`
			`T = u*s`
			`T = T[:,:a_opt]`
			`P = transpose(vh)`
			`P = P[:,:a_opt]`
			`elif scale=='loads':`
			`T = u`
			`T = T[:,:a_opt]`
			`P = transpose(vh)*s`
			`P = P[:,:a_opt]`

			`E = Xc - dot(T,transpose(P))`
			`varEach = s**2`
			`totVar = sum(varEach)`
			`r = divide(varEach,totVar)*100`
			`return T,P,E,r`
workflow update 2006-04-21 10:30:37 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`def mat_center(self,X,axis=0,ret_mn=False):`
			`"""Mean center matrix along axis.`
workflow update 2006-04-21 10:30:37 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`input:`
			`X -- matrix, data`
			`axis -- dim,`
			`ret_mn -- bool, return mean`
			`output:`
			`Xc, [mnX]`

			`NB: axis = 1 is column-centering, axis=0=row-centering`
			`default is row centering (axis=0)`
			`"""`
			`try:`
			`rows,cols = shape(X)`
			`except ValueError:`
			`print "The X data needs to be two-dimensional"`
workflow update 2006-04-21 10:30:37 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`if axis==0:`
			`mnX = mean(X,axis)`
			`Xs = X - resize(mnX,(rows,cols))`
workflow update 2006-04-21 10:30:37 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`elif axis==1:`
			`mnX = mean(X,axis)`
			`Xs = transpose(transpose(X) - resize(mnX,(cols,rows)))`
			`if ret_mn:`
			`return Xs,mnX`
			`else:`
			`return Xs`

			`def esvd(self,data,economy=1):`
			`"""SVD with the option of economy sized calculation`
			`Calculate subspaces of X'X or XX' depending on the shape`
			`of the matrix.`
			`Good for extreme fat or thin matrices.`

			`"""`
			`mm = self.mm`
			`m,n = shape(data)`
			`if m>=n:`
			`u,s,v = svd(mm(data,data,trans_a=1))`
			`u = mm(data,v,trans_b=1)`
			`for i in xrange(n):`
			`s[i] = norm(u[:,i])`
			`u[:,i] = u[:,i]/s[i]`
			`else:`
			`u,s,v = svd(mm(data,data,trans_b=1))`
			`v = mm(u,data,trans_a=1)`
			`for i in xrange(m):`
			`s[i] = norm(v[i,:])`
			`v[i,:] = v[i,:]/s[i]`
			`return u,s,v`

			`def mm(self,a,b, alpha=1.0, beta=0.0, c=None, trans_a=0,`
			`trans_b=0):`
			`"""Fast matrix multiplication`

			`Return alpha(ab) + beta*c.`

			`a,b,c : matrices`
			`alpha, beta: scalars`
			`trans_a : 0 (a not transposed),`
			`1 (a transposed),`
			`2 (a conjugate transposed)`
			`trans_b : 0 (b not transposed),`
			`1 (b transposed),`
			`2 (b conjugate transposed)`
			`"""`
			`if c:`
			`gemm,= get_blas_funcs(('gemm',),(a,b,c))`
			`else:`
			`gemm,= get_blas_funcs(('gemm',),(a,b))`

			`return gemm(alpha, a, b, beta, c, trans_a, trans_b)`


			`class LoadMoothaData(Function):`
			`def __init__(self):`
			`Function.__init__(self, 'load', 'Load diabetes data')`

Validation on number of inputs added 2006-05-03 13:52:54 +02:00			`def run(self,data):`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`data_file = open('full_data.pickle','r')`
			`data = pickle.load(data_file)`
			`data_file.close()`
			`sample_file = open('sample_labels.pickle','r')`
			`sample_names = pickle.load(sample_file)`
			`sample_file.close()`
			`typecode='f'`
			`nSamps = len(sample_names)`
			`nVars = len(data.keys())`
current datset selection fix 2006-04-21 11:23:05 +02:00			`gene_ids = []`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`x = zeros((nSamps,nVars),typecode)`
			`for i,(id,desc) in enumerate(data.items()):`
current datset selection fix 2006-04-21 11:23:05 +02:00			`gene_ids.append(id)`
			`x[:,i] = desc[0].astype(typecode)`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`gene_def = ('genes',gene_ids)`
			`sample_def = ('samples', sample_names)`
			`X = dataset.Dataset(x,identifiers=[sample_def,gene_def]) # samples x genes`
current datset selection fix 2006-04-21 11:23:05 +02:00			`return [X]`
* The New project toolbar button now works. A druid will display the available workflows and ask you to select one of them. There are still a couple of unresolved issues, but it works, so i upload it to ease the use of the program. 2006-04-22 23:46:44 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`class Log2Function(Function):`
			`def __init__(self):`
			`Function.__init__(self, 'log', 'Log2')`

Validation on number of inputs added 2006-05-03 13:52:54 +02:00			`def run(self,data):`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`x = log2(data._array)`
bugix on creation of output data (E) from pca 2006-04-24 14:51:43 +02:00			`#pull out identifiers`
			`ids = []`
			`for dim in data.get_dim_names():`
			`ids.append((dim,data.get_identifiers(dim)))`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`return [dataset.Dataset(x,identifiers=ids,name='Log2_X')]`
* The New project toolbar button now works. A druid will display the available workflows and ask you to select one of them. There are still a couple of unresolved issues, but it works, so i upload it to ease the use of the program. 2006-04-22 23:46:44 +02:00			`PCAWorkflow.name = 'PCA Workflow'`