Added SAM and conditioned enrichment analysis

2007-08-14 16:16:31 +00:00
parent 8d4848d5fa
commit d510e092e3
1 changed files with 109 additions and 19 deletions
--- a/workflows/smokers.py
+++ b/workflows/smokers.py
@@ -25,11 +25,6 @@ class SmallTestWorkflow(workflow.Workflow):
        #load.add_function(DatasetLoadFunctionCYCLE())
        self.add_stage(load)
        
-        # PREPROCESSING
-        # prep = workflow.Stage('prep', 'Preprocessing')
-        # prep.add_function(LogFunction())
-        # self.add_stage(prep)
-
        # NETWORK PREPROCESSING
        #net = workflow.Stage('net', 'Network integration')
        #net.add_function(DiffKernelFunction())
@@ -42,6 +37,7 @@ class SmallTestWorkflow(workflow.Workflow):
        model.add_function(blmfuncs.PCA())
        model.add_function(blmfuncs.PLS())
        model.add_function(blmfuncs.LPLS())
+        model.add_function(SAM())
        
        #model.add_function(bioconFuncs.SAM(app))
        self.add_stage(model)
@@ -60,6 +56,7 @@ class SmallTestWorkflow(workflow.Workflow):
        # go.add_function(gobrowser.TTestFunction())
        go.add_function(gobrowser.PlotDagFunction())
        go.add_function(GoEnrichment())
+        go.add_function(GoEnrichmentCond())
        self.add_stage(go)

        # EXTRA PLOTS
@@ -144,8 +141,60 @@ class DatasetLoadFunctionCYCLE(workflow.Function):


 ##### WORKFLOW SPECIFIC FUNCTIONS ######
-
+class SAM(workflow.Function):
+    def __init__(self, id='sam', name='SAM'):
+        workflow.Function.__init__(self, id, name)
        
+    def run(self, x, y):
+        
+        n_iter = 50 #B
+        alpha = 0.01 #cut off on qvals
+        
+        ###############
+
+        # Main function call
+
+        # setup prelimenaries
+        import rpy
+        rpy.r.library("siggenes")
+        rpy.r.library("multtest")
+        
+        cl = scipy.dot(y.asarray(), scipy.diag([1,2,3]) ).sum(1)
+        data = x.asarray().T
+        sam = rpy.r.sam(data, cl=cl, B=n_iter, var_equal=False,med=False,s0=scipy.nan,rand=scipy.nan)
+        qvals = scipy.asarray(rpy.r.slot(sam, "p.value"))
+        pvals = scipy.asarray(rpy.r.slot(sam, "q.value"))
+        
+        sam_index = (qvals<alpha).nonzero()[0]
+
+        # Update selection object
+        dim_name = x.get_dim_name(1)
+        sam_selection = x.get_identifiers(dim_name, indices=sam_index)
+        main.project.set_selection(dim_name, sam_selection)
+        
+        sel = dataset.Selection('SAM selection')
+        sel.select(dim_name, sam_selection)
+        logger.log('notice','Number of significant varibles (SAM): %s' %len(sam_selection))
+
+        # ## OUTPUT ###
+        xcolname = x.get_dim_name(1) # genes
+        x_col_ids = [xcolname, x.get_identifiers(xcolname, sorted=True)]
+        sing_id = ['_john', ['0']] #singleton
+        D_qvals = dataset.Dataset(qvals, (x_col_ids, sing_id), name='q_vals')
+        D_pvals = dataset.Dataset(pvals, (x_col_ids, sing_id), name='p_vals')
+        
+        # plots
+        s_indx = qvals.flatten().argsort()
+        s_ids = [x_col_ids[0],[x_col_ids[1][i] for i in s_indx]]
+        xindex = scipy.arange(len(qvals))
+        qvals_s = qvals.take(s_indx)
+        D_qs = dataset.Dataset(qvals_s, (s_ids, sing_id), name="sorted qvals")
+        Dind = dataset.Dataset(xindex, (s_ids, sing_id), name="dum")
+        st = plots.ScatterPlot(D_qs, Dind, 'gene_ids', '_john', '0', '0', s=10, name='SAM qvals')
+        
+        return [D_qvals, D_pvals, D_qs, st, sel]
+        
+
 class DiffKernelFunction(workflow.Function):
    def __init__(self):
        workflow.Function.__init__(self, 'diffkernel', 'Diffusion')
@@ -294,18 +343,6 @@ class KEGGQuery(workflow.Function):
        webbrowser.open(web_str)


-class LogFunction(workflow.Function):
-    def __init__(self):
-        workflow.Function.__init__(self, 'log', 'Log')
-
-    def run(self, data):
-        logger.log('notice', 'Taking the log of dataset %s' % data.get_name())
-        d = data.copy()
-        d._array = scipy.log(d._array)
-        d._name = 'log(%s)' % data.get_name()
-        return [d]
-
-
 class GoEnrichment(workflow.Function):
    def __init__(self):
        workflow.Function.__init__(self, 'goenrich', 'Go Enrichment')
@@ -320,6 +357,7 @@ class GoEnrichment(workflow.Function):
            logger.log('notice', 'No dimension called [gene_ids] in dataset: %s', data.get_name())
            return
        universe = list(data.get_identifiers('gene_ids'))
+        logger.log('notice', 'Universe consists of %s gene ids from %s' %(len(universe), data.get_name()))
        # Get current selection and validate
        curr_sel = main.project.get_selection()
        selected_genes = list(curr_sel['gene_ids'])
@@ -328,7 +366,7 @@ class GoEnrichment(workflow.Function):
            return
        
        # Hypergeometric parameter object
-        pval_cutoff = 0.99
+        pval_cutoff = 0.9999
        cond = False
        test_direction = 'over'
        params = rpy.r.new("GOHyperGParams",
@@ -353,3 +391,55 @@ class GoEnrichment(workflow.Function):
                               (row_ids, col_ids),
                               name='P values (enrichment)')
        return [xout]
+
+
+class GoEnrichmentCond(workflow.Function):
+    """ Enrichment conditioned on dag structure."""
+    def __init__(self):
+        workflow.Function.__init__(self, 'goenrich', 'Go Cond. Enrich.')
+
+    def run(self, data):
+        import rpy
+        rpy.r.library("GOstats")
+        
+        # Get universe
+        # Here, we are using a defined dataset to represent the universe
+        if not 'gene_ids' in data:
+            logger.log('notice', 'No dimension called [gene_ids] in dataset: %s', data.get_name())
+            return
+        universe = list(data.get_identifiers('gene_ids'))
+        logger.log('notice', 'Universe consists of %s gene ids from %s' %(len(universe), data.get_name()))
+        # Get current selection and validate
+        curr_sel = main.project.get_selection()
+        selected_genes = list(curr_sel['gene_ids'])
+        if len(selected_genes)==0:
+            logger.log('notice', 'This function needs a current selection!')
+            return
+        
+        # Hypergeometric parameter object
+        pval_cutoff = 0.9999
+        cond = True
+        test_direction = 'over'
+        params = rpy.r.new("GOHyperGParams",
+                           geneIds=selected_genes,
+                           annotation="hgu133a",
+                           ontology="BP",
+                           pvalueCutoff=pval_cutoff,
+                           conditional=cond,
+                           testDirection=test_direction
+                           )
+        # run test
+        # result.keys(): ['Count', 'Term', 'OddsRatio', 'Pvalue', 'ExpCount', 'GOBPID', 'Size']
+        result = rpy.r.summary(rpy.r.hyperGTest(params))
+        
+        # dataset
+        terms = result['GOBPID']
+        pvals = scipy.log(scipy.asarray(result['Pvalue']))
+        row_ids = ('go-terms', terms)
+        col_ids = ('_john', ['_doe'])
+        
+        xout = dataset.Dataset(pvals,
+                               (row_ids, col_ids),
+                               name='P values (enrichment)')
+        return [xout]
+