From 3f5d45d7af0826918ad7399ac6bd57c585af921c Mon Sep 17 00:00:00 2001
From: einarr <einarr@pvv.ntnu.no>
Date: Thu, 5 Jul 2007 18:24:45 +0000
Subject: [PATCH] Renamed smalltest workflow to smokers.

---
 workflows/smalltest.py | 275 -----------------------------------------
 workflows/smokers.py   |   3 +-
 2 files changed, 2 insertions(+), 276 deletions(-)
 delete mode 100644 workflows/smalltest.py

diff --git a/workflows/smalltest.py b/workflows/smalltest.py
deleted file mode 100644
index d43e263..0000000
--- a/workflows/smalltest.py
+++ /dev/null
@@ -1,275 +0,0 @@
-import sys,os
-import webbrowser
-
-from fluents import logger, plots,workflow,dataset
-from fluents.lib import blmfuncs,nx_utils,validation,engines,cx_stats,cx_utils
-import gobrowser, geneontology
-import scipy
-import networkx as nx
-
-
-class SmallTestWorkflow(workflow.Workflow):
-    name = 'SmallTest'
-    ident = 'smalltest'
-    description = 'A small test workflow for gene expression analysis'
-
-    def __init__(self, app):
-        workflow.Workflow.__init__(self, app)        
-
-        # DATA IMPORT
-        load = workflow.Stage('load', 'Data')
-        load.add_function(DatasetLoadFunctionSmokerSmall())
-        load.add_function(DatasetLoadFunctionSmokerMedium())
-        load.add_function(DatasetLoadFunctionSmokerFull())
-        #load.add_function(DatasetLoadFunctionCYCLE())
-        self.add_stage(load)
-        
-        # PREPROCESSING
-        prep = workflow.Stage('prep', 'Preprocessing')
-        prep.add_function(LogFunction())
-        self.add_stage(prep)
-
-        # NETWORK PREPROCESSING
-        net = workflow.Stage('net', 'Network integration')
-        net.add_function(DiffKernelFunction())
-        net.add_function(ModKernelFunction())
-        #net.add_function(RandDiffKernelFunction())
-        self.add_stage(net)
-        
-        # BLM's
-        model = workflow.Stage('models', 'Models')
-        model.add_function(blmfuncs.PCA())
-        model.add_function(blmfuncs.PLS())
-        
-        #model.add_function(bioconFuncs.SAM(app))
-        self.add_stage(model)
-        
-        query = workflow.Stage('query', 'Gene Query')
-        query.add_function(NCBIQuery())
-        query.add_function(KEGGQuery())
-        self.add_stage(query)
-        
-        go = workflow.Stage('go', 'Gene Ontology')
-        go.add_function(gobrowser.LoadGOFunction())
-        go.add_function(gobrowser.GOWeightFunction())
-        go.add_function(gobrowser.TTestFunction())
-        self.add_stage(go)
-
-        # EXTRA PLOTS
-        #plt = workflow.Stage('net', 'Network')
-        #plt.add_function(nx_analyser.KeggNetworkAnalyser())
-        #self.add_stage(plt)
-        
-        
-        logger.log('debug', 'Small test workflow is now active')
-
-
-class DatasetLoadFunctionSmokerSmall(workflow.Function):
-    """Loader for all ftsv files of smokers small datasets."""
-    def __init__(self):
-        workflow.Function.__init__(self, 'load_small', 'Smoker (Small)')
-
-    def run(self):
-        path = 'data/smokers-small/'
-        files = os.listdir(path)
-        out = []
-        for fname in files:
-            if fname.endswith('.ftsv'):
-                input_file = open(os.path.join(path, fname))
-                out.append(dataset.read_ftsv(input_file))
-        return out
-
-
-class DatasetLoadFunctionSmokerMedium(workflow.Function):
-    """Loader for all ftsv files of smokers small datasets."""
-    def __init__(self):
-        workflow.Function.__init__(self, 'load_medium', 'Smoker (Medium)')
-
-    def run(self):
-        path = 'data/smokers-medium/'
-        files = os.listdir(path)
-        out = []
-        for fname in files:
-            if fname.endswith('.ftsv'):
-                input_file = open(os.path.join(path, fname))
-                out.append(dataset.read_ftsv(input_file))
-        return out
-
-
-class DatasetLoadFunctionSmokerFull(workflow.Function):
-    """Loader for all ftsv files of smokers small datasets."""
-    def __init__(self):
-        workflow.Function.__init__(self, 'load_full', 'Smoker (Full)')
-
-    def run(self):
-        path = 'data/smokers-full/'
-        files = os.listdir(path)
-        out = []
-        for fname in files:
-            if fname.endswith('.ftsv'):
-                input_file = open(os.path.join(path, fname))
-                out.append(dataset.read_ftsv(input_file))
-        return out
-
-
-class DatasetLoadFunctionCYCLE(workflow.Function):
-    """Loader for pickled CYCLE datasets."""
-    def __init__(self):
-        workflow.Function.__init__(self, 'load_data', 'Cycle')
-
-    def run(self):
-        filename='fluents/data/CYCLE'
-        if filename:
-            return dataset.from_file(filename)            
-
-
-##### WORKFLOW SPECIFIC FUNCTIONS ######
-
-        
-class DiffKernelFunction(workflow.Function):
-    def __init__(self):
-        workflow.Function.__init__(self, 'diffkernel', 'Diffusion')
-
-    def run(self, x, a):
-        """x is gene expression data, a is the network.
-        """
-        #sanity check:
-        g = a.asnetworkx()
-        genes = x.get_identifiers(x.get_dim_name(1), sorted=True)
-        W = nx.adj_matrix(g, nodelist=genes)
-        X = x.asarray()
-        Xc, mn_x = cx_utils.mat_center(X, ret_mn=True)
-        out = []
-        alpha=1.0
-        beta = 1.0
-        K = nx_utils.K_diffusion(W, alpha=alpha, beta=beta,normalised=True)
-        Xp = scipy.dot(Xc, K) + mn_x
-        # dataset
-        row_ids = (x.get_dim_name(0),
-                   x.get_identifiers(x.get_dim_name(0),
-                                     sorted=True))
-        col_ids = (x.get_dim_name(1),
-                   x.get_identifiers(x.get_dim_name(1),
-                                     sorted=True))
-        
-        xout = dataset.Dataset(Xp,
-                               (row_ids, col_ids),
-                               name=x.get_name()+'_diff'+str(alpha))
-        out.append(xout)
-        
-        return out
-
-
-class RandDiffKernelFunction(workflow.Function):
-    def __init__(self):
-        workflow.Function.__init__(self, 'diffkernel', 'Rand. Diff.')
-
-    def run(self, x, a):
-        """x is gene expression data, a is the network.
-        """
-        #sanity check:
-        g = a.asnetworkx()
-        genes = x.get_identifiers(x.get_dim_name(1))
-        # randomise nodelist
-        genes = [genes[i] for i in cx_utils.randperm(x.shape[1])]
-        W = nx.adj_matrix(g, nodelist=genes)
-        X = x.asarray()
-        Xc, mn_x = cx_utils.mat_center(X, ret_mn=True)
-        out = []
-        alpha=1.
-        beta = 1.0
-        K = nx_utils.K_diffusion(W, alpha=alpha, beta=beta,normalised=True)
-
-        Xp = scipy.dot(Xc, K) + mn_x
-        # dataset
-        row_ids = (x.get_dim_name(0),
-                   x.get_identifiers(x.get_dim_name(0),
-                                     sorted=True))
-        col_ids = (x.get_dim_name(1),
-                   x.get_identifiers(x.get_dim_name(1),
-                                     sorted=True))
-        
-        xout = dataset.Dataset(Xp,
-                               (row_ids, col_ids),
-                               name=x.get_name()+'_diff'+str(alpha))
-        out.append(xout)
-        
-        return out
-    
-
-class ModKernelFunction(workflow.Function):
-    def __init__(self):
-        workflow.Function.__init__(self, 'mokernel', 'Modularity')
-
-    def run(self,x,a):
-        X = x.asarray()
-        g = a.asnetworkx()
-        genes = x.get_identifiers(x.get_dim_name(1), sorted=True)
-        W = nx.adj_matrix(g, nodelist=genes)
-        out=[]
-        alpha=.2
-        Xc,mn_x = cx_utils.mat_center(X, ret_mn=True)
-        K = nx_utils.K_modularity(W, alpha=alpha)
-        Xp = scipy.dot(Xc, K)
-        Xp = Xp + mn_x
-        
-        # dataset
-        row_ids = (x.get_dim_name(0),
-                   x.get_identifiers(x.get_dim_name(0),
-                                     sorted=True))
-        col_ids = (x.get_dim_name(1),
-                   x.get_identifiers(x.get_dim_name(1),
-                                     sorted=True))
-        xout = dataset.Dataset(Xp,
-                               (row_ids,col_ids),
-                               name=x.get_name()+'_mod'+str(alpha))
-        out.append(xout)
-        return out
-
-
-class NCBIQuery(workflow.Function):
-    def __init__(self):
-        workflow.Function.__init__(self, 'query', 'NCBI')
-
-    def run(self, selection):
-        base = 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?'
-        options = {r'&db=' : 'gene',
-                   r'&cmd=' : 'retrieve',
-                   r'&dopt=' : 'full_report'}
-        gene_str = ''.join([gene + "+" for gene in selection['gene_id']])
-        options[r'&list_uids='] = gene_str[:-1]
-        opt_str = ''.join([key+value for key,value in options.items()])
-        web_str = base + opt_str
-        webbrowser.open(web_str)
-
-
-class KEGGQuery(workflow.Function):
-    def __init__(self):
-        workflow.Function.__init__(self, 'query', 'KEGG')
-
-    def run(self, selection):
-        if not selection.has_key('genes') \
-               or not selection.has_key('orfs'):
-            return None
-        
-        base = r'http://www.genome.jp/dbget-bin/www_bget?'
-        options = {'org' : 'gene'}
-        org = 'hsa'
-        gene_str = ''.join([gene + "+" for gene in selection['gene_id']])
-        gene_str = gene_str[:-1]
-        gene_str = org + "+" + gene_str
-        web_str = base + gene_str
-        webbrowser.open(web_str)
-
-
-class LogFunction(workflow.Function):
-    def __init__(self):
-        workflow.Function.__init__(self, 'log', 'Log')
-
-    def run(self, data):
-        logger.log('notice', 'Taking the log of dataset %s' % data.get_name())
-        d = data.copy()
-        d._array = scipy.log(d._array)
-        d._name = 'log(%s)' % data.get_name()
-        return [d]
-
diff --git a/workflows/smokers.py b/workflows/smokers.py
index 09f40fd..3d222cd 100644
--- a/workflows/smokers.py
+++ b/workflows/smokers.py
@@ -49,9 +49,11 @@ class SmallTestWorkflow(workflow.Workflow):
         query.add_function(KEGGQuery())
         self.add_stage(query)
         
+        # Gene Ontology
         go = workflow.Stage('go', 'Gene Ontology')
         go.add_function(gobrowser.LoadGOFunction())
         go.add_function(gobrowser.GOWeightFunction())
+        go.add_function(gobrowser.DistanceToSelectionFunction())
         go.add_function(gobrowser.TTestFunction())
         self.add_stage(go)
 
@@ -60,7 +62,6 @@ class SmallTestWorkflow(workflow.Workflow):
         #plt.add_function(nx_analyser.KeggNetworkAnalyser())
         #self.add_stage(plt)
         
-        
         logger.log('debug', 'Small test workflow is now active')