laydi/workflows/demo.py

import sys,os
import os.path
import webbrowser
import cPickle

import scipy
import networkx as nx

from fluents import logger,plots,workflow,dataset,main
from fluents.lib import blmfuncs,nx_utils,cx_utils

import gobrowser


class SmallTestWorkflow(workflow.Workflow):
    name = 'Demo'
    ident = 'demo'
    description = 'A small test workflow for gene expression analysis.'
    chip = 'hgu'
    def __init__(self):
        workflow.Workflow.__init__(self)        

        # DATA IMPORT
        load = workflow.Stage('load', 'Data')

        load_small = LoadDataFunction('load-small', 'Small', self)
        load.add_function(load_small)

        load_medium = LoadDataFunction('load-geneid', 'GeneID', self, 'geneid')
        load.add_function(load_medium)
        
        load_medium = LoadDataFunction('load-full', 'FullChip', self, 'full')
        load.add_function(load_medium)
        
        self.add_stage(load)
        
        # NETWORK PREPROCESSING
        #net = workflow.Stage('net', 'Network integration')
        #net.add_function(DiffKernelFunction())
        #net.add_function(ModKernelFunction())
        #self.add_stage(net)
        
        # Models
        model = workflow.Stage('models', 'Models')
        model.add_function(blmfuncs.PCA())
        model.add_function(blmfuncs.PLS())
        model.add_function(SAM())
        self.add_stage(model)
        
        query = workflow.Stage('query', 'Gene Query')
        query.add_function(NCBIQuery())
        query.add_function(KEGGQuery())
        query.add_function(SubgraphQuery())
        self.add_stage(query)
        
        # Background knowledge
        go = workflow.Stage('go', 'Gene Ontology')
        go.add_function(gobrowser.PlotDagFunction())
        go.add_function(GoEnrichment())
        go.add_function(GoEnrichmentCond())
        go.add_function(MapGO2Gene())
        go.add_function(MapGene2GO())
        self.add_stage(go)

        # EXTRA PLOTS
        #plt = workflow.Stage('net', 'Network')
        #plt.add_function(nx_analyser.KeggNetworkAnalyser())
        #self.add_stage(plt)
        
        logger.log('debug', 'Small test workflow is now active')


class LoadDataFunction(workflow.Function):
    """Loads all datasets in a given directory."""
    def __init__(self, ident, label, wf, dir=''):
        workflow.Function.__init__(self, ident, label)
        self._dir = dir
        self._wf = wf

    def run(self):
        path = os.path.join(main.options.datadir, self._wf.ident, self._dir)
        files = os.listdir(path)
        out = []
        for fn in files:
            if fn.endswith('.ftsv'):
                out.append(dataset.read_ftsv(os.path.join(path, fn)))
        return out


##### WORKFLOW SPECIFIC FUNCTIONS ######


class SAM(workflow.Function):
    def __init__(self, id='sam', name='SAM'):
        workflow.Function.__init__(self, id, name)
        
    def run(self, x, y):        
        n_iter = 50 #B
        alpha = 0.01 #cut off on qvals
        
        ###############

        # Main function call

        # setup prelimenaries
        import rpy
        rpy.r.library("siggenes")
        rpy.r.library("multtest")
        
        cl = scipy.dot(y.asarray(), scipy.diag(scipy.arange(y.shape[1]))).sum(1)
        data = x.asarray().T
        sam = rpy.r.sam(data, cl=cl, B=n_iter, var_equal=False,med=False,s0=scipy.nan,rand=scipy.nan)
        qvals = scipy.asarray(rpy.r.slot(sam, "p.value"))
        pvals = scipy.asarray(rpy.r.slot(sam, "q.value"))
        
        sam_index = (qvals<alpha).nonzero()[0]

        # Update selection object
        dim_name = x.get_dim_name(1)
        sam_selection = x.get_identifiers(dim_name, indices=sam_index)
        main.project.set_selection(dim_name, sam_selection)
        
        sel = dataset.Selection('SAM selection')
        sel.select(dim_name, sam_selection)
        logger.log('notice','Number of significant varibles (SAM): %s' %len(sam_selection))

        # ## OUTPUT ###
        xcolname = x.get_dim_name(1) # genes
        x_col_ids = [xcolname, x.get_identifiers(xcolname, sorted=True)]
        sing_id = ['_john', ['0']] #singleton
        D_qvals = dataset.Dataset(qvals, (x_col_ids, sing_id), name='q_vals')
        D_pvals = dataset.Dataset(pvals, (x_col_ids, sing_id), name='p_vals')
        
        # plots
        s_indx = qvals.flatten().argsort()
        s_ids = [x_col_ids[0],[x_col_ids[1][i] for i in s_indx]]
        xindex = scipy.arange(len(qvals))
        qvals_s = qvals.take(s_indx)
        D_qs = dataset.Dataset(qvals_s, (s_ids, sing_id), name="sorted qvals")
        Dind = dataset.Dataset(xindex, (s_ids, sing_id), name="dum")
        st = plots.ScatterPlot(D_qs, Dind, 'gene_ids', '_john', '0', '0', s=10, name='SAM qvals')
        
        return [D_qvals, D_pvals, D_qs, st, sel]
        

class DiffKernelFunction(workflow.Function):
    def __init__(self):
        workflow.Function.__init__(self, 'diffkernel', 'Diffusion')

    def run(self, x, a):
        """x is gene expression data, a is the network.
        """
        #sanity check:
        g = a.asnetworkx()
        genes = x.get_identifiers(x.get_dim_name(1), sorted=True)
        W = nx.adj_matrix(g, nodelist=genes)
        X = x.asarray()
        Xc, mn_x = cx_utils.mat_center(X, ret_mn=True)
        out = []
        alpha = 1.0
        beta = 1.0
        K = nx_utils.K_diffusion(W, alpha=alpha, beta=beta,normalised=True)
        Xp = scipy.dot(Xc, K) + mn_x
        # dataset
        row_ids = (x.get_dim_name(0),
                   x.get_identifiers(x.get_dim_name(0),
                                     sorted=True))
        col_ids = (x.get_dim_name(1),
                   x.get_identifiers(x.get_dim_name(1),
                                     sorted=True))
        
        xout = dataset.Dataset(Xp,
                               (row_ids, col_ids),
                               name=x.get_name()+'_diff'+str(alpha))
        out.append(xout)
        
        return out


class ModKernelFunction(workflow.Function):
    def __init__(self):
        workflow.Function.__init__(self, 'mokernel', 'Modularity')

    def run(self,x,a):
        X = x.asarray()
        g = a.asnetworkx()
        genes = x.get_identifiers(x.get_dim_name(1), sorted=True)
        W = nx.adj_matrix(g, nodelist=genes)
        out=[]
        alpha=.2
        Xc,mn_x = cx_utils.mat_center(X, ret_mn=True)
        K = nx_utils.K_modularity(W, alpha=alpha)
        Xp = scipy.dot(Xc, K)
        Xp = Xp + mn_x
        
        # dataset
        row_ids = (x.get_dim_name(0),
                   x.get_identifiers(x.get_dim_name(0),
                                     sorted=True))
        col_ids = (x.get_dim_name(1),
                   x.get_identifiers(x.get_dim_name(1),
                                     sorted=True))
        xout = dataset.Dataset(Xp,
                               (row_ids,col_ids),
                               name=x.get_name()+'_mod'+str(alpha))
        out.append(xout)
        return out


class NCBIQuery(workflow.Function):
    def __init__(self, gene_id_name='gene_ids'):
        self._gene_id_name = gene_id_name
        workflow.Function.__init__(self, 'query', 'NCBI')

    def run(self):
        selection = main.project.get_selection()
        if not selection.has_key(self._gene_id_name):
            logger.log("notice", "Expected gene ids: %s, but got: %s" %(self._gene_id_name, selection.keys()))
            return None
        if len(selection[self._gene_id_name])==0:
            logger.log("notice", "No selected genes to query")
            return None
        
        base = 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?'
        options = {r'&db=' : 'gene',
                   r'&cmd=' : 'retrieve',
                   r'&dopt=' : 'full_report'}
        gene_str = ''.join([gene + "+" for gene in selection[self._gene_id_name]])
        options[r'&list_uids='] = gene_str[:-1]
        opt_str = ''.join([key+value for key,value in options.items()])
        web_str = base + opt_str
        webbrowser.open(web_str)


class KEGGQuery(workflow.Function):
    def __init__(self, org='hsa', gene_id_name='gene_ids'):
        self._org=org
        self._gene_id_name = gene_id_name
        workflow.Function.__init__(self, 'query', 'KEGG')

    def run(self, selection):
        if not selection.has_key(self._gene_id_name):
            logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys()))
            return None
        if len(selection[self._gene_id_name])==0:
            logger.log("notice", "No selected genes to query")
            return None
        
        base = r'http://www.genome.jp/dbget-bin/www_bget?'
        gene_str = ''.join([gene + "+" for gene in selection[self._gene_id_name]])
        gene_str = gene_str[:-1]
        gene_str = self._org + "+" + gene_str
        web_str = base + gene_str
        webbrowser.open(web_str)


class GoEnrichment(workflow.Function):
    def __init__(self):
        workflow.Function.__init__(self, 'goenrich', 'Go Enrichment')

    def run(self, data):
        import rpy
        rpy.r.library("GOstats")
        
        # Get universe
        # Here, we are using a defined dataset to represent the universe
        if not 'gene_ids' in data:
            logger.log('notice', 'No dimension called [gene_ids] in dataset: %s' %data.get_name())
            return
        universe = list(data.get_identifiers('gene_ids'))
        logger.log('notice', 'Universe consists of %s gene ids from %s' %(len(universe), data.get_name()))
        # Get current selection and validate
        curr_sel = main.project.get_selection()
        selected_genes = list(curr_sel['gene_ids'])
        if len(selected_genes)==0:
            logger.log('notice', 'This function needs a current selection!')
            return
        
        # Hypergeometric parameter object
        pval_cutoff = 0.9999
        cond = False
        test_direction = 'over'
        params = rpy.r.new("GOHyperGParams",
                           geneIds=selected_genes,
                           annotation="hgu133a",
                           ontology="BP",
                           pvalueCutoff=pval_cutoff,
                           conditional=cond,
                           testDirection=test_direction
                           )
        # run test
        # result.keys(): ['Count', 'Term', 'OddsRatio', 'Pvalue', 'ExpCount', 'GOBPID', 'Size']
        result = rpy.r.summary(rpy.r.hyperGTest(params))
        
        # dataset
        terms = result['GOBPID']
        pvals = scipy.log(scipy.asarray(result['Pvalue']))
        row_ids = ('go-terms', terms)
        col_ids = ('_john', ['_doe'])
        
        xout = dataset.Dataset(pvals,
                               (row_ids, col_ids),
                               name='P values (enrichment)')
        return [xout]


class GoEnrichmentCond(workflow.Function):
    """ Enrichment conditioned on dag structure."""
    def __init__(self):
        workflow.Function.__init__(self, 'goenrich', 'Go Cond. Enrich.')

    def run(self, data):
        import rpy
        rpy.r.library("GOstats")
        
        # Get universe
        # Here, we are using a defined dataset to represent the universe
        if not 'gene_ids' in data:
            logger.log('notice', 'No dimension called [gene_ids] in dataset: %s', data.get_name())
            return
        universe = list(data.get_identifiers('gene_ids'))
        logger.log('notice', 'Universe consists of %s gene ids from %s' %(len(universe), data.get_name()))
        # Get current selection and validate
        curr_sel = main.project.get_selection()
        selected_genes = list(curr_sel['gene_ids'])
        if len(selected_genes)==0:
            logger.log('notice', 'This function needs a current selection!')
            return
        
        # Hypergeometric parameter object
        pval_cutoff = 1
        cond = True
        test_direction = 'over'
        params = rpy.r.new("GOHyperGParams",
                           geneIds=selected_genes,
                           annotation="hgu133a",
                           ontology="BP",
                           pvalueCutoff=pval_cutoff,
                           conditional=cond,
                           testDirection=test_direction
                           )
        # run test
        # result.keys(): ['Count', 'Term', 'OddsRatio', 'Pvalue', 'ExpCount', 'GOBPID', 'Size']
        result = rpy.r.summary(rpy.r.hyperGTest(params))
        
        # dataset
        terms = result['GOBPID']
        pvals = scipy.log(scipy.asarray(result['Pvalue']))
        row_ids = ('go-terms', terms)
        col_ids = ('_john', ['_doe'])
        
        xout = dataset.Dataset(pvals,
                               (row_ids, col_ids),
                               name='P values (enrichment)')
        return [xout]


class MapGene2GO(workflow.Function):
    def __init__(self, ont='bp', gene_id_name='gene_ids'):
        self._ont = ont
        self._gene_id_name = gene_id_name
        workflow.Function.__init__(self, 'gene2go', 'gene->GO')
        # load data at init
        try:
            fname = "/home/flatberg/fluents/data/gene2go.pcl"
            self._gene2go = cPickle.load(open(fname))
        except:
            logger.log("notice", "could not load mapping")
        
    def run(self):
        selection = main.project.get_selection()
        if not selection.has_key(self._gene_id_name):
            logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys()))
            return None
        if len(selection[self._gene_id_name])==0:
            logger.log("notice", "No selected genes to query")
            return None

        gene_ids = selection[self._gene_id_name]
        go_ids = set()
        for gene in gene_ids:
            go_ids_new = self._gene2go.get(gene, [])
            if not go_ids_new:
                logger.log("notice", "Could not find any goterms for %s" %gene)
            go_ids.update(self._gene2go.get(gene, []))
        main.project.set_selection('go-terms', go_ids)
        logger.log("notice", "GO terms updated")


class MapGO2Gene(workflow.Function):
    def __init__(self, ont='bp', gene_id_name='go-terms'):
        self._ont = ont
        self._gene_id_name = gene_id_name
        workflow.Function.__init__(self, 'go2gene', 'GO->gene')
        # load data at init
        try:
            fname = "/home/flatberg/fluents/data/go2gene.pcl"
            self._go2gene = cPickle.load(open(fname))
        except:
            logger.log("notice", "could not load mapping")
        
    def run(self):
        selection = main.project.get_selection()
        if not selection.has_key(self._gene_id_name):
            logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys()))
            return None
        if len(selection[self._gene_id_name])==0:
            logger.log("notice", "No selected genes to query")
            return None

        go_ids = selection[self._gene_id_name]
        gene_ids = set()
        for go in go_ids:
            if not self._go2gene.get(go,[]):
                logger.log("notice", "Could not find any gene ids for %s" %go)
            gene_ids.update(self._go2gene.get(go,[]))
        main.project.set_selection('gene_ids', gene_ids)
        logger.log("notice", "GO terms updated")


class SubgraphQuery(workflow.Function):
    def __init__(self, graph='kegg', dim='gene_ids'):
        self._gtype = graph
        self._dim = dim
        
        workflow.Function.__init__(self, 'keggraph', 'KeggGraph')
        
    def run(self, Dw, DA):
        max_edge_ratio = .20
        max_cov_ratio = .25
        neigh_type = 'cov'
        neigh_type = 'cosine'
        #neigh_type = 'heat'
        # 1.) Operate on a subset selection
        selection = main.project.get_selection()
        if not selection.has_key(self._dim):
            logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._dim, selection.keys()))
            return None
        if len(selection[self._dim]) == 0:
            logger.log("notice", "No selected genes to query, using all")
            Dw = Dw.subdata(self._dim, Dw.get_identifiers(self._dim)[:100])
        else:
            Dw = Dw.subdata(self._dim, selection[self._dim])

        # 2.) Pairwise goodness in loading space
        indices = self._pairsim(Dw)
        idents1 = Dw.get_identifiers(self._dim, indices[:,0])
        idents2 = Dw.get_identifiers(self._dim, indices[:,1])
        idents = zip(idents1, idents2)
        
        # 3.) Identify close subgraphs
        G = DA.asnetworkx()
        for edge in G.edges():
            if edge not in idents:
                G.delete_edge(edge)
        S = nx.connected_component_subgraphs(G)
        print map(len, S)
        # 4.) Rank subgraphs
        
        main.project.set_selection('gene_ids', idents1)
        #main.project.set_sele
        logger.log("notice", "Gene ids updated")
        plt = GraphQueryScatterPlot(S, Dw)
        #return [plt]

    def _pairsim(self, Dw, ptype='cosine',cut_rat=.2):
        """Returns close pairs across given dim.
        ptype : ['cov', 'correlation', 'cosine', 'heat', 'euclidean']
        """
        W = Dw.asarray()
        if ptype == 'cov':
            W -= W.mean(1)[:,scipy.newaxis]
            wcov = scipy.dot(W, W.T)/(W.shape[1]-1)
            wcov_min = wcov.max()*cut_rat
            indices = scipy.asarray(scipy.where(wcov >= wcov_min)).T
        elif ptype == 'heat':
            from hcluster import pdist, squareform
            D = squareform(pdist(W))
            H = exp(-D)
            h_min = H.max()*cut_rat
            indices = scipy.asarray(scipy.where(H >= h_min)).T
        elif ptype in ['euclidean', 'cosine', 'correlation']:
            from hcluster import pdist, squareform
            D = squareform(pdist(W), ptype)
            d_min = D.max()*cut_rat
            indices = []
            for i in range(D.shape[0]):
                for j in range(i, D.shape[0]):
                    if D[i,j] <= d_min:
                        indices.append([i,j])
            print "W"
            print W.shape
            indices = scipy.asarray(indices)
            
        else:
            raise ValueError("ptype: %s  not valid" %ptype)
        return indices

    def _subgraphsim(self, Dw, idents, stype='dijkstra'):
        # subgraph
        Gw = nx.XGraph()
        for edge in idents:
            e = G.get_edge(edge)
            Gw.add_edge()
        if stype == 'dijkstra':
            pass
        
class GraphQueryScatterPlot(plots.ScatterPlot):
    def __init__(self, subgraphs, Dw, *args, **kw):
        self._subgraphs = subgraphs
        self._nx_nodes = []
        self._nx_edges = []
        self._init_scatter(Dw)
        self.overlay_subgraphs()

    def _init_scatter(self, Dw):
        self._Dw = Dw
        id_dim, sel_dim = Dw.get_dim_name()
        self._dim = id_dim
        id_1, = Dw.get_identifiers(sel_dim, [0])
        id_2, = Dw.get_identifiers(sel_dim, [1])
        plots.ScatterPlot.__init__(self, Dw, Dw, id_dim, sel_dim, id_1, id_2, c='g', s=50,name="Hypo", alpha=.5)
    
    def overlay_subgraphs(self):
        all_nodes = self._Dw.get_identifiers(self._dim, sorted=True)
        for subgraph in self._subgraphs:
            # get xy positions from 
            nodes = subgraph.nodes()
            for i, node in enumerate(all_nodes):
                pos[node] = (self.xaxis_data[i], self.yaxis_data[i])
            nn = nx.draw_networkx_nodes(subgraph, pos, node_size=200, ax=self.axes, zorder=10)
            ee = nx.draw_networkx_edges(subgraph, pos, ax=self.axes, zorder=9)
            self._nx_nodes.append(nn)
            self._nx_edges.append(ee)

    def _delete_networks(self):
        if len(self._nx_nodes) > 0:
            for n in self._nx_nodes:
                self._nx_nodes.remove(n)
                self.axes.collections.remove(n)
        if len(self._nx_edges) > 0:
            for e in self._nx_edges:
                self._nx_edges.remove(e)
                self.axes.collections.remove(e)
    
    def set_ordinate(self, sb):
        self._delete_networks()
        self.overlay_subgraphs()
        plots.ScatterPlot.set_ordinate(self, sb)

    def set_absicca(self, sb):
        self._delete_networks()
        self.overlay_subgraphs()
        plots.ScatterPlot.set_absicca(self, sb)


class CAsinglesel(workflow.Function):
    """ Modified non-symmetric correpsondence analysis.

    Setup multiple selections:

    Input : - a subset(s) along a dimension (selection) of `interesting` identifiers.
            - Predefined subsets (categories) along the same dimension.

    1.) The cooccurence matrix of interesting identifers and categories is made.
    2.) The variables are scaled to represent the relative frequencies.

    """

    def run(X, Ckegg):
        pass


    class CASingleSelDouble(workflow.Function):
        """
        """

    def run(X, Ckegg):
        pass
Demo workflow 2008-02-05 12:34:14 +01:00			`import sys,os`
			`import os.path`
			`import webbrowser`
			`import cPickle`

			`import scipy`
			`import networkx as nx`

			`from fluents import logger,plots,workflow,dataset,main`
			`from fluents.lib import blmfuncs,nx_utils,cx_utils`

			`import gobrowser`


			`class SmallTestWorkflow(workflow.Workflow):`
			`name = 'Demo'`
			`ident = 'demo'`
			`description = 'A small test workflow for gene expression analysis.'`
			`chip = 'hgu'`
			`def __init__(self):`
			`workflow.Workflow.__init__(self)`

			`# DATA IMPORT`
			`load = workflow.Stage('load', 'Data')`

			`load_small = LoadDataFunction('load-small', 'Small', self)`
			`load.add_function(load_small)`

			`load_medium = LoadDataFunction('load-geneid', 'GeneID', self, 'geneid')`
			`load.add_function(load_medium)`

			`load_medium = LoadDataFunction('load-full', 'FullChip', self, 'full')`
			`load.add_function(load_medium)`

			`self.add_stage(load)`

			`# NETWORK PREPROCESSING`
			`#net = workflow.Stage('net', 'Network integration')`
			`#net.add_function(DiffKernelFunction())`
			`#net.add_function(ModKernelFunction())`
			`#self.add_stage(net)`

			`# Models`
			`model = workflow.Stage('models', 'Models')`
			`model.add_function(blmfuncs.PCA())`
			`model.add_function(blmfuncs.PLS())`
			`model.add_function(SAM())`
			`self.add_stage(model)`

			`query = workflow.Stage('query', 'Gene Query')`
			`query.add_function(NCBIQuery())`
			`query.add_function(KEGGQuery())`
			`query.add_function(SubgraphQuery())`
			`self.add_stage(query)`

			`# Background knowledge`
			`go = workflow.Stage('go', 'Gene Ontology')`
			`go.add_function(gobrowser.PlotDagFunction())`
			`go.add_function(GoEnrichment())`
			`go.add_function(GoEnrichmentCond())`
			`go.add_function(MapGO2Gene())`
			`go.add_function(MapGene2GO())`
			`self.add_stage(go)`

			`# EXTRA PLOTS`
			`#plt = workflow.Stage('net', 'Network')`
			`#plt.add_function(nx_analyser.KeggNetworkAnalyser())`
			`#self.add_stage(plt)`

			`logger.log('debug', 'Small test workflow is now active')`


			`class LoadDataFunction(workflow.Function):`
			`"""Loads all datasets in a given directory."""`
			`def __init__(self, ident, label, wf, dir=''):`
			`workflow.Function.__init__(self, ident, label)`
			`self._dir = dir`
			`self._wf = wf`

			`def run(self):`
			`path = os.path.join(main.options.datadir, self._wf.ident, self._dir)`
			`files = os.listdir(path)`
			`out = []`
			`for fn in files:`
			`if fn.endswith('.ftsv'):`
			`out.append(dataset.read_ftsv(os.path.join(path, fn)))`
			`return out`



			`##### WORKFLOW SPECIFIC FUNCTIONS ######`


			`class SAM(workflow.Function):`
			`def __init__(self, id='sam', name='SAM'):`
			`workflow.Function.__init__(self, id, name)`

			`def run(self, x, y):`
			`n_iter = 50 #B`
			`alpha = 0.01 #cut off on qvals`

			`###############`

			`# Main function call`

			`# setup prelimenaries`
			`import rpy`
			`rpy.r.library("siggenes")`
			`rpy.r.library("multtest")`

			`cl = scipy.dot(y.asarray(), scipy.diag(scipy.arange(y.shape[1]))).sum(1)`
			`data = x.asarray().T`
			`sam = rpy.r.sam(data, cl=cl, B=n_iter, var_equal=False,med=False,s0=scipy.nan,rand=scipy.nan)`
			`qvals = scipy.asarray(rpy.r.slot(sam, "p.value"))`
			`pvals = scipy.asarray(rpy.r.slot(sam, "q.value"))`

			`sam_index = (qvals<alpha).nonzero()[0]`

			`# Update selection object`
			`dim_name = x.get_dim_name(1)`
			`sam_selection = x.get_identifiers(dim_name, indices=sam_index)`
			`main.project.set_selection(dim_name, sam_selection)`

			`sel = dataset.Selection('SAM selection')`
			`sel.select(dim_name, sam_selection)`
			`logger.log('notice','Number of significant varibles (SAM): %s' %len(sam_selection))`

			`# ## OUTPUT ###`
			`xcolname = x.get_dim_name(1) # genes`
			`x_col_ids = [xcolname, x.get_identifiers(xcolname, sorted=True)]`
			`sing_id = ['_john', ['0']] #singleton`
			`D_qvals = dataset.Dataset(qvals, (x_col_ids, sing_id), name='q_vals')`
			`D_pvals = dataset.Dataset(pvals, (x_col_ids, sing_id), name='p_vals')`

			`# plots`
			`s_indx = qvals.flatten().argsort()`
			`s_ids = [x_col_ids[0],[x_col_ids[1][i] for i in s_indx]]`
			`xindex = scipy.arange(len(qvals))`
			`qvals_s = qvals.take(s_indx)`
			`D_qs = dataset.Dataset(qvals_s, (s_ids, sing_id), name="sorted qvals")`
			`Dind = dataset.Dataset(xindex, (s_ids, sing_id), name="dum")`
			`st = plots.ScatterPlot(D_qs, Dind, 'gene_ids', '_john', '0', '0', s=10, name='SAM qvals')`

			`return [D_qvals, D_pvals, D_qs, st, sel]`


			`class DiffKernelFunction(workflow.Function):`
			`def __init__(self):`
			`workflow.Function.__init__(self, 'diffkernel', 'Diffusion')`

			`def run(self, x, a):`
			`"""x is gene expression data, a is the network.`
			`"""`
			`#sanity check:`
			`g = a.asnetworkx()`
			`genes = x.get_identifiers(x.get_dim_name(1), sorted=True)`
			`W = nx.adj_matrix(g, nodelist=genes)`
			`X = x.asarray()`
			`Xc, mn_x = cx_utils.mat_center(X, ret_mn=True)`
			`out = []`
			`alpha = 1.0`
			`beta = 1.0`
			`K = nx_utils.K_diffusion(W, alpha=alpha, beta=beta,normalised=True)`
			`Xp = scipy.dot(Xc, K) + mn_x`
			`# dataset`
			`row_ids = (x.get_dim_name(0),`
			`x.get_identifiers(x.get_dim_name(0),`
			`sorted=True))`
			`col_ids = (x.get_dim_name(1),`
			`x.get_identifiers(x.get_dim_name(1),`
			`sorted=True))`

			`xout = dataset.Dataset(Xp,`
			`(row_ids, col_ids),`
			`name=x.get_name()+'_diff'+str(alpha))`
			`out.append(xout)`

			`return out`


			`class ModKernelFunction(workflow.Function):`
			`def __init__(self):`
			`workflow.Function.__init__(self, 'mokernel', 'Modularity')`

			`def run(self,x,a):`
			`X = x.asarray()`
			`g = a.asnetworkx()`
			`genes = x.get_identifiers(x.get_dim_name(1), sorted=True)`
			`W = nx.adj_matrix(g, nodelist=genes)`
			`out=[]`
			`alpha=.2`
			`Xc,mn_x = cx_utils.mat_center(X, ret_mn=True)`
			`K = nx_utils.K_modularity(W, alpha=alpha)`
			`Xp = scipy.dot(Xc, K)`
			`Xp = Xp + mn_x`

			`# dataset`
			`row_ids = (x.get_dim_name(0),`
			`x.get_identifiers(x.get_dim_name(0),`
			`sorted=True))`
			`col_ids = (x.get_dim_name(1),`
			`x.get_identifiers(x.get_dim_name(1),`
			`sorted=True))`
			`xout = dataset.Dataset(Xp,`
			`(row_ids,col_ids),`
			`name=x.get_name()+'_mod'+str(alpha))`
			`out.append(xout)`
			`return out`


			`class NCBIQuery(workflow.Function):`
			`def __init__(self, gene_id_name='gene_ids'):`
			`self._gene_id_name = gene_id_name`
			`workflow.Function.__init__(self, 'query', 'NCBI')`

			`def run(self):`
			`selection = main.project.get_selection()`
			`if not selection.has_key(self._gene_id_name):`
			`logger.log("notice", "Expected gene ids: %s, but got: %s" %(self._gene_id_name, selection.keys()))`
			`return None`
			`if len(selection[self._gene_id_name])==0:`
			`logger.log("notice", "No selected genes to query")`
			`return None`

			`base = 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?'`
			`options = {r'&db=' : 'gene',`
			`r'&cmd=' : 'retrieve',`
			`r'&dopt=' : 'full_report'}`
			`gene_str = ''.join([gene + "+" for gene in selection[self._gene_id_name]])`
			`options[r'&list_uids='] = gene_str[:-1]`
			`opt_str = ''.join([key+value for key,value in options.items()])`
			`web_str = base + opt_str`
			`webbrowser.open(web_str)`


			`class KEGGQuery(workflow.Function):`
			`def __init__(self, org='hsa', gene_id_name='gene_ids'):`
			`self._org=org`
			`self._gene_id_name = gene_id_name`
			`workflow.Function.__init__(self, 'query', 'KEGG')`

			`def run(self, selection):`
			`if not selection.has_key(self._gene_id_name):`
			`logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys()))`
			`return None`
			`if len(selection[self._gene_id_name])==0:`
			`logger.log("notice", "No selected genes to query")`
			`return None`

			`base = r'http://www.genome.jp/dbget-bin/www_bget?'`
			`gene_str = ''.join([gene + "+" for gene in selection[self._gene_id_name]])`
			`gene_str = gene_str[:-1]`
			`gene_str = self._org + "+" + gene_str`
			`web_str = base + gene_str`
			`webbrowser.open(web_str)`


			`class GoEnrichment(workflow.Function):`
			`def __init__(self):`
			`workflow.Function.__init__(self, 'goenrich', 'Go Enrichment')`

			`def run(self, data):`
			`import rpy`
			`rpy.r.library("GOstats")`

			`# Get universe`
			`# Here, we are using a defined dataset to represent the universe`
			`if not 'gene_ids' in data:`
			`logger.log('notice', 'No dimension called [gene_ids] in dataset: %s' %data.get_name())`
			`return`
			`universe = list(data.get_identifiers('gene_ids'))`
			`logger.log('notice', 'Universe consists of %s gene ids from %s' %(len(universe), data.get_name()))`
			`# Get current selection and validate`
			`curr_sel = main.project.get_selection()`
			`selected_genes = list(curr_sel['gene_ids'])`
			`if len(selected_genes)==0:`
			`logger.log('notice', 'This function needs a current selection!')`
			`return`

			`# Hypergeometric parameter object`
			`pval_cutoff = 0.9999`
			`cond = False`
			`test_direction = 'over'`
			`params = rpy.r.new("GOHyperGParams",`
			`geneIds=selected_genes,`
			`annotation="hgu133a",`
			`ontology="BP",`
			`pvalueCutoff=pval_cutoff,`
			`conditional=cond,`
			`testDirection=test_direction`
			`)`
			`# run test`
			`# result.keys(): ['Count', 'Term', 'OddsRatio', 'Pvalue', 'ExpCount', 'GOBPID', 'Size']`
			`result = rpy.r.summary(rpy.r.hyperGTest(params))`

			`# dataset`
			`terms = result['GOBPID']`
			`pvals = scipy.log(scipy.asarray(result['Pvalue']))`
			`row_ids = ('go-terms', terms)`
			`col_ids = ('_john', ['_doe'])`

			`xout = dataset.Dataset(pvals,`
			`(row_ids, col_ids),`
			`name='P values (enrichment)')`
			`return [xout]`


			`class GoEnrichmentCond(workflow.Function):`
			`""" Enrichment conditioned on dag structure."""`
			`def __init__(self):`
			`workflow.Function.__init__(self, 'goenrich', 'Go Cond. Enrich.')`

			`def run(self, data):`
			`import rpy`
			`rpy.r.library("GOstats")`

			`# Get universe`
			`# Here, we are using a defined dataset to represent the universe`
			`if not 'gene_ids' in data:`
			`logger.log('notice', 'No dimension called [gene_ids] in dataset: %s', data.get_name())`
			`return`
			`universe = list(data.get_identifiers('gene_ids'))`
			`logger.log('notice', 'Universe consists of %s gene ids from %s' %(len(universe), data.get_name()))`
			`# Get current selection and validate`
			`curr_sel = main.project.get_selection()`
			`selected_genes = list(curr_sel['gene_ids'])`
			`if len(selected_genes)==0:`
			`logger.log('notice', 'This function needs a current selection!')`
			`return`

			`# Hypergeometric parameter object`
			`pval_cutoff = 1`
			`cond = True`
			`test_direction = 'over'`
			`params = rpy.r.new("GOHyperGParams",`
			`geneIds=selected_genes,`
			`annotation="hgu133a",`
			`ontology="BP",`
			`pvalueCutoff=pval_cutoff,`
			`conditional=cond,`
			`testDirection=test_direction`
			`)`
			`# run test`
			`# result.keys(): ['Count', 'Term', 'OddsRatio', 'Pvalue', 'ExpCount', 'GOBPID', 'Size']`
			`result = rpy.r.summary(rpy.r.hyperGTest(params))`

			`# dataset`
			`terms = result['GOBPID']`
			`pvals = scipy.log(scipy.asarray(result['Pvalue']))`
			`row_ids = ('go-terms', terms)`
			`col_ids = ('_john', ['_doe'])`

			`xout = dataset.Dataset(pvals,`
			`(row_ids, col_ids),`
			`name='P values (enrichment)')`
			`return [xout]`


			`class MapGene2GO(workflow.Function):`
			`def __init__(self, ont='bp', gene_id_name='gene_ids'):`
			`self._ont = ont`
			`self._gene_id_name = gene_id_name`
			`workflow.Function.__init__(self, 'gene2go', 'gene->GO')`
			`# load data at init`
			`try:`
			`fname = "/home/flatberg/fluents/data/gene2go.pcl"`
			`self._gene2go = cPickle.load(open(fname))`
			`except:`
			`logger.log("notice", "could not load mapping")`

			`def run(self):`
			`selection = main.project.get_selection()`
			`if not selection.has_key(self._gene_id_name):`
			`logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys()))`
			`return None`
			`if len(selection[self._gene_id_name])==0:`
			`logger.log("notice", "No selected genes to query")`
			`return None`

			`gene_ids = selection[self._gene_id_name]`
			`go_ids = set()`
			`for gene in gene_ids:`
			`go_ids_new = self._gene2go.get(gene, [])`
			`if not go_ids_new:`
			`logger.log("notice", "Could not find any goterms for %s" %gene)`
			`go_ids.update(self._gene2go.get(gene, []))`
			`main.project.set_selection('go-terms', go_ids)`
			`logger.log("notice", "GO terms updated")`


			`class MapGO2Gene(workflow.Function):`
			`def __init__(self, ont='bp', gene_id_name='go-terms'):`
			`self._ont = ont`
			`self._gene_id_name = gene_id_name`
			`workflow.Function.__init__(self, 'go2gene', 'GO->gene')`
			`# load data at init`
			`try:`
			`fname = "/home/flatberg/fluents/data/go2gene.pcl"`
			`self._go2gene = cPickle.load(open(fname))`
			`except:`
			`logger.log("notice", "could not load mapping")`

			`def run(self):`
			`selection = main.project.get_selection()`
			`if not selection.has_key(self._gene_id_name):`
			`logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys()))`
			`return None`
			`if len(selection[self._gene_id_name])==0:`
			`logger.log("notice", "No selected genes to query")`
			`return None`

			`go_ids = selection[self._gene_id_name]`
			`gene_ids = set()`
			`for go in go_ids:`
			`if not self._go2gene.get(go,[]):`
			`logger.log("notice", "Could not find any gene ids for %s" %go)`
			`gene_ids.update(self._go2gene.get(go,[]))`
			`main.project.set_selection('gene_ids', gene_ids)`
			`logger.log("notice", "GO terms updated")`


			`class SubgraphQuery(workflow.Function):`
			`def __init__(self, graph='kegg', dim='gene_ids'):`
			`self._gtype = graph`
			`self._dim = dim`

			`workflow.Function.__init__(self, 'keggraph', 'KeggGraph')`

			`def run(self, Dw, DA):`
			`max_edge_ratio = .20`
			`max_cov_ratio = .25`
			`neigh_type = 'cov'`
			`neigh_type = 'cosine'`
			`#neigh_type = 'heat'`
			`# 1.) Operate on a subset selection`
			`selection = main.project.get_selection()`
			`if not selection.has_key(self._dim):`
			`logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._dim, selection.keys()))`
			`return None`
			`if len(selection[self._dim]) == 0:`
cc 2008-02-06 10:42:46 +01:00			`logger.log("notice", "No selected genes to query, using all")`
			`Dw = Dw.subdata(self._dim, Dw.get_identifiers(self._dim)[:100])`
			`else:`
			`Dw = Dw.subdata(self._dim, selection[self._dim])`
Demo workflow 2008-02-05 12:34:14 +01:00
			`# 2.) Pairwise goodness in loading space`
cc 2008-02-06 10:42:46 +01:00			`indices = self._pairsim(Dw)`
			`idents1 = Dw.get_identifiers(self._dim, indices[:,0])`
			`idents2 = Dw.get_identifiers(self._dim, indices[:,1])`
			`idents = zip(idents1, idents2)`

			`# 3.) Identify close subgraphs`
Workflow updates 2008-02-29 17:23:57 +01:00			`G = DA.asnetworkx()`
			`for edge in G.edges():`
			`if edge not in idents:`
			`G.delete_edge(edge)`
			`S = nx.connected_component_subgraphs(G)`
			`print map(len, S)`
cc 2008-02-06 10:42:46 +01:00			`# 4.) Rank subgraphs`

			`main.project.set_selection('gene_ids', idents1)`
			`#main.project.set_sele`
			`logger.log("notice", "Gene ids updated")`
Workflow updates 2008-02-29 17:23:57 +01:00			`plt = GraphQueryScatterPlot(S, Dw)`
cc 2008-02-06 10:42:46 +01:00			`#return [plt]`

			`def _pairsim(self, Dw, ptype='cosine',cut_rat=.2):`
			`"""Returns close pairs across given dim.`
			`ptype : ['cov', 'correlation', 'cosine', 'heat', 'euclidean']`
			`"""`
Demo workflow 2008-02-05 12:34:14 +01:00			`W = Dw.asarray()`
cc 2008-02-06 10:42:46 +01:00			`if ptype == 'cov':`
Demo workflow 2008-02-05 12:34:14 +01:00			`W -= W.mean(1)[:,scipy.newaxis]`
			`wcov = scipy.dot(W, W.T)/(W.shape[1]-1)`
cc 2008-02-06 10:42:46 +01:00			`wcov_min = wcov.max()*cut_rat`
			`indices = scipy.asarray(scipy.where(wcov >= wcov_min)).T`
			`elif ptype == 'heat':`
			`from hcluster import pdist, squareform`
			`D = squareform(pdist(W))`
			`H = exp(-D)`
			`h_min = H.max()*cut_rat`
			`indices = scipy.asarray(scipy.where(H >= h_min)).T`
			`elif ptype in ['euclidean', 'cosine', 'correlation']:`
			`from hcluster import pdist, squareform`
			`D = squareform(pdist(W), ptype)`
			`d_min = D.max()*cut_rat`
			`indices = []`
			`for i in range(D.shape[0]):`
			`for j in range(i, D.shape[0]):`
			`if D[i,j] <= d_min:`
			`indices.append([i,j])`
			`print "W"`
			`print W.shape`
			`indices = scipy.asarray(indices)`

			`else:`
			`raise ValueError("ptype: %s not valid" %ptype)`
			`return indices`

			`def _subgraphsim(self, Dw, idents, stype='dijkstra'):`
			`# subgraph`
			`Gw = nx.XGraph()`
			`for edge in idents:`
			`e = G.get_edge(edge)`
			`Gw.add_edge()`
			`if stype == 'dijkstra':`
			`pass`
Demo workflow 2008-02-05 12:34:14 +01:00
			`class GraphQueryScatterPlot(plots.ScatterPlot):`
Workflow updates 2008-02-29 17:23:57 +01:00			`def __init__(self, subgraphs, Dw, args, *kw):`
			`self._subgraphs = subgraphs`
			`self._nx_nodes = []`
			`self._nx_edges = []`
			`self._init_scatter(Dw)`
			`self.overlay_subgraphs()`

			`def _init_scatter(self, Dw):`
			`self._Dw = Dw`
Demo workflow 2008-02-05 12:34:14 +01:00			`id_dim, sel_dim = Dw.get_dim_name()`
			`self._dim = id_dim`
			`id_1, = Dw.get_identifiers(sel_dim, [0])`
			`id_2, = Dw.get_identifiers(sel_dim, [1])`
Workflow updates 2008-02-29 17:23:57 +01:00			`plots.ScatterPlot.__init__(self, Dw, Dw, id_dim, sel_dim, id_1, id_2, c='g', s=50,name="Hypo", alpha=.5)`
Demo workflow 2008-02-05 12:34:14 +01:00
			`def overlay_subgraphs(self):`
Workflow updates 2008-02-29 17:23:57 +01:00			`all_nodes = self._Dw.get_identifiers(self._dim, sorted=True)`
			`for subgraph in self._subgraphs:`
			`# get xy positions from`
Demo workflow 2008-02-05 12:34:14 +01:00			`nodes = subgraph.nodes()`
Workflow updates 2008-02-29 17:23:57 +01:00			`for i, node in enumerate(all_nodes):`
			`pos[node] = (self.xaxis_data[i], self.yaxis_data[i])`
			`nn = nx.draw_networkx_nodes(subgraph, pos, node_size=200, ax=self.axes, zorder=10)`
			`ee = nx.draw_networkx_edges(subgraph, pos, ax=self.axes, zorder=9)`
			`self._nx_nodes.append(nn)`
			`self._nx_edges.append(ee)`

			`def _delete_networks(self):`
			`if len(self._nx_nodes) > 0:`
			`for n in self._nx_nodes:`
			`self._nx_nodes.remove(n)`
			`self.axes.collections.remove(n)`
			`if len(self._nx_edges) > 0:`
			`for e in self._nx_edges:`
			`self._nx_edges.remove(e)`
			`self.axes.collections.remove(e)`
Demo workflow 2008-02-05 12:34:14 +01:00
Workflow updates 2008-02-29 17:23:57 +01:00			`def set_ordinate(self, sb):`
			`self._delete_networks()`
			`self.overlay_subgraphs()`
			`plots.ScatterPlot.set_ordinate(self, sb)`

			`def set_absicca(self, sb):`
			`self._delete_networks()`
			`self.overlay_subgraphs()`
			`plots.ScatterPlot.set_absicca(self, sb)`


			`class CAsinglesel(workflow.Function):`
			`""" Modified non-symmetric correpsondence analysis.`

			`Setup multiple selections:`

			Input : - a subset(s) along a dimension (selection) of `interesting` identifiers.
			`- Predefined subsets (categories) along the same dimension.`

			`1.) The cooccurence matrix of interesting identifers and categories is made.`
			`2.) The variables are scaled to represent the relative frequencies.`

			`"""`

			`def run(X, Ckegg):`
Demo workflow 2008-02-05 12:34:14 +01:00			`pass`
Workflow updates 2008-02-29 17:23:57 +01:00

			`class CASingleSelDouble(workflow.Function):`
			`"""`
			`"""`

			`def run(X, Ckegg):`
			`pass`