laydi/workflows/demo.py

import sys,os
import os.path
import webbrowser
import cPickle

import scipy
import networkx as nx

from fluents import logger,plots,workflow,dataset,main
from fluents.lib import blmfuncs,nx_utils,cx_utils

import gobrowser


class SmallTestWorkflow(workflow.Workflow):
    name = 'Demo'
    ident = 'demo'
    description = 'A small test workflow for gene expression analysis.'
    chip = 'hgu'
    def __init__(self):
        workflow.Workflow.__init__(self)        

        # DATA IMPORT
        load = workflow.Stage('load', 'Data')

        load_small = LoadDataFunction('load-small', 'Small', self)
        load.add_function(load_small)

        load_medium = LoadDataFunction('load-geneid', 'GeneID', self, 'geneid')
        load.add_function(load_medium)
        
        load_medium = LoadDataFunction('load-full', 'FullChip', self, 'full')
        load.add_function(load_medium)
        
        self.add_stage(load)
        
        # NETWORK PREPROCESSING
        #net = workflow.Stage('net', 'Network integration')
        #net.add_function(DiffKernelFunction())
        #net.add_function(ModKernelFunction())
        #self.add_stage(net)
        
        # Models
        model = workflow.Stage('models', 'Models')
        model.add_function(blmfuncs.PCA())
        model.add_function(blmfuncs.PLS())
        model.add_function(SAM())
        self.add_stage(model)
        
        query = workflow.Stage('query', 'Gene Query')
        query.add_function(NCBIQuery())
        query.add_function(KEGGQuery())
        query.add_function(SubgraphQuery())
        self.add_stage(query)
        
        # Background knowledge
        go = workflow.Stage('go', 'Gene Ontology')
        go.add_function(gobrowser.PlotDagFunction())
        go.add_function(GoEnrichment())
        go.add_function(GoEnrichmentCond())
        go.add_function(MapGO2Gene())
        go.add_function(MapGene2GO())
        self.add_stage(go)

        # EXTRA PLOTS
        #plt = workflow.Stage('net', 'Network')
        #plt.add_function(nx_analyser.KeggNetworkAnalyser())
        #self.add_stage(plt)
        
        logger.log('debug', 'Small test workflow is now active')


class LoadDataFunction(workflow.Function):
    """Loads all datasets in a given directory."""
    def __init__(self, ident, label, wf, dir=''):
        workflow.Function.__init__(self, ident, label)
        self._dir = dir
        self._wf = wf

    def run(self):
        path = os.path.join(main.options.datadir, self._wf.ident, self._dir)
        files = os.listdir(path)
        out = []
        for fn in files:
            if fn.endswith('.ftsv'):
                out.append(dataset.read_ftsv(os.path.join(path, fn)))
        return out


##### WORKFLOW SPECIFIC FUNCTIONS ######


class SAM(workflow.Function):
    def __init__(self, id='sam', name='SAM'):
        workflow.Function.__init__(self, id, name)
        
    def run(self, x, y):        
        n_iter = 50 #B
        alpha = 0.01 #cut off on qvals
        
        ###############

        # Main function call

        # setup prelimenaries
        import rpy
        rpy.r.library("siggenes")
        rpy.r.library("multtest")
        
        cl = scipy.dot(y.asarray(), scipy.diag(scipy.arange(y.shape[1]))).sum(1)
        data = x.asarray().T
        sam = rpy.r.sam(data, cl=cl, B=n_iter, var_equal=False,med=False,s0=scipy.nan,rand=scipy.nan)
        qvals = scipy.asarray(rpy.r.slot(sam, "p.value"))
        pvals = scipy.asarray(rpy.r.slot(sam, "q.value"))
        
        sam_index = (qvals<alpha).nonzero()[0]

        # Update selection object
        dim_name = x.get_dim_name(1)
        sam_selection = x.get_identifiers(dim_name, indices=sam_index)
        main.project.set_selection(dim_name, sam_selection)
        
        sel = dataset.Selection('SAM selection')
        sel.select(dim_name, sam_selection)
        logger.log('notice','Number of significant varibles (SAM): %s' %len(sam_selection))

        # ## OUTPUT ###
        xcolname = x.get_dim_name(1) # genes
        x_col_ids = [xcolname, x.get_identifiers(xcolname, sorted=True)]
        sing_id = ['_john', ['0']] #singleton
        D_qvals = dataset.Dataset(qvals, (x_col_ids, sing_id), name='q_vals')
        D_pvals = dataset.Dataset(pvals, (x_col_ids, sing_id), name='p_vals')
        
        # plots
        s_indx = qvals.flatten().argsort()
        s_ids = [x_col_ids[0],[x_col_ids[1][i] for i in s_indx]]
        xindex = scipy.arange(len(qvals))
        qvals_s = qvals.take(s_indx)
        D_qs = dataset.Dataset(qvals_s, (s_ids, sing_id), name="sorted qvals")
        Dind = dataset.Dataset(xindex, (s_ids, sing_id), name="dum")
        st = plots.ScatterPlot(D_qs, Dind, 'gene_ids', '_john', '0', '0', s=10, name='SAM qvals')
        
        return [D_qvals, D_pvals, D_qs, st, sel]
        

class DiffKernelFunction(workflow.Function):
    def __init__(self):
        workflow.Function.__init__(self, 'diffkernel', 'Diffusion')

    def run(self, x, a):
        """x is gene expression data, a is the network.
        """
        #sanity check:
        g = a.asnetworkx()
        genes = x.get_identifiers(x.get_dim_name(1), sorted=True)
        W = nx.adj_matrix(g, nodelist=genes)
        X = x.asarray()
        Xc, mn_x = cx_utils.mat_center(X, ret_mn=True)
        out = []
        alpha = 1.0
        beta = 1.0
        K = nx_utils.K_diffusion(W, alpha=alpha, beta=beta,normalised=True)
        Xp = scipy.dot(Xc, K) + mn_x
        # dataset
        row_ids = (x.get_dim_name(0),
                   x.get_identifiers(x.get_dim_name(0),
                                     sorted=True))
        col_ids = (x.get_dim_name(1),
                   x.get_identifiers(x.get_dim_name(1),
                                     sorted=True))
        
        xout = dataset.Dataset(Xp,
                               (row_ids, col_ids),
                               name=x.get_name()+'_diff'+str(alpha))
        out.append(xout)
        
        return out


class ModKernelFunction(workflow.Function):
    def __init__(self):
        workflow.Function.__init__(self, 'mokernel', 'Modularity')

    def run(self,x,a):
        X = x.asarray()
        g = a.asnetworkx()
        genes = x.get_identifiers(x.get_dim_name(1), sorted=True)
        W = nx.adj_matrix(g, nodelist=genes)
        out=[]
        alpha=.2
        Xc,mn_x = cx_utils.mat_center(X, ret_mn=True)
        K = nx_utils.K_modularity(W, alpha=alpha)
        Xp = scipy.dot(Xc, K)
        Xp = Xp + mn_x
        
        # dataset
        row_ids = (x.get_dim_name(0),
                   x.get_identifiers(x.get_dim_name(0),
                                     sorted=True))
        col_ids = (x.get_dim_name(1),
                   x.get_identifiers(x.get_dim_name(1),
                                     sorted=True))
        xout = dataset.Dataset(Xp,
                               (row_ids,col_ids),
                               name=x.get_name()+'_mod'+str(alpha))
        out.append(xout)
        return out


class NCBIQuery(workflow.Function):
    def __init__(self, gene_id_name='gene_ids'):
        self._gene_id_name = gene_id_name
        workflow.Function.__init__(self, 'query', 'NCBI')

    def run(self):
        selection = main.project.get_selection()
        if not selection.has_key(self._gene_id_name):
            logger.log("notice", "Expected gene ids: %s, but got: %s" %(self._gene_id_name, selection.keys()))
            return None
        if len(selection[self._gene_id_name])==0:
            logger.log("notice", "No selected genes to query")
            return None
        
        base = 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?'
        options = {r'&db=' : 'gene',
                   r'&cmd=' : 'retrieve',
                   r'&dopt=' : 'full_report'}
        gene_str = ''.join([gene + "+" for gene in selection[self._gene_id_name]])
        options[r'&list_uids='] = gene_str[:-1]
        opt_str = ''.join([key+value for key,value in options.items()])
        web_str = base + opt_str
        webbrowser.open(web_str)


class KEGGQuery(workflow.Function):
    def __init__(self, org='hsa', gene_id_name='gene_ids'):
        self._org=org
        self._gene_id_name = gene_id_name
        workflow.Function.__init__(self, 'query', 'KEGG')

    def run(self, selection):
        if not selection.has_key(self._gene_id_name):
            logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys()))
            return None
        if len(selection[self._gene_id_name])==0:
            logger.log("notice", "No selected genes to query")
            return None
        
        base = r'http://www.genome.jp/dbget-bin/www_bget?'
        gene_str = ''.join([gene + "+" for gene in selection[self._gene_id_name]])
        gene_str = gene_str[:-1]
        gene_str = self._org + "+" + gene_str
        web_str = base + gene_str
        webbrowser.open(web_str)


class GoEnrichment(workflow.Function):
    def __init__(self):
        workflow.Function.__init__(self, 'goenrich', 'Go Enrichment')

    def run(self, data):
        import rpy
        rpy.r.library("GOstats")
        
        # Get universe
        # Here, we are using a defined dataset to represent the universe
        if not 'gene_ids' in data:
            logger.log('notice', 'No dimension called [gene_ids] in dataset: %s' %data.get_name())
            return
        universe = list(data.get_identifiers('gene_ids'))
        logger.log('notice', 'Universe consists of %s gene ids from %s' %(len(universe), data.get_name()))
        # Get current selection and validate
        curr_sel = main.project.get_selection()
        selected_genes = list(curr_sel['gene_ids'])
        if len(selected_genes)==0:
            logger.log('notice', 'This function needs a current selection!')
            return
        
        # Hypergeometric parameter object
        pval_cutoff = 0.9999
        cond = False
        test_direction = 'over'
        params = rpy.r.new("GOHyperGParams",
                           geneIds=selected_genes,
                           annotation="hgu133a",
                           ontology="BP",
                           pvalueCutoff=pval_cutoff,
                           conditional=cond,
                           testDirection=test_direction
                           )
        # run test
        # result.keys(): ['Count', 'Term', 'OddsRatio', 'Pvalue', 'ExpCount', 'GOBPID', 'Size']
        result = rpy.r.summary(rpy.r.hyperGTest(params))
        
        # dataset
        terms = result['GOBPID']
        pvals = scipy.log(scipy.asarray(result['Pvalue']))
        row_ids = ('go-terms', terms)
        col_ids = ('_john', ['_doe'])
        
        xout = dataset.Dataset(pvals,
                               (row_ids, col_ids),
                               name='P values (enrichment)')
        return [xout]


class GoEnrichmentCond(workflow.Function):
    """ Enrichment conditioned on dag structure."""
    def __init__(self):
        workflow.Function.__init__(self, 'goenrich', 'Go Cond. Enrich.')

    def run(self, data):
        import rpy
        rpy.r.library("GOstats")
        
        # Get universe
        # Here, we are using a defined dataset to represent the universe
        if not 'gene_ids' in data:
            logger.log('notice', 'No dimension called [gene_ids] in dataset: %s', data.get_name())
            return
        universe = list(data.get_identifiers('gene_ids'))
        logger.log('notice', 'Universe consists of %s gene ids from %s' %(len(universe), data.get_name()))
        # Get current selection and validate
        curr_sel = main.project.get_selection()
        selected_genes = list(curr_sel['gene_ids'])
        if len(selected_genes)==0:
            logger.log('notice', 'This function needs a current selection!')
            return
        
        # Hypergeometric parameter object
        pval_cutoff = 1
        cond = True
        test_direction = 'over'
        params = rpy.r.new("GOHyperGParams",
                           geneIds=selected_genes,
                           annotation="hgu133a",
                           ontology="BP",
                           pvalueCutoff=pval_cutoff,
                           conditional=cond,
                           testDirection=test_direction
                           )
        # run test
        # result.keys(): ['Count', 'Term', 'OddsRatio', 'Pvalue', 'ExpCount', 'GOBPID', 'Size']
        result = rpy.r.summary(rpy.r.hyperGTest(params))
        
        # dataset
        terms = result['GOBPID']
        pvals = scipy.log(scipy.asarray(result['Pvalue']))
        row_ids = ('go-terms', terms)
        col_ids = ('_john', ['_doe'])
        
        xout = dataset.Dataset(pvals,
                               (row_ids, col_ids),
                               name='P values (enrichment)')
        return [xout]


class MapGene2GO(workflow.Function):
    def __init__(self, ont='bp', gene_id_name='gene_ids'):
        self._ont = ont
        self._gene_id_name = gene_id_name
        workflow.Function.__init__(self, 'gene2go', 'gene->GO')
        # load data at init
        try:
            fname = "/home/flatberg/fluents/data/gene2go.pcl"
            self._gene2go = cPickle.load(open(fname))
        except:
            logger.log("notice", "could not load mapping")
        
    def run(self):
        selection = main.project.get_selection()
        if not selection.has_key(self._gene_id_name):
            logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys()))
            return None
        if len(selection[self._gene_id_name])==0:
            logger.log("notice", "No selected genes to query")
            return None

        gene_ids = selection[self._gene_id_name]
        go_ids = set()
        for gene in gene_ids:
            go_ids_new = self._gene2go.get(gene, [])
            if not go_ids_new:
                logger.log("notice", "Could not find any goterms for %s" %gene)
            go_ids.update(self._gene2go.get(gene, []))
        main.project.set_selection('go-terms', go_ids)
        logger.log("notice", "GO terms updated")


class MapGO2Gene(workflow.Function):
    def __init__(self, ont='bp', gene_id_name='go-terms'):
        self._ont = ont
        self._gene_id_name = gene_id_name
        workflow.Function.__init__(self, 'go2gene', 'GO->gene')
        # load data at init
        try:
            fname = "/home/flatberg/fluents/data/go2gene.pcl"
            self._go2gene = cPickle.load(open(fname))
        except:
            logger.log("notice", "could not load mapping")
        
    def run(self):
        selection = main.project.get_selection()
        if not selection.has_key(self._gene_id_name):
            logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys()))
            return None
        if len(selection[self._gene_id_name])==0:
            logger.log("notice", "No selected genes to query")
            return None

        go_ids = selection[self._gene_id_name]
        gene_ids = set()
        for go in go_ids:
            if not self._go2gene.get(go,[]):
                logger.log("notice", "Could not find any gene ids for %s" %go)
            gene_ids.update(self._go2gene.get(go,[]))
        main.project.set_selection('gene_ids', gene_ids)
        logger.log("notice", "GO terms updated")


class SubgraphQuery(workflow.Function):
    def __init__(self, graph='kegg', dim='gene_ids'):
        self._gtype = graph
        self._dim = dim
        
        workflow.Function.__init__(self, 'keggraph', 'KeggGraph')
        
    def run(self, Dw, DA):
        max_edge_ratio = .20
        max_cov_ratio = .25
        neigh_type = 'cov'
        neigh_type = 'cosine'
        #neigh_type = 'heat'
        # 1.) Operate on a subset selection
        selection = main.project.get_selection()
        if not selection.has_key(self._dim):
            logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._dim, selection.keys()))
            return None
        if len(selection[self._dim]) == 0:
            logger.log("notice", "No selected genes to query, using all")
            Dw = Dw.subdata(self._dim, Dw.get_identifiers(self._dim)[:100])
        else:
            Dw = Dw.subdata(self._dim, selection[self._dim])

        # 2.) Pairwise goodness in loading space
        indices = self._pairsim(Dw)
        print indices
        print indices.shape
        
        idents1 = Dw.get_identifiers(self._dim, indices[:,0])
        idents2 = Dw.get_identifiers(self._dim, indices[:,1])
        idents = zip(idents1, idents2)
        
        # 3.) Identify close subgraphs
        
        # 4.) Rank subgraphs
        
        main.project.set_selection('gene_ids', idents1)
        #main.project.set_sele
        logger.log("notice", "Gene ids updated")
        #plt = GraphQueryScatterPlot(SS, Dw)
        #return [plt]

    def _pairsim(self, Dw, ptype='cosine',cut_rat=.2):
        """Returns close pairs across given dim.
        ptype : ['cov', 'correlation', 'cosine', 'heat', 'euclidean']
        """
        W = Dw.asarray()
        if ptype == 'cov':
            W -= W.mean(1)[:,scipy.newaxis]
            wcov = scipy.dot(W, W.T)/(W.shape[1]-1)
            wcov_min = wcov.max()*cut_rat
            indices = scipy.asarray(scipy.where(wcov >= wcov_min)).T
        elif ptype == 'heat':
            from hcluster import pdist, squareform
            D = squareform(pdist(W))
            H = exp(-D)
            h_min = H.max()*cut_rat
            indices = scipy.asarray(scipy.where(H >= h_min)).T
        elif ptype in ['euclidean', 'cosine', 'correlation']:
            from hcluster import pdist, squareform
            D = squareform(pdist(W), ptype)
            d_min = D.max()*cut_rat
            indices = []
            for i in range(D.shape[0]):
                for j in range(i, D.shape[0]):
                    if D[i,j] <= d_min:
                        indices.append([i,j])
            print "W"
            print W.shape
            indices = scipy.asarray(indices)
            
        else:
            raise ValueError("ptype: %s  not valid" %ptype)
        return indices

    def _subgraphsim(self, Dw, idents, stype='dijkstra'):
        # subgraph
        Gw = nx.XGraph()
        for edge in idents:
            e = G.get_edge(edge)
            Gw.add_edge()
        if stype == 'dijkstra':
            pass
        
class GraphQueryScatterPlot(plots.ScatterPlot):
    def __init__(self, S, Dw, *args, **kw):
        self.S = S
        self.Dw = Dw
        id_dim, sel_dim = Dw.get_dim_name()
        self._dim = id_dim
        id_1, = Dw.get_identifiers(sel_dim, [0])
        id_2, = Dw.get_identifiers(sel_dim, [1])
        print id_1
        print id_2

        plots.ScatterPlot.__init__(self, Dw, Dw, id_dim, sel_dim, id_1, id_2, c='g', s=50,name="Kegg test", alpha=.5)
        self.overlay_subgraphs()
    
    def overlay_subgraphs(self):
        for subgraph in self.S:
            nodes = subgraph.nodes()
            print self._dim
            
            sub_dw = self.Dw.subdata(self._dim, nodes)
            nodes = sub_dw.get_identifiers(self._dim, sorted=True)
            xy = sub_dw.asarray()[:,:2]
            pos = {}
            for i, node in enumerate(nodes):
                pos[node] = xy[i]
            nx.draw_networkx_nodes(subgraph, pos, node_size=200, ax=self.axes, zorder=3)
            nx.draw_networkx_edges(subgraph, pos, ax=self.axes, zorder=3)
            
            
    def on_update(self, selection, graph):
        g = nx.subgraph(graph, selection)
        S = nx.connected_components_subgraphs(g)
    
    def _subgraph_score(self, subgraph):
        pass
Demo workflow 2008-02-05 12:34:14 +01:00			`import sys,os`
			`import os.path`
			`import webbrowser`
			`import cPickle`

			`import scipy`
			`import networkx as nx`

			`from fluents import logger,plots,workflow,dataset,main`
			`from fluents.lib import blmfuncs,nx_utils,cx_utils`

			`import gobrowser`


			`class SmallTestWorkflow(workflow.Workflow):`
			`name = 'Demo'`
			`ident = 'demo'`
			`description = 'A small test workflow for gene expression analysis.'`
			`chip = 'hgu'`
			`def __init__(self):`
			`workflow.Workflow.__init__(self)`

			`# DATA IMPORT`
			`load = workflow.Stage('load', 'Data')`

			`load_small = LoadDataFunction('load-small', 'Small', self)`
			`load.add_function(load_small)`

			`load_medium = LoadDataFunction('load-geneid', 'GeneID', self, 'geneid')`
			`load.add_function(load_medium)`

			`load_medium = LoadDataFunction('load-full', 'FullChip', self, 'full')`
			`load.add_function(load_medium)`

			`self.add_stage(load)`

			`# NETWORK PREPROCESSING`
			`#net = workflow.Stage('net', 'Network integration')`
			`#net.add_function(DiffKernelFunction())`
			`#net.add_function(ModKernelFunction())`
			`#self.add_stage(net)`

			`# Models`
			`model = workflow.Stage('models', 'Models')`
			`model.add_function(blmfuncs.PCA())`
			`model.add_function(blmfuncs.PLS())`
			`model.add_function(SAM())`
			`self.add_stage(model)`

			`query = workflow.Stage('query', 'Gene Query')`
			`query.add_function(NCBIQuery())`
			`query.add_function(KEGGQuery())`
			`query.add_function(SubgraphQuery())`
			`self.add_stage(query)`

			`# Background knowledge`
			`go = workflow.Stage('go', 'Gene Ontology')`
			`go.add_function(gobrowser.PlotDagFunction())`
			`go.add_function(GoEnrichment())`
			`go.add_function(GoEnrichmentCond())`
			`go.add_function(MapGO2Gene())`
			`go.add_function(MapGene2GO())`
			`self.add_stage(go)`

			`# EXTRA PLOTS`
			`#plt = workflow.Stage('net', 'Network')`
			`#plt.add_function(nx_analyser.KeggNetworkAnalyser())`
			`#self.add_stage(plt)`

			`logger.log('debug', 'Small test workflow is now active')`


			`class LoadDataFunction(workflow.Function):`
			`"""Loads all datasets in a given directory."""`
			`def __init__(self, ident, label, wf, dir=''):`
			`workflow.Function.__init__(self, ident, label)`
			`self._dir = dir`
			`self._wf = wf`

			`def run(self):`
			`path = os.path.join(main.options.datadir, self._wf.ident, self._dir)`
			`files = os.listdir(path)`
			`out = []`
			`for fn in files:`
			`if fn.endswith('.ftsv'):`
			`out.append(dataset.read_ftsv(os.path.join(path, fn)))`
			`return out`



			`##### WORKFLOW SPECIFIC FUNCTIONS ######`


			`class SAM(workflow.Function):`
			`def __init__(self, id='sam', name='SAM'):`
			`workflow.Function.__init__(self, id, name)`

			`def run(self, x, y):`
			`n_iter = 50 #B`
			`alpha = 0.01 #cut off on qvals`

			`###############`

			`# Main function call`

			`# setup prelimenaries`
			`import rpy`
			`rpy.r.library("siggenes")`
			`rpy.r.library("multtest")`

			`cl = scipy.dot(y.asarray(), scipy.diag(scipy.arange(y.shape[1]))).sum(1)`
			`data = x.asarray().T`
			`sam = rpy.r.sam(data, cl=cl, B=n_iter, var_equal=False,med=False,s0=scipy.nan,rand=scipy.nan)`
			`qvals = scipy.asarray(rpy.r.slot(sam, "p.value"))`
			`pvals = scipy.asarray(rpy.r.slot(sam, "q.value"))`

			`sam_index = (qvals<alpha).nonzero()[0]`

			`# Update selection object`
			`dim_name = x.get_dim_name(1)`
			`sam_selection = x.get_identifiers(dim_name, indices=sam_index)`
			`main.project.set_selection(dim_name, sam_selection)`

			`sel = dataset.Selection('SAM selection')`
			`sel.select(dim_name, sam_selection)`
			`logger.log('notice','Number of significant varibles (SAM): %s' %len(sam_selection))`

			`# ## OUTPUT ###`
			`xcolname = x.get_dim_name(1) # genes`
			`x_col_ids = [xcolname, x.get_identifiers(xcolname, sorted=True)]`
			`sing_id = ['_john', ['0']] #singleton`
			`D_qvals = dataset.Dataset(qvals, (x_col_ids, sing_id), name='q_vals')`
			`D_pvals = dataset.Dataset(pvals, (x_col_ids, sing_id), name='p_vals')`

			`# plots`
			`s_indx = qvals.flatten().argsort()`
			`s_ids = [x_col_ids[0],[x_col_ids[1][i] for i in s_indx]]`
			`xindex = scipy.arange(len(qvals))`
			`qvals_s = qvals.take(s_indx)`
			`D_qs = dataset.Dataset(qvals_s, (s_ids, sing_id), name="sorted qvals")`
			`Dind = dataset.Dataset(xindex, (s_ids, sing_id), name="dum")`
			`st = plots.ScatterPlot(D_qs, Dind, 'gene_ids', '_john', '0', '0', s=10, name='SAM qvals')`

			`return [D_qvals, D_pvals, D_qs, st, sel]`


			`class DiffKernelFunction(workflow.Function):`
			`def __init__(self):`
			`workflow.Function.__init__(self, 'diffkernel', 'Diffusion')`

			`def run(self, x, a):`
			`"""x is gene expression data, a is the network.`
			`"""`
			`#sanity check:`
			`g = a.asnetworkx()`
			`genes = x.get_identifiers(x.get_dim_name(1), sorted=True)`
			`W = nx.adj_matrix(g, nodelist=genes)`
			`X = x.asarray()`
			`Xc, mn_x = cx_utils.mat_center(X, ret_mn=True)`
			`out = []`
			`alpha = 1.0`
			`beta = 1.0`
			`K = nx_utils.K_diffusion(W, alpha=alpha, beta=beta,normalised=True)`
			`Xp = scipy.dot(Xc, K) + mn_x`
			`# dataset`
			`row_ids = (x.get_dim_name(0),`
			`x.get_identifiers(x.get_dim_name(0),`
			`sorted=True))`
			`col_ids = (x.get_dim_name(1),`
			`x.get_identifiers(x.get_dim_name(1),`
			`sorted=True))`

			`xout = dataset.Dataset(Xp,`
			`(row_ids, col_ids),`
			`name=x.get_name()+'_diff'+str(alpha))`
			`out.append(xout)`

			`return out`


			`class ModKernelFunction(workflow.Function):`
			`def __init__(self):`
			`workflow.Function.__init__(self, 'mokernel', 'Modularity')`

			`def run(self,x,a):`
			`X = x.asarray()`
			`g = a.asnetworkx()`
			`genes = x.get_identifiers(x.get_dim_name(1), sorted=True)`
			`W = nx.adj_matrix(g, nodelist=genes)`
			`out=[]`
			`alpha=.2`
			`Xc,mn_x = cx_utils.mat_center(X, ret_mn=True)`
			`K = nx_utils.K_modularity(W, alpha=alpha)`
			`Xp = scipy.dot(Xc, K)`
			`Xp = Xp + mn_x`

			`# dataset`
			`row_ids = (x.get_dim_name(0),`
			`x.get_identifiers(x.get_dim_name(0),`
			`sorted=True))`
			`col_ids = (x.get_dim_name(1),`
			`x.get_identifiers(x.get_dim_name(1),`
			`sorted=True))`
			`xout = dataset.Dataset(Xp,`
			`(row_ids,col_ids),`
			`name=x.get_name()+'_mod'+str(alpha))`
			`out.append(xout)`
			`return out`


			`class NCBIQuery(workflow.Function):`
			`def __init__(self, gene_id_name='gene_ids'):`
			`self._gene_id_name = gene_id_name`
			`workflow.Function.__init__(self, 'query', 'NCBI')`

			`def run(self):`
			`selection = main.project.get_selection()`
			`if not selection.has_key(self._gene_id_name):`
			`logger.log("notice", "Expected gene ids: %s, but got: %s" %(self._gene_id_name, selection.keys()))`
			`return None`
			`if len(selection[self._gene_id_name])==0:`
			`logger.log("notice", "No selected genes to query")`
			`return None`

			`base = 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?'`
			`options = {r'&db=' : 'gene',`
			`r'&cmd=' : 'retrieve',`
			`r'&dopt=' : 'full_report'}`
			`gene_str = ''.join([gene + "+" for gene in selection[self._gene_id_name]])`
			`options[r'&list_uids='] = gene_str[:-1]`
			`opt_str = ''.join([key+value for key,value in options.items()])`
			`web_str = base + opt_str`
			`webbrowser.open(web_str)`


			`class KEGGQuery(workflow.Function):`
			`def __init__(self, org='hsa', gene_id_name='gene_ids'):`
			`self._org=org`
			`self._gene_id_name = gene_id_name`
			`workflow.Function.__init__(self, 'query', 'KEGG')`

			`def run(self, selection):`
			`if not selection.has_key(self._gene_id_name):`
			`logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys()))`
			`return None`
			`if len(selection[self._gene_id_name])==0:`
			`logger.log("notice", "No selected genes to query")`
			`return None`

			`base = r'http://www.genome.jp/dbget-bin/www_bget?'`
			`gene_str = ''.join([gene + "+" for gene in selection[self._gene_id_name]])`
			`gene_str = gene_str[:-1]`
			`gene_str = self._org + "+" + gene_str`
			`web_str = base + gene_str`
			`webbrowser.open(web_str)`


			`class GoEnrichment(workflow.Function):`
			`def __init__(self):`
			`workflow.Function.__init__(self, 'goenrich', 'Go Enrichment')`

			`def run(self, data):`
			`import rpy`
			`rpy.r.library("GOstats")`

			`# Get universe`
			`# Here, we are using a defined dataset to represent the universe`
			`if not 'gene_ids' in data:`
			`logger.log('notice', 'No dimension called [gene_ids] in dataset: %s' %data.get_name())`
			`return`
			`universe = list(data.get_identifiers('gene_ids'))`
			`logger.log('notice', 'Universe consists of %s gene ids from %s' %(len(universe), data.get_name()))`
			`# Get current selection and validate`
			`curr_sel = main.project.get_selection()`
			`selected_genes = list(curr_sel['gene_ids'])`
			`if len(selected_genes)==0:`
			`logger.log('notice', 'This function needs a current selection!')`
			`return`

			`# Hypergeometric parameter object`
			`pval_cutoff = 0.9999`
			`cond = False`
			`test_direction = 'over'`
			`params = rpy.r.new("GOHyperGParams",`
			`geneIds=selected_genes,`
			`annotation="hgu133a",`
			`ontology="BP",`
			`pvalueCutoff=pval_cutoff,`
			`conditional=cond,`
			`testDirection=test_direction`
			`)`
			`# run test`
			`# result.keys(): ['Count', 'Term', 'OddsRatio', 'Pvalue', 'ExpCount', 'GOBPID', 'Size']`
			`result = rpy.r.summary(rpy.r.hyperGTest(params))`

			`# dataset`
			`terms = result['GOBPID']`
			`pvals = scipy.log(scipy.asarray(result['Pvalue']))`
			`row_ids = ('go-terms', terms)`
			`col_ids = ('_john', ['_doe'])`

			`xout = dataset.Dataset(pvals,`
			`(row_ids, col_ids),`
			`name='P values (enrichment)')`
			`return [xout]`


			`class GoEnrichmentCond(workflow.Function):`
			`""" Enrichment conditioned on dag structure."""`
			`def __init__(self):`
			`workflow.Function.__init__(self, 'goenrich', 'Go Cond. Enrich.')`

			`def run(self, data):`
			`import rpy`
			`rpy.r.library("GOstats")`

			`# Get universe`
			`# Here, we are using a defined dataset to represent the universe`
			`if not 'gene_ids' in data:`
			`logger.log('notice', 'No dimension called [gene_ids] in dataset: %s', data.get_name())`
			`return`
			`universe = list(data.get_identifiers('gene_ids'))`
			`logger.log('notice', 'Universe consists of %s gene ids from %s' %(len(universe), data.get_name()))`
			`# Get current selection and validate`
			`curr_sel = main.project.get_selection()`
			`selected_genes = list(curr_sel['gene_ids'])`
			`if len(selected_genes)==0:`
			`logger.log('notice', 'This function needs a current selection!')`
			`return`

			`# Hypergeometric parameter object`
			`pval_cutoff = 1`
			`cond = True`
			`test_direction = 'over'`
			`params = rpy.r.new("GOHyperGParams",`
			`geneIds=selected_genes,`
			`annotation="hgu133a",`
			`ontology="BP",`
			`pvalueCutoff=pval_cutoff,`
			`conditional=cond,`
			`testDirection=test_direction`
			`)`
			`# run test`
			`# result.keys(): ['Count', 'Term', 'OddsRatio', 'Pvalue', 'ExpCount', 'GOBPID', 'Size']`
			`result = rpy.r.summary(rpy.r.hyperGTest(params))`

			`# dataset`
			`terms = result['GOBPID']`
			`pvals = scipy.log(scipy.asarray(result['Pvalue']))`
			`row_ids = ('go-terms', terms)`
			`col_ids = ('_john', ['_doe'])`

			`xout = dataset.Dataset(pvals,`
			`(row_ids, col_ids),`
			`name='P values (enrichment)')`
			`return [xout]`


			`class MapGene2GO(workflow.Function):`
			`def __init__(self, ont='bp', gene_id_name='gene_ids'):`
			`self._ont = ont`
			`self._gene_id_name = gene_id_name`
			`workflow.Function.__init__(self, 'gene2go', 'gene->GO')`
			`# load data at init`
			`try:`
			`fname = "/home/flatberg/fluents/data/gene2go.pcl"`
			`self._gene2go = cPickle.load(open(fname))`
			`except:`
			`logger.log("notice", "could not load mapping")`

			`def run(self):`
			`selection = main.project.get_selection()`
			`if not selection.has_key(self._gene_id_name):`
			`logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys()))`
			`return None`
			`if len(selection[self._gene_id_name])==0:`
			`logger.log("notice", "No selected genes to query")`
			`return None`

			`gene_ids = selection[self._gene_id_name]`
			`go_ids = set()`
			`for gene in gene_ids:`
			`go_ids_new = self._gene2go.get(gene, [])`
			`if not go_ids_new:`
			`logger.log("notice", "Could not find any goterms for %s" %gene)`
			`go_ids.update(self._gene2go.get(gene, []))`
			`main.project.set_selection('go-terms', go_ids)`
			`logger.log("notice", "GO terms updated")`


			`class MapGO2Gene(workflow.Function):`
			`def __init__(self, ont='bp', gene_id_name='go-terms'):`
			`self._ont = ont`
			`self._gene_id_name = gene_id_name`
			`workflow.Function.__init__(self, 'go2gene', 'GO->gene')`
			`# load data at init`
			`try:`
			`fname = "/home/flatberg/fluents/data/go2gene.pcl"`
			`self._go2gene = cPickle.load(open(fname))`
			`except:`
			`logger.log("notice", "could not load mapping")`

			`def run(self):`
			`selection = main.project.get_selection()`
			`if not selection.has_key(self._gene_id_name):`
			`logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys()))`
			`return None`
			`if len(selection[self._gene_id_name])==0:`
			`logger.log("notice", "No selected genes to query")`
			`return None`

			`go_ids = selection[self._gene_id_name]`
			`gene_ids = set()`
			`for go in go_ids:`
			`if not self._go2gene.get(go,[]):`
			`logger.log("notice", "Could not find any gene ids for %s" %go)`
			`gene_ids.update(self._go2gene.get(go,[]))`
			`main.project.set_selection('gene_ids', gene_ids)`
			`logger.log("notice", "GO terms updated")`


			`class SubgraphQuery(workflow.Function):`
			`def __init__(self, graph='kegg', dim='gene_ids'):`
			`self._gtype = graph`
			`self._dim = dim`

			`workflow.Function.__init__(self, 'keggraph', 'KeggGraph')`

			`def run(self, Dw, DA):`
			`max_edge_ratio = .20`
			`max_cov_ratio = .25`
			`neigh_type = 'cov'`
			`neigh_type = 'cosine'`
			`#neigh_type = 'heat'`
			`# 1.) Operate on a subset selection`
			`selection = main.project.get_selection()`
			`if not selection.has_key(self._dim):`
			`logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._dim, selection.keys()))`
			`return None`
			`if len(selection[self._dim]) == 0:`
cc 2008-02-06 10:42:46 +01:00			`logger.log("notice", "No selected genes to query, using all")`
			`Dw = Dw.subdata(self._dim, Dw.get_identifiers(self._dim)[:100])`
			`else:`
			`Dw = Dw.subdata(self._dim, selection[self._dim])`
Demo workflow 2008-02-05 12:34:14 +01:00
			`# 2.) Pairwise goodness in loading space`
cc 2008-02-06 10:42:46 +01:00			`indices = self._pairsim(Dw)`
			`print indices`
			`print indices.shape`

			`idents1 = Dw.get_identifiers(self._dim, indices[:,0])`
			`idents2 = Dw.get_identifiers(self._dim, indices[:,1])`
			`idents = zip(idents1, idents2)`

			`# 3.) Identify close subgraphs`

			`# 4.) Rank subgraphs`

			`main.project.set_selection('gene_ids', idents1)`
			`#main.project.set_sele`
			`logger.log("notice", "Gene ids updated")`
			`#plt = GraphQueryScatterPlot(SS, Dw)`
			`#return [plt]`

			`def _pairsim(self, Dw, ptype='cosine',cut_rat=.2):`
			`"""Returns close pairs across given dim.`
			`ptype : ['cov', 'correlation', 'cosine', 'heat', 'euclidean']`
			`"""`
Demo workflow 2008-02-05 12:34:14 +01:00			`W = Dw.asarray()`
cc 2008-02-06 10:42:46 +01:00			`if ptype == 'cov':`
Demo workflow 2008-02-05 12:34:14 +01:00			`W -= W.mean(1)[:,scipy.newaxis]`
			`wcov = scipy.dot(W, W.T)/(W.shape[1]-1)`
cc 2008-02-06 10:42:46 +01:00			`wcov_min = wcov.max()*cut_rat`
			`indices = scipy.asarray(scipy.where(wcov >= wcov_min)).T`
			`elif ptype == 'heat':`
			`from hcluster import pdist, squareform`
			`D = squareform(pdist(W))`
			`H = exp(-D)`
			`h_min = H.max()*cut_rat`
			`indices = scipy.asarray(scipy.where(H >= h_min)).T`
			`elif ptype in ['euclidean', 'cosine', 'correlation']:`
			`from hcluster import pdist, squareform`
			`D = squareform(pdist(W), ptype)`
			`d_min = D.max()*cut_rat`
			`indices = []`
			`for i in range(D.shape[0]):`
			`for j in range(i, D.shape[0]):`
			`if D[i,j] <= d_min:`
			`indices.append([i,j])`
			`print "W"`
			`print W.shape`
			`indices = scipy.asarray(indices)`

			`else:`
			`raise ValueError("ptype: %s not valid" %ptype)`
			`return indices`

			`def _subgraphsim(self, Dw, idents, stype='dijkstra'):`
			`# subgraph`
			`Gw = nx.XGraph()`
			`for edge in idents:`
			`e = G.get_edge(edge)`
			`Gw.add_edge()`
			`if stype == 'dijkstra':`
			`pass`
Demo workflow 2008-02-05 12:34:14 +01:00
			`class GraphQueryScatterPlot(plots.ScatterPlot):`
			`def __init__(self, S, Dw, args, *kw):`
			`self.S = S`
			`self.Dw = Dw`
			`id_dim, sel_dim = Dw.get_dim_name()`
			`self._dim = id_dim`
			`id_1, = Dw.get_identifiers(sel_dim, [0])`
			`id_2, = Dw.get_identifiers(sel_dim, [1])`
			`print id_1`
			`print id_2`

			`plots.ScatterPlot.__init__(self, Dw, Dw, id_dim, sel_dim, id_1, id_2, c='g', s=50,name="Kegg test", alpha=.5)`
			`self.overlay_subgraphs()`

			`def overlay_subgraphs(self):`
			`for subgraph in self.S:`
			`nodes = subgraph.nodes()`
			`print self._dim`

			`sub_dw = self.Dw.subdata(self._dim, nodes)`
			`nodes = sub_dw.get_identifiers(self._dim, sorted=True)`
			`xy = sub_dw.asarray()[:,:2]`
			`pos = {}`
			`for i, node in enumerate(nodes):`
			`pos[node] = xy[i]`
			`nx.draw_networkx_nodes(subgraph, pos, node_size=200, ax=self.axes, zorder=3)`
			`nx.draw_networkx_edges(subgraph, pos, ax=self.axes, zorder=3)`



			`def on_update(self, selection, graph):`
			`g = nx.subgraph(graph, selection)`
			`S = nx.connected_components_subgraphs(g)`

			`def _subgraph_score(self, subgraph):`
			`pass`