import sys,os import os.path import webbrowser import cPickle import scipy import networkx as nx from fluents import logger,plots,workflow,dataset,main from fluents.lib import blmfuncs,nx_utils,cx_utils import gobrowser class SmallTestWorkflow(workflow.Workflow): name = 'Demo' ident = 'demo' description = 'A small test workflow for gene expression analysis.' chip = 'hgu' def __init__(self): workflow.Workflow.__init__(self) # DATA IMPORT load = workflow.Stage('load', 'Data') load_small = LoadDataFunction('load-small', 'Small', self) load.add_function(load_small) load_medium = LoadDataFunction('load-geneid', 'GeneID', self, 'geneid') load.add_function(load_medium) load_medium = LoadDataFunction('load-full', 'FullChip', self, 'full') load.add_function(load_medium) self.add_stage(load) # NETWORK PREPROCESSING #net = workflow.Stage('net', 'Network integration') #net.add_function(DiffKernelFunction()) #net.add_function(ModKernelFunction()) #self.add_stage(net) # Models model = workflow.Stage('models', 'Models') model.add_function(blmfuncs.PCA()) model.add_function(blmfuncs.PLS()) model.add_function(SAM()) self.add_stage(model) query = workflow.Stage('query', 'Gene Query') query.add_function(NCBIQuery()) query.add_function(KEGGQuery()) query.add_function(SubgraphQuery()) self.add_stage(query) # Background knowledge go = workflow.Stage('go', 'Gene Ontology') go.add_function(gobrowser.PlotDagFunction()) go.add_function(GoEnrichment()) go.add_function(GoEnrichmentCond()) go.add_function(MapGO2Gene()) go.add_function(MapGene2GO()) self.add_stage(go) # EXTRA PLOTS #plt = workflow.Stage('net', 'Network') #plt.add_function(nx_analyser.KeggNetworkAnalyser()) #self.add_stage(plt) logger.log('debug', 'Small test workflow is now active') class LoadDataFunction(workflow.Function): """Loads all datasets in a given directory.""" def __init__(self, ident, label, wf, dir=''): workflow.Function.__init__(self, ident, label) self._dir = dir self._wf = wf def run(self): path = os.path.join(main.options.datadir, self._wf.ident, self._dir) files = os.listdir(path) out = [] for fn in files: if fn.endswith('.ftsv'): out.append(dataset.read_ftsv(os.path.join(path, fn))) return out ##### WORKFLOW SPECIFIC FUNCTIONS ###### class SAM(workflow.Function): def __init__(self, id='sam', name='SAM'): workflow.Function.__init__(self, id, name) def run(self, x, y): n_iter = 50 #B alpha = 0.01 #cut off on qvals ############### # Main function call # setup prelimenaries import rpy rpy.r.library("siggenes") rpy.r.library("multtest") cl = scipy.dot(y.asarray(), scipy.diag(scipy.arange(y.shape[1]))).sum(1) data = x.asarray().T sam = rpy.r.sam(data, cl=cl, B=n_iter, var_equal=False,med=False,s0=scipy.nan,rand=scipy.nan) qvals = scipy.asarray(rpy.r.slot(sam, "p.value")) pvals = scipy.asarray(rpy.r.slot(sam, "q.value")) sam_index = (qvals<alpha).nonzero()[0] # Update selection object dim_name = x.get_dim_name(1) sam_selection = x.get_identifiers(dim_name, indices=sam_index) main.project.set_selection(dim_name, sam_selection) sel = dataset.Selection('SAM selection') sel.select(dim_name, sam_selection) logger.log('notice','Number of significant varibles (SAM): %s' %len(sam_selection)) # ## OUTPUT ### xcolname = x.get_dim_name(1) # genes x_col_ids = [xcolname, x.get_identifiers(xcolname, sorted=True)] sing_id = ['_john', ['0']] #singleton D_qvals = dataset.Dataset(qvals, (x_col_ids, sing_id), name='q_vals') D_pvals = dataset.Dataset(pvals, (x_col_ids, sing_id), name='p_vals') # plots s_indx = qvals.flatten().argsort() s_ids = [x_col_ids[0],[x_col_ids[1][i] for i in s_indx]] xindex = scipy.arange(len(qvals)) qvals_s = qvals.take(s_indx) D_qs = dataset.Dataset(qvals_s, (s_ids, sing_id), name="sorted qvals") Dind = dataset.Dataset(xindex, (s_ids, sing_id), name="dum") st = plots.ScatterPlot(D_qs, Dind, 'gene_ids', '_john', '0', '0', s=10, name='SAM qvals') return [D_qvals, D_pvals, D_qs, st, sel] class DiffKernelFunction(workflow.Function): def __init__(self): workflow.Function.__init__(self, 'diffkernel', 'Diffusion') def run(self, x, a): """x is gene expression data, a is the network. """ #sanity check: g = a.asnetworkx() genes = x.get_identifiers(x.get_dim_name(1), sorted=True) W = nx.adj_matrix(g, nodelist=genes) X = x.asarray() Xc, mn_x = cx_utils.mat_center(X, ret_mn=True) out = [] alpha = 1.0 beta = 1.0 K = nx_utils.K_diffusion(W, alpha=alpha, beta=beta,normalised=True) Xp = scipy.dot(Xc, K) + mn_x # dataset row_ids = (x.get_dim_name(0), x.get_identifiers(x.get_dim_name(0), sorted=True)) col_ids = (x.get_dim_name(1), x.get_identifiers(x.get_dim_name(1), sorted=True)) xout = dataset.Dataset(Xp, (row_ids, col_ids), name=x.get_name()+'_diff'+str(alpha)) out.append(xout) return out class ModKernelFunction(workflow.Function): def __init__(self): workflow.Function.__init__(self, 'mokernel', 'Modularity') def run(self,x,a): X = x.asarray() g = a.asnetworkx() genes = x.get_identifiers(x.get_dim_name(1), sorted=True) W = nx.adj_matrix(g, nodelist=genes) out=[] alpha=.2 Xc,mn_x = cx_utils.mat_center(X, ret_mn=True) K = nx_utils.K_modularity(W, alpha=alpha) Xp = scipy.dot(Xc, K) Xp = Xp + mn_x # dataset row_ids = (x.get_dim_name(0), x.get_identifiers(x.get_dim_name(0), sorted=True)) col_ids = (x.get_dim_name(1), x.get_identifiers(x.get_dim_name(1), sorted=True)) xout = dataset.Dataset(Xp, (row_ids,col_ids), name=x.get_name()+'_mod'+str(alpha)) out.append(xout) return out class NCBIQuery(workflow.Function): def __init__(self, gene_id_name='gene_ids'): self._gene_id_name = gene_id_name workflow.Function.__init__(self, 'query', 'NCBI') def run(self): selection = main.project.get_selection() if not selection.has_key(self._gene_id_name): logger.log("notice", "Expected gene ids: %s, but got: %s" %(self._gene_id_name, selection.keys())) return None if len(selection[self._gene_id_name])==0: logger.log("notice", "No selected genes to query") return None base = 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?' options = {r'&db=' : 'gene', r'&cmd=' : 'retrieve', r'&dopt=' : 'full_report'} gene_str = ''.join([gene + "+" for gene in selection[self._gene_id_name]]) options[r'&list_uids='] = gene_str[:-1] opt_str = ''.join([key+value for key,value in options.items()]) web_str = base + opt_str webbrowser.open(web_str) class KEGGQuery(workflow.Function): def __init__(self, org='hsa', gene_id_name='gene_ids'): self._org=org self._gene_id_name = gene_id_name workflow.Function.__init__(self, 'query', 'KEGG') def run(self, selection): if not selection.has_key(self._gene_id_name): logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys())) return None if len(selection[self._gene_id_name])==0: logger.log("notice", "No selected genes to query") return None base = r'http://www.genome.jp/dbget-bin/www_bget?' gene_str = ''.join([gene + "+" for gene in selection[self._gene_id_name]]) gene_str = gene_str[:-1] gene_str = self._org + "+" + gene_str web_str = base + gene_str webbrowser.open(web_str) class GoEnrichment(workflow.Function): def __init__(self): workflow.Function.__init__(self, 'goenrich', 'Go Enrichment') def run(self, data): import rpy rpy.r.library("GOstats") # Get universe # Here, we are using a defined dataset to represent the universe if not 'gene_ids' in data: logger.log('notice', 'No dimension called [gene_ids] in dataset: %s' %data.get_name()) return universe = list(data.get_identifiers('gene_ids')) logger.log('notice', 'Universe consists of %s gene ids from %s' %(len(universe), data.get_name())) # Get current selection and validate curr_sel = main.project.get_selection() selected_genes = list(curr_sel['gene_ids']) if len(selected_genes)==0: logger.log('notice', 'This function needs a current selection!') return # Hypergeometric parameter object pval_cutoff = 0.9999 cond = False test_direction = 'over' params = rpy.r.new("GOHyperGParams", geneIds=selected_genes, annotation="hgu133a", ontology="BP", pvalueCutoff=pval_cutoff, conditional=cond, testDirection=test_direction ) # run test # result.keys(): ['Count', 'Term', 'OddsRatio', 'Pvalue', 'ExpCount', 'GOBPID', 'Size'] result = rpy.r.summary(rpy.r.hyperGTest(params)) # dataset terms = result['GOBPID'] pvals = scipy.log(scipy.asarray(result['Pvalue'])) row_ids = ('go-terms', terms) col_ids = ('_john', ['_doe']) xout = dataset.Dataset(pvals, (row_ids, col_ids), name='P values (enrichment)') return [xout] class GoEnrichmentCond(workflow.Function): """ Enrichment conditioned on dag structure.""" def __init__(self): workflow.Function.__init__(self, 'goenrich', 'Go Cond. Enrich.') def run(self, data): import rpy rpy.r.library("GOstats") # Get universe # Here, we are using a defined dataset to represent the universe if not 'gene_ids' in data: logger.log('notice', 'No dimension called [gene_ids] in dataset: %s', data.get_name()) return universe = list(data.get_identifiers('gene_ids')) logger.log('notice', 'Universe consists of %s gene ids from %s' %(len(universe), data.get_name())) # Get current selection and validate curr_sel = main.project.get_selection() selected_genes = list(curr_sel['gene_ids']) if len(selected_genes)==0: logger.log('notice', 'This function needs a current selection!') return # Hypergeometric parameter object pval_cutoff = 1 cond = True test_direction = 'over' params = rpy.r.new("GOHyperGParams", geneIds=selected_genes, annotation="hgu133a", ontology="BP", pvalueCutoff=pval_cutoff, conditional=cond, testDirection=test_direction ) # run test # result.keys(): ['Count', 'Term', 'OddsRatio', 'Pvalue', 'ExpCount', 'GOBPID', 'Size'] result = rpy.r.summary(rpy.r.hyperGTest(params)) # dataset terms = result['GOBPID'] pvals = scipy.log(scipy.asarray(result['Pvalue'])) row_ids = ('go-terms', terms) col_ids = ('_john', ['_doe']) xout = dataset.Dataset(pvals, (row_ids, col_ids), name='P values (enrichment)') return [xout] class MapGene2GO(workflow.Function): def __init__(self, ont='bp', gene_id_name='gene_ids'): self._ont = ont self._gene_id_name = gene_id_name workflow.Function.__init__(self, 'gene2go', 'gene->GO') # load data at init try: fname = "/home/flatberg/fluents/data/gene2go.pcl" self._gene2go = cPickle.load(open(fname)) except: logger.log("notice", "could not load mapping") def run(self): selection = main.project.get_selection() if not selection.has_key(self._gene_id_name): logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys())) return None if len(selection[self._gene_id_name])==0: logger.log("notice", "No selected genes to query") return None gene_ids = selection[self._gene_id_name] go_ids = set() for gene in gene_ids: go_ids_new = self._gene2go.get(gene, []) if not go_ids_new: logger.log("notice", "Could not find any goterms for %s" %gene) go_ids.update(self._gene2go.get(gene, [])) main.project.set_selection('go-terms', go_ids) logger.log("notice", "GO terms updated") class MapGO2Gene(workflow.Function): def __init__(self, ont='bp', gene_id_name='go-terms'): self._ont = ont self._gene_id_name = gene_id_name workflow.Function.__init__(self, 'go2gene', 'GO->gene') # load data at init try: fname = "/home/flatberg/fluents/data/go2gene.pcl" self._go2gene = cPickle.load(open(fname)) except: logger.log("notice", "could not load mapping") def run(self): selection = main.project.get_selection() if not selection.has_key(self._gene_id_name): logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys())) return None if len(selection[self._gene_id_name])==0: logger.log("notice", "No selected genes to query") return None go_ids = selection[self._gene_id_name] gene_ids = set() for go in go_ids: if not self._go2gene.get(go,[]): logger.log("notice", "Could not find any gene ids for %s" %go) gene_ids.update(self._go2gene.get(go,[])) main.project.set_selection('gene_ids', gene_ids) logger.log("notice", "GO terms updated") class SubgraphQuery(workflow.Function): def __init__(self, graph='kegg', dim='gene_ids'): self._gtype = graph self._dim = dim workflow.Function.__init__(self, 'keggraph', 'KeggGraph') def run(self, Dw, DA): max_edge_ratio = .20 max_cov_ratio = .25 neigh_type = 'cov' neigh_type = 'cosine' #neigh_type = 'heat' # 1.) Operate on a subset selection selection = main.project.get_selection() if not selection.has_key(self._dim): logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._dim, selection.keys())) return None if len(selection[self._dim]) == 0: logger.log("notice", "No selected genes to query, using all") Dw = Dw.subdata(self._dim, Dw.get_identifiers(self._dim)[:100]) else: Dw = Dw.subdata(self._dim, selection[self._dim]) # 2.) Pairwise goodness in loading space indices = self._pairsim(Dw) print indices print indices.shape idents1 = Dw.get_identifiers(self._dim, indices[:,0]) idents2 = Dw.get_identifiers(self._dim, indices[:,1]) idents = zip(idents1, idents2) # 3.) Identify close subgraphs # 4.) Rank subgraphs main.project.set_selection('gene_ids', idents1) #main.project.set_sele logger.log("notice", "Gene ids updated") #plt = GraphQueryScatterPlot(SS, Dw) #return [plt] def _pairsim(self, Dw, ptype='cosine',cut_rat=.2): """Returns close pairs across given dim. ptype : ['cov', 'correlation', 'cosine', 'heat', 'euclidean'] """ W = Dw.asarray() if ptype == 'cov': W -= W.mean(1)[:,scipy.newaxis] wcov = scipy.dot(W, W.T)/(W.shape[1]-1) wcov_min = wcov.max()*cut_rat indices = scipy.asarray(scipy.where(wcov >= wcov_min)).T elif ptype == 'heat': from hcluster import pdist, squareform D = squareform(pdist(W)) H = exp(-D) h_min = H.max()*cut_rat indices = scipy.asarray(scipy.where(H >= h_min)).T elif ptype in ['euclidean', 'cosine', 'correlation']: from hcluster import pdist, squareform D = squareform(pdist(W), ptype) d_min = D.max()*cut_rat indices = [] for i in range(D.shape[0]): for j in range(i, D.shape[0]): if D[i,j] <= d_min: indices.append([i,j]) print "W" print W.shape indices = scipy.asarray(indices) else: raise ValueError("ptype: %s not valid" %ptype) return indices def _subgraphsim(self, Dw, idents, stype='dijkstra'): # subgraph Gw = nx.XGraph() for edge in idents: e = G.get_edge(edge) Gw.add_edge() if stype == 'dijkstra': pass class GraphQueryScatterPlot(plots.ScatterPlot): def __init__(self, S, Dw, *args, **kw): self.S = S self.Dw = Dw id_dim, sel_dim = Dw.get_dim_name() self._dim = id_dim id_1, = Dw.get_identifiers(sel_dim, [0]) id_2, = Dw.get_identifiers(sel_dim, [1]) print id_1 print id_2 plots.ScatterPlot.__init__(self, Dw, Dw, id_dim, sel_dim, id_1, id_2, c='g', s=50,name="Kegg test", alpha=.5) self.overlay_subgraphs() def overlay_subgraphs(self): for subgraph in self.S: nodes = subgraph.nodes() print self._dim sub_dw = self.Dw.subdata(self._dim, nodes) nodes = sub_dw.get_identifiers(self._dim, sorted=True) xy = sub_dw.asarray()[:,:2] pos = {} for i, node in enumerate(nodes): pos[node] = xy[i] nx.draw_networkx_nodes(subgraph, pos, node_size=200, ax=self.axes, zorder=3) nx.draw_networkx_edges(subgraph, pos, ax=self.axes, zorder=3) def on_update(self, selection, graph): g = nx.subgraph(graph, selection) S = nx.connected_components_subgraphs(g) def _subgraph_score(self, subgraph): pass