import sys,os import os.path import webbrowser import cPickle import scipy import networkx as nx from fluents import logger,plots,workflow,dataset,main from fluents.lib import blmfuncs,nx_utils,cx_utils import gobrowser class SmallTestWorkflow(workflow.Workflow): name = 'Demo' ident = 'demo' description = 'A small test workflow for gene expression analysis.' chip = 'hgu' def __init__(self): workflow.Workflow.__init__(self) # DATA IMPORT load = workflow.Stage('load', 'Data') load_small = LoadDataFunction('load-small', 'Small', self) load.add_function(load_small) load_medium = LoadDataFunction('load-geneid', 'GeneID', self, 'geneid') load.add_function(load_medium) load_medium = LoadDataFunction('load-full', 'FullChip', self, 'full') load.add_function(load_medium) self.add_stage(load) # NETWORK PREPROCESSING #net = workflow.Stage('net', 'Network integration') #net.add_function(DiffKernelFunction()) #net.add_function(ModKernelFunction()) #self.add_stage(net) # Models model = workflow.Stage('models', 'Models') model.add_function(blmfuncs.PCA()) model.add_function(blmfuncs.PLS()) model.add_function(SAM()) self.add_stage(model) query = workflow.Stage('query', 'Gene Query') query.add_function(NCBIQuery()) query.add_function(KEGGQuery()) query.add_function(SubgraphQuery()) self.add_stage(query) # Background knowledge go = workflow.Stage('go', 'Gene Ontology') go.add_function(gobrowser.PlotDagFunction()) go.add_function(GoEnrichment()) go.add_function(GoEnrichmentCond()) go.add_function(MapGO2Gene()) go.add_function(MapGene2GO()) self.add_stage(go) # EXTRA PLOTS #plt = workflow.Stage('net', 'Network') #plt.add_function(nx_analyser.KeggNetworkAnalyser()) #self.add_stage(plt) logger.log('debug', 'Small test workflow is now active') class LoadDataFunction(workflow.Function): """Loads all datasets in a given directory.""" def __init__(self, ident, label, wf, dir=''): workflow.Function.__init__(self, ident, label) self._dir = dir self._wf = wf def run(self): path = os.path.join(main.options.datadir, self._wf.ident, self._dir) files = os.listdir(path) out = [] for fn in files: if fn.endswith('.ftsv'): out.append(dataset.read_ftsv(os.path.join(path, fn))) return out ##### WORKFLOW SPECIFIC FUNCTIONS ###### class SAM(workflow.Function): def __init__(self, id='sam', name='SAM'): workflow.Function.__init__(self, id, name) def run(self, x, y): n_iter = 50 #B alpha = 0.01 #cut off on qvals ############### # Main function call # setup prelimenaries import rpy rpy.r.library("siggenes") rpy.r.library("multtest") cl = scipy.dot(y.asarray(), scipy.diag(scipy.arange(y.shape[1]))).sum(1) data = x.asarray().T sam = rpy.r.sam(data, cl=cl, B=n_iter, var_equal=False,med=False,s0=scipy.nan,rand=scipy.nan) qvals = scipy.asarray(rpy.r.slot(sam, "p.value")) pvals = scipy.asarray(rpy.r.slot(sam, "q.value")) sam_index = (qvalsGO') # load data at init try: fname = "/home/flatberg/fluents/data/gene2go.pcl" self._gene2go = cPickle.load(open(fname)) except: logger.log("notice", "could not load mapping") def run(self): selection = main.project.get_selection() if not selection.has_key(self._gene_id_name): logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys())) return None if len(selection[self._gene_id_name])==0: logger.log("notice", "No selected genes to query") return None gene_ids = selection[self._gene_id_name] go_ids = set() for gene in gene_ids: go_ids_new = self._gene2go.get(gene, []) if not go_ids_new: logger.log("notice", "Could not find any goterms for %s" %gene) go_ids.update(self._gene2go.get(gene, [])) main.project.set_selection('go-terms', go_ids) logger.log("notice", "GO terms updated") class MapGO2Gene(workflow.Function): def __init__(self, ont='bp', gene_id_name='go-terms'): self._ont = ont self._gene_id_name = gene_id_name workflow.Function.__init__(self, 'go2gene', 'GO->gene') # load data at init try: fname = "/home/flatberg/fluents/data/go2gene.pcl" self._go2gene = cPickle.load(open(fname)) except: logger.log("notice", "could not load mapping") def run(self): selection = main.project.get_selection() if not selection.has_key(self._gene_id_name): logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys())) return None if len(selection[self._gene_id_name])==0: logger.log("notice", "No selected genes to query") return None go_ids = selection[self._gene_id_name] gene_ids = set() for go in go_ids: if not self._go2gene.get(go,[]): logger.log("notice", "Could not find any gene ids for %s" %go) gene_ids.update(self._go2gene.get(go,[])) main.project.set_selection('gene_ids', gene_ids) logger.log("notice", "GO terms updated") class SubgraphQuery(workflow.Function): def __init__(self, graph='kegg', dim='gene_ids'): self._gtype = graph self._dim = dim workflow.Function.__init__(self, 'keggraph', 'KeggGraph') def run(self, Dw, DA): max_edge_ratio = .20 max_cov_ratio = .25 neigh_type = 'cov' neigh_type = 'cosine' #neigh_type = 'heat' # 1.) Operate on a subset selection selection = main.project.get_selection() if not selection.has_key(self._dim): logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._dim, selection.keys())) return None if len(selection[self._dim]) == 0: logger.log("notice", "No selected genes to query, using all") Dw = Dw.subdata(self._dim, Dw.get_identifiers(self._dim)[:100]) else: Dw = Dw.subdata(self._dim, selection[self._dim]) # 2.) Pairwise goodness in loading space indices = self._pairsim(Dw) idents1 = Dw.get_identifiers(self._dim, indices[:,0]) idents2 = Dw.get_identifiers(self._dim, indices[:,1]) idents = zip(idents1, idents2) # 3.) Identify close subgraphs G = DA.asnetworkx() for edge in G.edges(): if edge not in idents: G.delete_edge(edge) S = nx.connected_component_subgraphs(G) print map(len, S) # 4.) Rank subgraphs main.project.set_selection('gene_ids', idents1) #main.project.set_sele logger.log("notice", "Gene ids updated") plt = GraphQueryScatterPlot(S, Dw) #return [plt] def _pairsim(self, Dw, ptype='cosine',cut_rat=.2): """Returns close pairs across given dim. ptype : ['cov', 'correlation', 'cosine', 'heat', 'euclidean'] """ W = Dw.asarray() if ptype == 'cov': W -= W.mean(1)[:,scipy.newaxis] wcov = scipy.dot(W, W.T)/(W.shape[1]-1) wcov_min = wcov.max()*cut_rat indices = scipy.asarray(scipy.where(wcov >= wcov_min)).T elif ptype == 'heat': from hcluster import pdist, squareform D = squareform(pdist(W)) H = exp(-D) h_min = H.max()*cut_rat indices = scipy.asarray(scipy.where(H >= h_min)).T elif ptype in ['euclidean', 'cosine', 'correlation']: from hcluster import pdist, squareform D = squareform(pdist(W), ptype) d_min = D.max()*cut_rat indices = [] for i in range(D.shape[0]): for j in range(i, D.shape[0]): if D[i,j] <= d_min: indices.append([i,j]) print "W" print W.shape indices = scipy.asarray(indices) else: raise ValueError("ptype: %s not valid" %ptype) return indices def _subgraphsim(self, Dw, idents, stype='dijkstra'): # subgraph Gw = nx.XGraph() for edge in idents: e = G.get_edge(edge) Gw.add_edge() if stype == 'dijkstra': pass class GraphQueryScatterPlot(plots.ScatterPlot): def __init__(self, subgraphs, Dw, *args, **kw): self._subgraphs = subgraphs self._nx_nodes = [] self._nx_edges = [] self._init_scatter(Dw) self.overlay_subgraphs() def _init_scatter(self, Dw): self._Dw = Dw id_dim, sel_dim = Dw.get_dim_name() self._dim = id_dim id_1, = Dw.get_identifiers(sel_dim, [0]) id_2, = Dw.get_identifiers(sel_dim, [1]) plots.ScatterPlot.__init__(self, Dw, Dw, id_dim, sel_dim, id_1, id_2, c='g', s=50,name="Hypo", alpha=.5) def overlay_subgraphs(self): all_nodes = self._Dw.get_identifiers(self._dim, sorted=True) for subgraph in self._subgraphs: # get xy positions from nodes = subgraph.nodes() for i, node in enumerate(all_nodes): pos[node] = (self.xaxis_data[i], self.yaxis_data[i]) nn = nx.draw_networkx_nodes(subgraph, pos, node_size=200, ax=self.axes, zorder=10) ee = nx.draw_networkx_edges(subgraph, pos, ax=self.axes, zorder=9) self._nx_nodes.append(nn) self._nx_edges.append(ee) def _delete_networks(self): if len(self._nx_nodes) > 0: for n in self._nx_nodes: self._nx_nodes.remove(n) self.axes.collections.remove(n) if len(self._nx_edges) > 0: for e in self._nx_edges: self._nx_edges.remove(e) self.axes.collections.remove(e) def set_ordinate(self, sb): self._delete_networks() self.overlay_subgraphs() plots.ScatterPlot.set_ordinate(self, sb) def set_absicca(self, sb): self._delete_networks() self.overlay_subgraphs() plots.ScatterPlot.set_absicca(self, sb) class CAsinglesel(workflow.Function): """ Modified non-symmetric correpsondence analysis. Setup multiple selections: Input : - a subset(s) along a dimension (selection) of `interesting` identifiers. - Predefined subsets (categories) along the same dimension. 1.) The cooccurence matrix of interesting identifers and categories is made. 2.) The variables are scaled to represent the relative frequencies. """ def run(X, Ckegg): pass class CASingleSelDouble(workflow.Function): """ """ def run(X, Ckegg): pass