import sys,os import os.path import webbrowser import cPickle from fluents import logger, plots,workflow,dataset,main from fluents.lib import blmfuncs,nx_utils,validation,engines,cx_stats,cx_utils import gobrowser, geneontology import scipy import networkx as nx class SmallTestWorkflow(workflow.Workflow): name = 'Smokers' ident = 'smokers' description = 'A small test workflow for gene expression analysis.' def __init__(self): workflow.Workflow.__init__(self) # DATA IMPORT load = workflow.Stage('load', 'Data') load.add_function(DatasetLoadFunctionSmokerSmall()) load.add_function(DatasetLoadFunctionSmokerMedium()) load.add_function(DatasetLoadFunctionSmokerFull()) load.add_function(DatasetLoadFunctionSmokerGO()) #load.add_function(DatasetLoadFunctionCYCLE()) self.add_stage(load) # NETWORK PREPROCESSING #net = workflow.Stage('net', 'Network integration') #net.add_function(DiffKernelFunction()) #net.add_function(ModKernelFunction()) #net.add_function(RandDiffKernelFunction()) #self.add_stage(net) # BLM's model = workflow.Stage('models', 'Models') model.add_function(blmfuncs.PCA()) model.add_function(blmfuncs.PLS()) model.add_function(blmfuncs.LPLS()) model.add_function(SAM()) #model.add_function(bioconFuncs.SAM(app)) self.add_stage(model) query = workflow.Stage('query', 'Gene Query') query.add_function(NCBIQuery()) query.add_function(KEGGQuery()) self.add_stage(query) # Gene Ontology go = workflow.Stage('go', 'Gene Ontology') go.add_function(gobrowser.LoadGOFunction()) go.add_function(gobrowser.SetICFunction()) # go.add_function(gobrowser.GOWeightFunction()) # go.add_function(gobrowser.DistanceToSelectionFunction()) # go.add_function(gobrowser.TTestFunction()) go.add_function(gobrowser.PlotDagFunction()) go.add_function(GoEnrichment()) go.add_function(GoEnrichmentCond()) go.add_function(MapGO2Gene()) go.add_function(MapGene2GO()) self.add_stage(go) # EXTRA PLOTS #plt = workflow.Stage('net', 'Network') #plt.add_function(nx_analyser.KeggNetworkAnalyser()) #self.add_stage(plt) logger.log('debug', 'Small test workflow is now active') class DatasetLoadFunctionSmokerSmall(workflow.Function): """Loader for all ftsv files of smokers small datasets.""" def __init__(self): workflow.Function.__init__(self, 'load_small', 'Smoker (Small)') def run(self): path = os.path.join(main.options.datadir, 'smokers-small/') files = os.listdir(path) out = [] for fname in files: if fname.endswith('.ftsv'): input_file = open(os.path.join(path, fname)) out.append(dataset.read_ftsv(input_file)) return out class DatasetLoadFunctionSmokerMedium(workflow.Function): """Loader for all ftsv files of smokers small datasets.""" def __init__(self): workflow.Function.__init__(self, 'load_medium', 'Smoker (Medium)') def run(self): path = os.path.join(main.options.datadir, 'smokers-medium/') files = os.listdir(path) out = [] for fname in files: if fname.endswith('.ftsv'): input_file = open(os.path.join(path, fname)) out.append(dataset.read_ftsv(input_file)) return out class DatasetLoadFunctionSmokerFull(workflow.Function): """Loader for all ftsv files of smokers small datasets.""" def __init__(self): workflow.Function.__init__(self, 'load_full', 'Smoker (Full)') def run(self): path = os.path.join(main.options.datadir, 'smokers-full/') files = os.listdir(path) out = [] for fname in files: if fname.endswith('.ftsv'): input_file = open(os.path.join(path, fname)) out.append(dataset.read_ftsv(input_file)) return out class DatasetLoadFunctionSmokerGO(workflow.Function): """Loader for all ftsv files of smokers small datasets.""" def __init__(self): workflow.Function.__init__(self, 'load_go', 'Smoker (GO)') def run(self): path = os.path.join(main.options.datadir, 'smokers-go/') files = os.listdir(path) out = [] for fname in files: if fname.endswith('.ftsv'): input_file = open(os.path.join(path, fname)) out.append(dataset.read_ftsv(input_file)) return out class DatasetLoadFunctionCYCLE(workflow.Function): """Loader for pickled CYCLE datasets.""" def __init__(self): workflow.Function.__init__(self, 'load_data', 'Cycle') def run(self): filename='fluents/data/CYCLE' if filename: return dataset.from_file(filename) ##### WORKFLOW SPECIFIC FUNCTIONS ###### class SAM(workflow.Function): def __init__(self, id='sam', name='SAM'): workflow.Function.__init__(self, id, name) def run(self, x, y): n_iter = 50 #B alpha = 0.01 #cut off on qvals ############### # Main function call # setup prelimenaries import rpy rpy.r.library("siggenes") rpy.r.library("multtest") cl = scipy.dot(y.asarray(), scipy.diag(scipy.arange(y.shape[1]))).sum(1) data = x.asarray().T sam = rpy.r.sam(data, cl=cl, B=n_iter, var_equal=False,med=False,s0=scipy.nan,rand=scipy.nan) qvals = scipy.asarray(rpy.r.slot(sam, "p.value")) pvals = scipy.asarray(rpy.r.slot(sam, "q.value")) sam_index = (qvalsGO') # load data at init try: fname = "/home/flatberg/fluents/data/gene2go.pcl" self._gene2go = cPickle.load(open(fname)) except: logger.log("notice", "could not load mapping") def run(self): selection = main.project.get_selection() if not selection.has_key(self._gene_id_name): logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys())) return None if len(selection[self._gene_id_name])==0: logger.log("notice", "No selected genes to query") return None gene_ids = selection[self._gene_id_name] go_ids = set() for gene in gene_ids: go_ids_new = self._gene2go.get(gene, []) if not go_ids_new: logger.log("notice", "Could not find any goterms for %s" %gene) go_ids.update(self._gene2go.get(gene, [])) main.project.set_selection('go-terms', go_ids) logger.log("notice", "GO terms updated") class MapGO2Gene(workflow.Function): def __init__(self, ont='bp', gene_id_name='go-terms'): self._ont = ont self._gene_id_name = gene_id_name workflow.Function.__init__(self, 'go2gene', 'GO->gene') # load data at init try: fname = "/home/flatberg/fluents/data/go2gene.pcl" self._go2gene = cPickle.load(open(fname)) except: logger.log("notice", "could not load mapping") def run(self): selection = main.project.get_selection() if not selection.has_key(self._gene_id_name): logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys())) return None if len(selection[self._gene_id_name])==0: logger.log("notice", "No selected genes to query") return None go_ids = selection[self._gene_id_name] gene_ids = set() for go in go_ids: if not self._go2gene.get(go,[]): logger.log("notice", "Could not find any gene ids for %s" %go) gene_ids.update(self._go2gene.get(go,[])) main.project.set_selection('gene_ids', gene_ids) logger.log("notice", "GO terms updated")