2007-07-05 15:20:52 +02:00
|
|
|
import sys,os
|
2007-12-14 16:15:01 +01:00
|
|
|
import os.path
|
2007-07-05 15:20:52 +02:00
|
|
|
import webbrowser
|
2007-11-07 13:34:13 +01:00
|
|
|
import cPickle
|
2007-07-05 15:20:52 +02:00
|
|
|
|
2008-12-05 23:07:56 +01:00
|
|
|
from laydi import logger, plots,workflow,dataset,main
|
|
|
|
from laydi.lib import blmfuncs,nx_utils,validation,engines,cx_stats,cx_utils
|
2007-07-05 15:20:52 +02:00
|
|
|
import gobrowser, geneontology
|
|
|
|
import scipy
|
|
|
|
import networkx as nx
|
|
|
|
|
|
|
|
|
|
|
|
class SmallTestWorkflow(workflow.Workflow):
|
|
|
|
name = 'Smokers'
|
|
|
|
ident = 'smokers'
|
|
|
|
description = 'A small test workflow for gene expression analysis.'
|
|
|
|
|
2007-07-26 14:35:59 +02:00
|
|
|
def __init__(self):
|
|
|
|
workflow.Workflow.__init__(self)
|
2007-07-05 15:20:52 +02:00
|
|
|
|
|
|
|
# DATA IMPORT
|
|
|
|
load = workflow.Stage('load', 'Data')
|
2007-12-14 17:45:05 +01:00
|
|
|
|
2008-01-07 11:47:17 +01:00
|
|
|
load_small = LoadDataFunction('load-small', 'Small', self, 'small')
|
2007-12-14 17:45:05 +01:00
|
|
|
load.add_function(load_small)
|
|
|
|
|
2008-01-07 11:47:17 +01:00
|
|
|
load_medium = LoadDataFunction('load-medium', 'Medium', self, 'medium')
|
2007-12-14 17:45:05 +01:00
|
|
|
load.add_function(load_medium)
|
|
|
|
|
2008-01-07 11:47:17 +01:00
|
|
|
load_full = LoadDataFunction('load-full', 'Full', self, 'full')
|
2007-12-14 17:45:05 +01:00
|
|
|
load.add_function(load_full)
|
|
|
|
|
2008-01-07 11:47:17 +01:00
|
|
|
load_go = LoadDataFunction('load-go', 'GO', self, 'go')
|
2007-12-14 17:45:05 +01:00
|
|
|
load.add_function(load_go)
|
|
|
|
|
2007-07-05 15:20:52 +02:00
|
|
|
#load.add_function(DatasetLoadFunctionCYCLE())
|
|
|
|
self.add_stage(load)
|
|
|
|
|
|
|
|
# NETWORK PREPROCESSING
|
2007-08-02 13:18:48 +02:00
|
|
|
#net = workflow.Stage('net', 'Network integration')
|
|
|
|
#net.add_function(DiffKernelFunction())
|
|
|
|
#net.add_function(ModKernelFunction())
|
2007-07-05 15:20:52 +02:00
|
|
|
#net.add_function(RandDiffKernelFunction())
|
2007-08-02 13:18:48 +02:00
|
|
|
#self.add_stage(net)
|
2007-07-05 15:20:52 +02:00
|
|
|
|
|
|
|
# BLM's
|
|
|
|
model = workflow.Stage('models', 'Models')
|
|
|
|
model.add_function(blmfuncs.PCA())
|
|
|
|
model.add_function(blmfuncs.PLS())
|
2007-07-23 19:35:28 +02:00
|
|
|
model.add_function(blmfuncs.LPLS())
|
2007-08-14 18:16:31 +02:00
|
|
|
model.add_function(SAM())
|
2007-07-05 15:20:52 +02:00
|
|
|
|
|
|
|
#model.add_function(bioconFuncs.SAM(app))
|
|
|
|
self.add_stage(model)
|
|
|
|
|
|
|
|
query = workflow.Stage('query', 'Gene Query')
|
|
|
|
query.add_function(NCBIQuery())
|
|
|
|
query.add_function(KEGGQuery())
|
|
|
|
self.add_stage(query)
|
|
|
|
|
2007-07-05 20:24:45 +02:00
|
|
|
# Gene Ontology
|
2007-07-05 15:20:52 +02:00
|
|
|
go = workflow.Stage('go', 'Gene Ontology')
|
|
|
|
go.add_function(gobrowser.LoadGOFunction())
|
2007-08-02 12:20:33 +02:00
|
|
|
go.add_function(gobrowser.SetICFunction())
|
2007-07-30 12:51:58 +02:00
|
|
|
# go.add_function(gobrowser.GOWeightFunction())
|
|
|
|
# go.add_function(gobrowser.DistanceToSelectionFunction())
|
|
|
|
# go.add_function(gobrowser.TTestFunction())
|
2007-07-23 19:02:28 +02:00
|
|
|
go.add_function(gobrowser.PlotDagFunction())
|
2009-02-06 23:21:51 +01:00
|
|
|
go.add_function(gobrowser.PlotDagFunction("cc"))
|
|
|
|
go.add_function(gobrowser.PlotDagFunction("mf"))
|
2007-08-02 13:18:48 +02:00
|
|
|
go.add_function(GoEnrichment())
|
2007-08-14 18:16:31 +02:00
|
|
|
go.add_function(GoEnrichmentCond())
|
2007-11-07 13:34:13 +01:00
|
|
|
go.add_function(MapGO2Gene())
|
|
|
|
go.add_function(MapGene2GO())
|
2007-07-05 15:20:52 +02:00
|
|
|
self.add_stage(go)
|
|
|
|
|
|
|
|
# EXTRA PLOTS
|
|
|
|
#plt = workflow.Stage('net', 'Network')
|
|
|
|
#plt.add_function(nx_analyser.KeggNetworkAnalyser())
|
|
|
|
#self.add_stage(plt)
|
|
|
|
|
|
|
|
logger.log('debug', 'Small test workflow is now active')
|
|
|
|
|
|
|
|
|
2007-12-14 17:45:05 +01:00
|
|
|
class LoadDataFunction(workflow.Function):
|
|
|
|
"""Loads all datasets in a given directory."""
|
|
|
|
def __init__(self, ident, label, wf, dir=''):
|
|
|
|
workflow.Function.__init__(self, ident, label)
|
|
|
|
self._dir = dir
|
|
|
|
self._wf = wf
|
2007-07-05 15:20:52 +02:00
|
|
|
|
|
|
|
def run(self):
|
2007-12-14 17:45:05 +01:00
|
|
|
path = os.path.join(main.options.datadir, self._wf.ident, self._dir)
|
2007-07-05 15:20:52 +02:00
|
|
|
files = os.listdir(path)
|
|
|
|
out = []
|
2007-12-14 17:45:05 +01:00
|
|
|
for fn in files:
|
|
|
|
if fn.endswith('.ftsv'):
|
|
|
|
out.append(dataset.read_ftsv(os.path.join(path, fn)))
|
2007-07-05 15:20:52 +02:00
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
class DatasetLoadFunctionCYCLE(workflow.Function):
|
|
|
|
"""Loader for pickled CYCLE datasets."""
|
|
|
|
def __init__(self):
|
|
|
|
workflow.Function.__init__(self, 'load_data', 'Cycle')
|
|
|
|
|
|
|
|
def run(self):
|
2008-12-05 23:07:56 +01:00
|
|
|
filename='laydi/data/CYCLE'
|
2007-07-05 15:20:52 +02:00
|
|
|
if filename:
|
|
|
|
return dataset.from_file(filename)
|
|
|
|
|
|
|
|
|
|
|
|
##### WORKFLOW SPECIFIC FUNCTIONS ######
|
2007-08-14 18:16:31 +02:00
|
|
|
class SAM(workflow.Function):
|
|
|
|
def __init__(self, id='sam', name='SAM'):
|
|
|
|
workflow.Function.__init__(self, id, name)
|
|
|
|
|
|
|
|
def run(self, x, y):
|
|
|
|
|
|
|
|
n_iter = 50 #B
|
|
|
|
alpha = 0.01 #cut off on qvals
|
|
|
|
|
|
|
|
###############
|
2007-07-05 15:20:52 +02:00
|
|
|
|
2007-08-14 18:16:31 +02:00
|
|
|
# Main function call
|
|
|
|
|
|
|
|
# setup prelimenaries
|
|
|
|
import rpy
|
|
|
|
rpy.r.library("siggenes")
|
|
|
|
rpy.r.library("multtest")
|
2007-07-05 15:20:52 +02:00
|
|
|
|
2007-08-22 15:40:33 +02:00
|
|
|
cl = scipy.dot(y.asarray(), scipy.diag(scipy.arange(y.shape[1]))).sum(1)
|
2007-08-14 18:16:31 +02:00
|
|
|
data = x.asarray().T
|
|
|
|
sam = rpy.r.sam(data, cl=cl, B=n_iter, var_equal=False,med=False,s0=scipy.nan,rand=scipy.nan)
|
|
|
|
qvals = scipy.asarray(rpy.r.slot(sam, "p.value"))
|
|
|
|
pvals = scipy.asarray(rpy.r.slot(sam, "q.value"))
|
|
|
|
|
|
|
|
sam_index = (qvals<alpha).nonzero()[0]
|
|
|
|
|
|
|
|
# Update selection object
|
|
|
|
dim_name = x.get_dim_name(1)
|
|
|
|
sam_selection = x.get_identifiers(dim_name, indices=sam_index)
|
|
|
|
main.project.set_selection(dim_name, sam_selection)
|
|
|
|
|
|
|
|
sel = dataset.Selection('SAM selection')
|
|
|
|
sel.select(dim_name, sam_selection)
|
|
|
|
logger.log('notice','Number of significant varibles (SAM): %s' %len(sam_selection))
|
|
|
|
|
|
|
|
# ## OUTPUT ###
|
|
|
|
xcolname = x.get_dim_name(1) # genes
|
|
|
|
x_col_ids = [xcolname, x.get_identifiers(xcolname, sorted=True)]
|
|
|
|
sing_id = ['_john', ['0']] #singleton
|
|
|
|
D_qvals = dataset.Dataset(qvals, (x_col_ids, sing_id), name='q_vals')
|
|
|
|
D_pvals = dataset.Dataset(pvals, (x_col_ids, sing_id), name='p_vals')
|
|
|
|
|
|
|
|
# plots
|
|
|
|
s_indx = qvals.flatten().argsort()
|
|
|
|
s_ids = [x_col_ids[0],[x_col_ids[1][i] for i in s_indx]]
|
|
|
|
xindex = scipy.arange(len(qvals))
|
|
|
|
qvals_s = qvals.take(s_indx)
|
|
|
|
D_qs = dataset.Dataset(qvals_s, (s_ids, sing_id), name="sorted qvals")
|
|
|
|
Dind = dataset.Dataset(xindex, (s_ids, sing_id), name="dum")
|
|
|
|
st = plots.ScatterPlot(D_qs, Dind, 'gene_ids', '_john', '0', '0', s=10, name='SAM qvals')
|
|
|
|
|
|
|
|
return [D_qvals, D_pvals, D_qs, st, sel]
|
|
|
|
|
|
|
|
|
2007-07-05 15:20:52 +02:00
|
|
|
class DiffKernelFunction(workflow.Function):
|
|
|
|
def __init__(self):
|
|
|
|
workflow.Function.__init__(self, 'diffkernel', 'Diffusion')
|
|
|
|
|
|
|
|
def run(self, x, a):
|
|
|
|
"""x is gene expression data, a is the network.
|
|
|
|
"""
|
|
|
|
#sanity check:
|
|
|
|
g = a.asnetworkx()
|
|
|
|
genes = x.get_identifiers(x.get_dim_name(1), sorted=True)
|
|
|
|
W = nx.adj_matrix(g, nodelist=genes)
|
|
|
|
X = x.asarray()
|
|
|
|
Xc, mn_x = cx_utils.mat_center(X, ret_mn=True)
|
|
|
|
out = []
|
|
|
|
alpha=1.0
|
|
|
|
beta = 1.0
|
|
|
|
K = nx_utils.K_diffusion(W, alpha=alpha, beta=beta,normalised=True)
|
|
|
|
Xp = scipy.dot(Xc, K) + mn_x
|
|
|
|
# dataset
|
|
|
|
row_ids = (x.get_dim_name(0),
|
|
|
|
x.get_identifiers(x.get_dim_name(0),
|
|
|
|
sorted=True))
|
|
|
|
col_ids = (x.get_dim_name(1),
|
|
|
|
x.get_identifiers(x.get_dim_name(1),
|
|
|
|
sorted=True))
|
|
|
|
|
|
|
|
xout = dataset.Dataset(Xp,
|
|
|
|
(row_ids, col_ids),
|
|
|
|
name=x.get_name()+'_diff'+str(alpha))
|
|
|
|
out.append(xout)
|
|
|
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
class RandDiffKernelFunction(workflow.Function):
|
|
|
|
def __init__(self):
|
|
|
|
workflow.Function.__init__(self, 'diffkernel', 'Rand. Diff.')
|
|
|
|
|
|
|
|
def run(self, x, a):
|
|
|
|
"""x is gene expression data, a is the network.
|
|
|
|
"""
|
|
|
|
#sanity check:
|
|
|
|
g = a.asnetworkx()
|
|
|
|
genes = x.get_identifiers(x.get_dim_name(1))
|
|
|
|
# randomise nodelist
|
|
|
|
genes = [genes[i] for i in cx_utils.randperm(x.shape[1])]
|
|
|
|
W = nx.adj_matrix(g, nodelist=genes)
|
|
|
|
X = x.asarray()
|
|
|
|
Xc, mn_x = cx_utils.mat_center(X, ret_mn=True)
|
|
|
|
out = []
|
|
|
|
alpha=1.
|
|
|
|
beta = 1.0
|
|
|
|
K = nx_utils.K_diffusion(W, alpha=alpha, beta=beta,normalised=True)
|
|
|
|
|
|
|
|
Xp = scipy.dot(Xc, K) + mn_x
|
|
|
|
# dataset
|
|
|
|
row_ids = (x.get_dim_name(0),
|
|
|
|
x.get_identifiers(x.get_dim_name(0),
|
|
|
|
sorted=True))
|
|
|
|
col_ids = (x.get_dim_name(1),
|
|
|
|
x.get_identifiers(x.get_dim_name(1),
|
|
|
|
sorted=True))
|
|
|
|
|
|
|
|
xout = dataset.Dataset(Xp,
|
|
|
|
(row_ids, col_ids),
|
|
|
|
name=x.get_name()+'_diff'+str(alpha))
|
|
|
|
out.append(xout)
|
|
|
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
class ModKernelFunction(workflow.Function):
|
|
|
|
def __init__(self):
|
|
|
|
workflow.Function.__init__(self, 'mokernel', 'Modularity')
|
|
|
|
|
|
|
|
def run(self,x,a):
|
|
|
|
X = x.asarray()
|
|
|
|
g = a.asnetworkx()
|
|
|
|
genes = x.get_identifiers(x.get_dim_name(1), sorted=True)
|
|
|
|
W = nx.adj_matrix(g, nodelist=genes)
|
|
|
|
out=[]
|
|
|
|
alpha=.2
|
|
|
|
Xc,mn_x = cx_utils.mat_center(X, ret_mn=True)
|
|
|
|
K = nx_utils.K_modularity(W, alpha=alpha)
|
|
|
|
Xp = scipy.dot(Xc, K)
|
|
|
|
Xp = Xp + mn_x
|
|
|
|
|
|
|
|
# dataset
|
|
|
|
row_ids = (x.get_dim_name(0),
|
|
|
|
x.get_identifiers(x.get_dim_name(0),
|
|
|
|
sorted=True))
|
|
|
|
col_ids = (x.get_dim_name(1),
|
|
|
|
x.get_identifiers(x.get_dim_name(1),
|
|
|
|
sorted=True))
|
|
|
|
xout = dataset.Dataset(Xp,
|
|
|
|
(row_ids,col_ids),
|
|
|
|
name=x.get_name()+'_mod'+str(alpha))
|
|
|
|
out.append(xout)
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
class NCBIQuery(workflow.Function):
|
2007-08-22 15:40:33 +02:00
|
|
|
def __init__(self, gene_id_name='gene_ids'):
|
2007-07-05 20:49:24 +02:00
|
|
|
self._gene_id_name = gene_id_name
|
2007-07-05 15:20:52 +02:00
|
|
|
workflow.Function.__init__(self, 'query', 'NCBI')
|
|
|
|
|
2007-08-22 15:40:33 +02:00
|
|
|
def run(self):
|
|
|
|
selection = main.project.get_selection()
|
2007-07-05 20:49:24 +02:00
|
|
|
if not selection.has_key(self._gene_id_name):
|
|
|
|
logger.log("notice", "Expected gene ids: %s, but got: %s" %(self._gene_id_name, selection.keys()))
|
|
|
|
return None
|
|
|
|
if len(selection[self._gene_id_name])==0:
|
|
|
|
logger.log("notice", "No selected genes to query")
|
|
|
|
return None
|
|
|
|
|
2007-07-05 15:20:52 +02:00
|
|
|
base = 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?'
|
|
|
|
options = {r'&db=' : 'gene',
|
|
|
|
r'&cmd=' : 'retrieve',
|
|
|
|
r'&dopt=' : 'full_report'}
|
2007-07-05 20:49:24 +02:00
|
|
|
gene_str = ''.join([gene + "+" for gene in selection[self._gene_id_name]])
|
2007-07-05 15:20:52 +02:00
|
|
|
options[r'&list_uids='] = gene_str[:-1]
|
|
|
|
opt_str = ''.join([key+value for key,value in options.items()])
|
|
|
|
web_str = base + opt_str
|
|
|
|
webbrowser.open(web_str)
|
|
|
|
|
|
|
|
|
|
|
|
class KEGGQuery(workflow.Function):
|
2007-08-22 15:40:33 +02:00
|
|
|
def __init__(self, org='hsa', gene_id_name='gene_ids'):
|
2007-07-05 20:49:24 +02:00
|
|
|
self._org=org
|
|
|
|
self._gene_id_name = gene_id_name
|
2007-07-05 15:20:52 +02:00
|
|
|
workflow.Function.__init__(self, 'query', 'KEGG')
|
|
|
|
|
|
|
|
def run(self, selection):
|
2007-07-05 20:49:24 +02:00
|
|
|
if not selection.has_key(self._gene_id_name):
|
|
|
|
logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys()))
|
|
|
|
return None
|
|
|
|
if len(selection[self._gene_id_name])==0:
|
|
|
|
logger.log("notice", "No selected genes to query")
|
2007-07-05 15:20:52 +02:00
|
|
|
return None
|
|
|
|
|
|
|
|
base = r'http://www.genome.jp/dbget-bin/www_bget?'
|
2007-07-05 20:49:24 +02:00
|
|
|
gene_str = ''.join([gene + "+" for gene in selection[self._gene_id_name]])
|
2007-07-05 15:20:52 +02:00
|
|
|
gene_str = gene_str[:-1]
|
2007-07-05 20:49:24 +02:00
|
|
|
gene_str = self._org + "+" + gene_str
|
2007-07-05 15:20:52 +02:00
|
|
|
web_str = base + gene_str
|
|
|
|
webbrowser.open(web_str)
|
|
|
|
|
|
|
|
|
2007-08-14 18:16:31 +02:00
|
|
|
class GoEnrichment(workflow.Function):
|
2007-07-05 15:20:52 +02:00
|
|
|
def __init__(self):
|
2007-08-14 18:16:31 +02:00
|
|
|
workflow.Function.__init__(self, 'goenrich', 'Go Enrichment')
|
2007-07-05 15:20:52 +02:00
|
|
|
|
|
|
|
def run(self, data):
|
2007-08-14 18:16:31 +02:00
|
|
|
import rpy
|
|
|
|
rpy.r.library("GOstats")
|
|
|
|
|
|
|
|
# Get universe
|
|
|
|
# Here, we are using a defined dataset to represent the universe
|
|
|
|
if not 'gene_ids' in data:
|
2007-11-07 13:34:13 +01:00
|
|
|
logger.log('notice', 'No dimension called [gene_ids] in dataset: %s' %data.get_name())
|
2007-08-14 18:16:31 +02:00
|
|
|
return
|
|
|
|
universe = list(data.get_identifiers('gene_ids'))
|
|
|
|
logger.log('notice', 'Universe consists of %s gene ids from %s' %(len(universe), data.get_name()))
|
|
|
|
# Get current selection and validate
|
|
|
|
curr_sel = main.project.get_selection()
|
|
|
|
selected_genes = list(curr_sel['gene_ids'])
|
|
|
|
if len(selected_genes)==0:
|
|
|
|
logger.log('notice', 'This function needs a current selection!')
|
|
|
|
return
|
|
|
|
|
|
|
|
# Hypergeometric parameter object
|
|
|
|
pval_cutoff = 0.9999
|
|
|
|
cond = False
|
|
|
|
test_direction = 'over'
|
|
|
|
params = rpy.r.new("GOHyperGParams",
|
|
|
|
geneIds=selected_genes,
|
|
|
|
annotation="hgu133a",
|
|
|
|
ontology="BP",
|
|
|
|
pvalueCutoff=pval_cutoff,
|
|
|
|
conditional=cond,
|
|
|
|
testDirection=test_direction
|
|
|
|
)
|
|
|
|
# run test
|
|
|
|
# result.keys(): ['Count', 'Term', 'OddsRatio', 'Pvalue', 'ExpCount', 'GOBPID', 'Size']
|
|
|
|
result = rpy.r.summary(rpy.r.hyperGTest(params))
|
|
|
|
|
|
|
|
# dataset
|
|
|
|
terms = result['GOBPID']
|
|
|
|
pvals = scipy.log(scipy.asarray(result['Pvalue']))
|
|
|
|
row_ids = ('go-terms', terms)
|
|
|
|
col_ids = ('_john', ['_doe'])
|
|
|
|
|
|
|
|
xout = dataset.Dataset(pvals,
|
|
|
|
(row_ids, col_ids),
|
|
|
|
name='P values (enrichment)')
|
|
|
|
return [xout]
|
2007-07-05 15:20:52 +02:00
|
|
|
|
2007-08-02 13:18:48 +02:00
|
|
|
|
2007-08-14 18:16:31 +02:00
|
|
|
class GoEnrichmentCond(workflow.Function):
|
|
|
|
""" Enrichment conditioned on dag structure."""
|
2007-08-02 13:18:48 +02:00
|
|
|
def __init__(self):
|
2007-08-14 18:16:31 +02:00
|
|
|
workflow.Function.__init__(self, 'goenrich', 'Go Cond. Enrich.')
|
2007-08-02 13:18:48 +02:00
|
|
|
|
|
|
|
def run(self, data):
|
|
|
|
import rpy
|
|
|
|
rpy.r.library("GOstats")
|
|
|
|
|
|
|
|
# Get universe
|
|
|
|
# Here, we are using a defined dataset to represent the universe
|
|
|
|
if not 'gene_ids' in data:
|
|
|
|
logger.log('notice', 'No dimension called [gene_ids] in dataset: %s', data.get_name())
|
|
|
|
return
|
|
|
|
universe = list(data.get_identifiers('gene_ids'))
|
2007-08-14 18:16:31 +02:00
|
|
|
logger.log('notice', 'Universe consists of %s gene ids from %s' %(len(universe), data.get_name()))
|
2007-08-02 13:18:48 +02:00
|
|
|
# Get current selection and validate
|
|
|
|
curr_sel = main.project.get_selection()
|
|
|
|
selected_genes = list(curr_sel['gene_ids'])
|
|
|
|
if len(selected_genes)==0:
|
|
|
|
logger.log('notice', 'This function needs a current selection!')
|
|
|
|
return
|
|
|
|
|
|
|
|
# Hypergeometric parameter object
|
2007-11-07 13:34:13 +01:00
|
|
|
pval_cutoff = 1
|
2007-08-14 18:16:31 +02:00
|
|
|
cond = True
|
2007-08-02 13:18:48 +02:00
|
|
|
test_direction = 'over'
|
|
|
|
params = rpy.r.new("GOHyperGParams",
|
|
|
|
geneIds=selected_genes,
|
|
|
|
annotation="hgu133a",
|
|
|
|
ontology="BP",
|
|
|
|
pvalueCutoff=pval_cutoff,
|
|
|
|
conditional=cond,
|
|
|
|
testDirection=test_direction
|
|
|
|
)
|
|
|
|
# run test
|
|
|
|
# result.keys(): ['Count', 'Term', 'OddsRatio', 'Pvalue', 'ExpCount', 'GOBPID', 'Size']
|
|
|
|
result = rpy.r.summary(rpy.r.hyperGTest(params))
|
|
|
|
|
|
|
|
# dataset
|
|
|
|
terms = result['GOBPID']
|
|
|
|
pvals = scipy.log(scipy.asarray(result['Pvalue']))
|
|
|
|
row_ids = ('go-terms', terms)
|
|
|
|
col_ids = ('_john', ['_doe'])
|
|
|
|
|
|
|
|
xout = dataset.Dataset(pvals,
|
|
|
|
(row_ids, col_ids),
|
|
|
|
name='P values (enrichment)')
|
|
|
|
return [xout]
|
2007-08-14 18:16:31 +02:00
|
|
|
|
2007-11-07 13:34:13 +01:00
|
|
|
|
|
|
|
class MapGene2GO(workflow.Function):
|
|
|
|
def __init__(self, ont='bp', gene_id_name='gene_ids'):
|
|
|
|
self._ont = ont
|
|
|
|
self._gene_id_name = gene_id_name
|
|
|
|
workflow.Function.__init__(self, 'gene2go', 'gene->GO')
|
|
|
|
# load data at init
|
|
|
|
try:
|
2008-12-05 23:07:56 +01:00
|
|
|
fname = "/home/flatberg/laydi/data/gene2go.pcl"
|
2007-11-07 13:34:13 +01:00
|
|
|
self._gene2go = cPickle.load(open(fname))
|
|
|
|
except:
|
|
|
|
logger.log("notice", "could not load mapping")
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
selection = main.project.get_selection()
|
|
|
|
if not selection.has_key(self._gene_id_name):
|
|
|
|
logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys()))
|
|
|
|
return None
|
|
|
|
if len(selection[self._gene_id_name])==0:
|
|
|
|
logger.log("notice", "No selected genes to query")
|
|
|
|
return None
|
|
|
|
|
|
|
|
gene_ids = selection[self._gene_id_name]
|
|
|
|
go_ids = set()
|
|
|
|
for gene in gene_ids:
|
|
|
|
go_ids_new = self._gene2go.get(gene, [])
|
|
|
|
if not go_ids_new:
|
|
|
|
logger.log("notice", "Could not find any goterms for %s" %gene)
|
|
|
|
go_ids.update(self._gene2go.get(gene, []))
|
|
|
|
main.project.set_selection('go-terms', go_ids)
|
|
|
|
logger.log("notice", "GO terms updated")
|
|
|
|
|
|
|
|
|
|
|
|
class MapGO2Gene(workflow.Function):
|
|
|
|
def __init__(self, ont='bp', gene_id_name='go-terms'):
|
|
|
|
self._ont = ont
|
|
|
|
self._gene_id_name = gene_id_name
|
|
|
|
workflow.Function.__init__(self, 'go2gene', 'GO->gene')
|
|
|
|
# load data at init
|
|
|
|
try:
|
2008-12-05 23:07:56 +01:00
|
|
|
fname = "/home/flatberg/laydi/data/go2gene.pcl"
|
2007-11-07 13:34:13 +01:00
|
|
|
self._go2gene = cPickle.load(open(fname))
|
|
|
|
except:
|
|
|
|
logger.log("notice", "could not load mapping")
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
selection = main.project.get_selection()
|
|
|
|
if not selection.has_key(self._gene_id_name):
|
|
|
|
logger.log("notice", "Expected gene ids: %s, but got. %s" %(self._gene_id_name, selection.keys()))
|
|
|
|
return None
|
|
|
|
if len(selection[self._gene_id_name])==0:
|
|
|
|
logger.log("notice", "No selected genes to query")
|
|
|
|
return None
|
|
|
|
|
|
|
|
go_ids = selection[self._gene_id_name]
|
|
|
|
gene_ids = set()
|
|
|
|
for go in go_ids:
|
|
|
|
if not self._go2gene.get(go,[]):
|
|
|
|
logger.log("notice", "Could not find any gene ids for %s" %go)
|
|
|
|
gene_ids.update(self._go2gene.get(go,[]))
|
|
|
|
main.project.set_selection('gene_ids', gene_ids)
|
|
|
|
logger.log("notice", "GO terms updated")
|