94 lines
3.2 KiB
Python
94 lines
3.2 KiB
Python
|
import scipy
|
||
|
|
||
|
try:
|
||
|
import rpy
|
||
|
has_rpy = True
|
||
|
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
|
||
|
except:
|
||
|
has_rpy = False
|
||
|
|
||
|
def gene_hypergeo_test(selection, category_dataset):
|
||
|
"""Returns the pvals from a hypergeometric test of significance.
|
||
|
|
||
|
input:
|
||
|
-- selection: list of selected identifiers along 0 dim of cat.set
|
||
|
-- category dataset, categories along dim 1 (cols)
|
||
|
"""
|
||
|
gene_dim_name = category_dataset.get_dim_name(0)
|
||
|
category_dim_name = category_dataset.get_dim_name(1)
|
||
|
|
||
|
#categories
|
||
|
all_cats = category_dataset.get_identifiers(category_dim_name, sorted=True)
|
||
|
|
||
|
# gene_ids universe
|
||
|
all_genes = category_dataset.get_identifiers(gene_dim_name)
|
||
|
|
||
|
# signifcant genes
|
||
|
good_genes_all = list(selection)
|
||
|
gg_index = category_dataset.get_indices(gene_dim_name, good_genes_all)
|
||
|
|
||
|
# significant genes pr. category
|
||
|
good_genes_cat = []
|
||
|
for col in category_dataset.asarray().T:
|
||
|
index = scipy.where(col==1)[0]
|
||
|
index = scipy.intersect1d(index, gg_index)
|
||
|
if index.size==0:
|
||
|
good_genes_cat.append([])
|
||
|
else:
|
||
|
good_genes_cat.append(category_dataset.get_identifiers(gene_dim_name, index))
|
||
|
count = map(len, good_genes_cat)
|
||
|
count = scipy.asarray([max(i, 0) for i in count])
|
||
|
cat_count = category_dataset.asarray().sum(0)
|
||
|
if has_rpy:
|
||
|
rpy.r.assign("x", count - 1) #number of sign. genes in category i
|
||
|
rpy.r.assign("m", len(good_genes_all)) # number of sign. genes tot
|
||
|
rpy.r.assign("n", len(all_genes)-len(good_genes_all) ) # num. genes not sign.
|
||
|
rpy.r.assign("k", cat_count) #num. genes in cat i
|
||
|
silent_eval('pvals <- phyper(x, m, n, k, lower.tail=FALSE)')
|
||
|
pvals = rpy.r("pvals")
|
||
|
|
||
|
else:
|
||
|
pvals = p_hyper_geom(count, len(good_genes_all),
|
||
|
len(all_genes)-len(good_genes_all),
|
||
|
cat_count)
|
||
|
|
||
|
pvals = scipy.where(cat_count==0, 2, pvals)
|
||
|
out = {}
|
||
|
for i in range(pvals.size):
|
||
|
out[str(all_cats[i])] = (count[i], cat_count[i], pvals[i])
|
||
|
return out
|
||
|
|
||
|
|
||
|
def p_hyper_geom(x, m, n, k):
|
||
|
"""Distribution function for the hypergeometric distribution.
|
||
|
|
||
|
Inputs:
|
||
|
-- x: vector of quantiles representing the number of white balls
|
||
|
drawn without replacement from an urn which contains both
|
||
|
black and white balls.
|
||
|
-- m: the number of white balls in the urn.
|
||
|
-- n: the number of black balls in the urn.
|
||
|
-- k: [vector] the number of balls drawn from the urn
|
||
|
|
||
|
Comments:
|
||
|
Similar to R's phyper with lower.tail=FALSE
|
||
|
|
||
|
"""
|
||
|
|
||
|
M = m + n
|
||
|
multiple_draws = False
|
||
|
if isinstance(k, scipy.ndarray) and k.size>1:
|
||
|
multiple_draws = True
|
||
|
n_draws = k.size
|
||
|
if n_draws<x.size:
|
||
|
print "n_draws: %d and n_found: %d Length mismatch, zero padded" %(k.size, x.size)
|
||
|
N = k
|
||
|
n = m
|
||
|
if not multiple_draws:
|
||
|
out = scipy.stats.hypergeom.pmf(x, M, n, N).cumsum()
|
||
|
else:
|
||
|
out = scipy.zeros((max(n_draws, x.size),))
|
||
|
for i in xrange(N.size):
|
||
|
out[i] = scipy.stats.hypergeom.pmf(x, M, n, N[i]).cumsum()[i]
|
||
|
return out
|