hypergeometric test (scipy version has errors)
This commit is contained in:
parent
48165d1aed
commit
8c03338b75
|
@ -0,0 +1,93 @@
|
|||
import scipy
|
||||
|
||||
try:
|
||||
import rpy
|
||||
has_rpy = True
|
||||
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
|
||||
except:
|
||||
has_rpy = False
|
||||
|
||||
def gene_hypergeo_test(selection, category_dataset):
|
||||
"""Returns the pvals from a hypergeometric test of significance.
|
||||
|
||||
input:
|
||||
-- selection: list of selected identifiers along 0 dim of cat.set
|
||||
-- category dataset, categories along dim 1 (cols)
|
||||
"""
|
||||
gene_dim_name = category_dataset.get_dim_name(0)
|
||||
category_dim_name = category_dataset.get_dim_name(1)
|
||||
|
||||
#categories
|
||||
all_cats = category_dataset.get_identifiers(category_dim_name, sorted=True)
|
||||
|
||||
# gene_ids universe
|
||||
all_genes = category_dataset.get_identifiers(gene_dim_name)
|
||||
|
||||
# signifcant genes
|
||||
good_genes_all = list(selection)
|
||||
gg_index = category_dataset.get_indices(gene_dim_name, good_genes_all)
|
||||
|
||||
# significant genes pr. category
|
||||
good_genes_cat = []
|
||||
for col in category_dataset.asarray().T:
|
||||
index = scipy.where(col==1)[0]
|
||||
index = scipy.intersect1d(index, gg_index)
|
||||
if index.size==0:
|
||||
good_genes_cat.append([])
|
||||
else:
|
||||
good_genes_cat.append(category_dataset.get_identifiers(gene_dim_name, index))
|
||||
count = map(len, good_genes_cat)
|
||||
count = scipy.asarray([max(i, 0) for i in count])
|
||||
cat_count = category_dataset.asarray().sum(0)
|
||||
if has_rpy:
|
||||
rpy.r.assign("x", count - 1) #number of sign. genes in category i
|
||||
rpy.r.assign("m", len(good_genes_all)) # number of sign. genes tot
|
||||
rpy.r.assign("n", len(all_genes)-len(good_genes_all) ) # num. genes not sign.
|
||||
rpy.r.assign("k", cat_count) #num. genes in cat i
|
||||
silent_eval('pvals <- phyper(x, m, n, k, lower.tail=FALSE)')
|
||||
pvals = rpy.r("pvals")
|
||||
|
||||
else:
|
||||
pvals = p_hyper_geom(count, len(good_genes_all),
|
||||
len(all_genes)-len(good_genes_all),
|
||||
cat_count)
|
||||
|
||||
pvals = scipy.where(cat_count==0, 2, pvals)
|
||||
out = {}
|
||||
for i in range(pvals.size):
|
||||
out[str(all_cats[i])] = (count[i], cat_count[i], pvals[i])
|
||||
return out
|
||||
|
||||
|
||||
def p_hyper_geom(x, m, n, k):
|
||||
"""Distribution function for the hypergeometric distribution.
|
||||
|
||||
Inputs:
|
||||
-- x: vector of quantiles representing the number of white balls
|
||||
drawn without replacement from an urn which contains both
|
||||
black and white balls.
|
||||
-- m: the number of white balls in the urn.
|
||||
-- n: the number of black balls in the urn.
|
||||
-- k: [vector] the number of balls drawn from the urn
|
||||
|
||||
Comments:
|
||||
Similar to R's phyper with lower.tail=FALSE
|
||||
|
||||
"""
|
||||
|
||||
M = m + n
|
||||
multiple_draws = False
|
||||
if isinstance(k, scipy.ndarray) and k.size>1:
|
||||
multiple_draws = True
|
||||
n_draws = k.size
|
||||
if n_draws<x.size:
|
||||
print "n_draws: %d and n_found: %d Length mismatch, zero padded" %(k.size, x.size)
|
||||
N = k
|
||||
n = m
|
||||
if not multiple_draws:
|
||||
out = scipy.stats.hypergeom.pmf(x, M, n, N).cumsum()
|
||||
else:
|
||||
out = scipy.zeros((max(n_draws, x.size),))
|
||||
for i in xrange(N.size):
|
||||
out[i] = scipy.stats.hypergeom.pmf(x, M, n, N[i]).cumsum()[i]
|
||||
return out
|
Reference in New Issue