hypergeometric test (scipy version has errors)

This commit is contained in:
Arnar Flatberg 2007-02-21 13:03:51 +00:00
parent 48165d1aed
commit 8c03338b75

93
fluents/lib/hypergeom.py Normal file
View File

@ -0,0 +1,93 @@
import scipy
try:
import rpy
has_rpy = True
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
except:
has_rpy = False
def gene_hypergeo_test(selection, category_dataset):
"""Returns the pvals from a hypergeometric test of significance.
input:
-- selection: list of selected identifiers along 0 dim of cat.set
-- category dataset, categories along dim 1 (cols)
"""
gene_dim_name = category_dataset.get_dim_name(0)
category_dim_name = category_dataset.get_dim_name(1)
#categories
all_cats = category_dataset.get_identifiers(category_dim_name, sorted=True)
# gene_ids universe
all_genes = category_dataset.get_identifiers(gene_dim_name)
# signifcant genes
good_genes_all = list(selection)
gg_index = category_dataset.get_indices(gene_dim_name, good_genes_all)
# significant genes pr. category
good_genes_cat = []
for col in category_dataset.asarray().T:
index = scipy.where(col==1)[0]
index = scipy.intersect1d(index, gg_index)
if index.size==0:
good_genes_cat.append([])
else:
good_genes_cat.append(category_dataset.get_identifiers(gene_dim_name, index))
count = map(len, good_genes_cat)
count = scipy.asarray([max(i, 0) for i in count])
cat_count = category_dataset.asarray().sum(0)
if has_rpy:
rpy.r.assign("x", count - 1) #number of sign. genes in category i
rpy.r.assign("m", len(good_genes_all)) # number of sign. genes tot
rpy.r.assign("n", len(all_genes)-len(good_genes_all) ) # num. genes not sign.
rpy.r.assign("k", cat_count) #num. genes in cat i
silent_eval('pvals <- phyper(x, m, n, k, lower.tail=FALSE)')
pvals = rpy.r("pvals")
else:
pvals = p_hyper_geom(count, len(good_genes_all),
len(all_genes)-len(good_genes_all),
cat_count)
pvals = scipy.where(cat_count==0, 2, pvals)
out = {}
for i in range(pvals.size):
out[str(all_cats[i])] = (count[i], cat_count[i], pvals[i])
return out
def p_hyper_geom(x, m, n, k):
"""Distribution function for the hypergeometric distribution.
Inputs:
-- x: vector of quantiles representing the number of white balls
drawn without replacement from an urn which contains both
black and white balls.
-- m: the number of white balls in the urn.
-- n: the number of black balls in the urn.
-- k: [vector] the number of balls drawn from the urn
Comments:
Similar to R's phyper with lower.tail=FALSE
"""
M = m + n
multiple_draws = False
if isinstance(k, scipy.ndarray) and k.size>1:
multiple_draws = True
n_draws = k.size
if n_draws<x.size:
print "n_draws: %d and n_found: %d Length mismatch, zero padded" %(k.size, x.size)
N = k
n = m
if not multiple_draws:
out = scipy.stats.hypergeom.pmf(x, M, n, N).cumsum()
else:
out = scipy.zeros((max(n_draws, x.size),))
for i in xrange(N.size):
out[i] = scipy.stats.hypergeom.pmf(x, M, n, N[i]).cumsum()[i]
return out