import scipy try: import rpy has_rpy = True silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r) except: has_rpy = False def gene_hypergeo_test(selection, category_dataset): """Returns the pvals from a hypergeometric test of significance. input: -- selection: list of selected identifiers along 0 dim of cat.set -- category dataset, categories along dim 1 (cols) """ gene_dim_name = category_dataset.get_dim_name(0) category_dim_name = category_dataset.get_dim_name(1) #categories all_cats = category_dataset.get_identifiers(category_dim_name, sorted=True) # gene_ids universe all_genes = category_dataset.get_identifiers(gene_dim_name) # signifcant genes good_genes_all = list(selection) gg_index = category_dataset.get_indices(gene_dim_name, good_genes_all) # significant genes pr. category good_genes_cat = [] for col in category_dataset.asarray().T: index = scipy.where(col==1)[0] index = scipy.intersect1d(index, gg_index) if index.size==0: good_genes_cat.append([]) else: good_genes_cat.append(category_dataset.get_identifiers(gene_dim_name, index)) count = map(len, good_genes_cat) count = scipy.asarray([max(i, 0) for i in count]) cat_count = category_dataset.asarray().sum(0) if has_rpy: rpy.r.assign("x", count - 1) #number of sign. genes in category i rpy.r.assign("m", len(good_genes_all)) # number of sign. genes tot rpy.r.assign("n", len(all_genes)-len(good_genes_all) ) # num. genes not sign. rpy.r.assign("k", cat_count) #num. genes in cat i silent_eval('pvals <- phyper(x, m, n, k, lower.tail=FALSE)') pvals = rpy.r("pvals") else: pvals = p_hyper_geom(count, len(good_genes_all), len(all_genes)-len(good_genes_all), cat_count) pvals = scipy.where(cat_count==0, 2, pvals) out = {} for i in range(pvals.size): out[str(all_cats[i])] = (count[i], cat_count[i], pvals[i]) return out def p_hyper_geom(x, m, n, k): """Distribution function for the hypergeometric distribution. Inputs: -- x: vector of quantiles representing the number of white balls drawn without replacement from an urn which contains both black and white balls. -- m: the number of white balls in the urn. -- n: the number of black balls in the urn. -- k: [vector] the number of balls drawn from the urn Comments: Similar to R's phyper with lower.tail=FALSE """ M = m + n multiple_draws = False if isinstance(k, scipy.ndarray) and k.size>1: multiple_draws = True n_draws = k.size if n_draws