laydi/fluents/lib/hypergeom.py

import scipy

try:
    # FIXME: remove rpy in a more proper way
    import rpy_does_not_exist
    has_rpy = True
    silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
except:
    has_rpy = False

def gene_hypergeo_test(selection, category_dataset):
    """Returns the pvals from a hypergeometric test of significance.

    input:
           -- selection: list of selected identifiers along 0 dim of cat.set
           -- category dataset, categories along dim 1 (cols)
    """
    gene_dim_name = category_dataset.get_dim_name(0)
    category_dim_name = category_dataset.get_dim_name(1)

    #categories
    all_cats = category_dataset.get_identifiers(category_dim_name, sorted=True)

    # gene_ids universe
    all_genes = category_dataset.get_identifiers(gene_dim_name)

    # signifcant genes
    good_genes_all = list(selection)
    gg_index = category_dataset.get_indices(gene_dim_name, good_genes_all)

    # significant genes pr. category
    good_genes_cat = []
    for col in category_dataset.asarray().T:
        index = scipy.where(col==1)[0]
        index = scipy.intersect1d(index, gg_index)
        if index.size==0:
            good_genes_cat.append([])
        else:
            good_genes_cat.append(category_dataset.get_identifiers(gene_dim_name, index))
    count = map(len, good_genes_cat)
    count = scipy.asarray([max(i, 0) for i in count])
    cat_count = category_dataset.asarray().sum(0)
    if has_rpy:
        rpy.r.assign("x", count - 1) #number of sign. genes in category i
        rpy.r.assign("m", len(good_genes_all)) # number of sign. genes tot
        rpy.r.assign("n", len(all_genes)-len(good_genes_all) ) # num. genes not sign.
        rpy.r.assign("k", cat_count) #num. genes in cat i
        silent_eval('pvals <- phyper(x, m, n, k, lower.tail=FALSE)')
        pvals = rpy.r("pvals")

    else:
        pvals = p_hyper_geom(count, len(good_genes_all),
                             len(all_genes)-len(good_genes_all),
                             cat_count)

    pvals = scipy.where(cat_count==0, 2, pvals)
    pvals = scipy.where(scipy.isnan(pvals), 2, pvals)
    out = {}
    for i in range(pvals.size):
        out[str(all_cats[i])] = (count[i], cat_count[i], pvals[i])
    return out


def p_hyper_geom(x, m, n, k):
    """Distribution function for the hypergeometric distribution.

    Inputs:
           -- x: vector of quantiles representing the number of white balls
           drawn without replacement from an urn which contains both
           black and white balls.
           -- m: the number of white balls in the urn.
           -- n: the number of black balls in the urn.
           -- k: [vector] the number of balls drawn from the urn

    Comments:
    Similar to R's phyper with lower.tail=FALSE

    """

    M = m + n
    multiple_draws = False
    if isinstance(k, scipy.ndarray) and k.size>1:
        multiple_draws = True
        n_draws = k.size
        if n_draws<x.size:
            print "n_draws: %d and n_found: %d  Length mismatch, zero padded" %(k.size, x.size)
    N = k
    n = m
    if not multiple_draws:
        out = scipy.stats.hypergeom.pmf(x, M, n, N).cumsum()
    else:
        out = scipy.zeros((max(n_draws, x.size),))
        for i in xrange(N.size):
            out[i] = scipy.stats.hypergeom.pmf(x, M, n, N[i]).cumsum()[i]
    return out