laydi/fluents/lib/R_utils.py

"""A collection of functions that use R.

Most functions use libraries from bioconductor

depends on:
(not updated)
-- bioconductor min. install
-- hgu133a
-- hgu133plus2

"""

import scipy
import Numeric as N
import rpy
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)

def get_locusid(probelist=None,org="hgu133a"):
    """Returns a dictionary of locus link id for each affy probeset
    and reverse mapping

    innput:
    [probelist] -- probelist of affy probesets
    [org] -- chip type (organism)

    out:
    aff2loc, loc2aff

    The mapping is one-to-one for affy->locus_id
    However, there are several affy probesets for one locus_id

    From bioc-mail-archive: BioC takes the GeneBank ids associated
    with the probes (provided by the manufacture) and then maps them
    to Entrez Gene ids using data from UniGene, Entrez Gene, and other
    available data sources we trust. The Entrez Gene id a probe is
    assigned to is determined by votes from all the sources used. If
    there is no agreement among the sources, we take the smallest
    Entrez Gene id.
    """
    silent_eval("library("+org+")")
    silent_eval('locus_ids = as.list('+org+'LOCUSID)')
    silent_eval('pp<-as.list(locus_ids[!is.na(locus_ids)])')
    loc_ids = rpy.r("pp")
    for id in loc_ids:
        loc_ids[id] = str(loc_ids[id])

    aff2loc = {}
    if probelist:
        for pid in probelist:
            try:
                aff2loc[pid]=loc_ids[pid]
            except:
                print "Affy probeset: %s has no locus id" %pid
        print "\nCONVERSION SUMMARY:\n \
        Number of probesets input %s \n \
        Number of translated locus ids: %s \n \
        Number of missings: %s" %(len(probelist),len(aff2loc),len(probelist)-len(aff2loc))
    else:
        aff2loc = loc_ids
    # reverse mapping
    loc2aff = {}
    for k,v in aff2loc.items():
        if loc2aff.has_key(v):
            loc2aff[v].append(k)
        else:
            loc2aff[v]=[k]

    return aff2loc,loc2aff

def get_kegg_paths(org="hgu133plus2",id_type='aff',probelist=None):
    """Returns a dictionary of KEGG maps.

    input:
             org  --  chip_type (see bioconductor.org)
             id_type -- id ['aff','loc']

    key: affy_id, value = list of kegg map id
    example: '65884_at': ['00510', '00513']
    """
    silent_eval("library("+org+")")
    silent_eval('xx<-as.list('+org+'PATH)')
    silent_eval('xp <- xx[!is.na(xx)]')
    aff2path = rpy.r("xp")
    dummy = rpy.r("xx")

    if id_type=='loc':
        aff2loc,loc2aff = get_locusid(org=org)
        loc2path = {}
        for id,path in aff2path.items():
            if loc2path.has_key(id):
                pp = [path.append(i) for i in loc2path[id]]
                print "Found duplicate in path: %s" %path
            loc2path[aff2loc[id]]=path
        aff2path = loc2path
    out = {}

    if probelist:
        for pid in probelist:
            try:
                out[pid]=aff2path[pid]
            except:
                print "Could not find id: %s" %pid
    else:
        out = aff2path
    for k,v in out.items():
        # if string convert tol list
        try:
            v + ''
            out[k] = [v]
        except:
            out[k] = v

    return out

def get_probe_list(org="hgu133plus2"):
    rpy.r.library(org)
    silent_eval('probe_list<-ls('+org+'ACCNUM )')
    pl = rpy.r("probe_list")
    return pl

def get_GO_from_aff(org="hgu133plus2",id_type='aff',probelist=None):
    """Returns a dictionary of GO terms.

    input:
             org  --  chip_type (see bioconductor.org)
             id_type -- id ['aff','loc']

    key:
    example: '65884_at':
    """
    silent_eval("library("+org+")")
    silent_eval('xx<-as.list('+org+'GO)')
    silent_eval('xp <- xx[!is.na(xx)]')
    aff2path = rpy.r("xp")
    dummy = rpy.r("xx")
    if id_type=='loc':
        LOC = get_locusid(org=org)
        loc2path = {}
        for id,path in aff2path.items():
            if loc2path.has_key(id):
                pp = [path.append(i) for i in loc2path[id]]
                print "Found duplicate in path: %s" %path
            loc2path[LOC[id]]=path
        aff2path = loc2path
    out = {}
    if probelist:
        for pid in probelist:
            try:
                out[pid]=aff2path[pid]
            except:
                print "Could not find id: %s" %pid
    return aff2path

def get_kegg_as_category(org="hgu133plus2",id_type='aff',probelist=None):
    """Returns kegg pathway memberships in dummy (1/0) matrix (genes x maps)

    """
    kegg = get_kegg_paths(org=org, id_type=id_type, probelist=probelist)
    maps = set()
    for kpth in kegg.values():
        maps.update(kpth)

    n_maps = len(maps)
    n_genes = len(kegg)
    gene2index = dict(zip(kegg.keys(), range(n_genes)))
    map2index = dict(zip(maps, range(n_maps)))
    C = scipy.zeros((n_genes, n_maps))
    for k,v in kegg.items():
        for m in v:
            C[gene2index[k], map2index[m]]=1

    return C, list(maps), kegg.keys()

def impute(X, k=10, rowmax=0.5, colmax=0.8, maxp=1500, seed=362436069):
    """
    A function to impute missing expression data, using nearest
    neighbor averaging. (from bioconductors impute)

    input:

    data: An expression matrix with genes in the rows, samples in the
          columns

       k: Number of neighbors to be used in the imputation (default=10)

    rowmax: The maximum percent missing data allowed in any row (default
          50%). For any rows with more than 'rowmax'% missing are
          imputed using the overall mean per sample.

    colmax: The maximum percent missing data allowed in any column
          (default 80%). If any column has more than 'colmax'% missing
          data, the program halts and reports an error.

    maxp: The largest block of genes imputed using the knn algorithm
          inside 'impute.knn' (default 1500); larger blocks are divided
          by two-means clustering (recursively) prior to imputation. If
          'maxp=p', only knn imputation is done

    seed: The seed used for the random number generator (default
          362436069) for reproducibility.


    call:
    impute(data ,k = 10, rowmax = 0.5, colmax = 0.8, maxp = 1500, rng.seed=362436069)
    """

    rpy.r.library("impute")
    X = N.asarray(X) # cast as numeric array
    m, n = scipy.shape(X)
    if m>n:
        print "Warning (impute): more samples than variables. running transpose"
        t_flag = True
    else:
        X = N.transpose(X)
        t_flag = False

    rpy.r.assign("X", X)
    rpy.r.assign("k", k)
    rpy.r.assign("rmax", rowmax)
    rpy.r.assign("cmax", colmax)
    rpy.r.assign("maxp", maxp)

    call = "out<-impute.knn(X,k=k,rowmax=rmax,colmax=cmax,maxp=maxp)"
    silent_eval(call)
    out = rpy.r("out")
    if not t_flag:
        E = out['data']
        E = scipy.asarray(E)
        E = E.T
    else:
        E =  out['data']
        E = scipy.asarray(E)
    return E


def get_chip_annotation(org="hgu133a",annot='pmid', id_type='loc',probelist=None):
    """Returns a dictionary of annoations.

    input:
             org  --  chip_type (see bioconductor.org)
             annot -- annotation ['genename', 'pmid', ' symbol']
             id_type -- id ['aff','loc']


    key: id, value = list of annoations
    example: '65884_at': ['15672394', '138402']
    """
    _valid_annot = ['genename', 'pmid', 'symbol', 'enzyme', 'chr', 'chrloc']
    if annot.lower() not in _valid_annot:
        raise ValueError("Annotation must be one of %s" %_valid_annot)
    silent_eval("library("+org+")")
    silent_eval("dummy<-as.list("+org+annot.upper()+")")
    silent_eval('annotations <- dummy[!is.na(dummy)]')
    aff2annot = rpy.r("annotations")
    if id_type=='loc':
        aff2loc, loc2aff = get_locusid(org=org)
        loc2annot = {}
        for geneid, annotation in aff2annot.items():
            annotation = ensure_list(annotation)
            print annotation
            if loc2annot.has_key(geneid):
                for extra in loc2annot[geneid]:
                    annotation.append(extra)
                print "Found duplicate in gene: %s" %geneid
            loc2annot[aff2loc[geneid]] = annotation
        aff2annot = loc2annot

    out = {}
    if probelist:
        for pid in probelist:
            try:
                out[pid] = aff2annot.get(pid, 'none')
            except:
                print "Could not find id: %s" %pid
    else:
        out = aff2annot

    return out

def ensure_list(value):
    if isinstance(value, list):
        return value
    else:
        return [value]