laydi/fluents/lib/R_utils.py

"""A collection of functions that use R.

Most functions use libraries from bioconductor

depends on:
(not updated)
-- bioconductor min. install
-- hgu133a
-- hgu133plus2

"""

import scipy
import Numeric as N
import rpy
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)

def get_locusid(probelist=None,org="hgu133a"):
    """Returns a dictionary of locus link id for each affy probeset
    and reverse mapping

    innput:
    [probelist] -- probelist of affy probesets
    [org] -- chip type (organism)

    out:
    aff2loc, loc2aff

    The mapping is one-to-one for affy->locus_id
    However, there are several affy probesets for one locus_id

    From bioc-mail-archive: BioC takes the GeneBank ids associated
    with the probes (provided by the manufacture) and then maps them
    to Entrez Gene ids using data from UniGene, Entrez Gene, and other
    available data sources we trust. The Entrez Gene id a probe is
    assigned to is determined by votes from all the sources used. If
    there is no agreement among the sources, we take the smallest
    Entrez Gene id.
    """
    silent_eval("library("+org+")")
    silent_eval('locus_ids = as.list('+org+'LOCUSID)')
    silent_eval('pp<-as.list(locus_ids[!is.na(locus_ids)])')
    loc_ids = rpy.r("pp")
    for id in loc_ids:
        loc_ids[id] = str(loc_ids[id])
        
    aff2loc = {}
    if probelist:
        for pid in probelist:
            try:
                aff2loc[pid]=loc_ids[pid]
            except:
                print "Affy probeset: %s has no locus id" %pid
        print "\nCONVERSION SUMMARY:\n \
        Number of probesets input %s \n \
        Number of translated locus ids: %s \n \
        Number of missings: %s" %(len(probelist),len(aff2loc),len(probelist)-len(aff2loc))
    else:
        aff2loc = loc_ids
    # reverse mapping
    loc2aff = {}
    for k,v in aff2loc.items():
        if loc2aff.has_key(v):
            loc2aff[v].append(k)
        else:
            loc2aff[v]=[k]
    
    return aff2loc,loc2aff

def get_kegg_paths(org="hgu133plus2",id_type='aff',probelist=None):
    """Returns a dictionary of KEGG maps.

    input:
             org  --  chip_type (see bioconductor.org)
             id_type -- id ['aff','loc']
    
    key: affy_id, value = list of kegg map id
    example: '65884_at': ['00510', '00513']
    """
    silent_eval("library("+org+")")
    silent_eval('xx<-as.list('+org+'PATH)')
    silent_eval('xp <- xx[!is.na(xx)]')
    aff2path = rpy.r("xp")
    dummy = rpy.r("xx")
    
    if id_type=='loc':
        aff2loc,loc2aff = get_locusid(org=org)
        loc2path = {}
        for id,path in aff2path.items():
            if loc2path.has_key(id):
                pp = [path.append(i) for i in loc2path[id]]
                print "Found duplicate in path: %s" %path
            loc2path[aff2loc[id]]=path
        aff2path = loc2path
    out = {}

    if probelist:
        for pid in probelist:
            try:
                out[pid]=aff2path[pid]
            except:
                print "Could not find id: %s" %pid
    else:
        out = aff2path
    for k,v in out.items():
        # if string convert tol list
        try:
            v + ''
            out[k] = [v]
        except:
            out[k] = v
            
    return out

def get_probe_list(org="hgu133plus2"):
    rpy.r.library(org)
    silent_eval('probe_list<-ls('+org+'ACCNUM )')
    pl = rpy.r("probe_list")
    return pl

def get_GO_from_aff(org="hgu133plus2",id_type='aff',probelist=None):
    """Returns a dictionary of GO terms.

    input:
             org  --  chip_type (see bioconductor.org)
             id_type -- id ['aff','loc']
    
    key: 
    example: '65884_at': 
    """
    silent_eval("library("+org+")")
    silent_eval('xx<-as.list('+org+'GO)')
    silent_eval('xp <- xx[!is.na(xx)]')
    aff2path = rpy.r("xp")
    dummy = rpy.r("xx")
    if id_type=='loc':
        LOC = get_locusid(org=org)
        loc2path = {}
        for id,path in aff2path.items():
            if loc2path.has_key(id):
                pp = [path.append(i) for i in loc2path[id]]
                print "Found duplicate in path: %s" %path
            loc2path[LOC[id]]=path
        aff2path = loc2path
    out = {}
    if probelist:
        for pid in probelist:
            try:
                out[pid]=aff2path[pid]
            except:
                print "Could not find id: %s" %pid
    return aff2path

def get_kegg_as_category(org="hgu133plus2",id_type='aff',probelist=None):
    """Returns kegg pathway memberships in dummy (1/0) matrix (genes x maps)
    
    """
    kegg = get_kegg_paths(org=org, id_type=id_type, probelist=probelist)
    maps = set()
    for kpth in kegg.values():
        maps.update(kpth)
    
    n_maps = len(maps)
    n_genes = len(kegg)
    gene2index = dict(zip(kegg.keys(), range(n_genes)))
    map2index = dict(zip(maps, range(n_maps)))
    C = scipy.zeros((n_genes, n_maps))
    for k,v in kegg.items():
        for m in v:
            C[gene2index[k], map2index[m]]=1
        
    return C, list(maps), kegg.keys()

def impute(X, k=10, rowmax=0.5, colmax=0.8, maxp=1500, seed=362436069):
    """
    A function to impute missing expression data, using nearest
    neighbor averaging. (from bioconductors impute)

    input:

    data: An expression matrix with genes in the rows, samples in the
          columns

       k: Number of neighbors to be used in the imputation (default=10)

    rowmax: The maximum percent missing data allowed in any row (default
          50%). For any rows with more than 'rowmax'% missing are
          imputed using the overall mean per sample.

    colmax: The maximum percent missing data allowed in any column
          (default 80%). If any column has more than 'colmax'% missing
          data, the program halts and reports an error.

    maxp: The largest block of genes imputed using the knn algorithm
          inside 'impute.knn' (default 1500); larger blocks are divided
          by two-means clustering (recursively) prior to imputation. If
          'maxp=p', only knn imputation is done

    seed: The seed used for the random number generator (default
          362436069) for reproducibility.

    
    call:
    impute(data ,k = 10, rowmax = 0.5, colmax = 0.8, maxp = 1500, rng.seed=362436069)
    """
    
    rpy.r.library("impute")
    X = N.asarray(X) # cast as numeric array
    m, n = scipy.shape(X)
    if m>n:
        print "Warning (impute): more samples than variables. running transpose"
        t_flag = True
    else:
        X = N.transpose(X)
        t_flag = False
    
    rpy.r.assign("X", X)
    rpy.r.assign("k", k)
    rpy.r.assign("rmax", rowmax)
    rpy.r.assign("cmax", colmax)
    rpy.r.assign("maxp", maxp)

    call = "out<-impute.knn(X,k=k,rowmax=rmax,colmax=cmax,maxp=maxp)"
    silent_eval(call)
    out = rpy.r("out")
    if not t_flag:
        E = out['data']
        E = scipy.asarray(E)
        E = E.T
    else:
        E =  out['data']
        E = scipy.asarray(E)
    return E


def get_chip_annotation(org="hgu133a",annot='pmid', id_type='loc',probelist=None):
    """Returns a dictionary of annoations.

    input:
             org  --  chip_type (see bioconductor.org)
             annot -- annotation ['genename', 'pmid', ' symbol']
             id_type -- id ['aff','loc']
             
    
    key: id, value = list of annoations
    example: '65884_at': ['15672394', '138402']
    """
    _valid_annot = ['genename', 'pmid', 'symbol', 'enzyme', 'chr', 'chrloc']
    if annot.lower() not in _valid_annot:
        raise ValueError("Annotation must be one of %s" %_valid_annot)
    silent_eval("library("+org+")")
    silent_eval("dummy<-as.list("+org+annot.upper()+")")
    silent_eval('annotations <- dummy[!is.na(dummy)]')
    aff2annot = rpy.r("annotations")
    if id_type=='loc':
        aff2loc, loc2aff = get_locusid(org=org)
        loc2annot = {}
        for geneid, annotation in aff2annot.items():
            annotation = ensure_list(annotation)
            print annotation
            if loc2annot.has_key(geneid):
                for extra in loc2annot[geneid]:
                    annotation.append(extra) 
                print "Found duplicate in gene: %s" %geneid
            loc2annot[aff2loc[geneid]] = annotation
        aff2annot = loc2annot

    out = {}
    if probelist:
        for pid in probelist:
            try:
                out[pid] = aff2annot.get(pid, 'none')
            except:
                print "Could not find id: %s" %pid
    else:
        out = aff2annot
            
    return out

def ensure_list(value):
    if isinstance(value, list):
        return value
    else:
        return [value]
some biocondoctur utilities 2007-05-06 15:51:06 +02:00			`"""A collection of functions that use R.`

			`Most functions use libraries from bioconductor`

			`depends on:`
			`(not updated)`
			`-- bioconductor min. install`
			`-- hgu133a`
			`-- hgu133plus2`

			`"""`

			`import scipy`
			`import Numeric as N`
			`import rpy`
			`silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)`

			`def get_locusid(probelist=None,org="hgu133a"):`
			`"""Returns a dictionary of locus link id for each affy probeset`
			`and reverse mapping`

			`innput:`
			`[probelist] -- probelist of affy probesets`
			`[org] -- chip type (organism)`

			`out:`
			`aff2loc, loc2aff`

			`The mapping is one-to-one for affy->locus_id`
			`However, there are several affy probesets for one locus_id`

			`From bioc-mail-archive: BioC takes the GeneBank ids associated`
			`with the probes (provided by the manufacture) and then maps them`
			`to Entrez Gene ids using data from UniGene, Entrez Gene, and other`
			`available data sources we trust. The Entrez Gene id a probe is`
			`assigned to is determined by votes from all the sources used. If`
			`there is no agreement among the sources, we take the smallest`
			`Entrez Gene id.`
			`"""`
			`silent_eval("library("+org+")")`
			`silent_eval('locus_ids = as.list('+org+'LOCUSID)')`
			`silent_eval('pp<-as.list(locus_ids[!is.na(locus_ids)])')`
			`loc_ids = rpy.r("pp")`
			`for id in loc_ids:`
			`loc_ids[id] = str(loc_ids[id])`

			`aff2loc = {}`
			`if probelist:`
			`for pid in probelist:`
			`try:`
			`aff2loc[pid]=loc_ids[pid]`
			`except:`
			`print "Affy probeset: %s has no locus id" %pid`
			`print "\nCONVERSION SUMMARY:\n \`
			`Number of probesets input %s \n \`
			`Number of translated locus ids: %s \n \`
			`Number of missings: %s" %(len(probelist),len(aff2loc),len(probelist)-len(aff2loc))`
			`else:`
			`aff2loc = loc_ids`
			`# reverse mapping`
			`loc2aff = {}`
			`for k,v in aff2loc.items():`
			`if loc2aff.has_key(v):`
			`loc2aff[v].append(k)`
			`else:`
			`loc2aff[v]=[k]`

			`return aff2loc,loc2aff`

			`def get_kegg_paths(org="hgu133plus2",id_type='aff',probelist=None):`
			`"""Returns a dictionary of KEGG maps.`

			`input:`
			`org -- chip_type (see bioconductor.org)`
			`id_type -- id ['aff','loc']`

			`key: affy_id, value = list of kegg map id`
			`example: '65884_at': ['00510', '00513']`
			`"""`
			`silent_eval("library("+org+")")`
			`silent_eval('xx<-as.list('+org+'PATH)')`
			`silent_eval('xp <- xx[!is.na(xx)]')`
			`aff2path = rpy.r("xp")`
			`dummy = rpy.r("xx")`

			`if id_type=='loc':`
			`aff2loc,loc2aff = get_locusid(org=org)`
			`loc2path = {}`
			`for id,path in aff2path.items():`
			`if loc2path.has_key(id):`
			`pp = [path.append(i) for i in loc2path[id]]`
			`print "Found duplicate in path: %s" %path`
			`loc2path[aff2loc[id]]=path`
			`aff2path = loc2path`
			`out = {}`

			`if probelist:`
			`for pid in probelist:`
			`try:`
			`out[pid]=aff2path[pid]`
			`except:`
			`print "Could not find id: %s" %pid`
			`else:`
			`out = aff2path`
			`for k,v in out.items():`
			`# if string convert tol list`
			`try:`
			`v + ''`
			`out[k] = [v]`
			`except:`
			`out[k] = v`

			`return out`

			`def get_probe_list(org="hgu133plus2"):`
			`rpy.r.library(org)`
			`silent_eval('probe_list<-ls('+org+'ACCNUM )')`
			`pl = rpy.r("probe_list")`
			`return pl`

			`def get_GO_from_aff(org="hgu133plus2",id_type='aff',probelist=None):`
			`"""Returns a dictionary of GO terms.`

			`input:`
			`org -- chip_type (see bioconductor.org)`
			`id_type -- id ['aff','loc']`

			`key:`
			`example: '65884_at':`
			`"""`
			`silent_eval("library("+org+")")`
			`silent_eval('xx<-as.list('+org+'GO)')`
			`silent_eval('xp <- xx[!is.na(xx)]')`
			`aff2path = rpy.r("xp")`
			`dummy = rpy.r("xx")`
			`if id_type=='loc':`
			`LOC = get_locusid(org=org)`
			`loc2path = {}`
			`for id,path in aff2path.items():`
			`if loc2path.has_key(id):`
			`pp = [path.append(i) for i in loc2path[id]]`
			`print "Found duplicate in path: %s" %path`
			`loc2path[LOC[id]]=path`
			`aff2path = loc2path`
			`out = {}`
			`if probelist:`
			`for pid in probelist:`
			`try:`
			`out[pid]=aff2path[pid]`
			`except:`
			`print "Could not find id: %s" %pid`
			`return aff2path`

			`def get_kegg_as_category(org="hgu133plus2",id_type='aff',probelist=None):`
			`"""Returns kegg pathway memberships in dummy (1/0) matrix (genes x maps)`

			`"""`
			`kegg = get_kegg_paths(org=org, id_type=id_type, probelist=probelist)`
			`maps = set()`
			`for kpth in kegg.values():`
			`maps.update(kpth)`

			`n_maps = len(maps)`
			`n_genes = len(kegg)`
			`gene2index = dict(zip(kegg.keys(), range(n_genes)))`
			`map2index = dict(zip(maps, range(n_maps)))`
			`C = scipy.zeros((n_genes, n_maps))`
			`for k,v in kegg.items():`
			`for m in v:`
			`C[gene2index[k], map2index[m]]=1`

			`return C, list(maps), kegg.keys()`

			`def impute(X, k=10, rowmax=0.5, colmax=0.8, maxp=1500, seed=362436069):`
			`"""`
			`A function to impute missing expression data, using nearest`
			`neighbor averaging. (from bioconductors impute)`

			`input:`

			`data: An expression matrix with genes in the rows, samples in the`
			`columns`

			`k: Number of neighbors to be used in the imputation (default=10)`

			`rowmax: The maximum percent missing data allowed in any row (default`
			`50%). For any rows with more than 'rowmax'% missing are`
			`imputed using the overall mean per sample.`

			`colmax: The maximum percent missing data allowed in any column`
			`(default 80%). If any column has more than 'colmax'% missing`
			`data, the program halts and reports an error.`

			`maxp: The largest block of genes imputed using the knn algorithm`
			`inside 'impute.knn' (default 1500); larger blocks are divided`
			`by two-means clustering (recursively) prior to imputation. If`
			`'maxp=p', only knn imputation is done`

			`seed: The seed used for the random number generator (default`
			`362436069) for reproducibility.`


			`call:`
			`impute(data ,k = 10, rowmax = 0.5, colmax = 0.8, maxp = 1500, rng.seed=362436069)`
			`"""`

			`rpy.r.library("impute")`
			`X = N.asarray(X) # cast as numeric array`
			`m, n = scipy.shape(X)`
			`if m>n:`
			`print "Warning (impute): more samples than variables. running transpose"`
			`t_flag = True`
			`else:`
			`X = N.transpose(X)`
			`t_flag = False`

			`rpy.r.assign("X", X)`
			`rpy.r.assign("k", k)`
			`rpy.r.assign("rmax", rowmax)`
			`rpy.r.assign("cmax", colmax)`
			`rpy.r.assign("maxp", maxp)`

			`call = "out<-impute.knn(X,k=k,rowmax=rmax,colmax=cmax,maxp=maxp)"`
			`silent_eval(call)`
			`out = rpy.r("out")`
			`if not t_flag:`
			`E = out['data']`
			`E = scipy.asarray(E)`
			`E = E.T`
			`else:`
			`E = out['data']`
			`E = scipy.asarray(E)`
			`return E`


			`def get_chip_annotation(org="hgu133a",annot='pmid', id_type='loc',probelist=None):`
			`"""Returns a dictionary of annoations.`

			`input:`
			`org -- chip_type (see bioconductor.org)`
			`annot -- annotation ['genename', 'pmid', ' symbol']`
			`id_type -- id ['aff','loc']`


			`key: id, value = list of annoations`
			`example: '65884_at': ['15672394', '138402']`
			`"""`
			`_valid_annot = ['genename', 'pmid', 'symbol', 'enzyme', 'chr', 'chrloc']`
			`if annot.lower() not in _valid_annot:`
			`raise ValueError("Annotation must be one of %s" %_valid_annot)`
			`silent_eval("library("+org+")")`
			`silent_eval("dummy<-as.list("+org+annot.upper()+")")`
			`silent_eval('annotations <- dummy[!is.na(dummy)]')`
			`aff2annot = rpy.r("annotations")`
			`if id_type=='loc':`
			`aff2loc, loc2aff = get_locusid(org=org)`
			`loc2annot = {}`
			`for geneid, annotation in aff2annot.items():`
			`annotation = ensure_list(annotation)`
			`print annotation`
			`if loc2annot.has_key(geneid):`
			`for extra in loc2annot[geneid]:`
			`annotation.append(extra)`
			`print "Found duplicate in gene: %s" %geneid`
			`loc2annot[aff2loc[geneid]] = annotation`
			`aff2annot = loc2annot`

			`out = {}`
			`if probelist:`
			`for pid in probelist:`
			`try:`
			`out[pid] = aff2annot.get(pid, 'none')`
			`except:`
			`print "Could not find id: %s" %pid`
			`else:`
			`out = aff2annot`

			`return out`

			`def ensure_list(value):`
			`if isinstance(value, list):`
			`return value`
			`else:`
			`return [value]`