some biocondoctur utilities

2007-05-06 13:51:06 +00:00
parent f1301be67a
commit ebe7621817
1 changed files with 284 additions and 0 deletions
--- a/fluents/lib/R_utils.py
+++ b/fluents/lib/R_utils.py
@@ -0,0 +1,284 @@
+"""A collection of functions that use R.
+
+Most functions use libraries from bioconductor
+
+depends on:
+(not updated)
+-- bioconductor min. install
+-- hgu133a
+-- hgu133plus2
+
+"""
+
+import scipy
+import Numeric as N
+import rpy
+silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
+
+def get_locusid(probelist=None,org="hgu133a"):
+    """Returns a dictionary of locus link id for each affy probeset
+    and reverse mapping
+
+    innput:
+    [probelist] -- probelist of affy probesets
+    [org] -- chip type (organism)
+
+    out:
+    aff2loc, loc2aff
+
+    The mapping is one-to-one for affy->locus_id
+    However, there are several affy probesets for one locus_id
+
+    From bioc-mail-archive: BioC takes the GeneBank ids associated
+    with the probes (provided by the manufacture) and then maps them
+    to Entrez Gene ids using data from UniGene, Entrez Gene, and other
+    available data sources we trust. The Entrez Gene id a probe is
+    assigned to is determined by votes from all the sources used. If
+    there is no agreement among the sources, we take the smallest
+    Entrez Gene id.
+    """
+    silent_eval("library("+org+")")
+    silent_eval('locus_ids = as.list('+org+'LOCUSID)')
+    silent_eval('pp<-as.list(locus_ids[!is.na(locus_ids)])')
+    loc_ids = rpy.r("pp")
+    for id in loc_ids:
+        loc_ids[id] = str(loc_ids[id])
+        
+    aff2loc = {}
+    if probelist:
+        for pid in probelist:
+            try:
+                aff2loc[pid]=loc_ids[pid]
+            except:
+                print "Affy probeset: %s has no locus id" %pid
+        print "\nCONVERSION SUMMARY:\n \
+        Number of probesets input %s \n \
+        Number of translated locus ids: %s \n \
+        Number of missings: %s" %(len(probelist),len(aff2loc),len(probelist)-len(aff2loc))
+    else:
+        aff2loc = loc_ids
+    # reverse mapping
+    loc2aff = {}
+    for k,v in aff2loc.items():
+        if loc2aff.has_key(v):
+            loc2aff[v].append(k)
+        else:
+            loc2aff[v]=[k]
+    
+    return aff2loc,loc2aff
+
+def get_kegg_paths(org="hgu133plus2",id_type='aff',probelist=None):
+    """Returns a dictionary of KEGG maps.
+
+    input:
+             org  --  chip_type (see bioconductor.org)
+             id_type -- id ['aff','loc']
+    
+    key: affy_id, value = list of kegg map id
+    example: '65884_at': ['00510', '00513']
+    """
+    silent_eval("library("+org+")")
+    silent_eval('xx<-as.list('+org+'PATH)')
+    silent_eval('xp <- xx[!is.na(xx)]')
+    aff2path = rpy.r("xp")
+    dummy = rpy.r("xx")
+    
+    if id_type=='loc':
+        aff2loc,loc2aff = get_locusid(org=org)
+        loc2path = {}
+        for id,path in aff2path.items():
+            if loc2path.has_key(id):
+                pp = [path.append(i) for i in loc2path[id]]
+                print "Found duplicate in path: %s" %path
+            loc2path[aff2loc[id]]=path
+        aff2path = loc2path
+    out = {}
+
+    if probelist:
+        for pid in probelist:
+            try:
+                out[pid]=aff2path[pid]
+            except:
+                print "Could not find id: %s" %pid
+    else:
+        out = aff2path
+    for k,v in out.items():
+        # if string convert tol list
+        try:
+            v + ''
+            out[k] = [v]
+        except:
+            out[k] = v
+            
+    return out
+
+def get_probe_list(org="hgu133plus2"):
+    rpy.r.library(org)
+    silent_eval('probe_list<-ls('+org+'ACCNUM )')
+    pl = rpy.r("probe_list")
+    return pl
+
+def get_GO_from_aff(org="hgu133plus2",id_type='aff',probelist=None):
+    """Returns a dictionary of GO terms.
+
+    input:
+             org  --  chip_type (see bioconductor.org)
+             id_type -- id ['aff','loc']
+    
+    key: 
+    example: '65884_at': 
+    """
+    silent_eval("library("+org+")")
+    silent_eval('xx<-as.list('+org+'GO)')
+    silent_eval('xp <- xx[!is.na(xx)]')
+    aff2path = rpy.r("xp")
+    dummy = rpy.r("xx")
+    if id_type=='loc':
+        LOC = get_locusid(org=org)
+        loc2path = {}
+        for id,path in aff2path.items():
+            if loc2path.has_key(id):
+                pp = [path.append(i) for i in loc2path[id]]
+                print "Found duplicate in path: %s" %path
+            loc2path[LOC[id]]=path
+        aff2path = loc2path
+    out = {}
+    if probelist:
+        for pid in probelist:
+            try:
+                out[pid]=aff2path[pid]
+            except:
+                print "Could not find id: %s" %pid
+    return aff2path
+
+def get_kegg_as_category(org="hgu133plus2",id_type='aff',probelist=None):
+    """Returns kegg pathway memberships in dummy (1/0) matrix (genes x maps)
+    
+    """
+    kegg = get_kegg_paths(org=org, id_type=id_type, probelist=probelist)
+    maps = set()
+    for kpth in kegg.values():
+        maps.update(kpth)
+    
+    n_maps = len(maps)
+    n_genes = len(kegg)
+    gene2index = dict(zip(kegg.keys(), range(n_genes)))
+    map2index = dict(zip(maps, range(n_maps)))
+    C = scipy.zeros((n_genes, n_maps))
+    for k,v in kegg.items():
+        for m in v:
+            C[gene2index[k], map2index[m]]=1
+        
+    return C, list(maps), kegg.keys()
+
+def impute(X, k=10, rowmax=0.5, colmax=0.8, maxp=1500, seed=362436069):
+    """
+    A function to impute missing expression data, using nearest
+    neighbor averaging. (from bioconductors impute)
+
+    input:
+
+    data: An expression matrix with genes in the rows, samples in the
+          columns
+
+       k: Number of neighbors to be used in the imputation (default=10)
+
+    rowmax: The maximum percent missing data allowed in any row (default
+          50%). For any rows with more than 'rowmax'% missing are
+          imputed using the overall mean per sample.
+
+    colmax: The maximum percent missing data allowed in any column
+          (default 80%). If any column has more than 'colmax'% missing
+          data, the program halts and reports an error.
+
+    maxp: The largest block of genes imputed using the knn algorithm
+          inside 'impute.knn' (default 1500); larger blocks are divided
+          by two-means clustering (recursively) prior to imputation. If
+          'maxp=p', only knn imputation is done
+
+    seed: The seed used for the random number generator (default
+          362436069) for reproducibility.
+
+    
+    call:
+    impute(data ,k = 10, rowmax = 0.5, colmax = 0.8, maxp = 1500, rng.seed=362436069)
+    """
+    
+    rpy.r.library("impute")
+    X = N.asarray(X) # cast as numeric array
+    m, n = scipy.shape(X)
+    if m>n:
+        print "Warning (impute): more samples than variables. running transpose"
+        t_flag = True
+    else:
+        X = N.transpose(X)
+        t_flag = False
+    
+    rpy.r.assign("X", X)
+    rpy.r.assign("k", k)
+    rpy.r.assign("rmax", rowmax)
+    rpy.r.assign("cmax", colmax)
+    rpy.r.assign("maxp", maxp)
+
+    call = "out<-impute.knn(X,k=k,rowmax=rmax,colmax=cmax,maxp=maxp)"
+    silent_eval(call)
+    out = rpy.r("out")
+    if not t_flag:
+        E = out['data']
+        E = scipy.asarray(E)
+        E = E.T
+    else:
+        E =  out['data']
+        E = scipy.asarray(E)
+    return E
+
+
+def get_chip_annotation(org="hgu133a",annot='pmid', id_type='loc',probelist=None):
+    """Returns a dictionary of annoations.
+
+    input:
+             org  --  chip_type (see bioconductor.org)
+             annot -- annotation ['genename', 'pmid', ' symbol']
+             id_type -- id ['aff','loc']
+             
+    
+    key: id, value = list of annoations
+    example: '65884_at': ['15672394', '138402']
+    """
+    _valid_annot = ['genename', 'pmid', 'symbol', 'enzyme', 'chr', 'chrloc']
+    if annot.lower() not in _valid_annot:
+        raise ValueError("Annotation must be one of %s" %_valid_annot)
+    silent_eval("library("+org+")")
+    silent_eval("dummy<-as.list("+org+annot.upper()+")")
+    silent_eval('annotations <- dummy[!is.na(dummy)]')
+    aff2annot = rpy.r("annotations")
+    if id_type=='loc':
+        aff2loc, loc2aff = get_locusid(org=org)
+        loc2annot = {}
+        for geneid, annotation in aff2annot.items():
+            annotation = ensure_list(annotation)
+            print annotation
+            if loc2annot.has_key(geneid):
+                for extra in loc2annot[geneid]:
+                    annotation.append(extra) 
+                print "Found duplicate in gene: %s" %geneid
+            loc2annot[aff2loc[geneid]] = annotation
+        aff2annot = loc2annot
+
+    out = {}
+    if probelist:
+        for pid in probelist:
+            try:
+                out[pid] = aff2annot.get(pid, 'none')
+            except:
+                print "Could not find id: %s" %pid
+    else:
+        out = aff2annot
+            
+    return out
+
+def ensure_list(value):
+    if isinstance(value, list):
+        return value
+    else:
+        return [value]