some biocondoctur utilities
This commit is contained in:
		
							
								
								
									
										284
									
								
								fluents/lib/R_utils.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										284
									
								
								fluents/lib/R_utils.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,284 @@
 | 
			
		||||
"""A collection of functions that use R.
 | 
			
		||||
 | 
			
		||||
Most functions use libraries from bioconductor
 | 
			
		||||
 | 
			
		||||
depends on:
 | 
			
		||||
(not updated)
 | 
			
		||||
-- bioconductor min. install
 | 
			
		||||
-- hgu133a
 | 
			
		||||
-- hgu133plus2
 | 
			
		||||
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
import scipy
 | 
			
		||||
import Numeric as N
 | 
			
		||||
import rpy
 | 
			
		||||
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
 | 
			
		||||
 | 
			
		||||
def get_locusid(probelist=None,org="hgu133a"):
 | 
			
		||||
    """Returns a dictionary of locus link id for each affy probeset
 | 
			
		||||
    and reverse mapping
 | 
			
		||||
 | 
			
		||||
    innput:
 | 
			
		||||
    [probelist] -- probelist of affy probesets
 | 
			
		||||
    [org] -- chip type (organism)
 | 
			
		||||
 | 
			
		||||
    out:
 | 
			
		||||
    aff2loc, loc2aff
 | 
			
		||||
 | 
			
		||||
    The mapping is one-to-one for affy->locus_id
 | 
			
		||||
    However, there are several affy probesets for one locus_id
 | 
			
		||||
 | 
			
		||||
    From bioc-mail-archive: BioC takes the GeneBank ids associated
 | 
			
		||||
    with the probes (provided by the manufacture) and then maps them
 | 
			
		||||
    to Entrez Gene ids using data from UniGene, Entrez Gene, and other
 | 
			
		||||
    available data sources we trust. The Entrez Gene id a probe is
 | 
			
		||||
    assigned to is determined by votes from all the sources used. If
 | 
			
		||||
    there is no agreement among the sources, we take the smallest
 | 
			
		||||
    Entrez Gene id.
 | 
			
		||||
    """
 | 
			
		||||
    silent_eval("library("+org+")")
 | 
			
		||||
    silent_eval('locus_ids = as.list('+org+'LOCUSID)')
 | 
			
		||||
    silent_eval('pp<-as.list(locus_ids[!is.na(locus_ids)])')
 | 
			
		||||
    loc_ids = rpy.r("pp")
 | 
			
		||||
    for id in loc_ids:
 | 
			
		||||
        loc_ids[id] = str(loc_ids[id])
 | 
			
		||||
        
 | 
			
		||||
    aff2loc = {}
 | 
			
		||||
    if probelist:
 | 
			
		||||
        for pid in probelist:
 | 
			
		||||
            try:
 | 
			
		||||
                aff2loc[pid]=loc_ids[pid]
 | 
			
		||||
            except:
 | 
			
		||||
                print "Affy probeset: %s has no locus id" %pid
 | 
			
		||||
        print "\nCONVERSION SUMMARY:\n \
 | 
			
		||||
        Number of probesets input %s \n \
 | 
			
		||||
        Number of translated locus ids: %s \n \
 | 
			
		||||
        Number of missings: %s" %(len(probelist),len(aff2loc),len(probelist)-len(aff2loc))
 | 
			
		||||
    else:
 | 
			
		||||
        aff2loc = loc_ids
 | 
			
		||||
    # reverse mapping
 | 
			
		||||
    loc2aff = {}
 | 
			
		||||
    for k,v in aff2loc.items():
 | 
			
		||||
        if loc2aff.has_key(v):
 | 
			
		||||
            loc2aff[v].append(k)
 | 
			
		||||
        else:
 | 
			
		||||
            loc2aff[v]=[k]
 | 
			
		||||
    
 | 
			
		||||
    return aff2loc,loc2aff
 | 
			
		||||
 | 
			
		||||
def get_kegg_paths(org="hgu133plus2",id_type='aff',probelist=None):
 | 
			
		||||
    """Returns a dictionary of KEGG maps.
 | 
			
		||||
 | 
			
		||||
    input:
 | 
			
		||||
             org  --  chip_type (see bioconductor.org)
 | 
			
		||||
             id_type -- id ['aff','loc']
 | 
			
		||||
    
 | 
			
		||||
    key: affy_id, value = list of kegg map id
 | 
			
		||||
    example: '65884_at': ['00510', '00513']
 | 
			
		||||
    """
 | 
			
		||||
    silent_eval("library("+org+")")
 | 
			
		||||
    silent_eval('xx<-as.list('+org+'PATH)')
 | 
			
		||||
    silent_eval('xp <- xx[!is.na(xx)]')
 | 
			
		||||
    aff2path = rpy.r("xp")
 | 
			
		||||
    dummy = rpy.r("xx")
 | 
			
		||||
    
 | 
			
		||||
    if id_type=='loc':
 | 
			
		||||
        aff2loc,loc2aff = get_locusid(org=org)
 | 
			
		||||
        loc2path = {}
 | 
			
		||||
        for id,path in aff2path.items():
 | 
			
		||||
            if loc2path.has_key(id):
 | 
			
		||||
                pp = [path.append(i) for i in loc2path[id]]
 | 
			
		||||
                print "Found duplicate in path: %s" %path
 | 
			
		||||
            loc2path[aff2loc[id]]=path
 | 
			
		||||
        aff2path = loc2path
 | 
			
		||||
    out = {}
 | 
			
		||||
 | 
			
		||||
    if probelist:
 | 
			
		||||
        for pid in probelist:
 | 
			
		||||
            try:
 | 
			
		||||
                out[pid]=aff2path[pid]
 | 
			
		||||
            except:
 | 
			
		||||
                print "Could not find id: %s" %pid
 | 
			
		||||
    else:
 | 
			
		||||
        out = aff2path
 | 
			
		||||
    for k,v in out.items():
 | 
			
		||||
        # if string convert tol list
 | 
			
		||||
        try:
 | 
			
		||||
            v + ''
 | 
			
		||||
            out[k] = [v]
 | 
			
		||||
        except:
 | 
			
		||||
            out[k] = v
 | 
			
		||||
            
 | 
			
		||||
    return out
 | 
			
		||||
 | 
			
		||||
def get_probe_list(org="hgu133plus2"):
 | 
			
		||||
    rpy.r.library(org)
 | 
			
		||||
    silent_eval('probe_list<-ls('+org+'ACCNUM )')
 | 
			
		||||
    pl = rpy.r("probe_list")
 | 
			
		||||
    return pl
 | 
			
		||||
 | 
			
		||||
def get_GO_from_aff(org="hgu133plus2",id_type='aff',probelist=None):
 | 
			
		||||
    """Returns a dictionary of GO terms.
 | 
			
		||||
 | 
			
		||||
    input:
 | 
			
		||||
             org  --  chip_type (see bioconductor.org)
 | 
			
		||||
             id_type -- id ['aff','loc']
 | 
			
		||||
    
 | 
			
		||||
    key: 
 | 
			
		||||
    example: '65884_at': 
 | 
			
		||||
    """
 | 
			
		||||
    silent_eval("library("+org+")")
 | 
			
		||||
    silent_eval('xx<-as.list('+org+'GO)')
 | 
			
		||||
    silent_eval('xp <- xx[!is.na(xx)]')
 | 
			
		||||
    aff2path = rpy.r("xp")
 | 
			
		||||
    dummy = rpy.r("xx")
 | 
			
		||||
    if id_type=='loc':
 | 
			
		||||
        LOC = get_locusid(org=org)
 | 
			
		||||
        loc2path = {}
 | 
			
		||||
        for id,path in aff2path.items():
 | 
			
		||||
            if loc2path.has_key(id):
 | 
			
		||||
                pp = [path.append(i) for i in loc2path[id]]
 | 
			
		||||
                print "Found duplicate in path: %s" %path
 | 
			
		||||
            loc2path[LOC[id]]=path
 | 
			
		||||
        aff2path = loc2path
 | 
			
		||||
    out = {}
 | 
			
		||||
    if probelist:
 | 
			
		||||
        for pid in probelist:
 | 
			
		||||
            try:
 | 
			
		||||
                out[pid]=aff2path[pid]
 | 
			
		||||
            except:
 | 
			
		||||
                print "Could not find id: %s" %pid
 | 
			
		||||
    return aff2path
 | 
			
		||||
 | 
			
		||||
def get_kegg_as_category(org="hgu133plus2",id_type='aff',probelist=None):
 | 
			
		||||
    """Returns kegg pathway memberships in dummy (1/0) matrix (genes x maps)
 | 
			
		||||
    
 | 
			
		||||
    """
 | 
			
		||||
    kegg = get_kegg_paths(org=org, id_type=id_type, probelist=probelist)
 | 
			
		||||
    maps = set()
 | 
			
		||||
    for kpth in kegg.values():
 | 
			
		||||
        maps.update(kpth)
 | 
			
		||||
    
 | 
			
		||||
    n_maps = len(maps)
 | 
			
		||||
    n_genes = len(kegg)
 | 
			
		||||
    gene2index = dict(zip(kegg.keys(), range(n_genes)))
 | 
			
		||||
    map2index = dict(zip(maps, range(n_maps)))
 | 
			
		||||
    C = scipy.zeros((n_genes, n_maps))
 | 
			
		||||
    for k,v in kegg.items():
 | 
			
		||||
        for m in v:
 | 
			
		||||
            C[gene2index[k], map2index[m]]=1
 | 
			
		||||
        
 | 
			
		||||
    return C, list(maps), kegg.keys()
 | 
			
		||||
 | 
			
		||||
def impute(X, k=10, rowmax=0.5, colmax=0.8, maxp=1500, seed=362436069):
 | 
			
		||||
    """
 | 
			
		||||
    A function to impute missing expression data, using nearest
 | 
			
		||||
    neighbor averaging. (from bioconductors impute)
 | 
			
		||||
 | 
			
		||||
    input:
 | 
			
		||||
 | 
			
		||||
    data: An expression matrix with genes in the rows, samples in the
 | 
			
		||||
          columns
 | 
			
		||||
 | 
			
		||||
       k: Number of neighbors to be used in the imputation (default=10)
 | 
			
		||||
 | 
			
		||||
    rowmax: The maximum percent missing data allowed in any row (default
 | 
			
		||||
          50%). For any rows with more than 'rowmax'% missing are
 | 
			
		||||
          imputed using the overall mean per sample.
 | 
			
		||||
 | 
			
		||||
    colmax: The maximum percent missing data allowed in any column
 | 
			
		||||
          (default 80%). If any column has more than 'colmax'% missing
 | 
			
		||||
          data, the program halts and reports an error.
 | 
			
		||||
 | 
			
		||||
    maxp: The largest block of genes imputed using the knn algorithm
 | 
			
		||||
          inside 'impute.knn' (default 1500); larger blocks are divided
 | 
			
		||||
          by two-means clustering (recursively) prior to imputation. If
 | 
			
		||||
          'maxp=p', only knn imputation is done
 | 
			
		||||
 | 
			
		||||
    seed: The seed used for the random number generator (default
 | 
			
		||||
          362436069) for reproducibility.
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    call:
 | 
			
		||||
    impute(data ,k = 10, rowmax = 0.5, colmax = 0.8, maxp = 1500, rng.seed=362436069)
 | 
			
		||||
    """
 | 
			
		||||
    
 | 
			
		||||
    rpy.r.library("impute")
 | 
			
		||||
    X = N.asarray(X) # cast as numeric array
 | 
			
		||||
    m, n = scipy.shape(X)
 | 
			
		||||
    if m>n:
 | 
			
		||||
        print "Warning (impute): more samples than variables. running transpose"
 | 
			
		||||
        t_flag = True
 | 
			
		||||
    else:
 | 
			
		||||
        X = N.transpose(X)
 | 
			
		||||
        t_flag = False
 | 
			
		||||
    
 | 
			
		||||
    rpy.r.assign("X", X)
 | 
			
		||||
    rpy.r.assign("k", k)
 | 
			
		||||
    rpy.r.assign("rmax", rowmax)
 | 
			
		||||
    rpy.r.assign("cmax", colmax)
 | 
			
		||||
    rpy.r.assign("maxp", maxp)
 | 
			
		||||
 | 
			
		||||
    call = "out<-impute.knn(X,k=k,rowmax=rmax,colmax=cmax,maxp=maxp)"
 | 
			
		||||
    silent_eval(call)
 | 
			
		||||
    out = rpy.r("out")
 | 
			
		||||
    if not t_flag:
 | 
			
		||||
        E = out['data']
 | 
			
		||||
        E = scipy.asarray(E)
 | 
			
		||||
        E = E.T
 | 
			
		||||
    else:
 | 
			
		||||
        E =  out['data']
 | 
			
		||||
        E = scipy.asarray(E)
 | 
			
		||||
    return E
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_chip_annotation(org="hgu133a",annot='pmid', id_type='loc',probelist=None):
 | 
			
		||||
    """Returns a dictionary of annoations.
 | 
			
		||||
 | 
			
		||||
    input:
 | 
			
		||||
             org  --  chip_type (see bioconductor.org)
 | 
			
		||||
             annot -- annotation ['genename', 'pmid', ' symbol']
 | 
			
		||||
             id_type -- id ['aff','loc']
 | 
			
		||||
             
 | 
			
		||||
    
 | 
			
		||||
    key: id, value = list of annoations
 | 
			
		||||
    example: '65884_at': ['15672394', '138402']
 | 
			
		||||
    """
 | 
			
		||||
    _valid_annot = ['genename', 'pmid', 'symbol', 'enzyme', 'chr', 'chrloc']
 | 
			
		||||
    if annot.lower() not in _valid_annot:
 | 
			
		||||
        raise ValueError("Annotation must be one of %s" %_valid_annot)
 | 
			
		||||
    silent_eval("library("+org+")")
 | 
			
		||||
    silent_eval("dummy<-as.list("+org+annot.upper()+")")
 | 
			
		||||
    silent_eval('annotations <- dummy[!is.na(dummy)]')
 | 
			
		||||
    aff2annot = rpy.r("annotations")
 | 
			
		||||
    if id_type=='loc':
 | 
			
		||||
        aff2loc, loc2aff = get_locusid(org=org)
 | 
			
		||||
        loc2annot = {}
 | 
			
		||||
        for geneid, annotation in aff2annot.items():
 | 
			
		||||
            annotation = ensure_list(annotation)
 | 
			
		||||
            print annotation
 | 
			
		||||
            if loc2annot.has_key(geneid):
 | 
			
		||||
                for extra in loc2annot[geneid]:
 | 
			
		||||
                    annotation.append(extra) 
 | 
			
		||||
                print "Found duplicate in gene: %s" %geneid
 | 
			
		||||
            loc2annot[aff2loc[geneid]] = annotation
 | 
			
		||||
        aff2annot = loc2annot
 | 
			
		||||
 | 
			
		||||
    out = {}
 | 
			
		||||
    if probelist:
 | 
			
		||||
        for pid in probelist:
 | 
			
		||||
            try:
 | 
			
		||||
                out[pid] = aff2annot.get(pid, 'none')
 | 
			
		||||
            except:
 | 
			
		||||
                print "Could not find id: %s" %pid
 | 
			
		||||
    else:
 | 
			
		||||
        out = aff2annot
 | 
			
		||||
            
 | 
			
		||||
    return out
 | 
			
		||||
 | 
			
		||||
def ensure_list(value):
 | 
			
		||||
    if isinstance(value, list):
 | 
			
		||||
        return value
 | 
			
		||||
    else:
 | 
			
		||||
        return [value]
 | 
			
		||||
		Reference in New Issue
	
	Block a user