some biocondoctur utilities
This commit is contained in:
parent
f1301be67a
commit
ebe7621817
284
fluents/lib/R_utils.py
Normal file
284
fluents/lib/R_utils.py
Normal file
@ -0,0 +1,284 @@
|
||||
"""A collection of functions that use R.
|
||||
|
||||
Most functions use libraries from bioconductor
|
||||
|
||||
depends on:
|
||||
(not updated)
|
||||
-- bioconductor min. install
|
||||
-- hgu133a
|
||||
-- hgu133plus2
|
||||
|
||||
"""
|
||||
|
||||
import scipy
|
||||
import Numeric as N
|
||||
import rpy
|
||||
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
|
||||
|
||||
def get_locusid(probelist=None,org="hgu133a"):
|
||||
"""Returns a dictionary of locus link id for each affy probeset
|
||||
and reverse mapping
|
||||
|
||||
innput:
|
||||
[probelist] -- probelist of affy probesets
|
||||
[org] -- chip type (organism)
|
||||
|
||||
out:
|
||||
aff2loc, loc2aff
|
||||
|
||||
The mapping is one-to-one for affy->locus_id
|
||||
However, there are several affy probesets for one locus_id
|
||||
|
||||
From bioc-mail-archive: BioC takes the GeneBank ids associated
|
||||
with the probes (provided by the manufacture) and then maps them
|
||||
to Entrez Gene ids using data from UniGene, Entrez Gene, and other
|
||||
available data sources we trust. The Entrez Gene id a probe is
|
||||
assigned to is determined by votes from all the sources used. If
|
||||
there is no agreement among the sources, we take the smallest
|
||||
Entrez Gene id.
|
||||
"""
|
||||
silent_eval("library("+org+")")
|
||||
silent_eval('locus_ids = as.list('+org+'LOCUSID)')
|
||||
silent_eval('pp<-as.list(locus_ids[!is.na(locus_ids)])')
|
||||
loc_ids = rpy.r("pp")
|
||||
for id in loc_ids:
|
||||
loc_ids[id] = str(loc_ids[id])
|
||||
|
||||
aff2loc = {}
|
||||
if probelist:
|
||||
for pid in probelist:
|
||||
try:
|
||||
aff2loc[pid]=loc_ids[pid]
|
||||
except:
|
||||
print "Affy probeset: %s has no locus id" %pid
|
||||
print "\nCONVERSION SUMMARY:\n \
|
||||
Number of probesets input %s \n \
|
||||
Number of translated locus ids: %s \n \
|
||||
Number of missings: %s" %(len(probelist),len(aff2loc),len(probelist)-len(aff2loc))
|
||||
else:
|
||||
aff2loc = loc_ids
|
||||
# reverse mapping
|
||||
loc2aff = {}
|
||||
for k,v in aff2loc.items():
|
||||
if loc2aff.has_key(v):
|
||||
loc2aff[v].append(k)
|
||||
else:
|
||||
loc2aff[v]=[k]
|
||||
|
||||
return aff2loc,loc2aff
|
||||
|
||||
def get_kegg_paths(org="hgu133plus2",id_type='aff',probelist=None):
|
||||
"""Returns a dictionary of KEGG maps.
|
||||
|
||||
input:
|
||||
org -- chip_type (see bioconductor.org)
|
||||
id_type -- id ['aff','loc']
|
||||
|
||||
key: affy_id, value = list of kegg map id
|
||||
example: '65884_at': ['00510', '00513']
|
||||
"""
|
||||
silent_eval("library("+org+")")
|
||||
silent_eval('xx<-as.list('+org+'PATH)')
|
||||
silent_eval('xp <- xx[!is.na(xx)]')
|
||||
aff2path = rpy.r("xp")
|
||||
dummy = rpy.r("xx")
|
||||
|
||||
if id_type=='loc':
|
||||
aff2loc,loc2aff = get_locusid(org=org)
|
||||
loc2path = {}
|
||||
for id,path in aff2path.items():
|
||||
if loc2path.has_key(id):
|
||||
pp = [path.append(i) for i in loc2path[id]]
|
||||
print "Found duplicate in path: %s" %path
|
||||
loc2path[aff2loc[id]]=path
|
||||
aff2path = loc2path
|
||||
out = {}
|
||||
|
||||
if probelist:
|
||||
for pid in probelist:
|
||||
try:
|
||||
out[pid]=aff2path[pid]
|
||||
except:
|
||||
print "Could not find id: %s" %pid
|
||||
else:
|
||||
out = aff2path
|
||||
for k,v in out.items():
|
||||
# if string convert tol list
|
||||
try:
|
||||
v + ''
|
||||
out[k] = [v]
|
||||
except:
|
||||
out[k] = v
|
||||
|
||||
return out
|
||||
|
||||
def get_probe_list(org="hgu133plus2"):
|
||||
rpy.r.library(org)
|
||||
silent_eval('probe_list<-ls('+org+'ACCNUM )')
|
||||
pl = rpy.r("probe_list")
|
||||
return pl
|
||||
|
||||
def get_GO_from_aff(org="hgu133plus2",id_type='aff',probelist=None):
|
||||
"""Returns a dictionary of GO terms.
|
||||
|
||||
input:
|
||||
org -- chip_type (see bioconductor.org)
|
||||
id_type -- id ['aff','loc']
|
||||
|
||||
key:
|
||||
example: '65884_at':
|
||||
"""
|
||||
silent_eval("library("+org+")")
|
||||
silent_eval('xx<-as.list('+org+'GO)')
|
||||
silent_eval('xp <- xx[!is.na(xx)]')
|
||||
aff2path = rpy.r("xp")
|
||||
dummy = rpy.r("xx")
|
||||
if id_type=='loc':
|
||||
LOC = get_locusid(org=org)
|
||||
loc2path = {}
|
||||
for id,path in aff2path.items():
|
||||
if loc2path.has_key(id):
|
||||
pp = [path.append(i) for i in loc2path[id]]
|
||||
print "Found duplicate in path: %s" %path
|
||||
loc2path[LOC[id]]=path
|
||||
aff2path = loc2path
|
||||
out = {}
|
||||
if probelist:
|
||||
for pid in probelist:
|
||||
try:
|
||||
out[pid]=aff2path[pid]
|
||||
except:
|
||||
print "Could not find id: %s" %pid
|
||||
return aff2path
|
||||
|
||||
def get_kegg_as_category(org="hgu133plus2",id_type='aff',probelist=None):
|
||||
"""Returns kegg pathway memberships in dummy (1/0) matrix (genes x maps)
|
||||
|
||||
"""
|
||||
kegg = get_kegg_paths(org=org, id_type=id_type, probelist=probelist)
|
||||
maps = set()
|
||||
for kpth in kegg.values():
|
||||
maps.update(kpth)
|
||||
|
||||
n_maps = len(maps)
|
||||
n_genes = len(kegg)
|
||||
gene2index = dict(zip(kegg.keys(), range(n_genes)))
|
||||
map2index = dict(zip(maps, range(n_maps)))
|
||||
C = scipy.zeros((n_genes, n_maps))
|
||||
for k,v in kegg.items():
|
||||
for m in v:
|
||||
C[gene2index[k], map2index[m]]=1
|
||||
|
||||
return C, list(maps), kegg.keys()
|
||||
|
||||
def impute(X, k=10, rowmax=0.5, colmax=0.8, maxp=1500, seed=362436069):
|
||||
"""
|
||||
A function to impute missing expression data, using nearest
|
||||
neighbor averaging. (from bioconductors impute)
|
||||
|
||||
input:
|
||||
|
||||
data: An expression matrix with genes in the rows, samples in the
|
||||
columns
|
||||
|
||||
k: Number of neighbors to be used in the imputation (default=10)
|
||||
|
||||
rowmax: The maximum percent missing data allowed in any row (default
|
||||
50%). For any rows with more than 'rowmax'% missing are
|
||||
imputed using the overall mean per sample.
|
||||
|
||||
colmax: The maximum percent missing data allowed in any column
|
||||
(default 80%). If any column has more than 'colmax'% missing
|
||||
data, the program halts and reports an error.
|
||||
|
||||
maxp: The largest block of genes imputed using the knn algorithm
|
||||
inside 'impute.knn' (default 1500); larger blocks are divided
|
||||
by two-means clustering (recursively) prior to imputation. If
|
||||
'maxp=p', only knn imputation is done
|
||||
|
||||
seed: The seed used for the random number generator (default
|
||||
362436069) for reproducibility.
|
||||
|
||||
|
||||
call:
|
||||
impute(data ,k = 10, rowmax = 0.5, colmax = 0.8, maxp = 1500, rng.seed=362436069)
|
||||
"""
|
||||
|
||||
rpy.r.library("impute")
|
||||
X = N.asarray(X) # cast as numeric array
|
||||
m, n = scipy.shape(X)
|
||||
if m>n:
|
||||
print "Warning (impute): more samples than variables. running transpose"
|
||||
t_flag = True
|
||||
else:
|
||||
X = N.transpose(X)
|
||||
t_flag = False
|
||||
|
||||
rpy.r.assign("X", X)
|
||||
rpy.r.assign("k", k)
|
||||
rpy.r.assign("rmax", rowmax)
|
||||
rpy.r.assign("cmax", colmax)
|
||||
rpy.r.assign("maxp", maxp)
|
||||
|
||||
call = "out<-impute.knn(X,k=k,rowmax=rmax,colmax=cmax,maxp=maxp)"
|
||||
silent_eval(call)
|
||||
out = rpy.r("out")
|
||||
if not t_flag:
|
||||
E = out['data']
|
||||
E = scipy.asarray(E)
|
||||
E = E.T
|
||||
else:
|
||||
E = out['data']
|
||||
E = scipy.asarray(E)
|
||||
return E
|
||||
|
||||
|
||||
def get_chip_annotation(org="hgu133a",annot='pmid', id_type='loc',probelist=None):
|
||||
"""Returns a dictionary of annoations.
|
||||
|
||||
input:
|
||||
org -- chip_type (see bioconductor.org)
|
||||
annot -- annotation ['genename', 'pmid', ' symbol']
|
||||
id_type -- id ['aff','loc']
|
||||
|
||||
|
||||
key: id, value = list of annoations
|
||||
example: '65884_at': ['15672394', '138402']
|
||||
"""
|
||||
_valid_annot = ['genename', 'pmid', 'symbol', 'enzyme', 'chr', 'chrloc']
|
||||
if annot.lower() not in _valid_annot:
|
||||
raise ValueError("Annotation must be one of %s" %_valid_annot)
|
||||
silent_eval("library("+org+")")
|
||||
silent_eval("dummy<-as.list("+org+annot.upper()+")")
|
||||
silent_eval('annotations <- dummy[!is.na(dummy)]')
|
||||
aff2annot = rpy.r("annotations")
|
||||
if id_type=='loc':
|
||||
aff2loc, loc2aff = get_locusid(org=org)
|
||||
loc2annot = {}
|
||||
for geneid, annotation in aff2annot.items():
|
||||
annotation = ensure_list(annotation)
|
||||
print annotation
|
||||
if loc2annot.has_key(geneid):
|
||||
for extra in loc2annot[geneid]:
|
||||
annotation.append(extra)
|
||||
print "Found duplicate in gene: %s" %geneid
|
||||
loc2annot[aff2loc[geneid]] = annotation
|
||||
aff2annot = loc2annot
|
||||
|
||||
out = {}
|
||||
if probelist:
|
||||
for pid in probelist:
|
||||
try:
|
||||
out[pid] = aff2annot.get(pid, 'none')
|
||||
except:
|
||||
print "Could not find id: %s" %pid
|
||||
else:
|
||||
out = aff2annot
|
||||
|
||||
return out
|
||||
|
||||
def ensure_list(value):
|
||||
if isinstance(value, list):
|
||||
return value
|
||||
else:
|
||||
return [value]
|
Reference in New Issue
Block a user