some biocondoctur utilities

This commit is contained in:
Arnar Flatberg 2007-05-06 13:51:06 +00:00
parent f1301be67a
commit ebe7621817

284
fluents/lib/R_utils.py Normal file
View File

@ -0,0 +1,284 @@
"""A collection of functions that use R.
Most functions use libraries from bioconductor
depends on:
(not updated)
-- bioconductor min. install
-- hgu133a
-- hgu133plus2
"""
import scipy
import Numeric as N
import rpy
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
def get_locusid(probelist=None,org="hgu133a"):
"""Returns a dictionary of locus link id for each affy probeset
and reverse mapping
innput:
[probelist] -- probelist of affy probesets
[org] -- chip type (organism)
out:
aff2loc, loc2aff
The mapping is one-to-one for affy->locus_id
However, there are several affy probesets for one locus_id
From bioc-mail-archive: BioC takes the GeneBank ids associated
with the probes (provided by the manufacture) and then maps them
to Entrez Gene ids using data from UniGene, Entrez Gene, and other
available data sources we trust. The Entrez Gene id a probe is
assigned to is determined by votes from all the sources used. If
there is no agreement among the sources, we take the smallest
Entrez Gene id.
"""
silent_eval("library("+org+")")
silent_eval('locus_ids = as.list('+org+'LOCUSID)')
silent_eval('pp<-as.list(locus_ids[!is.na(locus_ids)])')
loc_ids = rpy.r("pp")
for id in loc_ids:
loc_ids[id] = str(loc_ids[id])
aff2loc = {}
if probelist:
for pid in probelist:
try:
aff2loc[pid]=loc_ids[pid]
except:
print "Affy probeset: %s has no locus id" %pid
print "\nCONVERSION SUMMARY:\n \
Number of probesets input %s \n \
Number of translated locus ids: %s \n \
Number of missings: %s" %(len(probelist),len(aff2loc),len(probelist)-len(aff2loc))
else:
aff2loc = loc_ids
# reverse mapping
loc2aff = {}
for k,v in aff2loc.items():
if loc2aff.has_key(v):
loc2aff[v].append(k)
else:
loc2aff[v]=[k]
return aff2loc,loc2aff
def get_kegg_paths(org="hgu133plus2",id_type='aff',probelist=None):
"""Returns a dictionary of KEGG maps.
input:
org -- chip_type (see bioconductor.org)
id_type -- id ['aff','loc']
key: affy_id, value = list of kegg map id
example: '65884_at': ['00510', '00513']
"""
silent_eval("library("+org+")")
silent_eval('xx<-as.list('+org+'PATH)')
silent_eval('xp <- xx[!is.na(xx)]')
aff2path = rpy.r("xp")
dummy = rpy.r("xx")
if id_type=='loc':
aff2loc,loc2aff = get_locusid(org=org)
loc2path = {}
for id,path in aff2path.items():
if loc2path.has_key(id):
pp = [path.append(i) for i in loc2path[id]]
print "Found duplicate in path: %s" %path
loc2path[aff2loc[id]]=path
aff2path = loc2path
out = {}
if probelist:
for pid in probelist:
try:
out[pid]=aff2path[pid]
except:
print "Could not find id: %s" %pid
else:
out = aff2path
for k,v in out.items():
# if string convert tol list
try:
v + ''
out[k] = [v]
except:
out[k] = v
return out
def get_probe_list(org="hgu133plus2"):
rpy.r.library(org)
silent_eval('probe_list<-ls('+org+'ACCNUM )')
pl = rpy.r("probe_list")
return pl
def get_GO_from_aff(org="hgu133plus2",id_type='aff',probelist=None):
"""Returns a dictionary of GO terms.
input:
org -- chip_type (see bioconductor.org)
id_type -- id ['aff','loc']
key:
example: '65884_at':
"""
silent_eval("library("+org+")")
silent_eval('xx<-as.list('+org+'GO)')
silent_eval('xp <- xx[!is.na(xx)]')
aff2path = rpy.r("xp")
dummy = rpy.r("xx")
if id_type=='loc':
LOC = get_locusid(org=org)
loc2path = {}
for id,path in aff2path.items():
if loc2path.has_key(id):
pp = [path.append(i) for i in loc2path[id]]
print "Found duplicate in path: %s" %path
loc2path[LOC[id]]=path
aff2path = loc2path
out = {}
if probelist:
for pid in probelist:
try:
out[pid]=aff2path[pid]
except:
print "Could not find id: %s" %pid
return aff2path
def get_kegg_as_category(org="hgu133plus2",id_type='aff',probelist=None):
"""Returns kegg pathway memberships in dummy (1/0) matrix (genes x maps)
"""
kegg = get_kegg_paths(org=org, id_type=id_type, probelist=probelist)
maps = set()
for kpth in kegg.values():
maps.update(kpth)
n_maps = len(maps)
n_genes = len(kegg)
gene2index = dict(zip(kegg.keys(), range(n_genes)))
map2index = dict(zip(maps, range(n_maps)))
C = scipy.zeros((n_genes, n_maps))
for k,v in kegg.items():
for m in v:
C[gene2index[k], map2index[m]]=1
return C, list(maps), kegg.keys()
def impute(X, k=10, rowmax=0.5, colmax=0.8, maxp=1500, seed=362436069):
"""
A function to impute missing expression data, using nearest
neighbor averaging. (from bioconductors impute)
input:
data: An expression matrix with genes in the rows, samples in the
columns
k: Number of neighbors to be used in the imputation (default=10)
rowmax: The maximum percent missing data allowed in any row (default
50%). For any rows with more than 'rowmax'% missing are
imputed using the overall mean per sample.
colmax: The maximum percent missing data allowed in any column
(default 80%). If any column has more than 'colmax'% missing
data, the program halts and reports an error.
maxp: The largest block of genes imputed using the knn algorithm
inside 'impute.knn' (default 1500); larger blocks are divided
by two-means clustering (recursively) prior to imputation. If
'maxp=p', only knn imputation is done
seed: The seed used for the random number generator (default
362436069) for reproducibility.
call:
impute(data ,k = 10, rowmax = 0.5, colmax = 0.8, maxp = 1500, rng.seed=362436069)
"""
rpy.r.library("impute")
X = N.asarray(X) # cast as numeric array
m, n = scipy.shape(X)
if m>n:
print "Warning (impute): more samples than variables. running transpose"
t_flag = True
else:
X = N.transpose(X)
t_flag = False
rpy.r.assign("X", X)
rpy.r.assign("k", k)
rpy.r.assign("rmax", rowmax)
rpy.r.assign("cmax", colmax)
rpy.r.assign("maxp", maxp)
call = "out<-impute.knn(X,k=k,rowmax=rmax,colmax=cmax,maxp=maxp)"
silent_eval(call)
out = rpy.r("out")
if not t_flag:
E = out['data']
E = scipy.asarray(E)
E = E.T
else:
E = out['data']
E = scipy.asarray(E)
return E
def get_chip_annotation(org="hgu133a",annot='pmid', id_type='loc',probelist=None):
"""Returns a dictionary of annoations.
input:
org -- chip_type (see bioconductor.org)
annot -- annotation ['genename', 'pmid', ' symbol']
id_type -- id ['aff','loc']
key: id, value = list of annoations
example: '65884_at': ['15672394', '138402']
"""
_valid_annot = ['genename', 'pmid', 'symbol', 'enzyme', 'chr', 'chrloc']
if annot.lower() not in _valid_annot:
raise ValueError("Annotation must be one of %s" %_valid_annot)
silent_eval("library("+org+")")
silent_eval("dummy<-as.list("+org+annot.upper()+")")
silent_eval('annotations <- dummy[!is.na(dummy)]')
aff2annot = rpy.r("annotations")
if id_type=='loc':
aff2loc, loc2aff = get_locusid(org=org)
loc2annot = {}
for geneid, annotation in aff2annot.items():
annotation = ensure_list(annotation)
print annotation
if loc2annot.has_key(geneid):
for extra in loc2annot[geneid]:
annotation.append(extra)
print "Found duplicate in gene: %s" %geneid
loc2annot[aff2loc[geneid]] = annotation
aff2annot = loc2annot
out = {}
if probelist:
for pid in probelist:
try:
out[pid] = aff2annot.get(pid, 'none')
except:
print "Could not find id: %s" %pid
else:
out = aff2annot
return out
def ensure_list(value):
if isinstance(value, list):
return value
else:
return [value]