285 lines
8.5 KiB
Python
285 lines
8.5 KiB
Python
|
"""A collection of functions that use R.
|
||
|
|
||
|
Most functions use libraries from bioconductor
|
||
|
|
||
|
depends on:
|
||
|
(not updated)
|
||
|
-- bioconductor min. install
|
||
|
-- hgu133a
|
||
|
-- hgu133plus2
|
||
|
|
||
|
"""
|
||
|
|
||
|
import scipy
|
||
|
import Numeric as N
|
||
|
import rpy
|
||
|
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
|
||
|
|
||
|
def get_locusid(probelist=None,org="hgu133a"):
|
||
|
"""Returns a dictionary of locus link id for each affy probeset
|
||
|
and reverse mapping
|
||
|
|
||
|
innput:
|
||
|
[probelist] -- probelist of affy probesets
|
||
|
[org] -- chip type (organism)
|
||
|
|
||
|
out:
|
||
|
aff2loc, loc2aff
|
||
|
|
||
|
The mapping is one-to-one for affy->locus_id
|
||
|
However, there are several affy probesets for one locus_id
|
||
|
|
||
|
From bioc-mail-archive: BioC takes the GeneBank ids associated
|
||
|
with the probes (provided by the manufacture) and then maps them
|
||
|
to Entrez Gene ids using data from UniGene, Entrez Gene, and other
|
||
|
available data sources we trust. The Entrez Gene id a probe is
|
||
|
assigned to is determined by votes from all the sources used. If
|
||
|
there is no agreement among the sources, we take the smallest
|
||
|
Entrez Gene id.
|
||
|
"""
|
||
|
silent_eval("library("+org+")")
|
||
|
silent_eval('locus_ids = as.list('+org+'LOCUSID)')
|
||
|
silent_eval('pp<-as.list(locus_ids[!is.na(locus_ids)])')
|
||
|
loc_ids = rpy.r("pp")
|
||
|
for id in loc_ids:
|
||
|
loc_ids[id] = str(loc_ids[id])
|
||
|
|
||
|
aff2loc = {}
|
||
|
if probelist:
|
||
|
for pid in probelist:
|
||
|
try:
|
||
|
aff2loc[pid]=loc_ids[pid]
|
||
|
except:
|
||
|
print "Affy probeset: %s has no locus id" %pid
|
||
|
print "\nCONVERSION SUMMARY:\n \
|
||
|
Number of probesets input %s \n \
|
||
|
Number of translated locus ids: %s \n \
|
||
|
Number of missings: %s" %(len(probelist),len(aff2loc),len(probelist)-len(aff2loc))
|
||
|
else:
|
||
|
aff2loc = loc_ids
|
||
|
# reverse mapping
|
||
|
loc2aff = {}
|
||
|
for k,v in aff2loc.items():
|
||
|
if loc2aff.has_key(v):
|
||
|
loc2aff[v].append(k)
|
||
|
else:
|
||
|
loc2aff[v]=[k]
|
||
|
|
||
|
return aff2loc,loc2aff
|
||
|
|
||
|
def get_kegg_paths(org="hgu133plus2",id_type='aff',probelist=None):
|
||
|
"""Returns a dictionary of KEGG maps.
|
||
|
|
||
|
input:
|
||
|
org -- chip_type (see bioconductor.org)
|
||
|
id_type -- id ['aff','loc']
|
||
|
|
||
|
key: affy_id, value = list of kegg map id
|
||
|
example: '65884_at': ['00510', '00513']
|
||
|
"""
|
||
|
silent_eval("library("+org+")")
|
||
|
silent_eval('xx<-as.list('+org+'PATH)')
|
||
|
silent_eval('xp <- xx[!is.na(xx)]')
|
||
|
aff2path = rpy.r("xp")
|
||
|
dummy = rpy.r("xx")
|
||
|
|
||
|
if id_type=='loc':
|
||
|
aff2loc,loc2aff = get_locusid(org=org)
|
||
|
loc2path = {}
|
||
|
for id,path in aff2path.items():
|
||
|
if loc2path.has_key(id):
|
||
|
pp = [path.append(i) for i in loc2path[id]]
|
||
|
print "Found duplicate in path: %s" %path
|
||
|
loc2path[aff2loc[id]]=path
|
||
|
aff2path = loc2path
|
||
|
out = {}
|
||
|
|
||
|
if probelist:
|
||
|
for pid in probelist:
|
||
|
try:
|
||
|
out[pid]=aff2path[pid]
|
||
|
except:
|
||
|
print "Could not find id: %s" %pid
|
||
|
else:
|
||
|
out = aff2path
|
||
|
for k,v in out.items():
|
||
|
# if string convert tol list
|
||
|
try:
|
||
|
v + ''
|
||
|
out[k] = [v]
|
||
|
except:
|
||
|
out[k] = v
|
||
|
|
||
|
return out
|
||
|
|
||
|
def get_probe_list(org="hgu133plus2"):
|
||
|
rpy.r.library(org)
|
||
|
silent_eval('probe_list<-ls('+org+'ACCNUM )')
|
||
|
pl = rpy.r("probe_list")
|
||
|
return pl
|
||
|
|
||
|
def get_GO_from_aff(org="hgu133plus2",id_type='aff',probelist=None):
|
||
|
"""Returns a dictionary of GO terms.
|
||
|
|
||
|
input:
|
||
|
org -- chip_type (see bioconductor.org)
|
||
|
id_type -- id ['aff','loc']
|
||
|
|
||
|
key:
|
||
|
example: '65884_at':
|
||
|
"""
|
||
|
silent_eval("library("+org+")")
|
||
|
silent_eval('xx<-as.list('+org+'GO)')
|
||
|
silent_eval('xp <- xx[!is.na(xx)]')
|
||
|
aff2path = rpy.r("xp")
|
||
|
dummy = rpy.r("xx")
|
||
|
if id_type=='loc':
|
||
|
LOC = get_locusid(org=org)
|
||
|
loc2path = {}
|
||
|
for id,path in aff2path.items():
|
||
|
if loc2path.has_key(id):
|
||
|
pp = [path.append(i) for i in loc2path[id]]
|
||
|
print "Found duplicate in path: %s" %path
|
||
|
loc2path[LOC[id]]=path
|
||
|
aff2path = loc2path
|
||
|
out = {}
|
||
|
if probelist:
|
||
|
for pid in probelist:
|
||
|
try:
|
||
|
out[pid]=aff2path[pid]
|
||
|
except:
|
||
|
print "Could not find id: %s" %pid
|
||
|
return aff2path
|
||
|
|
||
|
def get_kegg_as_category(org="hgu133plus2",id_type='aff',probelist=None):
|
||
|
"""Returns kegg pathway memberships in dummy (1/0) matrix (genes x maps)
|
||
|
|
||
|
"""
|
||
|
kegg = get_kegg_paths(org=org, id_type=id_type, probelist=probelist)
|
||
|
maps = set()
|
||
|
for kpth in kegg.values():
|
||
|
maps.update(kpth)
|
||
|
|
||
|
n_maps = len(maps)
|
||
|
n_genes = len(kegg)
|
||
|
gene2index = dict(zip(kegg.keys(), range(n_genes)))
|
||
|
map2index = dict(zip(maps, range(n_maps)))
|
||
|
C = scipy.zeros((n_genes, n_maps))
|
||
|
for k,v in kegg.items():
|
||
|
for m in v:
|
||
|
C[gene2index[k], map2index[m]]=1
|
||
|
|
||
|
return C, list(maps), kegg.keys()
|
||
|
|
||
|
def impute(X, k=10, rowmax=0.5, colmax=0.8, maxp=1500, seed=362436069):
|
||
|
"""
|
||
|
A function to impute missing expression data, using nearest
|
||
|
neighbor averaging. (from bioconductors impute)
|
||
|
|
||
|
input:
|
||
|
|
||
|
data: An expression matrix with genes in the rows, samples in the
|
||
|
columns
|
||
|
|
||
|
k: Number of neighbors to be used in the imputation (default=10)
|
||
|
|
||
|
rowmax: The maximum percent missing data allowed in any row (default
|
||
|
50%). For any rows with more than 'rowmax'% missing are
|
||
|
imputed using the overall mean per sample.
|
||
|
|
||
|
colmax: The maximum percent missing data allowed in any column
|
||
|
(default 80%). If any column has more than 'colmax'% missing
|
||
|
data, the program halts and reports an error.
|
||
|
|
||
|
maxp: The largest block of genes imputed using the knn algorithm
|
||
|
inside 'impute.knn' (default 1500); larger blocks are divided
|
||
|
by two-means clustering (recursively) prior to imputation. If
|
||
|
'maxp=p', only knn imputation is done
|
||
|
|
||
|
seed: The seed used for the random number generator (default
|
||
|
362436069) for reproducibility.
|
||
|
|
||
|
|
||
|
call:
|
||
|
impute(data ,k = 10, rowmax = 0.5, colmax = 0.8, maxp = 1500, rng.seed=362436069)
|
||
|
"""
|
||
|
|
||
|
rpy.r.library("impute")
|
||
|
X = N.asarray(X) # cast as numeric array
|
||
|
m, n = scipy.shape(X)
|
||
|
if m>n:
|
||
|
print "Warning (impute): more samples than variables. running transpose"
|
||
|
t_flag = True
|
||
|
else:
|
||
|
X = N.transpose(X)
|
||
|
t_flag = False
|
||
|
|
||
|
rpy.r.assign("X", X)
|
||
|
rpy.r.assign("k", k)
|
||
|
rpy.r.assign("rmax", rowmax)
|
||
|
rpy.r.assign("cmax", colmax)
|
||
|
rpy.r.assign("maxp", maxp)
|
||
|
|
||
|
call = "out<-impute.knn(X,k=k,rowmax=rmax,colmax=cmax,maxp=maxp)"
|
||
|
silent_eval(call)
|
||
|
out = rpy.r("out")
|
||
|
if not t_flag:
|
||
|
E = out['data']
|
||
|
E = scipy.asarray(E)
|
||
|
E = E.T
|
||
|
else:
|
||
|
E = out['data']
|
||
|
E = scipy.asarray(E)
|
||
|
return E
|
||
|
|
||
|
|
||
|
def get_chip_annotation(org="hgu133a",annot='pmid', id_type='loc',probelist=None):
|
||
|
"""Returns a dictionary of annoations.
|
||
|
|
||
|
input:
|
||
|
org -- chip_type (see bioconductor.org)
|
||
|
annot -- annotation ['genename', 'pmid', ' symbol']
|
||
|
id_type -- id ['aff','loc']
|
||
|
|
||
|
|
||
|
key: id, value = list of annoations
|
||
|
example: '65884_at': ['15672394', '138402']
|
||
|
"""
|
||
|
_valid_annot = ['genename', 'pmid', 'symbol', 'enzyme', 'chr', 'chrloc']
|
||
|
if annot.lower() not in _valid_annot:
|
||
|
raise ValueError("Annotation must be one of %s" %_valid_annot)
|
||
|
silent_eval("library("+org+")")
|
||
|
silent_eval("dummy<-as.list("+org+annot.upper()+")")
|
||
|
silent_eval('annotations <- dummy[!is.na(dummy)]')
|
||
|
aff2annot = rpy.r("annotations")
|
||
|
if id_type=='loc':
|
||
|
aff2loc, loc2aff = get_locusid(org=org)
|
||
|
loc2annot = {}
|
||
|
for geneid, annotation in aff2annot.items():
|
||
|
annotation = ensure_list(annotation)
|
||
|
print annotation
|
||
|
if loc2annot.has_key(geneid):
|
||
|
for extra in loc2annot[geneid]:
|
||
|
annotation.append(extra)
|
||
|
print "Found duplicate in gene: %s" %geneid
|
||
|
loc2annot[aff2loc[geneid]] = annotation
|
||
|
aff2annot = loc2annot
|
||
|
|
||
|
out = {}
|
||
|
if probelist:
|
||
|
for pid in probelist:
|
||
|
try:
|
||
|
out[pid] = aff2annot.get(pid, 'none')
|
||
|
except:
|
||
|
print "Could not find id: %s" %pid
|
||
|
else:
|
||
|
out = aff2annot
|
||
|
|
||
|
return out
|
||
|
|
||
|
def ensure_list(value):
|
||
|
if isinstance(value, list):
|
||
|
return value
|
||
|
else:
|
||
|
return [value]
|