"""A collection of functions that use R. Most functions use libraries from bioconductor depends on: (not updated) -- bioconductor min. install -- hgu133a -- hgu133plus2 """ import scipy import Numeric as N import rpy silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r) def get_locusid(probelist=None,org="hgu133a"): """Returns a dictionary of locus link id for each affy probeset and reverse mapping innput: [probelist] -- probelist of affy probesets [org] -- chip type (organism) out: aff2loc, loc2aff The mapping is one-to-one for affy->locus_id However, there are several affy probesets for one locus_id From bioc-mail-archive: BioC takes the GeneBank ids associated with the probes (provided by the manufacture) and then maps them to Entrez Gene ids using data from UniGene, Entrez Gene, and other available data sources we trust. The Entrez Gene id a probe is assigned to is determined by votes from all the sources used. If there is no agreement among the sources, we take the smallest Entrez Gene id. """ silent_eval("library("+org+")") silent_eval('locus_ids = as.list('+org+'LOCUSID)') silent_eval('pp<-as.list(locus_ids[!is.na(locus_ids)])') loc_ids = rpy.r("pp") for id in loc_ids: loc_ids[id] = str(loc_ids[id]) aff2loc = {} if probelist: for pid in probelist: try: aff2loc[pid]=loc_ids[pid] except: print "Affy probeset: %s has no locus id" %pid print "\nCONVERSION SUMMARY:\n \ Number of probesets input %s \n \ Number of translated locus ids: %s \n \ Number of missings: %s" %(len(probelist),len(aff2loc),len(probelist)-len(aff2loc)) else: aff2loc = loc_ids # reverse mapping loc2aff = {} for k,v in aff2loc.items(): if loc2aff.has_key(v): loc2aff[v].append(k) else: loc2aff[v]=[k] return aff2loc,loc2aff def get_kegg_paths(org="hgu133plus2",id_type='aff',probelist=None): """Returns a dictionary of KEGG maps. input: org -- chip_type (see bioconductor.org) id_type -- id ['aff','loc'] key: affy_id, value = list of kegg map id example: '65884_at': ['00510', '00513'] """ silent_eval("library("+org+")") silent_eval('xx<-as.list('+org+'PATH)') silent_eval('xp <- xx[!is.na(xx)]') aff2path = rpy.r("xp") dummy = rpy.r("xx") if id_type=='loc': aff2loc,loc2aff = get_locusid(org=org) loc2path = {} for id,path in aff2path.items(): if loc2path.has_key(id): pp = [path.append(i) for i in loc2path[id]] print "Found duplicate in path: %s" %path loc2path[aff2loc[id]]=path aff2path = loc2path out = {} if probelist: for pid in probelist: try: out[pid]=aff2path[pid] except: print "Could not find id: %s" %pid else: out = aff2path for k,v in out.items(): # if string convert tol list try: v + '' out[k] = [v] except: out[k] = v return out def get_probe_list(org="hgu133plus2"): rpy.r.library(org) silent_eval('probe_list<-ls('+org+'ACCNUM )') pl = rpy.r("probe_list") return pl def get_GO_from_aff(org="hgu133plus2",id_type='aff',probelist=None): """Returns a dictionary of GO terms. input: org -- chip_type (see bioconductor.org) id_type -- id ['aff','loc'] key: example: '65884_at': """ silent_eval("library("+org+")") silent_eval('xx<-as.list('+org+'GO)') silent_eval('xp <- xx[!is.na(xx)]') aff2path = rpy.r("xp") dummy = rpy.r("xx") if id_type=='loc': LOC = get_locusid(org=org) loc2path = {} for id,path in aff2path.items(): if loc2path.has_key(id): pp = [path.append(i) for i in loc2path[id]] print "Found duplicate in path: %s" %path loc2path[LOC[id]]=path aff2path = loc2path out = {} if probelist: for pid in probelist: try: out[pid]=aff2path[pid] except: print "Could not find id: %s" %pid return aff2path def get_kegg_as_category(org="hgu133plus2",id_type='aff',probelist=None): """Returns kegg pathway memberships in dummy (1/0) matrix (genes x maps) """ kegg = get_kegg_paths(org=org, id_type=id_type, probelist=probelist) maps = set() for kpth in kegg.values(): maps.update(kpth) n_maps = len(maps) n_genes = len(kegg) gene2index = dict(zip(kegg.keys(), range(n_genes))) map2index = dict(zip(maps, range(n_maps))) C = scipy.zeros((n_genes, n_maps)) for k,v in kegg.items(): for m in v: C[gene2index[k], map2index[m]]=1 return C, list(maps), kegg.keys() def impute(X, k=10, rowmax=0.5, colmax=0.8, maxp=1500, seed=362436069): """ A function to impute missing expression data, using nearest neighbor averaging. (from bioconductors impute) input: data: An expression matrix with genes in the rows, samples in the columns k: Number of neighbors to be used in the imputation (default=10) rowmax: The maximum percent missing data allowed in any row (default 50%). For any rows with more than 'rowmax'% missing are imputed using the overall mean per sample. colmax: The maximum percent missing data allowed in any column (default 80%). If any column has more than 'colmax'% missing data, the program halts and reports an error. maxp: The largest block of genes imputed using the knn algorithm inside 'impute.knn' (default 1500); larger blocks are divided by two-means clustering (recursively) prior to imputation. If 'maxp=p', only knn imputation is done seed: The seed used for the random number generator (default 362436069) for reproducibility. call: impute(data ,k = 10, rowmax = 0.5, colmax = 0.8, maxp = 1500, rng.seed=362436069) """ rpy.r.library("impute") X = N.asarray(X) # cast as numeric array m, n = scipy.shape(X) if m>n: print "Warning (impute): more samples than variables. running transpose" t_flag = True else: X = N.transpose(X) t_flag = False rpy.r.assign("X", X) rpy.r.assign("k", k) rpy.r.assign("rmax", rowmax) rpy.r.assign("cmax", colmax) rpy.r.assign("maxp", maxp) call = "out<-impute.knn(X,k=k,rowmax=rmax,colmax=cmax,maxp=maxp)" silent_eval(call) out = rpy.r("out") if not t_flag: E = out['data'] E = scipy.asarray(E) E = E.T else: E = out['data'] E = scipy.asarray(E) return E def get_chip_annotation(org="hgu133a",annot='pmid', id_type='loc',probelist=None): """Returns a dictionary of annoations. input: org -- chip_type (see bioconductor.org) annot -- annotation ['genename', 'pmid', ' symbol'] id_type -- id ['aff','loc'] key: id, value = list of annoations example: '65884_at': ['15672394', '138402'] """ _valid_annot = ['genename', 'pmid', 'symbol', 'enzyme', 'chr', 'chrloc'] if annot.lower() not in _valid_annot: raise ValueError("Annotation must be one of %s" %_valid_annot) silent_eval("library("+org+")") silent_eval("dummy<-as.list("+org+annot.upper()+")") silent_eval('annotations <- dummy[!is.na(dummy)]') aff2annot = rpy.r("annotations") if id_type=='loc': aff2loc, loc2aff = get_locusid(org=org) loc2annot = {} for geneid, annotation in aff2annot.items(): annotation = ensure_list(annotation) print annotation if loc2annot.has_key(geneid): for extra in loc2annot[geneid]: annotation.append(extra) print "Found duplicate in gene: %s" %geneid loc2annot[aff2loc[geneid]] = annotation aff2annot = loc2annot out = {} if probelist: for pid in probelist: try: out[pid] = aff2annot.get(pid, 'none') except: print "Could not find id: %s" %pid else: out = aff2annot return out def ensure_list(value): if isinstance(value, list): return value else: return [value]