diff --git a/fluents/lib/R_utils.py b/fluents/lib/R_utils.py new file mode 100644 index 0000000..9c7d736 --- /dev/null +++ b/fluents/lib/R_utils.py @@ -0,0 +1,284 @@ +"""A collection of functions that use R. + +Most functions use libraries from bioconductor + +depends on: +(not updated) +-- bioconductor min. install +-- hgu133a +-- hgu133plus2 + +""" + +import scipy +import Numeric as N +import rpy +silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r) + +def get_locusid(probelist=None,org="hgu133a"): + """Returns a dictionary of locus link id for each affy probeset + and reverse mapping + + innput: + [probelist] -- probelist of affy probesets + [org] -- chip type (organism) + + out: + aff2loc, loc2aff + + The mapping is one-to-one for affy->locus_id + However, there are several affy probesets for one locus_id + + From bioc-mail-archive: BioC takes the GeneBank ids associated + with the probes (provided by the manufacture) and then maps them + to Entrez Gene ids using data from UniGene, Entrez Gene, and other + available data sources we trust. The Entrez Gene id a probe is + assigned to is determined by votes from all the sources used. If + there is no agreement among the sources, we take the smallest + Entrez Gene id. + """ + silent_eval("library("+org+")") + silent_eval('locus_ids = as.list('+org+'LOCUSID)') + silent_eval('pp<-as.list(locus_ids[!is.na(locus_ids)])') + loc_ids = rpy.r("pp") + for id in loc_ids: + loc_ids[id] = str(loc_ids[id]) + + aff2loc = {} + if probelist: + for pid in probelist: + try: + aff2loc[pid]=loc_ids[pid] + except: + print "Affy probeset: %s has no locus id" %pid + print "\nCONVERSION SUMMARY:\n \ + Number of probesets input %s \n \ + Number of translated locus ids: %s \n \ + Number of missings: %s" %(len(probelist),len(aff2loc),len(probelist)-len(aff2loc)) + else: + aff2loc = loc_ids + # reverse mapping + loc2aff = {} + for k,v in aff2loc.items(): + if loc2aff.has_key(v): + loc2aff[v].append(k) + else: + loc2aff[v]=[k] + + return aff2loc,loc2aff + +def get_kegg_paths(org="hgu133plus2",id_type='aff',probelist=None): + """Returns a dictionary of KEGG maps. + + input: + org -- chip_type (see bioconductor.org) + id_type -- id ['aff','loc'] + + key: affy_id, value = list of kegg map id + example: '65884_at': ['00510', '00513'] + """ + silent_eval("library("+org+")") + silent_eval('xx<-as.list('+org+'PATH)') + silent_eval('xp <- xx[!is.na(xx)]') + aff2path = rpy.r("xp") + dummy = rpy.r("xx") + + if id_type=='loc': + aff2loc,loc2aff = get_locusid(org=org) + loc2path = {} + for id,path in aff2path.items(): + if loc2path.has_key(id): + pp = [path.append(i) for i in loc2path[id]] + print "Found duplicate in path: %s" %path + loc2path[aff2loc[id]]=path + aff2path = loc2path + out = {} + + if probelist: + for pid in probelist: + try: + out[pid]=aff2path[pid] + except: + print "Could not find id: %s" %pid + else: + out = aff2path + for k,v in out.items(): + # if string convert tol list + try: + v + '' + out[k] = [v] + except: + out[k] = v + + return out + +def get_probe_list(org="hgu133plus2"): + rpy.r.library(org) + silent_eval('probe_list<-ls('+org+'ACCNUM )') + pl = rpy.r("probe_list") + return pl + +def get_GO_from_aff(org="hgu133plus2",id_type='aff',probelist=None): + """Returns a dictionary of GO terms. + + input: + org -- chip_type (see bioconductor.org) + id_type -- id ['aff','loc'] + + key: + example: '65884_at': + """ + silent_eval("library("+org+")") + silent_eval('xx<-as.list('+org+'GO)') + silent_eval('xp <- xx[!is.na(xx)]') + aff2path = rpy.r("xp") + dummy = rpy.r("xx") + if id_type=='loc': + LOC = get_locusid(org=org) + loc2path = {} + for id,path in aff2path.items(): + if loc2path.has_key(id): + pp = [path.append(i) for i in loc2path[id]] + print "Found duplicate in path: %s" %path + loc2path[LOC[id]]=path + aff2path = loc2path + out = {} + if probelist: + for pid in probelist: + try: + out[pid]=aff2path[pid] + except: + print "Could not find id: %s" %pid + return aff2path + +def get_kegg_as_category(org="hgu133plus2",id_type='aff',probelist=None): + """Returns kegg pathway memberships in dummy (1/0) matrix (genes x maps) + + """ + kegg = get_kegg_paths(org=org, id_type=id_type, probelist=probelist) + maps = set() + for kpth in kegg.values(): + maps.update(kpth) + + n_maps = len(maps) + n_genes = len(kegg) + gene2index = dict(zip(kegg.keys(), range(n_genes))) + map2index = dict(zip(maps, range(n_maps))) + C = scipy.zeros((n_genes, n_maps)) + for k,v in kegg.items(): + for m in v: + C[gene2index[k], map2index[m]]=1 + + return C, list(maps), kegg.keys() + +def impute(X, k=10, rowmax=0.5, colmax=0.8, maxp=1500, seed=362436069): + """ + A function to impute missing expression data, using nearest + neighbor averaging. (from bioconductors impute) + + input: + + data: An expression matrix with genes in the rows, samples in the + columns + + k: Number of neighbors to be used in the imputation (default=10) + + rowmax: The maximum percent missing data allowed in any row (default + 50%). For any rows with more than 'rowmax'% missing are + imputed using the overall mean per sample. + + colmax: The maximum percent missing data allowed in any column + (default 80%). If any column has more than 'colmax'% missing + data, the program halts and reports an error. + + maxp: The largest block of genes imputed using the knn algorithm + inside 'impute.knn' (default 1500); larger blocks are divided + by two-means clustering (recursively) prior to imputation. If + 'maxp=p', only knn imputation is done + + seed: The seed used for the random number generator (default + 362436069) for reproducibility. + + + call: + impute(data ,k = 10, rowmax = 0.5, colmax = 0.8, maxp = 1500, rng.seed=362436069) + """ + + rpy.r.library("impute") + X = N.asarray(X) # cast as numeric array + m, n = scipy.shape(X) + if m>n: + print "Warning (impute): more samples than variables. running transpose" + t_flag = True + else: + X = N.transpose(X) + t_flag = False + + rpy.r.assign("X", X) + rpy.r.assign("k", k) + rpy.r.assign("rmax", rowmax) + rpy.r.assign("cmax", colmax) + rpy.r.assign("maxp", maxp) + + call = "out<-impute.knn(X,k=k,rowmax=rmax,colmax=cmax,maxp=maxp)" + silent_eval(call) + out = rpy.r("out") + if not t_flag: + E = out['data'] + E = scipy.asarray(E) + E = E.T + else: + E = out['data'] + E = scipy.asarray(E) + return E + + +def get_chip_annotation(org="hgu133a",annot='pmid', id_type='loc',probelist=None): + """Returns a dictionary of annoations. + + input: + org -- chip_type (see bioconductor.org) + annot -- annotation ['genename', 'pmid', ' symbol'] + id_type -- id ['aff','loc'] + + + key: id, value = list of annoations + example: '65884_at': ['15672394', '138402'] + """ + _valid_annot = ['genename', 'pmid', 'symbol', 'enzyme', 'chr', 'chrloc'] + if annot.lower() not in _valid_annot: + raise ValueError("Annotation must be one of %s" %_valid_annot) + silent_eval("library("+org+")") + silent_eval("dummy<-as.list("+org+annot.upper()+")") + silent_eval('annotations <- dummy[!is.na(dummy)]') + aff2annot = rpy.r("annotations") + if id_type=='loc': + aff2loc, loc2aff = get_locusid(org=org) + loc2annot = {} + for geneid, annotation in aff2annot.items(): + annotation = ensure_list(annotation) + print annotation + if loc2annot.has_key(geneid): + for extra in loc2annot[geneid]: + annotation.append(extra) + print "Found duplicate in gene: %s" %geneid + loc2annot[aff2loc[geneid]] = annotation + aff2annot = loc2annot + + out = {} + if probelist: + for pid in probelist: + try: + out[pid] = aff2annot.get(pid, 'none') + except: + print "Could not find id: %s" %pid + else: + out = aff2annot + + return out + +def ensure_list(value): + if isinstance(value, list): + return value + else: + return [value]