diff --git a/scripts/lpls/plots_lpls.py b/scripts/lpls/plots_lpls.py index bb1bc1b..9e4ca57 100644 --- a/scripts/lpls/plots_lpls.py +++ b/scripts/lpls/plots_lpls.py @@ -5,7 +5,7 @@ import networkx as nx def plot_corrloads(R, pc1=0,pc2=1,s=20, c='b', zorder=5,expvar=None,ax=None,drawback=True, labels=None): """ Correlation loading plot.""" - # backgorund + # background if ax==None or drawback==True: radius = 1 center = (0,0) diff --git a/scripts/lpls/rpy_go.py b/scripts/lpls/rpy_go.py index 9dda37c..97d3822 100644 --- a/scripts/lpls/rpy_go.py +++ b/scripts/lpls/rpy_go.py @@ -2,7 +2,7 @@ import scipy import rpy silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r) - +import collections def goterms_from_gene(genelist, ontology='BP', garbage=None): """ Returns the go-terms from a specified genelist (Entrez id). @@ -18,7 +18,7 @@ def goterms_from_gene(genelist, ontology='BP', garbage=None): "IDA" : "inferred from direct assay", "IEP" : "inferred from expression pattern", "IEA" : "inferred from electronic annotation", - "TAS" : "traceable author statement", + "TAS" : "traceable author statement", "NAS" : "non-traceable author statement", "ND" : "no biological data available", "IC" : "inferred by curator" @@ -167,3 +167,47 @@ def gene_GO_hypergeo_test(genelist,universe="entrezUniverse",ontology="BP",chip result = rpy.r.summary(rpy.r.hyperGTest(params)) return rpy.r.summary(result), params + +def data_aff2loc_hgu133a(X, aff_ids, verbose=False): + aff_ids = scipy.asarray(aff_ids) + if verbose: + print "\nNumber of probesets in affy list: %s" %len(aff_ids) + import rpy + rpy.r.library("hgu133a") + trans_table = rpy.r.as_list(rpy.r.hgu133aENTREZID) + if verbose: + print "Number of entrez ids: %d" %(scipy.asarray(trans_table.values())>0).sum() + enz2aff = collections.defaultdict(list) + #aff2enz = collections.defaultdict(list) + for aff, enz in trans_table.items(): + if int(enz)>0 and (aff in aff_ids): + enz2aff[enz].append(aff) + #aff2enz[aff].append(enz) + if verbose: + print "\nNumber of translated entrez ids: %d" %len(enz2aff) + aff2ind = dict(zip(aff_ids, scipy.arange(len(aff_ids)))) + var_x = X.var(0) + new_data = [] + new_ids = [] + m = 0 + s = 0 + for enz, aff_id_list in enz2aff.items(): + index = [aff2ind[aff_id] for aff_id in aff_id_list] + if len(index)>1: + m+=1 + if verbose: + pass + #print "\nEntrez id: %s has %d probesets" %(enz, len(index)) + #print index + xsub = X[:,index] + choose_this = scipy.argmax(xsub.var(0)) + new_data.append(xsub[:,choose_this].ravel()) + else: + s+=1 + new_data.append(X[:,index].ravel()) + new_ids.append(enz) + if verbose: + print "Ids with multiple probesets: %d" %m + print "Ids with unique probeset: %d" %s + X = scipy.asarray(new_data).T + return X, new_ids diff --git a/scripts/lpls/run_smoker.py b/scripts/lpls/run_smoker.py index a817eae..184637c 100644 --- a/scripts/lpls/run_smoker.py +++ b/scripts/lpls/run_smoker.py @@ -11,41 +11,30 @@ import cx_stats from plots_lpls import plot_corrloads ######## DATA ########## -# full smoker data -DX = dataset.read_ftsv(open("../../data/smokers-full/Smokers.ftsv")) -DY = dataset.read_ftsv(open("../../data/smokers-full/Yg.ftsv")) -Y = DY.asarray().astype('d') -# select subset genes by SAM -rpy.r.library("siggenes") -rpy.r.library("qvalue") -data = DX.asarray().T -# data = data[:100,:] -rpy.r.assign("data", data) -cl = dot(DY.asarray(), diag([1,2,3])).sum(1) -rpy.r.assign("cl", cl) -rpy.r.assign("B", 20) -# Perform a SAM analysis. -print "Starting SAM" -sam = rpy.r('sam.out<-sam(data=data,cl=cl,B=B,rand=123)') -print "SAM done" -# Compute the q-values of the genes. -qq = rpy.r('qobj<-qvalue(sam.out@p.value)') -qvals = asarray(qq['qvalues']) -# cut off -cutoff = 0.05 -index = where(qvals0: - raise valueError("NANs in tmat") - -# Z-matrix -#Z, newind = rpy_go.genego_matrix(terms, tmat, gene_ids, terms,func=mean) -#Z = Z.T -Z = rpy_go.genego_sim(gene2goterms,gene_ids,terms,rpytmat1,go_term_sim="OA",term_sim=meth) - - -#### do another -#meth = methods[4] -#rpytmat = rpy.with_mode(rpy.NO_CONVERSION, rpy.r.getTermSim)(terms, method=meth,verbose=False) -#tmat = rpy.r.assign("haha", rpytmat) - -# check if all terms where found -#nanindex = where(isnan(tmat[:,0]))[0] -#if len(nanindex)>0: -# raise valueError("NANs in tmat") - -# Z-matrix -#Z, newind = rpy_go.genego_matrix(terms, tmat, gene_ids, terms,func=mean) -#Z = Z.T -#Z = rpy_go.genego_sim(gene2goterms,gene_ids,terms,rpytmat,go_term_sim="OA",term_sim=meth) - - +rpytmat = rpy.with_mode(rpy.NO_CONVERSION, rpy.r.getTermSim)(terms, method=meth,verbose=False) +tmat = rpy.r.assign("haha", rpytmat) +print "\n Calculating Z matrix" +Z = rpy_go.genego_sim(gene2goterms,gene_ids,terms,rpytmat,go_term_sim="OA",term_sim=meth) # update data (X) matrix -#newind = [gene2ind[gene] for gene in gene_ids] newind = DX.get_indices('gene_ids', gene_ids) -Xr = X[:,newind] -#new_gene_ids = asarray(gene_ids)[newind] +Xr = DX.asarray()[:,newind] ######## LPLSR ######## print "LPLSR ..." a_max = 5 aopt = 3 -alpha=.4 -mean_ctr = [2, 0, 1] +xz_alpha = .5 +w_alpha = .1 +mean_ctr = [2, 0, 2] + +# standardize Z? +sdtz = False +if sdtz: + Z = Z/Z.std(0) + T, W, P, Q, U, L, K, B, b0, evx, evy, evz = nipals_lpls(Xr,Y,Z, a_max, - alpha=alpha, + alpha=xz_alpha, mean_ctr=mean_ctr) # Correlation loadings @@ -117,24 +117,22 @@ dx,Rx,rssx = correlation_loadings(Xr, T, P) dx,Ry,rssy = correlation_loadings(Y, T, Q) cadz,Rz,rssz = correlation_loadings(Z.T, W, L) # Prediction error -rmsep , yhat, class_error = cv_lpls(Xr, Y, Z, a_max, alpha=alpha,mean_ctr=mean_ctr) -alpha_check=True +rmsep , yhat, class_error = cv_lpls(Xr, Y, Z, a_max, alpha=xz_alpha,mean_ctr=mean_ctr) +alpha_check=False if alpha_check: Alpha = arange(0.01, 1, .1) Rmsep,Yhat, CE = [],[],[] for a in Alpha: - rmsep , yhat, ce = cv_lpls(Xr, Y, Z, a_max, alpha=alpha) + rmsep , yhat, ce = cv_lpls(Xr, Y, Z, a_max, alpha=xz_alpha,mean_ctr=mean_ctr) Rmsep.append(rmsep) Yhat.append(yhat) CE.append(ce) Rmsep = asarray(Rmsep) Yhat = asarray(Yhat) CE = asarray(CE) - - # Significance Hotellings T -Wx, Wz, Wy, = jk_lpls(Xr, Y, Z, aopt) +Wx, Wz, Wy, = jk_lpls(Xr, Y, Z, aopt, mean_ctr=mean_ctr,alpha=w_alpha) Ws = W*apply_along_axis(norm, 0, T) tsqx = cx_stats.hotelling(Wx, Ws[:,:aopt]) tsqz = cx_stats.hotelling(Wz, L[:,:aopt]) @@ -157,7 +155,8 @@ ylim([class_error.min()-.05, class_error.max()+.05]) title('Classification accuracy') figure(3) # Hypoid correlations -plot_corrloads(Rz, pc1=0, pc2=1, s=tsqz/10.0, c='b', zorder=5, expvar=evz, ax=None) +tsqz_s = 250*tsqz/tsqz.max() +plot_corrloads(Rz, pc1=0, pc2=1, s=tsqz_s, c='b', zorder=5, expvar=evz, ax=None) ax = gca() ylabels = DY.get_identifiers('_status', sorted=True) plot_corrloads(Ry, pc1=0, pc2=1, s=150, c='g', zorder=5, expvar=evy, ax=ax,labels=ylabels)