diff --git a/scripts/lpls/lpls.py b/scripts/lpls/lpls.py new file mode 100644 index 0000000..b32a91c --- /dev/null +++ b/scripts/lpls/lpls.py @@ -0,0 +1,409 @@ +import sys +from pylab import * +import matplotlib +from scipy import * +from scipy.linalg import inv,norm + +sys.path.append("/home/flatberg/fluents/fluents/lib") +import select_generators +sys.path.remove("/home/flatberg/fluents/fluents/lib") + + +def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], verbose=True): + """ L-shaped Partial Least Sqaures Regression by the nipals algorithm. + + (X!Z)->Y + :input: + X : data matrix (m, n) + Y : data matrix (m, l) + Z : data matrix (n, o) + + :output: + T : X-scores + W : X-weights/Z-weights + P : X-loadings + Q : Y-loadings + U : X-Y relation + L : Z-scores + K : Z-loads + B : Regression coefficients X->Y + b0: Regression coefficient intercept + evx : X-explained variance + evy : Y-explained variance + evz : Z-explained variance + + :Notes: + + """ + if mean_ctr: + xctr, yctr, zctr = mean_ctr + X, mnX = center(X, xctr) + Y, mnY = center(Y, xctr) + Z, mnZ = center(Z, zctr) + + varX = pow(X, 2).sum() + varY = pow(Y, 2).sum() + varZ = pow(Z, 2).sum() + + m, n = X.shape + k, l = Y.shape + u, o = Z.shape + + # initialize + U = empty((k, a_max)) + Q = empty((l, a_max)) + T = empty((m, a_max)) + W = empty((n, a_max)) + P = empty((n, a_max)) + K = empty((o, a_max)) + L = empty((u, a_max)) + B = empty((a_max, n, l)) + b0 = empty((a_max, m, l)) + var_x = empty((a_max,)) + var_y = empty((a_max,)) + var_z = empty((a_max,)) + + for a in range(a_max): + if verbose: + print "\n Working on comp. %s" %a + u = Y[:,:1] + diff = 1 + MAX_ITER = 100 + lim = 1e-7 + niter = 0 + while (diff>lim and niterY + :input: + X : data matrix (m, n) + Y : data matrix (m, l) + Z : data matrix (n, o) + + :output: + T : X-scores + W : X-weights/Z-weights + P : X-loadings + Q : Y-loadings + U : X-Y relation + L : Z-scores + K : Z-loads + B : Regression coefficients X->Y + b0: Regression coefficient intercept + evx : X-explained variance + evy : Y-explained variance + evz : Z-explained variance + + :Notes: + Not quite there ,,,,,,,,,,,,,, + + """ + if mean_ctr: + xctr, yctr, zctr = mean_ctr + X, mnX = center(X, xctr) + Y, mnY = center(Y, xctr) + Z, mnZ = center(Z, zctr) + + varX = pow(X, 2).sum() + varY = pow(Y, 2).sum() + varZ = pow(Z, 2).sum() + + m, n = X.shape + k, l = Y.shape + u, o = Z.shape + + # initialize + U = empty((k, a_max)) + Q = empty((l, a_max)) + T = empty((m, a_max)) + W = empty((n, a_max)) + P = empty((n, a_max)) + K = empty((o, a_max)) + L = empty((u, a_max)) + var_x = empty((a_max,)) + var_y = empty((a_max,)) + var_z = empty((a_max,)) + + for a in range(a_max): + if verbose: + print "\n Working on comp. %s" %a + xyz = dot(dot(Z,X.T),Y) + u,s,vt = linalg.svd(xyz, 0) + w = u[:,o] + t = dot(X, w) + tt = dot(t.T, t) + p = dot(X.T, t)/tt + q = dot(Y.T, t)/tt + l = dot(Z.T, w) + W[:,a] = w.ravel() + P[:,a] = p.ravel() + T[:,a] = t.ravel() + Q[:,a] = q.ravel() + L[:,a] = l.ravel() + K[:,a] = k.ravel() + X = X - dot(t, p.T) + Y = Y - dot(t, q.T) + Z = (Z.T - dot(w, l.T)).T + var_x[a] = pow(X, 2).sum() + var_y[a] = pow(Y, 2).sum() + var_z[a] = pow(Z, 2).sum() + B = dot(dot(W, inv(dot(P.T, W))), Q.T) + b0 = mnY - dot(mnX, B) + # variance explained + evx = 100.0*(1 - var_x/varX) + evy = 100.0*(1 - var_y/varY) + evz = 100.0*(1 - var_z/varZ) + return T, W, P, Q, U, L, K, B, b0, evx, evy, evz + + +def lplsr(X, Y, Z, a_max, mean_ctr=[2,0,1]): + """ Haralds LPLS. + """ + if mean_ctr!=None: + xctr, yctr, zctr = mean_ctr + X, mnX = center(X, xctr) + Y, mnY = center(Y, yctr) + Z, mnZ = center(Z, zctr) + + varX = pow(X, 2).sum() + varY = pow(Y, 2).sum() + varZ = pow(Z, 2).sum() + m, n = X.shape + k, l = Y.shape + u, o = Z.shape + + # initialize + Wy = empty((l, a_max)) + Py = empty((l, a_max)) + Ty = empty((m, a_max)) + Tz = empty((o, a_max)) + Wz = empty((u, a_max)) + Pz = empty((u, a_max)) + var_x = empty((a_max,)) + var_y = empty((a_max,)) + var_z = empty((a_max,)) + + # residuals + Ey = Y.copy() + Ez = Z.copy() + Ex = X.copy() + for i in range(a_max): + YtXZ = dot(Ey.T, dot(X, Ez.T)) + U, S, V = linalg.svd(YtXZ) + wy = U[:,0] + print wy + wz = V[0,:] + ty = dot(Ey, wy) + tz = dot(Ez.T, wz) + py = dot(Ey.T, ty)/dot(ty.T,ty) + pz = dot(Ez, tz)/dot(tz.T,tz) + Wy[:,i] = wy + Wz[:,i] = wz + Ty[:,i] = ty + Tz[:,i] = tz + Py[:,i] = py + Pz[:,i] = pz + Ey = Ey - outer(ty, py.T) + Ez = (Ez.T - outer(tz, pz.T)).T + var_y[i] = pow(Ey, 2).sum() + var_z[i] = pow(Ez, 2).sum() + + tyd = apply_along_axis(norm, 0, Ty) + tzd = apply_along_axis(norm, 0, Tz) + Tyu = Ty/tyd + Tzu = Tz/tzd + C = dot(dot(Tyu.T, X), Tzu) + for i in range(a_max): + Ex = Ex - dot(dot(Ty[:,:i+1],C[:i+1,:i+1]), Tz[:,:i+1].T) + var_x[i] = pow(Ex,2).sum() + # variance explained + print "var_x:" + print var_x + print "varX total:" + print varX + + evx = 100.0*(1 - var_x/varX) + evy = 100.0*(1 - var_y/varY) + evz = 100.0*(1 - var_z/varZ) + + return Ty, Tz, Wy, Wz, Py, Pz, C, Ey, Ez, Ex, evx, evy, evz + +def bifpls(X, Y, Z, a_max, alpha): + """Swedssihsh LPLS by nipals. + """ + u = X[:,0] + Ey = Y.copy() + Ez = Z.copy() + for i in range(100): + w = dot(X.T,u) + w = w/vnorm(w) + t = dot(X, w) + q = dot(Ey, t.T)/dot(t.T,t) + qnorm = vnorm(q) + q = q/qnorm + v = dot(Ez, q) + s = dot(Ez.T, v)/dot(v.T,v) + v = v*vnorm(s) + s = s/vnorm(s) + c = qnorm*(alpha*q + (1-alpha)*s) + u = dot(Ey, c)/dot(s.T,s) + p = dot(X.T, t)/dot(t.T,t) + v2 = dot(Ez, s)/dot(s.T,s) + Ey = Ey - dot(t, p.T) + Ez = Ez - dot(v2, c.T) + # variance explained + evx = 100.0*(1 - var_x/varX) + evy = 100.0*(1 - var_y/varY) + evz = 100.0*(1 - var_z/varZ) + +def center(a, axis): + # 0 = col center, 1 = row center, 2 = double center + # -1 = nothing + if axis==-1: + return a + elif axis==0: + mn = a.mean(0) + return a - mn, mn + elif axis==1: + mn = a.mean(1)[:,newaxis] + return a - mn , mn + elif axis==2: + mn = a.mean(0) + a.mean(1)[:,newaxis] - a.mean() + return a - mn, mn + else: + raise IOError("input error: axis must be in [-1,0,1,2]") + +def correlation_loadings(D, T, P, test=True): + """ Returns correlation loadings. + + :input: + - D: [nsamps, nvars], data (non-centered data) + - T: [nsamps, a_max], Scores + - P: [nvars, a_max], Loadings + :ouput: + - Rloads: [nvars, a_max], Correlation loadings + - rmseVars: [nvars], scaling coeff. for each var in D + + :notes: + - FIXME: Calculation is not valid .... using corrceof instead + """ + nsamps, nvars = D.shape + nsampsT, a_max = T.shape + nvarsP, a_maxP = P.shape + if nsamps!=nsampsT: raise IOError("D/T mismatch") + if a_max!=a_maxP: raise IOError("a_max mismatch") + if nvars!=nvarsP: raise IOError("D/P mismatch") + + #init + Rloads = empty((nvars, a_max), 'd') + stdvar = stats.std(D, 0) + rmseVars = sqrt(nsamps-1)*stdvar + + # center + D = D - D.mean(0) + TT = diag(dot(T.T, T)) + sTT = sqrt(TT) + for a in range(a_max): + Rloads[:,a] = sTT[a]*P[:,a]/rmseVars + R = empty_like(Rloads) + for a in range(a_max): + for k in range(nvars): + r = corrcoef(D[:,k], T[:,a]) + R[k,a] = r[0,1] + #Rloads = R + return Rloads, R, rmseVars + + + +def cv_lpls(X, Y, Z, a_max=2, nsets=None,alpha=.5): + """Performs crossvalidation to get generalisation error in lpls""" + cv_iter = select_generators.pls_gen(X, Y, n_blocks=nsets,center=True,index_out=True) + k, l = Y.shape + Yhat = empty((a_max,k,l), 'd') + for i, (xcal,xi,ycal,yi,ind) in enumerate(cv_iter): + T, W, P, Q, U, L, K, B, b0, evx, evy, evz = nipals_lpls(xcal,ycal,Z, + a_max=a_max, + alpha=alpha, + mean_ctr=[0,0,1], + verbose=False) + for a in range(a_max): + Yhat[a,ind,:] = b0[a][0][0] + dot(xi, B[a]) + Yhat_class = zeros_like(Yhat) + for a in range(a_max): + for i in range(k): + Yhat_class[a,i,argmax(Yhat[a,i,:])]=1.0 + class_err = 100*((Yhat_class+Y)==2).sum(1)/Y.sum(0).astype('d') + sep = (Y - Yhat)**2 + rmsep = sqrt(sep.mean(1)) + return rmsep, Yhat, class_err + +def jk_lpls(X, Y, Z, a_max, nsets=None, alpha=.5): + cv_iter = select_generators.pls_gen(X, Y, n_blocks=nsets,center=True,index_out=False) + m, n = X.shape + k, l = Y.shape + o, p = Z.shape + if nsets==None: + nsets = m + WWx = empty((nsets, n, a_max), 'd') + WWz = empty((nsets, o, a_max), 'd') + WWy = empty((nsets, l, a_max), 'd') + for i, (xcal,xi,ycal,yi) in enumerate(cv_iter): + T, W, P, Q, U, L, K, B, b0, evx, evy, evz = nipals_lpls(xcal,ycal,Z, + a_max=a_max, + alpha=alpha, + mean_ctr=[0,0,1], + verbose=False) + WWx[i,:,:] = W + WWz[i,:,:] = L + WWy[i,:,:] = Q + print "Q" + print Q + + return WWx, WWz, WWy diff --git a/scripts/lpls/plots_lpls.py b/scripts/lpls/plots_lpls.py new file mode 100644 index 0000000..1438ded --- /dev/null +++ b/scripts/lpls/plots_lpls.py @@ -0,0 +1,38 @@ +import pylab +import matplotlib + +def plot_corrloads(R, pc1=0,pc2=1,s=20, c='b', zorder=5,expvar=None,ax=None,drawback=True, labels=None): + """ Correlation loading plot.""" + + # backgorund + if ax==None or drawback==True: + radius = 1 + center = (0,0) + c100 = matplotlib.patches.Circle(center,radius=radius, + facecolor='gray', + alpha=.1, + zorder=1) + c50 = matplotlib.patches.Circle(center, radius=radius/2.0, + facecolor='gray', + alpha=.1, + zorder=2) + ax = pylab.gca() + ax.add_patch(c100) + ax.add_patch(c50) + ax.axhline(lw=1.5,color='k') + ax.axvline(lw=1.5,color='k') + + # corrloads + ax.scatter(R[:,pc1], R[:,pc2], s=s, c=c,zorder=zorder) + ax.set_xlim([-1,1]) + ax.set_ylim([-1,1]) + if expvar!=None: + xstring = "Comp: %d expl.var: %.1f " %(pc1+1, expvar[pc1]) + pylab.xlabel(xstring) + ystring = "Comp: %d expl.var.: %.1f " %(pc2+1, expvar[pc2]) + pylab.ylabel(ystring) + if labels: + assert(len(labels)==R.shape[0]) + for name, r in zip(labels, R): + ax.text(r[pc1], r[pc2], " " + name) + #pylab.show() diff --git a/scripts/lpls/rpy_go.py b/scripts/lpls/rpy_go.py new file mode 100644 index 0000000..38f54d5 --- /dev/null +++ b/scripts/lpls/rpy_go.py @@ -0,0 +1,110 @@ +""" Module for Gene ontology related functions called in R""" +import scipy + +import rpy +silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r) + +def get_term_sim(termlist, method = "JiangConrath", verbose=False): + """Returns the similariy matrix between go-terms. + + Arguments: + termlist: character vector of GO terms + method: one of + ("JiangConrath","Resnik","Lin","CoutoEnriched","CoutoJiangConrath","CoutoResnik","CoutoLin") + verbose: print out various information or not + """ + _methods = ("JiangConrath","Resnik","Lin","CoutoEnriched","CoutoJiangConrath","CoutoResnik","CoutoLin") + assert(method in _methods) + assert(termlist[0][:2]=='GO') + rpy.r.library("GOSim") + return rpy.r.getTermSim(termlist, method = method, verbose = verbose) + +def get_gene_sim(genelist, similarity='OA', + distance="Resnick"): + rpy.r.library("GOSim") + rpy.r.assign("ids", genelist) + silent_eval('a<-getGeneSim(ids)', verbose=FALSE) + +def goterms_from_gene(genelist, ontology=['BP'], garbage = ['IEA', 'ISS', 'ND']): + """ Returns the go-terms from a specified genelist (Entrez id). + + """ + rpy.r.library("GO") + _CODES = {"IMP" : "inferred from mutant phenotype", + "IGI" : "inferred from genetic interaction", + "IPI" :"inferred from physical interaction", + "ISS" : "inferred from sequence similarity", + "IDA" : "inferred from direct assay", + "IEP" : "inferred from expression pattern", + "IEA" : "inferred from electronic annotation", + "TAS" : "traceable author statement", + "NAS" : "non-traceable author statement", + "ND" : "no biological data available", + "IC" : "inferred by curator" + } + _ONTOLOGIES = ['BP', 'CC', 'MF'] + assert(scipy.all([(code in _CODES) for code in garbage])) + assert(scipy.all([(ont in _ONTOLOGIES) for ont in ontology])) + + goterms = {} + for gene in genelist: + goterms[gene] = [] + info = rpy.r('GOENTREZID2GO[["' + str(gene) + '"]]') + #print info + if info: + for term, desc in info.items(): + if desc['Ontology'] in ontology and desc['Evidence'] not in garbage: + goterms[gene].append(term) + return goterms + +def genego_matrix(goterms, tmat, gene_ids, term_ids, func=min): + ngenes = len(gene_ids) + nterms = len(term_ids) + gene2indx = {} + for i,id in enumerate(gene_ids): + gene2indx[id]=i + term2indx = {} + for i,id in enumerate(term_ids): + term2indx[id]=i + #G = scipy.empty((nterms, ngenes),'d') + G = [] + newindex = [] + for gene, terms in goterms.items(): + g_ind = gene2indx[gene] + if len(terms)>0: + t_ind = [] + newindex.append(g_ind) + for term in terms: + if term2indx.has_key(term): t_ind.append(term2indx[term]) + print t_ind + subsim = tmat[t_ind, :] + gene_vec = scipy.apply_along_axis(func, 0, subsim) + G.append(gene_vec) + + return scipy.asarray(G), newindex + +def goterm2desc(gotermlist): + """Returns the go-terms description keyed by go-term + """ + rpy.r.library("GO") + term2desc = {} + for term in gotermlist: + try: + desc = rpy.r('Term(GOTERM[["' +str(term)+ '"]])') + term2desc[str(term)] = desc + except: + raise Warning("Description not found for %s\n Mapping incomplete" %term) + return term2desc + +def parents_dag(go_terms, ontology=['BP']): + """ Returns a list of lists representation of a GO DAG parents of goterms.""" + try: + rpy.r.library("GOstats") + except: + raise ImportError, "Gostats" + assert(go_terms[0][:3]=='GO:') + + # go valid namespace + go_env = {'BP':rpy.r.BPPARENTS, 'MF':rpy.r.MFPARENTS, 'CC': rpy.r.CCPARENTS} + + diff --git a/scripts/lpls/run_smoker.py b/scripts/lpls/run_smoker.py new file mode 100644 index 0000000..1c6036f --- /dev/null +++ b/scripts/lpls/run_smoker.py @@ -0,0 +1,141 @@ +import sys +import rpy +from pylab import gca, figure, subplot +from scipy import * +from lpls import * +import rpy_go +sys.path.append("../../fluents") # home of dataset +sys.path.append("../../fluents/lib") # home of cx_stats +import dataset +import cx_stats +from plots_lpls import plot_corrloads + +######## DATA ########## +# full smoker data +DX = dataset.read_ftsv(open("../../data/smokers-full/Xfull.ftsv")) +DY = dataset.read_ftsv(open("../../data/smokers-full/Yg.ftsv")) +Y = DY.asarray() +# select subset genes by SAM +rpy.r.library("siggenes") +rpy.r.library("qvalue") +data = DX.asarray().T +# data = data[:100,:] +rpy.r.assign("data", data) +cl = dot(DY.asarray(), diag([1,2,3])).sum(1) +rpy.r.assign("cl", cl) +rpy.r.assign("B", 100) +# Perform a SAM analysis. +print "Starting SAM" +sam = rpy.r('sam.out<-sam(data=data,cl=cl,B=B,rand=123)') +print "SAM done" +# Compute the q-values of the genes. +qq = rpy.r('qobj<-qvalue(sam.out@p.value)') +qvals = asarray(qq['qvalues']) +# cut off +co = 0.001 +index = where(qvals<0.01)[0] + +# Subset data +X = DX.asarray() +Xr = X[:,index] +gene_ids = DX.get_identifiers('gene_ids', index) + +### Build GO data #### + +print "Go terms ..." +goterms = rpy_go.goterms_from_gene(gene_ids) +terms = set() +for t in goterms.values(): + terms.update(t) +terms = list(terms) +rpy.r.library("GOSim") +# Go-term similarity matrix +methods = ("JiangConrath","Resnik","Lin","CoutoEnriched","CoutoJiangConrath","CoutoResnik","CoutoLin") +meth = methods[0] +print "Term-term similarity matrix (method = %s)" %meth +if meth=="CoutoEnriched": + rpy.r('setEnrichmentFactors(alpha=0.1,beta=0.5)') +tmat = rpy.r.getTermSim(terms, verbose=False, method=meth) +# check if all terms where found +nanindex = where(isnan(tmat[:,0]))[0] +keep=[] +has_miss = False +if len(nanindex)>0: + has_miss = True + print "Some terms missing in similarity matrix" + keep = where(isnan(tmat[:,0])!=True)[0] + print "Number of nans: %d" %len(nanindex) + tmat_new = tmat[:,keep][keep,:] + new_terms = [i for ind,i in enumerate(terms) if ind in keep] + bad_terms = [i for ind,i in enumerate(terms) if ind not in keep] + # update go-term dict + for gene,trm in goterms.items(): + for t in trm: + if t in bad_terms: + trm.remove(t) + if len(trm)==0: + print "Removing gene: %s" %gene + goterms[gene]=trm + terms = new_terms + tmat = tmat_new +# Z-matrix +# func (min, max, median, mean, etc), +# func decides on the representation of gene-> goterm when multiple +# goterms exist for one gene +Z, newind = rpy_go.genego_matrix(goterms, tmat, gene_ids, terms,func=mean) +Z = Z.T +# update X matrix (no go-terms available) +Xr = Xr[:,newind] +gene_ids = asarray(gene_ids)[newind] + + +######## LPLSR ######## +print "LPLSR ..." +a_max = 5 +aopt = 2 +alpha=.5 +T, W, P, Q, U, L, K, B, b0, evx, evy, evz = nipals_lpls(Xr,Y,Z, a_max, alpha) + +# Correlation loadings +dx,Rx,ssx= correlation_loadings(Xr, T, P) +dx,Ry,ssx= correlation_loadings(Y, T, Q) +cadx,Rz,ssx= correlation_loadings(Z.T, K, L) + +# Prediction error +rmsep , yhat, class_error = cv_lpls(Xr, Y, Z, a_max, alpha=alpha) + +# Significance Hotellings T +Wx, Wz, Wy, = jk_lpls(Xr, Y, Z, aopt) +tsqx = cx_stats.hotelling(Wx,W[:,:aopt]) +tsqz = cx_stats.hotelling(Wz,L[:,:aopt]) + + +## plots ## +figure(1) #rmsep +#bar() +figure(2) # Hypoid correlations +plot_corrloads(Rz, pc1=0, pc2=1, s=tsqz/10.0, c='b', zorder=5, expvar=evz, ax=None) +ax = gca() +plot_corrloads(Ry, pc1=0, pc2=1, s=150, c='g', zorder=5, expvar=evy, ax=ax) + +figure(3) +subplot(221) +ax = gca() +plot_corrloads(Rx, pc1=0, pc2=1, s=tsqx/2.0, c='b', zorder=5, expvar=evx, ax=ax) +# title('X correlation') +subplot(222) +ax = gca() +plot_corrloads(Ry, pc1=0, pc2=1, s=150, c='g', zorder=5, expvar=evy, ax=ax) +#title('Y correlation') +subplot(223) +ax = gca() +plot_corrloads(Rz, pc1=0, pc2=1, s=tsqz/10.0, c='r', zorder=5, expvar=evz, ax=ax) +#title('Z correlation') +subplot(224) +plot(arange(len(evx)), evx, 'b', label='X', linewidth=2) +plot(evy, 'g', label='Y', linewidth=2) +plot(evz, 'r', label='Z', linewidth=2) +legend(loc=2) +ylabel('Explained variance') +xlabel('Component') +show() diff --git a/scripts/lpls/yeast_annot.py b/scripts/lpls/yeast_annot.py new file mode 100644 index 0000000..c7abc3b --- /dev/null +++ b/scripts/lpls/yeast_annot.py @@ -0,0 +1,44 @@ + +def smdb_annot(orflist=None, input_fname='registry.genenames.tab', output_fname='yeast.annot'): + + """Reads registry.genenames.tab from the Stanford yeast + microarray database. + + Available from: + ftp://genome-ftp.stanford.edu/pub/yeast/data_download/gene_registry/registry.genenames.tab + + input: orf -- list of orfs (open reading frames) + file -- (optional) file to fetch info from + + registry.genames contains: + + 0 = Locus name + 1 = Other name + 2 = Description + 3 = Gene product + 4 = Phenotype + 5 = ORF name + 6 = SGDID + + NB! Other name, Gene product and Phenotype may have more + than one mapping. These are separated by | + + Output: writes an annotation file + + """ + outfile = open(output_fname, 'w') + header = "Orf\tLocus_id\tOther_name\tDescription\tGene_product\tPhenotype\tSGD_ID\n" + outfile.write(header) + text = open(input_fname, 'r').read().splitlines() + for line in text: + els = line.split('\t') + orf_name = els.pop(5) + if orf_name!='': # we dont care about non-named orfs + if orflist and orf_name not in orflist: + break + for e in els: + if e !='': + outfile.write(str(e) + "\t") + else: + outfile.write("NA") + f.write("\n")