From 155dfada5cb9775bbd456084871d01ee683a249d Mon Sep 17 00:00:00 2001
From: flatberg <flatberg@pvv.ntnu.no>
Date: Mon, 23 Jul 2007 13:25:34 +0000
Subject: [PATCH] iups

---
 scripts/lpls/lpls.py       | 20 +++++++++++++-------
 scripts/lpls/plots_lpls.py |  1 +
 scripts/lpls/rpy_go.py     |  6 +++++-
 scripts/lpls/run_smoker.py | 21 ++++++++++++---------
 4 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/scripts/lpls/lpls.py b/scripts/lpls/lpls.py
index 13a0294..071ce5d 100644
--- a/scripts/lpls/lpls.py
+++ b/scripts/lpls/lpls.py
@@ -9,7 +9,7 @@ import select_generators
 sys.path.remove("/home/flatberg/fluents/fluents/lib")
 
 
-def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], verbose=True):
+def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', verbose=True):
     """ L-shaped Partial Least Sqaures Regression by the nipals algorithm.
 
     (X!Z)->Y
@@ -113,7 +113,11 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], verbose=True):
     evx = 100.0*(1 - var_x/varX)
     evy = 100.0*(1 - var_y/varY)
     evz = 100.0*(1 - var_z/varZ)
-    
+    if scale=='loads':
+        tnorm = apply_along_axis(norm, 0, T)
+        T = T/tnorm
+        Q = Q*tnorm
+        W = W*tnorm
     return T, W, P, Q, U, L, K, B, b0, evx, evy, evz
 
 def svd_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], verbose=True):
@@ -307,7 +311,8 @@ def center(a, axis):
      # 0 = col center, 1 = row center, 2 = double center
      # -1 = nothing
     if axis==-1:
-        return a
+        mn = zeros((a.shape[1],))
+        return a - mn, mn
     elif axis==0:
         mn = a.mean(0)
         return a - mn, mn
@@ -364,14 +369,14 @@ def correlation_loadings(D, T, P, test=True):
 
 def cv_lpls(X, Y, Z, a_max=2, nsets=None,alpha=.5):
     """Performs crossvalidation to get generalisation error in lpls"""
-    cv_iter = select_generators.pls_gen(X, Y, n_blocks=nsets,center=True,index_out=True)
+    cv_iter = select_generators.pls_gen(X, Y, n_blocks=nsets,center=False,index_out=True)
     k, l = Y.shape
     Yhat = empty((a_max,k,l), 'd')
     for i, (xcal,xi,ycal,yi,ind) in enumerate(cv_iter):
         T, W, P, Q, U, L, K, B, b0, evx, evy, evz = nipals_lpls(xcal,ycal,Z,
                                                                 a_max=a_max,
                                                                 alpha=alpha,
-                                                                mean_ctr=[0,0,1],
+                                                                mean_ctr=[2,0,1],
                                                                 verbose=False)
         for a in range(a_max):
             Yhat[a,ind,:] = b0[a][0][0] + dot(xi, B[a])
@@ -385,7 +390,7 @@ def cv_lpls(X, Y, Z, a_max=2, nsets=None,alpha=.5):
     return rmsep, Yhat, class_err
 
 def jk_lpls(X, Y, Z, a_max, nsets=None, alpha=.5):
-    cv_iter = select_generators.pls_gen(X, Y, n_blocks=nsets,center=True,index_out=False)
+    cv_iter = select_generators.pls_gen(X, Y, n_blocks=nsets,center=False,index_out=False)
     m, n = X.shape
     k, l = Y.shape
     o, p = Z.shape
@@ -398,7 +403,8 @@ def jk_lpls(X, Y, Z, a_max, nsets=None, alpha=.5):
         T, W, P, Q, U, L, K, B, b0, evx, evy, evz = nipals_lpls(xcal,ycal,Z,
                                                                 a_max=a_max,
                                                                 alpha=alpha,
-                                                                mean_ctr=[0,0,1],
+                                                                mean_ctr=[2,0,1],
+                                                                scale='loads',
                                                                 verbose=False)
         WWx[i,:,:] = W
         WWz[i,:,:] = L
diff --git a/scripts/lpls/plots_lpls.py b/scripts/lpls/plots_lpls.py
index a350b86..bb1bc1b 100644
--- a/scripts/lpls/plots_lpls.py
+++ b/scripts/lpls/plots_lpls.py
@@ -39,6 +39,7 @@ def plot_corrloads(R, pc1=0,pc2=1,s=20, c='b', zorder=5,expvar=None,ax=None,draw
     #pylab.show()
 
 def plot_dag(edge_dict, node_color='b', node_size=30,labels=None,nodelist=None,pos=None):
+    # networkx does not play well with colon in node names
     clean_edges = {}
     for head, neigb in edge_dict.items():
         head = head.replace(":", "_")
diff --git a/scripts/lpls/rpy_go.py b/scripts/lpls/rpy_go.py
index 14ada26..787700a 100644
--- a/scripts/lpls/rpy_go.py
+++ b/scripts/lpls/rpy_go.py
@@ -1,6 +1,5 @@
 """ Module for Gene ontology related functions called in R"""
 import scipy
-
 import rpy
 silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
 
@@ -126,3 +125,8 @@ def parents_dag(go_terms, ontology=['BP']):
             else:
                 edge_dict[nn] = [head]
     return edge_dict
+
+def gene_GO_hypergeo_test(genelist, universe, ontology = ['BP']):
+    
+    pvals = geneGoHyperGeoTest(entrezGeneIds, lib=None, ontology=ontology[0], universe=universe)
+    return pvals
diff --git a/scripts/lpls/run_smoker.py b/scripts/lpls/run_smoker.py
index 3f152b9..36deb09 100644
--- a/scripts/lpls/run_smoker.py
+++ b/scripts/lpls/run_smoker.py
@@ -32,14 +32,14 @@ print "SAM done"
 qq = rpy.r('qobj<-qvalue(sam.out@p.value)')
 qvals = asarray(qq['qvalues'])
 # cut off
-co = 0.1
+co = 0.001
 index = where(qvals<0.01)[0]
 
 # Subset data
 X = DX.asarray()
 Xr = X[:,index]
 gene_ids = DX.get_identifiers('gene_ids', index)
-
+print "\nWorkiing on subset with %s genes " %len(gene_ids)
 ### Build GO data ####
 
 print "Go terms ..."
@@ -48,13 +48,15 @@ terms = set()
 for t in goterms.values():
     terms.update(t)
 terms = list(terms)
+print "Number of go-terms: %s" %len(terms)
 rpy.r.library("GOSim")
 # Go-term similarity matrix
 methods = ("JiangConrath","Resnik","Lin","CoutoEnriched","CoutoJiangConrath","CoutoResnik","CoutoLin")
-meth = methods[2]
+meth = methods[0]
 print "Term-term similarity matrix (method = %s)" %meth
 if meth=="CoutoEnriched":
     rpy.r('setEnrichmentFactors(alpha=0.1,beta=0.5)')
+print "Calculating term-term similarity matrix"
 tmat = rpy.r.getTermSim(terms, verbose=False, method=meth)
 # check if all terms where found
 nanindex = where(isnan(tmat[:,0]))[0]
@@ -93,20 +95,21 @@ gene_ids = asarray(gene_ids)[newind]
 print "LPLSR ..."
 a_max = 5
 aopt = 2
-alpha=.5
+alpha=.6
 T, W, P, Q, U, L, K, B, b0, evx, evy, evz = nipals_lpls(Xr,Y,Z, a_max, alpha)
 
 # Correlation loadings
-dx,Rx,ssx= correlation_loadings(Xr, T, P)
-dx,Ry,ssx= correlation_loadings(Y, T, Q)
-cadx,Rz,ssx= correlation_loadings(Z.T, K, L)
+dx,Rx,rssx = correlation_loadings(Xr, T, P)
+dx,Ry,rssy = correlation_loadings(Y, T, Q)
+cadz,Rz,rssz = correlation_loadings(Z.T, W, L)
 # Prediction error
 rmsep , yhat, class_error = cv_lpls(Xr, Y, Z, a_max, alpha=alpha)
 
 # Significance Hotellings T
 Wx, Wz, Wy, = jk_lpls(Xr, Y, Z, aopt)
-tsqx = cx_stats.hotelling(Wx,W[:,:aopt])
-tsqz = cx_stats.hotelling(Wz,L[:,:aopt])
+Ws = W*apply_along_axis(norm, 0, T)
+tsqx = cx_stats.hotelling(Wx, Ws[:,:aopt])
+tsqz = cx_stats.hotelling(Wz, L[:,:aopt])
 
 
 ## plots ##