From 2951ca40882b8098c8841246ef60db742f74d9cc Mon Sep 17 00:00:00 2001
From: flatberg <flatberg@pvv.ntnu.no>
Date: Mon, 26 Nov 2007 15:30:52 +0000
Subject: [PATCH] A few updates

---
 pyblm/crossvalidation.py | 30 ++++++++++++----
 pyblm/engines.py         | 35 ++++++++++++------
 pyblm/models.py          | 78 ++++++++++++++++++++++------------------
 pyblm/statistics.py      | 24 +++++++++----
 4 files changed, 108 insertions(+), 59 deletions(-)

diff --git a/pyblm/crossvalidation.py b/pyblm/crossvalidation.py
index ab7aded..762f59b 100644
--- a/pyblm/crossvalidation.py
+++ b/pyblm/crossvalidation.py
@@ -12,7 +12,7 @@ from numpy.random import shuffle
 from engines import nipals_lpls as lpls
 
 
-def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=True):
+def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2], zorth=False, verbose=True):
     """Performs crossvalidation for generalisation error in lpls.
 
     The L-PLS crossvalidation is estimated just like an ordinary pls
@@ -42,6 +42,8 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=Tru
             0 : row center
             1 : column center
             2 : double center
+        zorth : {boolean}
+            If true, Require orthogonal latent components in Z.
         verbose : {boolean}, optional
             Verbosity of console output. For use in debugging.
     
@@ -70,7 +72,11 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=Tru
     
     Yhat = empty((a_max, k, l), 'd')
     for cal, val in cv(nsets, k):
-        dat = lpls(X[cal],Y[cal],Z,a_max=a_max,alpha=alpha,mean_ctr=mean_ctr,verbose=verbose)
+        # do the training model
+        dat = lpls(X[cal], Y[cal], Z, a_max=a_max, alpha=alpha,
+                   mean_ctr=mean_ctr, zorth=zorth, verbose=verbose)
+
+        # center test data
         if mean_ctr[0] != 1:
             xi = X[val,:] - dat['mnx']
         else:
@@ -79,14 +85,24 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=Tru
             ym = dat['mny']
         else:
             ym = Y[val].mean(1)[:,newaxis] #???: check this
+        # predictions
         for a in range(a_max):
             Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a]))
+        #if permute:
+        #    xcal = X[cal]
+        #    for a in range(1,a_max,1):
+        #        for n in range(10):
+        #            shuffle(cal)
+        #            dat = lpls(xcal, Y[cal], Z, a_max=a_max, alpha=alpha,
+        #                       mean_ctr=mean_ctr, verbose=verbose)
+                    
             
     # todo: need a better support for classification error
     y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?']
     if y_is_class:
-        Yhat, err = class_error(Yhat,Y)
-        return Yhat, err
+        pass
+        #Yhat, err = class_error(Yhat, Y)
+        #return Yhat, err
 
     sep = (Y - Yhat)**2
     rmsep = sqrt(sep.mean(1)).T
@@ -317,8 +333,8 @@ def cv(N, K, randomise=True, sequential=False):
         otherwise interleaved ordering is used.
     
     """
-    if K>N:
-        raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(N,N,K)
+    if N>K:
+        raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(K, K, N)
     index = xrange(N)
     if randomise:
         from random import shuffle
@@ -371,7 +387,7 @@ def class_error(Yhat, Y, method='vanilla'):
     Yhat_c = zeros((k, l), dtype='d')
     for a in range(a_opt):
         for i in range(k):
-            Yhat_c[a,val,argmax(Yhat[a,val,:])] = 1.0
+            Yhat_c[a, val, argmax(Yhat[a,val,:])] = 1.0
     err = 100*((Yhat_c + Y) == 2).sum(1)/Y.sum(0).astype('d')
 
     return Yhat_c, err
diff --git a/pyblm/engines.py b/pyblm/engines.py
index 5c224ab..878b4ca 100644
--- a/pyblm/engines.py
+++ b/pyblm/engines.py
@@ -411,7 +411,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1):
             'evx': expvarx, 'evy': expvary, 'ssqx': ssqx, 'ssqy': ssqy,
             'leverage': leverage, 'mnx': mnx, 'mny': mny}
 
-def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', verbose=False):
+def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 2], scale='scores', zorth = False, verbose=False):
     """ L-shaped Partial Least Sqaures Regression by the nipals algorithm.
 
     An L-shaped low rank model aproximates three matrices in a hyploid
@@ -475,10 +475,14 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
 
         scale : {'scores', 'loads'}, optional
             Option to decide on where the scale goes.
+        zorth : {False, boolean}, optional
+            Option to force orthogonality between latent components
+            in Z
         verbose : {boolean}, optional
             Verbosity of console output. For use in debugging.
 
     *References*
+
         Saeboe et al., LPLS-regression: a method for improved prediction and
         classification through inclusion of background information on
         predictor variables, J. of chemometrics and intell. laboratory syst.
@@ -522,18 +526,22 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
     var_y = empty((a_max,))
     var_z = empty((a_max,))
 
-    MAX_ITER = 450
+    MAX_ITER = 4500
     LIM = finfo(X.dtype).resolution
     is_rd = False
     for a in range(a_max):
         if verbose:
             print "\nWorking on comp. %s" %a
         u = F[:,:1]
+        w = E[:1,:].T
+        l = G[:,:1]
         diff = 1
         niter = 0
         while (diff>LIM and niter<MAX_ITER):
             niter += 1
             u1 = u.copy()
+            w1 = w.copy()
+            l1 = l.copy()
             w = dot(E.T, u)
             wn = msqrt(dot(w.T, w))
             if wn < LIM:
@@ -552,20 +560,25 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
             c = dot(F.T, t)
             c = c/msqrt(dot(c.T, c))
             u = dot(F, c)
-            diff = dot((u-u1).T, (u-u1))
-
+            diff = dot((u - u1).T, (u - u1))
         if verbose:
-            print "Converged after %s iterations" %niter
+            if niter==MAX_ITER:
+                print "Maximum nunber of iterations reached!"
+            print "Iterations: %d " %niter
             print "Error: %.2E" %diff
 
         if is_rd:
             print "Hei og haa ... rank deficient, this should really not happen"
             break
+
         tt = dot(t.T, t)
-        p = dot(X.T, t)/tt
-        q = dot(Y.T, t)/tt
-        l = dot(Z, w)
-        #k = dot(Z.T, l)/dot(l.T, l)
+        p = dot(E.T, t)/tt
+        q = dot(F.T, t)/tt
+        if zorth:
+            k = dot(G.T, l)/dot(l.T, l)
+        else:
+            k = w
+            l = dot(G, w)
         
         U[:,a] = u.ravel()
         W[:,a] = w.ravel()
@@ -575,10 +588,10 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
         L[:,a] = l.ravel()
         K[:,a] = k.ravel()
 
-        
+        # rank-one deflations
         E = E - dot(t, p.T)
         F = F - dot(t, q.T)
-        G = (G.T - dot(k, l.T)).T
+        G = G - dot(l, k.T)
 
         var_x[a] = pow(E, 2).sum()
         var_y[a] = pow(F, 2).sum()
diff --git a/pyblm/models.py b/pyblm/models.py
index 17c6730..b21926b 100644
--- a/pyblm/models.py
+++ b/pyblm/models.py
@@ -1,6 +1,6 @@
 """Bilinear models"""
 
-from numpy import expand_dims
+from numpy import expand_dims,ones
 
 from engines import pca
 
@@ -14,8 +14,11 @@ def scale(x, axis=0):
     #scale = 1./x.std(axis)
     return expand_dims(scale, axis)
 
+
 class Model(object):
-    def __init__(name="johndoe"):
+    """All underscored attributes are properties.
+    """
+    def __init__(self, name="johndoe"):
         self.name = name
         self.options = {}
         
@@ -27,8 +30,8 @@ class Model(object):
 
     def clear(self):
         for param in self.__dict__.keys():
-            if param.startswith("_") and param[1]!="_":
-                exec "del self." + param
+            if param.startswith("_") and param[1:5]!="core":
+                exec "del self." + param[1:]
 
     def clear_core(self):
         for param in self.__dict__.keys():
@@ -43,7 +46,7 @@ class PCA(Model):
         self._x = x
         self.amax = amax
         self.aopt = amax
-
+        
     # properties
     def amax():
         doc = "maximum number of components"
@@ -78,29 +81,29 @@ class PCA(Model):
     def scores():
         doc = "pca scores"
         def fget(self):
-            if not hasattr(self, "_scores"):
-                u, s, v, tot_var = pcaengine(self.xw, self.amax)
-                self._scores = u
-                self.singvals = s
-                self.loadings = v
-                self.tot_var = tot_var
-            return self._scores[:,:self.amax]
+            if not hasattr(self, "_core_scores"):
+                result= pca(self.xw, self.amax)
+                self._core_scores = result['T']
+                self.singvals = result['eigvals']
+                self.loadings = result['P']
+                self.tot_var = 120.
+            return self._core_scores[:,:self.amax]
         def fset(self, t):
-            self._scores = t
+            self._core_scores = t
         def fdel(self):
-            del self._scores
-        return locals()  # credit: David Niergarth
+            del self._core_scores
+        return locals()
     scores = property(**scores())
     
     def loadings():
         doc = "pca loadings"
         def fget(self):
             if not hasattr(self, "_loadings"):
-                u, s, v, tot_var = pcaengine(self.xw, self.amax)
-                self._loadings = v
-                self.scores = u
-                self.singvals = s
-                self.tot_var = tot_var
+                result = pca(self.xw, self.amax)
+                self.loadings = result['P']
+                self.scores = result['T']
+                self.singvals = result['eigvals']
+                self.tot_var = 120
             return self._loadings[:,:self.amax]
         def fdel(self):
             del self._loadings
@@ -113,11 +116,11 @@ class PCA(Model):
         doc = "Singular values"
         def fget(self):
             if not hasattr(self, "_singvals"):
-                u, s, v, tot_var = pcaengine(self.xw, self.amax)
-                self._singvals = s
-                self.scores = u
-                self.loadings = v
-                self.tot_var = tot_var
+                result = pca(self.xw, self.amax)
+                self._singvals = result['eigvals']
+                self.scores = result['T']
+                self.loadings = result['P']
+                self.tot_var = 120
             return self._singvals[:self.amax]
         def fset(self, w):
             self._singvals = w
@@ -139,7 +142,7 @@ class PCA(Model):
         doc = "column means"
         def fget(self):
             if not hasattr(self, "_xadd"):
-                self._xadd = center(self.x, axis=0)
+                self._xadd = mean_center(self.x, axis=0)
             return self._xadd
         def fset(self, mnx):
             if hasattr(self, "_xc"):
@@ -153,7 +156,7 @@ class PCA(Model):
     xadd = property(**xadd())
     
     def xc():
-        doc = "centered input data"
+        doc = "mean_centered input data"
         def fget(self):
             if not hasattr(self, "_xc"):
                  self._xc = self.x + self.xadd
@@ -161,7 +164,10 @@ class PCA(Model):
         def fset(self, xc):
             self._xc = xc
         def fdel(self):
-            del self._xc
+            print "a"
+            if hasattr(self, "_xc"):
+                print "del"
+                del self._xc
         return locals()
     xc = property(**xc())
 
@@ -237,7 +243,7 @@ class PCA(Model):
         def fdel(self):
             del self._row_metric
             if hasattr(self, "_xd"):
-                del self.xd
+                del self._xd
         return locals()
     row_metric = property(**row_metric())
 
@@ -254,7 +260,7 @@ class PCA(Model):
         def fdel(self):
             del self._column_metric
             if hasattr(self, "_xd"):
-                del self.xd
+                del self._xd
         return locals()
     column_metric = property(**column_metric())
     
@@ -273,10 +279,12 @@ class PCA(Model):
     def delete_rows(self, index):
         pass
 
-    def reweight(self, )
-    
+    def reweight(self, w):
+        pass
     
+        
 if __name__ == "__main__":
-     X = random.rand(4,10)
-     pcaobj = PCA(X)
-     print "explained variance" + str(pcaobj.explained_variance)
+    from numpy.random import rand
+    X = rand(4,10)
+    pcaobj = PCA(X)
+    print "explained variance" + str(pcaobj.explained_variance)
diff --git a/pyblm/statistics.py b/pyblm/statistics.py
index 9253180..6e7cbfe 100644
--- a/pyblm/statistics.py
+++ b/pyblm/statistics.py
@@ -115,8 +115,8 @@ def procrustes(a, b, strict=True, center=False, verbose=False):
 
     *Reference*:
 
-        Schonemann, A generalized solution of the orthogonal Procrustes problem,
-        Psychometrika, 1966 
+        Schonemann, A generalized solution of the orthogonal Procrustes
+        problem, Psychometrika, 1966 
     """
     
     if center:
@@ -131,9 +131,9 @@ def procrustes(a, b, strict=True, center=False, verbose=False):
        Cm = _ensure_strict(Cm)
     b_rot = dot(b, Cm)
     if verbose:
-        print Cm.round()
-        fit = sum(ravel(b - b_rot)**2)
-        print "Error: %.3E" %fit
+        fit = ((b - b_rot)**2).sum()
+        fit2 = (dot(a, a.T) + dot(b, b.T) - 2*diag(s)).trace()
+        print "Error: %.2E ,  %.2E" %(fit, fit2)
     if center:
         return mn_b + b_rot
     else:
@@ -159,7 +159,9 @@ def _ensure_strict(C, only_flips=True):
     
     *Notes*:
     
-        This function is not ready for use. Use (only_flips=True)
+        This function is not ready for use. Use (only_flips=True).
+        That is, for more than two components, the rotation matrix
+        has a tendency to be unstable (det(Cm)>1), when rounding is used.
     
     """
     if only_flips:
@@ -279,6 +281,16 @@ def _fdr(tsq, tsqp, loc_method=median):
         fdr : {array}
             False discovery rate
 
+    *Notes*:
+    
+    This is an internal function for use in fdr estimation of jack-knifed
+    perturbated blm parameters.
+
+
+    *Reference*:
+        Gidskehaug et al., A framework for significance analysis of
+        gene expression data using dimension reduction methods, BMC
+        bioinformatics, 2007
     """
     n, = tsq.shape
     k, m = tsqp.shape