A few updates

2007-11-26 15:30:52 +00:00
parent 902806c1d8
commit 2951ca4088
4 changed files with 108 additions and 59 deletions
--- a/pyblm/crossvalidation.py
+++ b/pyblm/crossvalidation.py
@ -12,7 +12,7 @@ from numpy.random import shuffle
 from engines import nipals_lpls as lpls


-def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=True):
+def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2], zorth=False, verbose=True):
    """Performs crossvalidation for generalisation error in lpls.

    The L-PLS crossvalidation is estimated just like an ordinary pls
@ -42,6 +42,8 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=Tru
            0 : row center
            1 : column center
            2 : double center
+        zorth : {boolean}
+            If true, Require orthogonal latent components in Z.
        verbose : {boolean}, optional
            Verbosity of console output. For use in debugging.
    
@ -70,7 +72,11 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=Tru
    
    Yhat = empty((a_max, k, l), 'd')
    for cal, val in cv(nsets, k):
-        dat = lpls(X[cal],Y[cal],Z,a_max=a_max,alpha=alpha,mean_ctr=mean_ctr,verbose=verbose)
+        # do the training model
+        dat = lpls(X[cal], Y[cal], Z, a_max=a_max, alpha=alpha,
+                   mean_ctr=mean_ctr, zorth=zorth, verbose=verbose)
+
+        # center test data
        if mean_ctr[0] != 1:
            xi = X[val,:] - dat['mnx']
        else:
@ -79,14 +85,24 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=Tru
            ym = dat['mny']
        else:
            ym = Y[val].mean(1)[:,newaxis] #???: check this
+        # predictions
        for a in range(a_max):
            Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a]))
+        #if permute:
+        #    xcal = X[cal]
+        #    for a in range(1,a_max,1):
+        #        for n in range(10):
+        #            shuffle(cal)
+        #            dat = lpls(xcal, Y[cal], Z, a_max=a_max, alpha=alpha,
+        #                       mean_ctr=mean_ctr, verbose=verbose)
+                    
            
    # todo: need a better support for classification error
    y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?']
    if y_is_class:
-        Yhat, err = class_error(Yhat,Y)
-        return Yhat, err
+        pass
+        #Yhat, err = class_error(Yhat, Y)
+        #return Yhat, err

    sep = (Y - Yhat)**2
    rmsep = sqrt(sep.mean(1)).T
@ -317,8 +333,8 @@ def cv(N, K, randomise=True, sequential=False):
        otherwise interleaved ordering is used.
    
    """
-    if K>N:
-        raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(N,N,K)
+    if N>K:
+        raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(K, K, N)
    index = xrange(N)
    if randomise:
        from random import shuffle
@ -371,7 +387,7 @@ def class_error(Yhat, Y, method='vanilla'):
    Yhat_c = zeros((k, l), dtype='d')
    for a in range(a_opt):
        for i in range(k):
-            Yhat_c[a,val,argmax(Yhat[a,val,:])] = 1.0
+            Yhat_c[a, val, argmax(Yhat[a,val,:])] = 1.0
    err = 100*((Yhat_c + Y) == 2).sum(1)/Y.sum(0).astype('d')

    return Yhat_c, err
--- a/pyblm/engines.py
+++ b/pyblm/engines.py
@ -411,7 +411,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1):
            'evx': expvarx, 'evy': expvary, 'ssqx': ssqx, 'ssqy': ssqy,
            'leverage': leverage, 'mnx': mnx, 'mny': mny}

-def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', verbose=False):
+def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 2], scale='scores', zorth = False, verbose=False):
    """ L-shaped Partial Least Sqaures Regression by the nipals algorithm.

    An L-shaped low rank model aproximates three matrices in a hyploid
@ -475,10 +475,14 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve

        scale : {'scores', 'loads'}, optional
            Option to decide on where the scale goes.
+        zorth : {False, boolean}, optional
+            Option to force orthogonality between latent components
+            in Z
        verbose : {boolean}, optional
            Verbosity of console output. For use in debugging.

    *References*
+
        Saeboe et al., LPLS-regression: a method for improved prediction and
        classification through inclusion of background information on
        predictor variables, J. of chemometrics and intell. laboratory syst.
@ -522,18 +526,22 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
    var_y = empty((a_max,))
    var_z = empty((a_max,))

-    MAX_ITER = 450
+    MAX_ITER = 4500
    LIM = finfo(X.dtype).resolution
    is_rd = False
    for a in range(a_max):
        if verbose:
            print "\nWorking on comp. %s" %a
        u = F[:,:1]
+        w = E[:1,:].T
+        l = G[:,:1]
        diff = 1
        niter = 0
        while (diff>LIM and niter<MAX_ITER):
            niter += 1
            u1 = u.copy()
+            w1 = w.copy()
+            l1 = l.copy()
            w = dot(E.T, u)
            wn = msqrt(dot(w.T, w))
            if wn < LIM:
@ -552,20 +560,25 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
            c = dot(F.T, t)
            c = c/msqrt(dot(c.T, c))
            u = dot(F, c)
-            diff = dot((u-u1).T, (u-u1))
-
+            diff = dot((u - u1).T, (u - u1))
        if verbose:
-            print "Converged after %s iterations" %niter
+            if niter==MAX_ITER:
+                print "Maximum nunber of iterations reached!"
+            print "Iterations: %d " %niter
            print "Error: %.2E" %diff

        if is_rd:
            print "Hei og haa ... rank deficient, this should really not happen"
            break
+
        tt = dot(t.T, t)
-        p = dot(X.T, t)/tt
-        q = dot(Y.T, t)/tt
-        l = dot(Z, w)
-        #k = dot(Z.T, l)/dot(l.T, l)
+        p = dot(E.T, t)/tt
+        q = dot(F.T, t)/tt
+        if zorth:
+            k = dot(G.T, l)/dot(l.T, l)
+        else:
+            k = w
+            l = dot(G, w)
        
        U[:,a] = u.ravel()
        W[:,a] = w.ravel()
@ -575,10 +588,10 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
        L[:,a] = l.ravel()
        K[:,a] = k.ravel()

-        
+        # rank-one deflations
        E = E - dot(t, p.T)
        F = F - dot(t, q.T)
-        G = (G.T - dot(k, l.T)).T
+        G = G - dot(l, k.T)

        var_x[a] = pow(E, 2).sum()
        var_y[a] = pow(F, 2).sum()
--- a/pyblm/models.py
+++ b/pyblm/models.py
@ -1,6 +1,6 @@
 """Bilinear models"""

-from numpy import expand_dims
+from numpy import expand_dims,ones

 from engines import pca

@ -14,8 +14,11 @@ def scale(x, axis=0):
    #scale = 1./x.std(axis)
    return expand_dims(scale, axis)

+
 class Model(object):
-    def __init__(name="johndoe"):
+    """All underscored attributes are properties.
+    """
+    def __init__(self, name="johndoe"):
        self.name = name
        self.options = {}
        
@ -27,8 +30,8 @@ class Model(object):

    def clear(self):
        for param in self.__dict__.keys():
-            if param.startswith("_") and param[1]!="_":
-                exec "del self." + param
+            if param.startswith("_") and param[1:5]!="core":
+                exec "del self." + param[1:]

    def clear_core(self):
        for param in self.__dict__.keys():
@ -78,29 +81,29 @@ class PCA(Model):
    def scores():
        doc = "pca scores"
        def fget(self):
-            if not hasattr(self, "_scores"):
-                u, s, v, tot_var = pcaengine(self.xw, self.amax)
-                self._scores = u
-                self.singvals = s
-                self.loadings = v
-                self.tot_var = tot_var
-            return self._scores[:,:self.amax]
+            if not hasattr(self, "_core_scores"):
+                result= pca(self.xw, self.amax)
+                self._core_scores = result['T']
+                self.singvals = result['eigvals']
+                self.loadings = result['P']
+                self.tot_var = 120.
+            return self._core_scores[:,:self.amax]
        def fset(self, t):
-            self._scores = t
+            self._core_scores = t
        def fdel(self):
-            del self._scores
-        return locals()  # credit: David Niergarth
+            del self._core_scores
+        return locals()
    scores = property(**scores())
    
    def loadings():
        doc = "pca loadings"
        def fget(self):
            if not hasattr(self, "_loadings"):
-                u, s, v, tot_var = pcaengine(self.xw, self.amax)
-                self._loadings = v
-                self.scores = u
-                self.singvals = s
-                self.tot_var = tot_var
+                result = pca(self.xw, self.amax)
+                self.loadings = result['P']
+                self.scores = result['T']
+                self.singvals = result['eigvals']
+                self.tot_var = 120
            return self._loadings[:,:self.amax]
        def fdel(self):
            del self._loadings
@ -113,11 +116,11 @@ class PCA(Model):
        doc = "Singular values"
        def fget(self):
            if not hasattr(self, "_singvals"):
-                u, s, v, tot_var = pcaengine(self.xw, self.amax)
-                self._singvals = s
-                self.scores = u
-                self.loadings = v
-                self.tot_var = tot_var
+                result = pca(self.xw, self.amax)
+                self._singvals = result['eigvals']
+                self.scores = result['T']
+                self.loadings = result['P']
+                self.tot_var = 120
            return self._singvals[:self.amax]
        def fset(self, w):
            self._singvals = w
@ -139,7 +142,7 @@ class PCA(Model):
        doc = "column means"
        def fget(self):
            if not hasattr(self, "_xadd"):
-                self._xadd = center(self.x, axis=0)
+                self._xadd = mean_center(self.x, axis=0)
            return self._xadd
        def fset(self, mnx):
            if hasattr(self, "_xc"):
@ -153,7 +156,7 @@ class PCA(Model):
    xadd = property(**xadd())
    
    def xc():
-        doc = "centered input data"
+        doc = "mean_centered input data"
        def fget(self):
            if not hasattr(self, "_xc"):
                 self._xc = self.x + self.xadd
@ -161,6 +164,9 @@ class PCA(Model):
        def fset(self, xc):
            self._xc = xc
        def fdel(self):
+            print "a"
+            if hasattr(self, "_xc"):
+                print "del"
                del self._xc
        return locals()
    xc = property(**xc())
@ -237,7 +243,7 @@ class PCA(Model):
        def fdel(self):
            del self._row_metric
            if hasattr(self, "_xd"):
-                del self.xd
+                del self._xd
        return locals()
    row_metric = property(**row_metric())

@ -254,7 +260,7 @@ class PCA(Model):
        def fdel(self):
            del self._column_metric
            if hasattr(self, "_xd"):
-                del self.xd
+                del self._xd
        return locals()
    column_metric = property(**column_metric())
    
@ -273,10 +279,12 @@ class PCA(Model):
    def delete_rows(self, index):
        pass

-    def reweight(self, )
+    def reweight(self, w):
+        pass
    
        
 if __name__ == "__main__":
-     X = random.rand(4,10)
+    from numpy.random import rand
+    X = rand(4,10)
    pcaobj = PCA(X)
    print "explained variance" + str(pcaobj.explained_variance)
--- a/pyblm/statistics.py
+++ b/pyblm/statistics.py
@ -115,8 +115,8 @@ def procrustes(a, b, strict=True, center=False, verbose=False):

    *Reference*:

-        Schonemann, A generalized solution of the orthogonal Procrustes problem,
-        Psychometrika, 1966 
+        Schonemann, A generalized solution of the orthogonal Procrustes
+        problem, Psychometrika, 1966 
    """
    
    if center:
@ -131,9 +131,9 @@ def procrustes(a, b, strict=True, center=False, verbose=False):
       Cm = _ensure_strict(Cm)
    b_rot = dot(b, Cm)
    if verbose:
-        print Cm.round()
-        fit = sum(ravel(b - b_rot)**2)
-        print "Error: %.3E" %fit
+        fit = ((b - b_rot)**2).sum()
+        fit2 = (dot(a, a.T) + dot(b, b.T) - 2*diag(s)).trace()
+        print "Error: %.2E ,  %.2E" %(fit, fit2)
    if center:
        return mn_b + b_rot
    else:
@ -159,7 +159,9 @@ def _ensure_strict(C, only_flips=True):
    
    *Notes*:
    
-        This function is not ready for use. Use (only_flips=True)
+        This function is not ready for use. Use (only_flips=True).
+        That is, for more than two components, the rotation matrix
+        has a tendency to be unstable (det(Cm)>1), when rounding is used.
    
    """
    if only_flips:
@ -279,6 +281,16 @@ def _fdr(tsq, tsqp, loc_method=median):
        fdr : {array}
            False discovery rate

+    *Notes*:
+    
+    This is an internal function for use in fdr estimation of jack-knifed
+    perturbated blm parameters.
+
+
+    *Reference*:
+        Gidskehaug et al., A framework for significance analysis of
+        gene expression data using dimension reduction methods, BMC
+        bioinformatics, 2007
    """
    n, = tsq.shape
    k, m = tsqp.shape