A few updates

2007-11-26 15:30:52 +00:00
parent 902806c1d8
commit 2951ca4088
4 changed files with 108 additions and 59 deletions
--- a/pyblm/crossvalidation.py
+++ b/pyblm/crossvalidation.py
@@ -12,7 +12,7 @@ from numpy.random import shuffle
 from engines import nipals_lpls as lpls
-def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=True):
+def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2], zorth=False, verbose=True):
    """Performs crossvalidation for generalisation error in lpls.
    The L-PLS crossvalidation is estimated just like an ordinary pls
@@ -42,6 +42,8 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=Tru
            0 : row center
            1 : column center
            2 : double center
        zorth : {boolean}
            If true, Require orthogonal latent components in Z.
        verbose : {boolean}, optional
            Verbosity of console output. For use in debugging.
@@ -70,7 +72,11 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=Tru
    Yhat = empty((a_max, k, l), 'd')
    for cal, val in cv(nsets, k):
-        dat = lpls(X[cal],Y[cal],Z,a_max=a_max,alpha=alpha,mean_ctr=mean_ctr,verbose=verbose)
+        # do the training model
        dat = lpls(X[cal], Y[cal], Z, a_max=a_max, alpha=alpha,
                   mean_ctr=mean_ctr, zorth=zorth, verbose=verbose)
        # center test data
        if mean_ctr[0] != 1:
            xi = X[val,:] - dat['mnx']
        else:
@@ -79,14 +85,24 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=Tru
            ym = dat['mny']
        else:
            ym = Y[val].mean(1)[:,newaxis] #???: check this
        # predictions
        for a in range(a_max):
            Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a]))
        #if permute:
        #    xcal = X[cal]
        #    for a in range(1,a_max,1):
        #        for n in range(10):
        #            shuffle(cal)
        #            dat = lpls(xcal, Y[cal], Z, a_max=a_max, alpha=alpha,
        #                       mean_ctr=mean_ctr, verbose=verbose)
    # todo: need a better support for classification error
    y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?']
    if y_is_class:
-        Yhat, err = class_error(Yhat,Y)
+        pass
-        return Yhat, err
+        #Yhat, err = class_error(Yhat, Y)
        #return Yhat, err
    sep = (Y - Yhat)**2
    rmsep = sqrt(sep.mean(1)).T
@@ -317,8 +333,8 @@ def cv(N, K, randomise=True, sequential=False):
        otherwise interleaved ordering is used.
    """
-    if K>N:
+    if N>K:
-        raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(N,N,K)
+        raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(K, K, N)
    index = xrange(N)
    if randomise:
        from random import shuffle
--- a/pyblm/engines.py
+++ b/pyblm/engines.py
@@ -411,7 +411,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1):
            'evx': expvarx, 'evy': expvary, 'ssqx': ssqx, 'ssqy': ssqy,
            'leverage': leverage, 'mnx': mnx, 'mny': mny}
-def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', verbose=False):
+def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 2], scale='scores', zorth = False, verbose=False):
    """ L-shaped Partial Least Sqaures Regression by the nipals algorithm.
    An L-shaped low rank model aproximates three matrices in a hyploid
@@ -475,10 +475,14 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
        scale : {'scores', 'loads'}, optional
            Option to decide on where the scale goes.
        zorth : {False, boolean}, optional
            Option to force orthogonality between latent components
            in Z
        verbose : {boolean}, optional
            Verbosity of console output. For use in debugging.
    *References*
        Saeboe et al., LPLS-regression: a method for improved prediction and
        classification through inclusion of background information on
        predictor variables, J. of chemometrics and intell. laboratory syst.
@@ -522,18 +526,22 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
    var_y = empty((a_max,))
    var_z = empty((a_max,))
-    MAX_ITER = 450
+    MAX_ITER = 4500
    LIM = finfo(X.dtype).resolution
    is_rd = False
    for a in range(a_max):
        if verbose:
            print "\nWorking on comp. %s" %a
        u = F[:,:1]
        w = E[:1,:].T
        l = G[:,:1]
        diff = 1
        niter = 0
        while (diff>LIM and niter<MAX_ITER):
            niter += 1
            u1 = u.copy()
            w1 = w.copy()
            l1 = l.copy()
            w = dot(E.T, u)
            wn = msqrt(dot(w.T, w))
            if wn < LIM:
@@ -553,19 +561,24 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
            c = c/msqrt(dot(c.T, c))
            u = dot(F, c)
            diff = dot((u - u1).T, (u - u1))
        if verbose:
-            print "Converged after %s iterations" %niter
+            if niter==MAX_ITER:
                print "Maximum nunber of iterations reached!"
            print "Iterations: %d " %niter
            print "Error: %.2E" %diff
        if is_rd:
            print "Hei og haa ... rank deficient, this should really not happen"
            break
        tt = dot(t.T, t)
-        p = dot(X.T, t)/tt
+        p = dot(E.T, t)/tt
-        q = dot(Y.T, t)/tt
+        q = dot(F.T, t)/tt
-        l = dot(Z, w)
+        if zorth:
-        #k = dot(Z.T, l)/dot(l.T, l)
+            k = dot(G.T, l)/dot(l.T, l)
        else:
            k = w
            l = dot(G, w)
        U[:,a] = u.ravel()
        W[:,a] = w.ravel()
@@ -575,10 +588,10 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
        L[:,a] = l.ravel()
        K[:,a] = k.ravel()
-        
+        # rank-one deflations
        E = E - dot(t, p.T)
        F = F - dot(t, q.T)
-        G = (G.T - dot(k, l.T)).T
+        G = G - dot(l, k.T)
        var_x[a] = pow(E, 2).sum()
        var_y[a] = pow(F, 2).sum()
--- a/pyblm/models.py
+++ b/pyblm/models.py
@@ -1,6 +1,6 @@
 """Bilinear models"""
-from numpy import expand_dims
+from numpy import expand_dims,ones
 from engines import pca
@@ -14,8 +14,11 @@ def scale(x, axis=0):
    #scale = 1./x.std(axis)
    return expand_dims(scale, axis)
 class Model(object):
-    def __init__(name="johndoe"):
+    """All underscored attributes are properties.
    """
    def __init__(self, name="johndoe"):
        self.name = name
        self.options = {}
@@ -27,8 +30,8 @@ class Model(object):
    def clear(self):
        for param in self.__dict__.keys():
-            if param.startswith("_") and param[1]!="_":
+            if param.startswith("_") and param[1:5]!="core":
-                exec "del self." + param
+                exec "del self." + param[1:]
    def clear_core(self):
        for param in self.__dict__.keys():
@@ -78,29 +81,29 @@ class PCA(Model):
    def scores():
        doc = "pca scores"
        def fget(self):
-            if not hasattr(self, "_scores"):
+            if not hasattr(self, "_core_scores"):
-                u, s, v, tot_var = pcaengine(self.xw, self.amax)
+                result= pca(self.xw, self.amax)
-                self._scores = u
+                self._core_scores = result['T']
-                self.singvals = s
+                self.singvals = result['eigvals']
-                self.loadings = v
+                self.loadings = result['P']
-                self.tot_var = tot_var
+                self.tot_var = 120.
-            return self._scores[:,:self.amax]
+            return self._core_scores[:,:self.amax]
        def fset(self, t):
-            self._scores = t
+            self._core_scores = t
        def fdel(self):
-            del self._scores
+            del self._core_scores
-        return locals()  # credit: David Niergarth
+        return locals()
    scores = property(**scores())
    def loadings():
        doc = "pca loadings"
        def fget(self):
            if not hasattr(self, "_loadings"):
-                u, s, v, tot_var = pcaengine(self.xw, self.amax)
+                result = pca(self.xw, self.amax)
-                self._loadings = v
+                self.loadings = result['P']
-                self.scores = u
+                self.scores = result['T']
-                self.singvals = s
+                self.singvals = result['eigvals']
-                self.tot_var = tot_var
+                self.tot_var = 120
            return self._loadings[:,:self.amax]
        def fdel(self):
            del self._loadings
@@ -113,11 +116,11 @@ class PCA(Model):
        doc = "Singular values"
        def fget(self):
            if not hasattr(self, "_singvals"):
-                u, s, v, tot_var = pcaengine(self.xw, self.amax)
+                result = pca(self.xw, self.amax)
-                self._singvals = s
+                self._singvals = result['eigvals']
-                self.scores = u
+                self.scores = result['T']
-                self.loadings = v
+                self.loadings = result['P']
-                self.tot_var = tot_var
+                self.tot_var = 120
            return self._singvals[:self.amax]
        def fset(self, w):
            self._singvals = w
@@ -139,7 +142,7 @@ class PCA(Model):
        doc = "column means"
        def fget(self):
            if not hasattr(self, "_xadd"):
-                self._xadd = center(self.x, axis=0)
+                self._xadd = mean_center(self.x, axis=0)
            return self._xadd
        def fset(self, mnx):
            if hasattr(self, "_xc"):
@@ -153,7 +156,7 @@ class PCA(Model):
    xadd = property(**xadd())
    def xc():
-        doc = "centered input data"
+        doc = "mean_centered input data"
        def fget(self):
            if not hasattr(self, "_xc"):
                 self._xc = self.x + self.xadd
@@ -161,6 +164,9 @@ class PCA(Model):
        def fset(self, xc):
            self._xc = xc
        def fdel(self):
            print "a"
            if hasattr(self, "_xc"):
                print "del"
                del self._xc
        return locals()
    xc = property(**xc())
@@ -237,7 +243,7 @@ class PCA(Model):
        def fdel(self):
            del self._row_metric
            if hasattr(self, "_xd"):
-                del self.xd
+                del self._xd
        return locals()
    row_metric = property(**row_metric())
@@ -254,7 +260,7 @@ class PCA(Model):
        def fdel(self):
            del self._column_metric
            if hasattr(self, "_xd"):
-                del self.xd
+                del self._xd
        return locals()
    column_metric = property(**column_metric())
@@ -273,10 +279,12 @@ class PCA(Model):
    def delete_rows(self, index):
        pass
-    def reweight(self, )
+    def reweight(self, w):
        pass
 if __name__ == "__main__":
-     X = random.rand(4,10)
+    from numpy.random import rand
    X = rand(4,10)
    pcaobj = PCA(X)
    print "explained variance" + str(pcaobj.explained_variance)
--- a/pyblm/statistics.py
+++ b/pyblm/statistics.py
@@ -115,8 +115,8 @@ def procrustes(a, b, strict=True, center=False, verbose=False):
    *Reference*:
-        Schonemann, A generalized solution of the orthogonal Procrustes problem,
+        Schonemann, A generalized solution of the orthogonal Procrustes
-        Psychometrika, 1966 
+        problem, Psychometrika, 1966 
    """
    if center:
@@ -131,9 +131,9 @@ def procrustes(a, b, strict=True, center=False, verbose=False):
       Cm = _ensure_strict(Cm)
    b_rot = dot(b, Cm)
    if verbose:
-        print Cm.round()
+        fit = ((b - b_rot)**2).sum()
-        fit = sum(ravel(b - b_rot)**2)
+        fit2 = (dot(a, a.T) + dot(b, b.T) - 2*diag(s)).trace()
-        print "Error: %.3E" %fit
+        print "Error: %.2E ,  %.2E" %(fit, fit2)
    if center:
        return mn_b + b_rot
    else:
@@ -159,7 +159,9 @@ def _ensure_strict(C, only_flips=True):
    *Notes*:
-        This function is not ready for use. Use (only_flips=True)
+        This function is not ready for use. Use (only_flips=True).
        That is, for more than two components, the rotation matrix
        has a tendency to be unstable (det(Cm)>1), when rounding is used.
    """
    if only_flips:
@@ -279,6 +281,16 @@ def _fdr(tsq, tsqp, loc_method=median):
        fdr : {array}
            False discovery rate
    *Notes*:
    This is an internal function for use in fdr estimation of jack-knifed
    perturbated blm parameters.
    *Reference*:
        Gidskehaug et al., A framework for significance analysis of
        gene expression data using dimension reduction methods, BMC
        bioinformatics, 2007
    """
    n, = tsq.shape
    k, m = tsqp.shape