Fixed conflicts

2007-12-14 00:16:31 +00:00
parent 1103245d85
commit 253305b602
5 changed files with 177 additions and 164 deletions
--- a/pyblm/init.py
+++ b/pyblm/init.py
@ -13,4 +13,3 @@ def test(level=1, verbosity=1):
    print 'Python version %s' % (sys.version.replace('\n', '',),)
    from numpy.testing import NumpyTest
    return NumpyTest().test(level, verbosity)
-
--- a/pyblm/crossvalidation.py
+++ b/pyblm/crossvalidation.py
@ -155,7 +155,7 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F
    validation scheme.

    *Parameters*:
-    
+
        X : {array}
            Main data matrix (m, n)
        Y : {array}
@ -180,9 +180,9 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F
            If true, Require orthogonal latent components in Z.
        verbose : {boolean}, optional
            Verbosity of console output. For use in debugging.
-    
+
    *Returns*:
-    
+
        rmsep : {array}
            Root mean squred error of prediction
        yhat : {array}
@ -191,19 +191,19 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F
            Estimated value of optimal number of components

    """
-    
+
    m, n = X.shape
    k, l = Y.shape
    o, p = Z.shape
    assert m == k, "X (%d,%d) - Y (%d,%d) dim mismatch" %(m, n, k, l)
    assert n == p, "X (%d,%d) - Z (%d,%d) dim mismatch" %(m, n, o, p)
    if nsets == None:
-        nsets = m 
+        nsets = m
    if nsets > X.shape[0]:
        print "nsets (%d) is larger than number of variables (%d).\nnsets:  %d -> %d" %(nsets, m, nsets, m)
        nsets = m
    assert (alpha >= 0 and alpha<=1), "Alpha needs to be within [0,1], got: %.2f" %alpha
-    
+
    Yhat = empty((a_max, k, l), 'd')
    for cal, val in cv(k, nsets):
        # do the training model
@ -217,10 +217,16 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F
        # predictions
        for a in range(a_max):
            Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a]))
+    # todo: need a better support for classification error
+    y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?']
+    if y_is_class:
+        pass
+        #Yhat, err = class_error(Yhat, Y)
+        #return Yhat, err
    sep = (Y - Yhat)**2
    rmsep = sqrt(sep.mean(1)).T
    #aopt = find_aopt_from_sep(rmsep)
-    
+
    # todo: need a better support for classification error
    error = prediction_error(Yhat, Y, method='1/2')
    
@ -228,7 +234,7 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F

 def pca_jk(a, aopt, nsets=None, center_axis=[0], method='cv'):
    """Returns jack-knife segments from PCA.
-    
+
    *Parameters*:

        a : {array}
@ -252,9 +258,9 @@ def pca_jk(a, aopt, nsets=None, center_axis=[0], method='cv'):
            Loadings collected in a three way matrix (n_segments, m, aopt)

    *Notes*:
-    
+
        - Crossvalidation method is currently set to random blocks of samples.
-        
+
    """
    m, n = a.shape
    if nsets == None:
@ -278,14 +284,14 @@ def pca_jk(a, aopt, nsets=None, center_axis=[0], method='cv'):
            Pcv[i,:,:] = pca(a[cal,:], aopt, mode='fast', scale='loads', center_axis = center_axis)['P']
    else:
        raise NotImplementedError(method)
-        
+
    return Pcv

 def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False):
    """ Returns jack-knife segements of W.
-    
+
    *Parameters*:
-    
+
        X : {array}
            Main data matrix (m, n)
        Y : {array}
@ -294,7 +300,7 @@ def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False):
            The number of components to calculate (0, min(m,n))
        nsets : (integer), optional
            Number of jack-knife segments
-        
+
        center_axis : {boolean}, optional
            - -1 : nothing
            - 0 : row center
@ -302,12 +308,12 @@ def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False):
            - 2 : double center
        verbose : {boolean}, optional
            Verbosity of console output. For use in debugging.
-    
+
    *Returns*:
-    
+
        Wcv : {array}
            Loading-weights jack-knife segements
-        
+
    """
    m, n = X.shape
    k, l = Y.shape
@ -320,7 +326,7 @@ def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False):
            print "Segment number: %d" %i
        dat = pls(X[cal,:], Y[cal,:], a_opt, scale='loads', mode='fast', center_axis=center_axis)
        Wcv[i,:,:] = dat['W']
-        
+
    return Wcv

 def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth=False, verbose=False):
@ -332,10 +338,10 @@ def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth=
    infer the paramter confidence in th model.

    The segements returned are the X-block weights and Z-block weights.
-    
-    
+
+
    *Parameters*:
-    
+
        X : {array}
            Main data matrix (m, n)
        Y : {array}
@ -358,15 +364,15 @@ def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth=
            2 : double center
        verbose : {boolean}, optional
            Verbosity of console output. For use in debugging.
-    
+
    *Returns*:
-    
+
        Wx : {array}
            X-block jack-knife segements
        Wz : {array}
            Z-block jack-knife segements
    """
-    
+
    m, n = X.shape
    k, l = Y.shape
    o, p = Z.shape
@ -388,7 +394,7 @@ def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth=

 def find_aopt_from_sep(err, method='vanilla'):
    """Returns an estimate of optimal number of components.
-    
+
    The estimate is based on the error of prediction from
    crossvalidation. This is pretty much wild guessing and it is
    recomended to inspect model parameters and prediction errors
@ -406,7 +412,7 @@ def find_aopt_from_sep(err, method='vanilla'):
        aopt : {integer}
            A guess on the optimal number of components
    """
-    
+
    if method == 'vanilla':
        # min rmsep
        rmsecv = sqrt(err.mean(0))
@ -434,7 +440,7 @@ def cv(N, K, randomise=True, sequential=False):
    of length ~N/K, *without* replacement.

    *Parameters*:
-    
+
        N : {integer}
            Total number of samples
        K : {integer}
@ -443,7 +449,7 @@ def cv(N, K, randomise=True, sequential=False):
            Use random sampling
        sequential : {boolean}
            Use sequential sampling
-        
+
    *Returns*:

        training : {array-like}
@ -456,12 +462,12 @@ def cv(N, K, randomise=True, sequential=False):
        If randomise is true, a copy of index is shuffled before partitioning,

        otherwise its order is preserved in training and validation.
-        
+
        Randomise overrides the sequential argument. If randomise is true,
        sequential is False
        If sequential is true the index is partioned in continous blocks,
        otherwise interleaved ordering is used.
-    
+
    """
    if K > N:
        raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(N, N, K)
@ -510,6 +516,8 @@ def diag_cv(shape, nsets=9, randomise=True):
    except:
        raise ValueError("shape needs to be a two-tuple")
    if nsets>m or nsets>n:
+        msg = "You may not use more subsets than max(n_rows, n_cols)"
+        raise ValueError, msg
         msg = "You may not use more subsets than max(n_rows, n_cols)"
         nsets = min(m, n)
    nm = n*m
@ -524,7 +532,20 @@ def diag_cv(shape, nsets=9, randomise=True):
            validation.update(ind)
        #training = [j for j in index if j not in validation]
        yield list(validation)
-        
+
+def class_error(y_hat, y, method='vanilla'):
+    """ Not used.
+    """
+    a_opt, k, l = y_hat.shape
+    y_hat_c = zeros((k, l), dtype='d')
+    if method == vanilla:
+        pass
+    for a in range(a_opt):
+        for i in range(k):
+            y_hat_c[a, val, argmax(y_hat[a,val,:])] = 1.0
+    err = 100*((y_hat_c + y) == 2).sum(1)/y.sum(0).astype('d')
+
+    return y_hat_c, err

 def prediction_error(y_hat, y, method='squared'):
    """Loss function on multiclass Y.
@ -651,7 +672,7 @@ def _wkernel_pls_val(X, Y, a_max, n_blocks=None):
    for Din, Doi, Yin, Yout in V:
        ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0])
        PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
-        
+
        dat = w_simpls(Din, Yin, a_max)
        Q, U, H = dat['Q'], dat['U'], dat['H']
        That = dot(Doi, dot(U, inv(triu(dot(H.T, U))) ))
--- a/pyblm/engines.py
+++ b/pyblm/engines.py
@ -20,7 +20,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
    extracts orthogonal components of maximum variance.

    *Parameters*:
-    
+
        X : {array}
            Data measurement matrix, (samples x variables)
        aopt : {integer}
@ -55,21 +55,21 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
        mode : {string}, optional
            Amount of info retained, [['normal'], 'fast', 'detailed']

-  
+
    *SeeAlso*:

        `center` : Data centering
-        
+

    *Notes*
-    
+
    Uses kernel speed-up if m>>n or m<<n.

    If residuals turn rank deficient, a lower number of component than given
    in input will be used.
-    
+
    *Examples*:
-    
+
    >>> import scipy,engines
    >>> a=scipy.asarray([[1,2,3],[2,4,5]])
    >>> dat=engines.pca(a, 2)
@ -77,7 +77,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
    array([0.,99.8561562,  100.])

    """
-    
+
    m, n = X.shape
    max_aopt = min(m, n)
    if center_axis != None:
@ -94,7 +94,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
        u = u[:,:aopt]
        s = s[:aopt]
        v = v[:,:aopt]
-    
+
    # ranktest
    tol = 1e-10
    eff_rank = sum(s > s[0]*tol)
@ -104,14 +104,14 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
    T = T[:,:aopt]
    P = v[:,:aopt]
    e = s**2
-    
+
    if scale=='loads':
        T = T/s
        P = P*s

    if mode in ['fast', 'f', 'F']:
        return {'T':T, 'P':P, 'aopt':aopt, 'mnx': mnx}
-    
+
    if mode in ['detailed', 'd', 'D']:
        E = empty((aopt, m, n))
        ssq = []
@ -135,7 +135,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
            lev = [(1./m)+((T/s)**2).sum(1), (1./n)+(P**2).sum(1)]
    # variances
    expvarx = r_[0, 100*e.cumsum()/(X*X).sum()]
-    
+
    return {'T': T, 'P': P, 'E': E, 'evx': expvarx, 'leverage': lev, 'ssqx': ssq,
            'aopt': aopt, 'eigvals': e, 'mnx': mnx}

@ -159,20 +159,20 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=[0, 0]):
        keys -- values,  T -- scores, P -- loadings, E -- residuals,
        levx -- leverages, ssqx -- sum of squares, expvarx -- cumulative
        explained variance, aopt -- number of components used
-    
+
    *OtherParameters*:

        mode : {string}
            Amount of info retained, ('fast', 'normal', 'detailed')
        center_axis : {integer}
            Center along given axis. If neg.: no centering (-inf,..., matrix modes)
-  
+
    SeeAlso:
-    
+
        - pca : other blm
        - pls : other blm
        - lpls : other blm
- 
+
    *Notes*

    -----
@ -180,12 +180,12 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=[0, 0]):
    Uses kernel speed-up if m>>n or m<<n.

    If residuals turn rank deficient, a lower number of component than given
-    in input will be used. The number of components used is given in results-dict. 
-    
+    in input will be used. The number of components used is given in results-dict.
+

    Examples
    --------
-    
+
    >>> import scipy,engines
    >>> a=scipy.asarray([[1,2,3],[2,4,5]])
    >>> b=scipy.asarray([[1,1],[2,3]])
@ -222,9 +222,9 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=[0, 0]):
        F = b - dot(T, Q.T)
        sepy = F**2
        ssqy = [sepy.sum(0), sepy.sum(1)]
-    
+
    expvary = r_[0,  100*((T**2).sum(0)*(Q**2).sum(0)/(b**2).sum()).cumsum()[:aopt]]
-    
+
    dat.update({'Q': Q, 'F': F, 'evy': expvary, 'ssqy': ssqy, 'mny': mny})
    return dat

@ -245,9 +245,9 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
            Which component should get the scale
        center_axis : {-1, integer}
            Perform centering across given axis, (-1 is no centering)
-        
+
    *Returns*:
-  
+
        T : {array}
            X-scores
        W : {array}
@ -280,25 +280,25 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
            Sum of squared residuals in Y along each dimesion
        leverage : {array}
            Sample leverages
-        
+
    *OtherParameters*:
-    
+
        mode : ['normal', 'fast', 'detailed'], optional
            How much details to compute
-    
+
    *SeeAlso*:
-    
+
        `center` - data centering
- 
+
    *Notes*

        - The output with mode='fast' will only return T and W
-        
+
        - If residuals turn rank deficient, a lower number of component than given in input will be used. The number of components used is given in results.


    *Examples*
-    
+
    >>> import numpy, engines
    >>> a = numpy.asarray([[1,2,3],[2,4,5]])
    >>> b = numpy.asarray([[1,1],[2,3]])
@ -307,7 +307,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
    array([0.,99.8561562,  100.])

    """
-    
+
    m, n = X.shape
    try:
        k, l = Y.shape
@ -322,7 +322,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
        Y, mny = center(Y, center_axis[1])
        min_aopt = min_aopt - 1
    assert(aopt > 0 and aopt < min_aopt)
-    
+
    W = empty((n, aopt))
    P = empty((n, aopt))
    R = empty((n, aopt))
@ -330,7 +330,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
    T = empty((m, aopt))
    B = empty((aopt, n, l))
    tt = empty((aopt,))
-    
+
    XY = dot(X.T, Y)
    for i in range(aopt):
        if XY.shape[1] == 1: #pls 1
@ -345,7 +345,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
            # with many samples, many x-vars and many non-orth y-vars (where arpack speed
            # shines)
            #############
-            
+
            #s, w = arpack.eigen_symmetric(dot(XY, XY.T),k=1, tol=1e-10, maxiter=1000)
            #if s[0] == 0:
            #    print "Arpack did not converge... using svd"
@ -357,15 +357,15 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
            #    print "Arpack did not converge... using svd"
            q, s, vh = svd(dot(XY.T, XY))
            q = q[:,:1]
-            
+
            w = dot(XY, q)
            w = w/vnorm(w)
-        
+
        r = w.copy()
        if i > 0:
            for j in range(0, i, 1):
                r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis]
-        
+
        t = dot(X, r)
        tt[i] = tti = dot(t.T, t).ravel()
        p  = dot(X.T, t)/tti
@ -385,7 +385,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
        R[:,i] = r.ravel()
        Q[:,i] = q.ravel()
        B[i] = dot(R[:,:i+1], Q[:,:i+1].T)
-        
+
    qnorm = apply_along_axis(vnorm, 0, Q)
    tnorm = sqrt(tt)
    pp = (P**2).sum(0)
@ -412,13 +412,13 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
        sepy = F**2
        ssqy = [sepy.sum(0), sepy.sum(1)]
        leverage = 1./m + ((T/tnorm)**2).sum(1)
-        
+
    # variances
    tp= tt*pp
    tq = tt*qnorm*qnorm
    expvarx = r_[0, 100*tp/(X*X).sum()]
    expvary = r_[0, 100*tq/(Y*Y).sum()]
-    
+
    if scale == 'loads':
        T = T/tnorm
        W = W*tnorm
@ -438,7 +438,7 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
    of these three matrices tries to discover common directions/subspaces.

    *Parameters*:
-    
+
        X : {array}
            Main data matrix (m, n)
        Y : {array}
@ -457,9 +457,9 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
            0 : row center
            1 : column center
            2 : double center
-    
+
    *Returns*:
-    
+
        T : {array}
            X-scores
        W : {array}
@ -504,18 +504,18 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
        Saeboe et al., LPLS-regression: a method for improved prediction and
        classification through inclusion of background information on
        predictor variables, J. of chemometrics and intell. laboratory syst.
-        
+
        Martens et.al, Regression of a data matrix on descriptors of
        both its rows and of its columns via latent variables: L-PLSR,
        Computational statistics & data analysis, 2005
-    
+
    """
    m, n = X.shape
    k, l = Y.shape
    u, o = Z.shape
    max_rank = min(m, n)
-    
-    
+
+
    if center_axis != None:
        xctr, yctr, zctr = center_axis
        X, mnX = center(X, xctr)
@ -523,14 +523,14 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
        Z, mnZ = center(Z, zctr)
        max_rank = max_rank -1
    assert (a_max > 0 and a_max < max_rank), "Number of comp error:\
-    tried: %d, max_rank: %d" %(a_max, max_rank)    
+    tried: %d, max_rank: %d" %(a_max, max_rank)

    # initial variance
    varX = (X**2).sum()
    varY = (Y**2).sum()
    varZ = (Z**2).sum()

-    # initialize 
+    # initialize
    U = empty((k, a_max))
    Q = empty((l, a_max))
    T = zeros((m, a_max))
@ -600,7 +600,7 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
        else:
            k = w
            l = dot(G, w)
-        
+
        U[:,a] = u.ravel()
        W[:,a] = w.ravel()
        P[:,a] = p.ravel()
@ -617,11 +617,11 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
        var_x[a] = pow(E, 2).sum()
        var_y[a] = pow(F, 2).sum()
        var_z[a] = pow(G, 2).sum()
-    
+
        B[a] = dot(dot(W[:,:a+1], inv(dot(P[:,:a+1].T, W[:,:a+1]))), Q[:,:a+1].T)
        #b0[a] = mnY - dot(mnX, B[a])
-        
-    
+
+
    # variance explained
    evx = 100.*(1 - var_x/varX)
    evy = 100.*(1 - var_y/varY)
@ -635,8 +635,8 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
        knorm = apply_along_axis(vnorm, 0, K)
        L = L*knorm
        K = K/knorm
-    
-    return {'T':T, 'W':W, 'P':P, 'Q':Q, 'U':U, 'L':L, 'K':K, 'B':B, 'E': E, 'F': F, 'G': G, 'evx':evx, 'evy':evy, 'evz':evz,'mnx': mnX, 'mny': mnY, 'mnz': mnZ}    
+
+    return {'T':T, 'W':W, 'P':P, 'Q':Q, 'U':U, 'L':L, 'K':K, 'B':B, 'E': E, 'F': F, 'G': G, 'evx':evx, 'evy':evy, 'evz':evz,'mnx': mnX, 'mny': mnY, 'mnz': mnZ}

 def lpls_predict(model_dict, x, aopt):
    """Predict lpls reponses from existing model on new data.
@ -646,25 +646,25 @@ def lpls_predict(model_dict, x, aopt):
    except:
        x = atleast_2d(x.shape)
        m, n = x.shape
-    
+
    if 'B0' in model_dict.keys():
        y = model_dict['B0'] + dot()
-        
-    
+
+

 def vnorm(a):
    """Returns the norm of a vector.

    *Parameters*:
-        
+
        a : {array}
            Input data, 1-dim, or column vector (m, 1)

    *Returns*:
-    
+
        a_norm : {array}
            Norm of input vector
-            
+
    """
    return msqrt(dot(a.T,a))

@ -676,7 +676,7 @@ def center(a, axis):
       a : {array}
           Input data
       axis : {integer}
-           Which centering to perform. 
+           Which centering to perform.
           0 = col center, 1 = row center, 2 = double center
           -1 = nothing

@ -707,16 +707,16 @@ def center(a, axis):
        else:
            mn = a.mean()*ones(a.shape)
        return a - mn, mn
-    
+
    if axis == -1:
        mn = zeros((1,a.shape[1],))
-        mn = tile(mn, (a.shape[0], 1))
+        #mn = tile(mn, (a.shape[0], 1))
    elif axis == 0:
        mn = a.mean(0)[newaxis]
-        mn = tile(mn, (a.shape[0], 1)) 
+        #mn = tile(mn, (a.shape[0], 1))
    elif axis == 1:
        mn = a.mean(1)[:,newaxis]
-        mn = tile(mn, (1, a.shape[1]))
+        #mn = tile(mn, (1, a.shape[1]))
    elif axis == 2:
        #fixme: double centering returns column mean as loc-vector, ok?
        mn = a.mean(0)[newaxis] + a.mean(1)[:,newaxis] - a.mean()
@ -774,7 +774,7 @@ def _scale(a, axis):
       a : {array}
           Input data
       axis : {integer}
-           Which scaling to perform. 
+           Which scaling to perform.
           0 = column, 1 = row, -1 = nothing

    *Returns*:
@ -784,7 +784,7 @@ def _scale(a, axis):
        mn : {array}
            Scaling vector/matrix
    """
-    
+
    if axis == -1:
        sc = zeros((a.shape[1],))
    elif axis == 0:
@ -817,21 +817,20 @@ def esvd(data, a_max=None):
            Singular values
        v : {array}
            Left hand eigenvectors
-    
+
    *Notes*:

        Uses Anoldi iterations for the symmetric eigendecomp (ARPACK)
-    
+
    """
-    
+
    m, n = data.shape
-    if m >= n:
+    if m > n:
        kernel = dot(data.T, data)
-        
        if a_max == None:
            a_max = n - 1
        s, v = arpack.eigen_symmetric(kernel, k=a_max, which='LM',
-                                      maxiter=200, tol=1e-5)
+                                      maxiter=500, tol=1e-7)
        s = s[::-1]
        v = v[:,::-1]
        #u, s, vt = svd(kernel)
@ -841,9 +840,9 @@ def esvd(data, a_max=None):
    else:
        kernel = dot(data, data.T)
        if a_max == None:
-            a_max = m -1
+            a_max = m - 1
        s, u = arpack.eigen_symmetric(kernel, k=a_max, which='LM',
-                                      maxiter=200, tol=1e-5)
+                                      maxiter=500, tol=1e-7)
        s = s[::-1]
        u = u[:,::-1]
        #u, s, vt = svd(kernel)
--- a/pyblm/models.py
+++ b/pyblm/models.py
@ -21,7 +21,7 @@ class Model(object):
    def __init__(self, name="johndoe"):
        self.name = name
        self.options = {}
-        
+
    def save(self, filename='pca.ml'):
        pass

@ -46,7 +46,7 @@ class PCA(Model):
        self._x = x
        self.amax = amax
        self.aopt = amax
-        
+
    # properties
    def amax():
        doc = "maximum number of components"
@ -77,7 +77,7 @@ class PCA(Model):
            del self._tot_var
        return locals()
    tot_var = property(**tot_var())
-    
+
    def scores():
        doc = "pca scores"
        def fget(self):
@ -94,7 +94,7 @@ class PCA(Model):
            del self._core_scores
        return locals()
    scores = property(**scores())
-    
+
    def loadings():
        doc = "pca loadings"
        def fget(self):
@ -111,7 +111,7 @@ class PCA(Model):
            self._loadings = p
        return locals()
    loadings = property(**loadings())
-    
+
    def singvals():
        doc = "Singular values"
        def fget(self):
@ -128,7 +128,7 @@ class PCA(Model):
            del self._singvals
        return locals()
    singvals = property(**singvals())
-    
+
    def x():
        doc = "x is readonly, may not be deleted"
        def fget(self):
@ -154,12 +154,12 @@ class PCA(Model):
                del self._xc
        return locals()
    xadd = property(**xadd())
-    
+
    def xc():
        doc = "mean_centered input data"
        def fget(self):
            if not hasattr(self, "_xc"):
-                 self._xc = self.x + self.xadd
+                self._xc = self.x + self.xadd
            return self._xc
        def fset(self, xc):
            self._xc = xc
@ -186,7 +186,7 @@ class PCA(Model):
            del self._xw
        return locals()
    xw = property(**xw())
-    
+
    def explained_variance():
        doc = "explained variance"
        def fget(self):
@ -215,7 +215,7 @@ class PCA(Model):
            del self._residuals
        return locals()
    residuals = property(**residuals())
-                
+
    def leverage():
        doc = "objects leverage"
        def fget(self):
@ -254,7 +254,7 @@ class PCA(Model):
                self._column_metric = scale(self.xc, axis=1)
            return self._column_metric
        def fset(self, w):
-            
+
            self._column_metric = w
            # update model
        def fdel(self):
@ -263,7 +263,7 @@ class PCA(Model):
                del self._xd
        return locals()
    column_metric = property(**column_metric())
-    
+
    def blm_update(self, a, b):
        pass

@ -281,8 +281,8 @@ class PCA(Model):

    def reweight(self, w):
        pass
-    
-        
+
+
 if __name__ == "__main__":
    from numpy.random import rand
    X = rand(4,10)
--- a/pyblm/statistics.py
+++ b/pyblm/statistics.py
@ -22,9 +22,9 @@ def hotelling(Pcv, P, p_center='median', cov_center='median',
    used in multivariate hypothesis testing. In order to avoid small variance
    samples to become significant this version allows borrowing variance
    from the pooled covariance.
-    
+
    *Parameters*:
-    
+
        Pcv : {array}
            Crossvalidation segements of paramter
        P : {array}
@ -39,9 +39,9 @@ def hotelling(Pcv, P, p_center='median', cov_center='median',
            Rotate sub-segments toward calibration model.
        strict : {boolean}, optional
            Only rotate 90 degree
-    
+
    *Returns*:
-    
+
       tsq : {array}
           Hotellings T^2 estimate

@ -50,19 +50,19 @@ def hotelling(Pcv, P, p_center='median', cov_center='median',
        Gidskehaug et al., A framework for significance analysis of
        gene expression datausing dimension reduction methods, BMC
        bioinformatics, 2007
-        
+
    *Notes*

        The rotational freedom in the solution of bilinear
        models may require that a rotation onto the calibration
        model. One way of doing that is procrustes rotation.
-        
+
    """
    m, n = P.shape
    n_sets, n, amax = Pcv.shape
    T_sq = empty((n,), dtype='d')
    Cov_i = zeros((n, amax, amax), dtype='d')
-    
+
    # rotate sub_models to full model
    if crot:
        for i, Pi in enumerate(Pcv):
@ -77,20 +77,15 @@ def hotelling(Pcv, P, p_center='median', cov_center='median',
        P_ctr = P

    for i in xrange(n):
-        Pi = Pcv[:,i,:] # (n_sets x amax) 
+        Pi = Pcv[:,i,:] # (n_sets x amax)
        Pi_ctr = P_ctr[i,:] # (1 x amax)
-        #Pim = (Pi - Pi_ctr)*msqrt(n_sets-1)
-        #Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim)
-        Pim = (Pi - Pi_ctr)
-        Cov_i[i] = dot(Pim.T, Pim)
+        Pim = (Pi - Pi_ctr)*msqrt(n_sets-1)
+        Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim)
+
    if cov_center == 'median':
        Cov_p = median(Cov_i)
-    elif cov_center == 'mean':
+    else cov_center == 'mean':
        Cov_p = Cov.mean(0)
-    else:
-        print "Pooled covariance est. invalid, using median"
-        print cov_center
-        Cov_p = median(Cov_i)
    reg_cov = (1. - alpha)*Cov_i + alpha*Cov_p
    for i in xrange(n):
        Pc = P_ctr[i,:]
@ -105,7 +100,7 @@ def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False)
    onto another by minimising the squared error.

    *Parameters*:
-    
+
        a : {array}
            Input array, stationary
        b : {array}
@ -127,9 +122,9 @@ def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False)
    *Reference*:

        Schonemann, A generalized solution of the orthogonal Procrustes
-        problem, Psychometrika, 1966 
+        problem, Psychometrika, 1966
    """
-    
+
    if center:
        mn_a = a.mean(0)
        a = a - mn_a
@ -145,7 +140,7 @@ def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False)
    u, s, vt = svd(dot(b.T, a))    
    Cm = dot(u, vt) # Cm: orthogonal rotation matrix
    if strict:
-       Cm = _ensure_strict(Cm)
+        Cm = _ensure_strict(Cm)
    b_rot = dot(b, Cm)
    if verbose:
        fit = ((b - b_rot)**2).sum()
@ -158,7 +153,7 @@ def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False)

 def _ensure_strict(C, only_flips=True):
    """Ensure that a rotation matrix does only 90 degree rotations.
-    
+
    In multiplication with pcs this allows flips and reordering.
    if only_flips is True there will onlt be flips allowed

@ -173,13 +168,13 @@ def _ensure_strict(C, only_flips=True):

        C_rot : {array}
            Restricted rotation matrix
-    
+
    *Notes*:
-    
+
        This function is not ready for use. Use (only_flips=True).
        That is, for more than two components, the rotation matrix
        has a tendency to be unstable (det(Cm)>1), when rounding is used.
-    
+
    """
    if only_flips:
        C = eye(C.shape[0])*sign(C)
@ -199,9 +194,9 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,

    The response (Y) is randomly permuted, and the number of false positives
    is registered by comparing hotellings T2 statistics of the calibration model.
-    
+
    *Parameters*:
-    
+
        X : {array}
            Main data matrix (m, n)
        Y : {array}
@ -237,14 +232,14 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,

        nsets : {integer}
            Number of crossvalidation segements
-    
+
    *Reference*:

        Gidskehaug et al., A framework for significance analysis of
        gene expression data using dimension reduction methods, BMC
        bioinformatics, 2007
    """
-    
+
    m, n = X.shape
    k, nz = Z.shape
    assert(n==nz)
@ -255,7 +250,7 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
        Y = atleast_2d(Y).T
        my, l = Y.shape
    assert(m==my)
-    
+
    pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets)
    pert_tsq_z = zeros((k, n_iter), dtype='d') # (nzvars x n_subsets)

@ -264,7 +259,7 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
    Wc, Lc = lpls_jk(X, Y, Z ,aopt, zorth=zorth)
    cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha)
    cal_tsq_z = hotelling(Lc, dat['L'], alpha=alpha)
-    print "morn"
+    
    # Perturbations
    index = arange(m)
    for i in range(n_iter):
@ -275,7 +270,7 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
        pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)
        # no reason to borrow variance in dag (alpha ->some small value)
        pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=0.01) 
-   
+
    return cal_tsq_z, pert_tsq_z, cal_tsq_x, pert_tsq_x


@ -286,9 +281,9 @@ def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median,

    The response (Y) is randomly permuted, and the number of false positives
    is registered by comparing hotellings T2 statistics of the calibration model.
-    
+
    *Parameters*:
-    
+
        X : {array}
            Main data matrix (m, n)
        Y : {array}
@ -318,14 +313,14 @@ def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median,
            Only rotate 90 degree
        nsets : {integer}
            Number of crossvalidation segements
-    
+
    *Reference*:

        Gidskehaug et al., A framework for significance analysis of
        gene expression data using dimension reduction methods, BMC
        bioinformatics, 2007
    """
-    
+
    m, n = X.shape
    try:
        my, l = Y.shape
@ -341,7 +336,7 @@ def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median,
    Wc = pls_jk(X, Y , aopt)

    cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha)
-    
+
    # Perturbations
    pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets)
    index = arange(m)
@ -351,7 +346,7 @@ def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median,
        dat = pls(X, Y[indi,:], aopt, scale='loads', center_axis=center_axis)
        Wi = pls_jk(X, Y[indi,:], aopt, nsets=nsets, center_axis=center_axis)
        pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)
-   
+
    return cal_tsq_x, pert_tsq_x


@ -362,7 +357,7 @@ def _fdr(tsq, tsqp, loc_method=median):
    Fdr is a method used in multiple hypothesis testing to correct for multiple
    comparisons. It controls the expected proportion of incorrectly rejected null
    hypotheses (type I errors) in a list of rejected hypotheses.
-    
+
    *Parameters*:

        tsq : {array}
@ -372,14 +367,14 @@ def _fdr(tsq, tsqp, loc_method=median):

        loc_method : {py_func}
            Location method
-    
+
    *Returns*:

        fdr : {array}
            False discovery rate

    *Notes*:
-    
+
    This is an internal function for use in fdr estimation of jack-knifed
    perturbated blm parameters.

@ -403,4 +398,3 @@ def _fdr(tsq, tsqp, loc_method=median):
    fd_rate = fp/n_signif
    fd_rate[fd_rate>1] = 1
    return fd_rate
-