Second initial import

2007-10-26 13:36:05 +00:00
parent 67d6d01f7f
commit 5b91e2d809
4 changed files with 596 additions and 23 deletions
--- a/arpack/arpack.py
+++ b/arpack/arpack.py
@@ -201,16 +201,16 @@ def eigen(A,k=6,M=None,ncv=None,which='LM',
        dr=sb.zeros(k+1,typ)
        di=sb.zeros(k+1,typ)
        zr=sb.zeros((n,k+1),typ)
-        dr,di,z,info=\
+        dr,di,zr,info=\
            eigextract(rvec,howmny,sselect,sigmar,sigmai,workev,
                   bmat,which,k,tol,resid,v,iparam,ipntr,
                   workd,workl,info)
-        
+
        # make eigenvalues complex
-        d=dr+1.0j*di
+        d=(dr+1.0j*di)[:k]
        # futz with the eigenvectors:
        # complex are stored as real,imaginary in consecutive columns
-        z=zr.astype(typ.upper())
+        z=zr.astype(typ.upper())[:,:k]
        for i in range(k): # fix c.c. pairs
            if di[i] > 0 :
                z[:,i]=zr[:,i]+1.0j*zr[:,i+1]
--- a/pyblm/init.py
+++ b/pyblm/init.py
@@ -1,5 +1,6 @@
 from crossvalidation import lpls_val
 from statistics import lpls_qvals
 from engines import pca, pcr, pls
 from engines import nipals_lpls as lpls
 #from tests import *
--- a/pyblm/crossvalidation.py
+++ b/pyblm/crossvalidation.py
@@ -2,11 +2,12 @@
 The primary use is crossvalidation.
 """
-__all__ = ['lpls_val', 'lpls_jk']
+__all__ = ['lpls_val', 'pls_jk', 'lpls_jk']
 __docformat__ = "restructuredtext en"
 from numpy import dot,empty,zeros,sqrt,atleast_2d,argmax,asarray,median,\
-     array_split
+     array_split,arange
 from numpy.random import shuffle
 from engines import nipals_lpls as lpls
@@ -93,8 +94,92 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=Tru
    return rmsep, Yhat, aopt
 def pca_jk(a, aopt, n_blocks=None):
    """Returns jack-knife segements from PCA.
    Parameters:
-def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=False):
+        a : {array}
            data matrix (n x m)
        aopt : {integer}
            number of components in model.
        nsets : {integer}
            number of segments
    Returns:
        Pcv : {array}
            Loadings collected in a three way matrix (n_segments, m, aopt)
    Notes:
        - The loadings are scaled with the (1/samples)*eigenvalues.
        - Crossvalidation method is currently set to random blocks of samples.
        - todo: add support for T
        - fixme: more efficient to add this in validation loop?
    """
    if nsets == None:
        nsets = a.shape[0]
    Pcv = empty((nsets, a.shape[1], aopt), dtype=a.dtype)
    mn_a = .5*(a.mean(0) + a.mean(1)[:,newaxis])
    for i, (cal, val) in enumerate(cv_diag(a.shape, nsets)):
        old_values = a.take(ind)
        new_values = mn_a.take(ind)
        a.put(ind, new_values)
        dat = pca(a, aopt, mode='fast', scale='loads', center=center)
        PP[nn,:,:] = dat['P']
        a.put(ind, old_values)
    return PP
 def pls_jk(X, Y, a_opt, nsets=None, center=True, verbose=False):
    """ Returns jack-knife segements of W.
        *Parameters*:
        X : {array}
            Main data matrix (m, n)
        Y : {array}
            External row data (m, l)
        a_opt : {integer}
            The number of components to calculate (0, min(m,n))
        nsets : (integer), optional
            Number of jack-knife segments
        center : {boolean}, optional
            - -1 : nothing
            - 0 : row center
            - 1 : column center
            - 2 : double center
        verbose : {boolean}, optional
            Verbosity of console output. For use in debugging.
    *Returns*:
        Wcv : {array}
            Loading-weights jack-knife segements
    """
    m, n = X.shape
    k, l = Y.shape
    assert(m==k)
    if nsets == None:
        nsets = X.shape[0]
    Wcv = empty((nsets, X.shape[1], a_opt), dtype=X.dtype)
    for i, (cal, val) in enumerate(cv(k, nsets)):
        if verbose:
            print "Segement number: %d" %i
        dat = pls(X[cal,:], Y[cal,:], a_opt, scale='loads', mode='fast')
        Wcv[nn,:,:] = dat['W']
    return Wcv
 def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=False):
    """Returns jack-knifed segments of lpls model.
    Jack-knifing is a method to perturb the model paramters, hopefully
@@ -113,8 +198,8 @@ def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=F
            External row data (m, l)
        Z : {array}
            External column data (n, o)
-        a_max : {integer}, optional
+        a_opt : {integer}, optional
-            Maximum number of components to calculate (0, min(m,n))
+            The number of components to calculate (0, min(m,n))
        nsets : (integer), optional
            Number of jack-knife segments
        xz_alpha : {float}, optional
@@ -145,11 +230,11 @@ def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=F
    assert(n==p)
    if nsets==None:
        nsets = m
-    WWx = empty((nsets, n, a_max), 'd')
+    WWx = empty((nsets, n, a_opt), 'd')
-    WWz = empty((nsets, o, a_max), 'd')
+    WWz = empty((nsets, o, a_opt), 'd')
-    #WWy = empty((nsets, l, a_max), 'd')
+    #WWy = empty((nsets, l, a_opt), 'd')
    for i, (cal, val) in enumerate(cv(k, nsets)):
-        dat = lpls(X[cal],Y[cal],Z,a_max=a_max,alpha=xz_alpha,mean_ctr=mean_ctr,
+        dat = lpls(X[cal], Y[cal], Z, a_max=a_opt,alpha=xz_alpha, mean_ctr=mean_ctr,
                   scale='loads', verbose=verbose)
        WWx[i,:,:] = dat['W']
        WWz[i,:,:] = dat['L']
@@ -219,6 +304,7 @@ def cv(N, K, randomise=True, sequential=False):
    *Notes*:
        If randomise is true, a copy of index is shuffled before partitioning,
        otherwise its order is preserved in training and validation.
        Randomise overrides the sequential argument. If randomise is true,
@@ -245,13 +331,40 @@ def cv(N, K, randomise=True, sequential=False):
            validation = [i for i in index if i % K == k]
            yield training, validation
 def diag_cv(shape, nsets=9):
    """Generates K (training, validation) index pairs.
    Parameters:
        N : {integer}
    alpha -- scalar, approx. portion of data perturbed  
    """
    try:
        m, n = shape
    except:
        raise ValueError("shape needs to be a two-tuple")
    if nsets>m or nsets>n:
         msg = "You may not use more subsets than max(n_rows, n_cols)"
         raise ValueError, msg
    nm = n*m
    index = arange(nm)
    n_ind = arange(n)
    shuffle(n_ind) # random start diag
    start_inds = array_split(n_ind, nsets)
    for v in range(nsets):
        validation = []
        for start in start_inds[v]: 
            ind = arange(start+v, nm, n+1)
            [validation.append(i) for i in ind]
        training = [j for j in index if j not in validation]
        yield training, validation
 def class_error(Yhat, Y, method='vanilla'):
    """ Not used.
    """
-    a_max, k, l = Yhat.shape
+    a_opt, k, l = Yhat.shape
    Yhat_c = zeros((k, l), dtype='d')
-    for a in range(a_max):
+    for a in range(a_opt):
        for i in range(k):
            Yhat_c[a,val,argmax(Yhat[a,val,:])] = 1.0
    err = 100*((Yhat_c + Y) == 2).sum(1)/Y.sum(0).astype('d')
--- a/pyblm/engines.py
+++ b/pyblm/engines.py
@@ -2,14 +2,414 @@
 """
-__all__ = ['nipals_lpls']
+__all__ = ['pca', 'pcr', 'pls', 'nipals_lpls']
 __docformat__ = "restructuredtext en"
 from math import sqrt as msqrt
-from numpy import dot,empty,zeros,apply_along_axis,newaxis,finfo
+from numpy import dot,empty,zeros,apply_along_axis,newaxis,finfo,sqrt,r_,expand_dims,\
-from numpy.linalg import inv
+minimum
 from numpy.linalg import inv, svd
 from scipy.sandbox import arpack
 def pca(X, aopt, scale='scores', mode='normal', center_axis=0):
    """ Principal Component Analysis.
    PCA is a low rank bilinear aprroximation to a data matrix that sequentially
    extracts orthogonal components of maximum variance.
    Parameters:
        X : {array}
            Data measurement matrix, (samples x variables)
        aopt : {integer}
            Number of components to use, aopt<=min(samples, variables)
        center_axis : {integer}
            Center along given axis. If neg.: no centering (-inf,..., matrix modes)
    Returns:
        T : {array}
            Scores, (samples, components)
        P : {array}
            Loadings, (variables, components)
        E : {array}
            Residuals, (samples, variables)
        evx : {array}
            X-explained variance, (components,)
        mnx : {array}
            X location, (variables,)
        aopt : {integer}
            The number of components used
        ssqx : {list}
            Sum of squared residuals in X along each dimesion
            [(samples, ), (variables,)]
        leverage : {array}
            Leverages, (samples,)
    OtherParameters:
        scale : {string}, optional
             Where to put the weights [['scores'], 'loadings']
        mode : {string}, optional
            Amount of info retained, [['normal'], 'fast', 'detailed']
    :SeeAlso:
        `center` : Data centering
    *Notes*
    Uses kernel speed-up if m>>n or m<<n.
    If residuals turn rank deficient, a lower number of component than given
    in input will be used.
    *Examples*:
    >>> import scipy,engines
    >>> a=scipy.asarray([[1,2,3],[2,4,5]])
    >>> dat=engines.pca(a, 2)
    >>> dat['evx']
    array([0.,99.8561562,  100.])
    """
    m, n = X.shape
    assert(aopt<=min(m,n))
    if center_axis>=0:
        X = X - expand_dims(X.mean(center_axis), center_axis)
    if m>(n+100) or n>(m+100):
        u, s, v = esvd(X, aopt)
    else:
        u, s, vt = svd(X, 0)
        v = vt.T
        u = u[:,:aopt]
        s = s[:aopt]
        v = v[:,:aopt]
    # ranktest
    tol = 1e-10
    eff_rank = sum(s>s[0]*tol)
    aopt = minimum(aopt, eff_rank)
    T = u*s
    s = s[:aopt]
    T = T[:,:aopt]
    P = v[:,:aopt]
    e = s**2
    if scale=='loads':
        T = T/s
        P = P*s
    if mode == 'fast':
        return {'T':T, 'P':P, 'aopt':aopt}
    if mode=='detailed':
        E = empty((aopt, m, n))
        ssq = []
        lev = []
        for ai in range(aopt):
            E[ai,:,:] = X - dot(T[:,:ai+1], P[:,:ai+1].T)
            ssq.append([(E[ai,:,:]**2).mean(0), (E[ai,:,:]**2).mean(1)])
            if scale=='loads':
                lev.append([((s*T)**2).sum(1), (P**2).sum(1)])
            else:
                lev.append([(T**2).sum(1), ((s*P)**2).sum(1)])
    else:
        # residuals
        E = X - dot(T, P.T)
        sep = E**2
        ssq = [sep.sum(0), sep.sum(1)]
        # leverages
        if scale=='loads':
            lev = [(1./m)+(T**2).sum(1), (1./n)+((P/s)**2).sum(1)]
        else:
            lev = [(1./m)+((T/s)**2).sum(1), (1./n)+(P**2).sum(1)]
    # variances
    expvarx = r_[0, 100*e.cumsum()/(X*X).sum()]
    return {'T': T, 'P': P, 'E': E, 'evx': expvarx, 'leverage': lev, 'ssqx': ssq,
            'aopt': aopt, 'eigvals': e}
 def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0):
    """ Principal Component Regression.
    Performs PCR on given matrix and returns results in a dictionary.
    Parameters:
        a : array
        Data measurement matrix, (samples x variables)
        b : array
        Data response matrix, (samples x responses)
        aopt : int
        Number of components to use, aopt<=min(samples, variables)
    Returns:
    results : dict
        keys -- values,  T -- scores, P -- loadings, E -- residuals,
        levx -- leverages, ssqx -- sum of squares, expvarx -- cumulative
        explained variance, aopt -- number of components used
    OtherParameters:
    mode : str
        Amount of info retained, ('fast', 'normal', 'detailed')
    center_axis : int
        Center along given axis. If neg.: no centering (-inf,..., matrix modes)
    SeeAlso:
        - pca : other blm
        - pls : other blm
        - lpls : other blm
    *Notes*
    -----
    Uses kernel speed-up if m>>n or m<<n.
    If residuals turn rank deficient, a lower number of component than given
    in input will be used. The number of components used is given in results-dict. 
    Examples
    --------
    >>> import scipy,engines
    >>> a=scipy.asarray([[1,2,3],[2,4,5]])
    >>> b=scipy.asarray([[1,1],[2,3]])
    >>> dat=engines.pcr(a, 2)
    >>> dat['evx']
    array([0.,99.8561562,  100.])
    """
    try:
        k, l = b.shape
    except:
        b = atleast_2d(b).T
        k, l = b.shape
    if center_axis>=0:
        b = b - expand_dims(b.mean(center_axis), center_axis)
    dat = pca(a, aopt=aopt, scale=scale, mode=mode, center_axis=center_axis)
    T = dat['T']
    weights = apply_along_axis(vnorm, 0, T)**2
    if scale=='loads':
        Q = dot(b.T, T*weights)
    else:
        Q = dot(b.T, T/weights)
    if mode=='fast':
        dat.update({'Q':Q})
        return dat
    if mode=='detailed':
        F = empty((aopt, k, l))
        ssqy = []
        for i in range(aopt):
            F[i,:,:] = b - dot(T[:,:i+1], Q[:,:i+1].T)
            ssqy.append([(F[i,:,:]**2).mean(0), (F[i,:,:]**2).mean(1)])
    else:
        F = b - dot(T, Q.T)
        sepy = F**2
        ssqy = [sepy.sum(0), sepy.sum(1)]
    expvary = r_[0,  100*((T**2).sum(0)*(Q**2).sum(0)/(b**2).sum()).cumsum()[:aopt]]
    dat.update({'Q': Q, 'F': F, 'evy': expvary, 'ssqy': ssqy})
    return dat
 def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1):
    """Partial Least Squares Regression.
    Performs PLS on given matrix and returns results in a dictionary.
    *Parameters*:
        X : {array}
            Data measurement matrix, (samples x variables)
        Y : {array}
            Data response matrix, (samples x responses)
        aopt : {integer}, optional
            Number of components to use, aopt<=min(samples, variables)
        scale : ['scores', 'loadings'], optional
            Which component should get the scale
        center_axis : {-1, integer}
            Perform centering across given axis, (-1 is no centering)
    *Returns*:
        T : {array}
            X-scores
        W : {array}
            X-loading-weights
        P : {array}
            X-loadings
        R : {array}
            X-loadings-basis
        Q : {array}
            Y-loadings
        B : {array}
            Regression coefficients
        E : {array}
            X-block residuals
        F : {array}
            Y-block residuals
        evx : {array}
            X-explained variance
        evy : {array}
            Y-explained variance
        mnx : {array}
            X location
        mny : {array}
            Y location
        aopt : {array}
            The number of components used
        ssqx : {list}, optional
            Sum of squared residuals in X along each dimesion
        ssqy : {list}
            Sum of squared residuals in Y along each dimesion
        leverage : {array}
            Sample leverages
    *OtherParameters*:
        mode : ['normal', 'fast', 'detailed'], optional
            How much details to compute
    *SeeAlso*:
        `center` : data centering
    *Notes*
        - The output with mode='fast' will only return T and W
        - If residuals turn rank deficient, a lower number of component than given in input will be used. The number of components used is given in results.
    *Examples*
    >>> import numpy, engines
    >>> a = numpy.asarray([[1,2,3],[2,4,5]])
    >>> b = numpy.asarray([[1,1],[2,3]])
    >>> dat =engines.pls(a, b, 2)
    >>> dat['evx']
    array([0.,99.8561562,  100.])
    """
    m, n = X.shape
    try:
        k, l = Y.shape
    except:
        Y = atleast_2d(Y).T
        k, l = Y.shape
    assert(m==k)
    assert(aopt<min(m, n))
    mnx, mny = 0,0
    if center_axis>=0:
        mnx = expand_dims(X.mean(center_axis), center_axis)
        X = X - mnx
        mny = expand_dims(Y.mean(center_axis), center_axis)
        Y = Y - mny 
    W = empty((n, aopt))
    P = empty((n, aopt))
    R = empty((n, aopt))
    Q = empty((l, aopt))
    T = empty((m, aopt))
    B = empty((aopt, n, l))
    tt = empty((aopt,))
    XY = dot(X.T, Y)
    for i in range(aopt):
        if XY.shape[1]==1: #pls 1
            w = XY.reshape(n, l)
            w = w/vnorm(w)
        elif n<l: # more yvars than xvars
            s, w = arpack.eigen_symmetric(dot(XY, XY.T),k=1, tol=1e-10, maxiter=100)
            #w, s, vh = svd(dot(XY, XY.T))
            #w = w[:,:1]
        else: # more xvars than yvars
            s, q = arpack.eigen_symmetric(dot(XY.T, XY), k=1, tol=1e-10, maxiter=100)
            #q, s, vh = svd(dot(XY.T, XY))
            #q = q[:,:1]
            w = dot(XY, q)
            w = w/vnorm(w)
        r = w.copy()
        if i>0:
            for j in range(0, i, 1):
                r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis]
        t = dot(X, r)
        tt[i] = tti = dot(t.T, t).ravel()
        p  = dot(X.T, t)/tti
        q = dot(r.T, XY).T/tti
        XY = XY - dot(p, q.T)*tti
        T[:,i] = t.ravel()
        W[:,i] = w.ravel()
        if mode=='fast' and i==aopt-1:
            if scale=='loads':
                tnorm = sqrt(tt)
                T = T/tnorm
                W = W*tnorm
            return {'T':T, 'W':W}
        P[:,i] = p.ravel()
        R[:,i] = r.ravel()
        Q[:,i] = q.ravel()
        B[i] = dot(R[:,:i+1], Q[:,:i+1].T)
    qnorm = apply_along_axis(vnorm, 0, Q)
    tnorm = sqrt(tt)
    pp = (P**2).sum(0)
    if mode=='detailed':
        E = empty((aopt, m, n))
        F = empty((aopt, k, l))
        ssqx, ssqy = [], []
        leverage = empty((aopt, m))
        #h2x = [] #hotellings T^2
        #h2y = []
        for ai in range(aopt):
            E[ai,:,:] = X - dot(T[:,:ai+1], P[:,:ai+1].T)
            F[ai,:,:] = Y - dot(T[:,:i], Q[:,:i].T)
            ssqx.append([(E[ai,:,:]**2).mean(0), (E[ai,:,:]**2).mean(1)])
            ssqy.append([(F[ai,:,:]**2).mean(0), (F[ai,:,:]**2).mean(1)])
            leverage[ai,:] = 1./m + ((T[:,:ai+1]/tnorm[:ai+1])**2).sum(1)
            #h2y.append(1./k + ((Q[:,:ai+1]/qnorm[:ai+1])**2).sum(1))
    else:
        # residuals
        E = X - dot(T, P.T)
        F = Y - dot(T, Q.T)
        sepx = E**2
        ssqx = [sepx.sum(0), sepx.sum(1)]
        sepy = F**2
        ssqy = [sepy.sum(0), sepy.sum(1)]
        leverage = 1./m + ((T/tnorm)**2).sum(1)
    # variances
    tp= tt*pp
    tq = tt*qnorm*qnorm
    expvarx = r_[0, 100*tp/(X*X).sum()]
    expvary = r_[0, 100*tq/(Y*Y).sum()]
    if scale=='loads':
        T = T/tnorm
        W = W*tnorm
        Q = Q*tnorm
        P = P*tnorm
    return {'Q': Q, 'P': P, 'T': T, 'W': W, 'R': R, 'E': E, 'F': F, 'B': B,
            'evx': expvarx, 'evy': expvary, 'ssqx': ssqx, 'ssqy': ssqy,
            'leverage': leverage, 'mnx': mnx, 'mny': mny}
 def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', verbose=False):
    """ L-shaped Partial Least Sqaures Regression by the nipals algorithm.
@@ -38,11 +438,7 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
            -1 : nothing
            0 : row center
            1 : column center
-            2 : double center   
+            2 : double center
        scale : {'scores', 'loads'}, optional
            Option to decide on where the scale goes.
        verbose : {boolean}, optional
            Verbosity of console output. For use in debugging.
    *Returns*:
@@ -75,6 +471,13 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
        mnz : {array}
            Z location
    *OtherParameters*:
        scale : {'scores', 'loads'}, optional
            Option to decide on where the scale goes.
        verbose : {boolean}, optional
            Verbosity of console output. For use in debugging.
    *References*
        Saeboe et al., LPLS-regression: a method for improved prediction and
        classification through inclusion of background information on
@@ -296,3 +699,59 @@ def _scale(a, axis):
        raise IOError("input error: axis must be in [-1,0,1]")
    return a - sc, sc
 def esvd(data, a_max=None):
    """ SVD with kernel calculation
    Calculate subspaces of X'X or XX' depending on the shape
    of the matrix.
    Parameters:
        data : {array}
            Data matrix
        a_max : {integer}
            Number of components to extract
    Returns:
        u : {array}
            Right hand eigenvectors
        s : {array}
            Singular values
        v : {array}
            Left hand eigenvectors
    notes:
        Uses Anoldi iterations (ARPACK)
    """
    m, n = data.shape
    if m>=n:
        kernel = dot(data.T, data)
        if a_max==None:
            a_max = n - 1
        s, v = arpack.eigen_symmetric(kernel,k=a_max, which='LM',
                                    maxiter=200,tol=1e-5)
        s = s[::-1]
        v = v[:,::-1]
        #u, s, vt = svd(kernel)
        #v = vt.T
        s = sqrt(s)
        u = dot(data, v)/s
    else:
        kernel = dot(data, data.T)
        if a_max==None:
            a_max = m -1
        s, u = arpack.eigen_symmetric(kernel,k=a_max, which='LM',
                                      maxiter=200,tol=1e-5)
        s = s[::-1]
        u = u[:,::-1]
        #u, s, vt = svd(kernel)
        s = sqrt(s)
        v = dot(data.T, u)/s
    return u, s, v