diff --git a/arpack/arpack.py b/arpack/arpack.py index b6913dd..39139c8 100644 --- a/arpack/arpack.py +++ b/arpack/arpack.py @@ -201,16 +201,16 @@ def eigen(A,k=6,M=None,ncv=None,which='LM', dr=sb.zeros(k+1,typ) di=sb.zeros(k+1,typ) zr=sb.zeros((n,k+1),typ) - dr,di,z,info=\ + dr,di,zr,info=\ eigextract(rvec,howmny,sselect,sigmar,sigmai,workev, bmat,which,k,tol,resid,v,iparam,ipntr, workd,workl,info) - + # make eigenvalues complex - d=dr+1.0j*di + d=(dr+1.0j*di)[:k] # futz with the eigenvectors: # complex are stored as real,imaginary in consecutive columns - z=zr.astype(typ.upper()) + z=zr.astype(typ.upper())[:,:k] for i in range(k): # fix c.c. pairs if di[i] > 0 : z[:,i]=zr[:,i]+1.0j*zr[:,i+1] diff --git a/pyblm/__init__.py b/pyblm/__init__.py index 8c651f7..185f119 100644 --- a/pyblm/__init__.py +++ b/pyblm/__init__.py @@ -1,5 +1,6 @@ from crossvalidation import lpls_val from statistics import lpls_qvals +from engines import pca, pcr, pls from engines import nipals_lpls as lpls #from tests import * diff --git a/pyblm/crossvalidation.py b/pyblm/crossvalidation.py index 000fbef..a824068 100644 --- a/pyblm/crossvalidation.py +++ b/pyblm/crossvalidation.py @@ -2,11 +2,12 @@ The primary use is crossvalidation. """ -__all__ = ['lpls_val', 'lpls_jk'] +__all__ = ['lpls_val', 'pls_jk', 'lpls_jk'] __docformat__ = "restructuredtext en" from numpy import dot,empty,zeros,sqrt,atleast_2d,argmax,asarray,median,\ - array_split + array_split,arange +from numpy.random import shuffle from engines import nipals_lpls as lpls @@ -93,8 +94,92 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=Tru return rmsep, Yhat, aopt +def pca_jk(a, aopt, n_blocks=None): + """Returns jack-knife segements from PCA. + + Parameters: -def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=False): + a : {array} + data matrix (n x m) + aopt : {integer} + number of components in model. + nsets : {integer} + number of segments + + Returns: + + Pcv : {array} + Loadings collected in a three way matrix (n_segments, m, aopt) + + Notes: + + - The loadings are scaled with the (1/samples)*eigenvalues. + + - Crossvalidation method is currently set to random blocks of samples. + + - todo: add support for T + + - fixme: more efficient to add this in validation loop? + + """ + if nsets == None: + nsets = a.shape[0] + + Pcv = empty((nsets, a.shape[1], aopt), dtype=a.dtype) + mn_a = .5*(a.mean(0) + a.mean(1)[:,newaxis]) + for i, (cal, val) in enumerate(cv_diag(a.shape, nsets)): + old_values = a.take(ind) + new_values = mn_a.take(ind) + a.put(ind, new_values) + dat = pca(a, aopt, mode='fast', scale='loads', center=center) + PP[nn,:,:] = dat['P'] + a.put(ind, old_values) + + return PP + +def pls_jk(X, Y, a_opt, nsets=None, center=True, verbose=False): + """ Returns jack-knife segements of W. + + *Parameters*: + + X : {array} + Main data matrix (m, n) + Y : {array} + External row data (m, l) + a_opt : {integer} + The number of components to calculate (0, min(m,n)) + nsets : (integer), optional + Number of jack-knife segments + + center : {boolean}, optional + - -1 : nothing + - 0 : row center + - 1 : column center + - 2 : double center + verbose : {boolean}, optional + Verbosity of console output. For use in debugging. + + *Returns*: + + Wcv : {array} + Loading-weights jack-knife segements + + """ + m, n = X.shape + k, l = Y.shape + assert(m==k) + if nsets == None: + nsets = X.shape[0] + Wcv = empty((nsets, X.shape[1], a_opt), dtype=X.dtype) + for i, (cal, val) in enumerate(cv(k, nsets)): + if verbose: + print "Segement number: %d" %i + dat = pls(X[cal,:], Y[cal,:], a_opt, scale='loads', mode='fast') + Wcv[nn,:,:] = dat['W'] + + return Wcv + +def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=False): """Returns jack-knifed segments of lpls model. Jack-knifing is a method to perturb the model paramters, hopefully @@ -113,8 +198,8 @@ def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=F External row data (m, l) Z : {array} External column data (n, o) - a_max : {integer}, optional - Maximum number of components to calculate (0, min(m,n)) + a_opt : {integer}, optional + The number of components to calculate (0, min(m,n)) nsets : (integer), optional Number of jack-knife segments xz_alpha : {float}, optional @@ -145,11 +230,11 @@ def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=F assert(n==p) if nsets==None: nsets = m - WWx = empty((nsets, n, a_max), 'd') - WWz = empty((nsets, o, a_max), 'd') - #WWy = empty((nsets, l, a_max), 'd') + WWx = empty((nsets, n, a_opt), 'd') + WWz = empty((nsets, o, a_opt), 'd') + #WWy = empty((nsets, l, a_opt), 'd') for i, (cal, val) in enumerate(cv(k, nsets)): - dat = lpls(X[cal],Y[cal],Z,a_max=a_max,alpha=xz_alpha,mean_ctr=mean_ctr, + dat = lpls(X[cal], Y[cal], Z, a_max=a_opt,alpha=xz_alpha, mean_ctr=mean_ctr, scale='loads', verbose=verbose) WWx[i,:,:] = dat['W'] WWz[i,:,:] = dat['L'] @@ -219,6 +304,7 @@ def cv(N, K, randomise=True, sequential=False): *Notes*: If randomise is true, a copy of index is shuffled before partitioning, + otherwise its order is preserved in training and validation. Randomise overrides the sequential argument. If randomise is true, @@ -245,13 +331,40 @@ def cv(N, K, randomise=True, sequential=False): validation = [i for i in index if i % K == k] yield training, validation +def diag_cv(shape, nsets=9): + """Generates K (training, validation) index pairs. + + Parameters: + N : {integer} + alpha -- scalar, approx. portion of data perturbed + """ + try: + m, n = shape + except: + raise ValueError("shape needs to be a two-tuple") + if nsets>m or nsets>n: + msg = "You may not use more subsets than max(n_rows, n_cols)" + raise ValueError, msg + nm = n*m + index = arange(nm) + n_ind = arange(n) + shuffle(n_ind) # random start diag + start_inds = array_split(n_ind, nsets) + for v in range(nsets): + validation = [] + for start in start_inds[v]: + ind = arange(start+v, nm, n+1) + [validation.append(i) for i in ind] + training = [j for j in index if j not in validation] + yield training, validation + def class_error(Yhat, Y, method='vanilla'): """ Not used. """ - a_max, k, l = Yhat.shape + a_opt, k, l = Yhat.shape Yhat_c = zeros((k, l), dtype='d') - for a in range(a_max): + for a in range(a_opt): for i in range(k): Yhat_c[a,val,argmax(Yhat[a,val,:])] = 1.0 err = 100*((Yhat_c + Y) == 2).sum(1)/Y.sum(0).astype('d') diff --git a/pyblm/engines.py b/pyblm/engines.py index fa44244..5c224ab 100644 --- a/pyblm/engines.py +++ b/pyblm/engines.py @@ -2,14 +2,414 @@ """ -__all__ = ['nipals_lpls'] +__all__ = ['pca', 'pcr', 'pls', 'nipals_lpls'] __docformat__ = "restructuredtext en" from math import sqrt as msqrt -from numpy import dot,empty,zeros,apply_along_axis,newaxis,finfo -from numpy.linalg import inv +from numpy import dot,empty,zeros,apply_along_axis,newaxis,finfo,sqrt,r_,expand_dims,\ +minimum +from numpy.linalg import inv, svd +from scipy.sandbox import arpack +def pca(X, aopt, scale='scores', mode='normal', center_axis=0): + """ Principal Component Analysis. + + PCA is a low rank bilinear aprroximation to a data matrix that sequentially + extracts orthogonal components of maximum variance. + + Parameters: + + X : {array} + Data measurement matrix, (samples x variables) + aopt : {integer} + Number of components to use, aopt<=min(samples, variables) + center_axis : {integer} + Center along given axis. If neg.: no centering (-inf,..., matrix modes) + + Returns: + + T : {array} + Scores, (samples, components) + P : {array} + Loadings, (variables, components) + E : {array} + Residuals, (samples, variables) + evx : {array} + X-explained variance, (components,) + mnx : {array} + X location, (variables,) + aopt : {integer} + The number of components used + ssqx : {list} + Sum of squared residuals in X along each dimesion + [(samples, ), (variables,)] + leverage : {array} + Leverages, (samples,) + + OtherParameters: + + scale : {string}, optional + Where to put the weights [['scores'], 'loadings'] + mode : {string}, optional + Amount of info retained, [['normal'], 'fast', 'detailed'] + + + :SeeAlso: + + `center` : Data centering + + + *Notes* + + Uses kernel speed-up if m>>n or m<>> import scipy,engines + >>> a=scipy.asarray([[1,2,3],[2,4,5]]) + >>> dat=engines.pca(a, 2) + >>> dat['evx'] + array([0.,99.8561562, 100.]) + + """ + + m, n = X.shape + assert(aopt<=min(m,n)) + if center_axis>=0: + X = X - expand_dims(X.mean(center_axis), center_axis) + if m>(n+100) or n>(m+100): + u, s, v = esvd(X, aopt) + else: + u, s, vt = svd(X, 0) + v = vt.T + u = u[:,:aopt] + s = s[:aopt] + v = v[:,:aopt] + + # ranktest + tol = 1e-10 + eff_rank = sum(s>s[0]*tol) + aopt = minimum(aopt, eff_rank) + T = u*s + s = s[:aopt] + T = T[:,:aopt] + P = v[:,:aopt] + e = s**2 + + if scale=='loads': + T = T/s + P = P*s + + if mode == 'fast': + return {'T':T, 'P':P, 'aopt':aopt} + + if mode=='detailed': + E = empty((aopt, m, n)) + ssq = [] + lev = [] + for ai in range(aopt): + E[ai,:,:] = X - dot(T[:,:ai+1], P[:,:ai+1].T) + ssq.append([(E[ai,:,:]**2).mean(0), (E[ai,:,:]**2).mean(1)]) + if scale=='loads': + lev.append([((s*T)**2).sum(1), (P**2).sum(1)]) + else: + lev.append([(T**2).sum(1), ((s*P)**2).sum(1)]) + else: + # residuals + E = X - dot(T, P.T) + sep = E**2 + ssq = [sep.sum(0), sep.sum(1)] + # leverages + if scale=='loads': + lev = [(1./m)+(T**2).sum(1), (1./n)+((P/s)**2).sum(1)] + else: + lev = [(1./m)+((T/s)**2).sum(1), (1./n)+(P**2).sum(1)] + # variances + expvarx = r_[0, 100*e.cumsum()/(X*X).sum()] + + return {'T': T, 'P': P, 'E': E, 'evx': expvarx, 'leverage': lev, 'ssqx': ssq, + 'aopt': aopt, 'eigvals': e} + +def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0): + """ Principal Component Regression. + + Performs PCR on given matrix and returns results in a dictionary. + + Parameters: + + a : array + Data measurement matrix, (samples x variables) + b : array + Data response matrix, (samples x responses) + aopt : int + Number of components to use, aopt<=min(samples, variables) + + Returns: + + results : dict + keys -- values, T -- scores, P -- loadings, E -- residuals, + levx -- leverages, ssqx -- sum of squares, expvarx -- cumulative + explained variance, aopt -- number of components used + + OtherParameters: + + mode : str + Amount of info retained, ('fast', 'normal', 'detailed') + center_axis : int + Center along given axis. If neg.: no centering (-inf,..., matrix modes) + + SeeAlso: + + - pca : other blm + - pls : other blm + - lpls : other blm + + *Notes* + + ----- + + Uses kernel speed-up if m>>n or m<>> import scipy,engines + >>> a=scipy.asarray([[1,2,3],[2,4,5]]) + >>> b=scipy.asarray([[1,1],[2,3]]) + >>> dat=engines.pcr(a, 2) + >>> dat['evx'] + array([0.,99.8561562, 100.]) + + """ + try: + k, l = b.shape + except: + b = atleast_2d(b).T + k, l = b.shape + if center_axis>=0: + b = b - expand_dims(b.mean(center_axis), center_axis) + dat = pca(a, aopt=aopt, scale=scale, mode=mode, center_axis=center_axis) + T = dat['T'] + weights = apply_along_axis(vnorm, 0, T)**2 + if scale=='loads': + Q = dot(b.T, T*weights) + else: + Q = dot(b.T, T/weights) + + if mode=='fast': + dat.update({'Q':Q}) + return dat + + if mode=='detailed': + F = empty((aopt, k, l)) + ssqy = [] + for i in range(aopt): + F[i,:,:] = b - dot(T[:,:i+1], Q[:,:i+1].T) + ssqy.append([(F[i,:,:]**2).mean(0), (F[i,:,:]**2).mean(1)]) + else: + F = b - dot(T, Q.T) + sepy = F**2 + ssqy = [sepy.sum(0), sepy.sum(1)] + + expvary = r_[0, 100*((T**2).sum(0)*(Q**2).sum(0)/(b**2).sum()).cumsum()[:aopt]] + + dat.update({'Q': Q, 'F': F, 'evy': expvary, 'ssqy': ssqy}) + return dat + +def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1): + """Partial Least Squares Regression. + + Performs PLS on given matrix and returns results in a dictionary. + + *Parameters*: + + X : {array} + Data measurement matrix, (samples x variables) + Y : {array} + Data response matrix, (samples x responses) + aopt : {integer}, optional + Number of components to use, aopt<=min(samples, variables) + scale : ['scores', 'loadings'], optional + Which component should get the scale + center_axis : {-1, integer} + Perform centering across given axis, (-1 is no centering) + + *Returns*: + + T : {array} + X-scores + W : {array} + X-loading-weights + P : {array} + X-loadings + R : {array} + X-loadings-basis + Q : {array} + Y-loadings + B : {array} + Regression coefficients + E : {array} + X-block residuals + F : {array} + Y-block residuals + evx : {array} + X-explained variance + evy : {array} + Y-explained variance + mnx : {array} + X location + mny : {array} + Y location + aopt : {array} + The number of components used + ssqx : {list}, optional + Sum of squared residuals in X along each dimesion + ssqy : {list} + Sum of squared residuals in Y along each dimesion + leverage : {array} + Sample leverages + + *OtherParameters*: + + mode : ['normal', 'fast', 'detailed'], optional + How much details to compute + + *SeeAlso*: + + `center` : data centering + + *Notes* + + - The output with mode='fast' will only return T and W + + - If residuals turn rank deficient, a lower number of component than given in input will be used. The number of components used is given in results. + + + *Examples* + + >>> import numpy, engines + >>> a = numpy.asarray([[1,2,3],[2,4,5]]) + >>> b = numpy.asarray([[1,1],[2,3]]) + >>> dat =engines.pls(a, b, 2) + >>> dat['evx'] + array([0.,99.8561562, 100.]) + + """ + + m, n = X.shape + try: + k, l = Y.shape + except: + Y = atleast_2d(Y).T + k, l = Y.shape + assert(m==k) + assert(aopt=0: + mnx = expand_dims(X.mean(center_axis), center_axis) + X = X - mnx + mny = expand_dims(Y.mean(center_axis), center_axis) + Y = Y - mny + + W = empty((n, aopt)) + P = empty((n, aopt)) + R = empty((n, aopt)) + Q = empty((l, aopt)) + T = empty((m, aopt)) + B = empty((aopt, n, l)) + tt = empty((aopt,)) + + XY = dot(X.T, Y) + for i in range(aopt): + if XY.shape[1]==1: #pls 1 + w = XY.reshape(n, l) + w = w/vnorm(w) + elif n0: + for j in range(0, i, 1): + r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis] + + t = dot(X, r) + tt[i] = tti = dot(t.T, t).ravel() + p = dot(X.T, t)/tti + q = dot(r.T, XY).T/tti + XY = XY - dot(p, q.T)*tti + T[:,i] = t.ravel() + W[:,i] = w.ravel() + + if mode=='fast' and i==aopt-1: + if scale=='loads': + tnorm = sqrt(tt) + T = T/tnorm + W = W*tnorm + return {'T':T, 'W':W} + + P[:,i] = p.ravel() + R[:,i] = r.ravel() + Q[:,i] = q.ravel() + B[i] = dot(R[:,:i+1], Q[:,:i+1].T) + + qnorm = apply_along_axis(vnorm, 0, Q) + tnorm = sqrt(tt) + pp = (P**2).sum(0) + if mode=='detailed': + E = empty((aopt, m, n)) + F = empty((aopt, k, l)) + ssqx, ssqy = [], [] + leverage = empty((aopt, m)) + #h2x = [] #hotellings T^2 + #h2y = [] + for ai in range(aopt): + E[ai,:,:] = X - dot(T[:,:ai+1], P[:,:ai+1].T) + F[ai,:,:] = Y - dot(T[:,:i], Q[:,:i].T) + ssqx.append([(E[ai,:,:]**2).mean(0), (E[ai,:,:]**2).mean(1)]) + ssqy.append([(F[ai,:,:]**2).mean(0), (F[ai,:,:]**2).mean(1)]) + leverage[ai,:] = 1./m + ((T[:,:ai+1]/tnorm[:ai+1])**2).sum(1) + #h2y.append(1./k + ((Q[:,:ai+1]/qnorm[:ai+1])**2).sum(1)) + else: + # residuals + E = X - dot(T, P.T) + F = Y - dot(T, Q.T) + sepx = E**2 + ssqx = [sepx.sum(0), sepx.sum(1)] + sepy = F**2 + ssqy = [sepy.sum(0), sepy.sum(1)] + leverage = 1./m + ((T/tnorm)**2).sum(1) + + # variances + tp= tt*pp + tq = tt*qnorm*qnorm + expvarx = r_[0, 100*tp/(X*X).sum()] + expvary = r_[0, 100*tq/(Y*Y).sum()] + + if scale=='loads': + T = T/tnorm + W = W*tnorm + Q = Q*tnorm + P = P*tnorm + + return {'Q': Q, 'P': P, 'T': T, 'W': W, 'R': R, 'E': E, 'F': F, 'B': B, + 'evx': expvarx, 'evy': expvary, 'ssqx': ssqx, 'ssqy': ssqy, + 'leverage': leverage, 'mnx': mnx, 'mny': mny} def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', verbose=False): """ L-shaped Partial Least Sqaures Regression by the nipals algorithm. @@ -38,11 +438,7 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve -1 : nothing 0 : row center 1 : column center - 2 : double center - scale : {'scores', 'loads'}, optional - Option to decide on where the scale goes. - verbose : {boolean}, optional - Verbosity of console output. For use in debugging. + 2 : double center *Returns*: @@ -75,6 +471,13 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve mnz : {array} Z location + *OtherParameters*: + + scale : {'scores', 'loads'}, optional + Option to decide on where the scale goes. + verbose : {boolean}, optional + Verbosity of console output. For use in debugging. + *References* Saeboe et al., LPLS-regression: a method for improved prediction and classification through inclusion of background information on @@ -296,3 +699,59 @@ def _scale(a, axis): raise IOError("input error: axis must be in [-1,0,1]") return a - sc, sc + +def esvd(data, a_max=None): + """ SVD with kernel calculation + + Calculate subspaces of X'X or XX' depending on the shape + of the matrix. + + Parameters: + + data : {array} + Data matrix + a_max : {integer} + Number of components to extract + + Returns: + + u : {array} + Right hand eigenvectors + s : {array} + Singular values + v : {array} + Left hand eigenvectors + + notes: + + Uses Anoldi iterations (ARPACK) + + """ + + m, n = data.shape + if m>=n: + kernel = dot(data.T, data) + + if a_max==None: + a_max = n - 1 + s, v = arpack.eigen_symmetric(kernel,k=a_max, which='LM', + maxiter=200,tol=1e-5) + s = s[::-1] + v = v[:,::-1] + #u, s, vt = svd(kernel) + #v = vt.T + s = sqrt(s) + u = dot(data, v)/s + else: + kernel = dot(data, data.T) + if a_max==None: + a_max = m -1 + s, u = arpack.eigen_symmetric(kernel,k=a_max, which='LM', + maxiter=200,tol=1e-5) + s = s[::-1] + u = u[:,::-1] + #u, s, vt = svd(kernel) + s = sqrt(s) + v = dot(data.T, u)/s + + return u, s, v