From 4c809674bb6cd3a17b51d0b83e390549d0301a6b Mon Sep 17 00:00:00 2001 From: flatberg Date: Tue, 27 Nov 2007 15:05:19 +0000 Subject: [PATCH] Mostly clean ups --- pyblm/crossvalidation.py | 37 +++++----- pyblm/engines.py | 145 ++++++++++++++++++++++----------------- 2 files changed, 98 insertions(+), 84 deletions(-) diff --git a/pyblm/crossvalidation.py b/pyblm/crossvalidation.py index 762f59b..3d4eb2a 100644 --- a/pyblm/crossvalidation.py +++ b/pyblm/crossvalidation.py @@ -12,7 +12,7 @@ from numpy.random import shuffle from engines import nipals_lpls as lpls -def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2], zorth=False, verbose=True): +def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2], zorth=False, verbose=False): """Performs crossvalidation for generalisation error in lpls. The L-PLS crossvalidation is estimated just like an ordinary pls @@ -61,8 +61,8 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2], zorth=Fals m, n = X.shape k, l = Y.shape o, p = Z.shape - assert m==k, "X (%d,%d) - Y (%d,%d) dim mismatch" %(m,n,k,l) - assert n==p, "X (%d,%d) - Z (%d,%d) dim mismatch" %(m,n,o,p) + assert m == k, "X (%d,%d) - Y (%d,%d) dim mismatch" %(m, n, k, l) + assert n == p, "X (%d,%d) - Z (%d,%d) dim mismatch" %(m, n, o, p) if nsets == None: nsets = m if nsets > X.shape[0]: @@ -80,11 +80,11 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2], zorth=Fals if mean_ctr[0] != 1: xi = X[val,:] - dat['mnx'] else: - xi = X[val] - X[val].mean(1)[:,newaxis] + xi = X[val] - X[cal].mean(1)[:,newaxis] if mean_ctr[2] != 1: ym = dat['mny'] else: - ym = Y[val].mean(1)[:,newaxis] #???: check this + ym = Y[cal].mean(1)[:,newaxis] # predictions for a in range(a_max): Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a])) @@ -113,7 +113,7 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2], zorth=Fals def pca_jk(a, aopt, n_blocks=None): """Returns jack-knife segements from PCA. - Parameters: + *Parameters*: a : {array} data matrix (n x m) @@ -122,20 +122,14 @@ def pca_jk(a, aopt, n_blocks=None): nsets : {integer} number of segments - Returns: + *Returns*: Pcv : {array} Loadings collected in a three way matrix (n_segments, m, aopt) - Notes: - - - The loadings are scaled with the (1/samples)*eigenvalues. + *Notes*: - Crossvalidation method is currently set to random blocks of samples. - - - todo: add support for T - - - fixme: more efficient to add this in validation loop? """ if nsets == None: @@ -183,7 +177,7 @@ def pls_jk(X, Y, a_opt, nsets=None, center=True, verbose=False): """ m, n = X.shape k, l = Y.shape - assert(m==k) + assert(m == k) if nsets == None: nsets = X.shape[0] Wcv = empty((nsets, X.shape[1], a_opt), dtype=X.dtype) @@ -242,9 +236,9 @@ def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=F m, n = X.shape k, l = Y.shape o, p = Z.shape - assert(m==k) - assert(n==p) - if nsets==None: + assert(m == k) + assert(n == p) + if nsets == None: nsets = m WWx = empty((nsets, n, a_opt), 'd') WWz = empty((nsets, o, a_opt), 'd') @@ -279,12 +273,12 @@ def find_aopt_from_sep(sep, method='vanilla'): A guess on the optimal number of components """ - if method=='vanilla': + if method == 'vanilla': # min rmsep rmsecv = sqrt(sep.mean(0)) return rmsecv.argmin() + 1 - elif method=='75perc': + elif method == '75perc': prct = .75 #percentile ind = 1.*sep.shape[0]*prct med = median(sep) @@ -305,6 +299,7 @@ def cv(N, K, randomise=True, sequential=False): of length ~N/K, *without* replacement. *Parameters*: + N : {integer} Total number of samples K : {integer} @@ -333,7 +328,7 @@ def cv(N, K, randomise=True, sequential=False): otherwise interleaved ordering is used. """ - if N>K: + if N > K: raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(K, K, N) index = xrange(N) if randomise: diff --git a/pyblm/engines.py b/pyblm/engines.py index 878b4ca..01412df 100644 --- a/pyblm/engines.py +++ b/pyblm/engines.py @@ -9,16 +9,17 @@ from math import sqrt as msqrt from numpy import dot,empty,zeros,apply_along_axis,newaxis,finfo,sqrt,r_,expand_dims,\ minimum -from numpy.linalg import inv, svd +from numpy.linalg import inv,svd from scipy.sandbox import arpack + def pca(X, aopt, scale='scores', mode='normal', center_axis=0): """ Principal Component Analysis. PCA is a low rank bilinear aprroximation to a data matrix that sequentially extracts orthogonal components of maximum variance. - Parameters: + *Parameters*: X : {array} Data measurement matrix, (samples x variables) @@ -27,7 +28,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=0): center_axis : {integer} Center along given axis. If neg.: no centering (-inf,..., matrix modes) - Returns: + *Returns*: T : {array} Scores, (samples, components) @@ -47,7 +48,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=0): leverage : {array} Leverages, (samples,) - OtherParameters: + *OtherParameters*: scale : {string}, optional Where to put the weights [['scores'], 'loadings'] @@ -55,7 +56,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=0): Amount of info retained, [['normal'], 'fast', 'detailed'] - :SeeAlso: + *SeeAlso*: `center` : Data centering @@ -78,10 +79,12 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=0): """ m, n = X.shape - assert(aopt<=min(m,n)) - if center_axis>=0: + min_aopt = min(m, n) + if center_axis >= 0: X = X - expand_dims(X.mean(center_axis), center_axis) - if m>(n+100) or n>(m+100): + min_aopt = min_aopt - 1 + assert(aopt <= min_aopt) + if m > (n+100) or n > (m+100): u, s, v = esvd(X, aopt) else: u, s, vt = svd(X, 0) @@ -92,7 +95,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=0): # ranktest tol = 1e-10 - eff_rank = sum(s>s[0]*tol) + eff_rank = sum(s > s[0]*tol) aopt = minimum(aopt, eff_rank) T = u*s s = s[:aopt] @@ -124,7 +127,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=0): sep = E**2 ssq = [sep.sum(0), sep.sum(1)] # leverages - if scale=='loads': + if scale == 'loads': lev = [(1./m)+(T**2).sum(1), (1./n)+((P/s)**2).sum(1)] else: lev = [(1./m)+((T/s)**2).sum(1), (1./n)+(P**2).sum(1)] @@ -139,7 +142,7 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0): Performs PCR on given matrix and returns results in a dictionary. - Parameters: + *Parameters*: a : array Data measurement matrix, (samples x variables) @@ -148,19 +151,19 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0): aopt : int Number of components to use, aopt<=min(samples, variables) - Returns: + *Returns*: results : dict keys -- values, T -- scores, P -- loadings, E -- residuals, levx -- leverages, ssqx -- sum of squares, expvarx -- cumulative explained variance, aopt -- number of components used - OtherParameters: + *OtherParameters*: - mode : str - Amount of info retained, ('fast', 'normal', 'detailed') - center_axis : int - Center along given axis. If neg.: no centering (-inf,..., matrix modes) + mode : {string} + Amount of info retained, ('fast', 'normal', 'detailed') + center_axis : {integer} + Center along given axis. If neg.: no centering (-inf,..., matrix modes) SeeAlso: @@ -194,21 +197,21 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0): except: b = atleast_2d(b).T k, l = b.shape - if center_axis>=0: + if center_axis >= 0: b = b - expand_dims(b.mean(center_axis), center_axis) dat = pca(a, aopt=aopt, scale=scale, mode=mode, center_axis=center_axis) T = dat['T'] weights = apply_along_axis(vnorm, 0, T)**2 - if scale=='loads': + if scale == 'loads': Q = dot(b.T, T*weights) else: Q = dot(b.T, T/weights) - if mode=='fast': + if mode == 'fast': dat.update({'Q':Q}) return dat - if mode=='detailed': + if mode == 'detailed': F = empty((aopt, k, l)) ssqy = [] for i in range(aopt): @@ -284,7 +287,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1): *SeeAlso*: - `center` : data centering + `center` - data centering *Notes* @@ -310,14 +313,16 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1): except: Y = atleast_2d(Y).T k, l = Y.shape - assert(m==k) - assert(aopt=0: + assert(m == k) + mnx, mny = 0, 0 + min_aopt = min(m, n) + if center_axis >= 0: mnx = expand_dims(X.mean(center_axis), center_axis) X = X - mnx + min_aopt = min_aopt - 1 mny = expand_dims(Y.mean(center_axis), center_axis) Y = Y - mny + assert(aopt > 0 and aopt < min_aopt) W = empty((n, aopt)) P = empty((n, aopt)) @@ -329,10 +334,10 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1): XY = dot(X.T, Y) for i in range(aopt): - if XY.shape[1]==1: #pls 1 + if XY.shape[1] == 1: #pls 1 w = XY.reshape(n, l) w = w/vnorm(w) - elif n0: + if i > 0: for j in range(0, i, 1): r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis] @@ -356,8 +361,8 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1): T[:,i] = t.ravel() W[:,i] = w.ravel() - if mode=='fast' and i==aopt-1: - if scale=='loads': + if mode == 'fast' and i == (aopt - 1): + if scale == 'loads': tnorm = sqrt(tt) T = T/tnorm W = W*tnorm @@ -371,7 +376,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1): qnorm = apply_along_axis(vnorm, 0, Q) tnorm = sqrt(tt) pp = (P**2).sum(0) - if mode=='detailed': + if mode == 'detailed': E = empty((aopt, m, n)) F = empty((aopt, k, l)) ssqx, ssqy = [], [] @@ -401,7 +406,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1): expvarx = r_[0, 100*tp/(X*X).sum()] expvary = r_[0, 100*tq/(Y*Y).sum()] - if scale=='loads': + if scale == 'loads': T = T/tnorm W = W*tnorm Q = Q*tnorm @@ -495,11 +500,11 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 2], scale='scores', zo m, n = X.shape k, l = Y.shape u, o = Z.shape - max_rank = min(m, n) - assert (a_max>0 and a_max 0 and a_max < max_rank), "Number of comp error:\ + tried: %d, max_rank: %d" %(a_max, max_rank) - if mean_ctr!=None: + if mean_ctr != None: xctr, yctr, zctr = mean_ctr X, mnX = center(X, xctr) Y, mnY = center(Y, yctr) @@ -537,7 +542,7 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 2], scale='scores', zo l = G[:,:1] diff = 1 niter = 0 - while (diff>LIM and niter LIM and niter < MAX_ITER): niter += 1 u1 = u.copy() w1 = w.copy() @@ -562,7 +567,7 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 2], scale='scores', zo u = dot(F, c) diff = dot((u - u1).T, (u - u1)) if verbose: - if niter==MAX_ITER: + if niter == MAX_ITER: print "Maximum nunber of iterations reached!" print "Iterations: %d " %niter print "Error: %.2E" %diff @@ -606,7 +611,7 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 2], scale='scores', zo evy = 100.*(1 - var_y/varY) evz = 100.*(1 - var_z/varZ) - if scale=='loads': + if scale == 'loads': tnorm = apply_along_axis(vnorm, 0, T) T = T/tnorm W = W*tnorm @@ -617,6 +622,20 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 2], scale='scores', zo return {'T':T, 'W':W, 'P':P, 'Q':Q, 'U':U, 'L':L, 'K':K, 'B':B, 'E': E, 'F': F, 'G': G, 'evx':evx, 'evy':evy, 'evz':evz,'mnx': mnX, 'mny': mnY, 'mnz': mnZ} +def lpls_predict(model_dict, x, aopt): + """Predict lpls reponses from existing model on new data. + """ + try: + m, n = x.shape + except: + x = atleast_2d(x.shape) + m, n = x.shape + + if 'B0' in model_dict.keys(): + y = model_dict['B0'] + dot() + + + def vnorm(a): """Returns the norm of a vector. @@ -654,28 +673,28 @@ def center(a, axis): """ # check if we have a vector - is_vec = len(a.shape)==1 + is_vec = len(a.shape) == 1 if not is_vec: - is_vec = a.shape[0]==1 or a.shape[1]==1 + is_vec = a.shape[0] == 1 or a.shape[1] == 1 if is_vec: - if axis==2: + if axis == 2: warnings.warn("Double centering of vecor ignored, using ordinary centering") - if axis==-1: + if axis == -1: mn = 0 else: mn = a.mean() return a - mn, mn # !!!fixme: use broadcasting - if axis==-1: + if axis == -1: mn = zeros((1,a.shape[1],)) #mn = tile(mn, (a.shape[0], 1)) - elif axis==0: + elif axis == 0: mn = a.mean(0)[newaxis] #mn = tile(mn, (a.shape[0], 1)) - elif axis==1: + elif axis == 1: mn = a.mean(1)[:,newaxis] #mn = tile(mn, (1, a.shape[1])) - elif axis==2: + elif axis == 2: mn = a.mean(0)[newaxis] + a.mean(1)[:,newaxis] - a.mean() return a - mn , a.mean(0)[newaxis] else: @@ -702,11 +721,11 @@ def _scale(a, axis): Scaling vector/matrix """ - if axis==-1: + if axis == -1: sc = zeros((a.shape[1],)) - elif axis==0: + elif axis == 0: sc = a.std(0) - elif axis==1: + elif axis == 1: sc = a.std(1)[:,newaxis] else: raise IOError("input error: axis must be in [-1,0,1]") @@ -714,19 +733,19 @@ def _scale(a, axis): return a - sc, sc def esvd(data, a_max=None): - """ SVD with kernel calculation + """SVD with kernel calculation. Calculate subspaces of X'X or XX' depending on the shape of the matrix. - Parameters: + *Parameters*: data : {array} Data matrix a_max : {integer} Number of components to extract - Returns: + *Returns*: u : {array} Right hand eigenvectors @@ -735,20 +754,20 @@ def esvd(data, a_max=None): v : {array} Left hand eigenvectors - notes: + *Notes*: - Uses Anoldi iterations (ARPACK) + Uses Anoldi iterations for the symmetric eigendecomp (ARPACK) """ m, n = data.shape - if m>=n: + if m >= n: kernel = dot(data.T, data) - if a_max==None: + if a_max == None: a_max = n - 1 - s, v = arpack.eigen_symmetric(kernel,k=a_max, which='LM', - maxiter=200,tol=1e-5) + s, v = arpack.eigen_symmetric(kernel, k=a_max, which='LM', + maxiter=200, tol=1e-5) s = s[::-1] v = v[:,::-1] #u, s, vt = svd(kernel) @@ -757,10 +776,10 @@ def esvd(data, a_max=None): u = dot(data, v)/s else: kernel = dot(data, data.T) - if a_max==None: + if a_max == None: a_max = m -1 - s, u = arpack.eigen_symmetric(kernel,k=a_max, which='LM', - maxiter=200,tol=1e-5) + s, u = arpack.eigen_symmetric(kernel, k=a_max, which='LM', + maxiter=200, tol=1e-5) s = s[::-1] u = u[:,::-1] #u, s, vt = svd(kernel)