"""This module implements some validation schemes l-pls. The primary use is crossvalidation. """ __all__ = ['lpls_val', 'lpls_jk'] __docformat__ = "restructuredtext en" from numpy import dot,empty,zeros,sqrt,atleast_2d,argmax,asarray,median,\ array_split from engines import nipals_lpls as lpls def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=True): """Performs crossvalidation for generalisation error in lpls. The L-PLS crossvalidation is estimated just like an ordinary pls crossvalidation. That is, the generalisation error is estimated by predicting samples (rows of X and Y) left out according to a cross validation scheme. *Parameters*: X : {array} Main data matrix (m, n) Y : {array} External row data (m, l) Z : {array} External column data (n, o) a_max : {integer}, optional Maximum number of components to calculate (0, min(m,n)) nsets : (integer), optional Number of crossvalidation sets alpha : {float}, optional Parameter to control the amount of influence from Z-matrix. 0 is none, which returns a pls-solution, 1 is max mean_center : {array-like}, optional A three element array-like structure with elements in [-1,0,1,2], that decides the type of centering used. -1 : nothing 0 : row center 1 : column center 2 : double center verbose : {boolean}, optional Verbosity of console output. For use in debugging. *Returns*: rmsep : {array} Root mean squred error of prediction yhat : {array} Estimated responses aopt : {integer} Estimated value of optimal number of components """ m, n = X.shape k, l = Y.shape o, p = Z.shape assert m==k, "X (%d,%d) - Y (%d,%d) dim mismatch" %(m,n,k,l) assert n==p, "X (%d,%d) - Z (%d,%d) dim mismatch" %(m,n,o,p) if nsets == None: nsets = m if nsets > X.shape[0]: print "nsets (%d) is larger than number of variables (%d).\nnsets: %d -> %d" %(nsets, m, nsets, m) nsets = m assert (alpha >= 0 and alpha<=1), "Alpha needs to be within [0,1], got: %.2f" %alpha Yhat = empty((a_max, k, l), 'd') for cal, val in cv(nsets, k): dat = lpls(X[cal],Y[cal],Z,a_max=a_max,alpha=alpha,mean_ctr=mean_ctr,verbose=verbose) if mean_ctr[0] != 1: xi = X[val,:] - dat['mnx'] else: xi = X[val] - X[val].mean(1)[:,newaxis] if mean_ctr[2] != 1: ym = dat['mny'] else: ym = Y[val].mean(1)[:,newaxis] #???: check this for a in range(a_max): Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a])) # todo: need a better support for classification error y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?'] if y_is_class: Yhat, err = class_error(Yhat,Y) return Yhat, err sep = (Y - Yhat)**2 rmsep = sqrt(sep.mean(1)).T aopt = find_aopt_from_sep(rmsep) return rmsep, Yhat, aopt def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=False): """Returns jack-knifed segments of lpls model. Jack-knifing is a method to perturb the model paramters, hopefully to be representable as a typical perturbation of a *future* sample. The mean and variance of the jack knife segements may be used to infer the paramter confidence in th model. The segements returned are the X-block weights and Z-block weights. *Parameters*: X : {array} Main data matrix (m, n) Y : {array} External row data (m, l) Z : {array} External column data (n, o) a_max : {integer}, optional Maximum number of components to calculate (0, min(m,n)) nsets : (integer), optional Number of jack-knife segments xz_alpha : {float}, optional Parameter to control the amount of influence from Z-matrix. 0 is none, which returns a pls-solution, 1 is max mean_center : {array-like}, optional A three element array-like structure with elements in [-1,0,1,2], that decides the type of centering used. -1 : nothing 0 : row center 1 : column center 2 : double center verbose : {boolean}, optional Verbosity of console output. For use in debugging. *Returns*: Wx : {array} X-block jack-knife segements Wz : {array} Z-block jack-knife segements """ m, n = X.shape k, l = Y.shape o, p = Z.shape assert(m==k) assert(n==p) if nsets==None: nsets = m WWx = empty((nsets, n, a_max), 'd') WWz = empty((nsets, o, a_max), 'd') #WWy = empty((nsets, l, a_max), 'd') for i, (cal, val) in enumerate(cv(k, nsets)): dat = lpls(X[cal],Y[cal],Z,a_max=a_max,alpha=xz_alpha,mean_ctr=mean_ctr, scale='loads', verbose=verbose) WWx[i,:,:] = dat['W'] WWz[i,:,:] = dat['L'] #WWy[i,:,:] = dat['Q'] return WWx, WWz def find_aopt_from_sep(sep, method='vanilla'): """Returns an estimate of optimal number of components. The estimate is based on the squared error of prediction from crossvalidation. This is pretty much wild guessing and it is recomended to inspect model parameters and prediction errors closely before deciding on the optimal number of components. *Parameters*: sep : {array} Squared error of prediction method : ['vanilla', '75perc'] Mehtod used to estimate optimal number of components *Returns*: aopt : {integer} A guess on the optimal number of components """ if method=='vanilla': # min rmsep rmsecv = sqrt(sep.mean(0)) return rmsecv.argmin() + 1 elif method=='75perc': prct = .75 #percentile ind = 1.*sep.shape[0]*prct med = median(sep) prc_75 = [] for col in sep.T: col = sorted(col) prc_75.append(col[int(ind)]) prc_75 = asarray(prc_75) for i in range(1, sep.shape[1], 1): if med[i-1]N: raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(N,N,K) index = xrange(N) if randomise: from random import shuffle index = list(index) shuffle(index) sequential = False if sequential: for validation in array_split(index, K): training = [i for i in index if i not in validation] yield training, validation else: for k in xrange(K): training = [i for i in index if i % K != k] validation = [i for i in index if i % K == k] yield training, validation def class_error(Yhat, Y, method='vanilla'): """ Not used. """ a_max, k, l = Yhat.shape Yhat_c = zeros((k, l), dtype='d') for a in range(a_max): for i in range(k): Yhat_c[a,val,argmax(Yhat[a,val,:])] = 1.0 err = 100*((Yhat_c + Y) == 2).sum(1)/Y.sum(0).astype('d') return Yhat_c, err def class_errorII(T, Y, method='lda'): """ Not used ... """ pass