diff --git a/arpack/arpack.py b/arpack/arpack.py index 39139c8..d7f069d 100644 --- a/arpack/arpack.py +++ b/arpack/arpack.py @@ -30,7 +30,7 @@ class get_matvec: if isinstance(obj, sb.ndarray): self.callfunc = self.type1 return - meth = getattr(obj,self.methname,None) + meth = getattr(obj, self.methname, None) if not callable(meth): raise ValueError, "Object must be an array "\ "or have a callable %s attribute." % (self.methname,) @@ -48,12 +48,12 @@ class get_matvec: return sb.dot(self.obj.A, x) def type2(self, x): - return self.obj(x,*self.args) + return self.obj(x, *self.args) -def eigen(A,k=6,M=None,ncv=None,which='LM', - maxiter=None,tol=0, return_eigenvectors=True): - """ Return k eigenvalues and eigenvectors of the matrix A. +def eigen(A, k=6, M=None, ncv=None, which='LM', + maxiter=None, tol=0, return_eigenvectors=True, v0=None): + """Return k eigenvalues and eigenvectors of the matrix A. Solves A * x[i] = w[i] * x[i], the standard eigenvalue problem for w[i] eigenvalues with corresponding eigenvectors x[i]. @@ -69,6 +69,7 @@ def eigen(A,k=6,M=None,ncv=None,which='LM', M -- (Not implemented) A symmetric positive-definite matrix for the generalized eigenvalue problem A * x = w * M * x + v0 -- Initial starting solution (n x 1) Outputs: @@ -99,8 +100,8 @@ def eigen(A,k=6,M=None,ncv=None,which='LM', """ try: - n,ny=A.shape - n==ny + n, ny = A.shape + n == ny except: raise AttributeError("matrix is not square") if M is not None: @@ -108,12 +109,12 @@ def eigen(A,k=6,M=None,ncv=None,which='LM', # some defaults if ncv is None: - ncv=2*k+1 - ncv=min(ncv,n) - if maxiter==None: - maxiter=n*10 + ncv = 2*k + 1 + ncv = min(ncv, n) + if maxiter == None: + maxiter = n*10 - # guess type + # guess type resid = sb.zeros(n,'f') try: typ = A.dtype.char @@ -129,7 +130,7 @@ def eigen(A,k=6,M=None,ncv=None,which='LM', raise ValueError("k must be less than rank(A), k=%d"%k) if maxiter <= 0: raise ValueError("maxiter must be positive, maxiter=%d"%maxiter) - whiches=['LM','SM','LR','SR','LI','SI'] + whiches = ['LM','SM','LR','SR','LI','SI'] if which not in whiches: raise ValueError("which must be one of %s"%' '.join(whiches)) if ncv > n or ncv < k: @@ -141,17 +142,26 @@ def eigen(A,k=6,M=None,ncv=None,which='LM', eigextract = _arpack.__dict__[ltr+'neupd'] matvec = get_matvec(A) - v = sb.zeros((n,ncv),typ) # holds Ritz vectors - resid = sb.zeros(n,typ) # residual - workd = sb.zeros(3*n,typ) # workspace - workl = sb.zeros(3*ncv*ncv+6*ncv,typ) # workspace - iparam = sb.zeros(11,'int') # problem parameters - ipntr = sb.zeros(14,'int') # pointers into workspaces - info = 0 + v = sb.zeros((n, ncv), typ) # holds Ritz vectors + if v0 == None: + resid = sb.zeros(n, typ) # residual + info = 0 + else: # starting vector is given + nn, kk = v0.shape + if nn != n: + raise ValueError("starting vector must be: (%d, 1), got: (%d, %d)" %(n, nn, kk)) + resid = v0[:,0].astype(typ) + info = 1 + + workd = sb.zeros(3*n, typ) # workspace + workl = sb.zeros(3*ncv*ncv+6*ncv, typ) # workspace + iparam = sb.zeros(11, 'int') # problem parameters + ipntr = sb.zeros(14, 'int') # pointers into workspaces + ido = 0 if typ in 'FD': - rwork = sb.zeros(ncv,typ.lower()) + rwork = sb.zeros(ncv, typ.lower()) # only supported mode is 1: Ax=lx ishfts = 1 @@ -160,7 +170,7 @@ def eigen(A,k=6,M=None,ncv=None,which='LM', iparam[0] = ishfts iparam[2] = maxiter iparam[6] = mode1 - + while True: if typ in 'fd': ido,resid,v,iparam,ipntr,info =\ @@ -173,9 +183,9 @@ def eigen(A,k=6,M=None,ncv=None,which='LM', if (ido == -1 or ido == 1): # compute y = A * x - xslice = slice(ipntr[0]-1, ipntr[0]-1+n) - yslice = slice(ipntr[1]-1, ipntr[1]-1+n) - workd[yslice]=matvec(workd[xslice]) + xslice = slice(ipntr[0] - 1, ipntr[0] - 1 + n) + yslice = slice(ipntr[1] - 1, ipntr[1] - 1 + n) + workd[yslice] = matvec(workd[xslice]) else: # done break @@ -233,7 +243,7 @@ def eigen(A,k=6,M=None,ncv=None,which='LM', def eigen_symmetric(A,k=6,M=None,ncv=None,which='LM', - maxiter=None,tol=0, return_eigenvectors=True): + maxiter=None,tol=0, return_eigenvectors=True, v0=None): """ Return k eigenvalues and eigenvectors of the real symmetric matrix A. Solves A * x[i] = w[i] * x[i], the standard eigenvalue problem for @@ -253,6 +263,8 @@ def eigen_symmetric(A,k=6,M=None,ncv=None,which='LM', A symmetric positive-definite matrix for the generalized eigenvalue problem A * x = w * M * x + v0 -- Starting vector (n, 1) + Outputs: w -- An real array of k eigenvalues @@ -325,12 +337,22 @@ def eigen_symmetric(A,k=6,M=None,ncv=None,which='LM', matvec = get_matvec(A) v = sb.zeros((n,ncv),typ) - resid = sb.zeros(n,typ) + if v0 == None: + resid = sb.zeros(n, typ) # residual + info = 0 + else: # starting solution is given + nn, kk = v0.shape + if nn != n: + raise ValueError("starting vectors must be: (%d, %d), got: (%d, %d)" %(n, k, nn, kk)) + resid = v0[:,0].astype(typ) + info = 1 + + #resid = sb.zeros(n,typ) workd = sb.zeros(3*n,typ) workl = sb.zeros(ncv*(ncv+8),typ) iparam = sb.zeros(11,'int') ipntr = sb.zeros(11,'int') - info = 0 + #info = 0 ido = 0 # only supported mode is 1: Ax=lx @@ -340,8 +362,7 @@ def eigen_symmetric(A,k=6,M=None,ncv=None,which='LM', iparam[0] = ishfts iparam[2] = maxiter iparam[6] = mode1 - - + while True: ido,resid,v,iparam,ipntr,info =\ eigsolver(ido,bmat,which,k,tol,resid,v,iparam,ipntr, diff --git a/pyblm/crossvalidation.py b/pyblm/crossvalidation.py index fd96904..364bf4e 100644 --- a/pyblm/crossvalidation.py +++ b/pyblm/crossvalidation.py @@ -2,16 +2,149 @@ The primary use is crossvalidation. """ -__all__ = ['lpls_val', 'pls_jk', 'lpls_jk'] +__all__ = ['pca_val', 'pls_val', 'lpls_val', 'pls_jk', 'lpls_jk'] __docformat__ = "restructuredtext en" from numpy import dot,empty,zeros,sqrt,atleast_2d,argmax,asarray,median,\ - array_split,arange, isnan, any + array_split,arange, isnan, any,newaxis from numpy.random import shuffle -from engines import pls +from engines import pls, pca from engines import nipals_lpls as lpls +def pca_val(a, a_max, nsets=None, center_axis=[0]): + """Returns error estimate of crossvalidated PCA. + + *Parameters*: + + a : {array} + data matrix (n x m) + a_max : {integer} + Maximum number of components in model. + center_axis: + Centering + nsets : {integer} + number of segments + + *Returns*: + + rmsep : {array} + Squared error of prediction for each component and xvar (a_max, m) + xhat : {array} + Crossvalidated predicted a (a_max, m, n) + aopt : {integer} + Estimate of optimal number of components + + *Notes*: + + - Crossvalidation method is currently set to random blocks of diagonals. + + """ + n, m = a.shape + if nsets == None: + nsets = n + + err = zeros((a_max, n, m), dtype=a.dtype) + err_mn = zeros((a_max, n, m), dtype=a.dtype) + xhat = zeros((a_max, n, m), dtype=a.dtype) + mn_a = .5*(a.mean(0) + a.mean(1)[:,newaxis]) + + for i, val in enumerate(diag_cv(a.shape, nsets)): + old_values = a.take(val) + new_values = mn_a.take(val) + # impute mean values + b = a.copy() + a.put(val, new_values) + dat = pca(a, a_max, mode='normal', center_axis=center_axis) + Ti, Pi = dat['T'], dat['P'] + bc = b - dat['mnx'] + bc2 = b - b.mean(0) + for j in xrange(a_max): + # predict the imputed values + a_pred = dot(Ti[:,:j+1], Pi[:,:j+1].T).take(val) + a_true = bc2.take(val) + err[j,:,:].put(val, (a_true - a_pred)**2) + err_mn[j,:,:].put(val, (bc.take(val) - a_pred)**2) + xhat[j,:,:].put(val, a_pred) + + # put original values back + a.put(val, old_values) + + rmsep = sqrt(err).mean(1) # take mean over samples + rmsep2 = sqrt(err_mn).mean(1) + aopt = rmsep.mean(-1).argmin() + return rmsep, xhat, aopt, err, rmsep2 + +def pls_val(X, Y, a_max=2, nsets=None, center_axis=[0,0], verbose=False): + """Performs crossvalidation for generalisation error in pls. + + + + *Parameters*: + + X : {array} + Main data matrix (m, n) + Y : {array} + External row data (m, l) + a_max : {integer}, optional + Maximum number of components to calculate (0, min(m,n)) + nsets : (integer), optional + Number of crossvalidation sets + center_axis : {array-like}, optional + A three element array-like structure with elements in [-1,0,1,2], + that decides the type of centering used. + -1 : nothing + 0 : row center + 1 : column center + 2 : double center + verbose : {boolean}, optional + Verbosity of console output. For use in debugging. + + *Returns*: + + rmsep : {array} + Root mean squred error of prediction (for each y-var) + yhat : {array} + Estimated responses + aopt : {integer} + Estimated value of optimal number of components + + """ + dt = X.dtype + m, n = X.shape + k, l = Y.shape + assert m == k, "X (%d,%d) - Y (%d,%d) dim mismatch" %(m, n, k, l) + assert n == p, "X (%d,%d) - Z (%d,%d) dim mismatch" %(m, n, o, p) + if nsets == None: + nsets = m + if nsets > X.shape[0]: + print "nsets (%d) is larger than number of variables (%d).\nnsets: %d -> %d" %(nsets, m, nsets, m) + nsets = m + if m > n: + # kernel boosting + Yhat = _w_pls_predict(X, Y, a_max) + + Yhat = empty((a_max, k, l), dtype=dt) + for cal, val in cv(k, nsets): + # do the training model + dat = pls(X[cal], Y[cal], a_max=a_max,center_axis=center_axis) + + # center test data + xi = X[val,:] - dat['mnx'] + ym = dat['mny'] + + # predictions + for a in range(a_max): + Yhat[a,val,:] = ym + dot(xi, dat['B'][a]) + + sep = (Y - Yhat)**2 + rmsep = sqrt(sep.mean(1)).T + #aopt = find_aopt_from_sep(rmsep) + + # todo: need a better support for classification error + error = prediction_error(Yhat, Y, method='1/2') + + return rmsep, Yhat, error def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=False, verbose=False): """Performs crossvalidation for generalisation error in lpls. @@ -72,47 +205,29 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F assert (alpha >= 0 and alpha<=1), "Alpha needs to be within [0,1], got: %.2f" %alpha Yhat = empty((a_max, k, l), 'd') - for cal, val in cv(nsets, k): + for cal, val in cv(k, nsets): # do the training model dat = lpls(X[cal], Y[cal], Z, a_max=a_max, alpha=alpha, center_axis=center_axis, zorth=zorth, verbose=verbose) # center test data - if center_axis[0] != 1: - xi = X[val,:] - dat['mnx'] - else: - xi = X[val] - X[cal].mean(1)[:,newaxis] - if center_axis[2] != 1: - ym = dat['mny'] - else: - ym = Y[cal].mean(1)[:,newaxis] + xi = X[val,:] - dat['mnx'] + + ym = dat['mny'][val,:] # predictions for a in range(a_max): Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a])) - #if permute: - # xcal = X[cal] - # for a in range(1,a_max,1): - # for n in range(10): - # shuffle(cal) - # dat = lpls(xcal, Y[cal], Z, a_max=a_max, alpha=alpha, - # center_axis=center_axis, verbose=verbose) - - - # todo: need a better support for classification error - y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?'] - if y_is_class: - pass - #Yhat, err = class_error(Yhat, Y) - #return Yhat, err - sep = (Y - Yhat)**2 rmsep = sqrt(sep.mean(1)).T - aopt = find_aopt_from_sep(rmsep) + #aopt = find_aopt_from_sep(rmsep) - return rmsep, Yhat, aopt + # todo: need a better support for classification error + error = prediction_error(Yhat, Y, method='1/2') + + return rmsep, Yhat, error -def pca_jk(a, aopt, center_axis=[0], nsets=None): - """Returns jack-knife segements from PCA. +def pca_jk(a, aopt, nsets=None, center_axis=[0], method='cv'): + """Returns jack-knife segments from PCA. *Parameters*: @@ -124,6 +239,12 @@ def pca_jk(a, aopt, center_axis=[0], nsets=None): Centering nsets : {integer} number of segments + method : {'cv', 'diag', 'bs', 'bs_diag'} + Perturbation method is one of: + cv = leave samples out + diag = impute diagonals + bs = leave samples out with replacement (bootstrap) + bs_diag = impute diagonals *Returns*: @@ -135,22 +256,32 @@ def pca_jk(a, aopt, center_axis=[0], nsets=None): - Crossvalidation method is currently set to random blocks of samples. """ + m, n = a.shape if nsets == None: - nsets = a.shape[0] + nsets = m Pcv = empty((nsets, a.shape[1], aopt), dtype=a.dtype) - mn_a = .5*(a.mean(0) + a.mean(1)[:,newaxis]) - for i, (cal, val) in enumerate(cv_diag(a.shape, nsets)): - old_values = a.take(ind) - new_values = mn_a.take(ind) - a.put(ind, new_values) - dat = pca(a, aopt, mode='fast', scale='loads', center_axis=center_axis) - PP[i,:,:] = dat['P'] - a.put(ind, old_values) + if method == 'diag': + mn_a = .5*(a.mean(0) + a.mean(1)[:,newaxis]) + for i, val in enumerate(diag_cv(a.shape, nsets)): + old_values = a.take(val) + new_values = mn_a.take(val) + # impute mean values + a.put(val, new_values) + dat = pca(a, aopt, mode='fast', scale='loads', center_axis=center_axis) + Pcv[i,:,:] = dat['P'] + # put original values back + a.put(val, old_values) + elif method == 'cv': + print "using ....cv " + for i, (cal, val) in enumerate(cv(m, nsets)): + Pcv[i,:,:] = pca(a[cal,:], aopt, mode='fast', scale='loads', center_axis = center_axis)['P'] + else: + raise NotImplementedError(method) - return PP + return Pcv -def pls_jk(X, Y, a_opt, nsets=None, center_axis=True, verbose=False): +def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False): """ Returns jack-knife segements of W. *Parameters*: @@ -187,9 +318,7 @@ def pls_jk(X, Y, a_opt, nsets=None, center_axis=True, verbose=False): for i, (cal, val) in enumerate(cv(k, nsets)): if verbose: print "Segment number: %d" %i - dat = pls(X[cal,:], Y[cal,:], a_opt, scale='loads', mode='fast', center_axis=[0, 0]) - if any(isnan(dat['W'])): - 1/0 + dat = pls(X[cal,:], Y[cal,:], a_opt, scale='loads', mode='fast', center_axis=center_axis) Wcv[i,:,:] = dat['W'] return Wcv @@ -257,18 +386,18 @@ def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth= return WWx, WWz -def find_aopt_from_sep(sep, method='vanilla'): +def find_aopt_from_sep(err, method='vanilla'): """Returns an estimate of optimal number of components. - The estimate is based on the squared error of prediction from + The estimate is based on the error of prediction from crossvalidation. This is pretty much wild guessing and it is recomended to inspect model parameters and prediction errors closely before deciding on the optimal number of components. *Parameters*: - sep : {array} - Squared error of prediction + err : {array} + Error of prediction method : ['vanilla', '75perc'] Mehtod used to estimate optimal number of components @@ -280,22 +409,23 @@ def find_aopt_from_sep(sep, method='vanilla'): if method == 'vanilla': # min rmsep - rmsecv = sqrt(sep.mean(0)) + rmsecv = sqrt(err.mean(0)) return rmsecv.argmin() + 1 elif method == '75perc': - prct = .75 #percentile - ind = 1.*sep.shape[0]*prct - med = median(sep) - prc_75 = [] - for col in sep.T: - col = sorted(col) - prc_75.append(col[int(ind)]) - prc_75 = asarray(prc_75) - for i in range(1, sep.shape[1], 1): - if med[i-1]m or nsets>n: msg = "You may not use more subsets than max(n_rows, n_cols)" - raise ValueError, msg + nsets = min(m, n) nm = n*m index = arange(nm) - n_ind = arange(n) - shuffle(n_ind) # random start diag + n_ind = arange(n+1) + #shuffle(n_ind) # random start diag start_inds = array_split(n_ind, nsets) for v in range(nsets): - validation = [] - for start in start_inds[v]: - ind = arange(start+v, nm, n+1) - [validation.append(i) for i in ind] - training = [j for j in index if j not in validation] - yield training, validation + validation = set() + for start in start_inds[v]: + ind = arange(start, nm, n+1) + validation.update(ind) + #training = [j for j in index if j not in validation] + yield list(validation) -def class_error(y_hat, y, method='vanilla'): - """ Not used. - """ - a_opt, k, l = y_hat.shape - y_hat_c = zeros((k, l), dtype='d') - if method == vanilla: - pass - for a in range(a_opt): - for i in range(k): - y_hat_c[a, val, argmax(y_hat[a,val,:])] = 1.0 - err = 100*((y_hat_c + y) == 2).sum(1)/y.sum(0).astype('d') - - return y_hat_c, err - def prediction_error(y_hat, y, method='squared'): """Loss function on multiclass Y. Assumes y is a binary dummy class matrix (samples, classes) """ k, l = y.shape - - if method == 'hinge': - pass - elif method == 'smooth_hinge': - z = 90 - elif method == 'abs': - err = abs(y - y_hat) - elif method == 'squared': - err = (y - y_hat)**2 - elif method == '0/1': - pred = zeros_like(y_hat) - for i, row in enumerate(y_hat): - largest = row.argsort()[-1] - pred[i, largest] = 1. - err = abs(y - pred) - elif method == '1/2': - y_hat[y_hat>.5] = 1 - y_hat[y_hat<.5] = 0 - err = abs(y - y_hat) + a_max, kk, ll = y_hat.shape + error = empty((a_max, l)) + for a in range(a_max): + yha = y_hat[a, :, :] + if method == 'hinge': + err = zeros((k, l)) + for j in range(l): + for i in range(k): + if y[i,j] == 1: + if yha[i, j] >= 1: + err[i,j] = 0 + else: + err[i,j] = abs(y[i,j] - yha[i,j]) + elif y[i,j] == 0: + if yha[i, j] <= 0: + err[i,j] = 0 + else: + err[i,j] = abs(y[i,j] - yha[i,j]) + + elif method == 'smooth_hinge': + err = zeros((k, l)) + for j in range(l): + for i in range(k): + if y[i,j] == 1: + if yha[i, j] >= 1: + err[i,j] = 0 + elif yha[i,j] < 1 and yha[i,j] > 0: + err[i,j] = abs(y[i,j] - yha[i,j]) + else: + err[i,j] = 1 + + elif y[i,j] == 0: + if yha[i, j] <= 0: + err[i,j] = 0 + elif yha[i,j] < 1 and yha[i,j] > 0: + err[i,j] = abs(y[i,j] - yha[i,j]) + else: + err[i,j] = 1 + + elif method == 'abs': + err = abs(y - yha) + elif method == 'squared': + err = (y - yha)**2 + elif method == '0/1': + pred = zeros((k, l)) + for i, row in enumerate(yha): + largest = row.argsort()[-1] + pred[i, largest] = 1. + err = abs(y - pred) + elif method == '1/2': + yh = yha.copy() + yh[yha>.5] = 1 + yh[yha<.5] = 0 + err = abs(y - yh) + else: + raise ValueError("Option: %s (method) not valid" %method) - return err + error[a,:] = err.mean(0) + + return error + +def _wkernel_pls_val(X, Y, a_max, n_blocks=None): + """Returns rmsep and aopt for pls tailored for wide X. + + The error of cross validation is calculated + based on random block cross-validation. With number of blocks equal to + number of samples [default] gives leave-one-out cv. + The pls model is based on the simpls algorithm for wide X, an is quite + fast in very high dimensional X data. + + *Parameters*: + + X : ndarray + column centered data matrix of size (samples x variables) + Y : ndarray + column centered response matrix of size (samples x responses) + a_max : scalar + Maximum number of components + n_blocks : scalar + Number of blocks in cross validation + + *Returns*: + + rmsep : ndarray + Root Mean Square Error of cross-validated Predictions + aopt : scalar + Guestimate of the optimal number of components + + SeeAlso: + + - pls_cv_val : Same output, not optimised for wide X + - w_simpls : Simpls algorithm for wide X + + Notes + ----- + + Based (cowardly translated) on m-files from the Chemoact toolbox + X, Y inputs need to be centered (fixme: check) + + + Examples + -------- + + >>> import numpy as n + >>> X = n.array([[1., 2., 3.],[]]) + >>> Y = n.array([[1., 2., 3.],[]]) + >>> w_pls(X, Y, 1) + [4,5,6], 1 + """ + dt = X.dtype + k, l = m_shape(Y) + PRESS = zeros((l, a_max+1), dtype=dt) + if n_blocks==None: + n_blocks = Y.shape[0] + XXt = dot(X, X.T) + V = w_pls_gen(XXt, Y, n_blocks=n_blocks, center=True) + for Din, Doi, Yin, Yout in V: + ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0]) + PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0) + + dat = w_simpls(Din, Yin, a_max) + Q, U, H = dat['Q'], dat['U'], dat['H'] + That = dot(Doi, dot(U, inv(triu(dot(H.T, U))) )) + + Yhat = zeros((a_max, k, l), dtype=dt) + for j in range(l): + TQ = dot(That, triu(dot(Q[j,:][:,newaxis], ones((1,a_max)))) ) + Yhat[:,:,l] = TQ + E = Yout[:,j][:,newaxis] - TQ + E = E + sum(E, 0)/Din.shape[0] + PRESS[j,1:] = PRESS[j,1:] + sum(E**2, 0) + #Yhat = Yin - dot(That,Q.T) + msep = PRESS/(Y.shape[0]) + #aopt = find_aopt_from_sep(msep) + return Yhat, sqrt(msep) + +def _w_pls(aat, b, aopt): + """ Pls for wide matrices. + Fast pls for crossval, used in calc rmsep for wide X + There is no P or W. T is normalised + + aat = centered kernel matrix + b = centered y + """ + bb = b.copy() + k, l = m_shape(b) + m, m = m_shape(aat) + U = empty((m, aopt)) # W + T = empty((m, aopt)) + R = empty((m, aopt)) # R + PROJ = empty((m, aopt)) # P? + + for i in range(aopt): + if has_sym: + s, q = symeig(dot(dot(b.T, aat), b), range=(l,l),overwrite=True) + else: + q, s, vh = svd(dot(dot(b.T, aat), b), full_matrices=0) + q = q[:,:1] + u = dot(b , q) #y-factor scores + U[:,i] = u.ravel() + t = dot(aat, u) + + t = t/vnorm(t) + T[:,i] = t.ravel() + r = dot(aat, t)#score-weights + #r = r/vnorm(r) + R[:,i] = r.ravel() + PROJ[:,: i+1] = dot(T[:,:i+1], inv(dot(T[:,:i+1].T, R[:,:i+1])) ) + if i max_aopt: + print "Using aopt: %d" %max_aopt + aopt = max_aopt if m > (n+100) or n > (m+100): u, s, v = esvd(X, aopt) else: @@ -108,7 +110,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]): P = P*s if mode in ['fast', 'f', 'F']: - return {'T':T, 'P':P, 'aopt':aopt} + return {'T':T, 'P':P, 'aopt':aopt, 'mnx': mnx} if mode in ['detailed', 'd', 'D']: E = empty((aopt, m, n)) @@ -135,7 +137,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]): expvarx = r_[0, 100*e.cumsum()/(X*X).sum()] return {'T': T, 'P': P, 'E': E, 'evx': expvarx, 'leverage': lev, 'ssqx': ssq, - 'aopt': aopt, 'eigvals': e} + 'aopt': aopt, 'eigvals': e, 'mnx': mnx} def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=[0, 0]): """ Principal Component Regression. @@ -690,39 +692,80 @@ def center(a, axis): ### TODO ### # Perhaps not(!) use broadcasting and return full centering # matrix instead ? + # - # check if we have a vector is_vec = len(a.shape) == 1 if not is_vec: is_vec = a.shape[0] == 1 or a.shape[1] == 1 if is_vec: if axis == 2: - warnings.warn("Double centering of vector ignored, using ordinary centering") + #warnings.warn("Double centering of vector ignored, using ordinary centering") + raise ValueError("Double centering of vector is not a valid option") if axis == -1: - mn = 0 + mn = zeros(a.shape) else: - mn = a.mean() + mn = a.mean()*ones(a.shape) return a - mn, mn if axis == -1: mn = zeros((1,a.shape[1],)) - #mn = tile(mn, (a.shape[0], 1)) + mn = tile(mn, (a.shape[0], 1)) elif axis == 0: mn = a.mean(0)[newaxis] - #mn = tile(mn, (a.shape[0], 1)) + mn = tile(mn, (a.shape[0], 1)) elif axis == 1: mn = a.mean(1)[:,newaxis] - #mn = tile(mn, (1, a.shape[1])) + mn = tile(mn, (1, a.shape[1])) elif axis == 2: #fixme: double centering returns column mean as loc-vector, ok? mn = a.mean(0)[newaxis] + a.mean(1)[:,newaxis] - a.mean() - return a - mn , a.mean(0)[newaxis] + return a - mn , mn else: raise IOError("input error: axis must be in [-1,0,1,2]") return a - mn, mn +def inv_center(a, mn_a, axis): + """Inverse centering. + + Adding row, column or double centering to a matrix. + This method uses broadcasting, so the size of a needs only + to match the axis argument. + + *Parameters*: + + a : {array} + Input data, to be centered + + axis : {integer} + Which centering to perform. + 0 = col center, 1 = row center, 2 = double center + -1 = nothing + + *Returns*: + + a_centered : {array} + Centered data matrix + + *Notes* + + May just as well used to subtract a mean (just use negative mn_a) + + """ + if axis == -1: + return a + m, n = a.shape + k, o = mn_a.shape + if axis == 0: #row centering + assert(n == o and k == 1) + elif axis == 1: # column centering + assert(n == k and o == 1) + elif axis == 2: # double centering + assert(n == o and m == k) + + return a + mn_a + def _scale(a, axis): """ Matrix scaling to unit variance. diff --git a/pyblm/statistics.py b/pyblm/statistics.py index dec2cc0..6d9db02 100644 --- a/pyblm/statistics.py +++ b/pyblm/statistics.py @@ -14,7 +14,7 @@ from engines import pls from engines import nipals_lpls as lpls -def hotelling(Pcv, P, p_center='median', cov_center=median, +def hotelling(Pcv, P, p_center='median', cov_center='median', alpha=0.3, crot=True, strict=False): """Returns regularized hotelling T^2. @@ -31,8 +31,8 @@ def hotelling(Pcv, P, p_center='median', cov_center=median, Calibration model paramter p_center : {'median', 'mean', 'cal_model'}, optional Location method for sub-segments - cov_center : {py_func}, optional - Location function + cov_center : {'median', 'mean', 'cal_model'}, optional + Pooled covariance estimate alpha : {float}, optional Regularisation towards pooled covariance estimate. crot : {boolean}, optional @@ -79,33 +79,43 @@ def hotelling(Pcv, P, p_center='median', cov_center=median, for i in xrange(n): Pi = Pcv[:,i,:] # (n_sets x amax) Pi_ctr = P_ctr[i,:] # (1 x amax) - Pim = (Pi - Pi_ctr)*msqrt(n_sets-1) - Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim) - - Cov = cov_center(Cov_i) - reg_cov = (1. - alpha)*Cov_i + alpha*Cov + #Pim = (Pi - Pi_ctr)*msqrt(n_sets-1) + #Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim) + Pim = (Pi - Pi_ctr) + Cov_i[i] = dot(Pim.T, Pim) + if cov_center == 'median': + Cov_p = median(Cov_i) + elif cov_center == 'mean': + Cov_p = Cov.mean(0) + else: + print "Pooled covariance est. invalid, using median" + print cov_center + Cov_p = median(Cov_i) + reg_cov = (1. - alpha)*Cov_i + alpha*Cov_p for i in xrange(n): Pc = P_ctr[i,:] sigma = reg_cov[i] T_sq[i] = dot(dot(Pc, inv(sigma)), Pc) return T_sq -def procrustes(a, b, strict=True, center=False, verbose=False): +def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False): """Orthogonal rotation of b to a. - Procrustes rotation is an orthogonal rotoation of one subspace + Procrustes rotation is an orthogonal rotation of one subspace onto another by minimising the squared error. *Parameters*: a : {array} - Input array + Input array, stationary b : {array} - Input array + Input array, rotate this to max. fit to a strict : {boolean} Only do flipping and shuffling center : {boolean} Center before rotation, translate back after + force_norm : {boolean} + Ensure that columns of a and b are orthonormal verbose : {boolean} Show sum of squares @@ -125,6 +135,12 @@ def procrustes(a, b, strict=True, center=False, verbose=False): a = a - mn_a mn_b = b.mean(0) b = b - mn_b + + if force_norm: + a_norm = apply_along_axis(norm, 0, a) + a = a/a_norm + b_norm = apply_along_axis(norm, 0, b) + b = b/b_norm u, s, vt = svd(dot(b.T, a)) Cm = dot(u, vt) # Cm: orthogonal rotation matrix @@ -166,13 +182,13 @@ def _ensure_strict(C, only_flips=True): """ if only_flips: - C = eye(Cm.shape[0])*sign(Cm) + C = eye(C.shape[0])*sign(C) return C Cm = zeros(C.shape, dtype='d') Cm[abs(C)>.6] = 1. if det(Cm)>1: raise NotImplementedError - return Cm*S + return Cm*sign(C) def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20, sim_method='shuffle', p_center='med', cov_center=median, @@ -248,7 +264,7 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20, Wc, Lc = lpls_jk(X, Y, Z ,aopt, zorth=zorth) cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha) cal_tsq_z = hotelling(Lc, dat['L'], alpha=alpha) - + print "morn" # Perturbations index = arange(m) for i in range(n_iter): @@ -257,9 +273,10 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20, dat = lpls(X, Y[indi,:], Z, aopt, scale='loads', center_axis=center_axis, zorth=zorth) Wi, Li = lpls_jk(X, Y[indi,:], Z, aopt, nsets=nsets) pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha) - pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=alpha) + # no reason to borrow variance in dag (alpha ->some small value) + pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=0.01) - return _fdr(cal_tsq_z, pert_tsq_z, median), _fdr(cal_tsq_x, pert_tsq_x, median) + return cal_tsq_z, pert_tsq_z, cal_tsq_x, pert_tsq_x def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median, @@ -335,7 +352,7 @@ def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median, Wi = pls_jk(X, Y[indi,:], aopt, nsets=nsets, center_axis=center_axis) pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha) - return _fdr(cal_tsq_x, pert_tsq_x, median) + return cal_tsq_x, pert_tsq_x @@ -372,7 +389,7 @@ def _fdr(tsq, tsqp, loc_method=median): gene expression data using dimension reduction methods, BMC bioinformatics, 2007 """ - n, = tsq.shape + n = tsq.shape[0] k, m = tsqp.shape assert(n==k) n_false = empty((n, m), 'd') diff --git a/sandbox/powerit/Makefile b/sandbox/powerit/Makefile new file mode 100644 index 0000000..6869f86 --- /dev/null +++ b/sandbox/powerit/Makefile @@ -0,0 +1,27 @@ +CC=gcc +PYTHON_INCLUDE=/usr/include/python2.5 +PYTHON_LIBRARY=/usr/lib/python2.5 +NUMPY_INCLUDE=$(PYTHON_LIBRARY)/site-packages/numpy/core/include/numpy/ +GOTO_LIBRARY=/home/flatberg/GotoBLAS +BLAS_LIBRARY=/usr/lib +BLAS=blas +GOTO=goto + + +all: numpy_module.so + +numpy_module.so: numpy_module.o sympowerit.o + $(CC) -Wall -shared numpy_module.o sympowerit.o -o _numpy_module.so -L$(PYTHON_LIBRARY) -lpython2.5 -L$(BLAS_LIBRARY) -l$(BLAS) -llapack -lg2c -lm + +numpy_module.o: numpy_module.c numpy_module.h + $(CC) -fPIC -Wall -O2 -g -c -I$(PYTHON_INCLUDE) -I$(NUMPY_INCLUDE) numpy_module.c + +c_egines.o: sympowerit.c + $(CC) -Wall -O2 -g -c sympowerit.c + +clean: + -rm numpy_module.o sympowerit.o + -rm -rf _numpy_module.so + + + diff --git a/sandbox/powerit/numpy_module.c b/sandbox/powerit/numpy_module.c new file mode 100755 index 0000000..89e8a80 --- /dev/null +++ b/sandbox/powerit/numpy_module.c @@ -0,0 +1,70 @@ +/* A file to test imorting C modules for handling arrays to Python */ + +/* Python.h includes , , , , and (if available)*/ +#include "Python.h" +#include "arrayobject.h" +#include "numpy_module.h" +#include "sympowerit.h" +#include + + +/* ==== Set up the methods table ====================== */ +static PyMethodDef _numpy_moduleMethods[] = { + {"sym_powerit", sym_powerit, METH_VARARGS}, + {NULL, NULL} /* Sentinel - marks the end of this structure */ +}; + + +/* ==== Initialize the numpy_module functions =============== */ +// Module name must be _numpy_module in compile and linked +void init_numpy_module() { + (void) Py_InitModule("_numpy_module", _numpy_moduleMethods); + import_array(); // Must be present for NumPy. Called first after above line. +} + + +/* =========== Power iteration on symmetric matrix ========== */ +static PyObject *sym_powerit(PyObject *self, PyObject *args) +{ + PyArrayObject *X=NULL, *T=NULL, *E=NULL,*U=NULL; + double *tin, *ein, tol; + int amax, n, info, maxiter, verbose; + int dims[2]; + + /* Parse tuple of input arguments*/ + if (!PyArg_ParseTuple(args, "O!O!iidii", + &PyArray_Type, &X, + &PyArray_Type, &T, + &n, + &amax, + &tol, + &maxiter, + &verbose) + ) + return NULL; + if (NULL == X) return NULL; + + /* Get the dimensions of the input */ + n = X->dimensions[0]; + + /* Create output/ work arrays, no inplace calculations */ + dims[0] = n; + dims[1] = amax; + U = (PyArrayObject*) PyArray_NewCopy(T, NPY_CORDER); + dims[1] = n; + E = (PyArrayObject*) PyArray_NewCopy(X, NPY_CORDER); + + /* Get pointers to contigous data (row-major)*/ + ein = (double *)E->data; + tin = (double *)U->data; + + /* Call sympower method */ + info = sympowerit (ein, n, tin, amax, tol, maxiter, verbose); + + Py_DECREF(E); + + return Py_BuildValue("Ni", U, info); + +} + + diff --git a/sandbox/powerit/numpy_module.h b/sandbox/powerit/numpy_module.h new file mode 100755 index 0000000..5d2e2e3 --- /dev/null +++ b/sandbox/powerit/numpy_module.h @@ -0,0 +1 @@ +/* Header to test of C modules for arrays for Python: sympowerit.c */ /* Python callablefunctions */ static PyObject *sym_powerit(PyObject *self, PyObject *args); \ No newline at end of file diff --git a/sandbox/powerit/sympowerit.c b/sandbox/powerit/sympowerit.c new file mode 100644 index 0000000..1a7cbd0 --- /dev/null +++ b/sandbox/powerit/sympowerit.c @@ -0,0 +1,116 @@ +#include +#include +#include +#include + +/* =============== Main functions body ====================*/ + +int sympowerit(double *E, int n, double *T, int amax, double tol, + int maxiter, int verbose) +{ +/* +PURPOSE: Estimate eigenvectos of a symmetric matrix using the power method. + +CALLING SEQUENCE: + info = sympowerit(*E, n, *T, amax, tol, maxiter); + +INPUTS: + E symmetric matrix from XX^T/X^TX of centered data matrix + type: *double + n elements in X first dimension + type: int + type: *double + T workspace for scores (m, amax) + type: *double + amax maximum number of principal components + type: int + tol convergence limit + type: double + maxiter maximum number of iteraitons + type: int + verbose the amount of debug output +*/ + + int iter, a, j; + int info = 0; + double sumsq, l2norm, *y,*x, sgn, diff, alpha; + double lambda = 0.0; + + if (verbose>1) verbose=1; + + /* Allocate space for working vector and eigenvector */ + y = (double *) malloc(n*sizeof(double)); + x = (double *) malloc(n*sizeof(double)); + + /* Loop over all components to be estimated*/ + for (a=0; a<=amax-1; a++){ + /* Reset work-/eigen-vector*/ + for (j=0; j= maxiter){ + if (verbose == 1){ + printf("\nComp: %d\n", a); + printf("Max iter reached.\n"); + printf("Error: %.2E\n", sumsq); + } + info = 1; + break; + } + } + /* Calculate T */ + for (j=0; j1e-15: + msg = "Array needs to be symmetric" + raise SymPowerException(msg) + + if not a_max: + a_max = 10 + + if T0 !=None: + tn, tm = T0.shape + if not (tn==n): + msg = "Start eigenvectors need to match input array ()" + raise SymPowerException(msg) + if not (tm==a_max): + msg = "Start eigenvectors need to match input a_max ()" + raise SymPowerException(msg) + else: + T0 = zeros((n, a_max), 'd') + T0[0,:] = ones((a_max,),'d') + + if mn_center: + xx = _center(xx) + + # call c-function + T, info = sym_powerit(xx, T0, n, a_max, tol, maxiter, verbose) + + if info != 0: + if verbose: + print _ERRORCODES.get(info, "Dont know this error") + return T + + +def _center(xx, ret_mn=False): + """Returns mean centered symmetric kernel matrix. + """ + n = xx.shape[0] + h = xx.sum(0)[:,newaxis] + h = (h - mean(h)/2)/n + mn_a = h + h.T + xxc = xx - mn_a + if ret_mn: + return xxc, mn_a + return xxc + +