From 41f93c598999b4f090b27180f15d121bb66d5f5e Mon Sep 17 00:00:00 2001 From: flatberg Date: Thu, 20 Sep 2007 16:11:37 +0000 Subject: [PATCH] .... --- fluents/lib/blmfuncs.py | 8 +- fluents/lib/cx_stats.py | 144 ++++++++++++++++++++----- fluents/lib/engines.py | 218 +++++++++++++++++++++++++++++--------- fluents/lib/validation.py | 69 ++++++++---- 4 files changed, 337 insertions(+), 102 deletions(-) diff --git a/fluents/lib/blmfuncs.py b/fluents/lib/blmfuncs.py index 6cbeca0..b02aab5 100644 --- a/fluents/lib/blmfuncs.py +++ b/fluents/lib/blmfuncs.py @@ -693,13 +693,13 @@ class LplsOptions(Options): opt['mode'] = 'normal' # how much info to calculate opt['amax'] = 10 opt['aopt'] = 3 - opt['xz_alpha'] = 0.4 + opt['xz_alpha'] = 0.6 opt['auto_aopt'] = False opt['center'] = True - opt['center_mth'] = [2, 2, 1] + opt['center_mth'] = [2, 0, 2] opt['scale'] = 'scores' - opt['calc_conf'] = True - opt['n_sets'] = 75 + opt['calc_conf'] = False + opt['n_sets'] = 7 opt['strict'] = False opt['p_center'] = 'med' opt['alpha'] = .3 diff --git a/fluents/lib/cx_stats.py b/fluents/lib/cx_stats.py index 43eea92..38286ad 100644 --- a/fluents/lib/cx_stats.py +++ b/fluents/lib/cx_stats.py @@ -1,6 +1,6 @@ from scipy import zeros,zeros_like,sqrt,dot,trace,sign,round_,argmax,\ sort,ravel,newaxis,asarray,diag,sum,outer,argsort,arange,ones_like,\ - all,apply_along_axis,eye + all,apply_along_axis,eye,atleast_2d from scipy.linalg import svd,inv,norm,det,sqrtm from scipy.stats import mean,median from cx_utils import mat_center @@ -26,8 +26,8 @@ def hotelling(Pcv, P, p_center='med', cov_center='med', m, n = P.shape n_sets, n, amax = Pcv.shape # allocate - T_sq = empty((n, ),dtype='f') - Cov_i = zeros((n, amax, amax),dtype='f') + T_sq = empty((n, ),dtype='d') + Cov_i = zeros((n, amax, amax),dtype='d') # rotate sub_models to full model if crot: @@ -56,10 +56,12 @@ def hotelling(Pcv, P, p_center='med', cov_center='med', reg_cov = (1. - alpha)*Cov_i + alpha*Cov for i in xrange(n): - Pc = P_ctr[i,:][:,newaxis] + #Pc = P_ctr[i,:][:,newaxis] + Pc = P_ctr[i,:] sigma = reg_cov[i] - #T_sq[i] = sqrt(dot(dot(Pc.T, inv(sigma)), Pc).ravel()) - T_sq[i] = dot(dot(Pc.T, inv(sigma)), Pc).ravel() + # T_sq[i] = (dot(Pc, inv(sigma) )*Pc).sum() #slow + T_sq[i] = dot(dot(Pc, inv(sigma)), Pc) # dont need to care about transposes + #T_sq[i] = dot(dot(Pc.T, inv(sigma)), Pc).ravel() return T_sq def procrustes(A, B, strict=True, center=False, verbose=False): @@ -147,10 +149,10 @@ def pls_qvals(a, b, aopt=None, alpha=.3, tsq_full = hotelling(Wcv, dat['W'], p_center=p_center, alpha=alpha, crot=crot, strict=strict, cov_center=cov_center) - t0 = time.time() + #t0 = time.time() Vs = shuffle_1d(bc, n_iter, axis=0) for i, b_shuff in enumerate(Vs): - t1 = time.time() + #t1 = time.time() if algo=='bridge': dat = bridge(ac, b_shuff, aopt, 'loads','fast') else: @@ -159,23 +161,10 @@ def pls_qvals(a, b, aopt=None, alpha=.3, TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center, alpha=alpha, crot=crot, strict=strict, cov_center=cov_center) - print time.time() - t1 - sort_index = argsort(tsq_full)[::-1] - back_sort_index = sort_index.argsort() - print time.time() - t0 - - # count false positives - tsq_full_sorted = tsq_full.take(sort_index) - for i in xrange(n_iter): - for j in xrange(n): - n_false[j,i] = sum(TSQ[:,i]>=tsq_full[j]) # number of false pos. genes (0-n) - false_pos = median(n_false, 1) - ll = arange(1, len(false_pos)+1, 1) - sort_qval = false_pos.take(sort_index)/ll - qval = false_pos/ll.take(back_sort_index) - print time.time() - t0 - #return qval, false_pos, TSQ, tsq_full - return qval + #print time.time() - t1 + + return fdr(tsq_full, TSQ, median) + def ensure_strict(C, only_flips=True): """Ensure that a rotation matrix does only 90 degree rotations. @@ -267,6 +256,7 @@ def pls_qvals_II(a, b, aopt=None, center=True, alpha=.3, qval = false_pos/ll.take(back_sort_index) print time.time() - t0 #return qval, false_pos, TSQ, tsq_full + return qval def leverage(aopt=1,*args): @@ -333,3 +323,107 @@ def vnorm(x): This is considerably faster than linalg.norm """ return sqrt(dot(x,x.conj())) + +def mahalanobis(a, loc=None, acov=None, invcov=None): + """Returns the distance of each observation in a + from the location estimate (loc) of the data, + relative to the shape of the data. + + + a : data matrix (n observations in rows, p variables in columns) + loc : location estimate of the data (p-dimensional vector) + covmat or invcov : scatter estimate of the data or the inverse of the scatter estimate (pxp matrix) + + :Returns: + A vector containing the distances of all the observations to locvct. + + """ + n, p = a.shape + if loc==None: + loc = a.mean(0) + loc = atleast_2d(loc) + if loc.shape[1]==1: + loc = loc.T; #ensure rowvector + assert(loc.shape[1]==p) + xc = a - loc + if acov==None and invcov==None: + acov = dot(xc.T, xc) + + if invcov != None: + covmat = atleast_2d(invcov) + if min(covmat.shape)==1: + covmat = diag(invcov.ravel()) + else: + covmat = atleast_2d(acov) + if min(covmat.shape)==1: + covmat = diag(covmat.ravel()) + covmat = inv(covmat) + # mdist = diag(dot(dot(xc, covmat),xc.T)) + mdist = (dot(xc, covmat)*xc).sum(1) + return mdist + +def lpls_qvals(a, b, c, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,center=True, + sim_method='shuffle',p_center='med', cov_center='med',crot=True, strict=False): + + """Returns qvals for l-pls model. + + input: + a -- data matrix + b -- data matrix + c -- data matrix + aopt -- scalar, opt. number of components + alpha -- [0,1] regularisation parameter for T2-test + xz_alpha -- [0,1] how much z info to include + n_iter -- number of permutations + sim_method -- permutation method ['shuffle'] + p_center -- location estimator for sub models ['med'] + cov_center -- location estimator for covariance of submodels ['med'] + crot -- bool, use rotations of sub models? + strict -- bool, use stict (rot/flips only) rotations? + """ + + m, n = a.shape + TSQ = zeros((n, n_iter), dtype='d') # (nvars x n_subsets) + n_false = zeros((n, n_iter), dtype='d') + + + # Full model + dat = lpls(a, b, c, aopt, scale='loads') + Wcv = lpls_jk(a, b, c ,aopt, n_blocks=None, algo=algo,center=center) + tsq_x = hotelling(Wcv, dat['W'], p_center=p_center,alpha=alpha, crot=crot, strict=strict, + cov_center=cov_center) + Lcv = lpls_jk(a, b, c ,aopt, n_blocks=None, algo=algo,center=center) + tsq_z = hotelling(Lcv, dat['L'], p_center=p_center,alpha=alpha, crot=crot, strict=strict, + cov_center=cov_center) + + # Perturbations + t0 = time.time() + Vs = shuffle_1d(b, n_iter, axis=0) + for i, b_shuff in enumerate(Vs): + t1 = time.time() + dat = pls(ac, b_shuff, aopt, 'loads', 'fast') + Wcv = pls_jkW(a, b_shuff, aopt, n_blocks=None, algo=algo) + TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center, + alpha=alpha, crot=crot, strict=strict, + cov_center=cov_center) + print time.time() - t1 + + return fdr(tsq_full, TSQ, median) + + + +def fdr(tsq, tsqp, loc_method=median): + n, = tsq.shape + k, m = tsqp.shape + assert(n==k) + n_false = empty((n, m), 'd') + sort_index = argsort(tsq)[::-1] + r_index = argsort(sort_index) + for i in xrange(m): + for j in xrange(n): + n_false[j,i] = (tsqp[:,i]>tsq[j]).sum() + + fp = loc_method(n_false,1) + n_signif = (arange(n) + 1.0)[r_index] + fd_rate = fp/n_signif + return fd_rate diff --git a/fluents/lib/engines.py b/fluents/lib/engines.py index 809a34b..25bb6ef 100644 --- a/fluents/lib/engines.py +++ b/fluents/lib/engines.py @@ -4,15 +4,21 @@ There is almost no typechecking of any kind here, just focus on speed """ import math +import warnings from scipy.linalg import svd,inv from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\ apply_along_axis,mean,ones,randn,empty_like,outer,r_,c_,\ - rand,sum,cumsum,matrix, expand_dims,minimum,where,arange -has_sym=True + rand,sum,cumsum,matrix, expand_dims,minimum,where,arange,inner,tile +has_sym = True +has_arpack = True try: from symeig import symeig except: has_sym = False +try: + from scipy.sandbox import arpack +except: + has_arpack = False def pca(a, aopt,scale='scores',mode='normal',center_axis=0): @@ -45,7 +51,6 @@ def pca(a, aopt,scale='scores',mode='normal',center_axis=0): Notes ----- - Uses kernel speed-up if m>>n or m<Y @@ -464,6 +489,9 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], mode='normal', sca evx : X-explained variance evy : Y-explained variance evz : Z-explained variance + mnx : X location + mny : Y location + mnz : Z location :Notes: @@ -471,12 +499,12 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], mode='normal', sca if mean_ctr!=None: xctr, yctr, zctr = mean_ctr X, mnX = center(X, xctr) - Y, mnY = center(Y, xctr) + Y, mnY = center(Y, yctr) Z, mnZ = center(Z, zctr) - varX = pow(X, 2).sum() - varY = pow(Y, 2).sum() - varZ = pow(Z, 2).sum() + varX = (X**2).sum() + varY = (Y**2).sum() + varZ = (Z**2).sum() m, n = X.shape k, l = Y.shape @@ -491,36 +519,40 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], mode='normal', sca K = empty((o, a_max)) L = empty((u, a_max)) B = empty((a_max, n, l)) - b0 = empty((a_max, m, l)) + #b0 = empty((a_max, 1, l)) var_x = empty((a_max,)) var_y = empty((a_max,)) var_z = empty((a_max,)) - + + MAX_ITER = 250 + LIM = 1e-1 for a in range(a_max): if verbose: - print "\n Working on comp. %s" %a + print "\nWorking on comp. %s" %a u = Y[:,:1] diff = 1 - MAX_ITER = 200 - lim = 1e-16 niter = 0 - while (diff>lim and niterLIM and niter=n: kernel = dot(data.T, data) + if has_arpack: + if amax==None: + amax = n + s, v = arpack.eigen_symmetric(kernel,k=amax, which='LM', + maxiter=200,tol=1e-5) if has_sym: if amax==None: amax = n @@ -728,16 +767,34 @@ def vnorm(x): return math.sqrt(dot(x.T, x)) def center(a, axis): - # 0 = col center, 1 = row center, 2 = double center - # -1 = nothing + # 0 = col center, 1 = row center, 2 = double center + # -1 = nothing + + # check if we have a vector + is_vec = len(a.shape)==1 + if not is_vec: + is_vec = a.shape[0]==1 or a.shape[1]==1 + if is_vec: + if axis==2: + warnings.warn("Double centering of vecor ignored, using ordinary centering") + if axis==-1: + mn = 0 + else: + mn = a.mean() + return a - mn, mn + # !!!fixme: use broadcasting if axis==-1: - mn = zeros((a.shape[1],)) + mn = zeros((1,a.shape[1],)) + #mn = tile(mn, (a.shape[0], 1)) elif axis==0: - mn = a.mean(0) + mn = a.mean(0)[newaxis] + #mn = tile(mn, (a.shape[0], 1)) elif axis==1: mn = a.mean(1)[:,newaxis] + #mn = tile(mn, (1, a.shape[1])) elif axis==2: - mn = a.mean(0) + a.mean(1)[:,newaxis] - a.mean() + mn = a.mean(0)[newaxis] + a.mean(1)[:,newaxis] - a.mean() + return a - mn , a.mean(0)[newaxis] else: raise IOError("input error: axis must be in [-1,0,1,2]") @@ -755,3 +812,64 @@ def scale(a, axis): return a - sc, sc + + +## #PCA CALCS + +## % Calculate Q limit using unused eigenvalues +## temp = diag(s); +## if n < m +## emod = temp(lv+1:n,:); +## else +## emod = temp(lv+1:m,:); +## end +## th1 = sum(emod); +## th2 = sum(emod.^2); +## th3 = sum(emod.^3); +## h0 = 1 - ((2*th1*th3)/(3*th2^2)); +## if h0 <= 0.0 +## h0 = .0001; +## disp(' ') +## disp('Warning: Distribution of unused eigenvalues indicates that') +## disp(' you should probably retain more PCs in the model.') +## end +## q = th1*(((1.65*sqrt(2*th2*h0^2)/th1) + 1 + th2*h0*(h0-1)/th1^2)^(1/h0)); +## disp(' ') +## disp('The 95% Q limit is') +## disp(q) +## if plots >= 1 +## lim = [q q]; +## plot(scl,res,scllim,lim,'--b') +## str = sprintf('Process Residual Q with 95 Percent Limit Based on %g PC Model',lv); +## title(str) +## xlabel('Sample Number') +## ylabel('Residual') +## pause +## end +## % Calculate T^2 limit using ftest routine +## if lv > 1 +## if m > 300 +## tsq = (lv*(m-1)/(m-lv))*ftest(.95,300,lv,2); +## else +## tsq = (lv*(m-1)/(m-lv))*ftest(.95,m-lv,lv,2); +## end +## disp(' ') +## disp('The 95% T^2 limit is') +## disp(tsq) +## % Calculate the value of T^2 by normalizing the scores to +## % unit variance and summing them up +## if plots >= 1.0 +## temp2 = scores*inv(diag(ssq(1:lv,2).^.5)); +## tsqvals = sum((temp2.^2)'); +## tlim = [tsq tsq]; +## plot(scl,tsqvals,scllim,tlim,'--b') +## str = sprintf('Value of T^2 with 95 Percent Limit Based on %g PC Model',lv); +## title(str) +## xlabel('Sample Number') +## ylabel('Value of T^2') +## end +## else +## disp('T^2 not calculated when number of latent variables = 1') +## tsq = 1.96^2; +## end + diff --git a/fluents/lib/validation.py b/fluents/lib/validation.py index dc9c8f3..e51a2b2 100644 --- a/fluents/lib/validation.py +++ b/fluents/lib/validation.py @@ -1,7 +1,7 @@ """This module implements some common validation schemes from pca and pls. """ from scipy import ones,mean,sqrt,dot,newaxis,zeros,sum,empty,\ - apply_along_axis,eye,kron,array,sort,zeros_like,argmax + apply_along_axis,eye,kron,array,sort,zeros_like,argmax,atleast_2d from scipy.stats import median from scipy.linalg import triu,inv,svd,norm @@ -9,7 +9,7 @@ from select_generators import w_pls_gen,w_pls_gen_jk,pls_gen,pca_gen,diag_pert from engines import w_simpls,pls,bridge,pca,nipals_lpls from cx_utils import m_shape -def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'): +def w_pls_cv_val(X, Y, amax, n_blocks=None): """Returns rmsep and aopt for pls tailored for wide X. The root mean square error of cross validation is calculated @@ -62,12 +62,10 @@ def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'): for Din, Doi, Yin, Yout in V: ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0]) PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0) - if algo=='simpls': - dat = w_simpls(Din, Yin, amax) - Q, U, H = dat['Q'], dat['U'], dat['H'] - That = dot(Doi, dot(U, inv(triu(dot(H.T, U))) )) - else: - raise NotImplementedError + + dat = w_simpls(Din, Yin, amax) + Q, U, H = dat['Q'], dat['U'], dat['H'] + That = dot(Doi, dot(U, inv(triu(dot(H.T, U))) )) Yhat = [] for j in range(l): @@ -78,7 +76,7 @@ def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'): #Yhat = Yin - dot(That,Q.T) msep = PRESS/(Y.shape[0]) aopt = find_aopt_from_sep(msep) - return sqrt(msep) + return sqrt(msep), aopt def pls_val(X, Y, amax=2, n_blocks=10, algo='pls'): k, l = m_shape(Y) @@ -108,29 +106,54 @@ def pls_val(X, Y, amax=2, n_blocks=10, algo='pls'): aopt = find_aopt_from_sep(msep) return msep, Yhat, aopt -def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5): +def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2]): """Performs crossvalidation to get generalisation error in lpls""" + assert(nsets<=X.shape[0]) + cv_iter = pls_gen(X, Y, n_blocks=nsets,center=False,index_out=True) k, l = Y.shape - Yhat = empty((a_max,k,l), 'd') + Yc = empty((k, l), 'd') + Yhat = empty((a_max, k, l), 'd') + Yhatc = empty((a_max, k, l), 'd') + sep2 = empty((a_max, k, l), 'd') for i, (xcal,xi,ycal,yi,ind) in enumerate(cv_iter): dat = nipals_lpls(xcal,ycal,Z, a_max=a_max, alpha=alpha, - mean_ctr=[2,0,1], + mean_ctr=mean_ctr, verbose=False) + B = dat['B'] - b0 = dat['b0'] + #b0 = dat['b0'] for a in range(a_max): - Yhat[a,ind,:] = b0[a][0][0] + dot(xi-xcal.mean(0), B[a]) - Yhat_class = zeros_like(Yhat) - for a in range(a_max): - for i in range(k): - Yhat_class[a,i,argmax(Yhat[a,i,:])]=1.0 - class_err = 100*((Yhat_class+Y)==2).sum(1)/Y.sum(0).astype('d') + if mean_ctr[0] in [0, 2]: + xi = xi - dat['mnx'] + else: + xi = xi - xi.mean(1)[:,newaxis] #???: cheating? + if mean_ctr[1] in [0, 2]: + ym = dat['mny'] + else: + ym = yi.mean(1)[:,newaxis] #???: check this + + Yhat[a,ind,:] = atleast_2d(ym + dot(xi, B[a])) + #Yhat[a,ind,:] = atleast_2d(b0[a] + dot(xi, B[a])) + + # todo: need a better support for class validation + y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?'] + print Y.dtype.char + if y_is_class: + Yhat_class = zeros_like(Yhat) + for a in range(a_max): + for i in range(k): + Yhat_class[a,i,argmax(Yhat[a,i,:])] = 1.0 + class_err = 100*((Yhat_class+Y)==2).sum(1)/Y.sum(0).astype('d') + sep = (Y - Yhat)**2 - rmsep = sqrt(sep.mean(1)) + rmsep = sqrt(sep.mean(1)).T + #rmsep2 = sqrt(sep2.mean(1)) + aopt = find_aopt_from_sep(rmsep) + return rmsep, Yhat, aopt def pca_alter_val(a, amax, n_sets=10, method='diag'): @@ -247,7 +270,7 @@ def pca_jkP(a, aopt, n_blocks=None): return PP -def lpls_jk(X, Y, Z, a_max, nsets=None, alpha=.5): +def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2]): cv_iter = pls_gen(X, Y, n_blocks=nsets,center=False,index_out=False) m, n = X.shape k, l = Y.shape @@ -258,8 +281,8 @@ def lpls_jk(X, Y, Z, a_max, nsets=None, alpha=.5): WWz = empty((nsets, o, a_max), 'd') #WWy = empty((nsets, l, a_max), 'd') for i, (xcal,xi,ycal,yi) in enumerate(cv_iter): - dat = nipals_lpls(xcal,ycal,Z,a_max=a_max,alpha=alpha, - mean_ctr=[2,0,1],scale='loads',verbose=False) + dat = nipals_lpls(xcal,ycal,Z,a_max=a_max,alpha=xz_alpha, + mean_ctr=mean_ctr,scale='loads',verbose=False) WWx[i,:,:] = dat['W'] WWz[i,:,:] = dat['L'] #WWy[i,:,:] = dat['Q']