From 253305b602e7e560b5e43fdbe938bb8c36ae450c Mon Sep 17 00:00:00 2001 From: flatberg Date: Fri, 14 Dec 2007 00:16:31 +0000 Subject: [PATCH] Fixed conflicts --- pyblm/__init__.py | 1 - pyblm/crossvalidation.py | 85 +++++++++++++--------- pyblm/engines.py | 149 +++++++++++++++++++-------------------- pyblm/models.py | 28 ++++---- pyblm/statistics.py | 78 ++++++++++---------- 5 files changed, 177 insertions(+), 164 deletions(-) diff --git a/pyblm/__init__.py b/pyblm/__init__.py index c4198ea..73566c4 100644 --- a/pyblm/__init__.py +++ b/pyblm/__init__.py @@ -13,4 +13,3 @@ def test(level=1, verbosity=1): print 'Python version %s' % (sys.version.replace('\n', '',),) from numpy.testing import NumpyTest return NumpyTest().test(level, verbosity) - diff --git a/pyblm/crossvalidation.py b/pyblm/crossvalidation.py index 364bf4e..765f42b 100644 --- a/pyblm/crossvalidation.py +++ b/pyblm/crossvalidation.py @@ -155,7 +155,7 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F validation scheme. *Parameters*: - + X : {array} Main data matrix (m, n) Y : {array} @@ -180,9 +180,9 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F If true, Require orthogonal latent components in Z. verbose : {boolean}, optional Verbosity of console output. For use in debugging. - + *Returns*: - + rmsep : {array} Root mean squred error of prediction yhat : {array} @@ -191,19 +191,19 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F Estimated value of optimal number of components """ - + m, n = X.shape k, l = Y.shape o, p = Z.shape assert m == k, "X (%d,%d) - Y (%d,%d) dim mismatch" %(m, n, k, l) assert n == p, "X (%d,%d) - Z (%d,%d) dim mismatch" %(m, n, o, p) if nsets == None: - nsets = m + nsets = m if nsets > X.shape[0]: print "nsets (%d) is larger than number of variables (%d).\nnsets: %d -> %d" %(nsets, m, nsets, m) nsets = m assert (alpha >= 0 and alpha<=1), "Alpha needs to be within [0,1], got: %.2f" %alpha - + Yhat = empty((a_max, k, l), 'd') for cal, val in cv(k, nsets): # do the training model @@ -217,10 +217,16 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F # predictions for a in range(a_max): Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a])) + # todo: need a better support for classification error + y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?'] + if y_is_class: + pass + #Yhat, err = class_error(Yhat, Y) + #return Yhat, err sep = (Y - Yhat)**2 rmsep = sqrt(sep.mean(1)).T #aopt = find_aopt_from_sep(rmsep) - + # todo: need a better support for classification error error = prediction_error(Yhat, Y, method='1/2') @@ -228,7 +234,7 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F def pca_jk(a, aopt, nsets=None, center_axis=[0], method='cv'): """Returns jack-knife segments from PCA. - + *Parameters*: a : {array} @@ -252,9 +258,9 @@ def pca_jk(a, aopt, nsets=None, center_axis=[0], method='cv'): Loadings collected in a three way matrix (n_segments, m, aopt) *Notes*: - + - Crossvalidation method is currently set to random blocks of samples. - + """ m, n = a.shape if nsets == None: @@ -278,14 +284,14 @@ def pca_jk(a, aopt, nsets=None, center_axis=[0], method='cv'): Pcv[i,:,:] = pca(a[cal,:], aopt, mode='fast', scale='loads', center_axis = center_axis)['P'] else: raise NotImplementedError(method) - + return Pcv def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False): """ Returns jack-knife segements of W. - + *Parameters*: - + X : {array} Main data matrix (m, n) Y : {array} @@ -294,7 +300,7 @@ def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False): The number of components to calculate (0, min(m,n)) nsets : (integer), optional Number of jack-knife segments - + center_axis : {boolean}, optional - -1 : nothing - 0 : row center @@ -302,12 +308,12 @@ def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False): - 2 : double center verbose : {boolean}, optional Verbosity of console output. For use in debugging. - + *Returns*: - + Wcv : {array} Loading-weights jack-knife segements - + """ m, n = X.shape k, l = Y.shape @@ -320,7 +326,7 @@ def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False): print "Segment number: %d" %i dat = pls(X[cal,:], Y[cal,:], a_opt, scale='loads', mode='fast', center_axis=center_axis) Wcv[i,:,:] = dat['W'] - + return Wcv def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth=False, verbose=False): @@ -332,10 +338,10 @@ def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth= infer the paramter confidence in th model. The segements returned are the X-block weights and Z-block weights. - - + + *Parameters*: - + X : {array} Main data matrix (m, n) Y : {array} @@ -358,15 +364,15 @@ def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth= 2 : double center verbose : {boolean}, optional Verbosity of console output. For use in debugging. - + *Returns*: - + Wx : {array} X-block jack-knife segements Wz : {array} Z-block jack-knife segements """ - + m, n = X.shape k, l = Y.shape o, p = Z.shape @@ -388,7 +394,7 @@ def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth= def find_aopt_from_sep(err, method='vanilla'): """Returns an estimate of optimal number of components. - + The estimate is based on the error of prediction from crossvalidation. This is pretty much wild guessing and it is recomended to inspect model parameters and prediction errors @@ -406,7 +412,7 @@ def find_aopt_from_sep(err, method='vanilla'): aopt : {integer} A guess on the optimal number of components """ - + if method == 'vanilla': # min rmsep rmsecv = sqrt(err.mean(0)) @@ -434,7 +440,7 @@ def cv(N, K, randomise=True, sequential=False): of length ~N/K, *without* replacement. *Parameters*: - + N : {integer} Total number of samples K : {integer} @@ -443,7 +449,7 @@ def cv(N, K, randomise=True, sequential=False): Use random sampling sequential : {boolean} Use sequential sampling - + *Returns*: training : {array-like} @@ -456,12 +462,12 @@ def cv(N, K, randomise=True, sequential=False): If randomise is true, a copy of index is shuffled before partitioning, otherwise its order is preserved in training and validation. - + Randomise overrides the sequential argument. If randomise is true, sequential is False If sequential is true the index is partioned in continous blocks, otherwise interleaved ordering is used. - + """ if K > N: raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(N, N, K) @@ -510,6 +516,8 @@ def diag_cv(shape, nsets=9, randomise=True): except: raise ValueError("shape needs to be a two-tuple") if nsets>m or nsets>n: + msg = "You may not use more subsets than max(n_rows, n_cols)" + raise ValueError, msg msg = "You may not use more subsets than max(n_rows, n_cols)" nsets = min(m, n) nm = n*m @@ -524,7 +532,20 @@ def diag_cv(shape, nsets=9, randomise=True): validation.update(ind) #training = [j for j in index if j not in validation] yield list(validation) - + +def class_error(y_hat, y, method='vanilla'): + """ Not used. + """ + a_opt, k, l = y_hat.shape + y_hat_c = zeros((k, l), dtype='d') + if method == vanilla: + pass + for a in range(a_opt): + for i in range(k): + y_hat_c[a, val, argmax(y_hat[a,val,:])] = 1.0 + err = 100*((y_hat_c + y) == 2).sum(1)/y.sum(0).astype('d') + + return y_hat_c, err def prediction_error(y_hat, y, method='squared'): """Loss function on multiclass Y. @@ -651,7 +672,7 @@ def _wkernel_pls_val(X, Y, a_max, n_blocks=None): for Din, Doi, Yin, Yout in V: ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0]) PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0) - + dat = w_simpls(Din, Yin, a_max) Q, U, H = dat['Q'], dat['U'], dat['H'] That = dot(Doi, dot(U, inv(triu(dot(H.T, U))) )) diff --git a/pyblm/engines.py b/pyblm/engines.py index 504f1f9..4d825f7 100644 --- a/pyblm/engines.py +++ b/pyblm/engines.py @@ -20,7 +20,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]): extracts orthogonal components of maximum variance. *Parameters*: - + X : {array} Data measurement matrix, (samples x variables) aopt : {integer} @@ -55,21 +55,21 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]): mode : {string}, optional Amount of info retained, [['normal'], 'fast', 'detailed'] - + *SeeAlso*: `center` : Data centering - + *Notes* - + Uses kernel speed-up if m>>n or m<>> import scipy,engines >>> a=scipy.asarray([[1,2,3],[2,4,5]]) >>> dat=engines.pca(a, 2) @@ -77,7 +77,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]): array([0.,99.8561562, 100.]) """ - + m, n = X.shape max_aopt = min(m, n) if center_axis != None: @@ -94,7 +94,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]): u = u[:,:aopt] s = s[:aopt] v = v[:,:aopt] - + # ranktest tol = 1e-10 eff_rank = sum(s > s[0]*tol) @@ -104,14 +104,14 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]): T = T[:,:aopt] P = v[:,:aopt] e = s**2 - + if scale=='loads': T = T/s P = P*s if mode in ['fast', 'f', 'F']: return {'T':T, 'P':P, 'aopt':aopt, 'mnx': mnx} - + if mode in ['detailed', 'd', 'D']: E = empty((aopt, m, n)) ssq = [] @@ -135,7 +135,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]): lev = [(1./m)+((T/s)**2).sum(1), (1./n)+(P**2).sum(1)] # variances expvarx = r_[0, 100*e.cumsum()/(X*X).sum()] - + return {'T': T, 'P': P, 'E': E, 'evx': expvarx, 'leverage': lev, 'ssqx': ssq, 'aopt': aopt, 'eigvals': e, 'mnx': mnx} @@ -159,20 +159,20 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=[0, 0]): keys -- values, T -- scores, P -- loadings, E -- residuals, levx -- leverages, ssqx -- sum of squares, expvarx -- cumulative explained variance, aopt -- number of components used - + *OtherParameters*: mode : {string} Amount of info retained, ('fast', 'normal', 'detailed') center_axis : {integer} Center along given axis. If neg.: no centering (-inf,..., matrix modes) - + SeeAlso: - + - pca : other blm - pls : other blm - lpls : other blm - + *Notes* ----- @@ -180,12 +180,12 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=[0, 0]): Uses kernel speed-up if m>>n or m<>> import scipy,engines >>> a=scipy.asarray([[1,2,3],[2,4,5]]) >>> b=scipy.asarray([[1,1],[2,3]]) @@ -222,9 +222,9 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=[0, 0]): F = b - dot(T, Q.T) sepy = F**2 ssqy = [sepy.sum(0), sepy.sum(1)] - + expvary = r_[0, 100*((T**2).sum(0)*(Q**2).sum(0)/(b**2).sum()).cumsum()[:aopt]] - + dat.update({'Q': Q, 'F': F, 'evy': expvary, 'ssqy': ssqy, 'mny': mny}) return dat @@ -245,9 +245,9 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]): Which component should get the scale center_axis : {-1, integer} Perform centering across given axis, (-1 is no centering) - + *Returns*: - + T : {array} X-scores W : {array} @@ -280,25 +280,25 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]): Sum of squared residuals in Y along each dimesion leverage : {array} Sample leverages - + *OtherParameters*: - + mode : ['normal', 'fast', 'detailed'], optional How much details to compute - + *SeeAlso*: - + `center` - data centering - + *Notes* - The output with mode='fast' will only return T and W - + - If residuals turn rank deficient, a lower number of component than given in input will be used. The number of components used is given in results. *Examples* - + >>> import numpy, engines >>> a = numpy.asarray([[1,2,3],[2,4,5]]) >>> b = numpy.asarray([[1,1],[2,3]]) @@ -307,7 +307,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]): array([0.,99.8561562, 100.]) """ - + m, n = X.shape try: k, l = Y.shape @@ -322,7 +322,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]): Y, mny = center(Y, center_axis[1]) min_aopt = min_aopt - 1 assert(aopt > 0 and aopt < min_aopt) - + W = empty((n, aopt)) P = empty((n, aopt)) R = empty((n, aopt)) @@ -330,7 +330,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]): T = empty((m, aopt)) B = empty((aopt, n, l)) tt = empty((aopt,)) - + XY = dot(X.T, Y) for i in range(aopt): if XY.shape[1] == 1: #pls 1 @@ -345,7 +345,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]): # with many samples, many x-vars and many non-orth y-vars (where arpack speed # shines) ############# - + #s, w = arpack.eigen_symmetric(dot(XY, XY.T),k=1, tol=1e-10, maxiter=1000) #if s[0] == 0: # print "Arpack did not converge... using svd" @@ -357,15 +357,15 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]): # print "Arpack did not converge... using svd" q, s, vh = svd(dot(XY.T, XY)) q = q[:,:1] - + w = dot(XY, q) w = w/vnorm(w) - + r = w.copy() if i > 0: for j in range(0, i, 1): r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis] - + t = dot(X, r) tt[i] = tti = dot(t.T, t).ravel() p = dot(X.T, t)/tti @@ -385,7 +385,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]): R[:,i] = r.ravel() Q[:,i] = q.ravel() B[i] = dot(R[:,:i+1], Q[:,:i+1].T) - + qnorm = apply_along_axis(vnorm, 0, Q) tnorm = sqrt(tt) pp = (P**2).sum(0) @@ -412,13 +412,13 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]): sepy = F**2 ssqy = [sepy.sum(0), sepy.sum(1)] leverage = 1./m + ((T/tnorm)**2).sum(1) - + # variances tp= tt*pp tq = tt*qnorm*qnorm expvarx = r_[0, 100*tp/(X*X).sum()] expvary = r_[0, 100*tq/(Y*Y).sum()] - + if scale == 'loads': T = T/tnorm W = W*tnorm @@ -438,7 +438,7 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores', of these three matrices tries to discover common directions/subspaces. *Parameters*: - + X : {array} Main data matrix (m, n) Y : {array} @@ -457,9 +457,9 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores', 0 : row center 1 : column center 2 : double center - + *Returns*: - + T : {array} X-scores W : {array} @@ -504,18 +504,18 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores', Saeboe et al., LPLS-regression: a method for improved prediction and classification through inclusion of background information on predictor variables, J. of chemometrics and intell. laboratory syst. - + Martens et.al, Regression of a data matrix on descriptors of both its rows and of its columns via latent variables: L-PLSR, Computational statistics & data analysis, 2005 - + """ m, n = X.shape k, l = Y.shape u, o = Z.shape max_rank = min(m, n) - - + + if center_axis != None: xctr, yctr, zctr = center_axis X, mnX = center(X, xctr) @@ -523,14 +523,14 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores', Z, mnZ = center(Z, zctr) max_rank = max_rank -1 assert (a_max > 0 and a_max < max_rank), "Number of comp error:\ - tried: %d, max_rank: %d" %(a_max, max_rank) + tried: %d, max_rank: %d" %(a_max, max_rank) # initial variance varX = (X**2).sum() varY = (Y**2).sum() varZ = (Z**2).sum() - # initialize + # initialize U = empty((k, a_max)) Q = empty((l, a_max)) T = zeros((m, a_max)) @@ -600,7 +600,7 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores', else: k = w l = dot(G, w) - + U[:,a] = u.ravel() W[:,a] = w.ravel() P[:,a] = p.ravel() @@ -617,11 +617,11 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores', var_x[a] = pow(E, 2).sum() var_y[a] = pow(F, 2).sum() var_z[a] = pow(G, 2).sum() - + B[a] = dot(dot(W[:,:a+1], inv(dot(P[:,:a+1].T, W[:,:a+1]))), Q[:,:a+1].T) #b0[a] = mnY - dot(mnX, B[a]) - - + + # variance explained evx = 100.*(1 - var_x/varX) evy = 100.*(1 - var_y/varY) @@ -635,8 +635,8 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores', knorm = apply_along_axis(vnorm, 0, K) L = L*knorm K = K/knorm - - return {'T':T, 'W':W, 'P':P, 'Q':Q, 'U':U, 'L':L, 'K':K, 'B':B, 'E': E, 'F': F, 'G': G, 'evx':evx, 'evy':evy, 'evz':evz,'mnx': mnX, 'mny': mnY, 'mnz': mnZ} + + return {'T':T, 'W':W, 'P':P, 'Q':Q, 'U':U, 'L':L, 'K':K, 'B':B, 'E': E, 'F': F, 'G': G, 'evx':evx, 'evy':evy, 'evz':evz,'mnx': mnX, 'mny': mnY, 'mnz': mnZ} def lpls_predict(model_dict, x, aopt): """Predict lpls reponses from existing model on new data. @@ -646,25 +646,25 @@ def lpls_predict(model_dict, x, aopt): except: x = atleast_2d(x.shape) m, n = x.shape - + if 'B0' in model_dict.keys(): y = model_dict['B0'] + dot() - - + + def vnorm(a): """Returns the norm of a vector. *Parameters*: - + a : {array} Input data, 1-dim, or column vector (m, 1) *Returns*: - + a_norm : {array} Norm of input vector - + """ return msqrt(dot(a.T,a)) @@ -676,7 +676,7 @@ def center(a, axis): a : {array} Input data axis : {integer} - Which centering to perform. + Which centering to perform. 0 = col center, 1 = row center, 2 = double center -1 = nothing @@ -707,16 +707,16 @@ def center(a, axis): else: mn = a.mean()*ones(a.shape) return a - mn, mn - + if axis == -1: mn = zeros((1,a.shape[1],)) - mn = tile(mn, (a.shape[0], 1)) + #mn = tile(mn, (a.shape[0], 1)) elif axis == 0: mn = a.mean(0)[newaxis] - mn = tile(mn, (a.shape[0], 1)) + #mn = tile(mn, (a.shape[0], 1)) elif axis == 1: mn = a.mean(1)[:,newaxis] - mn = tile(mn, (1, a.shape[1])) + #mn = tile(mn, (1, a.shape[1])) elif axis == 2: #fixme: double centering returns column mean as loc-vector, ok? mn = a.mean(0)[newaxis] + a.mean(1)[:,newaxis] - a.mean() @@ -774,7 +774,7 @@ def _scale(a, axis): a : {array} Input data axis : {integer} - Which scaling to perform. + Which scaling to perform. 0 = column, 1 = row, -1 = nothing *Returns*: @@ -784,7 +784,7 @@ def _scale(a, axis): mn : {array} Scaling vector/matrix """ - + if axis == -1: sc = zeros((a.shape[1],)) elif axis == 0: @@ -817,21 +817,20 @@ def esvd(data, a_max=None): Singular values v : {array} Left hand eigenvectors - + *Notes*: Uses Anoldi iterations for the symmetric eigendecomp (ARPACK) - + """ - + m, n = data.shape - if m >= n: + if m > n: kernel = dot(data.T, data) - if a_max == None: a_max = n - 1 s, v = arpack.eigen_symmetric(kernel, k=a_max, which='LM', - maxiter=200, tol=1e-5) + maxiter=500, tol=1e-7) s = s[::-1] v = v[:,::-1] #u, s, vt = svd(kernel) @@ -841,9 +840,9 @@ def esvd(data, a_max=None): else: kernel = dot(data, data.T) if a_max == None: - a_max = m -1 + a_max = m - 1 s, u = arpack.eigen_symmetric(kernel, k=a_max, which='LM', - maxiter=200, tol=1e-5) + maxiter=500, tol=1e-7) s = s[::-1] u = u[:,::-1] #u, s, vt = svd(kernel) diff --git a/pyblm/models.py b/pyblm/models.py index b21926b..42e69cd 100644 --- a/pyblm/models.py +++ b/pyblm/models.py @@ -21,7 +21,7 @@ class Model(object): def __init__(self, name="johndoe"): self.name = name self.options = {} - + def save(self, filename='pca.ml'): pass @@ -46,7 +46,7 @@ class PCA(Model): self._x = x self.amax = amax self.aopt = amax - + # properties def amax(): doc = "maximum number of components" @@ -77,7 +77,7 @@ class PCA(Model): del self._tot_var return locals() tot_var = property(**tot_var()) - + def scores(): doc = "pca scores" def fget(self): @@ -94,7 +94,7 @@ class PCA(Model): del self._core_scores return locals() scores = property(**scores()) - + def loadings(): doc = "pca loadings" def fget(self): @@ -111,7 +111,7 @@ class PCA(Model): self._loadings = p return locals() loadings = property(**loadings()) - + def singvals(): doc = "Singular values" def fget(self): @@ -128,7 +128,7 @@ class PCA(Model): del self._singvals return locals() singvals = property(**singvals()) - + def x(): doc = "x is readonly, may not be deleted" def fget(self): @@ -154,12 +154,12 @@ class PCA(Model): del self._xc return locals() xadd = property(**xadd()) - + def xc(): doc = "mean_centered input data" def fget(self): if not hasattr(self, "_xc"): - self._xc = self.x + self.xadd + self._xc = self.x + self.xadd return self._xc def fset(self, xc): self._xc = xc @@ -186,7 +186,7 @@ class PCA(Model): del self._xw return locals() xw = property(**xw()) - + def explained_variance(): doc = "explained variance" def fget(self): @@ -215,7 +215,7 @@ class PCA(Model): del self._residuals return locals() residuals = property(**residuals()) - + def leverage(): doc = "objects leverage" def fget(self): @@ -254,7 +254,7 @@ class PCA(Model): self._column_metric = scale(self.xc, axis=1) return self._column_metric def fset(self, w): - + self._column_metric = w # update model def fdel(self): @@ -263,7 +263,7 @@ class PCA(Model): del self._xd return locals() column_metric = property(**column_metric()) - + def blm_update(self, a, b): pass @@ -281,8 +281,8 @@ class PCA(Model): def reweight(self, w): pass - - + + if __name__ == "__main__": from numpy.random import rand X = rand(4,10) diff --git a/pyblm/statistics.py b/pyblm/statistics.py index 6d9db02..b38e881 100644 --- a/pyblm/statistics.py +++ b/pyblm/statistics.py @@ -22,9 +22,9 @@ def hotelling(Pcv, P, p_center='median', cov_center='median', used in multivariate hypothesis testing. In order to avoid small variance samples to become significant this version allows borrowing variance from the pooled covariance. - + *Parameters*: - + Pcv : {array} Crossvalidation segements of paramter P : {array} @@ -39,9 +39,9 @@ def hotelling(Pcv, P, p_center='median', cov_center='median', Rotate sub-segments toward calibration model. strict : {boolean}, optional Only rotate 90 degree - + *Returns*: - + tsq : {array} Hotellings T^2 estimate @@ -50,19 +50,19 @@ def hotelling(Pcv, P, p_center='median', cov_center='median', Gidskehaug et al., A framework for significance analysis of gene expression datausing dimension reduction methods, BMC bioinformatics, 2007 - + *Notes* The rotational freedom in the solution of bilinear models may require that a rotation onto the calibration model. One way of doing that is procrustes rotation. - + """ m, n = P.shape n_sets, n, amax = Pcv.shape T_sq = empty((n,), dtype='d') Cov_i = zeros((n, amax, amax), dtype='d') - + # rotate sub_models to full model if crot: for i, Pi in enumerate(Pcv): @@ -77,20 +77,15 @@ def hotelling(Pcv, P, p_center='median', cov_center='median', P_ctr = P for i in xrange(n): - Pi = Pcv[:,i,:] # (n_sets x amax) + Pi = Pcv[:,i,:] # (n_sets x amax) Pi_ctr = P_ctr[i,:] # (1 x amax) - #Pim = (Pi - Pi_ctr)*msqrt(n_sets-1) - #Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim) - Pim = (Pi - Pi_ctr) - Cov_i[i] = dot(Pim.T, Pim) + Pim = (Pi - Pi_ctr)*msqrt(n_sets-1) + Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim) + if cov_center == 'median': Cov_p = median(Cov_i) - elif cov_center == 'mean': + else cov_center == 'mean': Cov_p = Cov.mean(0) - else: - print "Pooled covariance est. invalid, using median" - print cov_center - Cov_p = median(Cov_i) reg_cov = (1. - alpha)*Cov_i + alpha*Cov_p for i in xrange(n): Pc = P_ctr[i,:] @@ -105,7 +100,7 @@ def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False) onto another by minimising the squared error. *Parameters*: - + a : {array} Input array, stationary b : {array} @@ -127,9 +122,9 @@ def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False) *Reference*: Schonemann, A generalized solution of the orthogonal Procrustes - problem, Psychometrika, 1966 + problem, Psychometrika, 1966 """ - + if center: mn_a = a.mean(0) a = a - mn_a @@ -145,7 +140,7 @@ def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False) u, s, vt = svd(dot(b.T, a)) Cm = dot(u, vt) # Cm: orthogonal rotation matrix if strict: - Cm = _ensure_strict(Cm) + Cm = _ensure_strict(Cm) b_rot = dot(b, Cm) if verbose: fit = ((b - b_rot)**2).sum() @@ -158,7 +153,7 @@ def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False) def _ensure_strict(C, only_flips=True): """Ensure that a rotation matrix does only 90 degree rotations. - + In multiplication with pcs this allows flips and reordering. if only_flips is True there will onlt be flips allowed @@ -173,13 +168,13 @@ def _ensure_strict(C, only_flips=True): C_rot : {array} Restricted rotation matrix - + *Notes*: - + This function is not ready for use. Use (only_flips=True). That is, for more than two components, the rotation matrix has a tendency to be unstable (det(Cm)>1), when rounding is used. - + """ if only_flips: C = eye(C.shape[0])*sign(C) @@ -199,9 +194,9 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20, The response (Y) is randomly permuted, and the number of false positives is registered by comparing hotellings T2 statistics of the calibration model. - + *Parameters*: - + X : {array} Main data matrix (m, n) Y : {array} @@ -237,14 +232,14 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20, nsets : {integer} Number of crossvalidation segements - + *Reference*: Gidskehaug et al., A framework for significance analysis of gene expression data using dimension reduction methods, BMC bioinformatics, 2007 """ - + m, n = X.shape k, nz = Z.shape assert(n==nz) @@ -255,7 +250,7 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20, Y = atleast_2d(Y).T my, l = Y.shape assert(m==my) - + pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets) pert_tsq_z = zeros((k, n_iter), dtype='d') # (nzvars x n_subsets) @@ -264,7 +259,7 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20, Wc, Lc = lpls_jk(X, Y, Z ,aopt, zorth=zorth) cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha) cal_tsq_z = hotelling(Lc, dat['L'], alpha=alpha) - print "morn" + # Perturbations index = arange(m) for i in range(n_iter): @@ -275,7 +270,7 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20, pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha) # no reason to borrow variance in dag (alpha ->some small value) pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=0.01) - + return cal_tsq_z, pert_tsq_z, cal_tsq_x, pert_tsq_x @@ -286,9 +281,9 @@ def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median, The response (Y) is randomly permuted, and the number of false positives is registered by comparing hotellings T2 statistics of the calibration model. - + *Parameters*: - + X : {array} Main data matrix (m, n) Y : {array} @@ -318,14 +313,14 @@ def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median, Only rotate 90 degree nsets : {integer} Number of crossvalidation segements - + *Reference*: Gidskehaug et al., A framework for significance analysis of gene expression data using dimension reduction methods, BMC bioinformatics, 2007 """ - + m, n = X.shape try: my, l = Y.shape @@ -341,7 +336,7 @@ def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median, Wc = pls_jk(X, Y , aopt) cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha) - + # Perturbations pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets) index = arange(m) @@ -351,7 +346,7 @@ def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median, dat = pls(X, Y[indi,:], aopt, scale='loads', center_axis=center_axis) Wi = pls_jk(X, Y[indi,:], aopt, nsets=nsets, center_axis=center_axis) pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha) - + return cal_tsq_x, pert_tsq_x @@ -362,7 +357,7 @@ def _fdr(tsq, tsqp, loc_method=median): Fdr is a method used in multiple hypothesis testing to correct for multiple comparisons. It controls the expected proportion of incorrectly rejected null hypotheses (type I errors) in a list of rejected hypotheses. - + *Parameters*: tsq : {array} @@ -372,14 +367,14 @@ def _fdr(tsq, tsqp, loc_method=median): loc_method : {py_func} Location method - + *Returns*: fdr : {array} False discovery rate *Notes*: - + This is an internal function for use in fdr estimation of jack-knifed perturbated blm parameters. @@ -403,4 +398,3 @@ def _fdr(tsq, tsqp, loc_method=median): fd_rate = fp/n_signif fd_rate[fd_rate>1] = 1 return fd_rate -