From 4866984e0093bd51064628ad4ba5bc203432fcd9 Mon Sep 17 00:00:00 2001 From: flatberg Date: Wed, 10 Oct 2007 12:19:21 +0000 Subject: [PATCH] Initial import --- COPYING | 0 README | 0 __init__.py | 15 ++ crossvalidation.py | 264 ++++++++++++++++++++++++++++++++++ engines.py | 298 +++++++++++++++++++++++++++++++++++++++ setup.py | 50 +++++++ statistics.py | 296 ++++++++++++++++++++++++++++++++++++++ tests/test_lplsengine.py | 143 +++++++++++++++++++ 8 files changed, 1066 insertions(+) create mode 100644 COPYING create mode 100644 README create mode 100644 __init__.py create mode 100644 crossvalidation.py create mode 100644 engines.py create mode 100644 setup.py create mode 100644 statistics.py create mode 100644 tests/test_lplsengine.py diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..e69de29 diff --git a/README b/README new file mode 100644 index 0000000..e69de29 diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..42ebe6c --- /dev/null +++ b/__init__.py @@ -0,0 +1,15 @@ +from crossvalidation import lpls_val +from statistics import lpls_qvals +from engines import nipals_lpls as lpls +#from tests import * + +__version__ = '0.0.1' + +def test(level=1, verbosity=1): + import os, sys + print 'lplslib is installed in %s' % (os.path.split(__file__)[0],) + print 'lpls ib version %s' % (__version__,) + print 'Python version %s' % (sys.version.replace('\n', '',),) + from numpy.testing import NumpyTest + return NumpyTest().test(level, verbosity) + diff --git a/crossvalidation.py b/crossvalidation.py new file mode 100644 index 0000000..000fbef --- /dev/null +++ b/crossvalidation.py @@ -0,0 +1,264 @@ +"""This module implements some validation schemes l-pls. + +The primary use is crossvalidation. +""" +__all__ = ['lpls_val', 'lpls_jk'] +__docformat__ = "restructuredtext en" + +from numpy import dot,empty,zeros,sqrt,atleast_2d,argmax,asarray,median,\ + array_split + +from engines import nipals_lpls as lpls + + +def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=True): + """Performs crossvalidation for generalisation error in lpls. + + The L-PLS crossvalidation is estimated just like an ordinary pls + crossvalidation. That is, the generalisation error is estimated by + predicting samples (rows of X and Y) left out according to a cross + validation scheme. + + *Parameters*: + + X : {array} + Main data matrix (m, n) + Y : {array} + External row data (m, l) + Z : {array} + External column data (n, o) + a_max : {integer}, optional + Maximum number of components to calculate (0, min(m,n)) + nsets : (integer), optional + Number of crossvalidation sets + alpha : {float}, optional + Parameter to control the amount of influence from Z-matrix. + 0 is none, which returns a pls-solution, 1 is max + mean_center : {array-like}, optional + A three element array-like structure with elements in [-1,0,1,2], + that decides the type of centering used. + -1 : nothing + 0 : row center + 1 : column center + 2 : double center + verbose : {boolean}, optional + Verbosity of console output. For use in debugging. + + *Returns*: + + rmsep : {array} + Root mean squred error of prediction + yhat : {array} + Estimated responses + aopt : {integer} + Estimated value of optimal number of components + + """ + + m, n = X.shape + k, l = Y.shape + o, p = Z.shape + assert m==k, "X (%d,%d) - Y (%d,%d) dim mismatch" %(m,n,k,l) + assert n==p, "X (%d,%d) - Z (%d,%d) dim mismatch" %(m,n,o,p) + if nsets == None: + nsets = m + if nsets > X.shape[0]: + print "nsets (%d) is larger than number of variables (%d).\nnsets: %d -> %d" %(nsets, m, nsets, m) + nsets = m + assert (alpha >= 0 and alpha<=1), "Alpha needs to be within [0,1], got: %.2f" %alpha + + Yhat = empty((a_max, k, l), 'd') + for cal, val in cv(nsets, k): + dat = lpls(X[cal],Y[cal],Z,a_max=a_max,alpha=alpha,mean_ctr=mean_ctr,verbose=verbose) + if mean_ctr[0] != 1: + xi = X[val,:] - dat['mnx'] + else: + xi = X[val] - X[val].mean(1)[:,newaxis] + if mean_ctr[2] != 1: + ym = dat['mny'] + else: + ym = Y[val].mean(1)[:,newaxis] #???: check this + for a in range(a_max): + Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a])) + + # todo: need a better support for classification error + y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?'] + if y_is_class: + Yhat, err = class_error(Yhat,Y) + return Yhat, err + + sep = (Y - Yhat)**2 + rmsep = sqrt(sep.mean(1)).T + aopt = find_aopt_from_sep(rmsep) + + return rmsep, Yhat, aopt + + +def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=False): + """Returns jack-knifed segments of lpls model. + + Jack-knifing is a method to perturb the model paramters, hopefully + to be representable as a typical perturbation of a *future* sample. + The mean and variance of the jack knife segements may be used to + infer the paramter confidence in th model. + + The segements returned are the X-block weights and Z-block weights. + + + *Parameters*: + + X : {array} + Main data matrix (m, n) + Y : {array} + External row data (m, l) + Z : {array} + External column data (n, o) + a_max : {integer}, optional + Maximum number of components to calculate (0, min(m,n)) + nsets : (integer), optional + Number of jack-knife segments + xz_alpha : {float}, optional + Parameter to control the amount of influence from Z-matrix. + 0 is none, which returns a pls-solution, 1 is max + mean_center : {array-like}, optional + A three element array-like structure with elements in [-1,0,1,2], + that decides the type of centering used. + -1 : nothing + 0 : row center + 1 : column center + 2 : double center + verbose : {boolean}, optional + Verbosity of console output. For use in debugging. + + *Returns*: + + Wx : {array} + X-block jack-knife segements + Wz : {array} + Z-block jack-knife segements + """ + + m, n = X.shape + k, l = Y.shape + o, p = Z.shape + assert(m==k) + assert(n==p) + if nsets==None: + nsets = m + WWx = empty((nsets, n, a_max), 'd') + WWz = empty((nsets, o, a_max), 'd') + #WWy = empty((nsets, l, a_max), 'd') + for i, (cal, val) in enumerate(cv(k, nsets)): + dat = lpls(X[cal],Y[cal],Z,a_max=a_max,alpha=xz_alpha,mean_ctr=mean_ctr, + scale='loads', verbose=verbose) + WWx[i,:,:] = dat['W'] + WWz[i,:,:] = dat['L'] + #WWy[i,:,:] = dat['Q'] + + return WWx, WWz + +def find_aopt_from_sep(sep, method='vanilla'): + """Returns an estimate of optimal number of components. + + The estimate is based on the squared error of prediction from + crossvalidation. This is pretty much wild guessing and it is + recomended to inspect model parameters and prediction errors + closely before deciding on the optimal number of components. + + *Parameters*: + sep : {array} + Squared error of prediction + method : ['vanilla', '75perc'] + Mehtod used to estimate optimal number of components + + *Returns*: + aopt : {integer} + A guess on the optimal number of components + """ + + if method=='vanilla': + # min rmsep + rmsecv = sqrt(sep.mean(0)) + return rmsecv.argmin() + 1 + + elif method=='75perc': + prct = .75 #percentile + ind = 1.*sep.shape[0]*prct + med = median(sep) + prc_75 = [] + for col in sep.T: + col = sorted(col) + prc_75.append(col[int(ind)]) + prc_75 = asarray(prc_75) + for i in range(1, sep.shape[1], 1): + if med[i-1]N: + raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(N,N,K) + index = xrange(N) + if randomise: + from random import shuffle + index = list(index) + shuffle(index) + sequential = False + if sequential: + for validation in array_split(index, K): + training = [i for i in index if i not in validation] + yield training, validation + else: + for k in xrange(K): + training = [i for i in index if i % K != k] + validation = [i for i in index if i % K == k] + yield training, validation + + +def class_error(Yhat, Y, method='vanilla'): + """ Not used. + """ + a_max, k, l = Yhat.shape + Yhat_c = zeros((k, l), dtype='d') + for a in range(a_max): + for i in range(k): + Yhat_c[a,val,argmax(Yhat[a,val,:])] = 1.0 + err = 100*((Yhat_c + Y) == 2).sum(1)/Y.sum(0).astype('d') + + return Yhat_c, err + +def class_errorII(T, Y, method='lda'): + """ Not used ... + """ + pass diff --git a/engines.py b/engines.py new file mode 100644 index 0000000..fa44244 --- /dev/null +++ b/engines.py @@ -0,0 +1,298 @@ +"""Module contain algorithms for low-rank L-shaped model. + +""" + +__all__ = ['nipals_lpls'] +__docformat__ = "restructuredtext en" + +from math import sqrt as msqrt + +from numpy import dot,empty,zeros,apply_along_axis,newaxis,finfo +from numpy.linalg import inv + + +def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', verbose=False): + """ L-shaped Partial Least Sqaures Regression by the nipals algorithm. + + An L-shaped low rank model aproximates three matrices in a hyploid + structure. That means that the main matrix (X), has one matrix asociated + with its row space and one to its column space. A simultanously low rank estiamte + of these three matrices tries to discover common directions/subspaces. + + *Parameters*: + + X : {array} + Main data matrix (m, n) + Y : {array} + External row data (m, l) + Z : {array} + External column data (n, o) + a_max : {integer} + Maximum number of components to calculate (0, min(m,n)) + alpha : {float}, optional + Parameter to control the amount of influence from Z-matrix. + 0 is none, which returns a pls-solution, 1 is max + mean_center : {array-like}, optional + A three element array-like structure with elements in [-1,0,1,2], + that decides the type of centering used. + -1 : nothing + 0 : row center + 1 : column center + 2 : double center + scale : {'scores', 'loads'}, optional + Option to decide on where the scale goes. + verbose : {boolean}, optional + Verbosity of console output. For use in debugging. + + *Returns*: + + T : {array} + X-scores + W : {array} + X-weights/Z-weights + P : {array} + X-loadings + Q : {array} + Y-loadings + U : {array} + X-Y relation + L : {array} + Z-scores + K : {array} + Z-loads + B : {array} + Regression coefficients X->Y + evx : {array} + X-explained variance + evy : {array} + Y-explained variance + evz : {array} + Z-explained variance + mnx : {array} + X location + mny : {array} + Y location + mnz : {array} + Z location + + *References* + Saeboe et al., LPLS-regression: a method for improved prediction and + classification through inclusion of background information on + predictor variables, J. of chemometrics and intell. laboratory syst. + + Martens et.al, Regression of a data matrix on descriptors of + both its rows and of its columns via latent variables: L-PLSR, + Computational statistics & data analysis, 2005 + + """ + m, n = X.shape + k, l = Y.shape + u, o = Z.shape + max_rank = min(m, n) + assert (a_max>0 and a_maxLIM and niter.6] = 1. + if det(Cm)>1: + raise NotImplementedError + return Cm*S + +def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20, + sim_method='shuffle',p_center='med', cov_center=median, + crot=True,strict=False,mean_ctr=[2,0,2], nsets=None): + + """Returns qvals for l-pls model by permutation analysis. + + The response (Y) is randomly permuted, and the number of false positives + is registered by comparing hotellings T2 statistics of the calibration model. + + *Parameters*: + + X : {array} + Main data matrix (m, n) + Y : {array} + External row data (m, l) + Z : {array} + External column data (k, n) + aopt : {integer} + Optimal number of components + alpha : {float}, optional + Parameter to control the amount of influence from Z-matrix. + 0 is none, which returns a pls-solution, 1 is max + mean_center : {array-like}, optional + A three element array-like structure with elements in [-1,0,1,2], + that decides the type of centering used. + -1 : nothing + 0 : row center + 1 : column center + 2 : double center + n_iter : {integer}, optional + Number of permutations + sim_method : ['shuffle'], optional + Permutation method + p_center : {'median', 'mean', 'cal_model'}, optional + Location method for sub-segments + cov_center : {py_func}, optional + Location function + alpha : {float}, optional + Regularisation towards pooled covariance estimate. + crot : {boolean}, optional + Rotate sub-segmentss toward calibration model. + strict : {boolean}, optional + Only rotate 90 degree + + nsets : {integer} + Number of crossvalidation segements + + *Reference*: + + Gidskehaug et al., A framework for significance analysis of + gene expression data using dimension reduction methods, BMC + bioinformatics, 2007 + """ + + m, n = X.shape + k, nz = Z.shape + assert(n==nz) + try: + my, l = Y.shape + except: + # make Y a column vector + Y = atleast_2d(Y).T + my, l = Y.shape + assert(m==my) + + pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets) + pert_tsq_z = zeros((k, n_iter), dtype='d') # (nzvars x n_subsets) + + # Full model + dat = lpls(X, Y, Z, aopt, scale='loads', mean_ctr=mean_ctr) + Wc, Lc = lpls_jk(X, Y, Z ,aopt) + cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha) + cal_tsq_z = hotelling(Lc, dat['L'], alpha=alpha) + + # Perturbations + index = arange(m) + for i in range(n_iter): + indi = index.copy() + shuffle(indi) + dat = lpls(X, Y[indi,:], Z, aopt, scale='loads', mean_ctr=mean_ctr) + Wi, Li = lpls_jk(X, Y[indi,:], Z, aopt, nsets=nsets) + pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha) + pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=alpha) + + return _fdr(cal_tsq_z, pert_tsq_z, median), _fdr(cal_tsq_x, pert_tsq_x, median) + +def _fdr(tsq, tsqp, loc_method=median): + """Returns false discovery rate. + + Fdr is a method used in multiple hypothesis testing to correct for multiple + comparisons. It controls the expected proportion of incorrectly rejected null + hypotheses (type I errors) in a list of rejected hypotheses. + + *Parameters*: + + tsq : {array} + Hotellings T2, calibration model + tsqp : {array} + Hotellings T2, submodels + + loc_method : {py_func} + Location method + + *Returns*: + + fdr : {array} + False discovery rate + + """ + n, = tsq.shape + k, m = tsqp.shape + assert(n==k) + n_false = empty((n, m), 'd') + sort_index = argsort(tsq)[::-1] + r_index = argsort(sort_index) + for i in xrange(m): + for j in xrange(n): + n_false[j,i] = (tsqp[:,i]>tsq[j]).sum() + fp = loc_method(n_false.T) + n_signif = (arange(n) + 1.0)[r_index] + fd_rate = fp/n_signif + fd_rate[fd_rate>1] = 1 + return fd_rate diff --git a/tests/test_lplsengine.py b/tests/test_lplsengine.py new file mode 100644 index 0000000..12ab0c5 --- /dev/null +++ b/tests/test_lplsengine.py @@ -0,0 +1,143 @@ +"""Testing routines for the lpls engine. +""" +from math import sqrt as msqrt + +from numpy.testing import * +set_package_path() +from lplslib import lpls +from numpy import dot, eye, random,asarray,empty +from numpy.random import rand, randn +from numpy.linalg import svd,norm +restore_path() + +def blm_array(shape=(5,10), comp=3, noise=0,seed=1,dtype='d'): + assert(min(*shape)>=comp) + random.seed(seed) + t = rand(shape[0], comp) + p = rand(shape[1], comp) + x = dot(t, p.T) + if noise>0: + noise = noise*randn(*shape) + return x + noise + + +class LplsTestCase(NumpyTestCase): + def setUp(self): + self.a = blm_array(shape=(5,10),noise=.1) + self.b = blm_array(shape=(5,3), noise=.1) + self.c = blm_array(shape=(10,10), noise=.1) + self.nc = 2 + + def check_single(self): + self.a = asarray(self.a, dtype='f') + self.b = asarray(self.b, dtype='f') + self.c = asarray(self.c, dtype='f') + self.do() + + def check_double(self): + a = asarray(self.a, dtype='d') + b = asarray(self.b, dtype='d') + c = asarray(self.c, dtype='d') + self.do() + + def do(self,*args): + pass + #raise NotImplementedError + +class testAlphaZero(LplsTestCase): + def do(self): + #dat = lpls(self.a, self.b, self.c, self.nc, alpha=0.0) + + #assert_almost_equal(t1, t2) + pass + +class testAlphaOne(LplsTestCase): + pass + +class testZidentity(LplsTestCase): + def do(self): + I = eye(self.a.shape[1]) + dat = lpls(self.a, self.b, I, 2, alpha=1.0) + dat2 = lpls(self.a, self.b, self.c, self.nc, alpha=0.0) + assert_almost_equal(dat['T'], dat2['T']) + +class testYidentity(LplsTestCase): + def do(self): + I = eye(self.b.shape[0], dtype=self.a.dtype) + T0 = lpls(self.a, I, self.c, self.nc, alpha=0.0, mean_ctr=[-1,-1,-1])['T'] + u, s, vt = svd(self.a, 0) + T = u*s + assert_almost_equal(abs(T0), abs(T[:,:self.nc]),5) + +class testWideX(LplsTestCase): + pass + +class testTallX(LplsTestCase): + pass + +class testWideY(LplsTestCase): + pass + +class testTallY(LplsTestCase): + pass + +class testWideZ(LplsTestCase): + pass + +class testTallZ(LplsTestCase): + pass + +class testRankDeficientX(LplsTestCase): + pass + +class testRankDeficientY(LplsTestCase): + pass + +class testRankDeficientZ(LplsTestCase): + pass + +class testCenterX(LplsTestCase): + def do(self): + T = lpls(self.a, self.b, self.c, self.nc, mean_ctr=[0,-1,-1])['T'] + assert_almost_equal(T.mean(0), 0) + W = lpls(self.a, self.b, self.c, self.nc, alpha=0,mean_ctr=[1,-1,-1])['W'] + assert_almost_equal(W.mean(0), 0) + + +class testResiduals(NumpyTestCase): + def setUp(self): + self.a = blm_array(shape=(5,5),noise=0, comp=3) + self.b = self.a.copy() + self.c = self.a.copy().T + self.nc = 3 + + def check_single(self): + self.a = asarray(self.a, dtype='f') + self.b = asarray(self.b, dtype='f') + self.c = asarray(self.c, dtype='f') + self.do() + + def check_double(self): + a = asarray(self.a, dtype='d') + b = asarray(self.b, dtype='d') + c = asarray(self.c, dtype='d') + self.do() + + def do(self): + dat = lpls(self.a, self.b, self.c, self.nc, mean_ctr=[-1,-1,-1]) + + +class testOrthogonality(LplsTestCase): + def do(self): + dat = lpls(self.a, self.b, self.c, self.nc, mean_ctr=[0,0,0],scale='loads') + T, W, L, E, F = dat['T'],dat['W'],dat['L'],dat['E'],dat['F'] + assert_almost_equal(dot(T.T,T), eye(T.shape[1])) + for i,w in enumerate(W.T): + W[:,i] = w/norm(w) + assert_almost_equal(dot(W.T, W), eye(W.shape[1]), 3) + assert_almost_equal(dot(T.T,E), 0, 3) + assert_almost_equal(dot(T.T,F), 0, 3) + + +if __name__ == '__main__': + NumpyTest().run()