Projects/pyblm
Archived
5
0

Added test on lpls, updated centering in blm, added pls_qvals

This commit is contained in:
Arnar Flatberg 2007-12-03 14:48:10 +00:00
parent 4c809674bb
commit 8e9c4c1c58
5 changed files with 199 additions and 68 deletions

@ -1,5 +1,5 @@
from crossvalidation import lpls_val from crossvalidation import lpls_val
from statistics import lpls_qvals from statistics import lpls_qvals, pls_qvals
from engines import pca, pcr, pls from engines import pca, pcr, pls
from engines import nipals_lpls as lpls from engines import nipals_lpls as lpls
#from tests import * #from tests import *

@ -9,10 +9,11 @@ from numpy import dot,empty,zeros,sqrt,atleast_2d,argmax,asarray,median,\
array_split,arange array_split,arange
from numpy.random import shuffle from numpy.random import shuffle
from engines import pls
from engines import nipals_lpls as lpls from engines import nipals_lpls as lpls
def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2], zorth=False, verbose=False): def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=False, verbose=False):
"""Performs crossvalidation for generalisation error in lpls. """Performs crossvalidation for generalisation error in lpls.
The L-PLS crossvalidation is estimated just like an ordinary pls The L-PLS crossvalidation is estimated just like an ordinary pls
@ -35,7 +36,7 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2], zorth=Fals
alpha : {float}, optional alpha : {float}, optional
Parameter to control the amount of influence from Z-matrix. Parameter to control the amount of influence from Z-matrix.
0 is none, which returns a pls-solution, 1 is max 0 is none, which returns a pls-solution, 1 is max
mean_center : {array-like}, optional center_axis : {array-like}, optional
A three element array-like structure with elements in [-1,0,1,2], A three element array-like structure with elements in [-1,0,1,2],
that decides the type of centering used. that decides the type of centering used.
-1 : nothing -1 : nothing
@ -74,14 +75,14 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2], zorth=Fals
for cal, val in cv(nsets, k): for cal, val in cv(nsets, k):
# do the training model # do the training model
dat = lpls(X[cal], Y[cal], Z, a_max=a_max, alpha=alpha, dat = lpls(X[cal], Y[cal], Z, a_max=a_max, alpha=alpha,
mean_ctr=mean_ctr, zorth=zorth, verbose=verbose) center_axis=center_axis, zorth=zorth, verbose=verbose)
# center test data # center test data
if mean_ctr[0] != 1: if center_axis[0] != 1:
xi = X[val,:] - dat['mnx'] xi = X[val,:] - dat['mnx']
else: else:
xi = X[val] - X[cal].mean(1)[:,newaxis] xi = X[val] - X[cal].mean(1)[:,newaxis]
if mean_ctr[2] != 1: if center_axis[2] != 1:
ym = dat['mny'] ym = dat['mny']
else: else:
ym = Y[cal].mean(1)[:,newaxis] ym = Y[cal].mean(1)[:,newaxis]
@ -94,7 +95,7 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2], zorth=Fals
# for n in range(10): # for n in range(10):
# shuffle(cal) # shuffle(cal)
# dat = lpls(xcal, Y[cal], Z, a_max=a_max, alpha=alpha, # dat = lpls(xcal, Y[cal], Z, a_max=a_max, alpha=alpha,
# mean_ctr=mean_ctr, verbose=verbose) # center_axis=center_axis, verbose=verbose)
# todo: need a better support for classification error # todo: need a better support for classification error
@ -110,7 +111,7 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2], zorth=Fals
return rmsep, Yhat, aopt return rmsep, Yhat, aopt
def pca_jk(a, aopt, n_blocks=None): def pca_jk(a, aopt, center_axis=[0], nsets=None):
"""Returns jack-knife segements from PCA. """Returns jack-knife segements from PCA.
*Parameters*: *Parameters*:
@ -119,6 +120,8 @@ def pca_jk(a, aopt, n_blocks=None):
data matrix (n x m) data matrix (n x m)
aopt : {integer} aopt : {integer}
number of components in model. number of components in model.
center_axis:
Centering
nsets : {integer} nsets : {integer}
number of segments number of segments
@ -141,13 +144,13 @@ def pca_jk(a, aopt, n_blocks=None):
old_values = a.take(ind) old_values = a.take(ind)
new_values = mn_a.take(ind) new_values = mn_a.take(ind)
a.put(ind, new_values) a.put(ind, new_values)
dat = pca(a, aopt, mode='fast', scale='loads', center=center) dat = pca(a, aopt, mode='fast', scale='loads', center_axis=center_axis)
PP[nn,:,:] = dat['P'] PP[i,:,:] = dat['P']
a.put(ind, old_values) a.put(ind, old_values)
return PP return PP
def pls_jk(X, Y, a_opt, nsets=None, center=True, verbose=False): def pls_jk(X, Y, a_opt, nsets=None, center_axis=True, verbose=False):
""" Returns jack-knife segements of W. """ Returns jack-knife segements of W.
*Parameters*: *Parameters*:
@ -161,7 +164,7 @@ def pls_jk(X, Y, a_opt, nsets=None, center=True, verbose=False):
nsets : (integer), optional nsets : (integer), optional
Number of jack-knife segments Number of jack-knife segments
center : {boolean}, optional center_axis : {boolean}, optional
- -1 : nothing - -1 : nothing
- 0 : row center - 0 : row center
- 1 : column center - 1 : column center
@ -183,13 +186,13 @@ def pls_jk(X, Y, a_opt, nsets=None, center=True, verbose=False):
Wcv = empty((nsets, X.shape[1], a_opt), dtype=X.dtype) Wcv = empty((nsets, X.shape[1], a_opt), dtype=X.dtype)
for i, (cal, val) in enumerate(cv(k, nsets)): for i, (cal, val) in enumerate(cv(k, nsets)):
if verbose: if verbose:
print "Segement number: %d" %i print "Segment number: %d" %i
dat = pls(X[cal,:], Y[cal,:], a_opt, scale='loads', mode='fast') dat = pls(X[cal,:], Y[cal,:], a_opt, scale='loads', mode='fast', center_axis=[0, 0])
Wcv[nn,:,:] = dat['W'] Wcv[i,:,:] = dat['W']
return Wcv return Wcv
def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=False): def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth=False, verbose=False):
"""Returns jack-knifed segments of lpls model. """Returns jack-knifed segments of lpls model.
Jack-knifing is a method to perturb the model paramters, hopefully Jack-knifing is a method to perturb the model paramters, hopefully
@ -215,7 +218,7 @@ def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=F
xz_alpha : {float}, optional xz_alpha : {float}, optional
Parameter to control the amount of influence from Z-matrix. Parameter to control the amount of influence from Z-matrix.
0 is none, which returns a pls-solution, 1 is max 0 is none, which returns a pls-solution, 1 is max
mean_center : {array-like}, optional center_axis : {array-like}, optional
A three element array-like structure with elements in [-1,0,1,2], A three element array-like structure with elements in [-1,0,1,2],
that decides the type of centering used. that decides the type of centering used.
-1 : nothing -1 : nothing
@ -244,8 +247,8 @@ def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=F
WWz = empty((nsets, o, a_opt), 'd') WWz = empty((nsets, o, a_opt), 'd')
#WWy = empty((nsets, l, a_opt), 'd') #WWy = empty((nsets, l, a_opt), 'd')
for i, (cal, val) in enumerate(cv(k, nsets)): for i, (cal, val) in enumerate(cv(k, nsets)):
dat = lpls(X[cal], Y[cal], Z, a_max=a_opt,alpha=xz_alpha, mean_ctr=mean_ctr, dat = lpls(X[cal], Y[cal], Z, a_max=a_opt,alpha=xz_alpha, center_axis=center_axis,
scale='loads', verbose=verbose) scale='loads', zorth=zorth, verbose=verbose)
WWx[i,:,:] = dat['W'] WWx[i,:,:] = dat['W']
WWz[i,:,:] = dat['L'] WWz[i,:,:] = dat['L']
#WWy[i,:,:] = dat['Q'] #WWy[i,:,:] = dat['Q']
@ -328,8 +331,8 @@ def cv(N, K, randomise=True, sequential=False):
otherwise interleaved ordering is used. otherwise interleaved ordering is used.
""" """
if N > K: if K > N:
raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(K, K, N) raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(N, N, K)
index = xrange(N) index = xrange(N)
if randomise: if randomise:
from random import shuffle from random import shuffle
@ -375,19 +378,44 @@ def diag_cv(shape, nsets=9):
yield training, validation yield training, validation
def class_error(Yhat, Y, method='vanilla'): def class_error(y_hat, y, method='vanilla'):
""" Not used. """ Not used.
""" """
a_opt, k, l = Yhat.shape a_opt, k, l = y_hat.shape
Yhat_c = zeros((k, l), dtype='d') y_hat_c = zeros((k, l), dtype='d')
if method == vanilla:
pass
for a in range(a_opt): for a in range(a_opt):
for i in range(k): for i in range(k):
Yhat_c[a, val, argmax(Yhat[a,val,:])] = 1.0 y_hat_c[a, val, argmax(y_hat[a,val,:])] = 1.0
err = 100*((Yhat_c + Y) == 2).sum(1)/Y.sum(0).astype('d') err = 100*((y_hat_c + y) == 2).sum(1)/y.sum(0).astype('d')
return Yhat_c, err return y_hat_c, err
def _class_errorII(T, Y, method='lda'): def prediction_error(y_hat, y, method='squared'):
""" Not used ... """Loss function on multiclass Y.
Assumes y is a binary dummy class matrix (samples, classes)
""" """
pass k, l = y.shape
if method == 'hinge':
pass
elif method == 'smooth_hinge':
z = 90
elif method == 'abs':
err = abs(y - y_hat)
elif method == 'squared':
err = (y - y_hat)**2
elif method == '0/1':
pred = zeros_like(y_hat)
for i, row in enumerate(y_hat):
largest = row.argsort()[-1]
pred[i, largest] = 1.
err = abs(y - pred)
elif method == '1/2':
y_hat[y_hat>.5] = 1
y_hat[y_hat<.5] = 0
err = abs(y - y_hat)
return err

@ -13,7 +13,7 @@ from numpy.linalg import inv,svd
from scipy.sandbox import arpack from scipy.sandbox import arpack
def pca(X, aopt, scale='scores', mode='normal', center_axis=0): def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
""" Principal Component Analysis. """ Principal Component Analysis.
PCA is a low rank bilinear aprroximation to a data matrix that sequentially PCA is a low rank bilinear aprroximation to a data matrix that sequentially
@ -80,8 +80,8 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=0):
m, n = X.shape m, n = X.shape
min_aopt = min(m, n) min_aopt = min(m, n)
if center_axis >= 0: if center_axis != None:
X = X - expand_dims(X.mean(center_axis), center_axis) X, mnx = center(X, center_axis[0])
min_aopt = min_aopt - 1 min_aopt = min_aopt - 1
assert(aopt <= min_aopt) assert(aopt <= min_aopt)
if m > (n+100) or n > (m+100): if m > (n+100) or n > (m+100):
@ -107,10 +107,10 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=0):
T = T/s T = T/s
P = P*s P = P*s
if mode == 'fast': if mode in ['fast', 'f', 'F']:
return {'T':T, 'P':P, 'aopt':aopt} return {'T':T, 'P':P, 'aopt':aopt}
if mode=='detailed': if mode in ['detailed', 'd', 'D']:
E = empty((aopt, m, n)) E = empty((aopt, m, n))
ssq = [] ssq = []
lev = [] lev = []
@ -137,7 +137,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=0):
return {'T': T, 'P': P, 'E': E, 'evx': expvarx, 'leverage': lev, 'ssqx': ssq, return {'T': T, 'P': P, 'E': E, 'evx': expvarx, 'leverage': lev, 'ssqx': ssq,
'aopt': aopt, 'eigvals': e} 'aopt': aopt, 'eigvals': e}
def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0): def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=[0, 0]):
""" Principal Component Regression. """ Principal Component Regression.
Performs PCR on given matrix and returns results in a dictionary. Performs PCR on given matrix and returns results in a dictionary.
@ -197,8 +197,7 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0):
except: except:
b = atleast_2d(b).T b = atleast_2d(b).T
k, l = b.shape k, l = b.shape
if center_axis >= 0: b, mny = center(b, center_axis[1])
b = b - expand_dims(b.mean(center_axis), center_axis)
dat = pca(a, aopt=aopt, scale=scale, mode=mode, center_axis=center_axis) dat = pca(a, aopt=aopt, scale=scale, mode=mode, center_axis=center_axis)
T = dat['T'] T = dat['T']
weights = apply_along_axis(vnorm, 0, T)**2 weights = apply_along_axis(vnorm, 0, T)**2
@ -207,11 +206,11 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0):
else: else:
Q = dot(b.T, T/weights) Q = dot(b.T, T/weights)
if mode == 'fast': if mode in ['fast', 'f', 'F']:
dat.update({'Q':Q}) dat.update({'Q':Q})
return dat return dat
if mode == 'detailed': if mode in ['detailed', 'd', 'D']:
F = empty((aopt, k, l)) F = empty((aopt, k, l))
ssqy = [] ssqy = []
for i in range(aopt): for i in range(aopt):
@ -224,10 +223,10 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0):
expvary = r_[0, 100*((T**2).sum(0)*(Q**2).sum(0)/(b**2).sum()).cumsum()[:aopt]] expvary = r_[0, 100*((T**2).sum(0)*(Q**2).sum(0)/(b**2).sum()).cumsum()[:aopt]]
dat.update({'Q': Q, 'F': F, 'evy': expvary, 'ssqy': ssqy}) dat.update({'Q': Q, 'F': F, 'evy': expvary, 'ssqy': ssqy, 'mny': mny})
return dat return dat
def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1): def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
"""Partial Least Squares Regression. """Partial Least Squares Regression.
Performs PLS on given matrix and returns results in a dictionary. Performs PLS on given matrix and returns results in a dictionary.
@ -316,12 +315,10 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1):
assert(m == k) assert(m == k)
mnx, mny = 0, 0 mnx, mny = 0, 0
min_aopt = min(m, n) min_aopt = min(m, n)
if center_axis >= 0: if center_axis != None:
mnx = expand_dims(X.mean(center_axis), center_axis) X, mnx = center(X, center_axis[0])
X = X - mnx Y, mny = center(Y, center_axis[1])
min_aopt = min_aopt - 1 min_aopt = min_aopt - 1
mny = expand_dims(Y.mean(center_axis), center_axis)
Y = Y - mny
assert(aopt > 0 and aopt < min_aopt) assert(aopt > 0 and aopt < min_aopt)
W = empty((n, aopt)) W = empty((n, aopt))
@ -361,7 +358,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1):
T[:,i] = t.ravel() T[:,i] = t.ravel()
W[:,i] = w.ravel() W[:,i] = w.ravel()
if mode == 'fast' and i == (aopt - 1): if mode in ['fast', 'f', 'F'] and i == (aopt - 1):
if scale == 'loads': if scale == 'loads':
tnorm = sqrt(tt) tnorm = sqrt(tt)
T = T/tnorm T = T/tnorm
@ -376,7 +373,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1):
qnorm = apply_along_axis(vnorm, 0, Q) qnorm = apply_along_axis(vnorm, 0, Q)
tnorm = sqrt(tt) tnorm = sqrt(tt)
pp = (P**2).sum(0) pp = (P**2).sum(0)
if mode == 'detailed': if mode in ['detailed', 'd', 'D']:
E = empty((aopt, m, n)) E = empty((aopt, m, n))
F = empty((aopt, k, l)) F = empty((aopt, k, l))
ssqx, ssqy = [], [] ssqx, ssqy = [], []
@ -416,7 +413,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1):
'evx': expvarx, 'evy': expvary, 'ssqx': ssqx, 'ssqy': ssqy, 'evx': expvarx, 'evy': expvary, 'ssqx': ssqx, 'ssqy': ssqy,
'leverage': leverage, 'mnx': mnx, 'mny': mny} 'leverage': leverage, 'mnx': mnx, 'mny': mny}
def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 2], scale='scores', zorth = False, verbose=False): def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores', zorth = False, verbose=False):
""" L-shaped Partial Least Sqaures Regression by the nipals algorithm. """ L-shaped Partial Least Sqaures Regression by the nipals algorithm.
An L-shaped low rank model aproximates three matrices in a hyploid An L-shaped low rank model aproximates three matrices in a hyploid
@ -437,7 +434,7 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 2], scale='scores', zo
alpha : {float}, optional alpha : {float}, optional
Parameter to control the amount of influence from Z-matrix. Parameter to control the amount of influence from Z-matrix.
0 is none, which returns a pls-solution, 1 is max 0 is none, which returns a pls-solution, 1 is max
mean_center : {array-like}, optional center_axis : {array-like}, optional
A three element array-like structure with elements in [-1,0,1,2], A three element array-like structure with elements in [-1,0,1,2],
that decides the type of centering used. that decides the type of centering used.
-1 : nothing -1 : nothing
@ -500,16 +497,19 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 2], scale='scores', zo
m, n = X.shape m, n = X.shape
k, l = Y.shape k, l = Y.shape
u, o = Z.shape u, o = Z.shape
max_rank = min(m, n) + 1 max_rank = min(m, n)
assert (a_max > 0 and a_max < max_rank), "Number of comp error:\
tried: %d, max_rank: %d" %(a_max, max_rank)
if mean_ctr != None:
xctr, yctr, zctr = mean_ctr if center_axis != None:
xctr, yctr, zctr = center_axis
X, mnX = center(X, xctr) X, mnX = center(X, xctr)
Y, mnY = center(Y, yctr) Y, mnY = center(Y, yctr)
Z, mnZ = center(Z, zctr) Z, mnZ = center(Z, zctr)
max_rank = max_rank -1
assert (a_max > 0 and a_max < max_rank), "Number of comp error:\
tried: %d, max_rank: %d" %(a_max, max_rank)
# initial variance
varX = (X**2).sum() varX = (X**2).sum()
varY = (Y**2).sum() varY = (Y**2).sum()
varZ = (Z**2).sum() varZ = (Z**2).sum()
@ -670,7 +670,13 @@ def center(a, axis):
Centered data matrix Centered data matrix
mn : {array} mn : {array}
Location vector/matrix Location vector/matrix
""" """
### TODO ###
# Perhaps not(!) use broadcasting and return full centering
# matrix instead ?
# check if we have a vector # check if we have a vector
is_vec = len(a.shape) == 1 is_vec = len(a.shape) == 1
@ -678,13 +684,13 @@ def center(a, axis):
is_vec = a.shape[0] == 1 or a.shape[1] == 1 is_vec = a.shape[0] == 1 or a.shape[1] == 1
if is_vec: if is_vec:
if axis == 2: if axis == 2:
warnings.warn("Double centering of vecor ignored, using ordinary centering") warnings.warn("Double centering of vector ignored, using ordinary centering")
if axis == -1: if axis == -1:
mn = 0 mn = 0
else: else:
mn = a.mean() mn = a.mean()
return a - mn, mn return a - mn, mn
# !!!fixme: use broadcasting
if axis == -1: if axis == -1:
mn = zeros((1,a.shape[1],)) mn = zeros((1,a.shape[1],))
#mn = tile(mn, (a.shape[0], 1)) #mn = tile(mn, (a.shape[0], 1))
@ -695,6 +701,7 @@ def center(a, axis):
mn = a.mean(1)[:,newaxis] mn = a.mean(1)[:,newaxis]
#mn = tile(mn, (1, a.shape[1])) #mn = tile(mn, (1, a.shape[1]))
elif axis == 2: elif axis == 2:
#fixme: double centering returns column mean as loc-vector, ok?
mn = a.mean(0)[newaxis] + a.mean(1)[:,newaxis] - a.mean() mn = a.mean(0)[newaxis] + a.mean(1)[:,newaxis] - a.mean()
return a - mn , a.mean(0)[newaxis] return a - mn , a.mean(0)[newaxis]
else: else:

@ -1,6 +1,6 @@
""" A collection of some statistical utitlites used. """ A collection of some statistical utitlites used.
""" """
__all__ = ['hotelling', 'lpls_qvals'] __all__ = ['hotelling', 'lpls_qvals', 'pls_qvals']
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from math import sqrt as msqrt from math import sqrt as msqrt
@ -9,7 +9,8 @@ from numpy import dot,empty,zeros,eye,median,sign,arange,argsort
from numpy.linalg import svd,inv,det from numpy.linalg import svd,inv,det
from numpy.random import shuffle from numpy.random import shuffle
from crossvalidation import lpls_jk from crossvalidation import lpls_jk, pls_jk
from engines import pls
from engines import nipals_lpls as lpls from engines import nipals_lpls as lpls
@ -174,8 +175,9 @@ def _ensure_strict(C, only_flips=True):
return Cm*S return Cm*S
def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20, def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
sim_method='shuffle',p_center='med', cov_center=median, sim_method='shuffle', p_center='med', cov_center=median,
crot=True,strict=False,mean_ctr=[2,0,2], nsets=None): crot=True, strict=False, center_axis=[2,0,2], nsets=None,
zorth=False):
"""Returns qvals for l-pls model by permutation analysis. """Returns qvals for l-pls model by permutation analysis.
@ -195,7 +197,7 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
alpha : {float}, optional alpha : {float}, optional
Parameter to control the amount of influence from Z-matrix. Parameter to control the amount of influence from Z-matrix.
0 is none, which returns a pls-solution, 1 is max 0 is none, which returns a pls-solution, 1 is max
mean_center : {array-like}, optional center_axis : {array-like}, optional
A three element array-like structure with elements in [-1,0,1,2], A three element array-like structure with elements in [-1,0,1,2],
that decides the type of centering used. that decides the type of centering used.
-1 : nothing -1 : nothing
@ -242,8 +244,8 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
pert_tsq_z = zeros((k, n_iter), dtype='d') # (nzvars x n_subsets) pert_tsq_z = zeros((k, n_iter), dtype='d') # (nzvars x n_subsets)
# Full model # Full model
dat = lpls(X, Y, Z, aopt, scale='loads', mean_ctr=mean_ctr) dat = lpls(X, Y, Z, aopt, scale='loads', center_axis=center_axis, zorth=zorth)
Wc, Lc = lpls_jk(X, Y, Z ,aopt) Wc, Lc = lpls_jk(X, Y, Z ,aopt, zorth=zorth)
cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha) cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha)
cal_tsq_z = hotelling(Lc, dat['L'], alpha=alpha) cal_tsq_z = hotelling(Lc, dat['L'], alpha=alpha)
@ -252,13 +254,92 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
for i in range(n_iter): for i in range(n_iter):
indi = index.copy() indi = index.copy()
shuffle(indi) shuffle(indi)
dat = lpls(X, Y[indi,:], Z, aopt, scale='loads', mean_ctr=mean_ctr) dat = lpls(X, Y[indi,:], Z, aopt, scale='loads', center_axis=center_axis, zorth=zorth)
Wi, Li = lpls_jk(X, Y[indi,:], Z, aopt, nsets=nsets) Wi, Li = lpls_jk(X, Y[indi,:], Z, aopt, nsets=nsets)
pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha) pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)
pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=alpha) pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=alpha)
return _fdr(cal_tsq_z, pert_tsq_z, median), _fdr(cal_tsq_x, pert_tsq_x, median) return _fdr(cal_tsq_z, pert_tsq_z, median), _fdr(cal_tsq_x, pert_tsq_x, median)
def pls_qvals(X, Y, aopt, alpha=.3, zx_alpha=.5, n_iter=20,
sim_method='shuffle',p_center='med', cov_center=median,
crot=True,strict=False, center_axis=[0,0], nsets=None, zorth=False):
"""Returns qvals for pls model by permutation analysis.
The response (Y) is randomly permuted, and the number of false positives
is registered by comparing hotellings T2 statistics of the calibration model.
*Parameters*:
X : {array}
Main data matrix (m, n)
Y : {array}
External row data (m, l)
aopt : {integer}
Optimal number of components
center_axis : {array-like}, optional
A two element array-like structure with elements in [-1,0,1,2],
that decides the type of centering used.
-1 : nothing
0 : row center
1 : column center
2 : double center
n_iter : {integer}, optional
Number of permutations
sim_method : ['shuffle'], optional
Permutation method
p_center : {'median', 'mean', 'cal_model'}, optional
Location method for sub-segments
cov_center : {py_func}, optional
Location function
alpha : {float}, optional
Regularisation towards pooled covariance estimate.
crot : {boolean}, optional
Rotate sub-segmentss toward calibration model.
strict : {boolean}, optional
Only rotate 90 degree
nsets : {integer}
Number of crossvalidation segements
*Reference*:
Gidskehaug et al., A framework for significance analysis of
gene expression data using dimension reduction methods, BMC
bioinformatics, 2007
"""
m, n = X.shape
try:
my, l = Y.shape
except:
# make Y a column vector
Y = atleast_2d(Y).T
my, l = Y.shape
assert(m==my)
# Full model
dat = pls(X, Y, aopt, scale='loads')
# Submodels
Wc = pls_jk(X, Y , aopt)
cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha)
# Perturbations
pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets)
index = arange(m)
for i in range(n_iter):
indi = index.copy()
shuffle(indi)
dat = pls(X, Y[indi,:], aopt, scale='loads', center_axis=center_axis)
Wi = pls_jk(X, Y[indi,:], aopt, nsets=nsets, center_axis=center_axis)
pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)
return _fdr(cal_tsq_x, pert_tsq_x, median)
def _fdr(tsq, tsqp, loc_method=median): def _fdr(tsq, tsqp, loc_method=median):
"""Returns false discovery rate. """Returns false discovery rate.
@ -306,3 +387,4 @@ def _fdr(tsq, tsqp, loc_method=median):
fd_rate = fp/n_signif fd_rate = fp/n_signif
fd_rate[fd_rate>1] = 1 fd_rate[fd_rate>1] = 1
return fd_rate return fd_rate

@ -10,8 +10,8 @@ from numpy.random import rand, randn
from numpy.linalg import svd,norm from numpy.linalg import svd,norm
restore_path() restore_path()
def blm_array(shape=(5,10), comp=3, noise=0,seed=1,dtype='d'): def blm_array(shape=(5, 10), comp=3, noise=0, seed=1, dtype='d'):
assert(min(*shape)>=comp) assert(min(*shape) >= comp)
random.seed(seed) random.seed(seed)
t = rand(shape[0], comp) t = rand(shape[0], comp)
p = rand(shape[1], comp) p = rand(shape[1], comp)
@ -44,6 +44,7 @@ class LplsTestCase(NumpyTestCase):
pass pass
#raise NotImplementedError #raise NotImplementedError
class testAlphaZero(LplsTestCase): class testAlphaZero(LplsTestCase):
def do(self): def do(self):
#dat = lpls(self.a, self.b, self.c, self.nc, alpha=0.0) #dat = lpls(self.a, self.b, self.c, self.nc, alpha=0.0)
@ -51,9 +52,11 @@ class testAlphaZero(LplsTestCase):
#assert_almost_equal(t1, t2) #assert_almost_equal(t1, t2)
pass pass
class testAlphaOne(LplsTestCase): class testAlphaOne(LplsTestCase):
pass pass
class testZidentity(LplsTestCase): class testZidentity(LplsTestCase):
def do(self): def do(self):
I = eye(self.a.shape[1]) I = eye(self.a.shape[1])
@ -61,6 +64,7 @@ class testZidentity(LplsTestCase):
dat2 = lpls(self.a, self.b, self.c, self.nc, alpha=0.0) dat2 = lpls(self.a, self.b, self.c, self.nc, alpha=0.0)
assert_almost_equal(dat['T'], dat2['T']) assert_almost_equal(dat['T'], dat2['T'])
class testYidentity(LplsTestCase): class testYidentity(LplsTestCase):
def do(self): def do(self):
I = eye(self.b.shape[0], dtype=self.a.dtype) I = eye(self.b.shape[0], dtype=self.a.dtype)
@ -69,33 +73,43 @@ class testYidentity(LplsTestCase):
T = u*s T = u*s
assert_almost_equal(abs(T0), abs(T[:,:self.nc]),5) assert_almost_equal(abs(T0), abs(T[:,:self.nc]),5)
class testWideX(LplsTestCase): class testWideX(LplsTestCase):
pass pass
class testTallX(LplsTestCase): class testTallX(LplsTestCase):
pass pass
class testWideY(LplsTestCase): class testWideY(LplsTestCase):
pass pass
class testTallY(LplsTestCase): class testTallY(LplsTestCase):
pass pass
class testWideZ(LplsTestCase): class testWideZ(LplsTestCase):
pass pass
class testTallZ(LplsTestCase): class testTallZ(LplsTestCase):
pass pass
class testRankDeficientX(LplsTestCase): class testRankDeficientX(LplsTestCase):
pass pass
class testRankDeficientY(LplsTestCase): class testRankDeficientY(LplsTestCase):
pass pass
class testRankDeficientZ(LplsTestCase): class testRankDeficientZ(LplsTestCase):
pass pass
class testCenterX(LplsTestCase): class testCenterX(LplsTestCase):
def do(self): def do(self):
T = lpls(self.a, self.b, self.c, self.nc, mean_ctr=[0,-1,-1])['T'] T = lpls(self.a, self.b, self.c, self.nc, mean_ctr=[0,-1,-1])['T']