Projects/laydi
Projects
/
laydi
Archived
7
0
Fork 0
This commit is contained in:
Arnar Flatberg 2007-09-20 16:11:37 +00:00
parent 7e9a0882f1
commit 41f93c5989
4 changed files with 337 additions and 102 deletions

View File

@ -693,13 +693,13 @@ class LplsOptions(Options):
opt['mode'] = 'normal' # how much info to calculate opt['mode'] = 'normal' # how much info to calculate
opt['amax'] = 10 opt['amax'] = 10
opt['aopt'] = 3 opt['aopt'] = 3
opt['xz_alpha'] = 0.4 opt['xz_alpha'] = 0.6
opt['auto_aopt'] = False opt['auto_aopt'] = False
opt['center'] = True opt['center'] = True
opt['center_mth'] = [2, 2, 1] opt['center_mth'] = [2, 0, 2]
opt['scale'] = 'scores' opt['scale'] = 'scores'
opt['calc_conf'] = True opt['calc_conf'] = False
opt['n_sets'] = 75 opt['n_sets'] = 7
opt['strict'] = False opt['strict'] = False
opt['p_center'] = 'med' opt['p_center'] = 'med'
opt['alpha'] = .3 opt['alpha'] = .3

View File

@ -1,6 +1,6 @@
from scipy import zeros,zeros_like,sqrt,dot,trace,sign,round_,argmax,\ from scipy import zeros,zeros_like,sqrt,dot,trace,sign,round_,argmax,\
sort,ravel,newaxis,asarray,diag,sum,outer,argsort,arange,ones_like,\ sort,ravel,newaxis,asarray,diag,sum,outer,argsort,arange,ones_like,\
all,apply_along_axis,eye all,apply_along_axis,eye,atleast_2d
from scipy.linalg import svd,inv,norm,det,sqrtm from scipy.linalg import svd,inv,norm,det,sqrtm
from scipy.stats import mean,median from scipy.stats import mean,median
from cx_utils import mat_center from cx_utils import mat_center
@ -26,8 +26,8 @@ def hotelling(Pcv, P, p_center='med', cov_center='med',
m, n = P.shape m, n = P.shape
n_sets, n, amax = Pcv.shape n_sets, n, amax = Pcv.shape
# allocate # allocate
T_sq = empty((n, ),dtype='f') T_sq = empty((n, ),dtype='d')
Cov_i = zeros((n, amax, amax),dtype='f') Cov_i = zeros((n, amax, amax),dtype='d')
# rotate sub_models to full model # rotate sub_models to full model
if crot: if crot:
@ -56,10 +56,12 @@ def hotelling(Pcv, P, p_center='med', cov_center='med',
reg_cov = (1. - alpha)*Cov_i + alpha*Cov reg_cov = (1. - alpha)*Cov_i + alpha*Cov
for i in xrange(n): for i in xrange(n):
Pc = P_ctr[i,:][:,newaxis] #Pc = P_ctr[i,:][:,newaxis]
Pc = P_ctr[i,:]
sigma = reg_cov[i] sigma = reg_cov[i]
#T_sq[i] = sqrt(dot(dot(Pc.T, inv(sigma)), Pc).ravel()) # T_sq[i] = (dot(Pc, inv(sigma) )*Pc).sum() #slow
T_sq[i] = dot(dot(Pc.T, inv(sigma)), Pc).ravel() T_sq[i] = dot(dot(Pc, inv(sigma)), Pc) # dont need to care about transposes
#T_sq[i] = dot(dot(Pc.T, inv(sigma)), Pc).ravel()
return T_sq return T_sq
def procrustes(A, B, strict=True, center=False, verbose=False): def procrustes(A, B, strict=True, center=False, verbose=False):
@ -147,10 +149,10 @@ def pls_qvals(a, b, aopt=None, alpha=.3,
tsq_full = hotelling(Wcv, dat['W'], p_center=p_center, tsq_full = hotelling(Wcv, dat['W'], p_center=p_center,
alpha=alpha, crot=crot, strict=strict, alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center) cov_center=cov_center)
t0 = time.time() #t0 = time.time()
Vs = shuffle_1d(bc, n_iter, axis=0) Vs = shuffle_1d(bc, n_iter, axis=0)
for i, b_shuff in enumerate(Vs): for i, b_shuff in enumerate(Vs):
t1 = time.time() #t1 = time.time()
if algo=='bridge': if algo=='bridge':
dat = bridge(ac, b_shuff, aopt, 'loads','fast') dat = bridge(ac, b_shuff, aopt, 'loads','fast')
else: else:
@ -159,23 +161,10 @@ def pls_qvals(a, b, aopt=None, alpha=.3,
TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center, TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center,
alpha=alpha, crot=crot, strict=strict, alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center) cov_center=cov_center)
print time.time() - t1 #print time.time() - t1
sort_index = argsort(tsq_full)[::-1]
back_sort_index = sort_index.argsort() return fdr(tsq_full, TSQ, median)
print time.time() - t0
# count false positives
tsq_full_sorted = tsq_full.take(sort_index)
for i in xrange(n_iter):
for j in xrange(n):
n_false[j,i] = sum(TSQ[:,i]>=tsq_full[j]) # number of false pos. genes (0-n)
false_pos = median(n_false, 1)
ll = arange(1, len(false_pos)+1, 1)
sort_qval = false_pos.take(sort_index)/ll
qval = false_pos/ll.take(back_sort_index)
print time.time() - t0
#return qval, false_pos, TSQ, tsq_full
return qval
def ensure_strict(C, only_flips=True): def ensure_strict(C, only_flips=True):
"""Ensure that a rotation matrix does only 90 degree rotations. """Ensure that a rotation matrix does only 90 degree rotations.
@ -267,6 +256,7 @@ def pls_qvals_II(a, b, aopt=None, center=True, alpha=.3,
qval = false_pos/ll.take(back_sort_index) qval = false_pos/ll.take(back_sort_index)
print time.time() - t0 print time.time() - t0
#return qval, false_pos, TSQ, tsq_full #return qval, false_pos, TSQ, tsq_full
return qval return qval
def leverage(aopt=1,*args): def leverage(aopt=1,*args):
@ -333,3 +323,107 @@ def vnorm(x):
This is considerably faster than linalg.norm This is considerably faster than linalg.norm
""" """
return sqrt(dot(x,x.conj())) return sqrt(dot(x,x.conj()))
def mahalanobis(a, loc=None, acov=None, invcov=None):
"""Returns the distance of each observation in a
from the location estimate (loc) of the data,
relative to the shape of the data.
a : data matrix (n observations in rows, p variables in columns)
loc : location estimate of the data (p-dimensional vector)
covmat or invcov : scatter estimate of the data or the inverse of the scatter estimate (pxp matrix)
:Returns:
A vector containing the distances of all the observations to locvct.
"""
n, p = a.shape
if loc==None:
loc = a.mean(0)
loc = atleast_2d(loc)
if loc.shape[1]==1:
loc = loc.T; #ensure rowvector
assert(loc.shape[1]==p)
xc = a - loc
if acov==None and invcov==None:
acov = dot(xc.T, xc)
if invcov != None:
covmat = atleast_2d(invcov)
if min(covmat.shape)==1:
covmat = diag(invcov.ravel())
else:
covmat = atleast_2d(acov)
if min(covmat.shape)==1:
covmat = diag(covmat.ravel())
covmat = inv(covmat)
# mdist = diag(dot(dot(xc, covmat),xc.T))
mdist = (dot(xc, covmat)*xc).sum(1)
return mdist
def lpls_qvals(a, b, c, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,center=True,
sim_method='shuffle',p_center='med', cov_center='med',crot=True, strict=False):
"""Returns qvals for l-pls model.
input:
a -- data matrix
b -- data matrix
c -- data matrix
aopt -- scalar, opt. number of components
alpha -- [0,1] regularisation parameter for T2-test
xz_alpha -- [0,1] how much z info to include
n_iter -- number of permutations
sim_method -- permutation method ['shuffle']
p_center -- location estimator for sub models ['med']
cov_center -- location estimator for covariance of submodels ['med']
crot -- bool, use rotations of sub models?
strict -- bool, use stict (rot/flips only) rotations?
"""
m, n = a.shape
TSQ = zeros((n, n_iter), dtype='d') # (nvars x n_subsets)
n_false = zeros((n, n_iter), dtype='d')
# Full model
dat = lpls(a, b, c, aopt, scale='loads')
Wcv = lpls_jk(a, b, c ,aopt, n_blocks=None, algo=algo,center=center)
tsq_x = hotelling(Wcv, dat['W'], p_center=p_center,alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center)
Lcv = lpls_jk(a, b, c ,aopt, n_blocks=None, algo=algo,center=center)
tsq_z = hotelling(Lcv, dat['L'], p_center=p_center,alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center)
# Perturbations
t0 = time.time()
Vs = shuffle_1d(b, n_iter, axis=0)
for i, b_shuff in enumerate(Vs):
t1 = time.time()
dat = pls(ac, b_shuff, aopt, 'loads', 'fast')
Wcv = pls_jkW(a, b_shuff, aopt, n_blocks=None, algo=algo)
TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center,
alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center)
print time.time() - t1
return fdr(tsq_full, TSQ, median)
def fdr(tsq, tsqp, loc_method=median):
n, = tsq.shape
k, m = tsqp.shape
assert(n==k)
n_false = empty((n, m), 'd')
sort_index = argsort(tsq)[::-1]
r_index = argsort(sort_index)
for i in xrange(m):
for j in xrange(n):
n_false[j,i] = (tsqp[:,i]>tsq[j]).sum()
fp = loc_method(n_false,1)
n_signif = (arange(n) + 1.0)[r_index]
fd_rate = fp/n_signif
return fd_rate

View File

@ -4,15 +4,21 @@ There is almost no typechecking of any kind here, just focus on speed
""" """
import math import math
import warnings
from scipy.linalg import svd,inv from scipy.linalg import svd,inv
from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\ from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\
apply_along_axis,mean,ones,randn,empty_like,outer,r_,c_,\ apply_along_axis,mean,ones,randn,empty_like,outer,r_,c_,\
rand,sum,cumsum,matrix, expand_dims,minimum,where,arange rand,sum,cumsum,matrix, expand_dims,minimum,where,arange,inner,tile
has_sym=True has_sym = True
has_arpack = True
try: try:
from symeig import symeig from symeig import symeig
except: except:
has_sym = False has_sym = False
try:
from scipy.sandbox import arpack
except:
has_arpack = False
def pca(a, aopt,scale='scores',mode='normal',center_axis=0): def pca(a, aopt,scale='scores',mode='normal',center_axis=0):
@ -45,7 +51,6 @@ def pca(a, aopt,scale='scores',mode='normal',center_axis=0):
Notes Notes
----- -----
Uses kernel speed-up if m>>n or m<<n. Uses kernel speed-up if m>>n or m<<n.
If residuals turn rank deficient, a lower number of component than given If residuals turn rank deficient, a lower number of component than given
@ -101,6 +106,7 @@ def pca(a, aopt,scale='scores',mode='normal',center_axis=0):
else: else:
# residuals # residuals
E = a - dot(T, P.T) E = a - dot(T, P.T)
#E = a
SEP = E**2 SEP = E**2
ssq = [SEP.sum(0), SEP.sum(1)] ssq = [SEP.sum(0), SEP.sum(1)]
# leverages # leverages
@ -256,6 +262,7 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', center_axis=-1, ab=None):
Q = empty((l, aopt)) Q = empty((l, aopt))
T = empty((m, aopt)) T = empty((m, aopt))
B = empty((aopt, n, l)) B = empty((aopt, n, l))
tt = empty((aopt,))
if ab==None: if ab==None:
ab = dot(a.T, b) ab = dot(a.T, b)
@ -265,10 +272,10 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', center_axis=-1, ab=None):
w = w/vnorm(w) w = w/vnorm(w)
elif n<l: # more yvars than xvars elif n<l: # more yvars than xvars
if has_sym: if has_sym:
s, u = symeig(dot(ab, ab.T),range=[l,l],overwrite=True) s, w = symeig(dot(ab, ab.T),range=[n,n],overwrite=True)
else: else:
u, s, vh = svd(dot(ab, ab.T)) w, s, vh = svd(dot(ab, ab.T))
w = u[:,0] w = w[:,:1]
else: # standard wide xdata else: # standard wide xdata
if has_sym: if has_sym:
s, q = symeig(dot(ab.T, ab),range=[l,l],overwrite=True) s, q = symeig(dot(ab.T, ab),range=[l,l],overwrite=True)
@ -283,16 +290,16 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', center_axis=-1, ab=None):
r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis] r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis]
t = dot(a, r) t = dot(a, r)
tt = vnorm(t)**2 tt[i] = tti = dot(t.T, t).ravel()
p = dot(a.T, t)/tt p = dot(a.T, t)/tti
q = dot(r.T, ab).T/tt q = dot(r.T, ab).T/tti
ab = ab - dot(p, q.T)*tt ab = ab - dot(p, q.T)*tti
T[:,i] = t.ravel() T[:,i] = t.ravel()
W[:,i] = w.ravel() W[:,i] = w.ravel()
if mode=='fast' and i==aopt-1: if mode=='fast' and i==aopt-1:
if scale=='loads': if scale=='loads':
tnorm = apply_along_axis(vnorm, 0, T) tnorm = sqrt(tt)
T = T/tnorm T = T/tnorm
W = W*tnorm W = W*tnorm
return {'T':T, 'W':W} return {'T':T, 'W':W}
@ -300,26 +307,54 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', center_axis=-1, ab=None):
P[:,i] = p.ravel() P[:,i] = p.ravel()
R[:,i] = r.ravel() R[:,i] = r.ravel()
Q[:,i] = q.ravel() Q[:,i] = q.ravel()
B[i] = dot(R[:,:i+1], Q[:,:i+1].T) #B[i] = dot(R[:,:i+1], Q[:,:i+1].T)
qnorm = apply_along_axis(vnorm, 0, Q)
tnorm = sqrt(tt)
pp = (P**2).sum(0)
if mode=='detailed': if mode=='detailed':
E = empty((aopt, m, n)) E = empty((aopt, m, n))
F = empty((aopt, k, l)) F = empty((aopt, k, l))
for i in range(1, aopt+1, 1): ssqx, ssqy = [], []
E[i-1] = a - dot(T[:,:i], P[:,:i].T) leverage = empty((aopt, m))
h2x = [] #hotellings T^2
h2y = []
for ai in range(aopt):
E[ai,:,:] = a - dot(T[:,:ai+1], P[:,:ai+1].T)
F[i-1] = b - dot(T[:,:i], Q[:,:i].T) F[i-1] = b - dot(T[:,:i], Q[:,:i].T)
ssqx.append([(E[ai,:,:]**2).mean(0), (E[ai,:,:]**2).mean(1)])
ssqy.append([(F[ai,:,:]**2).mean(0), (F[ai,:,:]**2).mean(1)])
leverage[ai,:] = 1./m + ((T[:,:ai+1]/tnorm[:ai+1])**2).sum(1)
h2y.append(1./k + ((Q[:,:ai+1]/qnorm[:ai+1])**2).sum(1))
else: else:
E = a - dot(T[:,:aopt], P[:,:aopt].T) # residuals
F = b - dot(T[:,:aopt], Q[:,:aopt].T) E = a - dot(T, P.T)
F = b - dot(T, Q.T)
sepx = E**2
ssqx = [sepx.sum(0), sepx.sum(1)]
sepy = F**2
ssqy = [sepy.sum(0), sepy.sum(1)]
# leverage
leverage = 1./m + ((T/tnorm)**2).sum(1)
h2x = []
h2y = []
# variances
tp= tt*pp
tq = tt*qnorm*qnorm
expvarx = r_[0, 100*tp/(a*a).sum()]
expvary = r_[0, 100*tq/(b*b).sum()]
if scale=='loads': if scale=='loads':
tnorm = apply_along_axis(vnorm, 0, T)
T = T/tnorm T = T/tnorm
W = W*tnorm W = W*tnorm
Q = Q*tnorm Q = Q*tnorm
P = P*tnorm P = P*tnorm
return {'B':B, 'Q':Q, 'P':P, 'T':T, 'W':W, 'R':R, 'E':E, 'F':F} return {'Q':Q, 'P':P, 'T':T, 'W':W, 'R':R, 'E':E, 'F':F,
'expvarx':expvarx, 'expvary':expvary, 'ssqx':ssqx, 'ssqy':ssqy,
'leverage':leverage, 'h2':h2x}
def w_simpls(aat, b, aopt): def w_simpls(aat, b, aopt):
""" Simpls for wide matrices. """ Simpls for wide matrices.
@ -423,16 +458,6 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
else: #normal else: #normal
F = b - dot(a, B[-1]) F = b - dot(a, B[-1])
E = a - dot(T, W.T) E = a - dot(T, W.T)
# leverages
# fixme: probably need an orthogonal basis for row-space leverage
# T (scores) are not orthogonal
# Using a qr decomp to get an orthonormal basis for row-space
#Tq = qr(T)[0]
#s_lev,v_lev = leverage(aopt,Tq,W)
# explained variance
#var_x, exp_var_x = variances(a,T,W)
#qnorm = apply_along_axis(norm, 0, Q)
#var_y, exp_var_y = variances(b,U,Q/qnorm)
if scale=='loads': if scale=='loads':
T = T/tnorm T = T/tnorm
@ -442,7 +467,7 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
return {'B':B, 'W':W, 'T':T, 'Q':Q, 'E':E, 'F':F, 'U':U, 'P':W} return {'B':B, 'W':W, 'T':T, 'Q':Q, 'E':E, 'F':F, 'U':U, 'P':W}
def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], mode='normal', scale='scores', verbose=False): def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', verbose=False):
""" L-shaped Partial Least Sqaures Regression by the nipals algorithm. """ L-shaped Partial Least Sqaures Regression by the nipals algorithm.
(X!Z)->Y (X!Z)->Y
@ -464,6 +489,9 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], mode='normal', sca
evx : X-explained variance evx : X-explained variance
evy : Y-explained variance evy : Y-explained variance
evz : Z-explained variance evz : Z-explained variance
mnx : X location
mny : Y location
mnz : Z location
:Notes: :Notes:
@ -471,12 +499,12 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], mode='normal', sca
if mean_ctr!=None: if mean_ctr!=None:
xctr, yctr, zctr = mean_ctr xctr, yctr, zctr = mean_ctr
X, mnX = center(X, xctr) X, mnX = center(X, xctr)
Y, mnY = center(Y, xctr) Y, mnY = center(Y, yctr)
Z, mnZ = center(Z, zctr) Z, mnZ = center(Z, zctr)
varX = pow(X, 2).sum() varX = (X**2).sum()
varY = pow(Y, 2).sum() varY = (Y**2).sum()
varZ = pow(Z, 2).sum() varZ = (Z**2).sum()
m, n = X.shape m, n = X.shape
k, l = Y.shape k, l = Y.shape
@ -491,36 +519,40 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], mode='normal', sca
K = empty((o, a_max)) K = empty((o, a_max))
L = empty((u, a_max)) L = empty((u, a_max))
B = empty((a_max, n, l)) B = empty((a_max, n, l))
b0 = empty((a_max, m, l)) #b0 = empty((a_max, 1, l))
var_x = empty((a_max,)) var_x = empty((a_max,))
var_y = empty((a_max,)) var_y = empty((a_max,))
var_z = empty((a_max,)) var_z = empty((a_max,))
MAX_ITER = 250
LIM = 1e-1
for a in range(a_max): for a in range(a_max):
if verbose: if verbose:
print "\n Working on comp. %s" %a print "\nWorking on comp. %s" %a
u = Y[:,:1] u = Y[:,:1]
diff = 1 diff = 1
MAX_ITER = 200
lim = 1e-16
niter = 0 niter = 0
while (diff>lim and niter<MAX_ITER): while (diff>LIM and niter<MAX_ITER):
niter += 1 niter += 1
u1 = u.copy() u1 = u.copy()
w = dot(X.T, u) w = dot(X.T, u)
w = w/sqrt(dot(w.T, w)) w = w/sqrt(dot(w.T, w))
#w = w/dot(w.T, w)
l = dot(Z, w) l = dot(Z, w)
k = dot(Z.T, l) k = dot(Z.T, l)
k = k/sqrt(dot(k.T, k)) k = k/sqrt(dot(k.T, k))
#k = k/dot(k.T, k)
w = alpha*k + (1-alpha)*w w = alpha*k + (1-alpha)*w
#print sqrt(dot(w.T, w))
w = w/sqrt(dot(w.T, w)) w = w/sqrt(dot(w.T, w))
t = dot(X, w) t = dot(X, w)
c = dot(Y.T, t) c = dot(Y.T, t)
c = c/sqrt(dot(c.T, c)) c = c/sqrt(dot(c.T, c))
u = dot(Y, c) u = dot(Y, c)
diff = abs(u1 - u).max() diff = dot((u-u1).T, (u-u1))
if verbose: if verbose:
print "Converged after %s iterations" %niter print "Converged after %s iterations" %niter
print "Error: %.2E" %diff
tt = dot(t.T, t) tt = dot(t.T, t)
p = dot(X.T, t)/tt p = dot(X.T, t)/tt
q = dot(Y.T, t)/tt q = dot(Y.T, t)/tt
@ -543,7 +575,8 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], mode='normal', sca
var_z[a] = pow(Z, 2).sum() var_z[a] = pow(Z, 2).sum()
B[a] = dot(dot(W[:,:a+1], inv(dot(P[:,:a+1].T, W[:,:a+1]))), Q[:,:a+1].T) B[a] = dot(dot(W[:,:a+1], inv(dot(P[:,:a+1].T, W[:,:a+1]))), Q[:,:a+1].T)
b0[a] = mnY - dot(mnX, B[a]) #b0[a] = mnY - dot(mnX, B[a])
# variance explained # variance explained
evx = 100.0*(1 - var_x/varX) evx = 100.0*(1 - var_x/varX)
@ -558,7 +591,7 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], mode='normal', sca
L = L*knorm L = L*knorm
K = K/knorm K = K/knorm
return {'T':T, 'W':W, 'P':P, 'Q':Q, 'U':U, 'L':L, 'K':K, 'B':B, 'b0':b0, 'evx':evx, 'evy':evy, 'evz':evz} return {'T':T, 'W':W, 'P':P, 'Q':Q, 'U':U, 'L':L, 'K':K, 'B':B, 'evx':evx, 'evy':evy, 'evz':evz,'mnx': mnX, 'mny': mnY, 'mnz': mnZ}
@ -670,7 +703,8 @@ def nipals_pls(X, Y, a_max, alpha=.7, ax_center=0, mode='normal', scale='scores'
W = W*tnorm W = W*tnorm
Q = Q*tnorm Q = Q*tnorm
return {'T':T, 'W':W, 'P':P, 'Q':Q, 'U':U, 'B':B, 'b0':b0, 'evx':evx, 'evy':evy} return {'T':T, 'W':W, 'P':P, 'Q':Q, 'U':U, 'B':B, 'b0':b0, 'evx':evx, 'evy':evy,
'mnx': mnX, 'mny': mnY, 'xc': X, 'yc': Y}
########### Helper routines ######### ########### Helper routines #########
@ -691,6 +725,11 @@ def esvd(data, amax=None):
m, n = data.shape m, n = data.shape
if m>=n: if m>=n:
kernel = dot(data.T, data) kernel = dot(data.T, data)
if has_arpack:
if amax==None:
amax = n
s, v = arpack.eigen_symmetric(kernel,k=amax, which='LM',
maxiter=200,tol=1e-5)
if has_sym: if has_sym:
if amax==None: if amax==None:
amax = n amax = n
@ -728,16 +767,34 @@ def vnorm(x):
return math.sqrt(dot(x.T, x)) return math.sqrt(dot(x.T, x))
def center(a, axis): def center(a, axis):
# 0 = col center, 1 = row center, 2 = double center # 0 = col center, 1 = row center, 2 = double center
# -1 = nothing # -1 = nothing
# check if we have a vector
is_vec = len(a.shape)==1
if not is_vec:
is_vec = a.shape[0]==1 or a.shape[1]==1
if is_vec:
if axis==2:
warnings.warn("Double centering of vecor ignored, using ordinary centering")
if axis==-1:
mn = 0
else:
mn = a.mean()
return a - mn, mn
# !!!fixme: use broadcasting
if axis==-1: if axis==-1:
mn = zeros((a.shape[1],)) mn = zeros((1,a.shape[1],))
#mn = tile(mn, (a.shape[0], 1))
elif axis==0: elif axis==0:
mn = a.mean(0) mn = a.mean(0)[newaxis]
#mn = tile(mn, (a.shape[0], 1))
elif axis==1: elif axis==1:
mn = a.mean(1)[:,newaxis] mn = a.mean(1)[:,newaxis]
#mn = tile(mn, (1, a.shape[1]))
elif axis==2: elif axis==2:
mn = a.mean(0) + a.mean(1)[:,newaxis] - a.mean() mn = a.mean(0)[newaxis] + a.mean(1)[:,newaxis] - a.mean()
return a - mn , a.mean(0)[newaxis]
else: else:
raise IOError("input error: axis must be in [-1,0,1,2]") raise IOError("input error: axis must be in [-1,0,1,2]")
@ -755,3 +812,64 @@ def scale(a, axis):
return a - sc, sc return a - sc, sc
## #PCA CALCS
## % Calculate Q limit using unused eigenvalues
## temp = diag(s);
## if n < m
## emod = temp(lv+1:n,:);
## else
## emod = temp(lv+1:m,:);
## end
## th1 = sum(emod);
## th2 = sum(emod.^2);
## th3 = sum(emod.^3);
## h0 = 1 - ((2*th1*th3)/(3*th2^2));
## if h0 <= 0.0
## h0 = .0001;
## disp(' ')
## disp('Warning: Distribution of unused eigenvalues indicates that')
## disp(' you should probably retain more PCs in the model.')
## end
## q = th1*(((1.65*sqrt(2*th2*h0^2)/th1) + 1 + th2*h0*(h0-1)/th1^2)^(1/h0));
## disp(' ')
## disp('The 95% Q limit is')
## disp(q)
## if plots >= 1
## lim = [q q];
## plot(scl,res,scllim,lim,'--b')
## str = sprintf('Process Residual Q with 95 Percent Limit Based on %g PC Model',lv);
## title(str)
## xlabel('Sample Number')
## ylabel('Residual')
## pause
## end
## % Calculate T^2 limit using ftest routine
## if lv > 1
## if m > 300
## tsq = (lv*(m-1)/(m-lv))*ftest(.95,300,lv,2);
## else
## tsq = (lv*(m-1)/(m-lv))*ftest(.95,m-lv,lv,2);
## end
## disp(' ')
## disp('The 95% T^2 limit is')
## disp(tsq)
## % Calculate the value of T^2 by normalizing the scores to
## % unit variance and summing them up
## if plots >= 1.0
## temp2 = scores*inv(diag(ssq(1:lv,2).^.5));
## tsqvals = sum((temp2.^2)');
## tlim = [tsq tsq];
## plot(scl,tsqvals,scllim,tlim,'--b')
## str = sprintf('Value of T^2 with 95 Percent Limit Based on %g PC Model',lv);
## title(str)
## xlabel('Sample Number')
## ylabel('Value of T^2')
## end
## else
## disp('T^2 not calculated when number of latent variables = 1')
## tsq = 1.96^2;
## end

View File

@ -1,7 +1,7 @@
"""This module implements some common validation schemes from pca and pls. """This module implements some common validation schemes from pca and pls.
""" """
from scipy import ones,mean,sqrt,dot,newaxis,zeros,sum,empty,\ from scipy import ones,mean,sqrt,dot,newaxis,zeros,sum,empty,\
apply_along_axis,eye,kron,array,sort,zeros_like,argmax apply_along_axis,eye,kron,array,sort,zeros_like,argmax,atleast_2d
from scipy.stats import median from scipy.stats import median
from scipy.linalg import triu,inv,svd,norm from scipy.linalg import triu,inv,svd,norm
@ -9,7 +9,7 @@ from select_generators import w_pls_gen,w_pls_gen_jk,pls_gen,pca_gen,diag_pert
from engines import w_simpls,pls,bridge,pca,nipals_lpls from engines import w_simpls,pls,bridge,pca,nipals_lpls
from cx_utils import m_shape from cx_utils import m_shape
def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'): def w_pls_cv_val(X, Y, amax, n_blocks=None):
"""Returns rmsep and aopt for pls tailored for wide X. """Returns rmsep and aopt for pls tailored for wide X.
The root mean square error of cross validation is calculated The root mean square error of cross validation is calculated
@ -62,12 +62,10 @@ def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
for Din, Doi, Yin, Yout in V: for Din, Doi, Yin, Yout in V:
ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0]) ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0])
PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0) PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
if algo=='simpls':
dat = w_simpls(Din, Yin, amax) dat = w_simpls(Din, Yin, amax)
Q, U, H = dat['Q'], dat['U'], dat['H'] Q, U, H = dat['Q'], dat['U'], dat['H']
That = dot(Doi, dot(U, inv(triu(dot(H.T, U))) )) That = dot(Doi, dot(U, inv(triu(dot(H.T, U))) ))
else:
raise NotImplementedError
Yhat = [] Yhat = []
for j in range(l): for j in range(l):
@ -78,7 +76,7 @@ def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
#Yhat = Yin - dot(That,Q.T) #Yhat = Yin - dot(That,Q.T)
msep = PRESS/(Y.shape[0]) msep = PRESS/(Y.shape[0])
aopt = find_aopt_from_sep(msep) aopt = find_aopt_from_sep(msep)
return sqrt(msep) return sqrt(msep), aopt
def pls_val(X, Y, amax=2, n_blocks=10, algo='pls'): def pls_val(X, Y, amax=2, n_blocks=10, algo='pls'):
k, l = m_shape(Y) k, l = m_shape(Y)
@ -108,29 +106,54 @@ def pls_val(X, Y, amax=2, n_blocks=10, algo='pls'):
aopt = find_aopt_from_sep(msep) aopt = find_aopt_from_sep(msep)
return msep, Yhat, aopt return msep, Yhat, aopt
def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5): def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2]):
"""Performs crossvalidation to get generalisation error in lpls""" """Performs crossvalidation to get generalisation error in lpls"""
assert(nsets<=X.shape[0])
cv_iter = pls_gen(X, Y, n_blocks=nsets,center=False,index_out=True) cv_iter = pls_gen(X, Y, n_blocks=nsets,center=False,index_out=True)
k, l = Y.shape k, l = Y.shape
Yhat = empty((a_max,k,l), 'd') Yc = empty((k, l), 'd')
Yhat = empty((a_max, k, l), 'd')
Yhatc = empty((a_max, k, l), 'd')
sep2 = empty((a_max, k, l), 'd')
for i, (xcal,xi,ycal,yi,ind) in enumerate(cv_iter): for i, (xcal,xi,ycal,yi,ind) in enumerate(cv_iter):
dat = nipals_lpls(xcal,ycal,Z, dat = nipals_lpls(xcal,ycal,Z,
a_max=a_max, a_max=a_max,
alpha=alpha, alpha=alpha,
mean_ctr=[2,0,1], mean_ctr=mean_ctr,
verbose=False) verbose=False)
B = dat['B'] B = dat['B']
b0 = dat['b0'] #b0 = dat['b0']
for a in range(a_max): for a in range(a_max):
Yhat[a,ind,:] = b0[a][0][0] + dot(xi-xcal.mean(0), B[a]) if mean_ctr[0] in [0, 2]:
Yhat_class = zeros_like(Yhat) xi = xi - dat['mnx']
for a in range(a_max): else:
for i in range(k): xi = xi - xi.mean(1)[:,newaxis] #???: cheating?
Yhat_class[a,i,argmax(Yhat[a,i,:])]=1.0 if mean_ctr[1] in [0, 2]:
class_err = 100*((Yhat_class+Y)==2).sum(1)/Y.sum(0).astype('d') ym = dat['mny']
else:
ym = yi.mean(1)[:,newaxis] #???: check this
Yhat[a,ind,:] = atleast_2d(ym + dot(xi, B[a]))
#Yhat[a,ind,:] = atleast_2d(b0[a] + dot(xi, B[a]))
# todo: need a better support for class validation
y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?']
print Y.dtype.char
if y_is_class:
Yhat_class = zeros_like(Yhat)
for a in range(a_max):
for i in range(k):
Yhat_class[a,i,argmax(Yhat[a,i,:])] = 1.0
class_err = 100*((Yhat_class+Y)==2).sum(1)/Y.sum(0).astype('d')
sep = (Y - Yhat)**2 sep = (Y - Yhat)**2
rmsep = sqrt(sep.mean(1)) rmsep = sqrt(sep.mean(1)).T
#rmsep2 = sqrt(sep2.mean(1))
aopt = find_aopt_from_sep(rmsep) aopt = find_aopt_from_sep(rmsep)
return rmsep, Yhat, aopt return rmsep, Yhat, aopt
def pca_alter_val(a, amax, n_sets=10, method='diag'): def pca_alter_val(a, amax, n_sets=10, method='diag'):
@ -247,7 +270,7 @@ def pca_jkP(a, aopt, n_blocks=None):
return PP return PP
def lpls_jk(X, Y, Z, a_max, nsets=None, alpha=.5): def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2]):
cv_iter = pls_gen(X, Y, n_blocks=nsets,center=False,index_out=False) cv_iter = pls_gen(X, Y, n_blocks=nsets,center=False,index_out=False)
m, n = X.shape m, n = X.shape
k, l = Y.shape k, l = Y.shape
@ -258,8 +281,8 @@ def lpls_jk(X, Y, Z, a_max, nsets=None, alpha=.5):
WWz = empty((nsets, o, a_max), 'd') WWz = empty((nsets, o, a_max), 'd')
#WWy = empty((nsets, l, a_max), 'd') #WWy = empty((nsets, l, a_max), 'd')
for i, (xcal,xi,ycal,yi) in enumerate(cv_iter): for i, (xcal,xi,ycal,yi) in enumerate(cv_iter):
dat = nipals_lpls(xcal,ycal,Z,a_max=a_max,alpha=alpha, dat = nipals_lpls(xcal,ycal,Z,a_max=a_max,alpha=xz_alpha,
mean_ctr=[2,0,1],scale='loads',verbose=False) mean_ctr=mean_ctr,scale='loads',verbose=False)
WWx[i,:,:] = dat['W'] WWx[i,:,:] = dat['W']
WWz[i,:,:] = dat['L'] WWz[i,:,:] = dat['L']
#WWy[i,:,:] = dat['Q'] #WWy[i,:,:] = dat['Q']