Second initial import

This commit is contained in:
Arnar Flatberg 2007-10-26 13:36:05 +00:00
parent 67d6d01f7f
commit 5b91e2d809
4 changed files with 596 additions and 23 deletions

View File

@ -201,16 +201,16 @@ def eigen(A,k=6,M=None,ncv=None,which='LM',
dr=sb.zeros(k+1,typ) dr=sb.zeros(k+1,typ)
di=sb.zeros(k+1,typ) di=sb.zeros(k+1,typ)
zr=sb.zeros((n,k+1),typ) zr=sb.zeros((n,k+1),typ)
dr,di,z,info=\ dr,di,zr,info=\
eigextract(rvec,howmny,sselect,sigmar,sigmai,workev, eigextract(rvec,howmny,sselect,sigmar,sigmai,workev,
bmat,which,k,tol,resid,v,iparam,ipntr, bmat,which,k,tol,resid,v,iparam,ipntr,
workd,workl,info) workd,workl,info)
# make eigenvalues complex # make eigenvalues complex
d=dr+1.0j*di d=(dr+1.0j*di)[:k]
# futz with the eigenvectors: # futz with the eigenvectors:
# complex are stored as real,imaginary in consecutive columns # complex are stored as real,imaginary in consecutive columns
z=zr.astype(typ.upper()) z=zr.astype(typ.upper())[:,:k]
for i in range(k): # fix c.c. pairs for i in range(k): # fix c.c. pairs
if di[i] > 0 : if di[i] > 0 :
z[:,i]=zr[:,i]+1.0j*zr[:,i+1] z[:,i]=zr[:,i]+1.0j*zr[:,i+1]

View File

@ -1,5 +1,6 @@
from crossvalidation import lpls_val from crossvalidation import lpls_val
from statistics import lpls_qvals from statistics import lpls_qvals
from engines import pca, pcr, pls
from engines import nipals_lpls as lpls from engines import nipals_lpls as lpls
#from tests import * #from tests import *

View File

@ -2,11 +2,12 @@
The primary use is crossvalidation. The primary use is crossvalidation.
""" """
__all__ = ['lpls_val', 'lpls_jk'] __all__ = ['lpls_val', 'pls_jk', 'lpls_jk']
__docformat__ = "restructuredtext en" __docformat__ = "restructuredtext en"
from numpy import dot,empty,zeros,sqrt,atleast_2d,argmax,asarray,median,\ from numpy import dot,empty,zeros,sqrt,atleast_2d,argmax,asarray,median,\
array_split array_split,arange
from numpy.random import shuffle
from engines import nipals_lpls as lpls from engines import nipals_lpls as lpls
@ -93,8 +94,92 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=Tru
return rmsep, Yhat, aopt return rmsep, Yhat, aopt
def pca_jk(a, aopt, n_blocks=None):
"""Returns jack-knife segements from PCA.
def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=False): Parameters:
a : {array}
data matrix (n x m)
aopt : {integer}
number of components in model.
nsets : {integer}
number of segments
Returns:
Pcv : {array}
Loadings collected in a three way matrix (n_segments, m, aopt)
Notes:
- The loadings are scaled with the (1/samples)*eigenvalues.
- Crossvalidation method is currently set to random blocks of samples.
- todo: add support for T
- fixme: more efficient to add this in validation loop?
"""
if nsets == None:
nsets = a.shape[0]
Pcv = empty((nsets, a.shape[1], aopt), dtype=a.dtype)
mn_a = .5*(a.mean(0) + a.mean(1)[:,newaxis])
for i, (cal, val) in enumerate(cv_diag(a.shape, nsets)):
old_values = a.take(ind)
new_values = mn_a.take(ind)
a.put(ind, new_values)
dat = pca(a, aopt, mode='fast', scale='loads', center=center)
PP[nn,:,:] = dat['P']
a.put(ind, old_values)
return PP
def pls_jk(X, Y, a_opt, nsets=None, center=True, verbose=False):
""" Returns jack-knife segements of W.
*Parameters*:
X : {array}
Main data matrix (m, n)
Y : {array}
External row data (m, l)
a_opt : {integer}
The number of components to calculate (0, min(m,n))
nsets : (integer), optional
Number of jack-knife segments
center : {boolean}, optional
- -1 : nothing
- 0 : row center
- 1 : column center
- 2 : double center
verbose : {boolean}, optional
Verbosity of console output. For use in debugging.
*Returns*:
Wcv : {array}
Loading-weights jack-knife segements
"""
m, n = X.shape
k, l = Y.shape
assert(m==k)
if nsets == None:
nsets = X.shape[0]
Wcv = empty((nsets, X.shape[1], a_opt), dtype=X.dtype)
for i, (cal, val) in enumerate(cv(k, nsets)):
if verbose:
print "Segement number: %d" %i
dat = pls(X[cal,:], Y[cal,:], a_opt, scale='loads', mode='fast')
Wcv[nn,:,:] = dat['W']
return Wcv
def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=False):
"""Returns jack-knifed segments of lpls model. """Returns jack-knifed segments of lpls model.
Jack-knifing is a method to perturb the model paramters, hopefully Jack-knifing is a method to perturb the model paramters, hopefully
@ -113,8 +198,8 @@ def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=F
External row data (m, l) External row data (m, l)
Z : {array} Z : {array}
External column data (n, o) External column data (n, o)
a_max : {integer}, optional a_opt : {integer}, optional
Maximum number of components to calculate (0, min(m,n)) The number of components to calculate (0, min(m,n))
nsets : (integer), optional nsets : (integer), optional
Number of jack-knife segments Number of jack-knife segments
xz_alpha : {float}, optional xz_alpha : {float}, optional
@ -145,11 +230,11 @@ def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=F
assert(n==p) assert(n==p)
if nsets==None: if nsets==None:
nsets = m nsets = m
WWx = empty((nsets, n, a_max), 'd') WWx = empty((nsets, n, a_opt), 'd')
WWz = empty((nsets, o, a_max), 'd') WWz = empty((nsets, o, a_opt), 'd')
#WWy = empty((nsets, l, a_max), 'd') #WWy = empty((nsets, l, a_opt), 'd')
for i, (cal, val) in enumerate(cv(k, nsets)): for i, (cal, val) in enumerate(cv(k, nsets)):
dat = lpls(X[cal],Y[cal],Z,a_max=a_max,alpha=xz_alpha,mean_ctr=mean_ctr, dat = lpls(X[cal], Y[cal], Z, a_max=a_opt,alpha=xz_alpha, mean_ctr=mean_ctr,
scale='loads', verbose=verbose) scale='loads', verbose=verbose)
WWx[i,:,:] = dat['W'] WWx[i,:,:] = dat['W']
WWz[i,:,:] = dat['L'] WWz[i,:,:] = dat['L']
@ -219,6 +304,7 @@ def cv(N, K, randomise=True, sequential=False):
*Notes*: *Notes*:
If randomise is true, a copy of index is shuffled before partitioning, If randomise is true, a copy of index is shuffled before partitioning,
otherwise its order is preserved in training and validation. otherwise its order is preserved in training and validation.
Randomise overrides the sequential argument. If randomise is true, Randomise overrides the sequential argument. If randomise is true,
@ -245,13 +331,40 @@ def cv(N, K, randomise=True, sequential=False):
validation = [i for i in index if i % K == k] validation = [i for i in index if i % K == k]
yield training, validation yield training, validation
def diag_cv(shape, nsets=9):
"""Generates K (training, validation) index pairs.
Parameters:
N : {integer}
alpha -- scalar, approx. portion of data perturbed
"""
try:
m, n = shape
except:
raise ValueError("shape needs to be a two-tuple")
if nsets>m or nsets>n:
msg = "You may not use more subsets than max(n_rows, n_cols)"
raise ValueError, msg
nm = n*m
index = arange(nm)
n_ind = arange(n)
shuffle(n_ind) # random start diag
start_inds = array_split(n_ind, nsets)
for v in range(nsets):
validation = []
for start in start_inds[v]:
ind = arange(start+v, nm, n+1)
[validation.append(i) for i in ind]
training = [j for j in index if j not in validation]
yield training, validation
def class_error(Yhat, Y, method='vanilla'): def class_error(Yhat, Y, method='vanilla'):
""" Not used. """ Not used.
""" """
a_max, k, l = Yhat.shape a_opt, k, l = Yhat.shape
Yhat_c = zeros((k, l), dtype='d') Yhat_c = zeros((k, l), dtype='d')
for a in range(a_max): for a in range(a_opt):
for i in range(k): for i in range(k):
Yhat_c[a,val,argmax(Yhat[a,val,:])] = 1.0 Yhat_c[a,val,argmax(Yhat[a,val,:])] = 1.0
err = 100*((Yhat_c + Y) == 2).sum(1)/Y.sum(0).astype('d') err = 100*((Yhat_c + Y) == 2).sum(1)/Y.sum(0).astype('d')

View File

@ -2,14 +2,414 @@
""" """
__all__ = ['nipals_lpls'] __all__ = ['pca', 'pcr', 'pls', 'nipals_lpls']
__docformat__ = "restructuredtext en" __docformat__ = "restructuredtext en"
from math import sqrt as msqrt from math import sqrt as msqrt
from numpy import dot,empty,zeros,apply_along_axis,newaxis,finfo from numpy import dot,empty,zeros,apply_along_axis,newaxis,finfo,sqrt,r_,expand_dims,\
from numpy.linalg import inv minimum
from numpy.linalg import inv, svd
from scipy.sandbox import arpack
def pca(X, aopt, scale='scores', mode='normal', center_axis=0):
""" Principal Component Analysis.
PCA is a low rank bilinear aprroximation to a data matrix that sequentially
extracts orthogonal components of maximum variance.
Parameters:
X : {array}
Data measurement matrix, (samples x variables)
aopt : {integer}
Number of components to use, aopt<=min(samples, variables)
center_axis : {integer}
Center along given axis. If neg.: no centering (-inf,..., matrix modes)
Returns:
T : {array}
Scores, (samples, components)
P : {array}
Loadings, (variables, components)
E : {array}
Residuals, (samples, variables)
evx : {array}
X-explained variance, (components,)
mnx : {array}
X location, (variables,)
aopt : {integer}
The number of components used
ssqx : {list}
Sum of squared residuals in X along each dimesion
[(samples, ), (variables,)]
leverage : {array}
Leverages, (samples,)
OtherParameters:
scale : {string}, optional
Where to put the weights [['scores'], 'loadings']
mode : {string}, optional
Amount of info retained, [['normal'], 'fast', 'detailed']
:SeeAlso:
`center` : Data centering
*Notes*
Uses kernel speed-up if m>>n or m<<n.
If residuals turn rank deficient, a lower number of component than given
in input will be used.
*Examples*:
>>> import scipy,engines
>>> a=scipy.asarray([[1,2,3],[2,4,5]])
>>> dat=engines.pca(a, 2)
>>> dat['evx']
array([0.,99.8561562, 100.])
"""
m, n = X.shape
assert(aopt<=min(m,n))
if center_axis>=0:
X = X - expand_dims(X.mean(center_axis), center_axis)
if m>(n+100) or n>(m+100):
u, s, v = esvd(X, aopt)
else:
u, s, vt = svd(X, 0)
v = vt.T
u = u[:,:aopt]
s = s[:aopt]
v = v[:,:aopt]
# ranktest
tol = 1e-10
eff_rank = sum(s>s[0]*tol)
aopt = minimum(aopt, eff_rank)
T = u*s
s = s[:aopt]
T = T[:,:aopt]
P = v[:,:aopt]
e = s**2
if scale=='loads':
T = T/s
P = P*s
if mode == 'fast':
return {'T':T, 'P':P, 'aopt':aopt}
if mode=='detailed':
E = empty((aopt, m, n))
ssq = []
lev = []
for ai in range(aopt):
E[ai,:,:] = X - dot(T[:,:ai+1], P[:,:ai+1].T)
ssq.append([(E[ai,:,:]**2).mean(0), (E[ai,:,:]**2).mean(1)])
if scale=='loads':
lev.append([((s*T)**2).sum(1), (P**2).sum(1)])
else:
lev.append([(T**2).sum(1), ((s*P)**2).sum(1)])
else:
# residuals
E = X - dot(T, P.T)
sep = E**2
ssq = [sep.sum(0), sep.sum(1)]
# leverages
if scale=='loads':
lev = [(1./m)+(T**2).sum(1), (1./n)+((P/s)**2).sum(1)]
else:
lev = [(1./m)+((T/s)**2).sum(1), (1./n)+(P**2).sum(1)]
# variances
expvarx = r_[0, 100*e.cumsum()/(X*X).sum()]
return {'T': T, 'P': P, 'E': E, 'evx': expvarx, 'leverage': lev, 'ssqx': ssq,
'aopt': aopt, 'eigvals': e}
def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0):
""" Principal Component Regression.
Performs PCR on given matrix and returns results in a dictionary.
Parameters:
a : array
Data measurement matrix, (samples x variables)
b : array
Data response matrix, (samples x responses)
aopt : int
Number of components to use, aopt<=min(samples, variables)
Returns:
results : dict
keys -- values, T -- scores, P -- loadings, E -- residuals,
levx -- leverages, ssqx -- sum of squares, expvarx -- cumulative
explained variance, aopt -- number of components used
OtherParameters:
mode : str
Amount of info retained, ('fast', 'normal', 'detailed')
center_axis : int
Center along given axis. If neg.: no centering (-inf,..., matrix modes)
SeeAlso:
- pca : other blm
- pls : other blm
- lpls : other blm
*Notes*
-----
Uses kernel speed-up if m>>n or m<<n.
If residuals turn rank deficient, a lower number of component than given
in input will be used. The number of components used is given in results-dict.
Examples
--------
>>> import scipy,engines
>>> a=scipy.asarray([[1,2,3],[2,4,5]])
>>> b=scipy.asarray([[1,1],[2,3]])
>>> dat=engines.pcr(a, 2)
>>> dat['evx']
array([0.,99.8561562, 100.])
"""
try:
k, l = b.shape
except:
b = atleast_2d(b).T
k, l = b.shape
if center_axis>=0:
b = b - expand_dims(b.mean(center_axis), center_axis)
dat = pca(a, aopt=aopt, scale=scale, mode=mode, center_axis=center_axis)
T = dat['T']
weights = apply_along_axis(vnorm, 0, T)**2
if scale=='loads':
Q = dot(b.T, T*weights)
else:
Q = dot(b.T, T/weights)
if mode=='fast':
dat.update({'Q':Q})
return dat
if mode=='detailed':
F = empty((aopt, k, l))
ssqy = []
for i in range(aopt):
F[i,:,:] = b - dot(T[:,:i+1], Q[:,:i+1].T)
ssqy.append([(F[i,:,:]**2).mean(0), (F[i,:,:]**2).mean(1)])
else:
F = b - dot(T, Q.T)
sepy = F**2
ssqy = [sepy.sum(0), sepy.sum(1)]
expvary = r_[0, 100*((T**2).sum(0)*(Q**2).sum(0)/(b**2).sum()).cumsum()[:aopt]]
dat.update({'Q': Q, 'F': F, 'evy': expvary, 'ssqy': ssqy})
return dat
def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1):
"""Partial Least Squares Regression.
Performs PLS on given matrix and returns results in a dictionary.
*Parameters*:
X : {array}
Data measurement matrix, (samples x variables)
Y : {array}
Data response matrix, (samples x responses)
aopt : {integer}, optional
Number of components to use, aopt<=min(samples, variables)
scale : ['scores', 'loadings'], optional
Which component should get the scale
center_axis : {-1, integer}
Perform centering across given axis, (-1 is no centering)
*Returns*:
T : {array}
X-scores
W : {array}
X-loading-weights
P : {array}
X-loadings
R : {array}
X-loadings-basis
Q : {array}
Y-loadings
B : {array}
Regression coefficients
E : {array}
X-block residuals
F : {array}
Y-block residuals
evx : {array}
X-explained variance
evy : {array}
Y-explained variance
mnx : {array}
X location
mny : {array}
Y location
aopt : {array}
The number of components used
ssqx : {list}, optional
Sum of squared residuals in X along each dimesion
ssqy : {list}
Sum of squared residuals in Y along each dimesion
leverage : {array}
Sample leverages
*OtherParameters*:
mode : ['normal', 'fast', 'detailed'], optional
How much details to compute
*SeeAlso*:
`center` : data centering
*Notes*
- The output with mode='fast' will only return T and W
- If residuals turn rank deficient, a lower number of component than given in input will be used. The number of components used is given in results.
*Examples*
>>> import numpy, engines
>>> a = numpy.asarray([[1,2,3],[2,4,5]])
>>> b = numpy.asarray([[1,1],[2,3]])
>>> dat =engines.pls(a, b, 2)
>>> dat['evx']
array([0.,99.8561562, 100.])
"""
m, n = X.shape
try:
k, l = Y.shape
except:
Y = atleast_2d(Y).T
k, l = Y.shape
assert(m==k)
assert(aopt<min(m, n))
mnx, mny = 0,0
if center_axis>=0:
mnx = expand_dims(X.mean(center_axis), center_axis)
X = X - mnx
mny = expand_dims(Y.mean(center_axis), center_axis)
Y = Y - mny
W = empty((n, aopt))
P = empty((n, aopt))
R = empty((n, aopt))
Q = empty((l, aopt))
T = empty((m, aopt))
B = empty((aopt, n, l))
tt = empty((aopt,))
XY = dot(X.T, Y)
for i in range(aopt):
if XY.shape[1]==1: #pls 1
w = XY.reshape(n, l)
w = w/vnorm(w)
elif n<l: # more yvars than xvars
s, w = arpack.eigen_symmetric(dot(XY, XY.T),k=1, tol=1e-10, maxiter=100)
#w, s, vh = svd(dot(XY, XY.T))
#w = w[:,:1]
else: # more xvars than yvars
s, q = arpack.eigen_symmetric(dot(XY.T, XY), k=1, tol=1e-10, maxiter=100)
#q, s, vh = svd(dot(XY.T, XY))
#q = q[:,:1]
w = dot(XY, q)
w = w/vnorm(w)
r = w.copy()
if i>0:
for j in range(0, i, 1):
r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis]
t = dot(X, r)
tt[i] = tti = dot(t.T, t).ravel()
p = dot(X.T, t)/tti
q = dot(r.T, XY).T/tti
XY = XY - dot(p, q.T)*tti
T[:,i] = t.ravel()
W[:,i] = w.ravel()
if mode=='fast' and i==aopt-1:
if scale=='loads':
tnorm = sqrt(tt)
T = T/tnorm
W = W*tnorm
return {'T':T, 'W':W}
P[:,i] = p.ravel()
R[:,i] = r.ravel()
Q[:,i] = q.ravel()
B[i] = dot(R[:,:i+1], Q[:,:i+1].T)
qnorm = apply_along_axis(vnorm, 0, Q)
tnorm = sqrt(tt)
pp = (P**2).sum(0)
if mode=='detailed':
E = empty((aopt, m, n))
F = empty((aopt, k, l))
ssqx, ssqy = [], []
leverage = empty((aopt, m))
#h2x = [] #hotellings T^2
#h2y = []
for ai in range(aopt):
E[ai,:,:] = X - dot(T[:,:ai+1], P[:,:ai+1].T)
F[ai,:,:] = Y - dot(T[:,:i], Q[:,:i].T)
ssqx.append([(E[ai,:,:]**2).mean(0), (E[ai,:,:]**2).mean(1)])
ssqy.append([(F[ai,:,:]**2).mean(0), (F[ai,:,:]**2).mean(1)])
leverage[ai,:] = 1./m + ((T[:,:ai+1]/tnorm[:ai+1])**2).sum(1)
#h2y.append(1./k + ((Q[:,:ai+1]/qnorm[:ai+1])**2).sum(1))
else:
# residuals
E = X - dot(T, P.T)
F = Y - dot(T, Q.T)
sepx = E**2
ssqx = [sepx.sum(0), sepx.sum(1)]
sepy = F**2
ssqy = [sepy.sum(0), sepy.sum(1)]
leverage = 1./m + ((T/tnorm)**2).sum(1)
# variances
tp= tt*pp
tq = tt*qnorm*qnorm
expvarx = r_[0, 100*tp/(X*X).sum()]
expvary = r_[0, 100*tq/(Y*Y).sum()]
if scale=='loads':
T = T/tnorm
W = W*tnorm
Q = Q*tnorm
P = P*tnorm
return {'Q': Q, 'P': P, 'T': T, 'W': W, 'R': R, 'E': E, 'F': F, 'B': B,
'evx': expvarx, 'evy': expvary, 'ssqx': ssqx, 'ssqy': ssqy,
'leverage': leverage, 'mnx': mnx, 'mny': mny}
def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', verbose=False): def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', verbose=False):
""" L-shaped Partial Least Sqaures Regression by the nipals algorithm. """ L-shaped Partial Least Sqaures Regression by the nipals algorithm.
@ -39,10 +439,6 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
0 : row center 0 : row center
1 : column center 1 : column center
2 : double center 2 : double center
scale : {'scores', 'loads'}, optional
Option to decide on where the scale goes.
verbose : {boolean}, optional
Verbosity of console output. For use in debugging.
*Returns*: *Returns*:
@ -75,6 +471,13 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
mnz : {array} mnz : {array}
Z location Z location
*OtherParameters*:
scale : {'scores', 'loads'}, optional
Option to decide on where the scale goes.
verbose : {boolean}, optional
Verbosity of console output. For use in debugging.
*References* *References*
Saeboe et al., LPLS-regression: a method for improved prediction and Saeboe et al., LPLS-regression: a method for improved prediction and
classification through inclusion of background information on classification through inclusion of background information on
@ -296,3 +699,59 @@ def _scale(a, axis):
raise IOError("input error: axis must be in [-1,0,1]") raise IOError("input error: axis must be in [-1,0,1]")
return a - sc, sc return a - sc, sc
def esvd(data, a_max=None):
""" SVD with kernel calculation
Calculate subspaces of X'X or XX' depending on the shape
of the matrix.
Parameters:
data : {array}
Data matrix
a_max : {integer}
Number of components to extract
Returns:
u : {array}
Right hand eigenvectors
s : {array}
Singular values
v : {array}
Left hand eigenvectors
notes:
Uses Anoldi iterations (ARPACK)
"""
m, n = data.shape
if m>=n:
kernel = dot(data.T, data)
if a_max==None:
a_max = n - 1
s, v = arpack.eigen_symmetric(kernel,k=a_max, which='LM',
maxiter=200,tol=1e-5)
s = s[::-1]
v = v[:,::-1]
#u, s, vt = svd(kernel)
#v = vt.T
s = sqrt(s)
u = dot(data, v)/s
else:
kernel = dot(data, data.T)
if a_max==None:
a_max = m -1
s, u = arpack.eigen_symmetric(kernel,k=a_max, which='LM',
maxiter=200,tol=1e-5)
s = s[::-1]
u = u[:,::-1]
#u, s, vt = svd(kernel)
s = sqrt(s)
v = dot(data.T, u)/s
return u, s, v