265 lines
8.4 KiB
Python
265 lines
8.4 KiB
Python
"""This module implements some validation schemes l-pls.
|
|
|
|
The primary use is crossvalidation.
|
|
"""
|
|
__all__ = ['lpls_val', 'lpls_jk']
|
|
__docformat__ = "restructuredtext en"
|
|
|
|
from numpy import dot,empty,zeros,sqrt,atleast_2d,argmax,asarray,median,\
|
|
array_split
|
|
|
|
from engines import nipals_lpls as lpls
|
|
|
|
|
|
def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=True):
|
|
"""Performs crossvalidation for generalisation error in lpls.
|
|
|
|
The L-PLS crossvalidation is estimated just like an ordinary pls
|
|
crossvalidation. That is, the generalisation error is estimated by
|
|
predicting samples (rows of X and Y) left out according to a cross
|
|
validation scheme.
|
|
|
|
*Parameters*:
|
|
|
|
X : {array}
|
|
Main data matrix (m, n)
|
|
Y : {array}
|
|
External row data (m, l)
|
|
Z : {array}
|
|
External column data (n, o)
|
|
a_max : {integer}, optional
|
|
Maximum number of components to calculate (0, min(m,n))
|
|
nsets : (integer), optional
|
|
Number of crossvalidation sets
|
|
alpha : {float}, optional
|
|
Parameter to control the amount of influence from Z-matrix.
|
|
0 is none, which returns a pls-solution, 1 is max
|
|
mean_center : {array-like}, optional
|
|
A three element array-like structure with elements in [-1,0,1,2],
|
|
that decides the type of centering used.
|
|
-1 : nothing
|
|
0 : row center
|
|
1 : column center
|
|
2 : double center
|
|
verbose : {boolean}, optional
|
|
Verbosity of console output. For use in debugging.
|
|
|
|
*Returns*:
|
|
|
|
rmsep : {array}
|
|
Root mean squred error of prediction
|
|
yhat : {array}
|
|
Estimated responses
|
|
aopt : {integer}
|
|
Estimated value of optimal number of components
|
|
|
|
"""
|
|
|
|
m, n = X.shape
|
|
k, l = Y.shape
|
|
o, p = Z.shape
|
|
assert m==k, "X (%d,%d) - Y (%d,%d) dim mismatch" %(m,n,k,l)
|
|
assert n==p, "X (%d,%d) - Z (%d,%d) dim mismatch" %(m,n,o,p)
|
|
if nsets == None:
|
|
nsets = m
|
|
if nsets > X.shape[0]:
|
|
print "nsets (%d) is larger than number of variables (%d).\nnsets: %d -> %d" %(nsets, m, nsets, m)
|
|
nsets = m
|
|
assert (alpha >= 0 and alpha<=1), "Alpha needs to be within [0,1], got: %.2f" %alpha
|
|
|
|
Yhat = empty((a_max, k, l), 'd')
|
|
for cal, val in cv(nsets, k):
|
|
dat = lpls(X[cal],Y[cal],Z,a_max=a_max,alpha=alpha,mean_ctr=mean_ctr,verbose=verbose)
|
|
if mean_ctr[0] != 1:
|
|
xi = X[val,:] - dat['mnx']
|
|
else:
|
|
xi = X[val] - X[val].mean(1)[:,newaxis]
|
|
if mean_ctr[2] != 1:
|
|
ym = dat['mny']
|
|
else:
|
|
ym = Y[val].mean(1)[:,newaxis] #???: check this
|
|
for a in range(a_max):
|
|
Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a]))
|
|
|
|
# todo: need a better support for classification error
|
|
y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?']
|
|
if y_is_class:
|
|
Yhat, err = class_error(Yhat,Y)
|
|
return Yhat, err
|
|
|
|
sep = (Y - Yhat)**2
|
|
rmsep = sqrt(sep.mean(1)).T
|
|
aopt = find_aopt_from_sep(rmsep)
|
|
|
|
return rmsep, Yhat, aopt
|
|
|
|
|
|
def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=False):
|
|
"""Returns jack-knifed segments of lpls model.
|
|
|
|
Jack-knifing is a method to perturb the model paramters, hopefully
|
|
to be representable as a typical perturbation of a *future* sample.
|
|
The mean and variance of the jack knife segements may be used to
|
|
infer the paramter confidence in th model.
|
|
|
|
The segements returned are the X-block weights and Z-block weights.
|
|
|
|
|
|
*Parameters*:
|
|
|
|
X : {array}
|
|
Main data matrix (m, n)
|
|
Y : {array}
|
|
External row data (m, l)
|
|
Z : {array}
|
|
External column data (n, o)
|
|
a_max : {integer}, optional
|
|
Maximum number of components to calculate (0, min(m,n))
|
|
nsets : (integer), optional
|
|
Number of jack-knife segments
|
|
xz_alpha : {float}, optional
|
|
Parameter to control the amount of influence from Z-matrix.
|
|
0 is none, which returns a pls-solution, 1 is max
|
|
mean_center : {array-like}, optional
|
|
A three element array-like structure with elements in [-1,0,1,2],
|
|
that decides the type of centering used.
|
|
-1 : nothing
|
|
0 : row center
|
|
1 : column center
|
|
2 : double center
|
|
verbose : {boolean}, optional
|
|
Verbosity of console output. For use in debugging.
|
|
|
|
*Returns*:
|
|
|
|
Wx : {array}
|
|
X-block jack-knife segements
|
|
Wz : {array}
|
|
Z-block jack-knife segements
|
|
"""
|
|
|
|
m, n = X.shape
|
|
k, l = Y.shape
|
|
o, p = Z.shape
|
|
assert(m==k)
|
|
assert(n==p)
|
|
if nsets==None:
|
|
nsets = m
|
|
WWx = empty((nsets, n, a_max), 'd')
|
|
WWz = empty((nsets, o, a_max), 'd')
|
|
#WWy = empty((nsets, l, a_max), 'd')
|
|
for i, (cal, val) in enumerate(cv(k, nsets)):
|
|
dat = lpls(X[cal],Y[cal],Z,a_max=a_max,alpha=xz_alpha,mean_ctr=mean_ctr,
|
|
scale='loads', verbose=verbose)
|
|
WWx[i,:,:] = dat['W']
|
|
WWz[i,:,:] = dat['L']
|
|
#WWy[i,:,:] = dat['Q']
|
|
|
|
return WWx, WWz
|
|
|
|
def find_aopt_from_sep(sep, method='vanilla'):
|
|
"""Returns an estimate of optimal number of components.
|
|
|
|
The estimate is based on the squared error of prediction from
|
|
crossvalidation. This is pretty much wild guessing and it is
|
|
recomended to inspect model parameters and prediction errors
|
|
closely before deciding on the optimal number of components.
|
|
|
|
*Parameters*:
|
|
sep : {array}
|
|
Squared error of prediction
|
|
method : ['vanilla', '75perc']
|
|
Mehtod used to estimate optimal number of components
|
|
|
|
*Returns*:
|
|
aopt : {integer}
|
|
A guess on the optimal number of components
|
|
"""
|
|
|
|
if method=='vanilla':
|
|
# min rmsep
|
|
rmsecv = sqrt(sep.mean(0))
|
|
return rmsecv.argmin() + 1
|
|
|
|
elif method=='75perc':
|
|
prct = .75 #percentile
|
|
ind = 1.*sep.shape[0]*prct
|
|
med = median(sep)
|
|
prc_75 = []
|
|
for col in sep.T:
|
|
col = sorted(col)
|
|
prc_75.append(col[int(ind)])
|
|
prc_75 = asarray(prc_75)
|
|
for i in range(1, sep.shape[1], 1):
|
|
if med[i-1]<prc_75[i]:
|
|
return i
|
|
return len(med)
|
|
|
|
def cv(N, K, randomise=True, sequential=False):
|
|
"""Generates K (training, validation) index pairs.
|
|
|
|
Each pair is a partition of arange(N), where validation is an iterable
|
|
of length ~N/K, *without* replacement.
|
|
|
|
*Parameters*:
|
|
N : {integer}
|
|
Total number of samples
|
|
K : {integer}
|
|
Number of subset samples
|
|
randomise : {boolean}
|
|
Use random sampling
|
|
sequential : {boolean}
|
|
Use sequential sampling
|
|
|
|
*Returns*:
|
|
training : {array-like}
|
|
training-indices
|
|
validation : {array-like}
|
|
validation-indices
|
|
|
|
*Notes*:
|
|
If randomise is true, a copy of index is shuffled before partitioning,
|
|
otherwise its order is preserved in training and validation.
|
|
|
|
Randomise overrides the sequential argument. If randomise is true,
|
|
sequential is False
|
|
If sequential is true the index is partioned in continous blocks,
|
|
otherwise interleaved ordering is used.
|
|
|
|
"""
|
|
if K>N:
|
|
raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(N,N,K)
|
|
index = xrange(N)
|
|
if randomise:
|
|
from random import shuffle
|
|
index = list(index)
|
|
shuffle(index)
|
|
sequential = False
|
|
if sequential:
|
|
for validation in array_split(index, K):
|
|
training = [i for i in index if i not in validation]
|
|
yield training, validation
|
|
else:
|
|
for k in xrange(K):
|
|
training = [i for i in index if i % K != k]
|
|
validation = [i for i in index if i % K == k]
|
|
yield training, validation
|
|
|
|
|
|
def class_error(Yhat, Y, method='vanilla'):
|
|
""" Not used.
|
|
"""
|
|
a_max, k, l = Yhat.shape
|
|
Yhat_c = zeros((k, l), dtype='d')
|
|
for a in range(a_max):
|
|
for i in range(k):
|
|
Yhat_c[a,val,argmax(Yhat[a,val,:])] = 1.0
|
|
err = 100*((Yhat_c + Y) == 2).sum(1)/Y.sum(0).astype('d')
|
|
|
|
return Yhat_c, err
|
|
|
|
def class_errorII(T, Y, method='lda'):
|
|
""" Not used ...
|
|
"""
|
|
pass
|