Projects/pyblm
Projects
/
pyblm
Archived
5
0
Fork 0
This repository has been archived on 2024-07-04. You can view files and clone it, but cannot push or open issues or pull requests.
pyblm/crossvalidation.py

265 lines
8.4 KiB
Python

"""This module implements some validation schemes l-pls.
The primary use is crossvalidation.
"""
__all__ = ['lpls_val', 'lpls_jk']
__docformat__ = "restructuredtext en"
from numpy import dot,empty,zeros,sqrt,atleast_2d,argmax,asarray,median,\
array_split
from engines import nipals_lpls as lpls
def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=True):
"""Performs crossvalidation for generalisation error in lpls.
The L-PLS crossvalidation is estimated just like an ordinary pls
crossvalidation. That is, the generalisation error is estimated by
predicting samples (rows of X and Y) left out according to a cross
validation scheme.
*Parameters*:
X : {array}
Main data matrix (m, n)
Y : {array}
External row data (m, l)
Z : {array}
External column data (n, o)
a_max : {integer}, optional
Maximum number of components to calculate (0, min(m,n))
nsets : (integer), optional
Number of crossvalidation sets
alpha : {float}, optional
Parameter to control the amount of influence from Z-matrix.
0 is none, which returns a pls-solution, 1 is max
mean_center : {array-like}, optional
A three element array-like structure with elements in [-1,0,1,2],
that decides the type of centering used.
-1 : nothing
0 : row center
1 : column center
2 : double center
verbose : {boolean}, optional
Verbosity of console output. For use in debugging.
*Returns*:
rmsep : {array}
Root mean squred error of prediction
yhat : {array}
Estimated responses
aopt : {integer}
Estimated value of optimal number of components
"""
m, n = X.shape
k, l = Y.shape
o, p = Z.shape
assert m==k, "X (%d,%d) - Y (%d,%d) dim mismatch" %(m,n,k,l)
assert n==p, "X (%d,%d) - Z (%d,%d) dim mismatch" %(m,n,o,p)
if nsets == None:
nsets = m
if nsets > X.shape[0]:
print "nsets (%d) is larger than number of variables (%d).\nnsets: %d -> %d" %(nsets, m, nsets, m)
nsets = m
assert (alpha >= 0 and alpha<=1), "Alpha needs to be within [0,1], got: %.2f" %alpha
Yhat = empty((a_max, k, l), 'd')
for cal, val in cv(nsets, k):
dat = lpls(X[cal],Y[cal],Z,a_max=a_max,alpha=alpha,mean_ctr=mean_ctr,verbose=verbose)
if mean_ctr[0] != 1:
xi = X[val,:] - dat['mnx']
else:
xi = X[val] - X[val].mean(1)[:,newaxis]
if mean_ctr[2] != 1:
ym = dat['mny']
else:
ym = Y[val].mean(1)[:,newaxis] #???: check this
for a in range(a_max):
Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a]))
# todo: need a better support for classification error
y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?']
if y_is_class:
Yhat, err = class_error(Yhat,Y)
return Yhat, err
sep = (Y - Yhat)**2
rmsep = sqrt(sep.mean(1)).T
aopt = find_aopt_from_sep(rmsep)
return rmsep, Yhat, aopt
def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=False):
"""Returns jack-knifed segments of lpls model.
Jack-knifing is a method to perturb the model paramters, hopefully
to be representable as a typical perturbation of a *future* sample.
The mean and variance of the jack knife segements may be used to
infer the paramter confidence in th model.
The segements returned are the X-block weights and Z-block weights.
*Parameters*:
X : {array}
Main data matrix (m, n)
Y : {array}
External row data (m, l)
Z : {array}
External column data (n, o)
a_max : {integer}, optional
Maximum number of components to calculate (0, min(m,n))
nsets : (integer), optional
Number of jack-knife segments
xz_alpha : {float}, optional
Parameter to control the amount of influence from Z-matrix.
0 is none, which returns a pls-solution, 1 is max
mean_center : {array-like}, optional
A three element array-like structure with elements in [-1,0,1,2],
that decides the type of centering used.
-1 : nothing
0 : row center
1 : column center
2 : double center
verbose : {boolean}, optional
Verbosity of console output. For use in debugging.
*Returns*:
Wx : {array}
X-block jack-knife segements
Wz : {array}
Z-block jack-knife segements
"""
m, n = X.shape
k, l = Y.shape
o, p = Z.shape
assert(m==k)
assert(n==p)
if nsets==None:
nsets = m
WWx = empty((nsets, n, a_max), 'd')
WWz = empty((nsets, o, a_max), 'd')
#WWy = empty((nsets, l, a_max), 'd')
for i, (cal, val) in enumerate(cv(k, nsets)):
dat = lpls(X[cal],Y[cal],Z,a_max=a_max,alpha=xz_alpha,mean_ctr=mean_ctr,
scale='loads', verbose=verbose)
WWx[i,:,:] = dat['W']
WWz[i,:,:] = dat['L']
#WWy[i,:,:] = dat['Q']
return WWx, WWz
def find_aopt_from_sep(sep, method='vanilla'):
"""Returns an estimate of optimal number of components.
The estimate is based on the squared error of prediction from
crossvalidation. This is pretty much wild guessing and it is
recomended to inspect model parameters and prediction errors
closely before deciding on the optimal number of components.
*Parameters*:
sep : {array}
Squared error of prediction
method : ['vanilla', '75perc']
Mehtod used to estimate optimal number of components
*Returns*:
aopt : {integer}
A guess on the optimal number of components
"""
if method=='vanilla':
# min rmsep
rmsecv = sqrt(sep.mean(0))
return rmsecv.argmin() + 1
elif method=='75perc':
prct = .75 #percentile
ind = 1.*sep.shape[0]*prct
med = median(sep)
prc_75 = []
for col in sep.T:
col = sorted(col)
prc_75.append(col[int(ind)])
prc_75 = asarray(prc_75)
for i in range(1, sep.shape[1], 1):
if med[i-1]<prc_75[i]:
return i
return len(med)
def cv(N, K, randomise=True, sequential=False):
"""Generates K (training, validation) index pairs.
Each pair is a partition of arange(N), where validation is an iterable
of length ~N/K, *without* replacement.
*Parameters*:
N : {integer}
Total number of samples
K : {integer}
Number of subset samples
randomise : {boolean}
Use random sampling
sequential : {boolean}
Use sequential sampling
*Returns*:
training : {array-like}
training-indices
validation : {array-like}
validation-indices
*Notes*:
If randomise is true, a copy of index is shuffled before partitioning,
otherwise its order is preserved in training and validation.
Randomise overrides the sequential argument. If randomise is true,
sequential is False
If sequential is true the index is partioned in continous blocks,
otherwise interleaved ordering is used.
"""
if K>N:
raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(N,N,K)
index = xrange(N)
if randomise:
from random import shuffle
index = list(index)
shuffle(index)
sequential = False
if sequential:
for validation in array_split(index, K):
training = [i for i in index if i not in validation]
yield training, validation
else:
for k in xrange(K):
training = [i for i in index if i % K != k]
validation = [i for i in index if i % K == k]
yield training, validation
def class_error(Yhat, Y, method='vanilla'):
""" Not used.
"""
a_max, k, l = Yhat.shape
Yhat_c = zeros((k, l), dtype='d')
for a in range(a_max):
for i in range(k):
Yhat_c[a,val,argmax(Yhat[a,val,:])] = 1.0
err = 100*((Yhat_c + Y) == 2).sum(1)/Y.sum(0).astype('d')
return Yhat_c, err
def class_errorII(T, Y, method='lda'):
""" Not used ...
"""
pass