This commit is contained in:
parent
253305b602
commit
ac4456474b
|
@ -12,7 +12,7 @@ from numpy.random import shuffle
|
||||||
from engines import pls, pca
|
from engines import pls, pca
|
||||||
from engines import nipals_lpls as lpls
|
from engines import nipals_lpls as lpls
|
||||||
|
|
||||||
def pca_val(a, a_max, nsets=None, center_axis=[0]):
|
def pca_val(a, a_max, nsets=None, center_axis=[0], method='cv'):
|
||||||
"""Returns error estimate of crossvalidated PCA.
|
"""Returns error estimate of crossvalidated PCA.
|
||||||
|
|
||||||
*Parameters*:
|
*Parameters*:
|
||||||
|
@ -25,6 +25,7 @@ def pca_val(a, a_max, nsets=None, center_axis=[0]):
|
||||||
Centering
|
Centering
|
||||||
nsets : {integer}
|
nsets : {integer}
|
||||||
number of segments
|
number of segments
|
||||||
|
method : {['cv', 'diag']}
|
||||||
|
|
||||||
*Returns*:
|
*Returns*:
|
||||||
|
|
||||||
|
@ -37,8 +38,14 @@ def pca_val(a, a_max, nsets=None, center_axis=[0]):
|
||||||
|
|
||||||
*Notes*:
|
*Notes*:
|
||||||
|
|
||||||
|
- Crossvalidation of PCA is somewhat artificial as we do not have
|
||||||
|
any external information to predict. There are essentially two approaches
|
||||||
|
to this, one is to use a projection error, the other is to leave out
|
||||||
|
elements in the data matrix then record a missing-value estimator error.
|
||||||
|
|
||||||
- Crossvalidation method is currently set to random blocks of diagonals.
|
- Crossvalidation method is currently set to random blocks of diagonals.
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
n, m = a.shape
|
n, m = a.shape
|
||||||
if nsets == None:
|
if nsets == None:
|
||||||
|
@ -47,28 +54,43 @@ def pca_val(a, a_max, nsets=None, center_axis=[0]):
|
||||||
err = zeros((a_max, n, m), dtype=a.dtype)
|
err = zeros((a_max, n, m), dtype=a.dtype)
|
||||||
err_mn = zeros((a_max, n, m), dtype=a.dtype)
|
err_mn = zeros((a_max, n, m), dtype=a.dtype)
|
||||||
xhat = zeros((a_max, n, m), dtype=a.dtype)
|
xhat = zeros((a_max, n, m), dtype=a.dtype)
|
||||||
mn_a = .5*(a.mean(0) + a.mean(1)[:,newaxis])
|
|
||||||
|
|
||||||
for i, val in enumerate(diag_cv(a.shape, nsets)):
|
if method == 'diag':
|
||||||
old_values = a.take(val)
|
mn_a = .5*(a.mean(0) + a.mean(1)[:,newaxis])
|
||||||
new_values = mn_a.take(val)
|
for i, val in enumerate(diag_cv(a.shape, nsets)):
|
||||||
# impute mean values
|
old_values = a.take(val)
|
||||||
b = a.copy()
|
new_values = mn_a.take(val)
|
||||||
a.put(val, new_values)
|
# impute with mean values
|
||||||
dat = pca(a, a_max, mode='normal', center_axis=center_axis)
|
b = a.copy()
|
||||||
Ti, Pi = dat['T'], dat['P']
|
a.put(val, new_values)
|
||||||
bc = b - dat['mnx']
|
dat = pca(a, a_max, mode='normal', center_axis=center_axis)
|
||||||
bc2 = b - b.mean(0)
|
Ti, Pi = dat['T'], dat['P']
|
||||||
for j in xrange(a_max):
|
bc = b - dat['mnx']
|
||||||
# predict the imputed values
|
bc2 = b - b.mean(0)
|
||||||
a_pred = dot(Ti[:,:j+1], Pi[:,:j+1].T).take(val)
|
for j in xrange(a_max):
|
||||||
a_true = bc2.take(val)
|
# estimate the imputed values
|
||||||
err[j,:,:].put(val, (a_true - a_pred)**2)
|
a_pred = dot(Ti[:,:j+1], Pi[:,:j+1].T).take(val)
|
||||||
err_mn[j,:,:].put(val, (bc.take(val) - a_pred)**2)
|
a_true = bc2.take(val)
|
||||||
xhat[j,:,:].put(val, a_pred)
|
err[j,:,:].put(val, (a_true - a_pred)**2)
|
||||||
|
err_mn[j,:,:].put(val, (bc.take(val) - a_pred)**2)
|
||||||
|
xhat[j,:,:].put(val, a_pred)
|
||||||
|
# put original values back
|
||||||
|
a.put(val, old_values)
|
||||||
|
|
||||||
# put original values back
|
elif method == 'cv':
|
||||||
a.put(val, old_values)
|
for i, (cal, val) in enumerate(cv(n, nsets)):
|
||||||
|
xval = atleast_2d(x[val,:])
|
||||||
|
xcal = x[cal, :]
|
||||||
|
P = pca(xcal, aopt, mode='fast', scale='scores')['P']
|
||||||
|
e = eye(m)
|
||||||
|
rmat = zeros((m, m))
|
||||||
|
for j, p in enumerate(P.T):
|
||||||
|
d2 = diag(e) - (p**2).ravel()
|
||||||
|
e = e - dot(p, p.T)
|
||||||
|
d = diag(e)
|
||||||
|
es = e/atleast_2d(d)
|
||||||
|
xhat[j,:,:] = dot(xval, es)
|
||||||
|
err[i, a] = (dot(xval, es)**2).sum()
|
||||||
|
|
||||||
rmsep = sqrt(err).mean(1) # take mean over samples
|
rmsep = sqrt(err).mean(1) # take mean over samples
|
||||||
rmsep2 = sqrt(err_mn).mean(1)
|
rmsep2 = sqrt(err_mn).mean(1)
|
||||||
|
@ -120,8 +142,8 @@ def pls_val(X, Y, a_max=2, nsets=None, center_axis=[0,0], verbose=False):
|
||||||
if nsets > X.shape[0]:
|
if nsets > X.shape[0]:
|
||||||
print "nsets (%d) is larger than number of variables (%d).\nnsets: %d -> %d" %(nsets, m, nsets, m)
|
print "nsets (%d) is larger than number of variables (%d).\nnsets: %d -> %d" %(nsets, m, nsets, m)
|
||||||
nsets = m
|
nsets = m
|
||||||
if m > n:
|
if n > 5*m:
|
||||||
# kernel boosting
|
# boosting (wide x)
|
||||||
Yhat = _w_pls_predict(X, Y, a_max)
|
Yhat = _w_pls_predict(X, Y, a_max)
|
||||||
|
|
||||||
Yhat = empty((a_max, k, l), dtype=dt)
|
Yhat = empty((a_max, k, l), dtype=dt)
|
||||||
|
@ -259,7 +281,7 @@ def pca_jk(a, aopt, nsets=None, center_axis=[0], method='cv'):
|
||||||
|
|
||||||
*Notes*:
|
*Notes*:
|
||||||
|
|
||||||
- Crossvalidation method is currently set to random blocks of samples.
|
- .
|
||||||
|
|
||||||
"""
|
"""
|
||||||
m, n = a.shape
|
m, n = a.shape
|
||||||
|
@ -518,8 +540,8 @@ def diag_cv(shape, nsets=9, randomise=True):
|
||||||
if nsets>m or nsets>n:
|
if nsets>m or nsets>n:
|
||||||
msg = "You may not use more subsets than max(n_rows, n_cols)"
|
msg = "You may not use more subsets than max(n_rows, n_cols)"
|
||||||
raise ValueError, msg
|
raise ValueError, msg
|
||||||
msg = "You may not use more subsets than max(n_rows, n_cols)"
|
|
||||||
nsets = min(m, n)
|
nsets = min(m, n)
|
||||||
nm = n*m
|
nm = n*m
|
||||||
index = arange(nm)
|
index = arange(nm)
|
||||||
n_ind = arange(n+1)
|
n_ind = arange(n+1)
|
||||||
|
@ -533,20 +555,6 @@ def diag_cv(shape, nsets=9, randomise=True):
|
||||||
#training = [j for j in index if j not in validation]
|
#training = [j for j in index if j not in validation]
|
||||||
yield list(validation)
|
yield list(validation)
|
||||||
|
|
||||||
def class_error(y_hat, y, method='vanilla'):
|
|
||||||
""" Not used.
|
|
||||||
"""
|
|
||||||
a_opt, k, l = y_hat.shape
|
|
||||||
y_hat_c = zeros((k, l), dtype='d')
|
|
||||||
if method == vanilla:
|
|
||||||
pass
|
|
||||||
for a in range(a_opt):
|
|
||||||
for i in range(k):
|
|
||||||
y_hat_c[a, val, argmax(y_hat[a,val,:])] = 1.0
|
|
||||||
err = 100*((y_hat_c + y) == 2).sum(1)/y.sum(0).astype('d')
|
|
||||||
|
|
||||||
return y_hat_c, err
|
|
||||||
|
|
||||||
def prediction_error(y_hat, y, method='squared'):
|
def prediction_error(y_hat, y, method='squared'):
|
||||||
"""Loss function on multiclass Y.
|
"""Loss function on multiclass Y.
|
||||||
|
|
||||||
|
|
|
@ -84,7 +84,8 @@ def hotelling(Pcv, P, p_center='median', cov_center='median',
|
||||||
|
|
||||||
if cov_center == 'median':
|
if cov_center == 'median':
|
||||||
Cov_p = median(Cov_i)
|
Cov_p = median(Cov_i)
|
||||||
else cov_center == 'mean':
|
else:
|
||||||
|
# cov_center == 'mean'
|
||||||
Cov_p = Cov.mean(0)
|
Cov_p = Cov.mean(0)
|
||||||
reg_cov = (1. - alpha)*Cov_i + alpha*Cov_p
|
reg_cov = (1. - alpha)*Cov_i + alpha*Cov_p
|
||||||
for i in xrange(n):
|
for i in xrange(n):
|
||||||
|
|
Reference in New Issue