From ac4456474b66434d95edfe57cdbc523f76facc44 Mon Sep 17 00:00:00 2001 From: flatberg Date: Sun, 16 Dec 2007 21:36:40 +0000 Subject: [PATCH] ii --- pyblm/crossvalidation.py | 98 ++++++++++++++++++++++------------------ pyblm/statistics.py | 3 +- 2 files changed, 55 insertions(+), 46 deletions(-) diff --git a/pyblm/crossvalidation.py b/pyblm/crossvalidation.py index 765f42b..41afde5 100644 --- a/pyblm/crossvalidation.py +++ b/pyblm/crossvalidation.py @@ -12,7 +12,7 @@ from numpy.random import shuffle from engines import pls, pca from engines import nipals_lpls as lpls -def pca_val(a, a_max, nsets=None, center_axis=[0]): +def pca_val(a, a_max, nsets=None, center_axis=[0], method='cv'): """Returns error estimate of crossvalidated PCA. *Parameters*: @@ -25,7 +25,8 @@ def pca_val(a, a_max, nsets=None, center_axis=[0]): Centering nsets : {integer} number of segments - + method : {['cv', 'diag']} + *Returns*: rmsep : {array} @@ -36,8 +37,14 @@ def pca_val(a, a_max, nsets=None, center_axis=[0]): Estimate of optimal number of components *Notes*: - + + - Crossvalidation of PCA is somewhat artificial as we do not have + any external information to predict. There are essentially two approaches + to this, one is to use a projection error, the other is to leave out + elements in the data matrix then record a missing-value estimator error. + - Crossvalidation method is currently set to random blocks of diagonals. + """ n, m = a.shape @@ -47,29 +54,44 @@ def pca_val(a, a_max, nsets=None, center_axis=[0]): err = zeros((a_max, n, m), dtype=a.dtype) err_mn = zeros((a_max, n, m), dtype=a.dtype) xhat = zeros((a_max, n, m), dtype=a.dtype) - mn_a = .5*(a.mean(0) + a.mean(1)[:,newaxis]) - - for i, val in enumerate(diag_cv(a.shape, nsets)): - old_values = a.take(val) - new_values = mn_a.take(val) - # impute mean values - b = a.copy() - a.put(val, new_values) - dat = pca(a, a_max, mode='normal', center_axis=center_axis) - Ti, Pi = dat['T'], dat['P'] - bc = b - dat['mnx'] - bc2 = b - b.mean(0) - for j in xrange(a_max): - # predict the imputed values - a_pred = dot(Ti[:,:j+1], Pi[:,:j+1].T).take(val) - a_true = bc2.take(val) - err[j,:,:].put(val, (a_true - a_pred)**2) - err_mn[j,:,:].put(val, (bc.take(val) - a_pred)**2) - xhat[j,:,:].put(val, a_pred) - - # put original values back - a.put(val, old_values) - + + if method == 'diag': + mn_a = .5*(a.mean(0) + a.mean(1)[:,newaxis]) + for i, val in enumerate(diag_cv(a.shape, nsets)): + old_values = a.take(val) + new_values = mn_a.take(val) + # impute with mean values + b = a.copy() + a.put(val, new_values) + dat = pca(a, a_max, mode='normal', center_axis=center_axis) + Ti, Pi = dat['T'], dat['P'] + bc = b - dat['mnx'] + bc2 = b - b.mean(0) + for j in xrange(a_max): + # estimate the imputed values + a_pred = dot(Ti[:,:j+1], Pi[:,:j+1].T).take(val) + a_true = bc2.take(val) + err[j,:,:].put(val, (a_true - a_pred)**2) + err_mn[j,:,:].put(val, (bc.take(val) - a_pred)**2) + xhat[j,:,:].put(val, a_pred) + # put original values back + a.put(val, old_values) + + elif method == 'cv': + for i, (cal, val) in enumerate(cv(n, nsets)): + xval = atleast_2d(x[val,:]) + xcal = x[cal, :] + P = pca(xcal, aopt, mode='fast', scale='scores')['P'] + e = eye(m) + rmat = zeros((m, m)) + for j, p in enumerate(P.T): + d2 = diag(e) - (p**2).ravel() + e = e - dot(p, p.T) + d = diag(e) + es = e/atleast_2d(d) + xhat[j,:,:] = dot(xval, es) + err[i, a] = (dot(xval, es)**2).sum() + rmsep = sqrt(err).mean(1) # take mean over samples rmsep2 = sqrt(err_mn).mean(1) aopt = rmsep.mean(-1).argmin() @@ -120,8 +142,8 @@ def pls_val(X, Y, a_max=2, nsets=None, center_axis=[0,0], verbose=False): if nsets > X.shape[0]: print "nsets (%d) is larger than number of variables (%d).\nnsets: %d -> %d" %(nsets, m, nsets, m) nsets = m - if m > n: - # kernel boosting + if n > 5*m: + # boosting (wide x) Yhat = _w_pls_predict(X, Y, a_max) Yhat = empty((a_max, k, l), dtype=dt) @@ -259,7 +281,7 @@ def pca_jk(a, aopt, nsets=None, center_axis=[0], method='cv'): *Notes*: - - Crossvalidation method is currently set to random blocks of samples. + - . """ m, n = a.shape @@ -518,8 +540,8 @@ def diag_cv(shape, nsets=9, randomise=True): if nsets>m or nsets>n: msg = "You may not use more subsets than max(n_rows, n_cols)" raise ValueError, msg - msg = "You may not use more subsets than max(n_rows, n_cols)" - nsets = min(m, n) + + nsets = min(m, n) nm = n*m index = arange(nm) n_ind = arange(n+1) @@ -533,20 +555,6 @@ def diag_cv(shape, nsets=9, randomise=True): #training = [j for j in index if j not in validation] yield list(validation) -def class_error(y_hat, y, method='vanilla'): - """ Not used. - """ - a_opt, k, l = y_hat.shape - y_hat_c = zeros((k, l), dtype='d') - if method == vanilla: - pass - for a in range(a_opt): - for i in range(k): - y_hat_c[a, val, argmax(y_hat[a,val,:])] = 1.0 - err = 100*((y_hat_c + y) == 2).sum(1)/y.sum(0).astype('d') - - return y_hat_c, err - def prediction_error(y_hat, y, method='squared'): """Loss function on multiclass Y. diff --git a/pyblm/statistics.py b/pyblm/statistics.py index b38e881..5b0f96c 100644 --- a/pyblm/statistics.py +++ b/pyblm/statistics.py @@ -84,7 +84,8 @@ def hotelling(Pcv, P, p_center='median', cov_center='median', if cov_center == 'median': Cov_p = median(Cov_i) - else cov_center == 'mean': + else: + # cov_center == 'mean' Cov_p = Cov.mean(0) reg_cov = (1. - alpha)*Cov_i + alpha*Cov_p for i in xrange(n):