ii

2007-12-16 21:36:40 +00:00
parent 253305b602
commit ac4456474b
2 changed files with 55 additions and 46 deletions
--- a/pyblm/crossvalidation.py
+++ b/pyblm/crossvalidation.py
@ -12,7 +12,7 @@ from numpy.random import shuffle
 from engines import pls, pca
 from engines import nipals_lpls as lpls

-def pca_val(a, a_max, nsets=None, center_axis=[0]):
+def pca_val(a, a_max, nsets=None, center_axis=[0], method='cv'):
    """Returns error estimate of crossvalidated PCA.
    
    *Parameters*:
@ -25,7 +25,8 @@ def pca_val(a, a_max, nsets=None, center_axis=[0]):
            Centering
        nsets : {integer}
            number of segments
-
+        method : {['cv', 'diag']}
+    
    *Returns*:

        rmsep : {array}
@ -36,8 +37,14 @@ def pca_val(a, a_max, nsets=None, center_axis=[0]):
            Estimate of optimal number of components
    
    *Notes*:
-    
+
+        - Crossvalidation of PCA is somewhat artificial as we do not have
+        any external information to predict. There are essentially two approaches
+        to this, one is to use a projection error, the other is to leave out
+        elements in the data matrix then record a missing-value estimator error.
+
        - Crossvalidation method is currently set to random blocks of diagonals.
+
        
    """
    n, m = a.shape
@ -47,29 +54,44 @@ def pca_val(a, a_max, nsets=None, center_axis=[0]):
    err = zeros((a_max, n, m), dtype=a.dtype)
    err_mn = zeros((a_max, n, m), dtype=a.dtype)
    xhat = zeros((a_max, n, m), dtype=a.dtype)
-    mn_a = .5*(a.mean(0) + a.mean(1)[:,newaxis])
-    
-    for i, val in enumerate(diag_cv(a.shape, nsets)):
-        old_values = a.take(val)
-        new_values = mn_a.take(val)
-        # impute mean values
-        b = a.copy()
-        a.put(val, new_values)
-        dat = pca(a, a_max, mode='normal', center_axis=center_axis)
-        Ti, Pi = dat['T'], dat['P']
-        bc = b - dat['mnx']
-        bc2 = b - b.mean(0)
-        for j in xrange(a_max):
-            # predict the imputed values    
-            a_pred = dot(Ti[:,:j+1], Pi[:,:j+1].T).take(val)
-            a_true = bc2.take(val)
-            err[j,:,:].put(val, (a_true - a_pred)**2)
-            err_mn[j,:,:].put(val, (bc.take(val) - a_pred)**2)
-            xhat[j,:,:].put(val, a_pred)
-        
-        # put original values back
-        a.put(val, old_values)
-    
+
+    if method == 'diag':
+        mn_a = .5*(a.mean(0) + a.mean(1)[:,newaxis])
+        for i, val in enumerate(diag_cv(a.shape, nsets)):
+            old_values = a.take(val)
+            new_values = mn_a.take(val)
+            # impute with mean values
+            b = a.copy()
+            a.put(val, new_values)
+            dat = pca(a, a_max, mode='normal', center_axis=center_axis)
+            Ti, Pi = dat['T'], dat['P']
+            bc = b - dat['mnx']
+            bc2 = b - b.mean(0)
+            for j in xrange(a_max):
+                # estimate the imputed values    
+                a_pred = dot(Ti[:,:j+1], Pi[:,:j+1].T).take(val)
+                a_true = bc2.take(val)
+                err[j,:,:].put(val, (a_true - a_pred)**2)
+                err_mn[j,:,:].put(val, (bc.take(val) - a_pred)**2)
+                xhat[j,:,:].put(val, a_pred)
+            # put original values back
+            a.put(val, old_values)
+
+    elif method == 'cv':
+        for i, (cal, val) in enumerate(cv(n, nsets)):
+            xval = atleast_2d(x[val,:])
+            xcal = x[cal, :]
+            P = pca(xcal, aopt, mode='fast', scale='scores')['P']
+            e = eye(m)
+            rmat = zeros((m, m))
+            for j, p in enumerate(P.T):
+                d2 = diag(e) - (p**2).ravel()
+                e = e - dot(p, p.T)
+                d = diag(e)
+                es = e/atleast_2d(d)
+                xhat[j,:,:] = dot(xval, es)
+                err[i, a] = (dot(xval, es)**2).sum()
+                
    rmsep = sqrt(err).mean(1) # take mean over samples
    rmsep2 = sqrt(err_mn).mean(1)
    aopt = rmsep.mean(-1).argmin()
@ -120,8 +142,8 @@ def pls_val(X, Y, a_max=2, nsets=None, center_axis=[0,0], verbose=False):
    if nsets > X.shape[0]:
        print "nsets (%d) is larger than number of variables (%d).\nnsets:  %d -> %d" %(nsets, m, nsets, m)
        nsets = m
-    if m > n:
-        # kernel boosting
+    if n > 5*m:
+        # boosting (wide x)
        Yhat = _w_pls_predict(X, Y, a_max)
        
    Yhat = empty((a_max, k, l), dtype=dt)
@ -259,7 +281,7 @@ def pca_jk(a, aopt, nsets=None, center_axis=[0], method='cv'):

    *Notes*:

-        - Crossvalidation method is currently set to random blocks of samples.
+        - .

    """
    m, n = a.shape
@ -518,8 +540,8 @@ def diag_cv(shape, nsets=9, randomise=True):
    if nsets>m or nsets>n:
        msg = "You may not use more subsets than max(n_rows, n_cols)"
        raise ValueError, msg
-         msg = "You may not use more subsets than max(n_rows, n_cols)"
-         nsets = min(m, n)
+         
+    nsets = min(m, n)
    nm = n*m
    index = arange(nm)
    n_ind = arange(n+1)
@ -533,20 +555,6 @@ def diag_cv(shape, nsets=9, randomise=True):
        #training = [j for j in index if j not in validation]
        yield list(validation)

-def class_error(y_hat, y, method='vanilla'):
-    """ Not used.
-    """
-    a_opt, k, l = y_hat.shape
-    y_hat_c = zeros((k, l), dtype='d')
-    if method == vanilla:
-        pass
-    for a in range(a_opt):
-        for i in range(k):
-            y_hat_c[a, val, argmax(y_hat[a,val,:])] = 1.0
-    err = 100*((y_hat_c + y) == 2).sum(1)/y.sum(0).astype('d')
-
-    return y_hat_c, err
-
 def prediction_error(y_hat, y, method='squared'):
    """Loss function on multiclass Y.

--- a/pyblm/statistics.py
+++ b/pyblm/statistics.py
@ -84,7 +84,8 @@ def hotelling(Pcv, P, p_center='median', cov_center='median',

    if cov_center == 'median':
        Cov_p = median(Cov_i)
-    else cov_center == 'mean':
+    else:
+        # cov_center == 'mean'
        Cov_p = Cov.mean(0)
    reg_cov = (1. - alpha)*Cov_i + alpha*Cov_p
    for i in xrange(n):