laydi/fluents/lib/validation.py

"""This module implements some common validation schemes from pca and pls.
"""
from scipy import ones,mean,sqrt,dot,newaxis,zeros,sum,empty,\
     apply_along_axis,eye,kron,array,sort,zeros_like,argmax,atleast_2d
from scipy.stats import median
from scipy.linalg import triu,inv,svd,norm

from select_generators import w_pls_gen,w_pls_gen_jk,pls_gen,pca_gen,diag_pert
from engines import w_simpls,pls,bridge,pca,nipals_lpls
from cx_utils import m_shape


def w_pls_cv_val(X, Y, amax, n_blocks=None):
    """Returns rmsep and aopt for pls tailored for wide X.

    The root mean square error of cross validation is calculated
    based on random block cross-validation. With number of blocks equal to
    number of samples [default] gives leave-one-out cv.
    The pls model is based on the simpls algorithm for wide X.

    :Parameters:
    X : ndarray 
        column centered data matrix of size (samples x variables)
    Y : ndarray
        column centered response matrix of size (samples x responses)
    amax : scalar 
        Maximum number of components
    n_blocks : scalar
        Number of blocks in cross validation
    
    :Returns: 
    rmsep : ndarray
        Root Mean Square Error of cross-validated Predictions 
    aopt : scalar
        Guestimate of the optimal number of components

    :SeeAlso:
    - pls_cv_val : Same output, not optimised for wide X
    - w_simpls : Simpls algorithm for wide X
    
    Notes
    -----
    Based (cowardly translated) on m-files from the Chemoact toolbox
    X, Y inputs need to be centered (fixme: check)
    

    Examples
    --------

    >>> import numpy as n
    >>> X = n.array([[1., 2., 3.],[]])
    >>> Y = n.array([[1., 2., 3.],[]])
    >>> w_pls(X, Y, 1)
    [4,5,6], 1
    """
    
    k, l = m_shape(Y)
    PRESS = zeros((l, amax+1), dtype='f')
    if n_blocks==None:
        n_blocks = Y.shape[0]
    XXt = dot(X, X.T)
    V = w_pls_gen(XXt, Y, n_blocks=n_blocks, center=True)
    for Din, Doi, Yin, Yout in V:
        ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0])
        PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
        
        dat = w_simpls(Din, Yin, amax)
        Q, U, H = dat['Q'], dat['U'], dat['H']
        That = dot(Doi, dot(U, inv(triu(dot(H.T, U))) ))
        
        Yhat = []
        for j in range(l):
            TQ = dot(That, triu(dot(Q[j,:][:,newaxis], ones((1,amax)))) )
            E = Yout[:,j][:,newaxis] - TQ
            E = E + sum(E, 0)/Din.shape[0] 
            PRESS[j,1:] = PRESS[j,1:] + sum(E**2, 0)
    #Yhat = Yin - dot(That,Q.T)
    msep = PRESS/(Y.shape[0])
    aopt = find_aopt_from_sep(msep)
    return sqrt(msep), aopt

def pls_val(X, Y, amax=2, n_blocks=10, algo='pls'):
    k, l = m_shape(Y)
    PRESS = zeros((l, amax+1), dtype='<f8')
    EE = zeros((amax, k, l), dtype='<f8')
    Yhat = zeros((amax, k, l), dtype='<f8')
    V = pls_gen(X, Y, n_blocks=n_blocks, center=True, index_out=True)
    for Xin, Xout, Yin, Yout, out in V:
        ym = -sum(Yout,0)[newaxis]/Yin.shape[0]
        Yin = (Yin - ym)
        PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)

        if algo=='pls':
            dat = pls(Xin, Yin, amax, mode='normal')
        elif algo=='bridge':
            dat = simpls(Xin, Yin, amax, mode='normal')
        
        for a in range(amax):
            Ba = dat['B'][a,:,:]
            Yhat[a,out[:],:] = dot(Xout, Ba)
            E = Yout -  dot(Xout, Ba)
            EE[a,out,:] = E
            PRESS[:,a+1] = PRESS[:,a+1] + sum(E**2,0)

    #rmsep = sqrt(PRESS/(k-1.))
    msep = PRESS
    aopt = find_aopt_from_sep(msep)
    return msep, Yhat, aopt

def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2]):
    """Performs crossvalidation to get generalisation error in lpls"""
    assert(nsets<=X.shape[0])
    
    cv_iter = pls_gen(X, Y, n_blocks=nsets,center=False,index_out=True)
    k, l = Y.shape
    Yc = empty((k, l), 'd')
    Yhat = empty((a_max, k, l), 'd')
    Yhatc = empty((a_max, k, l), 'd')
    sep2 = empty((a_max, k, l), 'd')
    for i, (xcal,xi,ycal,yi,ind) in enumerate(cv_iter):
        print ind
        dat = nipals_lpls(xcal,ycal,Z,
                          a_max=a_max,
                          alpha=alpha,
                          mean_ctr=mean_ctr,
                          verbose=False)
        
        B = dat['B']
        #b0 = dat['b0'] 
        for a in range(a_max):
            if mean_ctr[0] in [0, 2]:
                xi = xi - dat['mnx']
            else:
                xi = xi - xi.mean(1)[:,newaxis] #???: cheating?
            if mean_ctr[1] in [0, 2]:
                ym = dat['mny']
            else:
                ym = yi.mean(1)[:,newaxis] #???: check this
                
            Yhat[a,ind,:] = atleast_2d(ym + dot(xi, B[a]))
            #Yhat[a,ind,:] = atleast_2d(b0[a] + dot(xi, B[a]))
            
    # todo: need a better support for class validation
    y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?']
    #print Y.dtype.char
    if y_is_class:
        Yhat_class = zeros_like(Yhat)
        for a in range(a_max):
            for i in range(k):
                Yhat_class[a,i,argmax(Yhat[a,i,:])] = 1.0
        class_err = 100*((Yhat_class+Y)==2).sum(1)/Y.sum(0).astype('d')

    sep = (Y - Yhat)**2
    rmsep = sqrt(sep.mean(1)).T
    #rmsep2 = sqrt(sep2.mean(1))
    
    aopt = find_aopt_from_sep(rmsep)
    
    return rmsep, Yhat, aopt

def pca_alter_val(a, amax, n_sets=10, method='diag'):
    """Pca validation by altering elements in X.

    comments:
             -- may do all jk estimates in this loop
    """
    
    V = diag_pert(a, n_sets, center=True, index_out=True)
    sep = empty((n_sets, amax), dtype='f')
    for i, (xi, ind) in enumerate(V):
        dat_i = pca(xi, amax, mode='detailed')
        Ti, Pi = dat_i['T'],dat_i['P']
        for j in xrange(amax):
            Xhat = dot(Ti[:,:j+1], Pi[:,:j+1].T)
            a_sub = a.ravel().take(ind)
            EE = a_sub - Xhat.ravel().take(ind)
            tot = (a_sub**2).sum()
            sep[i,j] = (EE**2).sum()/tot
    sep = sqrt(sep)
    aopt = find_aopt_from_sep(sep)
    return sep, aopt

def pca_cv_val(a, amax, n_sets):
    """ Returns PRESS from cross-validated pca using random segments.

    input:
          -- a, data matrix (m x n)
          -- amax, maximum nuber of components used
          -- n_sets, number of segments to calculate
    output:
          -- sep, (amax x m x n), squared error of prediction (press)
          -- aopt, guestimated optimal number of components

    """

    m, n = a.shape
    E = empty((amax, m, n), dtype='f')
    xtot = (a**2).sum() # this needs centering
    V = pca_gen(a, n_sets=7, center=True, index_out=True)
    for xi, xout, ind in V:
        dat_i = pca(xi, amax, mode='fast')
        Pi = dat_i['P']
        for a in xrange(amax):
            Pia = Pi[:,:a+1]
            E[a][ind,:] = (X[ind,:] - dot(xout, dot(Pia,Pia.T) ))**2

    sep = []
    for a in xrange(amax):
        sep.append(E[a].sum()/xtot)
    sep = array(sep)
    aopt = find_aopt_from_sep(sep)

    return sep, aopt

def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True, center=True):
    """ Returns CV-segments of paramter W for wide X.
    
    todo: add support for T,Q and B
    """
    if n_blocks == None:
        n_blocks = b.shape[0]

    Wcv = empty((n_blocks, a.shape[1], amax), dtype='d')
    if use_pack:
        u, s, inflater = svd(a, full_matrices=0)
        a = u*s
    
    V = pls_gen(a, b, n_blocks=n_blocks, center=center)
    for nn,(a_in, a_out, b_in, b_out) in enumerate(V):
        if algo=='pls':
            dat = pls(a_in, b_in, amax, 'loads', 'fast')

        elif algo=='bridge':
            dat = bridge(a_in, b_in, amax, 'loads', 'fast')

        W = dat['W']
        if use_pack:
            W = dot(inflater.T, W)

        Wcv[nn,:,:] = W[:,:,]
        
    return Wcv

def pca_jkP(a, aopt, n_blocks=None):
    """Returns loading from PCA on CV-segments.
    
    input:
           -- a, data matrix (n x m)
           -- aopt, number of components in model.
           -- nblocks, number of segments
    output:
           -- PP, loadings collected in a three way matrix
           (n_segments, m, aopt)

    comments:
    * The loadings are scaled with the (1/samples)*eigenvalues.
    * Crossvalidation method is currently set to random blocks of samples.

    todo: add support for T
    fixme: more efficient to add this in validation loop
    """
    if n_blocks == None:
        n_blocks = a.shape[0]

    PP = empty((n_blocks, a.shape[1], aopt), dtype='f')
    V = pca_gen(a, n_sets=n_blocks, center=True)
    for nn,(a_in, a_out) in enumerate(V):  
        dat = pca(a_in, aopt, mode='fast', scale='loads')
        P = dat['P']
        PP[nn,:,:] = P
        
    return PP


def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2]):
    cv_iter = pls_gen(X, Y, n_blocks=nsets,center=False,index_out=False)
    m, n = X.shape
    k, l = Y.shape
    o, p = Z.shape
    if nsets==None:
        nsets = m
    WWx = empty((nsets, n, a_max), 'd')
    WWz = empty((nsets, o, a_max), 'd')
    #WWy = empty((nsets, l, a_max), 'd')
    for i, (xcal, xi, ycal, yi) in enumerate(cv_iter):
        dat = nipals_lpls(xcal,ycal,Z,a_max=a_max,alpha=xz_alpha,
                          mean_ctr=mean_ctr,scale='loads',verbose=False)
        WWx[i,:,:] = dat['W']
        WWz[i,:,:] = dat['L']
        #WWy[i,:,:] = dat['Q']

    return WWx, WWz

def find_aopt_from_sep(sep, method='75perc'):
    """Returns an estimate of optimal number of components from rmsecv.
    """
    sep = sep.copy()
    if method=='vanilla':
        # min rmsep
        rmsecv = sqrt(sep.mean(0))
        return rmsecv.argmin() + 1

    elif method=='75perc':
        prct = .75 #percentile
        ind = 1.*sep.shape[0]*prct
        med = median(sep)
        prc_75 = []
        for col in sep.T:
            col.sort() #this is inplace -> ruins sep, so we are doing a copy
            prc_75.append(col[int(ind)])
        prc_75 = array(prc_75)
        for i in range(1, sep.shape[1], 1):
            if med[i-1]<prc_75[i]:
                return i
        return len(med)
Multiple lib changes 2007-01-25 12:58:10 +01:00			`"""This module implements some common validation schemes from pca and pls.`
			`"""`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`from scipy import ones,mean,sqrt,dot,newaxis,zeros,sum,empty,\`
.... 2007-09-20 18:11:37 +02:00			`apply_along_axis,eye,kron,array,sort,zeros_like,argmax,atleast_2d`
Multiple lib changes 2007-01-25 12:58:10 +01:00			`from scipy.stats import median`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`from scipy.linalg import triu,inv,svd,norm`

			`from select_generators import w_pls_gen,w_pls_gen_jk,pls_gen,pca_gen,diag_pert`
confidence 2007-07-23 20:07:10 +02:00			`from engines import w_simpls,pls,bridge,pca,nipals_lpls`
Multiple lib changes 2007-01-25 12:58:10 +01:00			`from cx_utils import m_shape`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00
Tralala ... 2007-11-07 13:34:13 +01:00
.... 2007-09-20 18:11:37 +02:00			`def w_pls_cv_val(X, Y, amax, n_blocks=None):`
whitespace 2007-03-14 17:33:54 +01:00			`"""Returns rmsep and aopt for pls tailored for wide X.`

Lib updates 2007-07-23 19:33:21 +02:00			`The root mean square error of cross validation is calculated`
			`based on random block cross-validation. With number of blocks equal to`
			`number of samples [default] gives leave-one-out cv.`
			`The pls model is based on the simpls algorithm for wide X.`

			`:Parameters:`
			`X : ndarray`
			`column centered data matrix of size (samples x variables)`
			`Y : ndarray`
			`column centered response matrix of size (samples x responses)`
			`amax : scalar`
			`Maximum number of components`
			`n_blocks : scalar`
			`Number of blocks in cross validation`

			`:Returns:`
			`rmsep : ndarray`
			`Root Mean Square Error of cross-validated Predictions`
			`aopt : scalar`
			`Guestimate of the optimal number of components`

			`:SeeAlso:`
			`- pls_cv_val : Same output, not optimised for wide X`
			`- w_simpls : Simpls algorithm for wide X`

			`Notes`
			`-----`
			`Based (cowardly translated) on m-files from the Chemoact toolbox`
			`X, Y inputs need to be centered (fixme: check)`

whitespace 2007-03-14 17:33:54 +01:00
Lib updates 2007-07-23 19:33:21 +02:00			`Examples`
			`--------`
whitespace 2007-03-14 17:33:54 +01:00
Lib updates 2007-07-23 19:33:21 +02:00			`>>> import numpy as n`
			`>>> X = n.array([[1., 2., 3.],[]])`
			`>>> Y = n.array([[1., 2., 3.],[]])`
			`>>> w_pls(X, Y, 1)`
			`[4,5,6], 1`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`"""`
Lib updates 2007-07-23 19:33:21 +02:00
Multiple lib changes 2007-01-25 12:58:10 +01:00			`k, l = m_shape(Y)`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`PRESS = zeros((l, amax+1), dtype='f')`
			`if n_blocks==None:`
			`n_blocks = Y.shape[0]`
Multiple lib changes 2007-01-25 12:58:10 +01:00			`XXt = dot(X, X.T)`
			`V = w_pls_gen(XXt, Y, n_blocks=n_blocks, center=True)`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`for Din, Doi, Yin, Yout in V:`
			`ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0])`
			`PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)`
.... 2007-09-20 18:11:37 +02:00
			`dat = w_simpls(Din, Yin, amax)`
			`Q, U, H = dat['Q'], dat['U'], dat['H']`
			`That = dot(Doi, dot(U, inv(triu(dot(H.T, U))) ))`
whitespace 2007-03-14 17:33:54 +01:00
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`Yhat = []`
			`for j in range(l):`
			`TQ = dot(That, triu(dot(Q[j,:][:,newaxis], ones((1,amax)))) )`
			`E = Yout[:,j][:,newaxis] - TQ`
Trying to fix cv_pls 2007-07-30 11:46:43 +02:00			`E = E + sum(E, 0)/Din.shape[0]`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`PRESS[j,1:] = PRESS[j,1:] + sum(E**2, 0)`
Trying to fix cv_pls 2007-07-30 11:46:43 +02:00			`#Yhat = Yin - dot(That,Q.T)`
			`msep = PRESS/(Y.shape[0])`
			`aopt = find_aopt_from_sep(msep)`
.... 2007-09-20 18:11:37 +02:00			`return sqrt(msep), aopt`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00
... just lots of stuff 2007-08-24 11:14:24 +02:00			`def pls_val(X, Y, amax=2, n_blocks=10, algo='pls'):`
Multiple lib changes 2007-01-25 12:58:10 +01:00			`k, l = m_shape(Y)`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`PRESS = zeros((l, amax+1), dtype='<f8')`
			`EE = zeros((amax, k, l), dtype='<f8')`
			`Yhat = zeros((amax, k, l), dtype='<f8')`
... just lots of stuff 2007-08-24 11:14:24 +02:00			`V = pls_gen(X, Y, n_blocks=n_blocks, center=True, index_out=True)`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`for Xin, Xout, Yin, Yout, out in V:`
			`ym = -sum(Yout,0)[newaxis]/Yin.shape[0]`
			`Yin = (Yin - ym)`
			`PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)`
Multiple lib changes 2007-01-25 12:58:10 +01:00
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`if algo=='pls':`
			`dat = pls(Xin, Yin, amax, mode='normal')`
			`elif algo=='bridge':`
			`dat = simpls(Xin, Yin, amax, mode='normal')`

			`for a in range(amax):`
			`Ba = dat['B'][a,:,:]`
			`Yhat[a,out[:],:] = dot(Xout, Ba)`
			`E = Yout - dot(Xout, Ba)`
			`EE[a,out,:] = E`
			`PRESS[:,a+1] = PRESS[:,a+1] + sum(E**2,0)`

Trying to fix cv_pls 2007-07-30 11:46:43 +02:00			`#rmsep = sqrt(PRESS/(k-1.))`
			`msep = PRESS`
			`aopt = find_aopt_from_sep(msep)`
			`return msep, Yhat, aopt`
Lib updates 2007-07-23 19:33:21 +02:00
.... 2007-09-20 18:11:37 +02:00			`def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2]):`
Lib updates 2007-07-23 19:33:21 +02:00			`"""Performs crossvalidation to get generalisation error in lpls"""`
.... 2007-09-20 18:11:37 +02:00			`assert(nsets<=X.shape[0])`

confidence 2007-07-23 20:07:10 +02:00			`cv_iter = pls_gen(X, Y, n_blocks=nsets,center=False,index_out=True)`
Lib updates 2007-07-23 19:33:21 +02:00			`k, l = Y.shape`
.... 2007-09-20 18:11:37 +02:00			`Yc = empty((k, l), 'd')`
			`Yhat = empty((a_max, k, l), 'd')`
			`Yhatc = empty((a_max, k, l), 'd')`
			`sep2 = empty((a_max, k, l), 'd')`
Lib updates 2007-07-23 19:33:21 +02:00			`for i, (xcal,xi,ycal,yi,ind) in enumerate(cv_iter):`
Tralala ... 2007-11-07 13:34:13 +01:00			`print ind`
confidence 2007-07-23 20:07:10 +02:00			`dat = nipals_lpls(xcal,ycal,Z,`
			`a_max=a_max,`
			`alpha=alpha,`
.... 2007-09-20 18:11:37 +02:00			`mean_ctr=mean_ctr,`
confidence 2007-07-23 20:07:10 +02:00			`verbose=False)`
.... 2007-09-20 18:11:37 +02:00
confidence 2007-07-23 20:07:10 +02:00			`B = dat['B']`
.... 2007-09-20 18:11:37 +02:00			`#b0 = dat['b0']`
			`for a in range(a_max):`
			`if mean_ctr[0] in [0, 2]:`
			`xi = xi - dat['mnx']`
			`else:`
			`xi = xi - xi.mean(1)[:,newaxis] #???: cheating?`
			`if mean_ctr[1] in [0, 2]:`
			`ym = dat['mny']`
			`else:`
			`ym = yi.mean(1)[:,newaxis] #???: check this`

			`Yhat[a,ind,:] = atleast_2d(ym + dot(xi, B[a]))`
			`#Yhat[a,ind,:] = atleast_2d(b0[a] + dot(xi, B[a]))`

			`# todo: need a better support for class validation`
			`y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?']`
Tralala ... 2007-11-07 13:34:13 +01:00			`#print Y.dtype.char`
.... 2007-09-20 18:11:37 +02:00			`if y_is_class:`
			`Yhat_class = zeros_like(Yhat)`
Lib updates 2007-07-23 19:33:21 +02:00			`for a in range(a_max):`
.... 2007-09-20 18:11:37 +02:00			`for i in range(k):`
			`Yhat_class[a,i,argmax(Yhat[a,i,:])] = 1.0`
			`class_err = 100*((Yhat_class+Y)==2).sum(1)/Y.sum(0).astype('d')`

Lib updates 2007-07-23 19:33:21 +02:00			`sep = (Y - Yhat)**2`
.... 2007-09-20 18:11:37 +02:00			`rmsep = sqrt(sep.mean(1)).T`
			`#rmsep2 = sqrt(sep2.mean(1))`

Lib updates 2007-07-23 19:33:21 +02:00			`aopt = find_aopt_from_sep(rmsep)`
.... 2007-09-20 18:11:37 +02:00
Lib updates 2007-07-23 19:33:21 +02:00			`return rmsep, Yhat, aopt`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00
Multiple lib changes 2007-01-25 12:58:10 +01:00			`def pca_alter_val(a, amax, n_sets=10, method='diag'):`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`"""Pca validation by altering elements in X.`
whitespace 2007-03-14 17:33:54 +01:00
			`comments:`
			`-- may do all jk estimates in this loop`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`"""`
whitespace 2007-03-14 17:33:54 +01:00
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`V = diag_pert(a, n_sets, center=True, index_out=True)`
			`sep = empty((n_sets, amax), dtype='f')`
			`for i, (xi, ind) in enumerate(V):`
			`dat_i = pca(xi, amax, mode='detailed')`
whitespace 2007-03-14 17:33:54 +01:00			`Ti, Pi = dat_i['T'],dat_i['P']`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`for j in xrange(amax):`
			`Xhat = dot(Ti[:,:j+1], Pi[:,:j+1].T)`
			`a_sub = a.ravel().take(ind)`
			`EE = a_sub - Xhat.ravel().take(ind)`
			`tot = (a_sub**2).sum()`
			`sep[i,j] = (EE**2).sum()/tot`
Multiple lib changes 2007-01-25 12:58:10 +01:00			`sep = sqrt(sep)`
			`aopt = find_aopt_from_sep(sep)`
			`return sep, aopt`

			`def pca_cv_val(a, amax, n_sets):`
			`""" Returns PRESS from cross-validated pca using random segments.`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00
Multiple lib changes 2007-01-25 12:58:10 +01:00			`input:`
			`-- a, data matrix (m x n)`
			`-- amax, maximum nuber of components used`
			`-- n_sets, number of segments to calculate`
			`output:`
			`-- sep, (amax x m x n), squared error of prediction (press)`
			`-- aopt, guestimated optimal number of components`
whitespace 2007-03-14 17:33:54 +01:00
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`"""`
whitespace 2007-03-14 17:33:54 +01:00
Multiple lib changes 2007-01-25 12:58:10 +01:00			`m, n = a.shape`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`E = empty((amax, m, n), dtype='f')`
Multiple lib changes 2007-01-25 12:58:10 +01:00			`xtot = (a**2).sum() # this needs centering`
			`V = pca_gen(a, n_sets=7, center=True, index_out=True)`
			`for xi, xout, ind in V:`
			`dat_i = pca(xi, amax, mode='fast')`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`Pi = dat_i['P']`
			`for a in xrange(amax):`
			`Pia = Pi[:,:a+1]`
			`E[a][ind,:] = (X[ind,:] - dot(xout, dot(Pia,Pia.T) ))**2`

			`sep = []`
			`for a in xrange(amax):`
			`sep.append(E[a].sum()/xtot)`
Multiple lib changes 2007-01-25 12:58:10 +01:00			`sep = array(sep)`
			`aopt = find_aopt_from_sep(sep)`
whitespace 2007-03-14 17:33:54 +01:00
Multiple lib changes 2007-01-25 12:58:10 +01:00			`return sep, aopt`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00
... just lots of stuff 2007-08-24 11:14:24 +02:00			`def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True, center=True):`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`""" Returns CV-segments of paramter W for wide X.`
whitespace 2007-03-14 17:33:54 +01:00
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`todo: add support for T,Q and B`
			`"""`
			`if n_blocks == None:`
			`n_blocks = b.shape[0]`

Lib updates 2007-07-23 19:33:21 +02:00			`Wcv = empty((n_blocks, a.shape[1], amax), dtype='d')`
... just lots of stuff 2007-08-24 11:14:24 +02:00			`if use_pack:`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`u, s, inflater = svd(a, full_matrices=0)`
			`a = u*s`
whitespace 2007-03-14 17:33:54 +01:00
... just lots of stuff 2007-08-24 11:14:24 +02:00			`V = pls_gen(a, b, n_blocks=n_blocks, center=center)`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`for nn,(a_in, a_out, b_in, b_out) in enumerate(V):`
			`if algo=='pls':`
			`dat = pls(a_in, b_in, amax, 'loads', 'fast')`
whitespace 2007-03-14 17:33:54 +01:00
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`elif algo=='bridge':`
			`dat = bridge(a_in, b_in, amax, 'loads', 'fast')`
whitespace 2007-03-14 17:33:54 +01:00
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`W = dat['W']`
... just lots of stuff 2007-08-24 11:14:24 +02:00			`if use_pack:`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`W = dot(inflater.T, W)`
Added center option to validation, + naming updates 2007-01-31 12:59:23 +01:00
Lib updates 2007-07-23 19:33:21 +02:00			`Wcv[nn,:,:] = W[:,:,]`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00
Added center option to validation, + naming updates 2007-01-31 12:59:23 +01:00			`return Wcv`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00
... just lots of stuff 2007-08-24 11:14:24 +02:00			`def pca_jkP(a, aopt, n_blocks=None):`
Multiple lib changes 2007-01-25 12:58:10 +01:00			`"""Returns loading from PCA on CV-segments.`

			`input:`
			`-- a, data matrix (n x m)`
			`-- aopt, number of components in model.`
			`-- nblocks, number of segments`
			`output:`
			`-- PP, loadings collected in a three way matrix`
			`(n_segments, m, aopt)`

			`comments:`
			`* The loadings are scaled with the (1/samples)*eigenvalues.`
			`* Crossvalidation method is currently set to random blocks of samples.`

First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`todo: add support for T`
			`fixme: more efficient to add this in validation loop`
			`"""`
			`if n_blocks == None:`
			`n_blocks = a.shape[0]`

			`PP = empty((n_blocks, a.shape[1], aopt), dtype='f')`
			`V = pca_gen(a, n_sets=n_blocks, center=True)`
			`for nn,(a_in, a_out) in enumerate(V):`
Multiple lib changes 2007-01-25 12:58:10 +01:00			`dat = pca(a_in, aopt, mode='fast', scale='loads')`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`P = dat['P']`
			`PP[nn,:,:] = P`

			`return PP`
Multiple lib changes 2007-01-25 12:58:10 +01:00
whitespace 2007-03-14 17:33:54 +01:00
.... 2007-09-20 18:11:37 +02:00			`def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2]):`
confidence 2007-07-23 20:07:10 +02:00			`cv_iter = pls_gen(X, Y, n_blocks=nsets,center=False,index_out=False)`
Lib updates 2007-07-23 19:33:21 +02:00			`m, n = X.shape`
			`k, l = Y.shape`
			`o, p = Z.shape`
			`if nsets==None:`
			`nsets = m`
			`WWx = empty((nsets, n, a_max), 'd')`
			`WWz = empty((nsets, o, a_max), 'd')`
			`#WWy = empty((nsets, l, a_max), 'd')`
Tralala ... 2007-11-07 13:34:13 +01:00			`for i, (xcal, xi, ycal, yi) in enumerate(cv_iter):`
.... 2007-09-20 18:11:37 +02:00			`dat = nipals_lpls(xcal,ycal,Z,a_max=a_max,alpha=xz_alpha,`
			`mean_ctr=mean_ctr,scale='loads',verbose=False)`
confidence 2007-07-23 20:07:10 +02:00			`WWx[i,:,:] = dat['W']`
			`WWz[i,:,:] = dat['L']`
			`#WWy[i,:,:] = dat['Q']`
Lib updates 2007-07-23 19:33:21 +02:00
			`return WWx, WWz`

Multiple lib changes 2007-01-25 12:58:10 +01:00			`def find_aopt_from_sep(sep, method='75perc'):`
			`"""Returns an estimate of optimal number of components from rmsecv.`
			`"""`
Trying to fix cv_pls 2007-07-30 11:46:43 +02:00			`sep = sep.copy()`
Multiple lib changes 2007-01-25 12:58:10 +01:00			`if method=='vanilla':`
			`# min rmsep`
			`rmsecv = sqrt(sep.mean(0))`
			`return rmsecv.argmin() + 1`

			`elif method=='75perc':`
			`prct = .75 #percentile`
			`ind = 1.sep.shape[0]prct`
			`med = median(sep)`
			`prc_75 = []`
			`for col in sep.T:`
Trying to fix cv_pls 2007-07-30 11:46:43 +02:00			`col.sort() #this is inplace -> ruins sep, so we are doing a copy`
Multiple lib changes 2007-01-25 12:58:10 +01:00			`prc_75.append(col[int(ind)])`
			`prc_75 = array(prc_75)`
			`for i in range(1, sep.shape[1], 1):`
			`if med[i-1]<prc_75[i]:`
			`return i`
			`return len(med)`