pyblm/crossvalidation.py

"""This module implements some validation schemes l-pls.

The primary use is crossvalidation.
"""
__all__ = ['lpls_val', 'lpls_jk']
__docformat__ = "restructuredtext en"

from numpy import dot,empty,zeros,sqrt,atleast_2d,argmax,asarray,median,\
     array_split

from engines import nipals_lpls as lpls


def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=True):
    """Performs crossvalidation for generalisation error in lpls.

    The L-PLS crossvalidation is estimated just like an ordinary pls
    crossvalidation. That is, the generalisation error is estimated by
    predicting samples (rows of X and Y) left out according to a cross
    validation scheme.

    *Parameters*:

        X : {array}
            Main data matrix (m, n)
        Y : {array}
            External row data (m, l)
        Z : {array}
            External column data (n, o)
        a_max : {integer}, optional
            Maximum number of components to calculate (0, min(m,n))
        nsets : (integer), optional
            Number of crossvalidation sets
        alpha : {float}, optional
            Parameter to control the amount of influence from Z-matrix.
            0 is none, which returns a pls-solution, 1 is max
        mean_center : {array-like}, optional
            A three element array-like structure with elements in [-1,0,1,2],
            that decides the type of centering used.
            -1 : nothing
            0 : row center
            1 : column center
            2 : double center
        verbose : {boolean}, optional
            Verbosity of console output. For use in debugging.

    *Returns*:

        rmsep : {array}
            Root mean squred error of prediction
        yhat : {array}
            Estimated responses
        aopt : {integer}
            Estimated value of optimal number of components

    """

    m, n = X.shape
    k, l = Y.shape
    o, p = Z.shape
    assert m==k, "X (%d,%d) - Y (%d,%d) dim mismatch" %(m,n,k,l)
    assert n==p, "X (%d,%d) - Z (%d,%d) dim mismatch" %(m,n,o,p)
    if nsets == None:
        nsets = m
    if nsets > X.shape[0]:
        print "nsets (%d) is larger than number of variables (%d).\nnsets:  %d -> %d" %(nsets, m, nsets, m)
        nsets = m
    assert (alpha >= 0 and alpha<=1), "Alpha needs to be within [0,1], got: %.2f" %alpha

    Yhat = empty((a_max, k, l), 'd')
    for cal, val in cv(nsets, k):
        dat = lpls(X[cal],Y[cal],Z,a_max=a_max,alpha=alpha,mean_ctr=mean_ctr,verbose=verbose)
        if mean_ctr[0] != 1:
            xi = X[val,:] - dat['mnx']
        else:
            xi = X[val] - X[val].mean(1)[:,newaxis]
        if mean_ctr[2] != 1:
            ym = dat['mny']
        else:
            ym = Y[val].mean(1)[:,newaxis] #???: check this
        for a in range(a_max):
            Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a]))

    # todo: need a better support for classification error
    y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?']
    if y_is_class:
        Yhat, err = class_error(Yhat,Y)
        return Yhat, err

    sep = (Y - Yhat)**2
    rmsep = sqrt(sep.mean(1)).T
    aopt = find_aopt_from_sep(rmsep)

    return rmsep, Yhat, aopt


def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2], verbose=False):
    """Returns jack-knifed segments of lpls model.

    Jack-knifing is a method to perturb the model paramters, hopefully
    to be representable as a typical perturbation of a *future* sample.
    The mean and variance of the jack knife segements may be used to
    infer the paramter confidence in th model.

    The segements returned are the X-block weights and Z-block weights.


    *Parameters*:

        X : {array}
            Main data matrix (m, n)
        Y : {array}
            External row data (m, l)
        Z : {array}
            External column data (n, o)
        a_max : {integer}, optional
            Maximum number of components to calculate (0, min(m,n))
        nsets : (integer), optional
            Number of jack-knife segments
        xz_alpha : {float}, optional
            Parameter to control the amount of influence from Z-matrix.
            0 is none, which returns a pls-solution, 1 is max
        mean_center : {array-like}, optional
            A three element array-like structure with elements in [-1,0,1,2],
            that decides the type of centering used.
            -1 : nothing
            0 : row center
            1 : column center
            2 : double center
        verbose : {boolean}, optional
            Verbosity of console output. For use in debugging.

    *Returns*:

        Wx : {array}
            X-block jack-knife segements
        Wz : {array}
            Z-block jack-knife segements
    """

    m, n = X.shape
    k, l = Y.shape
    o, p = Z.shape
    assert(m==k)
    assert(n==p)
    if nsets==None:
        nsets = m
    WWx = empty((nsets, n, a_max), 'd')
    WWz = empty((nsets, o, a_max), 'd')
    #WWy = empty((nsets, l, a_max), 'd')
    for i, (cal, val) in enumerate(cv(k, nsets)):
        dat = lpls(X[cal],Y[cal],Z,a_max=a_max,alpha=xz_alpha,mean_ctr=mean_ctr,
                   scale='loads', verbose=verbose)
        WWx[i,:,:] = dat['W']
        WWz[i,:,:] = dat['L']
        #WWy[i,:,:] = dat['Q']

    return WWx, WWz

def find_aopt_from_sep(sep, method='vanilla'):
    """Returns an estimate of optimal number of components.

    The estimate is based on the squared error of prediction from
    crossvalidation. This is pretty much wild guessing and it is
    recomended to inspect model parameters and prediction errors
    closely before deciding on the optimal number of components.

    *Parameters*:
        sep : {array}
            Squared error of prediction
        method : ['vanilla', '75perc']
            Mehtod used to estimate optimal number of components

    *Returns*:
        aopt : {integer}
            A guess on the optimal number of components
    """

    if method=='vanilla':
        # min rmsep
        rmsecv = sqrt(sep.mean(0))
        return rmsecv.argmin() + 1

    elif method=='75perc':
        prct = .75 #percentile
        ind = 1.*sep.shape[0]*prct
        med = median(sep)
        prc_75 = []
        for col in sep.T:
            col = sorted(col)
            prc_75.append(col[int(ind)])
        prc_75 = asarray(prc_75)
        for i in range(1, sep.shape[1], 1):
            if med[i-1]<prc_75[i]:
                return i
        return len(med)

def cv(N, K, randomise=True, sequential=False):
    """Generates K (training, validation) index pairs.

    Each pair is a partition of arange(N), where validation is an iterable
    of length ~N/K, *without* replacement.

    *Parameters*:
        N : {integer}
            Total number of samples
        K : {integer}
            Number of subset samples
        randomise : {boolean}
            Use random sampling
        sequential : {boolean}
            Use sequential sampling

    *Returns*:
        training : {array-like}
            training-indices
        validation : {array-like}
            validation-indices

    *Notes*:
        If randomise is true, a copy of index is shuffled before partitioning,
        otherwise its order is preserved in training and validation.

        Randomise overrides the sequential argument. If randomise is true,
        sequential is False
        If sequential is true the index is partioned in continous blocks,
        otherwise interleaved ordering is used.

    """
    if K>N:
        raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(N,N,K)
    index = xrange(N)
    if randomise:
        from random import shuffle
        index = list(index)
        shuffle(index)
        sequential = False
    if sequential:
        for validation in array_split(index, K):
            training = [i for i in index if i not in validation]
            yield training, validation
    else:
        for k in xrange(K):
            training = [i for i in index if i % K != k]
            validation = [i for i in index if i % K == k]
            yield training, validation


def class_error(Yhat, Y, method='vanilla'):
    """ Not used.
    """
    a_max, k, l = Yhat.shape
    Yhat_c = zeros((k, l), dtype='d')
    for a in range(a_max):
        for i in range(k):
            Yhat_c[a,val,argmax(Yhat[a,val,:])] = 1.0
    err = 100*((Yhat_c + Y) == 2).sum(1)/Y.sum(0).astype('d')

    return Yhat_c, err

def class_errorII(T, Y, method='lda'):
    """ Not used ...
    """
    pass