pyblm/pyblm/statistics.py

""" A collection of some statistical utitlites used.
"""
__all__ = ['hotelling', 'lpls_qvals', 'pls_qvals']
__docformat__ = 'restructuredtext en'

from math import sqrt as msqrt

from numpy import dot,empty,zeros,eye,median,sign,arange,argsort
from numpy.linalg import svd,inv,det
from numpy.random import shuffle

from crossvalidation import lpls_jk, pls_jk
from engines import pls
from engines import nipals_lpls as lpls


def hotelling(Pcv, P, p_center='median', cov_center='median',
              alpha=0.3, crot=True, strict=False):
    """Returns regularized hotelling T^2.

    Hotelling, is a generalization of Student's t statistic that is
    used in multivariate hypothesis testing. In order to avoid small variance
    samples to become significant this version allows borrowing variance
    from the pooled covariance.

    *Parameters*:

        Pcv : {array}
            Crossvalidation segements of paramter
        P : {array}
            Calibration model paramter
        p_center : {'median', 'mean', 'cal_model'}, optional
            Location method for sub-segments
        cov_center : {'median', 'mean', 'cal_model'}, optional
            Pooled covariance estimate
        alpha : {float}, optional
            Regularisation towards pooled covariance estimate.
        crot : {boolean}, optional
            Rotate sub-segments toward calibration model.
        strict : {boolean}, optional
            Only rotate 90 degree

    *Returns*:

       tsq : {array}
           Hotellings T^2 estimate

    *Reference*:

        Gidskehaug et al., A framework for significance analysis of
        gene expression datausing dimension reduction methods, BMC
        bioinformatics, 2007

    *Notes*

        The rotational freedom in the solution of bilinear
        models may require that a rotation onto the calibration
        model. One way of doing that is procrustes rotation.

    """
    m, n = P.shape
    n_sets, n, amax = Pcv.shape
    T_sq = empty((n,), dtype='d')
    Cov_i = zeros((n, amax, amax), dtype='d')

    # rotate sub_models to full model
    if crot:
        for i, Pi in enumerate(Pcv):
            Pcv[i] = procrustes(P, Pi, strict=strict)

    # center of pnull
    if p_center=='median':
        P_ctr = median(Pcv)
    elif p_center=='mean':
        P_ctr = Pcv.mean(0)
    else: # calibration model
        P_ctr = P

    for i in xrange(n):
        Pi = Pcv[:,i,:] # (n_sets x amax)
        Pi_ctr = P_ctr[i,:] # (1 x amax)
        #Pim = (Pi - Pi_ctr)*msqrt(n_sets-1)
        #Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim)
        Pim = (Pi - Pi_ctr)
        Cov_i[i] = dot(Pim.T, Pim)
    if cov_center == 'median':
        Cov_p = median(Cov_i)
    elif cov_center == 'mean':
        Cov_p = Cov.mean(0)
    else:
        print "Pooled covariance est. invalid, using median"
        print cov_center
        Cov_p = median(Cov_i)
    reg_cov = (1. - alpha)*Cov_i + alpha*Cov_p
    for i in xrange(n):
        Pc = P_ctr[i,:]
        sigma = reg_cov[i]
        T_sq[i] = dot(dot(Pc, inv(sigma)), Pc)
    return T_sq

def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False):
    """Orthogonal rotation of b to a.

    Procrustes rotation is an orthogonal rotation of one subspace
    onto another by minimising the squared error.

    *Parameters*:

        a : {array}
            Input array, stationary
        b : {array}
            Input array, rotate this to max. fit to a
        strict : {boolean}
            Only do flipping and shuffling
        center : {boolean}
            Center before rotation, translate back after
        force_norm : {boolean}
            Ensure that columns of a and b are orthonormal
        verbose : {boolean}
            Show sum of squares

    *Returns*:

        b_rot : {array}
            B-matrix rotated

    *Reference*:

        Schonemann, A generalized solution of the orthogonal Procrustes
        problem, Psychometrika, 1966
    """

    if center:
        mn_a = a.mean(0)
        a = a - mn_a
        mn_b = b.mean(0)
        b = b - mn_b

    if force_norm:
        a_norm = apply_along_axis(norm, 0, a)
        a = a/a_norm
        b_norm = apply_along_axis(norm, 0, b)
        b = b/b_norm

    u, s, vt = svd(dot(b.T, a))
    Cm = dot(u, vt) # Cm: orthogonal rotation matrix
    if strict:
       Cm = _ensure_strict(Cm)
    b_rot = dot(b, Cm)
    if verbose:
        fit = ((b - b_rot)**2).sum()
        fit2 = (dot(a, a.T) + dot(b, b.T) - 2*diag(s)).trace()
        print "Error: %.2E ,  %.2E" %(fit, fit2)
    if center:
        return mn_b + b_rot
    else:
        return b_rot

def _ensure_strict(C, only_flips=True):
    """Ensure that a rotation matrix does only 90 degree rotations.

    In multiplication with pcs this allows flips and reordering.
    if only_flips is True there will onlt be flips allowed

    *Parameters*:

        C : {array}
            Rotation matrix
        only_flips : {boolean}
            Only accept columns to flip (switch signs)

    *Returns*:

        C_rot : {array}
            Restricted rotation matrix

    *Notes*:

        This function is not ready for use. Use (only_flips=True).
        That is, for more than two components, the rotation matrix
        has a tendency to be unstable (det(Cm)>1), when rounding is used.

    """
    if only_flips:
        C = eye(C.shape[0])*sign(C)
        return C
    Cm = zeros(C.shape, dtype='d')
    Cm[abs(C)>.6] = 1.
    if det(Cm)>1:
        raise NotImplementedError
    return Cm*sign(C)

def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
               sim_method='shuffle', p_center='med', cov_center=median,
               crot=True, strict=False, center_axis=[2,0,2], nsets=None,
               zorth=False):

    """Returns qvals for l-pls model by permutation analysis.

    The response (Y) is randomly permuted, and the number of false positives
    is registered by comparing hotellings T2 statistics of the calibration model.

    *Parameters*:

        X : {array}
            Main data matrix (m, n)
        Y : {array}
            External row data (m, l)
        Z : {array}
            External column data (k, n)
        aopt : {integer}
            Optimal number of components
        alpha : {float}, optional
            Parameter to control the amount of influence from Z-matrix.
            0 is none, which returns a pls-solution, 1 is max
        center_axis : {array-like}, optional
            A three element array-like structure with elements in [-1,0,1,2],
            that decides the type of centering used.
            -1 : nothing
            0 : row center
            1 : column center
            2 : double center
        n_iter : {integer}, optional
            Number of permutations
        sim_method : ['shuffle'], optional
            Permutation method
        p_center : {'median', 'mean', 'cal_model'}, optional
            Location method for sub-segments
        cov_center : {py_func}, optional
            Location function
        alpha : {float}, optional
            Regularisation towards pooled covariance estimate.
        crot : {boolean}, optional
            Rotate sub-segmentss toward calibration model.
        strict : {boolean}, optional
            Only rotate 90 degree

        nsets : {integer}
            Number of crossvalidation segements

    *Reference*:

        Gidskehaug et al., A framework for significance analysis of
        gene expression data using dimension reduction methods, BMC
        bioinformatics, 2007
    """

    m, n = X.shape
    k, nz = Z.shape
    assert(n==nz)
    try:
        my, l = Y.shape
    except:
        # make Y a column vector
        Y = atleast_2d(Y).T
        my, l = Y.shape
    assert(m==my)

    pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets)
    pert_tsq_z = zeros((k, n_iter), dtype='d') # (nzvars x n_subsets)

    # Full model
    dat = lpls(X, Y, Z, aopt, scale='loads', center_axis=center_axis, zorth=zorth)
    Wc, Lc = lpls_jk(X, Y, Z ,aopt, zorth=zorth)
    cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha)
    cal_tsq_z = hotelling(Lc, dat['L'], alpha=alpha)
    print "morn"
    # Perturbations
    index = arange(m)
    for i in range(n_iter):
        indi = index.copy()
        shuffle(indi)
        dat = lpls(X, Y[indi,:], Z, aopt, scale='loads', center_axis=center_axis, zorth=zorth)
        Wi, Li = lpls_jk(X, Y[indi,:], Z, aopt, nsets=nsets)
        pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)
        # no reason to borrow variance in dag (alpha ->some small value)
        pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=0.01)

    return cal_tsq_z, pert_tsq_z, cal_tsq_x, pert_tsq_x


def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median,
              crot=True,strict=False, center_axis=[0,0], nsets=None, zorth=False):

    """Returns qvals for pls model by permutation analysis.

    The response (Y) is randomly permuted, and the number of false positives
    is registered by comparing hotellings T2 statistics of the calibration model.

    *Parameters*:

        X : {array}
            Main data matrix (m, n)
        Y : {array}
            External row data (m, l)
        aopt : {integer}
            Optimal number of components
        center_axis : {array-like}, optional
            A two element array-like structure with elements in [-1,0,1,2],
            that decides the type of centering used.
            -1 : nothing
            0 : row center
            1 : column center
            2 : double center
        n_iter : {integer}, optional
            Number of permutations
        sim_method : ['shuffle'], optional
            Permutation method
        p_center : {'median', 'mean', 'cal_model'}, optional
            Location method for sub-segments
        cov_center : {py_func}, optional
            Location function
        alpha : {float}, optional
            Regularisation towards pooled covariance estimate.
        crot : {boolean}, optional
            Rotate sub-segmentss toward calibration model.
        strict : {boolean}, optional
            Only rotate 90 degree
        nsets : {integer}
            Number of crossvalidation segements

    *Reference*:

        Gidskehaug et al., A framework for significance analysis of
        gene expression data using dimension reduction methods, BMC
        bioinformatics, 2007
    """

    m, n = X.shape
    try:
        my, l = Y.shape
    except:
        # make Y a column vector
        Y = atleast_2d(Y).T
        my, l = Y.shape
    assert(m==my)

    # Full model
    dat = pls(X, Y, aopt, scale='loads')
    # Submodels
    Wc = pls_jk(X, Y , aopt)

    cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha)

    # Perturbations
    pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets)
    index = arange(m)
    for i in range(n_iter):
        indi = index.copy()
        shuffle(indi)
        dat = pls(X, Y[indi,:], aopt, scale='loads', center_axis=center_axis)
        Wi = pls_jk(X, Y[indi,:], aopt, nsets=nsets, center_axis=center_axis)
        pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)

    return cal_tsq_x, pert_tsq_x


def _fdr(tsq, tsqp, loc_method=median):
    """Returns false discovery rate.

    Fdr is a method used in multiple hypothesis testing to correct for multiple
    comparisons. It controls the expected proportion of incorrectly rejected null
    hypotheses (type I errors) in a list of rejected hypotheses.

    *Parameters*:

        tsq : {array}
            Hotellings T2, calibration model
        tsqp : {array}
            Hotellings T2, submodels

        loc_method : {py_func}
            Location method

    *Returns*:

        fdr : {array}
            False discovery rate

    *Notes*:

    This is an internal function for use in fdr estimation of jack-knifed
    perturbated blm parameters.


    *Reference*:
        Gidskehaug et al., A framework for significance analysis of
        gene expression data using dimension reduction methods, BMC
        bioinformatics, 2007
    """
    n = tsq.shape[0]
    k, m = tsqp.shape
    assert(n==k)
    n_false = empty((n, m), 'd')
    sort_index = argsort(tsq)[::-1]
    r_index = argsort(sort_index)
    for i in xrange(m):
        for j in xrange(n):
            n_false[j,i] = (tsqp[:,i]>tsq[j]).sum()
    fp = loc_method(n_false.T)
    n_signif = (arange(n) + 1.0)[r_index]
    fd_rate = fp/n_signif
    fd_rate[fd_rate>1] = 1
    return fd_rate