laydi/fluents/lib/cx_stats.py

from scipy import zeros,zeros_like,sqrt,dot,trace,sign,round_,argmax,\
     sort,ravel,newaxis,asarray,diag,sum,outer,argsort,arange,ones_like,\
     all,apply_along_axis,eye,atleast_2d
from scipy.linalg import svd,inv,norm,det,sqrtm
from scipy.stats import mean,median
from cx_utils import mat_center
from validation import pls_jkW, lpls_jk
from select_generators import shuffle_1d
from engines import pca, pls, bridge
from engines import nipals_lpls as lpls
import time


def hotelling(Pcv, P, p_center='med', cov_center='med',
              alpha=0.3, crot=True, strict=False):
    """Returns regularized hotelling T^2.
    
    alpha -- regularisation towards pooled cov estimates
    beta -- regularisation for unstable eigenvalues
    p_center -- location method for submodels
    cov_center -- location method for sub coviariances
    alpha -- regularisation
    crot -- rotate submodels toward full?
    strict -- only rotate 90 degree ?
    
    """
    m, n = P.shape
    n_sets, n, amax = Pcv.shape
    # allocate
    T_sq = empty((n, ),dtype='d')
    Cov_i = zeros((n, amax, amax),dtype='d')
    
    # rotate sub_models to full model
    if crot:
        for i, Pi in enumerate(Pcv):
            Pcv[i] = procrustes(P, Pi, strict=strict)

    # center of pnull
    if p_center=='med':
        P_ctr = median(Pcv, 0)
    elif p_center=='mean':
        # fixme: mean is unstable
        P_ctr = mean(Pcv, 0)
    else: #use full
        P_ctr = P

    for i in xrange(n):
        Pi = Pcv[:,i,:] # (n_sets x amax) 
        Pi_ctr = P_ctr[i,:] # (1 x amax)
        Pim = (Pi - Pi_ctr[newaxis])*sqrt(n_sets-1)
        Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim)
        
    if cov_center == 'med':
        Cov = median(Cov_i, 0)
    else:
        Cov = mean(Cov_i, 0)
    
    reg_cov = (1. - alpha)*Cov_i + alpha*Cov
    for i in xrange(n):
        #Pc = P_ctr[i,:][:,newaxis]
        Pc = P_ctr[i,:]
        sigma = reg_cov[i]
        # T_sq[i] = (dot(Pc, inv(sigma) )*Pc).sum() #slow
        T_sq[i] = dot(dot(Pc, inv(sigma)), Pc) # dont need to care about transposes
        #T_sq[i] = dot(dot(Pc.T, inv(sigma)), Pc).ravel()
    return T_sq

def procrustes(A, B, strict=True, center=False, verbose=False):
    """Rotation of B to A.

    strict -- Only do flipping and shuffling
    center -- Center before rotation, translate back after
    verbose -- Print ssq
    
    No scaling calculated.
    Output B_rot = Rotated B
    """
    if center:
        A,mn_A = mat_center(A, ret_mn=True)
        B,mn_B = mat_center(B, ret_mn=True)
    u,s,vh = svd(dot(B.T, A))
    v = vh.T
    Cm = dot(u, v.T) #orthogonal rotation matrix
    if strict: # just inverting and flipping
       Cm = ensure_strict(Cm)
    b_rot = dot(B, Cm)

    if verbose:
        print Cm.round()
        fit = sum(ravel(B - b_rot)**2)
        print "Sum of squares: %s" %fit
    if center:
        return mn_B + b_rot
    else:
        return b_rot

def expl_var_x(Xc, T):
    """Returns explained variance of X.
    T should carry variance in length, Xc has zero col-mean.
    """
    exp_var_x = diag(dot(T.T, T))*100/(sum(Xc**2))
    return exp_var_x

def expl_var_y(Y, T, Q):
    """Returns explained variance of Y.
    """
    # centered Y
    exp_var_y = zeros((Q.shape[1], ))
    for a in range(Q.shape[1]):
        Ya = outer(T[:,a], Q[:,a])
        exp_var_y[a] = 100*sum(Ya**2)/sum(Y**2)
    return exp_var_y
        
def pls_qvals(a, b, aopt=None, alpha=.3,
              n_iter=20, algo='pls',
              center=True,
              sim_method='shuffle',
              p_center='med', cov_center='med',
              crot=True, strict=False):

    """Returns qvals for pls model.

    input:
    a -- data matrix
    b -- data matrix
    aopt -- scalar, opt. number of components
    alpha -- [0,1] regularisation parameter for T2-test
    n_iter -- number of permutations
    sim_method -- permutation method ['shuffle']
    p_center -- location estimator for sub models ['med']
    cov_center -- location estimator for covariance of submodels ['med']
    crot -- bool, use rotations of sub models?
    strict -- bool, use stict (rot/flips only) rotations?
    """
    
    m, n = a.shape
    TSQ = zeros((n, n_iter), dtype='d') # (nvars x n_subsets)
    n_false = zeros((n, n_iter), dtype='d')

    #full model
    if center:
        ac = a - a.mean(0)
        bc = b - b.mean(0)
    
    if algo=='bridge':
        dat = bridge(ac, bc, aopt, 'loads', 'fast')
    else:
        dat = pls(ac, bc, aopt, 'loads', 'fast')
    Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo,center=True)
    tsq_full = hotelling(Wcv, dat['W'], p_center=p_center,
                         alpha=alpha, crot=crot, strict=strict,
                         cov_center=cov_center)
    #t0 = time.time()
    Vs = shuffle_1d(bc, n_iter, axis=0)
    for i, b_shuff in enumerate(Vs):
        #t1 = time.time()
        if algo=='bridge':
            dat = bridge(ac, b_shuff, aopt, 'loads','fast')
        else:
            dat = pls(ac, b_shuff, aopt, 'loads', 'fast')
        Wcv = pls_jkW(a, b_shuff, aopt, n_blocks=None, algo=algo)
        TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center,
                             alpha=alpha, crot=crot, strict=strict,
                             cov_center=cov_center)
        #print time.time() - t1
    
    return fdr(tsq_full, TSQ, median)
    

def ensure_strict(C, only_flips=True):
    """Ensure that a rotation matrix does only 90 degree rotations.
    In multiplication with pcs this allows flips and reordering.

    if only_flips is True there will onlt be flips allowed
    """
    Cm = C
    S = sign(C) # signs
    if only_flips==True:
        C = eye(Cm.shape[0])*S
        return C
    Cm = zeros_like(C)
    Cm.putmask(1.,abs(C)>.6)
    if det(Cm)>1:
        raise ValueError,"Implement this!"
    return Cm*S

def pls_qvals_II(a, b, aopt=None, center=True, alpha=.3,
                 n_iter=20, algo='pls',
                 sim_method='shuffle',
                 p_center='med', cov_center='med',
                 crot=True, strict=False):

    """Returns qvals for pls model.
    Shuffling of variables in X.
    Null model is 'If I put genes randomly on network' ... if they are sign:
    then this is due to network structure and not covariance with response.

    input:
    a -- data matrix
    b -- data matrix
    aopt -- scalar, opt. number of components
    alpha -- [0,1] regularisation parameter for T2-test
    n_iter -- number of permutations
    sim_method -- permutation method ['shuffle']
    p_center -- location estimator for sub models ['med']
    cov_center -- location estimator for covariance of submodels ['med']
    crot -- bool, use rotations of sub models?
    strict -- bool, use stict (rot/flips only) rotations?
    """
    
    m, n = a.shape
    TSQ = zeros((n, n_iter), dtype='<f8') # (nvars x n_subsets)
    n_false = zeros((n, n_iter), dtype='<f8')

    #full model

    # center?
    if center==True:
        ac = a - a.mean(0)
        bc = b - b.mean(0)
    
    if algo=='bridge':
        dat = bridge(ac, bc, aopt, 'loads', 'fast')
    else:
        dat = pls(ac, bc, aopt, 'loads', 'fast')
    Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo)
    tsq_full = hotelling(Wcv, dat['W'], p_center=p_center,
                         alpha=alpha, crot=crot, strict=strict,
                         cov_center=cov_center)
    t0 = time.time()
    Vs = shuffle_1d(a, n_iter, 1)
    for i, a_shuff in enumerate(Vs):
        t1 = time.time()
        a = a_shuff - a_shuff.mean(0)
        
        if algo=='bridge':
            dat = bridge(a, b, aopt, 'loads','fast')
        else:
            dat = pls(a, b, aopt, 'loads', 'fast')
        Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo)
        TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center,
                             alpha=alpha, crot=crot, strict=strict,
                             cov_center=cov_center)
        print time.time() - t1
    sort_index = argsort(tsq_full)[::-1]
    back_sort_index = sort_index.argsort()
    print time.time() - t0

    # count false positives
    tsq_full_sorted = tsq_full.take(sort_index)
    for i in xrange(n_iter):
        for j in xrange(n):
            n_false[j,i] = sum(TSQ[:,i]>=tsq_full[j])
    false_pos = median(n_false, 1)
    ll = arange(1, len(false_pos)+1, 1)
    sort_qval = false_pos.take(sort_index)/ll
    qval = false_pos/ll.take(back_sort_index)
    print time.time() - t0
    #return qval, false_pos, TSQ, tsq_full

    return qval

def leverage(aopt=1,*args):
    """Returns leverages
    input : aopt, number of components to base leverage calculations on
            *args, matrices of normed blm-paramters
    output: leverages
    
    For PCA typical inputs are normalised T or normalised P
    For PLSR typical inputs are normalised T or normalised W
    """
    if aopt<1:
        raise ValueError,"Leverages only make sense for aopt>0"
    lev  = []
    for u in args:
        lev_u = 1./u.shape[0] + dot(u[:,:aopt], u[:,:aopt].T).diagonal()
        lev.append(lev_u)
    return lev

def variances(a, t, p):
    """Returns explained variance and ind. var from blm-params.
    input:
          a -- full centered matrix
          t,p -- parameters from a bilinear approx of the above matrix.
    output:
          var -- variance of each component
          var_exp -- cumulative explained variance in percentage

    Typical inputs are:  X(centered),T,P for PCA or
                         X(centered),T,P / Y(centered),T,Q for PLSR.
    """
    
    tot_var = sum(a**2)
    var = 100*(sum(p**2, 0)*sum(t**2, 0))/tot_var
    var_exp = cumsum(var)
    return var, var_exp

def residual_diagnostics(Y, Yhat, aopt=1):
    """Root mean errors and press values. 
    R2 vals
    """
    pass
    

def ssq(E, axis=0, weights=None):
    """Sum of squares, supports weights."""
    n = E.shape[axis]
    if weights==None:
        weights = eye(n)
    else:
        weigths = diag(weigths)
    if axis==0:
        Ew = dot(weights, E)
    elif axis==1:
        Ew = dot(E, weights)
    else:
        raise NotImplementedError, "Higher order modes not supported"
    return pow(Ew,2).sum(axis)


def vnorm(x):
    """Returns the euclidian norm of a vector.

    This is considerably faster than linalg.norm
    """
    return sqrt(dot(x,x.conj()))

def mahalanobis(a, loc=None, acov=None, invcov=None):
    """Returns the distance of each observation in a
    from the location estimate (loc) of the data,
    relative to the shape of the data.
    
    
    a : data matrix (n observations in rows, p variables in columns)
    loc : location estimate of the data (p-dimensional vector)
    covmat or invcov : scatter estimate of the data or the inverse of the scatter estimate (pxp matrix)

    :Returns:
    A vector containing the distances of all the observations to locvct.
    
    """
    n, p = a.shape
    if loc==None:
        loc = a.mean(0)
    loc = atleast_2d(loc)
    if loc.shape[1]==1:
        loc = loc.T; #ensure rowvector
    assert(loc.shape[1]==p)
    xc = a - loc
    if acov==None and invcov==None:
        acov = dot(xc.T, xc)
        
    if invcov != None:
        covmat = atleast_2d(invcov)
        if min(covmat.shape)==1:
            covmat = diag(invcov.ravel())
    else:
        covmat = atleast_2d(acov)
        if min(covmat.shape)==1:
            covmat = diag(covmat.ravel())
        covmat = inv(covmat)
    # mdist = diag(dot(dot(xc, covmat),xc.T))
    mdist = (dot(xc, covmat)*xc).sum(1)
    return mdist

def lpls_qvals(a, b, c, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
               sim_method='shuffle',p_center='med', cov_center='med',crot=True,
               strict=False, mean_ctr=[2,0,2]):

    """Returns qvals for l-pls model.

    input:
    a -- data matrix
    b -- data matrix
    c -- data matrix
    aopt -- scalar, opt. number of components
    alpha -- [0,1] regularisation parameter for T2-test
    xz_alpha -- [0,1] how much z info to include
    n_iter -- number of permutations
    sim_method -- permutation method ['shuffle']
    p_center -- location estimator for sub models ['med']
    cov_center -- location estimator for covariance of submodels ['med']
    crot -- bool, use rotations of sub models?
    strict -- bool, use stict (rot/flips only) rotations?
    """
    
    m, n = a.shape
    p, k = c.shape
    pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets)
    pert_tsq_z = zeros((p, n_iter), dtype='d') # (nzvars x n_subsets)

    # Full model
    #print "Full model start"
    dat = lpls(a, b, c, aopt, scale='loads', mean_ctr=mean_ctr)
    Wc,Lc = lpls_jk(a, b, c ,aopt)
    #print "Full hot"
    cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha)
    cal_tsq_z = hotelling(Lc, dat['L'], alpha=alpha)
    
    # Perturbations
    Vs = shuffle_1d(b, n_iter, axis=0)
    for i, b_shuff in enumerate(Vs):
        #print i
        time.sleep(.01)
        dat = lpls(a, b_shuff,c, aopt, scale='loads', mean_ctr=mean_ctr)
        Wi, Li = lpls_jk(a, b_shuff, c, aopt)
        pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)
        pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=alpha)
   
    return fdr(cal_tsq_z, pert_tsq_z, median), fdr(cal_tsq_x, pert_tsq_x, median)
    

def fdr(tsq, tsqp, loc_method=median):
    n, = tsq.shape
    k, m = tsqp.shape
    assert(n==k)
    n_false = empty((n, m), 'd')
    sort_index = argsort(tsq)[::-1]
    r_index = argsort(sort_index)
    for i in xrange(m):
        for j in xrange(n):
            n_false[j,i] = (tsqp[:,i]>tsq[j]).sum()
    
    fp = loc_method(n_false,1)
    n_signif = (arange(n) + 1.0)[r_index]
    fd_rate = fp/n_signif
    return fd_rate
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`from scipy import zeros,zeros_like,sqrt,dot,trace,sign,round_,argmax,\`
			`sort,ravel,newaxis,asarray,diag,sum,outer,argsort,arange,ones_like,\`
.... 2007-09-20 18:11:37 +02:00			`all,apply_along_axis,eye,atleast_2d`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`from scipy.linalg import svd,inv,norm,det,sqrtm`
			`from scipy.stats import mean,median`
			`from cx_utils import mat_center`
Finnished the l-pls fdr 2007-09-21 15:16:40 +02:00			`from validation import pls_jkW, lpls_jk`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`from select_generators import shuffle_1d`
Finnished the l-pls fdr 2007-09-21 15:16:40 +02:00			`from engines import pca, pls, bridge`
			`from engines import nipals_lpls as lpls`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`import time`

bugfixed scaling issues 2007-01-31 12:57:04 +01:00
			`def hotelling(Pcv, P, p_center='med', cov_center='med',`
... just lots of stuff 2007-08-24 11:14:24 +02:00			`alpha=0.3, crot=True, strict=False):`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`"""Returns regularized hotelling T^2.`

			`alpha -- regularisation towards pooled cov estimates`
			`beta -- regularisation for unstable eigenvalues`
			`p_center -- location method for submodels`
			`cov_center -- location method for sub coviariances`
			`alpha -- regularisation`
			`crot -- rotate submodels toward full?`
			`strict -- only rotate 90 degree ?`

			`"""`
bugfixed scaling issues 2007-01-31 12:57:04 +01:00			`m, n = P.shape`
			`n_sets, n, amax = Pcv.shape`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`# allocate`
.... 2007-09-20 18:11:37 +02:00			`T_sq = empty((n, ),dtype='d')`
			`Cov_i = zeros((n, amax, amax),dtype='d')`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00
			`# rotate sub_models to full model`
			`if crot:`
bugfixed scaling issues 2007-01-31 12:57:04 +01:00			`for i, Pi in enumerate(Pcv):`
			`Pcv[i] = procrustes(P, Pi, strict=strict)`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00
			`# center of pnull`
			`if p_center=='med':`
bugfixed scaling issues 2007-01-31 12:57:04 +01:00			`P_ctr = median(Pcv, 0)`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`elif p_center=='mean':`
			`# fixme: mean is unstable`
bugfixed scaling issues 2007-01-31 12:57:04 +01:00			`P_ctr = mean(Pcv, 0)`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`else: #use full`
bugfixed scaling issues 2007-01-31 12:57:04 +01:00			`P_ctr = P`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00
			`for i in xrange(n):`
bugfixed scaling issues 2007-01-31 12:57:04 +01:00			`Pi = Pcv[:,i,:] # (n_sets x amax)`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`Pi_ctr = P_ctr[i,:] # (1 x amax)`
			`Pim = (Pi - Pi_ctr[newaxis])*sqrt(n_sets-1)`
			`Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim)`

			`if cov_center == 'med':`
			`Cov = median(Cov_i, 0)`
			`else:`
			`Cov = mean(Cov_i, 0)`

			`reg_cov = (1. - alpha)Cov_i + alphaCov`
			`for i in xrange(n):`
.... 2007-09-20 18:11:37 +02:00			`#Pc = P_ctr[i,:][:,newaxis]`
			`Pc = P_ctr[i,:]`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`sigma = reg_cov[i]`
.... 2007-09-20 18:11:37 +02:00			`# T_sq[i] = (dot(Pc, inv(sigma) )*Pc).sum() #slow`
			`T_sq[i] = dot(dot(Pc, inv(sigma)), Pc) # dont need to care about transposes`
			`#T_sq[i] = dot(dot(Pc.T, inv(sigma)), Pc).ravel()`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`return T_sq`

			`def procrustes(A, B, strict=True, center=False, verbose=False):`
			`"""Rotation of B to A.`

			`strict -- Only do flipping and shuffling`
			`center -- Center before rotation, translate back after`
			`verbose -- Print ssq`

			`No scaling calculated.`
			`Output B_rot = Rotated B`
			`"""`
			`if center:`
			`A,mn_A = mat_center(A, ret_mn=True)`
			`B,mn_B = mat_center(B, ret_mn=True)`
			`u,s,vh = svd(dot(B.T, A))`
			`v = vh.T`
			`Cm = dot(u, v.T) #orthogonal rotation matrix`
			`if strict: # just inverting and flipping`
			`Cm = ensure_strict(Cm)`
			`b_rot = dot(B, Cm)`

			`if verbose:`
			`print Cm.round()`
			`fit = sum(ravel(B - b_rot)**2)`
			`print "Sum of squares: %s" %fit`
			`if center:`
			`return mn_B + b_rot`
			`else:`
			`return b_rot`

Updates on metric, whitespace 2007-03-14 17:31:25 +01:00			`def expl_var_x(Xc, T):`
			`"""Returns explained variance of X.`
			`T should carry variance in length, Xc has zero col-mean.`
			`"""`
			`exp_var_x = diag(dot(T.T, T))100/(sum(Xc*2))`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`return exp_var_x`

			`def expl_var_y(Y, T, Q):`
			`"""Returns explained variance of Y.`
			`"""`
			`# centered Y`
			`exp_var_y = zeros((Q.shape[1], ))`
			`for a in range(Q.shape[1]):`
			`Ya = outer(T[:,a], Q[:,a])`
			`exp_var_y[a] = 100sum(Ya2)/sum(Y*2)`
			`return exp_var_y`

			`def pls_qvals(a, b, aopt=None, alpha=.3,`
			`n_iter=20, algo='pls',`
Lib updates 2007-07-23 19:33:21 +02:00			`center=True,`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`sim_method='shuffle',`
			`p_center='med', cov_center='med',`
... just lots of stuff 2007-08-24 11:14:24 +02:00			`crot=True, strict=False):`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00
			`"""Returns qvals for pls model.`

			`input:`
Lib updates 2007-07-23 19:33:21 +02:00			`a -- data matrix`
			`b -- data matrix`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`aopt -- scalar, opt. number of components`
			`alpha -- [0,1] regularisation parameter for T2-test`
			`n_iter -- number of permutations`
			`sim_method -- permutation method ['shuffle']`
			`p_center -- location estimator for sub models ['med']`
			`cov_center -- location estimator for covariance of submodels ['med']`
			`crot -- bool, use rotations of sub models?`
			`strict -- bool, use stict (rot/flips only) rotations?`
			`"""`

			`m, n = a.shape`
Lib updates 2007-07-23 19:33:21 +02:00			`TSQ = zeros((n, n_iter), dtype='d') # (nvars x n_subsets)`
			`n_false = zeros((n, n_iter), dtype='d')`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00
			`#full model`
Lib updates 2007-07-23 19:33:21 +02:00			`if center:`
			`ac = a - a.mean(0)`
			`bc = b - b.mean(0)`
... just lots of stuff 2007-08-24 11:14:24 +02:00
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`if algo=='bridge':`
Lib updates 2007-07-23 19:33:21 +02:00			`dat = bridge(ac, bc, aopt, 'loads', 'fast')`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`else:`
Lib updates 2007-07-23 19:33:21 +02:00			`dat = pls(ac, bc, aopt, 'loads', 'fast')`
... just lots of stuff 2007-08-24 11:14:24 +02:00			`Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo,center=True)`
bugfixed scaling issues 2007-01-31 12:57:04 +01:00			`tsq_full = hotelling(Wcv, dat['W'], p_center=p_center,`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`alpha=alpha, crot=crot, strict=strict,`
Updates on metric, whitespace 2007-03-14 17:31:25 +01:00			`cov_center=cov_center)`
.... 2007-09-20 18:11:37 +02:00			`#t0 = time.time()`
Lib updates 2007-07-23 19:33:21 +02:00			`Vs = shuffle_1d(bc, n_iter, axis=0)`
Updates on metric, whitespace 2007-03-14 17:31:25 +01:00			`for i, b_shuff in enumerate(Vs):`
.... 2007-09-20 18:11:37 +02:00			`#t1 = time.time()`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`if algo=='bridge':`
Lib updates 2007-07-23 19:33:21 +02:00			`dat = bridge(ac, b_shuff, aopt, 'loads','fast')`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`else:`
Lib updates 2007-07-23 19:33:21 +02:00			`dat = pls(ac, b_shuff, aopt, 'loads', 'fast')`
... just lots of stuff 2007-08-24 11:14:24 +02:00			`Wcv = pls_jkW(a, b_shuff, aopt, n_blocks=None, algo=algo)`
bugfixed scaling issues 2007-01-31 12:57:04 +01:00			`TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center,`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`alpha=alpha, crot=crot, strict=strict,`
Updates on metric, whitespace 2007-03-14 17:31:25 +01:00			`cov_center=cov_center)`
.... 2007-09-20 18:11:37 +02:00			`#print time.time() - t1`

			`return fdr(tsq_full, TSQ, median)`

First import of chemometrics utils 2006-12-18 12:59:12 +01:00
			`def ensure_strict(C, only_flips=True):`
			`"""Ensure that a rotation matrix does only 90 degree rotations.`
			`In multiplication with pcs this allows flips and reordering.`

			`if only_flips is True there will onlt be flips allowed`
			`"""`
			`Cm = C`
			`S = sign(C) # signs`
			`if only_flips==True:`
			`C = eye(Cm.shape[0])*S`
			`return C`
			`Cm = zeros_like(C)`
			`Cm.putmask(1.,abs(C)>.6)`
			`if det(Cm)>1:`
			`raise ValueError,"Implement this!"`
			`return Cm*S`

Lib updates 2007-07-23 19:33:21 +02:00			`def pls_qvals_II(a, b, aopt=None, center=True, alpha=.3,`
			`n_iter=20, algo='pls',`
			`sim_method='shuffle',`
			`p_center='med', cov_center='med',`
... just lots of stuff 2007-08-24 11:14:24 +02:00			`crot=True, strict=False):`
Lib updates 2007-07-23 19:33:21 +02:00
			`"""Returns qvals for pls model.`
... just lots of stuff 2007-08-24 11:14:24 +02:00			`Shuffling of variables in X.`
Lib updates 2007-07-23 19:33:21 +02:00			`Null model is 'If I put genes randomly on network' ... if they are sign:`
			`then this is due to network structure and not covariance with response.`

			`input:`
			`a -- data matrix`
			`b -- data matrix`
			`aopt -- scalar, opt. number of components`
			`alpha -- [0,1] regularisation parameter for T2-test`
			`n_iter -- number of permutations`
			`sim_method -- permutation method ['shuffle']`
			`p_center -- location estimator for sub models ['med']`
			`cov_center -- location estimator for covariance of submodels ['med']`
			`crot -- bool, use rotations of sub models?`
			`strict -- bool, use stict (rot/flips only) rotations?`
			`"""`

			`m, n = a.shape`
			`TSQ = zeros((n, n_iter), dtype='<f8') # (nvars x n_subsets)`
			`n_false = zeros((n, n_iter), dtype='<f8')`

			`#full model`

			`# center?`
			`if center==True:`
			`ac = a - a.mean(0)`
			`bc = b - b.mean(0)`
... just lots of stuff 2007-08-24 11:14:24 +02:00
Lib updates 2007-07-23 19:33:21 +02:00			`if algo=='bridge':`
			`dat = bridge(ac, bc, aopt, 'loads', 'fast')`
			`else:`
			`dat = pls(ac, bc, aopt, 'loads', 'fast')`
... just lots of stuff 2007-08-24 11:14:24 +02:00			`Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo)`
Lib updates 2007-07-23 19:33:21 +02:00			`tsq_full = hotelling(Wcv, dat['W'], p_center=p_center,`
			`alpha=alpha, crot=crot, strict=strict,`
			`cov_center=cov_center)`
			`t0 = time.time()`
			`Vs = shuffle_1d(a, n_iter, 1)`
			`for i, a_shuff in enumerate(Vs):`
			`t1 = time.time()`
			`a = a_shuff - a_shuff.mean(0)`

			`if algo=='bridge':`
			`dat = bridge(a, b, aopt, 'loads','fast')`
			`else:`
			`dat = pls(a, b, aopt, 'loads', 'fast')`
... just lots of stuff 2007-08-24 11:14:24 +02:00			`Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo)`
Lib updates 2007-07-23 19:33:21 +02:00			`TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center,`
			`alpha=alpha, crot=crot, strict=strict,`
			`cov_center=cov_center)`
			`print time.time() - t1`
			`sort_index = argsort(tsq_full)[::-1]`
			`back_sort_index = sort_index.argsort()`
			`print time.time() - t0`

			`# count false positives`
			`tsq_full_sorted = tsq_full.take(sort_index)`
			`for i in xrange(n_iter):`
			`for j in xrange(n):`
			`n_false[j,i] = sum(TSQ[:,i]>=tsq_full[j])`
			`false_pos = median(n_false, 1)`
			`ll = arange(1, len(false_pos)+1, 1)`
			`sort_qval = false_pos.take(sort_index)/ll`
			`qval = false_pos/ll.take(back_sort_index)`
			`print time.time() - t0`
			`#return qval, false_pos, TSQ, tsq_full`
.... 2007-09-20 18:11:37 +02:00
Lib updates 2007-07-23 19:33:21 +02:00			`return qval`

First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`def leverage(aopt=1,*args):`
			`"""Returns leverages`
			`input : aopt, number of components to base leverage calculations on`
			`*args, matrices of normed blm-paramters`
			`output: leverages`

			`For PCA typical inputs are normalised T or normalised P`
			`For PLSR typical inputs are normalised T or normalised W`
			`"""`
			`if aopt<1:`
			`raise ValueError,"Leverages only make sense for aopt>0"`
			`lev = []`
			`for u in args:`
			`lev_u = 1./u.shape[0] + dot(u[:,:aopt], u[:,:aopt].T).diagonal()`
			`lev.append(lev_u)`
			`return lev`

Updates on metric, whitespace 2007-03-14 17:31:25 +01:00			`def variances(a, t, p):`
First import of chemometrics utils 2006-12-18 12:59:12 +01:00			`"""Returns explained variance and ind. var from blm-params.`
			`input:`
			`a -- full centered matrix`
			`t,p -- parameters from a bilinear approx of the above matrix.`
			`output:`
			`var -- variance of each component`
			`var_exp -- cumulative explained variance in percentage`

			`Typical inputs are: X(centered),T,P for PCA or`
			`X(centered),T,P / Y(centered),T,Q for PLSR.`
			`"""`

			`tot_var = sum(a**2)`
			`var = 100(sum(p2, 0)sum(t**2, 0))/tot_var`
			`var_exp = cumsum(var)`
			`return var, var_exp`

			`def residual_diagnostics(Y, Yhat, aopt=1):`
			`"""Root mean errors and press values.`
			`R2 vals`
			`"""`
			`pass`


			`def ssq(E, axis=0, weights=None):`
			`"""Sum of squares, supports weights."""`
			`n = E.shape[axis]`
			`if weights==None:`
			`weights = eye(n)`
			`else:`
			`weigths = diag(weigths)`
			`if axis==0:`
			`Ew = dot(weights, E)`
			`elif axis==1:`
			`Ew = dot(E, weights)`
			`else:`
			`raise NotImplementedError, "Higher order modes not supported"`
			`return pow(Ew,2).sum(axis)`

Lib updates 2007-07-23 19:33:21 +02:00
			`def vnorm(x):`
			`"""Returns the euclidian norm of a vector.`

			`This is considerably faster than linalg.norm`
			`"""`
			`return sqrt(dot(x,x.conj()))`
.... 2007-09-20 18:11:37 +02:00
			`def mahalanobis(a, loc=None, acov=None, invcov=None):`
			`"""Returns the distance of each observation in a`
			`from the location estimate (loc) of the data,`
			`relative to the shape of the data.`


			`a : data matrix (n observations in rows, p variables in columns)`
			`loc : location estimate of the data (p-dimensional vector)`
			`covmat or invcov : scatter estimate of the data or the inverse of the scatter estimate (pxp matrix)`

			`:Returns:`
			`A vector containing the distances of all the observations to locvct.`

			`"""`
			`n, p = a.shape`
			`if loc==None:`
			`loc = a.mean(0)`
			`loc = atleast_2d(loc)`
			`if loc.shape[1]==1:`
			`loc = loc.T; #ensure rowvector`
			`assert(loc.shape[1]==p)`
			`xc = a - loc`
			`if acov==None and invcov==None:`
			`acov = dot(xc.T, xc)`

			`if invcov != None:`
			`covmat = atleast_2d(invcov)`
			`if min(covmat.shape)==1:`
			`covmat = diag(invcov.ravel())`
			`else:`
			`covmat = atleast_2d(acov)`
			`if min(covmat.shape)==1:`
			`covmat = diag(covmat.ravel())`
			`covmat = inv(covmat)`
			`# mdist = diag(dot(dot(xc, covmat),xc.T))`
			`mdist = (dot(xc, covmat)*xc).sum(1)`
			`return mdist`

Finnished the l-pls fdr 2007-09-21 15:16:40 +02:00			`def lpls_qvals(a, b, c, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,`
			`sim_method='shuffle',p_center='med', cov_center='med',crot=True,`
			`strict=False, mean_ctr=[2,0,2]):`
.... 2007-09-20 18:11:37 +02:00
			`"""Returns qvals for l-pls model.`

			`input:`
			`a -- data matrix`
			`b -- data matrix`
			`c -- data matrix`
			`aopt -- scalar, opt. number of components`
			`alpha -- [0,1] regularisation parameter for T2-test`
			`xz_alpha -- [0,1] how much z info to include`
			`n_iter -- number of permutations`
			`sim_method -- permutation method ['shuffle']`
			`p_center -- location estimator for sub models ['med']`
			`cov_center -- location estimator for covariance of submodels ['med']`
			`crot -- bool, use rotations of sub models?`
			`strict -- bool, use stict (rot/flips only) rotations?`
			`"""`

			`m, n = a.shape`
Finnished the l-pls fdr 2007-09-21 15:16:40 +02:00			`p, k = c.shape`
			`pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets)`
			`pert_tsq_z = zeros((p, n_iter), dtype='d') # (nzvars x n_subsets)`
.... 2007-09-20 18:11:37 +02:00
			`# Full model`
Finnished the l-pls fdr 2007-09-21 15:16:40 +02:00			`#print "Full model start"`
			`dat = lpls(a, b, c, aopt, scale='loads', mean_ctr=mean_ctr)`
			`Wc,Lc = lpls_jk(a, b, c ,aopt)`
			`#print "Full hot"`
			`cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha)`
			`cal_tsq_z = hotelling(Lc, dat['L'], alpha=alpha)`

.... 2007-09-20 18:11:37 +02:00			`# Perturbations`
			`Vs = shuffle_1d(b, n_iter, axis=0)`
			`for i, b_shuff in enumerate(Vs):`
Finnished the l-pls fdr 2007-09-21 15:16:40 +02:00			`#print i`
			`time.sleep(.01)`
			`dat = lpls(a, b_shuff,c, aopt, scale='loads', mean_ctr=mean_ctr)`
			`Wi, Li = lpls_jk(a, b_shuff, c, aopt)`
			`pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)`
			`pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=alpha)`
.... 2007-09-20 18:11:37 +02:00
Finnished the l-pls fdr 2007-09-21 15:16:40 +02:00			`return fdr(cal_tsq_z, pert_tsq_z, median), fdr(cal_tsq_x, pert_tsq_x, median)`
.... 2007-09-20 18:11:37 +02:00


			`def fdr(tsq, tsqp, loc_method=median):`
			`n, = tsq.shape`
			`k, m = tsqp.shape`
			`assert(n==k)`
			`n_false = empty((n, m), 'd')`
			`sort_index = argsort(tsq)[::-1]`
			`r_index = argsort(sort_index)`
			`for i in xrange(m):`
			`for j in xrange(n):`
			`n_false[j,i] = (tsqp[:,i]>tsq[j]).sum()`

			`fp = loc_method(n_false,1)`
			`n_signif = (arange(n) + 1.0)[r_index]`
			`fd_rate = fp/n_signif`
			`return fd_rate`