""" A collection of some statistical utitlites used. """ __all__ = ['hotelling', 'lpls_qvals'] __docformat__ = 'restructuredtext en' from math import sqrt as msqrt from numpy import dot,empty,zeros,eye,median,sign,arange,argsort from numpy.linalg import svd,inv,det from numpy.random import shuffle from crossvalidation import lpls_jk from engines import nipals_lpls as lpls def hotelling(Pcv, P, p_center='median', cov_center=median, alpha=0.3, crot=True, strict=False): """Returns regularized hotelling T^2. Hotelling, is a generalization of Student's t statistic that is used in multivariate hypothesis testing. In order to avoid small variance samples to become significant this version allows borrowing variance from the pooled covariance. *Parameters*: Pcv : {array} Crossvalidation segements of paramter P : {array} Calibration model paramter p_center : {'median', 'mean', 'cal_model'}, optional Location method for sub-segments cov_center : {py_func}, optional Location function alpha : {float}, optional Regularisation towards pooled covariance estimate. crot : {boolean}, optional Rotate sub-segments toward calibration model. strict : {boolean}, optional Only rotate 90 degree *Returns*: tsq : {array} Hotellings T^2 estimate *Reference*: Gidskehaug et al., A framework for significance analysis of gene expression datausing dimension reduction methods, BMC bioinformatics, 2007 *Notes* The rotational freedom in the solution of bilinear models may require that a rotation onto the calibration model. One way of doing that is procrustes rotation. """ m, n = P.shape n_sets, n, amax = Pcv.shape T_sq = empty((n,), dtype='d') Cov_i = zeros((n, amax, amax), dtype='d') # rotate sub_models to full model if crot: for i, Pi in enumerate(Pcv): Pcv[i] = procrustes(P, Pi, strict=strict) # center of pnull if p_center=='median': P_ctr = median(Pcv) elif p_center=='mean': P_ctr = Pcv.mean(0) else: # calibration model P_ctr = P for i in xrange(n): Pi = Pcv[:,i,:] # (n_sets x amax) Pi_ctr = P_ctr[i,:] # (1 x amax) Pim = (Pi - Pi_ctr)*msqrt(n_sets-1) Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim) Cov = cov_center(Cov_i) reg_cov = (1. - alpha)*Cov_i + alpha*Cov for i in xrange(n): Pc = P_ctr[i,:] sigma = reg_cov[i] T_sq[i] = dot(dot(Pc, inv(sigma)), Pc) return T_sq def procrustes(a, b, strict=True, center=False, verbose=False): """Orthogonal rotation of b to a. Procrustes rotation is an orthogonal rotoation of one subspace onto another by minimising the squared error. *Parameters*: a : {array} Input array b : {array} Input array strict : {boolean} Only do flipping and shuffling center : {boolean} Center before rotation, translate back after verbose : {boolean} Show sum of squares *Returns*: b_rot : {array} B-matrix rotated *Reference*: Schonemann, A generalized solution of the orthogonal Procrustes problem, Psychometrika, 1966 """ if center: mn_a = a.mean(0) a = a - mn_a mn_b = b.mean(0) b = b - mn_b u, s, vt = svd(dot(b.T, a)) Cm = dot(u, vt) # Cm: orthogonal rotation matrix if strict: Cm = _ensure_strict(Cm) b_rot = dot(b, Cm) if verbose: print Cm.round() fit = sum(ravel(b - b_rot)**2) print "Error: %.3E" %fit if center: return mn_b + b_rot else: return b_rot def _ensure_strict(C, only_flips=True): """Ensure that a rotation matrix does only 90 degree rotations. In multiplication with pcs this allows flips and reordering. if only_flips is True there will onlt be flips allowed *Parameters*: C : {array} Rotation matrix only_flips : {boolean} Only accept columns to flip (switch signs) *Returns*: C_rot : {array} Restricted rotation matrix *Notes*: This function is not ready for use. Use (only_flips=True) """ if only_flips: C = eye(Cm.shape[0])*sign(Cm) return C Cm = zeros(C.shape, dtype='d') Cm[abs(C)>.6] = 1. if det(Cm)>1: raise NotImplementedError return Cm*S def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20, sim_method='shuffle',p_center='med', cov_center=median, crot=True,strict=False,mean_ctr=[2,0,2], nsets=None): """Returns qvals for l-pls model by permutation analysis. The response (Y) is randomly permuted, and the number of false positives is registered by comparing hotellings T2 statistics of the calibration model. *Parameters*: X : {array} Main data matrix (m, n) Y : {array} External row data (m, l) Z : {array} External column data (k, n) aopt : {integer} Optimal number of components alpha : {float}, optional Parameter to control the amount of influence from Z-matrix. 0 is none, which returns a pls-solution, 1 is max mean_center : {array-like}, optional A three element array-like structure with elements in [-1,0,1,2], that decides the type of centering used. -1 : nothing 0 : row center 1 : column center 2 : double center n_iter : {integer}, optional Number of permutations sim_method : ['shuffle'], optional Permutation method p_center : {'median', 'mean', 'cal_model'}, optional Location method for sub-segments cov_center : {py_func}, optional Location function alpha : {float}, optional Regularisation towards pooled covariance estimate. crot : {boolean}, optional Rotate sub-segmentss toward calibration model. strict : {boolean}, optional Only rotate 90 degree nsets : {integer} Number of crossvalidation segements *Reference*: Gidskehaug et al., A framework for significance analysis of gene expression data using dimension reduction methods, BMC bioinformatics, 2007 """ m, n = X.shape k, nz = Z.shape assert(n==nz) try: my, l = Y.shape except: # make Y a column vector Y = atleast_2d(Y).T my, l = Y.shape assert(m==my) pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets) pert_tsq_z = zeros((k, n_iter), dtype='d') # (nzvars x n_subsets) # Full model dat = lpls(X, Y, Z, aopt, scale='loads', mean_ctr=mean_ctr) Wc, Lc = lpls_jk(X, Y, Z ,aopt) cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha) cal_tsq_z = hotelling(Lc, dat['L'], alpha=alpha) # Perturbations index = arange(m) for i in range(n_iter): indi = index.copy() shuffle(indi) dat = lpls(X, Y[indi,:], Z, aopt, scale='loads', mean_ctr=mean_ctr) Wi, Li = lpls_jk(X, Y[indi,:], Z, aopt, nsets=nsets) pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha) pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=alpha) return _fdr(cal_tsq_z, pert_tsq_z, median), _fdr(cal_tsq_x, pert_tsq_x, median) def _fdr(tsq, tsqp, loc_method=median): """Returns false discovery rate. Fdr is a method used in multiple hypothesis testing to correct for multiple comparisons. It controls the expected proportion of incorrectly rejected null hypotheses (type I errors) in a list of rejected hypotheses. *Parameters*: tsq : {array} Hotellings T2, calibration model tsqp : {array} Hotellings T2, submodels loc_method : {py_func} Location method *Returns*: fdr : {array} False discovery rate """ n, = tsq.shape k, m = tsqp.shape assert(n==k) n_false = empty((n, m), 'd') sort_index = argsort(tsq)[::-1] r_index = argsort(sort_index) for i in xrange(m): for j in xrange(n): n_false[j,i] = (tsqp[:,i]>tsq[j]).sum() fp = loc_method(n_false.T) n_signif = (arange(n) + 1.0)[r_index] fd_rate = fp/n_signif fd_rate[fd_rate>1] = 1 return fd_rate