297 lines
8.6 KiB
Python
297 lines
8.6 KiB
Python
""" A collection of some statistical utitlites used.
|
|
"""
|
|
__all__ = ['hotelling', 'lpls_qvals']
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
from math import sqrt as msqrt
|
|
|
|
from numpy import dot,empty,zeros,eye,median,sign,arange,argsort
|
|
from numpy.linalg import svd,inv,det
|
|
from numpy.random import shuffle
|
|
|
|
from crossvalidation import lpls_jk
|
|
from engines import nipals_lpls as lpls
|
|
|
|
|
|
def hotelling(Pcv, P, p_center='median', cov_center=median,
|
|
alpha=0.3, crot=True, strict=False):
|
|
"""Returns regularized hotelling T^2.
|
|
|
|
Hotelling, is a generalization of Student's t statistic that is
|
|
used in multivariate hypothesis testing. In order to avoid small variance
|
|
samples to become significant this version allows borrowing variance
|
|
from the pooled covariance.
|
|
|
|
*Parameters*:
|
|
|
|
Pcv : {array}
|
|
Crossvalidation segements of paramter
|
|
P : {array}
|
|
Calibration model paramter
|
|
p_center : {'median', 'mean', 'cal_model'}, optional
|
|
Location method for sub-segments
|
|
cov_center : {py_func}, optional
|
|
Location function
|
|
alpha : {float}, optional
|
|
Regularisation towards pooled covariance estimate.
|
|
crot : {boolean}, optional
|
|
Rotate sub-segments toward calibration model.
|
|
strict : {boolean}, optional
|
|
Only rotate 90 degree
|
|
|
|
*Returns*:
|
|
|
|
tsq : {array}
|
|
Hotellings T^2 estimate
|
|
|
|
*Reference*:
|
|
|
|
Gidskehaug et al., A framework for significance analysis of
|
|
gene expression datausing dimension reduction methods, BMC
|
|
bioinformatics, 2007
|
|
|
|
*Notes*
|
|
|
|
The rotational freedom in the solution of bilinear
|
|
models may require that a rotation onto the calibration
|
|
model. One way of doing that is procrustes rotation.
|
|
|
|
"""
|
|
m, n = P.shape
|
|
n_sets, n, amax = Pcv.shape
|
|
T_sq = empty((n,), dtype='d')
|
|
Cov_i = zeros((n, amax, amax), dtype='d')
|
|
|
|
# rotate sub_models to full model
|
|
if crot:
|
|
for i, Pi in enumerate(Pcv):
|
|
Pcv[i] = procrustes(P, Pi, strict=strict)
|
|
|
|
# center of pnull
|
|
if p_center=='median':
|
|
P_ctr = median(Pcv)
|
|
elif p_center=='mean':
|
|
P_ctr = Pcv.mean(0)
|
|
else: # calibration model
|
|
P_ctr = P
|
|
|
|
for i in xrange(n):
|
|
Pi = Pcv[:,i,:] # (n_sets x amax)
|
|
Pi_ctr = P_ctr[i,:] # (1 x amax)
|
|
Pim = (Pi - Pi_ctr)*msqrt(n_sets-1)
|
|
Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim)
|
|
|
|
Cov = cov_center(Cov_i)
|
|
reg_cov = (1. - alpha)*Cov_i + alpha*Cov
|
|
for i in xrange(n):
|
|
Pc = P_ctr[i,:]
|
|
sigma = reg_cov[i]
|
|
T_sq[i] = dot(dot(Pc, inv(sigma)), Pc)
|
|
return T_sq
|
|
|
|
def procrustes(a, b, strict=True, center=False, verbose=False):
|
|
"""Orthogonal rotation of b to a.
|
|
|
|
Procrustes rotation is an orthogonal rotoation of one subspace
|
|
onto another by minimising the squared error.
|
|
|
|
*Parameters*:
|
|
|
|
a : {array}
|
|
Input array
|
|
b : {array}
|
|
Input array
|
|
strict : {boolean}
|
|
Only do flipping and shuffling
|
|
center : {boolean}
|
|
Center before rotation, translate back after
|
|
verbose : {boolean}
|
|
Show sum of squares
|
|
|
|
*Returns*:
|
|
|
|
b_rot : {array}
|
|
B-matrix rotated
|
|
|
|
*Reference*:
|
|
|
|
Schonemann, A generalized solution of the orthogonal Procrustes problem,
|
|
Psychometrika, 1966
|
|
"""
|
|
|
|
if center:
|
|
mn_a = a.mean(0)
|
|
a = a - mn_a
|
|
mn_b = b.mean(0)
|
|
b = b - mn_b
|
|
|
|
u, s, vt = svd(dot(b.T, a))
|
|
Cm = dot(u, vt) # Cm: orthogonal rotation matrix
|
|
if strict:
|
|
Cm = _ensure_strict(Cm)
|
|
b_rot = dot(b, Cm)
|
|
if verbose:
|
|
print Cm.round()
|
|
fit = sum(ravel(b - b_rot)**2)
|
|
print "Error: %.3E" %fit
|
|
if center:
|
|
return mn_b + b_rot
|
|
else:
|
|
return b_rot
|
|
|
|
def _ensure_strict(C, only_flips=True):
|
|
"""Ensure that a rotation matrix does only 90 degree rotations.
|
|
|
|
In multiplication with pcs this allows flips and reordering.
|
|
if only_flips is True there will onlt be flips allowed
|
|
|
|
*Parameters*:
|
|
|
|
C : {array}
|
|
Rotation matrix
|
|
only_flips : {boolean}
|
|
Only accept columns to flip (switch signs)
|
|
|
|
*Returns*:
|
|
|
|
C_rot : {array}
|
|
Restricted rotation matrix
|
|
|
|
*Notes*:
|
|
|
|
This function is not ready for use. Use (only_flips=True)
|
|
|
|
"""
|
|
if only_flips:
|
|
C = eye(Cm.shape[0])*sign(Cm)
|
|
return C
|
|
Cm = zeros(C.shape, dtype='d')
|
|
Cm[abs(C)>.6] = 1.
|
|
if det(Cm)>1:
|
|
raise NotImplementedError
|
|
return Cm*S
|
|
|
|
def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
|
|
sim_method='shuffle',p_center='med', cov_center=median,
|
|
crot=True,strict=False,mean_ctr=[2,0,2], nsets=None):
|
|
|
|
"""Returns qvals for l-pls model by permutation analysis.
|
|
|
|
The response (Y) is randomly permuted, and the number of false positives
|
|
is registered by comparing hotellings T2 statistics of the calibration model.
|
|
|
|
*Parameters*:
|
|
|
|
X : {array}
|
|
Main data matrix (m, n)
|
|
Y : {array}
|
|
External row data (m, l)
|
|
Z : {array}
|
|
External column data (k, n)
|
|
aopt : {integer}
|
|
Optimal number of components
|
|
alpha : {float}, optional
|
|
Parameter to control the amount of influence from Z-matrix.
|
|
0 is none, which returns a pls-solution, 1 is max
|
|
mean_center : {array-like}, optional
|
|
A three element array-like structure with elements in [-1,0,1,2],
|
|
that decides the type of centering used.
|
|
-1 : nothing
|
|
0 : row center
|
|
1 : column center
|
|
2 : double center
|
|
n_iter : {integer}, optional
|
|
Number of permutations
|
|
sim_method : ['shuffle'], optional
|
|
Permutation method
|
|
p_center : {'median', 'mean', 'cal_model'}, optional
|
|
Location method for sub-segments
|
|
cov_center : {py_func}, optional
|
|
Location function
|
|
alpha : {float}, optional
|
|
Regularisation towards pooled covariance estimate.
|
|
crot : {boolean}, optional
|
|
Rotate sub-segmentss toward calibration model.
|
|
strict : {boolean}, optional
|
|
Only rotate 90 degree
|
|
|
|
nsets : {integer}
|
|
Number of crossvalidation segements
|
|
|
|
*Reference*:
|
|
|
|
Gidskehaug et al., A framework for significance analysis of
|
|
gene expression data using dimension reduction methods, BMC
|
|
bioinformatics, 2007
|
|
"""
|
|
|
|
m, n = X.shape
|
|
k, nz = Z.shape
|
|
assert(n==nz)
|
|
try:
|
|
my, l = Y.shape
|
|
except:
|
|
# make Y a column vector
|
|
Y = atleast_2d(Y).T
|
|
my, l = Y.shape
|
|
assert(m==my)
|
|
|
|
pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets)
|
|
pert_tsq_z = zeros((k, n_iter), dtype='d') # (nzvars x n_subsets)
|
|
|
|
# Full model
|
|
dat = lpls(X, Y, Z, aopt, scale='loads', mean_ctr=mean_ctr)
|
|
Wc, Lc = lpls_jk(X, Y, Z ,aopt)
|
|
cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha)
|
|
cal_tsq_z = hotelling(Lc, dat['L'], alpha=alpha)
|
|
|
|
# Perturbations
|
|
index = arange(m)
|
|
for i in range(n_iter):
|
|
indi = index.copy()
|
|
shuffle(indi)
|
|
dat = lpls(X, Y[indi,:], Z, aopt, scale='loads', mean_ctr=mean_ctr)
|
|
Wi, Li = lpls_jk(X, Y[indi,:], Z, aopt, nsets=nsets)
|
|
pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)
|
|
pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=alpha)
|
|
|
|
return _fdr(cal_tsq_z, pert_tsq_z, median), _fdr(cal_tsq_x, pert_tsq_x, median)
|
|
|
|
def _fdr(tsq, tsqp, loc_method=median):
|
|
"""Returns false discovery rate.
|
|
|
|
Fdr is a method used in multiple hypothesis testing to correct for multiple
|
|
comparisons. It controls the expected proportion of incorrectly rejected null
|
|
hypotheses (type I errors) in a list of rejected hypotheses.
|
|
|
|
*Parameters*:
|
|
|
|
tsq : {array}
|
|
Hotellings T2, calibration model
|
|
tsqp : {array}
|
|
Hotellings T2, submodels
|
|
|
|
loc_method : {py_func}
|
|
Location method
|
|
|
|
*Returns*:
|
|
|
|
fdr : {array}
|
|
False discovery rate
|
|
|
|
"""
|
|
n, = tsq.shape
|
|
k, m = tsqp.shape
|
|
assert(n==k)
|
|
n_false = empty((n, m), 'd')
|
|
sort_index = argsort(tsq)[::-1]
|
|
r_index = argsort(sort_index)
|
|
for i in xrange(m):
|
|
for j in xrange(n):
|
|
n_false[j,i] = (tsqp[:,i]>tsq[j]).sum()
|
|
fp = loc_method(n_false.T)
|
|
n_signif = (arange(n) + 1.0)[r_index]
|
|
fd_rate = fp/n_signif
|
|
fd_rate[fd_rate>1] = 1
|
|
return fd_rate
|