This repository has been archived on 2024-07-04. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
pyblm/pyblm/statistics.py

407 lines
12 KiB
Python

""" A collection of some statistical utitlites used.
"""
__all__ = ['hotelling', 'lpls_qvals', 'pls_qvals']
__docformat__ = 'restructuredtext en'
from math import sqrt as msqrt
from numpy import dot,empty,zeros,eye,median,sign,arange,argsort
from numpy.linalg import svd,inv,det
from numpy.random import shuffle
from crossvalidation import lpls_jk, pls_jk
from engines import pls
from engines import nipals_lpls as lpls
def hotelling(Pcv, P, p_center='median', cov_center='median',
alpha=0.3, crot=True, strict=False):
"""Returns regularized hotelling T^2.
Hotelling, is a generalization of Student's t statistic that is
used in multivariate hypothesis testing. In order to avoid small variance
samples to become significant this version allows borrowing variance
from the pooled covariance.
*Parameters*:
Pcv : {array}
Crossvalidation segements of paramter
P : {array}
Calibration model paramter
p_center : {'median', 'mean', 'cal_model'}, optional
Location method for sub-segments
cov_center : {'median', 'mean', 'cal_model'}, optional
Pooled covariance estimate
alpha : {float}, optional
Regularisation towards pooled covariance estimate.
crot : {boolean}, optional
Rotate sub-segments toward calibration model.
strict : {boolean}, optional
Only rotate 90 degree
*Returns*:
tsq : {array}
Hotellings T^2 estimate
*Reference*:
Gidskehaug et al., A framework for significance analysis of
gene expression datausing dimension reduction methods, BMC
bioinformatics, 2007
*Notes*
The rotational freedom in the solution of bilinear
models may require that a rotation onto the calibration
model. One way of doing that is procrustes rotation.
"""
m, n = P.shape
n_sets, n, amax = Pcv.shape
T_sq = empty((n,), dtype='d')
Cov_i = zeros((n, amax, amax), dtype='d')
# rotate sub_models to full model
if crot:
for i, Pi in enumerate(Pcv):
Pcv[i] = procrustes(P, Pi, strict=strict)
# center of pnull
if p_center=='median':
P_ctr = median(Pcv)
elif p_center=='mean':
P_ctr = Pcv.mean(0)
else: # calibration model
P_ctr = P
for i in xrange(n):
Pi = Pcv[:,i,:] # (n_sets x amax)
Pi_ctr = P_ctr[i,:] # (1 x amax)
#Pim = (Pi - Pi_ctr)*msqrt(n_sets-1)
#Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim)
Pim = (Pi - Pi_ctr)
Cov_i[i] = dot(Pim.T, Pim)
if cov_center == 'median':
Cov_p = median(Cov_i)
elif cov_center == 'mean':
Cov_p = Cov.mean(0)
else:
print "Pooled covariance est. invalid, using median"
print cov_center
Cov_p = median(Cov_i)
reg_cov = (1. - alpha)*Cov_i + alpha*Cov_p
for i in xrange(n):
Pc = P_ctr[i,:]
sigma = reg_cov[i]
T_sq[i] = dot(dot(Pc, inv(sigma)), Pc)
return T_sq
def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False):
"""Orthogonal rotation of b to a.
Procrustes rotation is an orthogonal rotation of one subspace
onto another by minimising the squared error.
*Parameters*:
a : {array}
Input array, stationary
b : {array}
Input array, rotate this to max. fit to a
strict : {boolean}
Only do flipping and shuffling
center : {boolean}
Center before rotation, translate back after
force_norm : {boolean}
Ensure that columns of a and b are orthonormal
verbose : {boolean}
Show sum of squares
*Returns*:
b_rot : {array}
B-matrix rotated
*Reference*:
Schonemann, A generalized solution of the orthogonal Procrustes
problem, Psychometrika, 1966
"""
if center:
mn_a = a.mean(0)
a = a - mn_a
mn_b = b.mean(0)
b = b - mn_b
if force_norm:
a_norm = apply_along_axis(norm, 0, a)
a = a/a_norm
b_norm = apply_along_axis(norm, 0, b)
b = b/b_norm
u, s, vt = svd(dot(b.T, a))
Cm = dot(u, vt) # Cm: orthogonal rotation matrix
if strict:
Cm = _ensure_strict(Cm)
b_rot = dot(b, Cm)
if verbose:
fit = ((b - b_rot)**2).sum()
fit2 = (dot(a, a.T) + dot(b, b.T) - 2*diag(s)).trace()
print "Error: %.2E , %.2E" %(fit, fit2)
if center:
return mn_b + b_rot
else:
return b_rot
def _ensure_strict(C, only_flips=True):
"""Ensure that a rotation matrix does only 90 degree rotations.
In multiplication with pcs this allows flips and reordering.
if only_flips is True there will onlt be flips allowed
*Parameters*:
C : {array}
Rotation matrix
only_flips : {boolean}
Only accept columns to flip (switch signs)
*Returns*:
C_rot : {array}
Restricted rotation matrix
*Notes*:
This function is not ready for use. Use (only_flips=True).
That is, for more than two components, the rotation matrix
has a tendency to be unstable (det(Cm)>1), when rounding is used.
"""
if only_flips:
C = eye(C.shape[0])*sign(C)
return C
Cm = zeros(C.shape, dtype='d')
Cm[abs(C)>.6] = 1.
if det(Cm)>1:
raise NotImplementedError
return Cm*sign(C)
def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
sim_method='shuffle', p_center='med', cov_center=median,
crot=True, strict=False, center_axis=[2,0,2], nsets=None,
zorth=False):
"""Returns qvals for l-pls model by permutation analysis.
The response (Y) is randomly permuted, and the number of false positives
is registered by comparing hotellings T2 statistics of the calibration model.
*Parameters*:
X : {array}
Main data matrix (m, n)
Y : {array}
External row data (m, l)
Z : {array}
External column data (k, n)
aopt : {integer}
Optimal number of components
alpha : {float}, optional
Parameter to control the amount of influence from Z-matrix.
0 is none, which returns a pls-solution, 1 is max
center_axis : {array-like}, optional
A three element array-like structure with elements in [-1,0,1,2],
that decides the type of centering used.
-1 : nothing
0 : row center
1 : column center
2 : double center
n_iter : {integer}, optional
Number of permutations
sim_method : ['shuffle'], optional
Permutation method
p_center : {'median', 'mean', 'cal_model'}, optional
Location method for sub-segments
cov_center : {py_func}, optional
Location function
alpha : {float}, optional
Regularisation towards pooled covariance estimate.
crot : {boolean}, optional
Rotate sub-segmentss toward calibration model.
strict : {boolean}, optional
Only rotate 90 degree
nsets : {integer}
Number of crossvalidation segements
*Reference*:
Gidskehaug et al., A framework for significance analysis of
gene expression data using dimension reduction methods, BMC
bioinformatics, 2007
"""
m, n = X.shape
k, nz = Z.shape
assert(n==nz)
try:
my, l = Y.shape
except:
# make Y a column vector
Y = atleast_2d(Y).T
my, l = Y.shape
assert(m==my)
pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets)
pert_tsq_z = zeros((k, n_iter), dtype='d') # (nzvars x n_subsets)
# Full model
dat = lpls(X, Y, Z, aopt, scale='loads', center_axis=center_axis, zorth=zorth)
Wc, Lc = lpls_jk(X, Y, Z ,aopt, zorth=zorth)
cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha)
cal_tsq_z = hotelling(Lc, dat['L'], alpha=alpha)
print "morn"
# Perturbations
index = arange(m)
for i in range(n_iter):
indi = index.copy()
shuffle(indi)
dat = lpls(X, Y[indi,:], Z, aopt, scale='loads', center_axis=center_axis, zorth=zorth)
Wi, Li = lpls_jk(X, Y[indi,:], Z, aopt, nsets=nsets)
pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)
# no reason to borrow variance in dag (alpha ->some small value)
pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=0.01)
return cal_tsq_z, pert_tsq_z, cal_tsq_x, pert_tsq_x
def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median,
crot=True,strict=False, center_axis=[0,0], nsets=None, zorth=False):
"""Returns qvals for pls model by permutation analysis.
The response (Y) is randomly permuted, and the number of false positives
is registered by comparing hotellings T2 statistics of the calibration model.
*Parameters*:
X : {array}
Main data matrix (m, n)
Y : {array}
External row data (m, l)
aopt : {integer}
Optimal number of components
center_axis : {array-like}, optional
A two element array-like structure with elements in [-1,0,1,2],
that decides the type of centering used.
-1 : nothing
0 : row center
1 : column center
2 : double center
n_iter : {integer}, optional
Number of permutations
sim_method : ['shuffle'], optional
Permutation method
p_center : {'median', 'mean', 'cal_model'}, optional
Location method for sub-segments
cov_center : {py_func}, optional
Location function
alpha : {float}, optional
Regularisation towards pooled covariance estimate.
crot : {boolean}, optional
Rotate sub-segmentss toward calibration model.
strict : {boolean}, optional
Only rotate 90 degree
nsets : {integer}
Number of crossvalidation segements
*Reference*:
Gidskehaug et al., A framework for significance analysis of
gene expression data using dimension reduction methods, BMC
bioinformatics, 2007
"""
m, n = X.shape
try:
my, l = Y.shape
except:
# make Y a column vector
Y = atleast_2d(Y).T
my, l = Y.shape
assert(m==my)
# Full model
dat = pls(X, Y, aopt, scale='loads')
# Submodels
Wc = pls_jk(X, Y , aopt)
cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha)
# Perturbations
pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets)
index = arange(m)
for i in range(n_iter):
indi = index.copy()
shuffle(indi)
dat = pls(X, Y[indi,:], aopt, scale='loads', center_axis=center_axis)
Wi = pls_jk(X, Y[indi,:], aopt, nsets=nsets, center_axis=center_axis)
pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)
return cal_tsq_x, pert_tsq_x
def _fdr(tsq, tsqp, loc_method=median):
"""Returns false discovery rate.
Fdr is a method used in multiple hypothesis testing to correct for multiple
comparisons. It controls the expected proportion of incorrectly rejected null
hypotheses (type I errors) in a list of rejected hypotheses.
*Parameters*:
tsq : {array}
Hotellings T2, calibration model
tsqp : {array}
Hotellings T2, submodels
loc_method : {py_func}
Location method
*Returns*:
fdr : {array}
False discovery rate
*Notes*:
This is an internal function for use in fdr estimation of jack-knifed
perturbated blm parameters.
*Reference*:
Gidskehaug et al., A framework for significance analysis of
gene expression data using dimension reduction methods, BMC
bioinformatics, 2007
"""
n = tsq.shape[0]
k, m = tsqp.shape
assert(n==k)
n_false = empty((n, m), 'd')
sort_index = argsort(tsq)[::-1]
r_index = argsort(sort_index)
for i in xrange(m):
for j in xrange(n):
n_false[j,i] = (tsqp[:,i]>tsq[j]).sum()
fp = loc_method(n_false.T)
n_signif = (arange(n) + 1.0)[r_index]
fd_rate = fp/n_signif
fd_rate[fd_rate>1] = 1
return fd_rate