439 lines
14 KiB
Python
439 lines
14 KiB
Python
import time
|
|
import cPickle
|
|
|
|
from scipy import zeros,zeros_like,sqrt,dot,trace,sign,round_,argmax,\
|
|
sort,ravel,newaxis,asarray,diag,sum,outer,argsort,arange,ones_like,\
|
|
all,apply_along_axis,eye,atleast_2d,empty
|
|
from scipy.linalg import svd,inv,norm,det,sqrtm
|
|
from scipy.stats import mean,median
|
|
|
|
#import plots_lpls
|
|
|
|
from cx_utils import mat_center
|
|
from validation import pls_jkW, lpls_jk
|
|
from select_generators import shuffle_1d
|
|
from engines import pca, pls, bridge
|
|
from engines import nipals_lpls as lpls
|
|
|
|
|
|
|
|
def hotelling(Pcv, P, p_center='med', cov_center='med',
|
|
alpha=0.3, crot=True, strict=False):
|
|
"""Returns regularized hotelling T^2.
|
|
|
|
alpha -- regularisation towards pooled cov estimates
|
|
beta -- regularisation for unstable eigenvalues
|
|
p_center -- location method for submodels
|
|
cov_center -- location method for sub coviariances
|
|
alpha -- regularisation
|
|
crot -- rotate submodels toward full?
|
|
strict -- only rotate 90 degree ?
|
|
|
|
"""
|
|
m, n = P.shape
|
|
n_sets, n, amax = Pcv.shape
|
|
# allocate
|
|
T_sq = empty((n, ),dtype='d')
|
|
Cov_i = zeros((n, amax, amax),dtype='d')
|
|
|
|
# rotate sub_models to full model
|
|
if crot:
|
|
for i, Pi in enumerate(Pcv):
|
|
Pcv[i] = procrustes(P, Pi, strict=strict)
|
|
|
|
# center of pnull
|
|
if p_center=='med':
|
|
P_ctr = median(Pcv, 0)
|
|
elif p_center=='mean':
|
|
# fixme: mean is unstable
|
|
P_ctr = mean(Pcv, 0)
|
|
else: #use full
|
|
P_ctr = P
|
|
|
|
for i in xrange(n):
|
|
Pi = Pcv[:,i,:] # (n_sets x amax)
|
|
Pi_ctr = P_ctr[i,:] # (1 x amax)
|
|
Pim = (Pi - Pi_ctr[newaxis])*sqrt(n_sets-1)
|
|
Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim)
|
|
|
|
if cov_center == 'med':
|
|
Cov = median(Cov_i, 0)
|
|
else:
|
|
Cov = mean(Cov_i, 0)
|
|
|
|
reg_cov = (1. - alpha)*Cov_i + alpha*Cov
|
|
for i in xrange(n):
|
|
#Pc = P_ctr[i,:][:,newaxis]
|
|
Pc = P_ctr[i,:]
|
|
sigma = reg_cov[i]
|
|
# T_sq[i] = (dot(Pc, inv(sigma) )*Pc).sum() #slow
|
|
T_sq[i] = dot(dot(Pc, inv(sigma)), Pc) # dont need to care about transposes
|
|
#T_sq[i] = dot(dot(Pc.T, inv(sigma)), Pc).ravel()
|
|
return T_sq
|
|
|
|
def procrustes(A, B, strict=True, center=False, verbose=False):
|
|
"""Rotation of B to A.
|
|
|
|
strict -- Only do flipping and shuffling
|
|
center -- Center before rotation, translate back after
|
|
verbose -- Print ssq
|
|
|
|
No scaling calculated.
|
|
Output B_rot = Rotated B
|
|
"""
|
|
if center:
|
|
A,mn_A = mat_center(A, ret_mn=True)
|
|
B,mn_B = mat_center(B, ret_mn=True)
|
|
u,s,vh = svd(dot(B.T, A))
|
|
v = vh.T
|
|
Cm = dot(u, v.T) #orthogonal rotation matrix
|
|
if strict: # just inverting and flipping
|
|
Cm = ensure_strict(Cm)
|
|
b_rot = dot(B, Cm)
|
|
|
|
if verbose:
|
|
print Cm.round()
|
|
fit = sum(ravel(B - b_rot)**2)
|
|
print "Sum of squares: %s" %fit
|
|
if center:
|
|
return mn_B + b_rot
|
|
else:
|
|
return b_rot
|
|
|
|
def expl_var_x(Xc, T):
|
|
"""Returns explained variance of X.
|
|
T should carry variance in length, Xc has zero col-mean.
|
|
"""
|
|
exp_var_x = diag(dot(T.T, T))*100/(sum(Xc**2))
|
|
return exp_var_x
|
|
|
|
def expl_var_y(Y, T, Q):
|
|
"""Returns explained variance of Y.
|
|
"""
|
|
# centered Y
|
|
exp_var_y = zeros((Q.shape[1], ))
|
|
for a in range(Q.shape[1]):
|
|
Ya = outer(T[:,a], Q[:,a])
|
|
exp_var_y[a] = 100*sum(Ya**2)/sum(Y**2)
|
|
return exp_var_y
|
|
|
|
def pls_qvals(a, b, aopt=None, alpha=.3,
|
|
n_iter=20, algo='pls',
|
|
center=True,
|
|
sim_method='shuffle',
|
|
p_center='med', cov_center='med',
|
|
crot=True, strict=False):
|
|
|
|
"""Returns qvals for pls model.
|
|
|
|
input:
|
|
a -- data matrix
|
|
b -- data matrix
|
|
aopt -- scalar, opt. number of components
|
|
alpha -- [0,1] regularisation parameter for T2-test
|
|
n_iter -- number of permutations
|
|
sim_method -- permutation method ['shuffle']
|
|
p_center -- location estimator for sub models ['med']
|
|
cov_center -- location estimator for covariance of submodels ['med']
|
|
crot -- bool, use rotations of sub models?
|
|
strict -- bool, use stict (rot/flips only) rotations?
|
|
"""
|
|
|
|
m, n = a.shape
|
|
TSQ = zeros((n, n_iter), dtype='d') # (nvars x n_subsets)
|
|
n_false = zeros((n, n_iter), dtype='d')
|
|
|
|
#full model
|
|
if center:
|
|
ac = a - a.mean(0)
|
|
bc = b - b.mean(0)
|
|
|
|
if algo=='bridge':
|
|
dat = bridge(ac, bc, aopt, 'loads', 'fast')
|
|
else:
|
|
dat = pls(ac, bc, aopt, 'loads', 'fast')
|
|
Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo,center=True)
|
|
tsq_full = hotelling(Wcv, dat['W'], p_center=p_center,
|
|
alpha=alpha, crot=crot, strict=strict,
|
|
cov_center=cov_center)
|
|
#t0 = time.time()
|
|
Vs = shuffle_1d(bc, n_iter, axis=0)
|
|
for i, b_shuff in enumerate(Vs):
|
|
#t1 = time.time()
|
|
if algo=='bridge':
|
|
dat = bridge(ac, b_shuff, aopt, 'loads','fast')
|
|
else:
|
|
dat = pls(ac, b_shuff, aopt, 'loads', 'fast')
|
|
Wcv = pls_jkW(a, b_shuff, aopt, n_blocks=None, algo=algo)
|
|
TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center,
|
|
alpha=alpha, crot=crot, strict=strict,
|
|
cov_center=cov_center)
|
|
#print time.time() - t1
|
|
|
|
return fdr(tsq_full, TSQ, median)
|
|
|
|
|
|
def ensure_strict(C, only_flips=True):
|
|
"""Ensure that a rotation matrix does only 90 degree rotations.
|
|
In multiplication with pcs this allows flips and reordering.
|
|
|
|
if only_flips is True there will onlt be flips allowed
|
|
"""
|
|
Cm = C
|
|
S = sign(C) # signs
|
|
if only_flips==True:
|
|
C = eye(Cm.shape[0])*S
|
|
return C
|
|
Cm = zeros_like(C)
|
|
Cm.putmask(1.,abs(C)>.6)
|
|
if det(Cm)>1:
|
|
raise ValueError,"Implement this!"
|
|
return Cm*S
|
|
|
|
def pls_qvals_II(a, b, aopt=None, center=True, alpha=.3,
|
|
n_iter=20, algo='pls',
|
|
sim_method='shuffle',
|
|
p_center='med', cov_center='med',
|
|
crot=True, strict=False):
|
|
|
|
"""Returns qvals for pls model.
|
|
Shuffling of variables in X.
|
|
Null model is 'If I put genes randomly on network' ... if they are sign:
|
|
then this is due to network structure and not covariance with response.
|
|
|
|
input:
|
|
a -- data matrix
|
|
b -- data matrix
|
|
aopt -- scalar, opt. number of components
|
|
alpha -- [0,1] regularisation parameter for T2-test
|
|
n_iter -- number of permutations
|
|
sim_method -- permutation method ['shuffle']
|
|
p_center -- location estimator for sub models ['med']
|
|
cov_center -- location estimator for covariance of submodels ['med']
|
|
crot -- bool, use rotations of sub models?
|
|
strict -- bool, use stict (rot/flips only) rotations?
|
|
"""
|
|
|
|
m, n = a.shape
|
|
TSQ = zeros((n, n_iter), dtype='<f8') # (nvars x n_subsets)
|
|
n_false = zeros((n, n_iter), dtype='<f8')
|
|
|
|
#full model
|
|
|
|
# center?
|
|
if center==True:
|
|
ac = a - a.mean(0)
|
|
bc = b - b.mean(0)
|
|
|
|
if algo=='bridge':
|
|
dat = bridge(ac, bc, aopt, 'loads', 'fast')
|
|
else:
|
|
dat = pls(ac, bc, aopt, 'loads', 'fast')
|
|
Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo)
|
|
tsq_full = hotelling(Wcv, dat['W'], p_center=p_center,
|
|
alpha=alpha, crot=crot, strict=strict,
|
|
cov_center=cov_center)
|
|
t0 = time.time()
|
|
Vs = shuffle_1d(a, n_iter, 1)
|
|
for i, a_shuff in enumerate(Vs):
|
|
t1 = time.time()
|
|
a = a_shuff - a_shuff.mean(0)
|
|
|
|
if algo=='bridge':
|
|
dat = bridge(a, b, aopt, 'loads','fast')
|
|
else:
|
|
dat = pls(a, b, aopt, 'loads', 'fast')
|
|
Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo)
|
|
TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center,
|
|
alpha=alpha, crot=crot, strict=strict,
|
|
cov_center=cov_center)
|
|
print time.time() - t1
|
|
sort_index = argsort(tsq_full)[::-1]
|
|
back_sort_index = sort_index.argsort()
|
|
print time.time() - t0
|
|
|
|
# count false positives
|
|
tsq_full_sorted = tsq_full.take(sort_index)
|
|
for i in xrange(n_iter):
|
|
for j in xrange(n):
|
|
n_false[j,i] = sum(TSQ[:,i]>=tsq_full[j])
|
|
false_pos = median(n_false, 1)
|
|
ll = arange(1, len(false_pos)+1, 1)
|
|
sort_qval = false_pos.take(sort_index)/ll
|
|
qval = false_pos/ll.take(back_sort_index)
|
|
print time.time() - t0
|
|
#return qval, false_pos, TSQ, tsq_full
|
|
|
|
return qval
|
|
|
|
def leverage(aopt=1,*args):
|
|
"""Returns leverages
|
|
input : aopt, number of components to base leverage calculations on
|
|
*args, matrices of normed blm-paramters
|
|
output: leverages
|
|
|
|
For PCA typical inputs are normalised T or normalised P
|
|
For PLSR typical inputs are normalised T or normalised W
|
|
"""
|
|
if aopt<1:
|
|
raise ValueError,"Leverages only make sense for aopt>0"
|
|
lev = []
|
|
for u in args:
|
|
lev_u = 1./u.shape[0] + dot(u[:,:aopt], u[:,:aopt].T).diagonal()
|
|
lev.append(lev_u)
|
|
return lev
|
|
|
|
def variances(a, t, p):
|
|
"""Returns explained variance and ind. var from blm-params.
|
|
input:
|
|
a -- full centered matrix
|
|
t,p -- parameters from a bilinear approx of the above matrix.
|
|
output:
|
|
var -- variance of each component
|
|
var_exp -- cumulative explained variance in percentage
|
|
|
|
Typical inputs are: X(centered),T,P for PCA or
|
|
X(centered),T,P / Y(centered),T,Q for PLSR.
|
|
"""
|
|
|
|
tot_var = sum(a**2)
|
|
var = 100*(sum(p**2, 0)*sum(t**2, 0))/tot_var
|
|
var_exp = var.cumsum()
|
|
return var, var_exp
|
|
|
|
def residual_diagnostics(Y, Yhat, aopt=1):
|
|
"""Root mean errors and press values.
|
|
R2 vals
|
|
"""
|
|
pass
|
|
|
|
|
|
def ssq(E, axis=0, weights=None):
|
|
"""Sum of squares, supports weights."""
|
|
n = E.shape[axis]
|
|
if weights==None:
|
|
weights = eye(n)
|
|
else:
|
|
weigths = diag(weigths)
|
|
if axis==0:
|
|
Ew = dot(weights, E)
|
|
elif axis==1:
|
|
Ew = dot(E, weights)
|
|
else:
|
|
raise NotImplementedError, "Higher order modes not supported"
|
|
return pow(Ew,2).sum(axis)
|
|
|
|
|
|
def vnorm(x):
|
|
"""Returns the euclidian norm of a vector.
|
|
|
|
This is considerably faster than linalg.norm
|
|
"""
|
|
return sqrt(dot(x,x.conj()))
|
|
|
|
def mahalanobis(a, loc=None, acov=None, invcov=None):
|
|
"""Returns the distance of each observation in a
|
|
from the location estimate (loc) of the data,
|
|
relative to the shape of the data.
|
|
|
|
|
|
a : data matrix (n observations in rows, p variables in columns)
|
|
loc : location estimate of the data (p-dimensional vector)
|
|
covmat or invcov : scatter estimate of the data or the inverse of the scatter estimate (pxp matrix)
|
|
|
|
:Returns:
|
|
A vector containing the distances of all the observations to locvct.
|
|
|
|
"""
|
|
n, p = a.shape
|
|
if loc==None:
|
|
loc = a.mean(0)
|
|
loc = atleast_2d(loc)
|
|
if loc.shape[1]==1:
|
|
loc = loc.T; #ensure rowvector
|
|
assert(loc.shape[1]==p)
|
|
xc = a - loc
|
|
if acov==None and invcov==None:
|
|
acov = dot(xc.T, xc)
|
|
|
|
if invcov != None:
|
|
covmat = atleast_2d(invcov)
|
|
if min(covmat.shape)==1:
|
|
covmat = diag(invcov.ravel())
|
|
else:
|
|
covmat = atleast_2d(acov)
|
|
if min(covmat.shape)==1:
|
|
covmat = diag(covmat.ravel())
|
|
covmat = inv(covmat)
|
|
# mdist = diag(dot(dot(xc, covmat),xc.T))
|
|
mdist = (dot(xc, covmat)*xc).sum(1)
|
|
return mdist
|
|
|
|
def lpls_qvals(a, b, c, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
|
|
sim_method='shuffle',p_center='med', cov_center='med',crot=True,
|
|
strict=False, mean_ctr=[2,0,2], nsets=None):
|
|
|
|
"""Returns qvals for l-pls model.
|
|
|
|
input:
|
|
a -- data matrix
|
|
b -- data matrix
|
|
c -- data matrix
|
|
aopt -- scalar, opt. number of components
|
|
alpha -- [0,1] regularisation parameter for T2-test
|
|
xz_alpha -- [0,1] how much z info to include
|
|
n_iter -- number of permutations
|
|
sim_method -- permutation method ['shuffle']
|
|
p_center -- location estimator for sub models ['med']
|
|
cov_center -- location estimator for covariance of submodels ['med']
|
|
crot -- bool, use rotations of sub models?
|
|
strict -- bool, use stict (rot/flips only) rotations?
|
|
"""
|
|
|
|
m, n = a.shape
|
|
p, k = c.shape
|
|
pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets)
|
|
pert_tsq_z = zeros((p, n_iter), dtype='d') # (nzvars x n_subsets)
|
|
|
|
# Full model
|
|
#print "Full model start"
|
|
dat = lpls(a, b, c, aopt, scale='loads', mean_ctr=mean_ctr)
|
|
Wc, Lc = lpls_jk(a, b, c , aopt, nsets=nsets)
|
|
#print "Full hot"
|
|
cal_tsq_x = hotelling(Wc, dat['W'], alpha = alpha)
|
|
cal_tsq_z = hotelling(Lc, dat['L'], alpha = 0)
|
|
|
|
# Perturbations
|
|
Vs = shuffle_1d(b, n_iter, axis=0)
|
|
for i, b_shuff in enumerate(Vs):
|
|
print i
|
|
dat = lpls(a, b_shuff,c, aopt, scale='loads', mean_ctr=mean_ctr)
|
|
Wi, Li = lpls_jk(a, b_shuff, c, aopt, nsets=nsets)
|
|
pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)
|
|
pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=alpha)
|
|
|
|
return cal_tsq_z, pert_tsq_z, cal_tsq_x, pert_tsq_x
|
|
|
|
|
|
|
|
def fdr(tsq, tsqp, loc_method='mean'):
|
|
n, = tsq.shape
|
|
k, m = tsqp.shape
|
|
assert(n==k)
|
|
n_false = empty((n, m), 'd')
|
|
sort_index = argsort(tsq)[::-1]
|
|
r_index = argsort(sort_index)
|
|
for i in xrange(m):
|
|
for j in xrange(n):
|
|
n_false[j,i] = (tsqp[:,i]>tsq[j]).sum()
|
|
#cPickle.dump(n_false, open("/tmp/nfalse.dat_"+str(n), "w"))
|
|
if loc_method=='mean':
|
|
fp = mean(n_false,1)
|
|
elif loc_method == 'median':
|
|
fp = median(n_false.T)
|
|
else:
|
|
raise ValueError
|
|
n_signif = (arange(n) + 1.0)[r_index]
|
|
fd_rate = fp/n_signif
|
|
return fd_rate
|