Fixed conflicts

This commit is contained in:
Arnar Flatberg 2007-12-14 00:16:31 +00:00
parent 1103245d85
commit 253305b602
5 changed files with 177 additions and 164 deletions

View File

@ -13,4 +13,3 @@ def test(level=1, verbosity=1):
print 'Python version %s' % (sys.version.replace('\n', '',),) print 'Python version %s' % (sys.version.replace('\n', '',),)
from numpy.testing import NumpyTest from numpy.testing import NumpyTest
return NumpyTest().test(level, verbosity) return NumpyTest().test(level, verbosity)

View File

@ -155,7 +155,7 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F
validation scheme. validation scheme.
*Parameters*: *Parameters*:
X : {array} X : {array}
Main data matrix (m, n) Main data matrix (m, n)
Y : {array} Y : {array}
@ -180,9 +180,9 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F
If true, Require orthogonal latent components in Z. If true, Require orthogonal latent components in Z.
verbose : {boolean}, optional verbose : {boolean}, optional
Verbosity of console output. For use in debugging. Verbosity of console output. For use in debugging.
*Returns*: *Returns*:
rmsep : {array} rmsep : {array}
Root mean squred error of prediction Root mean squred error of prediction
yhat : {array} yhat : {array}
@ -191,19 +191,19 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F
Estimated value of optimal number of components Estimated value of optimal number of components
""" """
m, n = X.shape m, n = X.shape
k, l = Y.shape k, l = Y.shape
o, p = Z.shape o, p = Z.shape
assert m == k, "X (%d,%d) - Y (%d,%d) dim mismatch" %(m, n, k, l) assert m == k, "X (%d,%d) - Y (%d,%d) dim mismatch" %(m, n, k, l)
assert n == p, "X (%d,%d) - Z (%d,%d) dim mismatch" %(m, n, o, p) assert n == p, "X (%d,%d) - Z (%d,%d) dim mismatch" %(m, n, o, p)
if nsets == None: if nsets == None:
nsets = m nsets = m
if nsets > X.shape[0]: if nsets > X.shape[0]:
print "nsets (%d) is larger than number of variables (%d).\nnsets: %d -> %d" %(nsets, m, nsets, m) print "nsets (%d) is larger than number of variables (%d).\nnsets: %d -> %d" %(nsets, m, nsets, m)
nsets = m nsets = m
assert (alpha >= 0 and alpha<=1), "Alpha needs to be within [0,1], got: %.2f" %alpha assert (alpha >= 0 and alpha<=1), "Alpha needs to be within [0,1], got: %.2f" %alpha
Yhat = empty((a_max, k, l), 'd') Yhat = empty((a_max, k, l), 'd')
for cal, val in cv(k, nsets): for cal, val in cv(k, nsets):
# do the training model # do the training model
@ -217,10 +217,16 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F
# predictions # predictions
for a in range(a_max): for a in range(a_max):
Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a])) Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a]))
# todo: need a better support for classification error
y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?']
if y_is_class:
pass
#Yhat, err = class_error(Yhat, Y)
#return Yhat, err
sep = (Y - Yhat)**2 sep = (Y - Yhat)**2
rmsep = sqrt(sep.mean(1)).T rmsep = sqrt(sep.mean(1)).T
#aopt = find_aopt_from_sep(rmsep) #aopt = find_aopt_from_sep(rmsep)
# todo: need a better support for classification error # todo: need a better support for classification error
error = prediction_error(Yhat, Y, method='1/2') error = prediction_error(Yhat, Y, method='1/2')
@ -228,7 +234,7 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F
def pca_jk(a, aopt, nsets=None, center_axis=[0], method='cv'): def pca_jk(a, aopt, nsets=None, center_axis=[0], method='cv'):
"""Returns jack-knife segments from PCA. """Returns jack-knife segments from PCA.
*Parameters*: *Parameters*:
a : {array} a : {array}
@ -252,9 +258,9 @@ def pca_jk(a, aopt, nsets=None, center_axis=[0], method='cv'):
Loadings collected in a three way matrix (n_segments, m, aopt) Loadings collected in a three way matrix (n_segments, m, aopt)
*Notes*: *Notes*:
- Crossvalidation method is currently set to random blocks of samples. - Crossvalidation method is currently set to random blocks of samples.
""" """
m, n = a.shape m, n = a.shape
if nsets == None: if nsets == None:
@ -278,14 +284,14 @@ def pca_jk(a, aopt, nsets=None, center_axis=[0], method='cv'):
Pcv[i,:,:] = pca(a[cal,:], aopt, mode='fast', scale='loads', center_axis = center_axis)['P'] Pcv[i,:,:] = pca(a[cal,:], aopt, mode='fast', scale='loads', center_axis = center_axis)['P']
else: else:
raise NotImplementedError(method) raise NotImplementedError(method)
return Pcv return Pcv
def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False): def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False):
""" Returns jack-knife segements of W. """ Returns jack-knife segements of W.
*Parameters*: *Parameters*:
X : {array} X : {array}
Main data matrix (m, n) Main data matrix (m, n)
Y : {array} Y : {array}
@ -294,7 +300,7 @@ def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False):
The number of components to calculate (0, min(m,n)) The number of components to calculate (0, min(m,n))
nsets : (integer), optional nsets : (integer), optional
Number of jack-knife segments Number of jack-knife segments
center_axis : {boolean}, optional center_axis : {boolean}, optional
- -1 : nothing - -1 : nothing
- 0 : row center - 0 : row center
@ -302,12 +308,12 @@ def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False):
- 2 : double center - 2 : double center
verbose : {boolean}, optional verbose : {boolean}, optional
Verbosity of console output. For use in debugging. Verbosity of console output. For use in debugging.
*Returns*: *Returns*:
Wcv : {array} Wcv : {array}
Loading-weights jack-knife segements Loading-weights jack-knife segements
""" """
m, n = X.shape m, n = X.shape
k, l = Y.shape k, l = Y.shape
@ -320,7 +326,7 @@ def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False):
print "Segment number: %d" %i print "Segment number: %d" %i
dat = pls(X[cal,:], Y[cal,:], a_opt, scale='loads', mode='fast', center_axis=center_axis) dat = pls(X[cal,:], Y[cal,:], a_opt, scale='loads', mode='fast', center_axis=center_axis)
Wcv[i,:,:] = dat['W'] Wcv[i,:,:] = dat['W']
return Wcv return Wcv
def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth=False, verbose=False): def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth=False, verbose=False):
@ -332,10 +338,10 @@ def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth=
infer the paramter confidence in th model. infer the paramter confidence in th model.
The segements returned are the X-block weights and Z-block weights. The segements returned are the X-block weights and Z-block weights.
*Parameters*: *Parameters*:
X : {array} X : {array}
Main data matrix (m, n) Main data matrix (m, n)
Y : {array} Y : {array}
@ -358,15 +364,15 @@ def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth=
2 : double center 2 : double center
verbose : {boolean}, optional verbose : {boolean}, optional
Verbosity of console output. For use in debugging. Verbosity of console output. For use in debugging.
*Returns*: *Returns*:
Wx : {array} Wx : {array}
X-block jack-knife segements X-block jack-knife segements
Wz : {array} Wz : {array}
Z-block jack-knife segements Z-block jack-knife segements
""" """
m, n = X.shape m, n = X.shape
k, l = Y.shape k, l = Y.shape
o, p = Z.shape o, p = Z.shape
@ -388,7 +394,7 @@ def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth=
def find_aopt_from_sep(err, method='vanilla'): def find_aopt_from_sep(err, method='vanilla'):
"""Returns an estimate of optimal number of components. """Returns an estimate of optimal number of components.
The estimate is based on the error of prediction from The estimate is based on the error of prediction from
crossvalidation. This is pretty much wild guessing and it is crossvalidation. This is pretty much wild guessing and it is
recomended to inspect model parameters and prediction errors recomended to inspect model parameters and prediction errors
@ -406,7 +412,7 @@ def find_aopt_from_sep(err, method='vanilla'):
aopt : {integer} aopt : {integer}
A guess on the optimal number of components A guess on the optimal number of components
""" """
if method == 'vanilla': if method == 'vanilla':
# min rmsep # min rmsep
rmsecv = sqrt(err.mean(0)) rmsecv = sqrt(err.mean(0))
@ -434,7 +440,7 @@ def cv(N, K, randomise=True, sequential=False):
of length ~N/K, *without* replacement. of length ~N/K, *without* replacement.
*Parameters*: *Parameters*:
N : {integer} N : {integer}
Total number of samples Total number of samples
K : {integer} K : {integer}
@ -443,7 +449,7 @@ def cv(N, K, randomise=True, sequential=False):
Use random sampling Use random sampling
sequential : {boolean} sequential : {boolean}
Use sequential sampling Use sequential sampling
*Returns*: *Returns*:
training : {array-like} training : {array-like}
@ -456,12 +462,12 @@ def cv(N, K, randomise=True, sequential=False):
If randomise is true, a copy of index is shuffled before partitioning, If randomise is true, a copy of index is shuffled before partitioning,
otherwise its order is preserved in training and validation. otherwise its order is preserved in training and validation.
Randomise overrides the sequential argument. If randomise is true, Randomise overrides the sequential argument. If randomise is true,
sequential is False sequential is False
If sequential is true the index is partioned in continous blocks, If sequential is true the index is partioned in continous blocks,
otherwise interleaved ordering is used. otherwise interleaved ordering is used.
""" """
if K > N: if K > N:
raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(N, N, K) raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(N, N, K)
@ -510,6 +516,8 @@ def diag_cv(shape, nsets=9, randomise=True):
except: except:
raise ValueError("shape needs to be a two-tuple") raise ValueError("shape needs to be a two-tuple")
if nsets>m or nsets>n: if nsets>m or nsets>n:
msg = "You may not use more subsets than max(n_rows, n_cols)"
raise ValueError, msg
msg = "You may not use more subsets than max(n_rows, n_cols)" msg = "You may not use more subsets than max(n_rows, n_cols)"
nsets = min(m, n) nsets = min(m, n)
nm = n*m nm = n*m
@ -524,7 +532,20 @@ def diag_cv(shape, nsets=9, randomise=True):
validation.update(ind) validation.update(ind)
#training = [j for j in index if j not in validation] #training = [j for j in index if j not in validation]
yield list(validation) yield list(validation)
def class_error(y_hat, y, method='vanilla'):
""" Not used.
"""
a_opt, k, l = y_hat.shape
y_hat_c = zeros((k, l), dtype='d')
if method == vanilla:
pass
for a in range(a_opt):
for i in range(k):
y_hat_c[a, val, argmax(y_hat[a,val,:])] = 1.0
err = 100*((y_hat_c + y) == 2).sum(1)/y.sum(0).astype('d')
return y_hat_c, err
def prediction_error(y_hat, y, method='squared'): def prediction_error(y_hat, y, method='squared'):
"""Loss function on multiclass Y. """Loss function on multiclass Y.
@ -651,7 +672,7 @@ def _wkernel_pls_val(X, Y, a_max, n_blocks=None):
for Din, Doi, Yin, Yout in V: for Din, Doi, Yin, Yout in V:
ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0]) ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0])
PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0) PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
dat = w_simpls(Din, Yin, a_max) dat = w_simpls(Din, Yin, a_max)
Q, U, H = dat['Q'], dat['U'], dat['H'] Q, U, H = dat['Q'], dat['U'], dat['H']
That = dot(Doi, dot(U, inv(triu(dot(H.T, U))) )) That = dot(Doi, dot(U, inv(triu(dot(H.T, U))) ))

View File

@ -20,7 +20,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
extracts orthogonal components of maximum variance. extracts orthogonal components of maximum variance.
*Parameters*: *Parameters*:
X : {array} X : {array}
Data measurement matrix, (samples x variables) Data measurement matrix, (samples x variables)
aopt : {integer} aopt : {integer}
@ -55,21 +55,21 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
mode : {string}, optional mode : {string}, optional
Amount of info retained, [['normal'], 'fast', 'detailed'] Amount of info retained, [['normal'], 'fast', 'detailed']
*SeeAlso*: *SeeAlso*:
`center` : Data centering `center` : Data centering
*Notes* *Notes*
Uses kernel speed-up if m>>n or m<<n. Uses kernel speed-up if m>>n or m<<n.
If residuals turn rank deficient, a lower number of component than given If residuals turn rank deficient, a lower number of component than given
in input will be used. in input will be used.
*Examples*: *Examples*:
>>> import scipy,engines >>> import scipy,engines
>>> a=scipy.asarray([[1,2,3],[2,4,5]]) >>> a=scipy.asarray([[1,2,3],[2,4,5]])
>>> dat=engines.pca(a, 2) >>> dat=engines.pca(a, 2)
@ -77,7 +77,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
array([0.,99.8561562, 100.]) array([0.,99.8561562, 100.])
""" """
m, n = X.shape m, n = X.shape
max_aopt = min(m, n) max_aopt = min(m, n)
if center_axis != None: if center_axis != None:
@ -94,7 +94,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
u = u[:,:aopt] u = u[:,:aopt]
s = s[:aopt] s = s[:aopt]
v = v[:,:aopt] v = v[:,:aopt]
# ranktest # ranktest
tol = 1e-10 tol = 1e-10
eff_rank = sum(s > s[0]*tol) eff_rank = sum(s > s[0]*tol)
@ -104,14 +104,14 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
T = T[:,:aopt] T = T[:,:aopt]
P = v[:,:aopt] P = v[:,:aopt]
e = s**2 e = s**2
if scale=='loads': if scale=='loads':
T = T/s T = T/s
P = P*s P = P*s
if mode in ['fast', 'f', 'F']: if mode in ['fast', 'f', 'F']:
return {'T':T, 'P':P, 'aopt':aopt, 'mnx': mnx} return {'T':T, 'P':P, 'aopt':aopt, 'mnx': mnx}
if mode in ['detailed', 'd', 'D']: if mode in ['detailed', 'd', 'D']:
E = empty((aopt, m, n)) E = empty((aopt, m, n))
ssq = [] ssq = []
@ -135,7 +135,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
lev = [(1./m)+((T/s)**2).sum(1), (1./n)+(P**2).sum(1)] lev = [(1./m)+((T/s)**2).sum(1), (1./n)+(P**2).sum(1)]
# variances # variances
expvarx = r_[0, 100*e.cumsum()/(X*X).sum()] expvarx = r_[0, 100*e.cumsum()/(X*X).sum()]
return {'T': T, 'P': P, 'E': E, 'evx': expvarx, 'leverage': lev, 'ssqx': ssq, return {'T': T, 'P': P, 'E': E, 'evx': expvarx, 'leverage': lev, 'ssqx': ssq,
'aopt': aopt, 'eigvals': e, 'mnx': mnx} 'aopt': aopt, 'eigvals': e, 'mnx': mnx}
@ -159,20 +159,20 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=[0, 0]):
keys -- values, T -- scores, P -- loadings, E -- residuals, keys -- values, T -- scores, P -- loadings, E -- residuals,
levx -- leverages, ssqx -- sum of squares, expvarx -- cumulative levx -- leverages, ssqx -- sum of squares, expvarx -- cumulative
explained variance, aopt -- number of components used explained variance, aopt -- number of components used
*OtherParameters*: *OtherParameters*:
mode : {string} mode : {string}
Amount of info retained, ('fast', 'normal', 'detailed') Amount of info retained, ('fast', 'normal', 'detailed')
center_axis : {integer} center_axis : {integer}
Center along given axis. If neg.: no centering (-inf,..., matrix modes) Center along given axis. If neg.: no centering (-inf,..., matrix modes)
SeeAlso: SeeAlso:
- pca : other blm - pca : other blm
- pls : other blm - pls : other blm
- lpls : other blm - lpls : other blm
*Notes* *Notes*
----- -----
@ -180,12 +180,12 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=[0, 0]):
Uses kernel speed-up if m>>n or m<<n. Uses kernel speed-up if m>>n or m<<n.
If residuals turn rank deficient, a lower number of component than given If residuals turn rank deficient, a lower number of component than given
in input will be used. The number of components used is given in results-dict. in input will be used. The number of components used is given in results-dict.
Examples Examples
-------- --------
>>> import scipy,engines >>> import scipy,engines
>>> a=scipy.asarray([[1,2,3],[2,4,5]]) >>> a=scipy.asarray([[1,2,3],[2,4,5]])
>>> b=scipy.asarray([[1,1],[2,3]]) >>> b=scipy.asarray([[1,1],[2,3]])
@ -222,9 +222,9 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=[0, 0]):
F = b - dot(T, Q.T) F = b - dot(T, Q.T)
sepy = F**2 sepy = F**2
ssqy = [sepy.sum(0), sepy.sum(1)] ssqy = [sepy.sum(0), sepy.sum(1)]
expvary = r_[0, 100*((T**2).sum(0)*(Q**2).sum(0)/(b**2).sum()).cumsum()[:aopt]] expvary = r_[0, 100*((T**2).sum(0)*(Q**2).sum(0)/(b**2).sum()).cumsum()[:aopt]]
dat.update({'Q': Q, 'F': F, 'evy': expvary, 'ssqy': ssqy, 'mny': mny}) dat.update({'Q': Q, 'F': F, 'evy': expvary, 'ssqy': ssqy, 'mny': mny})
return dat return dat
@ -245,9 +245,9 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
Which component should get the scale Which component should get the scale
center_axis : {-1, integer} center_axis : {-1, integer}
Perform centering across given axis, (-1 is no centering) Perform centering across given axis, (-1 is no centering)
*Returns*: *Returns*:
T : {array} T : {array}
X-scores X-scores
W : {array} W : {array}
@ -280,25 +280,25 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
Sum of squared residuals in Y along each dimesion Sum of squared residuals in Y along each dimesion
leverage : {array} leverage : {array}
Sample leverages Sample leverages
*OtherParameters*: *OtherParameters*:
mode : ['normal', 'fast', 'detailed'], optional mode : ['normal', 'fast', 'detailed'], optional
How much details to compute How much details to compute
*SeeAlso*: *SeeAlso*:
`center` - data centering `center` - data centering
*Notes* *Notes*
- The output with mode='fast' will only return T and W - The output with mode='fast' will only return T and W
- If residuals turn rank deficient, a lower number of component than given in input will be used. The number of components used is given in results. - If residuals turn rank deficient, a lower number of component than given in input will be used. The number of components used is given in results.
*Examples* *Examples*
>>> import numpy, engines >>> import numpy, engines
>>> a = numpy.asarray([[1,2,3],[2,4,5]]) >>> a = numpy.asarray([[1,2,3],[2,4,5]])
>>> b = numpy.asarray([[1,1],[2,3]]) >>> b = numpy.asarray([[1,1],[2,3]])
@ -307,7 +307,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
array([0.,99.8561562, 100.]) array([0.,99.8561562, 100.])
""" """
m, n = X.shape m, n = X.shape
try: try:
k, l = Y.shape k, l = Y.shape
@ -322,7 +322,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
Y, mny = center(Y, center_axis[1]) Y, mny = center(Y, center_axis[1])
min_aopt = min_aopt - 1 min_aopt = min_aopt - 1
assert(aopt > 0 and aopt < min_aopt) assert(aopt > 0 and aopt < min_aopt)
W = empty((n, aopt)) W = empty((n, aopt))
P = empty((n, aopt)) P = empty((n, aopt))
R = empty((n, aopt)) R = empty((n, aopt))
@ -330,7 +330,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
T = empty((m, aopt)) T = empty((m, aopt))
B = empty((aopt, n, l)) B = empty((aopt, n, l))
tt = empty((aopt,)) tt = empty((aopt,))
XY = dot(X.T, Y) XY = dot(X.T, Y)
for i in range(aopt): for i in range(aopt):
if XY.shape[1] == 1: #pls 1 if XY.shape[1] == 1: #pls 1
@ -345,7 +345,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
# with many samples, many x-vars and many non-orth y-vars (where arpack speed # with many samples, many x-vars and many non-orth y-vars (where arpack speed
# shines) # shines)
############# #############
#s, w = arpack.eigen_symmetric(dot(XY, XY.T),k=1, tol=1e-10, maxiter=1000) #s, w = arpack.eigen_symmetric(dot(XY, XY.T),k=1, tol=1e-10, maxiter=1000)
#if s[0] == 0: #if s[0] == 0:
# print "Arpack did not converge... using svd" # print "Arpack did not converge... using svd"
@ -357,15 +357,15 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
# print "Arpack did not converge... using svd" # print "Arpack did not converge... using svd"
q, s, vh = svd(dot(XY.T, XY)) q, s, vh = svd(dot(XY.T, XY))
q = q[:,:1] q = q[:,:1]
w = dot(XY, q) w = dot(XY, q)
w = w/vnorm(w) w = w/vnorm(w)
r = w.copy() r = w.copy()
if i > 0: if i > 0:
for j in range(0, i, 1): for j in range(0, i, 1):
r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis] r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis]
t = dot(X, r) t = dot(X, r)
tt[i] = tti = dot(t.T, t).ravel() tt[i] = tti = dot(t.T, t).ravel()
p = dot(X.T, t)/tti p = dot(X.T, t)/tti
@ -385,7 +385,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
R[:,i] = r.ravel() R[:,i] = r.ravel()
Q[:,i] = q.ravel() Q[:,i] = q.ravel()
B[i] = dot(R[:,:i+1], Q[:,:i+1].T) B[i] = dot(R[:,:i+1], Q[:,:i+1].T)
qnorm = apply_along_axis(vnorm, 0, Q) qnorm = apply_along_axis(vnorm, 0, Q)
tnorm = sqrt(tt) tnorm = sqrt(tt)
pp = (P**2).sum(0) pp = (P**2).sum(0)
@ -412,13 +412,13 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
sepy = F**2 sepy = F**2
ssqy = [sepy.sum(0), sepy.sum(1)] ssqy = [sepy.sum(0), sepy.sum(1)]
leverage = 1./m + ((T/tnorm)**2).sum(1) leverage = 1./m + ((T/tnorm)**2).sum(1)
# variances # variances
tp= tt*pp tp= tt*pp
tq = tt*qnorm*qnorm tq = tt*qnorm*qnorm
expvarx = r_[0, 100*tp/(X*X).sum()] expvarx = r_[0, 100*tp/(X*X).sum()]
expvary = r_[0, 100*tq/(Y*Y).sum()] expvary = r_[0, 100*tq/(Y*Y).sum()]
if scale == 'loads': if scale == 'loads':
T = T/tnorm T = T/tnorm
W = W*tnorm W = W*tnorm
@ -438,7 +438,7 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
of these three matrices tries to discover common directions/subspaces. of these three matrices tries to discover common directions/subspaces.
*Parameters*: *Parameters*:
X : {array} X : {array}
Main data matrix (m, n) Main data matrix (m, n)
Y : {array} Y : {array}
@ -457,9 +457,9 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
0 : row center 0 : row center
1 : column center 1 : column center
2 : double center 2 : double center
*Returns*: *Returns*:
T : {array} T : {array}
X-scores X-scores
W : {array} W : {array}
@ -504,18 +504,18 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
Saeboe et al., LPLS-regression: a method for improved prediction and Saeboe et al., LPLS-regression: a method for improved prediction and
classification through inclusion of background information on classification through inclusion of background information on
predictor variables, J. of chemometrics and intell. laboratory syst. predictor variables, J. of chemometrics and intell. laboratory syst.
Martens et.al, Regression of a data matrix on descriptors of Martens et.al, Regression of a data matrix on descriptors of
both its rows and of its columns via latent variables: L-PLSR, both its rows and of its columns via latent variables: L-PLSR,
Computational statistics & data analysis, 2005 Computational statistics & data analysis, 2005
""" """
m, n = X.shape m, n = X.shape
k, l = Y.shape k, l = Y.shape
u, o = Z.shape u, o = Z.shape
max_rank = min(m, n) max_rank = min(m, n)
if center_axis != None: if center_axis != None:
xctr, yctr, zctr = center_axis xctr, yctr, zctr = center_axis
X, mnX = center(X, xctr) X, mnX = center(X, xctr)
@ -523,14 +523,14 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
Z, mnZ = center(Z, zctr) Z, mnZ = center(Z, zctr)
max_rank = max_rank -1 max_rank = max_rank -1
assert (a_max > 0 and a_max < max_rank), "Number of comp error:\ assert (a_max > 0 and a_max < max_rank), "Number of comp error:\
tried: %d, max_rank: %d" %(a_max, max_rank) tried: %d, max_rank: %d" %(a_max, max_rank)
# initial variance # initial variance
varX = (X**2).sum() varX = (X**2).sum()
varY = (Y**2).sum() varY = (Y**2).sum()
varZ = (Z**2).sum() varZ = (Z**2).sum()
# initialize # initialize
U = empty((k, a_max)) U = empty((k, a_max))
Q = empty((l, a_max)) Q = empty((l, a_max))
T = zeros((m, a_max)) T = zeros((m, a_max))
@ -600,7 +600,7 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
else: else:
k = w k = w
l = dot(G, w) l = dot(G, w)
U[:,a] = u.ravel() U[:,a] = u.ravel()
W[:,a] = w.ravel() W[:,a] = w.ravel()
P[:,a] = p.ravel() P[:,a] = p.ravel()
@ -617,11 +617,11 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
var_x[a] = pow(E, 2).sum() var_x[a] = pow(E, 2).sum()
var_y[a] = pow(F, 2).sum() var_y[a] = pow(F, 2).sum()
var_z[a] = pow(G, 2).sum() var_z[a] = pow(G, 2).sum()
B[a] = dot(dot(W[:,:a+1], inv(dot(P[:,:a+1].T, W[:,:a+1]))), Q[:,:a+1].T) B[a] = dot(dot(W[:,:a+1], inv(dot(P[:,:a+1].T, W[:,:a+1]))), Q[:,:a+1].T)
#b0[a] = mnY - dot(mnX, B[a]) #b0[a] = mnY - dot(mnX, B[a])
# variance explained # variance explained
evx = 100.*(1 - var_x/varX) evx = 100.*(1 - var_x/varX)
evy = 100.*(1 - var_y/varY) evy = 100.*(1 - var_y/varY)
@ -635,8 +635,8 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
knorm = apply_along_axis(vnorm, 0, K) knorm = apply_along_axis(vnorm, 0, K)
L = L*knorm L = L*knorm
K = K/knorm K = K/knorm
return {'T':T, 'W':W, 'P':P, 'Q':Q, 'U':U, 'L':L, 'K':K, 'B':B, 'E': E, 'F': F, 'G': G, 'evx':evx, 'evy':evy, 'evz':evz,'mnx': mnX, 'mny': mnY, 'mnz': mnZ} return {'T':T, 'W':W, 'P':P, 'Q':Q, 'U':U, 'L':L, 'K':K, 'B':B, 'E': E, 'F': F, 'G': G, 'evx':evx, 'evy':evy, 'evz':evz,'mnx': mnX, 'mny': mnY, 'mnz': mnZ}
def lpls_predict(model_dict, x, aopt): def lpls_predict(model_dict, x, aopt):
"""Predict lpls reponses from existing model on new data. """Predict lpls reponses from existing model on new data.
@ -646,25 +646,25 @@ def lpls_predict(model_dict, x, aopt):
except: except:
x = atleast_2d(x.shape) x = atleast_2d(x.shape)
m, n = x.shape m, n = x.shape
if 'B0' in model_dict.keys(): if 'B0' in model_dict.keys():
y = model_dict['B0'] + dot() y = model_dict['B0'] + dot()
def vnorm(a): def vnorm(a):
"""Returns the norm of a vector. """Returns the norm of a vector.
*Parameters*: *Parameters*:
a : {array} a : {array}
Input data, 1-dim, or column vector (m, 1) Input data, 1-dim, or column vector (m, 1)
*Returns*: *Returns*:
a_norm : {array} a_norm : {array}
Norm of input vector Norm of input vector
""" """
return msqrt(dot(a.T,a)) return msqrt(dot(a.T,a))
@ -676,7 +676,7 @@ def center(a, axis):
a : {array} a : {array}
Input data Input data
axis : {integer} axis : {integer}
Which centering to perform. Which centering to perform.
0 = col center, 1 = row center, 2 = double center 0 = col center, 1 = row center, 2 = double center
-1 = nothing -1 = nothing
@ -707,16 +707,16 @@ def center(a, axis):
else: else:
mn = a.mean()*ones(a.shape) mn = a.mean()*ones(a.shape)
return a - mn, mn return a - mn, mn
if axis == -1: if axis == -1:
mn = zeros((1,a.shape[1],)) mn = zeros((1,a.shape[1],))
mn = tile(mn, (a.shape[0], 1)) #mn = tile(mn, (a.shape[0], 1))
elif axis == 0: elif axis == 0:
mn = a.mean(0)[newaxis] mn = a.mean(0)[newaxis]
mn = tile(mn, (a.shape[0], 1)) #mn = tile(mn, (a.shape[0], 1))
elif axis == 1: elif axis == 1:
mn = a.mean(1)[:,newaxis] mn = a.mean(1)[:,newaxis]
mn = tile(mn, (1, a.shape[1])) #mn = tile(mn, (1, a.shape[1]))
elif axis == 2: elif axis == 2:
#fixme: double centering returns column mean as loc-vector, ok? #fixme: double centering returns column mean as loc-vector, ok?
mn = a.mean(0)[newaxis] + a.mean(1)[:,newaxis] - a.mean() mn = a.mean(0)[newaxis] + a.mean(1)[:,newaxis] - a.mean()
@ -774,7 +774,7 @@ def _scale(a, axis):
a : {array} a : {array}
Input data Input data
axis : {integer} axis : {integer}
Which scaling to perform. Which scaling to perform.
0 = column, 1 = row, -1 = nothing 0 = column, 1 = row, -1 = nothing
*Returns*: *Returns*:
@ -784,7 +784,7 @@ def _scale(a, axis):
mn : {array} mn : {array}
Scaling vector/matrix Scaling vector/matrix
""" """
if axis == -1: if axis == -1:
sc = zeros((a.shape[1],)) sc = zeros((a.shape[1],))
elif axis == 0: elif axis == 0:
@ -817,21 +817,20 @@ def esvd(data, a_max=None):
Singular values Singular values
v : {array} v : {array}
Left hand eigenvectors Left hand eigenvectors
*Notes*: *Notes*:
Uses Anoldi iterations for the symmetric eigendecomp (ARPACK) Uses Anoldi iterations for the symmetric eigendecomp (ARPACK)
""" """
m, n = data.shape m, n = data.shape
if m >= n: if m > n:
kernel = dot(data.T, data) kernel = dot(data.T, data)
if a_max == None: if a_max == None:
a_max = n - 1 a_max = n - 1
s, v = arpack.eigen_symmetric(kernel, k=a_max, which='LM', s, v = arpack.eigen_symmetric(kernel, k=a_max, which='LM',
maxiter=200, tol=1e-5) maxiter=500, tol=1e-7)
s = s[::-1] s = s[::-1]
v = v[:,::-1] v = v[:,::-1]
#u, s, vt = svd(kernel) #u, s, vt = svd(kernel)
@ -841,9 +840,9 @@ def esvd(data, a_max=None):
else: else:
kernel = dot(data, data.T) kernel = dot(data, data.T)
if a_max == None: if a_max == None:
a_max = m -1 a_max = m - 1
s, u = arpack.eigen_symmetric(kernel, k=a_max, which='LM', s, u = arpack.eigen_symmetric(kernel, k=a_max, which='LM',
maxiter=200, tol=1e-5) maxiter=500, tol=1e-7)
s = s[::-1] s = s[::-1]
u = u[:,::-1] u = u[:,::-1]
#u, s, vt = svd(kernel) #u, s, vt = svd(kernel)

View File

@ -21,7 +21,7 @@ class Model(object):
def __init__(self, name="johndoe"): def __init__(self, name="johndoe"):
self.name = name self.name = name
self.options = {} self.options = {}
def save(self, filename='pca.ml'): def save(self, filename='pca.ml'):
pass pass
@ -46,7 +46,7 @@ class PCA(Model):
self._x = x self._x = x
self.amax = amax self.amax = amax
self.aopt = amax self.aopt = amax
# properties # properties
def amax(): def amax():
doc = "maximum number of components" doc = "maximum number of components"
@ -77,7 +77,7 @@ class PCA(Model):
del self._tot_var del self._tot_var
return locals() return locals()
tot_var = property(**tot_var()) tot_var = property(**tot_var())
def scores(): def scores():
doc = "pca scores" doc = "pca scores"
def fget(self): def fget(self):
@ -94,7 +94,7 @@ class PCA(Model):
del self._core_scores del self._core_scores
return locals() return locals()
scores = property(**scores()) scores = property(**scores())
def loadings(): def loadings():
doc = "pca loadings" doc = "pca loadings"
def fget(self): def fget(self):
@ -111,7 +111,7 @@ class PCA(Model):
self._loadings = p self._loadings = p
return locals() return locals()
loadings = property(**loadings()) loadings = property(**loadings())
def singvals(): def singvals():
doc = "Singular values" doc = "Singular values"
def fget(self): def fget(self):
@ -128,7 +128,7 @@ class PCA(Model):
del self._singvals del self._singvals
return locals() return locals()
singvals = property(**singvals()) singvals = property(**singvals())
def x(): def x():
doc = "x is readonly, may not be deleted" doc = "x is readonly, may not be deleted"
def fget(self): def fget(self):
@ -154,12 +154,12 @@ class PCA(Model):
del self._xc del self._xc
return locals() return locals()
xadd = property(**xadd()) xadd = property(**xadd())
def xc(): def xc():
doc = "mean_centered input data" doc = "mean_centered input data"
def fget(self): def fget(self):
if not hasattr(self, "_xc"): if not hasattr(self, "_xc"):
self._xc = self.x + self.xadd self._xc = self.x + self.xadd
return self._xc return self._xc
def fset(self, xc): def fset(self, xc):
self._xc = xc self._xc = xc
@ -186,7 +186,7 @@ class PCA(Model):
del self._xw del self._xw
return locals() return locals()
xw = property(**xw()) xw = property(**xw())
def explained_variance(): def explained_variance():
doc = "explained variance" doc = "explained variance"
def fget(self): def fget(self):
@ -215,7 +215,7 @@ class PCA(Model):
del self._residuals del self._residuals
return locals() return locals()
residuals = property(**residuals()) residuals = property(**residuals())
def leverage(): def leverage():
doc = "objects leverage" doc = "objects leverage"
def fget(self): def fget(self):
@ -254,7 +254,7 @@ class PCA(Model):
self._column_metric = scale(self.xc, axis=1) self._column_metric = scale(self.xc, axis=1)
return self._column_metric return self._column_metric
def fset(self, w): def fset(self, w):
self._column_metric = w self._column_metric = w
# update model # update model
def fdel(self): def fdel(self):
@ -263,7 +263,7 @@ class PCA(Model):
del self._xd del self._xd
return locals() return locals()
column_metric = property(**column_metric()) column_metric = property(**column_metric())
def blm_update(self, a, b): def blm_update(self, a, b):
pass pass
@ -281,8 +281,8 @@ class PCA(Model):
def reweight(self, w): def reweight(self, w):
pass pass
if __name__ == "__main__": if __name__ == "__main__":
from numpy.random import rand from numpy.random import rand
X = rand(4,10) X = rand(4,10)

View File

@ -22,9 +22,9 @@ def hotelling(Pcv, P, p_center='median', cov_center='median',
used in multivariate hypothesis testing. In order to avoid small variance used in multivariate hypothesis testing. In order to avoid small variance
samples to become significant this version allows borrowing variance samples to become significant this version allows borrowing variance
from the pooled covariance. from the pooled covariance.
*Parameters*: *Parameters*:
Pcv : {array} Pcv : {array}
Crossvalidation segements of paramter Crossvalidation segements of paramter
P : {array} P : {array}
@ -39,9 +39,9 @@ def hotelling(Pcv, P, p_center='median', cov_center='median',
Rotate sub-segments toward calibration model. Rotate sub-segments toward calibration model.
strict : {boolean}, optional strict : {boolean}, optional
Only rotate 90 degree Only rotate 90 degree
*Returns*: *Returns*:
tsq : {array} tsq : {array}
Hotellings T^2 estimate Hotellings T^2 estimate
@ -50,19 +50,19 @@ def hotelling(Pcv, P, p_center='median', cov_center='median',
Gidskehaug et al., A framework for significance analysis of Gidskehaug et al., A framework for significance analysis of
gene expression datausing dimension reduction methods, BMC gene expression datausing dimension reduction methods, BMC
bioinformatics, 2007 bioinformatics, 2007
*Notes* *Notes*
The rotational freedom in the solution of bilinear The rotational freedom in the solution of bilinear
models may require that a rotation onto the calibration models may require that a rotation onto the calibration
model. One way of doing that is procrustes rotation. model. One way of doing that is procrustes rotation.
""" """
m, n = P.shape m, n = P.shape
n_sets, n, amax = Pcv.shape n_sets, n, amax = Pcv.shape
T_sq = empty((n,), dtype='d') T_sq = empty((n,), dtype='d')
Cov_i = zeros((n, amax, amax), dtype='d') Cov_i = zeros((n, amax, amax), dtype='d')
# rotate sub_models to full model # rotate sub_models to full model
if crot: if crot:
for i, Pi in enumerate(Pcv): for i, Pi in enumerate(Pcv):
@ -77,20 +77,15 @@ def hotelling(Pcv, P, p_center='median', cov_center='median',
P_ctr = P P_ctr = P
for i in xrange(n): for i in xrange(n):
Pi = Pcv[:,i,:] # (n_sets x amax) Pi = Pcv[:,i,:] # (n_sets x amax)
Pi_ctr = P_ctr[i,:] # (1 x amax) Pi_ctr = P_ctr[i,:] # (1 x amax)
#Pim = (Pi - Pi_ctr)*msqrt(n_sets-1) Pim = (Pi - Pi_ctr)*msqrt(n_sets-1)
#Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim) Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim)
Pim = (Pi - Pi_ctr)
Cov_i[i] = dot(Pim.T, Pim)
if cov_center == 'median': if cov_center == 'median':
Cov_p = median(Cov_i) Cov_p = median(Cov_i)
elif cov_center == 'mean': else cov_center == 'mean':
Cov_p = Cov.mean(0) Cov_p = Cov.mean(0)
else:
print "Pooled covariance est. invalid, using median"
print cov_center
Cov_p = median(Cov_i)
reg_cov = (1. - alpha)*Cov_i + alpha*Cov_p reg_cov = (1. - alpha)*Cov_i + alpha*Cov_p
for i in xrange(n): for i in xrange(n):
Pc = P_ctr[i,:] Pc = P_ctr[i,:]
@ -105,7 +100,7 @@ def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False)
onto another by minimising the squared error. onto another by minimising the squared error.
*Parameters*: *Parameters*:
a : {array} a : {array}
Input array, stationary Input array, stationary
b : {array} b : {array}
@ -127,9 +122,9 @@ def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False)
*Reference*: *Reference*:
Schonemann, A generalized solution of the orthogonal Procrustes Schonemann, A generalized solution of the orthogonal Procrustes
problem, Psychometrika, 1966 problem, Psychometrika, 1966
""" """
if center: if center:
mn_a = a.mean(0) mn_a = a.mean(0)
a = a - mn_a a = a - mn_a
@ -145,7 +140,7 @@ def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False)
u, s, vt = svd(dot(b.T, a)) u, s, vt = svd(dot(b.T, a))
Cm = dot(u, vt) # Cm: orthogonal rotation matrix Cm = dot(u, vt) # Cm: orthogonal rotation matrix
if strict: if strict:
Cm = _ensure_strict(Cm) Cm = _ensure_strict(Cm)
b_rot = dot(b, Cm) b_rot = dot(b, Cm)
if verbose: if verbose:
fit = ((b - b_rot)**2).sum() fit = ((b - b_rot)**2).sum()
@ -158,7 +153,7 @@ def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False)
def _ensure_strict(C, only_flips=True): def _ensure_strict(C, only_flips=True):
"""Ensure that a rotation matrix does only 90 degree rotations. """Ensure that a rotation matrix does only 90 degree rotations.
In multiplication with pcs this allows flips and reordering. In multiplication with pcs this allows flips and reordering.
if only_flips is True there will onlt be flips allowed if only_flips is True there will onlt be flips allowed
@ -173,13 +168,13 @@ def _ensure_strict(C, only_flips=True):
C_rot : {array} C_rot : {array}
Restricted rotation matrix Restricted rotation matrix
*Notes*: *Notes*:
This function is not ready for use. Use (only_flips=True). This function is not ready for use. Use (only_flips=True).
That is, for more than two components, the rotation matrix That is, for more than two components, the rotation matrix
has a tendency to be unstable (det(Cm)>1), when rounding is used. has a tendency to be unstable (det(Cm)>1), when rounding is used.
""" """
if only_flips: if only_flips:
C = eye(C.shape[0])*sign(C) C = eye(C.shape[0])*sign(C)
@ -199,9 +194,9 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
The response (Y) is randomly permuted, and the number of false positives The response (Y) is randomly permuted, and the number of false positives
is registered by comparing hotellings T2 statistics of the calibration model. is registered by comparing hotellings T2 statistics of the calibration model.
*Parameters*: *Parameters*:
X : {array} X : {array}
Main data matrix (m, n) Main data matrix (m, n)
Y : {array} Y : {array}
@ -237,14 +232,14 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
nsets : {integer} nsets : {integer}
Number of crossvalidation segements Number of crossvalidation segements
*Reference*: *Reference*:
Gidskehaug et al., A framework for significance analysis of Gidskehaug et al., A framework for significance analysis of
gene expression data using dimension reduction methods, BMC gene expression data using dimension reduction methods, BMC
bioinformatics, 2007 bioinformatics, 2007
""" """
m, n = X.shape m, n = X.shape
k, nz = Z.shape k, nz = Z.shape
assert(n==nz) assert(n==nz)
@ -255,7 +250,7 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
Y = atleast_2d(Y).T Y = atleast_2d(Y).T
my, l = Y.shape my, l = Y.shape
assert(m==my) assert(m==my)
pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets) pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets)
pert_tsq_z = zeros((k, n_iter), dtype='d') # (nzvars x n_subsets) pert_tsq_z = zeros((k, n_iter), dtype='d') # (nzvars x n_subsets)
@ -264,7 +259,7 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
Wc, Lc = lpls_jk(X, Y, Z ,aopt, zorth=zorth) Wc, Lc = lpls_jk(X, Y, Z ,aopt, zorth=zorth)
cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha) cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha)
cal_tsq_z = hotelling(Lc, dat['L'], alpha=alpha) cal_tsq_z = hotelling(Lc, dat['L'], alpha=alpha)
print "morn"
# Perturbations # Perturbations
index = arange(m) index = arange(m)
for i in range(n_iter): for i in range(n_iter):
@ -275,7 +270,7 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha) pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)
# no reason to borrow variance in dag (alpha ->some small value) # no reason to borrow variance in dag (alpha ->some small value)
pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=0.01) pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=0.01)
return cal_tsq_z, pert_tsq_z, cal_tsq_x, pert_tsq_x return cal_tsq_z, pert_tsq_z, cal_tsq_x, pert_tsq_x
@ -286,9 +281,9 @@ def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median,
The response (Y) is randomly permuted, and the number of false positives The response (Y) is randomly permuted, and the number of false positives
is registered by comparing hotellings T2 statistics of the calibration model. is registered by comparing hotellings T2 statistics of the calibration model.
*Parameters*: *Parameters*:
X : {array} X : {array}
Main data matrix (m, n) Main data matrix (m, n)
Y : {array} Y : {array}
@ -318,14 +313,14 @@ def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median,
Only rotate 90 degree Only rotate 90 degree
nsets : {integer} nsets : {integer}
Number of crossvalidation segements Number of crossvalidation segements
*Reference*: *Reference*:
Gidskehaug et al., A framework for significance analysis of Gidskehaug et al., A framework for significance analysis of
gene expression data using dimension reduction methods, BMC gene expression data using dimension reduction methods, BMC
bioinformatics, 2007 bioinformatics, 2007
""" """
m, n = X.shape m, n = X.shape
try: try:
my, l = Y.shape my, l = Y.shape
@ -341,7 +336,7 @@ def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median,
Wc = pls_jk(X, Y , aopt) Wc = pls_jk(X, Y , aopt)
cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha) cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha)
# Perturbations # Perturbations
pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets) pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets)
index = arange(m) index = arange(m)
@ -351,7 +346,7 @@ def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median,
dat = pls(X, Y[indi,:], aopt, scale='loads', center_axis=center_axis) dat = pls(X, Y[indi,:], aopt, scale='loads', center_axis=center_axis)
Wi = pls_jk(X, Y[indi,:], aopt, nsets=nsets, center_axis=center_axis) Wi = pls_jk(X, Y[indi,:], aopt, nsets=nsets, center_axis=center_axis)
pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha) pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)
return cal_tsq_x, pert_tsq_x return cal_tsq_x, pert_tsq_x
@ -362,7 +357,7 @@ def _fdr(tsq, tsqp, loc_method=median):
Fdr is a method used in multiple hypothesis testing to correct for multiple Fdr is a method used in multiple hypothesis testing to correct for multiple
comparisons. It controls the expected proportion of incorrectly rejected null comparisons. It controls the expected proportion of incorrectly rejected null
hypotheses (type I errors) in a list of rejected hypotheses. hypotheses (type I errors) in a list of rejected hypotheses.
*Parameters*: *Parameters*:
tsq : {array} tsq : {array}
@ -372,14 +367,14 @@ def _fdr(tsq, tsqp, loc_method=median):
loc_method : {py_func} loc_method : {py_func}
Location method Location method
*Returns*: *Returns*:
fdr : {array} fdr : {array}
False discovery rate False discovery rate
*Notes*: *Notes*:
This is an internal function for use in fdr estimation of jack-knifed This is an internal function for use in fdr estimation of jack-knifed
perturbated blm parameters. perturbated blm parameters.
@ -403,4 +398,3 @@ def _fdr(tsq, tsqp, loc_method=median):
fd_rate = fp/n_signif fd_rate = fp/n_signif
fd_rate[fd_rate>1] = 1 fd_rate[fd_rate>1] = 1
return fd_rate return fd_rate