Projects/pyblm
Projects
/
pyblm
Archived
5
0
Fork 0

Fixed conflicts

This commit is contained in:
Arnar Flatberg 2007-12-14 00:16:31 +00:00
parent 1103245d85
commit 253305b602
5 changed files with 177 additions and 164 deletions

View File

@ -13,4 +13,3 @@ def test(level=1, verbosity=1):
print 'Python version %s' % (sys.version.replace('\n', '',),)
from numpy.testing import NumpyTest
return NumpyTest().test(level, verbosity)

View File

@ -155,7 +155,7 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F
validation scheme.
*Parameters*:
X : {array}
Main data matrix (m, n)
Y : {array}
@ -180,9 +180,9 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F
If true, Require orthogonal latent components in Z.
verbose : {boolean}, optional
Verbosity of console output. For use in debugging.
*Returns*:
rmsep : {array}
Root mean squred error of prediction
yhat : {array}
@ -191,19 +191,19 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F
Estimated value of optimal number of components
"""
m, n = X.shape
k, l = Y.shape
o, p = Z.shape
assert m == k, "X (%d,%d) - Y (%d,%d) dim mismatch" %(m, n, k, l)
assert n == p, "X (%d,%d) - Z (%d,%d) dim mismatch" %(m, n, o, p)
if nsets == None:
nsets = m
nsets = m
if nsets > X.shape[0]:
print "nsets (%d) is larger than number of variables (%d).\nnsets: %d -> %d" %(nsets, m, nsets, m)
nsets = m
assert (alpha >= 0 and alpha<=1), "Alpha needs to be within [0,1], got: %.2f" %alpha
Yhat = empty((a_max, k, l), 'd')
for cal, val in cv(k, nsets):
# do the training model
@ -217,10 +217,16 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F
# predictions
for a in range(a_max):
Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a]))
# todo: need a better support for classification error
y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?']
if y_is_class:
pass
#Yhat, err = class_error(Yhat, Y)
#return Yhat, err
sep = (Y - Yhat)**2
rmsep = sqrt(sep.mean(1)).T
#aopt = find_aopt_from_sep(rmsep)
# todo: need a better support for classification error
error = prediction_error(Yhat, Y, method='1/2')
@ -228,7 +234,7 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F
def pca_jk(a, aopt, nsets=None, center_axis=[0], method='cv'):
"""Returns jack-knife segments from PCA.
*Parameters*:
a : {array}
@ -252,9 +258,9 @@ def pca_jk(a, aopt, nsets=None, center_axis=[0], method='cv'):
Loadings collected in a three way matrix (n_segments, m, aopt)
*Notes*:
- Crossvalidation method is currently set to random blocks of samples.
"""
m, n = a.shape
if nsets == None:
@ -278,14 +284,14 @@ def pca_jk(a, aopt, nsets=None, center_axis=[0], method='cv'):
Pcv[i,:,:] = pca(a[cal,:], aopt, mode='fast', scale='loads', center_axis = center_axis)['P']
else:
raise NotImplementedError(method)
return Pcv
def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False):
""" Returns jack-knife segements of W.
*Parameters*:
X : {array}
Main data matrix (m, n)
Y : {array}
@ -294,7 +300,7 @@ def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False):
The number of components to calculate (0, min(m,n))
nsets : (integer), optional
Number of jack-knife segments
center_axis : {boolean}, optional
- -1 : nothing
- 0 : row center
@ -302,12 +308,12 @@ def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False):
- 2 : double center
verbose : {boolean}, optional
Verbosity of console output. For use in debugging.
*Returns*:
Wcv : {array}
Loading-weights jack-knife segements
"""
m, n = X.shape
k, l = Y.shape
@ -320,7 +326,7 @@ def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False):
print "Segment number: %d" %i
dat = pls(X[cal,:], Y[cal,:], a_opt, scale='loads', mode='fast', center_axis=center_axis)
Wcv[i,:,:] = dat['W']
return Wcv
def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth=False, verbose=False):
@ -332,10 +338,10 @@ def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth=
infer the paramter confidence in th model.
The segements returned are the X-block weights and Z-block weights.
*Parameters*:
X : {array}
Main data matrix (m, n)
Y : {array}
@ -358,15 +364,15 @@ def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth=
2 : double center
verbose : {boolean}, optional
Verbosity of console output. For use in debugging.
*Returns*:
Wx : {array}
X-block jack-knife segements
Wz : {array}
Z-block jack-knife segements
"""
m, n = X.shape
k, l = Y.shape
o, p = Z.shape
@ -388,7 +394,7 @@ def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth=
def find_aopt_from_sep(err, method='vanilla'):
"""Returns an estimate of optimal number of components.
The estimate is based on the error of prediction from
crossvalidation. This is pretty much wild guessing and it is
recomended to inspect model parameters and prediction errors
@ -406,7 +412,7 @@ def find_aopt_from_sep(err, method='vanilla'):
aopt : {integer}
A guess on the optimal number of components
"""
if method == 'vanilla':
# min rmsep
rmsecv = sqrt(err.mean(0))
@ -434,7 +440,7 @@ def cv(N, K, randomise=True, sequential=False):
of length ~N/K, *without* replacement.
*Parameters*:
N : {integer}
Total number of samples
K : {integer}
@ -443,7 +449,7 @@ def cv(N, K, randomise=True, sequential=False):
Use random sampling
sequential : {boolean}
Use sequential sampling
*Returns*:
training : {array-like}
@ -456,12 +462,12 @@ def cv(N, K, randomise=True, sequential=False):
If randomise is true, a copy of index is shuffled before partitioning,
otherwise its order is preserved in training and validation.
Randomise overrides the sequential argument. If randomise is true,
sequential is False
If sequential is true the index is partioned in continous blocks,
otherwise interleaved ordering is used.
"""
if K > N:
raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(N, N, K)
@ -510,6 +516,8 @@ def diag_cv(shape, nsets=9, randomise=True):
except:
raise ValueError("shape needs to be a two-tuple")
if nsets>m or nsets>n:
msg = "You may not use more subsets than max(n_rows, n_cols)"
raise ValueError, msg
msg = "You may not use more subsets than max(n_rows, n_cols)"
nsets = min(m, n)
nm = n*m
@ -524,7 +532,20 @@ def diag_cv(shape, nsets=9, randomise=True):
validation.update(ind)
#training = [j for j in index if j not in validation]
yield list(validation)
def class_error(y_hat, y, method='vanilla'):
""" Not used.
"""
a_opt, k, l = y_hat.shape
y_hat_c = zeros((k, l), dtype='d')
if method == vanilla:
pass
for a in range(a_opt):
for i in range(k):
y_hat_c[a, val, argmax(y_hat[a,val,:])] = 1.0
err = 100*((y_hat_c + y) == 2).sum(1)/y.sum(0).astype('d')
return y_hat_c, err
def prediction_error(y_hat, y, method='squared'):
"""Loss function on multiclass Y.
@ -651,7 +672,7 @@ def _wkernel_pls_val(X, Y, a_max, n_blocks=None):
for Din, Doi, Yin, Yout in V:
ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0])
PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
dat = w_simpls(Din, Yin, a_max)
Q, U, H = dat['Q'], dat['U'], dat['H']
That = dot(Doi, dot(U, inv(triu(dot(H.T, U))) ))

View File

@ -20,7 +20,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
extracts orthogonal components of maximum variance.
*Parameters*:
X : {array}
Data measurement matrix, (samples x variables)
aopt : {integer}
@ -55,21 +55,21 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
mode : {string}, optional
Amount of info retained, [['normal'], 'fast', 'detailed']
*SeeAlso*:
`center` : Data centering
*Notes*
Uses kernel speed-up if m>>n or m<<n.
If residuals turn rank deficient, a lower number of component than given
in input will be used.
*Examples*:
>>> import scipy,engines
>>> a=scipy.asarray([[1,2,3],[2,4,5]])
>>> dat=engines.pca(a, 2)
@ -77,7 +77,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
array([0.,99.8561562, 100.])
"""
m, n = X.shape
max_aopt = min(m, n)
if center_axis != None:
@ -94,7 +94,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
u = u[:,:aopt]
s = s[:aopt]
v = v[:,:aopt]
# ranktest
tol = 1e-10
eff_rank = sum(s > s[0]*tol)
@ -104,14 +104,14 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
T = T[:,:aopt]
P = v[:,:aopt]
e = s**2
if scale=='loads':
T = T/s
P = P*s
if mode in ['fast', 'f', 'F']:
return {'T':T, 'P':P, 'aopt':aopt, 'mnx': mnx}
if mode in ['detailed', 'd', 'D']:
E = empty((aopt, m, n))
ssq = []
@ -135,7 +135,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
lev = [(1./m)+((T/s)**2).sum(1), (1./n)+(P**2).sum(1)]
# variances
expvarx = r_[0, 100*e.cumsum()/(X*X).sum()]
return {'T': T, 'P': P, 'E': E, 'evx': expvarx, 'leverage': lev, 'ssqx': ssq,
'aopt': aopt, 'eigvals': e, 'mnx': mnx}
@ -159,20 +159,20 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=[0, 0]):
keys -- values, T -- scores, P -- loadings, E -- residuals,
levx -- leverages, ssqx -- sum of squares, expvarx -- cumulative
explained variance, aopt -- number of components used
*OtherParameters*:
mode : {string}
Amount of info retained, ('fast', 'normal', 'detailed')
center_axis : {integer}
Center along given axis. If neg.: no centering (-inf,..., matrix modes)
SeeAlso:
- pca : other blm
- pls : other blm
- lpls : other blm
*Notes*
-----
@ -180,12 +180,12 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=[0, 0]):
Uses kernel speed-up if m>>n or m<<n.
If residuals turn rank deficient, a lower number of component than given
in input will be used. The number of components used is given in results-dict.
in input will be used. The number of components used is given in results-dict.
Examples
--------
>>> import scipy,engines
>>> a=scipy.asarray([[1,2,3],[2,4,5]])
>>> b=scipy.asarray([[1,1],[2,3]])
@ -222,9 +222,9 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=[0, 0]):
F = b - dot(T, Q.T)
sepy = F**2
ssqy = [sepy.sum(0), sepy.sum(1)]
expvary = r_[0, 100*((T**2).sum(0)*(Q**2).sum(0)/(b**2).sum()).cumsum()[:aopt]]
dat.update({'Q': Q, 'F': F, 'evy': expvary, 'ssqy': ssqy, 'mny': mny})
return dat
@ -245,9 +245,9 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
Which component should get the scale
center_axis : {-1, integer}
Perform centering across given axis, (-1 is no centering)
*Returns*:
T : {array}
X-scores
W : {array}
@ -280,25 +280,25 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
Sum of squared residuals in Y along each dimesion
leverage : {array}
Sample leverages
*OtherParameters*:
mode : ['normal', 'fast', 'detailed'], optional
How much details to compute
*SeeAlso*:
`center` - data centering
*Notes*
- The output with mode='fast' will only return T and W
- If residuals turn rank deficient, a lower number of component than given in input will be used. The number of components used is given in results.
*Examples*
>>> import numpy, engines
>>> a = numpy.asarray([[1,2,3],[2,4,5]])
>>> b = numpy.asarray([[1,1],[2,3]])
@ -307,7 +307,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
array([0.,99.8561562, 100.])
"""
m, n = X.shape
try:
k, l = Y.shape
@ -322,7 +322,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
Y, mny = center(Y, center_axis[1])
min_aopt = min_aopt - 1
assert(aopt > 0 and aopt < min_aopt)
W = empty((n, aopt))
P = empty((n, aopt))
R = empty((n, aopt))
@ -330,7 +330,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
T = empty((m, aopt))
B = empty((aopt, n, l))
tt = empty((aopt,))
XY = dot(X.T, Y)
for i in range(aopt):
if XY.shape[1] == 1: #pls 1
@ -345,7 +345,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
# with many samples, many x-vars and many non-orth y-vars (where arpack speed
# shines)
#############
#s, w = arpack.eigen_symmetric(dot(XY, XY.T),k=1, tol=1e-10, maxiter=1000)
#if s[0] == 0:
# print "Arpack did not converge... using svd"
@ -357,15 +357,15 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
# print "Arpack did not converge... using svd"
q, s, vh = svd(dot(XY.T, XY))
q = q[:,:1]
w = dot(XY, q)
w = w/vnorm(w)
r = w.copy()
if i > 0:
for j in range(0, i, 1):
r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis]
t = dot(X, r)
tt[i] = tti = dot(t.T, t).ravel()
p = dot(X.T, t)/tti
@ -385,7 +385,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
R[:,i] = r.ravel()
Q[:,i] = q.ravel()
B[i] = dot(R[:,:i+1], Q[:,:i+1].T)
qnorm = apply_along_axis(vnorm, 0, Q)
tnorm = sqrt(tt)
pp = (P**2).sum(0)
@ -412,13 +412,13 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=[0, 0]):
sepy = F**2
ssqy = [sepy.sum(0), sepy.sum(1)]
leverage = 1./m + ((T/tnorm)**2).sum(1)
# variances
tp= tt*pp
tq = tt*qnorm*qnorm
expvarx = r_[0, 100*tp/(X*X).sum()]
expvary = r_[0, 100*tq/(Y*Y).sum()]
if scale == 'loads':
T = T/tnorm
W = W*tnorm
@ -438,7 +438,7 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
of these three matrices tries to discover common directions/subspaces.
*Parameters*:
X : {array}
Main data matrix (m, n)
Y : {array}
@ -457,9 +457,9 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
0 : row center
1 : column center
2 : double center
*Returns*:
T : {array}
X-scores
W : {array}
@ -504,18 +504,18 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
Saeboe et al., LPLS-regression: a method for improved prediction and
classification through inclusion of background information on
predictor variables, J. of chemometrics and intell. laboratory syst.
Martens et.al, Regression of a data matrix on descriptors of
both its rows and of its columns via latent variables: L-PLSR,
Computational statistics & data analysis, 2005
"""
m, n = X.shape
k, l = Y.shape
u, o = Z.shape
max_rank = min(m, n)
if center_axis != None:
xctr, yctr, zctr = center_axis
X, mnX = center(X, xctr)
@ -523,14 +523,14 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
Z, mnZ = center(Z, zctr)
max_rank = max_rank -1
assert (a_max > 0 and a_max < max_rank), "Number of comp error:\
tried: %d, max_rank: %d" %(a_max, max_rank)
tried: %d, max_rank: %d" %(a_max, max_rank)
# initial variance
varX = (X**2).sum()
varY = (Y**2).sum()
varZ = (Z**2).sum()
# initialize
# initialize
U = empty((k, a_max))
Q = empty((l, a_max))
T = zeros((m, a_max))
@ -600,7 +600,7 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
else:
k = w
l = dot(G, w)
U[:,a] = u.ravel()
W[:,a] = w.ravel()
P[:,a] = p.ravel()
@ -617,11 +617,11 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
var_x[a] = pow(E, 2).sum()
var_y[a] = pow(F, 2).sum()
var_z[a] = pow(G, 2).sum()
B[a] = dot(dot(W[:,:a+1], inv(dot(P[:,:a+1].T, W[:,:a+1]))), Q[:,:a+1].T)
#b0[a] = mnY - dot(mnX, B[a])
# variance explained
evx = 100.*(1 - var_x/varX)
evy = 100.*(1 - var_y/varY)
@ -635,8 +635,8 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, center_axis=[2, 0, 2], scale='scores',
knorm = apply_along_axis(vnorm, 0, K)
L = L*knorm
K = K/knorm
return {'T':T, 'W':W, 'P':P, 'Q':Q, 'U':U, 'L':L, 'K':K, 'B':B, 'E': E, 'F': F, 'G': G, 'evx':evx, 'evy':evy, 'evz':evz,'mnx': mnX, 'mny': mnY, 'mnz': mnZ}
return {'T':T, 'W':W, 'P':P, 'Q':Q, 'U':U, 'L':L, 'K':K, 'B':B, 'E': E, 'F': F, 'G': G, 'evx':evx, 'evy':evy, 'evz':evz,'mnx': mnX, 'mny': mnY, 'mnz': mnZ}
def lpls_predict(model_dict, x, aopt):
"""Predict lpls reponses from existing model on new data.
@ -646,25 +646,25 @@ def lpls_predict(model_dict, x, aopt):
except:
x = atleast_2d(x.shape)
m, n = x.shape
if 'B0' in model_dict.keys():
y = model_dict['B0'] + dot()
def vnorm(a):
"""Returns the norm of a vector.
*Parameters*:
a : {array}
Input data, 1-dim, or column vector (m, 1)
*Returns*:
a_norm : {array}
Norm of input vector
"""
return msqrt(dot(a.T,a))
@ -676,7 +676,7 @@ def center(a, axis):
a : {array}
Input data
axis : {integer}
Which centering to perform.
Which centering to perform.
0 = col center, 1 = row center, 2 = double center
-1 = nothing
@ -707,16 +707,16 @@ def center(a, axis):
else:
mn = a.mean()*ones(a.shape)
return a - mn, mn
if axis == -1:
mn = zeros((1,a.shape[1],))
mn = tile(mn, (a.shape[0], 1))
#mn = tile(mn, (a.shape[0], 1))
elif axis == 0:
mn = a.mean(0)[newaxis]
mn = tile(mn, (a.shape[0], 1))
#mn = tile(mn, (a.shape[0], 1))
elif axis == 1:
mn = a.mean(1)[:,newaxis]
mn = tile(mn, (1, a.shape[1]))
#mn = tile(mn, (1, a.shape[1]))
elif axis == 2:
#fixme: double centering returns column mean as loc-vector, ok?
mn = a.mean(0)[newaxis] + a.mean(1)[:,newaxis] - a.mean()
@ -774,7 +774,7 @@ def _scale(a, axis):
a : {array}
Input data
axis : {integer}
Which scaling to perform.
Which scaling to perform.
0 = column, 1 = row, -1 = nothing
*Returns*:
@ -784,7 +784,7 @@ def _scale(a, axis):
mn : {array}
Scaling vector/matrix
"""
if axis == -1:
sc = zeros((a.shape[1],))
elif axis == 0:
@ -817,21 +817,20 @@ def esvd(data, a_max=None):
Singular values
v : {array}
Left hand eigenvectors
*Notes*:
Uses Anoldi iterations for the symmetric eigendecomp (ARPACK)
"""
m, n = data.shape
if m >= n:
if m > n:
kernel = dot(data.T, data)
if a_max == None:
a_max = n - 1
s, v = arpack.eigen_symmetric(kernel, k=a_max, which='LM',
maxiter=200, tol=1e-5)
maxiter=500, tol=1e-7)
s = s[::-1]
v = v[:,::-1]
#u, s, vt = svd(kernel)
@ -841,9 +840,9 @@ def esvd(data, a_max=None):
else:
kernel = dot(data, data.T)
if a_max == None:
a_max = m -1
a_max = m - 1
s, u = arpack.eigen_symmetric(kernel, k=a_max, which='LM',
maxiter=200, tol=1e-5)
maxiter=500, tol=1e-7)
s = s[::-1]
u = u[:,::-1]
#u, s, vt = svd(kernel)

View File

@ -21,7 +21,7 @@ class Model(object):
def __init__(self, name="johndoe"):
self.name = name
self.options = {}
def save(self, filename='pca.ml'):
pass
@ -46,7 +46,7 @@ class PCA(Model):
self._x = x
self.amax = amax
self.aopt = amax
# properties
def amax():
doc = "maximum number of components"
@ -77,7 +77,7 @@ class PCA(Model):
del self._tot_var
return locals()
tot_var = property(**tot_var())
def scores():
doc = "pca scores"
def fget(self):
@ -94,7 +94,7 @@ class PCA(Model):
del self._core_scores
return locals()
scores = property(**scores())
def loadings():
doc = "pca loadings"
def fget(self):
@ -111,7 +111,7 @@ class PCA(Model):
self._loadings = p
return locals()
loadings = property(**loadings())
def singvals():
doc = "Singular values"
def fget(self):
@ -128,7 +128,7 @@ class PCA(Model):
del self._singvals
return locals()
singvals = property(**singvals())
def x():
doc = "x is readonly, may not be deleted"
def fget(self):
@ -154,12 +154,12 @@ class PCA(Model):
del self._xc
return locals()
xadd = property(**xadd())
def xc():
doc = "mean_centered input data"
def fget(self):
if not hasattr(self, "_xc"):
self._xc = self.x + self.xadd
self._xc = self.x + self.xadd
return self._xc
def fset(self, xc):
self._xc = xc
@ -186,7 +186,7 @@ class PCA(Model):
del self._xw
return locals()
xw = property(**xw())
def explained_variance():
doc = "explained variance"
def fget(self):
@ -215,7 +215,7 @@ class PCA(Model):
del self._residuals
return locals()
residuals = property(**residuals())
def leverage():
doc = "objects leverage"
def fget(self):
@ -254,7 +254,7 @@ class PCA(Model):
self._column_metric = scale(self.xc, axis=1)
return self._column_metric
def fset(self, w):
self._column_metric = w
# update model
def fdel(self):
@ -263,7 +263,7 @@ class PCA(Model):
del self._xd
return locals()
column_metric = property(**column_metric())
def blm_update(self, a, b):
pass
@ -281,8 +281,8 @@ class PCA(Model):
def reweight(self, w):
pass
if __name__ == "__main__":
from numpy.random import rand
X = rand(4,10)

View File

@ -22,9 +22,9 @@ def hotelling(Pcv, P, p_center='median', cov_center='median',
used in multivariate hypothesis testing. In order to avoid small variance
samples to become significant this version allows borrowing variance
from the pooled covariance.
*Parameters*:
Pcv : {array}
Crossvalidation segements of paramter
P : {array}
@ -39,9 +39,9 @@ def hotelling(Pcv, P, p_center='median', cov_center='median',
Rotate sub-segments toward calibration model.
strict : {boolean}, optional
Only rotate 90 degree
*Returns*:
tsq : {array}
Hotellings T^2 estimate
@ -50,19 +50,19 @@ def hotelling(Pcv, P, p_center='median', cov_center='median',
Gidskehaug et al., A framework for significance analysis of
gene expression datausing dimension reduction methods, BMC
bioinformatics, 2007
*Notes*
The rotational freedom in the solution of bilinear
models may require that a rotation onto the calibration
model. One way of doing that is procrustes rotation.
"""
m, n = P.shape
n_sets, n, amax = Pcv.shape
T_sq = empty((n,), dtype='d')
Cov_i = zeros((n, amax, amax), dtype='d')
# rotate sub_models to full model
if crot:
for i, Pi in enumerate(Pcv):
@ -77,20 +77,15 @@ def hotelling(Pcv, P, p_center='median', cov_center='median',
P_ctr = P
for i in xrange(n):
Pi = Pcv[:,i,:] # (n_sets x amax)
Pi = Pcv[:,i,:] # (n_sets x amax)
Pi_ctr = P_ctr[i,:] # (1 x amax)
#Pim = (Pi - Pi_ctr)*msqrt(n_sets-1)
#Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim)
Pim = (Pi - Pi_ctr)
Cov_i[i] = dot(Pim.T, Pim)
Pim = (Pi - Pi_ctr)*msqrt(n_sets-1)
Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim)
if cov_center == 'median':
Cov_p = median(Cov_i)
elif cov_center == 'mean':
else cov_center == 'mean':
Cov_p = Cov.mean(0)
else:
print "Pooled covariance est. invalid, using median"
print cov_center
Cov_p = median(Cov_i)
reg_cov = (1. - alpha)*Cov_i + alpha*Cov_p
for i in xrange(n):
Pc = P_ctr[i,:]
@ -105,7 +100,7 @@ def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False)
onto another by minimising the squared error.
*Parameters*:
a : {array}
Input array, stationary
b : {array}
@ -127,9 +122,9 @@ def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False)
*Reference*:
Schonemann, A generalized solution of the orthogonal Procrustes
problem, Psychometrika, 1966
problem, Psychometrika, 1966
"""
if center:
mn_a = a.mean(0)
a = a - mn_a
@ -145,7 +140,7 @@ def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False)
u, s, vt = svd(dot(b.T, a))
Cm = dot(u, vt) # Cm: orthogonal rotation matrix
if strict:
Cm = _ensure_strict(Cm)
Cm = _ensure_strict(Cm)
b_rot = dot(b, Cm)
if verbose:
fit = ((b - b_rot)**2).sum()
@ -158,7 +153,7 @@ def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False)
def _ensure_strict(C, only_flips=True):
"""Ensure that a rotation matrix does only 90 degree rotations.
In multiplication with pcs this allows flips and reordering.
if only_flips is True there will onlt be flips allowed
@ -173,13 +168,13 @@ def _ensure_strict(C, only_flips=True):
C_rot : {array}
Restricted rotation matrix
*Notes*:
This function is not ready for use. Use (only_flips=True).
That is, for more than two components, the rotation matrix
has a tendency to be unstable (det(Cm)>1), when rounding is used.
"""
if only_flips:
C = eye(C.shape[0])*sign(C)
@ -199,9 +194,9 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
The response (Y) is randomly permuted, and the number of false positives
is registered by comparing hotellings T2 statistics of the calibration model.
*Parameters*:
X : {array}
Main data matrix (m, n)
Y : {array}
@ -237,14 +232,14 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
nsets : {integer}
Number of crossvalidation segements
*Reference*:
Gidskehaug et al., A framework for significance analysis of
gene expression data using dimension reduction methods, BMC
bioinformatics, 2007
"""
m, n = X.shape
k, nz = Z.shape
assert(n==nz)
@ -255,7 +250,7 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
Y = atleast_2d(Y).T
my, l = Y.shape
assert(m==my)
pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets)
pert_tsq_z = zeros((k, n_iter), dtype='d') # (nzvars x n_subsets)
@ -264,7 +259,7 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
Wc, Lc = lpls_jk(X, Y, Z ,aopt, zorth=zorth)
cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha)
cal_tsq_z = hotelling(Lc, dat['L'], alpha=alpha)
print "morn"
# Perturbations
index = arange(m)
for i in range(n_iter):
@ -275,7 +270,7 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)
# no reason to borrow variance in dag (alpha ->some small value)
pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=0.01)
return cal_tsq_z, pert_tsq_z, cal_tsq_x, pert_tsq_x
@ -286,9 +281,9 @@ def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median,
The response (Y) is randomly permuted, and the number of false positives
is registered by comparing hotellings T2 statistics of the calibration model.
*Parameters*:
X : {array}
Main data matrix (m, n)
Y : {array}
@ -318,14 +313,14 @@ def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median,
Only rotate 90 degree
nsets : {integer}
Number of crossvalidation segements
*Reference*:
Gidskehaug et al., A framework for significance analysis of
gene expression data using dimension reduction methods, BMC
bioinformatics, 2007
"""
m, n = X.shape
try:
my, l = Y.shape
@ -341,7 +336,7 @@ def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median,
Wc = pls_jk(X, Y , aopt)
cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha)
# Perturbations
pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets)
index = arange(m)
@ -351,7 +346,7 @@ def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median,
dat = pls(X, Y[indi,:], aopt, scale='loads', center_axis=center_axis)
Wi = pls_jk(X, Y[indi,:], aopt, nsets=nsets, center_axis=center_axis)
pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)
return cal_tsq_x, pert_tsq_x
@ -362,7 +357,7 @@ def _fdr(tsq, tsqp, loc_method=median):
Fdr is a method used in multiple hypothesis testing to correct for multiple
comparisons. It controls the expected proportion of incorrectly rejected null
hypotheses (type I errors) in a list of rejected hypotheses.
*Parameters*:
tsq : {array}
@ -372,14 +367,14 @@ def _fdr(tsq, tsqp, loc_method=median):
loc_method : {py_func}
Location method
*Returns*:
fdr : {array}
False discovery rate
*Notes*:
This is an internal function for use in fdr estimation of jack-knifed
perturbated blm parameters.
@ -403,4 +398,3 @@ def _fdr(tsq, tsqp, loc_method=median):
fd_rate = fp/n_signif
fd_rate[fd_rate>1] = 1
return fd_rate