Added symmetric poweriterations to sandbox

This commit is contained in:
Arnar Flatberg 2007-12-12 22:08:55 +00:00
parent 0956581c42
commit 1103245d85
10 changed files with 851 additions and 172 deletions

View File

@ -30,7 +30,7 @@ class get_matvec:
if isinstance(obj, sb.ndarray):
self.callfunc = self.type1
return
meth = getattr(obj,self.methname,None)
meth = getattr(obj, self.methname, None)
if not callable(meth):
raise ValueError, "Object must be an array "\
"or have a callable %s attribute." % (self.methname,)
@ -48,12 +48,12 @@ class get_matvec:
return sb.dot(self.obj.A, x)
def type2(self, x):
return self.obj(x,*self.args)
return self.obj(x, *self.args)
def eigen(A,k=6,M=None,ncv=None,which='LM',
maxiter=None,tol=0, return_eigenvectors=True):
""" Return k eigenvalues and eigenvectors of the matrix A.
def eigen(A, k=6, M=None, ncv=None, which='LM',
maxiter=None, tol=0, return_eigenvectors=True, v0=None):
"""Return k eigenvalues and eigenvectors of the matrix A.
Solves A * x[i] = w[i] * x[i], the standard eigenvalue problem for
w[i] eigenvalues with corresponding eigenvectors x[i].
@ -69,6 +69,7 @@ def eigen(A,k=6,M=None,ncv=None,which='LM',
M -- (Not implemented)
A symmetric positive-definite matrix for the generalized
eigenvalue problem A * x = w * M * x
v0 -- Initial starting solution (n x 1)
Outputs:
@ -99,8 +100,8 @@ def eigen(A,k=6,M=None,ncv=None,which='LM',
"""
try:
n,ny=A.shape
n==ny
n, ny = A.shape
n == ny
except:
raise AttributeError("matrix is not square")
if M is not None:
@ -108,10 +109,10 @@ def eigen(A,k=6,M=None,ncv=None,which='LM',
# some defaults
if ncv is None:
ncv=2*k+1
ncv=min(ncv,n)
if maxiter==None:
maxiter=n*10
ncv = 2*k + 1
ncv = min(ncv, n)
if maxiter == None:
maxiter = n*10
# guess type
resid = sb.zeros(n,'f')
@ -129,7 +130,7 @@ def eigen(A,k=6,M=None,ncv=None,which='LM',
raise ValueError("k must be less than rank(A), k=%d"%k)
if maxiter <= 0:
raise ValueError("maxiter must be positive, maxiter=%d"%maxiter)
whiches=['LM','SM','LR','SR','LI','SI']
whiches = ['LM','SM','LR','SR','LI','SI']
if which not in whiches:
raise ValueError("which must be one of %s"%' '.join(whiches))
if ncv > n or ncv < k:
@ -141,17 +142,26 @@ def eigen(A,k=6,M=None,ncv=None,which='LM',
eigextract = _arpack.__dict__[ltr+'neupd']
matvec = get_matvec(A)
v = sb.zeros((n,ncv),typ) # holds Ritz vectors
resid = sb.zeros(n,typ) # residual
workd = sb.zeros(3*n,typ) # workspace
workl = sb.zeros(3*ncv*ncv+6*ncv,typ) # workspace
iparam = sb.zeros(11,'int') # problem parameters
ipntr = sb.zeros(14,'int') # pointers into workspaces
info = 0
v = sb.zeros((n, ncv), typ) # holds Ritz vectors
if v0 == None:
resid = sb.zeros(n, typ) # residual
info = 0
else: # starting vector is given
nn, kk = v0.shape
if nn != n:
raise ValueError("starting vector must be: (%d, 1), got: (%d, %d)" %(n, nn, kk))
resid = v0[:,0].astype(typ)
info = 1
workd = sb.zeros(3*n, typ) # workspace
workl = sb.zeros(3*ncv*ncv+6*ncv, typ) # workspace
iparam = sb.zeros(11, 'int') # problem parameters
ipntr = sb.zeros(14, 'int') # pointers into workspaces
ido = 0
if typ in 'FD':
rwork = sb.zeros(ncv,typ.lower())
rwork = sb.zeros(ncv, typ.lower())
# only supported mode is 1: Ax=lx
ishfts = 1
@ -173,9 +183,9 @@ def eigen(A,k=6,M=None,ncv=None,which='LM',
if (ido == -1 or ido == 1):
# compute y = A * x
xslice = slice(ipntr[0]-1, ipntr[0]-1+n)
yslice = slice(ipntr[1]-1, ipntr[1]-1+n)
workd[yslice]=matvec(workd[xslice])
xslice = slice(ipntr[0] - 1, ipntr[0] - 1 + n)
yslice = slice(ipntr[1] - 1, ipntr[1] - 1 + n)
workd[yslice] = matvec(workd[xslice])
else: # done
break
@ -233,7 +243,7 @@ def eigen(A,k=6,M=None,ncv=None,which='LM',
def eigen_symmetric(A,k=6,M=None,ncv=None,which='LM',
maxiter=None,tol=0, return_eigenvectors=True):
maxiter=None,tol=0, return_eigenvectors=True, v0=None):
""" Return k eigenvalues and eigenvectors of the real symmetric matrix A.
Solves A * x[i] = w[i] * x[i], the standard eigenvalue problem for
@ -253,6 +263,8 @@ def eigen_symmetric(A,k=6,M=None,ncv=None,which='LM',
A symmetric positive-definite matrix for the generalized
eigenvalue problem A * x = w * M * x
v0 -- Starting vector (n, 1)
Outputs:
w -- An real array of k eigenvalues
@ -325,12 +337,22 @@ def eigen_symmetric(A,k=6,M=None,ncv=None,which='LM',
matvec = get_matvec(A)
v = sb.zeros((n,ncv),typ)
resid = sb.zeros(n,typ)
if v0 == None:
resid = sb.zeros(n, typ) # residual
info = 0
else: # starting solution is given
nn, kk = v0.shape
if nn != n:
raise ValueError("starting vectors must be: (%d, %d), got: (%d, %d)" %(n, k, nn, kk))
resid = v0[:,0].astype(typ)
info = 1
#resid = sb.zeros(n,typ)
workd = sb.zeros(3*n,typ)
workl = sb.zeros(ncv*(ncv+8),typ)
iparam = sb.zeros(11,'int')
ipntr = sb.zeros(11,'int')
info = 0
#info = 0
ido = 0
# only supported mode is 1: Ax=lx
@ -341,7 +363,6 @@ def eigen_symmetric(A,k=6,M=None,ncv=None,which='LM',
iparam[2] = maxiter
iparam[6] = mode1
while True:
ido,resid,v,iparam,ipntr,info =\
eigsolver(ido,bmat,which,k,tol,resid,v,iparam,ipntr,

View File

@ -2,16 +2,149 @@
The primary use is crossvalidation.
"""
__all__ = ['lpls_val', 'pls_jk', 'lpls_jk']
__all__ = ['pca_val', 'pls_val', 'lpls_val', 'pls_jk', 'lpls_jk']
__docformat__ = "restructuredtext en"
from numpy import dot,empty,zeros,sqrt,atleast_2d,argmax,asarray,median,\
array_split,arange, isnan, any
array_split,arange, isnan, any,newaxis
from numpy.random import shuffle
from engines import pls
from engines import pls, pca
from engines import nipals_lpls as lpls
def pca_val(a, a_max, nsets=None, center_axis=[0]):
"""Returns error estimate of crossvalidated PCA.
*Parameters*:
a : {array}
data matrix (n x m)
a_max : {integer}
Maximum number of components in model.
center_axis:
Centering
nsets : {integer}
number of segments
*Returns*:
rmsep : {array}
Squared error of prediction for each component and xvar (a_max, m)
xhat : {array}
Crossvalidated predicted a (a_max, m, n)
aopt : {integer}
Estimate of optimal number of components
*Notes*:
- Crossvalidation method is currently set to random blocks of diagonals.
"""
n, m = a.shape
if nsets == None:
nsets = n
err = zeros((a_max, n, m), dtype=a.dtype)
err_mn = zeros((a_max, n, m), dtype=a.dtype)
xhat = zeros((a_max, n, m), dtype=a.dtype)
mn_a = .5*(a.mean(0) + a.mean(1)[:,newaxis])
for i, val in enumerate(diag_cv(a.shape, nsets)):
old_values = a.take(val)
new_values = mn_a.take(val)
# impute mean values
b = a.copy()
a.put(val, new_values)
dat = pca(a, a_max, mode='normal', center_axis=center_axis)
Ti, Pi = dat['T'], dat['P']
bc = b - dat['mnx']
bc2 = b - b.mean(0)
for j in xrange(a_max):
# predict the imputed values
a_pred = dot(Ti[:,:j+1], Pi[:,:j+1].T).take(val)
a_true = bc2.take(val)
err[j,:,:].put(val, (a_true - a_pred)**2)
err_mn[j,:,:].put(val, (bc.take(val) - a_pred)**2)
xhat[j,:,:].put(val, a_pred)
# put original values back
a.put(val, old_values)
rmsep = sqrt(err).mean(1) # take mean over samples
rmsep2 = sqrt(err_mn).mean(1)
aopt = rmsep.mean(-1).argmin()
return rmsep, xhat, aopt, err, rmsep2
def pls_val(X, Y, a_max=2, nsets=None, center_axis=[0,0], verbose=False):
"""Performs crossvalidation for generalisation error in pls.
*Parameters*:
X : {array}
Main data matrix (m, n)
Y : {array}
External row data (m, l)
a_max : {integer}, optional
Maximum number of components to calculate (0, min(m,n))
nsets : (integer), optional
Number of crossvalidation sets
center_axis : {array-like}, optional
A three element array-like structure with elements in [-1,0,1,2],
that decides the type of centering used.
-1 : nothing
0 : row center
1 : column center
2 : double center
verbose : {boolean}, optional
Verbosity of console output. For use in debugging.
*Returns*:
rmsep : {array}
Root mean squred error of prediction (for each y-var)
yhat : {array}
Estimated responses
aopt : {integer}
Estimated value of optimal number of components
"""
dt = X.dtype
m, n = X.shape
k, l = Y.shape
assert m == k, "X (%d,%d) - Y (%d,%d) dim mismatch" %(m, n, k, l)
assert n == p, "X (%d,%d) - Z (%d,%d) dim mismatch" %(m, n, o, p)
if nsets == None:
nsets = m
if nsets > X.shape[0]:
print "nsets (%d) is larger than number of variables (%d).\nnsets: %d -> %d" %(nsets, m, nsets, m)
nsets = m
if m > n:
# kernel boosting
Yhat = _w_pls_predict(X, Y, a_max)
Yhat = empty((a_max, k, l), dtype=dt)
for cal, val in cv(k, nsets):
# do the training model
dat = pls(X[cal], Y[cal], a_max=a_max,center_axis=center_axis)
# center test data
xi = X[val,:] - dat['mnx']
ym = dat['mny']
# predictions
for a in range(a_max):
Yhat[a,val,:] = ym + dot(xi, dat['B'][a])
sep = (Y - Yhat)**2
rmsep = sqrt(sep.mean(1)).T
#aopt = find_aopt_from_sep(rmsep)
# todo: need a better support for classification error
error = prediction_error(Yhat, Y, method='1/2')
return rmsep, Yhat, error
def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=False, verbose=False):
"""Performs crossvalidation for generalisation error in lpls.
@ -72,47 +205,29 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, center_axis=[2,0,2], zorth=F
assert (alpha >= 0 and alpha<=1), "Alpha needs to be within [0,1], got: %.2f" %alpha
Yhat = empty((a_max, k, l), 'd')
for cal, val in cv(nsets, k):
for cal, val in cv(k, nsets):
# do the training model
dat = lpls(X[cal], Y[cal], Z, a_max=a_max, alpha=alpha,
center_axis=center_axis, zorth=zorth, verbose=verbose)
# center test data
if center_axis[0] != 1:
xi = X[val,:] - dat['mnx']
else:
xi = X[val] - X[cal].mean(1)[:,newaxis]
if center_axis[2] != 1:
ym = dat['mny']
else:
ym = Y[cal].mean(1)[:,newaxis]
xi = X[val,:] - dat['mnx']
ym = dat['mny'][val,:]
# predictions
for a in range(a_max):
Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a]))
#if permute:
# xcal = X[cal]
# for a in range(1,a_max,1):
# for n in range(10):
# shuffle(cal)
# dat = lpls(xcal, Y[cal], Z, a_max=a_max, alpha=alpha,
# center_axis=center_axis, verbose=verbose)
# todo: need a better support for classification error
y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?']
if y_is_class:
pass
#Yhat, err = class_error(Yhat, Y)
#return Yhat, err
sep = (Y - Yhat)**2
rmsep = sqrt(sep.mean(1)).T
aopt = find_aopt_from_sep(rmsep)
#aopt = find_aopt_from_sep(rmsep)
return rmsep, Yhat, aopt
# todo: need a better support for classification error
error = prediction_error(Yhat, Y, method='1/2')
def pca_jk(a, aopt, center_axis=[0], nsets=None):
"""Returns jack-knife segements from PCA.
return rmsep, Yhat, error
def pca_jk(a, aopt, nsets=None, center_axis=[0], method='cv'):
"""Returns jack-knife segments from PCA.
*Parameters*:
@ -124,6 +239,12 @@ def pca_jk(a, aopt, center_axis=[0], nsets=None):
Centering
nsets : {integer}
number of segments
method : {'cv', 'diag', 'bs', 'bs_diag'}
Perturbation method is one of:
cv = leave samples out
diag = impute diagonals
bs = leave samples out with replacement (bootstrap)
bs_diag = impute diagonals
*Returns*:
@ -135,22 +256,32 @@ def pca_jk(a, aopt, center_axis=[0], nsets=None):
- Crossvalidation method is currently set to random blocks of samples.
"""
m, n = a.shape
if nsets == None:
nsets = a.shape[0]
nsets = m
Pcv = empty((nsets, a.shape[1], aopt), dtype=a.dtype)
mn_a = .5*(a.mean(0) + a.mean(1)[:,newaxis])
for i, (cal, val) in enumerate(cv_diag(a.shape, nsets)):
old_values = a.take(ind)
new_values = mn_a.take(ind)
a.put(ind, new_values)
dat = pca(a, aopt, mode='fast', scale='loads', center_axis=center_axis)
PP[i,:,:] = dat['P']
a.put(ind, old_values)
if method == 'diag':
mn_a = .5*(a.mean(0) + a.mean(1)[:,newaxis])
for i, val in enumerate(diag_cv(a.shape, nsets)):
old_values = a.take(val)
new_values = mn_a.take(val)
# impute mean values
a.put(val, new_values)
dat = pca(a, aopt, mode='fast', scale='loads', center_axis=center_axis)
Pcv[i,:,:] = dat['P']
# put original values back
a.put(val, old_values)
elif method == 'cv':
print "using ....cv "
for i, (cal, val) in enumerate(cv(m, nsets)):
Pcv[i,:,:] = pca(a[cal,:], aopt, mode='fast', scale='loads', center_axis = center_axis)['P']
else:
raise NotImplementedError(method)
return PP
return Pcv
def pls_jk(X, Y, a_opt, nsets=None, center_axis=True, verbose=False):
def pls_jk(X, Y, a_opt, nsets=None, center_axis=[0,0], verbose=False):
""" Returns jack-knife segements of W.
*Parameters*:
@ -187,9 +318,7 @@ def pls_jk(X, Y, a_opt, nsets=None, center_axis=True, verbose=False):
for i, (cal, val) in enumerate(cv(k, nsets)):
if verbose:
print "Segment number: %d" %i
dat = pls(X[cal,:], Y[cal,:], a_opt, scale='loads', mode='fast', center_axis=[0, 0])
if any(isnan(dat['W'])):
1/0
dat = pls(X[cal,:], Y[cal,:], a_opt, scale='loads', mode='fast', center_axis=center_axis)
Wcv[i,:,:] = dat['W']
return Wcv
@ -257,18 +386,18 @@ def lpls_jk(X, Y, Z, a_opt, nsets=None, xz_alpha=.5, center_axis=[2,0,2], zorth=
return WWx, WWz
def find_aopt_from_sep(sep, method='vanilla'):
def find_aopt_from_sep(err, method='vanilla'):
"""Returns an estimate of optimal number of components.
The estimate is based on the squared error of prediction from
The estimate is based on the error of prediction from
crossvalidation. This is pretty much wild guessing and it is
recomended to inspect model parameters and prediction errors
closely before deciding on the optimal number of components.
*Parameters*:
sep : {array}
Squared error of prediction
err : {array}
Error of prediction
method : ['vanilla', '75perc']
Mehtod used to estimate optimal number of components
@ -280,22 +409,23 @@ def find_aopt_from_sep(sep, method='vanilla'):
if method == 'vanilla':
# min rmsep
rmsecv = sqrt(sep.mean(0))
rmsecv = sqrt(err.mean(0))
return rmsecv.argmin() + 1
elif method == '75perc':
prct = .75 #percentile
ind = 1.*sep.shape[0]*prct
med = median(sep)
prc_75 = []
for col in sep.T:
col = sorted(col)
prc_75.append(col[int(ind)])
prc_75 = asarray(prc_75)
for i in range(1, sep.shape[1], 1):
if med[i-1]<prc_75[i]:
return i
return len(med)
raise NotImplementedError
#prct = .75 #percentile
#ind = 1.*err.shape[0]*prct
#med = median(err)
#prc_75 = []
#for col in err.T:
# col = sorted(col)
# prc_75.append(col[int(ind)])
#prc_75 = asarray(prc_75)
#for i in range(1, err.shape[1], 1):
# if med[i-1]<prc_75[i]:
# return i
#return len(med)
def cv(N, K, randomise=True, sequential=False):
"""Generates K (training, validation) index pairs.
@ -351,13 +481,29 @@ def cv(N, K, randomise=True, sequential=False):
validation = [i for i in index if i % K == k]
yield training, validation
def diag_cv(shape, nsets=9):
def diag_cv(shape, nsets=9, randomise=True):
"""Generates K (training, validation) index pairs.
*Parameters*:
N : {integer}
alpha -- scalar, approx. portion of data perturbed
shape : {tuple}
Array shape
nsets : {integer}
Number of cv sets
randomise : {boolean}
Randomise diagonal index
*Returns*:
training : {array-like}
training-indices
validation : {array-like}
validation-indices
*Notes*:
This index is based on the full index (raveled row-major ordering).
It extracts along diagonals to ensure balanced removal along both axis
"""
try:
m, n = shape
@ -365,59 +511,195 @@ def diag_cv(shape, nsets=9):
raise ValueError("shape needs to be a two-tuple")
if nsets>m or nsets>n:
msg = "You may not use more subsets than max(n_rows, n_cols)"
raise ValueError, msg
nsets = min(m, n)
nm = n*m
index = arange(nm)
n_ind = arange(n)
shuffle(n_ind) # random start diag
n_ind = arange(n+1)
#shuffle(n_ind) # random start diag
start_inds = array_split(n_ind, nsets)
for v in range(nsets):
validation = []
validation = set()
for start in start_inds[v]:
ind = arange(start+v, nm, n+1)
[validation.append(i) for i in ind]
training = [j for j in index if j not in validation]
yield training, validation
ind = arange(start, nm, n+1)
validation.update(ind)
#training = [j for j in index if j not in validation]
yield list(validation)
def class_error(y_hat, y, method='vanilla'):
""" Not used.
"""
a_opt, k, l = y_hat.shape
y_hat_c = zeros((k, l), dtype='d')
if method == vanilla:
pass
for a in range(a_opt):
for i in range(k):
y_hat_c[a, val, argmax(y_hat[a,val,:])] = 1.0
err = 100*((y_hat_c + y) == 2).sum(1)/y.sum(0).astype('d')
return y_hat_c, err
def prediction_error(y_hat, y, method='squared'):
"""Loss function on multiclass Y.
Assumes y is a binary dummy class matrix (samples, classes)
"""
k, l = y.shape
a_max, kk, ll = y_hat.shape
error = empty((a_max, l))
for a in range(a_max):
yha = y_hat[a, :, :]
if method == 'hinge':
err = zeros((k, l))
for j in range(l):
for i in range(k):
if y[i,j] == 1:
if yha[i, j] >= 1:
err[i,j] = 0
else:
err[i,j] = abs(y[i,j] - yha[i,j])
elif y[i,j] == 0:
if yha[i, j] <= 0:
err[i,j] = 0
else:
err[i,j] = abs(y[i,j] - yha[i,j])
if method == 'hinge':
pass
elif method == 'smooth_hinge':
z = 90
elif method == 'abs':
err = abs(y - y_hat)
elif method == 'squared':
err = (y - y_hat)**2
elif method == '0/1':
pred = zeros_like(y_hat)
for i, row in enumerate(y_hat):
largest = row.argsort()[-1]
pred[i, largest] = 1.
err = abs(y - pred)
elif method == '1/2':
y_hat[y_hat>.5] = 1
y_hat[y_hat<.5] = 0
err = abs(y - y_hat)
elif method == 'smooth_hinge':
err = zeros((k, l))
for j in range(l):
for i in range(k):
if y[i,j] == 1:
if yha[i, j] >= 1:
err[i,j] = 0
elif yha[i,j] < 1 and yha[i,j] > 0:
err[i,j] = abs(y[i,j] - yha[i,j])
else:
err[i,j] = 1
return err
elif y[i,j] == 0:
if yha[i, j] <= 0:
err[i,j] = 0
elif yha[i,j] < 1 and yha[i,j] > 0:
err[i,j] = abs(y[i,j] - yha[i,j])
else:
err[i,j] = 1
elif method == 'abs':
err = abs(y - yha)
elif method == 'squared':
err = (y - yha)**2
elif method == '0/1':
pred = zeros((k, l))
for i, row in enumerate(yha):
largest = row.argsort()[-1]
pred[i, largest] = 1.
err = abs(y - pred)
elif method == '1/2':
yh = yha.copy()
yh[yha>.5] = 1
yh[yha<.5] = 0
err = abs(y - yh)
else:
raise ValueError("Option: %s (method) not valid" %method)
error[a,:] = err.mean(0)
return error
def _wkernel_pls_val(X, Y, a_max, n_blocks=None):
"""Returns rmsep and aopt for pls tailored for wide X.
The error of cross validation is calculated
based on random block cross-validation. With number of blocks equal to
number of samples [default] gives leave-one-out cv.
The pls model is based on the simpls algorithm for wide X, an is quite
fast in very high dimensional X data.
*Parameters*:
X : ndarray
column centered data matrix of size (samples x variables)
Y : ndarray
column centered response matrix of size (samples x responses)
a_max : scalar
Maximum number of components
n_blocks : scalar
Number of blocks in cross validation
*Returns*:
rmsep : ndarray
Root Mean Square Error of cross-validated Predictions
aopt : scalar
Guestimate of the optimal number of components
SeeAlso:
- pls_cv_val : Same output, not optimised for wide X
- w_simpls : Simpls algorithm for wide X
Notes
-----
Based (cowardly translated) on m-files from the Chemoact toolbox
X, Y inputs need to be centered (fixme: check)
Examples
--------
>>> import numpy as n
>>> X = n.array([[1., 2., 3.],[]])
>>> Y = n.array([[1., 2., 3.],[]])
>>> w_pls(X, Y, 1)
[4,5,6], 1
"""
dt = X.dtype
k, l = m_shape(Y)
PRESS = zeros((l, a_max+1), dtype=dt)
if n_blocks==None:
n_blocks = Y.shape[0]
XXt = dot(X, X.T)
V = w_pls_gen(XXt, Y, n_blocks=n_blocks, center=True)
for Din, Doi, Yin, Yout in V:
ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0])
PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
dat = w_simpls(Din, Yin, a_max)
Q, U, H = dat['Q'], dat['U'], dat['H']
That = dot(Doi, dot(U, inv(triu(dot(H.T, U))) ))
Yhat = zeros((a_max, k, l), dtype=dt)
for j in range(l):
TQ = dot(That, triu(dot(Q[j,:][:,newaxis], ones((1,a_max)))) )
Yhat[:,:,l] = TQ
E = Yout[:,j][:,newaxis] - TQ
E = E + sum(E, 0)/Din.shape[0]
PRESS[j,1:] = PRESS[j,1:] + sum(E**2, 0)
#Yhat = Yin - dot(That,Q.T)
msep = PRESS/(Y.shape[0])
#aopt = find_aopt_from_sep(msep)
return Yhat, sqrt(msep)
def _w_pls(aat, b, aopt):
""" Pls for wide matrices.
Fast pls for crossval, used in calc rmsep for wide X
There is no P or W. T is normalised
aat = centered kernel matrix
b = centered y
"""
bb = b.copy()
k, l = m_shape(b)
m, m = m_shape(aat)
U = empty((m, aopt)) # W
T = empty((m, aopt))
R = empty((m, aopt)) # R
PROJ = empty((m, aopt)) # P?
for i in range(aopt):
if has_sym:
s, q = symeig(dot(dot(b.T, aat), b), range=(l,l),overwrite=True)
else:
q, s, vh = svd(dot(dot(b.T, aat), b), full_matrices=0)
q = q[:,:1]
u = dot(b , q) #y-factor scores
U[:,i] = u.ravel()
t = dot(aat, u)
t = t/vnorm(t)
T[:,i] = t.ravel()
r = dot(aat, t)#score-weights
#r = r/vnorm(r)
R[:,i] = r.ravel()
PROJ[:,: i+1] = dot(T[:,:i+1], inv(dot(T[:,:i+1].T, R[:,:i+1])) )
if i<aopt:
b = b - dot(PROJ[:,:i+1], dot(R[:,:i+1].T, b) )
C = dot(bb.T, T)

View File

@ -8,7 +8,7 @@ __docformat__ = "restructuredtext en"
from math import sqrt as msqrt
from numpy import dot,empty,zeros,apply_along_axis,newaxis,finfo,sqrt,r_,expand_dims,\
minimum, any, isnan
minimum,any,isnan,ones,tile
from numpy.linalg import inv,svd
from scipy.sandbox import arpack
@ -79,11 +79,13 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
"""
m, n = X.shape
min_aopt = min(m, n)
max_aopt = min(m, n)
if center_axis != None:
X, mnx = center(X, center_axis[0])
min_aopt = min_aopt - 1
assert(aopt <= min_aopt)
max_aopt = max_aopt - 1
if aopt > max_aopt:
print "Using aopt: %d" %max_aopt
aopt = max_aopt
if m > (n+100) or n > (m+100):
u, s, v = esvd(X, aopt)
else:
@ -108,7 +110,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
P = P*s
if mode in ['fast', 'f', 'F']:
return {'T':T, 'P':P, 'aopt':aopt}
return {'T':T, 'P':P, 'aopt':aopt, 'mnx': mnx}
if mode in ['detailed', 'd', 'D']:
E = empty((aopt, m, n))
@ -135,7 +137,7 @@ def pca(X, aopt, scale='scores', mode='normal', center_axis=[0]):
expvarx = r_[0, 100*e.cumsum()/(X*X).sum()]
return {'T': T, 'P': P, 'E': E, 'evx': expvarx, 'leverage': lev, 'ssqx': ssq,
'aopt': aopt, 'eigvals': e}
'aopt': aopt, 'eigvals': e, 'mnx': mnx}
def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=[0, 0]):
""" Principal Component Regression.
@ -690,7 +692,7 @@ def center(a, axis):
### TODO ###
# Perhaps not(!) use broadcasting and return full centering
# matrix instead ?
#
# check if we have a vector
is_vec = len(a.shape) == 1
@ -698,31 +700,72 @@ def center(a, axis):
is_vec = a.shape[0] == 1 or a.shape[1] == 1
if is_vec:
if axis == 2:
warnings.warn("Double centering of vector ignored, using ordinary centering")
#warnings.warn("Double centering of vector ignored, using ordinary centering")
raise ValueError("Double centering of vector is not a valid option")
if axis == -1:
mn = 0
mn = zeros(a.shape)
else:
mn = a.mean()
mn = a.mean()*ones(a.shape)
return a - mn, mn
if axis == -1:
mn = zeros((1,a.shape[1],))
#mn = tile(mn, (a.shape[0], 1))
mn = tile(mn, (a.shape[0], 1))
elif axis == 0:
mn = a.mean(0)[newaxis]
#mn = tile(mn, (a.shape[0], 1))
mn = tile(mn, (a.shape[0], 1))
elif axis == 1:
mn = a.mean(1)[:,newaxis]
#mn = tile(mn, (1, a.shape[1]))
mn = tile(mn, (1, a.shape[1]))
elif axis == 2:
#fixme: double centering returns column mean as loc-vector, ok?
mn = a.mean(0)[newaxis] + a.mean(1)[:,newaxis] - a.mean()
return a - mn , a.mean(0)[newaxis]
return a - mn , mn
else:
raise IOError("input error: axis must be in [-1,0,1,2]")
return a - mn, mn
def inv_center(a, mn_a, axis):
"""Inverse centering.
Adding row, column or double centering to a matrix.
This method uses broadcasting, so the size of a needs only
to match the axis argument.
*Parameters*:
a : {array}
Input data, to be centered
axis : {integer}
Which centering to perform.
0 = col center, 1 = row center, 2 = double center
-1 = nothing
*Returns*:
a_centered : {array}
Centered data matrix
*Notes*
May just as well used to subtract a mean (just use negative mn_a)
"""
if axis == -1:
return a
m, n = a.shape
k, o = mn_a.shape
if axis == 0: #row centering
assert(n == o and k == 1)
elif axis == 1: # column centering
assert(n == k and o == 1)
elif axis == 2: # double centering
assert(n == o and m == k)
return a + mn_a
def _scale(a, axis):
""" Matrix scaling to unit variance.

View File

@ -14,7 +14,7 @@ from engines import pls
from engines import nipals_lpls as lpls
def hotelling(Pcv, P, p_center='median', cov_center=median,
def hotelling(Pcv, P, p_center='median', cov_center='median',
alpha=0.3, crot=True, strict=False):
"""Returns regularized hotelling T^2.
@ -31,8 +31,8 @@ def hotelling(Pcv, P, p_center='median', cov_center=median,
Calibration model paramter
p_center : {'median', 'mean', 'cal_model'}, optional
Location method for sub-segments
cov_center : {py_func}, optional
Location function
cov_center : {'median', 'mean', 'cal_model'}, optional
Pooled covariance estimate
alpha : {float}, optional
Regularisation towards pooled covariance estimate.
crot : {boolean}, optional
@ -79,33 +79,43 @@ def hotelling(Pcv, P, p_center='median', cov_center=median,
for i in xrange(n):
Pi = Pcv[:,i,:] # (n_sets x amax)
Pi_ctr = P_ctr[i,:] # (1 x amax)
Pim = (Pi - Pi_ctr)*msqrt(n_sets-1)
Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim)
Cov = cov_center(Cov_i)
reg_cov = (1. - alpha)*Cov_i + alpha*Cov
#Pim = (Pi - Pi_ctr)*msqrt(n_sets-1)
#Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim)
Pim = (Pi - Pi_ctr)
Cov_i[i] = dot(Pim.T, Pim)
if cov_center == 'median':
Cov_p = median(Cov_i)
elif cov_center == 'mean':
Cov_p = Cov.mean(0)
else:
print "Pooled covariance est. invalid, using median"
print cov_center
Cov_p = median(Cov_i)
reg_cov = (1. - alpha)*Cov_i + alpha*Cov_p
for i in xrange(n):
Pc = P_ctr[i,:]
sigma = reg_cov[i]
T_sq[i] = dot(dot(Pc, inv(sigma)), Pc)
return T_sq
def procrustes(a, b, strict=True, center=False, verbose=False):
def procrustes(a, b, strict=True, center=False, force_norm=False, verbose=False):
"""Orthogonal rotation of b to a.
Procrustes rotation is an orthogonal rotoation of one subspace
Procrustes rotation is an orthogonal rotation of one subspace
onto another by minimising the squared error.
*Parameters*:
a : {array}
Input array
Input array, stationary
b : {array}
Input array
Input array, rotate this to max. fit to a
strict : {boolean}
Only do flipping and shuffling
center : {boolean}
Center before rotation, translate back after
force_norm : {boolean}
Ensure that columns of a and b are orthonormal
verbose : {boolean}
Show sum of squares
@ -126,6 +136,12 @@ def procrustes(a, b, strict=True, center=False, verbose=False):
mn_b = b.mean(0)
b = b - mn_b
if force_norm:
a_norm = apply_along_axis(norm, 0, a)
a = a/a_norm
b_norm = apply_along_axis(norm, 0, b)
b = b/b_norm
u, s, vt = svd(dot(b.T, a))
Cm = dot(u, vt) # Cm: orthogonal rotation matrix
if strict:
@ -166,13 +182,13 @@ def _ensure_strict(C, only_flips=True):
"""
if only_flips:
C = eye(Cm.shape[0])*sign(Cm)
C = eye(C.shape[0])*sign(C)
return C
Cm = zeros(C.shape, dtype='d')
Cm[abs(C)>.6] = 1.
if det(Cm)>1:
raise NotImplementedError
return Cm*S
return Cm*sign(C)
def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
sim_method='shuffle', p_center='med', cov_center=median,
@ -248,7 +264,7 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
Wc, Lc = lpls_jk(X, Y, Z ,aopt, zorth=zorth)
cal_tsq_x = hotelling(Wc, dat['W'], alpha=alpha)
cal_tsq_z = hotelling(Lc, dat['L'], alpha=alpha)
print "morn"
# Perturbations
index = arange(m)
for i in range(n_iter):
@ -257,9 +273,10 @@ def lpls_qvals(X, Y, Z, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
dat = lpls(X, Y[indi,:], Z, aopt, scale='loads', center_axis=center_axis, zorth=zorth)
Wi, Li = lpls_jk(X, Y[indi,:], Z, aopt, nsets=nsets)
pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)
pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=alpha)
# no reason to borrow variance in dag (alpha ->some small value)
pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=0.01)
return _fdr(cal_tsq_z, pert_tsq_z, median), _fdr(cal_tsq_x, pert_tsq_x, median)
return cal_tsq_z, pert_tsq_z, cal_tsq_x, pert_tsq_x
def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median,
@ -335,7 +352,7 @@ def pls_qvals(X, Y, aopt, alpha=.3, n_iter=20,p_center='med', cov_center=median,
Wi = pls_jk(X, Y[indi,:], aopt, nsets=nsets, center_axis=center_axis)
pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)
return _fdr(cal_tsq_x, pert_tsq_x, median)
return cal_tsq_x, pert_tsq_x
@ -372,7 +389,7 @@ def _fdr(tsq, tsqp, loc_method=median):
gene expression data using dimension reduction methods, BMC
bioinformatics, 2007
"""
n, = tsq.shape
n = tsq.shape[0]
k, m = tsqp.shape
assert(n==k)
n_false = empty((n, m), 'd')

27
sandbox/powerit/Makefile Normal file
View File

@ -0,0 +1,27 @@
CC=gcc
PYTHON_INCLUDE=/usr/include/python2.5
PYTHON_LIBRARY=/usr/lib/python2.5
NUMPY_INCLUDE=$(PYTHON_LIBRARY)/site-packages/numpy/core/include/numpy/
GOTO_LIBRARY=/home/flatberg/GotoBLAS
BLAS_LIBRARY=/usr/lib
BLAS=blas
GOTO=goto
all: numpy_module.so
numpy_module.so: numpy_module.o sympowerit.o
$(CC) -Wall -shared numpy_module.o sympowerit.o -o _numpy_module.so -L$(PYTHON_LIBRARY) -lpython2.5 -L$(BLAS_LIBRARY) -l$(BLAS) -llapack -lg2c -lm
numpy_module.o: numpy_module.c numpy_module.h
$(CC) -fPIC -Wall -O2 -g -c -I$(PYTHON_INCLUDE) -I$(NUMPY_INCLUDE) numpy_module.c
c_egines.o: sympowerit.c
$(CC) -Wall -O2 -g -c sympowerit.c
clean:
-rm numpy_module.o sympowerit.o
-rm -rf _numpy_module.so

70
sandbox/powerit/numpy_module.c Executable file
View File

@ -0,0 +1,70 @@
/* A file to test imorting C modules for handling arrays to Python */
/* Python.h includes <stdio.h>, <string.h>, <errno.h>, <limits.h>, and <stdlib.h> (if available)*/
#include "Python.h"
#include "arrayobject.h"
#include "numpy_module.h"
#include "sympowerit.h"
#include <cblas.h>
/* ==== Set up the methods table ====================== */
static PyMethodDef _numpy_moduleMethods[] = {
{"sym_powerit", sym_powerit, METH_VARARGS},
{NULL, NULL} /* Sentinel - marks the end of this structure */
};
/* ==== Initialize the numpy_module functions =============== */
// Module name must be _numpy_module in compile and linked
void init_numpy_module() {
(void) Py_InitModule("_numpy_module", _numpy_moduleMethods);
import_array(); // Must be present for NumPy. Called first after above line.
}
/* =========== Power iteration on symmetric matrix ========== */
static PyObject *sym_powerit(PyObject *self, PyObject *args)
{
PyArrayObject *X=NULL, *T=NULL, *E=NULL,*U=NULL;
double *tin, *ein, tol;
int amax, n, info, maxiter, verbose;
int dims[2];
/* Parse tuple of input arguments*/
if (!PyArg_ParseTuple(args, "O!O!iidii",
&PyArray_Type, &X,
&PyArray_Type, &T,
&n,
&amax,
&tol,
&maxiter,
&verbose)
)
return NULL;
if (NULL == X) return NULL;
/* Get the dimensions of the input */
n = X->dimensions[0];
/* Create output/ work arrays, no inplace calculations */
dims[0] = n;
dims[1] = amax;
U = (PyArrayObject*) PyArray_NewCopy(T, NPY_CORDER);
dims[1] = n;
E = (PyArrayObject*) PyArray_NewCopy(X, NPY_CORDER);
/* Get pointers to contigous data (row-major)*/
ein = (double *)E->data;
tin = (double *)U->data;
/* Call sympower method */
info = sympowerit (ein, n, tin, amax, tol, maxiter, verbose);
Py_DECREF(E);
return Py_BuildValue("Ni", U, info);
}

1
sandbox/powerit/numpy_module.h Executable file
View File

@ -0,0 +1 @@
/* Header to test of C modules for arrays for Python: sympowerit.c */ /* Python callablefunctions */ static PyObject *sym_powerit(PyObject *self, PyObject *args);

View File

@ -0,0 +1,116 @@
#include <cblas.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
/* =============== Main functions body ====================*/
int sympowerit(double *E, int n, double *T, int amax, double tol,
int maxiter, int verbose)
{
/*
PURPOSE: Estimate eigenvectos of a symmetric matrix using the power method.
CALLING SEQUENCE:
info = sympowerit(*E, n, *T, amax, tol, maxiter);
INPUTS:
E symmetric matrix from XX^T/X^TX of centered data matrix
type: *double
n elements in X first dimension
type: int
type: *double
T workspace for scores (m, amax)
type: *double
amax maximum number of principal components
type: int
tol convergence limit
type: double
maxiter maximum number of iteraitons
type: int
verbose the amount of debug output
*/
int iter, a, j;
int info = 0;
double sumsq, l2norm, *y,*x, sgn, diff, alpha;
double lambda = 0.0;
if (verbose>1) verbose=1;
/* Allocate space for working vector and eigenvector */
y = (double *) malloc(n*sizeof(double));
x = (double *) malloc(n*sizeof(double));
/* Loop over all components to be estimated*/
for (a=0; a<=amax-1; a++){
/* Reset work-/eigen-vector*/
for (j=0; j<n; j++) {
y[j] = 0.0;
x[j] = T[(j*amax)+a];
}
/*x[0] = 1.0;*/
/* Main power-iteration loop */
for ( iter = 0; iter <= maxiter; iter++ ) {
/* Matrix-vector product */
/*cblas_dsymv (CblasRowMajor, CblasUpper, n, 1.0, E, n,
x, 1, 0.0, y, 1);*/
cblas_dgemv (CblasRowMajor, CblasNoTrans, n, n, 1.0, E, n,
x, 1, 0.0, y, 1);
/* Normalise y */
sumsq = y[0] * y[0];
lambda = x[0] * y[0];
for ( j = 1; j < n; j++ ) {
sumsq += y[j] * y[j];
lambda += x[j] * y[j];
}
l2norm = sqrt ( sumsq );
for ( j = 0; j < n; j++ ) y[j] /= l2norm;
/*Check for convergence */
sgn = ( lambda < 0 ? -1.0 : 1.0 );
diff = x[0] - sgn * y[0];
sumsq = diff * diff;
x[0] = y[0];
for ( j = 0; j < n; j++ ) {
diff = x[j] - sgn * y[j];
sumsq += diff * diff;
x[j] = y[j];
}
if ( sqrt ( sumsq ) < tol ) {
if (verbose == 1){
printf("\nComp: %d\n", a);
printf("Converged in %d iterations\n", iter);
}
break;
}
if (iter >= maxiter){
if (verbose == 1){
printf("\nComp: %d\n", a);
printf("Max iter reached.\n");
printf("Error: %.2E\n", sumsq);
}
info = 1;
break;
}
}
/* Calculate T */
for (j=0; j<n; j++){
y[j] = sgn*sqrt(sgn*lambda)*x[j];
T[(j*amax)+a] = y[j];
}
/* rank one deflation of residual matrix */
/*cblas_dsyr (CblasRowMajor, CblasUpper, n, -1.0, x, 1, E, n);*/
alpha = -1.0*lambda;
cblas_dger (CblasRowMajor, n, n, alpha, x, 1, x, 1, E, n);
}
/* Free used space */
free(x);
free(y);
return (info);
}

View File

@ -0,0 +1,2 @@
int sympowerit (double *X, int n, double *T, int amax, double tol, int maxiter, int verbose);
/* Simple symmetric power iterations */

View File

@ -0,0 +1,100 @@
from numpy import *
HAS_SYMPOWER=True
try:
from _numpy_module import sym_powerit
except:
raise ImportError("Sym_powerit module not present")
HAS_SYMPOWER = False
class SymPowerException(Exception):
pass
_ERRORCODES = {1: "Some eigenvectors did not converge, try to increase \nthe number of iterations or lower the tolerance level",
0: ""}
def sympowerit(xx, T0=None, mn_center=False, a_max=10, tol=1e-7, maxiter=100,
verbose=0):
"""Estimate eigenvectos of a symmetric matrix using the power method.
*Parameters*:
xx : {array}
Symmetric square array (m, m)
T0 : {array}
Initial solution (m, a_max), optional
mn_center : {boolean}, optional
Mean centering
a_max : {integer}, optional
Number of components to extract
tol : {float}, optional
Tolerance level of eigenvector solver
maxiter : {integer}
Maximum number of poweriterations to use
verbose : {integer}
Debug output (==1)
*Returns*:
v : {array}
Eigenvectors of xx, (m , a_max)
"""
valid_types = ['D','d','F','f']
dtype = xx.dtype.char
n, m = xx.shape
if not(dtype in valid_types):
msg = "Array type: (%s) needs to be a float or double" %dtype
raise SymPowerException(msg)
if not (m==n):
msg = "Input array needs to be square, input: (%d,%d)" %(m,n)
raise SymPowerException(msg)
# small test of symmetry
N = 5
num = random.randint(0,n,N)
for i in range(5):
j = N-5
if abs(xx[num[i],num[j]] - xx[num[j],num[i]])>1e-15:
msg = "Array needs to be symmetric"
raise SymPowerException(msg)
if not a_max:
a_max = 10
if T0 !=None:
tn, tm = T0.shape
if not (tn==n):
msg = "Start eigenvectors need to match input array ()"
raise SymPowerException(msg)
if not (tm==a_max):
msg = "Start eigenvectors need to match input a_max ()"
raise SymPowerException(msg)
else:
T0 = zeros((n, a_max), 'd')
T0[0,:] = ones((a_max,),'d')
if mn_center:
xx = _center(xx)
# call c-function
T, info = sym_powerit(xx, T0, n, a_max, tol, maxiter, verbose)
if info != 0:
if verbose:
print _ERRORCODES.get(info, "Dont know this error")
return T
def _center(xx, ret_mn=False):
"""Returns mean centered symmetric kernel matrix.
"""
n = xx.shape[0]
h = xx.sum(0)[:,newaxis]
h = (h - mean(h)/2)/n
mn_a = h + h.T
xxc = xx - mn_a
if ret_mn:
return xxc, mn_a
return xxc