A few updates

This commit is contained in:
Arnar Flatberg 2007-11-26 15:30:52 +00:00
parent 902806c1d8
commit 2951ca4088
4 changed files with 108 additions and 59 deletions

View File

@ -12,7 +12,7 @@ from numpy.random import shuffle
from engines import nipals_lpls as lpls
def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=True):
def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2], zorth=False, verbose=True):
"""Performs crossvalidation for generalisation error in lpls.
The L-PLS crossvalidation is estimated just like an ordinary pls
@ -42,6 +42,8 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=Tru
0 : row center
1 : column center
2 : double center
zorth : {boolean}
If true, Require orthogonal latent components in Z.
verbose : {boolean}, optional
Verbosity of console output. For use in debugging.
@ -70,7 +72,11 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=Tru
Yhat = empty((a_max, k, l), 'd')
for cal, val in cv(nsets, k):
dat = lpls(X[cal],Y[cal],Z,a_max=a_max,alpha=alpha,mean_ctr=mean_ctr,verbose=verbose)
# do the training model
dat = lpls(X[cal], Y[cal], Z, a_max=a_max, alpha=alpha,
mean_ctr=mean_ctr, zorth=zorth, verbose=verbose)
# center test data
if mean_ctr[0] != 1:
xi = X[val,:] - dat['mnx']
else:
@ -79,14 +85,24 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=Tru
ym = dat['mny']
else:
ym = Y[val].mean(1)[:,newaxis] #???: check this
# predictions
for a in range(a_max):
Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a]))
#if permute:
# xcal = X[cal]
# for a in range(1,a_max,1):
# for n in range(10):
# shuffle(cal)
# dat = lpls(xcal, Y[cal], Z, a_max=a_max, alpha=alpha,
# mean_ctr=mean_ctr, verbose=verbose)
# todo: need a better support for classification error
y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?']
if y_is_class:
Yhat, err = class_error(Yhat,Y)
return Yhat, err
pass
#Yhat, err = class_error(Yhat, Y)
#return Yhat, err
sep = (Y - Yhat)**2
rmsep = sqrt(sep.mean(1)).T
@ -317,8 +333,8 @@ def cv(N, K, randomise=True, sequential=False):
otherwise interleaved ordering is used.
"""
if K>N:
raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(N,N,K)
if N>K:
raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(K, K, N)
index = xrange(N)
if randomise:
from random import shuffle
@ -371,7 +387,7 @@ def class_error(Yhat, Y, method='vanilla'):
Yhat_c = zeros((k, l), dtype='d')
for a in range(a_opt):
for i in range(k):
Yhat_c[a,val,argmax(Yhat[a,val,:])] = 1.0
Yhat_c[a, val, argmax(Yhat[a,val,:])] = 1.0
err = 100*((Yhat_c + Y) == 2).sum(1)/Y.sum(0).astype('d')
return Yhat_c, err

View File

@ -411,7 +411,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1):
'evx': expvarx, 'evy': expvary, 'ssqx': ssqx, 'ssqy': ssqy,
'leverage': leverage, 'mnx': mnx, 'mny': mny}
def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', verbose=False):
def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 2], scale='scores', zorth = False, verbose=False):
""" L-shaped Partial Least Sqaures Regression by the nipals algorithm.
An L-shaped low rank model aproximates three matrices in a hyploid
@ -475,10 +475,14 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
scale : {'scores', 'loads'}, optional
Option to decide on where the scale goes.
zorth : {False, boolean}, optional
Option to force orthogonality between latent components
in Z
verbose : {boolean}, optional
Verbosity of console output. For use in debugging.
*References*
Saeboe et al., LPLS-regression: a method for improved prediction and
classification through inclusion of background information on
predictor variables, J. of chemometrics and intell. laboratory syst.
@ -522,18 +526,22 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
var_y = empty((a_max,))
var_z = empty((a_max,))
MAX_ITER = 450
MAX_ITER = 4500
LIM = finfo(X.dtype).resolution
is_rd = False
for a in range(a_max):
if verbose:
print "\nWorking on comp. %s" %a
u = F[:,:1]
w = E[:1,:].T
l = G[:,:1]
diff = 1
niter = 0
while (diff>LIM and niter<MAX_ITER):
niter += 1
u1 = u.copy()
w1 = w.copy()
l1 = l.copy()
w = dot(E.T, u)
wn = msqrt(dot(w.T, w))
if wn < LIM:
@ -552,20 +560,25 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
c = dot(F.T, t)
c = c/msqrt(dot(c.T, c))
u = dot(F, c)
diff = dot((u-u1).T, (u-u1))
diff = dot((u - u1).T, (u - u1))
if verbose:
print "Converged after %s iterations" %niter
if niter==MAX_ITER:
print "Maximum nunber of iterations reached!"
print "Iterations: %d " %niter
print "Error: %.2E" %diff
if is_rd:
print "Hei og haa ... rank deficient, this should really not happen"
break
tt = dot(t.T, t)
p = dot(X.T, t)/tt
q = dot(Y.T, t)/tt
l = dot(Z, w)
#k = dot(Z.T, l)/dot(l.T, l)
p = dot(E.T, t)/tt
q = dot(F.T, t)/tt
if zorth:
k = dot(G.T, l)/dot(l.T, l)
else:
k = w
l = dot(G, w)
U[:,a] = u.ravel()
W[:,a] = w.ravel()
@ -575,10 +588,10 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
L[:,a] = l.ravel()
K[:,a] = k.ravel()
# rank-one deflations
E = E - dot(t, p.T)
F = F - dot(t, q.T)
G = (G.T - dot(k, l.T)).T
G = G - dot(l, k.T)
var_x[a] = pow(E, 2).sum()
var_y[a] = pow(F, 2).sum()

View File

@ -1,6 +1,6 @@
"""Bilinear models"""
from numpy import expand_dims
from numpy import expand_dims,ones
from engines import pca
@ -14,8 +14,11 @@ def scale(x, axis=0):
#scale = 1./x.std(axis)
return expand_dims(scale, axis)
class Model(object):
def __init__(name="johndoe"):
"""All underscored attributes are properties.
"""
def __init__(self, name="johndoe"):
self.name = name
self.options = {}
@ -27,8 +30,8 @@ class Model(object):
def clear(self):
for param in self.__dict__.keys():
if param.startswith("_") and param[1]!="_":
exec "del self." + param
if param.startswith("_") and param[1:5]!="core":
exec "del self." + param[1:]
def clear_core(self):
for param in self.__dict__.keys():
@ -43,7 +46,7 @@ class PCA(Model):
self._x = x
self.amax = amax
self.aopt = amax
# properties
def amax():
doc = "maximum number of components"
@ -78,29 +81,29 @@ class PCA(Model):
def scores():
doc = "pca scores"
def fget(self):
if not hasattr(self, "_scores"):
u, s, v, tot_var = pcaengine(self.xw, self.amax)
self._scores = u
self.singvals = s
self.loadings = v
self.tot_var = tot_var
return self._scores[:,:self.amax]
if not hasattr(self, "_core_scores"):
result= pca(self.xw, self.amax)
self._core_scores = result['T']
self.singvals = result['eigvals']
self.loadings = result['P']
self.tot_var = 120.
return self._core_scores[:,:self.amax]
def fset(self, t):
self._scores = t
self._core_scores = t
def fdel(self):
del self._scores
return locals() # credit: David Niergarth
del self._core_scores
return locals()
scores = property(**scores())
def loadings():
doc = "pca loadings"
def fget(self):
if not hasattr(self, "_loadings"):
u, s, v, tot_var = pcaengine(self.xw, self.amax)
self._loadings = v
self.scores = u
self.singvals = s
self.tot_var = tot_var
result = pca(self.xw, self.amax)
self.loadings = result['P']
self.scores = result['T']
self.singvals = result['eigvals']
self.tot_var = 120
return self._loadings[:,:self.amax]
def fdel(self):
del self._loadings
@ -113,11 +116,11 @@ class PCA(Model):
doc = "Singular values"
def fget(self):
if not hasattr(self, "_singvals"):
u, s, v, tot_var = pcaengine(self.xw, self.amax)
self._singvals = s
self.scores = u
self.loadings = v
self.tot_var = tot_var
result = pca(self.xw, self.amax)
self._singvals = result['eigvals']
self.scores = result['T']
self.loadings = result['P']
self.tot_var = 120
return self._singvals[:self.amax]
def fset(self, w):
self._singvals = w
@ -139,7 +142,7 @@ class PCA(Model):
doc = "column means"
def fget(self):
if not hasattr(self, "_xadd"):
self._xadd = center(self.x, axis=0)
self._xadd = mean_center(self.x, axis=0)
return self._xadd
def fset(self, mnx):
if hasattr(self, "_xc"):
@ -153,7 +156,7 @@ class PCA(Model):
xadd = property(**xadd())
def xc():
doc = "centered input data"
doc = "mean_centered input data"
def fget(self):
if not hasattr(self, "_xc"):
self._xc = self.x + self.xadd
@ -161,7 +164,10 @@ class PCA(Model):
def fset(self, xc):
self._xc = xc
def fdel(self):
del self._xc
print "a"
if hasattr(self, "_xc"):
print "del"
del self._xc
return locals()
xc = property(**xc())
@ -237,7 +243,7 @@ class PCA(Model):
def fdel(self):
del self._row_metric
if hasattr(self, "_xd"):
del self.xd
del self._xd
return locals()
row_metric = property(**row_metric())
@ -254,7 +260,7 @@ class PCA(Model):
def fdel(self):
del self._column_metric
if hasattr(self, "_xd"):
del self.xd
del self._xd
return locals()
column_metric = property(**column_metric())
@ -273,10 +279,12 @@ class PCA(Model):
def delete_rows(self, index):
pass
def reweight(self, )
def reweight(self, w):
pass
if __name__ == "__main__":
X = random.rand(4,10)
pcaobj = PCA(X)
print "explained variance" + str(pcaobj.explained_variance)
from numpy.random import rand
X = rand(4,10)
pcaobj = PCA(X)
print "explained variance" + str(pcaobj.explained_variance)

View File

@ -115,8 +115,8 @@ def procrustes(a, b, strict=True, center=False, verbose=False):
*Reference*:
Schonemann, A generalized solution of the orthogonal Procrustes problem,
Psychometrika, 1966
Schonemann, A generalized solution of the orthogonal Procrustes
problem, Psychometrika, 1966
"""
if center:
@ -131,9 +131,9 @@ def procrustes(a, b, strict=True, center=False, verbose=False):
Cm = _ensure_strict(Cm)
b_rot = dot(b, Cm)
if verbose:
print Cm.round()
fit = sum(ravel(b - b_rot)**2)
print "Error: %.3E" %fit
fit = ((b - b_rot)**2).sum()
fit2 = (dot(a, a.T) + dot(b, b.T) - 2*diag(s)).trace()
print "Error: %.2E , %.2E" %(fit, fit2)
if center:
return mn_b + b_rot
else:
@ -159,7 +159,9 @@ def _ensure_strict(C, only_flips=True):
*Notes*:
This function is not ready for use. Use (only_flips=True)
This function is not ready for use. Use (only_flips=True).
That is, for more than two components, the rotation matrix
has a tendency to be unstable (det(Cm)>1), when rounding is used.
"""
if only_flips:
@ -279,6 +281,16 @@ def _fdr(tsq, tsqp, loc_method=median):
fdr : {array}
False discovery rate
*Notes*:
This is an internal function for use in fdr estimation of jack-knifed
perturbated blm parameters.
*Reference*:
Gidskehaug et al., A framework for significance analysis of
gene expression data using dimension reduction methods, BMC
bioinformatics, 2007
"""
n, = tsq.shape
k, m = tsqp.shape