A few updates

This commit is contained in:
Arnar Flatberg 2007-11-26 15:30:52 +00:00
parent 902806c1d8
commit 2951ca4088
4 changed files with 108 additions and 59 deletions

View File

@ -12,7 +12,7 @@ from numpy.random import shuffle
from engines import nipals_lpls as lpls from engines import nipals_lpls as lpls
def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=True): def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2], zorth=False, verbose=True):
"""Performs crossvalidation for generalisation error in lpls. """Performs crossvalidation for generalisation error in lpls.
The L-PLS crossvalidation is estimated just like an ordinary pls The L-PLS crossvalidation is estimated just like an ordinary pls
@ -42,6 +42,8 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=Tru
0 : row center 0 : row center
1 : column center 1 : column center
2 : double center 2 : double center
zorth : {boolean}
If true, Require orthogonal latent components in Z.
verbose : {boolean}, optional verbose : {boolean}, optional
Verbosity of console output. For use in debugging. Verbosity of console output. For use in debugging.
@ -70,7 +72,11 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=Tru
Yhat = empty((a_max, k, l), 'd') Yhat = empty((a_max, k, l), 'd')
for cal, val in cv(nsets, k): for cal, val in cv(nsets, k):
dat = lpls(X[cal],Y[cal],Z,a_max=a_max,alpha=alpha,mean_ctr=mean_ctr,verbose=verbose) # do the training model
dat = lpls(X[cal], Y[cal], Z, a_max=a_max, alpha=alpha,
mean_ctr=mean_ctr, zorth=zorth, verbose=verbose)
# center test data
if mean_ctr[0] != 1: if mean_ctr[0] != 1:
xi = X[val,:] - dat['mnx'] xi = X[val,:] - dat['mnx']
else: else:
@ -79,14 +85,24 @@ def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2],verbose=Tru
ym = dat['mny'] ym = dat['mny']
else: else:
ym = Y[val].mean(1)[:,newaxis] #???: check this ym = Y[val].mean(1)[:,newaxis] #???: check this
# predictions
for a in range(a_max): for a in range(a_max):
Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a])) Yhat[a,val,:] = atleast_2d(ym + dot(xi, dat['B'][a]))
#if permute:
# xcal = X[cal]
# for a in range(1,a_max,1):
# for n in range(10):
# shuffle(cal)
# dat = lpls(xcal, Y[cal], Z, a_max=a_max, alpha=alpha,
# mean_ctr=mean_ctr, verbose=verbose)
# todo: need a better support for classification error # todo: need a better support for classification error
y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?'] y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?']
if y_is_class: if y_is_class:
Yhat, err = class_error(Yhat,Y) pass
return Yhat, err #Yhat, err = class_error(Yhat, Y)
#return Yhat, err
sep = (Y - Yhat)**2 sep = (Y - Yhat)**2
rmsep = sqrt(sep.mean(1)).T rmsep = sqrt(sep.mean(1)).T
@ -317,8 +333,8 @@ def cv(N, K, randomise=True, sequential=False):
otherwise interleaved ordering is used. otherwise interleaved ordering is used.
""" """
if K>N: if N>K:
raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(N,N,K) raise ValueError, "You cannot divide a list of %d samples into more than %d segments. Yout tried: %s" %(K, K, N)
index = xrange(N) index = xrange(N)
if randomise: if randomise:
from random import shuffle from random import shuffle
@ -371,7 +387,7 @@ def class_error(Yhat, Y, method='vanilla'):
Yhat_c = zeros((k, l), dtype='d') Yhat_c = zeros((k, l), dtype='d')
for a in range(a_opt): for a in range(a_opt):
for i in range(k): for i in range(k):
Yhat_c[a,val,argmax(Yhat[a,val,:])] = 1.0 Yhat_c[a, val, argmax(Yhat[a,val,:])] = 1.0
err = 100*((Yhat_c + Y) == 2).sum(1)/Y.sum(0).astype('d') err = 100*((Yhat_c + Y) == 2).sum(1)/Y.sum(0).astype('d')
return Yhat_c, err return Yhat_c, err

View File

@ -411,7 +411,7 @@ def pls(X, Y, aopt=2, scale='scores', mode='normal', center_axis=-1):
'evx': expvarx, 'evy': expvary, 'ssqx': ssqx, 'ssqy': ssqy, 'evx': expvarx, 'evy': expvary, 'ssqx': ssqx, 'ssqy': ssqy,
'leverage': leverage, 'mnx': mnx, 'mny': mny} 'leverage': leverage, 'mnx': mnx, 'mny': mny}
def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', verbose=False): def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 2], scale='scores', zorth = False, verbose=False):
""" L-shaped Partial Least Sqaures Regression by the nipals algorithm. """ L-shaped Partial Least Sqaures Regression by the nipals algorithm.
An L-shaped low rank model aproximates three matrices in a hyploid An L-shaped low rank model aproximates three matrices in a hyploid
@ -475,10 +475,14 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
scale : {'scores', 'loads'}, optional scale : {'scores', 'loads'}, optional
Option to decide on where the scale goes. Option to decide on where the scale goes.
zorth : {False, boolean}, optional
Option to force orthogonality between latent components
in Z
verbose : {boolean}, optional verbose : {boolean}, optional
Verbosity of console output. For use in debugging. Verbosity of console output. For use in debugging.
*References* *References*
Saeboe et al., LPLS-regression: a method for improved prediction and Saeboe et al., LPLS-regression: a method for improved prediction and
classification through inclusion of background information on classification through inclusion of background information on
predictor variables, J. of chemometrics and intell. laboratory syst. predictor variables, J. of chemometrics and intell. laboratory syst.
@ -522,18 +526,22 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
var_y = empty((a_max,)) var_y = empty((a_max,))
var_z = empty((a_max,)) var_z = empty((a_max,))
MAX_ITER = 450 MAX_ITER = 4500
LIM = finfo(X.dtype).resolution LIM = finfo(X.dtype).resolution
is_rd = False is_rd = False
for a in range(a_max): for a in range(a_max):
if verbose: if verbose:
print "\nWorking on comp. %s" %a print "\nWorking on comp. %s" %a
u = F[:,:1] u = F[:,:1]
w = E[:1,:].T
l = G[:,:1]
diff = 1 diff = 1
niter = 0 niter = 0
while (diff>LIM and niter<MAX_ITER): while (diff>LIM and niter<MAX_ITER):
niter += 1 niter += 1
u1 = u.copy() u1 = u.copy()
w1 = w.copy()
l1 = l.copy()
w = dot(E.T, u) w = dot(E.T, u)
wn = msqrt(dot(w.T, w)) wn = msqrt(dot(w.T, w))
if wn < LIM: if wn < LIM:
@ -552,20 +560,25 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
c = dot(F.T, t) c = dot(F.T, t)
c = c/msqrt(dot(c.T, c)) c = c/msqrt(dot(c.T, c))
u = dot(F, c) u = dot(F, c)
diff = dot((u-u1).T, (u-u1)) diff = dot((u - u1).T, (u - u1))
if verbose: if verbose:
print "Converged after %s iterations" %niter if niter==MAX_ITER:
print "Maximum nunber of iterations reached!"
print "Iterations: %d " %niter
print "Error: %.2E" %diff print "Error: %.2E" %diff
if is_rd: if is_rd:
print "Hei og haa ... rank deficient, this should really not happen" print "Hei og haa ... rank deficient, this should really not happen"
break break
tt = dot(t.T, t) tt = dot(t.T, t)
p = dot(X.T, t)/tt p = dot(E.T, t)/tt
q = dot(Y.T, t)/tt q = dot(F.T, t)/tt
l = dot(Z, w) if zorth:
#k = dot(Z.T, l)/dot(l.T, l) k = dot(G.T, l)/dot(l.T, l)
else:
k = w
l = dot(G, w)
U[:,a] = u.ravel() U[:,a] = u.ravel()
W[:,a] = w.ravel() W[:,a] = w.ravel()
@ -575,10 +588,10 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', ve
L[:,a] = l.ravel() L[:,a] = l.ravel()
K[:,a] = k.ravel() K[:,a] = k.ravel()
# rank-one deflations
E = E - dot(t, p.T) E = E - dot(t, p.T)
F = F - dot(t, q.T) F = F - dot(t, q.T)
G = (G.T - dot(k, l.T)).T G = G - dot(l, k.T)
var_x[a] = pow(E, 2).sum() var_x[a] = pow(E, 2).sum()
var_y[a] = pow(F, 2).sum() var_y[a] = pow(F, 2).sum()

View File

@ -1,6 +1,6 @@
"""Bilinear models""" """Bilinear models"""
from numpy import expand_dims from numpy import expand_dims,ones
from engines import pca from engines import pca
@ -14,8 +14,11 @@ def scale(x, axis=0):
#scale = 1./x.std(axis) #scale = 1./x.std(axis)
return expand_dims(scale, axis) return expand_dims(scale, axis)
class Model(object): class Model(object):
def __init__(name="johndoe"): """All underscored attributes are properties.
"""
def __init__(self, name="johndoe"):
self.name = name self.name = name
self.options = {} self.options = {}
@ -27,8 +30,8 @@ class Model(object):
def clear(self): def clear(self):
for param in self.__dict__.keys(): for param in self.__dict__.keys():
if param.startswith("_") and param[1]!="_": if param.startswith("_") and param[1:5]!="core":
exec "del self." + param exec "del self." + param[1:]
def clear_core(self): def clear_core(self):
for param in self.__dict__.keys(): for param in self.__dict__.keys():
@ -43,7 +46,7 @@ class PCA(Model):
self._x = x self._x = x
self.amax = amax self.amax = amax
self.aopt = amax self.aopt = amax
# properties # properties
def amax(): def amax():
doc = "maximum number of components" doc = "maximum number of components"
@ -78,29 +81,29 @@ class PCA(Model):
def scores(): def scores():
doc = "pca scores" doc = "pca scores"
def fget(self): def fget(self):
if not hasattr(self, "_scores"): if not hasattr(self, "_core_scores"):
u, s, v, tot_var = pcaengine(self.xw, self.amax) result= pca(self.xw, self.amax)
self._scores = u self._core_scores = result['T']
self.singvals = s self.singvals = result['eigvals']
self.loadings = v self.loadings = result['P']
self.tot_var = tot_var self.tot_var = 120.
return self._scores[:,:self.amax] return self._core_scores[:,:self.amax]
def fset(self, t): def fset(self, t):
self._scores = t self._core_scores = t
def fdel(self): def fdel(self):
del self._scores del self._core_scores
return locals() # credit: David Niergarth return locals()
scores = property(**scores()) scores = property(**scores())
def loadings(): def loadings():
doc = "pca loadings" doc = "pca loadings"
def fget(self): def fget(self):
if not hasattr(self, "_loadings"): if not hasattr(self, "_loadings"):
u, s, v, tot_var = pcaengine(self.xw, self.amax) result = pca(self.xw, self.amax)
self._loadings = v self.loadings = result['P']
self.scores = u self.scores = result['T']
self.singvals = s self.singvals = result['eigvals']
self.tot_var = tot_var self.tot_var = 120
return self._loadings[:,:self.amax] return self._loadings[:,:self.amax]
def fdel(self): def fdel(self):
del self._loadings del self._loadings
@ -113,11 +116,11 @@ class PCA(Model):
doc = "Singular values" doc = "Singular values"
def fget(self): def fget(self):
if not hasattr(self, "_singvals"): if not hasattr(self, "_singvals"):
u, s, v, tot_var = pcaengine(self.xw, self.amax) result = pca(self.xw, self.amax)
self._singvals = s self._singvals = result['eigvals']
self.scores = u self.scores = result['T']
self.loadings = v self.loadings = result['P']
self.tot_var = tot_var self.tot_var = 120
return self._singvals[:self.amax] return self._singvals[:self.amax]
def fset(self, w): def fset(self, w):
self._singvals = w self._singvals = w
@ -139,7 +142,7 @@ class PCA(Model):
doc = "column means" doc = "column means"
def fget(self): def fget(self):
if not hasattr(self, "_xadd"): if not hasattr(self, "_xadd"):
self._xadd = center(self.x, axis=0) self._xadd = mean_center(self.x, axis=0)
return self._xadd return self._xadd
def fset(self, mnx): def fset(self, mnx):
if hasattr(self, "_xc"): if hasattr(self, "_xc"):
@ -153,7 +156,7 @@ class PCA(Model):
xadd = property(**xadd()) xadd = property(**xadd())
def xc(): def xc():
doc = "centered input data" doc = "mean_centered input data"
def fget(self): def fget(self):
if not hasattr(self, "_xc"): if not hasattr(self, "_xc"):
self._xc = self.x + self.xadd self._xc = self.x + self.xadd
@ -161,7 +164,10 @@ class PCA(Model):
def fset(self, xc): def fset(self, xc):
self._xc = xc self._xc = xc
def fdel(self): def fdel(self):
del self._xc print "a"
if hasattr(self, "_xc"):
print "del"
del self._xc
return locals() return locals()
xc = property(**xc()) xc = property(**xc())
@ -237,7 +243,7 @@ class PCA(Model):
def fdel(self): def fdel(self):
del self._row_metric del self._row_metric
if hasattr(self, "_xd"): if hasattr(self, "_xd"):
del self.xd del self._xd
return locals() return locals()
row_metric = property(**row_metric()) row_metric = property(**row_metric())
@ -254,7 +260,7 @@ class PCA(Model):
def fdel(self): def fdel(self):
del self._column_metric del self._column_metric
if hasattr(self, "_xd"): if hasattr(self, "_xd"):
del self.xd del self._xd
return locals() return locals()
column_metric = property(**column_metric()) column_metric = property(**column_metric())
@ -273,10 +279,12 @@ class PCA(Model):
def delete_rows(self, index): def delete_rows(self, index):
pass pass
def reweight(self, ) def reweight(self, w):
pass
if __name__ == "__main__": if __name__ == "__main__":
X = random.rand(4,10) from numpy.random import rand
pcaobj = PCA(X) X = rand(4,10)
print "explained variance" + str(pcaobj.explained_variance) pcaobj = PCA(X)
print "explained variance" + str(pcaobj.explained_variance)

View File

@ -115,8 +115,8 @@ def procrustes(a, b, strict=True, center=False, verbose=False):
*Reference*: *Reference*:
Schonemann, A generalized solution of the orthogonal Procrustes problem, Schonemann, A generalized solution of the orthogonal Procrustes
Psychometrika, 1966 problem, Psychometrika, 1966
""" """
if center: if center:
@ -131,9 +131,9 @@ def procrustes(a, b, strict=True, center=False, verbose=False):
Cm = _ensure_strict(Cm) Cm = _ensure_strict(Cm)
b_rot = dot(b, Cm) b_rot = dot(b, Cm)
if verbose: if verbose:
print Cm.round() fit = ((b - b_rot)**2).sum()
fit = sum(ravel(b - b_rot)**2) fit2 = (dot(a, a.T) + dot(b, b.T) - 2*diag(s)).trace()
print "Error: %.3E" %fit print "Error: %.2E , %.2E" %(fit, fit2)
if center: if center:
return mn_b + b_rot return mn_b + b_rot
else: else:
@ -159,7 +159,9 @@ def _ensure_strict(C, only_flips=True):
*Notes*: *Notes*:
This function is not ready for use. Use (only_flips=True) This function is not ready for use. Use (only_flips=True).
That is, for more than two components, the rotation matrix
has a tendency to be unstable (det(Cm)>1), when rounding is used.
""" """
if only_flips: if only_flips:
@ -279,6 +281,16 @@ def _fdr(tsq, tsqp, loc_method=median):
fdr : {array} fdr : {array}
False discovery rate False discovery rate
*Notes*:
This is an internal function for use in fdr estimation of jack-knifed
perturbated blm parameters.
*Reference*:
Gidskehaug et al., A framework for significance analysis of
gene expression data using dimension reduction methods, BMC
bioinformatics, 2007
""" """
n, = tsq.shape n, = tsq.shape
k, m = tsqp.shape k, m = tsqp.shape