This commit is contained in:
2007-07-28 16:05:11 +00:00
parent 9a2e259209
commit 349cab3c51
4 changed files with 297 additions and 131 deletions
+3 -3
View File
@@ -197,7 +197,7 @@ class PLS(Model):
Model.__init__(self, id, name)
self._options = PlsOptions()
def validation(self, amax, n_sets, cv_val_method):
def validation(self):
"""Returns rmsep for pls model.
"""
m, n = self.model['E0'].shape
@@ -207,7 +207,7 @@ class PLS(Model):
val_engine = pls_val
if self._options['calc_cv']==True:
rmsep, aopt = val_engine(self.model['E0'], self.model['F0'],
amax, n_sets)
self._options['amax'], self._options['n_sets'])
self.model['rmsep'] = rmsep[:,:-1]
self.model['aopt'] = aopt
else:
@@ -319,7 +319,7 @@ class PLS(Model):
self.model['E0'] = self._data['X']
self.model['F0'] = self._data['Y']
self.validation(**options.validation_options())
self.validation()
self.make_model(self.model['E0'], self.model['F0'],
**options.make_model_options())
# variance captured
+165 -45
View File
@@ -6,81 +6,189 @@ There is almost no typechecking of any kind here, just focus on speed
import math
from scipy.linalg import svd,inv
from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\
apply_along_axis,mean,ones,randn,empty_like,outer,c_,\
rand,sum,cumsum,matrix
apply_along_axis,mean,ones,randn,empty_like,outer,r_,c_,\
rand,sum,cumsum,matrix, expand_dims,minimum,where
has_sym=True
try:
import symmeig
from symeig import symeig
except:
has_sym = False
has_sym=False
def pca(a, aopt, scale='scores', mode='normal'):
""" Principal Component Analysis model
mode:
-- fast : returns smallest dim scaled (T for n<=m, P for n>m )
-- normal : returns all model params and residuals after aopt comp
-- detailed : returns all model params and all residuals
"""
def pca(a, aopt,scale='scores',mode='normal',center_axis=-1):
""" Principal Component Analysis.
Performs PCA on given matrix and returns results in a dictionary.
:Parameters:
a : array
Data measurement matrix, (samples x variables)
aopt : int
Number of components to use, aopt<=min(samples, variables)
:Returns:
results : dict
keys -- values, T -- scores, P -- loadings, E -- residuals,
lev --leverages, ssq -- sum of squares, expvar -- cumulative
explained variance, aopt -- number of components used
:OtherParameters:
mode : str
Amount of info retained, ('fast', 'normal', 'detailed')
center_axis : int
Center along given axis. If neg.: no centering (-inf,..., matrix modes)
:SeeAlso:
- pcr : other blm
- pls : other blm
- lpls : other blm
Notes
-----
Uses kernel speed-up if m>>n or m<<n.
If residuals turn rank deficient, a lower number of component than given
in input will be used. The number of components used is given in results-dict.
Examples
--------
>>> import scipy,engines
>>> a=scipy.asarray([[1,2,3],[2,4,5]])
>>> dat=engines.pca(a, 2)
>>> dat['expvar']
array([0.,99.8561562, 100.])
"""
if center_axis>=0:
a = a - expand_dims(a.mean(center_axis), center_axis)
m, n = a.shape
#print "rows: %s cols: %s" %(m,n)
if m>(n+100) or n>(m+100):
u, s, v = esvd(a)
u, e, v = esvd(a)
s = sqrt(e)
else:
u, s, vt = svd(a, 0)
v = vt.T
eigvals = (1./m)*s
e = s**2
tol = 1e-10
eff_rank = sum(s>s[0]*tol)
aopt = minimum(aopt, eff_rank)
T = u*s
s = s[:aopt]
e = e[:aopt]
T = T[:,:aopt]
P = v[:,:aopt]
if scale=='loads':
tnorm = apply_along_axis(vnorm, 0, T)
T = T/tnorm
P = P*tnorm
T = T/s
P = P*s
if mode == 'fast':
return {'T':T, 'P':P}
return {'T':T, 'P':P, 'aopt':aopt}
if mode=='detailed':
"""Detailed mode returns residual matrix for all comp.
That is E, is a three-mode matrix: (amax, m, n) """
E = empty((aopt, m, n))
E = empty((aopt, m, n))
ssq = []
lev = []
expvarx = empty((aopt, aopt+1))
for ai in range(aopt):
e = a - dot(T[:,:ai+1], P[:,:ai+1].T)
E[ai,:,:] = e.copy()
E[ai,:,:] = a - dot(T[:,:ai+1], P[:,:ai+1].T)
ssq.append([(E[ai,:,:]**2).sum(0), (E[ai,:,:]**2).sum(1)])
if scale=='loads':
lev.append([((s*T)**2).sum(1), (P**2).sum(1)])
else:
lev.append([(T**2).sum(1), ((s*P)**2).sum(1)])
expvarx[ai,:] = r_[0, 100*e.cumsum()/e.sum()]
else:
E = a - dot(T,P.T)
return {'T':T, 'P':P, 'E':E}
# residuals
E = a - dot(T, P.T)
SEP = E**2
ssq = [SEP.sum(0), SEP.sum(1)]
# leverages
if scale=='loads':
lev = [(1./m)+(T**2).sum(1), (1./n)+((P/s)**2).sum(1)]
else:
lev = [(1./m)+((T/s)**2).sum(1), (1./n)+(P**2).sum(1)]
# variances
expvarx = r_[0, 100*e.cumsum()/e.sum()]
return {'T':T, 'P':P, 'E':E, 'expvarx':expvarx, 'levx':lev, 'ssqx':ssq, 'aopt':aopt}
def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0):
""" Principal Component Regression.
def pcr(a, b, aopt=2, scale='scores', mode='normal'):
"""Principal Component Regression.
Performs PCR on given matrix and returns results in a dictionary.
Returns
:Parameters:
a : array
Data measurement matrix, (samples x variables)
b : array
Data response matrix, (samples x responses)
aopt : int
Number of components to use, aopt<=min(samples, variables)
:Returns:
results : dict
keys -- values, T -- scores, P -- loadings, E -- residuals,
levx -- leverages, ssqx -- sum of squares, expvarx -- cumulative
explained variance, aopt -- number of components used
:OtherParameters:
mode : str
Amount of info retained, ('fast', 'normal', 'detailed')
center_axis : int
Center along given axis. If neg.: no centering (-inf,..., matrix modes)
:SeeAlso:
- pcr : other blm
- pls : other blm
- lpls : other blm
Notes
-----
Uses kernel speed-up if m>>n or m<<n.
If residuals turn rank deficient, a lower number of component than given
in input will be used. The number of components used is given in results-dict.
Examples
--------
>>> import scipy,engines
>>> a=scipy.asarray([[1,2,3],[2,4,5]])
>>> dat=engines.pca(a, 2)
>>> dat['expvar']
array([0.,99.8561562, 100.])
"""
m, n = m_shape(a)
B = empty((aopt, n, l))
dat = pca(a, aopt=aopt, scale=scale, mode='normal', center_axis=0)
k, l = m_shape(b)
if center_axis>=0:
b = b - expand_dims(b.mean(center_axis), center_axis)
dat = pca(a, aopt=aopt, scale=scale, mode=mode, center_axis=center_axis)
T = dat['T']
weigths = apply_along_axis(vnorm, 0, T)
weights = apply_along_axis(vnorm, 0, T)
if scale=='loads':
# fixme: check weights
Q = dot(b.T, T*weights)
Q = dot(b.T, T*weights**2)
else:
Q = dot(b.T, T/weights**2)
if mode=='fast':
return {'T', T:, 'P':P, 'Q':Q}
dat.update({'Q':Q})
return dat
if mode=='detailed':
for i in range(1, aopt+1, 1):
F[i,:,:] = b - dot(T[:,i],Q[:,:i].T)
F = empty((aopt, k, l))
for i in range(aopt):
F[i,:,:] = b - dot(T[:,:i+1], Q[:,:i+1].T)
else:
F = b - dot(T, Q.T)
#fixme: explained variance in Y + Y-var leverages
dat.update({'Q',Q, 'F':F})
dat.update({'Q':Q, 'F':F})
return dat
def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
@@ -271,7 +379,6 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], mode='normal', sca
X, mnX = center(X, xctr)
Y, mnY = center(Y, xctr)
Z, mnZ = center(Z, zctr)
print Z.mean(1)
varX = pow(X, 2).sum()
varY = pow(Y, 2).sum()
@@ -365,7 +472,7 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], mode='normal', sca
def m_shape(array):
return matrix(array).shape
def esvd(data):
def esvd(data, amax=None):
"""SVD with the option of economy sized calculation
Calculate subspaces of X'X or XX' depending on the shape
of the matrix.
@@ -378,17 +485,30 @@ def esvd(data):
m, n = data.shape
if m>=n:
kernel = dot(data.T, data)
u, s, vt = svd(kernel)
u = dot(data, vt.T)
v = vt.T
if has_sym:
if not amax:
amax = n
pcrange = [n-amax, n]
s, v = symeig(kernel, range=pcrange, overwrite=True)
s = s[::-1]
v = v[:,arange(n, -1, -1)]
else:
u, s, vt = svd(kernel)
v = vt.T
u = dot(data, v)
for i in xrange(n):
s[i] = vnorm(u[:,i])
u[:,i] = u[:,i]/s[i]
else:
kernel = dot(data, data.T)
#data = (data + data.T)/2.0
u, s, vt = svd(kernel)
v = dot(u.T, data)
if has_sym:
if not amax:
amax = m
pcrange = [m-amax, m]
s, u = symeig(kernel, range=pcrange, overwrite=True)
else:
u, s, vt = svd(kernel)
v = dot(u.T, data)
for i in xrange(m):
s[i] = vnorm(v[i,:])
v[i,:] = v[i,:]/s[i]