ups

2007-07-28 16:05:11 +00:00
parent 9a2e259209
commit 349cab3c51
4 changed files with 297 additions and 131 deletions
@@ -197,7 +197,7 @@ class PLS(Model):
        Model.__init__(self, id, name)
        self._options = PlsOptions()
        
-    def validation(self, amax, n_sets, cv_val_method):
+    def validation(self):
        """Returns rmsep for pls model.
        """
        m, n = self.model['E0'].shape
@@ -207,7 +207,7 @@ class PLS(Model):
            val_engine = pls_val
        if self._options['calc_cv']==True:
            rmsep, aopt = val_engine(self.model['E0'], self.model['F0'],
-                                     amax, n_sets)
+                                     self._options['amax'], self._options['n_sets'])
            self.model['rmsep'] = rmsep[:,:-1]
            self.model['aopt'] = aopt
        else:
@@ -319,7 +319,7 @@ class PLS(Model):
            self.model['E0'] = self._data['X']
            self.model['F0'] = self._data['Y']
        
-        self.validation(**options.validation_options())
+        self.validation()
        self.make_model(self.model['E0'], self.model['F0'],
                        **options.make_model_options())
        # variance captured
@@ -6,81 +6,189 @@ There is almost no typechecking of any kind here, just focus on speed
 import math
 from scipy.linalg import svd,inv
 from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\
-     apply_along_axis,mean,ones,randn,empty_like,outer,c_,\
-     rand,sum,cumsum,matrix
+     apply_along_axis,mean,ones,randn,empty_like,outer,r_,c_,\
+     rand,sum,cumsum,matrix, expand_dims,minimum,where
 has_sym=True
 try:
-    import symmeig
+    from symeig import symeig
 except:
    has_sym = False
+has_sym=False

-def pca(a, aopt, scale='scores', mode='normal'):
-    """ Principal Component Analysis model
-    mode:
-         -- fast : returns smallest dim scaled (T for n<=m, P for n>m )
-         -- normal : returns all model params and residuals after aopt comp
-         -- detailed    : returns all model params and all residuals
-    """
+def pca(a, aopt,scale='scores',mode='normal',center_axis=-1):
+    """ Principal Component Analysis.
+
+    Performs PCA on given matrix and returns results in a dictionary.
+
+    :Parameters:
+        a : array
+    Data measurement matrix, (samples x variables)
+        aopt : int
+    Number of components to use, aopt<=min(samples, variables)
+
+    :Returns:  
+    results : dict
+        keys -- values,  T -- scores, P -- loadings, E -- residuals,
+        lev --leverages, ssq -- sum of squares, expvar -- cumulative
+        explained variance, aopt -- number of components used
    
+    :OtherParameters:
+    mode : str
+        Amount of info retained, ('fast', 'normal', 'detailed')
+    center_axis : int
+        Center along given axis. If neg.: no centering (-inf,..., matrix modes)
+  
+    :SeeAlso:
+        - pcr : other blm
+        - pls : other blm
+        - lpls : other blm
+ 
+    Notes
+    -----
+
+    Uses kernel speed-up if m>>n or m<<n.
+
+    If residuals turn rank deficient, a lower number of component than given
+    in input will be used. The number of components used is given in results-dict. 
+    
+    Examples
+    --------
+    
+    >>> import scipy,engines
+    >>> a=scipy.asarray([[1,2,3],[2,4,5]])
+    >>> dat=engines.pca(a, 2)
+    >>> dat['expvar']
+    array([0.,99.8561562,  100.])
+
+    """
+    if center_axis>=0:
+        a = a - expand_dims(a.mean(center_axis), center_axis)
    m, n = a.shape
-    #print "rows: %s    cols: %s" %(m,n)
    if m>(n+100) or n>(m+100):
-        u, s, v = esvd(a)
+        u, e, v = esvd(a)
+        s = sqrt(e)
    else:
        u, s, vt = svd(a, 0)
        v = vt.T
-    eigvals = (1./m)*s
+        e = s**2
+    tol = 1e-10
+    eff_rank = sum(s>s[0]*tol)
+    aopt = minimum(aopt, eff_rank)
    T = u*s
+    s = s[:aopt]
+    e = e[:aopt]
    T = T[:,:aopt]
    P = v[:,:aopt]
    
    if scale=='loads':
-        tnorm = apply_along_axis(vnorm, 0, T)
-        T = T/tnorm
-        P = P*tnorm
+        T = T/s
+        P = P*s

    if mode == 'fast':
-        return {'T':T, 'P':P}
+        return {'T':T, 'P':P, 'aopt':aopt}
    
    if mode=='detailed':
-        """Detailed mode returns residual matrix for all comp.
-        That is E, is a three-mode matrix: (amax, m, n) """
-        E = empty((aopt,  m,  n))
+        E = empty((aopt, m, n))
+        ssq = []
+        lev = []
+        expvarx = empty((aopt, aopt+1))
        for ai in range(aopt):
-            e = a - dot(T[:,:ai+1], P[:,:ai+1].T)
-            E[ai,:,:] = e.copy()
+            E[ai,:,:] = a - dot(T[:,:ai+1], P[:,:ai+1].T)
+            ssq.append([(E[ai,:,:]**2).sum(0), (E[ai,:,:]**2).sum(1)])
+            if scale=='loads':
+                lev.append([((s*T)**2).sum(1), (P**2).sum(1)])
+            else:
+                lev.append([(T**2).sum(1), ((s*P)**2).sum(1)])
+        
+            expvarx[ai,:] = r_[0, 100*e.cumsum()/e.sum()]
+        
    else:
-        E = a - dot(T,P.T)
-            
-    return {'T':T, 'P':P, 'E':E}
+        # residuals
+        E = a - dot(T, P.T)
+        SEP = E**2
+        ssq = [SEP.sum(0), SEP.sum(1)]
+        # leverages
+        if scale=='loads':
+            lev = [(1./m)+(T**2).sum(1), (1./n)+((P/s)**2).sum(1)]
+        else:
+            lev = [(1./m)+((T/s)**2).sum(1), (1./n)+(P**2).sum(1)]
+        # variances        
+        expvarx = r_[0, 100*e.cumsum()/e.sum()]
+        
+    return {'T':T, 'P':P, 'E':E, 'expvarx':expvarx, 'levx':lev, 'ssqx':ssq, 'aopt':aopt}

+def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0):
+    """ Principal Component Regression.

-def pcr(a, b, aopt=2, scale='scores', mode='normal'):
-    """Principal Component Regression.
+    Performs PCR on given matrix and returns results in a dictionary.

-    Returns 
+    :Parameters:
+        a : array
+    Data measurement matrix, (samples x variables)
+        b : array
+    Data response matrix, (samples x responses)
+        aopt : int
+    Number of components to use, aopt<=min(samples, variables)
+
+    :Returns:  
+    results : dict
+        keys -- values,  T -- scores, P -- loadings, E -- residuals,
+        levx -- leverages, ssqx -- sum of squares, expvarx -- cumulative
+        explained variance, aopt -- number of components used
+    
+    :OtherParameters:
+    mode : str
+        Amount of info retained, ('fast', 'normal', 'detailed')
+    center_axis : int
+        Center along given axis. If neg.: no centering (-inf,..., matrix modes)
+  
+    :SeeAlso:
+        - pcr : other blm
+        - pls : other blm
+        - lpls : other blm
+ 
+    Notes
+    -----
+
+    Uses kernel speed-up if m>>n or m<<n.
+
+    If residuals turn rank deficient, a lower number of component than given
+    in input will be used. The number of components used is given in results-dict. 
+    
+
+    Examples
+    --------
+    
+    >>> import scipy,engines
+    >>> a=scipy.asarray([[1,2,3],[2,4,5]])
+    >>> dat=engines.pca(a, 2)
+    >>> dat['expvar']
+    array([0.,99.8561562,  100.])

    """
-    m, n = m_shape(a)
-    B = empty((aopt, n, l))
-    dat = pca(a, aopt=aopt, scale=scale, mode='normal', center_axis=0)
+    k, l = m_shape(b)
+    if center_axis>=0:
+        b = b - expand_dims(b.mean(center_axis), center_axis)
+    dat = pca(a, aopt=aopt, scale=scale, mode=mode, center_axis=center_axis)
    T = dat['T']
-    weigths = apply_along_axis(vnorm, 0, T)
+    weights = apply_along_axis(vnorm, 0, T)
    if scale=='loads':
        # fixme: check weights
-        Q = dot(b.T, T*weights)
+        Q = dot(b.T, T*weights**2)
    else:
        Q = dot(b.T, T/weights**2)

    if mode=='fast':
-        return {'T', T:, 'P':P, 'Q':Q}
+        dat.update({'Q':Q})
+        return dat
    if mode=='detailed':
-        for i in range(1, aopt+1, 1):
-            F[i,:,:] = b - dot(T[:,i],Q[:,:i].T)
+        F = empty((aopt, k, l))
+        for i in range(aopt):
+            F[i,:,:] = b - dot(T[:,:i+1], Q[:,:i+1].T)
    else:
        F = b - dot(T, Q.T)
    #fixme: explained variance in Y + Y-var leverages
-    dat.update({'Q',Q, 'F':F})
+    dat.update({'Q':Q, 'F':F})
    return dat

 def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
@@ -271,7 +379,6 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], mode='normal', sca
        X, mnX = center(X, xctr)
        Y, mnY = center(Y, xctr)
        Z, mnZ = center(Z, zctr)
-        print Z.mean(1)

    varX = pow(X, 2).sum()
    varY = pow(Y, 2).sum()
@@ -365,7 +472,7 @@ def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], mode='normal', sca
 def m_shape(array):
    return matrix(array).shape

-def esvd(data):
+def esvd(data, amax=None):
    """SVD with the option of economy sized calculation
    Calculate subspaces of X'X or XX' depending on the shape
    of the matrix.
@@ -378,17 +485,30 @@ def esvd(data):
    m, n = data.shape
    if m>=n:
        kernel = dot(data.T, data)
-        u, s, vt = svd(kernel)
-        u = dot(data, vt.T)
-        v = vt.T
+        if has_sym:
+            if not amax:
+                amax = n
+            pcrange = [n-amax, n]
+            s, v = symeig(kernel, range=pcrange, overwrite=True)
+            s = s[::-1]
+            v = v[:,arange(n, -1, -1)]
+        else:
+            u, s, vt = svd(kernel)
+            v = vt.T
+        u = dot(data, v)
        for i in xrange(n):
            s[i] = vnorm(u[:,i])
            u[:,i] = u[:,i]/s[i]
    else:
        kernel = dot(data, data.T)
-        #data = (data + data.T)/2.0
-        u, s, vt = svd(kernel)
-        v = dot(u.T, data)
+        if has_sym:
+            if not amax:
+                amax = m
+            pcrange = [m-amax, m]
+            s, u = symeig(kernel, range=pcrange, overwrite=True)
+        else:
+            u, s, vt = svd(kernel)
+            v = dot(u.T, data)
        for i in xrange(m):
            s[i] = vnorm(v[i,:])
            v[i,:] = v[i,:]/s[i]