diff --git a/fluents/lib/engines.py b/fluents/lib/engines.py index 8f529e5..219f1ce 100644 --- a/fluents/lib/engines.py +++ b/fluents/lib/engines.py @@ -13,9 +13,8 @@ try: from symeig import symeig except: has_sym = False -has_sym=False -def pca(a, aopt,scale='scores',mode='normal',center_axis=-1): +def pca(a, aopt,scale='scores',mode='normal',center_axis=0): """ Principal Component Analysis. Performs PCA on given matrix and returns results in a dictionary. @@ -26,7 +25,7 @@ def pca(a, aopt,scale='scores',mode='normal',center_axis=-1): aopt : int Number of components to use, aopt<=min(samples, variables) - :Returns: + :Returns: results : dict keys -- values, T -- scores, P -- loadings, E -- residuals, lev --leverages, ssq -- sum of squares, expvar -- cumulative @@ -42,14 +41,15 @@ def pca(a, aopt,scale='scores',mode='normal',center_axis=-1): - pcr : other blm - pls : other blm - lpls : other blm - + Notes ----- Uses kernel speed-up if m>>n or m<>> import scipy,engines >>> a=scipy.asarray([[1,2,3],[2,4,5]]) >>> dat=engines.pca(a, 2) - >>> dat['expvar'] + >>> dat['expvarx'] array([0.,99.8561562, 100.]) """ @@ -76,7 +76,6 @@ def pca(a, aopt,scale='scores',mode='normal',center_axis=-1): aopt = minimum(aopt, eff_rank) T = u*s s = s[:aopt] - e = e[:aopt] T = T[:,:aopt] P = v[:,:aopt] @@ -91,7 +90,6 @@ def pca(a, aopt,scale='scores',mode='normal',center_axis=-1): E = empty((aopt, m, n)) ssq = [] lev = [] - expvarx = empty((aopt, aopt+1)) for ai in range(aopt): E[ai,:,:] = a - dot(T[:,:ai+1], P[:,:ai+1].T) ssq.append([(E[ai,:,:]**2).sum(0), (E[ai,:,:]**2).sum(1)]) @@ -99,9 +97,6 @@ def pca(a, aopt,scale='scores',mode='normal',center_axis=-1): lev.append([((s*T)**2).sum(1), (P**2).sum(1)]) else: lev.append([(T**2).sum(1), ((s*P)**2).sum(1)]) - - expvarx[ai,:] = r_[0, 100*e.cumsum()/e.sum()] - else: # residuals E = a - dot(T, P.T) @@ -112,9 +107,9 @@ def pca(a, aopt,scale='scores',mode='normal',center_axis=-1): lev = [(1./m)+(T**2).sum(1), (1./n)+((P/s)**2).sum(1)] else: lev = [(1./m)+((T/s)**2).sum(1), (1./n)+(P**2).sum(1)] - # variances - expvarx = r_[0, 100*e.cumsum()/e.sum()] - + # variances + expvarx = r_[0, 100*e.cumsum()/e.sum()][:aopt+1] + return {'T':T, 'P':P, 'E':E, 'expvarx':expvarx, 'levx':lev, 'ssqx':ssq, 'aopt':aopt} def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0): @@ -130,7 +125,7 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0): aopt : int Number of components to use, aopt<=min(samples, variables) - :Returns: + :Returns: results : dict keys -- values, T -- scores, P -- loadings, E -- residuals, levx -- leverages, ssqx -- sum of squares, expvarx -- cumulative @@ -143,7 +138,7 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0): Center along given axis. If neg.: no centering (-inf,..., matrix modes) :SeeAlso: - - pcr : other blm + - pca : other blm - pls : other blm - lpls : other blm @@ -161,8 +156,9 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0): >>> import scipy,engines >>> a=scipy.asarray([[1,2,3],[2,4,5]]) - >>> dat=engines.pca(a, 2) - >>> dat['expvar'] + >>> b=scipy.asarray([[1,1],[2,3]]) + >>> dat=engines.pcr(a, 2) + >>> dat['expvarx'] array([0.,99.8561562, 100.]) """ @@ -171,12 +167,11 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0): b = b - expand_dims(b.mean(center_axis), center_axis) dat = pca(a, aopt=aopt, scale=scale, mode=mode, center_axis=center_axis) T = dat['T'] - weights = apply_along_axis(vnorm, 0, T) + weights = apply_along_axis(vnorm, 0, T)**2 if scale=='loads': - # fixme: check weights - Q = dot(b.T, T*weights**2) + Q = dot(b.T, T*weights) else: - Q = dot(b.T, T/weights**2) + Q = dot(b.T, T/weights) if mode=='fast': dat.update({'Q':Q}) @@ -187,43 +182,96 @@ def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0): F[i,:,:] = b - dot(T[:,:i+1], Q[:,:i+1].T) else: F = b - dot(T, Q.T) - #fixme: explained variance in Y + Y-var leverages - dat.update({'Q':Q, 'F':F}) + expvary = r_[0, 100*((T**2).sum(0)*(Q**2).sum(0)/(b**2).sum()).cumsum()[:aopt]] + #fixme: Y-var leverages + dat.update({'Q':Q, 'F':F, 'expvary':expvary}) return dat -def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None): +def pls(a, b, aopt=2, scale='scores', mode='normal', ax_center=0, ab=None): """Partial Least Squares Regression. - Applies plsr to given matrices and returns results in a dictionary. + Performs PLS on given matrix and returns results in a dictionary. + + :Parameters: + a : array + Data measurement matrix, (samples x variables) + b : array + Data response matrix, (samples x responses) + aopt : int + Number of components to use, aopt<=min(samples, variables) + + :Returns: + results : dict + keys -- values, T -- scores, P -- loadings, E -- residuals, + levx -- leverages, ssqx -- sum of squares, expvarx -- cumulative + explained variance of descriptors, expvary -- cumulative explained + variance of responses, aopt -- number of components used + + :OtherParameters: + mode : str + Amount of info retained, ('fast', 'normal', 'detailed') + center_axis : int + Center along given axis. If neg.: no centering (-inf,..., matrix modes) + + :SeeAlso: + - pca : other blm + - pcr : other blm + - lpls : other blm + + Notes + ----- + + Uses kernel speed-up if m>>n or m<>> import scipy,engines + >>> a=scipy.asarray([[1,2,3],[2,4,5]]) + >>> b=scipy.asarray([[1,1],[2,3]]) + >>> dat=engines.pls(a, b, 2) + >>> dat['expvarx'] + array([0.,99.8561562, 100.]) - Fast pls for calibration. Only inefficient for many Y-vars. """ - m, n = a.shape + + m, n = m_shape(a) if ab!=None: - mm, ll = m_shape(ab) + mm, l = m_shape(ab) + assert(m==mm) else: k, l = m_shape(b) - assert(m==mm) - assert(l==ll) + W = empty((n, aopt)) P = empty((n, aopt)) R = empty((n, aopt)) Q = empty((l, aopt)) T = empty((m, aopt)) B = empty((aopt, n, l)) - + if ab==None: ab = dot(a.T, b) for i in range(aopt): if ab.shape[1]==1: w = ab.reshape(n, l) + w = w/vnorm(w) + elif n0: # recursive estimate to + if i>0: for j in range(0, i, 1): r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis] t = dot(a, r) @@ -233,8 +281,6 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None): ab = ab - dot(p, q.T)*tt T[:,i] = t.ravel() W[:,i] = w.ravel() - P[:,i] = p.ravel() - R[:,i] = r.ravel() if mode=='fast' and i==aopt-1: if scale=='loads': @@ -243,6 +289,8 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None): W = W*tnorm return {'T':T, 'W':W} + P[:,i] = p.ravel() + R[:,i] = r.ravel() Q[:,i] = q.ravel() B[i] = dot(R[:,:i+1], Q[:,:i+1].T) @@ -272,14 +320,14 @@ def w_simpls(aat, b, aopt): """ bb = b.copy() m, m = aat.shape - U = empty((m, aopt)) + U = empty((m, aopt)) # W T = empty((m, aopt)) - H = empty((m, aopt)) #just like W in simpls - PROJ = empty((m, aopt)) #just like R in simpls + H = empty((m, aopt)) # R + PROJ = empty((m, aopt)) # P? for i in range(aopt): - u, s, vh = svd(dot(dot(b.T, aat), b), full_matrices=0) - u = dot(b, u[:,:1]) #y-factor scores + q, s, vh = svd(dot(dot(b.T, aat), b), full_matrices=0) + u = dot(b, q[:,:1]) #y-factor scores U[:,i] = u.ravel() t = dot(aat, u) t = t/vnorm(t) @@ -293,6 +341,38 @@ def w_simpls(aat, b, aopt): return {'T':T, 'U':U, 'Q':C, 'H':H} +def w_pls(aat, b, aopt): + """ Pls for wide matrices. + Fast pls for crossval, used in calc rmsep for wide X + There is no P or W. T is normalised + """ + bb = b.copy() + m, m = aat.shape + U = empty((m, aopt)) # W + T = empty((m, aopt)) + R = empty((m, aopt)) # R + PROJ = empty((m, aopt)) # P? + + for i in range(aopt): + if has_sym: + pass + else: + q, s, vh = svd(dot(dot(b.T, aat), b), full_matrices=0) + q = q[:,:1] + u = dot(b , q) #y-factor scores + U[:,i] = u.ravel() + t = dot(aat, u) + t = t/vnorm(t) + T[:,i] = t.ravel() + r = dot(aat, t) #score-weights + R[:,i] = r.ravel() + PROJ[:,: i+1] = dot(T[:,:i+1], inv(dot(T[:,:i+1].T, R[:,:i+1])) ) + if i ruins sep, so we are doing a copy prc_75.append(col[int(ind)]) prc_75 = array(prc_75) for i in range(1, sep.shape[1], 1):