"""Module contain algorithms for (burdensome) calculations. There is no typechecking of any kind here, just focus on speed """ from scipy.linalg import svd,norm,inv,pinv,qr from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\ apply_along_axis,mean,ones,randn,empty_like,outer,c_,\ rand,sum,cumsum def pca(a, aopt, scale='scores', mode='normal'): """ Principal Component Analysis model mode: -- fast : returns smallest dim scaled (T for n<=m, P for n>m ) -- normal : returns all model params and residuals after aopt comp -- detailed : returns all model params and all residuals """ m,n = a.shape u,s,vt = svd(a, full_matrices=0) T = u*s T = T[:,:aopt] P = vt[:aopt,:].T if scale=='loads': tnorm = apply_along_axis(norm, 0, T) T = T/tnorm P = P*tnorm if mode == 'fast': return {'T':T, 'P':P} if mode=='detailed': """Detailed mode returns residual matrix for all comp. That is E, is a three-mode matrix: (amax, m, n) """ E = empty((aopt, m, n)) for ai in range(aopt): e = a - dot(T[:,:ai+1], P[:,:ai+1].T) E[ai,:,:] = e.copy() else: E = a - dot(T,P.T) return {'T':T, 'P':P, 'E':E} def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None): """Kernel pls for tall/wide matrices. Fast pls for calibration. Only inefficient for many Y-vars. """ m,n = a.shape if ab!=None: mm,l = ab.shape else: k,l = b.shape W = empty((n, aopt)) P = empty((n, aopt)) R = empty((n, aopt)) Q = empty((l, aopt)) T = empty((m, aopt)) B = empty((aopt, n, l)) if ab==None: ab = dot(a.T, b) for i in range(aopt): if ab.shape[1]==1: w = ab else: u,s,vh = svd(dot(ab.T, ab)) w = dot(ab,u[:,:1]) w = w/norm(w) r = w.copy() if i>0: for j in range(0,i,1): r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis] t = dot(a, r) tt = norm(t)**2 p = dot(a.T, t)/tt q = dot(r.T, ab).T/tt ab = ab - dot(p, q.T)*tt T[:,i] = t.ravel() W[:,i] = w.ravel() P[:,i] = p.ravel() R[:,i] = r.ravel() if mode=='fast' and i==aopt-1: if scale=='loads': tnorm = apply_along_axis(norm, 0, T) T = T/tnorm W = W*tnorm return {'T':T, 'W':W} Q[:,i] = q.ravel() B[i] = dot(R[:,:i+1], Q[:,:i+1].T) if mode=='detailed': E = empty((aopt, m, n)) F = empty((aopt, k, l)) for i in range(1,aopt+1,1): E[i-1] = a - dot(T[:,:i],P[:,:i].T) F[i-1] = b - dot(T[:,:i],Q[:,:i].T) else: E = a - dot(T[:,:aopt], P[:,:aopt].T) F = b - dot(T[:,:aopt], Q[:,:aopt].T) if scale=='loads': tnorm = apply_along_axis(norm, 0, T) T = T/tnorm W = W*tnorm Q = Q*tnorm P = P*tnorm return {'B':B, 'Q':Q, 'P':P, 'T':T, 'W':W, 'R':R, 'E':E, 'F':F} def w_simpls(aat, b, aopt): """ Simpls for wide matrices. Fast pls for crossval, used in calc rmsep for wide X There is no P,W. T is normalised """ bb = b.copy() m,m = aat.shape U = empty((m, aopt)) T = empty((m, aopt)) H = empty((m, aopt)) #just like W in simpls PROJ = empty((m, aopt)) #just like R in simpls for i in range(aopt): u,s,vh = svd(dot(dot(b.T, aat), b), full_matrices=0) u = dot(b, u[:,:1]) #y-factor scores U[:,i] = u.ravel() t =dot(aat, u) t = t/norm(t) T[:,i] = t.ravel() h = dot(aat, t) #score-weights H[:,i] = h.ravel() PROJ[:,:i+1] = dot(T[:,:i+1], inv(dot(T[:,:i+1].T, H[:,:i+1])) ) if i<aopt: b = b - dot(PROJ[:,:i+1], dot(H[:,:i+1].T,b) ) C = dot(bb.T, T) return {'T':T,'U':U,'Q':C,'H':H} def bridge(a, b, aopt, scale='scores', mode='normal', r=0): """Undeflated Ridged svd(X'Y) """ m, n = a.shape k, l = b.shape u,s,vt = svd(b, full_matrices=0) g0 = dot(u*s, u.T) g = (1 - r)*g0 + r*eye(m) ag = dot(a.T, g) u,s,vt = svd(ag, full_matrices=0) W = u[:,:aopt] K = vt[:aopt,:].T T = dot(a, W) tnorm = apply_along_axis(norm, 0, T) # norm of T-columns if mode == 'fast': if scale=='loads': T = T/tnorm W = W*tnorm return {'T':T, 'W':W} U = dot(g0, K) #fixme check this Q = dot(b.T, dot(T, inv(dot(T.T,T)) )) B = zeros((aopt, n, l)) for i in range(aopt): B[i] = dot(W[:,:i+1], Q[:,:i+1].T) # leverages # fixme: probably need an orthogonal basis for row-space leverage # T (scores) are not orthogonal # Using a qr decomp to get an orthonormal basis for row-space #Tq = qr(T)[0] #s_lev,v_lev = leverage(aopt,Tq,W) # explained variance #var_x, exp_var_x = variances(a,T,W) #qnorm = apply_along_axis(norm, 0, Q) #var_y, exp_var_y = variances(b,U,Q/qnorm) if mode == 'detailed': E = empty((aopt, m, n)) F = empty((aopt, k, l)) for i in range(aopt): E[i] = a - dot(T[:,:i+1], W[:,:i+1].T) F[i] = b - dot(a, B[i]) else: #normal F = b - dot(a, B[-1]) E = a - dot(T, W.T) if scale=='loads': T = T/tnorm W = W*tnorm Q = Q*tnorm return {'B':B, 'W':W, 'T':T, 'Q':Q, 'E':E, 'F':F, 'U':U, 'P':W}