from scipy import zeros,zeros_like,sqrt,dot,trace,sign,round_,argmax,\ sort,ravel,newaxis,asarray,diag,sum,outer,argsort,arange,ones_like,\ all,apply_along_axis,eye,atleast_2d from scipy.linalg import svd,inv,norm,det,sqrtm from scipy.stats import mean,median from cx_utils import mat_center from validation import pls_jkW from select_generators import shuffle_1d from engines import * import time def hotelling(Pcv, P, p_center='med', cov_center='med', alpha=0.3, crot=True, strict=False): """Returns regularized hotelling T^2. alpha -- regularisation towards pooled cov estimates beta -- regularisation for unstable eigenvalues p_center -- location method for submodels cov_center -- location method for sub coviariances alpha -- regularisation crot -- rotate submodels toward full? strict -- only rotate 90 degree ? """ m, n = P.shape n_sets, n, amax = Pcv.shape # allocate T_sq = empty((n, ),dtype='d') Cov_i = zeros((n, amax, amax),dtype='d') # rotate sub_models to full model if crot: for i, Pi in enumerate(Pcv): Pcv[i] = procrustes(P, Pi, strict=strict) # center of pnull if p_center=='med': P_ctr = median(Pcv, 0) elif p_center=='mean': # fixme: mean is unstable P_ctr = mean(Pcv, 0) else: #use full P_ctr = P for i in xrange(n): Pi = Pcv[:,i,:] # (n_sets x amax) Pi_ctr = P_ctr[i,:] # (1 x amax) Pim = (Pi - Pi_ctr[newaxis])*sqrt(n_sets-1) Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim) if cov_center == 'med': Cov = median(Cov_i, 0) else: Cov = mean(Cov_i, 0) reg_cov = (1. - alpha)*Cov_i + alpha*Cov for i in xrange(n): #Pc = P_ctr[i,:][:,newaxis] Pc = P_ctr[i,:] sigma = reg_cov[i] # T_sq[i] = (dot(Pc, inv(sigma) )*Pc).sum() #slow T_sq[i] = dot(dot(Pc, inv(sigma)), Pc) # dont need to care about transposes #T_sq[i] = dot(dot(Pc.T, inv(sigma)), Pc).ravel() return T_sq def procrustes(A, B, strict=True, center=False, verbose=False): """Rotation of B to A. strict -- Only do flipping and shuffling center -- Center before rotation, translate back after verbose -- Print ssq No scaling calculated. Output B_rot = Rotated B """ if center: A,mn_A = mat_center(A, ret_mn=True) B,mn_B = mat_center(B, ret_mn=True) u,s,vh = svd(dot(B.T, A)) v = vh.T Cm = dot(u, v.T) #orthogonal rotation matrix if strict: # just inverting and flipping Cm = ensure_strict(Cm) b_rot = dot(B, Cm) if verbose: print Cm.round() fit = sum(ravel(B - b_rot)**2) print "Sum of squares: %s" %fit if center: return mn_B + b_rot else: return b_rot def expl_var_x(Xc, T): """Returns explained variance of X. T should carry variance in length, Xc has zero col-mean. """ exp_var_x = diag(dot(T.T, T))*100/(sum(Xc**2)) return exp_var_x def expl_var_y(Y, T, Q): """Returns explained variance of Y. """ # centered Y exp_var_y = zeros((Q.shape[1], )) for a in range(Q.shape[1]): Ya = outer(T[:,a], Q[:,a]) exp_var_y[a] = 100*sum(Ya**2)/sum(Y**2) return exp_var_y def pls_qvals(a, b, aopt=None, alpha=.3, n_iter=20, algo='pls', center=True, sim_method='shuffle', p_center='med', cov_center='med', crot=True, strict=False): """Returns qvals for pls model. input: a -- data matrix b -- data matrix aopt -- scalar, opt. number of components alpha -- [0,1] regularisation parameter for T2-test n_iter -- number of permutations sim_method -- permutation method ['shuffle'] p_center -- location estimator for sub models ['med'] cov_center -- location estimator for covariance of submodels ['med'] crot -- bool, use rotations of sub models? strict -- bool, use stict (rot/flips only) rotations? """ m, n = a.shape TSQ = zeros((n, n_iter), dtype='d') # (nvars x n_subsets) n_false = zeros((n, n_iter), dtype='d') #full model if center: ac = a - a.mean(0) bc = b - b.mean(0) if algo=='bridge': dat = bridge(ac, bc, aopt, 'loads', 'fast') else: dat = pls(ac, bc, aopt, 'loads', 'fast') Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo,center=True) tsq_full = hotelling(Wcv, dat['W'], p_center=p_center, alpha=alpha, crot=crot, strict=strict, cov_center=cov_center) #t0 = time.time() Vs = shuffle_1d(bc, n_iter, axis=0) for i, b_shuff in enumerate(Vs): #t1 = time.time() if algo=='bridge': dat = bridge(ac, b_shuff, aopt, 'loads','fast') else: dat = pls(ac, b_shuff, aopt, 'loads', 'fast') Wcv = pls_jkW(a, b_shuff, aopt, n_blocks=None, algo=algo) TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center, alpha=alpha, crot=crot, strict=strict, cov_center=cov_center) #print time.time() - t1 return fdr(tsq_full, TSQ, median) def ensure_strict(C, only_flips=True): """Ensure that a rotation matrix does only 90 degree rotations. In multiplication with pcs this allows flips and reordering. if only_flips is True there will onlt be flips allowed """ Cm = C S = sign(C) # signs if only_flips==True: C = eye(Cm.shape[0])*S return C Cm = zeros_like(C) Cm.putmask(1.,abs(C)>.6) if det(Cm)>1: raise ValueError,"Implement this!" return Cm*S def pls_qvals_II(a, b, aopt=None, center=True, alpha=.3, n_iter=20, algo='pls', sim_method='shuffle', p_center='med', cov_center='med', crot=True, strict=False): """Returns qvals for pls model. Shuffling of variables in X. Null model is 'If I put genes randomly on network' ... if they are sign: then this is due to network structure and not covariance with response. input: a -- data matrix b -- data matrix aopt -- scalar, opt. number of components alpha -- [0,1] regularisation parameter for T2-test n_iter -- number of permutations sim_method -- permutation method ['shuffle'] p_center -- location estimator for sub models ['med'] cov_center -- location estimator for covariance of submodels ['med'] crot -- bool, use rotations of sub models? strict -- bool, use stict (rot/flips only) rotations? """ m, n = a.shape TSQ = zeros((n, n_iter), dtype='=tsq_full[j]) false_pos = median(n_false, 1) ll = arange(1, len(false_pos)+1, 1) sort_qval = false_pos.take(sort_index)/ll qval = false_pos/ll.take(back_sort_index) print time.time() - t0 #return qval, false_pos, TSQ, tsq_full return qval def leverage(aopt=1,*args): """Returns leverages input : aopt, number of components to base leverage calculations on *args, matrices of normed blm-paramters output: leverages For PCA typical inputs are normalised T or normalised P For PLSR typical inputs are normalised T or normalised W """ if aopt<1: raise ValueError,"Leverages only make sense for aopt>0" lev = [] for u in args: lev_u = 1./u.shape[0] + dot(u[:,:aopt], u[:,:aopt].T).diagonal() lev.append(lev_u) return lev def variances(a, t, p): """Returns explained variance and ind. var from blm-params. input: a -- full centered matrix t,p -- parameters from a bilinear approx of the above matrix. output: var -- variance of each component var_exp -- cumulative explained variance in percentage Typical inputs are: X(centered),T,P for PCA or X(centered),T,P / Y(centered),T,Q for PLSR. """ tot_var = sum(a**2) var = 100*(sum(p**2, 0)*sum(t**2, 0))/tot_var var_exp = cumsum(var) return var, var_exp def residual_diagnostics(Y, Yhat, aopt=1): """Root mean errors and press values. R2 vals """ pass def ssq(E, axis=0, weights=None): """Sum of squares, supports weights.""" n = E.shape[axis] if weights==None: weights = eye(n) else: weigths = diag(weigths) if axis==0: Ew = dot(weights, E) elif axis==1: Ew = dot(E, weights) else: raise NotImplementedError, "Higher order modes not supported" return pow(Ew,2).sum(axis) def vnorm(x): """Returns the euclidian norm of a vector. This is considerably faster than linalg.norm """ return sqrt(dot(x,x.conj())) def mahalanobis(a, loc=None, acov=None, invcov=None): """Returns the distance of each observation in a from the location estimate (loc) of the data, relative to the shape of the data. a : data matrix (n observations in rows, p variables in columns) loc : location estimate of the data (p-dimensional vector) covmat or invcov : scatter estimate of the data or the inverse of the scatter estimate (pxp matrix) :Returns: A vector containing the distances of all the observations to locvct. """ n, p = a.shape if loc==None: loc = a.mean(0) loc = atleast_2d(loc) if loc.shape[1]==1: loc = loc.T; #ensure rowvector assert(loc.shape[1]==p) xc = a - loc if acov==None and invcov==None: acov = dot(xc.T, xc) if invcov != None: covmat = atleast_2d(invcov) if min(covmat.shape)==1: covmat = diag(invcov.ravel()) else: covmat = atleast_2d(acov) if min(covmat.shape)==1: covmat = diag(covmat.ravel()) covmat = inv(covmat) # mdist = diag(dot(dot(xc, covmat),xc.T)) mdist = (dot(xc, covmat)*xc).sum(1) return mdist def lpls_qvals(a, b, c, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,center=True, sim_method='shuffle',p_center='med', cov_center='med',crot=True, strict=False): """Returns qvals for l-pls model. input: a -- data matrix b -- data matrix c -- data matrix aopt -- scalar, opt. number of components alpha -- [0,1] regularisation parameter for T2-test xz_alpha -- [0,1] how much z info to include n_iter -- number of permutations sim_method -- permutation method ['shuffle'] p_center -- location estimator for sub models ['med'] cov_center -- location estimator for covariance of submodels ['med'] crot -- bool, use rotations of sub models? strict -- bool, use stict (rot/flips only) rotations? """ m, n = a.shape TSQ = zeros((n, n_iter), dtype='d') # (nvars x n_subsets) n_false = zeros((n, n_iter), dtype='d') # Full model dat = lpls(a, b, c, aopt, scale='loads') Wcv = lpls_jk(a, b, c ,aopt, n_blocks=None, algo=algo,center=center) tsq_x = hotelling(Wcv, dat['W'], p_center=p_center,alpha=alpha, crot=crot, strict=strict, cov_center=cov_center) Lcv = lpls_jk(a, b, c ,aopt, n_blocks=None, algo=algo,center=center) tsq_z = hotelling(Lcv, dat['L'], p_center=p_center,alpha=alpha, crot=crot, strict=strict, cov_center=cov_center) # Perturbations t0 = time.time() Vs = shuffle_1d(b, n_iter, axis=0) for i, b_shuff in enumerate(Vs): t1 = time.time() dat = pls(ac, b_shuff, aopt, 'loads', 'fast') Wcv = pls_jkW(a, b_shuff, aopt, n_blocks=None, algo=algo) TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center, alpha=alpha, crot=crot, strict=strict, cov_center=cov_center) print time.time() - t1 return fdr(tsq_full, TSQ, median) def fdr(tsq, tsqp, loc_method=median): n, = tsq.shape k, m = tsqp.shape assert(n==k) n_false = empty((n, m), 'd') sort_index = argsort(tsq)[::-1] r_index = argsort(sort_index) for i in xrange(m): for j in xrange(n): n_false[j,i] = (tsqp[:,i]>tsq[j]).sum() fp = loc_method(n_false,1) n_signif = (arange(n) + 1.0)[r_index] fd_rate = fp/n_signif return fd_rate