diff --git a/fluents/lib/blmfuncs.py b/fluents/lib/blmfuncs.py index 588ed35..ff4d77c 100644 --- a/fluents/lib/blmfuncs.py +++ b/fluents/lib/blmfuncs.py @@ -10,7 +10,7 @@ from fluents.workflow import Function, OptionsDialog, Options from fluents.dataset import Dataset from fluents import plots, dataset, workflow, logger import scipy -from engines import pca, pls +from engines import pca, pls, nipals_lpls from cx_stats import leverage, variances, hotelling from cx_utils import mat_center from validation import * @@ -238,14 +238,14 @@ class PLS(Model): """Estimates cut off on significant vars by controlling fdr.""" if self._options['calc_qvals']==True: - qvals_sorted, qvals = pls_qvals(a, b, - aopt=None, - alpha=reg, - n_iter=n_iter, - algo='pls', - sim_method=sim_method) + qvals = pls_qvals(a, b, + aopt=None, + alpha=reg, + n_iter=n_iter, + algo='pls', + sim_method=sim_method) self.model['qval'] = qvals - self.model['qval_sorted'] = qvals_sorted + #self.model['qval_sorted'] = qvals_sorted else: self.model['qval'] = None self.model['qval_sorted'] = None @@ -276,18 +276,19 @@ class PLS(Model): pc_ids_opt = ['_comp', map(str, range(self.model['aopt']))] zero_dim = ['_doe',['0']] # null dim, vector (hidden) - match_ids = {'E':[ids_0, ids_1], - 'P':[ids_1, pc_ids], - 'T':[ids_0, pc_ids], - 'W': [ids_1, pc_ids], - 'R': [ids_1, pc_ids], - 'Q':[ids_3, pc_ids], - 'F':[ids_0, ids_3], - 'B':[ids_1, ids_3], - 'qval':[ids_1, zero_dim], + match_ids = {'E' : [ids_0, ids_1], + 'P' : [ids_1, pc_ids], + 'T' : [ids_0, pc_ids], + 'W' : [ids_1, pc_ids], + 'R' : [ids_1, pc_ids], + 'Q' : [ids_3, pc_ids], + 'F' : [ids_0, ids_3], + 'B' : [ids_1, ids_3], + 'qval' : [ids_1, zero_dim], 'qval_sorted':[ids_1, zero_dim], - 'w_tsq':[ids_1, zero_dim], - 'rmsep':[ids_3, pc_ids], + 'w_tsq' : [ids_1, zero_dim], + 'rmsep' : [ids_3, pc_ids], + 'CP': [ids_1, pc_ids] } array = self.model[name] @@ -302,7 +303,7 @@ class PLS(Model): #except: # logger.log('debug', 'Plot: %s failed' %plt) return out - + def run_o(self, a, b): """Run PLS with present options.""" options = self._options @@ -330,6 +331,17 @@ class PLS(Model): self.model['var_y'] = var_y self.model['exp_var_y'] = exp_var_y + if options['calc_corrloads']: + corr_load = scipy.empty_like(self.model['P'].copy()) + T = self.model['T'] + X = self._data['X'] + # For each variable/attribute in original matrix (not meancentered) + for i,score in enumerate(T.T): + for j, profile in enumerate(X.T): + corrs = scipy.corrcoef(score, profile) + corr_load[j,i] = corrs[0,1] + self.model['CP'] = corr_load + if options['calc_conf']: self.confidence(**options.confidence_options()) @@ -353,6 +365,141 @@ class PLS(Model): #run with current data and options return self.run_o(a, b) +class LPLS(Model): + def __init__(self, id='lpls', name='LPLS'): + Model.__init__(self, id, name) + self._options = LplsOptions() + + def validation(self, opt): + """Returns rmsep for lpls model. + """ + + if opt['calc_cv']==True: + val_engine = opt['val_engine'] + rmsep, aopt = val_engine(self.model['X'], self.model['Y'], + self.model['Z'], opt['amax'], opt['n_sets'], opt['xz_alpha']) + self.model['rmsep'] = rmsep + self.model['aopt'] = aopt + else: + self.model['rmsep'] = None + self.model['aopt'] = opt['aopt'] + + def confidence(self, opt): + """Returns a confidence measure for model parameters + Supported parameters: W + """ + aopt = self.model['aopt'] + if opt['calc_conf']: + Wx, Wz = lpls_jk(self.model['X'], self.model['Y'], self.model['Z'], aopt, n_sets) + Wcal = self.model['W'][:,:aopt] + Lcal = self.model['L'][:,:aopt] + # ensure that Wcal is scaled + tnorm = scipy.apply_along_axis(norm, 0, self.model['T'][:,:aopt]) + Wcal = Wcal*tnorm + a,b,c,d,e = opt['p_center'], opt['crot'], opt['alpha'], opt['strict'], opt['cov_center'] + tsqx = hotelling(Wx, Wcal, a,b,c,d,e) + tsqz = hotelling(Wz, Lcal, a,b,c,d,e) + self.model['tsqx'] = tsqx + self.model['tsqz'] = tsqz + else: + self.model['tsqx'] = None + self.model['tsqz'] = None + + def permutation_confidence(self, opt): + """Estimates cut off on significant vars by controlling fdr. + + """ + self.model['qval'] = None + self.model['qval_sorted'] = None + + def make_model(self, opt): + """Make model on amax components. + """ + engine = opt['engine'] + dat = engine(self._data['X'], self._data['Y'], self._data['Z'], + opt['amax'], opt['xz_alpha'], opt['center_mth'], + opt['mode'], opt['scale'], False) + self.model.update(dat) + + def as_dataset(self, name, dtype='Dataset'): + """Return any model parameter as Dataset + No ids matching + """ + if name not in self.model.keys(): + return + DX, DY, DZ = self._dataset['X'], self._dataset['Y'], self._dataset['Z'] + dim_name_0, dim_name_1 = DX.get_dim_name() + dim_name_2, dim_name_3 = DY.get_dim_name() + dim_name_4, dim_name_5 = DZ.get_dim_name() + #samples + ids_0 = [dim_name_0, DX.get_identifiers(dim_name_0, sorted=True)] + # x vars (genes) + ids_1 = [dim_name_1, DX.get_identifiers(dim_name_1, sorted=True)] + # y vars (sample descriptors) + ids_3 = [dim_name_3, DY.get_identifiers(dim_name_3, sorted=True)] + #z-vars (variable descriptors) + ids_4 = [dim_name_4, DZ.get_identifiers(dim_name_4, sorted=True)] + # components (hidden) + pc_ids = ['_comp', map(str, range(self._options['amax']))] + pc_ids_opt = ['_comp', map(str, range(self.model['aopt']))] + zero_dim = ['_doe',['0']] # null dim, vector (hidden) + + match_ids = {'E' : [ids_0, ids_1], + 'P' : [ids_1, pc_ids], + 'T' : [ids_0, pc_ids], + 'W' : [ids_1, pc_ids], + 'L' : [ids_4, pc_ids], + 'Q' : [ids_3, pc_ids], + 'F' : [ids_0, ids_3], + 'B' : [ids_1, ids_3], + 'tsqx' : [ids_1, zero_dim], + 'tsqz' : [ids_4, zero_dim], + 'K' : [ids_1, pc_ids], + 'rmsep' : [ids_3, pc_ids] + } + + array = self.model[name] + M = Dataset(array, identifiers=match_ids[name], name=name) + return M + + def get_out_plots(self, options): + out=[] + for plt in options['out_plots']: + out.append(plt(self)) + return out + + def run(self, a, b, c): + """Run L-PLS with present options.""" + options = self._options + self._dataset['X'] = a + self._dataset['Y'] = b + self._dataset['Z'] = c + self._data['X'] = a.asarray() + self._data['Y'] = b.asarray() + self._data['Z'] = c.asarray() + self.validation(options) + self.make_model(options) + if options['calc_conf']: + self.confidence(options) + + out = [self.as_dataset(p) for p in options['out_data']] + for plt in self.get_out_plots(options): + out.append(plt) + return out + + def run_gui(self, a, b, c): + """Run LPLS with option gui. + """ + dialog = LPlsOptionsDialog([a, b, c], self._options) + dialog.show_all() + response = dialog.run() + dialog.hide() + + if response == gtk.RESPONSE_OK: + # set output data and plots + dialog.set_output() + #run with current data and options + return self.run(a, b, c) class PcaOptions(Options): """Options for Principal Component Analysis. @@ -403,7 +550,9 @@ class PcaOptions(Options): ] opt['out_data'] = ['T','P', 'p_tsq'] - opt['out_plots'] = [blmplots.PcaScorePlot,blmplots.PcaLoadingPlot,blmplots.LineViewXc] + opt['out_plots'] = [blmplots.PcaScorePlot, + blmplots.PcaLoadingPlot, + blmplots.LineViewXc] self.update(opt) @@ -444,6 +593,7 @@ class PlsOptions(Options): opt['center_mth'] = mat_center opt['scale'] = 'scores' + opt['calc_corrloads'] = True opt['calc_conf'] = False opt['n_sets'] = 5 opt['strict'] = True @@ -468,7 +618,8 @@ class PlsOptions(Options): (blmplots.PlsLoadingPlot, 'Loadings', True), (blmplots.LineViewXc, 'Line view', True), (blmplots.PredictionErrorPlot, 'Residual Error', False), - (blmplots.RMSEPPlot, 'RMSEP', False) + (blmplots.RMSEPPlot, 'RMSEP', False), + (blmplots.PlsCorrelationLoadingPlot, 'Corr. loadings', True) ] opt['out_data'] = ['T','P', 'p_tsq'] @@ -494,14 +645,87 @@ class PlsOptions(Options): 'strict', 'crot', 'cov_center'] return self._copy_from_list(opt_list) + + def permutation_confidence(self): + opt_list = ['q_pert_method', 'q_iter'] + return self._copy_from_list(opt_list) + + +class LplsOptions(Options): + """Options for L-shaped Partial Least Squares Regression. + """ + def __init__(self): + Options.__init__(self) + self._set_default() + + def _set_default(self): + opt = {} + opt['engine'] = nipals_lpls + opt['mode'] = 'normal' # how much info to calculate + opt['amax'] = 10 + opt['aopt'] = 9 + opt['xz_alpha'] = .5 + opt['auto_aopt'] = False + opt['center'] = True + opt['center_mth'] = [2, 0, 1] + opt['scale'] = 'scores' + opt['calc_conf'] = False + opt['n_sets'] = 7 + opt['strict'] = False + opt['p_center'] = 'med' + opt['alpha'] = .3 + opt['cov_center'] = 'med' + opt['crot'] = True + + opt['calc_cv'] = False + opt['cv_val_method'] = 'random' + opt['cv_val_sets'] = opt['n_sets'] + + opt['all_data'] = [('T', 'scores', True), + ('Wx', 'X-weights', True), + ('Wz', 'Z-weights', True), + ('E','residuals', False), + ('tsq_x', 't2X', False), + ('rmsep', 'RMSEP', False) + ] + + # (class, name, sensitive, ticked) + opt['all_plots'] = [(blmplots.PlsScorePlot, 'Scores', True), + (blmplots.PlsLoadingPlot, 'Loadings', True), + (blmplots.LineViewXc, 'Line view', True), + (blmplots.PredictionErrorPlot, 'Residual Error', False), + (blmplots.RMSEPPlot, 'RMSEP', False), + (blmplots.LplsHypoidCorrelationPlot, 'Hypoid corr.', False) + ] + + opt['out_data'] = ['T','P'] + opt['out_plots'] = [blmplots.PlsScorePlot,blmplots.PlsLoadingPlot,blmplots.LineViewXc] + + #opt['out_data'] = None + + opt['pack'] = False + opt['calc_qvals'] = False + opt['q_pert_method'] = 'shuffle_rows' + opt['q_iter'] = 20 + + self.update(opt) + + def make_model_options(self): + """Options for make_model method.""" + opt_list = ['scale','mode', 'amax', 'engine'] + return self._copy_from_list(opt_list) + + def confidence_options(self): + """Options for confidence method.""" + opt_list = ['n_sets', 'aopt', 'alpha', 'p_center', + 'strict', 'crot', 'cov_center'] + return self._copy_from_list(opt_list) + def validation_options(self): """Options for pre_validation method.""" opt_list = ['amax', 'n_sets', 'cv_val_method'] return self._copy_from_list(opt_list) - def permutation_confidence(self): - opt_list = ['q_pert_method', 'q_iter'] - return self._copy_from_list(opt_list) class PcaOptionsDialog(OptionsDialog): """Options dialog for Principal Component Analysis. @@ -716,6 +940,210 @@ class PcaOptionsDialog(OptionsDialog): self._options['strict'] = True +class LplsOptionsDialog(OptionsDialog): + """Options dialog for L-shaped Partial Least squares regression. + """ + def __init__(self, data, options, input_names=['X', 'Y', 'Z']): + OptionsDialog.__init__(self, data, options, input_names) + glade_file = os.path.join(fluents.DATADIR, 'lpls_options.glade') + + notebook_name = "vbox1" + page_name = "Options" + self.add_page_from_glade(glade_file, notebook_name, page_name) + # connect signals to handlers + dic = {"on_amax_value_changed" : self.on_amax_changed, + "on_aopt_value_changed" : self.on_aopt_changed, + "auto_aopt_toggled" : self.auto_aopt_toggled, + "center_toggled" : self.center_toggled, + #"on_scale_changed" : self.on_scale_changed, + "on_val_none" : self.val_toggled, + "on_val_cv" : self.cv_toggled, + "on_cv_method_changed" : self.on_cv_method_changed, + "on_cv_sets_changed" : self.on_cv_sets_changed, + "on_conf_toggled" : self.conf_toggled, + "on_subset_loc_changed" : self.on_subset_loc_changed, + "on_cov_loc_changed" : self.on_cov_loc_changed, + "on_alpha_changed" : self.on_alpha_changed, + "on_rot_changed" : self.on_rot_changed, + "on__toggled" : self.conf_toggled, + "on_qval_changed" : self.on_qval_changed, + "on_iter_changed" : self.on_iter_changed + } + + self.wTree.signal_autoconnect(dic) + + # set/ensure valid default values/ranges + # + amax_sb = self.wTree.get_widget("amax_spinbutton") + max_comp = min(data[0].shape) # max num of components + if self._options['amax']>max_comp: + logger.log('debug', 'amax default too large ... adjusting') + self._options['amax'] = max_comp + amax_sb.get_adjustment().set_all(self._options['amax'], 1, max_comp, 1, 0, 0) + # aopt spin button + aopt_sb = self.wTree.get_widget("aopt_spinbutton") + if self._options['aopt']>self._options['amax']: + self._options['aopt'] = self._options['amax'] + 1 - 1 + aopt_sb.get_adjustment().set_all(self._options['aopt'], 1, self._options['amax'], 1, 0, 0) + + # scale + # scale_cb = self.wTree.get_widget("scale_combobox") + # scale_cb.set_active(0) + + # validation frames + if self._options['calc_cv']==False: + cv_frame = self.wTree.get_widget("cv_frame") + cv_frame.set_sensitive(False) + + cv = self.wTree.get_widget("cv_method").set_active(0) + + # confidence + if self._options['calc_conf']==True: + self.wTree.get_widget("subset_expander").set_sensitive(True) + else: + self.wTree.get_widget("subset_expander").set_sensitive(False) + + cb = self.wTree.get_widget("subset_loc") + _m = {'med': 0, 'mean': 1, 'full_model': 2} + cb.set_active(_m.get(self._options['p_center'])) + + cb = self.wTree.get_widget("cov_loc") + _m = {'med': 0, 'mean': 1} + cb.set_active(_m.get(self._options['cov_center'])) + + hs = self.wTree.get_widget("alpha_scale") + hs.set_value(self._options['alpha']) + + tb = self.wTree.get_widget("qvals") + tb.set_sensitive(True) + + + def on_amax_changed(self, sb): + logger.log("debug", "amax changed: new value: %s" %sb.get_value_as_int()) + amax = sb.get_value_as_int() + # update aopt if needed + if amax=tsq_full[j]) # number of false pos. genes (0-n) + false_pos = median(n_false, 1) + ll = arange(1, len(false_pos)+1, 1) + sort_qval = false_pos.take(sort_index)/ll + qval = false_pos/ll.take(back_sort_index) + print time.time() - t0 + #return qval, false_pos, TSQ, tsq_full + return qval + +def ensure_strict(C, only_flips=True): + """Ensure that a rotation matrix does only 90 degree rotations. + In multiplication with pcs this allows flips and reordering. + + if only_flips is True there will onlt be flips allowed + """ + Cm = C + S = sign(C) # signs + if only_flips==True: + C = eye(Cm.shape[0])*S + return C + Cm = zeros_like(C) + Cm.putmask(1.,abs(C)>.6) + if det(Cm)>1: + raise ValueError,"Implement this!" + return Cm*S + +def pls_qvals_II(a, b, aopt=None, center=True, alpha=.3, + n_iter=20, algo='pls', + sim_method='shuffle', + p_center='med', cov_center='med', + crot=True, strict=False, metric=None): + + """Returns qvals for pls model. + Shuffling of variables in X is preprocessed in metric. + Null model is 'If I put genes randomly on network' ... if they are sign: + then this is due to network structure and not covariance with response. + + input: + a -- data matrix + b -- data matrix aopt -- scalar, opt. number of components alpha -- [0,1] regularisation parameter for T2-test n_iter -- number of permutations @@ -140,25 +231,33 @@ def pls_qvals(a, b, aopt=None, alpha=.3, n_false = zeros((n, n_iter), dtype='.6) - if det(Cm)>1: - raise ValueError,"Implement this!" - return Cm*S + #return qval, false_pos, TSQ, tsq_full + return qval def leverage(aopt=1,*args): """Returns leverages @@ -253,3 +336,10 @@ def ssq(E, axis=0, weights=None): raise NotImplementedError, "Higher order modes not supported" return pow(Ew,2).sum(axis) + +def vnorm(x): + """Returns the euclidian norm of a vector. + + This is considerably faster than linalg.norm + """ + return sqrt(dot(x,x.conj())) diff --git a/fluents/lib/cx_utils.py b/fluents/lib/cx_utils.py index 33c49e6..281390a 100644 --- a/fluents/lib/cx_utils.py +++ b/fluents/lib/cx_utils.py @@ -1,23 +1,25 @@ from scipy import apply_along_axis,newaxis,zeros,\ median,round_,nonzero,dot,argmax,any,sqrt,ndarray,\ trace,zeros_like,sign,sort,real,argsort,rand,array,\ - matrix + matrix,nan from scipy.linalg import norm,svd,inv,eig from scipy.stats import median,mean -def normalise(a,axis=0,return_scales=False): - s = apply_along_axis(norm,axis,a) - if axis==0: - s = s[newaxis] - else: - s = s[:,newaxis] - - a_s = a/s - if return_scales: - return a_s,s - return a_s +def normalise(a, axis=0, return_scales=False): + s = apply_along_axis(norm, axis, a) + if axis==0: + s = s[newaxis] + else: + s = s[:,newaxis] + + a_s = a/s -def sub2ind(shape,i,j): + if return_scales: + return a_s, s + + return a_s + +def sub2ind(shape, i, j): """Indices from subscripts. Only support for 2d""" row,col = shape ind = [] @@ -41,13 +43,13 @@ def sorted_eig(a, b=None,sort_by='sm'): (This is reversed output compared to matlab) """ - s,v = eig(a,b) + s,v = eig(a, b) s = real(s) # dont expect any imaginary part v = real(v) ind = argsort(s) if sort_by=='lm': ind = ind[::-1] - v = v.take(ind,1) + v = v.take(ind, 1) s = s.take(ind) return s,v @@ -67,15 +69,15 @@ def str2num(string_number): return num def randperm(n): - r=rand(n) + r = rand(n) dict={} for i in range(n): - dict[r[i]]=i - r=sort(r) - out=zeros(n) + dict[r[i]] = i + r = sort(r) + out = zeros(n) for i in range(n): - out[i]=dict[r[i]] - return array(out,dtype='i') + out[i] = dict[r[i]] + return array(out).astype('i') def mat_center(X,axis=0,ret_mn=False): """Mean center matrix along axis. diff --git a/fluents/lib/engines.py b/fluents/lib/engines.py index 53cf711..1e2f7e8 100644 --- a/fluents/lib/engines.py +++ b/fluents/lib/engines.py @@ -3,11 +3,12 @@ There is no typechecking of any kind here, just focus on speed """ -from scipy.linalg import svd,norm,inv,pinv,qr +import math +from scipy.linalg import svd,inv from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\ apply_along_axis,mean,ones,randn,empty_like,outer,c_,\ rand,sum,cumsum,matrix - + def pca(a, aopt, scale='scores', mode='normal'): """ Principal Component Analysis model mode: @@ -18,17 +19,18 @@ def pca(a, aopt, scale='scores', mode='normal'): m, n = a.shape - if m*10.>n: - u, s, vt = esvd(a) + if m*3>n: + u, s, v = esvd(a) else: u, s, vt = svd(a, full_matrices=0) + v = vt.T eigvals = (1./m)*s T = u*s T = T[:,:aopt] - P = vt[:aopt,:].T + P = v[:,:aopt] if scale=='loads': - tnorm = apply_along_axis(norm, 0, T) + tnorm = apply_along_axis(vnorm, 0, T) T = T/tnorm P = P*tnorm @@ -47,6 +49,7 @@ def pca(a, aopt, scale='scores', mode='normal'): return {'T':T, 'P':P, 'E':E} + def pcr(a, b, aopt=2, scale='scores', mode='normal'): """Returns Principal component regression model.""" m, n = a.shape @@ -98,13 +101,13 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None): u, s, vh = svd(dot(ab.T, ab)) w = dot(ab, u[:,:1]) - w = w/norm(w) + w = w/vnorm(w) r = w.copy() if i>0: for j in range(0,i,1): r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis] t = dot(a, r) - tt = norm(t)**2 + tt = vnorm(t)**2 p = dot(a.T, t)/tt q = dot(r.T, ab).T/tt ab = ab - dot(p, q.T)*tt @@ -115,7 +118,7 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None): if mode=='fast' and i==aopt-1: if scale=='loads': - tnorm = apply_along_axis(norm, 0, T) + tnorm = apply_along_axis(vnorm, 0, T) T = T/tnorm W = W*tnorm return {'T':T, 'W':W} @@ -134,7 +137,7 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None): F = b - dot(T[:,:aopt], Q[:,:aopt].T) if scale=='loads': - tnorm = apply_along_axis(norm, 0, T) + tnorm = apply_along_axis(vnorm, 0, T) T = T/tnorm W = W*tnorm Q = Q*tnorm @@ -159,7 +162,7 @@ def w_simpls(aat, b, aopt): u = dot(b, u[:,:1]) #y-factor scores U[:,i] = u.ravel() t = dot(aat, u) - t = t/norm(t) + t = t/vnorm(t) T[:,i] = t.ravel() h = dot(aat, t) #score-weights H[:,i] = h.ravel() @@ -183,7 +186,7 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0): W = u[:,:aopt] K = vt[:aopt,:].T T = dot(a, W) - tnorm = apply_along_axis(norm, 0, T) # norm of T-columns + tnorm = apply_along_axis(vnorm, 0, T) # norm of T-columns if mode == 'fast': if scale=='loads': @@ -196,16 +199,6 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0): B = zeros((aopt, n, l), dtype='f') for i in range(aopt): B[i] = dot(W[:,:i+1], Q[:,:i+1].T) - # leverages - # fixme: probably need an orthogonal basis for row-space leverage - # T (scores) are not orthogonal - # Using a qr decomp to get an orthonormal basis for row-space - #Tq = qr(T)[0] - #s_lev,v_lev = leverage(aopt,Tq,W) - # explained variance - #var_x, exp_var_x = variances(a,T,W) - #qnorm = apply_along_axis(norm, 0, Q) - #var_y, exp_var_y = variances(b,U,Q/qnorm) if mode == 'detailed': E = empty((aopt, m, n)) @@ -225,10 +218,132 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0): return {'B':B, 'W':W, 'T':T, 'Q':Q, 'E':E, 'F':F, 'U':U, 'P':W} + +def nipals_lpls(X, Y, Z, amax, alpha=.7, mean_ctr=[2, 0, 1], mode='normal', scale='scores', verbose=False): + """ L-shaped Partial Least Sqaures Regression by the nipals algorithm. + + (X!Z)->Y + :input: + X : data matrix (m, n) + Y : data matrix (m, l) + Z : data matrix (n, o) + + :output: + T : X-scores + W : X-weights/Z-weights + P : X-loadings + Q : Y-loadings + U : X-Y relation + L : Z-scores + K : Z-loads + B : Regression coefficients X->Y + b0: Regression coefficient intercept + evx : X-explained variance + evy : Y-explained variance + evz : Z-explained variance + + :Notes: + + """ + if mean_ctr!=None: + xctr, yctr, zctr = mean_ctr + X, mnX = center(X, xctr) + Y, mnY = center(Y, xctr) + Z, mnZ = center(Z, zctr) + + varX = pow(X, 2).sum() + varY = pow(Y, 2).sum() + varZ = pow(Z, 2).sum() + + m, n = X.shape + k, l = Y.shape + u, o = Z.shape + + # initialize + U = empty((k, amax)) + Q = empty((l, amax)) + T = empty((m, amax)) + W = empty((n, amax)) + P = empty((n, amax)) + K = empty((o, amax)) + L = empty((u, amax)) + var_x = empty((amax,)) + var_y = empty((amax,)) + var_z = empty((amax,)) + + for a in range(amax): + if verbose: + print "\n Working on comp. %s" %a + u = Y[:,:1] + diff = 1 + MAX_ITER = 100 + lim = 1e-5 + niter = 0 + while (diff>lim and niter=n: - u, s, vt = svd(dot(data.T, data)) + data = dot(data.T, data) + u, s, vt = svd(data) u = dot(data, vt.T) v = vt.T for i in xrange(n): - s[i] = norm(u[:,i]) + s[i] = vnorm(u[:,i]) u[:,i] = u[:,i]/s[i] else: - u, s, vt = svd(dot(data, data.T)) + data = dot(data, data.T) + data = (data + data.T)/2.0 + u, s, vt = svd(data) v = dot(u.T, data) for i in xrange(m): - s[i] = norm(v[i,:]) + s[i] = vnorm(v[i,:]) v[i,:] = v[i,:]/s[i] - return u, s, v + return u, s, v.T + +def vnorm(x): + # assume column arrays (or vectors) + return math.sqrt(dot(x.T, x)) + +def center(a, axis): + # 0 = col center, 1 = row center, 2 = double center + # -1 = nothing + if axis==-1: + mn = zeros((a.shape[1],)) + elif axis==0: + mn = a.mean(0) + elif axis==1: + mn = a.mean(1)[:,newaxis] + elif axis==2: + mn = a.mean(0) + a.mean(1)[:,newaxis] - a.mean() + else: + raise IOError("input error: axis must be in [-1,0,1,2]") + + return a - mn, mn diff --git a/fluents/lib/hypergeom.py b/fluents/lib/hypergeom.py index 56fa70f..f04744f 100644 --- a/fluents/lib/hypergeom.py +++ b/fluents/lib/hypergeom.py @@ -53,6 +53,7 @@ def gene_hypergeo_test(selection, category_dataset): cat_count) pvals = scipy.where(cat_count==0, 2, pvals) + pvals = scipy.where(scipy.isnan(pvals), 2, pvals) out = {} for i in range(pvals.size): out[str(all_cats[i])] = (count[i], cat_count[i], pvals[i]) diff --git a/fluents/lib/nx_utils.py b/fluents/lib/nx_utils.py index d61259c..532aa07 100644 --- a/fluents/lib/nx_utils.py +++ b/fluents/lib/nx_utils.py @@ -2,7 +2,7 @@ import os,sys from itertools import izip import networkx as NX from scipy import shape,diag,dot,asarray,sqrt,real,zeros,eye,exp,maximum,\ - outer,maximum,sum,diag,real + outer,maximum,sum,diag,real,atleast_2d from scipy.linalg import eig,svd,inv,expm,norm from cx_utils import sorted_eig @@ -378,6 +378,7 @@ Ke = expm(A) .... expm(-A)? # 14.05.2006: diffusion returns negative values, using expm(-LL) instead (FIX) # 13.09.2206: update for use in numpy +# 27.04.2007: diffusion now uses pade approximations to matrix exponential. Also the last def K_expAdj(W, normalised=True, alpha=1.0): """Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel. @@ -433,8 +434,8 @@ def K_vonNeumann(W, normalised=True, alpha=1.0): return dot(dot(vr,psigma),vri).astype(t) def K_laplacian(W, normalised=True, alpha=1.0): - """ This is the matrix square root of the pseudo inverse of L. - Also known as th eaverage commute time matrix. + """ This is the matrix pseudo inverse of L. + Also known as the average commute time matrix. """ W = asarray(W) t = W.dtype.char @@ -464,8 +465,7 @@ def K_laplacian(W, normalised=True, alpha=1.0): return K - -def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5): +def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5, use_cut=False): """Returns diffusion kernel. input: -- W, adj. matrix @@ -477,27 +477,45 @@ def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5): t = W.dtype.char if len(W.shape)!=2: raise ValueError, "Non-matrix input to matrix function." - m,n = W.shape + m, n = W.shape if t in ['F','D']: raise TypeError, "Complex input!" - D = diag(sum(W,0)) - L = D-W + D = diag(W.sum(0)) + L = D - W if normalised==True: - T = diag(sqrt(1./(sum(W,0)))) - L = dot(dot(T,L),T) - e,vr = eig(L) + T = diag(sqrt(1./W.sum(0))) + L = dot(dot(T, L), T) + e, vr = eig(L) vri = inv(vr) #inv cond = 1.0*{0: feps*1e3, 1: eps*1e6}[_array_precision[t]] cutoff = 1.*abs(cond*maximum.reduce(e)) - psigma = eye(m) # if sing vals are 0 exp(0)=1 (unnecessary) + psigma = eye(m) # if eigvals are 0 exp(0)=1 (unnecessary) #psigma = zeros((m,n), dtype=' cutoff: psigma[i,i] = exp(-beta*e[i]) + #else: + # psigma[i,i] = 0.0 K = real(dot(dot(vr, psigma), vri)) I = eye(n, dtype='), (diffusion degree) + """ + + D = diag(W.sum(0)) + L = D - W + if normalised==True: + T = diag(sqrt(1./W.sum(0))) + L = dot(dot(T, L), T) + return expm(-beta*L) + def K_modularity(W,alpha=1.0): """ Returns the matrix square root of Newmans modularity.""" @@ -530,3 +548,20 @@ def kernel_score(K, W): score = diag(dot(W, dot(K, W)) ) tot = sum(score) return score, tot + + +def modularity_matrix(G, nodelist=None): + if not nodelist: + nodelist = G.nodes() + else: + G = NX.subgraph(G, nodelist) + + A = NX.adj_matrix(G, nodelist=nodelist) + d = atleast_2d(G.degree(nbunch=nodelist)) + m = 1.*G.number_of_edges() + B = A - A/m + return B + + + + diff --git a/fluents/lib/select_generators.py b/fluents/lib/select_generators.py index 9cfcb33..d791402 100644 --- a/fluents/lib/select_generators.py +++ b/fluents/lib/select_generators.py @@ -41,30 +41,31 @@ def pls_gen(a, b, n_blocks=None, center=False, index_out=False,axis=0, metric=No """Random block crossvalidation Leave-one-out is a subset, with n_blocks equals a.shape[-1] """ - index = randperm(a.shape[axis]) + #index = randperm(a.shape[axis]) + index = arange(a.shape[axis]) if n_blocks==None: n_blocks = a.shape[axis] n_in_set = ceil(float(a.shape[axis])/n_blocks) out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_blocks)] for out in out_ind_sets: - inn = [i for i in index if i not in out] - acal = a.take(inn, 0) - atrue = a.take(out, 0) - bcal = b.take(inn, 0) - btrue = b.take(out, 0) - if center: - mn_a = acal.mean(0)[newaxis] - acal = acal - mn_a - atrue = atrue - mn_a - mn_b = bcal.mean(0)[newaxis] - bcal = bcal - mn_b - btrue = btrue - mn_b - if metric!=None: - acal = dot(acal, metric) - if index_out: - yield acal, atrue, bcal, btrue, out - else: - yield acal, atrue, bcal, btrue + inn = [i for i in index if i not in out] + acal = a.take(inn, 0) + atrue = a.take(out, 0) + bcal = b.take(inn, 0) + btrue = b.take(out, 0) + if center: + mn_a = acal.mean(0)[newaxis] + acal = acal - mn_a + atrue = atrue - mn_a + mn_b = bcal.mean(0)[newaxis] + bcal = bcal - mn_b + btrue = btrue - mn_b + if metric!=None: + acal = dot(acal, metric) + if index_out: + yield acal, atrue, bcal, btrue, out + else: + yield acal, atrue, bcal, btrue def pca_gen(a, n_sets=None, center=False, index_out=False, axis=0, metric=None): @@ -151,6 +152,7 @@ def shuffle_1d_block(a, n_sets=None, blocks=None, index_out=False, axis=0): index = arange(m) dummy = map(random.shuffle, array_split(index, blocks)) a_out = a.take(index, axis) + if index_out: yield a_out, index else: @@ -164,7 +166,8 @@ def shuffle_1d(a, n_sets, axis=0): m = a.shape[axis] for ii in xrange(n_sets): index = randperm(m) - yield a.take(index, axis) + a = a.take(index, axis) + yield a def diag_pert(a, n_sets=10, center=True, index_out=False): """Alter generator returning sets perturbed with means at diagonals. @@ -205,18 +208,17 @@ def diag_pert(a, n_sets=10, center=True, index_out=False): else: yield a_out - + def outerprod_centering(aat, ret_mn=True): - """Returns mean centered symmetric outerproduct matrix. + """Returns double centered symmetric outerproduct matrix. """ - n = aat.shape[0] - h = aat.sum(0)[:,newaxis] - h = (h - mean(h)/2)/n - mn_a = h + h.T + h = aat.mean(0)[newaxis] + h = h - 0.5*h.mean() + mn_a = h + h.T # beauty of broadcasting aatc = aat - mn_a if ret_mn: - return aatc, aat.mean(0) - return aat - mn_a + return aatc, mn_a + return aatc diff --git a/fluents/lib/validation.py b/fluents/lib/validation.py index 4d6cfe1..7dba90d 100644 --- a/fluents/lib/validation.py +++ b/fluents/lib/validation.py @@ -12,11 +12,47 @@ from cx_utils import m_shape def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'): """Returns rmsep and aopt for pls tailored for wide X. + The root mean square error of cross validation is calculated + based on random block cross-validation. With number of blocks equal to + number of samples [default] gives leave-one-out cv. + The pls model is based on the simpls algorithm for wide X. - comments: - -- X, Y inputs need to be centered (fixme: check) + :Parameters: + X : ndarray + column centered data matrix of size (samples x variables) + Y : ndarray + column centered response matrix of size (samples x responses) + amax : scalar + Maximum number of components + n_blocks : scalar + Number of blocks in cross validation + + :Returns: + rmsep : ndarray + Root Mean Square Error of cross-validated Predictions + aopt : scalar + Guestimate of the optimal number of components + :SeeAlso: + - pls_cv_val : Same output, not optimised for wide X + - w_simpls : Simpls algorithm for wide X + + Notes + ----- + Based (cowardly translated) on m-files from the Chemoact toolbox + X, Y inputs need to be centered (fixme: check) + + + Examples + -------- + + >>> import numpy as n + >>> X = n.array([[1., 2., 3.],[]]) + >>> Y = n.array([[1., 2., 3.],[]]) + >>> w_pls(X, Y, 1) + [4,5,6], 1 """ + k, l = m_shape(Y) PRESS = zeros((l, amax+1), dtype='f') if n_blocks==None: @@ -30,7 +66,7 @@ def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'): if algo=='simpls': dat = w_simpls(Din, Yin, amax) Q, U, H = dat['Q'], dat['U'], dat['H'] - That = dot(Doi, dot(U, inv(triu(dot(H.T,U))) )) + That = dot(Doi, dot(U, inv(triu(dot(H.T, U))) )) else: raise NotImplementedError @@ -40,21 +76,13 @@ def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'): E = Yout[:,j][:,newaxis] - TQ E = E + sum(E, 0)/Din.shape[0] PRESS[j,1:] = PRESS[j,1:] + sum(E**2, 0) - #Yhat = Y - dot(That,Q.T) + Yhat = Y - dot(That,Q.T) rmsep = sqrt(PRESS/Y.shape[0]) aopt = find_aopt_from_sep(rmsep) - return rmsep, aopt + return rmsep, Yhat, aopt def pls_val(X, Y, amax=2, n_blocks=10, algo='pls', metric=None): - """ Validation results of pls model. - - - - comments: - -- X, Y inputs need to be centered (fixme: check) - - - """ + k, l = m_shape(Y) PRESS = zeros((l, amax+1), dtype='