Lib updates

2007-07-23 17:33:21 +00:00
parent 7ea87e646a
commit a05d0faa0d
9 changed files with 937 additions and 166 deletions
--- a/fluents/lib/blmfuncs.py
+++ b/fluents/lib/blmfuncs.py
@@ -10,7 +10,7 @@ from fluents.workflow import Function, OptionsDialog, Options
 from fluents.dataset import Dataset
 from fluents import plots, dataset, workflow, logger
 import scipy
-from engines import pca, pls
+from engines import pca, pls, nipals_lpls
 from cx_stats import leverage, variances, hotelling
 from cx_utils import mat_center
 from validation import *
@@ -238,14 +238,14 @@ class PLS(Model):
        """Estimates cut off on significant vars by controlling fdr."""
        if self._options['calc_qvals']==True:
-            qvals_sorted, qvals = pls_qvals(a, b,
+            qvals = pls_qvals(a, b,
-                                            aopt=None,
+                              aopt=None,
-                                            alpha=reg,
+                              alpha=reg,
-                                            n_iter=n_iter,
+                              n_iter=n_iter,
-                                            algo='pls',
+                              algo='pls',
-                                            sim_method=sim_method)
+                              sim_method=sim_method)
            self.model['qval'] = qvals
-            self.model['qval_sorted'] = qvals_sorted
+            #self.model['qval_sorted'] = qvals_sorted
        else:
           self.model['qval'] = None
           self.model['qval_sorted'] = None 
@@ -276,18 +276,19 @@ class PLS(Model):
        pc_ids_opt = ['_comp', map(str, range(self.model['aopt']))]
        zero_dim = ['_doe',['0']] # null dim, vector (hidden)
-        match_ids = {'E':[ids_0, ids_1],
+        match_ids = {'E' : [ids_0, ids_1],
-                     'P':[ids_1, pc_ids],
+                     'P' : [ids_1, pc_ids],
-                     'T':[ids_0, pc_ids],
+                     'T' : [ids_0, pc_ids],
-                     'W': [ids_1, pc_ids],
+                     'W' : [ids_1, pc_ids],
-                     'R': [ids_1, pc_ids],
+                     'R' : [ids_1, pc_ids],
-                     'Q':[ids_3, pc_ids],
+                     'Q' : [ids_3, pc_ids],
-                     'F':[ids_0, ids_3],
+                     'F' : [ids_0, ids_3],
-                     'B':[ids_1, ids_3],
+                     'B' : [ids_1, ids_3],
-                     'qval':[ids_1, zero_dim],
+                     'qval' : [ids_1, zero_dim],
                     'qval_sorted':[ids_1, zero_dim],
-                     'w_tsq':[ids_1, zero_dim],
+                     'w_tsq' : [ids_1, zero_dim],
-                     'rmsep':[ids_3, pc_ids],
+                     'rmsep' : [ids_3, pc_ids],
                     'CP': [ids_1, pc_ids]
                     }
        array = self.model[name]     
@@ -302,7 +303,7 @@ class PLS(Model):
            #except:
            #    logger.log('debug', 'Plot: %s failed' %plt)
        return out
-            
+    
    def run_o(self, a, b):
        """Run PLS with present options."""
        options = self._options
@@ -330,6 +331,17 @@ class PLS(Model):
        self.model['var_y'] = var_y
        self.model['exp_var_y'] = exp_var_y
        if options['calc_corrloads']:
            corr_load = scipy.empty_like(self.model['P'].copy())
            T = self.model['T']
            X = self._data['X']
            # For each variable/attribute in original matrix (not meancentered)
            for i,score in enumerate(T.T):
                for j, profile in enumerate(X.T):
                    corrs = scipy.corrcoef(score, profile)
                    corr_load[j,i] = corrs[0,1]
            self.model['CP'] = corr_load
        if options['calc_conf']:
            self.confidence(**options.confidence_options())
@@ -353,6 +365,141 @@ class PLS(Model):
            #run with current data and options
            return self.run_o(a, b)
 class LPLS(Model):
    def __init__(self, id='lpls', name='LPLS'):
        Model.__init__(self, id, name)
        self._options = LplsOptions()
    def validation(self, opt):
        """Returns rmsep for lpls model.
        """
        if opt['calc_cv']==True:
            val_engine = opt['val_engine']
            rmsep, aopt = val_engine(self.model['X'], self.model['Y'],
                                     self.model['Z'], opt['amax'], opt['n_sets'], opt['xz_alpha'])
            self.model['rmsep'] = rmsep
            self.model['aopt'] = aopt
        else:
            self.model['rmsep'] = None
            self.model['aopt'] = opt['aopt']
    def confidence(self, opt):
        """Returns a confidence measure for model parameters
        Supported parameters: W
        """
        aopt = self.model['aopt']
        if opt['calc_conf']:
            Wx, Wz = lpls_jk(self.model['X'], self.model['Y'], self.model['Z'], aopt, n_sets)
            Wcal = self.model['W'][:,:aopt]
            Lcal = self.model['L'][:,:aopt]
            # ensure that Wcal is scaled
            tnorm = scipy.apply_along_axis(norm, 0, self.model['T'][:,:aopt])
            Wcal = Wcal*tnorm
            a,b,c,d,e = opt['p_center'], opt['crot'], opt['alpha'], opt['strict'], opt['cov_center']
            tsqx = hotelling(Wx, Wcal, a,b,c,d,e)
            tsqz = hotelling(Wz, Lcal, a,b,c,d,e)
            self.model['tsqx'] = tsqx
            self.model['tsqz'] = tsqz
        else:
            self.model['tsqx'] = None
            self.model['tsqz'] = None
    def permutation_confidence(self, opt):
        """Estimates cut off on significant vars by controlling fdr.
        """
        self.model['qval'] = None
        self.model['qval_sorted'] = None 
    def make_model(self, opt):
        """Make model on amax components.
        """
        engine = opt['engine']
        dat = engine(self._data['X'], self._data['Y'], self._data['Z'],
                     opt['amax'], opt['xz_alpha'], opt['center_mth'],
                     opt['mode'], opt['scale'], False)
        self.model.update(dat)
    def as_dataset(self, name, dtype='Dataset'):
        """Return any model parameter as Dataset
        No ids matching
        """
        if name not in self.model.keys():
            return
        DX, DY, DZ = self._dataset['X'], self._dataset['Y'], self._dataset['Z']
        dim_name_0, dim_name_1 = DX.get_dim_name()
        dim_name_2, dim_name_3 = DY.get_dim_name()
        dim_name_4, dim_name_5 = DZ.get_dim_name()
        #samples
        ids_0 = [dim_name_0, DX.get_identifiers(dim_name_0, sorted=True)]
         # x vars (genes)
        ids_1 = [dim_name_1, DX.get_identifiers(dim_name_1, sorted=True)]
        # y vars (sample descriptors)
        ids_3 = [dim_name_3, DY.get_identifiers(dim_name_3, sorted=True)]
        #z-vars (variable descriptors)
        ids_4 = [dim_name_4, DZ.get_identifiers(dim_name_4, sorted=True)]
        # components (hidden)
        pc_ids = ['_comp', map(str, range(self._options['amax']))]
        pc_ids_opt = ['_comp', map(str, range(self.model['aopt']))]
        zero_dim = ['_doe',['0']] # null dim, vector (hidden)
        match_ids = {'E' : [ids_0, ids_1],
                     'P' : [ids_1, pc_ids],
                     'T' : [ids_0, pc_ids],
                     'W' : [ids_1, pc_ids],
                     'L' : [ids_4, pc_ids],
                     'Q' : [ids_3, pc_ids],
                     'F' : [ids_0, ids_3],
                     'B' : [ids_1, ids_3],
                     'tsqx' : [ids_1, zero_dim],
                     'tsqz' : [ids_4, zero_dim],
                     'K' : [ids_1, pc_ids],
                     'rmsep' : [ids_3, pc_ids]
                     }
        array = self.model[name]     
        M = Dataset(array, identifiers=match_ids[name], name=name)
        return M
    def get_out_plots(self, options):
        out=[]
        for plt in options['out_plots']:
            out.append(plt(self))
        return out
    def run(self, a, b, c):
        """Run L-PLS with present options."""
        options = self._options
        self._dataset['X'] = a
        self._dataset['Y'] = b
        self._dataset['Z'] = c
        self._data['X'] = a.asarray()
        self._data['Y'] = b.asarray()
        self._data['Z'] = c.asarray()
        self.validation(options)
        self.make_model(options)
        if options['calc_conf']:
            self.confidence(options)
        out = [self.as_dataset(p) for p in options['out_data']]
        for plt in self.get_out_plots(options):
            out.append(plt)
        return out
    def run_gui(self, a, b, c):
        """Run LPLS with option gui.
        """
        dialog = LPlsOptionsDialog([a, b, c], self._options)
        dialog.show_all()
        response = dialog.run()
        dialog.hide()
        if response == gtk.RESPONSE_OK:
            # set output data and plots
            dialog.set_output()
            #run with current data and options
            return self.run(a, b, c)
 class PcaOptions(Options):
    """Options for Principal Component Analysis.
@@ -403,7 +550,9 @@ class PcaOptions(Options):
                            ]
        opt['out_data'] = ['T','P', 'p_tsq']
-        opt['out_plots'] = [blmplots.PcaScorePlot,blmplots.PcaLoadingPlot,blmplots.LineViewXc]
+        opt['out_plots'] = [blmplots.PcaScorePlot,
                            blmplots.PcaLoadingPlot,
                            blmplots.LineViewXc]
        self.update(opt)
@@ -444,6 +593,7 @@ class PlsOptions(Options):
        opt['center_mth'] = mat_center
        opt['scale'] = 'scores'
        opt['calc_corrloads'] = True
        opt['calc_conf'] = False
        opt['n_sets'] = 5
        opt['strict'] = True
@@ -468,7 +618,8 @@ class PlsOptions(Options):
                            (blmplots.PlsLoadingPlot, 'Loadings', True),
                            (blmplots.LineViewXc, 'Line view', True),
                            (blmplots.PredictionErrorPlot, 'Residual Error', False),
-                            (blmplots.RMSEPPlot, 'RMSEP', False)
+                            (blmplots.RMSEPPlot, 'RMSEP', False),
                            (blmplots.PlsCorrelationLoadingPlot, 'Corr. loadings', True)
                            ]
        opt['out_data'] = ['T','P', 'p_tsq']
@@ -494,14 +645,87 @@ class PlsOptions(Options):
                    'strict', 'crot', 'cov_center'] 
        return self._copy_from_list(opt_list)  
    def permutation_confidence(self):
        opt_list = ['q_pert_method', 'q_iter']
        return self._copy_from_list(opt_list)
 class LplsOptions(Options):
    """Options for L-shaped Partial Least Squares Regression.
    """
    def __init__(self):
        Options.__init__(self)
        self._set_default()
    def _set_default(self):
        opt = {}
        opt['engine'] = nipals_lpls
        opt['mode'] = 'normal' # how much info to calculate
        opt['amax'] = 10
        opt['aopt'] = 9
        opt['xz_alpha'] = .5
        opt['auto_aopt'] = False
        opt['center'] = True
        opt['center_mth'] = [2, 0, 1]
        opt['scale'] = 'scores'
        opt['calc_conf'] = False
        opt['n_sets'] = 7
        opt['strict'] = False
        opt['p_center'] = 'med'
        opt['alpha'] = .3
        opt['cov_center'] = 'med'
        opt['crot'] = True
        opt['calc_cv'] = False
        opt['cv_val_method'] = 'random'
        opt['cv_val_sets'] = opt['n_sets']
        opt['all_data'] = [('T', 'scores', True),
                           ('Wx', 'X-weights', True),
                           ('Wz', 'Z-weights', True),
                           ('E','residuals', False),
                           ('tsq_x', 't2X', False),
                           ('rmsep', 'RMSEP', False)
                           ]
        # (class, name, sensitive, ticked)
        opt['all_plots'] = [(blmplots.PlsScorePlot, 'Scores', True),
                            (blmplots.PlsLoadingPlot, 'Loadings', True),
                            (blmplots.LineViewXc, 'Line view', True),
                            (blmplots.PredictionErrorPlot, 'Residual Error', False),
                            (blmplots.RMSEPPlot, 'RMSEP', False),
                            (blmplots.LplsHypoidCorrelationPlot, 'Hypoid corr.', False)
                            ]
        opt['out_data'] = ['T','P']
        opt['out_plots'] = [blmplots.PlsScorePlot,blmplots.PlsLoadingPlot,blmplots.LineViewXc]
        #opt['out_data'] = None
        opt['pack'] = False
        opt['calc_qvals'] = False
        opt['q_pert_method'] = 'shuffle_rows'
        opt['q_iter'] = 20        
        self.update(opt)
    def make_model_options(self):
        """Options for make_model method."""
        opt_list = ['scale','mode', 'amax', 'engine']
        return self._copy_from_list(opt_list)   
    def confidence_options(self):
        """Options for confidence method."""
        opt_list = ['n_sets', 'aopt', 'alpha', 'p_center',
                    'strict', 'crot', 'cov_center'] 
        return self._copy_from_list(opt_list)  
    def validation_options(self):
        """Options for pre_validation method."""
        opt_list = ['amax', 'n_sets', 'cv_val_method']
        return self._copy_from_list(opt_list)
    def permutation_confidence(self):
        opt_list = ['q_pert_method', 'q_iter']
        return self._copy_from_list(opt_list)
 class PcaOptionsDialog(OptionsDialog):
    """Options dialog for Principal Component Analysis.
@@ -716,6 +940,210 @@ class PcaOptionsDialog(OptionsDialog):
            self._options['strict'] = True 
 class LplsOptionsDialog(OptionsDialog):
    """Options dialog for L-shaped Partial Least squares regression.
    """
    def __init__(self, data, options, input_names=['X', 'Y', 'Z']):
        OptionsDialog.__init__(self, data, options, input_names)
        glade_file = os.path.join(fluents.DATADIR, 'lpls_options.glade')
        notebook_name = "vbox1"
        page_name = "Options"
        self.add_page_from_glade(glade_file, notebook_name, page_name)
        # connect signals to handlers
        dic = {"on_amax_value_changed" : self.on_amax_changed,
               "on_aopt_value_changed" : self.on_aopt_changed,
               "auto_aopt_toggled" : self.auto_aopt_toggled,
               "center_toggled" : self.center_toggled,
               #"on_scale_changed" : self.on_scale_changed,
               "on_val_none" : self.val_toggled,
               "on_val_cv" : self.cv_toggled,
               "on_cv_method_changed" : self.on_cv_method_changed,
               "on_cv_sets_changed" : self.on_cv_sets_changed,
               "on_conf_toggled" : self.conf_toggled,
               "on_subset_loc_changed" : self.on_subset_loc_changed,
               "on_cov_loc_changed" : self.on_cov_loc_changed,
               "on_alpha_changed" : self.on_alpha_changed,
               "on_rot_changed" : self.on_rot_changed,
               "on__toggled" : self.conf_toggled,
               "on_qval_changed" : self.on_qval_changed,
               "on_iter_changed" : self.on_iter_changed
               }
        self.wTree.signal_autoconnect(dic)
        # set/ensure valid default values/ranges
        #
        amax_sb = self.wTree.get_widget("amax_spinbutton")
        max_comp =  min(data[0].shape) # max num of components
        if self._options['amax']>max_comp:
            logger.log('debug', 'amax default too large ... adjusting')
            self._options['amax'] = max_comp
        amax_sb.get_adjustment().set_all(self._options['amax'], 1, max_comp, 1, 0, 0)
        # aopt spin button
        aopt_sb = self.wTree.get_widget("aopt_spinbutton")
        if self._options['aopt']>self._options['amax']:
            self._options['aopt'] = self._options['amax'] + 1 - 1
        aopt_sb.get_adjustment().set_all(self._options['aopt'], 1, self._options['amax'], 1, 0, 0)
        # scale
        # scale_cb = self.wTree.get_widget("scale_combobox")
        # scale_cb.set_active(0)
        # validation frames
        if self._options['calc_cv']==False:
            cv_frame = self.wTree.get_widget("cv_frame")
            cv_frame.set_sensitive(False)
        cv = self.wTree.get_widget("cv_method").set_active(0)
        # confidence
        if self._options['calc_conf']==True:
            self.wTree.get_widget("subset_expander").set_sensitive(True)
        else:
            self.wTree.get_widget("subset_expander").set_sensitive(False)
        cb = self.wTree.get_widget("subset_loc")
        _m = {'med': 0, 'mean': 1, 'full_model': 2}
        cb.set_active(_m.get(self._options['p_center']))
        cb = self.wTree.get_widget("cov_loc")
        _m = {'med': 0, 'mean': 1}
        cb.set_active(_m.get(self._options['cov_center']))
        hs = self.wTree.get_widget("alpha_scale")
        hs.set_value(self._options['alpha'])
        tb = self.wTree.get_widget("qvals")
        tb.set_sensitive(True)
    def on_amax_changed(self, sb):
        logger.log("debug", "amax changed: new value: %s" %sb.get_value_as_int())
        amax = sb.get_value_as_int()
        # update aopt if needed
        if amax<self._options['aopt']:
            self._options['aopt'] = amax
        aopt_sb = self.wTree.get_widget("aopt_spinbutton")
        aopt_sb.get_adjustment().set_all(self._options['aopt'], 1, amax, 1, 0, 0)
        self._options['amax'] = sb.get_value_as_int()
    def on_aopt_changed(self, sb):
        aopt = sb.get_value_as_int()
        self._options['aopt'] = aopt
    def auto_aopt_toggled(self, tb):
        aopt_sb = self.wTree.get_widget("aopt_spinbutton")
        if tb.get_active():
            self._options['auto_aopt'] = True
            aopt_sb.set_sensitive(False)
        else:
            self._options['auto_aopt'] = False
            aopt_sb.set_sensitive(True)
    def center_toggled(self, tb):
        if tb.get_active():
            self._options['center'] = True
        else:
            logger.log("debug", "centering set to False")
            self._options['center'] = False
    #def on_scale_changed(self, cb):
    #    scale = cb.get_active_text()
    #    if scale=='Scores':
    #        self._options['scale'] = 'scores'
    #    elif scale=='Loadings':
    #        self._options['scale'] = 'loads'
    #    else:
    #        raise IOError
    def val_toggled(self, tb):
        """Callback for validation: None. """
        cv_frame = self.wTree.get_widget("cv_frame")
        cv_tb = self.wTree.get_widget("cv_toggle")
        if tb.get_active():
            self._options['calc_cv'] = False
            cv_frame.set_sensitive(False)
            cv_tb.set_sensitive(False)
        else:
            cv_tb.set_sensitive(True)
            if cv_tb.get_active():
                cv_frame.set_sensitive(True)
                self._options['calc_cv'] = True
    def cv_toggled(self, tb):
        cv_frame = self.wTree.get_widget("cv_frame")
        val_tb = self.wTree.get_widget("val_none_toggle")
        if tb.get_active():
            cv_frame.set_sensitive(True)
            self._options['calc_cv'] = True
        else:
            cv_frame.set_sensitive(False)
            self._options['calc_cv'] = False
    def on_cv_method_changed(self, cb):
        method = cb.get_active_text()
        if method == 'Random':
            self._options['cv_val_method'] = 'random'
    def on_cv_sets_changed(self, sb):
        val = sb.get_value_as_int()
        self._options['cv_val_sets'] = val
    def conf_toggled(self, tb):
        if tb.get_active():
            self._options['calc_conf'] = False
            self.wTree.get_widget("subset_expander").set_sensitive(False)
        else:
            self._options['calc_conf'] = True
            self.wTree.get_widget("subset_expander").set_sensitive(True)
    def on_subset_loc_changed(self, cb):
        method = cb.get_active_text()
        if method=='Full model':
            self._options['p_center'] = 'full_model'
        elif method=='Median':
            self._options['p_center'] = 'med'
        elif method=='Mean':
            self._options['p_center'] = 'mean'
    def on_cov_loc_changed(self, cb):
        method = cb.get_active_text()
        if method=='Median':
            self._options['cov_center'] = 'med'
        elif method=='Mean':
            self._options['cov_center'] = 'mean'
    def on_alpha_changed(self, hs):
        self._options['alpha'] = hs.get_value()
    def on_rot_changed(self, rg):
        proc, strict = rg
        if proc.get_active():
            self._options['crot'] = True
        else:
            self._options['crot'] = True
            self._options['strict'] = True 
    def qval_toggled(self, tb):
        if tb.get_active():
            self._options['calc_qval'] = False
            self.wTree.get_widget("qval_method").set_sensitive(False)
            self.wTree.get_widget("q_iter").set_sensitive(False)
        else:
            self._options['calc_qval'] = True
            self.wTree.get_widget("qval_method").set_sensitive(True)
            self.wTree.get_widget("q_iter").set_sensitive(True)
    def on_iter_changed(self, sb):
        self._options['q_iter'] = sb.get_value()
    def on_qval_changed(self, cb):
        q_method = cb.get_active_text()
        if method=='Shuffle rows':
            self._options['q_pert_method'] = 'shuffle'
 class PlsOptionsDialog(OptionsDialog):
    """Options dialog for Partial Least squares regression.
    """
@@ -918,5 +1346,3 @@ class PlsOptionsDialog(OptionsDialog):
        q_method = cb.get_active_text()
        if method=='Shuffle rows':
            self._options['q_pert_method'] = 'shuffle'
--- a/fluents/lib/blmplots.py
+++ b/fluents/lib/blmplots.py
@@ -190,7 +190,12 @@ class PlsCorrelationLoadingPlot(BlmScatterPlot):
        title = "Pls correlation loadings (%s)" %model._dataset['X'].get_name()
        BlmScatterPlot.__init__(self, title, model, absi, ordi, part_name='CP')
-        
+
 class LplsHypoidCorrelationPlot(BlmScatterPlot):
    def __init__(self, model, absi=0, ordi=1):
        title = "Hypoid correlations(%s)" %model._dataset['X'].get_name()
        BlmScatterPlot.__init__(self, title, model, absi, ordi, part_name='W')
 class LineViewXc(plots.LineViewPlot):
    """A line view of centered raw data
    """
@@ -214,8 +219,8 @@ class PlsQvalScatter(plots.ScatterPlot):
    def __init__(self, model, pc=0):
        if not model.model.has_key('w_tsq'):
            return None
-        self._W = model.model['P']
+        self._W = model.model['W']
-        dataset_1 = model.as_dataset('P')
+        dataset_1 = model.as_dataset('W')
        dataset_2 = model.as_dataset('w_tsq')
        id_dim = dataset_1.get_dim_name(0) #genes
        sel_dim = dataset_1.get_dim_name(1) #_comp
--- a/fluents/lib/cx_stats.py
+++ b/fluents/lib/cx_stats.py
@@ -115,6 +115,7 @@ def expl_var_y(Y, T, Q):
 def pls_qvals(a, b, aopt=None, alpha=.3,
              n_iter=20, algo='pls',
              center=True,
              sim_method='shuffle',
              p_center='med', cov_center='med',
              crot=True, strict=False, metric=None):
@@ -122,8 +123,98 @@ def pls_qvals(a, b, aopt=None, alpha=.3,
    """Returns qvals for pls model.
    input:
-    a -- centered data matrix
+    a -- data matrix
-    b -- centered data matrix
+    b -- data matrix
    aopt -- scalar, opt. number of components
    alpha -- [0,1] regularisation parameter for T2-test
    n_iter -- number of permutations
    sim_method -- permutation method ['shuffle']
    p_center -- location estimator for sub models ['med']
    cov_center -- location estimator for covariance of submodels ['med']
    crot -- bool, use rotations of sub models?
    strict -- bool, use stict (rot/flips only) rotations?
    metric -- bool, use row metric?
    """
    m, n = a.shape
    TSQ = zeros((n, n_iter), dtype='d') # (nvars x n_subsets)
    n_false = zeros((n, n_iter), dtype='d')
    #full model
    if center:
        ac = a - a.mean(0)
        bc = b - b.mean(0)
    if metric!=None:
        ac = dot(ac, metric)
    if algo=='bridge':
        dat = bridge(ac, bc, aopt, 'loads', 'fast')
    else:
        dat = pls(ac, bc, aopt, 'loads', 'fast')
    Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo, metric=metric, center=True)
    tsq_full = hotelling(Wcv, dat['W'], p_center=p_center,
                         alpha=alpha, crot=crot, strict=strict,
                         cov_center=cov_center)
    t0 = time.time()
    Vs = shuffle_1d(bc, n_iter, axis=0)
    for i, b_shuff in enumerate(Vs):
        t1 = time.time()
        if algo=='bridge':
            dat = bridge(ac, b_shuff, aopt, 'loads','fast')
        else:
            dat = pls(ac, b_shuff, aopt, 'loads', 'fast')
        Wcv = pls_jkW(a, b_shuff, aopt, n_blocks=None, algo=algo, metric=metric)
        TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center,
                             alpha=alpha, crot=crot, strict=strict,
                             cov_center=cov_center)
        print time.time() - t1
    sort_index = argsort(tsq_full)[::-1]
    back_sort_index = sort_index.argsort()
    print time.time() - t0
    # count false positives
    tsq_full_sorted = tsq_full.take(sort_index)
    for i in xrange(n_iter):
        for j in xrange(n):
            n_false[j,i] = sum(TSQ[:,i]>=tsq_full[j]) # number of false pos. genes (0-n)
    false_pos = median(n_false, 1)
    ll = arange(1, len(false_pos)+1, 1)
    sort_qval = false_pos.take(sort_index)/ll
    qval = false_pos/ll.take(back_sort_index)
    print time.time() - t0
    #return qval, false_pos, TSQ, tsq_full
    return qval
 def ensure_strict(C, only_flips=True):
    """Ensure that a rotation matrix does only 90 degree rotations.
    In multiplication with pcs this allows flips and reordering.
    if only_flips is True there will onlt be flips allowed
    """
    Cm = C
    S = sign(C) # signs
    if only_flips==True:
        C = eye(Cm.shape[0])*S
        return C
    Cm = zeros_like(C)
    Cm.putmask(1.,abs(C)>.6)
    if det(Cm)>1:
        raise ValueError,"Implement this!"
    return Cm*S
 def pls_qvals_II(a, b, aopt=None, center=True, alpha=.3,
                 n_iter=20, algo='pls',
                 sim_method='shuffle',
                 p_center='med', cov_center='med',
                 crot=True, strict=False, metric=None):
    """Returns qvals for pls model.
    Shuffling of variables in X is preprocessed in metric.
    Null model is 'If I put genes randomly on network' ... if they are sign:
    then this is due to network structure and not covariance with response.
    input:
    a -- data matrix
    b -- data matrix
    aopt -- scalar, opt. number of components
    alpha -- [0,1] regularisation parameter for T2-test
    n_iter -- number of permutations
@@ -140,25 +231,33 @@ def pls_qvals(a, b, aopt=None, alpha=.3,
    n_false = zeros((n, n_iter), dtype='<f8')
    #full model
-    if metric!=None:
+
-        a = dot(a, metric)
+    # center?
    if center==True:
        ac = a - a.mean(0)
        bc = b - b.mean(0)
    if metric==None:
        metric = eye(n,n)
    if algo=='bridge':
-        dat = bridge(a, b, aopt, 'loads', 'fast')
+        dat = bridge(ac, bc, aopt, 'loads', 'fast')
    else:
-        dat = pls(a, b, aopt, 'loads', 'fast')
+        dat = pls(ac, bc, aopt, 'loads', 'fast')
    Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo, metric=metric)
    tsq_full = hotelling(Wcv, dat['W'], p_center=p_center,
                         alpha=alpha, crot=crot, strict=strict,
                         cov_center=cov_center)
    t0 = time.time()
-    Vs = shuffle_1d(b, n_iter)
+    Vs = shuffle_1d(a, n_iter, 1)
-    for i, b_shuff in enumerate(Vs):
+    for i, a_shuff in enumerate(Vs):
        t1 = time.time()
        a = a_shuff - a_shuff.mean(0)
        a = dot(a, metric)
        if algo=='bridge':
-            dat = bridge(a, b_shuff, aopt, 'loads','fast')
+            dat = bridge(a, b, aopt, 'loads','fast')
        else:
            dat = pls(a, b, aopt, 'loads', 'fast')
-        Wcv = pls_jkW(a, b_shuff, aopt, n_blocks=None, algo=algo, metric=metric)
+        Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo, metric=metric)
        TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center,
                             alpha=alpha, crot=crot, strict=strict,
                             cov_center=cov_center)
@@ -177,24 +276,8 @@ def pls_qvals(a, b, aopt=None, alpha=.3,
    sort_qval = false_pos.take(sort_index)/ll
    qval = false_pos/ll.take(back_sort_index)
    print time.time() - t0
-    return qval, false_pos, TSQ, tsq_full
+    #return qval, false_pos, TSQ, tsq_full
-
+    return qval
 def ensure_strict(C, only_flips=True):
    """Ensure that a rotation matrix does only 90 degree rotations.
    In multiplication with pcs this allows flips and reordering.
    if only_flips is True there will onlt be flips allowed
    """
    Cm = C
    S = sign(C) # signs
    if only_flips==True:
        C = eye(Cm.shape[0])*S
        return C
    Cm = zeros_like(C)
    Cm.putmask(1.,abs(C)>.6)
    if det(Cm)>1:
        raise ValueError,"Implement this!"
    return Cm*S
 def leverage(aopt=1,*args):
    """Returns leverages
@@ -253,3 +336,10 @@ def ssq(E, axis=0, weights=None):
        raise NotImplementedError, "Higher order modes not supported"
    return pow(Ew,2).sum(axis)
 def vnorm(x):
    """Returns the euclidian norm of a vector.
    This is considerably faster than linalg.norm
    """
    return sqrt(dot(x,x.conj()))
--- a/fluents/lib/cx_utils.py
+++ b/fluents/lib/cx_utils.py
@@ -1,23 +1,25 @@
 from scipy import apply_along_axis,newaxis,zeros,\
     median,round_,nonzero,dot,argmax,any,sqrt,ndarray,\
     trace,zeros_like,sign,sort,real,argsort,rand,array,\
-     matrix
+     matrix,nan
 from scipy.linalg import norm,svd,inv,eig
 from scipy.stats import median,mean
-def normalise(a,axis=0,return_scales=False):
+def normalise(a, axis=0, return_scales=False):
-	s = apply_along_axis(norm,axis,a)
+    s = apply_along_axis(norm, axis, a)
-	if axis==0:
+    if axis==0:
-		s = s[newaxis]
+        s = s[newaxis]
-	else:
+    else:
-		s = s[:,newaxis]
+        s = s[:,newaxis]
-		
+	    
-	a_s = a/s
+    a_s = a/s
 	if return_scales:
 		return a_s,s
 	return a_s
-def sub2ind(shape,i,j):
+    if return_scales:
       return a_s, s
    return a_s
 def sub2ind(shape, i, j):
 	"""Indices from subscripts. Only support for 2d"""
 	row,col = shape
 	ind = []
@@ -41,13 +43,13 @@ def sorted_eig(a, b=None,sort_by='sm'):
    (This is reversed output compared to matlab)
    """
-    s,v = eig(a,b)
+    s,v = eig(a, b)
    s = real(s) # dont expect any imaginary part
    v = real(v)
    ind = argsort(s)
    if sort_by=='lm':
        ind = ind[::-1]
-    v = v.take(ind,1)
+    v = v.take(ind, 1)
    s = s.take(ind)
    return s,v
@@ -67,15 +69,15 @@ def str2num(string_number):
    return num
 def randperm(n):
-  r=rand(n)
+  r = rand(n)
  dict={}
  for i in range(n):
-     dict[r[i]]=i
+     dict[r[i]] = i
-  r=sort(r)
+  r = sort(r)
-  out=zeros(n)
+  out = zeros(n)
  for i in range(n):
-     out[i]=dict[r[i]]
+     out[i] = dict[r[i]]
-  return array(out,dtype='i')
+  return array(out).astype('i')
 def mat_center(X,axis=0,ret_mn=False):
    """Mean center matrix along axis.
--- a/fluents/lib/engines.py
+++ b/fluents/lib/engines.py
@@ -3,11 +3,12 @@
 There is no typechecking of any kind here, just focus on speed
 """
-from scipy.linalg import svd,norm,inv,pinv,qr
+import math
 from scipy.linalg import svd,inv
 from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\
     apply_along_axis,mean,ones,randn,empty_like,outer,c_,\
     rand,sum,cumsum,matrix
-
+    
 def pca(a, aopt, scale='scores', mode='normal'):
    """ Principal Component Analysis model
    mode:
@@ -18,17 +19,18 @@ def pca(a, aopt, scale='scores', mode='normal'):
    m, n = a.shape
-    if m*10.>n:
+    if m*3>n:
-        u, s, vt = esvd(a)
+        u, s, v = esvd(a)
    else:
        u, s, vt = svd(a, full_matrices=0)
        v = vt.T
    eigvals = (1./m)*s
    T = u*s
    T = T[:,:aopt]
-    P = vt[:aopt,:].T
+    P = v[:,:aopt]
    if scale=='loads':
-        tnorm = apply_along_axis(norm, 0, T)
+        tnorm = apply_along_axis(vnorm, 0, T)
        T = T/tnorm
        P = P*tnorm
@@ -47,6 +49,7 @@ def pca(a, aopt, scale='scores', mode='normal'):
    return {'T':T, 'P':P, 'E':E}
 def pcr(a, b, aopt=2, scale='scores', mode='normal'):
    """Returns Principal component regression model."""
    m, n = a.shape
@@ -98,13 +101,13 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
            u, s, vh = svd(dot(ab.T, ab))
            w = dot(ab, u[:,:1])
-        w = w/norm(w)
+        w = w/vnorm(w)
        r = w.copy()
        if i>0:
            for j in range(0,i,1):
                r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis]
        t = dot(a, r)
-        tt = norm(t)**2
+        tt = vnorm(t)**2
        p  = dot(a.T, t)/tt
        q = dot(r.T, ab).T/tt
        ab = ab - dot(p, q.T)*tt
@@ -115,7 +118,7 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
        if mode=='fast' and i==aopt-1:
            if scale=='loads':
-                tnorm = apply_along_axis(norm, 0, T)
+                tnorm = apply_along_axis(vnorm, 0, T)
                T = T/tnorm
                W = W*tnorm
            return {'T':T, 'W':W}
@@ -134,7 +137,7 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
        F = b - dot(T[:,:aopt], Q[:,:aopt].T)
    if scale=='loads':
-        tnorm = apply_along_axis(norm, 0, T)
+        tnorm = apply_along_axis(vnorm, 0, T)
        T = T/tnorm
        W = W*tnorm
        Q = Q*tnorm
@@ -159,7 +162,7 @@ def w_simpls(aat, b, aopt):
        u = dot(b, u[:,:1]) #y-factor scores
        U[:,i] = u.ravel()
        t = dot(aat, u)
-        t = t/norm(t)
+        t = t/vnorm(t)
        T[:,i] = t.ravel()
        h = dot(aat, t) #score-weights
        H[:,i] = h.ravel()
@@ -183,7 +186,7 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
    W = u[:,:aopt]
    K = vt[:aopt,:].T
    T = dot(a, W)
-    tnorm = apply_along_axis(norm, 0, T) # norm of T-columns
+    tnorm = apply_along_axis(vnorm, 0, T) # norm of T-columns
    if mode == 'fast':
        if scale=='loads':
@@ -196,16 +199,6 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
    B = zeros((aopt, n, l), dtype='f')
    for i in range(aopt):
        B[i] = dot(W[:,:i+1], Q[:,:i+1].T)
    # leverages
    # fixme: probably need an orthogonal basis for row-space leverage
    #        T (scores) are not orthogonal
    #        Using a qr decomp to get an orthonormal basis for row-space
    #Tq = qr(T)[0]
    #s_lev,v_lev = leverage(aopt,Tq,W)
    # explained variance
    #var_x, exp_var_x = variances(a,T,W)
    #qnorm = apply_along_axis(norm, 0, Q)
    #var_y, exp_var_y = variances(b,U,Q/qnorm)
    if mode == 'detailed':
        E = empty((aopt, m, n))
@@ -225,10 +218,132 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
    return {'B':B, 'W':W, 'T':T, 'Q':Q, 'E':E, 'F':F, 'U':U, 'P':W}
 def nipals_lpls(X, Y, Z, amax, alpha=.7, mean_ctr=[2, 0, 1], mode='normal', scale='scores', verbose=False):
    """ L-shaped Partial Least Sqaures Regression by the nipals algorithm.
    (X!Z)->Y
    :input:
        X : data matrix (m, n)
        Y : data matrix (m, l)
        Z : data matrix (n, o)
    :output:
      T : X-scores
      W : X-weights/Z-weights
      P : X-loadings
      Q : Y-loadings
      U : X-Y relation
      L : Z-scores
      K : Z-loads
      B : Regression coefficients X->Y
      b0: Regression coefficient intercept
      evx : X-explained variance
      evy : Y-explained variance
      evz : Z-explained variance
    :Notes:
    """
    if mean_ctr!=None:
        xctr, yctr, zctr = mean_ctr
        X, mnX = center(X, xctr)
        Y, mnY = center(Y, xctr)
        Z, mnZ = center(Z, zctr)
    varX = pow(X, 2).sum()
    varY = pow(Y, 2).sum()
    varZ = pow(Z, 2).sum()
    m, n = X.shape
    k, l = Y.shape
    u, o = Z.shape
    # initialize 
    U = empty((k, amax))
    Q = empty((l, amax))
    T = empty((m, amax))
    W = empty((n, amax))
    P = empty((n, amax))
    K = empty((o, amax))
    L = empty((u, amax))
    var_x = empty((amax,))
    var_y = empty((amax,))
    var_z = empty((amax,))
    for a in range(amax):
        if verbose:
            print "\n Working on comp. %s" %a
        u = Y[:,:1]
        diff = 1
        MAX_ITER = 100
        lim = 1e-5
        niter = 0
        while (diff>lim and niter<MAX_ITER):
            niter += 1
            u1 = u.copy()
            w = dot(X.T, u)
            w = w/sqrt(dot(w.T, w))
            l = dot(Z, w)
            k = dot(Z.T, l)
            k = k/sqrt(dot(k.T, k))
            w = alpha*k + (1-alpha)*w
            w = w/sqrt(dot(w.T, w))
            t = dot(X, w)
            c = dot(Y.T, t)
            c = c/sqrt(dot(c.T, c))
            u = dot(Y, c)
            diff = abs(u1 - u).max()
        if verbose:
            print "Converged after %s iterations" %niter
        tt = dot(t.T, t)
        p = dot(X.T, t)/tt
        q = dot(Y.T, t)/tt
        l = dot(Z, w)
        U[:,a] = u.ravel()
        W[:,a] = w.ravel()
        P[:,a] = p.ravel()
        T[:,a] = t.ravel()
        Q[:,a] = q.ravel()
        L[:,a] = l.ravel()
        K[:,a] = k.ravel()
        X = X - dot(t, p.T)
        Y = Y - dot(t, q.T)
        Z = (Z.T - dot(w, l.T)).T
        var_x[a] = pow(X, 2).sum()
        var_y[a] = pow(Y, 2).sum()
        var_z[a] = pow(Z, 2).sum()
    B = dot(dot(W, inv(dot(P.T, W))), Q.T)
    b0 = mnY - dot(mnX, B)
    # variance explained
    evx = 100.0*(1 - var_x/varX)
    evy = 100.0*(1 - var_y/varY)
    evz = 100.0*(1 - var_z/varZ)
    if scale=='loads':
        tnorm = apply_along_axis(vnorm, 0, T)
        T = T/tnorm
        W = W*tnorm
        Q = Q*tnorm
        knorm = apply_along_axis(vnorm, 0, K)
        L = L*knorm
        K = K/knorm
    return {'T':T, 'W':W, 'P':P, 'Q':Q, 'U':U, 'L':L, 'K':K, 'B':B, 'b0':b0, 'evx':evx, 'evy':evy, 'evz':evz}    
 ########### Helper routines #########
 def m_shape(array):
    return matrix(array).shape
-def esvd(data,economy=1):
+def esvd(data, economy=1):
    """SVD with the option of economy sized calculation
    Calculate subspaces of X'X or XX' depending on the shape
    of the matrix.
@@ -239,17 +354,40 @@ def esvd(data,economy=1):
    """
    m, n = data.shape
    if m>=n:
-        u, s, vt = svd(dot(data.T, data))
+        data = dot(data.T, data)
        u, s, vt = svd(data)
        u = dot(data, vt.T)
        v = vt.T
        for i in xrange(n):
-            s[i] = norm(u[:,i])
+            s[i] = vnorm(u[:,i])
            u[:,i] = u[:,i]/s[i]
    else:
-        u, s, vt = svd(dot(data, data.T))
+        data = dot(data, data.T)
        data = (data + data.T)/2.0
        u, s, vt = svd(data)
        v = dot(u.T, data)
        for i in xrange(m):
-            s[i] = norm(v[i,:])
+            s[i] = vnorm(v[i,:])
            v[i,:] = v[i,:]/s[i]
-    return u, s, v
+    return u, s, v.T
 def vnorm(x):
    # assume column arrays (or vectors)
    return math.sqrt(dot(x.T, x))
 def center(a, axis):
     # 0 = col center, 1 = row center, 2 = double center
     # -1 = nothing
    if axis==-1:
        mn = zeros((a.shape[1],))
    elif axis==0:
        mn = a.mean(0)
    elif axis==1:
        mn = a.mean(1)[:,newaxis]
    elif axis==2:
        mn = a.mean(0) + a.mean(1)[:,newaxis] - a.mean()
    else:
        raise IOError("input error: axis must be in [-1,0,1,2]")
    return a - mn, mn
--- a/fluents/lib/hypergeom.py
+++ b/fluents/lib/hypergeom.py
@@ -53,6 +53,7 @@ def gene_hypergeo_test(selection, category_dataset):
                             cat_count)
    pvals = scipy.where(cat_count==0, 2, pvals)
    pvals = scipy.where(scipy.isnan(pvals), 2, pvals)
    out = {}
    for i in range(pvals.size):
        out[str(all_cats[i])] = (count[i], cat_count[i], pvals[i])
--- a/fluents/lib/nx_utils.py
+++ b/fluents/lib/nx_utils.py
@@ -2,7 +2,7 @@ import os,sys
 from itertools import izip
 import networkx as NX
 from scipy import shape,diag,dot,asarray,sqrt,real,zeros,eye,exp,maximum,\
-     outer,maximum,sum,diag,real
+     outer,maximum,sum,diag,real,atleast_2d
 from scipy.linalg import eig,svd,inv,expm,norm
 from cx_utils import sorted_eig
@@ -378,6 +378,7 @@ Ke = expm(A) .... expm(-A)?
 # 14.05.2006: diffusion returns negative values, using expm(-LL) instead (FIX)
 # 13.09.2206: update for use in numpy
 # 27.04.2007: diffusion now uses pade approximations to matrix exponential. Also the last 
 def K_expAdj(W, normalised=True, alpha=1.0):
    """Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel. 
@@ -433,8 +434,8 @@ def K_vonNeumann(W, normalised=True, alpha=1.0):
    return dot(dot(vr,psigma),vri).astype(t)
 def K_laplacian(W, normalised=True, alpha=1.0):
-    """ This is the matrix square root of the pseudo inverse of L.
+    """ This is the matrix pseudo inverse of L.
-    Also known as th eaverage commute time matrix.
+    Also known as the average commute time matrix.
    """
    W = asarray(W)
    t = W.dtype.char
@@ -464,8 +465,7 @@ def K_laplacian(W, normalised=True, alpha=1.0):
    return K
-
+def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5, use_cut=False):
 def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5):
    """Returns diffusion kernel.
    input:
            -- W, adj. matrix
@@ -477,27 +477,45 @@ def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5):
    t = W.dtype.char
    if len(W.shape)!=2:
        raise ValueError, "Non-matrix input to matrix function."
-    m,n = W.shape
+    m, n = W.shape
    if t in ['F','D']:
        raise TypeError, "Complex input!"
-    D = diag(sum(W,0))
+    D = diag(W.sum(0))
-    L = D-W
+    L = D - W
    if normalised==True:
-        T = diag(sqrt(1./(sum(W,0))))
+        T = diag(sqrt(1./W.sum(0)))
-        L = dot(dot(T,L),T)
+        L = dot(dot(T, L), T)
-    e,vr = eig(L)
+    e, vr = eig(L)
    vri = inv(vr) #inv
    cond = 1.0*{0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
    cutoff = 1.*abs(cond*maximum.reduce(e))
-    psigma = eye(m) # if sing vals are 0 exp(0)=1 (unnecessary)
+    psigma = eye(m) # if eigvals are 0 exp(0)=1 (unnecessary)
    #psigma = zeros((m,n), dtype='<f8')
    for i in range(len(e)):
        if abs(e[i]) > cutoff:
            psigma[i,i] = exp(-beta*e[i])
        #else:
        #    psigma[i,i] = 0.0
    K = real(dot(dot(vr, psigma), vri))
    I = eye(n, dtype='<f8')
    K = (1. - alpha)*I + alpha*K
    return K
 def K_diffusion2(W, normalised=True, alpha=1.0, beta=0.5, ncomp=None):
    """Returns diffusion kernel, using fast pade approximation.
    input:
            -- W, adj. matrix
            -- normalised [True/False]
            -- beta, [0->), (diffusion degree)
    """
    D = diag(W.sum(0))
    L = D - W
    if normalised==True:
        T = diag(sqrt(1./W.sum(0)))
        L = dot(dot(T, L), T)
    return expm(-beta*L)
 def K_modularity(W,alpha=1.0):
    """ Returns the matrix square root of Newmans modularity."""
@@ -530,3 +548,20 @@ def kernel_score(K, W):
    score = diag(dot(W, dot(K, W)) )
    tot = sum(score)
    return score, tot
 def modularity_matrix(G, nodelist=None):
    if not nodelist:
        nodelist = G.nodes()
    else:
        G = NX.subgraph(G, nodelist) 
    A = NX.adj_matrix(G, nodelist=nodelist)
    d = atleast_2d(G.degree(nbunch=nodelist))
    m = 1.*G.number_of_edges()
    B = A - A/m
    return B
--- a/fluents/lib/select_generators.py
+++ b/fluents/lib/select_generators.py
@@ -41,30 +41,31 @@ def pls_gen(a, b, n_blocks=None, center=False, index_out=False,axis=0, metric=No
     """Random block crossvalidation
    Leave-one-out is a subset, with n_blocks equals a.shape[-1]
    """
-     index = randperm(a.shape[axis])
+     #index = randperm(a.shape[axis])
     index = arange(a.shape[axis])
     if n_blocks==None:
          n_blocks = a.shape[axis]
     n_in_set = ceil(float(a.shape[axis])/n_blocks)
     out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_blocks)]
     for out in out_ind_sets:
-         inn = [i for i in index if i not in out]
+          inn = [i for i in index if i not in out]
-         acal = a.take(inn, 0)
+          acal = a.take(inn, 0)
-         atrue = a.take(out, 0)
+          atrue = a.take(out, 0)
-         bcal = b.take(inn, 0)
+          bcal = b.take(inn, 0)
-         btrue = b.take(out, 0)
+          btrue = b.take(out, 0)
-         if center:
+          if center:
-              mn_a = acal.mean(0)[newaxis]
+               mn_a = acal.mean(0)[newaxis]
-              acal = acal - mn_a
+               acal = acal - mn_a
-              atrue = atrue - mn_a
+               atrue = atrue - mn_a
-              mn_b = bcal.mean(0)[newaxis]
+               mn_b = bcal.mean(0)[newaxis]
-              bcal = bcal - mn_b
+               bcal = bcal - mn_b
-              btrue = btrue - mn_b
+               btrue = btrue - mn_b
-         if metric!=None:
+          if metric!=None:
-              acal = dot(acal, metric)
+               acal = dot(acal, metric)
-         if index_out:
+          if index_out:
-              yield acal, atrue, bcal, btrue, out
+               yield acal, atrue, bcal, btrue, out
-         else:     
+          else:     
-              yield acal, atrue, bcal, btrue
+               yield acal, atrue, bcal, btrue
 def pca_gen(a, n_sets=None, center=False, index_out=False, axis=0, metric=None):
@@ -151,6 +152,7 @@ def shuffle_1d_block(a, n_sets=None, blocks=None, index_out=False, axis=0):
             index = arange(m)
             dummy = map(random.shuffle, array_split(index, blocks))
             a_out = a.take(index, axis)
         if index_out:
              yield a_out, index
         else:
@@ -164,7 +166,8 @@ def shuffle_1d(a, n_sets, axis=0):
     m = a.shape[axis]
     for ii in xrange(n_sets):
         index = randperm(m)
-         yield a.take(index, axis)
+         a = a.take(index, axis)
         yield a
 def diag_pert(a, n_sets=10, center=True, index_out=False):
    """Alter generator returning sets perturbed with means at diagonals.
@@ -205,18 +208,17 @@ def diag_pert(a, n_sets=10, center=True, index_out=False):
        else:
             yield a_out
-
+ 
 def outerprod_centering(aat, ret_mn=True):
-    """Returns mean centered symmetric outerproduct matrix.
+    """Returns double centered symmetric outerproduct matrix.
    """
-    n = aat.shape[0]
+    h = aat.mean(0)[newaxis]
-    h = aat.sum(0)[:,newaxis]
+    h = h - 0.5*h.mean()
-    h = (h - mean(h)/2)/n
+    mn_a = h + h.T # beauty of broadcasting
    mn_a = h + h.T
    aatc = aat - mn_a
    if ret_mn:
-        return aatc, aat.mean(0)
+        return aatc, mn_a
-    return aat - mn_a
+    return aatc
--- a/fluents/lib/validation.py
+++ b/fluents/lib/validation.py
@@ -12,11 +12,47 @@ from cx_utils import m_shape
 def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
    """Returns rmsep and aopt for pls tailored for wide X.
    The root mean square error of cross validation is calculated
    based on random block cross-validation. With number of blocks equal to
    number of samples [default] gives leave-one-out cv.
    The pls model is based on the simpls algorithm for wide X.
-    comments:
+    :Parameters:
-             -- X, Y inputs need to be centered (fixme: check)
+    X : ndarray 
        column centered data matrix of size (samples x variables)
    Y : ndarray
        column centered response matrix of size (samples x responses)
    amax : scalar 
        Maximum number of components
    n_blocks : scalar
        Number of blocks in cross validation
    :Returns: 
    rmsep : ndarray
        Root Mean Square Error of cross-validated Predictions 
    aopt : scalar
        Guestimate of the optimal number of components
    :SeeAlso:
    - pls_cv_val : Same output, not optimised for wide X
    - w_simpls : Simpls algorithm for wide X
    Notes
    -----
    Based (cowardly translated) on m-files from the Chemoact toolbox
    X, Y inputs need to be centered (fixme: check)
    Examples
    --------
    >>> import numpy as n
    >>> X = n.array([[1., 2., 3.],[]])
    >>> Y = n.array([[1., 2., 3.],[]])
    >>> w_pls(X, Y, 1)
    [4,5,6], 1
    """
    k, l = m_shape(Y)
    PRESS = zeros((l, amax+1), dtype='f')
    if n_blocks==None:
@@ -30,7 +66,7 @@ def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
        if algo=='simpls':
            dat = w_simpls(Din, Yin, amax)
            Q, U, H = dat['Q'], dat['U'], dat['H']
-            That = dot(Doi, dot(U, inv(triu(dot(H.T,U))) ))
+            That = dot(Doi, dot(U, inv(triu(dot(H.T, U))) ))
        else:
            raise NotImplementedError
@@ -40,21 +76,13 @@ def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
            E = Yout[:,j][:,newaxis] - TQ
            E = E + sum(E, 0)/Din.shape[0]
            PRESS[j,1:] = PRESS[j,1:] + sum(E**2, 0)
-    #Yhat = Y - dot(That,Q.T)
+    Yhat = Y - dot(That,Q.T)
    rmsep = sqrt(PRESS/Y.shape[0])
    aopt = find_aopt_from_sep(rmsep)
-    return rmsep, aopt
+    return rmsep, Yhat, aopt
 def pls_val(X, Y, amax=2, n_blocks=10, algo='pls', metric=None):
-    """ Validation results of pls model. 
+    
    comments:
             -- X, Y inputs need to be centered (fixme: check)
    """    
    k, l = m_shape(Y)
    PRESS = zeros((l, amax+1), dtype='<f8')
    EE = zeros((amax, k, l), dtype='<f8')
@@ -79,7 +107,30 @@ def pls_val(X, Y, amax=2, n_blocks=10, algo='pls', metric=None):
    rmsep = sqrt(PRESS/(k-1.))
    aopt = find_aopt_from_sep(rmsep)
-    return rmsep, aopt
+    return rmsep, Yhat, aopt
 def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5):
    """Performs crossvalidation to get generalisation error in lpls"""
    cv_iter = select_generators.pls_gen(X, Y, n_blocks=nsets,center=False,index_out=True)
    k, l = Y.shape
    Yhat = empty((a_max,k,l), 'd')
    for i, (xcal,xi,ycal,yi,ind) in enumerate(cv_iter):
        T, W, P, Q, U, L, K, B, b0, evx, evy, evz = nipals_lpls(xcal,ycal,Z,
                                                                a_max=a_max,
                                                                alpha=alpha,
                                                                mean_ctr=[2,0,1],
                                                                verbose=False)
        for a in range(a_max):
            Yhat[a,ind,:] = b0[a][0][0] + dot(xi, B[a])
    Yhat_class = zeros_like(Yhat)
    for a in range(a_max):
        for i in range(k):
            Yhat_class[a,i,argmax(Yhat[a,i,:])]=1.0
    class_err = 100*((Yhat_class+Y)==2).sum(1)/Y.sum(0).astype('d')
    sep = (Y - Yhat)**2
    rmsep = sqrt(sep.mean(1))
    aopt = find_aopt_from_sep(rmsep)
    return rmsep, Yhat, aopt
 def pca_alter_val(a, amax, n_sets=10, method='diag'):
    """Pca validation by altering elements in X.
@@ -146,8 +197,7 @@ def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True, center=True, m
    if n_blocks == None:
        n_blocks = b.shape[0]
-    Wcv = empty((n_blocks, a.shape[1], amax), dtype='f')
+    Wcv = empty((n_blocks, a.shape[1], amax), dtype='d')
    if use_pack and metric==None:
        u, s, inflater = svd(a, full_matrices=0)
        a = u*s
@@ -161,11 +211,10 @@ def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True, center=True, m
            dat = bridge(a_in, b_in, amax, 'loads', 'fast')
        W = dat['W']
        if use_pack and metric==None:
            W = dot(inflater.T, W)
-        Wcv[nn,:,:] = W
+        Wcv[nn,:,:] = W[:,:,]
    return Wcv
@@ -200,6 +249,29 @@ def pca_jkP(a, aopt, n_blocks=None, metric=None):
    return PP
 def lpls_jk(X, Y, Z, a_max, nsets=None, alpha=.5):
    cv_iter = select_generators.pls_gen(X, Y, n_blocks=nsets,center=False,index_out=False)
    m, n = X.shape
    k, l = Y.shape
    o, p = Z.shape
    if nsets==None:
        nsets = m
    WWx = empty((nsets, n, a_max), 'd')
    WWz = empty((nsets, o, a_max), 'd')
    #WWy = empty((nsets, l, a_max), 'd')
    for i, (xcal,xi,ycal,yi) in enumerate(cv_iter):
        T, W, P, Q, U, L, K, B, b0, evx, evy, evz = nipals_lpls(xcal,ycal,Z,
                                                                a_max=a_max,
                                                                alpha=alpha,
                                                                mean_ctr=[2,0,1],
                                                                scale='loads',
                                                                verbose=False)
        WWx[i,:,:] = W
        WWz[i,:,:] = L
        #WWy[i,:,:] = Q
    return WWx, WWz
 def find_aopt_from_sep(sep, method='75perc'):
    """Returns an estimate of optimal number of components from rmsecv.
    """