Multiple lib changes

2007-01-25 11:58:10 +00:00
parent a65d79697f
commit 1c2c2c8895
7 changed files with 519 additions and 152 deletions
--- a/fluents/lib/blmfuncs.py
+++ b/fluents/lib/blmfuncs.py
@@ -1,7 +1,9 @@
 """This module contains bilinear models(Functions)
 """

+import pygtk
 import gtk
+import gtk.glade
 from fluents.workflow import Function, OptionsDialog, Options
 from fluents.dataset import Dataset
 from fluents import plots, dataset, workflow, logger
@@ -12,7 +14,7 @@ from cx_utils import mat_center
 from validation import *
 import blmplots
 import engines
-
+import copy

 class Model(Function):
    """Base class of bilinear models.
@@ -39,23 +41,42 @@ class PCA(Model):
        Model.__init__(self,id,name)
        self._options = PcaOptions()
        
-    def pre_validation(self, amax, n_sets, val_engine):
-        """Model calculations for maximum number of components. 
+    def validation(self, amax, cv_val_sets, pert_val_sets, cv_val_method, pert_val_method):
+        """Model validation and estimate of optimal numer of components. 
        """
-        rmsep = val_engine(self.model['E0'], amax, n_sets)
-        self.model['rmsep'] = rmsep
-        self.model['aopt'] = rmsep.argmin()
+        if self._options['calc_cv']:
+            if cv_val_method == 'random':
+                sep, aopt = pca_cv_val(self.model['E0'], amax, cv_val_sets)
+                self.model['sep'] = sep
+
+        if self._options['calc_pert']:
+            if pert_val_method == 'random_diag':
+                sep, aopt = pca_alter_val(self.model['E0'], amax, pert_val_sets)
+                self.model['sep'] = sep
+            
+        if self._options['calc_cv']==False and self._options['calc_pert']==False:
+            self.model['sep'] = None
+            aopt = self._options['amax']
+        
+        if self._options['auto_aopt']:
+            logger.log("notice", "Auto aopt: " + str(aopt))
+            self._options['aopt'] = aopt
+        if aopt==1:
+            logger.log('notice', 'Aopt at first component!')
        
    def confidence(self, aopt, n_sets, alpha, p_center,
                   crot, strict, cov_center ):
        """Returns a confidence measure for model parameters.
        Based on aopt.
        """
-        aopt = self.model['aopt']
+        if aopt<2:
+            aopt = 2
+            logger.log('notice','Hotellings T2 needs more than 1 comp.\n switching to 2!!')
+        
        jk_segments = pca_jkP(self.model['E0'], aopt, n_sets)
        Pcal = self.model['P'][:,:aopt]
        tsq = hotelling(jk_segments, Pcal, p_center,
-                              cov_center, alpha, crot, strict)
+                        cov_center, alpha, crot, strict)
        self.model['p_tsq'] = tsq

    def make_model(self, amax, mode, scale):
@@ -96,8 +117,8 @@ class PCA(Model):
        # vars
        ids_1 = [dim_name_1, DX.get_identifiers(dim_name_1, sorted=True)]
        # components (hidden)
-        pc_ids = ['_comp_a', map(str,range(self.model['aopt'])) ]
-        pc_ids_opt = ['_comp_o', map(str, range(self.model['aopt'])) ] 
+        pc_ids = ['_amax', map(str,range(self._options['amax'])) ]
+        pc_ids_opt = ['_aopt', map(str, range(self._options['aopt'])) ] 
        zero_dim = ['_doe', ['0']] # null dim, vector (hidden)
        match_ids = {'E':[ids_0, ids_1],
                     'E0':[ids_0, ids_1],
@@ -121,8 +142,7 @@ class PCA(Model):
            #try:
            out.append(plt(self))
            #except:
-            #    print plt
-                #logger.log('debug', 'Plot: %s failed') %plt
+            #    logger.log('debug', 'Plot: %s failed') %str(plt)
        return out
    
    def run_o(self, data):
@@ -130,6 +150,8 @@ class PCA(Model):
        """
        self.clear()
        options = self._options
+        for item in options.items():
+            print item
        self._dataset['X'] = data
        self._data['X'] = data.asarray().astype('<f8')
        if options['center']:
@@ -138,7 +160,8 @@ class PCA(Model):
        else:
            self.model['E0'] = data.asarray()

-        self.pre_validation(**options.pre_validation_options())
+        self.validation(**options.validation_options())
+        self.model['aopt'] = self._options['aopt']
        self.make_model(**options.make_model_options())
        if options['calc_conf']:
            self.confidence(**options.confidence_options())
@@ -159,7 +182,6 @@ class PCA(Model):
        if response == gtk.RESPONSE_OK:
            # set output data and plots
            dialog.set_output()
-
            #run with current data and options 
            return self.run_o(data)
    
@@ -172,10 +194,10 @@ class PLS(Model):
    def pre_validation(self, amax, n_sets, val_engine):
        """Returns rmsec,rmsep for model.
        """
-        rmsep = val_engine(self.model['E0'], self.model['F0'],
+        rmsep, aopt = val_engine(self.model['E0'], self.model['F0'],
                           amax, n_sets)
        self.model['rmsep'] = rmsep.mean(0)
-        self.model['aopt'] = rmsep.mean(0).argmin()
+        self.model['aopt'] = aopt
        
    def confidence(self, aopt, n_sets, alpha, p_center,
                   crot, strict, cov_center ):
@@ -341,34 +363,39 @@ class PcaOptions(Options):
        opt['algo'] = 'pca'
        opt['engine'] = engines.pca
        opt['mode'] = 'normal' # how much info to calculate
-        opt['lod'] = 'compact' # how much info to store
-        opt['amax'] = 5
-        opt['aopt'] = 5
+        opt['amax'] = 10
+        opt['aopt'] = 100
+        opt['auto_aopt'] = False
        opt['center'] = True
        opt['center_mth'] = mat_center
        opt['scale'] = 'scores'
-        opt['calc_conf'] = True
-        opt['n_sets'] = 5

+        opt['calc_conf'] = False
+        opt['n_sets'] = 5
        opt['strict'] = True
        opt['p_center'] = 'med'
        opt['alpha'] = .8
        opt['cov_center'] = 'med'
        opt['crot'] = True

-        opt['val_engine'] = pca_alter_val
-        opt['val_n_sets'] = 10
+        opt['calc_cv'] = False
+        opt['calc_pert'] = True
+        opt['pert_val_method'] = 'random_diag'
+        opt['cv_val_method'] = 'random'
+        opt['cv_val_sets'] = 10
+        opt['pert_val_sets'] = 10

        opt['all_data'] = [('T', 'scores', True),
                           ('P', 'loadings', True),
                           ('E','residuals', False),
                           ('p_tsq', 't2', False),
-                           ('rmsep', 'root mean square error of prediction', False)
+                           ('rmsep', 'RMSEP', False)
                           ]
        
        opt['all_plots'] = [(blmplots.PcaScorePlot, 'Scores', True),
                            (blmplots.PcaLoadingPlot, 'Loadings', True),
-                            (blmplots.LineViewXc, 'Line view', True)
+                            (blmplots.LineViewXc, 'Line view', True),
+                            (blmplots.PredictionErrorPlot, 'Residual Error', True)
                            ]
        
        opt['out_data'] = ['T','P', 'p_tsq']
@@ -387,9 +414,10 @@ class PcaOptions(Options):
                    'strict', 'crot', 'cov_center']
        return self._copy_from_list(opt_list)  

-    def pre_validation_options(self):
+    def validation_options(self):
        """Options for pre_validation method."""
-        opt_list = ['amax', 'n_sets', 'val_engine']
+        opt_list = ['amax', 'cv_val_sets', 'pert_val_sets',
+                    'cv_val_method', 'pert_val_method']
        return self._copy_from_list(opt_list)


@@ -411,7 +439,7 @@ class PlsOptions(Options):
        opt['center'] = True
        opt['center_mth'] = mat_center
        opt['scale'] = 'scores'
-        opt['calc_conf'] = True
+        opt['calc_conf'] = False
        opt['n_sets'] = 10

        opt['strict'] = True
@@ -420,13 +448,15 @@ class PlsOptions(Options):
        opt['cov_center'] = 'med'
        opt['crot'] = True

+        opt['calc_cv'] = True
+        opt['calc_pert'] = False
        opt['val_engine'] = w_pls_cv_val

        opt['all_data'] = [('T', 'scores', True),
                           ('P', 'loadings', True),
                           ('E','residuals', False),
                           ('p_tsq', 't2', False),
-                           ('rmsep', 'root mean square error of prediction', False)
+                           ('rmsep', 'RMSEP', False)
                           ]
        
        opt['all_plots'] = [(blmplots.PlsScorePlot, 'Scores', True),
@@ -434,7 +464,7 @@ class PlsOptions(Options):
                            (blmplots.LineViewXc, 'Line view', True)]
                            
        opt['out_plots'] = [blmplots.PlsScorePlot,
-                            blmplots.PlsLoadingPlot,
+                             blmplots.PlsLoadingPlot,
                            blmplots.LineViewXc]
        opt['out_data'] = None
                            
@@ -468,9 +498,175 @@ class PcaOptionsDialog(OptionsDialog):
    def __init__(self, data, options, input_names=['X']):
        OptionsDialog.__init__(self, data, options, input_names)

+        glade_file = "/home/flatberg/Projects/project4/project4.glade"
+        notebook_name = "vbox1"
+        page_name = "Options"
+        self.add_page_from_glade(glade_file, notebook_name, page_name)
+        # connect signals to handlers
+        dic = {"on_amax_value_changed" : self.on_amax_changed,
+               "on_aopt_value_changed" : self.on_aopt_changed,
+               "auto_aopt_toggled" : self.auto_aopt_toggled,
+               "center_toggled" : self.center_toggled,
+               "on_scale_changed" : self.on_scale_changed,
+               "on_val_none" : self.val_toggled,
+               "on_val_cv" : self.cv_toggled,
+               "on_val_pert" : self.pert_toggled,
+               "on_cv_method_changed" : self.on_cv_method_changed,
+               "on_cv_sets_changed" : self.on_cv_sets_changed,
+               "on_pert_sets_changed" : self.on_pert_sets_changed,
+               "on_conf_toggled" : self.on_conf_toggled
+               }
+        
+        self.wTree.signal_autoconnect(dic)
+
+        # set/ensure valid default values/ranges
+        amax_sb = self.wTree.get_widget("amax_spinbutton")
+        max_comp =  min(data[0].shape) # max num of components
+        if self._options['amax']>max_comp:
+            logger.log('debug', 'amax default too large ... adjusting')
+            self._options['amax'] = max_comp
+        amax_sb.get_adjustment().set_all(self._options['amax'], 1, max_comp, 1, 0, 0)
+        # aopt spin button
+        aopt_sb = self.wTree.get_widget("aopt_spinbutton")
+        if self._options['aopt']>self._options['amax']:
+            self._options['aopt'] = self._options['amax'] + 1 - 1
+        aopt_sb.get_adjustment().set_all(self._options['aopt'], 1, self._options['amax'], 1, 0, 0)
+
+        # scale
+        scale_cb = self.wTree.get_widget("scale_combobox")
+        scale_cb.set_active(0)
+
+        # validation frames
+        if self._options['calc_cv']==False:
+            cv_frame = self.wTree.get_widget("cv_frame")
+            cv_frame.set_sensitive(False)
+        if self._options['calc_pert']==False:
+            pert_frame = self.wTree.get_widget("pert_frame")
+            pert_frame.set_sensitive(False)
+
+        cv = self.wTree.get_widget("cv_method").set_active(0)
+        pm = self.wTree.get_widget("pert_method").set_active(0)
+
+        # confidence
+        if self._options['calc_conf']==True:
+            self.wTree.get_widget("subset_frame").set_sensitive(True)
+        else:
+            self.wTree.get_widget("subset_frame").set_sensitive(False)
+            
+            
+    def on_amax_changed(self, sb):
+        logger.log("debug", "amax changed: new value: %s" %sb.get_value_as_int())
+        amax = sb.get_value_as_int()
+        # update aopt if needed
+        if amax<self._options['aopt']:
+            self._options['aopt'] = amax
+        aopt_sb = self.wTree.get_widget("aopt_spinbutton")
+        aopt_sb.get_adjustment().set_all(self._options['aopt'], 1, amax, 1, 0, 0)
+        self._options['amax'] = sb.get_value_as_int()
+        
+    def on_aopt_changed(self, sb):
+        aopt = sb.get_value_as_int()
+        self._options['aopt'] = aopt
+        
+    def auto_aopt_toggled(self, tb):
+        aopt_sb = self.wTree.get_widget("aopt_spinbutton")
+        if tb.get_active():
+            self._options['auto_aopt'] = True
+            aopt_sb.set_sensitive(False)
+        else:
+            self._options['auto_aopt'] = False
+            aopt_sb.set_sensitive(True)
+
+    def center_toggled(self, tb):
+        if tb.get_active():
+            self._options['center'] = True
+        else:
+            logger.log("debug", "centering set to False")
+            self._options['center'] = False
+
+    def on_scale_changed(self, cb):
+        scale = cb.get_active_text()
+        if scale=='Scores':
+            self._options['scale'] = 'scores'
+        elif scale=='Loadings':
+            self._options['scale'] = 'loads'
+        else:
+            raise IOError
+
+    def val_toggled(self, tb):
+        """Callback for validation: None. """
+        cv_frame = self.wTree.get_widget("cv_frame")
+        pert_frame = self.wTree.get_widget("pert_frame")
+        cv_tb = self.wTree.get_widget("cv_toggle")
+        p_tb = self.wTree.get_widget("pert_toggle")
+        if tb.get_active():
+            self._options['calc_cv'] = False
+            self._options['calc_pert'] = False
+            cv_frame.set_sensitive(False)
+            pert_frame.set_sensitive(False)
+            cv_tb.set_sensitive(False)
+            p_tb.set_sensitive(False)
+        else:
+            p_tb.set_sensitive(True)
+            cv_tb.set_sensitive(True)
+            if p_tb.get_active(): 
+                pert_frame.set_sensitive(True)
+                self._options['calc_pert'] = True
+            if cv_tb.get_active():
+                cv_frame.set_sensitive(True)
+                self._options['calc_cv'] = True
+
+    def cv_toggled(self, tb):
+        cv_frame = self.wTree.get_widget("cv_frame")
+        if tb.get_active():
+            cv_frame.set_sensitive(True)
+            self._options['calc_cv'] = True
+        else:
+            cv_frame.set_sensitive(False)
+            self._options['calc_cv'] = False
+
+    def pert_toggled(self, tb):
+        pert_frame = self.wTree.get_widget("pert_frame")
+        if tb.get_active():
+            pert_frame.set_sensitive(True)
+            self._options['calc_pert'] = True
+        else:
+            pert_frame.set_sensitive(False)
+            self._options['calc_pert'] = False
+
+
+    def on_cv_method_changed(self, cb):
+        method = cb.get_active_text()
+        if method == 'Random':
+            self._options['cv_val_method'] = 'random'
+            
+    def on_pert_method_changed(self, cb):
+        method = cb.get_active_text()
+        if method == 'Random diags':
+            self._options['pert_val_method'] = 'random_diag'
+
+    def on_cv_sets_changed(self, sb):
+        val = sb.get_value_as_int()
+        self._options['cv_val_sets'] = val
+        
+    def on_pert_sets_changed(self, sb):
+        val = sb.get_value_as_int()
+        self._options['pert_val_sets'] = val
+
+    def on_conf_toggled(self, tb):
+        if tb.get_active():
+            self._options['calc_conf'] = False
+            self.wTree.get_widget("subset_frame").set_sensitive(False)
+        else:
+            self._options['calc_conf'] = True
+            self.wTree.get_widget("subset_frame").set_sensitive(True)
+        
+        
+

 class PlsOptionsDialog(OptionsDialog):
    """Options dialog for Partial Least Squares Regression.
    """
+
    def __init__(self, data, options, input_names=['X', 'Y']):
        OptionsDialog.__init__(self, data, options, input_names)
--- a/fluents/lib/blmplots.py
+++ b/fluents/lib/blmplots.py
@@ -12,8 +12,9 @@ fixme2:
        colorbar, but when adding colors the colorbar shoud be created.
 """
 from fluents import plots
-from scipy import dot,sum,diag,arange,log,mean,newaxis
+from scipy import dot,sum,diag,arange,log,mean,newaxis,sqrt
 from matplotlib import cm
+import pylab as PB

 class PcaScorePlot(plots.ScatterPlot):
    """PCA Score plot"""
@@ -27,10 +28,10 @@ class PcaScorePlot(plots.ScatterPlot):
        id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
        plots.ScatterPlot.__init__(self, dataset_1, dataset_2, id_dim, sel_dim, id_1, id_2 ,c='b' ,s=40 , name='pca-scores')

-    def set_absicca(self,n):
+    def set_absicca(self, n):
        self.xaxis_data = self._T[:,n]

-    def set_ordinate(self,n):
+    def set_ordinate(self, n):
        self.yaxis_data = self._T[:,n]

 class PcaLoadingPlot(plots.ScatterPlot):
@@ -50,15 +51,15 @@ class PcaLoadingPlot(plots.ScatterPlot):
             col = 'g'
         plots.ScatterPlot.__init__(self, dataset_1, dataset_2, id_dim, sel_dim, id_1, id_2,c=col,s=20, name='pls-loadings')

-     def set_absicca(self,n):
+     def set_absicca(self, n):
         self.xaxis_data = self._P[:,n]

-     def set_ordinate(self,n):
+     def set_ordinate(self, n):
         self.yaxis_data = self._P[:,n]
    
 class PlsScorePlot(plots.ScatterPlot):
    """PLS Score plot"""
-    def __init__(self,model, absi=0, ordi=1):
+    def __init__(self, model, absi=0, ordi=1):
        self._T = model.model['T']
        dataset_1 = model.as_dataset('T')
        dataset_2 = dataset_1
@@ -71,16 +72,16 @@ class PlsScorePlot(plots.ScatterPlot):
                                   id_dim, sel_dim, id_1, id_2 ,
                                   c='b' ,s=40 , name='pls-scores')
        
-    def set_absicca(self,n):
+    def set_absicca(self, n):
        self.xaxis_data = self._T[:,n]

-    def set_ordinate(self,n):
+    def set_ordinate(self, n):
        self.yaxis_data = self._T[:,n]


 class PlsLoadingPlot(plots.ScatterPlot):    
    """PLS Loading plot"""
-    def __init__(self,model,absi=0,ordi=1):
+    def __init__(self, model, absi=0, ordi=1):
        self._P = model.model['P']
        dataset_1 = model.as_dataset('P')
        dataset_2 = dataset_1
@@ -97,44 +98,46 @@ class PlsLoadingPlot(plots.ScatterPlot):
                                   id_dim, sel_dim, id_1, id_2,
                                   c=col, s=20,                                   name='loadings')
        
-    def set_absicca(self,n):
+    def set_absicca(self, n):
        self.xaxis_data = self._P[:,n]
        
-    def set_ordinate(self,n):
+    def set_ordinate(self, n):
        self.yaxis_data = self._T[:,n]

+
 class LineViewXc(plots.LineViewPlot):
    """A line view of centered raw data
    """
-    def __init__(self, func_class, name='Profiles'):
+    def __init__(self, model, name='Profiles'):
        # copy, center, plot
-        x = func_class._dataset['X'].copy()
+        x = model._dataset['X'].copy()
        x._array = x._array - mean(x._array,0)[newaxis]
        plots.LineViewPlot.__init__(self, x, 1, None, name)

+
 class ParalellCoordinates(plots.Plot):
    """Parallell coordinates for score loads with many comp.
    """
-    def __init__(self,model, p = 'loads'):
+    def __init__(self, model, p='loads'):
        pass

+
 class PlsQvalScatter(plots.ScatterPlot):
    """A vulcano like plot of loads vs qvals
    """
-    def __init__(self, func_class, pc=0):
-        model = func_class.model
-        if not model.has_key('w_tsq'):
+    def __init__(self, model, pc=0):
+        if not model.model.has_key('w_tsq'):
            return
-        self._W = model['P']
-        dataset_1 = func_class.as_dataset('P')
-        dataset_2 = func_class.as_dataset('w_tsq')
+        self._W = model.model['P']
+        dataset_1 = model.as_dataset('P')
+        dataset_2 = model.as_dataset('w_tsq')
        id_dim = dataset_1.get_dim_name(0) #genes
        sel_dim = dataset_1.get_dim_name(1) #_comp
        sel_dim_2 = dataset_2.get_dim_name(1) #_zero_dim
        id_1, = dataset_1.get_identifiers(sel_dim, [0])
        id_2, = dataset_2.get_identifiers(sel_dim_2, [0])
-        if model.has_key('w_tsq'):
-            col = model['w_tsq'].ravel()
+        if model.model.has_key('w_tsq'):
+            col = model.model['w_tsq'].ravel()
            col = normalise(col)
        else:
            col = 'g'
@@ -143,6 +146,33 @@ class PlsQvalScatter(plots.ScatterPlot):
                                   c=col, s=20, sel_dim_2=sel_dim_2,
                                   name='Load Volcano')

+class PredictionErrorPlot(plots.Plot):
+    """A boxplot of prediction error vs. comp. number.
+    """
+    def __init__(self, model, name="Pred. Err."):
+        if not model.model.has_key('sep'):
+            logger.log('notice', 'Model has no calculations of sep')
+            return
+        plots.Plot.__init__(self, name)
+        self._frozen = True
+        self.current_dim = 'johndoe'
+        self.ax = self.fig.add_subplot(111)
+        
+        # draw
+        sep = model.model['sep']
+        aopt = model.model['aopt']
+        bx_plot_lines = self.ax.boxplot(sqrt(sep))
+        aopt_marker = self.ax.axvline(aopt, linewidth=10,
+                                      color='r',zorder=0,
+                                      alpha=.5)
+        
+        # add canvas
+        self.add(self.canvas)
+        self.canvas.show()
+
+    def set_current_selection(self, selection):
+        pass
+    
    
 class InfluencePlot(plots.ScatterPlot):
    """
--- a/fluents/lib/cx_utils.py
+++ b/fluents/lib/cx_utils.py
@@ -1,6 +1,7 @@
 from scipy import apply_along_axis,newaxis,zeros,\
     median,round_,nonzero,dot,argmax,any,sqrt,ndarray,\
-     trace,zeros_like,sign,sort,real,argsort,rand,array
+     trace,zeros_like,sign,sort,real,argsort,rand,array,\
+     matrix
 from scipy.linalg import norm,svd,inv,eig
 from scipy.stats import median,mean

@@ -106,3 +107,7 @@ def mat_center(X,axis=0,ret_mn=False):
        return Xs,mnX
    else:
        return Xs
+
+def m_shape(array):
+	"""Returns the array shape on the form of a numpy.matrix."""
+	return matrix(array).shape
--- a/fluents/lib/engines.py
+++ b/fluents/lib/engines.py
@@ -1,4 +1,3 @@
-
 """Module contain algorithms for  (burdensome) calculations.

 There is no typechecking of any kind here, just focus on speed
@@ -7,7 +6,7 @@ There is no typechecking of any kind here, just focus on speed
 from scipy.linalg import svd,norm,inv,pinv,qr
 from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\
     apply_along_axis,mean,ones,randn,empty_like,outer,c_,\
-     rand,sum,cumsum
+     rand,sum,cumsum,matrix

 def pca(a, aopt, scale='scores', mode='normal'):
    """ Principal Component Analysis model
@@ -17,8 +16,9 @@ def pca(a, aopt, scale='scores', mode='normal'):
         -- detailed    : returns all model params and all residuals
    """
    
-    m,n = a.shape
-    u,s,vt = svd(a, full_matrices=0)
+    m, n = a.shape
+    u, s, vt = svd(a, full_matrices=0)
+    eigvals = (1./m)*s
    T = u*s
    T = T[:,:aopt]
    P = vt[:aopt,:].T
@@ -43,17 +43,40 @@ def pca(a, aopt, scale='scores', mode='normal'):
            
    return {'T':T, 'P':P, 'E':E}

+def pcr(a, b, aopt=2, scale='scores', mode='normal'):
+    """Returns Principal component regression model."""
+    m, n = a.shape
+    try:
+        k, l = b.shape
+    except:
+        k = b.shape[0]
+        l = 1
+    B = empty((aopt, n, l))
+    U, s, Vt = svd(a, full_matrices=True)
+    T = U*s
+    T = T[:,:aopt]
+    P = Vt[:aopt,:].T
+    Q = dot(dot(inv(dot(T.T, T)), T.T), b).T
+    for i in range(aopt):
+        ti = T[:,:i+1]
+        r = dot(dot(inv(dot(ti.T,ti)), ti.T), b)
+        B[i] = dot(P[:,:i+1], r)
+    E = a - dot(T, P.T)
+    F = b - dot(T, Q.T)
+
+    return {'T':T, 'P':P,'Q': Q, 'B':B, 'E':E, 'F':F}
+    
 def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
    """Kernel pls for tall/wide matrices.

    Fast pls for calibration. Only inefficient for many Y-vars.
    
    """
-    m,n = a.shape
+    m, n = a.shape
    if ab!=None:
-        mm,l = ab.shape
+        mm, l = m_shape(ab)
    else:
-        k,l = b.shape
+        k, l = m_shape(b)

    W = empty((n, aopt))
    P = empty((n, aopt))
@@ -66,10 +89,10 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
        ab = dot(a.T, b)
    for i in range(aopt):
        if ab.shape[1]==1:
-            w = ab
+            w = ab.reshape(mm, l)
        else:
-            u,s,vh = svd(dot(ab.T, ab))
-            w = dot(ab,u[:,:1])
+            u, s, vh = svd(dot(ab.T, ab))
+            w = dot(ab, u[:,:1])
    
        w = w/norm(w)
        r = w.copy()
@@ -99,9 +122,9 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
    if mode=='detailed':
        E = empty((aopt, m, n))
        F = empty((aopt, k, l))
-        for i in range(1,aopt+1,1):
-            E[i-1] = a - dot(T[:,:i],P[:,:i].T)
-            F[i-1] = b - dot(T[:,:i],Q[:,:i].T)
+        for i in range(1, aopt+1, 1):
+            E[i-1] = a - dot(T[:,:i], P[:,:i].T)
+            F[i-1] = b - dot(T[:,:i], Q[:,:i].T)
    else:
        E = a - dot(T[:,:aopt], P[:,:aopt].T)
        F = b - dot(T[:,:aopt], Q[:,:aopt].T)
@@ -121,17 +144,17 @@ def w_simpls(aat, b, aopt):
    There is no P,W.  T is normalised
    """
    bb = b.copy()
-    m,m = aat.shape
+    m, m = aat.shape
    U = empty((m, aopt))
    T = empty((m, aopt))
    H = empty((m, aopt)) #just like W in simpls
    PROJ = empty((m, aopt)) #just like R in simpls

    for i in range(aopt):
-        u,s,vh = svd(dot(dot(b.T, aat), b), full_matrices=0)
+        u, s, vh = svd(dot(dot(b.T, aat), b), full_matrices=0)
        u = dot(b, u[:,:1]) #y-factor scores
        U[:,i] = u.ravel()
-        t =dot(aat, u)
+        t = dot(aat, u)
        t = t/norm(t)
        T[:,i] = t.ravel()
        h = dot(aat, t) #score-weights
@@ -141,19 +164,18 @@ def w_simpls(aat, b, aopt):
            b = b - dot(PROJ[:,:i+1], dot(H[:,:i+1].T,b) )
    C = dot(bb.T, T)

-    return {'T':T,'U':U,'Q':C,'H':H}
+    return {'T':T, 'U':U, 'Q':C, 'H':H}

 def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
    """Undeflated Ridged svd(X'Y)
    """
    m, n = a.shape
-    k, l = b.shape
-    u,s,vt = svd(b, full_matrices=0)
+    k, l = m_shape(b)
+    u, s, vt = svd(b, full_matrices=0)
    g0 = dot(u*s, u.T)
    g = (1 - r)*g0 + r*eye(m)
    ag = dot(a.T, g)
-    
-    u,s,vt = svd(ag, full_matrices=0)
+    u, s, vt = svd(ag, full_matrices=0)
    W = u[:,:aopt]
    K = vt[:aopt,:].T
    T = dot(a, W)
@@ -166,8 +188,8 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
        return {'T':T, 'W':W}

    U = dot(g0, K) #fixme check this 
-    Q = dot(b.T, dot(T, inv(dot(T.T,T)) ))
-    B = zeros((aopt, n, l))
+    Q = dot(b.T, dot(T, inv(dot(T.T, T)) ))
+    B = zeros((aopt, n, l), dtype='f')
    for i in range(aopt):
        B[i] = dot(W[:,:i+1], Q[:,:i+1].T)
    # leverages
@@ -198,3 +220,6 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
        
    return {'B':B, 'W':W, 'T':T, 'Q':Q, 'E':E, 'F':F, 'U':U, 'P':W}
    
+
+def m_shape(array):
+    return matrix(array).shape
--- a/fluents/lib/nx_utils.py
+++ b/fluents/lib/nx_utils.py
@@ -201,16 +201,16 @@ def node_weighted_adj_matrix(G, weights=None, ave_type='harmonic', with_labels=F
 def weighted_adj_matrix(G, with_labels=False):
    """Adjacency matrix of an XGraph whos weights are given in edges.
    """
-    A,labels = NX.adj_matrix(G,with_labels=True)
+    A, labels = NX.adj_matrix(G, with_labels=True)
    W = A.astype('<f8')
-    for orf,i in labels.items():
-        for orf2,j in labels.items():
-            if G.has_edge(orf,orf2):
-                edge_weight = G.get_edge(orf,orf2)
-                W[i,j]=edge_weight
-                W[j,i]=edge_weight
+    for orf, i in labels.items():
+        for orf2, j in labels.items():
+            if G.has_edge(orf, orf2):
+                edge_weight = G.get_edge(orf, orf2)
+                W[i,j] = edge_weight
+                W[j,i] = edge_weight
    if with_labels==True:
-        return W,labels
+        return W, labels
    else:
        return W

@@ -418,6 +418,31 @@ def weighted_laplacian(G,with_labels=False):
    else:	
        return L            

+def subnetworks(G, T2):
+    """Return the highest scoring (T2-test) subgraph og G.
+
+    Use simulated annealing to identify highly scoring subgraphs.
+
+    ref: -- Ideker et.al (Bioinformatics 18, 2002)
+         -- Patil and Nielsen (PNAS 2006)
+    
+    """
+    N = 1000
+    states = [(node, False) for node in G.nodes()]
+    t2_last = 0.0
+    for i in xrange(N):
+        if i==0: #assign random states
+            states = [(state[0], True) for state in states if rand(1)>.5]
+        sub_nodes = [state[0] for state in states if state[1]]
+        Gsub = NX.subgraph(G, sub_nodes)
+        Gsub = NX.connected_components_subgraphs(Gsub)[0]
+        t2 = [T2[node] for node in Gsub]
+        if t2>t2_last:
+            pass
+        else:
+            p = numpy.exp()
+            
+    

 """Below are methods for calculating graph metrics

@@ -473,7 +498,7 @@ Ke = expm(A) .... expm(-A)?
 # 13.09.2206: update for use in numpy


-def K_expAdj(W, normalised=False, alpha=1.0):
+def K_expAdj(W, normalised=True, alpha=1.0):
    """Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel. 
    """
    W = asarray(W)
@@ -499,7 +524,7 @@ def K_expAdj(W, normalised=False, alpha=1.0):
    
    return dot(dot(vr,psigma),vri)

-def K_vonNeumann(W,normalised=False,alpha=1.0):
+def K_vonNeumann(W, normalised=True, alpha=1.0):
    """ The geometric series of path lengths.
    Returns matrix square root of pseudo inverse of the adjacency matrix.
    """
@@ -540,8 +565,8 @@ def K_laplacian(W, normalised=True, alpha=1.0):
    D = diag(sum(W,0))
    L = D - W
    if normalised==True:
-        T = diag(sqrt(1./sum(W,0)))
-        L = dot(dot(T,L),T)
+        T = diag(sqrt(1./sum(W, 0)))
+        L = dot(dot(T, L), T)
    e,vr = eig(L)
    e = real(e)
    vri = inv(vr)
@@ -551,7 +576,7 @@ def K_laplacian(W, normalised=True, alpha=1.0):
    for i in range(len(e)):
        if e[i] > cutoff:
            psigma[i] = 1.0/e[i]
-    K = dot(dot(vr,diag(psigma)),vri).astype(t)
+    K = dot(dot(vr, diag(psigma)), vri).astype(t)
    K = real(K)
    I = eye(n)
    K = (1-alpha)*I + alpha*K
--- a/fluents/lib/select_generators.py
+++ b/fluents/lib/select_generators.py
@@ -30,22 +30,14 @@ def w_pls_gen(aat,b,n_blocks=None,center=True,index_out=False):
          b_in = b[inn,:]
          b_out = b[out,:]
          if center:
-               # centering projector: I - (1/n)11'
-               # nin = len(inn)
-               # Pc = eye(nin) - outer(ones((nin,)),ones((nin,)))/nin
-               # xxt - x( outer(ones((nin,)),ones((nin,)))/nin ) x.T
-
-               # de jong:
-               h = sum(aat_in,0)[ :,newaxis]
-               h = (h - mean(h)/2)/len(inn)
-               mn_a = h + h.T
-               aat_in = aat_in - mn_a
+               aat_in, mn = outerprod_centering(aat_in)
+               aat_out = aat_out - mn
          if index_out:
               yield aat_in,aat_out,b_in,b_out,out
          else:
               yield aat_in,aat_out,b_in,b_out

-def pls_gen(a,b, n_blocks=None, center=False, index_out=False,axis=0):
+def pls_gen(a, b, n_blocks=None, center=False, index_out=False,axis=0, metric=None):
     """Random block crossvalidation
    Leave-one-out is a subset, with n_blocks equals a.shape[-1]
    """
@@ -56,17 +48,38 @@ def pls_gen(a,b, n_blocks=None, center=False, index_out=False,axis=0):
     out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_blocks)]
     for out in out_ind_sets:
         inn = [i for i in index if i not in out]
+         acal = a.take(inn, 0)
+         atrue = a.take(out, 0)
+         bcal = b.take(inn, 0)
+         btrue = b.take(out, 0)
         if center:
-              a = a - mean(a,0)[newaxis]
-              b = b - mean(b,0)[newaxis]
+              mn_a = acal.mean(0)[newaxis]
+              acal = acal - mn_a
+              atrue = atrue - mn_a
+              mn_b = bcal.mean(0)[newaxis]
+              bcal = bcal - mn_b
+              btrue = btrue - mn_b
+         if metric!=None:
+              acal = dot(acal, metric)
         if index_out:
-              yield a.take(inn,0),a.take(out,0), b.take(inn,0),b.take(out,0),out
+              yield acal, atrue, bcal, btrue, out
         else:     
-              yield a.take(inn,0),a.take(out,0), b.take(inn,0),b.take(out,0)
+              yield acal, atrue, bcal, btrue

         
-def pca_gen(a,n_sets=None, center=False, index_out=False,axis=0):
-     """PCA random block crossval generator.
+def pca_gen(a, n_sets=None, center=False, index_out=False, axis=0):
+     """Returns a generator of crossvalidation sample segments.
+
+     input:
+           -- a, data matrix (m x n)
+           -- n_sets, number of segments/subsets to generate.
+           -- center, bool, choice of centering each subset
+           -- index_out, bool, return subset index
+           -- axis, int, which axis to get subset from
+
+     ouput:
+           -- V, generator with (n_sets) memebers (subsets)
+     
     """
     m = a.shape[axis]
     index = randperm(m)
@@ -76,21 +89,26 @@ def pca_gen(a,n_sets=None, center=False, index_out=False,axis=0):
     out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_sets)]
     for out in out_ind_sets:
         inn = [i for i in index if i not in out]
+         acal = a.take(inn, 0)
+         atrue = a.take(out, 0)
         if center:
-              a = a - mean(a,0)[newaxis]
+              mn_a = acal.mean(0)[newaxis]
+              acal = acal - mn_a
+              atrue = atrue - mn_a
         if index_out:
-              yield a.take(inn,0),a.take(out,0),out
+              yield acal, atrue, out
         else:
-              yield a.take(inn,0),a.take(out,0)
+              yield acal, atrue

-def w_pls_gen_jk(a,b,n_sets=None,center=True,index_out=False,axis=0):
+def w_pls_gen_jk(a, b, n_sets=None, center=True,
+                 index_out=False, axis=0):
     """Random block crossvalidation for wide X (m>>n)
     Leave-one-out is a subset, with n_sets equals a.shape[-1]

     Returns : X_m and X_m'Y_m
     """
     m = a.shape[axis]
-     ab = dot(a.T,b)
+     ab = dot(a.T, b)
     index = randperm(m)
     if n_sets==None:
          n_sets = m
@@ -103,19 +121,18 @@ def w_pls_gen_jk(a,b,n_sets=None,center=True,index_out=False,axis=0):
         a_in = a[inn,:]
         mn_a = 0
         mAB = 0
-         
         if center:
-              mn_a = mean(a,0)[newaxis]
-              mAin = dot(-ones((1,nout)),a[out,:])/nin
-              mBin = dot(-ones((1,nout)),b[out,:])/nin
-              mAB = dot(mAin.T,(mBin*nin))
-         ab_in = ab - dot(a[out,].T,b[out,:]) - mAB
+              mn_a = a_in.mean(0)[newaxis]
+              mAin = dot(-ones((1,nout)), a[out,:])/nin
+              mBin = dot(-ones((1,nout)), b[out,:])/nin
+              mAB = dot(mAin.T, (mBin*nin))
+         ab_in = ab - dot(a[out,].T, b[out,:]) - mAB
         a_in = a_in - mn_a

         if index_out:
-              yield ain,ab, out
+              yield a_in, ab_in, out
         else:     
-              yield a_in, ab
+              yield a_in, ab_in

 def shuffle_1d_block(a, n_sets=None, blocks=None, index_out=False, axis=0):
     """Random block shuffling along 1d axis
@@ -185,3 +202,19 @@ def diag_pert(a, n_sets=10, center=True, index_out=False):
             yield a_out, asarray(out)
        else:
             yield a_out
+
+
+def outerprod_centering(aat, ret_mn=True):
+    """Returns mean centered symmetric outerproduct matrix.
+    """
+    n = aat.shape[0]
+    h = aat.sum(0)[:,newaxis]
+    h = (h - mean(h)/2)/n
+    mn_a = h + h.T
+    aatc = aat - mn_a
+    if ret_mn:
+        return aatc, aat.mean(0)
+    return aat - mn_a
+     
+     
+     
--- a/fluents/lib/validation.py
+++ b/fluents/lib/validation.py
@@ -1,30 +1,33 @@
+"""This module implements some common validation schemes from pca and pls.
+"""
 from scipy import ones,mean,sqrt,dot,newaxis,zeros,sum,empty,\
-     apply_along_axis,eye, kron
+     apply_along_axis,eye,kron,array,sort
+from scipy.stats import median
 from scipy.linalg import triu,inv,svd,norm

 from select_generators import w_pls_gen,w_pls_gen_jk,pls_gen,pca_gen,diag_pert
-from engines import w_simpls,pls, bridge,pca
-from pylab import *
+from engines import w_simpls,pls,bridge,pca
+from cx_utils import m_shape

 def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
-    """RMSEP calc for pls with wide X.
+    """Returns and  RMSEP for pls tailored for wide X.
    """
-    k, l = Y.shape
+    k, l = m_shape(Y)
    PRESS = zeros((l, amax+1), dtype='f')
-    # X,Y are centered
+    # X,Y are centered0
    if n_blocks==None:
        n_blocks = Y.shape[0]
-    V = w_pls_gen(dot(X, X.T), Y, n_blocks=n_blocks, center=True)
+    XXt = dot(X, X.T)
+    V = w_pls_gen(XXt, Y, n_blocks=n_blocks, center=True)
    for Din, Doi, Yin, Yout in V:
        ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0])
        Yin = Yin - ym
        PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
        if algo=='simpls':
            dat = w_simpls(Din, Yin, amax)
-            Q,U,H = dat['Q'], dat['U'], dat['H']
+            Q, U, H = dat['Q'], dat['U'], dat['H']
            That = dot(Doi, dot(U, inv(triu(dot(H.T,U))) ))
        else:
-            "Other algo-support comming soon"
            raise NotImplementedError
        #Yhat = empty((amax, k, l),dtype='<f8')
        Yhat = []
@@ -34,13 +37,14 @@ def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
            E = E + sum(E, 0)/Din.shape[0]
            PRESS[j,1:] = PRESS[j,1:] + sum(E**2, 0)
    #Yhat = Y - dot(That,Q.T)
-    return sqrt(PRESS/Y.shape[0])
+    rmsep = sqrt(PRESS/Y.shape[0])
+    aopt = find_aopt_from_sep(rmsep)
+    return rmsep, aopt

 def pls_val(X, Y, amax=2, n_blocks=10,algo='pls'):
    """ Validation results of pls model. 
    """    
-    
-    k, l = Y.shape
+    k, l = m_shape(Y)
    PRESS = zeros((l, amax+1), dtype='<f8')
    EE = zeros((amax, k, l), dtype='<f8')
    Yhat = zeros((amax, k, l), dtype='<f8')
@@ -50,6 +54,7 @@ def pls_val(X, Y, amax=2, n_blocks=10,algo='pls'):
        ym = -sum(Yout,0)[newaxis]/Yin.shape[0]
        Yin = (Yin - ym)
        PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
+
        if algo=='pls':
            dat = pls(Xin, Yin, amax, mode='normal')
        elif algo=='bridge':
@@ -62,9 +67,11 @@ def pls_val(X, Y, amax=2, n_blocks=10,algo='pls'):
            EE[a,out,:] = E
            PRESS[:,a+1] = PRESS[:,a+1] + sum(E**2,0)

-    return sqrt(PRESS/(k-1.)), EE, Yhat
+    rmsep = sqrt(PRESS/(k-1.))
+    aopt = find_aopt_from_sep(rmsep)
+    return rmsep, aopt

-def pca_alter_val(a, amax, n_sets=10,method='diag'):
+def pca_alter_val(a, amax, n_sets=10, method='diag'):
    """Pca validation by altering elements in X.
    """
    # todo: it is just as easy to do jk-estimates her as well
@@ -79,18 +86,27 @@ def pca_alter_val(a, amax, n_sets=10,method='diag'):
            EE = a_sub - Xhat.ravel().take(ind)
            tot = (a_sub**2).sum()
            sep[i,j] = (EE**2).sum()/tot
-    return sqrt(sep.mean(0))
-    #return sep
+    sep = sqrt(sep)
+    aopt = find_aopt_from_sep(sep)
+    return sep, aopt

-def pca_cv_val(X, amax, n_sets):
-    """ Cross validation of pca using random sets crossval.
+def pca_cv_val(a, amax, n_sets):
+    """ Returns PRESS from cross-validated pca using random segments.
+
+    input:
+          -- a, data matrix (m x n)
+          -- amax, maximum nuber of components used
+          -- n_sets, number of segments to calculate
+    output:
+          -- sep, (amax x m x n), squared error of prediction (press)
+          -- aopt, guestimated optimal number of components
    """
-    m, n = X.shape
-    xtot = (X**2).sum()
-    V = pca_gen(X, n_sets=7, center=True, index_out=True)
+    m, n = a.shape
    E = empty((amax, m, n), dtype='f')
-    for xi,xout,ind in V:
-        dat_i = pca(xi, amax, mode='detailed')
+    xtot = (a**2).sum() # this needs centering
+    V = pca_gen(a, n_sets=7, center=True, index_out=True)
+    for xi, xout, ind in V:
+        dat_i = pca(xi, amax, mode='fast')
        Pi = dat_i['P']
        for a in xrange(amax):
            Pia = Pi[:,:a+1]
@@ -99,7 +115,9 @@ def pca_cv_val(X, amax, n_sets):
    sep = []
    for a in xrange(amax):
        sep.append(E[a].sum()/xtot)
-    return sqrt(sep.mean(0))
+    sep = array(sep)
+    aopt = find_aopt_from_sep(sep)
+    return sep, aopt

 def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True):
    """ Returns CV-segments of paramter W for wide X.
@@ -128,7 +146,20 @@ def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True):
    return WW

 def pca_jkP(a, aopt, n_blocks=None):
-    """ Returns CV-segments of paramter P.
+    """Returns loading from PCA on CV-segments.
+    
+    input:
+           -- a, data matrix (n x m)
+           -- aopt, number of components in model.
+           -- nblocks, number of segments
+    output:
+           -- PP, loadings collected in a three way matrix
+           (n_segments, m, aopt)
+
+    comments:
+    * The loadings are scaled with the (1/samples)*eigenvalues.
+    * Crossvalidation method is currently set to random blocks of samples.
+
    todo: add support for T
    fixme: more efficient to add this in validation loop
    """
@@ -138,8 +169,30 @@ def pca_jkP(a, aopt, n_blocks=None):
    PP = empty((n_blocks, a.shape[1], aopt), dtype='f')
    V = pca_gen(a, n_sets=n_blocks, center=True)
    for nn,(a_in, a_out) in enumerate(V):  
-        dat = pca(a_in, aopt, mode='fast')
+        dat = pca(a_in, aopt, mode='fast', scale='loads')
        P = dat['P']
        PP[nn,:,:] = P
        
    return PP
+
+def find_aopt_from_sep(sep, method='75perc'):
+    """Returns an estimate of optimal number of components from rmsecv.
+    """
+    if method=='vanilla':
+        # min rmsep
+        rmsecv = sqrt(sep.mean(0))
+        return rmsecv.argmin() + 1
+
+    elif method=='75perc':
+        prct = .75 #percentile
+        ind = 1.*sep.shape[0]*prct
+        med = median(sep)
+        prc_75 = []
+        for col in sep.T:
+            col.sort()
+            prc_75.append(col[int(ind)])
+        prc_75 = array(prc_75)
+        for i in range(1, sep.shape[1], 1):
+            if med[i-1]<prc_75[i]:
+                return i
+        return len(med)