First import of chemometrics utils

2006-12-18 11:59:12 +00:00
parent fac9346aad
commit 3ef5522dd0
8 changed files with 2112 additions and 0 deletions
--- a/fluents/lib/blmfuncs.py
+++ b/fluents/lib/blmfuncs.py
@@ -0,0 +1,432 @@
+"""This module contains bilinear models(Functions)
+"""
+import sys
+# add library
+sys.path.append('/home/flatberg/fluents/fluents/lib')
+import time
+from fluents.workflow import Function
+from fluents.dataset import Dataset
+from fluents import plots, dataset, workflow, logger
+import scipy
+from engines import *
+from cx_stats import leverage, variances, hotelling
+from cx_utils import mat_center
+from validation import *
+import blmplots
+import engines
+
+
+class Model(Function):
+    """Base class of bilinear models.
+    """
+    def __init__(self,id='johndoe',name='JohnDoe'):
+        Function.__init__(self,id,name)
+        self.name = name
+        self._options = None
+        self._data = {}
+        self._dataset = {}
+        self._packers = {}
+        self.model = {}
+        
+    def clear(self):
+        """ Clears model paramters
+        """
+        self.model = {}
+        self._data = {}
+        self._packers = {}
+    
+
+class PCA(Model):
+    def __init__(self,id='pca',name='PCA'):
+        Model.__init__(self,id,name)
+        self._options = PcaOptions()
+    
+    def pre_validation(self, amax, n_sets, val_engine):
+        """Model calculations for maximum number of components. 
+        """
+        rmsep = val_engine(self.model['E0'], amax, n_sets)
+        self.model['rmsep'] = rmsep
+        self.model['aopt'] = rmsep.argmin()
+        
+    def confidence(self, aopt, n_sets, alpha, p_center,
+                   crot, strict, cov_center ):
+        """Returns a confidence measure for model parameters.
+        Based on aopt.
+        """
+        aopt = self.model['aopt']
+        jk_segments = pca_jkP(self.model['E0'], aopt, n_sets)
+        Pcal = self.model['P'][:,:aopt]
+        tsq = hotelling(jk_segments, Pcal, p_center,
+                              cov_center, alpha, crot, strict)
+        self.model['p_tsq'] = tsq
+
+    def make_model(self, amax, mode, scale):
+        """Model on optimal number of components.
+        """
+        dat = pca(self.model['E0'], amax, scale, mode)
+        
+        # explained variance
+        var_x, exp_var_x = variances(self.model['E0'], dat['T'], dat['P'])
+        dat['var_x'] = var_x
+        dat['exp_var_x'] = exp_var_x
+
+        #fixme###
+        do_lev_s = False
+        do_lev_v = False
+        #####
+        if do_lev_s:
+            # sample leverages
+            tnorm = scipy.apply_along_axis(norm, 0, dat['T']) # norm of T-columns
+            s_lev = leverage(amax, tnorm)
+            dat['s_lev'] = s_lev
+        if do_lev_v:
+            # variable leverages
+            v_lev = leverage(amax, dat['P'])
+            dat['v_lev'] = v_lev
+        
+        self.model.update(dat)
+
+    def as_dataset(self, param, dtype='dataset'):
+        """Return model parameter as Dataset.
+        """
+        if not param in self.model.keys():
+            return
+        DX = self._dataset['X'] #input dataset
+        dim_name_0, dim_name_1 = DX.get_dim_name()
+        # samples
+        ids_0 = [dim_name_0, DX.get_identifiers(dim_name_0, sorted=True)]
+        # vars
+        ids_1 = [dim_name_1, DX.get_identifiers(dim_name_1, sorted=True)]
+        # components (hidden)
+        pc_ids = ['_comp_a', map(str,range(self.model['aopt'])) ]
+        pc_ids_opt = ['_comp_o', map(str, range(self.model['aopt'])) ] 
+        zero_dim = ['_doe', ['0']] # null dim, vector (hidden)
+        match_ids = {'E':[ids_0, ids_1],
+                     'E0':[ids_0, ids_1],
+                     'P':[ids_1, pc_ids],
+                     'T':[ids_0, pc_ids], 
+                     'W':[ids_1, pc_ids],
+                     'p_tsq':[ids_1, zero_dim],
+                     'rmsep':[pc_ids, zero_dim],
+                     'var_leverages':[ids_1, zero_dim],
+                     'sample_leverages':[pc_ids, zero_dim],
+                     'exp_var_x': [pc_ids, zero_dim],
+                     'var_x': [pc_ids, zero_dim],
+                     }
+        
+        out = Dataset(self.model[param], match_ids[param], name=param)
+        return out
+
+    def get_out_plots(self, options):
+        out=[]
+        for plt in options['out_plots']:
+            #try:
+            out.append(plt(self))
+            #except:
+            #    print plt
+                #logger.log('debug', 'Plot: %s failed') %plt
+        return out
+    
+    def run(self, data):
+        """Run pca with present options. 
+        """
+        self.clear()
+        options = self._options
+        self._dataset['X'] = data
+        self._data['X'] = data.asarray().astype('<f8')
+        if options['center']:
+            center = options['center_mth']
+            self.model['E0'] = center(self._data['X'])
+        else:
+            self.model['E0'] = data.asarray()
+        
+        self.pre_validation(**options.pre_validation_options())
+        self.make_model(**options.make_model_options())
+        if options['calc_conf']:
+            self.confidence(**options.confidence_options())
+            
+        out = [self.as_dataset(p) for p in options['out_data']]
+        for plt in self.get_out_plots(options):
+            out.append(plt)
+        return out
+
+
+class PLS(Model):
+    def __init__(self, id='pls', name='PLS'):
+        Model.__init__(self, id, name)
+        self._options = PlsOptions()
+        
+    def pre_validation(self, amax, n_sets, val_engine):
+        """Returns rmsec,rmsep for model.
+        """
+        rmsep = val_engine(self.model['E0'], self.model['F0'],
+                           amax, n_sets)
+        self.model['rmsep'] = rmsep.mean(0)
+        self.model['aopt'] = rmsep.mean(0).argmin()
+        
+    def confidence(self, aopt, n_sets, alpha, p_center,
+                   crot, strict, cov_center ):
+        """Returns a confidence measure for model parameters
+        Supported parameters: W
+        """
+        aopt = self.model['aopt']
+        jk_segments = pls_jkW(self.model['E0'], self.model['F0'],
+                              aopt, n_sets)
+        Wcal = self.model['W'][:,:aopt]
+        tsq = hotelling(jk_segments, Wcal, p_center,
+                              alpha, crot, strict, cov_center)
+        self.model['w_tsq'] = tsq
+
+    def permutation_confidence(self, a, b, aopt, reg, n_iter, algo,
+                               sim_method):
+        """Estimates sign. vars by controlling fdr."""
+
+        qvals_sorted, qvals = pls_qvals(a, b, aopt=None,
+                                        alpha=.4, n_iter=20, algo='pls',
+                                        sim_method='shuffle', )
+        
+
+    def make_model(self, a, b, amax, scale, mode, engine):
+        """Make model on amax components.
+        """
+        dat = engine(a, b, amax, scale, mode)
+        self.model.update(dat)
+        
+    def as_dataset(self, name, dtype='Dataset'):
+        """Return any model parameter as Dataset
+        No ids matching
+        """
+        if name not in self.model.keys():
+            return
+        DX, DY = self._dataset['X'], self._dataset['Y']
+        dim_name_0, dim_name_1 = DX.get_dim_name()
+        dim_name_2, dim_name_3 = DY.get_dim_name()
+        #samples
+        ids_0 = [dim_name_0, DX.get_identifiers(dim_name_0, sorted=True)]
+        # x vars
+        ids_1 = [dim_name_1, DX.get_identifiers(dim_name_1, sorted=True)]
+        # y vars
+        ids_3 = [dim_name_3, DY.get_identifiers(dim_name_3, sorted=True)]
+        # components (hidden)
+        pc_ids = ['_comp', map(str, range(self.model['aopt']))]
+        zero_dim = ['_doe',['0']] # null dim, vector (hidden)
+
+        match_ids = {'E':[ids_0, ids_1],
+                     'P':[ids_1, pc_ids],
+                     'T':[ids_0, pc_ids],
+                     'W': [ids_1, pc_ids],
+                     'R': [ids_1, pc_ids],
+                     'Q':[ids_3, pc_ids],
+                     'F':[ids_0, ids_3],
+                     'B':[ids_1, ids_3],
+                     'qval':[ids_1, zero_dim],
+                     'qval_sorted':[ids_1, zero_dim],
+                     'w_tsq':[ids_1, zero_dim],
+                     'rmsep':[pc_ids, zero_dim],
+                     }
+        
+        array = self.model[name]     
+        M = Dataset(array,identifiers=match_ids[name],name=name)
+        return M
+
+    def get_out_plots(self, options):
+        out=[]
+        for plt in options['out_plots']:
+            #try:
+            out.append(plt(self))
+            #except:
+            #    logger.log('debug', 'Plot: %s failed' %plt)
+        return out
+            
+    def run(self,a,b):
+        options = self._options
+        self._dataset['X'] = a
+        self._dataset['Y'] = b
+        self._data['X'] = a.asarray()
+        self._data['Y'] = b.asarray()
+        if options['center']:
+            self.model['E0'] = options['center_mth'](self._data['X'])
+            self.model['F0'] = options['center_mth'](self._data['Y'])
+        else:
+            self.model['E0'] = self._data['X']
+            self.model['F0'] = self._data['Y']
+
+        self.pre_validation(**options.pre_validation_options())
+        self.make_model(self.model['E0'], self.model['F0'],
+                        **options.make_model_options())
+        # variance captured
+        var_x, exp_var_x = variances(self.model['E0'], self.model['T'], self.model['P'])
+        self.model['var_x'] = var_x
+        self.model['exp_var_x'] = exp_var_x
+
+        var_y, exp_var_y = variances(self.model['F0'], self.model['T'], self.model['Q'])
+        self.model['var_y'] = var_y
+        self.model['exp_var_y'] = exp_var_y
+        
+        if options['calc_conf']:
+            self.confidence(**options.confidence_options())
+
+        out = [self.as_dataset(p) for p in options['out_data']]
+        for plt in self.get_out_plots(options):
+            out.append(plt)
+        return out
+        
+
+class Packer:
+    """A compression object used to speed up model calculations.
+
+    Often used in conjunction with crossvalidation and perturbations
+    analysis. 
+    """
+    def __init__(self,array):
+        self._shape = array.shape
+        self._array = array
+        self._packed_data = None
+        
+    def expand(self,a):
+        if self._inflater!=None:
+            return dot(self._inflater,a)
+        
+    def collapse(self,axis=None,mode='svd'):
+        if not axis:
+            axis = argmin(self._array.shape) # default is the smallest dim
+
+        if axis == 1:
+            self._array = self._array.T
+        u,s,vt = svd(self._array,full_matrices=0)
+        self._inflater = vt.T
+        self._packed_data = u*s
+        return self._packed_data
+
+    def get_packed_data(self):
+        return self._packed_data
+    
+
+class Options(dict):
+    """Options base class.
+    """
+    def __init__(self, *args,**kw):
+        dict.__init__(self, *args, **kw)
+
+    def _copy_from_list(self, key_list):
+        d = {}
+        for key in key_list:
+            d[key] = self.get(key,None)
+        return d
+
+class PcaOptions(Options):
+    """Options for Principal Component Analysis.
+    """
+    def __init__(self):
+        Options.__init__(self)
+        self._set_default()
+            
+    def _set_default(self):
+        opt = {}
+        opt['algo'] = 'pca'
+        opt['engine'] = engines.pca
+        opt['mode'] = 'normal' # how much info to calculate
+        opt['lod'] = 'compact' # how much info to store
+        opt['amax'] = 5
+        opt['aopt'] = 5
+        opt['center'] = True
+        opt['center_mth'] = mat_center
+        opt['scale'] = 'scores'
+        opt['calc_conf'] = True
+        opt['n_sets'] = 5
+
+        opt['strict'] = True
+        opt['p_center'] = 'med'
+        opt['alpha'] = .8
+        opt['cov_center'] = 'med'
+        opt['crot'] = True
+        
+        opt['val_engine'] = pca_alter_val
+        opt['val_n_sets'] = 10
+
+        opt['all_data'] = ['T','P','E','p_tsq','rmsep']
+        opt['all_plots'] = ['PcaScorePlot', 'PcaLoadingPlot',
+                            'PcaRmsepPlot']
+        
+        opt['out_data'] = ['T','P', 'p_tsq']
+        opt['out_plots'] = [blmplots.PcaScorePlot,blmplots.PcaLoadingPlot,blmplots.LineViewXc]
+
+        self.update(opt)
+        
+    def make_model_options(self):
+        """Options for make_model method."""
+        opt_list = ['scale','mode', 'amax']
+        return self._copy_from_list(opt_list)   
+
+    def confidence_options(self):
+        """Options for confidence method."""
+        opt_list = ['n_sets', 'aopt', 'alpha', 'p_center',
+                    'strict', 'crot', 'cov_center']
+        return self._copy_from_list(opt_list)  
+
+    def pre_validation_options(self):
+        """Options for pre_validation method."""
+        opt_list = ['amax', 'n_sets', 'val_engine']
+        return self._copy_from_list(opt_list)
+
+
+class PlsOptions(Options):
+    """Options for Partial Least Squares Regression.
+    """
+    def __init__(self):
+        Options.__init__(self)
+        self._set_default()
+            
+    def _set_default(self): 
+        opt = {}
+        opt['algo'] = 'pls'
+        opt['engine'] = engines.pls
+        opt['mode'] = 'normal' # how much info to calculate
+        opt['lod'] = 'compact' # how much info to store
+        opt['amax'] = 3
+        opt['aopt'] = 3
+        opt['center'] = True
+        opt['center_mth'] = mat_center
+        opt['scale'] = 'scores'
+        opt['calc_conf'] = True
+        opt['n_sets'] = 10
+
+        opt['strict'] = True
+        opt['p_center'] = 'med'
+        opt['alpha'] = .2
+        opt['cov_center'] = 'med'
+        opt['crot'] = True
+        
+        opt['val_engine'] = w_pls_cv_val
+
+        opt['all_data'] = ['T','P','E','p_tsq','rmsep']
+        opt['all_plots'] = ['PcaScorePlot', 'PcaLoadingPlot',
+                            'PcaRmsepPlot']
+        
+        opt['out_data'] = []
+        opt['out_plots'] = [blmplots.PlsScorePlot,
+                            blmplots.PlsLoadingPlot,
+                            blmplots.LineViewXc]
+                            #blmplots.PlsQvalScatter]
+
+        opt['pack'] = False
+        opt['calc_qvals'] = False
+        opt['q_pert_mth'] = 'shuffle_vars'
+        opt['q_iter'] = 20        
+        self.update(opt)
+
+    def make_model_options(self):
+        """Options for make_model method."""
+        opt_list = ['scale','mode', 'amax', 'engine']
+        return self._copy_from_list(opt_list)   
+
+    def confidence_options(self):
+        """Options for confidence method."""
+        opt_list = ['n_sets', 'aopt', 'alpha', 'p_center',
+                    'strict', 'crot', 'cov_center']
+        return self._copy_from_list(opt_list)  
+
+    def pre_validation_options(self):
+        """Options for pre_validation method."""
+        opt_list = ['amax', 'n_sets', 'val_engine']
+        return self._copy_from_list(opt_list)  
--- a/fluents/lib/blmplots.py
+++ b/fluents/lib/blmplots.py
@@ -0,0 +1,158 @@
+"""Specialised plots for functions defined in blmfuncs.py.
+
+fixme:
+        -- Im normalsing all color mapping input vectors to [0,1]. This will
+        destroy informative numerical values in colorbar (but we
+        are not showing these anyway). A better fix would be to let the
+        colorbar listen to the scalarmappable instance and corect itself, but
+        I did not get that to work ...
+
+fixme2:
+        -- If scatterplot is not inited with a colorvector there will be no
+        colorbar, but when adding colors the colorbar shoud be created.
+"""
+from fluents import plots
+from scipy import dot,sum,diag,arange,log,mean,newaxis
+from matplotlib import cm
+
+class PcaScorePlot(plots.ScatterPlot):
+    """PCA Score plot"""
+    def __init__(self, model, absi=0, ordi=1):
+        self._T = model.model['T']
+        dataset_1 = model.as_dataset('T')
+        dataset_2 = dataset_1
+        id_dim = dataset_1.get_dim_name(0)
+        sel_dim = dataset_1.get_dim_name(1)
+        id_1, = dataset_1.get_identifiers(sel_dim, [absi])
+        id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
+        plots.ScatterPlot.__init__(self, dataset_1, dataset_2, id_dim, sel_dim, id_1, id_2 ,c='b' ,s=40 , name='pca-scores')
+
+    def set_absicca(self,n):
+        self.xaxis_data = self._T[:,n]
+
+    def set_ordinate(self,n):
+        self.yaxis_data = self._T[:,n]
+
+class PcaLoadingPlot(plots.ScatterPlot):
+     """PCA Loading plot"""
+     def __init__(self, model, absi=0, ordi=1):
+         self._P = model.model['P']
+         dataset_1 = model.as_dataset('P')
+         dataset_2 = dataset_1
+         id_dim = dataset_1.get_dim_name(0)
+         sel_dim = dataset_1.get_dim_name(1)
+         id_1, = dataset_1.get_identifiers(sel_dim, [absi])
+         id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
+         if model.model.has_key('p_tsq'):
+             col = model.model['p_tsq'].ravel()
+             col = normalise(col)
+         else:
+             col = 'g'
+         plots.ScatterPlot.__init__(self, dataset_1, dataset_2, id_dim, sel_dim, id_1, id_2,c=col,s=20, name='pls-loadings')
+
+     def set_absicca(self,n):
+         self.xaxis_data = self._P[:,n]
+
+     def set_ordinate(self,n):
+         self.yaxis_data = self._P[:,n]
+    
+class PlsScorePlot(plots.ScatterPlot):
+    """PLS Score plot"""
+    def __init__(self,model, absi=0, ordi=1):
+        self._T = model.model['T']
+        dataset_1 = model.as_dataset('T')
+        dataset_2 = dataset_1
+        id_dim = dataset_1.get_dim_name(0)
+        sel_dim = dataset_1.get_dim_name(1)
+        id_1, = dataset_1.get_identifiers(sel_dim, [absi])
+        id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
+        
+        plots.ScatterPlot.__init__(self, dataset_1, dataset_2,
+                                   id_dim, sel_dim, id_1, id_2 ,
+                                   c='b' ,s=40 , name='pls-scores')
+        
+    def set_absicca(self,n):
+        self.xaxis_data = self._T[:,n]
+
+    def set_ordinate(self,n):
+        self.yaxis_data = self._T[:,n]
+
+
+class PlsLoadingPlot(plots.ScatterPlot):    
+    """PLS Loading plot"""
+    def __init__(self,model,absi=0,ordi=1):
+        self._P = model.model['P']
+        dataset_1 = model.as_dataset('P')
+        dataset_2 = dataset_1
+        id_dim = dataset_1.get_dim_name(0)
+        sel_dim = dataset_1.get_dim_name(1)
+        id_1, = dataset_1.get_identifiers(sel_dim, [absi])
+        id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
+        if model.model.has_key('w_tsq'):
+            col = model.model['w_tsq'].ravel()
+            col = normalise(col)
+        else:
+            col = 'g'
+        plots.ScatterPlot.__init__(self, dataset_1, dataset_2,
+                                   id_dim, sel_dim, id_1, id_2,
+                                   c=col, s=20,                                   name='loadings')
+        
+    def set_absicca(self,n):
+        self.xaxis_data = self._P[:,n]
+        
+    def set_ordinate(self,n):
+        self.yaxis_data = self._T[:,n]
+
+class LineViewXc(plots.LineViewPlot):
+    """A line view of centered raw data
+    """
+    def __init__(self, func_class, name='Profiles'):
+        # copy, center, plot
+        x = func_class._dataset['X'].copy()
+        x._array = x._array - mean(x._array,0)[newaxis]
+        plots.LineViewPlot.__init__(self, x, 1, None, name)
+        
+class ParalellCoordinates(plots.Plot):
+    """Parallell coordinates for score loads with many comp.
+    """
+    def __init__(self,model, p = 'loads'):
+        pass
+
+class PlsQvalScatter(plots.ScatterPlot):
+    """A vulcano like plot of loads vs qvals
+    """
+    def __init__(self, func_class, pc=0):
+        model = func_class.model
+        if not model.has_key('w_tsq'):
+            return
+        self._W = model['P']
+        dataset_1 = func_class.as_dataset('P')
+        dataset_2 = func_class.as_dataset('w_tsq')
+        id_dim = dataset_1.get_dim_name(0) #genes
+        sel_dim = dataset_1.get_dim_name(1) #_comp
+        sel_dim_2 = dataset_2.get_dim_name(1) #_zero_dim
+        id_1, = dataset_1.get_identifiers(sel_dim, [0])
+        id_2, = dataset_2.get_identifiers(sel_dim_2, [0])
+        if model.has_key('w_tsq'):
+            col = model['w_tsq'].ravel()
+            col = normalise(col)
+        else:
+            col = 'g'
+        plots.ScatterPlot.__init__(self, dataset_1, dataset_2,
+                                   id_dim, sel_dim, id_1, id_2,
+                                   c=col, s=20, sel_dim_2=sel_dim_2,
+                                   name='Load Volcano')
+
+
+class InfluencePlot(plots.ScatterPlot):
+    """
+    """
+    pass
+        
+
+def normalise(x):
+    """Scale vector x to [0,1]
+    """
+    x = x - x.min()
+    x = x/x.max()
+    return x
--- a/fluents/lib/cx_stats.py
+++ b/fluents/lib/cx_stats.py
@@ -0,0 +1,256 @@
+from scipy import zeros,zeros_like,sqrt,dot,trace,sign,round_,argmax,\
+     sort,ravel,newaxis,asarray,diag,sum,outer,argsort,arange,ones_like,\
+     all,apply_along_axis,eye
+from scipy.linalg import svd,inv,norm,det,sqrtm
+from scipy.stats import mean,median
+from cx_utils import mat_center
+from validation import pls_jkW
+from select_generators import shuffle_1d
+from engines import *
+import time
+
+def hotelling(P, Pfull, p_center='med', cov_center='med',
+              alpha=0.3, crot=True, strict=False, metric=None):
+    """Returns regularized hotelling T^2.
+    
+    alpha -- regularisation towards pooled cov estimates
+    beta -- regularisation for unstable eigenvalues
+    p_center -- location method for submodels
+    cov_center -- location method for sub coviariances
+    alpha -- regularisation
+    crot -- rotate submodels toward full?
+    strict -- only rotate 90 degree ?
+    metric -- inverse metric matrix (if P and Pfull from metric pca/pls)
+    
+    """
+    m, n = Pfull.shape
+    if metric==None:
+        metric = eye(m, dtype='<f8')
+    Pfull = dot(metric.T, asarray(Pfull))
+    n_sets,n,amax = P.shape
+    # allocate
+    T_sq = empty((n, ),dtype='f')
+    Cov_i = zeros((n, amax, amax),dtype='f')
+    
+    # rotate sub_models to full model
+    if crot:
+        for i,Pi in enumerate(P):
+            Pi = dot(metric.T, Pi)
+            P[i] = procrustes(Pfull, Pi, strict=strict)
+
+    # center of pnull
+    if p_center=='med':
+        P_ctr = median(P, 0)
+    elif p_center=='mean':
+        # fixme: mean is unstable
+        P_ctr = mean(P, 0)
+    else: #use full
+        P_ctr = Pfull
+
+    for i in xrange(n):
+        Pi = P[:,i,:] # (n_sets x amax) 
+        Pi_ctr = P_ctr[i,:] # (1 x amax)
+        Pim = (Pi - Pi_ctr[newaxis])*sqrt(n_sets-1)
+        Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim)
+        
+    if cov_center == 'med':
+        Cov = median(Cov_i, 0)
+    else:
+        Cov = mean(Cov_i, 0)
+    
+    reg_cov = (1. - alpha)*Cov_i + alpha*Cov
+    for i in xrange(n):
+        Pc = P_ctr[i,:][:,newaxis]
+        sigma = reg_cov[i]
+        #T_sq[i] = sqrt(dot(dot(Pc.T, inv(sigma)), Pc).ravel())
+        T_sq[i] = dot(dot(Pc.T, inv(sigma)), Pc).ravel()  
+    return T_sq
+
+def procrustes(A, B, strict=True, center=False, verbose=False):
+    """Rotation of B to A.
+
+    strict -- Only do flipping and shuffling
+    center -- Center before rotation, translate back after
+    verbose -- Print ssq
+    
+    No scaling calculated.
+    Output B_rot = Rotated B
+    """
+    if center:
+        A,mn_A = mat_center(A, ret_mn=True)
+        B,mn_B = mat_center(B, ret_mn=True)
+    u,s,vh = svd(dot(B.T, A))
+    v = vh.T
+    Cm = dot(u, v.T) #orthogonal rotation matrix
+    if strict: # just inverting and flipping
+       Cm = ensure_strict(Cm)
+    b_rot = dot(B, Cm)
+
+    if verbose:
+        print Cm.round()
+        fit = sum(ravel(B - b_rot)**2)
+        print "Sum of squares: %s" %fit
+    if center:
+        return mn_B + b_rot
+    else:
+        return b_rot
+
+def expl_var_x(X, T):
+    """Returns explained variance of X."""
+    # centered X,Y
+    exp_var_x = diag(dot(T.T, T))*100/(sum(X**2))
+    return exp_var_x
+
+def expl_var_y(Y, T, Q):
+    """Returns explained variance of Y.
+    """
+    # centered Y
+    exp_var_y = zeros((Q.shape[1], ))
+    for a in range(Q.shape[1]):
+        Ya = outer(T[:,a], Q[:,a])
+        exp_var_y[a] = 100*sum(Ya**2)/sum(Y**2)
+    return exp_var_y
+        
+def pls_qvals(a, b, aopt=None, alpha=.3,
+              n_iter=20, algo='pls',
+              sim_method='shuffle',
+              p_center='med', cov_center='med',
+              crot=True, strict=False, metric=None):
+
+    """Returns qvals for pls model.
+
+    input:
+    a -- centered data matrix
+    b -- centered data matrix
+    aopt -- scalar, opt. number of components
+    alpha -- [0,1] regularisation parameter for T2-test
+    n_iter -- number of permutations
+    sim_method -- permutation method ['shuffle']
+    p_center -- location estimator for sub models ['med']
+    cov_center -- location estimator for covariance of submodels ['med']
+    crot -- bool, use rotations of sub models?
+    strict -- bool, use stict (rot/flips only) rotations?
+    metric -- bool, use row metric?
+
+    
+    """
+    
+    m, n = a.shape
+    TSQ = zeros((n, n_iter), dtype='<f8') # (nvars x n_subsets)
+    n_false = zeros((n, n_iter), dtype='<f8')
+
+    #full model
+    if algo=='bridge':
+        dat = bridge(a, b, aopt, 'loads', 'fast')
+    else:
+        dat = pls(a, b, aopt, 'loads', 'fast')
+    W = pls_jkW(a, b, aopt, n_blocks=None, algo=algo)
+    tsq_full = hotelling(W, dat['W'], p_center=p_center,
+                         alpha=alpha, crot=crot, strict=strict,
+                         cov_center=cov_center, metric=metric)
+    t0 = time.time()
+    Vs = shuffle_1d(b, n_iter)
+    for i,b_shuff in enumerate(Vs):
+        t1 = time.time()
+        if algo=='bridge':
+            dat = bridge(a, b_shuff, aopt, 'loads','fast')
+        else:
+            dat = pls(a, b, aopt, 'loads', 'fast')
+        W = pls_jkW(a, b_shuff, aopt, n_blocks=None, algo=algo)
+        TSQ[:,i] = hotelling(W, dat['W'],p_center=p_center,
+                             alpha=alpha, crot=crot, strict=strict,
+                             cov_center=cov_center, metric=metric)
+        print time.time() - t1
+    sort_index = argsort(tsq_full)[::-1]
+    back_sort_index = sort_index.argsort()
+    print time.time() - t0
+
+    # count false positives
+    tsq_full_sorted = tsq_full.take(sort_index)
+    for i in xrange(n_iter):
+        for j in xrange(n):
+            n_false[j,i] = sum(TSQ[:,i]>=tsq_full[j])
+    false_pos = median(n_false, 1)
+    ll = arange(1, len(false_pos)+1, 1)
+    sort_qval = false_pos.take(sort_index)/ll
+    qval = false_pos/ll.take(back_sort_index)
+    print time.time() - t0
+    return qval, false_pos, TSQ, tsq_full
+
+def ensure_strict(C, only_flips=True):
+    """Ensure that a rotation matrix does only 90 degree rotations.
+    In multiplication with pcs this allows flips and reordering.
+
+    if only_flips is True there will onlt be flips allowed
+    """
+    Cm = C
+    S = sign(C) # signs
+    if only_flips==True:
+        C = eye(Cm.shape[0])*S
+        return C
+    Cm = zeros_like(C)
+    Cm.putmask(1.,abs(C)>.6)
+    if det(Cm)>1:
+        raise ValueError,"Implement this!"
+    return Cm*S
+
+
+
+def leverage(aopt=1,*args):
+    """Returns leverages
+    input : aopt, number of components to base leverage calculations on
+            *args, matrices of normed blm-paramters
+    output: leverages
+    
+    For PCA typical inputs are normalised T or normalised P
+    For PLSR typical inputs are normalised T or normalised W
+    """
+    if aopt<1:
+        raise ValueError,"Leverages only make sense for aopt>0"
+    lev  = []
+    for u in args:
+        lev_u = 1./u.shape[0] + dot(u[:,:aopt], u[:,:aopt].T).diagonal()
+        lev.append(lev_u)
+    return lev
+
+
+def variances(a,t,p):
+    """Returns explained variance and ind. var from blm-params.
+    input:
+          a -- full centered matrix
+          t,p -- parameters from a bilinear approx of the above matrix.
+    output:
+          var -- variance of each component
+          var_exp -- cumulative explained variance in percentage
+
+    Typical inputs are:  X(centered),T,P for PCA or
+                         X(centered),T,P / Y(centered),T,Q for PLSR.
+    """
+    
+    tot_var = sum(a**2)
+    var = 100*(sum(p**2, 0)*sum(t**2, 0))/tot_var
+    var_exp = cumsum(var)
+    return var, var_exp
+
+def residual_diagnostics(Y, Yhat, aopt=1):
+    """Root mean errors and press values. 
+    R2 vals
+    """
+    pass
+    
+
+def ssq(E, axis=0, weights=None):
+    """Sum of squares, supports weights."""
+    n = E.shape[axis]
+    if weights==None:
+        weights = eye(n)
+    else:
+        weigths = diag(weigths)
+    if axis==0:
+        Ew = dot(weights, E)
+    elif axis==1:
+        Ew = dot(E, weights)
+    else:
+        raise NotImplementedError, "Higher order modes not supported"
+    return pow(Ew,2).sum(axis)
+
--- a/fluents/lib/cx_utils.py
+++ b/fluents/lib/cx_utils.py
@@ -0,0 +1,108 @@
+from scipy import apply_along_axis,newaxis,zeros,\
+     median,round_,nonzero,dot,argmax,any,sqrt,ndarray,\
+     trace,zeros_like,sign,sort,real,argsort,rand,array
+from scipy.linalg import norm,svd,inv,eig
+from scipy.stats import median,mean
+
+def normalise(a,axis=0,return_scales=False):
+	s = apply_along_axis(norm,axis,a)
+	if axis==0:
+		s = s[newaxis]
+	else:
+		s = s[:,newaxis]
+		
+	a_s = a/s
+	if return_scales:
+		return a_s,s
+	return a_s
+
+def sub2ind(shape,i,j):
+	"""Indices from subscripts. Only support for 2d"""
+	row,col = shape
+	ind = []
+	for k in xrange(len(i)):
+		for m in xrange(len(j)):
+			ind.append(i[k]*col + j[m])
+	return ind
+
+
+def sorted_eig(a, b=None,sort_by='sm'):
+    """
+    Just eig with real part of output sorted:
+    This is for convenience only, not general!
+    
+    sort_by='sm': return the eigenvectors by eigenvalues
+                  of smallest magnitude first. (default)
+            'lm': returns largest eigenvalues first      
+
+    output: just as eig with 2 outputs
+            -- s,v (eigvals,eigenvectors)
+    (This is reversed output compared to matlab)
+    
+    """
+    s,v = eig(a,b)
+    s = real(s) # dont expect any imaginary part
+    v = real(v)
+    ind = argsort(s)
+    if sort_by=='lm':
+        ind = ind[::-1]
+    v = v.take(ind,1)
+    s = s.take(ind)
+
+    return s,v
+
+def str2num(string_number):
+    """Convert input (string number) into number, if float(string_number) fails, a nan is inserted. 
+    """
+    missings = ['','nan','NaN','NA']
+    try:
+        num = float(string_number)
+    except:
+        if string_number in missings:
+            num = nan
+        else:
+            print "Found strange entry: %s" %string_number
+            raise
+    return num
+
+def randperm(n):
+  r=rand(n)
+  dict={}
+  for i in range(n):
+     dict[r[i]]=i
+  r=sort(r)
+  out=zeros(n)
+  for i in range(n):
+     out[i]=dict[r[i]]
+  return array(out,dtype='i')
+
+def mat_center(X,axis=0,ret_mn=False):
+    """Mean center matrix along axis.
+    
+        X -- matrix, data
+        axis -- dim,
+        ret_mn -- bool, return mean
+
+    output:
+            Xc, [mnX]
+            
+    NB: axis = 1 is column-centering, axis=0=row-centering
+    default is row centering (axis=0)
+    """
+
+    try:
+        rows,cols = X.shape
+    except ValueError:
+        print "The X data needs to be two-dimensional"
+        
+    if axis==0:
+        mnX = mean(X,axis)[newaxis]
+        Xs = X - mnX
+    
+    elif axis==1:
+        mnX = mean(X,axis)[newaxis]
+        Xs = (X.T - mnX).T
+    if ret_mn:
+        return Xs,mnX
+    else:
+        return Xs
--- a/fluents/lib/engines.py
+++ b/fluents/lib/engines.py
@@ -0,0 +1,200 @@
+
+"""Module contain algorithms for  (burdensome) calculations.
+
+There is no typechecking of any kind here, just focus on speed
+"""
+
+from scipy.linalg import svd,norm,inv,pinv,qr
+from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\
+     apply_along_axis,mean,ones,randn,empty_like,outer,c_,\
+     rand,sum,cumsum
+
+def pca(a, aopt, scale='scores', mode='normal'):
+    """ Principal Component Analysis model
+    mode:
+         -- fast : returns smallest dim scaled (T for n<=m, P for n>m )
+         -- normal : returns all model params and residuals after aopt comp
+         -- detailed    : returns all model params and all residuals
+    """
+    
+    m,n = a.shape
+    u,s,vt = svd(a, full_matrices=0)
+    T = u*s
+    T = T[:,:aopt]
+    P = vt[:aopt,:].T
+    
+    if scale=='loads':
+        tnorm = apply_along_axis(norm, 0, T)
+        T = T/tnorm
+        P = P*tnorm
+
+    if mode == 'fast':
+        return {'T':T, 'P':P}
+    
+    if mode=='detailed':
+        """Detailed mode returns residual matrix for all comp.
+        That is E, is a three-mode matrix: (amax, m, n) """
+        E = empty((aopt,  m,  n))
+        for ai in range(aopt):
+            e = a - dot(T[:,:ai+1], P[:,:ai+1].T)
+            E[ai,:,:] = e.copy()
+    else:
+        E = a - dot(T,P.T)
+            
+    return {'T':T, 'P':P, 'E':E}
+
+def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
+    """Kernel pls for tall/wide matrices.
+
+    Fast pls for calibration. Only inefficient for many Y-vars.
+    
+    """
+    m,n = a.shape
+    if ab!=None:
+        mm,l = ab.shape
+    else:
+        k,l = b.shape
+
+    W = empty((n, aopt))
+    P = empty((n, aopt))
+    R = empty((n, aopt))
+    Q = empty((l, aopt))
+    T = empty((m, aopt))
+    B = empty((aopt, n, l))
+
+    if ab==None: 
+        ab = dot(a.T, b)
+    for i in range(aopt):
+        if ab.shape[1]==1:
+            w = ab
+        else:
+            u,s,vh = svd(dot(ab.T, ab))
+            w = dot(ab,u[:,:1])
+    
+        w = w/norm(w)
+        r = w.copy()
+        if i>0:
+            for j in range(0,i,1):
+                r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis]
+        t = dot(a, r)
+        tt = norm(t)**2
+        p  = dot(a.T, t)/tt
+        q = dot(r.T, ab).T/tt
+        ab = ab - dot(p, q.T)*tt
+        T[:,i] = t.ravel()
+        W[:,i] = w.ravel()
+        P[:,i] = p.ravel()
+        R[:,i] = r.ravel()
+
+        if mode=='fast' and i==aopt-1:
+            if scale=='loads':
+                tnorm = apply_along_axis(norm, 0, T)
+                T = T/tnorm
+                W = W*tnorm
+            return {'T':T, 'W':W}
+
+        Q[:,i] = q.ravel()
+        B[i] = dot(R[:,:i+1], Q[:,:i+1].T)
+    
+    if mode=='detailed':
+        E = empty((aopt, m, n))
+        F = empty((aopt, k, l))
+        for i in range(1,aopt+1,1):
+            E[i-1] = a - dot(T[:,:i],P[:,:i].T)
+            F[i-1] = b - dot(T[:,:i],Q[:,:i].T)
+    else:
+        E = a - dot(T[:,:aopt], P[:,:aopt].T)
+        F = b - dot(T[:,:aopt], Q[:,:aopt].T)
+
+    if scale=='loads':
+        tnorm = apply_along_axis(norm, 0, T)
+        T = T/tnorm
+        W = W*tnorm
+        Q = Q*tnorm
+        P = P*tnorm
+        
+    return {'B':B, 'Q':Q, 'P':P, 'T':T, 'W':W, 'R':R, 'E':E, 'F':F}
+
+def w_simpls(aat, b, aopt):
+    """ Simpls for wide matrices.
+    Fast pls for crossval, used in calc rmsep for wide X
+    There is no P,W.  T is normalised
+    """
+    bb = b.copy()
+    m,m = aat.shape
+    U = empty((m, aopt))
+    T = empty((m, aopt))
+    H = empty((m, aopt)) #just like W in simpls
+    PROJ = empty((m, aopt)) #just like R in simpls
+
+    for i in range(aopt):
+        u,s,vh = svd(dot(dot(b.T, aat), b), full_matrices=0)
+        u = dot(b, u[:,:1]) #y-factor scores
+        U[:,i] = u.ravel()
+        t =dot(aat, u)
+        t = t/norm(t)
+        T[:,i] = t.ravel()
+        h = dot(aat, t) #score-weights
+        H[:,i] = h.ravel()
+        PROJ[:,:i+1] = dot(T[:,:i+1], inv(dot(T[:,:i+1].T, H[:,:i+1])) )
+        if i<aopt:
+            b = b - dot(PROJ[:,:i+1], dot(H[:,:i+1].T,b) )
+    C = dot(bb.T, T)
+
+    return {'T':T,'U':U,'Q':C,'H':H}
+
+def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
+    """Undeflated Ridged svd(X'Y)
+    """
+    m, n = a.shape
+    k, l = b.shape
+    u,s,vt = svd(b, full_matrices=0)
+    g0 = dot(u*s, u.T)
+    g = (1 - r)*g0 + r*eye(m)
+    ag = dot(a.T, g)
+    
+    u,s,vt = svd(ag, full_matrices=0)
+    W = u[:,:aopt]
+    K = vt[:aopt,:].T
+    T = dot(a, W)
+    tnorm = apply_along_axis(norm, 0, T) # norm of T-columns
+
+    if mode == 'fast':
+        if scale=='loads':
+            T = T/tnorm
+            W = W*tnorm
+        return {'T':T, 'W':W}
+
+    U = dot(g0, K) #fixme check this 
+    Q = dot(b.T, dot(T, inv(dot(T.T,T)) ))
+    B = zeros((aopt, n, l))
+    for i in range(aopt):
+        B[i] = dot(W[:,:i+1], Q[:,:i+1].T)
+    # leverages
+    # fixme: probably need an orthogonal basis for row-space leverage
+    #        T (scores) are not orthogonal
+    #        Using a qr decomp to get an orthonormal basis for row-space
+    #Tq = qr(T)[0]
+    #s_lev,v_lev = leverage(aopt,Tq,W)
+    # explained variance
+    #var_x, exp_var_x = variances(a,T,W)
+    #qnorm = apply_along_axis(norm, 0, Q)
+    #var_y, exp_var_y = variances(b,U,Q/qnorm)
+    
+    if mode == 'detailed':
+        E = empty((aopt, m, n))
+        F = empty((aopt, k, l))
+        for i in range(aopt):
+            E[i] = a - dot(T[:,:i+1], W[:,:i+1].T)
+            F[i] = b - dot(a, B[i])
+    else: #normal
+        F = b - dot(a, B[-1])
+        E = a - dot(T, W.T)
+
+    if scale=='loads':
+        T = T/tnorm
+        W = W*tnorm
+        Q = Q*tnorm
+        
+    return {'B':B, 'W':W, 'T':T, 'Q':Q, 'E':E, 'F':F, 'U':U, 'P':W}
+    
--- a/fluents/lib/nx_utils.py
+++ b/fluents/lib/nx_utils.py
@@ -0,0 +1,626 @@
+import os,sys
+from itertools import izip
+import networkx as NX
+from scipy import shape,diag,dot,asarray,sqrt,real,zeros,eye,exp,maximum,\
+     outer,maximum,sum,diag,real
+from scipy.linalg import eig,svd,inv,expm,norm
+from cx_utils import sorted_eig
+
+import numpy
+
+eps = numpy.finfo(float).eps.item()
+feps = numpy.finfo(numpy.single).eps.item()
+_array_precision = {'f': 0, 'd': 1, 'F': 0, 'D': 1,'i': 1}
+
+
+def xgraph_to_graph(G):
+    """Convert an Xgraph to an ordinary graph.
+    Edge attributes, mult.edges and self-loops are lost in the process.
+    """
+    
+    GG = NX.convert.from_dict_of_lists(NX.convert.to_dict_of_lists(G))
+    return GG
+
+def get_affinity_matrix(G, data, ids, dist='e', mask=None, weight=None, t=0, out='dist'):
+    """
+    Function for calculating a general affinity matrix, based upon distances.
+    Affiniy = 1 - distance ((10-1) 1 is far apart)
+    INPUT
+
+    data:
+    gene expression data, type dict data[gene] = expression-vector
+
+    G:
+    The network (networkx.base.Graph object)
+    
+    mask:
+    The array mask shows which data are missing. If mask[i][j]==0, then
+    data[i][j] is missing.
+
+    weights:
+    The array weight contains the weights to be used when calculating distances.
+
+    transpose:
+    If transpose==0, then genes are clustered. If transpose==1, microarrays are
+    clustered.
+
+    dist:
+    The character dist defines the distance function to be used:
+    dist=='e': Euclidean distance
+    dist=='b': City Block distance
+    dist=='h': Harmonically summed Euclidean distance
+    dist=='c': Pearson correlation
+    dist=='a': absolute value of the correlation
+    dist=='u': uncentered correlation
+    dist=='x': absolute uncentered correlation
+    dist=='s': Spearman's rank correlation
+    dist=='k': Kendall's tau
+    For other values of dist, the default (Euclidean distance) is used.
+
+    OUTPUT
+    D :
+    Similariy matrix (nGenes x nGenes), symetric, d_ij e in [0,1]
+    Normalized so max weight = 1.0
+    """
+    try:
+        from Bio import Cluster as CLS
+    except:
+        raise ValueError, "Need installed biopython"
+    nVar = len(data)
+    nSamp = len(data[data.keys()[0]])
+    X = zeros((nVar, nSamp),dtpye='<f8')
+    for i,gene in enumerate(ids): #this shuld be right!!
+        X[i,:] = data[gene]
+    
+    
+    #X = transpose(X) # distancematrix needs matrix as (nGenes,nSamples)
+    
+    D_list  = CLS.distancematrix(X, dist=dist)
+    D = zeros((nVar,nVar),dtype='<f8')
+    for i,row in enumerate(D_list):
+        if i>0:
+            D[i,:len(row)]=row
+
+    D = D + D.T
+    MAX = 30.0
+    D_max = max(ravel(D))/MAX
+    D_n = D/D_max #normalised (max = 10.0)
+    D_n = (MAX+1.) - D_n #using correlation (inverse distance for dists)
+    
+    A = NX.adj_matrix(G, nodelist=ids)
+    if out=='dist':
+        return D_n*A
+    elif out=='heat_kernel':
+        t=1.0
+        K = exp(-t*D*A)
+        return K
+    elif out=='complete':
+        return D_n
+    else:
+        return []
+
+def remove_one_degree_nodes(G, iter=True):
+    """Removes all nodes with only one neighbour.  These nodes does
+    not contribute to community structure.
+    input:
+             G -- graph
+             iter -- True/False iteratively remove?
+    """
+    G_copy = G.copy()
+    if iter==True:
+        while 1:
+            bad_nodes=[]
+            for node in G_copy.nodes():
+                if len(G_copy.neighbors(node))==1:
+                    bad_nodes.append(node)
+            if len(bad_nodes)>0:
+                G_copy.delete_nodes_from(bad_nodes)
+            else:
+                break
+    else:
+       bad_nodes=[]
+       for ngb in G_copy.neighbors_iter():
+           if len(G_copy.neighbors(node))==1:
+               bad_nodes.append(node)
+           if len(bad_nodes)>0:
+               G_copy.delete_nodes_from(bad_nodes)
+
+    print "Deleted %s nodes from network" %(len(G)-len(G_copy))
+    return G_copy
+
+def key_players(G, n=1, with_labels=False):
+    """
+    Resilince measure
+    Identification of key nodes by fraction of nodes in
+    disconnected subgraph when the node is removed.
+
+    output:
+           fraction of nodes disconnected when node i is removed
+    """
+    i=0
+    frac=[]
+    labels = {}
+    for node in G.nodes():
+        i+=1
+        print i
+        T = G.copy()
+        T.delete_node(node)
+        n_nodes = T.number_of_nodes()
+        sub_graphs = NX.connected_component_subgraphs(T)
+        n = len(sub_graphs)
+        if n>1:
+            strong_comp = sub_graphs[0]
+            fraction = 1.0 - 1.0*strong_comp.number_of_nodes()/n_nodes
+            frac.append(fraction)
+            labels[node]=fraction
+            
+        else:
+            frac.append(0.0)
+            labels[node]=0.0
+
+    out = 1.0 - array(frac)
+    if with_labels==True:
+        return out,labels
+    else:
+        return out
+
+def node_weighted_adj_matrix(G, weights=None, ave_type='harmonic', with_labels=False):
+    """Return a weighted adjacency matrix of graph. The weights are
+    node weights.
+    input: G -- graph
+           weights -- dict, keys: nodes, values: weights
+           with_labels -- True/False, return labels?
+
+    output: A -- weighted eadjacency matrix
+            [index] -- node labels 
+    
+    """
+    n=G.order()
+    # make an dictionary that maps vertex name to position
+    index={}
+    count=0
+    for node in G.nodes():
+        index[node]=count
+        count = count+1
+  
+    a = zeros((n,n))
+    if type(G)=='networkx.xbase.XGraph':
+        raise
+    for head,tail in G.edges():
+        if ave_type == 'geometric':
+            a[index[head],index[tail]]= sqrt(weights[head]*weights[tail])
+            a[index[tail],index[head]]= a[index[head],index[tail]]
+        elif ave_type == 'harmonic':
+            a[index[head],index[tail]] = mean(weights[head],weights[tail])
+            a[index[tail],index[head]]= mean(weights[head],weights[tail])
+    if with_labels:
+        return a,index
+    else:
+        return a            
+
+def weighted_adj_matrix(G, with_labels=False):
+    """Adjacency matrix of an XGraph whos weights are given in edges.
+    """
+    A,labels = NX.adj_matrix(G,with_labels=True)
+    W = A.astype('<f8')
+    for orf,i in labels.items():
+        for orf2,j in labels.items():
+            if G.has_edge(orf,orf2):
+                edge_weight = G.get_edge(orf,orf2)
+                W[i,j]=edge_weight
+                W[j,i]=edge_weight
+    if with_labels==True:
+        return W,labels
+    else:
+        return W
+
+def assortative_index(G):
+    """Ouputs two vectors: the degree and the neighbor average degree.
+    Used to measure the assortative mixing.  If the average degree is
+    pos. correlated with the degree we know that hubs tend to connect
+    to other hubs.
+
+    input: G, graph connected!!
+    ouput: d,mn_d: degree, and average degree of neighb.
+    (degree sorting from degree(with_labels=True))
+    """
+    d = G.degree(with_labels=True)
+    out=[]
+    for node in G.nodes():
+        nn = G.neighbors(node)
+        if len(nn)>0:
+            nn_d = mean([float(d[i]) for i in nn])
+            out.append((d[node], nn_d))
+    return array(out).T
+
+def struct_equivalence(G,n1,n2):
+    """Returns the structural equivalence of a node pair.  Two nodes
+    are structural equal if they share the same neighbors.
+
+    x_s = [ne(n1) union ne(n2) - ne(n1) intersection ne(n2)]/[ne(n1)
+    union ne(n2) + ne(n1) intersection ne(n2)]
+    ref: Brun et.al 2003
+    """
+
+    #[ne(n1) union ne(n2) - ne(n1) intersection ne(n2
+    s1 = set(G.neighbors(n1))
+    s2 = set(G.neighbors(n2))
+    num_union = len(s1.union(s2))
+    num_intersection = len(s1.intersection(s2))
+    if num_union & num_intersection:
+        xs=0
+    else:
+        xs = (num_union - num_intersection)/(num_union + num_intersection)
+    return xs
+
+def struct_equivalence_all(G):
+    """Not finnished.
+    """
+    A,labels = NX.adj_matrix(G,with_labels=True)
+    pass
+
+def hamming_distance(n1,n2):
+    """Not finnsihed.
+    """
+    pass
+
+def graph_corrcoeff(G):
+    """Not finnished.
+    """
+    A,index = NX.adj_matrix(G,with_labels=True)
+    #C = zeros(*A.shape(),'d')
+    n = 1.*G.number_of_nodes()
+    for node in G.nodes():
+        a_j = A[index[node],:] #neighbors
+        mean_a = sum(a_j)/n# degree(G)/number_of_nodes()
+        var_a = sqrt(sum((a_j - mean_a)**2)/n)
+        pass
+
+def graph_and_data_intersection(data, graph, pathways=None,
+                                keep_connected=True):
+
+    """Returns the intersection of keys in two dictionaries.
+
+    NB: keep track of identifer sorting after these dict transforms.
+    
+    input:
+          data -- dict, keys: gene id, value: measurement profile
+          graph -- networkx,base.graph, full graph
+          pathways -- dict, keys: pathway name, values: nodes in pathway 
+    call:
+    new_data, new_graph,pathways = graph_and_data_intersection(data,graph,pathways,keep_connected=True)
+
+    """
+    new_graph = graph.copy()
+    new_data = {}
+    new_pathways = {}
+    graph_set = set(graph.nodes())
+    data_set = set(data.keys())
+    intersection = data_set & graph_set
+    new_graph.delete_nodes_from(graph_set - data_set) #remove difference
+    for k in intersection:
+        new_data[k] = data[k]
+
+    if keep_connected:
+        max_iter = 0
+        sub_graphs = NX.connected_component_subgraphs(new_graph)
+        if len(sub_graphs)==0:
+            new_graph = sub_graphs[0]
+        else:
+            new_graph = sub_graphs[0]
+        old_data = new_data
+        while new_graph.number_of_nodes() != len(new_data) and max_iter<100:
+            max_iter+=1
+            graph_set = sets.Set(new_graph.nodes())
+            data_set = sets.Set(new_data.keys())
+            intersection = data_set & graph_set
+            new_graph.delete_nodes_from(graph_set - data_set)
+            new_data={}
+            for k in intersection:
+                new_data[k] = old_data[k]
+            old_data = new_data.copy()
+            new_graph = NX.connected_component_subgraphs(new_graph)[0]
+    if pathways!=None:
+        for pth,nodes in pathways.items():
+            new_pathways[pth] = [node for node in nodes if node in new_graph]
+    print "\nSUMMARY (graph_and_data_intersection): "
+    print "Number of input variables: %s\n\
+    Number nodes in input graph: %s" %(len(data),len(graph))
+    print "\nUsing intersection of connected graph and nodes with data values"
+    print "Number of variables is now: %s" %len(new_data)
+    print "Number of nodes in graph: %s" %new_graph.number_of_nodes()
+    if pathways!=None:
+        return new_data,new_graph,new_pathways
+    else:
+        return new_data,new_graph
+
+def rx_graph_and_data_intersection(graph,node_data,pathways,data,keep_connected=False):
+    """Returns a (connected) reaction graph with present gene expression data.
+
+    keep_connected==True:
+    When a node (gene) is not present in our expression data, the node
+    is deleted and all neighbors are connected with edge weight=0.5
+    if the are not already neigbors.
+
+    input:
+          data -- dict, keys: gene id, value: measurement profile
+          graph -- networkx.xbase.xgraph, full wieghted graph
+          node_data -- dict, keys: rx id, value: set of gene_ids 
+          pathways -- dict, keys: pathway name, values: lidt of nodes in pathway 
+    """
+    # We do not connect the full graph ... may be performed by using the reference graph?
+    graph = NX.connected_component_subgraphs(graph)[0] #largest connected component 
+
+    new_graph = graph.copy()
+    new_data = {}
+    new_node_data = node_data.copy()
+    new_pathways = {}
+    
+    genes_in_graph=set()
+    genes_in_data = set(data.keys())
+    rx_in_graph = set(new_graph.nodes())
+
+    # genes in graph nodes (rx_nodes)
+    for rx in rx_in_graph:
+        genes_in_graph.update(set(new_node_data.get(rx)))
+    keep_genes = genes_in_data.intersection(genes_in_graph) #both in graph and data
+
+    #update node data
+    for rx,genes in node_data.items(): # delete node data of nodes not present in graph
+            genes = set(genes)
+            genes.intersection_update(keep_genes) #remove genes if they are not in inters. 
+            if len(genes)==0 or rx not in rx_in_graph: #no gene data or not in graph
+                print "removing: " + str(rx)
+                del new_node_data[rx]
+    rx_in_data= set(new_node_data.keys())
+    rx_intersection = rx_in_data.intersection(rx_in_graph)
+
+    for gene in keep_genes:
+        new_data[gene] = data.get(gene)
+
+    # update pathways nodes
+    for pth,genes in pathways.items():
+        if genes:
+            genes = set(genes)
+            genes.intersection_update(keep_genes) # gene needs to have data
+        else:
+            pass
+        new_pathways[pth] = genes
+    bad_nodes = rx_in_graph.difference(rx_in_data) #in graph but no data
+
+    if keep_connected==True:
+        dummy = new_graph.copy()
+        for rx in bad_nodes:
+            dummy.delete_node(rx)
+            if len(NX.connected_component_subgraphs(dummy))>1:
+                nghbrs = new_graph.neighbors(rx)
+                for i in nghbrs:
+                    for j in nghbrs:
+                        if i!=j:
+                            if not new_graph.has_edge(i,j):
+                                new_graph.add_edge(i,j,0.5)
+
+    #update graph
+    new_graph.delete_nodes_from(list(bad_nodes))
+    
+    return new_graph,new_node_data,new_pathways,new_data
+
+def weighted_laplacian(G,with_labels=False):
+    """Return standard Laplacian of graph from a weighted adjacency matrix."""
+    n= G.order()
+    I = scipy.eye(n)
+    A = weighted_adj_matrix(G)
+    D = I*scipy.sum(A, 0)
+    L = D-A
+    if with_labels:
+        A,index = weighted_adj_matrix(G, with_labels=True)
+	return L, index
+    else:	
+        return L            
+
+
+"""Below are methods for calculating graph metrics
+
+Four main decompositions :
+0.) Adjacency diffusion kernel expm(A),
+1.) von neumann kernels (diagonalisation of adjacency matrix)
+
+2.) laplacian kernels (geometric series of adj.)
+
+3.) diffusion kernels (exponential series of adj.)
+
+---- Kv
+von_neumann : Kv = (I-alpha*A)^-1 (mod: A(I-alpha*A)^-1)? ,
+geom. series
+
+---- Kl
+laplacian: Kl = (I-alpha*L)^-1 , geom. series
+
+---- Kd
+laplacian_diffusion: Kd = expm(-alpha*L)
+exp. series
+
+---- Ke
+Exponential diffusion.
+Ke = expm(A) .... expm(-A)?
+
+"""
+
+# TODO:
+# check for numerical unstable eigenvalues and set to zero
+# othervise some inverses wil explode ->ok ..using pinv for inverses
+#
+# This gives results that look numerical unstable
+#
+# -- divided adj by sum(A[:]), check this one (paper by Lebart scales with number of edges)
+#
+#
+#
+# the neumann kernel is defined in Kandola to be K = A*(I-A)^-1
+# lowest eigenvectors are same as the highest of K = A*A ?
+# this needs clarification
+
+# diffusion is still wrong! ... ok
+# diff needs normalisation?! check the meaning of exp(-s) = exp(1/s) -L = 1/degree ... etc
+# Is it the negative of exp. of adj. metrix in Kandola?
+#
+# Normalised=False returns only nans (no idea why!!) ... fixed ok
+
+# 31.1: diff is ok exp(0)=1 not zero!
+# 07.03.2005: normalisation is ok: -> normalisation will emphasize high degree nodes
+# 10.03.2005: symeig is unstable an returns nans of some eigenvectors? switching back to eig
+# 14.05.2006: diffusion returns negative values, using expm(-LL) instead (FIX)
+# 13.09.2206: update for use in numpy
+
+
+def K_expAdj(W, normalised=False, alpha=1.0):
+    """Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel. 
+    """
+    W = asarray(W)
+    t = W.dtype.char
+    if len(W.shape)!=2:
+        raise ValueError, "Non-matrix input to matrix function."
+    m,n = W.shape
+    if t in ['F','D']:
+        raise TypeError, "Complex input!"
+    if normalised==True:
+        T = diag( sqrt( 1./(sum(W,0))) )
+        W = dot(dot(T, W), T)
+    e,vr = eig(W)
+    s = real(e)**2 # from eigenvalues to singularvalues
+    vri = inv(vr)
+    s = maximum.reduce(s) + s
+    cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
+    cutoff = abs(cond*maximum.reduce(s))
+    psigma = eye(m)
+    for i in range(len(s)):
+        if abs(s[i]) > cutoff:
+            psigma[i,i] = .5*alpha*exp(s[i])
+    
+    return dot(dot(vr,psigma),vri)
+
+def K_vonNeumann(W,normalised=False,alpha=1.0):
+    """ The geometric series of path lengths.
+    Returns matrix square root of pseudo inverse of the adjacency matrix.
+    """
+    W = asarray(W)
+    t = W.dtype.char
+    if len(W.shape)!=2:
+        raise ValueError, "Non-matrix input to matrix function."
+    m,n = W.shape
+    if t in ['F','D']:
+        raise TypeError, "Complex input!"
+    
+    if normalised==True:
+        T = diag(sqrt(1./(sum(W,0))))
+        W = dot(dot(T,W),T)
+    e,vr = eig(W)
+    vri = inv(vr)
+    e = real(e)  # we only work with real pos. eigvals
+    e = maximum.reduce(e) + e
+    cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
+    cutoff = cond*maximum.reduce(e)
+    psigma = zeros((m,n),t)
+    for i in range(len(e)):
+        if e[i] > cutoff:
+            psigma[i,i] = 1.0/e[i] #these are eig.vals (=sqrt(sing.vals))
+    return dot(dot(vr,psigma),vri).astype(t)
+
+def K_laplacian(W, normalised=True, alpha=1.0):
+    """ This is the matrix square root of the pseudo inverse of L.
+    Also known as th eaverage commute time matrix.
+    """
+    W = asarray(W)
+    t = W.dtype.char
+    if len(W.shape)!=2:
+        raise ValueError, "Non-matrix input to matrix function."
+    m,n = W.shape
+    if t in ['F','D']:
+        raise TypeError, "Complex input!"
+    D = diag(sum(W,0))
+    L = D - W
+    if normalised==True:
+        T = diag(sqrt(1./sum(W,0)))
+        L = dot(dot(T,L),T)
+    e,vr = eig(L)
+    e = real(e)
+    vri = inv(vr)
+    cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
+    cutoff = cond*maximum.reduce(e)
+    psigma = zeros((m,),t) # if s close to zero -> set 1/s = 0 
+    for i in range(len(e)):
+        if e[i] > cutoff:
+            psigma[i] = 1.0/e[i]
+    K = dot(dot(vr,diag(psigma)),vri).astype(t)
+    K = real(K)
+    I = eye(n)
+    K = (1-alpha)*I + alpha*K
+    return K
+
+
+
+def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5):
+    """Returns diffusion kernel.
+    input:
+            -- W, adj. matrix
+            -- normalised [True/False]
+            -- alpha, [0,1] (degree of network influence)
+            -- beta, [0->), (diffusion degree)
+    """
+    W = asarray(W)
+    t = W.dtype.char
+    if len(W.shape)!=2:
+        raise ValueError, "Non-matrix input to matrix function."
+    m,n = W.shape
+    if t in ['F','D']:
+        raise TypeError, "Complex input!"
+    D = diag(sum(W,0))
+    L = D-W
+    if normalised==True:
+        T = diag(sqrt(1./(sum(W,0))))
+        L = dot(dot(T,L),T)
+    e,vr = eig(L)
+    vri = inv(vr) #inv
+    cond = 1.0*{0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
+    cutoff = 1.*abs(cond*maximum.reduce(e))
+    psigma = eye(m) # if sing vals are 0 exp(0)=1 (unnecessary)
+    #psigma = zeros((m,n), dtype='<f8')
+    for i in range(len(e)):
+        if abs(e[i]) > cutoff:
+            psigma[i,i] = exp(-beta*e[i])
+    K = real(dot(dot(vr, psigma), vri))
+    I = eye(n, dtype='<f8')
+    K = (1. - alpha)*I + alpha*K
+    return K
+    
+def K_modularity(W,alpha=1.0):
+    """ Returns the matrix square root of Newmans modularity."""
+    W = asarray(W)
+    t = W.dtype.char
+    m, n = W.shape
+    d = sum(W, 0)
+    m = 1.*sum(d)
+    B = W - (outer(d, d)/m)
+    s,v = sorted_eig(B, sort_by='lm')
+    psigma = zeros( (n, n), dtype='<f8' )
+    for i in range(len(s)):
+        if s[i]>1e-7:
+            psigma[i,i] = sqrt(s[i])
+            #psigma[i,i] = s[i]
+    K = dot(dot(v, psigma), v.T)
+    I = eye(n)
+    K = (1 - alpha)*I + alpha*K
+    return K
+
+def kernel_score(K, W):
+    """Returns the modularity score.
+    K -- (modularity) kernel
+    W -- adjacency matrix (possibly weighted)
+    """
+    # normalize W (: W'W=I)
+    m, n = shape(W)
+    for i in range(n):
+        W[:,i] = W[:,i]/norm(W[:,i])
+    score = diag(dot(W, dot(K, W)) )
+    tot = sum(score)
+    return score, tot
--- a/fluents/lib/select_generators.py
+++ b/fluents/lib/select_generators.py
@@ -0,0 +1,187 @@
+"""Matrix cross validation selection generators
+"""
+from scipy import take,arange,ceil,repeat,newaxis,mean,asarray,dot,ones,\
+     random,array_split,floor,vstack,asarray,minimum
+from cx_utils import randperm
+
+def w_pls_gen(aat,b,n_blocks=None,center=True,index_out=False):
+     """Random block crossvalidation for wide (XX.T) trick in PLS.
+     Leave-one-out is a subset, with n_blocks equals nSamples
+     
+     aat -- outerproduct of X
+     b -- Y
+     n_blocks = 
+     center -- use centering of calibration ,sets (aat_in,b_in) are centered
+
+     Returns:
+         -- aat_in,aat_out,b_in,b_out,[out]
+     """
+     m,n = aat.shape
+     index = randperm(m)
+     nValuesInBlock = m/n_blocks
+     if n_blocks==m:
+         index = arange(m)
+     out_ind = [index[i*nValuesInBlock:(i+1)*nValuesInBlock] for i in range(n_blocks)]
+     
+     for out in out_ind:
+          inn = [i for i in index if i not in out]
+          aat_in = aat[inn,:][:,inn]
+          aat_out = aat[out,:][:,inn]
+          b_in = b[inn,:]
+          b_out = b[out,:]
+          if center:
+               # centering projector: I - (1/n)11'
+               # nin = len(inn)
+               # Pc = eye(nin) - outer(ones((nin,)),ones((nin,)))/nin
+               # xxt - x( outer(ones((nin,)),ones((nin,)))/nin ) x.T
+
+               # de jong:
+               h = sum(aat_in,0)[ :,newaxis]
+               h = (h - mean(h)/2)/len(inn)
+               mn_a = h + h.T
+               aat_in = aat_in - mn_a
+          if index_out:
+               yield aat_in,aat_out,b_in,b_out,out
+          else:
+               yield aat_in,aat_out,b_in,b_out
+
+def pls_gen(a,b, n_blocks=None, center=False, index_out=False,axis=0):
+     """Random block crossvalidation
+    Leave-one-out is a subset, with n_blocks equals a.shape[-1]
+    """
+     index = randperm(a.shape[axis])
+     if n_blocks==None:
+          n_blocks = a.shape[axis]
+     n_in_set = ceil(float(a.shape[axis])/n_blocks)
+     out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_blocks)]
+     for out in out_ind_sets:
+         inn = [i for i in index if i not in out]
+         if center:
+              a = a - mean(a,0)[newaxis]
+              b = b - mean(b,0)[newaxis]
+         if index_out:
+              yield a.take(inn,0),a.take(out,0), b.take(inn,0),b.take(out,0),out
+         else:     
+              yield a.take(inn,0),a.take(out,0), b.take(inn,0),b.take(out,0)
+
+         
+def pca_gen(a,n_sets=None, center=False, index_out=False,axis=0):
+     """PCA random block crossval generator.
+     """
+     m = a.shape[axis]
+     index = randperm(m)
+     if n_sets==None:
+          n_sets = m
+     n_in_set = ceil(float(m)/n_sets)
+     out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_sets)]
+     for out in out_ind_sets:
+         inn = [i for i in index if i not in out]
+         if center:
+              a = a - mean(a,0)[newaxis]
+         if index_out:
+              yield a.take(inn,0),a.take(out,0),out
+         else:     
+              yield a.take(inn,0),a.take(out,0)
+
+def w_pls_gen_jk(a,b,n_sets=None,center=True,index_out=False,axis=0):
+     """Random block crossvalidation for wide X (m>>n)
+     Leave-one-out is a subset, with n_sets equals a.shape[-1]
+
+     Returns : X_m and X_m'Y_m
+     """
+     m = a.shape[axis]
+     ab = dot(a.T,b)
+     index = randperm(m)
+     if n_sets==None:
+          n_sets = m
+     n_in_set = ceil(float(m)/n_sets)
+     out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_sets)]
+     for out in out_ind_sets:
+         inn = [i for i in index if i not in out]
+         nin = len(inn)
+         nout = len(out)
+         a_in = a[inn,:]
+         mn_a = 0
+         mAB = 0
+         
+         if center:
+              mn_a = mean(a,0)[newaxis]
+              mAin = dot(-ones((1,nout)),a[out,:])/nin
+              mBin = dot(-ones((1,nout)),b[out,:])/nin
+              mAB = dot(mAin.T,(mBin*nin))
+         ab_in = ab - dot(a[out,].T,b[out,:]) - mAB
+         a_in = a_in - mn_a
+
+         if index_out:
+              yield ain,ab, out
+         else:     
+              yield a_in, ab
+
+def shuffle_1d_block(a, n_sets=None, blocks=None, index_out=False, axis=0):
+     """Random block shuffling along 1d axis
+     Returns : Shuffled a by axis
+     """
+     m = a.shape[axis]
+     if blocks==None:
+         blocks = m
+     for ii in xrange(n_sets):
+         index = randperm(m)
+         if blocks==m:
+             a_out = a.take(index, axis)
+         else:
+             index = arange(m)
+             dummy = map(random.shuffle, array_split(index, blocks))
+             a_out = a.take(index, axis)
+         if index_out:
+              yield a_out, index
+         else:
+              yield a_out
+
+def shuffle_1d(a, n_sets, axis=0):
+     """Random shuffling along 1d axis.
+
+     Returns : Shuffled a by axis
+     """
+     m = a.shape[axis]
+     for ii in xrange(n_sets):
+         index = randperm(m)
+         yield a.take(index, axis)
+         
+def diag_pert(a, n_sets=10, center=True, index_out=False):
+    """Alter generator returning sets perturbed with means at diagonals.
+
+    input:
+            X -- matrix, data
+            alpha -- scalar, approx. portion of data perturbed  
+    """
+    
+    m, n = a.shape
+    tr=False
+    if m>n:
+         a = a.T
+         m, n = a.shape
+         tr = True
+    if n_sets>m or n_sets>n:
+         msg = "You may not use more subsets than max(n_rows, n_cols)"
+         raise ValueError, msg
+    nm=n*m
+    start_inds = array_split(randperm(m),n_sets) # we use random start diags
+    if center:
+         a = a - mean(a, 0)[newaxis]
+    for v in range(n_sets):
+        a_out = a.copy()
+        out = []
+        for start in start_inds[v]: 
+            ind = arange(start+v, nm, n+1)
+            [out.append(i) for i in ind]
+            if center:
+                a_out.put(a.mean(),ind) 
+            else:
+                 a_out.put(0, ind)
+        if tr:
+             a_out = a_out.T
+             
+        if index_out:
+             yield a_out, asarray(out)
+        else:
+             yield a_out
--- a/fluents/lib/validation.py
+++ b/fluents/lib/validation.py
@@ -0,0 +1,145 @@
+from scipy import ones,mean,sqrt,dot,newaxis,zeros,sum,empty,\
+     apply_along_axis,eye, kron
+from scipy.linalg import triu,inv,svd,norm
+
+from select_generators import w_pls_gen,w_pls_gen_jk,pls_gen,pca_gen,diag_pert
+from engines import w_simpls,pls, bridge,pca
+from pylab import *
+
+def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
+    """RMSEP calc for pls with wide X.
+    """
+    k, l = Y.shape
+    PRESS = zeros((l, amax+1), dtype='f')
+    # X,Y are centered
+    if n_blocks==None:
+        n_blocks = Y.shape[0]
+    V = w_pls_gen(dot(X, X.T), Y, n_blocks=n_blocks, center=True)
+    for Din, Doi, Yin, Yout in V:
+        ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0])
+        Yin = Yin - ym
+        PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
+        if algo=='simpls':
+            dat = w_simpls(Din, Yin, amax)
+            Q,U,H = dat['Q'], dat['U'], dat['H']
+            That = dot(Doi, dot(U, inv(triu(dot(H.T,U))) ))
+        else:
+            "Other algo-support comming soon"
+            raise NotImplementedError
+        #Yhat = empty((amax, k, l),dtype='<f8')
+        Yhat = []
+        for j in range(l):
+            TQ = dot(That, triu(dot(Q[j,:][:,newaxis], ones((1,amax)))) )
+            E = Yout[:,j][:,newaxis] - TQ
+            E = E + sum(E, 0)/Din.shape[0]
+            PRESS[j,1:] = PRESS[j,1:] + sum(E**2, 0)
+    #Yhat = Y - dot(That,Q.T)
+    return sqrt(PRESS/Y.shape[0])
+
+def pls_val(X, Y, amax=2, n_blocks=10,algo='pls'):
+    """ Validation results of pls model. 
+    """
+    
+    k, l = Y.shape
+    PRESS = zeros((l, amax+1), dtype='<f8')
+    EE = zeros((amax, k, l), dtype='<f8')
+    Yhat = zeros((amax, k, l), dtype='<f8')
+    # X,Y are centered
+    V = pls_gen(X, Y, n_blocks=n_blocks, center=True, index_out=True)
+    for Xin, Xout, Yin, Yout, out in V:
+        ym = -sum(Yout,0)[newaxis]/Yin.shape[0]
+        Yin = (Yin - ym)
+        PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
+        if algo=='pls':
+            dat = pls(Xin, Yin, amax, mode='normal')
+        elif algo=='bridge':
+            dat = simpls(Xin, Yin, amax, mode='normal')
+        
+        for a in range(amax):
+            Ba = dat['B'][a,:,:]
+            Yhat[a,out[:],:] = dot(Xout, Ba)
+            E = Yout -  dot(Xout, Ba)
+            EE[a,out,:] = E
+            PRESS[:,a+1] = PRESS[:,a+1] + sum(E**2,0)
+
+    return sqrt(PRESS/(k-1.)), EE, Yhat
+
+def pca_alter_val(a, amax, n_sets=10,method='diag'):
+    """Pca validation by altering elements in X.
+    """
+    # todo: it is just as easy to do jk-estimates her as well
+    V = diag_pert(a, n_sets, center=True, index_out=True)
+    sep = empty((n_sets, amax), dtype='f')
+    for i, (xi, ind) in enumerate(V):
+        dat_i = pca(xi, amax, mode='detailed')
+        Ti,Pi = dat_i['T'],dat_i['P']
+        for j in xrange(amax):
+            Xhat = dot(Ti[:,:j+1], Pi[:,:j+1].T)
+            a_sub = a.ravel().take(ind)
+            EE = a_sub - Xhat.ravel().take(ind)
+            tot = (a_sub**2).sum()
+            sep[i,j] = (EE**2).sum()/tot
+    return sqrt(sep.mean(0))
+    #return sep
+
+def pca_cv_val(X, amax, n_sets):
+    """ Cross validation of pca using random sets crossval.
+    """
+    m, n = X.shape
+    xtot = (X**2).sum()
+    V = pca_gen(X, n_sets=7, center=True, index_out=True)
+    E = empty((amax, m, n), dtype='f')
+    for xi,xout,ind in V:
+        dat_i = pca(xi, amax, mode='detailed')
+        Pi = dat_i['P']
+        for a in xrange(amax):
+            Pia = Pi[:,:a+1]
+            E[a][ind,:] = (X[ind,:] - dot(xout, dot(Pia,Pia.T) ))**2
+
+    sep = []
+    for a in xrange(amax):
+        sep.append(E[a].sum()/xtot)
+    return sqrt(sep.mean(0))
+
+def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True):
+    """ Returns CV-segments of paramter W for wide X.
+
+    todo: add support for T,Q and B
+    """
+    if n_blocks == None:
+        n_blocks = b.shape[0]
+
+    WW = empty((n_blocks, a.shape[1], amax), dtype='f')
+
+    if use_pack:
+        u, s, inflater = svd(a, full_matrices=0)
+        a = u*s
+    V = pls_gen(a, b, n_blocks=n_blocks)
+    for nn,(a_in, a_out, b_in, b_out) in enumerate(V):
+        if algo=='pls':
+            dat = pls(a_in, b_in, amax, 'loads', 'fast')
+        elif algo=='bridge':
+            dat = bridge(a_in, b_in, amax, 'loads', 'fast')
+        W = dat['W']
+        if use_pack:
+            W = dot(inflater.T, W)
+        WW[nn,:,:] = W
+        
+    return WW
+
+def pca_jkP(a, aopt, n_blocks=None):
+    """ Returns CV-segments of paramter P.
+    todo: add support for T
+    fixme: more efficient to add this in validation loop
+    """
+    if n_blocks == None:
+        n_blocks = a.shape[0]
+
+    PP = empty((n_blocks, a.shape[1], aopt), dtype='f')
+    V = pca_gen(a, n_sets=n_blocks, center=True)
+    for nn,(a_in, a_out) in enumerate(V):  
+        dat = pca(a_in, aopt, mode='fast')
+        P = dat['P']
+        PP[nn,:,:] = P
+        
+    return PP