First import of chemometrics utils

2006-12-18 11:59:12 +00:00
parent fac9346aad
commit 3ef5522dd0
8 changed files with 2112 additions and 0 deletions
--- a/fluents/lib/blmfuncs.py
+++ b/fluents/lib/blmfuncs.py
@@ -0,0 +1,432 @@
 """This module contains bilinear models(Functions)
 """
 import sys
 # add library
 sys.path.append('/home/flatberg/fluents/fluents/lib')
 import time
 from fluents.workflow import Function
 from fluents.dataset import Dataset
 from fluents import plots, dataset, workflow, logger
 import scipy
 from engines import *
 from cx_stats import leverage, variances, hotelling
 from cx_utils import mat_center
 from validation import *
 import blmplots
 import engines
 class Model(Function):
    """Base class of bilinear models.
    """
    def __init__(self,id='johndoe',name='JohnDoe'):
        Function.__init__(self,id,name)
        self.name = name
        self._options = None
        self._data = {}
        self._dataset = {}
        self._packers = {}
        self.model = {}
    def clear(self):
        """ Clears model paramters
        """
        self.model = {}
        self._data = {}
        self._packers = {}
 class PCA(Model):
    def __init__(self,id='pca',name='PCA'):
        Model.__init__(self,id,name)
        self._options = PcaOptions()
    def pre_validation(self, amax, n_sets, val_engine):
        """Model calculations for maximum number of components. 
        """
        rmsep = val_engine(self.model['E0'], amax, n_sets)
        self.model['rmsep'] = rmsep
        self.model['aopt'] = rmsep.argmin()
    def confidence(self, aopt, n_sets, alpha, p_center,
                   crot, strict, cov_center ):
        """Returns a confidence measure for model parameters.
        Based on aopt.
        """
        aopt = self.model['aopt']
        jk_segments = pca_jkP(self.model['E0'], aopt, n_sets)
        Pcal = self.model['P'][:,:aopt]
        tsq = hotelling(jk_segments, Pcal, p_center,
                              cov_center, alpha, crot, strict)
        self.model['p_tsq'] = tsq
    def make_model(self, amax, mode, scale):
        """Model on optimal number of components.
        """
        dat = pca(self.model['E0'], amax, scale, mode)
        # explained variance
        var_x, exp_var_x = variances(self.model['E0'], dat['T'], dat['P'])
        dat['var_x'] = var_x
        dat['exp_var_x'] = exp_var_x
        #fixme###
        do_lev_s = False
        do_lev_v = False
        #####
        if do_lev_s:
            # sample leverages
            tnorm = scipy.apply_along_axis(norm, 0, dat['T']) # norm of T-columns
            s_lev = leverage(amax, tnorm)
            dat['s_lev'] = s_lev
        if do_lev_v:
            # variable leverages
            v_lev = leverage(amax, dat['P'])
            dat['v_lev'] = v_lev
        self.model.update(dat)
    def as_dataset(self, param, dtype='dataset'):
        """Return model parameter as Dataset.
        """
        if not param in self.model.keys():
            return
        DX = self._dataset['X'] #input dataset
        dim_name_0, dim_name_1 = DX.get_dim_name()
        # samples
        ids_0 = [dim_name_0, DX.get_identifiers(dim_name_0, sorted=True)]
        # vars
        ids_1 = [dim_name_1, DX.get_identifiers(dim_name_1, sorted=True)]
        # components (hidden)
        pc_ids = ['_comp_a', map(str,range(self.model['aopt'])) ]
        pc_ids_opt = ['_comp_o', map(str, range(self.model['aopt'])) ] 
        zero_dim = ['_doe', ['0']] # null dim, vector (hidden)
        match_ids = {'E':[ids_0, ids_1],
                     'E0':[ids_0, ids_1],
                     'P':[ids_1, pc_ids],
                     'T':[ids_0, pc_ids], 
                     'W':[ids_1, pc_ids],
                     'p_tsq':[ids_1, zero_dim],
                     'rmsep':[pc_ids, zero_dim],
                     'var_leverages':[ids_1, zero_dim],
                     'sample_leverages':[pc_ids, zero_dim],
                     'exp_var_x': [pc_ids, zero_dim],
                     'var_x': [pc_ids, zero_dim],
                     }
        out = Dataset(self.model[param], match_ids[param], name=param)
        return out
    def get_out_plots(self, options):
        out=[]
        for plt in options['out_plots']:
            #try:
            out.append(plt(self))
            #except:
            #    print plt
                #logger.log('debug', 'Plot: %s failed') %plt
        return out
    def run(self, data):
        """Run pca with present options. 
        """
        self.clear()
        options = self._options
        self._dataset['X'] = data
        self._data['X'] = data.asarray().astype('<f8')
        if options['center']:
            center = options['center_mth']
            self.model['E0'] = center(self._data['X'])
        else:
            self.model['E0'] = data.asarray()
        self.pre_validation(**options.pre_validation_options())
        self.make_model(**options.make_model_options())
        if options['calc_conf']:
            self.confidence(**options.confidence_options())
        out = [self.as_dataset(p) for p in options['out_data']]
        for plt in self.get_out_plots(options):
            out.append(plt)
        return out
 class PLS(Model):
    def __init__(self, id='pls', name='PLS'):
        Model.__init__(self, id, name)
        self._options = PlsOptions()
    def pre_validation(self, amax, n_sets, val_engine):
        """Returns rmsec,rmsep for model.
        """
        rmsep = val_engine(self.model['E0'], self.model['F0'],
                           amax, n_sets)
        self.model['rmsep'] = rmsep.mean(0)
        self.model['aopt'] = rmsep.mean(0).argmin()
    def confidence(self, aopt, n_sets, alpha, p_center,
                   crot, strict, cov_center ):
        """Returns a confidence measure for model parameters
        Supported parameters: W
        """
        aopt = self.model['aopt']
        jk_segments = pls_jkW(self.model['E0'], self.model['F0'],
                              aopt, n_sets)
        Wcal = self.model['W'][:,:aopt]
        tsq = hotelling(jk_segments, Wcal, p_center,
                              alpha, crot, strict, cov_center)
        self.model['w_tsq'] = tsq
    def permutation_confidence(self, a, b, aopt, reg, n_iter, algo,
                               sim_method):
        """Estimates sign. vars by controlling fdr."""
        qvals_sorted, qvals = pls_qvals(a, b, aopt=None,
                                        alpha=.4, n_iter=20, algo='pls',
                                        sim_method='shuffle', )
    def make_model(self, a, b, amax, scale, mode, engine):
        """Make model on amax components.
        """
        dat = engine(a, b, amax, scale, mode)
        self.model.update(dat)
    def as_dataset(self, name, dtype='Dataset'):
        """Return any model parameter as Dataset
        No ids matching
        """
        if name not in self.model.keys():
            return
        DX, DY = self._dataset['X'], self._dataset['Y']
        dim_name_0, dim_name_1 = DX.get_dim_name()
        dim_name_2, dim_name_3 = DY.get_dim_name()
        #samples
        ids_0 = [dim_name_0, DX.get_identifiers(dim_name_0, sorted=True)]
        # x vars
        ids_1 = [dim_name_1, DX.get_identifiers(dim_name_1, sorted=True)]
        # y vars
        ids_3 = [dim_name_3, DY.get_identifiers(dim_name_3, sorted=True)]
        # components (hidden)
        pc_ids = ['_comp', map(str, range(self.model['aopt']))]
        zero_dim = ['_doe',['0']] # null dim, vector (hidden)
        match_ids = {'E':[ids_0, ids_1],
                     'P':[ids_1, pc_ids],
                     'T':[ids_0, pc_ids],
                     'W': [ids_1, pc_ids],
                     'R': [ids_1, pc_ids],
                     'Q':[ids_3, pc_ids],
                     'F':[ids_0, ids_3],
                     'B':[ids_1, ids_3],
                     'qval':[ids_1, zero_dim],
                     'qval_sorted':[ids_1, zero_dim],
                     'w_tsq':[ids_1, zero_dim],
                     'rmsep':[pc_ids, zero_dim],
                     }
        array = self.model[name]     
        M = Dataset(array,identifiers=match_ids[name],name=name)
        return M
    def get_out_plots(self, options):
        out=[]
        for plt in options['out_plots']:
            #try:
            out.append(plt(self))
            #except:
            #    logger.log('debug', 'Plot: %s failed' %plt)
        return out
    def run(self,a,b):
        options = self._options
        self._dataset['X'] = a
        self._dataset['Y'] = b
        self._data['X'] = a.asarray()
        self._data['Y'] = b.asarray()
        if options['center']:
            self.model['E0'] = options['center_mth'](self._data['X'])
            self.model['F0'] = options['center_mth'](self._data['Y'])
        else:
            self.model['E0'] = self._data['X']
            self.model['F0'] = self._data['Y']
        self.pre_validation(**options.pre_validation_options())
        self.make_model(self.model['E0'], self.model['F0'],
                        **options.make_model_options())
        # variance captured
        var_x, exp_var_x = variances(self.model['E0'], self.model['T'], self.model['P'])
        self.model['var_x'] = var_x
        self.model['exp_var_x'] = exp_var_x
        var_y, exp_var_y = variances(self.model['F0'], self.model['T'], self.model['Q'])
        self.model['var_y'] = var_y
        self.model['exp_var_y'] = exp_var_y
        if options['calc_conf']:
            self.confidence(**options.confidence_options())
        out = [self.as_dataset(p) for p in options['out_data']]
        for plt in self.get_out_plots(options):
            out.append(plt)
        return out
 class Packer:
    """A compression object used to speed up model calculations.
    Often used in conjunction with crossvalidation and perturbations
    analysis. 
    """
    def __init__(self,array):
        self._shape = array.shape
        self._array = array
        self._packed_data = None
    def expand(self,a):
        if self._inflater!=None:
            return dot(self._inflater,a)
    def collapse(self,axis=None,mode='svd'):
        if not axis:
            axis = argmin(self._array.shape) # default is the smallest dim
        if axis == 1:
            self._array = self._array.T
        u,s,vt = svd(self._array,full_matrices=0)
        self._inflater = vt.T
        self._packed_data = u*s
        return self._packed_data
    def get_packed_data(self):
        return self._packed_data
 class Options(dict):
    """Options base class.
    """
    def __init__(self, *args,**kw):
        dict.__init__(self, *args, **kw)
    def _copy_from_list(self, key_list):
        d = {}
        for key in key_list:
            d[key] = self.get(key,None)
        return d
 class PcaOptions(Options):
    """Options for Principal Component Analysis.
    """
    def __init__(self):
        Options.__init__(self)
        self._set_default()
    def _set_default(self):
        opt = {}
        opt['algo'] = 'pca'
        opt['engine'] = engines.pca
        opt['mode'] = 'normal' # how much info to calculate
        opt['lod'] = 'compact' # how much info to store
        opt['amax'] = 5
        opt['aopt'] = 5
        opt['center'] = True
        opt['center_mth'] = mat_center
        opt['scale'] = 'scores'
        opt['calc_conf'] = True
        opt['n_sets'] = 5
        opt['strict'] = True
        opt['p_center'] = 'med'
        opt['alpha'] = .8
        opt['cov_center'] = 'med'
        opt['crot'] = True
        opt['val_engine'] = pca_alter_val
        opt['val_n_sets'] = 10
        opt['all_data'] = ['T','P','E','p_tsq','rmsep']
        opt['all_plots'] = ['PcaScorePlot', 'PcaLoadingPlot',
                            'PcaRmsepPlot']
        opt['out_data'] = ['T','P', 'p_tsq']
        opt['out_plots'] = [blmplots.PcaScorePlot,blmplots.PcaLoadingPlot,blmplots.LineViewXc]
        self.update(opt)
    def make_model_options(self):
        """Options for make_model method."""
        opt_list = ['scale','mode', 'amax']
        return self._copy_from_list(opt_list)   
    def confidence_options(self):
        """Options for confidence method."""
        opt_list = ['n_sets', 'aopt', 'alpha', 'p_center',
                    'strict', 'crot', 'cov_center']
        return self._copy_from_list(opt_list)  
    def pre_validation_options(self):
        """Options for pre_validation method."""
        opt_list = ['amax', 'n_sets', 'val_engine']
        return self._copy_from_list(opt_list)
 class PlsOptions(Options):
    """Options for Partial Least Squares Regression.
    """
    def __init__(self):
        Options.__init__(self)
        self._set_default()
    def _set_default(self): 
        opt = {}
        opt['algo'] = 'pls'
        opt['engine'] = engines.pls
        opt['mode'] = 'normal' # how much info to calculate
        opt['lod'] = 'compact' # how much info to store
        opt['amax'] = 3
        opt['aopt'] = 3
        opt['center'] = True
        opt['center_mth'] = mat_center
        opt['scale'] = 'scores'
        opt['calc_conf'] = True
        opt['n_sets'] = 10
        opt['strict'] = True
        opt['p_center'] = 'med'
        opt['alpha'] = .2
        opt['cov_center'] = 'med'
        opt['crot'] = True
        opt['val_engine'] = w_pls_cv_val
        opt['all_data'] = ['T','P','E','p_tsq','rmsep']
        opt['all_plots'] = ['PcaScorePlot', 'PcaLoadingPlot',
                            'PcaRmsepPlot']
        opt['out_data'] = []
        opt['out_plots'] = [blmplots.PlsScorePlot,
                            blmplots.PlsLoadingPlot,
                            blmplots.LineViewXc]
                            #blmplots.PlsQvalScatter]
        opt['pack'] = False
        opt['calc_qvals'] = False
        opt['q_pert_mth'] = 'shuffle_vars'
        opt['q_iter'] = 20        
        self.update(opt)
    def make_model_options(self):
        """Options for make_model method."""
        opt_list = ['scale','mode', 'amax', 'engine']
        return self._copy_from_list(opt_list)   
    def confidence_options(self):
        """Options for confidence method."""
        opt_list = ['n_sets', 'aopt', 'alpha', 'p_center',
                    'strict', 'crot', 'cov_center']
        return self._copy_from_list(opt_list)  
    def pre_validation_options(self):
        """Options for pre_validation method."""
        opt_list = ['amax', 'n_sets', 'val_engine']
        return self._copy_from_list(opt_list)  
--- a/fluents/lib/blmplots.py
+++ b/fluents/lib/blmplots.py
@@ -0,0 +1,158 @@
 """Specialised plots for functions defined in blmfuncs.py.
 fixme:
        -- Im normalsing all color mapping input vectors to [0,1]. This will
        destroy informative numerical values in colorbar (but we
        are not showing these anyway). A better fix would be to let the
        colorbar listen to the scalarmappable instance and corect itself, but
        I did not get that to work ...
 fixme2:
        -- If scatterplot is not inited with a colorvector there will be no
        colorbar, but when adding colors the colorbar shoud be created.
 """
 from fluents import plots
 from scipy import dot,sum,diag,arange,log,mean,newaxis
 from matplotlib import cm
 class PcaScorePlot(plots.ScatterPlot):
    """PCA Score plot"""
    def __init__(self, model, absi=0, ordi=1):
        self._T = model.model['T']
        dataset_1 = model.as_dataset('T')
        dataset_2 = dataset_1
        id_dim = dataset_1.get_dim_name(0)
        sel_dim = dataset_1.get_dim_name(1)
        id_1, = dataset_1.get_identifiers(sel_dim, [absi])
        id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
        plots.ScatterPlot.__init__(self, dataset_1, dataset_2, id_dim, sel_dim, id_1, id_2 ,c='b' ,s=40 , name='pca-scores')
    def set_absicca(self,n):
        self.xaxis_data = self._T[:,n]
    def set_ordinate(self,n):
        self.yaxis_data = self._T[:,n]
 class PcaLoadingPlot(plots.ScatterPlot):
     """PCA Loading plot"""
     def __init__(self, model, absi=0, ordi=1):
         self._P = model.model['P']
         dataset_1 = model.as_dataset('P')
         dataset_2 = dataset_1
         id_dim = dataset_1.get_dim_name(0)
         sel_dim = dataset_1.get_dim_name(1)
         id_1, = dataset_1.get_identifiers(sel_dim, [absi])
         id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
         if model.model.has_key('p_tsq'):
             col = model.model['p_tsq'].ravel()
             col = normalise(col)
         else:
             col = 'g'
         plots.ScatterPlot.__init__(self, dataset_1, dataset_2, id_dim, sel_dim, id_1, id_2,c=col,s=20, name='pls-loadings')
     def set_absicca(self,n):
         self.xaxis_data = self._P[:,n]
     def set_ordinate(self,n):
         self.yaxis_data = self._P[:,n]
 class PlsScorePlot(plots.ScatterPlot):
    """PLS Score plot"""
    def __init__(self,model, absi=0, ordi=1):
        self._T = model.model['T']
        dataset_1 = model.as_dataset('T')
        dataset_2 = dataset_1
        id_dim = dataset_1.get_dim_name(0)
        sel_dim = dataset_1.get_dim_name(1)
        id_1, = dataset_1.get_identifiers(sel_dim, [absi])
        id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
        plots.ScatterPlot.__init__(self, dataset_1, dataset_2,
                                   id_dim, sel_dim, id_1, id_2 ,
                                   c='b' ,s=40 , name='pls-scores')
    def set_absicca(self,n):
        self.xaxis_data = self._T[:,n]
    def set_ordinate(self,n):
        self.yaxis_data = self._T[:,n]
 class PlsLoadingPlot(plots.ScatterPlot):    
    """PLS Loading plot"""
    def __init__(self,model,absi=0,ordi=1):
        self._P = model.model['P']
        dataset_1 = model.as_dataset('P')
        dataset_2 = dataset_1
        id_dim = dataset_1.get_dim_name(0)
        sel_dim = dataset_1.get_dim_name(1)
        id_1, = dataset_1.get_identifiers(sel_dim, [absi])
        id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
        if model.model.has_key('w_tsq'):
            col = model.model['w_tsq'].ravel()
            col = normalise(col)
        else:
            col = 'g'
        plots.ScatterPlot.__init__(self, dataset_1, dataset_2,
                                   id_dim, sel_dim, id_1, id_2,
                                   c=col, s=20,                                   name='loadings')
    def set_absicca(self,n):
        self.xaxis_data = self._P[:,n]
    def set_ordinate(self,n):
        self.yaxis_data = self._T[:,n]
 class LineViewXc(plots.LineViewPlot):
    """A line view of centered raw data
    """
    def __init__(self, func_class, name='Profiles'):
        # copy, center, plot
        x = func_class._dataset['X'].copy()
        x._array = x._array - mean(x._array,0)[newaxis]
        plots.LineViewPlot.__init__(self, x, 1, None, name)
 class ParalellCoordinates(plots.Plot):
    """Parallell coordinates for score loads with many comp.
    """
    def __init__(self,model, p = 'loads'):
        pass
 class PlsQvalScatter(plots.ScatterPlot):
    """A vulcano like plot of loads vs qvals
    """
    def __init__(self, func_class, pc=0):
        model = func_class.model
        if not model.has_key('w_tsq'):
            return
        self._W = model['P']
        dataset_1 = func_class.as_dataset('P')
        dataset_2 = func_class.as_dataset('w_tsq')
        id_dim = dataset_1.get_dim_name(0) #genes
        sel_dim = dataset_1.get_dim_name(1) #_comp
        sel_dim_2 = dataset_2.get_dim_name(1) #_zero_dim
        id_1, = dataset_1.get_identifiers(sel_dim, [0])
        id_2, = dataset_2.get_identifiers(sel_dim_2, [0])
        if model.has_key('w_tsq'):
            col = model['w_tsq'].ravel()
            col = normalise(col)
        else:
            col = 'g'
        plots.ScatterPlot.__init__(self, dataset_1, dataset_2,
                                   id_dim, sel_dim, id_1, id_2,
                                   c=col, s=20, sel_dim_2=sel_dim_2,
                                   name='Load Volcano')
 class InfluencePlot(plots.ScatterPlot):
    """
    """
    pass
 def normalise(x):
    """Scale vector x to [0,1]
    """
    x = x - x.min()
    x = x/x.max()
    return x
--- a/fluents/lib/cx_stats.py
+++ b/fluents/lib/cx_stats.py
@@ -0,0 +1,256 @@
 from scipy import zeros,zeros_like,sqrt,dot,trace,sign,round_,argmax,\
     sort,ravel,newaxis,asarray,diag,sum,outer,argsort,arange,ones_like,\
     all,apply_along_axis,eye
 from scipy.linalg import svd,inv,norm,det,sqrtm
 from scipy.stats import mean,median
 from cx_utils import mat_center
 from validation import pls_jkW
 from select_generators import shuffle_1d
 from engines import *
 import time
 def hotelling(P, Pfull, p_center='med', cov_center='med',
              alpha=0.3, crot=True, strict=False, metric=None):
    """Returns regularized hotelling T^2.
    alpha -- regularisation towards pooled cov estimates
    beta -- regularisation for unstable eigenvalues
    p_center -- location method for submodels
    cov_center -- location method for sub coviariances
    alpha -- regularisation
    crot -- rotate submodels toward full?
    strict -- only rotate 90 degree ?
    metric -- inverse metric matrix (if P and Pfull from metric pca/pls)
    """
    m, n = Pfull.shape
    if metric==None:
        metric = eye(m, dtype='<f8')
    Pfull = dot(metric.T, asarray(Pfull))
    n_sets,n,amax = P.shape
    # allocate
    T_sq = empty((n, ),dtype='f')
    Cov_i = zeros((n, amax, amax),dtype='f')
    # rotate sub_models to full model
    if crot:
        for i,Pi in enumerate(P):
            Pi = dot(metric.T, Pi)
            P[i] = procrustes(Pfull, Pi, strict=strict)
    # center of pnull
    if p_center=='med':
        P_ctr = median(P, 0)
    elif p_center=='mean':
        # fixme: mean is unstable
        P_ctr = mean(P, 0)
    else: #use full
        P_ctr = Pfull
    for i in xrange(n):
        Pi = P[:,i,:] # (n_sets x amax) 
        Pi_ctr = P_ctr[i,:] # (1 x amax)
        Pim = (Pi - Pi_ctr[newaxis])*sqrt(n_sets-1)
        Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim)
    if cov_center == 'med':
        Cov = median(Cov_i, 0)
    else:
        Cov = mean(Cov_i, 0)
    reg_cov = (1. - alpha)*Cov_i + alpha*Cov
    for i in xrange(n):
        Pc = P_ctr[i,:][:,newaxis]
        sigma = reg_cov[i]
        #T_sq[i] = sqrt(dot(dot(Pc.T, inv(sigma)), Pc).ravel())
        T_sq[i] = dot(dot(Pc.T, inv(sigma)), Pc).ravel()  
    return T_sq
 def procrustes(A, B, strict=True, center=False, verbose=False):
    """Rotation of B to A.
    strict -- Only do flipping and shuffling
    center -- Center before rotation, translate back after
    verbose -- Print ssq
    No scaling calculated.
    Output B_rot = Rotated B
    """
    if center:
        A,mn_A = mat_center(A, ret_mn=True)
        B,mn_B = mat_center(B, ret_mn=True)
    u,s,vh = svd(dot(B.T, A))
    v = vh.T
    Cm = dot(u, v.T) #orthogonal rotation matrix
    if strict: # just inverting and flipping
       Cm = ensure_strict(Cm)
    b_rot = dot(B, Cm)
    if verbose:
        print Cm.round()
        fit = sum(ravel(B - b_rot)**2)
        print "Sum of squares: %s" %fit
    if center:
        return mn_B + b_rot
    else:
        return b_rot
 def expl_var_x(X, T):
    """Returns explained variance of X."""
    # centered X,Y
    exp_var_x = diag(dot(T.T, T))*100/(sum(X**2))
    return exp_var_x
 def expl_var_y(Y, T, Q):
    """Returns explained variance of Y.
    """
    # centered Y
    exp_var_y = zeros((Q.shape[1], ))
    for a in range(Q.shape[1]):
        Ya = outer(T[:,a], Q[:,a])
        exp_var_y[a] = 100*sum(Ya**2)/sum(Y**2)
    return exp_var_y
 def pls_qvals(a, b, aopt=None, alpha=.3,
              n_iter=20, algo='pls',
              sim_method='shuffle',
              p_center='med', cov_center='med',
              crot=True, strict=False, metric=None):
    """Returns qvals for pls model.
    input:
    a -- centered data matrix
    b -- centered data matrix
    aopt -- scalar, opt. number of components
    alpha -- [0,1] regularisation parameter for T2-test
    n_iter -- number of permutations
    sim_method -- permutation method ['shuffle']
    p_center -- location estimator for sub models ['med']
    cov_center -- location estimator for covariance of submodels ['med']
    crot -- bool, use rotations of sub models?
    strict -- bool, use stict (rot/flips only) rotations?
    metric -- bool, use row metric?
    """
    m, n = a.shape
    TSQ = zeros((n, n_iter), dtype='<f8') # (nvars x n_subsets)
    n_false = zeros((n, n_iter), dtype='<f8')
    #full model
    if algo=='bridge':
        dat = bridge(a, b, aopt, 'loads', 'fast')
    else:
        dat = pls(a, b, aopt, 'loads', 'fast')
    W = pls_jkW(a, b, aopt, n_blocks=None, algo=algo)
    tsq_full = hotelling(W, dat['W'], p_center=p_center,
                         alpha=alpha, crot=crot, strict=strict,
                         cov_center=cov_center, metric=metric)
    t0 = time.time()
    Vs = shuffle_1d(b, n_iter)
    for i,b_shuff in enumerate(Vs):
        t1 = time.time()
        if algo=='bridge':
            dat = bridge(a, b_shuff, aopt, 'loads','fast')
        else:
            dat = pls(a, b, aopt, 'loads', 'fast')
        W = pls_jkW(a, b_shuff, aopt, n_blocks=None, algo=algo)
        TSQ[:,i] = hotelling(W, dat['W'],p_center=p_center,
                             alpha=alpha, crot=crot, strict=strict,
                             cov_center=cov_center, metric=metric)
        print time.time() - t1
    sort_index = argsort(tsq_full)[::-1]
    back_sort_index = sort_index.argsort()
    print time.time() - t0
    # count false positives
    tsq_full_sorted = tsq_full.take(sort_index)
    for i in xrange(n_iter):
        for j in xrange(n):
            n_false[j,i] = sum(TSQ[:,i]>=tsq_full[j])
    false_pos = median(n_false, 1)
    ll = arange(1, len(false_pos)+1, 1)
    sort_qval = false_pos.take(sort_index)/ll
    qval = false_pos/ll.take(back_sort_index)
    print time.time() - t0
    return qval, false_pos, TSQ, tsq_full
 def ensure_strict(C, only_flips=True):
    """Ensure that a rotation matrix does only 90 degree rotations.
    In multiplication with pcs this allows flips and reordering.
    if only_flips is True there will onlt be flips allowed
    """
    Cm = C
    S = sign(C) # signs
    if only_flips==True:
        C = eye(Cm.shape[0])*S
        return C
    Cm = zeros_like(C)
    Cm.putmask(1.,abs(C)>.6)
    if det(Cm)>1:
        raise ValueError,"Implement this!"
    return Cm*S
 def leverage(aopt=1,*args):
    """Returns leverages
    input : aopt, number of components to base leverage calculations on
            *args, matrices of normed blm-paramters
    output: leverages
    For PCA typical inputs are normalised T or normalised P
    For PLSR typical inputs are normalised T or normalised W
    """
    if aopt<1:
        raise ValueError,"Leverages only make sense for aopt>0"
    lev  = []
    for u in args:
        lev_u = 1./u.shape[0] + dot(u[:,:aopt], u[:,:aopt].T).diagonal()
        lev.append(lev_u)
    return lev
 def variances(a,t,p):
    """Returns explained variance and ind. var from blm-params.
    input:
          a -- full centered matrix
          t,p -- parameters from a bilinear approx of the above matrix.
    output:
          var -- variance of each component
          var_exp -- cumulative explained variance in percentage
    Typical inputs are:  X(centered),T,P for PCA or
                         X(centered),T,P / Y(centered),T,Q for PLSR.
    """
    tot_var = sum(a**2)
    var = 100*(sum(p**2, 0)*sum(t**2, 0))/tot_var
    var_exp = cumsum(var)
    return var, var_exp
 def residual_diagnostics(Y, Yhat, aopt=1):
    """Root mean errors and press values. 
    R2 vals
    """
    pass
 def ssq(E, axis=0, weights=None):
    """Sum of squares, supports weights."""
    n = E.shape[axis]
    if weights==None:
        weights = eye(n)
    else:
        weigths = diag(weigths)
    if axis==0:
        Ew = dot(weights, E)
    elif axis==1:
        Ew = dot(E, weights)
    else:
        raise NotImplementedError, "Higher order modes not supported"
    return pow(Ew,2).sum(axis)
--- a/fluents/lib/cx_utils.py
+++ b/fluents/lib/cx_utils.py
@@ -0,0 +1,108 @@
 from scipy import apply_along_axis,newaxis,zeros,\
     median,round_,nonzero,dot,argmax,any,sqrt,ndarray,\
     trace,zeros_like,sign,sort,real,argsort,rand,array
 from scipy.linalg import norm,svd,inv,eig
 from scipy.stats import median,mean
 def normalise(a,axis=0,return_scales=False):
 	s = apply_along_axis(norm,axis,a)
 	if axis==0:
 		s = s[newaxis]
 	else:
 		s = s[:,newaxis]
 	a_s = a/s
 	if return_scales:
 		return a_s,s
 	return a_s
 def sub2ind(shape,i,j):
 	"""Indices from subscripts. Only support for 2d"""
 	row,col = shape
 	ind = []
 	for k in xrange(len(i)):
 		for m in xrange(len(j)):
 			ind.append(i[k]*col + j[m])
 	return ind
 def sorted_eig(a, b=None,sort_by='sm'):
    """
    Just eig with real part of output sorted:
    This is for convenience only, not general!
    sort_by='sm': return the eigenvectors by eigenvalues
                  of smallest magnitude first. (default)
            'lm': returns largest eigenvalues first      
    output: just as eig with 2 outputs
            -- s,v (eigvals,eigenvectors)
    (This is reversed output compared to matlab)
    """
    s,v = eig(a,b)
    s = real(s) # dont expect any imaginary part
    v = real(v)
    ind = argsort(s)
    if sort_by=='lm':
        ind = ind[::-1]
    v = v.take(ind,1)
    s = s.take(ind)
    return s,v
 def str2num(string_number):
    """Convert input (string number) into number, if float(string_number) fails, a nan is inserted. 
    """
    missings = ['','nan','NaN','NA']
    try:
        num = float(string_number)
    except:
        if string_number in missings:
            num = nan
        else:
            print "Found strange entry: %s" %string_number
            raise
    return num
 def randperm(n):
  r=rand(n)
  dict={}
  for i in range(n):
     dict[r[i]]=i
  r=sort(r)
  out=zeros(n)
  for i in range(n):
     out[i]=dict[r[i]]
  return array(out,dtype='i')
 def mat_center(X,axis=0,ret_mn=False):
    """Mean center matrix along axis.
        X -- matrix, data
        axis -- dim,
        ret_mn -- bool, return mean
    output:
            Xc, [mnX]
    NB: axis = 1 is column-centering, axis=0=row-centering
    default is row centering (axis=0)
    """
    try:
        rows,cols = X.shape
    except ValueError:
        print "The X data needs to be two-dimensional"
    if axis==0:
        mnX = mean(X,axis)[newaxis]
        Xs = X - mnX
    elif axis==1:
        mnX = mean(X,axis)[newaxis]
        Xs = (X.T - mnX).T
    if ret_mn:
        return Xs,mnX
    else:
        return Xs
--- a/fluents/lib/engines.py
+++ b/fluents/lib/engines.py
@@ -0,0 +1,200 @@
 """Module contain algorithms for  (burdensome) calculations.
 There is no typechecking of any kind here, just focus on speed
 """
 from scipy.linalg import svd,norm,inv,pinv,qr
 from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\
     apply_along_axis,mean,ones,randn,empty_like,outer,c_,\
     rand,sum,cumsum
 def pca(a, aopt, scale='scores', mode='normal'):
    """ Principal Component Analysis model
    mode:
         -- fast : returns smallest dim scaled (T for n<=m, P for n>m )
         -- normal : returns all model params and residuals after aopt comp
         -- detailed    : returns all model params and all residuals
    """
    m,n = a.shape
    u,s,vt = svd(a, full_matrices=0)
    T = u*s
    T = T[:,:aopt]
    P = vt[:aopt,:].T
    if scale=='loads':
        tnorm = apply_along_axis(norm, 0, T)
        T = T/tnorm
        P = P*tnorm
    if mode == 'fast':
        return {'T':T, 'P':P}
    if mode=='detailed':
        """Detailed mode returns residual matrix for all comp.
        That is E, is a three-mode matrix: (amax, m, n) """
        E = empty((aopt,  m,  n))
        for ai in range(aopt):
            e = a - dot(T[:,:ai+1], P[:,:ai+1].T)
            E[ai,:,:] = e.copy()
    else:
        E = a - dot(T,P.T)
    return {'T':T, 'P':P, 'E':E}
 def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
    """Kernel pls for tall/wide matrices.
    Fast pls for calibration. Only inefficient for many Y-vars.
    """
    m,n = a.shape
    if ab!=None:
        mm,l = ab.shape
    else:
        k,l = b.shape
    W = empty((n, aopt))
    P = empty((n, aopt))
    R = empty((n, aopt))
    Q = empty((l, aopt))
    T = empty((m, aopt))
    B = empty((aopt, n, l))
    if ab==None: 
        ab = dot(a.T, b)
    for i in range(aopt):
        if ab.shape[1]==1:
            w = ab
        else:
            u,s,vh = svd(dot(ab.T, ab))
            w = dot(ab,u[:,:1])
        w = w/norm(w)
        r = w.copy()
        if i>0:
            for j in range(0,i,1):
                r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis]
        t = dot(a, r)
        tt = norm(t)**2
        p  = dot(a.T, t)/tt
        q = dot(r.T, ab).T/tt
        ab = ab - dot(p, q.T)*tt
        T[:,i] = t.ravel()
        W[:,i] = w.ravel()
        P[:,i] = p.ravel()
        R[:,i] = r.ravel()
        if mode=='fast' and i==aopt-1:
            if scale=='loads':
                tnorm = apply_along_axis(norm, 0, T)
                T = T/tnorm
                W = W*tnorm
            return {'T':T, 'W':W}
        Q[:,i] = q.ravel()
        B[i] = dot(R[:,:i+1], Q[:,:i+1].T)
    if mode=='detailed':
        E = empty((aopt, m, n))
        F = empty((aopt, k, l))
        for i in range(1,aopt+1,1):
            E[i-1] = a - dot(T[:,:i],P[:,:i].T)
            F[i-1] = b - dot(T[:,:i],Q[:,:i].T)
    else:
        E = a - dot(T[:,:aopt], P[:,:aopt].T)
        F = b - dot(T[:,:aopt], Q[:,:aopt].T)
    if scale=='loads':
        tnorm = apply_along_axis(norm, 0, T)
        T = T/tnorm
        W = W*tnorm
        Q = Q*tnorm
        P = P*tnorm
    return {'B':B, 'Q':Q, 'P':P, 'T':T, 'W':W, 'R':R, 'E':E, 'F':F}
 def w_simpls(aat, b, aopt):
    """ Simpls for wide matrices.
    Fast pls for crossval, used in calc rmsep for wide X
    There is no P,W.  T is normalised
    """
    bb = b.copy()
    m,m = aat.shape
    U = empty((m, aopt))
    T = empty((m, aopt))
    H = empty((m, aopt)) #just like W in simpls
    PROJ = empty((m, aopt)) #just like R in simpls
    for i in range(aopt):
        u,s,vh = svd(dot(dot(b.T, aat), b), full_matrices=0)
        u = dot(b, u[:,:1]) #y-factor scores
        U[:,i] = u.ravel()
        t =dot(aat, u)
        t = t/norm(t)
        T[:,i] = t.ravel()
        h = dot(aat, t) #score-weights
        H[:,i] = h.ravel()
        PROJ[:,:i+1] = dot(T[:,:i+1], inv(dot(T[:,:i+1].T, H[:,:i+1])) )
        if i<aopt:
            b = b - dot(PROJ[:,:i+1], dot(H[:,:i+1].T,b) )
    C = dot(bb.T, T)
    return {'T':T,'U':U,'Q':C,'H':H}
 def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
    """Undeflated Ridged svd(X'Y)
    """
    m, n = a.shape
    k, l = b.shape
    u,s,vt = svd(b, full_matrices=0)
    g0 = dot(u*s, u.T)
    g = (1 - r)*g0 + r*eye(m)
    ag = dot(a.T, g)
    u,s,vt = svd(ag, full_matrices=0)
    W = u[:,:aopt]
    K = vt[:aopt,:].T
    T = dot(a, W)
    tnorm = apply_along_axis(norm, 0, T) # norm of T-columns
    if mode == 'fast':
        if scale=='loads':
            T = T/tnorm
            W = W*tnorm
        return {'T':T, 'W':W}
    U = dot(g0, K) #fixme check this 
    Q = dot(b.T, dot(T, inv(dot(T.T,T)) ))
    B = zeros((aopt, n, l))
    for i in range(aopt):
        B[i] = dot(W[:,:i+1], Q[:,:i+1].T)
    # leverages
    # fixme: probably need an orthogonal basis for row-space leverage
    #        T (scores) are not orthogonal
    #        Using a qr decomp to get an orthonormal basis for row-space
    #Tq = qr(T)[0]
    #s_lev,v_lev = leverage(aopt,Tq,W)
    # explained variance
    #var_x, exp_var_x = variances(a,T,W)
    #qnorm = apply_along_axis(norm, 0, Q)
    #var_y, exp_var_y = variances(b,U,Q/qnorm)
    if mode == 'detailed':
        E = empty((aopt, m, n))
        F = empty((aopt, k, l))
        for i in range(aopt):
            E[i] = a - dot(T[:,:i+1], W[:,:i+1].T)
            F[i] = b - dot(a, B[i])
    else: #normal
        F = b - dot(a, B[-1])
        E = a - dot(T, W.T)
    if scale=='loads':
        T = T/tnorm
        W = W*tnorm
        Q = Q*tnorm
    return {'B':B, 'W':W, 'T':T, 'Q':Q, 'E':E, 'F':F, 'U':U, 'P':W}
--- a/fluents/lib/nx_utils.py
+++ b/fluents/lib/nx_utils.py
@@ -0,0 +1,626 @@
 import os,sys
 from itertools import izip
 import networkx as NX
 from scipy import shape,diag,dot,asarray,sqrt,real,zeros,eye,exp,maximum,\
     outer,maximum,sum,diag,real
 from scipy.linalg import eig,svd,inv,expm,norm
 from cx_utils import sorted_eig
 import numpy
 eps = numpy.finfo(float).eps.item()
 feps = numpy.finfo(numpy.single).eps.item()
 _array_precision = {'f': 0, 'd': 1, 'F': 0, 'D': 1,'i': 1}
 def xgraph_to_graph(G):
    """Convert an Xgraph to an ordinary graph.
    Edge attributes, mult.edges and self-loops are lost in the process.
    """
    GG = NX.convert.from_dict_of_lists(NX.convert.to_dict_of_lists(G))
    return GG
 def get_affinity_matrix(G, data, ids, dist='e', mask=None, weight=None, t=0, out='dist'):
    """
    Function for calculating a general affinity matrix, based upon distances.
    Affiniy = 1 - distance ((10-1) 1 is far apart)
    INPUT
    data:
    gene expression data, type dict data[gene] = expression-vector
    G:
    The network (networkx.base.Graph object)
    mask:
    The array mask shows which data are missing. If mask[i][j]==0, then
    data[i][j] is missing.
    weights:
    The array weight contains the weights to be used when calculating distances.
    transpose:
    If transpose==0, then genes are clustered. If transpose==1, microarrays are
    clustered.
    dist:
    The character dist defines the distance function to be used:
    dist=='e': Euclidean distance
    dist=='b': City Block distance
    dist=='h': Harmonically summed Euclidean distance
    dist=='c': Pearson correlation
    dist=='a': absolute value of the correlation
    dist=='u': uncentered correlation
    dist=='x': absolute uncentered correlation
    dist=='s': Spearman's rank correlation
    dist=='k': Kendall's tau
    For other values of dist, the default (Euclidean distance) is used.
    OUTPUT
    D :
    Similariy matrix (nGenes x nGenes), symetric, d_ij e in [0,1]
    Normalized so max weight = 1.0
    """
    try:
        from Bio import Cluster as CLS
    except:
        raise ValueError, "Need installed biopython"
    nVar = len(data)
    nSamp = len(data[data.keys()[0]])
    X = zeros((nVar, nSamp),dtpye='<f8')
    for i,gene in enumerate(ids): #this shuld be right!!
        X[i,:] = data[gene]
    #X = transpose(X) # distancematrix needs matrix as (nGenes,nSamples)
    D_list  = CLS.distancematrix(X, dist=dist)
    D = zeros((nVar,nVar),dtype='<f8')
    for i,row in enumerate(D_list):
        if i>0:
            D[i,:len(row)]=row
    D = D + D.T
    MAX = 30.0
    D_max = max(ravel(D))/MAX
    D_n = D/D_max #normalised (max = 10.0)
    D_n = (MAX+1.) - D_n #using correlation (inverse distance for dists)
    A = NX.adj_matrix(G, nodelist=ids)
    if out=='dist':
        return D_n*A
    elif out=='heat_kernel':
        t=1.0
        K = exp(-t*D*A)
        return K
    elif out=='complete':
        return D_n
    else:
        return []
 def remove_one_degree_nodes(G, iter=True):
    """Removes all nodes with only one neighbour.  These nodes does
    not contribute to community structure.
    input:
             G -- graph
             iter -- True/False iteratively remove?
    """
    G_copy = G.copy()
    if iter==True:
        while 1:
            bad_nodes=[]
            for node in G_copy.nodes():
                if len(G_copy.neighbors(node))==1:
                    bad_nodes.append(node)
            if len(bad_nodes)>0:
                G_copy.delete_nodes_from(bad_nodes)
            else:
                break
    else:
       bad_nodes=[]
       for ngb in G_copy.neighbors_iter():
           if len(G_copy.neighbors(node))==1:
               bad_nodes.append(node)
           if len(bad_nodes)>0:
               G_copy.delete_nodes_from(bad_nodes)
    print "Deleted %s nodes from network" %(len(G)-len(G_copy))
    return G_copy
 def key_players(G, n=1, with_labels=False):
    """
    Resilince measure
    Identification of key nodes by fraction of nodes in
    disconnected subgraph when the node is removed.
    output:
           fraction of nodes disconnected when node i is removed
    """
    i=0
    frac=[]
    labels = {}
    for node in G.nodes():
        i+=1
        print i
        T = G.copy()
        T.delete_node(node)
        n_nodes = T.number_of_nodes()
        sub_graphs = NX.connected_component_subgraphs(T)
        n = len(sub_graphs)
        if n>1:
            strong_comp = sub_graphs[0]
            fraction = 1.0 - 1.0*strong_comp.number_of_nodes()/n_nodes
            frac.append(fraction)
            labels[node]=fraction
        else:
            frac.append(0.0)
            labels[node]=0.0
    out = 1.0 - array(frac)
    if with_labels==True:
        return out,labels
    else:
        return out
 def node_weighted_adj_matrix(G, weights=None, ave_type='harmonic', with_labels=False):
    """Return a weighted adjacency matrix of graph. The weights are
    node weights.
    input: G -- graph
           weights -- dict, keys: nodes, values: weights
           with_labels -- True/False, return labels?
    output: A -- weighted eadjacency matrix
            [index] -- node labels 
    """
    n=G.order()
    # make an dictionary that maps vertex name to position
    index={}
    count=0
    for node in G.nodes():
        index[node]=count
        count = count+1
    a = zeros((n,n))
    if type(G)=='networkx.xbase.XGraph':
        raise
    for head,tail in G.edges():
        if ave_type == 'geometric':
            a[index[head],index[tail]]= sqrt(weights[head]*weights[tail])
            a[index[tail],index[head]]= a[index[head],index[tail]]
        elif ave_type == 'harmonic':
            a[index[head],index[tail]] = mean(weights[head],weights[tail])
            a[index[tail],index[head]]= mean(weights[head],weights[tail])
    if with_labels:
        return a,index
    else:
        return a            
 def weighted_adj_matrix(G, with_labels=False):
    """Adjacency matrix of an XGraph whos weights are given in edges.
    """
    A,labels = NX.adj_matrix(G,with_labels=True)
    W = A.astype('<f8')
    for orf,i in labels.items():
        for orf2,j in labels.items():
            if G.has_edge(orf,orf2):
                edge_weight = G.get_edge(orf,orf2)
                W[i,j]=edge_weight
                W[j,i]=edge_weight
    if with_labels==True:
        return W,labels
    else:
        return W
 def assortative_index(G):
    """Ouputs two vectors: the degree and the neighbor average degree.
    Used to measure the assortative mixing.  If the average degree is
    pos. correlated with the degree we know that hubs tend to connect
    to other hubs.
    input: G, graph connected!!
    ouput: d,mn_d: degree, and average degree of neighb.
    (degree sorting from degree(with_labels=True))
    """
    d = G.degree(with_labels=True)
    out=[]
    for node in G.nodes():
        nn = G.neighbors(node)
        if len(nn)>0:
            nn_d = mean([float(d[i]) for i in nn])
            out.append((d[node], nn_d))
    return array(out).T
 def struct_equivalence(G,n1,n2):
    """Returns the structural equivalence of a node pair.  Two nodes
    are structural equal if they share the same neighbors.
    x_s = [ne(n1) union ne(n2) - ne(n1) intersection ne(n2)]/[ne(n1)
    union ne(n2) + ne(n1) intersection ne(n2)]
    ref: Brun et.al 2003
    """
    #[ne(n1) union ne(n2) - ne(n1) intersection ne(n2
    s1 = set(G.neighbors(n1))
    s2 = set(G.neighbors(n2))
    num_union = len(s1.union(s2))
    num_intersection = len(s1.intersection(s2))
    if num_union & num_intersection:
        xs=0
    else:
        xs = (num_union - num_intersection)/(num_union + num_intersection)
    return xs
 def struct_equivalence_all(G):
    """Not finnished.
    """
    A,labels = NX.adj_matrix(G,with_labels=True)
    pass
 def hamming_distance(n1,n2):
    """Not finnsihed.
    """
    pass
 def graph_corrcoeff(G):
    """Not finnished.
    """
    A,index = NX.adj_matrix(G,with_labels=True)
    #C = zeros(*A.shape(),'d')
    n = 1.*G.number_of_nodes()
    for node in G.nodes():
        a_j = A[index[node],:] #neighbors
        mean_a = sum(a_j)/n# degree(G)/number_of_nodes()
        var_a = sqrt(sum((a_j - mean_a)**2)/n)
        pass
 def graph_and_data_intersection(data, graph, pathways=None,
                                keep_connected=True):
    """Returns the intersection of keys in two dictionaries.
    NB: keep track of identifer sorting after these dict transforms.
    input:
          data -- dict, keys: gene id, value: measurement profile
          graph -- networkx,base.graph, full graph
          pathways -- dict, keys: pathway name, values: nodes in pathway 
    call:
    new_data, new_graph,pathways = graph_and_data_intersection(data,graph,pathways,keep_connected=True)
    """
    new_graph = graph.copy()
    new_data = {}
    new_pathways = {}
    graph_set = set(graph.nodes())
    data_set = set(data.keys())
    intersection = data_set & graph_set
    new_graph.delete_nodes_from(graph_set - data_set) #remove difference
    for k in intersection:
        new_data[k] = data[k]
    if keep_connected:
        max_iter = 0
        sub_graphs = NX.connected_component_subgraphs(new_graph)
        if len(sub_graphs)==0:
            new_graph = sub_graphs[0]
        else:
            new_graph = sub_graphs[0]
        old_data = new_data
        while new_graph.number_of_nodes() != len(new_data) and max_iter<100:
            max_iter+=1
            graph_set = sets.Set(new_graph.nodes())
            data_set = sets.Set(new_data.keys())
            intersection = data_set & graph_set
            new_graph.delete_nodes_from(graph_set - data_set)
            new_data={}
            for k in intersection:
                new_data[k] = old_data[k]
            old_data = new_data.copy()
            new_graph = NX.connected_component_subgraphs(new_graph)[0]
    if pathways!=None:
        for pth,nodes in pathways.items():
            new_pathways[pth] = [node for node in nodes if node in new_graph]
    print "\nSUMMARY (graph_and_data_intersection): "
    print "Number of input variables: %s\n\
    Number nodes in input graph: %s" %(len(data),len(graph))
    print "\nUsing intersection of connected graph and nodes with data values"
    print "Number of variables is now: %s" %len(new_data)
    print "Number of nodes in graph: %s" %new_graph.number_of_nodes()
    if pathways!=None:
        return new_data,new_graph,new_pathways
    else:
        return new_data,new_graph
 def rx_graph_and_data_intersection(graph,node_data,pathways,data,keep_connected=False):
    """Returns a (connected) reaction graph with present gene expression data.
    keep_connected==True:
    When a node (gene) is not present in our expression data, the node
    is deleted and all neighbors are connected with edge weight=0.5
    if the are not already neigbors.
    input:
          data -- dict, keys: gene id, value: measurement profile
          graph -- networkx.xbase.xgraph, full wieghted graph
          node_data -- dict, keys: rx id, value: set of gene_ids 
          pathways -- dict, keys: pathway name, values: lidt of nodes in pathway 
    """
    # We do not connect the full graph ... may be performed by using the reference graph?
    graph = NX.connected_component_subgraphs(graph)[0] #largest connected component 
    new_graph = graph.copy()
    new_data = {}
    new_node_data = node_data.copy()
    new_pathways = {}
    genes_in_graph=set()
    genes_in_data = set(data.keys())
    rx_in_graph = set(new_graph.nodes())
    # genes in graph nodes (rx_nodes)
    for rx in rx_in_graph:
        genes_in_graph.update(set(new_node_data.get(rx)))
    keep_genes = genes_in_data.intersection(genes_in_graph) #both in graph and data
    #update node data
    for rx,genes in node_data.items(): # delete node data of nodes not present in graph
            genes = set(genes)
            genes.intersection_update(keep_genes) #remove genes if they are not in inters. 
            if len(genes)==0 or rx not in rx_in_graph: #no gene data or not in graph
                print "removing: " + str(rx)
                del new_node_data[rx]
    rx_in_data= set(new_node_data.keys())
    rx_intersection = rx_in_data.intersection(rx_in_graph)
    for gene in keep_genes:
        new_data[gene] = data.get(gene)
    # update pathways nodes
    for pth,genes in pathways.items():
        if genes:
            genes = set(genes)
            genes.intersection_update(keep_genes) # gene needs to have data
        else:
            pass
        new_pathways[pth] = genes
    bad_nodes = rx_in_graph.difference(rx_in_data) #in graph but no data
    if keep_connected==True:
        dummy = new_graph.copy()
        for rx in bad_nodes:
            dummy.delete_node(rx)
            if len(NX.connected_component_subgraphs(dummy))>1:
                nghbrs = new_graph.neighbors(rx)
                for i in nghbrs:
                    for j in nghbrs:
                        if i!=j:
                            if not new_graph.has_edge(i,j):
                                new_graph.add_edge(i,j,0.5)
    #update graph
    new_graph.delete_nodes_from(list(bad_nodes))
    return new_graph,new_node_data,new_pathways,new_data
 def weighted_laplacian(G,with_labels=False):
    """Return standard Laplacian of graph from a weighted adjacency matrix."""
    n= G.order()
    I = scipy.eye(n)
    A = weighted_adj_matrix(G)
    D = I*scipy.sum(A, 0)
    L = D-A
    if with_labels:
        A,index = weighted_adj_matrix(G, with_labels=True)
 	return L, index
    else:	
        return L            
 """Below are methods for calculating graph metrics
 Four main decompositions :
 0.) Adjacency diffusion kernel expm(A),
 1.) von neumann kernels (diagonalisation of adjacency matrix)
 2.) laplacian kernels (geometric series of adj.)
 3.) diffusion kernels (exponential series of adj.)
 ---- Kv
 von_neumann : Kv = (I-alpha*A)^-1 (mod: A(I-alpha*A)^-1)? ,
 geom. series
 ---- Kl
 laplacian: Kl = (I-alpha*L)^-1 , geom. series
 ---- Kd
 laplacian_diffusion: Kd = expm(-alpha*L)
 exp. series
 ---- Ke
 Exponential diffusion.
 Ke = expm(A) .... expm(-A)?
 """
 # TODO:
 # check for numerical unstable eigenvalues and set to zero
 # othervise some inverses wil explode ->ok ..using pinv for inverses
 #
 # This gives results that look numerical unstable
 #
 # -- divided adj by sum(A[:]), check this one (paper by Lebart scales with number of edges)
 #
 #
 #
 # the neumann kernel is defined in Kandola to be K = A*(I-A)^-1
 # lowest eigenvectors are same as the highest of K = A*A ?
 # this needs clarification
 # diffusion is still wrong! ... ok
 # diff needs normalisation?! check the meaning of exp(-s) = exp(1/s) -L = 1/degree ... etc
 # Is it the negative of exp. of adj. metrix in Kandola?
 #
 # Normalised=False returns only nans (no idea why!!) ... fixed ok
 # 31.1: diff is ok exp(0)=1 not zero!
 # 07.03.2005: normalisation is ok: -> normalisation will emphasize high degree nodes
 # 10.03.2005: symeig is unstable an returns nans of some eigenvectors? switching back to eig
 # 14.05.2006: diffusion returns negative values, using expm(-LL) instead (FIX)
 # 13.09.2206: update for use in numpy
 def K_expAdj(W, normalised=False, alpha=1.0):
    """Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel. 
    """
    W = asarray(W)
    t = W.dtype.char
    if len(W.shape)!=2:
        raise ValueError, "Non-matrix input to matrix function."
    m,n = W.shape
    if t in ['F','D']:
        raise TypeError, "Complex input!"
    if normalised==True:
        T = diag( sqrt( 1./(sum(W,0))) )
        W = dot(dot(T, W), T)
    e,vr = eig(W)
    s = real(e)**2 # from eigenvalues to singularvalues
    vri = inv(vr)
    s = maximum.reduce(s) + s
    cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
    cutoff = abs(cond*maximum.reduce(s))
    psigma = eye(m)
    for i in range(len(s)):
        if abs(s[i]) > cutoff:
            psigma[i,i] = .5*alpha*exp(s[i])
    return dot(dot(vr,psigma),vri)
 def K_vonNeumann(W,normalised=False,alpha=1.0):
    """ The geometric series of path lengths.
    Returns matrix square root of pseudo inverse of the adjacency matrix.
    """
    W = asarray(W)
    t = W.dtype.char
    if len(W.shape)!=2:
        raise ValueError, "Non-matrix input to matrix function."
    m,n = W.shape
    if t in ['F','D']:
        raise TypeError, "Complex input!"
    if normalised==True:
        T = diag(sqrt(1./(sum(W,0))))
        W = dot(dot(T,W),T)
    e,vr = eig(W)
    vri = inv(vr)
    e = real(e)  # we only work with real pos. eigvals
    e = maximum.reduce(e) + e
    cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
    cutoff = cond*maximum.reduce(e)
    psigma = zeros((m,n),t)
    for i in range(len(e)):
        if e[i] > cutoff:
            psigma[i,i] = 1.0/e[i] #these are eig.vals (=sqrt(sing.vals))
    return dot(dot(vr,psigma),vri).astype(t)
 def K_laplacian(W, normalised=True, alpha=1.0):
    """ This is the matrix square root of the pseudo inverse of L.
    Also known as th eaverage commute time matrix.
    """
    W = asarray(W)
    t = W.dtype.char
    if len(W.shape)!=2:
        raise ValueError, "Non-matrix input to matrix function."
    m,n = W.shape
    if t in ['F','D']:
        raise TypeError, "Complex input!"
    D = diag(sum(W,0))
    L = D - W
    if normalised==True:
        T = diag(sqrt(1./sum(W,0)))
        L = dot(dot(T,L),T)
    e,vr = eig(L)
    e = real(e)
    vri = inv(vr)
    cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
    cutoff = cond*maximum.reduce(e)
    psigma = zeros((m,),t) # if s close to zero -> set 1/s = 0 
    for i in range(len(e)):
        if e[i] > cutoff:
            psigma[i] = 1.0/e[i]
    K = dot(dot(vr,diag(psigma)),vri).astype(t)
    K = real(K)
    I = eye(n)
    K = (1-alpha)*I + alpha*K
    return K
 def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5):
    """Returns diffusion kernel.
    input:
            -- W, adj. matrix
            -- normalised [True/False]
            -- alpha, [0,1] (degree of network influence)
            -- beta, [0->), (diffusion degree)
    """
    W = asarray(W)
    t = W.dtype.char
    if len(W.shape)!=2:
        raise ValueError, "Non-matrix input to matrix function."
    m,n = W.shape
    if t in ['F','D']:
        raise TypeError, "Complex input!"
    D = diag(sum(W,0))
    L = D-W
    if normalised==True:
        T = diag(sqrt(1./(sum(W,0))))
        L = dot(dot(T,L),T)
    e,vr = eig(L)
    vri = inv(vr) #inv
    cond = 1.0*{0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
    cutoff = 1.*abs(cond*maximum.reduce(e))
    psigma = eye(m) # if sing vals are 0 exp(0)=1 (unnecessary)
    #psigma = zeros((m,n), dtype='<f8')
    for i in range(len(e)):
        if abs(e[i]) > cutoff:
            psigma[i,i] = exp(-beta*e[i])
    K = real(dot(dot(vr, psigma), vri))
    I = eye(n, dtype='<f8')
    K = (1. - alpha)*I + alpha*K
    return K
 def K_modularity(W,alpha=1.0):
    """ Returns the matrix square root of Newmans modularity."""
    W = asarray(W)
    t = W.dtype.char
    m, n = W.shape
    d = sum(W, 0)
    m = 1.*sum(d)
    B = W - (outer(d, d)/m)
    s,v = sorted_eig(B, sort_by='lm')
    psigma = zeros( (n, n), dtype='<f8' )
    for i in range(len(s)):
        if s[i]>1e-7:
            psigma[i,i] = sqrt(s[i])
            #psigma[i,i] = s[i]
    K = dot(dot(v, psigma), v.T)
    I = eye(n)
    K = (1 - alpha)*I + alpha*K
    return K
 def kernel_score(K, W):
    """Returns the modularity score.
    K -- (modularity) kernel
    W -- adjacency matrix (possibly weighted)
    """
    # normalize W (: W'W=I)
    m, n = shape(W)
    for i in range(n):
        W[:,i] = W[:,i]/norm(W[:,i])
    score = diag(dot(W, dot(K, W)) )
    tot = sum(score)
    return score, tot
--- a/fluents/lib/select_generators.py
+++ b/fluents/lib/select_generators.py
@@ -0,0 +1,187 @@
 """Matrix cross validation selection generators
 """
 from scipy import take,arange,ceil,repeat,newaxis,mean,asarray,dot,ones,\
     random,array_split,floor,vstack,asarray,minimum
 from cx_utils import randperm
 def w_pls_gen(aat,b,n_blocks=None,center=True,index_out=False):
     """Random block crossvalidation for wide (XX.T) trick in PLS.
     Leave-one-out is a subset, with n_blocks equals nSamples
     aat -- outerproduct of X
     b -- Y
     n_blocks = 
     center -- use centering of calibration ,sets (aat_in,b_in) are centered
     Returns:
         -- aat_in,aat_out,b_in,b_out,[out]
     """
     m,n = aat.shape
     index = randperm(m)
     nValuesInBlock = m/n_blocks
     if n_blocks==m:
         index = arange(m)
     out_ind = [index[i*nValuesInBlock:(i+1)*nValuesInBlock] for i in range(n_blocks)]
     for out in out_ind:
          inn = [i for i in index if i not in out]
          aat_in = aat[inn,:][:,inn]
          aat_out = aat[out,:][:,inn]
          b_in = b[inn,:]
          b_out = b[out,:]
          if center:
               # centering projector: I - (1/n)11'
               # nin = len(inn)
               # Pc = eye(nin) - outer(ones((nin,)),ones((nin,)))/nin
               # xxt - x( outer(ones((nin,)),ones((nin,)))/nin ) x.T
               # de jong:
               h = sum(aat_in,0)[ :,newaxis]
               h = (h - mean(h)/2)/len(inn)
               mn_a = h + h.T
               aat_in = aat_in - mn_a
          if index_out:
               yield aat_in,aat_out,b_in,b_out,out
          else:
               yield aat_in,aat_out,b_in,b_out
 def pls_gen(a,b, n_blocks=None, center=False, index_out=False,axis=0):
     """Random block crossvalidation
    Leave-one-out is a subset, with n_blocks equals a.shape[-1]
    """
     index = randperm(a.shape[axis])
     if n_blocks==None:
          n_blocks = a.shape[axis]
     n_in_set = ceil(float(a.shape[axis])/n_blocks)
     out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_blocks)]
     for out in out_ind_sets:
         inn = [i for i in index if i not in out]
         if center:
              a = a - mean(a,0)[newaxis]
              b = b - mean(b,0)[newaxis]
         if index_out:
              yield a.take(inn,0),a.take(out,0), b.take(inn,0),b.take(out,0),out
         else:     
              yield a.take(inn,0),a.take(out,0), b.take(inn,0),b.take(out,0)
 def pca_gen(a,n_sets=None, center=False, index_out=False,axis=0):
     """PCA random block crossval generator.
     """
     m = a.shape[axis]
     index = randperm(m)
     if n_sets==None:
          n_sets = m
     n_in_set = ceil(float(m)/n_sets)
     out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_sets)]
     for out in out_ind_sets:
         inn = [i for i in index if i not in out]
         if center:
              a = a - mean(a,0)[newaxis]
         if index_out:
              yield a.take(inn,0),a.take(out,0),out
         else:     
              yield a.take(inn,0),a.take(out,0)
 def w_pls_gen_jk(a,b,n_sets=None,center=True,index_out=False,axis=0):
     """Random block crossvalidation for wide X (m>>n)
     Leave-one-out is a subset, with n_sets equals a.shape[-1]
     Returns : X_m and X_m'Y_m
     """
     m = a.shape[axis]
     ab = dot(a.T,b)
     index = randperm(m)
     if n_sets==None:
          n_sets = m
     n_in_set = ceil(float(m)/n_sets)
     out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_sets)]
     for out in out_ind_sets:
         inn = [i for i in index if i not in out]
         nin = len(inn)
         nout = len(out)
         a_in = a[inn,:]
         mn_a = 0
         mAB = 0
         if center:
              mn_a = mean(a,0)[newaxis]
              mAin = dot(-ones((1,nout)),a[out,:])/nin
              mBin = dot(-ones((1,nout)),b[out,:])/nin
              mAB = dot(mAin.T,(mBin*nin))
         ab_in = ab - dot(a[out,].T,b[out,:]) - mAB
         a_in = a_in - mn_a
         if index_out:
              yield ain,ab, out
         else:     
              yield a_in, ab
 def shuffle_1d_block(a, n_sets=None, blocks=None, index_out=False, axis=0):
     """Random block shuffling along 1d axis
     Returns : Shuffled a by axis
     """
     m = a.shape[axis]
     if blocks==None:
         blocks = m
     for ii in xrange(n_sets):
         index = randperm(m)
         if blocks==m:
             a_out = a.take(index, axis)
         else:
             index = arange(m)
             dummy = map(random.shuffle, array_split(index, blocks))
             a_out = a.take(index, axis)
         if index_out:
              yield a_out, index
         else:
              yield a_out
 def shuffle_1d(a, n_sets, axis=0):
     """Random shuffling along 1d axis.
     Returns : Shuffled a by axis
     """
     m = a.shape[axis]
     for ii in xrange(n_sets):
         index = randperm(m)
         yield a.take(index, axis)
 def diag_pert(a, n_sets=10, center=True, index_out=False):
    """Alter generator returning sets perturbed with means at diagonals.
    input:
            X -- matrix, data
            alpha -- scalar, approx. portion of data perturbed  
    """
    m, n = a.shape
    tr=False
    if m>n:
         a = a.T
         m, n = a.shape
         tr = True
    if n_sets>m or n_sets>n:
         msg = "You may not use more subsets than max(n_rows, n_cols)"
         raise ValueError, msg
    nm=n*m
    start_inds = array_split(randperm(m),n_sets) # we use random start diags
    if center:
         a = a - mean(a, 0)[newaxis]
    for v in range(n_sets):
        a_out = a.copy()
        out = []
        for start in start_inds[v]: 
            ind = arange(start+v, nm, n+1)
            [out.append(i) for i in ind]
            if center:
                a_out.put(a.mean(),ind) 
            else:
                 a_out.put(0, ind)
        if tr:
             a_out = a_out.T
        if index_out:
             yield a_out, asarray(out)
        else:
             yield a_out
--- a/fluents/lib/validation.py
+++ b/fluents/lib/validation.py
@@ -0,0 +1,145 @@
 from scipy import ones,mean,sqrt,dot,newaxis,zeros,sum,empty,\
     apply_along_axis,eye, kron
 from scipy.linalg import triu,inv,svd,norm
 from select_generators import w_pls_gen,w_pls_gen_jk,pls_gen,pca_gen,diag_pert
 from engines import w_simpls,pls, bridge,pca
 from pylab import *
 def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
    """RMSEP calc for pls with wide X.
    """
    k, l = Y.shape
    PRESS = zeros((l, amax+1), dtype='f')
    # X,Y are centered
    if n_blocks==None:
        n_blocks = Y.shape[0]
    V = w_pls_gen(dot(X, X.T), Y, n_blocks=n_blocks, center=True)
    for Din, Doi, Yin, Yout in V:
        ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0])
        Yin = Yin - ym
        PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
        if algo=='simpls':
            dat = w_simpls(Din, Yin, amax)
            Q,U,H = dat['Q'], dat['U'], dat['H']
            That = dot(Doi, dot(U, inv(triu(dot(H.T,U))) ))
        else:
            "Other algo-support comming soon"
            raise NotImplementedError
        #Yhat = empty((amax, k, l),dtype='<f8')
        Yhat = []
        for j in range(l):
            TQ = dot(That, triu(dot(Q[j,:][:,newaxis], ones((1,amax)))) )
            E = Yout[:,j][:,newaxis] - TQ
            E = E + sum(E, 0)/Din.shape[0]
            PRESS[j,1:] = PRESS[j,1:] + sum(E**2, 0)
    #Yhat = Y - dot(That,Q.T)
    return sqrt(PRESS/Y.shape[0])
 def pls_val(X, Y, amax=2, n_blocks=10,algo='pls'):
    """ Validation results of pls model. 
    """
    k, l = Y.shape
    PRESS = zeros((l, amax+1), dtype='<f8')
    EE = zeros((amax, k, l), dtype='<f8')
    Yhat = zeros((amax, k, l), dtype='<f8')
    # X,Y are centered
    V = pls_gen(X, Y, n_blocks=n_blocks, center=True, index_out=True)
    for Xin, Xout, Yin, Yout, out in V:
        ym = -sum(Yout,0)[newaxis]/Yin.shape[0]
        Yin = (Yin - ym)
        PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
        if algo=='pls':
            dat = pls(Xin, Yin, amax, mode='normal')
        elif algo=='bridge':
            dat = simpls(Xin, Yin, amax, mode='normal')
        for a in range(amax):
            Ba = dat['B'][a,:,:]
            Yhat[a,out[:],:] = dot(Xout, Ba)
            E = Yout -  dot(Xout, Ba)
            EE[a,out,:] = E
            PRESS[:,a+1] = PRESS[:,a+1] + sum(E**2,0)
    return sqrt(PRESS/(k-1.)), EE, Yhat
 def pca_alter_val(a, amax, n_sets=10,method='diag'):
    """Pca validation by altering elements in X.
    """
    # todo: it is just as easy to do jk-estimates her as well
    V = diag_pert(a, n_sets, center=True, index_out=True)
    sep = empty((n_sets, amax), dtype='f')
    for i, (xi, ind) in enumerate(V):
        dat_i = pca(xi, amax, mode='detailed')
        Ti,Pi = dat_i['T'],dat_i['P']
        for j in xrange(amax):
            Xhat = dot(Ti[:,:j+1], Pi[:,:j+1].T)
            a_sub = a.ravel().take(ind)
            EE = a_sub - Xhat.ravel().take(ind)
            tot = (a_sub**2).sum()
            sep[i,j] = (EE**2).sum()/tot
    return sqrt(sep.mean(0))
    #return sep
 def pca_cv_val(X, amax, n_sets):
    """ Cross validation of pca using random sets crossval.
    """
    m, n = X.shape
    xtot = (X**2).sum()
    V = pca_gen(X, n_sets=7, center=True, index_out=True)
    E = empty((amax, m, n), dtype='f')
    for xi,xout,ind in V:
        dat_i = pca(xi, amax, mode='detailed')
        Pi = dat_i['P']
        for a in xrange(amax):
            Pia = Pi[:,:a+1]
            E[a][ind,:] = (X[ind,:] - dot(xout, dot(Pia,Pia.T) ))**2
    sep = []
    for a in xrange(amax):
        sep.append(E[a].sum()/xtot)
    return sqrt(sep.mean(0))
 def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True):
    """ Returns CV-segments of paramter W for wide X.
    todo: add support for T,Q and B
    """
    if n_blocks == None:
        n_blocks = b.shape[0]
    WW = empty((n_blocks, a.shape[1], amax), dtype='f')
    if use_pack:
        u, s, inflater = svd(a, full_matrices=0)
        a = u*s
    V = pls_gen(a, b, n_blocks=n_blocks)
    for nn,(a_in, a_out, b_in, b_out) in enumerate(V):
        if algo=='pls':
            dat = pls(a_in, b_in, amax, 'loads', 'fast')
        elif algo=='bridge':
            dat = bridge(a_in, b_in, amax, 'loads', 'fast')
        W = dat['W']
        if use_pack:
            W = dot(inflater.T, W)
        WW[nn,:,:] = W
    return WW
 def pca_jkP(a, aopt, n_blocks=None):
    """ Returns CV-segments of paramter P.
    todo: add support for T
    fixme: more efficient to add this in validation loop
    """
    if n_blocks == None:
        n_blocks = a.shape[0]
    PP = empty((n_blocks, a.shape[1], aopt), dtype='f')
    V = pca_gen(a, n_sets=n_blocks, center=True)
    for nn,(a_in, a_out) in enumerate(V):  
        dat = pca(a_in, aopt, mode='fast')
        P = dat['P']
        PP[nn,:,:] = P
    return PP