First import of chemometrics utils
This commit is contained in:
parent
fac9346aad
commit
3ef5522dd0
|
@ -0,0 +1,432 @@
|
||||||
|
"""This module contains bilinear models(Functions)
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
# add library
|
||||||
|
sys.path.append('/home/flatberg/fluents/fluents/lib')
|
||||||
|
import time
|
||||||
|
from fluents.workflow import Function
|
||||||
|
from fluents.dataset import Dataset
|
||||||
|
from fluents import plots, dataset, workflow, logger
|
||||||
|
import scipy
|
||||||
|
from engines import *
|
||||||
|
from cx_stats import leverage, variances, hotelling
|
||||||
|
from cx_utils import mat_center
|
||||||
|
from validation import *
|
||||||
|
import blmplots
|
||||||
|
import engines
|
||||||
|
|
||||||
|
|
||||||
|
class Model(Function):
|
||||||
|
"""Base class of bilinear models.
|
||||||
|
"""
|
||||||
|
def __init__(self,id='johndoe',name='JohnDoe'):
|
||||||
|
Function.__init__(self,id,name)
|
||||||
|
self.name = name
|
||||||
|
self._options = None
|
||||||
|
self._data = {}
|
||||||
|
self._dataset = {}
|
||||||
|
self._packers = {}
|
||||||
|
self.model = {}
|
||||||
|
|
||||||
|
def clear(self):
|
||||||
|
""" Clears model paramters
|
||||||
|
"""
|
||||||
|
self.model = {}
|
||||||
|
self._data = {}
|
||||||
|
self._packers = {}
|
||||||
|
|
||||||
|
|
||||||
|
class PCA(Model):
|
||||||
|
def __init__(self,id='pca',name='PCA'):
|
||||||
|
Model.__init__(self,id,name)
|
||||||
|
self._options = PcaOptions()
|
||||||
|
|
||||||
|
def pre_validation(self, amax, n_sets, val_engine):
|
||||||
|
"""Model calculations for maximum number of components.
|
||||||
|
"""
|
||||||
|
rmsep = val_engine(self.model['E0'], amax, n_sets)
|
||||||
|
self.model['rmsep'] = rmsep
|
||||||
|
self.model['aopt'] = rmsep.argmin()
|
||||||
|
|
||||||
|
def confidence(self, aopt, n_sets, alpha, p_center,
|
||||||
|
crot, strict, cov_center ):
|
||||||
|
"""Returns a confidence measure for model parameters.
|
||||||
|
Based on aopt.
|
||||||
|
"""
|
||||||
|
aopt = self.model['aopt']
|
||||||
|
jk_segments = pca_jkP(self.model['E0'], aopt, n_sets)
|
||||||
|
Pcal = self.model['P'][:,:aopt]
|
||||||
|
tsq = hotelling(jk_segments, Pcal, p_center,
|
||||||
|
cov_center, alpha, crot, strict)
|
||||||
|
self.model['p_tsq'] = tsq
|
||||||
|
|
||||||
|
def make_model(self, amax, mode, scale):
|
||||||
|
"""Model on optimal number of components.
|
||||||
|
"""
|
||||||
|
dat = pca(self.model['E0'], amax, scale, mode)
|
||||||
|
|
||||||
|
# explained variance
|
||||||
|
var_x, exp_var_x = variances(self.model['E0'], dat['T'], dat['P'])
|
||||||
|
dat['var_x'] = var_x
|
||||||
|
dat['exp_var_x'] = exp_var_x
|
||||||
|
|
||||||
|
#fixme###
|
||||||
|
do_lev_s = False
|
||||||
|
do_lev_v = False
|
||||||
|
#####
|
||||||
|
if do_lev_s:
|
||||||
|
# sample leverages
|
||||||
|
tnorm = scipy.apply_along_axis(norm, 0, dat['T']) # norm of T-columns
|
||||||
|
s_lev = leverage(amax, tnorm)
|
||||||
|
dat['s_lev'] = s_lev
|
||||||
|
if do_lev_v:
|
||||||
|
# variable leverages
|
||||||
|
v_lev = leverage(amax, dat['P'])
|
||||||
|
dat['v_lev'] = v_lev
|
||||||
|
|
||||||
|
self.model.update(dat)
|
||||||
|
|
||||||
|
def as_dataset(self, param, dtype='dataset'):
|
||||||
|
"""Return model parameter as Dataset.
|
||||||
|
"""
|
||||||
|
if not param in self.model.keys():
|
||||||
|
return
|
||||||
|
DX = self._dataset['X'] #input dataset
|
||||||
|
dim_name_0, dim_name_1 = DX.get_dim_name()
|
||||||
|
# samples
|
||||||
|
ids_0 = [dim_name_0, DX.get_identifiers(dim_name_0, sorted=True)]
|
||||||
|
# vars
|
||||||
|
ids_1 = [dim_name_1, DX.get_identifiers(dim_name_1, sorted=True)]
|
||||||
|
# components (hidden)
|
||||||
|
pc_ids = ['_comp_a', map(str,range(self.model['aopt'])) ]
|
||||||
|
pc_ids_opt = ['_comp_o', map(str, range(self.model['aopt'])) ]
|
||||||
|
zero_dim = ['_doe', ['0']] # null dim, vector (hidden)
|
||||||
|
match_ids = {'E':[ids_0, ids_1],
|
||||||
|
'E0':[ids_0, ids_1],
|
||||||
|
'P':[ids_1, pc_ids],
|
||||||
|
'T':[ids_0, pc_ids],
|
||||||
|
'W':[ids_1, pc_ids],
|
||||||
|
'p_tsq':[ids_1, zero_dim],
|
||||||
|
'rmsep':[pc_ids, zero_dim],
|
||||||
|
'var_leverages':[ids_1, zero_dim],
|
||||||
|
'sample_leverages':[pc_ids, zero_dim],
|
||||||
|
'exp_var_x': [pc_ids, zero_dim],
|
||||||
|
'var_x': [pc_ids, zero_dim],
|
||||||
|
}
|
||||||
|
|
||||||
|
out = Dataset(self.model[param], match_ids[param], name=param)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def get_out_plots(self, options):
|
||||||
|
out=[]
|
||||||
|
for plt in options['out_plots']:
|
||||||
|
#try:
|
||||||
|
out.append(plt(self))
|
||||||
|
#except:
|
||||||
|
# print plt
|
||||||
|
#logger.log('debug', 'Plot: %s failed') %plt
|
||||||
|
return out
|
||||||
|
|
||||||
|
def run(self, data):
|
||||||
|
"""Run pca with present options.
|
||||||
|
"""
|
||||||
|
self.clear()
|
||||||
|
options = self._options
|
||||||
|
self._dataset['X'] = data
|
||||||
|
self._data['X'] = data.asarray().astype('<f8')
|
||||||
|
if options['center']:
|
||||||
|
center = options['center_mth']
|
||||||
|
self.model['E0'] = center(self._data['X'])
|
||||||
|
else:
|
||||||
|
self.model['E0'] = data.asarray()
|
||||||
|
|
||||||
|
self.pre_validation(**options.pre_validation_options())
|
||||||
|
self.make_model(**options.make_model_options())
|
||||||
|
if options['calc_conf']:
|
||||||
|
self.confidence(**options.confidence_options())
|
||||||
|
|
||||||
|
out = [self.as_dataset(p) for p in options['out_data']]
|
||||||
|
for plt in self.get_out_plots(options):
|
||||||
|
out.append(plt)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class PLS(Model):
|
||||||
|
def __init__(self, id='pls', name='PLS'):
|
||||||
|
Model.__init__(self, id, name)
|
||||||
|
self._options = PlsOptions()
|
||||||
|
|
||||||
|
def pre_validation(self, amax, n_sets, val_engine):
|
||||||
|
"""Returns rmsec,rmsep for model.
|
||||||
|
"""
|
||||||
|
rmsep = val_engine(self.model['E0'], self.model['F0'],
|
||||||
|
amax, n_sets)
|
||||||
|
self.model['rmsep'] = rmsep.mean(0)
|
||||||
|
self.model['aopt'] = rmsep.mean(0).argmin()
|
||||||
|
|
||||||
|
def confidence(self, aopt, n_sets, alpha, p_center,
|
||||||
|
crot, strict, cov_center ):
|
||||||
|
"""Returns a confidence measure for model parameters
|
||||||
|
Supported parameters: W
|
||||||
|
"""
|
||||||
|
aopt = self.model['aopt']
|
||||||
|
jk_segments = pls_jkW(self.model['E0'], self.model['F0'],
|
||||||
|
aopt, n_sets)
|
||||||
|
Wcal = self.model['W'][:,:aopt]
|
||||||
|
tsq = hotelling(jk_segments, Wcal, p_center,
|
||||||
|
alpha, crot, strict, cov_center)
|
||||||
|
self.model['w_tsq'] = tsq
|
||||||
|
|
||||||
|
def permutation_confidence(self, a, b, aopt, reg, n_iter, algo,
|
||||||
|
sim_method):
|
||||||
|
"""Estimates sign. vars by controlling fdr."""
|
||||||
|
|
||||||
|
qvals_sorted, qvals = pls_qvals(a, b, aopt=None,
|
||||||
|
alpha=.4, n_iter=20, algo='pls',
|
||||||
|
sim_method='shuffle', )
|
||||||
|
|
||||||
|
|
||||||
|
def make_model(self, a, b, amax, scale, mode, engine):
|
||||||
|
"""Make model on amax components.
|
||||||
|
"""
|
||||||
|
dat = engine(a, b, amax, scale, mode)
|
||||||
|
self.model.update(dat)
|
||||||
|
|
||||||
|
def as_dataset(self, name, dtype='Dataset'):
|
||||||
|
"""Return any model parameter as Dataset
|
||||||
|
No ids matching
|
||||||
|
"""
|
||||||
|
if name not in self.model.keys():
|
||||||
|
return
|
||||||
|
DX, DY = self._dataset['X'], self._dataset['Y']
|
||||||
|
dim_name_0, dim_name_1 = DX.get_dim_name()
|
||||||
|
dim_name_2, dim_name_3 = DY.get_dim_name()
|
||||||
|
#samples
|
||||||
|
ids_0 = [dim_name_0, DX.get_identifiers(dim_name_0, sorted=True)]
|
||||||
|
# x vars
|
||||||
|
ids_1 = [dim_name_1, DX.get_identifiers(dim_name_1, sorted=True)]
|
||||||
|
# y vars
|
||||||
|
ids_3 = [dim_name_3, DY.get_identifiers(dim_name_3, sorted=True)]
|
||||||
|
# components (hidden)
|
||||||
|
pc_ids = ['_comp', map(str, range(self.model['aopt']))]
|
||||||
|
zero_dim = ['_doe',['0']] # null dim, vector (hidden)
|
||||||
|
|
||||||
|
match_ids = {'E':[ids_0, ids_1],
|
||||||
|
'P':[ids_1, pc_ids],
|
||||||
|
'T':[ids_0, pc_ids],
|
||||||
|
'W': [ids_1, pc_ids],
|
||||||
|
'R': [ids_1, pc_ids],
|
||||||
|
'Q':[ids_3, pc_ids],
|
||||||
|
'F':[ids_0, ids_3],
|
||||||
|
'B':[ids_1, ids_3],
|
||||||
|
'qval':[ids_1, zero_dim],
|
||||||
|
'qval_sorted':[ids_1, zero_dim],
|
||||||
|
'w_tsq':[ids_1, zero_dim],
|
||||||
|
'rmsep':[pc_ids, zero_dim],
|
||||||
|
}
|
||||||
|
|
||||||
|
array = self.model[name]
|
||||||
|
M = Dataset(array,identifiers=match_ids[name],name=name)
|
||||||
|
return M
|
||||||
|
|
||||||
|
def get_out_plots(self, options):
|
||||||
|
out=[]
|
||||||
|
for plt in options['out_plots']:
|
||||||
|
#try:
|
||||||
|
out.append(plt(self))
|
||||||
|
#except:
|
||||||
|
# logger.log('debug', 'Plot: %s failed' %plt)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def run(self,a,b):
|
||||||
|
options = self._options
|
||||||
|
self._dataset['X'] = a
|
||||||
|
self._dataset['Y'] = b
|
||||||
|
self._data['X'] = a.asarray()
|
||||||
|
self._data['Y'] = b.asarray()
|
||||||
|
if options['center']:
|
||||||
|
self.model['E0'] = options['center_mth'](self._data['X'])
|
||||||
|
self.model['F0'] = options['center_mth'](self._data['Y'])
|
||||||
|
else:
|
||||||
|
self.model['E0'] = self._data['X']
|
||||||
|
self.model['F0'] = self._data['Y']
|
||||||
|
|
||||||
|
self.pre_validation(**options.pre_validation_options())
|
||||||
|
self.make_model(self.model['E0'], self.model['F0'],
|
||||||
|
**options.make_model_options())
|
||||||
|
# variance captured
|
||||||
|
var_x, exp_var_x = variances(self.model['E0'], self.model['T'], self.model['P'])
|
||||||
|
self.model['var_x'] = var_x
|
||||||
|
self.model['exp_var_x'] = exp_var_x
|
||||||
|
|
||||||
|
var_y, exp_var_y = variances(self.model['F0'], self.model['T'], self.model['Q'])
|
||||||
|
self.model['var_y'] = var_y
|
||||||
|
self.model['exp_var_y'] = exp_var_y
|
||||||
|
|
||||||
|
if options['calc_conf']:
|
||||||
|
self.confidence(**options.confidence_options())
|
||||||
|
|
||||||
|
out = [self.as_dataset(p) for p in options['out_data']]
|
||||||
|
for plt in self.get_out_plots(options):
|
||||||
|
out.append(plt)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class Packer:
|
||||||
|
"""A compression object used to speed up model calculations.
|
||||||
|
|
||||||
|
Often used in conjunction with crossvalidation and perturbations
|
||||||
|
analysis.
|
||||||
|
"""
|
||||||
|
def __init__(self,array):
|
||||||
|
self._shape = array.shape
|
||||||
|
self._array = array
|
||||||
|
self._packed_data = None
|
||||||
|
|
||||||
|
def expand(self,a):
|
||||||
|
if self._inflater!=None:
|
||||||
|
return dot(self._inflater,a)
|
||||||
|
|
||||||
|
def collapse(self,axis=None,mode='svd'):
|
||||||
|
if not axis:
|
||||||
|
axis = argmin(self._array.shape) # default is the smallest dim
|
||||||
|
|
||||||
|
if axis == 1:
|
||||||
|
self._array = self._array.T
|
||||||
|
u,s,vt = svd(self._array,full_matrices=0)
|
||||||
|
self._inflater = vt.T
|
||||||
|
self._packed_data = u*s
|
||||||
|
return self._packed_data
|
||||||
|
|
||||||
|
def get_packed_data(self):
|
||||||
|
return self._packed_data
|
||||||
|
|
||||||
|
|
||||||
|
class Options(dict):
|
||||||
|
"""Options base class.
|
||||||
|
"""
|
||||||
|
def __init__(self, *args,**kw):
|
||||||
|
dict.__init__(self, *args, **kw)
|
||||||
|
|
||||||
|
def _copy_from_list(self, key_list):
|
||||||
|
d = {}
|
||||||
|
for key in key_list:
|
||||||
|
d[key] = self.get(key,None)
|
||||||
|
return d
|
||||||
|
|
||||||
|
class PcaOptions(Options):
|
||||||
|
"""Options for Principal Component Analysis.
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
Options.__init__(self)
|
||||||
|
self._set_default()
|
||||||
|
|
||||||
|
def _set_default(self):
|
||||||
|
opt = {}
|
||||||
|
opt['algo'] = 'pca'
|
||||||
|
opt['engine'] = engines.pca
|
||||||
|
opt['mode'] = 'normal' # how much info to calculate
|
||||||
|
opt['lod'] = 'compact' # how much info to store
|
||||||
|
opt['amax'] = 5
|
||||||
|
opt['aopt'] = 5
|
||||||
|
opt['center'] = True
|
||||||
|
opt['center_mth'] = mat_center
|
||||||
|
opt['scale'] = 'scores'
|
||||||
|
opt['calc_conf'] = True
|
||||||
|
opt['n_sets'] = 5
|
||||||
|
|
||||||
|
opt['strict'] = True
|
||||||
|
opt['p_center'] = 'med'
|
||||||
|
opt['alpha'] = .8
|
||||||
|
opt['cov_center'] = 'med'
|
||||||
|
opt['crot'] = True
|
||||||
|
|
||||||
|
opt['val_engine'] = pca_alter_val
|
||||||
|
opt['val_n_sets'] = 10
|
||||||
|
|
||||||
|
opt['all_data'] = ['T','P','E','p_tsq','rmsep']
|
||||||
|
opt['all_plots'] = ['PcaScorePlot', 'PcaLoadingPlot',
|
||||||
|
'PcaRmsepPlot']
|
||||||
|
|
||||||
|
opt['out_data'] = ['T','P', 'p_tsq']
|
||||||
|
opt['out_plots'] = [blmplots.PcaScorePlot,blmplots.PcaLoadingPlot,blmplots.LineViewXc]
|
||||||
|
|
||||||
|
self.update(opt)
|
||||||
|
|
||||||
|
def make_model_options(self):
|
||||||
|
"""Options for make_model method."""
|
||||||
|
opt_list = ['scale','mode', 'amax']
|
||||||
|
return self._copy_from_list(opt_list)
|
||||||
|
|
||||||
|
def confidence_options(self):
|
||||||
|
"""Options for confidence method."""
|
||||||
|
opt_list = ['n_sets', 'aopt', 'alpha', 'p_center',
|
||||||
|
'strict', 'crot', 'cov_center']
|
||||||
|
return self._copy_from_list(opt_list)
|
||||||
|
|
||||||
|
def pre_validation_options(self):
|
||||||
|
"""Options for pre_validation method."""
|
||||||
|
opt_list = ['amax', 'n_sets', 'val_engine']
|
||||||
|
return self._copy_from_list(opt_list)
|
||||||
|
|
||||||
|
|
||||||
|
class PlsOptions(Options):
|
||||||
|
"""Options for Partial Least Squares Regression.
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
Options.__init__(self)
|
||||||
|
self._set_default()
|
||||||
|
|
||||||
|
def _set_default(self):
|
||||||
|
opt = {}
|
||||||
|
opt['algo'] = 'pls'
|
||||||
|
opt['engine'] = engines.pls
|
||||||
|
opt['mode'] = 'normal' # how much info to calculate
|
||||||
|
opt['lod'] = 'compact' # how much info to store
|
||||||
|
opt['amax'] = 3
|
||||||
|
opt['aopt'] = 3
|
||||||
|
opt['center'] = True
|
||||||
|
opt['center_mth'] = mat_center
|
||||||
|
opt['scale'] = 'scores'
|
||||||
|
opt['calc_conf'] = True
|
||||||
|
opt['n_sets'] = 10
|
||||||
|
|
||||||
|
opt['strict'] = True
|
||||||
|
opt['p_center'] = 'med'
|
||||||
|
opt['alpha'] = .2
|
||||||
|
opt['cov_center'] = 'med'
|
||||||
|
opt['crot'] = True
|
||||||
|
|
||||||
|
opt['val_engine'] = w_pls_cv_val
|
||||||
|
|
||||||
|
opt['all_data'] = ['T','P','E','p_tsq','rmsep']
|
||||||
|
opt['all_plots'] = ['PcaScorePlot', 'PcaLoadingPlot',
|
||||||
|
'PcaRmsepPlot']
|
||||||
|
|
||||||
|
opt['out_data'] = []
|
||||||
|
opt['out_plots'] = [blmplots.PlsScorePlot,
|
||||||
|
blmplots.PlsLoadingPlot,
|
||||||
|
blmplots.LineViewXc]
|
||||||
|
#blmplots.PlsQvalScatter]
|
||||||
|
|
||||||
|
opt['pack'] = False
|
||||||
|
opt['calc_qvals'] = False
|
||||||
|
opt['q_pert_mth'] = 'shuffle_vars'
|
||||||
|
opt['q_iter'] = 20
|
||||||
|
self.update(opt)
|
||||||
|
|
||||||
|
def make_model_options(self):
|
||||||
|
"""Options for make_model method."""
|
||||||
|
opt_list = ['scale','mode', 'amax', 'engine']
|
||||||
|
return self._copy_from_list(opt_list)
|
||||||
|
|
||||||
|
def confidence_options(self):
|
||||||
|
"""Options for confidence method."""
|
||||||
|
opt_list = ['n_sets', 'aopt', 'alpha', 'p_center',
|
||||||
|
'strict', 'crot', 'cov_center']
|
||||||
|
return self._copy_from_list(opt_list)
|
||||||
|
|
||||||
|
def pre_validation_options(self):
|
||||||
|
"""Options for pre_validation method."""
|
||||||
|
opt_list = ['amax', 'n_sets', 'val_engine']
|
||||||
|
return self._copy_from_list(opt_list)
|
|
@ -0,0 +1,158 @@
|
||||||
|
"""Specialised plots for functions defined in blmfuncs.py.
|
||||||
|
|
||||||
|
fixme:
|
||||||
|
-- Im normalsing all color mapping input vectors to [0,1]. This will
|
||||||
|
destroy informative numerical values in colorbar (but we
|
||||||
|
are not showing these anyway). A better fix would be to let the
|
||||||
|
colorbar listen to the scalarmappable instance and corect itself, but
|
||||||
|
I did not get that to work ...
|
||||||
|
|
||||||
|
fixme2:
|
||||||
|
-- If scatterplot is not inited with a colorvector there will be no
|
||||||
|
colorbar, but when adding colors the colorbar shoud be created.
|
||||||
|
"""
|
||||||
|
from fluents import plots
|
||||||
|
from scipy import dot,sum,diag,arange,log,mean,newaxis
|
||||||
|
from matplotlib import cm
|
||||||
|
|
||||||
|
class PcaScorePlot(plots.ScatterPlot):
|
||||||
|
"""PCA Score plot"""
|
||||||
|
def __init__(self, model, absi=0, ordi=1):
|
||||||
|
self._T = model.model['T']
|
||||||
|
dataset_1 = model.as_dataset('T')
|
||||||
|
dataset_2 = dataset_1
|
||||||
|
id_dim = dataset_1.get_dim_name(0)
|
||||||
|
sel_dim = dataset_1.get_dim_name(1)
|
||||||
|
id_1, = dataset_1.get_identifiers(sel_dim, [absi])
|
||||||
|
id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
|
||||||
|
plots.ScatterPlot.__init__(self, dataset_1, dataset_2, id_dim, sel_dim, id_1, id_2 ,c='b' ,s=40 , name='pca-scores')
|
||||||
|
|
||||||
|
def set_absicca(self,n):
|
||||||
|
self.xaxis_data = self._T[:,n]
|
||||||
|
|
||||||
|
def set_ordinate(self,n):
|
||||||
|
self.yaxis_data = self._T[:,n]
|
||||||
|
|
||||||
|
class PcaLoadingPlot(plots.ScatterPlot):
|
||||||
|
"""PCA Loading plot"""
|
||||||
|
def __init__(self, model, absi=0, ordi=1):
|
||||||
|
self._P = model.model['P']
|
||||||
|
dataset_1 = model.as_dataset('P')
|
||||||
|
dataset_2 = dataset_1
|
||||||
|
id_dim = dataset_1.get_dim_name(0)
|
||||||
|
sel_dim = dataset_1.get_dim_name(1)
|
||||||
|
id_1, = dataset_1.get_identifiers(sel_dim, [absi])
|
||||||
|
id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
|
||||||
|
if model.model.has_key('p_tsq'):
|
||||||
|
col = model.model['p_tsq'].ravel()
|
||||||
|
col = normalise(col)
|
||||||
|
else:
|
||||||
|
col = 'g'
|
||||||
|
plots.ScatterPlot.__init__(self, dataset_1, dataset_2, id_dim, sel_dim, id_1, id_2,c=col,s=20, name='pls-loadings')
|
||||||
|
|
||||||
|
def set_absicca(self,n):
|
||||||
|
self.xaxis_data = self._P[:,n]
|
||||||
|
|
||||||
|
def set_ordinate(self,n):
|
||||||
|
self.yaxis_data = self._P[:,n]
|
||||||
|
|
||||||
|
class PlsScorePlot(plots.ScatterPlot):
|
||||||
|
"""PLS Score plot"""
|
||||||
|
def __init__(self,model, absi=0, ordi=1):
|
||||||
|
self._T = model.model['T']
|
||||||
|
dataset_1 = model.as_dataset('T')
|
||||||
|
dataset_2 = dataset_1
|
||||||
|
id_dim = dataset_1.get_dim_name(0)
|
||||||
|
sel_dim = dataset_1.get_dim_name(1)
|
||||||
|
id_1, = dataset_1.get_identifiers(sel_dim, [absi])
|
||||||
|
id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
|
||||||
|
|
||||||
|
plots.ScatterPlot.__init__(self, dataset_1, dataset_2,
|
||||||
|
id_dim, sel_dim, id_1, id_2 ,
|
||||||
|
c='b' ,s=40 , name='pls-scores')
|
||||||
|
|
||||||
|
def set_absicca(self,n):
|
||||||
|
self.xaxis_data = self._T[:,n]
|
||||||
|
|
||||||
|
def set_ordinate(self,n):
|
||||||
|
self.yaxis_data = self._T[:,n]
|
||||||
|
|
||||||
|
|
||||||
|
class PlsLoadingPlot(plots.ScatterPlot):
|
||||||
|
"""PLS Loading plot"""
|
||||||
|
def __init__(self,model,absi=0,ordi=1):
|
||||||
|
self._P = model.model['P']
|
||||||
|
dataset_1 = model.as_dataset('P')
|
||||||
|
dataset_2 = dataset_1
|
||||||
|
id_dim = dataset_1.get_dim_name(0)
|
||||||
|
sel_dim = dataset_1.get_dim_name(1)
|
||||||
|
id_1, = dataset_1.get_identifiers(sel_dim, [absi])
|
||||||
|
id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
|
||||||
|
if model.model.has_key('w_tsq'):
|
||||||
|
col = model.model['w_tsq'].ravel()
|
||||||
|
col = normalise(col)
|
||||||
|
else:
|
||||||
|
col = 'g'
|
||||||
|
plots.ScatterPlot.__init__(self, dataset_1, dataset_2,
|
||||||
|
id_dim, sel_dim, id_1, id_2,
|
||||||
|
c=col, s=20, name='loadings')
|
||||||
|
|
||||||
|
def set_absicca(self,n):
|
||||||
|
self.xaxis_data = self._P[:,n]
|
||||||
|
|
||||||
|
def set_ordinate(self,n):
|
||||||
|
self.yaxis_data = self._T[:,n]
|
||||||
|
|
||||||
|
class LineViewXc(plots.LineViewPlot):
|
||||||
|
"""A line view of centered raw data
|
||||||
|
"""
|
||||||
|
def __init__(self, func_class, name='Profiles'):
|
||||||
|
# copy, center, plot
|
||||||
|
x = func_class._dataset['X'].copy()
|
||||||
|
x._array = x._array - mean(x._array,0)[newaxis]
|
||||||
|
plots.LineViewPlot.__init__(self, x, 1, None, name)
|
||||||
|
|
||||||
|
class ParalellCoordinates(plots.Plot):
|
||||||
|
"""Parallell coordinates for score loads with many comp.
|
||||||
|
"""
|
||||||
|
def __init__(self,model, p = 'loads'):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class PlsQvalScatter(plots.ScatterPlot):
|
||||||
|
"""A vulcano like plot of loads vs qvals
|
||||||
|
"""
|
||||||
|
def __init__(self, func_class, pc=0):
|
||||||
|
model = func_class.model
|
||||||
|
if not model.has_key('w_tsq'):
|
||||||
|
return
|
||||||
|
self._W = model['P']
|
||||||
|
dataset_1 = func_class.as_dataset('P')
|
||||||
|
dataset_2 = func_class.as_dataset('w_tsq')
|
||||||
|
id_dim = dataset_1.get_dim_name(0) #genes
|
||||||
|
sel_dim = dataset_1.get_dim_name(1) #_comp
|
||||||
|
sel_dim_2 = dataset_2.get_dim_name(1) #_zero_dim
|
||||||
|
id_1, = dataset_1.get_identifiers(sel_dim, [0])
|
||||||
|
id_2, = dataset_2.get_identifiers(sel_dim_2, [0])
|
||||||
|
if model.has_key('w_tsq'):
|
||||||
|
col = model['w_tsq'].ravel()
|
||||||
|
col = normalise(col)
|
||||||
|
else:
|
||||||
|
col = 'g'
|
||||||
|
plots.ScatterPlot.__init__(self, dataset_1, dataset_2,
|
||||||
|
id_dim, sel_dim, id_1, id_2,
|
||||||
|
c=col, s=20, sel_dim_2=sel_dim_2,
|
||||||
|
name='Load Volcano')
|
||||||
|
|
||||||
|
|
||||||
|
class InfluencePlot(plots.ScatterPlot):
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def normalise(x):
|
||||||
|
"""Scale vector x to [0,1]
|
||||||
|
"""
|
||||||
|
x = x - x.min()
|
||||||
|
x = x/x.max()
|
||||||
|
return x
|
|
@ -0,0 +1,256 @@
|
||||||
|
from scipy import zeros,zeros_like,sqrt,dot,trace,sign,round_,argmax,\
|
||||||
|
sort,ravel,newaxis,asarray,diag,sum,outer,argsort,arange,ones_like,\
|
||||||
|
all,apply_along_axis,eye
|
||||||
|
from scipy.linalg import svd,inv,norm,det,sqrtm
|
||||||
|
from scipy.stats import mean,median
|
||||||
|
from cx_utils import mat_center
|
||||||
|
from validation import pls_jkW
|
||||||
|
from select_generators import shuffle_1d
|
||||||
|
from engines import *
|
||||||
|
import time
|
||||||
|
|
||||||
|
def hotelling(P, Pfull, p_center='med', cov_center='med',
|
||||||
|
alpha=0.3, crot=True, strict=False, metric=None):
|
||||||
|
"""Returns regularized hotelling T^2.
|
||||||
|
|
||||||
|
alpha -- regularisation towards pooled cov estimates
|
||||||
|
beta -- regularisation for unstable eigenvalues
|
||||||
|
p_center -- location method for submodels
|
||||||
|
cov_center -- location method for sub coviariances
|
||||||
|
alpha -- regularisation
|
||||||
|
crot -- rotate submodels toward full?
|
||||||
|
strict -- only rotate 90 degree ?
|
||||||
|
metric -- inverse metric matrix (if P and Pfull from metric pca/pls)
|
||||||
|
|
||||||
|
"""
|
||||||
|
m, n = Pfull.shape
|
||||||
|
if metric==None:
|
||||||
|
metric = eye(m, dtype='<f8')
|
||||||
|
Pfull = dot(metric.T, asarray(Pfull))
|
||||||
|
n_sets,n,amax = P.shape
|
||||||
|
# allocate
|
||||||
|
T_sq = empty((n, ),dtype='f')
|
||||||
|
Cov_i = zeros((n, amax, amax),dtype='f')
|
||||||
|
|
||||||
|
# rotate sub_models to full model
|
||||||
|
if crot:
|
||||||
|
for i,Pi in enumerate(P):
|
||||||
|
Pi = dot(metric.T, Pi)
|
||||||
|
P[i] = procrustes(Pfull, Pi, strict=strict)
|
||||||
|
|
||||||
|
# center of pnull
|
||||||
|
if p_center=='med':
|
||||||
|
P_ctr = median(P, 0)
|
||||||
|
elif p_center=='mean':
|
||||||
|
# fixme: mean is unstable
|
||||||
|
P_ctr = mean(P, 0)
|
||||||
|
else: #use full
|
||||||
|
P_ctr = Pfull
|
||||||
|
|
||||||
|
for i in xrange(n):
|
||||||
|
Pi = P[:,i,:] # (n_sets x amax)
|
||||||
|
Pi_ctr = P_ctr[i,:] # (1 x amax)
|
||||||
|
Pim = (Pi - Pi_ctr[newaxis])*sqrt(n_sets-1)
|
||||||
|
Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim)
|
||||||
|
|
||||||
|
if cov_center == 'med':
|
||||||
|
Cov = median(Cov_i, 0)
|
||||||
|
else:
|
||||||
|
Cov = mean(Cov_i, 0)
|
||||||
|
|
||||||
|
reg_cov = (1. - alpha)*Cov_i + alpha*Cov
|
||||||
|
for i in xrange(n):
|
||||||
|
Pc = P_ctr[i,:][:,newaxis]
|
||||||
|
sigma = reg_cov[i]
|
||||||
|
#T_sq[i] = sqrt(dot(dot(Pc.T, inv(sigma)), Pc).ravel())
|
||||||
|
T_sq[i] = dot(dot(Pc.T, inv(sigma)), Pc).ravel()
|
||||||
|
return T_sq
|
||||||
|
|
||||||
|
def procrustes(A, B, strict=True, center=False, verbose=False):
|
||||||
|
"""Rotation of B to A.
|
||||||
|
|
||||||
|
strict -- Only do flipping and shuffling
|
||||||
|
center -- Center before rotation, translate back after
|
||||||
|
verbose -- Print ssq
|
||||||
|
|
||||||
|
No scaling calculated.
|
||||||
|
Output B_rot = Rotated B
|
||||||
|
"""
|
||||||
|
if center:
|
||||||
|
A,mn_A = mat_center(A, ret_mn=True)
|
||||||
|
B,mn_B = mat_center(B, ret_mn=True)
|
||||||
|
u,s,vh = svd(dot(B.T, A))
|
||||||
|
v = vh.T
|
||||||
|
Cm = dot(u, v.T) #orthogonal rotation matrix
|
||||||
|
if strict: # just inverting and flipping
|
||||||
|
Cm = ensure_strict(Cm)
|
||||||
|
b_rot = dot(B, Cm)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print Cm.round()
|
||||||
|
fit = sum(ravel(B - b_rot)**2)
|
||||||
|
print "Sum of squares: %s" %fit
|
||||||
|
if center:
|
||||||
|
return mn_B + b_rot
|
||||||
|
else:
|
||||||
|
return b_rot
|
||||||
|
|
||||||
|
def expl_var_x(X, T):
|
||||||
|
"""Returns explained variance of X."""
|
||||||
|
# centered X,Y
|
||||||
|
exp_var_x = diag(dot(T.T, T))*100/(sum(X**2))
|
||||||
|
return exp_var_x
|
||||||
|
|
||||||
|
def expl_var_y(Y, T, Q):
|
||||||
|
"""Returns explained variance of Y.
|
||||||
|
"""
|
||||||
|
# centered Y
|
||||||
|
exp_var_y = zeros((Q.shape[1], ))
|
||||||
|
for a in range(Q.shape[1]):
|
||||||
|
Ya = outer(T[:,a], Q[:,a])
|
||||||
|
exp_var_y[a] = 100*sum(Ya**2)/sum(Y**2)
|
||||||
|
return exp_var_y
|
||||||
|
|
||||||
|
def pls_qvals(a, b, aopt=None, alpha=.3,
|
||||||
|
n_iter=20, algo='pls',
|
||||||
|
sim_method='shuffle',
|
||||||
|
p_center='med', cov_center='med',
|
||||||
|
crot=True, strict=False, metric=None):
|
||||||
|
|
||||||
|
"""Returns qvals for pls model.
|
||||||
|
|
||||||
|
input:
|
||||||
|
a -- centered data matrix
|
||||||
|
b -- centered data matrix
|
||||||
|
aopt -- scalar, opt. number of components
|
||||||
|
alpha -- [0,1] regularisation parameter for T2-test
|
||||||
|
n_iter -- number of permutations
|
||||||
|
sim_method -- permutation method ['shuffle']
|
||||||
|
p_center -- location estimator for sub models ['med']
|
||||||
|
cov_center -- location estimator for covariance of submodels ['med']
|
||||||
|
crot -- bool, use rotations of sub models?
|
||||||
|
strict -- bool, use stict (rot/flips only) rotations?
|
||||||
|
metric -- bool, use row metric?
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
m, n = a.shape
|
||||||
|
TSQ = zeros((n, n_iter), dtype='<f8') # (nvars x n_subsets)
|
||||||
|
n_false = zeros((n, n_iter), dtype='<f8')
|
||||||
|
|
||||||
|
#full model
|
||||||
|
if algo=='bridge':
|
||||||
|
dat = bridge(a, b, aopt, 'loads', 'fast')
|
||||||
|
else:
|
||||||
|
dat = pls(a, b, aopt, 'loads', 'fast')
|
||||||
|
W = pls_jkW(a, b, aopt, n_blocks=None, algo=algo)
|
||||||
|
tsq_full = hotelling(W, dat['W'], p_center=p_center,
|
||||||
|
alpha=alpha, crot=crot, strict=strict,
|
||||||
|
cov_center=cov_center, metric=metric)
|
||||||
|
t0 = time.time()
|
||||||
|
Vs = shuffle_1d(b, n_iter)
|
||||||
|
for i,b_shuff in enumerate(Vs):
|
||||||
|
t1 = time.time()
|
||||||
|
if algo=='bridge':
|
||||||
|
dat = bridge(a, b_shuff, aopt, 'loads','fast')
|
||||||
|
else:
|
||||||
|
dat = pls(a, b, aopt, 'loads', 'fast')
|
||||||
|
W = pls_jkW(a, b_shuff, aopt, n_blocks=None, algo=algo)
|
||||||
|
TSQ[:,i] = hotelling(W, dat['W'],p_center=p_center,
|
||||||
|
alpha=alpha, crot=crot, strict=strict,
|
||||||
|
cov_center=cov_center, metric=metric)
|
||||||
|
print time.time() - t1
|
||||||
|
sort_index = argsort(tsq_full)[::-1]
|
||||||
|
back_sort_index = sort_index.argsort()
|
||||||
|
print time.time() - t0
|
||||||
|
|
||||||
|
# count false positives
|
||||||
|
tsq_full_sorted = tsq_full.take(sort_index)
|
||||||
|
for i in xrange(n_iter):
|
||||||
|
for j in xrange(n):
|
||||||
|
n_false[j,i] = sum(TSQ[:,i]>=tsq_full[j])
|
||||||
|
false_pos = median(n_false, 1)
|
||||||
|
ll = arange(1, len(false_pos)+1, 1)
|
||||||
|
sort_qval = false_pos.take(sort_index)/ll
|
||||||
|
qval = false_pos/ll.take(back_sort_index)
|
||||||
|
print time.time() - t0
|
||||||
|
return qval, false_pos, TSQ, tsq_full
|
||||||
|
|
||||||
|
def ensure_strict(C, only_flips=True):
|
||||||
|
"""Ensure that a rotation matrix does only 90 degree rotations.
|
||||||
|
In multiplication with pcs this allows flips and reordering.
|
||||||
|
|
||||||
|
if only_flips is True there will onlt be flips allowed
|
||||||
|
"""
|
||||||
|
Cm = C
|
||||||
|
S = sign(C) # signs
|
||||||
|
if only_flips==True:
|
||||||
|
C = eye(Cm.shape[0])*S
|
||||||
|
return C
|
||||||
|
Cm = zeros_like(C)
|
||||||
|
Cm.putmask(1.,abs(C)>.6)
|
||||||
|
if det(Cm)>1:
|
||||||
|
raise ValueError,"Implement this!"
|
||||||
|
return Cm*S
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def leverage(aopt=1,*args):
|
||||||
|
"""Returns leverages
|
||||||
|
input : aopt, number of components to base leverage calculations on
|
||||||
|
*args, matrices of normed blm-paramters
|
||||||
|
output: leverages
|
||||||
|
|
||||||
|
For PCA typical inputs are normalised T or normalised P
|
||||||
|
For PLSR typical inputs are normalised T or normalised W
|
||||||
|
"""
|
||||||
|
if aopt<1:
|
||||||
|
raise ValueError,"Leverages only make sense for aopt>0"
|
||||||
|
lev = []
|
||||||
|
for u in args:
|
||||||
|
lev_u = 1./u.shape[0] + dot(u[:,:aopt], u[:,:aopt].T).diagonal()
|
||||||
|
lev.append(lev_u)
|
||||||
|
return lev
|
||||||
|
|
||||||
|
|
||||||
|
def variances(a,t,p):
|
||||||
|
"""Returns explained variance and ind. var from blm-params.
|
||||||
|
input:
|
||||||
|
a -- full centered matrix
|
||||||
|
t,p -- parameters from a bilinear approx of the above matrix.
|
||||||
|
output:
|
||||||
|
var -- variance of each component
|
||||||
|
var_exp -- cumulative explained variance in percentage
|
||||||
|
|
||||||
|
Typical inputs are: X(centered),T,P for PCA or
|
||||||
|
X(centered),T,P / Y(centered),T,Q for PLSR.
|
||||||
|
"""
|
||||||
|
|
||||||
|
tot_var = sum(a**2)
|
||||||
|
var = 100*(sum(p**2, 0)*sum(t**2, 0))/tot_var
|
||||||
|
var_exp = cumsum(var)
|
||||||
|
return var, var_exp
|
||||||
|
|
||||||
|
def residual_diagnostics(Y, Yhat, aopt=1):
|
||||||
|
"""Root mean errors and press values.
|
||||||
|
R2 vals
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def ssq(E, axis=0, weights=None):
|
||||||
|
"""Sum of squares, supports weights."""
|
||||||
|
n = E.shape[axis]
|
||||||
|
if weights==None:
|
||||||
|
weights = eye(n)
|
||||||
|
else:
|
||||||
|
weigths = diag(weigths)
|
||||||
|
if axis==0:
|
||||||
|
Ew = dot(weights, E)
|
||||||
|
elif axis==1:
|
||||||
|
Ew = dot(E, weights)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError, "Higher order modes not supported"
|
||||||
|
return pow(Ew,2).sum(axis)
|
||||||
|
|
|
@ -0,0 +1,108 @@
|
||||||
|
from scipy import apply_along_axis,newaxis,zeros,\
|
||||||
|
median,round_,nonzero,dot,argmax,any,sqrt,ndarray,\
|
||||||
|
trace,zeros_like,sign,sort,real,argsort,rand,array
|
||||||
|
from scipy.linalg import norm,svd,inv,eig
|
||||||
|
from scipy.stats import median,mean
|
||||||
|
|
||||||
|
def normalise(a,axis=0,return_scales=False):
|
||||||
|
s = apply_along_axis(norm,axis,a)
|
||||||
|
if axis==0:
|
||||||
|
s = s[newaxis]
|
||||||
|
else:
|
||||||
|
s = s[:,newaxis]
|
||||||
|
|
||||||
|
a_s = a/s
|
||||||
|
if return_scales:
|
||||||
|
return a_s,s
|
||||||
|
return a_s
|
||||||
|
|
||||||
|
def sub2ind(shape,i,j):
|
||||||
|
"""Indices from subscripts. Only support for 2d"""
|
||||||
|
row,col = shape
|
||||||
|
ind = []
|
||||||
|
for k in xrange(len(i)):
|
||||||
|
for m in xrange(len(j)):
|
||||||
|
ind.append(i[k]*col + j[m])
|
||||||
|
return ind
|
||||||
|
|
||||||
|
|
||||||
|
def sorted_eig(a, b=None,sort_by='sm'):
|
||||||
|
"""
|
||||||
|
Just eig with real part of output sorted:
|
||||||
|
This is for convenience only, not general!
|
||||||
|
|
||||||
|
sort_by='sm': return the eigenvectors by eigenvalues
|
||||||
|
of smallest magnitude first. (default)
|
||||||
|
'lm': returns largest eigenvalues first
|
||||||
|
|
||||||
|
output: just as eig with 2 outputs
|
||||||
|
-- s,v (eigvals,eigenvectors)
|
||||||
|
(This is reversed output compared to matlab)
|
||||||
|
|
||||||
|
"""
|
||||||
|
s,v = eig(a,b)
|
||||||
|
s = real(s) # dont expect any imaginary part
|
||||||
|
v = real(v)
|
||||||
|
ind = argsort(s)
|
||||||
|
if sort_by=='lm':
|
||||||
|
ind = ind[::-1]
|
||||||
|
v = v.take(ind,1)
|
||||||
|
s = s.take(ind)
|
||||||
|
|
||||||
|
return s,v
|
||||||
|
|
||||||
|
def str2num(string_number):
|
||||||
|
"""Convert input (string number) into number, if float(string_number) fails, a nan is inserted.
|
||||||
|
"""
|
||||||
|
missings = ['','nan','NaN','NA']
|
||||||
|
try:
|
||||||
|
num = float(string_number)
|
||||||
|
except:
|
||||||
|
if string_number in missings:
|
||||||
|
num = nan
|
||||||
|
else:
|
||||||
|
print "Found strange entry: %s" %string_number
|
||||||
|
raise
|
||||||
|
return num
|
||||||
|
|
||||||
|
def randperm(n):
|
||||||
|
r=rand(n)
|
||||||
|
dict={}
|
||||||
|
for i in range(n):
|
||||||
|
dict[r[i]]=i
|
||||||
|
r=sort(r)
|
||||||
|
out=zeros(n)
|
||||||
|
for i in range(n):
|
||||||
|
out[i]=dict[r[i]]
|
||||||
|
return array(out,dtype='i')
|
||||||
|
|
||||||
|
def mat_center(X,axis=0,ret_mn=False):
|
||||||
|
"""Mean center matrix along axis.
|
||||||
|
|
||||||
|
X -- matrix, data
|
||||||
|
axis -- dim,
|
||||||
|
ret_mn -- bool, return mean
|
||||||
|
|
||||||
|
output:
|
||||||
|
Xc, [mnX]
|
||||||
|
|
||||||
|
NB: axis = 1 is column-centering, axis=0=row-centering
|
||||||
|
default is row centering (axis=0)
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
rows,cols = X.shape
|
||||||
|
except ValueError:
|
||||||
|
print "The X data needs to be two-dimensional"
|
||||||
|
|
||||||
|
if axis==0:
|
||||||
|
mnX = mean(X,axis)[newaxis]
|
||||||
|
Xs = X - mnX
|
||||||
|
|
||||||
|
elif axis==1:
|
||||||
|
mnX = mean(X,axis)[newaxis]
|
||||||
|
Xs = (X.T - mnX).T
|
||||||
|
if ret_mn:
|
||||||
|
return Xs,mnX
|
||||||
|
else:
|
||||||
|
return Xs
|
|
@ -0,0 +1,200 @@
|
||||||
|
|
||||||
|
"""Module contain algorithms for (burdensome) calculations.
|
||||||
|
|
||||||
|
There is no typechecking of any kind here, just focus on speed
|
||||||
|
"""
|
||||||
|
|
||||||
|
from scipy.linalg import svd,norm,inv,pinv,qr
|
||||||
|
from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\
|
||||||
|
apply_along_axis,mean,ones,randn,empty_like,outer,c_,\
|
||||||
|
rand,sum,cumsum
|
||||||
|
|
||||||
|
def pca(a, aopt, scale='scores', mode='normal'):
|
||||||
|
""" Principal Component Analysis model
|
||||||
|
mode:
|
||||||
|
-- fast : returns smallest dim scaled (T for n<=m, P for n>m )
|
||||||
|
-- normal : returns all model params and residuals after aopt comp
|
||||||
|
-- detailed : returns all model params and all residuals
|
||||||
|
"""
|
||||||
|
|
||||||
|
m,n = a.shape
|
||||||
|
u,s,vt = svd(a, full_matrices=0)
|
||||||
|
T = u*s
|
||||||
|
T = T[:,:aopt]
|
||||||
|
P = vt[:aopt,:].T
|
||||||
|
|
||||||
|
if scale=='loads':
|
||||||
|
tnorm = apply_along_axis(norm, 0, T)
|
||||||
|
T = T/tnorm
|
||||||
|
P = P*tnorm
|
||||||
|
|
||||||
|
if mode == 'fast':
|
||||||
|
return {'T':T, 'P':P}
|
||||||
|
|
||||||
|
if mode=='detailed':
|
||||||
|
"""Detailed mode returns residual matrix for all comp.
|
||||||
|
That is E, is a three-mode matrix: (amax, m, n) """
|
||||||
|
E = empty((aopt, m, n))
|
||||||
|
for ai in range(aopt):
|
||||||
|
e = a - dot(T[:,:ai+1], P[:,:ai+1].T)
|
||||||
|
E[ai,:,:] = e.copy()
|
||||||
|
else:
|
||||||
|
E = a - dot(T,P.T)
|
||||||
|
|
||||||
|
return {'T':T, 'P':P, 'E':E}
|
||||||
|
|
||||||
|
def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
|
||||||
|
"""Kernel pls for tall/wide matrices.
|
||||||
|
|
||||||
|
Fast pls for calibration. Only inefficient for many Y-vars.
|
||||||
|
|
||||||
|
"""
|
||||||
|
m,n = a.shape
|
||||||
|
if ab!=None:
|
||||||
|
mm,l = ab.shape
|
||||||
|
else:
|
||||||
|
k,l = b.shape
|
||||||
|
|
||||||
|
W = empty((n, aopt))
|
||||||
|
P = empty((n, aopt))
|
||||||
|
R = empty((n, aopt))
|
||||||
|
Q = empty((l, aopt))
|
||||||
|
T = empty((m, aopt))
|
||||||
|
B = empty((aopt, n, l))
|
||||||
|
|
||||||
|
if ab==None:
|
||||||
|
ab = dot(a.T, b)
|
||||||
|
for i in range(aopt):
|
||||||
|
if ab.shape[1]==1:
|
||||||
|
w = ab
|
||||||
|
else:
|
||||||
|
u,s,vh = svd(dot(ab.T, ab))
|
||||||
|
w = dot(ab,u[:,:1])
|
||||||
|
|
||||||
|
w = w/norm(w)
|
||||||
|
r = w.copy()
|
||||||
|
if i>0:
|
||||||
|
for j in range(0,i,1):
|
||||||
|
r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis]
|
||||||
|
t = dot(a, r)
|
||||||
|
tt = norm(t)**2
|
||||||
|
p = dot(a.T, t)/tt
|
||||||
|
q = dot(r.T, ab).T/tt
|
||||||
|
ab = ab - dot(p, q.T)*tt
|
||||||
|
T[:,i] = t.ravel()
|
||||||
|
W[:,i] = w.ravel()
|
||||||
|
P[:,i] = p.ravel()
|
||||||
|
R[:,i] = r.ravel()
|
||||||
|
|
||||||
|
if mode=='fast' and i==aopt-1:
|
||||||
|
if scale=='loads':
|
||||||
|
tnorm = apply_along_axis(norm, 0, T)
|
||||||
|
T = T/tnorm
|
||||||
|
W = W*tnorm
|
||||||
|
return {'T':T, 'W':W}
|
||||||
|
|
||||||
|
Q[:,i] = q.ravel()
|
||||||
|
B[i] = dot(R[:,:i+1], Q[:,:i+1].T)
|
||||||
|
|
||||||
|
if mode=='detailed':
|
||||||
|
E = empty((aopt, m, n))
|
||||||
|
F = empty((aopt, k, l))
|
||||||
|
for i in range(1,aopt+1,1):
|
||||||
|
E[i-1] = a - dot(T[:,:i],P[:,:i].T)
|
||||||
|
F[i-1] = b - dot(T[:,:i],Q[:,:i].T)
|
||||||
|
else:
|
||||||
|
E = a - dot(T[:,:aopt], P[:,:aopt].T)
|
||||||
|
F = b - dot(T[:,:aopt], Q[:,:aopt].T)
|
||||||
|
|
||||||
|
if scale=='loads':
|
||||||
|
tnorm = apply_along_axis(norm, 0, T)
|
||||||
|
T = T/tnorm
|
||||||
|
W = W*tnorm
|
||||||
|
Q = Q*tnorm
|
||||||
|
P = P*tnorm
|
||||||
|
|
||||||
|
return {'B':B, 'Q':Q, 'P':P, 'T':T, 'W':W, 'R':R, 'E':E, 'F':F}
|
||||||
|
|
||||||
|
def w_simpls(aat, b, aopt):
|
||||||
|
""" Simpls for wide matrices.
|
||||||
|
Fast pls for crossval, used in calc rmsep for wide X
|
||||||
|
There is no P,W. T is normalised
|
||||||
|
"""
|
||||||
|
bb = b.copy()
|
||||||
|
m,m = aat.shape
|
||||||
|
U = empty((m, aopt))
|
||||||
|
T = empty((m, aopt))
|
||||||
|
H = empty((m, aopt)) #just like W in simpls
|
||||||
|
PROJ = empty((m, aopt)) #just like R in simpls
|
||||||
|
|
||||||
|
for i in range(aopt):
|
||||||
|
u,s,vh = svd(dot(dot(b.T, aat), b), full_matrices=0)
|
||||||
|
u = dot(b, u[:,:1]) #y-factor scores
|
||||||
|
U[:,i] = u.ravel()
|
||||||
|
t =dot(aat, u)
|
||||||
|
t = t/norm(t)
|
||||||
|
T[:,i] = t.ravel()
|
||||||
|
h = dot(aat, t) #score-weights
|
||||||
|
H[:,i] = h.ravel()
|
||||||
|
PROJ[:,:i+1] = dot(T[:,:i+1], inv(dot(T[:,:i+1].T, H[:,:i+1])) )
|
||||||
|
if i<aopt:
|
||||||
|
b = b - dot(PROJ[:,:i+1], dot(H[:,:i+1].T,b) )
|
||||||
|
C = dot(bb.T, T)
|
||||||
|
|
||||||
|
return {'T':T,'U':U,'Q':C,'H':H}
|
||||||
|
|
||||||
|
def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
|
||||||
|
"""Undeflated Ridged svd(X'Y)
|
||||||
|
"""
|
||||||
|
m, n = a.shape
|
||||||
|
k, l = b.shape
|
||||||
|
u,s,vt = svd(b, full_matrices=0)
|
||||||
|
g0 = dot(u*s, u.T)
|
||||||
|
g = (1 - r)*g0 + r*eye(m)
|
||||||
|
ag = dot(a.T, g)
|
||||||
|
|
||||||
|
u,s,vt = svd(ag, full_matrices=0)
|
||||||
|
W = u[:,:aopt]
|
||||||
|
K = vt[:aopt,:].T
|
||||||
|
T = dot(a, W)
|
||||||
|
tnorm = apply_along_axis(norm, 0, T) # norm of T-columns
|
||||||
|
|
||||||
|
if mode == 'fast':
|
||||||
|
if scale=='loads':
|
||||||
|
T = T/tnorm
|
||||||
|
W = W*tnorm
|
||||||
|
return {'T':T, 'W':W}
|
||||||
|
|
||||||
|
U = dot(g0, K) #fixme check this
|
||||||
|
Q = dot(b.T, dot(T, inv(dot(T.T,T)) ))
|
||||||
|
B = zeros((aopt, n, l))
|
||||||
|
for i in range(aopt):
|
||||||
|
B[i] = dot(W[:,:i+1], Q[:,:i+1].T)
|
||||||
|
# leverages
|
||||||
|
# fixme: probably need an orthogonal basis for row-space leverage
|
||||||
|
# T (scores) are not orthogonal
|
||||||
|
# Using a qr decomp to get an orthonormal basis for row-space
|
||||||
|
#Tq = qr(T)[0]
|
||||||
|
#s_lev,v_lev = leverage(aopt,Tq,W)
|
||||||
|
# explained variance
|
||||||
|
#var_x, exp_var_x = variances(a,T,W)
|
||||||
|
#qnorm = apply_along_axis(norm, 0, Q)
|
||||||
|
#var_y, exp_var_y = variances(b,U,Q/qnorm)
|
||||||
|
|
||||||
|
if mode == 'detailed':
|
||||||
|
E = empty((aopt, m, n))
|
||||||
|
F = empty((aopt, k, l))
|
||||||
|
for i in range(aopt):
|
||||||
|
E[i] = a - dot(T[:,:i+1], W[:,:i+1].T)
|
||||||
|
F[i] = b - dot(a, B[i])
|
||||||
|
else: #normal
|
||||||
|
F = b - dot(a, B[-1])
|
||||||
|
E = a - dot(T, W.T)
|
||||||
|
|
||||||
|
if scale=='loads':
|
||||||
|
T = T/tnorm
|
||||||
|
W = W*tnorm
|
||||||
|
Q = Q*tnorm
|
||||||
|
|
||||||
|
return {'B':B, 'W':W, 'T':T, 'Q':Q, 'E':E, 'F':F, 'U':U, 'P':W}
|
||||||
|
|
|
@ -0,0 +1,626 @@
|
||||||
|
import os,sys
|
||||||
|
from itertools import izip
|
||||||
|
import networkx as NX
|
||||||
|
from scipy import shape,diag,dot,asarray,sqrt,real,zeros,eye,exp,maximum,\
|
||||||
|
outer,maximum,sum,diag,real
|
||||||
|
from scipy.linalg import eig,svd,inv,expm,norm
|
||||||
|
from cx_utils import sorted_eig
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
eps = numpy.finfo(float).eps.item()
|
||||||
|
feps = numpy.finfo(numpy.single).eps.item()
|
||||||
|
_array_precision = {'f': 0, 'd': 1, 'F': 0, 'D': 1,'i': 1}
|
||||||
|
|
||||||
|
|
||||||
|
def xgraph_to_graph(G):
|
||||||
|
"""Convert an Xgraph to an ordinary graph.
|
||||||
|
Edge attributes, mult.edges and self-loops are lost in the process.
|
||||||
|
"""
|
||||||
|
|
||||||
|
GG = NX.convert.from_dict_of_lists(NX.convert.to_dict_of_lists(G))
|
||||||
|
return GG
|
||||||
|
|
||||||
|
def get_affinity_matrix(G, data, ids, dist='e', mask=None, weight=None, t=0, out='dist'):
|
||||||
|
"""
|
||||||
|
Function for calculating a general affinity matrix, based upon distances.
|
||||||
|
Affiniy = 1 - distance ((10-1) 1 is far apart)
|
||||||
|
INPUT
|
||||||
|
|
||||||
|
data:
|
||||||
|
gene expression data, type dict data[gene] = expression-vector
|
||||||
|
|
||||||
|
G:
|
||||||
|
The network (networkx.base.Graph object)
|
||||||
|
|
||||||
|
mask:
|
||||||
|
The array mask shows which data are missing. If mask[i][j]==0, then
|
||||||
|
data[i][j] is missing.
|
||||||
|
|
||||||
|
weights:
|
||||||
|
The array weight contains the weights to be used when calculating distances.
|
||||||
|
|
||||||
|
transpose:
|
||||||
|
If transpose==0, then genes are clustered. If transpose==1, microarrays are
|
||||||
|
clustered.
|
||||||
|
|
||||||
|
dist:
|
||||||
|
The character dist defines the distance function to be used:
|
||||||
|
dist=='e': Euclidean distance
|
||||||
|
dist=='b': City Block distance
|
||||||
|
dist=='h': Harmonically summed Euclidean distance
|
||||||
|
dist=='c': Pearson correlation
|
||||||
|
dist=='a': absolute value of the correlation
|
||||||
|
dist=='u': uncentered correlation
|
||||||
|
dist=='x': absolute uncentered correlation
|
||||||
|
dist=='s': Spearman's rank correlation
|
||||||
|
dist=='k': Kendall's tau
|
||||||
|
For other values of dist, the default (Euclidean distance) is used.
|
||||||
|
|
||||||
|
OUTPUT
|
||||||
|
D :
|
||||||
|
Similariy matrix (nGenes x nGenes), symetric, d_ij e in [0,1]
|
||||||
|
Normalized so max weight = 1.0
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from Bio import Cluster as CLS
|
||||||
|
except:
|
||||||
|
raise ValueError, "Need installed biopython"
|
||||||
|
nVar = len(data)
|
||||||
|
nSamp = len(data[data.keys()[0]])
|
||||||
|
X = zeros((nVar, nSamp),dtpye='<f8')
|
||||||
|
for i,gene in enumerate(ids): #this shuld be right!!
|
||||||
|
X[i,:] = data[gene]
|
||||||
|
|
||||||
|
|
||||||
|
#X = transpose(X) # distancematrix needs matrix as (nGenes,nSamples)
|
||||||
|
|
||||||
|
D_list = CLS.distancematrix(X, dist=dist)
|
||||||
|
D = zeros((nVar,nVar),dtype='<f8')
|
||||||
|
for i,row in enumerate(D_list):
|
||||||
|
if i>0:
|
||||||
|
D[i,:len(row)]=row
|
||||||
|
|
||||||
|
D = D + D.T
|
||||||
|
MAX = 30.0
|
||||||
|
D_max = max(ravel(D))/MAX
|
||||||
|
D_n = D/D_max #normalised (max = 10.0)
|
||||||
|
D_n = (MAX+1.) - D_n #using correlation (inverse distance for dists)
|
||||||
|
|
||||||
|
A = NX.adj_matrix(G, nodelist=ids)
|
||||||
|
if out=='dist':
|
||||||
|
return D_n*A
|
||||||
|
elif out=='heat_kernel':
|
||||||
|
t=1.0
|
||||||
|
K = exp(-t*D*A)
|
||||||
|
return K
|
||||||
|
elif out=='complete':
|
||||||
|
return D_n
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def remove_one_degree_nodes(G, iter=True):
|
||||||
|
"""Removes all nodes with only one neighbour. These nodes does
|
||||||
|
not contribute to community structure.
|
||||||
|
input:
|
||||||
|
G -- graph
|
||||||
|
iter -- True/False iteratively remove?
|
||||||
|
"""
|
||||||
|
G_copy = G.copy()
|
||||||
|
if iter==True:
|
||||||
|
while 1:
|
||||||
|
bad_nodes=[]
|
||||||
|
for node in G_copy.nodes():
|
||||||
|
if len(G_copy.neighbors(node))==1:
|
||||||
|
bad_nodes.append(node)
|
||||||
|
if len(bad_nodes)>0:
|
||||||
|
G_copy.delete_nodes_from(bad_nodes)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
bad_nodes=[]
|
||||||
|
for ngb in G_copy.neighbors_iter():
|
||||||
|
if len(G_copy.neighbors(node))==1:
|
||||||
|
bad_nodes.append(node)
|
||||||
|
if len(bad_nodes)>0:
|
||||||
|
G_copy.delete_nodes_from(bad_nodes)
|
||||||
|
|
||||||
|
print "Deleted %s nodes from network" %(len(G)-len(G_copy))
|
||||||
|
return G_copy
|
||||||
|
|
||||||
|
def key_players(G, n=1, with_labels=False):
|
||||||
|
"""
|
||||||
|
Resilince measure
|
||||||
|
Identification of key nodes by fraction of nodes in
|
||||||
|
disconnected subgraph when the node is removed.
|
||||||
|
|
||||||
|
output:
|
||||||
|
fraction of nodes disconnected when node i is removed
|
||||||
|
"""
|
||||||
|
i=0
|
||||||
|
frac=[]
|
||||||
|
labels = {}
|
||||||
|
for node in G.nodes():
|
||||||
|
i+=1
|
||||||
|
print i
|
||||||
|
T = G.copy()
|
||||||
|
T.delete_node(node)
|
||||||
|
n_nodes = T.number_of_nodes()
|
||||||
|
sub_graphs = NX.connected_component_subgraphs(T)
|
||||||
|
n = len(sub_graphs)
|
||||||
|
if n>1:
|
||||||
|
strong_comp = sub_graphs[0]
|
||||||
|
fraction = 1.0 - 1.0*strong_comp.number_of_nodes()/n_nodes
|
||||||
|
frac.append(fraction)
|
||||||
|
labels[node]=fraction
|
||||||
|
|
||||||
|
else:
|
||||||
|
frac.append(0.0)
|
||||||
|
labels[node]=0.0
|
||||||
|
|
||||||
|
out = 1.0 - array(frac)
|
||||||
|
if with_labels==True:
|
||||||
|
return out,labels
|
||||||
|
else:
|
||||||
|
return out
|
||||||
|
|
||||||
|
def node_weighted_adj_matrix(G, weights=None, ave_type='harmonic', with_labels=False):
|
||||||
|
"""Return a weighted adjacency matrix of graph. The weights are
|
||||||
|
node weights.
|
||||||
|
input: G -- graph
|
||||||
|
weights -- dict, keys: nodes, values: weights
|
||||||
|
with_labels -- True/False, return labels?
|
||||||
|
|
||||||
|
output: A -- weighted eadjacency matrix
|
||||||
|
[index] -- node labels
|
||||||
|
|
||||||
|
"""
|
||||||
|
n=G.order()
|
||||||
|
# make an dictionary that maps vertex name to position
|
||||||
|
index={}
|
||||||
|
count=0
|
||||||
|
for node in G.nodes():
|
||||||
|
index[node]=count
|
||||||
|
count = count+1
|
||||||
|
|
||||||
|
a = zeros((n,n))
|
||||||
|
if type(G)=='networkx.xbase.XGraph':
|
||||||
|
raise
|
||||||
|
for head,tail in G.edges():
|
||||||
|
if ave_type == 'geometric':
|
||||||
|
a[index[head],index[tail]]= sqrt(weights[head]*weights[tail])
|
||||||
|
a[index[tail],index[head]]= a[index[head],index[tail]]
|
||||||
|
elif ave_type == 'harmonic':
|
||||||
|
a[index[head],index[tail]] = mean(weights[head],weights[tail])
|
||||||
|
a[index[tail],index[head]]= mean(weights[head],weights[tail])
|
||||||
|
if with_labels:
|
||||||
|
return a,index
|
||||||
|
else:
|
||||||
|
return a
|
||||||
|
|
||||||
|
def weighted_adj_matrix(G, with_labels=False):
|
||||||
|
"""Adjacency matrix of an XGraph whos weights are given in edges.
|
||||||
|
"""
|
||||||
|
A,labels = NX.adj_matrix(G,with_labels=True)
|
||||||
|
W = A.astype('<f8')
|
||||||
|
for orf,i in labels.items():
|
||||||
|
for orf2,j in labels.items():
|
||||||
|
if G.has_edge(orf,orf2):
|
||||||
|
edge_weight = G.get_edge(orf,orf2)
|
||||||
|
W[i,j]=edge_weight
|
||||||
|
W[j,i]=edge_weight
|
||||||
|
if with_labels==True:
|
||||||
|
return W,labels
|
||||||
|
else:
|
||||||
|
return W
|
||||||
|
|
||||||
|
def assortative_index(G):
|
||||||
|
"""Ouputs two vectors: the degree and the neighbor average degree.
|
||||||
|
Used to measure the assortative mixing. If the average degree is
|
||||||
|
pos. correlated with the degree we know that hubs tend to connect
|
||||||
|
to other hubs.
|
||||||
|
|
||||||
|
input: G, graph connected!!
|
||||||
|
ouput: d,mn_d: degree, and average degree of neighb.
|
||||||
|
(degree sorting from degree(with_labels=True))
|
||||||
|
"""
|
||||||
|
d = G.degree(with_labels=True)
|
||||||
|
out=[]
|
||||||
|
for node in G.nodes():
|
||||||
|
nn = G.neighbors(node)
|
||||||
|
if len(nn)>0:
|
||||||
|
nn_d = mean([float(d[i]) for i in nn])
|
||||||
|
out.append((d[node], nn_d))
|
||||||
|
return array(out).T
|
||||||
|
|
||||||
|
def struct_equivalence(G,n1,n2):
|
||||||
|
"""Returns the structural equivalence of a node pair. Two nodes
|
||||||
|
are structural equal if they share the same neighbors.
|
||||||
|
|
||||||
|
x_s = [ne(n1) union ne(n2) - ne(n1) intersection ne(n2)]/[ne(n1)
|
||||||
|
union ne(n2) + ne(n1) intersection ne(n2)]
|
||||||
|
ref: Brun et.al 2003
|
||||||
|
"""
|
||||||
|
|
||||||
|
#[ne(n1) union ne(n2) - ne(n1) intersection ne(n2
|
||||||
|
s1 = set(G.neighbors(n1))
|
||||||
|
s2 = set(G.neighbors(n2))
|
||||||
|
num_union = len(s1.union(s2))
|
||||||
|
num_intersection = len(s1.intersection(s2))
|
||||||
|
if num_union & num_intersection:
|
||||||
|
xs=0
|
||||||
|
else:
|
||||||
|
xs = (num_union - num_intersection)/(num_union + num_intersection)
|
||||||
|
return xs
|
||||||
|
|
||||||
|
def struct_equivalence_all(G):
|
||||||
|
"""Not finnished.
|
||||||
|
"""
|
||||||
|
A,labels = NX.adj_matrix(G,with_labels=True)
|
||||||
|
pass
|
||||||
|
|
||||||
|
def hamming_distance(n1,n2):
|
||||||
|
"""Not finnsihed.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def graph_corrcoeff(G):
|
||||||
|
"""Not finnished.
|
||||||
|
"""
|
||||||
|
A,index = NX.adj_matrix(G,with_labels=True)
|
||||||
|
#C = zeros(*A.shape(),'d')
|
||||||
|
n = 1.*G.number_of_nodes()
|
||||||
|
for node in G.nodes():
|
||||||
|
a_j = A[index[node],:] #neighbors
|
||||||
|
mean_a = sum(a_j)/n# degree(G)/number_of_nodes()
|
||||||
|
var_a = sqrt(sum((a_j - mean_a)**2)/n)
|
||||||
|
pass
|
||||||
|
|
||||||
|
def graph_and_data_intersection(data, graph, pathways=None,
|
||||||
|
keep_connected=True):
|
||||||
|
|
||||||
|
"""Returns the intersection of keys in two dictionaries.
|
||||||
|
|
||||||
|
NB: keep track of identifer sorting after these dict transforms.
|
||||||
|
|
||||||
|
input:
|
||||||
|
data -- dict, keys: gene id, value: measurement profile
|
||||||
|
graph -- networkx,base.graph, full graph
|
||||||
|
pathways -- dict, keys: pathway name, values: nodes in pathway
|
||||||
|
call:
|
||||||
|
new_data, new_graph,pathways = graph_and_data_intersection(data,graph,pathways,keep_connected=True)
|
||||||
|
|
||||||
|
"""
|
||||||
|
new_graph = graph.copy()
|
||||||
|
new_data = {}
|
||||||
|
new_pathways = {}
|
||||||
|
graph_set = set(graph.nodes())
|
||||||
|
data_set = set(data.keys())
|
||||||
|
intersection = data_set & graph_set
|
||||||
|
new_graph.delete_nodes_from(graph_set - data_set) #remove difference
|
||||||
|
for k in intersection:
|
||||||
|
new_data[k] = data[k]
|
||||||
|
|
||||||
|
if keep_connected:
|
||||||
|
max_iter = 0
|
||||||
|
sub_graphs = NX.connected_component_subgraphs(new_graph)
|
||||||
|
if len(sub_graphs)==0:
|
||||||
|
new_graph = sub_graphs[0]
|
||||||
|
else:
|
||||||
|
new_graph = sub_graphs[0]
|
||||||
|
old_data = new_data
|
||||||
|
while new_graph.number_of_nodes() != len(new_data) and max_iter<100:
|
||||||
|
max_iter+=1
|
||||||
|
graph_set = sets.Set(new_graph.nodes())
|
||||||
|
data_set = sets.Set(new_data.keys())
|
||||||
|
intersection = data_set & graph_set
|
||||||
|
new_graph.delete_nodes_from(graph_set - data_set)
|
||||||
|
new_data={}
|
||||||
|
for k in intersection:
|
||||||
|
new_data[k] = old_data[k]
|
||||||
|
old_data = new_data.copy()
|
||||||
|
new_graph = NX.connected_component_subgraphs(new_graph)[0]
|
||||||
|
if pathways!=None:
|
||||||
|
for pth,nodes in pathways.items():
|
||||||
|
new_pathways[pth] = [node for node in nodes if node in new_graph]
|
||||||
|
print "\nSUMMARY (graph_and_data_intersection): "
|
||||||
|
print "Number of input variables: %s\n\
|
||||||
|
Number nodes in input graph: %s" %(len(data),len(graph))
|
||||||
|
print "\nUsing intersection of connected graph and nodes with data values"
|
||||||
|
print "Number of variables is now: %s" %len(new_data)
|
||||||
|
print "Number of nodes in graph: %s" %new_graph.number_of_nodes()
|
||||||
|
if pathways!=None:
|
||||||
|
return new_data,new_graph,new_pathways
|
||||||
|
else:
|
||||||
|
return new_data,new_graph
|
||||||
|
|
||||||
|
def rx_graph_and_data_intersection(graph,node_data,pathways,data,keep_connected=False):
|
||||||
|
"""Returns a (connected) reaction graph with present gene expression data.
|
||||||
|
|
||||||
|
keep_connected==True:
|
||||||
|
When a node (gene) is not present in our expression data, the node
|
||||||
|
is deleted and all neighbors are connected with edge weight=0.5
|
||||||
|
if the are not already neigbors.
|
||||||
|
|
||||||
|
input:
|
||||||
|
data -- dict, keys: gene id, value: measurement profile
|
||||||
|
graph -- networkx.xbase.xgraph, full wieghted graph
|
||||||
|
node_data -- dict, keys: rx id, value: set of gene_ids
|
||||||
|
pathways -- dict, keys: pathway name, values: lidt of nodes in pathway
|
||||||
|
"""
|
||||||
|
# We do not connect the full graph ... may be performed by using the reference graph?
|
||||||
|
graph = NX.connected_component_subgraphs(graph)[0] #largest connected component
|
||||||
|
|
||||||
|
new_graph = graph.copy()
|
||||||
|
new_data = {}
|
||||||
|
new_node_data = node_data.copy()
|
||||||
|
new_pathways = {}
|
||||||
|
|
||||||
|
genes_in_graph=set()
|
||||||
|
genes_in_data = set(data.keys())
|
||||||
|
rx_in_graph = set(new_graph.nodes())
|
||||||
|
|
||||||
|
# genes in graph nodes (rx_nodes)
|
||||||
|
for rx in rx_in_graph:
|
||||||
|
genes_in_graph.update(set(new_node_data.get(rx)))
|
||||||
|
keep_genes = genes_in_data.intersection(genes_in_graph) #both in graph and data
|
||||||
|
|
||||||
|
#update node data
|
||||||
|
for rx,genes in node_data.items(): # delete node data of nodes not present in graph
|
||||||
|
genes = set(genes)
|
||||||
|
genes.intersection_update(keep_genes) #remove genes if they are not in inters.
|
||||||
|
if len(genes)==0 or rx not in rx_in_graph: #no gene data or not in graph
|
||||||
|
print "removing: " + str(rx)
|
||||||
|
del new_node_data[rx]
|
||||||
|
rx_in_data= set(new_node_data.keys())
|
||||||
|
rx_intersection = rx_in_data.intersection(rx_in_graph)
|
||||||
|
|
||||||
|
for gene in keep_genes:
|
||||||
|
new_data[gene] = data.get(gene)
|
||||||
|
|
||||||
|
# update pathways nodes
|
||||||
|
for pth,genes in pathways.items():
|
||||||
|
if genes:
|
||||||
|
genes = set(genes)
|
||||||
|
genes.intersection_update(keep_genes) # gene needs to have data
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
new_pathways[pth] = genes
|
||||||
|
bad_nodes = rx_in_graph.difference(rx_in_data) #in graph but no data
|
||||||
|
|
||||||
|
if keep_connected==True:
|
||||||
|
dummy = new_graph.copy()
|
||||||
|
for rx in bad_nodes:
|
||||||
|
dummy.delete_node(rx)
|
||||||
|
if len(NX.connected_component_subgraphs(dummy))>1:
|
||||||
|
nghbrs = new_graph.neighbors(rx)
|
||||||
|
for i in nghbrs:
|
||||||
|
for j in nghbrs:
|
||||||
|
if i!=j:
|
||||||
|
if not new_graph.has_edge(i,j):
|
||||||
|
new_graph.add_edge(i,j,0.5)
|
||||||
|
|
||||||
|
#update graph
|
||||||
|
new_graph.delete_nodes_from(list(bad_nodes))
|
||||||
|
|
||||||
|
return new_graph,new_node_data,new_pathways,new_data
|
||||||
|
|
||||||
|
def weighted_laplacian(G,with_labels=False):
|
||||||
|
"""Return standard Laplacian of graph from a weighted adjacency matrix."""
|
||||||
|
n= G.order()
|
||||||
|
I = scipy.eye(n)
|
||||||
|
A = weighted_adj_matrix(G)
|
||||||
|
D = I*scipy.sum(A, 0)
|
||||||
|
L = D-A
|
||||||
|
if with_labels:
|
||||||
|
A,index = weighted_adj_matrix(G, with_labels=True)
|
||||||
|
return L, index
|
||||||
|
else:
|
||||||
|
return L
|
||||||
|
|
||||||
|
|
||||||
|
"""Below are methods for calculating graph metrics
|
||||||
|
|
||||||
|
Four main decompositions :
|
||||||
|
0.) Adjacency diffusion kernel expm(A),
|
||||||
|
1.) von neumann kernels (diagonalisation of adjacency matrix)
|
||||||
|
|
||||||
|
2.) laplacian kernels (geometric series of adj.)
|
||||||
|
|
||||||
|
3.) diffusion kernels (exponential series of adj.)
|
||||||
|
|
||||||
|
---- Kv
|
||||||
|
von_neumann : Kv = (I-alpha*A)^-1 (mod: A(I-alpha*A)^-1)? ,
|
||||||
|
geom. series
|
||||||
|
|
||||||
|
---- Kl
|
||||||
|
laplacian: Kl = (I-alpha*L)^-1 , geom. series
|
||||||
|
|
||||||
|
---- Kd
|
||||||
|
laplacian_diffusion: Kd = expm(-alpha*L)
|
||||||
|
exp. series
|
||||||
|
|
||||||
|
---- Ke
|
||||||
|
Exponential diffusion.
|
||||||
|
Ke = expm(A) .... expm(-A)?
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# TODO:
|
||||||
|
# check for numerical unstable eigenvalues and set to zero
|
||||||
|
# othervise some inverses wil explode ->ok ..using pinv for inverses
|
||||||
|
#
|
||||||
|
# This gives results that look numerical unstable
|
||||||
|
#
|
||||||
|
# -- divided adj by sum(A[:]), check this one (paper by Lebart scales with number of edges)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# the neumann kernel is defined in Kandola to be K = A*(I-A)^-1
|
||||||
|
# lowest eigenvectors are same as the highest of K = A*A ?
|
||||||
|
# this needs clarification
|
||||||
|
|
||||||
|
# diffusion is still wrong! ... ok
|
||||||
|
# diff needs normalisation?! check the meaning of exp(-s) = exp(1/s) -L = 1/degree ... etc
|
||||||
|
# Is it the negative of exp. of adj. metrix in Kandola?
|
||||||
|
#
|
||||||
|
# Normalised=False returns only nans (no idea why!!) ... fixed ok
|
||||||
|
|
||||||
|
# 31.1: diff is ok exp(0)=1 not zero!
|
||||||
|
# 07.03.2005: normalisation is ok: -> normalisation will emphasize high degree nodes
|
||||||
|
# 10.03.2005: symeig is unstable an returns nans of some eigenvectors? switching back to eig
|
||||||
|
# 14.05.2006: diffusion returns negative values, using expm(-LL) instead (FIX)
|
||||||
|
# 13.09.2206: update for use in numpy
|
||||||
|
|
||||||
|
|
||||||
|
def K_expAdj(W, normalised=False, alpha=1.0):
|
||||||
|
"""Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel.
|
||||||
|
"""
|
||||||
|
W = asarray(W)
|
||||||
|
t = W.dtype.char
|
||||||
|
if len(W.shape)!=2:
|
||||||
|
raise ValueError, "Non-matrix input to matrix function."
|
||||||
|
m,n = W.shape
|
||||||
|
if t in ['F','D']:
|
||||||
|
raise TypeError, "Complex input!"
|
||||||
|
if normalised==True:
|
||||||
|
T = diag( sqrt( 1./(sum(W,0))) )
|
||||||
|
W = dot(dot(T, W), T)
|
||||||
|
e,vr = eig(W)
|
||||||
|
s = real(e)**2 # from eigenvalues to singularvalues
|
||||||
|
vri = inv(vr)
|
||||||
|
s = maximum.reduce(s) + s
|
||||||
|
cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
|
||||||
|
cutoff = abs(cond*maximum.reduce(s))
|
||||||
|
psigma = eye(m)
|
||||||
|
for i in range(len(s)):
|
||||||
|
if abs(s[i]) > cutoff:
|
||||||
|
psigma[i,i] = .5*alpha*exp(s[i])
|
||||||
|
|
||||||
|
return dot(dot(vr,psigma),vri)
|
||||||
|
|
||||||
|
def K_vonNeumann(W,normalised=False,alpha=1.0):
|
||||||
|
""" The geometric series of path lengths.
|
||||||
|
Returns matrix square root of pseudo inverse of the adjacency matrix.
|
||||||
|
"""
|
||||||
|
W = asarray(W)
|
||||||
|
t = W.dtype.char
|
||||||
|
if len(W.shape)!=2:
|
||||||
|
raise ValueError, "Non-matrix input to matrix function."
|
||||||
|
m,n = W.shape
|
||||||
|
if t in ['F','D']:
|
||||||
|
raise TypeError, "Complex input!"
|
||||||
|
|
||||||
|
if normalised==True:
|
||||||
|
T = diag(sqrt(1./(sum(W,0))))
|
||||||
|
W = dot(dot(T,W),T)
|
||||||
|
e,vr = eig(W)
|
||||||
|
vri = inv(vr)
|
||||||
|
e = real(e) # we only work with real pos. eigvals
|
||||||
|
e = maximum.reduce(e) + e
|
||||||
|
cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
|
||||||
|
cutoff = cond*maximum.reduce(e)
|
||||||
|
psigma = zeros((m,n),t)
|
||||||
|
for i in range(len(e)):
|
||||||
|
if e[i] > cutoff:
|
||||||
|
psigma[i,i] = 1.0/e[i] #these are eig.vals (=sqrt(sing.vals))
|
||||||
|
return dot(dot(vr,psigma),vri).astype(t)
|
||||||
|
|
||||||
|
def K_laplacian(W, normalised=True, alpha=1.0):
|
||||||
|
""" This is the matrix square root of the pseudo inverse of L.
|
||||||
|
Also known as th eaverage commute time matrix.
|
||||||
|
"""
|
||||||
|
W = asarray(W)
|
||||||
|
t = W.dtype.char
|
||||||
|
if len(W.shape)!=2:
|
||||||
|
raise ValueError, "Non-matrix input to matrix function."
|
||||||
|
m,n = W.shape
|
||||||
|
if t in ['F','D']:
|
||||||
|
raise TypeError, "Complex input!"
|
||||||
|
D = diag(sum(W,0))
|
||||||
|
L = D - W
|
||||||
|
if normalised==True:
|
||||||
|
T = diag(sqrt(1./sum(W,0)))
|
||||||
|
L = dot(dot(T,L),T)
|
||||||
|
e,vr = eig(L)
|
||||||
|
e = real(e)
|
||||||
|
vri = inv(vr)
|
||||||
|
cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
|
||||||
|
cutoff = cond*maximum.reduce(e)
|
||||||
|
psigma = zeros((m,),t) # if s close to zero -> set 1/s = 0
|
||||||
|
for i in range(len(e)):
|
||||||
|
if e[i] > cutoff:
|
||||||
|
psigma[i] = 1.0/e[i]
|
||||||
|
K = dot(dot(vr,diag(psigma)),vri).astype(t)
|
||||||
|
K = real(K)
|
||||||
|
I = eye(n)
|
||||||
|
K = (1-alpha)*I + alpha*K
|
||||||
|
return K
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5):
|
||||||
|
"""Returns diffusion kernel.
|
||||||
|
input:
|
||||||
|
-- W, adj. matrix
|
||||||
|
-- normalised [True/False]
|
||||||
|
-- alpha, [0,1] (degree of network influence)
|
||||||
|
-- beta, [0->), (diffusion degree)
|
||||||
|
"""
|
||||||
|
W = asarray(W)
|
||||||
|
t = W.dtype.char
|
||||||
|
if len(W.shape)!=2:
|
||||||
|
raise ValueError, "Non-matrix input to matrix function."
|
||||||
|
m,n = W.shape
|
||||||
|
if t in ['F','D']:
|
||||||
|
raise TypeError, "Complex input!"
|
||||||
|
D = diag(sum(W,0))
|
||||||
|
L = D-W
|
||||||
|
if normalised==True:
|
||||||
|
T = diag(sqrt(1./(sum(W,0))))
|
||||||
|
L = dot(dot(T,L),T)
|
||||||
|
e,vr = eig(L)
|
||||||
|
vri = inv(vr) #inv
|
||||||
|
cond = 1.0*{0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
|
||||||
|
cutoff = 1.*abs(cond*maximum.reduce(e))
|
||||||
|
psigma = eye(m) # if sing vals are 0 exp(0)=1 (unnecessary)
|
||||||
|
#psigma = zeros((m,n), dtype='<f8')
|
||||||
|
for i in range(len(e)):
|
||||||
|
if abs(e[i]) > cutoff:
|
||||||
|
psigma[i,i] = exp(-beta*e[i])
|
||||||
|
K = real(dot(dot(vr, psigma), vri))
|
||||||
|
I = eye(n, dtype='<f8')
|
||||||
|
K = (1. - alpha)*I + alpha*K
|
||||||
|
return K
|
||||||
|
|
||||||
|
def K_modularity(W,alpha=1.0):
|
||||||
|
""" Returns the matrix square root of Newmans modularity."""
|
||||||
|
W = asarray(W)
|
||||||
|
t = W.dtype.char
|
||||||
|
m, n = W.shape
|
||||||
|
d = sum(W, 0)
|
||||||
|
m = 1.*sum(d)
|
||||||
|
B = W - (outer(d, d)/m)
|
||||||
|
s,v = sorted_eig(B, sort_by='lm')
|
||||||
|
psigma = zeros( (n, n), dtype='<f8' )
|
||||||
|
for i in range(len(s)):
|
||||||
|
if s[i]>1e-7:
|
||||||
|
psigma[i,i] = sqrt(s[i])
|
||||||
|
#psigma[i,i] = s[i]
|
||||||
|
K = dot(dot(v, psigma), v.T)
|
||||||
|
I = eye(n)
|
||||||
|
K = (1 - alpha)*I + alpha*K
|
||||||
|
return K
|
||||||
|
|
||||||
|
def kernel_score(K, W):
|
||||||
|
"""Returns the modularity score.
|
||||||
|
K -- (modularity) kernel
|
||||||
|
W -- adjacency matrix (possibly weighted)
|
||||||
|
"""
|
||||||
|
# normalize W (: W'W=I)
|
||||||
|
m, n = shape(W)
|
||||||
|
for i in range(n):
|
||||||
|
W[:,i] = W[:,i]/norm(W[:,i])
|
||||||
|
score = diag(dot(W, dot(K, W)) )
|
||||||
|
tot = sum(score)
|
||||||
|
return score, tot
|
|
@ -0,0 +1,187 @@
|
||||||
|
"""Matrix cross validation selection generators
|
||||||
|
"""
|
||||||
|
from scipy import take,arange,ceil,repeat,newaxis,mean,asarray,dot,ones,\
|
||||||
|
random,array_split,floor,vstack,asarray,minimum
|
||||||
|
from cx_utils import randperm
|
||||||
|
|
||||||
|
def w_pls_gen(aat,b,n_blocks=None,center=True,index_out=False):
|
||||||
|
"""Random block crossvalidation for wide (XX.T) trick in PLS.
|
||||||
|
Leave-one-out is a subset, with n_blocks equals nSamples
|
||||||
|
|
||||||
|
aat -- outerproduct of X
|
||||||
|
b -- Y
|
||||||
|
n_blocks =
|
||||||
|
center -- use centering of calibration ,sets (aat_in,b_in) are centered
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
-- aat_in,aat_out,b_in,b_out,[out]
|
||||||
|
"""
|
||||||
|
m,n = aat.shape
|
||||||
|
index = randperm(m)
|
||||||
|
nValuesInBlock = m/n_blocks
|
||||||
|
if n_blocks==m:
|
||||||
|
index = arange(m)
|
||||||
|
out_ind = [index[i*nValuesInBlock:(i+1)*nValuesInBlock] for i in range(n_blocks)]
|
||||||
|
|
||||||
|
for out in out_ind:
|
||||||
|
inn = [i for i in index if i not in out]
|
||||||
|
aat_in = aat[inn,:][:,inn]
|
||||||
|
aat_out = aat[out,:][:,inn]
|
||||||
|
b_in = b[inn,:]
|
||||||
|
b_out = b[out,:]
|
||||||
|
if center:
|
||||||
|
# centering projector: I - (1/n)11'
|
||||||
|
# nin = len(inn)
|
||||||
|
# Pc = eye(nin) - outer(ones((nin,)),ones((nin,)))/nin
|
||||||
|
# xxt - x( outer(ones((nin,)),ones((nin,)))/nin ) x.T
|
||||||
|
|
||||||
|
# de jong:
|
||||||
|
h = sum(aat_in,0)[ :,newaxis]
|
||||||
|
h = (h - mean(h)/2)/len(inn)
|
||||||
|
mn_a = h + h.T
|
||||||
|
aat_in = aat_in - mn_a
|
||||||
|
if index_out:
|
||||||
|
yield aat_in,aat_out,b_in,b_out,out
|
||||||
|
else:
|
||||||
|
yield aat_in,aat_out,b_in,b_out
|
||||||
|
|
||||||
|
def pls_gen(a,b, n_blocks=None, center=False, index_out=False,axis=0):
|
||||||
|
"""Random block crossvalidation
|
||||||
|
Leave-one-out is a subset, with n_blocks equals a.shape[-1]
|
||||||
|
"""
|
||||||
|
index = randperm(a.shape[axis])
|
||||||
|
if n_blocks==None:
|
||||||
|
n_blocks = a.shape[axis]
|
||||||
|
n_in_set = ceil(float(a.shape[axis])/n_blocks)
|
||||||
|
out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_blocks)]
|
||||||
|
for out in out_ind_sets:
|
||||||
|
inn = [i for i in index if i not in out]
|
||||||
|
if center:
|
||||||
|
a = a - mean(a,0)[newaxis]
|
||||||
|
b = b - mean(b,0)[newaxis]
|
||||||
|
if index_out:
|
||||||
|
yield a.take(inn,0),a.take(out,0), b.take(inn,0),b.take(out,0),out
|
||||||
|
else:
|
||||||
|
yield a.take(inn,0),a.take(out,0), b.take(inn,0),b.take(out,0)
|
||||||
|
|
||||||
|
|
||||||
|
def pca_gen(a,n_sets=None, center=False, index_out=False,axis=0):
|
||||||
|
"""PCA random block crossval generator.
|
||||||
|
"""
|
||||||
|
m = a.shape[axis]
|
||||||
|
index = randperm(m)
|
||||||
|
if n_sets==None:
|
||||||
|
n_sets = m
|
||||||
|
n_in_set = ceil(float(m)/n_sets)
|
||||||
|
out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_sets)]
|
||||||
|
for out in out_ind_sets:
|
||||||
|
inn = [i for i in index if i not in out]
|
||||||
|
if center:
|
||||||
|
a = a - mean(a,0)[newaxis]
|
||||||
|
if index_out:
|
||||||
|
yield a.take(inn,0),a.take(out,0),out
|
||||||
|
else:
|
||||||
|
yield a.take(inn,0),a.take(out,0)
|
||||||
|
|
||||||
|
def w_pls_gen_jk(a,b,n_sets=None,center=True,index_out=False,axis=0):
|
||||||
|
"""Random block crossvalidation for wide X (m>>n)
|
||||||
|
Leave-one-out is a subset, with n_sets equals a.shape[-1]
|
||||||
|
|
||||||
|
Returns : X_m and X_m'Y_m
|
||||||
|
"""
|
||||||
|
m = a.shape[axis]
|
||||||
|
ab = dot(a.T,b)
|
||||||
|
index = randperm(m)
|
||||||
|
if n_sets==None:
|
||||||
|
n_sets = m
|
||||||
|
n_in_set = ceil(float(m)/n_sets)
|
||||||
|
out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_sets)]
|
||||||
|
for out in out_ind_sets:
|
||||||
|
inn = [i for i in index if i not in out]
|
||||||
|
nin = len(inn)
|
||||||
|
nout = len(out)
|
||||||
|
a_in = a[inn,:]
|
||||||
|
mn_a = 0
|
||||||
|
mAB = 0
|
||||||
|
|
||||||
|
if center:
|
||||||
|
mn_a = mean(a,0)[newaxis]
|
||||||
|
mAin = dot(-ones((1,nout)),a[out,:])/nin
|
||||||
|
mBin = dot(-ones((1,nout)),b[out,:])/nin
|
||||||
|
mAB = dot(mAin.T,(mBin*nin))
|
||||||
|
ab_in = ab - dot(a[out,].T,b[out,:]) - mAB
|
||||||
|
a_in = a_in - mn_a
|
||||||
|
|
||||||
|
if index_out:
|
||||||
|
yield ain,ab, out
|
||||||
|
else:
|
||||||
|
yield a_in, ab
|
||||||
|
|
||||||
|
def shuffle_1d_block(a, n_sets=None, blocks=None, index_out=False, axis=0):
|
||||||
|
"""Random block shuffling along 1d axis
|
||||||
|
Returns : Shuffled a by axis
|
||||||
|
"""
|
||||||
|
m = a.shape[axis]
|
||||||
|
if blocks==None:
|
||||||
|
blocks = m
|
||||||
|
for ii in xrange(n_sets):
|
||||||
|
index = randperm(m)
|
||||||
|
if blocks==m:
|
||||||
|
a_out = a.take(index, axis)
|
||||||
|
else:
|
||||||
|
index = arange(m)
|
||||||
|
dummy = map(random.shuffle, array_split(index, blocks))
|
||||||
|
a_out = a.take(index, axis)
|
||||||
|
if index_out:
|
||||||
|
yield a_out, index
|
||||||
|
else:
|
||||||
|
yield a_out
|
||||||
|
|
||||||
|
def shuffle_1d(a, n_sets, axis=0):
|
||||||
|
"""Random shuffling along 1d axis.
|
||||||
|
|
||||||
|
Returns : Shuffled a by axis
|
||||||
|
"""
|
||||||
|
m = a.shape[axis]
|
||||||
|
for ii in xrange(n_sets):
|
||||||
|
index = randperm(m)
|
||||||
|
yield a.take(index, axis)
|
||||||
|
|
||||||
|
def diag_pert(a, n_sets=10, center=True, index_out=False):
|
||||||
|
"""Alter generator returning sets perturbed with means at diagonals.
|
||||||
|
|
||||||
|
input:
|
||||||
|
X -- matrix, data
|
||||||
|
alpha -- scalar, approx. portion of data perturbed
|
||||||
|
"""
|
||||||
|
|
||||||
|
m, n = a.shape
|
||||||
|
tr=False
|
||||||
|
if m>n:
|
||||||
|
a = a.T
|
||||||
|
m, n = a.shape
|
||||||
|
tr = True
|
||||||
|
if n_sets>m or n_sets>n:
|
||||||
|
msg = "You may not use more subsets than max(n_rows, n_cols)"
|
||||||
|
raise ValueError, msg
|
||||||
|
nm=n*m
|
||||||
|
start_inds = array_split(randperm(m),n_sets) # we use random start diags
|
||||||
|
if center:
|
||||||
|
a = a - mean(a, 0)[newaxis]
|
||||||
|
for v in range(n_sets):
|
||||||
|
a_out = a.copy()
|
||||||
|
out = []
|
||||||
|
for start in start_inds[v]:
|
||||||
|
ind = arange(start+v, nm, n+1)
|
||||||
|
[out.append(i) for i in ind]
|
||||||
|
if center:
|
||||||
|
a_out.put(a.mean(),ind)
|
||||||
|
else:
|
||||||
|
a_out.put(0, ind)
|
||||||
|
if tr:
|
||||||
|
a_out = a_out.T
|
||||||
|
|
||||||
|
if index_out:
|
||||||
|
yield a_out, asarray(out)
|
||||||
|
else:
|
||||||
|
yield a_out
|
|
@ -0,0 +1,145 @@
|
||||||
|
from scipy import ones,mean,sqrt,dot,newaxis,zeros,sum,empty,\
|
||||||
|
apply_along_axis,eye, kron
|
||||||
|
from scipy.linalg import triu,inv,svd,norm
|
||||||
|
|
||||||
|
from select_generators import w_pls_gen,w_pls_gen_jk,pls_gen,pca_gen,diag_pert
|
||||||
|
from engines import w_simpls,pls, bridge,pca
|
||||||
|
from pylab import *
|
||||||
|
|
||||||
|
def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
|
||||||
|
"""RMSEP calc for pls with wide X.
|
||||||
|
"""
|
||||||
|
k, l = Y.shape
|
||||||
|
PRESS = zeros((l, amax+1), dtype='f')
|
||||||
|
# X,Y are centered
|
||||||
|
if n_blocks==None:
|
||||||
|
n_blocks = Y.shape[0]
|
||||||
|
V = w_pls_gen(dot(X, X.T), Y, n_blocks=n_blocks, center=True)
|
||||||
|
for Din, Doi, Yin, Yout in V:
|
||||||
|
ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0])
|
||||||
|
Yin = Yin - ym
|
||||||
|
PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
|
||||||
|
if algo=='simpls':
|
||||||
|
dat = w_simpls(Din, Yin, amax)
|
||||||
|
Q,U,H = dat['Q'], dat['U'], dat['H']
|
||||||
|
That = dot(Doi, dot(U, inv(triu(dot(H.T,U))) ))
|
||||||
|
else:
|
||||||
|
"Other algo-support comming soon"
|
||||||
|
raise NotImplementedError
|
||||||
|
#Yhat = empty((amax, k, l),dtype='<f8')
|
||||||
|
Yhat = []
|
||||||
|
for j in range(l):
|
||||||
|
TQ = dot(That, triu(dot(Q[j,:][:,newaxis], ones((1,amax)))) )
|
||||||
|
E = Yout[:,j][:,newaxis] - TQ
|
||||||
|
E = E + sum(E, 0)/Din.shape[0]
|
||||||
|
PRESS[j,1:] = PRESS[j,1:] + sum(E**2, 0)
|
||||||
|
#Yhat = Y - dot(That,Q.T)
|
||||||
|
return sqrt(PRESS/Y.shape[0])
|
||||||
|
|
||||||
|
def pls_val(X, Y, amax=2, n_blocks=10,algo='pls'):
|
||||||
|
""" Validation results of pls model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
k, l = Y.shape
|
||||||
|
PRESS = zeros((l, amax+1), dtype='<f8')
|
||||||
|
EE = zeros((amax, k, l), dtype='<f8')
|
||||||
|
Yhat = zeros((amax, k, l), dtype='<f8')
|
||||||
|
# X,Y are centered
|
||||||
|
V = pls_gen(X, Y, n_blocks=n_blocks, center=True, index_out=True)
|
||||||
|
for Xin, Xout, Yin, Yout, out in V:
|
||||||
|
ym = -sum(Yout,0)[newaxis]/Yin.shape[0]
|
||||||
|
Yin = (Yin - ym)
|
||||||
|
PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
|
||||||
|
if algo=='pls':
|
||||||
|
dat = pls(Xin, Yin, amax, mode='normal')
|
||||||
|
elif algo=='bridge':
|
||||||
|
dat = simpls(Xin, Yin, amax, mode='normal')
|
||||||
|
|
||||||
|
for a in range(amax):
|
||||||
|
Ba = dat['B'][a,:,:]
|
||||||
|
Yhat[a,out[:],:] = dot(Xout, Ba)
|
||||||
|
E = Yout - dot(Xout, Ba)
|
||||||
|
EE[a,out,:] = E
|
||||||
|
PRESS[:,a+1] = PRESS[:,a+1] + sum(E**2,0)
|
||||||
|
|
||||||
|
return sqrt(PRESS/(k-1.)), EE, Yhat
|
||||||
|
|
||||||
|
def pca_alter_val(a, amax, n_sets=10,method='diag'):
|
||||||
|
"""Pca validation by altering elements in X.
|
||||||
|
"""
|
||||||
|
# todo: it is just as easy to do jk-estimates her as well
|
||||||
|
V = diag_pert(a, n_sets, center=True, index_out=True)
|
||||||
|
sep = empty((n_sets, amax), dtype='f')
|
||||||
|
for i, (xi, ind) in enumerate(V):
|
||||||
|
dat_i = pca(xi, amax, mode='detailed')
|
||||||
|
Ti,Pi = dat_i['T'],dat_i['P']
|
||||||
|
for j in xrange(amax):
|
||||||
|
Xhat = dot(Ti[:,:j+1], Pi[:,:j+1].T)
|
||||||
|
a_sub = a.ravel().take(ind)
|
||||||
|
EE = a_sub - Xhat.ravel().take(ind)
|
||||||
|
tot = (a_sub**2).sum()
|
||||||
|
sep[i,j] = (EE**2).sum()/tot
|
||||||
|
return sqrt(sep.mean(0))
|
||||||
|
#return sep
|
||||||
|
|
||||||
|
def pca_cv_val(X, amax, n_sets):
|
||||||
|
""" Cross validation of pca using random sets crossval.
|
||||||
|
"""
|
||||||
|
m, n = X.shape
|
||||||
|
xtot = (X**2).sum()
|
||||||
|
V = pca_gen(X, n_sets=7, center=True, index_out=True)
|
||||||
|
E = empty((amax, m, n), dtype='f')
|
||||||
|
for xi,xout,ind in V:
|
||||||
|
dat_i = pca(xi, amax, mode='detailed')
|
||||||
|
Pi = dat_i['P']
|
||||||
|
for a in xrange(amax):
|
||||||
|
Pia = Pi[:,:a+1]
|
||||||
|
E[a][ind,:] = (X[ind,:] - dot(xout, dot(Pia,Pia.T) ))**2
|
||||||
|
|
||||||
|
sep = []
|
||||||
|
for a in xrange(amax):
|
||||||
|
sep.append(E[a].sum()/xtot)
|
||||||
|
return sqrt(sep.mean(0))
|
||||||
|
|
||||||
|
def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True):
|
||||||
|
""" Returns CV-segments of paramter W for wide X.
|
||||||
|
|
||||||
|
todo: add support for T,Q and B
|
||||||
|
"""
|
||||||
|
if n_blocks == None:
|
||||||
|
n_blocks = b.shape[0]
|
||||||
|
|
||||||
|
WW = empty((n_blocks, a.shape[1], amax), dtype='f')
|
||||||
|
|
||||||
|
if use_pack:
|
||||||
|
u, s, inflater = svd(a, full_matrices=0)
|
||||||
|
a = u*s
|
||||||
|
V = pls_gen(a, b, n_blocks=n_blocks)
|
||||||
|
for nn,(a_in, a_out, b_in, b_out) in enumerate(V):
|
||||||
|
if algo=='pls':
|
||||||
|
dat = pls(a_in, b_in, amax, 'loads', 'fast')
|
||||||
|
elif algo=='bridge':
|
||||||
|
dat = bridge(a_in, b_in, amax, 'loads', 'fast')
|
||||||
|
W = dat['W']
|
||||||
|
if use_pack:
|
||||||
|
W = dot(inflater.T, W)
|
||||||
|
WW[nn,:,:] = W
|
||||||
|
|
||||||
|
return WW
|
||||||
|
|
||||||
|
def pca_jkP(a, aopt, n_blocks=None):
|
||||||
|
""" Returns CV-segments of paramter P.
|
||||||
|
todo: add support for T
|
||||||
|
fixme: more efficient to add this in validation loop
|
||||||
|
"""
|
||||||
|
if n_blocks == None:
|
||||||
|
n_blocks = a.shape[0]
|
||||||
|
|
||||||
|
PP = empty((n_blocks, a.shape[1], aopt), dtype='f')
|
||||||
|
V = pca_gen(a, n_sets=n_blocks, center=True)
|
||||||
|
for nn,(a_in, a_out) in enumerate(V):
|
||||||
|
dat = pca(a_in, aopt, mode='fast')
|
||||||
|
P = dat['P']
|
||||||
|
PP[nn,:,:] = P
|
||||||
|
|
||||||
|
return PP
|
Reference in New Issue