First import of chemometrics utils

This commit is contained in:
Arnar Flatberg 2006-12-18 11:59:12 +00:00
parent fac9346aad
commit 3ef5522dd0
8 changed files with 2112 additions and 0 deletions

432
fluents/lib/blmfuncs.py Normal file
View File

@ -0,0 +1,432 @@
"""This module contains bilinear models(Functions)
"""
import sys
# add library
sys.path.append('/home/flatberg/fluents/fluents/lib')
import time
from fluents.workflow import Function
from fluents.dataset import Dataset
from fluents import plots, dataset, workflow, logger
import scipy
from engines import *
from cx_stats import leverage, variances, hotelling
from cx_utils import mat_center
from validation import *
import blmplots
import engines
class Model(Function):
"""Base class of bilinear models.
"""
def __init__(self,id='johndoe',name='JohnDoe'):
Function.__init__(self,id,name)
self.name = name
self._options = None
self._data = {}
self._dataset = {}
self._packers = {}
self.model = {}
def clear(self):
""" Clears model paramters
"""
self.model = {}
self._data = {}
self._packers = {}
class PCA(Model):
def __init__(self,id='pca',name='PCA'):
Model.__init__(self,id,name)
self._options = PcaOptions()
def pre_validation(self, amax, n_sets, val_engine):
"""Model calculations for maximum number of components.
"""
rmsep = val_engine(self.model['E0'], amax, n_sets)
self.model['rmsep'] = rmsep
self.model['aopt'] = rmsep.argmin()
def confidence(self, aopt, n_sets, alpha, p_center,
crot, strict, cov_center ):
"""Returns a confidence measure for model parameters.
Based on aopt.
"""
aopt = self.model['aopt']
jk_segments = pca_jkP(self.model['E0'], aopt, n_sets)
Pcal = self.model['P'][:,:aopt]
tsq = hotelling(jk_segments, Pcal, p_center,
cov_center, alpha, crot, strict)
self.model['p_tsq'] = tsq
def make_model(self, amax, mode, scale):
"""Model on optimal number of components.
"""
dat = pca(self.model['E0'], amax, scale, mode)
# explained variance
var_x, exp_var_x = variances(self.model['E0'], dat['T'], dat['P'])
dat['var_x'] = var_x
dat['exp_var_x'] = exp_var_x
#fixme###
do_lev_s = False
do_lev_v = False
#####
if do_lev_s:
# sample leverages
tnorm = scipy.apply_along_axis(norm, 0, dat['T']) # norm of T-columns
s_lev = leverage(amax, tnorm)
dat['s_lev'] = s_lev
if do_lev_v:
# variable leverages
v_lev = leverage(amax, dat['P'])
dat['v_lev'] = v_lev
self.model.update(dat)
def as_dataset(self, param, dtype='dataset'):
"""Return model parameter as Dataset.
"""
if not param in self.model.keys():
return
DX = self._dataset['X'] #input dataset
dim_name_0, dim_name_1 = DX.get_dim_name()
# samples
ids_0 = [dim_name_0, DX.get_identifiers(dim_name_0, sorted=True)]
# vars
ids_1 = [dim_name_1, DX.get_identifiers(dim_name_1, sorted=True)]
# components (hidden)
pc_ids = ['_comp_a', map(str,range(self.model['aopt'])) ]
pc_ids_opt = ['_comp_o', map(str, range(self.model['aopt'])) ]
zero_dim = ['_doe', ['0']] # null dim, vector (hidden)
match_ids = {'E':[ids_0, ids_1],
'E0':[ids_0, ids_1],
'P':[ids_1, pc_ids],
'T':[ids_0, pc_ids],
'W':[ids_1, pc_ids],
'p_tsq':[ids_1, zero_dim],
'rmsep':[pc_ids, zero_dim],
'var_leverages':[ids_1, zero_dim],
'sample_leverages':[pc_ids, zero_dim],
'exp_var_x': [pc_ids, zero_dim],
'var_x': [pc_ids, zero_dim],
}
out = Dataset(self.model[param], match_ids[param], name=param)
return out
def get_out_plots(self, options):
out=[]
for plt in options['out_plots']:
#try:
out.append(plt(self))
#except:
# print plt
#logger.log('debug', 'Plot: %s failed') %plt
return out
def run(self, data):
"""Run pca with present options.
"""
self.clear()
options = self._options
self._dataset['X'] = data
self._data['X'] = data.asarray().astype('<f8')
if options['center']:
center = options['center_mth']
self.model['E0'] = center(self._data['X'])
else:
self.model['E0'] = data.asarray()
self.pre_validation(**options.pre_validation_options())
self.make_model(**options.make_model_options())
if options['calc_conf']:
self.confidence(**options.confidence_options())
out = [self.as_dataset(p) for p in options['out_data']]
for plt in self.get_out_plots(options):
out.append(plt)
return out
class PLS(Model):
def __init__(self, id='pls', name='PLS'):
Model.__init__(self, id, name)
self._options = PlsOptions()
def pre_validation(self, amax, n_sets, val_engine):
"""Returns rmsec,rmsep for model.
"""
rmsep = val_engine(self.model['E0'], self.model['F0'],
amax, n_sets)
self.model['rmsep'] = rmsep.mean(0)
self.model['aopt'] = rmsep.mean(0).argmin()
def confidence(self, aopt, n_sets, alpha, p_center,
crot, strict, cov_center ):
"""Returns a confidence measure for model parameters
Supported parameters: W
"""
aopt = self.model['aopt']
jk_segments = pls_jkW(self.model['E0'], self.model['F0'],
aopt, n_sets)
Wcal = self.model['W'][:,:aopt]
tsq = hotelling(jk_segments, Wcal, p_center,
alpha, crot, strict, cov_center)
self.model['w_tsq'] = tsq
def permutation_confidence(self, a, b, aopt, reg, n_iter, algo,
sim_method):
"""Estimates sign. vars by controlling fdr."""
qvals_sorted, qvals = pls_qvals(a, b, aopt=None,
alpha=.4, n_iter=20, algo='pls',
sim_method='shuffle', )
def make_model(self, a, b, amax, scale, mode, engine):
"""Make model on amax components.
"""
dat = engine(a, b, amax, scale, mode)
self.model.update(dat)
def as_dataset(self, name, dtype='Dataset'):
"""Return any model parameter as Dataset
No ids matching
"""
if name not in self.model.keys():
return
DX, DY = self._dataset['X'], self._dataset['Y']
dim_name_0, dim_name_1 = DX.get_dim_name()
dim_name_2, dim_name_3 = DY.get_dim_name()
#samples
ids_0 = [dim_name_0, DX.get_identifiers(dim_name_0, sorted=True)]
# x vars
ids_1 = [dim_name_1, DX.get_identifiers(dim_name_1, sorted=True)]
# y vars
ids_3 = [dim_name_3, DY.get_identifiers(dim_name_3, sorted=True)]
# components (hidden)
pc_ids = ['_comp', map(str, range(self.model['aopt']))]
zero_dim = ['_doe',['0']] # null dim, vector (hidden)
match_ids = {'E':[ids_0, ids_1],
'P':[ids_1, pc_ids],
'T':[ids_0, pc_ids],
'W': [ids_1, pc_ids],
'R': [ids_1, pc_ids],
'Q':[ids_3, pc_ids],
'F':[ids_0, ids_3],
'B':[ids_1, ids_3],
'qval':[ids_1, zero_dim],
'qval_sorted':[ids_1, zero_dim],
'w_tsq':[ids_1, zero_dim],
'rmsep':[pc_ids, zero_dim],
}
array = self.model[name]
M = Dataset(array,identifiers=match_ids[name],name=name)
return M
def get_out_plots(self, options):
out=[]
for plt in options['out_plots']:
#try:
out.append(plt(self))
#except:
# logger.log('debug', 'Plot: %s failed' %plt)
return out
def run(self,a,b):
options = self._options
self._dataset['X'] = a
self._dataset['Y'] = b
self._data['X'] = a.asarray()
self._data['Y'] = b.asarray()
if options['center']:
self.model['E0'] = options['center_mth'](self._data['X'])
self.model['F0'] = options['center_mth'](self._data['Y'])
else:
self.model['E0'] = self._data['X']
self.model['F0'] = self._data['Y']
self.pre_validation(**options.pre_validation_options())
self.make_model(self.model['E0'], self.model['F0'],
**options.make_model_options())
# variance captured
var_x, exp_var_x = variances(self.model['E0'], self.model['T'], self.model['P'])
self.model['var_x'] = var_x
self.model['exp_var_x'] = exp_var_x
var_y, exp_var_y = variances(self.model['F0'], self.model['T'], self.model['Q'])
self.model['var_y'] = var_y
self.model['exp_var_y'] = exp_var_y
if options['calc_conf']:
self.confidence(**options.confidence_options())
out = [self.as_dataset(p) for p in options['out_data']]
for plt in self.get_out_plots(options):
out.append(plt)
return out
class Packer:
"""A compression object used to speed up model calculations.
Often used in conjunction with crossvalidation and perturbations
analysis.
"""
def __init__(self,array):
self._shape = array.shape
self._array = array
self._packed_data = None
def expand(self,a):
if self._inflater!=None:
return dot(self._inflater,a)
def collapse(self,axis=None,mode='svd'):
if not axis:
axis = argmin(self._array.shape) # default is the smallest dim
if axis == 1:
self._array = self._array.T
u,s,vt = svd(self._array,full_matrices=0)
self._inflater = vt.T
self._packed_data = u*s
return self._packed_data
def get_packed_data(self):
return self._packed_data
class Options(dict):
"""Options base class.
"""
def __init__(self, *args,**kw):
dict.__init__(self, *args, **kw)
def _copy_from_list(self, key_list):
d = {}
for key in key_list:
d[key] = self.get(key,None)
return d
class PcaOptions(Options):
"""Options for Principal Component Analysis.
"""
def __init__(self):
Options.__init__(self)
self._set_default()
def _set_default(self):
opt = {}
opt['algo'] = 'pca'
opt['engine'] = engines.pca
opt['mode'] = 'normal' # how much info to calculate
opt['lod'] = 'compact' # how much info to store
opt['amax'] = 5
opt['aopt'] = 5
opt['center'] = True
opt['center_mth'] = mat_center
opt['scale'] = 'scores'
opt['calc_conf'] = True
opt['n_sets'] = 5
opt['strict'] = True
opt['p_center'] = 'med'
opt['alpha'] = .8
opt['cov_center'] = 'med'
opt['crot'] = True
opt['val_engine'] = pca_alter_val
opt['val_n_sets'] = 10
opt['all_data'] = ['T','P','E','p_tsq','rmsep']
opt['all_plots'] = ['PcaScorePlot', 'PcaLoadingPlot',
'PcaRmsepPlot']
opt['out_data'] = ['T','P', 'p_tsq']
opt['out_plots'] = [blmplots.PcaScorePlot,blmplots.PcaLoadingPlot,blmplots.LineViewXc]
self.update(opt)
def make_model_options(self):
"""Options for make_model method."""
opt_list = ['scale','mode', 'amax']
return self._copy_from_list(opt_list)
def confidence_options(self):
"""Options for confidence method."""
opt_list = ['n_sets', 'aopt', 'alpha', 'p_center',
'strict', 'crot', 'cov_center']
return self._copy_from_list(opt_list)
def pre_validation_options(self):
"""Options for pre_validation method."""
opt_list = ['amax', 'n_sets', 'val_engine']
return self._copy_from_list(opt_list)
class PlsOptions(Options):
"""Options for Partial Least Squares Regression.
"""
def __init__(self):
Options.__init__(self)
self._set_default()
def _set_default(self):
opt = {}
opt['algo'] = 'pls'
opt['engine'] = engines.pls
opt['mode'] = 'normal' # how much info to calculate
opt['lod'] = 'compact' # how much info to store
opt['amax'] = 3
opt['aopt'] = 3
opt['center'] = True
opt['center_mth'] = mat_center
opt['scale'] = 'scores'
opt['calc_conf'] = True
opt['n_sets'] = 10
opt['strict'] = True
opt['p_center'] = 'med'
opt['alpha'] = .2
opt['cov_center'] = 'med'
opt['crot'] = True
opt['val_engine'] = w_pls_cv_val
opt['all_data'] = ['T','P','E','p_tsq','rmsep']
opt['all_plots'] = ['PcaScorePlot', 'PcaLoadingPlot',
'PcaRmsepPlot']
opt['out_data'] = []
opt['out_plots'] = [blmplots.PlsScorePlot,
blmplots.PlsLoadingPlot,
blmplots.LineViewXc]
#blmplots.PlsQvalScatter]
opt['pack'] = False
opt['calc_qvals'] = False
opt['q_pert_mth'] = 'shuffle_vars'
opt['q_iter'] = 20
self.update(opt)
def make_model_options(self):
"""Options for make_model method."""
opt_list = ['scale','mode', 'amax', 'engine']
return self._copy_from_list(opt_list)
def confidence_options(self):
"""Options for confidence method."""
opt_list = ['n_sets', 'aopt', 'alpha', 'p_center',
'strict', 'crot', 'cov_center']
return self._copy_from_list(opt_list)
def pre_validation_options(self):
"""Options for pre_validation method."""
opt_list = ['amax', 'n_sets', 'val_engine']
return self._copy_from_list(opt_list)

158
fluents/lib/blmplots.py Normal file
View File

@ -0,0 +1,158 @@
"""Specialised plots for functions defined in blmfuncs.py.
fixme:
-- Im normalsing all color mapping input vectors to [0,1]. This will
destroy informative numerical values in colorbar (but we
are not showing these anyway). A better fix would be to let the
colorbar listen to the scalarmappable instance and corect itself, but
I did not get that to work ...
fixme2:
-- If scatterplot is not inited with a colorvector there will be no
colorbar, but when adding colors the colorbar shoud be created.
"""
from fluents import plots
from scipy import dot,sum,diag,arange,log,mean,newaxis
from matplotlib import cm
class PcaScorePlot(plots.ScatterPlot):
"""PCA Score plot"""
def __init__(self, model, absi=0, ordi=1):
self._T = model.model['T']
dataset_1 = model.as_dataset('T')
dataset_2 = dataset_1
id_dim = dataset_1.get_dim_name(0)
sel_dim = dataset_1.get_dim_name(1)
id_1, = dataset_1.get_identifiers(sel_dim, [absi])
id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
plots.ScatterPlot.__init__(self, dataset_1, dataset_2, id_dim, sel_dim, id_1, id_2 ,c='b' ,s=40 , name='pca-scores')
def set_absicca(self,n):
self.xaxis_data = self._T[:,n]
def set_ordinate(self,n):
self.yaxis_data = self._T[:,n]
class PcaLoadingPlot(plots.ScatterPlot):
"""PCA Loading plot"""
def __init__(self, model, absi=0, ordi=1):
self._P = model.model['P']
dataset_1 = model.as_dataset('P')
dataset_2 = dataset_1
id_dim = dataset_1.get_dim_name(0)
sel_dim = dataset_1.get_dim_name(1)
id_1, = dataset_1.get_identifiers(sel_dim, [absi])
id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
if model.model.has_key('p_tsq'):
col = model.model['p_tsq'].ravel()
col = normalise(col)
else:
col = 'g'
plots.ScatterPlot.__init__(self, dataset_1, dataset_2, id_dim, sel_dim, id_1, id_2,c=col,s=20, name='pls-loadings')
def set_absicca(self,n):
self.xaxis_data = self._P[:,n]
def set_ordinate(self,n):
self.yaxis_data = self._P[:,n]
class PlsScorePlot(plots.ScatterPlot):
"""PLS Score plot"""
def __init__(self,model, absi=0, ordi=1):
self._T = model.model['T']
dataset_1 = model.as_dataset('T')
dataset_2 = dataset_1
id_dim = dataset_1.get_dim_name(0)
sel_dim = dataset_1.get_dim_name(1)
id_1, = dataset_1.get_identifiers(sel_dim, [absi])
id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
plots.ScatterPlot.__init__(self, dataset_1, dataset_2,
id_dim, sel_dim, id_1, id_2 ,
c='b' ,s=40 , name='pls-scores')
def set_absicca(self,n):
self.xaxis_data = self._T[:,n]
def set_ordinate(self,n):
self.yaxis_data = self._T[:,n]
class PlsLoadingPlot(plots.ScatterPlot):
"""PLS Loading plot"""
def __init__(self,model,absi=0,ordi=1):
self._P = model.model['P']
dataset_1 = model.as_dataset('P')
dataset_2 = dataset_1
id_dim = dataset_1.get_dim_name(0)
sel_dim = dataset_1.get_dim_name(1)
id_1, = dataset_1.get_identifiers(sel_dim, [absi])
id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
if model.model.has_key('w_tsq'):
col = model.model['w_tsq'].ravel()
col = normalise(col)
else:
col = 'g'
plots.ScatterPlot.__init__(self, dataset_1, dataset_2,
id_dim, sel_dim, id_1, id_2,
c=col, s=20, name='loadings')
def set_absicca(self,n):
self.xaxis_data = self._P[:,n]
def set_ordinate(self,n):
self.yaxis_data = self._T[:,n]
class LineViewXc(plots.LineViewPlot):
"""A line view of centered raw data
"""
def __init__(self, func_class, name='Profiles'):
# copy, center, plot
x = func_class._dataset['X'].copy()
x._array = x._array - mean(x._array,0)[newaxis]
plots.LineViewPlot.__init__(self, x, 1, None, name)
class ParalellCoordinates(plots.Plot):
"""Parallell coordinates for score loads with many comp.
"""
def __init__(self,model, p = 'loads'):
pass
class PlsQvalScatter(plots.ScatterPlot):
"""A vulcano like plot of loads vs qvals
"""
def __init__(self, func_class, pc=0):
model = func_class.model
if not model.has_key('w_tsq'):
return
self._W = model['P']
dataset_1 = func_class.as_dataset('P')
dataset_2 = func_class.as_dataset('w_tsq')
id_dim = dataset_1.get_dim_name(0) #genes
sel_dim = dataset_1.get_dim_name(1) #_comp
sel_dim_2 = dataset_2.get_dim_name(1) #_zero_dim
id_1, = dataset_1.get_identifiers(sel_dim, [0])
id_2, = dataset_2.get_identifiers(sel_dim_2, [0])
if model.has_key('w_tsq'):
col = model['w_tsq'].ravel()
col = normalise(col)
else:
col = 'g'
plots.ScatterPlot.__init__(self, dataset_1, dataset_2,
id_dim, sel_dim, id_1, id_2,
c=col, s=20, sel_dim_2=sel_dim_2,
name='Load Volcano')
class InfluencePlot(plots.ScatterPlot):
"""
"""
pass
def normalise(x):
"""Scale vector x to [0,1]
"""
x = x - x.min()
x = x/x.max()
return x

256
fluents/lib/cx_stats.py Normal file
View File

@ -0,0 +1,256 @@
from scipy import zeros,zeros_like,sqrt,dot,trace,sign,round_,argmax,\
sort,ravel,newaxis,asarray,diag,sum,outer,argsort,arange,ones_like,\
all,apply_along_axis,eye
from scipy.linalg import svd,inv,norm,det,sqrtm
from scipy.stats import mean,median
from cx_utils import mat_center
from validation import pls_jkW
from select_generators import shuffle_1d
from engines import *
import time
def hotelling(P, Pfull, p_center='med', cov_center='med',
alpha=0.3, crot=True, strict=False, metric=None):
"""Returns regularized hotelling T^2.
alpha -- regularisation towards pooled cov estimates
beta -- regularisation for unstable eigenvalues
p_center -- location method for submodels
cov_center -- location method for sub coviariances
alpha -- regularisation
crot -- rotate submodels toward full?
strict -- only rotate 90 degree ?
metric -- inverse metric matrix (if P and Pfull from metric pca/pls)
"""
m, n = Pfull.shape
if metric==None:
metric = eye(m, dtype='<f8')
Pfull = dot(metric.T, asarray(Pfull))
n_sets,n,amax = P.shape
# allocate
T_sq = empty((n, ),dtype='f')
Cov_i = zeros((n, amax, amax),dtype='f')
# rotate sub_models to full model
if crot:
for i,Pi in enumerate(P):
Pi = dot(metric.T, Pi)
P[i] = procrustes(Pfull, Pi, strict=strict)
# center of pnull
if p_center=='med':
P_ctr = median(P, 0)
elif p_center=='mean':
# fixme: mean is unstable
P_ctr = mean(P, 0)
else: #use full
P_ctr = Pfull
for i in xrange(n):
Pi = P[:,i,:] # (n_sets x amax)
Pi_ctr = P_ctr[i,:] # (1 x amax)
Pim = (Pi - Pi_ctr[newaxis])*sqrt(n_sets-1)
Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim)
if cov_center == 'med':
Cov = median(Cov_i, 0)
else:
Cov = mean(Cov_i, 0)
reg_cov = (1. - alpha)*Cov_i + alpha*Cov
for i in xrange(n):
Pc = P_ctr[i,:][:,newaxis]
sigma = reg_cov[i]
#T_sq[i] = sqrt(dot(dot(Pc.T, inv(sigma)), Pc).ravel())
T_sq[i] = dot(dot(Pc.T, inv(sigma)), Pc).ravel()
return T_sq
def procrustes(A, B, strict=True, center=False, verbose=False):
"""Rotation of B to A.
strict -- Only do flipping and shuffling
center -- Center before rotation, translate back after
verbose -- Print ssq
No scaling calculated.
Output B_rot = Rotated B
"""
if center:
A,mn_A = mat_center(A, ret_mn=True)
B,mn_B = mat_center(B, ret_mn=True)
u,s,vh = svd(dot(B.T, A))
v = vh.T
Cm = dot(u, v.T) #orthogonal rotation matrix
if strict: # just inverting and flipping
Cm = ensure_strict(Cm)
b_rot = dot(B, Cm)
if verbose:
print Cm.round()
fit = sum(ravel(B - b_rot)**2)
print "Sum of squares: %s" %fit
if center:
return mn_B + b_rot
else:
return b_rot
def expl_var_x(X, T):
"""Returns explained variance of X."""
# centered X,Y
exp_var_x = diag(dot(T.T, T))*100/(sum(X**2))
return exp_var_x
def expl_var_y(Y, T, Q):
"""Returns explained variance of Y.
"""
# centered Y
exp_var_y = zeros((Q.shape[1], ))
for a in range(Q.shape[1]):
Ya = outer(T[:,a], Q[:,a])
exp_var_y[a] = 100*sum(Ya**2)/sum(Y**2)
return exp_var_y
def pls_qvals(a, b, aopt=None, alpha=.3,
n_iter=20, algo='pls',
sim_method='shuffle',
p_center='med', cov_center='med',
crot=True, strict=False, metric=None):
"""Returns qvals for pls model.
input:
a -- centered data matrix
b -- centered data matrix
aopt -- scalar, opt. number of components
alpha -- [0,1] regularisation parameter for T2-test
n_iter -- number of permutations
sim_method -- permutation method ['shuffle']
p_center -- location estimator for sub models ['med']
cov_center -- location estimator for covariance of submodels ['med']
crot -- bool, use rotations of sub models?
strict -- bool, use stict (rot/flips only) rotations?
metric -- bool, use row metric?
"""
m, n = a.shape
TSQ = zeros((n, n_iter), dtype='<f8') # (nvars x n_subsets)
n_false = zeros((n, n_iter), dtype='<f8')
#full model
if algo=='bridge':
dat = bridge(a, b, aopt, 'loads', 'fast')
else:
dat = pls(a, b, aopt, 'loads', 'fast')
W = pls_jkW(a, b, aopt, n_blocks=None, algo=algo)
tsq_full = hotelling(W, dat['W'], p_center=p_center,
alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center, metric=metric)
t0 = time.time()
Vs = shuffle_1d(b, n_iter)
for i,b_shuff in enumerate(Vs):
t1 = time.time()
if algo=='bridge':
dat = bridge(a, b_shuff, aopt, 'loads','fast')
else:
dat = pls(a, b, aopt, 'loads', 'fast')
W = pls_jkW(a, b_shuff, aopt, n_blocks=None, algo=algo)
TSQ[:,i] = hotelling(W, dat['W'],p_center=p_center,
alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center, metric=metric)
print time.time() - t1
sort_index = argsort(tsq_full)[::-1]
back_sort_index = sort_index.argsort()
print time.time() - t0
# count false positives
tsq_full_sorted = tsq_full.take(sort_index)
for i in xrange(n_iter):
for j in xrange(n):
n_false[j,i] = sum(TSQ[:,i]>=tsq_full[j])
false_pos = median(n_false, 1)
ll = arange(1, len(false_pos)+1, 1)
sort_qval = false_pos.take(sort_index)/ll
qval = false_pos/ll.take(back_sort_index)
print time.time() - t0
return qval, false_pos, TSQ, tsq_full
def ensure_strict(C, only_flips=True):
"""Ensure that a rotation matrix does only 90 degree rotations.
In multiplication with pcs this allows flips and reordering.
if only_flips is True there will onlt be flips allowed
"""
Cm = C
S = sign(C) # signs
if only_flips==True:
C = eye(Cm.shape[0])*S
return C
Cm = zeros_like(C)
Cm.putmask(1.,abs(C)>.6)
if det(Cm)>1:
raise ValueError,"Implement this!"
return Cm*S
def leverage(aopt=1,*args):
"""Returns leverages
input : aopt, number of components to base leverage calculations on
*args, matrices of normed blm-paramters
output: leverages
For PCA typical inputs are normalised T or normalised P
For PLSR typical inputs are normalised T or normalised W
"""
if aopt<1:
raise ValueError,"Leverages only make sense for aopt>0"
lev = []
for u in args:
lev_u = 1./u.shape[0] + dot(u[:,:aopt], u[:,:aopt].T).diagonal()
lev.append(lev_u)
return lev
def variances(a,t,p):
"""Returns explained variance and ind. var from blm-params.
input:
a -- full centered matrix
t,p -- parameters from a bilinear approx of the above matrix.
output:
var -- variance of each component
var_exp -- cumulative explained variance in percentage
Typical inputs are: X(centered),T,P for PCA or
X(centered),T,P / Y(centered),T,Q for PLSR.
"""
tot_var = sum(a**2)
var = 100*(sum(p**2, 0)*sum(t**2, 0))/tot_var
var_exp = cumsum(var)
return var, var_exp
def residual_diagnostics(Y, Yhat, aopt=1):
"""Root mean errors and press values.
R2 vals
"""
pass
def ssq(E, axis=0, weights=None):
"""Sum of squares, supports weights."""
n = E.shape[axis]
if weights==None:
weights = eye(n)
else:
weigths = diag(weigths)
if axis==0:
Ew = dot(weights, E)
elif axis==1:
Ew = dot(E, weights)
else:
raise NotImplementedError, "Higher order modes not supported"
return pow(Ew,2).sum(axis)

108
fluents/lib/cx_utils.py Normal file
View File

@ -0,0 +1,108 @@
from scipy import apply_along_axis,newaxis,zeros,\
median,round_,nonzero,dot,argmax,any,sqrt,ndarray,\
trace,zeros_like,sign,sort,real,argsort,rand,array
from scipy.linalg import norm,svd,inv,eig
from scipy.stats import median,mean
def normalise(a,axis=0,return_scales=False):
s = apply_along_axis(norm,axis,a)
if axis==0:
s = s[newaxis]
else:
s = s[:,newaxis]
a_s = a/s
if return_scales:
return a_s,s
return a_s
def sub2ind(shape,i,j):
"""Indices from subscripts. Only support for 2d"""
row,col = shape
ind = []
for k in xrange(len(i)):
for m in xrange(len(j)):
ind.append(i[k]*col + j[m])
return ind
def sorted_eig(a, b=None,sort_by='sm'):
"""
Just eig with real part of output sorted:
This is for convenience only, not general!
sort_by='sm': return the eigenvectors by eigenvalues
of smallest magnitude first. (default)
'lm': returns largest eigenvalues first
output: just as eig with 2 outputs
-- s,v (eigvals,eigenvectors)
(This is reversed output compared to matlab)
"""
s,v = eig(a,b)
s = real(s) # dont expect any imaginary part
v = real(v)
ind = argsort(s)
if sort_by=='lm':
ind = ind[::-1]
v = v.take(ind,1)
s = s.take(ind)
return s,v
def str2num(string_number):
"""Convert input (string number) into number, if float(string_number) fails, a nan is inserted.
"""
missings = ['','nan','NaN','NA']
try:
num = float(string_number)
except:
if string_number in missings:
num = nan
else:
print "Found strange entry: %s" %string_number
raise
return num
def randperm(n):
r=rand(n)
dict={}
for i in range(n):
dict[r[i]]=i
r=sort(r)
out=zeros(n)
for i in range(n):
out[i]=dict[r[i]]
return array(out,dtype='i')
def mat_center(X,axis=0,ret_mn=False):
"""Mean center matrix along axis.
X -- matrix, data
axis -- dim,
ret_mn -- bool, return mean
output:
Xc, [mnX]
NB: axis = 1 is column-centering, axis=0=row-centering
default is row centering (axis=0)
"""
try:
rows,cols = X.shape
except ValueError:
print "The X data needs to be two-dimensional"
if axis==0:
mnX = mean(X,axis)[newaxis]
Xs = X - mnX
elif axis==1:
mnX = mean(X,axis)[newaxis]
Xs = (X.T - mnX).T
if ret_mn:
return Xs,mnX
else:
return Xs

200
fluents/lib/engines.py Normal file
View File

@ -0,0 +1,200 @@
"""Module contain algorithms for (burdensome) calculations.
There is no typechecking of any kind here, just focus on speed
"""
from scipy.linalg import svd,norm,inv,pinv,qr
from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\
apply_along_axis,mean,ones,randn,empty_like,outer,c_,\
rand,sum,cumsum
def pca(a, aopt, scale='scores', mode='normal'):
""" Principal Component Analysis model
mode:
-- fast : returns smallest dim scaled (T for n<=m, P for n>m )
-- normal : returns all model params and residuals after aopt comp
-- detailed : returns all model params and all residuals
"""
m,n = a.shape
u,s,vt = svd(a, full_matrices=0)
T = u*s
T = T[:,:aopt]
P = vt[:aopt,:].T
if scale=='loads':
tnorm = apply_along_axis(norm, 0, T)
T = T/tnorm
P = P*tnorm
if mode == 'fast':
return {'T':T, 'P':P}
if mode=='detailed':
"""Detailed mode returns residual matrix for all comp.
That is E, is a three-mode matrix: (amax, m, n) """
E = empty((aopt, m, n))
for ai in range(aopt):
e = a - dot(T[:,:ai+1], P[:,:ai+1].T)
E[ai,:,:] = e.copy()
else:
E = a - dot(T,P.T)
return {'T':T, 'P':P, 'E':E}
def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
"""Kernel pls for tall/wide matrices.
Fast pls for calibration. Only inefficient for many Y-vars.
"""
m,n = a.shape
if ab!=None:
mm,l = ab.shape
else:
k,l = b.shape
W = empty((n, aopt))
P = empty((n, aopt))
R = empty((n, aopt))
Q = empty((l, aopt))
T = empty((m, aopt))
B = empty((aopt, n, l))
if ab==None:
ab = dot(a.T, b)
for i in range(aopt):
if ab.shape[1]==1:
w = ab
else:
u,s,vh = svd(dot(ab.T, ab))
w = dot(ab,u[:,:1])
w = w/norm(w)
r = w.copy()
if i>0:
for j in range(0,i,1):
r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis]
t = dot(a, r)
tt = norm(t)**2
p = dot(a.T, t)/tt
q = dot(r.T, ab).T/tt
ab = ab - dot(p, q.T)*tt
T[:,i] = t.ravel()
W[:,i] = w.ravel()
P[:,i] = p.ravel()
R[:,i] = r.ravel()
if mode=='fast' and i==aopt-1:
if scale=='loads':
tnorm = apply_along_axis(norm, 0, T)
T = T/tnorm
W = W*tnorm
return {'T':T, 'W':W}
Q[:,i] = q.ravel()
B[i] = dot(R[:,:i+1], Q[:,:i+1].T)
if mode=='detailed':
E = empty((aopt, m, n))
F = empty((aopt, k, l))
for i in range(1,aopt+1,1):
E[i-1] = a - dot(T[:,:i],P[:,:i].T)
F[i-1] = b - dot(T[:,:i],Q[:,:i].T)
else:
E = a - dot(T[:,:aopt], P[:,:aopt].T)
F = b - dot(T[:,:aopt], Q[:,:aopt].T)
if scale=='loads':
tnorm = apply_along_axis(norm, 0, T)
T = T/tnorm
W = W*tnorm
Q = Q*tnorm
P = P*tnorm
return {'B':B, 'Q':Q, 'P':P, 'T':T, 'W':W, 'R':R, 'E':E, 'F':F}
def w_simpls(aat, b, aopt):
""" Simpls for wide matrices.
Fast pls for crossval, used in calc rmsep for wide X
There is no P,W. T is normalised
"""
bb = b.copy()
m,m = aat.shape
U = empty((m, aopt))
T = empty((m, aopt))
H = empty((m, aopt)) #just like W in simpls
PROJ = empty((m, aopt)) #just like R in simpls
for i in range(aopt):
u,s,vh = svd(dot(dot(b.T, aat), b), full_matrices=0)
u = dot(b, u[:,:1]) #y-factor scores
U[:,i] = u.ravel()
t =dot(aat, u)
t = t/norm(t)
T[:,i] = t.ravel()
h = dot(aat, t) #score-weights
H[:,i] = h.ravel()
PROJ[:,:i+1] = dot(T[:,:i+1], inv(dot(T[:,:i+1].T, H[:,:i+1])) )
if i<aopt:
b = b - dot(PROJ[:,:i+1], dot(H[:,:i+1].T,b) )
C = dot(bb.T, T)
return {'T':T,'U':U,'Q':C,'H':H}
def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
"""Undeflated Ridged svd(X'Y)
"""
m, n = a.shape
k, l = b.shape
u,s,vt = svd(b, full_matrices=0)
g0 = dot(u*s, u.T)
g = (1 - r)*g0 + r*eye(m)
ag = dot(a.T, g)
u,s,vt = svd(ag, full_matrices=0)
W = u[:,:aopt]
K = vt[:aopt,:].T
T = dot(a, W)
tnorm = apply_along_axis(norm, 0, T) # norm of T-columns
if mode == 'fast':
if scale=='loads':
T = T/tnorm
W = W*tnorm
return {'T':T, 'W':W}
U = dot(g0, K) #fixme check this
Q = dot(b.T, dot(T, inv(dot(T.T,T)) ))
B = zeros((aopt, n, l))
for i in range(aopt):
B[i] = dot(W[:,:i+1], Q[:,:i+1].T)
# leverages
# fixme: probably need an orthogonal basis for row-space leverage
# T (scores) are not orthogonal
# Using a qr decomp to get an orthonormal basis for row-space
#Tq = qr(T)[0]
#s_lev,v_lev = leverage(aopt,Tq,W)
# explained variance
#var_x, exp_var_x = variances(a,T,W)
#qnorm = apply_along_axis(norm, 0, Q)
#var_y, exp_var_y = variances(b,U,Q/qnorm)
if mode == 'detailed':
E = empty((aopt, m, n))
F = empty((aopt, k, l))
for i in range(aopt):
E[i] = a - dot(T[:,:i+1], W[:,:i+1].T)
F[i] = b - dot(a, B[i])
else: #normal
F = b - dot(a, B[-1])
E = a - dot(T, W.T)
if scale=='loads':
T = T/tnorm
W = W*tnorm
Q = Q*tnorm
return {'B':B, 'W':W, 'T':T, 'Q':Q, 'E':E, 'F':F, 'U':U, 'P':W}

626
fluents/lib/nx_utils.py Normal file
View File

@ -0,0 +1,626 @@
import os,sys
from itertools import izip
import networkx as NX
from scipy import shape,diag,dot,asarray,sqrt,real,zeros,eye,exp,maximum,\
outer,maximum,sum,diag,real
from scipy.linalg import eig,svd,inv,expm,norm
from cx_utils import sorted_eig
import numpy
eps = numpy.finfo(float).eps.item()
feps = numpy.finfo(numpy.single).eps.item()
_array_precision = {'f': 0, 'd': 1, 'F': 0, 'D': 1,'i': 1}
def xgraph_to_graph(G):
"""Convert an Xgraph to an ordinary graph.
Edge attributes, mult.edges and self-loops are lost in the process.
"""
GG = NX.convert.from_dict_of_lists(NX.convert.to_dict_of_lists(G))
return GG
def get_affinity_matrix(G, data, ids, dist='e', mask=None, weight=None, t=0, out='dist'):
"""
Function for calculating a general affinity matrix, based upon distances.
Affiniy = 1 - distance ((10-1) 1 is far apart)
INPUT
data:
gene expression data, type dict data[gene] = expression-vector
G:
The network (networkx.base.Graph object)
mask:
The array mask shows which data are missing. If mask[i][j]==0, then
data[i][j] is missing.
weights:
The array weight contains the weights to be used when calculating distances.
transpose:
If transpose==0, then genes are clustered. If transpose==1, microarrays are
clustered.
dist:
The character dist defines the distance function to be used:
dist=='e': Euclidean distance
dist=='b': City Block distance
dist=='h': Harmonically summed Euclidean distance
dist=='c': Pearson correlation
dist=='a': absolute value of the correlation
dist=='u': uncentered correlation
dist=='x': absolute uncentered correlation
dist=='s': Spearman's rank correlation
dist=='k': Kendall's tau
For other values of dist, the default (Euclidean distance) is used.
OUTPUT
D :
Similariy matrix (nGenes x nGenes), symetric, d_ij e in [0,1]
Normalized so max weight = 1.0
"""
try:
from Bio import Cluster as CLS
except:
raise ValueError, "Need installed biopython"
nVar = len(data)
nSamp = len(data[data.keys()[0]])
X = zeros((nVar, nSamp),dtpye='<f8')
for i,gene in enumerate(ids): #this shuld be right!!
X[i,:] = data[gene]
#X = transpose(X) # distancematrix needs matrix as (nGenes,nSamples)
D_list = CLS.distancematrix(X, dist=dist)
D = zeros((nVar,nVar),dtype='<f8')
for i,row in enumerate(D_list):
if i>0:
D[i,:len(row)]=row
D = D + D.T
MAX = 30.0
D_max = max(ravel(D))/MAX
D_n = D/D_max #normalised (max = 10.0)
D_n = (MAX+1.) - D_n #using correlation (inverse distance for dists)
A = NX.adj_matrix(G, nodelist=ids)
if out=='dist':
return D_n*A
elif out=='heat_kernel':
t=1.0
K = exp(-t*D*A)
return K
elif out=='complete':
return D_n
else:
return []
def remove_one_degree_nodes(G, iter=True):
"""Removes all nodes with only one neighbour. These nodes does
not contribute to community structure.
input:
G -- graph
iter -- True/False iteratively remove?
"""
G_copy = G.copy()
if iter==True:
while 1:
bad_nodes=[]
for node in G_copy.nodes():
if len(G_copy.neighbors(node))==1:
bad_nodes.append(node)
if len(bad_nodes)>0:
G_copy.delete_nodes_from(bad_nodes)
else:
break
else:
bad_nodes=[]
for ngb in G_copy.neighbors_iter():
if len(G_copy.neighbors(node))==1:
bad_nodes.append(node)
if len(bad_nodes)>0:
G_copy.delete_nodes_from(bad_nodes)
print "Deleted %s nodes from network" %(len(G)-len(G_copy))
return G_copy
def key_players(G, n=1, with_labels=False):
"""
Resilince measure
Identification of key nodes by fraction of nodes in
disconnected subgraph when the node is removed.
output:
fraction of nodes disconnected when node i is removed
"""
i=0
frac=[]
labels = {}
for node in G.nodes():
i+=1
print i
T = G.copy()
T.delete_node(node)
n_nodes = T.number_of_nodes()
sub_graphs = NX.connected_component_subgraphs(T)
n = len(sub_graphs)
if n>1:
strong_comp = sub_graphs[0]
fraction = 1.0 - 1.0*strong_comp.number_of_nodes()/n_nodes
frac.append(fraction)
labels[node]=fraction
else:
frac.append(0.0)
labels[node]=0.0
out = 1.0 - array(frac)
if with_labels==True:
return out,labels
else:
return out
def node_weighted_adj_matrix(G, weights=None, ave_type='harmonic', with_labels=False):
"""Return a weighted adjacency matrix of graph. The weights are
node weights.
input: G -- graph
weights -- dict, keys: nodes, values: weights
with_labels -- True/False, return labels?
output: A -- weighted eadjacency matrix
[index] -- node labels
"""
n=G.order()
# make an dictionary that maps vertex name to position
index={}
count=0
for node in G.nodes():
index[node]=count
count = count+1
a = zeros((n,n))
if type(G)=='networkx.xbase.XGraph':
raise
for head,tail in G.edges():
if ave_type == 'geometric':
a[index[head],index[tail]]= sqrt(weights[head]*weights[tail])
a[index[tail],index[head]]= a[index[head],index[tail]]
elif ave_type == 'harmonic':
a[index[head],index[tail]] = mean(weights[head],weights[tail])
a[index[tail],index[head]]= mean(weights[head],weights[tail])
if with_labels:
return a,index
else:
return a
def weighted_adj_matrix(G, with_labels=False):
"""Adjacency matrix of an XGraph whos weights are given in edges.
"""
A,labels = NX.adj_matrix(G,with_labels=True)
W = A.astype('<f8')
for orf,i in labels.items():
for orf2,j in labels.items():
if G.has_edge(orf,orf2):
edge_weight = G.get_edge(orf,orf2)
W[i,j]=edge_weight
W[j,i]=edge_weight
if with_labels==True:
return W,labels
else:
return W
def assortative_index(G):
"""Ouputs two vectors: the degree and the neighbor average degree.
Used to measure the assortative mixing. If the average degree is
pos. correlated with the degree we know that hubs tend to connect
to other hubs.
input: G, graph connected!!
ouput: d,mn_d: degree, and average degree of neighb.
(degree sorting from degree(with_labels=True))
"""
d = G.degree(with_labels=True)
out=[]
for node in G.nodes():
nn = G.neighbors(node)
if len(nn)>0:
nn_d = mean([float(d[i]) for i in nn])
out.append((d[node], nn_d))
return array(out).T
def struct_equivalence(G,n1,n2):
"""Returns the structural equivalence of a node pair. Two nodes
are structural equal if they share the same neighbors.
x_s = [ne(n1) union ne(n2) - ne(n1) intersection ne(n2)]/[ne(n1)
union ne(n2) + ne(n1) intersection ne(n2)]
ref: Brun et.al 2003
"""
#[ne(n1) union ne(n2) - ne(n1) intersection ne(n2
s1 = set(G.neighbors(n1))
s2 = set(G.neighbors(n2))
num_union = len(s1.union(s2))
num_intersection = len(s1.intersection(s2))
if num_union & num_intersection:
xs=0
else:
xs = (num_union - num_intersection)/(num_union + num_intersection)
return xs
def struct_equivalence_all(G):
"""Not finnished.
"""
A,labels = NX.adj_matrix(G,with_labels=True)
pass
def hamming_distance(n1,n2):
"""Not finnsihed.
"""
pass
def graph_corrcoeff(G):
"""Not finnished.
"""
A,index = NX.adj_matrix(G,with_labels=True)
#C = zeros(*A.shape(),'d')
n = 1.*G.number_of_nodes()
for node in G.nodes():
a_j = A[index[node],:] #neighbors
mean_a = sum(a_j)/n# degree(G)/number_of_nodes()
var_a = sqrt(sum((a_j - mean_a)**2)/n)
pass
def graph_and_data_intersection(data, graph, pathways=None,
keep_connected=True):
"""Returns the intersection of keys in two dictionaries.
NB: keep track of identifer sorting after these dict transforms.
input:
data -- dict, keys: gene id, value: measurement profile
graph -- networkx,base.graph, full graph
pathways -- dict, keys: pathway name, values: nodes in pathway
call:
new_data, new_graph,pathways = graph_and_data_intersection(data,graph,pathways,keep_connected=True)
"""
new_graph = graph.copy()
new_data = {}
new_pathways = {}
graph_set = set(graph.nodes())
data_set = set(data.keys())
intersection = data_set & graph_set
new_graph.delete_nodes_from(graph_set - data_set) #remove difference
for k in intersection:
new_data[k] = data[k]
if keep_connected:
max_iter = 0
sub_graphs = NX.connected_component_subgraphs(new_graph)
if len(sub_graphs)==0:
new_graph = sub_graphs[0]
else:
new_graph = sub_graphs[0]
old_data = new_data
while new_graph.number_of_nodes() != len(new_data) and max_iter<100:
max_iter+=1
graph_set = sets.Set(new_graph.nodes())
data_set = sets.Set(new_data.keys())
intersection = data_set & graph_set
new_graph.delete_nodes_from(graph_set - data_set)
new_data={}
for k in intersection:
new_data[k] = old_data[k]
old_data = new_data.copy()
new_graph = NX.connected_component_subgraphs(new_graph)[0]
if pathways!=None:
for pth,nodes in pathways.items():
new_pathways[pth] = [node for node in nodes if node in new_graph]
print "\nSUMMARY (graph_and_data_intersection): "
print "Number of input variables: %s\n\
Number nodes in input graph: %s" %(len(data),len(graph))
print "\nUsing intersection of connected graph and nodes with data values"
print "Number of variables is now: %s" %len(new_data)
print "Number of nodes in graph: %s" %new_graph.number_of_nodes()
if pathways!=None:
return new_data,new_graph,new_pathways
else:
return new_data,new_graph
def rx_graph_and_data_intersection(graph,node_data,pathways,data,keep_connected=False):
"""Returns a (connected) reaction graph with present gene expression data.
keep_connected==True:
When a node (gene) is not present in our expression data, the node
is deleted and all neighbors are connected with edge weight=0.5
if the are not already neigbors.
input:
data -- dict, keys: gene id, value: measurement profile
graph -- networkx.xbase.xgraph, full wieghted graph
node_data -- dict, keys: rx id, value: set of gene_ids
pathways -- dict, keys: pathway name, values: lidt of nodes in pathway
"""
# We do not connect the full graph ... may be performed by using the reference graph?
graph = NX.connected_component_subgraphs(graph)[0] #largest connected component
new_graph = graph.copy()
new_data = {}
new_node_data = node_data.copy()
new_pathways = {}
genes_in_graph=set()
genes_in_data = set(data.keys())
rx_in_graph = set(new_graph.nodes())
# genes in graph nodes (rx_nodes)
for rx in rx_in_graph:
genes_in_graph.update(set(new_node_data.get(rx)))
keep_genes = genes_in_data.intersection(genes_in_graph) #both in graph and data
#update node data
for rx,genes in node_data.items(): # delete node data of nodes not present in graph
genes = set(genes)
genes.intersection_update(keep_genes) #remove genes if they are not in inters.
if len(genes)==0 or rx not in rx_in_graph: #no gene data or not in graph
print "removing: " + str(rx)
del new_node_data[rx]
rx_in_data= set(new_node_data.keys())
rx_intersection = rx_in_data.intersection(rx_in_graph)
for gene in keep_genes:
new_data[gene] = data.get(gene)
# update pathways nodes
for pth,genes in pathways.items():
if genes:
genes = set(genes)
genes.intersection_update(keep_genes) # gene needs to have data
else:
pass
new_pathways[pth] = genes
bad_nodes = rx_in_graph.difference(rx_in_data) #in graph but no data
if keep_connected==True:
dummy = new_graph.copy()
for rx in bad_nodes:
dummy.delete_node(rx)
if len(NX.connected_component_subgraphs(dummy))>1:
nghbrs = new_graph.neighbors(rx)
for i in nghbrs:
for j in nghbrs:
if i!=j:
if not new_graph.has_edge(i,j):
new_graph.add_edge(i,j,0.5)
#update graph
new_graph.delete_nodes_from(list(bad_nodes))
return new_graph,new_node_data,new_pathways,new_data
def weighted_laplacian(G,with_labels=False):
"""Return standard Laplacian of graph from a weighted adjacency matrix."""
n= G.order()
I = scipy.eye(n)
A = weighted_adj_matrix(G)
D = I*scipy.sum(A, 0)
L = D-A
if with_labels:
A,index = weighted_adj_matrix(G, with_labels=True)
return L, index
else:
return L
"""Below are methods for calculating graph metrics
Four main decompositions :
0.) Adjacency diffusion kernel expm(A),
1.) von neumann kernels (diagonalisation of adjacency matrix)
2.) laplacian kernels (geometric series of adj.)
3.) diffusion kernels (exponential series of adj.)
---- Kv
von_neumann : Kv = (I-alpha*A)^-1 (mod: A(I-alpha*A)^-1)? ,
geom. series
---- Kl
laplacian: Kl = (I-alpha*L)^-1 , geom. series
---- Kd
laplacian_diffusion: Kd = expm(-alpha*L)
exp. series
---- Ke
Exponential diffusion.
Ke = expm(A) .... expm(-A)?
"""
# TODO:
# check for numerical unstable eigenvalues and set to zero
# othervise some inverses wil explode ->ok ..using pinv for inverses
#
# This gives results that look numerical unstable
#
# -- divided adj by sum(A[:]), check this one (paper by Lebart scales with number of edges)
#
#
#
# the neumann kernel is defined in Kandola to be K = A*(I-A)^-1
# lowest eigenvectors are same as the highest of K = A*A ?
# this needs clarification
# diffusion is still wrong! ... ok
# diff needs normalisation?! check the meaning of exp(-s) = exp(1/s) -L = 1/degree ... etc
# Is it the negative of exp. of adj. metrix in Kandola?
#
# Normalised=False returns only nans (no idea why!!) ... fixed ok
# 31.1: diff is ok exp(0)=1 not zero!
# 07.03.2005: normalisation is ok: -> normalisation will emphasize high degree nodes
# 10.03.2005: symeig is unstable an returns nans of some eigenvectors? switching back to eig
# 14.05.2006: diffusion returns negative values, using expm(-LL) instead (FIX)
# 13.09.2206: update for use in numpy
def K_expAdj(W, normalised=False, alpha=1.0):
"""Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel.
"""
W = asarray(W)
t = W.dtype.char
if len(W.shape)!=2:
raise ValueError, "Non-matrix input to matrix function."
m,n = W.shape
if t in ['F','D']:
raise TypeError, "Complex input!"
if normalised==True:
T = diag( sqrt( 1./(sum(W,0))) )
W = dot(dot(T, W), T)
e,vr = eig(W)
s = real(e)**2 # from eigenvalues to singularvalues
vri = inv(vr)
s = maximum.reduce(s) + s
cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
cutoff = abs(cond*maximum.reduce(s))
psigma = eye(m)
for i in range(len(s)):
if abs(s[i]) > cutoff:
psigma[i,i] = .5*alpha*exp(s[i])
return dot(dot(vr,psigma),vri)
def K_vonNeumann(W,normalised=False,alpha=1.0):
""" The geometric series of path lengths.
Returns matrix square root of pseudo inverse of the adjacency matrix.
"""
W = asarray(W)
t = W.dtype.char
if len(W.shape)!=2:
raise ValueError, "Non-matrix input to matrix function."
m,n = W.shape
if t in ['F','D']:
raise TypeError, "Complex input!"
if normalised==True:
T = diag(sqrt(1./(sum(W,0))))
W = dot(dot(T,W),T)
e,vr = eig(W)
vri = inv(vr)
e = real(e) # we only work with real pos. eigvals
e = maximum.reduce(e) + e
cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
cutoff = cond*maximum.reduce(e)
psigma = zeros((m,n),t)
for i in range(len(e)):
if e[i] > cutoff:
psigma[i,i] = 1.0/e[i] #these are eig.vals (=sqrt(sing.vals))
return dot(dot(vr,psigma),vri).astype(t)
def K_laplacian(W, normalised=True, alpha=1.0):
""" This is the matrix square root of the pseudo inverse of L.
Also known as th eaverage commute time matrix.
"""
W = asarray(W)
t = W.dtype.char
if len(W.shape)!=2:
raise ValueError, "Non-matrix input to matrix function."
m,n = W.shape
if t in ['F','D']:
raise TypeError, "Complex input!"
D = diag(sum(W,0))
L = D - W
if normalised==True:
T = diag(sqrt(1./sum(W,0)))
L = dot(dot(T,L),T)
e,vr = eig(L)
e = real(e)
vri = inv(vr)
cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
cutoff = cond*maximum.reduce(e)
psigma = zeros((m,),t) # if s close to zero -> set 1/s = 0
for i in range(len(e)):
if e[i] > cutoff:
psigma[i] = 1.0/e[i]
K = dot(dot(vr,diag(psigma)),vri).astype(t)
K = real(K)
I = eye(n)
K = (1-alpha)*I + alpha*K
return K
def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5):
"""Returns diffusion kernel.
input:
-- W, adj. matrix
-- normalised [True/False]
-- alpha, [0,1] (degree of network influence)
-- beta, [0->), (diffusion degree)
"""
W = asarray(W)
t = W.dtype.char
if len(W.shape)!=2:
raise ValueError, "Non-matrix input to matrix function."
m,n = W.shape
if t in ['F','D']:
raise TypeError, "Complex input!"
D = diag(sum(W,0))
L = D-W
if normalised==True:
T = diag(sqrt(1./(sum(W,0))))
L = dot(dot(T,L),T)
e,vr = eig(L)
vri = inv(vr) #inv
cond = 1.0*{0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
cutoff = 1.*abs(cond*maximum.reduce(e))
psigma = eye(m) # if sing vals are 0 exp(0)=1 (unnecessary)
#psigma = zeros((m,n), dtype='<f8')
for i in range(len(e)):
if abs(e[i]) > cutoff:
psigma[i,i] = exp(-beta*e[i])
K = real(dot(dot(vr, psigma), vri))
I = eye(n, dtype='<f8')
K = (1. - alpha)*I + alpha*K
return K
def K_modularity(W,alpha=1.0):
""" Returns the matrix square root of Newmans modularity."""
W = asarray(W)
t = W.dtype.char
m, n = W.shape
d = sum(W, 0)
m = 1.*sum(d)
B = W - (outer(d, d)/m)
s,v = sorted_eig(B, sort_by='lm')
psigma = zeros( (n, n), dtype='<f8' )
for i in range(len(s)):
if s[i]>1e-7:
psigma[i,i] = sqrt(s[i])
#psigma[i,i] = s[i]
K = dot(dot(v, psigma), v.T)
I = eye(n)
K = (1 - alpha)*I + alpha*K
return K
def kernel_score(K, W):
"""Returns the modularity score.
K -- (modularity) kernel
W -- adjacency matrix (possibly weighted)
"""
# normalize W (: W'W=I)
m, n = shape(W)
for i in range(n):
W[:,i] = W[:,i]/norm(W[:,i])
score = diag(dot(W, dot(K, W)) )
tot = sum(score)
return score, tot

View File

@ -0,0 +1,187 @@
"""Matrix cross validation selection generators
"""
from scipy import take,arange,ceil,repeat,newaxis,mean,asarray,dot,ones,\
random,array_split,floor,vstack,asarray,minimum
from cx_utils import randperm
def w_pls_gen(aat,b,n_blocks=None,center=True,index_out=False):
"""Random block crossvalidation for wide (XX.T) trick in PLS.
Leave-one-out is a subset, with n_blocks equals nSamples
aat -- outerproduct of X
b -- Y
n_blocks =
center -- use centering of calibration ,sets (aat_in,b_in) are centered
Returns:
-- aat_in,aat_out,b_in,b_out,[out]
"""
m,n = aat.shape
index = randperm(m)
nValuesInBlock = m/n_blocks
if n_blocks==m:
index = arange(m)
out_ind = [index[i*nValuesInBlock:(i+1)*nValuesInBlock] for i in range(n_blocks)]
for out in out_ind:
inn = [i for i in index if i not in out]
aat_in = aat[inn,:][:,inn]
aat_out = aat[out,:][:,inn]
b_in = b[inn,:]
b_out = b[out,:]
if center:
# centering projector: I - (1/n)11'
# nin = len(inn)
# Pc = eye(nin) - outer(ones((nin,)),ones((nin,)))/nin
# xxt - x( outer(ones((nin,)),ones((nin,)))/nin ) x.T
# de jong:
h = sum(aat_in,0)[ :,newaxis]
h = (h - mean(h)/2)/len(inn)
mn_a = h + h.T
aat_in = aat_in - mn_a
if index_out:
yield aat_in,aat_out,b_in,b_out,out
else:
yield aat_in,aat_out,b_in,b_out
def pls_gen(a,b, n_blocks=None, center=False, index_out=False,axis=0):
"""Random block crossvalidation
Leave-one-out is a subset, with n_blocks equals a.shape[-1]
"""
index = randperm(a.shape[axis])
if n_blocks==None:
n_blocks = a.shape[axis]
n_in_set = ceil(float(a.shape[axis])/n_blocks)
out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_blocks)]
for out in out_ind_sets:
inn = [i for i in index if i not in out]
if center:
a = a - mean(a,0)[newaxis]
b = b - mean(b,0)[newaxis]
if index_out:
yield a.take(inn,0),a.take(out,0), b.take(inn,0),b.take(out,0),out
else:
yield a.take(inn,0),a.take(out,0), b.take(inn,0),b.take(out,0)
def pca_gen(a,n_sets=None, center=False, index_out=False,axis=0):
"""PCA random block crossval generator.
"""
m = a.shape[axis]
index = randperm(m)
if n_sets==None:
n_sets = m
n_in_set = ceil(float(m)/n_sets)
out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_sets)]
for out in out_ind_sets:
inn = [i for i in index if i not in out]
if center:
a = a - mean(a,0)[newaxis]
if index_out:
yield a.take(inn,0),a.take(out,0),out
else:
yield a.take(inn,0),a.take(out,0)
def w_pls_gen_jk(a,b,n_sets=None,center=True,index_out=False,axis=0):
"""Random block crossvalidation for wide X (m>>n)
Leave-one-out is a subset, with n_sets equals a.shape[-1]
Returns : X_m and X_m'Y_m
"""
m = a.shape[axis]
ab = dot(a.T,b)
index = randperm(m)
if n_sets==None:
n_sets = m
n_in_set = ceil(float(m)/n_sets)
out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_sets)]
for out in out_ind_sets:
inn = [i for i in index if i not in out]
nin = len(inn)
nout = len(out)
a_in = a[inn,:]
mn_a = 0
mAB = 0
if center:
mn_a = mean(a,0)[newaxis]
mAin = dot(-ones((1,nout)),a[out,:])/nin
mBin = dot(-ones((1,nout)),b[out,:])/nin
mAB = dot(mAin.T,(mBin*nin))
ab_in = ab - dot(a[out,].T,b[out,:]) - mAB
a_in = a_in - mn_a
if index_out:
yield ain,ab, out
else:
yield a_in, ab
def shuffle_1d_block(a, n_sets=None, blocks=None, index_out=False, axis=0):
"""Random block shuffling along 1d axis
Returns : Shuffled a by axis
"""
m = a.shape[axis]
if blocks==None:
blocks = m
for ii in xrange(n_sets):
index = randperm(m)
if blocks==m:
a_out = a.take(index, axis)
else:
index = arange(m)
dummy = map(random.shuffle, array_split(index, blocks))
a_out = a.take(index, axis)
if index_out:
yield a_out, index
else:
yield a_out
def shuffle_1d(a, n_sets, axis=0):
"""Random shuffling along 1d axis.
Returns : Shuffled a by axis
"""
m = a.shape[axis]
for ii in xrange(n_sets):
index = randperm(m)
yield a.take(index, axis)
def diag_pert(a, n_sets=10, center=True, index_out=False):
"""Alter generator returning sets perturbed with means at diagonals.
input:
X -- matrix, data
alpha -- scalar, approx. portion of data perturbed
"""
m, n = a.shape
tr=False
if m>n:
a = a.T
m, n = a.shape
tr = True
if n_sets>m or n_sets>n:
msg = "You may not use more subsets than max(n_rows, n_cols)"
raise ValueError, msg
nm=n*m
start_inds = array_split(randperm(m),n_sets) # we use random start diags
if center:
a = a - mean(a, 0)[newaxis]
for v in range(n_sets):
a_out = a.copy()
out = []
for start in start_inds[v]:
ind = arange(start+v, nm, n+1)
[out.append(i) for i in ind]
if center:
a_out.put(a.mean(),ind)
else:
a_out.put(0, ind)
if tr:
a_out = a_out.T
if index_out:
yield a_out, asarray(out)
else:
yield a_out

145
fluents/lib/validation.py Normal file
View File

@ -0,0 +1,145 @@
from scipy import ones,mean,sqrt,dot,newaxis,zeros,sum,empty,\
apply_along_axis,eye, kron
from scipy.linalg import triu,inv,svd,norm
from select_generators import w_pls_gen,w_pls_gen_jk,pls_gen,pca_gen,diag_pert
from engines import w_simpls,pls, bridge,pca
from pylab import *
def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
"""RMSEP calc for pls with wide X.
"""
k, l = Y.shape
PRESS = zeros((l, amax+1), dtype='f')
# X,Y are centered
if n_blocks==None:
n_blocks = Y.shape[0]
V = w_pls_gen(dot(X, X.T), Y, n_blocks=n_blocks, center=True)
for Din, Doi, Yin, Yout in V:
ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0])
Yin = Yin - ym
PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
if algo=='simpls':
dat = w_simpls(Din, Yin, amax)
Q,U,H = dat['Q'], dat['U'], dat['H']
That = dot(Doi, dot(U, inv(triu(dot(H.T,U))) ))
else:
"Other algo-support comming soon"
raise NotImplementedError
#Yhat = empty((amax, k, l),dtype='<f8')
Yhat = []
for j in range(l):
TQ = dot(That, triu(dot(Q[j,:][:,newaxis], ones((1,amax)))) )
E = Yout[:,j][:,newaxis] - TQ
E = E + sum(E, 0)/Din.shape[0]
PRESS[j,1:] = PRESS[j,1:] + sum(E**2, 0)
#Yhat = Y - dot(That,Q.T)
return sqrt(PRESS/Y.shape[0])
def pls_val(X, Y, amax=2, n_blocks=10,algo='pls'):
""" Validation results of pls model.
"""
k, l = Y.shape
PRESS = zeros((l, amax+1), dtype='<f8')
EE = zeros((amax, k, l), dtype='<f8')
Yhat = zeros((amax, k, l), dtype='<f8')
# X,Y are centered
V = pls_gen(X, Y, n_blocks=n_blocks, center=True, index_out=True)
for Xin, Xout, Yin, Yout, out in V:
ym = -sum(Yout,0)[newaxis]/Yin.shape[0]
Yin = (Yin - ym)
PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
if algo=='pls':
dat = pls(Xin, Yin, amax, mode='normal')
elif algo=='bridge':
dat = simpls(Xin, Yin, amax, mode='normal')
for a in range(amax):
Ba = dat['B'][a,:,:]
Yhat[a,out[:],:] = dot(Xout, Ba)
E = Yout - dot(Xout, Ba)
EE[a,out,:] = E
PRESS[:,a+1] = PRESS[:,a+1] + sum(E**2,0)
return sqrt(PRESS/(k-1.)), EE, Yhat
def pca_alter_val(a, amax, n_sets=10,method='diag'):
"""Pca validation by altering elements in X.
"""
# todo: it is just as easy to do jk-estimates her as well
V = diag_pert(a, n_sets, center=True, index_out=True)
sep = empty((n_sets, amax), dtype='f')
for i, (xi, ind) in enumerate(V):
dat_i = pca(xi, amax, mode='detailed')
Ti,Pi = dat_i['T'],dat_i['P']
for j in xrange(amax):
Xhat = dot(Ti[:,:j+1], Pi[:,:j+1].T)
a_sub = a.ravel().take(ind)
EE = a_sub - Xhat.ravel().take(ind)
tot = (a_sub**2).sum()
sep[i,j] = (EE**2).sum()/tot
return sqrt(sep.mean(0))
#return sep
def pca_cv_val(X, amax, n_sets):
""" Cross validation of pca using random sets crossval.
"""
m, n = X.shape
xtot = (X**2).sum()
V = pca_gen(X, n_sets=7, center=True, index_out=True)
E = empty((amax, m, n), dtype='f')
for xi,xout,ind in V:
dat_i = pca(xi, amax, mode='detailed')
Pi = dat_i['P']
for a in xrange(amax):
Pia = Pi[:,:a+1]
E[a][ind,:] = (X[ind,:] - dot(xout, dot(Pia,Pia.T) ))**2
sep = []
for a in xrange(amax):
sep.append(E[a].sum()/xtot)
return sqrt(sep.mean(0))
def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True):
""" Returns CV-segments of paramter W for wide X.
todo: add support for T,Q and B
"""
if n_blocks == None:
n_blocks = b.shape[0]
WW = empty((n_blocks, a.shape[1], amax), dtype='f')
if use_pack:
u, s, inflater = svd(a, full_matrices=0)
a = u*s
V = pls_gen(a, b, n_blocks=n_blocks)
for nn,(a_in, a_out, b_in, b_out) in enumerate(V):
if algo=='pls':
dat = pls(a_in, b_in, amax, 'loads', 'fast')
elif algo=='bridge':
dat = bridge(a_in, b_in, amax, 'loads', 'fast')
W = dat['W']
if use_pack:
W = dot(inflater.T, W)
WW[nn,:,:] = W
return WW
def pca_jkP(a, aopt, n_blocks=None):
""" Returns CV-segments of paramter P.
todo: add support for T
fixme: more efficient to add this in validation loop
"""
if n_blocks == None:
n_blocks = a.shape[0]
PP = empty((n_blocks, a.shape[1], aopt), dtype='f')
V = pca_gen(a, n_sets=n_blocks, center=True)
for nn,(a_in, a_out) in enumerate(V):
dat = pca(a_in, aopt, mode='fast')
P = dat['P']
PP[nn,:,:] = P
return PP