Projects/laydi
Projects
/
laydi
Archived
7
0
Fork 0

Lib updates

This commit is contained in:
Arnar Flatberg 2007-07-23 17:33:21 +00:00
parent 7ea87e646a
commit a05d0faa0d
9 changed files with 937 additions and 166 deletions

View File

@ -10,7 +10,7 @@ from fluents.workflow import Function, OptionsDialog, Options
from fluents.dataset import Dataset from fluents.dataset import Dataset
from fluents import plots, dataset, workflow, logger from fluents import plots, dataset, workflow, logger
import scipy import scipy
from engines import pca, pls from engines import pca, pls, nipals_lpls
from cx_stats import leverage, variances, hotelling from cx_stats import leverage, variances, hotelling
from cx_utils import mat_center from cx_utils import mat_center
from validation import * from validation import *
@ -238,14 +238,14 @@ class PLS(Model):
"""Estimates cut off on significant vars by controlling fdr.""" """Estimates cut off on significant vars by controlling fdr."""
if self._options['calc_qvals']==True: if self._options['calc_qvals']==True:
qvals_sorted, qvals = pls_qvals(a, b, qvals = pls_qvals(a, b,
aopt=None, aopt=None,
alpha=reg, alpha=reg,
n_iter=n_iter, n_iter=n_iter,
algo='pls', algo='pls',
sim_method=sim_method) sim_method=sim_method)
self.model['qval'] = qvals self.model['qval'] = qvals
self.model['qval_sorted'] = qvals_sorted #self.model['qval_sorted'] = qvals_sorted
else: else:
self.model['qval'] = None self.model['qval'] = None
self.model['qval_sorted'] = None self.model['qval_sorted'] = None
@ -276,18 +276,19 @@ class PLS(Model):
pc_ids_opt = ['_comp', map(str, range(self.model['aopt']))] pc_ids_opt = ['_comp', map(str, range(self.model['aopt']))]
zero_dim = ['_doe',['0']] # null dim, vector (hidden) zero_dim = ['_doe',['0']] # null dim, vector (hidden)
match_ids = {'E':[ids_0, ids_1], match_ids = {'E' : [ids_0, ids_1],
'P':[ids_1, pc_ids], 'P' : [ids_1, pc_ids],
'T':[ids_0, pc_ids], 'T' : [ids_0, pc_ids],
'W': [ids_1, pc_ids], 'W' : [ids_1, pc_ids],
'R': [ids_1, pc_ids], 'R' : [ids_1, pc_ids],
'Q':[ids_3, pc_ids], 'Q' : [ids_3, pc_ids],
'F':[ids_0, ids_3], 'F' : [ids_0, ids_3],
'B':[ids_1, ids_3], 'B' : [ids_1, ids_3],
'qval':[ids_1, zero_dim], 'qval' : [ids_1, zero_dim],
'qval_sorted':[ids_1, zero_dim], 'qval_sorted':[ids_1, zero_dim],
'w_tsq':[ids_1, zero_dim], 'w_tsq' : [ids_1, zero_dim],
'rmsep':[ids_3, pc_ids], 'rmsep' : [ids_3, pc_ids],
'CP': [ids_1, pc_ids]
} }
array = self.model[name] array = self.model[name]
@ -302,7 +303,7 @@ class PLS(Model):
#except: #except:
# logger.log('debug', 'Plot: %s failed' %plt) # logger.log('debug', 'Plot: %s failed' %plt)
return out return out
def run_o(self, a, b): def run_o(self, a, b):
"""Run PLS with present options.""" """Run PLS with present options."""
options = self._options options = self._options
@ -330,6 +331,17 @@ class PLS(Model):
self.model['var_y'] = var_y self.model['var_y'] = var_y
self.model['exp_var_y'] = exp_var_y self.model['exp_var_y'] = exp_var_y
if options['calc_corrloads']:
corr_load = scipy.empty_like(self.model['P'].copy())
T = self.model['T']
X = self._data['X']
# For each variable/attribute in original matrix (not meancentered)
for i,score in enumerate(T.T):
for j, profile in enumerate(X.T):
corrs = scipy.corrcoef(score, profile)
corr_load[j,i] = corrs[0,1]
self.model['CP'] = corr_load
if options['calc_conf']: if options['calc_conf']:
self.confidence(**options.confidence_options()) self.confidence(**options.confidence_options())
@ -353,6 +365,141 @@ class PLS(Model):
#run with current data and options #run with current data and options
return self.run_o(a, b) return self.run_o(a, b)
class LPLS(Model):
def __init__(self, id='lpls', name='LPLS'):
Model.__init__(self, id, name)
self._options = LplsOptions()
def validation(self, opt):
"""Returns rmsep for lpls model.
"""
if opt['calc_cv']==True:
val_engine = opt['val_engine']
rmsep, aopt = val_engine(self.model['X'], self.model['Y'],
self.model['Z'], opt['amax'], opt['n_sets'], opt['xz_alpha'])
self.model['rmsep'] = rmsep
self.model['aopt'] = aopt
else:
self.model['rmsep'] = None
self.model['aopt'] = opt['aopt']
def confidence(self, opt):
"""Returns a confidence measure for model parameters
Supported parameters: W
"""
aopt = self.model['aopt']
if opt['calc_conf']:
Wx, Wz = lpls_jk(self.model['X'], self.model['Y'], self.model['Z'], aopt, n_sets)
Wcal = self.model['W'][:,:aopt]
Lcal = self.model['L'][:,:aopt]
# ensure that Wcal is scaled
tnorm = scipy.apply_along_axis(norm, 0, self.model['T'][:,:aopt])
Wcal = Wcal*tnorm
a,b,c,d,e = opt['p_center'], opt['crot'], opt['alpha'], opt['strict'], opt['cov_center']
tsqx = hotelling(Wx, Wcal, a,b,c,d,e)
tsqz = hotelling(Wz, Lcal, a,b,c,d,e)
self.model['tsqx'] = tsqx
self.model['tsqz'] = tsqz
else:
self.model['tsqx'] = None
self.model['tsqz'] = None
def permutation_confidence(self, opt):
"""Estimates cut off on significant vars by controlling fdr.
"""
self.model['qval'] = None
self.model['qval_sorted'] = None
def make_model(self, opt):
"""Make model on amax components.
"""
engine = opt['engine']
dat = engine(self._data['X'], self._data['Y'], self._data['Z'],
opt['amax'], opt['xz_alpha'], opt['center_mth'],
opt['mode'], opt['scale'], False)
self.model.update(dat)
def as_dataset(self, name, dtype='Dataset'):
"""Return any model parameter as Dataset
No ids matching
"""
if name not in self.model.keys():
return
DX, DY, DZ = self._dataset['X'], self._dataset['Y'], self._dataset['Z']
dim_name_0, dim_name_1 = DX.get_dim_name()
dim_name_2, dim_name_3 = DY.get_dim_name()
dim_name_4, dim_name_5 = DZ.get_dim_name()
#samples
ids_0 = [dim_name_0, DX.get_identifiers(dim_name_0, sorted=True)]
# x vars (genes)
ids_1 = [dim_name_1, DX.get_identifiers(dim_name_1, sorted=True)]
# y vars (sample descriptors)
ids_3 = [dim_name_3, DY.get_identifiers(dim_name_3, sorted=True)]
#z-vars (variable descriptors)
ids_4 = [dim_name_4, DZ.get_identifiers(dim_name_4, sorted=True)]
# components (hidden)
pc_ids = ['_comp', map(str, range(self._options['amax']))]
pc_ids_opt = ['_comp', map(str, range(self.model['aopt']))]
zero_dim = ['_doe',['0']] # null dim, vector (hidden)
match_ids = {'E' : [ids_0, ids_1],
'P' : [ids_1, pc_ids],
'T' : [ids_0, pc_ids],
'W' : [ids_1, pc_ids],
'L' : [ids_4, pc_ids],
'Q' : [ids_3, pc_ids],
'F' : [ids_0, ids_3],
'B' : [ids_1, ids_3],
'tsqx' : [ids_1, zero_dim],
'tsqz' : [ids_4, zero_dim],
'K' : [ids_1, pc_ids],
'rmsep' : [ids_3, pc_ids]
}
array = self.model[name]
M = Dataset(array, identifiers=match_ids[name], name=name)
return M
def get_out_plots(self, options):
out=[]
for plt in options['out_plots']:
out.append(plt(self))
return out
def run(self, a, b, c):
"""Run L-PLS with present options."""
options = self._options
self._dataset['X'] = a
self._dataset['Y'] = b
self._dataset['Z'] = c
self._data['X'] = a.asarray()
self._data['Y'] = b.asarray()
self._data['Z'] = c.asarray()
self.validation(options)
self.make_model(options)
if options['calc_conf']:
self.confidence(options)
out = [self.as_dataset(p) for p in options['out_data']]
for plt in self.get_out_plots(options):
out.append(plt)
return out
def run_gui(self, a, b, c):
"""Run LPLS with option gui.
"""
dialog = LPlsOptionsDialog([a, b, c], self._options)
dialog.show_all()
response = dialog.run()
dialog.hide()
if response == gtk.RESPONSE_OK:
# set output data and plots
dialog.set_output()
#run with current data and options
return self.run(a, b, c)
class PcaOptions(Options): class PcaOptions(Options):
"""Options for Principal Component Analysis. """Options for Principal Component Analysis.
@ -403,7 +550,9 @@ class PcaOptions(Options):
] ]
opt['out_data'] = ['T','P', 'p_tsq'] opt['out_data'] = ['T','P', 'p_tsq']
opt['out_plots'] = [blmplots.PcaScorePlot,blmplots.PcaLoadingPlot,blmplots.LineViewXc] opt['out_plots'] = [blmplots.PcaScorePlot,
blmplots.PcaLoadingPlot,
blmplots.LineViewXc]
self.update(opt) self.update(opt)
@ -444,6 +593,7 @@ class PlsOptions(Options):
opt['center_mth'] = mat_center opt['center_mth'] = mat_center
opt['scale'] = 'scores' opt['scale'] = 'scores'
opt['calc_corrloads'] = True
opt['calc_conf'] = False opt['calc_conf'] = False
opt['n_sets'] = 5 opt['n_sets'] = 5
opt['strict'] = True opt['strict'] = True
@ -468,7 +618,8 @@ class PlsOptions(Options):
(blmplots.PlsLoadingPlot, 'Loadings', True), (blmplots.PlsLoadingPlot, 'Loadings', True),
(blmplots.LineViewXc, 'Line view', True), (blmplots.LineViewXc, 'Line view', True),
(blmplots.PredictionErrorPlot, 'Residual Error', False), (blmplots.PredictionErrorPlot, 'Residual Error', False),
(blmplots.RMSEPPlot, 'RMSEP', False) (blmplots.RMSEPPlot, 'RMSEP', False),
(blmplots.PlsCorrelationLoadingPlot, 'Corr. loadings', True)
] ]
opt['out_data'] = ['T','P', 'p_tsq'] opt['out_data'] = ['T','P', 'p_tsq']
@ -494,14 +645,87 @@ class PlsOptions(Options):
'strict', 'crot', 'cov_center'] 'strict', 'crot', 'cov_center']
return self._copy_from_list(opt_list) return self._copy_from_list(opt_list)
def permutation_confidence(self):
opt_list = ['q_pert_method', 'q_iter']
return self._copy_from_list(opt_list)
class LplsOptions(Options):
"""Options for L-shaped Partial Least Squares Regression.
"""
def __init__(self):
Options.__init__(self)
self._set_default()
def _set_default(self):
opt = {}
opt['engine'] = nipals_lpls
opt['mode'] = 'normal' # how much info to calculate
opt['amax'] = 10
opt['aopt'] = 9
opt['xz_alpha'] = .5
opt['auto_aopt'] = False
opt['center'] = True
opt['center_mth'] = [2, 0, 1]
opt['scale'] = 'scores'
opt['calc_conf'] = False
opt['n_sets'] = 7
opt['strict'] = False
opt['p_center'] = 'med'
opt['alpha'] = .3
opt['cov_center'] = 'med'
opt['crot'] = True
opt['calc_cv'] = False
opt['cv_val_method'] = 'random'
opt['cv_val_sets'] = opt['n_sets']
opt['all_data'] = [('T', 'scores', True),
('Wx', 'X-weights', True),
('Wz', 'Z-weights', True),
('E','residuals', False),
('tsq_x', 't2X', False),
('rmsep', 'RMSEP', False)
]
# (class, name, sensitive, ticked)
opt['all_plots'] = [(blmplots.PlsScorePlot, 'Scores', True),
(blmplots.PlsLoadingPlot, 'Loadings', True),
(blmplots.LineViewXc, 'Line view', True),
(blmplots.PredictionErrorPlot, 'Residual Error', False),
(blmplots.RMSEPPlot, 'RMSEP', False),
(blmplots.LplsHypoidCorrelationPlot, 'Hypoid corr.', False)
]
opt['out_data'] = ['T','P']
opt['out_plots'] = [blmplots.PlsScorePlot,blmplots.PlsLoadingPlot,blmplots.LineViewXc]
#opt['out_data'] = None
opt['pack'] = False
opt['calc_qvals'] = False
opt['q_pert_method'] = 'shuffle_rows'
opt['q_iter'] = 20
self.update(opt)
def make_model_options(self):
"""Options for make_model method."""
opt_list = ['scale','mode', 'amax', 'engine']
return self._copy_from_list(opt_list)
def confidence_options(self):
"""Options for confidence method."""
opt_list = ['n_sets', 'aopt', 'alpha', 'p_center',
'strict', 'crot', 'cov_center']
return self._copy_from_list(opt_list)
def validation_options(self): def validation_options(self):
"""Options for pre_validation method.""" """Options for pre_validation method."""
opt_list = ['amax', 'n_sets', 'cv_val_method'] opt_list = ['amax', 'n_sets', 'cv_val_method']
return self._copy_from_list(opt_list) return self._copy_from_list(opt_list)
def permutation_confidence(self):
opt_list = ['q_pert_method', 'q_iter']
return self._copy_from_list(opt_list)
class PcaOptionsDialog(OptionsDialog): class PcaOptionsDialog(OptionsDialog):
"""Options dialog for Principal Component Analysis. """Options dialog for Principal Component Analysis.
@ -716,6 +940,210 @@ class PcaOptionsDialog(OptionsDialog):
self._options['strict'] = True self._options['strict'] = True
class LplsOptionsDialog(OptionsDialog):
"""Options dialog for L-shaped Partial Least squares regression.
"""
def __init__(self, data, options, input_names=['X', 'Y', 'Z']):
OptionsDialog.__init__(self, data, options, input_names)
glade_file = os.path.join(fluents.DATADIR, 'lpls_options.glade')
notebook_name = "vbox1"
page_name = "Options"
self.add_page_from_glade(glade_file, notebook_name, page_name)
# connect signals to handlers
dic = {"on_amax_value_changed" : self.on_amax_changed,
"on_aopt_value_changed" : self.on_aopt_changed,
"auto_aopt_toggled" : self.auto_aopt_toggled,
"center_toggled" : self.center_toggled,
#"on_scale_changed" : self.on_scale_changed,
"on_val_none" : self.val_toggled,
"on_val_cv" : self.cv_toggled,
"on_cv_method_changed" : self.on_cv_method_changed,
"on_cv_sets_changed" : self.on_cv_sets_changed,
"on_conf_toggled" : self.conf_toggled,
"on_subset_loc_changed" : self.on_subset_loc_changed,
"on_cov_loc_changed" : self.on_cov_loc_changed,
"on_alpha_changed" : self.on_alpha_changed,
"on_rot_changed" : self.on_rot_changed,
"on__toggled" : self.conf_toggled,
"on_qval_changed" : self.on_qval_changed,
"on_iter_changed" : self.on_iter_changed
}
self.wTree.signal_autoconnect(dic)
# set/ensure valid default values/ranges
#
amax_sb = self.wTree.get_widget("amax_spinbutton")
max_comp = min(data[0].shape) # max num of components
if self._options['amax']>max_comp:
logger.log('debug', 'amax default too large ... adjusting')
self._options['amax'] = max_comp
amax_sb.get_adjustment().set_all(self._options['amax'], 1, max_comp, 1, 0, 0)
# aopt spin button
aopt_sb = self.wTree.get_widget("aopt_spinbutton")
if self._options['aopt']>self._options['amax']:
self._options['aopt'] = self._options['amax'] + 1 - 1
aopt_sb.get_adjustment().set_all(self._options['aopt'], 1, self._options['amax'], 1, 0, 0)
# scale
# scale_cb = self.wTree.get_widget("scale_combobox")
# scale_cb.set_active(0)
# validation frames
if self._options['calc_cv']==False:
cv_frame = self.wTree.get_widget("cv_frame")
cv_frame.set_sensitive(False)
cv = self.wTree.get_widget("cv_method").set_active(0)
# confidence
if self._options['calc_conf']==True:
self.wTree.get_widget("subset_expander").set_sensitive(True)
else:
self.wTree.get_widget("subset_expander").set_sensitive(False)
cb = self.wTree.get_widget("subset_loc")
_m = {'med': 0, 'mean': 1, 'full_model': 2}
cb.set_active(_m.get(self._options['p_center']))
cb = self.wTree.get_widget("cov_loc")
_m = {'med': 0, 'mean': 1}
cb.set_active(_m.get(self._options['cov_center']))
hs = self.wTree.get_widget("alpha_scale")
hs.set_value(self._options['alpha'])
tb = self.wTree.get_widget("qvals")
tb.set_sensitive(True)
def on_amax_changed(self, sb):
logger.log("debug", "amax changed: new value: %s" %sb.get_value_as_int())
amax = sb.get_value_as_int()
# update aopt if needed
if amax<self._options['aopt']:
self._options['aopt'] = amax
aopt_sb = self.wTree.get_widget("aopt_spinbutton")
aopt_sb.get_adjustment().set_all(self._options['aopt'], 1, amax, 1, 0, 0)
self._options['amax'] = sb.get_value_as_int()
def on_aopt_changed(self, sb):
aopt = sb.get_value_as_int()
self._options['aopt'] = aopt
def auto_aopt_toggled(self, tb):
aopt_sb = self.wTree.get_widget("aopt_spinbutton")
if tb.get_active():
self._options['auto_aopt'] = True
aopt_sb.set_sensitive(False)
else:
self._options['auto_aopt'] = False
aopt_sb.set_sensitive(True)
def center_toggled(self, tb):
if tb.get_active():
self._options['center'] = True
else:
logger.log("debug", "centering set to False")
self._options['center'] = False
#def on_scale_changed(self, cb):
# scale = cb.get_active_text()
# if scale=='Scores':
# self._options['scale'] = 'scores'
# elif scale=='Loadings':
# self._options['scale'] = 'loads'
# else:
# raise IOError
def val_toggled(self, tb):
"""Callback for validation: None. """
cv_frame = self.wTree.get_widget("cv_frame")
cv_tb = self.wTree.get_widget("cv_toggle")
if tb.get_active():
self._options['calc_cv'] = False
cv_frame.set_sensitive(False)
cv_tb.set_sensitive(False)
else:
cv_tb.set_sensitive(True)
if cv_tb.get_active():
cv_frame.set_sensitive(True)
self._options['calc_cv'] = True
def cv_toggled(self, tb):
cv_frame = self.wTree.get_widget("cv_frame")
val_tb = self.wTree.get_widget("val_none_toggle")
if tb.get_active():
cv_frame.set_sensitive(True)
self._options['calc_cv'] = True
else:
cv_frame.set_sensitive(False)
self._options['calc_cv'] = False
def on_cv_method_changed(self, cb):
method = cb.get_active_text()
if method == 'Random':
self._options['cv_val_method'] = 'random'
def on_cv_sets_changed(self, sb):
val = sb.get_value_as_int()
self._options['cv_val_sets'] = val
def conf_toggled(self, tb):
if tb.get_active():
self._options['calc_conf'] = False
self.wTree.get_widget("subset_expander").set_sensitive(False)
else:
self._options['calc_conf'] = True
self.wTree.get_widget("subset_expander").set_sensitive(True)
def on_subset_loc_changed(self, cb):
method = cb.get_active_text()
if method=='Full model':
self._options['p_center'] = 'full_model'
elif method=='Median':
self._options['p_center'] = 'med'
elif method=='Mean':
self._options['p_center'] = 'mean'
def on_cov_loc_changed(self, cb):
method = cb.get_active_text()
if method=='Median':
self._options['cov_center'] = 'med'
elif method=='Mean':
self._options['cov_center'] = 'mean'
def on_alpha_changed(self, hs):
self._options['alpha'] = hs.get_value()
def on_rot_changed(self, rg):
proc, strict = rg
if proc.get_active():
self._options['crot'] = True
else:
self._options['crot'] = True
self._options['strict'] = True
def qval_toggled(self, tb):
if tb.get_active():
self._options['calc_qval'] = False
self.wTree.get_widget("qval_method").set_sensitive(False)
self.wTree.get_widget("q_iter").set_sensitive(False)
else:
self._options['calc_qval'] = True
self.wTree.get_widget("qval_method").set_sensitive(True)
self.wTree.get_widget("q_iter").set_sensitive(True)
def on_iter_changed(self, sb):
self._options['q_iter'] = sb.get_value()
def on_qval_changed(self, cb):
q_method = cb.get_active_text()
if method=='Shuffle rows':
self._options['q_pert_method'] = 'shuffle'
class PlsOptionsDialog(OptionsDialog): class PlsOptionsDialog(OptionsDialog):
"""Options dialog for Partial Least squares regression. """Options dialog for Partial Least squares regression.
""" """
@ -918,5 +1346,3 @@ class PlsOptionsDialog(OptionsDialog):
q_method = cb.get_active_text() q_method = cb.get_active_text()
if method=='Shuffle rows': if method=='Shuffle rows':
self._options['q_pert_method'] = 'shuffle' self._options['q_pert_method'] = 'shuffle'

View File

@ -190,7 +190,12 @@ class PlsCorrelationLoadingPlot(BlmScatterPlot):
title = "Pls correlation loadings (%s)" %model._dataset['X'].get_name() title = "Pls correlation loadings (%s)" %model._dataset['X'].get_name()
BlmScatterPlot.__init__(self, title, model, absi, ordi, part_name='CP') BlmScatterPlot.__init__(self, title, model, absi, ordi, part_name='CP')
class LplsHypoidCorrelationPlot(BlmScatterPlot):
def __init__(self, model, absi=0, ordi=1):
title = "Hypoid correlations(%s)" %model._dataset['X'].get_name()
BlmScatterPlot.__init__(self, title, model, absi, ordi, part_name='W')
class LineViewXc(plots.LineViewPlot): class LineViewXc(plots.LineViewPlot):
"""A line view of centered raw data """A line view of centered raw data
""" """
@ -214,8 +219,8 @@ class PlsQvalScatter(plots.ScatterPlot):
def __init__(self, model, pc=0): def __init__(self, model, pc=0):
if not model.model.has_key('w_tsq'): if not model.model.has_key('w_tsq'):
return None return None
self._W = model.model['P'] self._W = model.model['W']
dataset_1 = model.as_dataset('P') dataset_1 = model.as_dataset('W')
dataset_2 = model.as_dataset('w_tsq') dataset_2 = model.as_dataset('w_tsq')
id_dim = dataset_1.get_dim_name(0) #genes id_dim = dataset_1.get_dim_name(0) #genes
sel_dim = dataset_1.get_dim_name(1) #_comp sel_dim = dataset_1.get_dim_name(1) #_comp

View File

@ -115,6 +115,7 @@ def expl_var_y(Y, T, Q):
def pls_qvals(a, b, aopt=None, alpha=.3, def pls_qvals(a, b, aopt=None, alpha=.3,
n_iter=20, algo='pls', n_iter=20, algo='pls',
center=True,
sim_method='shuffle', sim_method='shuffle',
p_center='med', cov_center='med', p_center='med', cov_center='med',
crot=True, strict=False, metric=None): crot=True, strict=False, metric=None):
@ -122,8 +123,98 @@ def pls_qvals(a, b, aopt=None, alpha=.3,
"""Returns qvals for pls model. """Returns qvals for pls model.
input: input:
a -- centered data matrix a -- data matrix
b -- centered data matrix b -- data matrix
aopt -- scalar, opt. number of components
alpha -- [0,1] regularisation parameter for T2-test
n_iter -- number of permutations
sim_method -- permutation method ['shuffle']
p_center -- location estimator for sub models ['med']
cov_center -- location estimator for covariance of submodels ['med']
crot -- bool, use rotations of sub models?
strict -- bool, use stict (rot/flips only) rotations?
metric -- bool, use row metric?
"""
m, n = a.shape
TSQ = zeros((n, n_iter), dtype='d') # (nvars x n_subsets)
n_false = zeros((n, n_iter), dtype='d')
#full model
if center:
ac = a - a.mean(0)
bc = b - b.mean(0)
if metric!=None:
ac = dot(ac, metric)
if algo=='bridge':
dat = bridge(ac, bc, aopt, 'loads', 'fast')
else:
dat = pls(ac, bc, aopt, 'loads', 'fast')
Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo, metric=metric, center=True)
tsq_full = hotelling(Wcv, dat['W'], p_center=p_center,
alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center)
t0 = time.time()
Vs = shuffle_1d(bc, n_iter, axis=0)
for i, b_shuff in enumerate(Vs):
t1 = time.time()
if algo=='bridge':
dat = bridge(ac, b_shuff, aopt, 'loads','fast')
else:
dat = pls(ac, b_shuff, aopt, 'loads', 'fast')
Wcv = pls_jkW(a, b_shuff, aopt, n_blocks=None, algo=algo, metric=metric)
TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center,
alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center)
print time.time() - t1
sort_index = argsort(tsq_full)[::-1]
back_sort_index = sort_index.argsort()
print time.time() - t0
# count false positives
tsq_full_sorted = tsq_full.take(sort_index)
for i in xrange(n_iter):
for j in xrange(n):
n_false[j,i] = sum(TSQ[:,i]>=tsq_full[j]) # number of false pos. genes (0-n)
false_pos = median(n_false, 1)
ll = arange(1, len(false_pos)+1, 1)
sort_qval = false_pos.take(sort_index)/ll
qval = false_pos/ll.take(back_sort_index)
print time.time() - t0
#return qval, false_pos, TSQ, tsq_full
return qval
def ensure_strict(C, only_flips=True):
"""Ensure that a rotation matrix does only 90 degree rotations.
In multiplication with pcs this allows flips and reordering.
if only_flips is True there will onlt be flips allowed
"""
Cm = C
S = sign(C) # signs
if only_flips==True:
C = eye(Cm.shape[0])*S
return C
Cm = zeros_like(C)
Cm.putmask(1.,abs(C)>.6)
if det(Cm)>1:
raise ValueError,"Implement this!"
return Cm*S
def pls_qvals_II(a, b, aopt=None, center=True, alpha=.3,
n_iter=20, algo='pls',
sim_method='shuffle',
p_center='med', cov_center='med',
crot=True, strict=False, metric=None):
"""Returns qvals for pls model.
Shuffling of variables in X is preprocessed in metric.
Null model is 'If I put genes randomly on network' ... if they are sign:
then this is due to network structure and not covariance with response.
input:
a -- data matrix
b -- data matrix
aopt -- scalar, opt. number of components aopt -- scalar, opt. number of components
alpha -- [0,1] regularisation parameter for T2-test alpha -- [0,1] regularisation parameter for T2-test
n_iter -- number of permutations n_iter -- number of permutations
@ -140,25 +231,33 @@ def pls_qvals(a, b, aopt=None, alpha=.3,
n_false = zeros((n, n_iter), dtype='<f8') n_false = zeros((n, n_iter), dtype='<f8')
#full model #full model
if metric!=None:
a = dot(a, metric) # center?
if center==True:
ac = a - a.mean(0)
bc = b - b.mean(0)
if metric==None:
metric = eye(n,n)
if algo=='bridge': if algo=='bridge':
dat = bridge(a, b, aopt, 'loads', 'fast') dat = bridge(ac, bc, aopt, 'loads', 'fast')
else: else:
dat = pls(a, b, aopt, 'loads', 'fast') dat = pls(ac, bc, aopt, 'loads', 'fast')
Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo, metric=metric) Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo, metric=metric)
tsq_full = hotelling(Wcv, dat['W'], p_center=p_center, tsq_full = hotelling(Wcv, dat['W'], p_center=p_center,
alpha=alpha, crot=crot, strict=strict, alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center) cov_center=cov_center)
t0 = time.time() t0 = time.time()
Vs = shuffle_1d(b, n_iter) Vs = shuffle_1d(a, n_iter, 1)
for i, b_shuff in enumerate(Vs): for i, a_shuff in enumerate(Vs):
t1 = time.time() t1 = time.time()
a = a_shuff - a_shuff.mean(0)
a = dot(a, metric)
if algo=='bridge': if algo=='bridge':
dat = bridge(a, b_shuff, aopt, 'loads','fast') dat = bridge(a, b, aopt, 'loads','fast')
else: else:
dat = pls(a, b, aopt, 'loads', 'fast') dat = pls(a, b, aopt, 'loads', 'fast')
Wcv = pls_jkW(a, b_shuff, aopt, n_blocks=None, algo=algo, metric=metric) Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo, metric=metric)
TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center, TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center,
alpha=alpha, crot=crot, strict=strict, alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center) cov_center=cov_center)
@ -177,24 +276,8 @@ def pls_qvals(a, b, aopt=None, alpha=.3,
sort_qval = false_pos.take(sort_index)/ll sort_qval = false_pos.take(sort_index)/ll
qval = false_pos/ll.take(back_sort_index) qval = false_pos/ll.take(back_sort_index)
print time.time() - t0 print time.time() - t0
return qval, false_pos, TSQ, tsq_full #return qval, false_pos, TSQ, tsq_full
return qval
def ensure_strict(C, only_flips=True):
"""Ensure that a rotation matrix does only 90 degree rotations.
In multiplication with pcs this allows flips and reordering.
if only_flips is True there will onlt be flips allowed
"""
Cm = C
S = sign(C) # signs
if only_flips==True:
C = eye(Cm.shape[0])*S
return C
Cm = zeros_like(C)
Cm.putmask(1.,abs(C)>.6)
if det(Cm)>1:
raise ValueError,"Implement this!"
return Cm*S
def leverage(aopt=1,*args): def leverage(aopt=1,*args):
"""Returns leverages """Returns leverages
@ -253,3 +336,10 @@ def ssq(E, axis=0, weights=None):
raise NotImplementedError, "Higher order modes not supported" raise NotImplementedError, "Higher order modes not supported"
return pow(Ew,2).sum(axis) return pow(Ew,2).sum(axis)
def vnorm(x):
"""Returns the euclidian norm of a vector.
This is considerably faster than linalg.norm
"""
return sqrt(dot(x,x.conj()))

View File

@ -1,23 +1,25 @@
from scipy import apply_along_axis,newaxis,zeros,\ from scipy import apply_along_axis,newaxis,zeros,\
median,round_,nonzero,dot,argmax,any,sqrt,ndarray,\ median,round_,nonzero,dot,argmax,any,sqrt,ndarray,\
trace,zeros_like,sign,sort,real,argsort,rand,array,\ trace,zeros_like,sign,sort,real,argsort,rand,array,\
matrix matrix,nan
from scipy.linalg import norm,svd,inv,eig from scipy.linalg import norm,svd,inv,eig
from scipy.stats import median,mean from scipy.stats import median,mean
def normalise(a,axis=0,return_scales=False): def normalise(a, axis=0, return_scales=False):
s = apply_along_axis(norm,axis,a) s = apply_along_axis(norm, axis, a)
if axis==0: if axis==0:
s = s[newaxis] s = s[newaxis]
else: else:
s = s[:,newaxis] s = s[:,newaxis]
a_s = a/s a_s = a/s
if return_scales:
return a_s,s
return a_s
def sub2ind(shape,i,j): if return_scales:
return a_s, s
return a_s
def sub2ind(shape, i, j):
"""Indices from subscripts. Only support for 2d""" """Indices from subscripts. Only support for 2d"""
row,col = shape row,col = shape
ind = [] ind = []
@ -41,13 +43,13 @@ def sorted_eig(a, b=None,sort_by='sm'):
(This is reversed output compared to matlab) (This is reversed output compared to matlab)
""" """
s,v = eig(a,b) s,v = eig(a, b)
s = real(s) # dont expect any imaginary part s = real(s) # dont expect any imaginary part
v = real(v) v = real(v)
ind = argsort(s) ind = argsort(s)
if sort_by=='lm': if sort_by=='lm':
ind = ind[::-1] ind = ind[::-1]
v = v.take(ind,1) v = v.take(ind, 1)
s = s.take(ind) s = s.take(ind)
return s,v return s,v
@ -67,15 +69,15 @@ def str2num(string_number):
return num return num
def randperm(n): def randperm(n):
r=rand(n) r = rand(n)
dict={} dict={}
for i in range(n): for i in range(n):
dict[r[i]]=i dict[r[i]] = i
r=sort(r) r = sort(r)
out=zeros(n) out = zeros(n)
for i in range(n): for i in range(n):
out[i]=dict[r[i]] out[i] = dict[r[i]]
return array(out,dtype='i') return array(out).astype('i')
def mat_center(X,axis=0,ret_mn=False): def mat_center(X,axis=0,ret_mn=False):
"""Mean center matrix along axis. """Mean center matrix along axis.

View File

@ -3,11 +3,12 @@
There is no typechecking of any kind here, just focus on speed There is no typechecking of any kind here, just focus on speed
""" """
from scipy.linalg import svd,norm,inv,pinv,qr import math
from scipy.linalg import svd,inv
from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\ from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\
apply_along_axis,mean,ones,randn,empty_like,outer,c_,\ apply_along_axis,mean,ones,randn,empty_like,outer,c_,\
rand,sum,cumsum,matrix rand,sum,cumsum,matrix
def pca(a, aopt, scale='scores', mode='normal'): def pca(a, aopt, scale='scores', mode='normal'):
""" Principal Component Analysis model """ Principal Component Analysis model
mode: mode:
@ -18,17 +19,18 @@ def pca(a, aopt, scale='scores', mode='normal'):
m, n = a.shape m, n = a.shape
if m*10.>n: if m*3>n:
u, s, vt = esvd(a) u, s, v = esvd(a)
else: else:
u, s, vt = svd(a, full_matrices=0) u, s, vt = svd(a, full_matrices=0)
v = vt.T
eigvals = (1./m)*s eigvals = (1./m)*s
T = u*s T = u*s
T = T[:,:aopt] T = T[:,:aopt]
P = vt[:aopt,:].T P = v[:,:aopt]
if scale=='loads': if scale=='loads':
tnorm = apply_along_axis(norm, 0, T) tnorm = apply_along_axis(vnorm, 0, T)
T = T/tnorm T = T/tnorm
P = P*tnorm P = P*tnorm
@ -47,6 +49,7 @@ def pca(a, aopt, scale='scores', mode='normal'):
return {'T':T, 'P':P, 'E':E} return {'T':T, 'P':P, 'E':E}
def pcr(a, b, aopt=2, scale='scores', mode='normal'): def pcr(a, b, aopt=2, scale='scores', mode='normal'):
"""Returns Principal component regression model.""" """Returns Principal component regression model."""
m, n = a.shape m, n = a.shape
@ -98,13 +101,13 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
u, s, vh = svd(dot(ab.T, ab)) u, s, vh = svd(dot(ab.T, ab))
w = dot(ab, u[:,:1]) w = dot(ab, u[:,:1])
w = w/norm(w) w = w/vnorm(w)
r = w.copy() r = w.copy()
if i>0: if i>0:
for j in range(0,i,1): for j in range(0,i,1):
r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis] r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis]
t = dot(a, r) t = dot(a, r)
tt = norm(t)**2 tt = vnorm(t)**2
p = dot(a.T, t)/tt p = dot(a.T, t)/tt
q = dot(r.T, ab).T/tt q = dot(r.T, ab).T/tt
ab = ab - dot(p, q.T)*tt ab = ab - dot(p, q.T)*tt
@ -115,7 +118,7 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
if mode=='fast' and i==aopt-1: if mode=='fast' and i==aopt-1:
if scale=='loads': if scale=='loads':
tnorm = apply_along_axis(norm, 0, T) tnorm = apply_along_axis(vnorm, 0, T)
T = T/tnorm T = T/tnorm
W = W*tnorm W = W*tnorm
return {'T':T, 'W':W} return {'T':T, 'W':W}
@ -134,7 +137,7 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
F = b - dot(T[:,:aopt], Q[:,:aopt].T) F = b - dot(T[:,:aopt], Q[:,:aopt].T)
if scale=='loads': if scale=='loads':
tnorm = apply_along_axis(norm, 0, T) tnorm = apply_along_axis(vnorm, 0, T)
T = T/tnorm T = T/tnorm
W = W*tnorm W = W*tnorm
Q = Q*tnorm Q = Q*tnorm
@ -159,7 +162,7 @@ def w_simpls(aat, b, aopt):
u = dot(b, u[:,:1]) #y-factor scores u = dot(b, u[:,:1]) #y-factor scores
U[:,i] = u.ravel() U[:,i] = u.ravel()
t = dot(aat, u) t = dot(aat, u)
t = t/norm(t) t = t/vnorm(t)
T[:,i] = t.ravel() T[:,i] = t.ravel()
h = dot(aat, t) #score-weights h = dot(aat, t) #score-weights
H[:,i] = h.ravel() H[:,i] = h.ravel()
@ -183,7 +186,7 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
W = u[:,:aopt] W = u[:,:aopt]
K = vt[:aopt,:].T K = vt[:aopt,:].T
T = dot(a, W) T = dot(a, W)
tnorm = apply_along_axis(norm, 0, T) # norm of T-columns tnorm = apply_along_axis(vnorm, 0, T) # norm of T-columns
if mode == 'fast': if mode == 'fast':
if scale=='loads': if scale=='loads':
@ -196,16 +199,6 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
B = zeros((aopt, n, l), dtype='f') B = zeros((aopt, n, l), dtype='f')
for i in range(aopt): for i in range(aopt):
B[i] = dot(W[:,:i+1], Q[:,:i+1].T) B[i] = dot(W[:,:i+1], Q[:,:i+1].T)
# leverages
# fixme: probably need an orthogonal basis for row-space leverage
# T (scores) are not orthogonal
# Using a qr decomp to get an orthonormal basis for row-space
#Tq = qr(T)[0]
#s_lev,v_lev = leverage(aopt,Tq,W)
# explained variance
#var_x, exp_var_x = variances(a,T,W)
#qnorm = apply_along_axis(norm, 0, Q)
#var_y, exp_var_y = variances(b,U,Q/qnorm)
if mode == 'detailed': if mode == 'detailed':
E = empty((aopt, m, n)) E = empty((aopt, m, n))
@ -225,10 +218,132 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
return {'B':B, 'W':W, 'T':T, 'Q':Q, 'E':E, 'F':F, 'U':U, 'P':W} return {'B':B, 'W':W, 'T':T, 'Q':Q, 'E':E, 'F':F, 'U':U, 'P':W}
def nipals_lpls(X, Y, Z, amax, alpha=.7, mean_ctr=[2, 0, 1], mode='normal', scale='scores', verbose=False):
""" L-shaped Partial Least Sqaures Regression by the nipals algorithm.
(X!Z)->Y
:input:
X : data matrix (m, n)
Y : data matrix (m, l)
Z : data matrix (n, o)
:output:
T : X-scores
W : X-weights/Z-weights
P : X-loadings
Q : Y-loadings
U : X-Y relation
L : Z-scores
K : Z-loads
B : Regression coefficients X->Y
b0: Regression coefficient intercept
evx : X-explained variance
evy : Y-explained variance
evz : Z-explained variance
:Notes:
"""
if mean_ctr!=None:
xctr, yctr, zctr = mean_ctr
X, mnX = center(X, xctr)
Y, mnY = center(Y, xctr)
Z, mnZ = center(Z, zctr)
varX = pow(X, 2).sum()
varY = pow(Y, 2).sum()
varZ = pow(Z, 2).sum()
m, n = X.shape
k, l = Y.shape
u, o = Z.shape
# initialize
U = empty((k, amax))
Q = empty((l, amax))
T = empty((m, amax))
W = empty((n, amax))
P = empty((n, amax))
K = empty((o, amax))
L = empty((u, amax))
var_x = empty((amax,))
var_y = empty((amax,))
var_z = empty((amax,))
for a in range(amax):
if verbose:
print "\n Working on comp. %s" %a
u = Y[:,:1]
diff = 1
MAX_ITER = 100
lim = 1e-5
niter = 0
while (diff>lim and niter<MAX_ITER):
niter += 1
u1 = u.copy()
w = dot(X.T, u)
w = w/sqrt(dot(w.T, w))
l = dot(Z, w)
k = dot(Z.T, l)
k = k/sqrt(dot(k.T, k))
w = alpha*k + (1-alpha)*w
w = w/sqrt(dot(w.T, w))
t = dot(X, w)
c = dot(Y.T, t)
c = c/sqrt(dot(c.T, c))
u = dot(Y, c)
diff = abs(u1 - u).max()
if verbose:
print "Converged after %s iterations" %niter
tt = dot(t.T, t)
p = dot(X.T, t)/tt
q = dot(Y.T, t)/tt
l = dot(Z, w)
U[:,a] = u.ravel()
W[:,a] = w.ravel()
P[:,a] = p.ravel()
T[:,a] = t.ravel()
Q[:,a] = q.ravel()
L[:,a] = l.ravel()
K[:,a] = k.ravel()
X = X - dot(t, p.T)
Y = Y - dot(t, q.T)
Z = (Z.T - dot(w, l.T)).T
var_x[a] = pow(X, 2).sum()
var_y[a] = pow(Y, 2).sum()
var_z[a] = pow(Z, 2).sum()
B = dot(dot(W, inv(dot(P.T, W))), Q.T)
b0 = mnY - dot(mnX, B)
# variance explained
evx = 100.0*(1 - var_x/varX)
evy = 100.0*(1 - var_y/varY)
evz = 100.0*(1 - var_z/varZ)
if scale=='loads':
tnorm = apply_along_axis(vnorm, 0, T)
T = T/tnorm
W = W*tnorm
Q = Q*tnorm
knorm = apply_along_axis(vnorm, 0, K)
L = L*knorm
K = K/knorm
return {'T':T, 'W':W, 'P':P, 'Q':Q, 'U':U, 'L':L, 'K':K, 'B':B, 'b0':b0, 'evx':evx, 'evy':evy, 'evz':evz}
########### Helper routines #########
def m_shape(array): def m_shape(array):
return matrix(array).shape return matrix(array).shape
def esvd(data,economy=1): def esvd(data, economy=1):
"""SVD with the option of economy sized calculation """SVD with the option of economy sized calculation
Calculate subspaces of X'X or XX' depending on the shape Calculate subspaces of X'X or XX' depending on the shape
of the matrix. of the matrix.
@ -239,17 +354,40 @@ def esvd(data,economy=1):
""" """
m, n = data.shape m, n = data.shape
if m>=n: if m>=n:
u, s, vt = svd(dot(data.T, data)) data = dot(data.T, data)
u, s, vt = svd(data)
u = dot(data, vt.T) u = dot(data, vt.T)
v = vt.T v = vt.T
for i in xrange(n): for i in xrange(n):
s[i] = norm(u[:,i]) s[i] = vnorm(u[:,i])
u[:,i] = u[:,i]/s[i] u[:,i] = u[:,i]/s[i]
else: else:
u, s, vt = svd(dot(data, data.T)) data = dot(data, data.T)
data = (data + data.T)/2.0
u, s, vt = svd(data)
v = dot(u.T, data) v = dot(u.T, data)
for i in xrange(m): for i in xrange(m):
s[i] = norm(v[i,:]) s[i] = vnorm(v[i,:])
v[i,:] = v[i,:]/s[i] v[i,:] = v[i,:]/s[i]
return u, s, v return u, s, v.T
def vnorm(x):
# assume column arrays (or vectors)
return math.sqrt(dot(x.T, x))
def center(a, axis):
# 0 = col center, 1 = row center, 2 = double center
# -1 = nothing
if axis==-1:
mn = zeros((a.shape[1],))
elif axis==0:
mn = a.mean(0)
elif axis==1:
mn = a.mean(1)[:,newaxis]
elif axis==2:
mn = a.mean(0) + a.mean(1)[:,newaxis] - a.mean()
else:
raise IOError("input error: axis must be in [-1,0,1,2]")
return a - mn, mn

View File

@ -53,6 +53,7 @@ def gene_hypergeo_test(selection, category_dataset):
cat_count) cat_count)
pvals = scipy.where(cat_count==0, 2, pvals) pvals = scipy.where(cat_count==0, 2, pvals)
pvals = scipy.where(scipy.isnan(pvals), 2, pvals)
out = {} out = {}
for i in range(pvals.size): for i in range(pvals.size):
out[str(all_cats[i])] = (count[i], cat_count[i], pvals[i]) out[str(all_cats[i])] = (count[i], cat_count[i], pvals[i])

View File

@ -2,7 +2,7 @@ import os,sys
from itertools import izip from itertools import izip
import networkx as NX import networkx as NX
from scipy import shape,diag,dot,asarray,sqrt,real,zeros,eye,exp,maximum,\ from scipy import shape,diag,dot,asarray,sqrt,real,zeros,eye,exp,maximum,\
outer,maximum,sum,diag,real outer,maximum,sum,diag,real,atleast_2d
from scipy.linalg import eig,svd,inv,expm,norm from scipy.linalg import eig,svd,inv,expm,norm
from cx_utils import sorted_eig from cx_utils import sorted_eig
@ -378,6 +378,7 @@ Ke = expm(A) .... expm(-A)?
# 14.05.2006: diffusion returns negative values, using expm(-LL) instead (FIX) # 14.05.2006: diffusion returns negative values, using expm(-LL) instead (FIX)
# 13.09.2206: update for use in numpy # 13.09.2206: update for use in numpy
# 27.04.2007: diffusion now uses pade approximations to matrix exponential. Also the last
def K_expAdj(W, normalised=True, alpha=1.0): def K_expAdj(W, normalised=True, alpha=1.0):
"""Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel. """Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel.
@ -433,8 +434,8 @@ def K_vonNeumann(W, normalised=True, alpha=1.0):
return dot(dot(vr,psigma),vri).astype(t) return dot(dot(vr,psigma),vri).astype(t)
def K_laplacian(W, normalised=True, alpha=1.0): def K_laplacian(W, normalised=True, alpha=1.0):
""" This is the matrix square root of the pseudo inverse of L. """ This is the matrix pseudo inverse of L.
Also known as th eaverage commute time matrix. Also known as the average commute time matrix.
""" """
W = asarray(W) W = asarray(W)
t = W.dtype.char t = W.dtype.char
@ -464,8 +465,7 @@ def K_laplacian(W, normalised=True, alpha=1.0):
return K return K
def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5, use_cut=False):
def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5):
"""Returns diffusion kernel. """Returns diffusion kernel.
input: input:
-- W, adj. matrix -- W, adj. matrix
@ -477,27 +477,45 @@ def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5):
t = W.dtype.char t = W.dtype.char
if len(W.shape)!=2: if len(W.shape)!=2:
raise ValueError, "Non-matrix input to matrix function." raise ValueError, "Non-matrix input to matrix function."
m,n = W.shape m, n = W.shape
if t in ['F','D']: if t in ['F','D']:
raise TypeError, "Complex input!" raise TypeError, "Complex input!"
D = diag(sum(W,0)) D = diag(W.sum(0))
L = D-W L = D - W
if normalised==True: if normalised==True:
T = diag(sqrt(1./(sum(W,0)))) T = diag(sqrt(1./W.sum(0)))
L = dot(dot(T,L),T) L = dot(dot(T, L), T)
e,vr = eig(L) e, vr = eig(L)
vri = inv(vr) #inv vri = inv(vr) #inv
cond = 1.0*{0: feps*1e3, 1: eps*1e6}[_array_precision[t]] cond = 1.0*{0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
cutoff = 1.*abs(cond*maximum.reduce(e)) cutoff = 1.*abs(cond*maximum.reduce(e))
psigma = eye(m) # if sing vals are 0 exp(0)=1 (unnecessary) psigma = eye(m) # if eigvals are 0 exp(0)=1 (unnecessary)
#psigma = zeros((m,n), dtype='<f8') #psigma = zeros((m,n), dtype='<f8')
for i in range(len(e)): for i in range(len(e)):
if abs(e[i]) > cutoff: if abs(e[i]) > cutoff:
psigma[i,i] = exp(-beta*e[i]) psigma[i,i] = exp(-beta*e[i])
#else:
# psigma[i,i] = 0.0
K = real(dot(dot(vr, psigma), vri)) K = real(dot(dot(vr, psigma), vri))
I = eye(n, dtype='<f8') I = eye(n, dtype='<f8')
K = (1. - alpha)*I + alpha*K K = (1. - alpha)*I + alpha*K
return K return K
def K_diffusion2(W, normalised=True, alpha=1.0, beta=0.5, ncomp=None):
"""Returns diffusion kernel, using fast pade approximation.
input:
-- W, adj. matrix
-- normalised [True/False]
-- beta, [0->), (diffusion degree)
"""
D = diag(W.sum(0))
L = D - W
if normalised==True:
T = diag(sqrt(1./W.sum(0)))
L = dot(dot(T, L), T)
return expm(-beta*L)
def K_modularity(W,alpha=1.0): def K_modularity(W,alpha=1.0):
""" Returns the matrix square root of Newmans modularity.""" """ Returns the matrix square root of Newmans modularity."""
@ -530,3 +548,20 @@ def kernel_score(K, W):
score = diag(dot(W, dot(K, W)) ) score = diag(dot(W, dot(K, W)) )
tot = sum(score) tot = sum(score)
return score, tot return score, tot
def modularity_matrix(G, nodelist=None):
if not nodelist:
nodelist = G.nodes()
else:
G = NX.subgraph(G, nodelist)
A = NX.adj_matrix(G, nodelist=nodelist)
d = atleast_2d(G.degree(nbunch=nodelist))
m = 1.*G.number_of_edges()
B = A - A/m
return B

View File

@ -41,30 +41,31 @@ def pls_gen(a, b, n_blocks=None, center=False, index_out=False,axis=0, metric=No
"""Random block crossvalidation """Random block crossvalidation
Leave-one-out is a subset, with n_blocks equals a.shape[-1] Leave-one-out is a subset, with n_blocks equals a.shape[-1]
""" """
index = randperm(a.shape[axis]) #index = randperm(a.shape[axis])
index = arange(a.shape[axis])
if n_blocks==None: if n_blocks==None:
n_blocks = a.shape[axis] n_blocks = a.shape[axis]
n_in_set = ceil(float(a.shape[axis])/n_blocks) n_in_set = ceil(float(a.shape[axis])/n_blocks)
out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_blocks)] out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_blocks)]
for out in out_ind_sets: for out in out_ind_sets:
inn = [i for i in index if i not in out] inn = [i for i in index if i not in out]
acal = a.take(inn, 0) acal = a.take(inn, 0)
atrue = a.take(out, 0) atrue = a.take(out, 0)
bcal = b.take(inn, 0) bcal = b.take(inn, 0)
btrue = b.take(out, 0) btrue = b.take(out, 0)
if center: if center:
mn_a = acal.mean(0)[newaxis] mn_a = acal.mean(0)[newaxis]
acal = acal - mn_a acal = acal - mn_a
atrue = atrue - mn_a atrue = atrue - mn_a
mn_b = bcal.mean(0)[newaxis] mn_b = bcal.mean(0)[newaxis]
bcal = bcal - mn_b bcal = bcal - mn_b
btrue = btrue - mn_b btrue = btrue - mn_b
if metric!=None: if metric!=None:
acal = dot(acal, metric) acal = dot(acal, metric)
if index_out: if index_out:
yield acal, atrue, bcal, btrue, out yield acal, atrue, bcal, btrue, out
else: else:
yield acal, atrue, bcal, btrue yield acal, atrue, bcal, btrue
def pca_gen(a, n_sets=None, center=False, index_out=False, axis=0, metric=None): def pca_gen(a, n_sets=None, center=False, index_out=False, axis=0, metric=None):
@ -151,6 +152,7 @@ def shuffle_1d_block(a, n_sets=None, blocks=None, index_out=False, axis=0):
index = arange(m) index = arange(m)
dummy = map(random.shuffle, array_split(index, blocks)) dummy = map(random.shuffle, array_split(index, blocks))
a_out = a.take(index, axis) a_out = a.take(index, axis)
if index_out: if index_out:
yield a_out, index yield a_out, index
else: else:
@ -164,7 +166,8 @@ def shuffle_1d(a, n_sets, axis=0):
m = a.shape[axis] m = a.shape[axis]
for ii in xrange(n_sets): for ii in xrange(n_sets):
index = randperm(m) index = randperm(m)
yield a.take(index, axis) a = a.take(index, axis)
yield a
def diag_pert(a, n_sets=10, center=True, index_out=False): def diag_pert(a, n_sets=10, center=True, index_out=False):
"""Alter generator returning sets perturbed with means at diagonals. """Alter generator returning sets perturbed with means at diagonals.
@ -205,18 +208,17 @@ def diag_pert(a, n_sets=10, center=True, index_out=False):
else: else:
yield a_out yield a_out
def outerprod_centering(aat, ret_mn=True): def outerprod_centering(aat, ret_mn=True):
"""Returns mean centered symmetric outerproduct matrix. """Returns double centered symmetric outerproduct matrix.
""" """
n = aat.shape[0] h = aat.mean(0)[newaxis]
h = aat.sum(0)[:,newaxis] h = h - 0.5*h.mean()
h = (h - mean(h)/2)/n mn_a = h + h.T # beauty of broadcasting
mn_a = h + h.T
aatc = aat - mn_a aatc = aat - mn_a
if ret_mn: if ret_mn:
return aatc, aat.mean(0) return aatc, mn_a
return aat - mn_a return aatc

View File

@ -12,11 +12,47 @@ from cx_utils import m_shape
def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'): def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
"""Returns rmsep and aopt for pls tailored for wide X. """Returns rmsep and aopt for pls tailored for wide X.
The root mean square error of cross validation is calculated
based on random block cross-validation. With number of blocks equal to
number of samples [default] gives leave-one-out cv.
The pls model is based on the simpls algorithm for wide X.
comments: :Parameters:
-- X, Y inputs need to be centered (fixme: check) X : ndarray
column centered data matrix of size (samples x variables)
Y : ndarray
column centered response matrix of size (samples x responses)
amax : scalar
Maximum number of components
n_blocks : scalar
Number of blocks in cross validation
:Returns:
rmsep : ndarray
Root Mean Square Error of cross-validated Predictions
aopt : scalar
Guestimate of the optimal number of components
:SeeAlso:
- pls_cv_val : Same output, not optimised for wide X
- w_simpls : Simpls algorithm for wide X
Notes
-----
Based (cowardly translated) on m-files from the Chemoact toolbox
X, Y inputs need to be centered (fixme: check)
Examples
--------
>>> import numpy as n
>>> X = n.array([[1., 2., 3.],[]])
>>> Y = n.array([[1., 2., 3.],[]])
>>> w_pls(X, Y, 1)
[4,5,6], 1
""" """
k, l = m_shape(Y) k, l = m_shape(Y)
PRESS = zeros((l, amax+1), dtype='f') PRESS = zeros((l, amax+1), dtype='f')
if n_blocks==None: if n_blocks==None:
@ -30,7 +66,7 @@ def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
if algo=='simpls': if algo=='simpls':
dat = w_simpls(Din, Yin, amax) dat = w_simpls(Din, Yin, amax)
Q, U, H = dat['Q'], dat['U'], dat['H'] Q, U, H = dat['Q'], dat['U'], dat['H']
That = dot(Doi, dot(U, inv(triu(dot(H.T,U))) )) That = dot(Doi, dot(U, inv(triu(dot(H.T, U))) ))
else: else:
raise NotImplementedError raise NotImplementedError
@ -40,21 +76,13 @@ def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
E = Yout[:,j][:,newaxis] - TQ E = Yout[:,j][:,newaxis] - TQ
E = E + sum(E, 0)/Din.shape[0] E = E + sum(E, 0)/Din.shape[0]
PRESS[j,1:] = PRESS[j,1:] + sum(E**2, 0) PRESS[j,1:] = PRESS[j,1:] + sum(E**2, 0)
#Yhat = Y - dot(That,Q.T) Yhat = Y - dot(That,Q.T)
rmsep = sqrt(PRESS/Y.shape[0]) rmsep = sqrt(PRESS/Y.shape[0])
aopt = find_aopt_from_sep(rmsep) aopt = find_aopt_from_sep(rmsep)
return rmsep, aopt return rmsep, Yhat, aopt
def pls_val(X, Y, amax=2, n_blocks=10, algo='pls', metric=None): def pls_val(X, Y, amax=2, n_blocks=10, algo='pls', metric=None):
""" Validation results of pls model.
comments:
-- X, Y inputs need to be centered (fixme: check)
"""
k, l = m_shape(Y) k, l = m_shape(Y)
PRESS = zeros((l, amax+1), dtype='<f8') PRESS = zeros((l, amax+1), dtype='<f8')
EE = zeros((amax, k, l), dtype='<f8') EE = zeros((amax, k, l), dtype='<f8')
@ -79,7 +107,30 @@ def pls_val(X, Y, amax=2, n_blocks=10, algo='pls', metric=None):
rmsep = sqrt(PRESS/(k-1.)) rmsep = sqrt(PRESS/(k-1.))
aopt = find_aopt_from_sep(rmsep) aopt = find_aopt_from_sep(rmsep)
return rmsep, aopt return rmsep, Yhat, aopt
def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5):
"""Performs crossvalidation to get generalisation error in lpls"""
cv_iter = select_generators.pls_gen(X, Y, n_blocks=nsets,center=False,index_out=True)
k, l = Y.shape
Yhat = empty((a_max,k,l), 'd')
for i, (xcal,xi,ycal,yi,ind) in enumerate(cv_iter):
T, W, P, Q, U, L, K, B, b0, evx, evy, evz = nipals_lpls(xcal,ycal,Z,
a_max=a_max,
alpha=alpha,
mean_ctr=[2,0,1],
verbose=False)
for a in range(a_max):
Yhat[a,ind,:] = b0[a][0][0] + dot(xi, B[a])
Yhat_class = zeros_like(Yhat)
for a in range(a_max):
for i in range(k):
Yhat_class[a,i,argmax(Yhat[a,i,:])]=1.0
class_err = 100*((Yhat_class+Y)==2).sum(1)/Y.sum(0).astype('d')
sep = (Y - Yhat)**2
rmsep = sqrt(sep.mean(1))
aopt = find_aopt_from_sep(rmsep)
return rmsep, Yhat, aopt
def pca_alter_val(a, amax, n_sets=10, method='diag'): def pca_alter_val(a, amax, n_sets=10, method='diag'):
"""Pca validation by altering elements in X. """Pca validation by altering elements in X.
@ -146,8 +197,7 @@ def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True, center=True, m
if n_blocks == None: if n_blocks == None:
n_blocks = b.shape[0] n_blocks = b.shape[0]
Wcv = empty((n_blocks, a.shape[1], amax), dtype='f') Wcv = empty((n_blocks, a.shape[1], amax), dtype='d')
if use_pack and metric==None: if use_pack and metric==None:
u, s, inflater = svd(a, full_matrices=0) u, s, inflater = svd(a, full_matrices=0)
a = u*s a = u*s
@ -161,11 +211,10 @@ def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True, center=True, m
dat = bridge(a_in, b_in, amax, 'loads', 'fast') dat = bridge(a_in, b_in, amax, 'loads', 'fast')
W = dat['W'] W = dat['W']
if use_pack and metric==None: if use_pack and metric==None:
W = dot(inflater.T, W) W = dot(inflater.T, W)
Wcv[nn,:,:] = W Wcv[nn,:,:] = W[:,:,]
return Wcv return Wcv
@ -200,6 +249,29 @@ def pca_jkP(a, aopt, n_blocks=None, metric=None):
return PP return PP
def lpls_jk(X, Y, Z, a_max, nsets=None, alpha=.5):
cv_iter = select_generators.pls_gen(X, Y, n_blocks=nsets,center=False,index_out=False)
m, n = X.shape
k, l = Y.shape
o, p = Z.shape
if nsets==None:
nsets = m
WWx = empty((nsets, n, a_max), 'd')
WWz = empty((nsets, o, a_max), 'd')
#WWy = empty((nsets, l, a_max), 'd')
for i, (xcal,xi,ycal,yi) in enumerate(cv_iter):
T, W, P, Q, U, L, K, B, b0, evx, evy, evz = nipals_lpls(xcal,ycal,Z,
a_max=a_max,
alpha=alpha,
mean_ctr=[2,0,1],
scale='loads',
verbose=False)
WWx[i,:,:] = W
WWz[i,:,:] = L
#WWy[i,:,:] = Q
return WWx, WWz
def find_aopt_from_sep(sep, method='75perc'): def find_aopt_from_sep(sep, method='75perc'):
"""Returns an estimate of optimal number of components from rmsecv. """Returns an estimate of optimal number of components from rmsecv.
""" """