Projects/laydi
Projects
/
laydi
Archived
7
0
Fork 0

Multiple lib changes

This commit is contained in:
Arnar Flatberg 2007-01-25 11:58:10 +00:00
parent a65d79697f
commit 1c2c2c8895
7 changed files with 519 additions and 152 deletions

View File

@ -1,7 +1,9 @@
"""This module contains bilinear models(Functions) """This module contains bilinear models(Functions)
""" """
import pygtk
import gtk import gtk
import gtk.glade
from fluents.workflow import Function, OptionsDialog, Options from fluents.workflow import Function, OptionsDialog, Options
from fluents.dataset import Dataset from fluents.dataset import Dataset
from fluents import plots, dataset, workflow, logger from fluents import plots, dataset, workflow, logger
@ -12,7 +14,7 @@ from cx_utils import mat_center
from validation import * from validation import *
import blmplots import blmplots
import engines import engines
import copy
class Model(Function): class Model(Function):
"""Base class of bilinear models. """Base class of bilinear models.
@ -39,19 +41,38 @@ class PCA(Model):
Model.__init__(self,id,name) Model.__init__(self,id,name)
self._options = PcaOptions() self._options = PcaOptions()
def pre_validation(self, amax, n_sets, val_engine): def validation(self, amax, cv_val_sets, pert_val_sets, cv_val_method, pert_val_method):
"""Model calculations for maximum number of components. """Model validation and estimate of optimal numer of components.
""" """
rmsep = val_engine(self.model['E0'], amax, n_sets) if self._options['calc_cv']:
self.model['rmsep'] = rmsep if cv_val_method == 'random':
self.model['aopt'] = rmsep.argmin() sep, aopt = pca_cv_val(self.model['E0'], amax, cv_val_sets)
self.model['sep'] = sep
if self._options['calc_pert']:
if pert_val_method == 'random_diag':
sep, aopt = pca_alter_val(self.model['E0'], amax, pert_val_sets)
self.model['sep'] = sep
if self._options['calc_cv']==False and self._options['calc_pert']==False:
self.model['sep'] = None
aopt = self._options['amax']
if self._options['auto_aopt']:
logger.log("notice", "Auto aopt: " + str(aopt))
self._options['aopt'] = aopt
if aopt==1:
logger.log('notice', 'Aopt at first component!')
def confidence(self, aopt, n_sets, alpha, p_center, def confidence(self, aopt, n_sets, alpha, p_center,
crot, strict, cov_center ): crot, strict, cov_center ):
"""Returns a confidence measure for model parameters. """Returns a confidence measure for model parameters.
Based on aopt. Based on aopt.
""" """
aopt = self.model['aopt'] if aopt<2:
aopt = 2
logger.log('notice','Hotellings T2 needs more than 1 comp.\n switching to 2!!')
jk_segments = pca_jkP(self.model['E0'], aopt, n_sets) jk_segments = pca_jkP(self.model['E0'], aopt, n_sets)
Pcal = self.model['P'][:,:aopt] Pcal = self.model['P'][:,:aopt]
tsq = hotelling(jk_segments, Pcal, p_center, tsq = hotelling(jk_segments, Pcal, p_center,
@ -96,8 +117,8 @@ class PCA(Model):
# vars # vars
ids_1 = [dim_name_1, DX.get_identifiers(dim_name_1, sorted=True)] ids_1 = [dim_name_1, DX.get_identifiers(dim_name_1, sorted=True)]
# components (hidden) # components (hidden)
pc_ids = ['_comp_a', map(str,range(self.model['aopt'])) ] pc_ids = ['_amax', map(str,range(self._options['amax'])) ]
pc_ids_opt = ['_comp_o', map(str, range(self.model['aopt'])) ] pc_ids_opt = ['_aopt', map(str, range(self._options['aopt'])) ]
zero_dim = ['_doe', ['0']] # null dim, vector (hidden) zero_dim = ['_doe', ['0']] # null dim, vector (hidden)
match_ids = {'E':[ids_0, ids_1], match_ids = {'E':[ids_0, ids_1],
'E0':[ids_0, ids_1], 'E0':[ids_0, ids_1],
@ -121,8 +142,7 @@ class PCA(Model):
#try: #try:
out.append(plt(self)) out.append(plt(self))
#except: #except:
# print plt # logger.log('debug', 'Plot: %s failed') %str(plt)
#logger.log('debug', 'Plot: %s failed') %plt
return out return out
def run_o(self, data): def run_o(self, data):
@ -130,6 +150,8 @@ class PCA(Model):
""" """
self.clear() self.clear()
options = self._options options = self._options
for item in options.items():
print item
self._dataset['X'] = data self._dataset['X'] = data
self._data['X'] = data.asarray().astype('<f8') self._data['X'] = data.asarray().astype('<f8')
if options['center']: if options['center']:
@ -138,7 +160,8 @@ class PCA(Model):
else: else:
self.model['E0'] = data.asarray() self.model['E0'] = data.asarray()
self.pre_validation(**options.pre_validation_options()) self.validation(**options.validation_options())
self.model['aopt'] = self._options['aopt']
self.make_model(**options.make_model_options()) self.make_model(**options.make_model_options())
if options['calc_conf']: if options['calc_conf']:
self.confidence(**options.confidence_options()) self.confidence(**options.confidence_options())
@ -159,7 +182,6 @@ class PCA(Model):
if response == gtk.RESPONSE_OK: if response == gtk.RESPONSE_OK:
# set output data and plots # set output data and plots
dialog.set_output() dialog.set_output()
#run with current data and options #run with current data and options
return self.run_o(data) return self.run_o(data)
@ -172,10 +194,10 @@ class PLS(Model):
def pre_validation(self, amax, n_sets, val_engine): def pre_validation(self, amax, n_sets, val_engine):
"""Returns rmsec,rmsep for model. """Returns rmsec,rmsep for model.
""" """
rmsep = val_engine(self.model['E0'], self.model['F0'], rmsep, aopt = val_engine(self.model['E0'], self.model['F0'],
amax, n_sets) amax, n_sets)
self.model['rmsep'] = rmsep.mean(0) self.model['rmsep'] = rmsep.mean(0)
self.model['aopt'] = rmsep.mean(0).argmin() self.model['aopt'] = aopt
def confidence(self, aopt, n_sets, alpha, p_center, def confidence(self, aopt, n_sets, alpha, p_center,
crot, strict, cov_center ): crot, strict, cov_center ):
@ -341,34 +363,39 @@ class PcaOptions(Options):
opt['algo'] = 'pca' opt['algo'] = 'pca'
opt['engine'] = engines.pca opt['engine'] = engines.pca
opt['mode'] = 'normal' # how much info to calculate opt['mode'] = 'normal' # how much info to calculate
opt['lod'] = 'compact' # how much info to store opt['amax'] = 10
opt['amax'] = 5 opt['aopt'] = 100
opt['aopt'] = 5 opt['auto_aopt'] = False
opt['center'] = True opt['center'] = True
opt['center_mth'] = mat_center opt['center_mth'] = mat_center
opt['scale'] = 'scores' opt['scale'] = 'scores'
opt['calc_conf'] = True
opt['n_sets'] = 5
opt['calc_conf'] = False
opt['n_sets'] = 5
opt['strict'] = True opt['strict'] = True
opt['p_center'] = 'med' opt['p_center'] = 'med'
opt['alpha'] = .8 opt['alpha'] = .8
opt['cov_center'] = 'med' opt['cov_center'] = 'med'
opt['crot'] = True opt['crot'] = True
opt['val_engine'] = pca_alter_val opt['calc_cv'] = False
opt['val_n_sets'] = 10 opt['calc_pert'] = True
opt['pert_val_method'] = 'random_diag'
opt['cv_val_method'] = 'random'
opt['cv_val_sets'] = 10
opt['pert_val_sets'] = 10
opt['all_data'] = [('T', 'scores', True), opt['all_data'] = [('T', 'scores', True),
('P', 'loadings', True), ('P', 'loadings', True),
('E','residuals', False), ('E','residuals', False),
('p_tsq', 't2', False), ('p_tsq', 't2', False),
('rmsep', 'root mean square error of prediction', False) ('rmsep', 'RMSEP', False)
] ]
opt['all_plots'] = [(blmplots.PcaScorePlot, 'Scores', True), opt['all_plots'] = [(blmplots.PcaScorePlot, 'Scores', True),
(blmplots.PcaLoadingPlot, 'Loadings', True), (blmplots.PcaLoadingPlot, 'Loadings', True),
(blmplots.LineViewXc, 'Line view', True) (blmplots.LineViewXc, 'Line view', True),
(blmplots.PredictionErrorPlot, 'Residual Error', True)
] ]
opt['out_data'] = ['T','P', 'p_tsq'] opt['out_data'] = ['T','P', 'p_tsq']
@ -387,9 +414,10 @@ class PcaOptions(Options):
'strict', 'crot', 'cov_center'] 'strict', 'crot', 'cov_center']
return self._copy_from_list(opt_list) return self._copy_from_list(opt_list)
def pre_validation_options(self): def validation_options(self):
"""Options for pre_validation method.""" """Options for pre_validation method."""
opt_list = ['amax', 'n_sets', 'val_engine'] opt_list = ['amax', 'cv_val_sets', 'pert_val_sets',
'cv_val_method', 'pert_val_method']
return self._copy_from_list(opt_list) return self._copy_from_list(opt_list)
@ -411,7 +439,7 @@ class PlsOptions(Options):
opt['center'] = True opt['center'] = True
opt['center_mth'] = mat_center opt['center_mth'] = mat_center
opt['scale'] = 'scores' opt['scale'] = 'scores'
opt['calc_conf'] = True opt['calc_conf'] = False
opt['n_sets'] = 10 opt['n_sets'] = 10
opt['strict'] = True opt['strict'] = True
@ -420,13 +448,15 @@ class PlsOptions(Options):
opt['cov_center'] = 'med' opt['cov_center'] = 'med'
opt['crot'] = True opt['crot'] = True
opt['calc_cv'] = True
opt['calc_pert'] = False
opt['val_engine'] = w_pls_cv_val opt['val_engine'] = w_pls_cv_val
opt['all_data'] = [('T', 'scores', True), opt['all_data'] = [('T', 'scores', True),
('P', 'loadings', True), ('P', 'loadings', True),
('E','residuals', False), ('E','residuals', False),
('p_tsq', 't2', False), ('p_tsq', 't2', False),
('rmsep', 'root mean square error of prediction', False) ('rmsep', 'RMSEP', False)
] ]
opt['all_plots'] = [(blmplots.PlsScorePlot, 'Scores', True), opt['all_plots'] = [(blmplots.PlsScorePlot, 'Scores', True),
@ -468,9 +498,175 @@ class PcaOptionsDialog(OptionsDialog):
def __init__(self, data, options, input_names=['X']): def __init__(self, data, options, input_names=['X']):
OptionsDialog.__init__(self, data, options, input_names) OptionsDialog.__init__(self, data, options, input_names)
glade_file = "/home/flatberg/Projects/project4/project4.glade"
notebook_name = "vbox1"
page_name = "Options"
self.add_page_from_glade(glade_file, notebook_name, page_name)
# connect signals to handlers
dic = {"on_amax_value_changed" : self.on_amax_changed,
"on_aopt_value_changed" : self.on_aopt_changed,
"auto_aopt_toggled" : self.auto_aopt_toggled,
"center_toggled" : self.center_toggled,
"on_scale_changed" : self.on_scale_changed,
"on_val_none" : self.val_toggled,
"on_val_cv" : self.cv_toggled,
"on_val_pert" : self.pert_toggled,
"on_cv_method_changed" : self.on_cv_method_changed,
"on_cv_sets_changed" : self.on_cv_sets_changed,
"on_pert_sets_changed" : self.on_pert_sets_changed,
"on_conf_toggled" : self.on_conf_toggled
}
self.wTree.signal_autoconnect(dic)
# set/ensure valid default values/ranges
amax_sb = self.wTree.get_widget("amax_spinbutton")
max_comp = min(data[0].shape) # max num of components
if self._options['amax']>max_comp:
logger.log('debug', 'amax default too large ... adjusting')
self._options['amax'] = max_comp
amax_sb.get_adjustment().set_all(self._options['amax'], 1, max_comp, 1, 0, 0)
# aopt spin button
aopt_sb = self.wTree.get_widget("aopt_spinbutton")
if self._options['aopt']>self._options['amax']:
self._options['aopt'] = self._options['amax'] + 1 - 1
aopt_sb.get_adjustment().set_all(self._options['aopt'], 1, self._options['amax'], 1, 0, 0)
# scale
scale_cb = self.wTree.get_widget("scale_combobox")
scale_cb.set_active(0)
# validation frames
if self._options['calc_cv']==False:
cv_frame = self.wTree.get_widget("cv_frame")
cv_frame.set_sensitive(False)
if self._options['calc_pert']==False:
pert_frame = self.wTree.get_widget("pert_frame")
pert_frame.set_sensitive(False)
cv = self.wTree.get_widget("cv_method").set_active(0)
pm = self.wTree.get_widget("pert_method").set_active(0)
# confidence
if self._options['calc_conf']==True:
self.wTree.get_widget("subset_frame").set_sensitive(True)
else:
self.wTree.get_widget("subset_frame").set_sensitive(False)
def on_amax_changed(self, sb):
logger.log("debug", "amax changed: new value: %s" %sb.get_value_as_int())
amax = sb.get_value_as_int()
# update aopt if needed
if amax<self._options['aopt']:
self._options['aopt'] = amax
aopt_sb = self.wTree.get_widget("aopt_spinbutton")
aopt_sb.get_adjustment().set_all(self._options['aopt'], 1, amax, 1, 0, 0)
self._options['amax'] = sb.get_value_as_int()
def on_aopt_changed(self, sb):
aopt = sb.get_value_as_int()
self._options['aopt'] = aopt
def auto_aopt_toggled(self, tb):
aopt_sb = self.wTree.get_widget("aopt_spinbutton")
if tb.get_active():
self._options['auto_aopt'] = True
aopt_sb.set_sensitive(False)
else:
self._options['auto_aopt'] = False
aopt_sb.set_sensitive(True)
def center_toggled(self, tb):
if tb.get_active():
self._options['center'] = True
else:
logger.log("debug", "centering set to False")
self._options['center'] = False
def on_scale_changed(self, cb):
scale = cb.get_active_text()
if scale=='Scores':
self._options['scale'] = 'scores'
elif scale=='Loadings':
self._options['scale'] = 'loads'
else:
raise IOError
def val_toggled(self, tb):
"""Callback for validation: None. """
cv_frame = self.wTree.get_widget("cv_frame")
pert_frame = self.wTree.get_widget("pert_frame")
cv_tb = self.wTree.get_widget("cv_toggle")
p_tb = self.wTree.get_widget("pert_toggle")
if tb.get_active():
self._options['calc_cv'] = False
self._options['calc_pert'] = False
cv_frame.set_sensitive(False)
pert_frame.set_sensitive(False)
cv_tb.set_sensitive(False)
p_tb.set_sensitive(False)
else:
p_tb.set_sensitive(True)
cv_tb.set_sensitive(True)
if p_tb.get_active():
pert_frame.set_sensitive(True)
self._options['calc_pert'] = True
if cv_tb.get_active():
cv_frame.set_sensitive(True)
self._options['calc_cv'] = True
def cv_toggled(self, tb):
cv_frame = self.wTree.get_widget("cv_frame")
if tb.get_active():
cv_frame.set_sensitive(True)
self._options['calc_cv'] = True
else:
cv_frame.set_sensitive(False)
self._options['calc_cv'] = False
def pert_toggled(self, tb):
pert_frame = self.wTree.get_widget("pert_frame")
if tb.get_active():
pert_frame.set_sensitive(True)
self._options['calc_pert'] = True
else:
pert_frame.set_sensitive(False)
self._options['calc_pert'] = False
def on_cv_method_changed(self, cb):
method = cb.get_active_text()
if method == 'Random':
self._options['cv_val_method'] = 'random'
def on_pert_method_changed(self, cb):
method = cb.get_active_text()
if method == 'Random diags':
self._options['pert_val_method'] = 'random_diag'
def on_cv_sets_changed(self, sb):
val = sb.get_value_as_int()
self._options['cv_val_sets'] = val
def on_pert_sets_changed(self, sb):
val = sb.get_value_as_int()
self._options['pert_val_sets'] = val
def on_conf_toggled(self, tb):
if tb.get_active():
self._options['calc_conf'] = False
self.wTree.get_widget("subset_frame").set_sensitive(False)
else:
self._options['calc_conf'] = True
self.wTree.get_widget("subset_frame").set_sensitive(True)
class PlsOptionsDialog(OptionsDialog): class PlsOptionsDialog(OptionsDialog):
"""Options dialog for Partial Least Squares Regression. """Options dialog for Partial Least Squares Regression.
""" """
def __init__(self, data, options, input_names=['X', 'Y']): def __init__(self, data, options, input_names=['X', 'Y']):
OptionsDialog.__init__(self, data, options, input_names) OptionsDialog.__init__(self, data, options, input_names)

View File

@ -12,8 +12,9 @@ fixme2:
colorbar, but when adding colors the colorbar shoud be created. colorbar, but when adding colors the colorbar shoud be created.
""" """
from fluents import plots from fluents import plots
from scipy import dot,sum,diag,arange,log,mean,newaxis from scipy import dot,sum,diag,arange,log,mean,newaxis,sqrt
from matplotlib import cm from matplotlib import cm
import pylab as PB
class PcaScorePlot(plots.ScatterPlot): class PcaScorePlot(plots.ScatterPlot):
"""PCA Score plot""" """PCA Score plot"""
@ -27,10 +28,10 @@ class PcaScorePlot(plots.ScatterPlot):
id_2, = dataset_1.get_identifiers(sel_dim, [ordi]) id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
plots.ScatterPlot.__init__(self, dataset_1, dataset_2, id_dim, sel_dim, id_1, id_2 ,c='b' ,s=40 , name='pca-scores') plots.ScatterPlot.__init__(self, dataset_1, dataset_2, id_dim, sel_dim, id_1, id_2 ,c='b' ,s=40 , name='pca-scores')
def set_absicca(self,n): def set_absicca(self, n):
self.xaxis_data = self._T[:,n] self.xaxis_data = self._T[:,n]
def set_ordinate(self,n): def set_ordinate(self, n):
self.yaxis_data = self._T[:,n] self.yaxis_data = self._T[:,n]
class PcaLoadingPlot(plots.ScatterPlot): class PcaLoadingPlot(plots.ScatterPlot):
@ -50,15 +51,15 @@ class PcaLoadingPlot(plots.ScatterPlot):
col = 'g' col = 'g'
plots.ScatterPlot.__init__(self, dataset_1, dataset_2, id_dim, sel_dim, id_1, id_2,c=col,s=20, name='pls-loadings') plots.ScatterPlot.__init__(self, dataset_1, dataset_2, id_dim, sel_dim, id_1, id_2,c=col,s=20, name='pls-loadings')
def set_absicca(self,n): def set_absicca(self, n):
self.xaxis_data = self._P[:,n] self.xaxis_data = self._P[:,n]
def set_ordinate(self,n): def set_ordinate(self, n):
self.yaxis_data = self._P[:,n] self.yaxis_data = self._P[:,n]
class PlsScorePlot(plots.ScatterPlot): class PlsScorePlot(plots.ScatterPlot):
"""PLS Score plot""" """PLS Score plot"""
def __init__(self,model, absi=0, ordi=1): def __init__(self, model, absi=0, ordi=1):
self._T = model.model['T'] self._T = model.model['T']
dataset_1 = model.as_dataset('T') dataset_1 = model.as_dataset('T')
dataset_2 = dataset_1 dataset_2 = dataset_1
@ -71,16 +72,16 @@ class PlsScorePlot(plots.ScatterPlot):
id_dim, sel_dim, id_1, id_2 , id_dim, sel_dim, id_1, id_2 ,
c='b' ,s=40 , name='pls-scores') c='b' ,s=40 , name='pls-scores')
def set_absicca(self,n): def set_absicca(self, n):
self.xaxis_data = self._T[:,n] self.xaxis_data = self._T[:,n]
def set_ordinate(self,n): def set_ordinate(self, n):
self.yaxis_data = self._T[:,n] self.yaxis_data = self._T[:,n]
class PlsLoadingPlot(plots.ScatterPlot): class PlsLoadingPlot(plots.ScatterPlot):
"""PLS Loading plot""" """PLS Loading plot"""
def __init__(self,model,absi=0,ordi=1): def __init__(self, model, absi=0, ordi=1):
self._P = model.model['P'] self._P = model.model['P']
dataset_1 = model.as_dataset('P') dataset_1 = model.as_dataset('P')
dataset_2 = dataset_1 dataset_2 = dataset_1
@ -97,44 +98,46 @@ class PlsLoadingPlot(plots.ScatterPlot):
id_dim, sel_dim, id_1, id_2, id_dim, sel_dim, id_1, id_2,
c=col, s=20, name='loadings') c=col, s=20, name='loadings')
def set_absicca(self,n): def set_absicca(self, n):
self.xaxis_data = self._P[:,n] self.xaxis_data = self._P[:,n]
def set_ordinate(self,n): def set_ordinate(self, n):
self.yaxis_data = self._T[:,n] self.yaxis_data = self._T[:,n]
class LineViewXc(plots.LineViewPlot): class LineViewXc(plots.LineViewPlot):
"""A line view of centered raw data """A line view of centered raw data
""" """
def __init__(self, func_class, name='Profiles'): def __init__(self, model, name='Profiles'):
# copy, center, plot # copy, center, plot
x = func_class._dataset['X'].copy() x = model._dataset['X'].copy()
x._array = x._array - mean(x._array,0)[newaxis] x._array = x._array - mean(x._array,0)[newaxis]
plots.LineViewPlot.__init__(self, x, 1, None, name) plots.LineViewPlot.__init__(self, x, 1, None, name)
class ParalellCoordinates(plots.Plot): class ParalellCoordinates(plots.Plot):
"""Parallell coordinates for score loads with many comp. """Parallell coordinates for score loads with many comp.
""" """
def __init__(self,model, p = 'loads'): def __init__(self, model, p='loads'):
pass pass
class PlsQvalScatter(plots.ScatterPlot): class PlsQvalScatter(plots.ScatterPlot):
"""A vulcano like plot of loads vs qvals """A vulcano like plot of loads vs qvals
""" """
def __init__(self, func_class, pc=0): def __init__(self, model, pc=0):
model = func_class.model if not model.model.has_key('w_tsq'):
if not model.has_key('w_tsq'):
return return
self._W = model['P'] self._W = model.model['P']
dataset_1 = func_class.as_dataset('P') dataset_1 = model.as_dataset('P')
dataset_2 = func_class.as_dataset('w_tsq') dataset_2 = model.as_dataset('w_tsq')
id_dim = dataset_1.get_dim_name(0) #genes id_dim = dataset_1.get_dim_name(0) #genes
sel_dim = dataset_1.get_dim_name(1) #_comp sel_dim = dataset_1.get_dim_name(1) #_comp
sel_dim_2 = dataset_2.get_dim_name(1) #_zero_dim sel_dim_2 = dataset_2.get_dim_name(1) #_zero_dim
id_1, = dataset_1.get_identifiers(sel_dim, [0]) id_1, = dataset_1.get_identifiers(sel_dim, [0])
id_2, = dataset_2.get_identifiers(sel_dim_2, [0]) id_2, = dataset_2.get_identifiers(sel_dim_2, [0])
if model.has_key('w_tsq'): if model.model.has_key('w_tsq'):
col = model['w_tsq'].ravel() col = model.model['w_tsq'].ravel()
col = normalise(col) col = normalise(col)
else: else:
col = 'g' col = 'g'
@ -143,6 +146,33 @@ class PlsQvalScatter(plots.ScatterPlot):
c=col, s=20, sel_dim_2=sel_dim_2, c=col, s=20, sel_dim_2=sel_dim_2,
name='Load Volcano') name='Load Volcano')
class PredictionErrorPlot(plots.Plot):
"""A boxplot of prediction error vs. comp. number.
"""
def __init__(self, model, name="Pred. Err."):
if not model.model.has_key('sep'):
logger.log('notice', 'Model has no calculations of sep')
return
plots.Plot.__init__(self, name)
self._frozen = True
self.current_dim = 'johndoe'
self.ax = self.fig.add_subplot(111)
# draw
sep = model.model['sep']
aopt = model.model['aopt']
bx_plot_lines = self.ax.boxplot(sqrt(sep))
aopt_marker = self.ax.axvline(aopt, linewidth=10,
color='r',zorder=0,
alpha=.5)
# add canvas
self.add(self.canvas)
self.canvas.show()
def set_current_selection(self, selection):
pass
class InfluencePlot(plots.ScatterPlot): class InfluencePlot(plots.ScatterPlot):
""" """

View File

@ -1,6 +1,7 @@
from scipy import apply_along_axis,newaxis,zeros,\ from scipy import apply_along_axis,newaxis,zeros,\
median,round_,nonzero,dot,argmax,any,sqrt,ndarray,\ median,round_,nonzero,dot,argmax,any,sqrt,ndarray,\
trace,zeros_like,sign,sort,real,argsort,rand,array trace,zeros_like,sign,sort,real,argsort,rand,array,\
matrix
from scipy.linalg import norm,svd,inv,eig from scipy.linalg import norm,svd,inv,eig
from scipy.stats import median,mean from scipy.stats import median,mean
@ -106,3 +107,7 @@ def mat_center(X,axis=0,ret_mn=False):
return Xs,mnX return Xs,mnX
else: else:
return Xs return Xs
def m_shape(array):
"""Returns the array shape on the form of a numpy.matrix."""
return matrix(array).shape

View File

@ -1,4 +1,3 @@
"""Module contain algorithms for (burdensome) calculations. """Module contain algorithms for (burdensome) calculations.
There is no typechecking of any kind here, just focus on speed There is no typechecking of any kind here, just focus on speed
@ -7,7 +6,7 @@ There is no typechecking of any kind here, just focus on speed
from scipy.linalg import svd,norm,inv,pinv,qr from scipy.linalg import svd,norm,inv,pinv,qr
from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\ from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\
apply_along_axis,mean,ones,randn,empty_like,outer,c_,\ apply_along_axis,mean,ones,randn,empty_like,outer,c_,\
rand,sum,cumsum rand,sum,cumsum,matrix
def pca(a, aopt, scale='scores', mode='normal'): def pca(a, aopt, scale='scores', mode='normal'):
""" Principal Component Analysis model """ Principal Component Analysis model
@ -17,8 +16,9 @@ def pca(a, aopt, scale='scores', mode='normal'):
-- detailed : returns all model params and all residuals -- detailed : returns all model params and all residuals
""" """
m,n = a.shape m, n = a.shape
u,s,vt = svd(a, full_matrices=0) u, s, vt = svd(a, full_matrices=0)
eigvals = (1./m)*s
T = u*s T = u*s
T = T[:,:aopt] T = T[:,:aopt]
P = vt[:aopt,:].T P = vt[:aopt,:].T
@ -43,17 +43,40 @@ def pca(a, aopt, scale='scores', mode='normal'):
return {'T':T, 'P':P, 'E':E} return {'T':T, 'P':P, 'E':E}
def pcr(a, b, aopt=2, scale='scores', mode='normal'):
"""Returns Principal component regression model."""
m, n = a.shape
try:
k, l = b.shape
except:
k = b.shape[0]
l = 1
B = empty((aopt, n, l))
U, s, Vt = svd(a, full_matrices=True)
T = U*s
T = T[:,:aopt]
P = Vt[:aopt,:].T
Q = dot(dot(inv(dot(T.T, T)), T.T), b).T
for i in range(aopt):
ti = T[:,:i+1]
r = dot(dot(inv(dot(ti.T,ti)), ti.T), b)
B[i] = dot(P[:,:i+1], r)
E = a - dot(T, P.T)
F = b - dot(T, Q.T)
return {'T':T, 'P':P,'Q': Q, 'B':B, 'E':E, 'F':F}
def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None): def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
"""Kernel pls for tall/wide matrices. """Kernel pls for tall/wide matrices.
Fast pls for calibration. Only inefficient for many Y-vars. Fast pls for calibration. Only inefficient for many Y-vars.
""" """
m,n = a.shape m, n = a.shape
if ab!=None: if ab!=None:
mm,l = ab.shape mm, l = m_shape(ab)
else: else:
k,l = b.shape k, l = m_shape(b)
W = empty((n, aopt)) W = empty((n, aopt))
P = empty((n, aopt)) P = empty((n, aopt))
@ -66,10 +89,10 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
ab = dot(a.T, b) ab = dot(a.T, b)
for i in range(aopt): for i in range(aopt):
if ab.shape[1]==1: if ab.shape[1]==1:
w = ab w = ab.reshape(mm, l)
else: else:
u,s,vh = svd(dot(ab.T, ab)) u, s, vh = svd(dot(ab.T, ab))
w = dot(ab,u[:,:1]) w = dot(ab, u[:,:1])
w = w/norm(w) w = w/norm(w)
r = w.copy() r = w.copy()
@ -99,9 +122,9 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
if mode=='detailed': if mode=='detailed':
E = empty((aopt, m, n)) E = empty((aopt, m, n))
F = empty((aopt, k, l)) F = empty((aopt, k, l))
for i in range(1,aopt+1,1): for i in range(1, aopt+1, 1):
E[i-1] = a - dot(T[:,:i],P[:,:i].T) E[i-1] = a - dot(T[:,:i], P[:,:i].T)
F[i-1] = b - dot(T[:,:i],Q[:,:i].T) F[i-1] = b - dot(T[:,:i], Q[:,:i].T)
else: else:
E = a - dot(T[:,:aopt], P[:,:aopt].T) E = a - dot(T[:,:aopt], P[:,:aopt].T)
F = b - dot(T[:,:aopt], Q[:,:aopt].T) F = b - dot(T[:,:aopt], Q[:,:aopt].T)
@ -121,17 +144,17 @@ def w_simpls(aat, b, aopt):
There is no P,W. T is normalised There is no P,W. T is normalised
""" """
bb = b.copy() bb = b.copy()
m,m = aat.shape m, m = aat.shape
U = empty((m, aopt)) U = empty((m, aopt))
T = empty((m, aopt)) T = empty((m, aopt))
H = empty((m, aopt)) #just like W in simpls H = empty((m, aopt)) #just like W in simpls
PROJ = empty((m, aopt)) #just like R in simpls PROJ = empty((m, aopt)) #just like R in simpls
for i in range(aopt): for i in range(aopt):
u,s,vh = svd(dot(dot(b.T, aat), b), full_matrices=0) u, s, vh = svd(dot(dot(b.T, aat), b), full_matrices=0)
u = dot(b, u[:,:1]) #y-factor scores u = dot(b, u[:,:1]) #y-factor scores
U[:,i] = u.ravel() U[:,i] = u.ravel()
t =dot(aat, u) t = dot(aat, u)
t = t/norm(t) t = t/norm(t)
T[:,i] = t.ravel() T[:,i] = t.ravel()
h = dot(aat, t) #score-weights h = dot(aat, t) #score-weights
@ -141,19 +164,18 @@ def w_simpls(aat, b, aopt):
b = b - dot(PROJ[:,:i+1], dot(H[:,:i+1].T,b) ) b = b - dot(PROJ[:,:i+1], dot(H[:,:i+1].T,b) )
C = dot(bb.T, T) C = dot(bb.T, T)
return {'T':T,'U':U,'Q':C,'H':H} return {'T':T, 'U':U, 'Q':C, 'H':H}
def bridge(a, b, aopt, scale='scores', mode='normal', r=0): def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
"""Undeflated Ridged svd(X'Y) """Undeflated Ridged svd(X'Y)
""" """
m, n = a.shape m, n = a.shape
k, l = b.shape k, l = m_shape(b)
u,s,vt = svd(b, full_matrices=0) u, s, vt = svd(b, full_matrices=0)
g0 = dot(u*s, u.T) g0 = dot(u*s, u.T)
g = (1 - r)*g0 + r*eye(m) g = (1 - r)*g0 + r*eye(m)
ag = dot(a.T, g) ag = dot(a.T, g)
u, s, vt = svd(ag, full_matrices=0)
u,s,vt = svd(ag, full_matrices=0)
W = u[:,:aopt] W = u[:,:aopt]
K = vt[:aopt,:].T K = vt[:aopt,:].T
T = dot(a, W) T = dot(a, W)
@ -166,8 +188,8 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
return {'T':T, 'W':W} return {'T':T, 'W':W}
U = dot(g0, K) #fixme check this U = dot(g0, K) #fixme check this
Q = dot(b.T, dot(T, inv(dot(T.T,T)) )) Q = dot(b.T, dot(T, inv(dot(T.T, T)) ))
B = zeros((aopt, n, l)) B = zeros((aopt, n, l), dtype='f')
for i in range(aopt): for i in range(aopt):
B[i] = dot(W[:,:i+1], Q[:,:i+1].T) B[i] = dot(W[:,:i+1], Q[:,:i+1].T)
# leverages # leverages
@ -198,3 +220,6 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
return {'B':B, 'W':W, 'T':T, 'Q':Q, 'E':E, 'F':F, 'U':U, 'P':W} return {'B':B, 'W':W, 'T':T, 'Q':Q, 'E':E, 'F':F, 'U':U, 'P':W}
def m_shape(array):
return matrix(array).shape

View File

@ -201,16 +201,16 @@ def node_weighted_adj_matrix(G, weights=None, ave_type='harmonic', with_labels=F
def weighted_adj_matrix(G, with_labels=False): def weighted_adj_matrix(G, with_labels=False):
"""Adjacency matrix of an XGraph whos weights are given in edges. """Adjacency matrix of an XGraph whos weights are given in edges.
""" """
A,labels = NX.adj_matrix(G,with_labels=True) A, labels = NX.adj_matrix(G, with_labels=True)
W = A.astype('<f8') W = A.astype('<f8')
for orf,i in labels.items(): for orf, i in labels.items():
for orf2,j in labels.items(): for orf2, j in labels.items():
if G.has_edge(orf,orf2): if G.has_edge(orf, orf2):
edge_weight = G.get_edge(orf,orf2) edge_weight = G.get_edge(orf, orf2)
W[i,j]=edge_weight W[i,j] = edge_weight
W[j,i]=edge_weight W[j,i] = edge_weight
if with_labels==True: if with_labels==True:
return W,labels return W, labels
else: else:
return W return W
@ -418,6 +418,31 @@ def weighted_laplacian(G,with_labels=False):
else: else:
return L return L
def subnetworks(G, T2):
"""Return the highest scoring (T2-test) subgraph og G.
Use simulated annealing to identify highly scoring subgraphs.
ref: -- Ideker et.al (Bioinformatics 18, 2002)
-- Patil and Nielsen (PNAS 2006)
"""
N = 1000
states = [(node, False) for node in G.nodes()]
t2_last = 0.0
for i in xrange(N):
if i==0: #assign random states
states = [(state[0], True) for state in states if rand(1)>.5]
sub_nodes = [state[0] for state in states if state[1]]
Gsub = NX.subgraph(G, sub_nodes)
Gsub = NX.connected_components_subgraphs(Gsub)[0]
t2 = [T2[node] for node in Gsub]
if t2>t2_last:
pass
else:
p = numpy.exp()
"""Below are methods for calculating graph metrics """Below are methods for calculating graph metrics
@ -473,7 +498,7 @@ Ke = expm(A) .... expm(-A)?
# 13.09.2206: update for use in numpy # 13.09.2206: update for use in numpy
def K_expAdj(W, normalised=False, alpha=1.0): def K_expAdj(W, normalised=True, alpha=1.0):
"""Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel. """Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel.
""" """
W = asarray(W) W = asarray(W)
@ -499,7 +524,7 @@ def K_expAdj(W, normalised=False, alpha=1.0):
return dot(dot(vr,psigma),vri) return dot(dot(vr,psigma),vri)
def K_vonNeumann(W,normalised=False,alpha=1.0): def K_vonNeumann(W, normalised=True, alpha=1.0):
""" The geometric series of path lengths. """ The geometric series of path lengths.
Returns matrix square root of pseudo inverse of the adjacency matrix. Returns matrix square root of pseudo inverse of the adjacency matrix.
""" """
@ -540,8 +565,8 @@ def K_laplacian(W, normalised=True, alpha=1.0):
D = diag(sum(W,0)) D = diag(sum(W,0))
L = D - W L = D - W
if normalised==True: if normalised==True:
T = diag(sqrt(1./sum(W,0))) T = diag(sqrt(1./sum(W, 0)))
L = dot(dot(T,L),T) L = dot(dot(T, L), T)
e,vr = eig(L) e,vr = eig(L)
e = real(e) e = real(e)
vri = inv(vr) vri = inv(vr)
@ -551,7 +576,7 @@ def K_laplacian(W, normalised=True, alpha=1.0):
for i in range(len(e)): for i in range(len(e)):
if e[i] > cutoff: if e[i] > cutoff:
psigma[i] = 1.0/e[i] psigma[i] = 1.0/e[i]
K = dot(dot(vr,diag(psigma)),vri).astype(t) K = dot(dot(vr, diag(psigma)), vri).astype(t)
K = real(K) K = real(K)
I = eye(n) I = eye(n)
K = (1-alpha)*I + alpha*K K = (1-alpha)*I + alpha*K

View File

@ -30,22 +30,14 @@ def w_pls_gen(aat,b,n_blocks=None,center=True,index_out=False):
b_in = b[inn,:] b_in = b[inn,:]
b_out = b[out,:] b_out = b[out,:]
if center: if center:
# centering projector: I - (1/n)11' aat_in, mn = outerprod_centering(aat_in)
# nin = len(inn) aat_out = aat_out - mn
# Pc = eye(nin) - outer(ones((nin,)),ones((nin,)))/nin
# xxt - x( outer(ones((nin,)),ones((nin,)))/nin ) x.T
# de jong:
h = sum(aat_in,0)[ :,newaxis]
h = (h - mean(h)/2)/len(inn)
mn_a = h + h.T
aat_in = aat_in - mn_a
if index_out: if index_out:
yield aat_in,aat_out,b_in,b_out,out yield aat_in,aat_out,b_in,b_out,out
else: else:
yield aat_in,aat_out,b_in,b_out yield aat_in,aat_out,b_in,b_out
def pls_gen(a,b, n_blocks=None, center=False, index_out=False,axis=0): def pls_gen(a, b, n_blocks=None, center=False, index_out=False,axis=0, metric=None):
"""Random block crossvalidation """Random block crossvalidation
Leave-one-out is a subset, with n_blocks equals a.shape[-1] Leave-one-out is a subset, with n_blocks equals a.shape[-1]
""" """
@ -56,17 +48,38 @@ def pls_gen(a,b, n_blocks=None, center=False, index_out=False,axis=0):
out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_blocks)] out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_blocks)]
for out in out_ind_sets: for out in out_ind_sets:
inn = [i for i in index if i not in out] inn = [i for i in index if i not in out]
acal = a.take(inn, 0)
atrue = a.take(out, 0)
bcal = b.take(inn, 0)
btrue = b.take(out, 0)
if center: if center:
a = a - mean(a,0)[newaxis] mn_a = acal.mean(0)[newaxis]
b = b - mean(b,0)[newaxis] acal = acal - mn_a
atrue = atrue - mn_a
mn_b = bcal.mean(0)[newaxis]
bcal = bcal - mn_b
btrue = btrue - mn_b
if metric!=None:
acal = dot(acal, metric)
if index_out: if index_out:
yield a.take(inn,0),a.take(out,0), b.take(inn,0),b.take(out,0),out yield acal, atrue, bcal, btrue, out
else: else:
yield a.take(inn,0),a.take(out,0), b.take(inn,0),b.take(out,0) yield acal, atrue, bcal, btrue
def pca_gen(a,n_sets=None, center=False, index_out=False,axis=0): def pca_gen(a, n_sets=None, center=False, index_out=False, axis=0):
"""PCA random block crossval generator. """Returns a generator of crossvalidation sample segments.
input:
-- a, data matrix (m x n)
-- n_sets, number of segments/subsets to generate.
-- center, bool, choice of centering each subset
-- index_out, bool, return subset index
-- axis, int, which axis to get subset from
ouput:
-- V, generator with (n_sets) memebers (subsets)
""" """
m = a.shape[axis] m = a.shape[axis]
index = randperm(m) index = randperm(m)
@ -76,21 +89,26 @@ def pca_gen(a,n_sets=None, center=False, index_out=False,axis=0):
out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_sets)] out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_sets)]
for out in out_ind_sets: for out in out_ind_sets:
inn = [i for i in index if i not in out] inn = [i for i in index if i not in out]
acal = a.take(inn, 0)
atrue = a.take(out, 0)
if center: if center:
a = a - mean(a,0)[newaxis] mn_a = acal.mean(0)[newaxis]
acal = acal - mn_a
atrue = atrue - mn_a
if index_out: if index_out:
yield a.take(inn,0),a.take(out,0),out yield acal, atrue, out
else: else:
yield a.take(inn,0),a.take(out,0) yield acal, atrue
def w_pls_gen_jk(a,b,n_sets=None,center=True,index_out=False,axis=0): def w_pls_gen_jk(a, b, n_sets=None, center=True,
index_out=False, axis=0):
"""Random block crossvalidation for wide X (m>>n) """Random block crossvalidation for wide X (m>>n)
Leave-one-out is a subset, with n_sets equals a.shape[-1] Leave-one-out is a subset, with n_sets equals a.shape[-1]
Returns : X_m and X_m'Y_m Returns : X_m and X_m'Y_m
""" """
m = a.shape[axis] m = a.shape[axis]
ab = dot(a.T,b) ab = dot(a.T, b)
index = randperm(m) index = randperm(m)
if n_sets==None: if n_sets==None:
n_sets = m n_sets = m
@ -103,19 +121,18 @@ def w_pls_gen_jk(a,b,n_sets=None,center=True,index_out=False,axis=0):
a_in = a[inn,:] a_in = a[inn,:]
mn_a = 0 mn_a = 0
mAB = 0 mAB = 0
if center: if center:
mn_a = mean(a,0)[newaxis] mn_a = a_in.mean(0)[newaxis]
mAin = dot(-ones((1,nout)),a[out,:])/nin mAin = dot(-ones((1,nout)), a[out,:])/nin
mBin = dot(-ones((1,nout)),b[out,:])/nin mBin = dot(-ones((1,nout)), b[out,:])/nin
mAB = dot(mAin.T,(mBin*nin)) mAB = dot(mAin.T, (mBin*nin))
ab_in = ab - dot(a[out,].T,b[out,:]) - mAB ab_in = ab - dot(a[out,].T, b[out,:]) - mAB
a_in = a_in - mn_a a_in = a_in - mn_a
if index_out: if index_out:
yield ain,ab, out yield a_in, ab_in, out
else: else:
yield a_in, ab yield a_in, ab_in
def shuffle_1d_block(a, n_sets=None, blocks=None, index_out=False, axis=0): def shuffle_1d_block(a, n_sets=None, blocks=None, index_out=False, axis=0):
"""Random block shuffling along 1d axis """Random block shuffling along 1d axis
@ -185,3 +202,19 @@ def diag_pert(a, n_sets=10, center=True, index_out=False):
yield a_out, asarray(out) yield a_out, asarray(out)
else: else:
yield a_out yield a_out
def outerprod_centering(aat, ret_mn=True):
"""Returns mean centered symmetric outerproduct matrix.
"""
n = aat.shape[0]
h = aat.sum(0)[:,newaxis]
h = (h - mean(h)/2)/n
mn_a = h + h.T
aatc = aat - mn_a
if ret_mn:
return aatc, aat.mean(0)
return aat - mn_a

View File

@ -1,30 +1,33 @@
"""This module implements some common validation schemes from pca and pls.
"""
from scipy import ones,mean,sqrt,dot,newaxis,zeros,sum,empty,\ from scipy import ones,mean,sqrt,dot,newaxis,zeros,sum,empty,\
apply_along_axis,eye, kron apply_along_axis,eye,kron,array,sort
from scipy.stats import median
from scipy.linalg import triu,inv,svd,norm from scipy.linalg import triu,inv,svd,norm
from select_generators import w_pls_gen,w_pls_gen_jk,pls_gen,pca_gen,diag_pert from select_generators import w_pls_gen,w_pls_gen_jk,pls_gen,pca_gen,diag_pert
from engines import w_simpls,pls, bridge,pca from engines import w_simpls,pls,bridge,pca
from pylab import * from cx_utils import m_shape
def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'): def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
"""RMSEP calc for pls with wide X. """Returns and RMSEP for pls tailored for wide X.
""" """
k, l = Y.shape k, l = m_shape(Y)
PRESS = zeros((l, amax+1), dtype='f') PRESS = zeros((l, amax+1), dtype='f')
# X,Y are centered # X,Y are centered0
if n_blocks==None: if n_blocks==None:
n_blocks = Y.shape[0] n_blocks = Y.shape[0]
V = w_pls_gen(dot(X, X.T), Y, n_blocks=n_blocks, center=True) XXt = dot(X, X.T)
V = w_pls_gen(XXt, Y, n_blocks=n_blocks, center=True)
for Din, Doi, Yin, Yout in V: for Din, Doi, Yin, Yout in V:
ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0]) ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0])
Yin = Yin - ym Yin = Yin - ym
PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0) PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
if algo=='simpls': if algo=='simpls':
dat = w_simpls(Din, Yin, amax) dat = w_simpls(Din, Yin, amax)
Q,U,H = dat['Q'], dat['U'], dat['H'] Q, U, H = dat['Q'], dat['U'], dat['H']
That = dot(Doi, dot(U, inv(triu(dot(H.T,U))) )) That = dot(Doi, dot(U, inv(triu(dot(H.T,U))) ))
else: else:
"Other algo-support comming soon"
raise NotImplementedError raise NotImplementedError
#Yhat = empty((amax, k, l),dtype='<f8') #Yhat = empty((amax, k, l),dtype='<f8')
Yhat = [] Yhat = []
@ -34,13 +37,14 @@ def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
E = E + sum(E, 0)/Din.shape[0] E = E + sum(E, 0)/Din.shape[0]
PRESS[j,1:] = PRESS[j,1:] + sum(E**2, 0) PRESS[j,1:] = PRESS[j,1:] + sum(E**2, 0)
#Yhat = Y - dot(That,Q.T) #Yhat = Y - dot(That,Q.T)
return sqrt(PRESS/Y.shape[0]) rmsep = sqrt(PRESS/Y.shape[0])
aopt = find_aopt_from_sep(rmsep)
return rmsep, aopt
def pls_val(X, Y, amax=2, n_blocks=10,algo='pls'): def pls_val(X, Y, amax=2, n_blocks=10,algo='pls'):
""" Validation results of pls model. """ Validation results of pls model.
""" """
k, l = m_shape(Y)
k, l = Y.shape
PRESS = zeros((l, amax+1), dtype='<f8') PRESS = zeros((l, amax+1), dtype='<f8')
EE = zeros((amax, k, l), dtype='<f8') EE = zeros((amax, k, l), dtype='<f8')
Yhat = zeros((amax, k, l), dtype='<f8') Yhat = zeros((amax, k, l), dtype='<f8')
@ -50,6 +54,7 @@ def pls_val(X, Y, amax=2, n_blocks=10,algo='pls'):
ym = -sum(Yout,0)[newaxis]/Yin.shape[0] ym = -sum(Yout,0)[newaxis]/Yin.shape[0]
Yin = (Yin - ym) Yin = (Yin - ym)
PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0) PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
if algo=='pls': if algo=='pls':
dat = pls(Xin, Yin, amax, mode='normal') dat = pls(Xin, Yin, amax, mode='normal')
elif algo=='bridge': elif algo=='bridge':
@ -62,9 +67,11 @@ def pls_val(X, Y, amax=2, n_blocks=10,algo='pls'):
EE[a,out,:] = E EE[a,out,:] = E
PRESS[:,a+1] = PRESS[:,a+1] + sum(E**2,0) PRESS[:,a+1] = PRESS[:,a+1] + sum(E**2,0)
return sqrt(PRESS/(k-1.)), EE, Yhat rmsep = sqrt(PRESS/(k-1.))
aopt = find_aopt_from_sep(rmsep)
return rmsep, aopt
def pca_alter_val(a, amax, n_sets=10,method='diag'): def pca_alter_val(a, amax, n_sets=10, method='diag'):
"""Pca validation by altering elements in X. """Pca validation by altering elements in X.
""" """
# todo: it is just as easy to do jk-estimates her as well # todo: it is just as easy to do jk-estimates her as well
@ -79,18 +86,27 @@ def pca_alter_val(a, amax, n_sets=10,method='diag'):
EE = a_sub - Xhat.ravel().take(ind) EE = a_sub - Xhat.ravel().take(ind)
tot = (a_sub**2).sum() tot = (a_sub**2).sum()
sep[i,j] = (EE**2).sum()/tot sep[i,j] = (EE**2).sum()/tot
return sqrt(sep.mean(0)) sep = sqrt(sep)
#return sep aopt = find_aopt_from_sep(sep)
return sep, aopt
def pca_cv_val(X, amax, n_sets): def pca_cv_val(a, amax, n_sets):
""" Cross validation of pca using random sets crossval. """ Returns PRESS from cross-validated pca using random segments.
input:
-- a, data matrix (m x n)
-- amax, maximum nuber of components used
-- n_sets, number of segments to calculate
output:
-- sep, (amax x m x n), squared error of prediction (press)
-- aopt, guestimated optimal number of components
""" """
m, n = X.shape m, n = a.shape
xtot = (X**2).sum()
V = pca_gen(X, n_sets=7, center=True, index_out=True)
E = empty((amax, m, n), dtype='f') E = empty((amax, m, n), dtype='f')
for xi,xout,ind in V: xtot = (a**2).sum() # this needs centering
dat_i = pca(xi, amax, mode='detailed') V = pca_gen(a, n_sets=7, center=True, index_out=True)
for xi, xout, ind in V:
dat_i = pca(xi, amax, mode='fast')
Pi = dat_i['P'] Pi = dat_i['P']
for a in xrange(amax): for a in xrange(amax):
Pia = Pi[:,:a+1] Pia = Pi[:,:a+1]
@ -99,7 +115,9 @@ def pca_cv_val(X, amax, n_sets):
sep = [] sep = []
for a in xrange(amax): for a in xrange(amax):
sep.append(E[a].sum()/xtot) sep.append(E[a].sum()/xtot)
return sqrt(sep.mean(0)) sep = array(sep)
aopt = find_aopt_from_sep(sep)
return sep, aopt
def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True): def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True):
""" Returns CV-segments of paramter W for wide X. """ Returns CV-segments of paramter W for wide X.
@ -128,7 +146,20 @@ def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True):
return WW return WW
def pca_jkP(a, aopt, n_blocks=None): def pca_jkP(a, aopt, n_blocks=None):
""" Returns CV-segments of paramter P. """Returns loading from PCA on CV-segments.
input:
-- a, data matrix (n x m)
-- aopt, number of components in model.
-- nblocks, number of segments
output:
-- PP, loadings collected in a three way matrix
(n_segments, m, aopt)
comments:
* The loadings are scaled with the (1/samples)*eigenvalues.
* Crossvalidation method is currently set to random blocks of samples.
todo: add support for T todo: add support for T
fixme: more efficient to add this in validation loop fixme: more efficient to add this in validation loop
""" """
@ -138,8 +169,30 @@ def pca_jkP(a, aopt, n_blocks=None):
PP = empty((n_blocks, a.shape[1], aopt), dtype='f') PP = empty((n_blocks, a.shape[1], aopt), dtype='f')
V = pca_gen(a, n_sets=n_blocks, center=True) V = pca_gen(a, n_sets=n_blocks, center=True)
for nn,(a_in, a_out) in enumerate(V): for nn,(a_in, a_out) in enumerate(V):
dat = pca(a_in, aopt, mode='fast') dat = pca(a_in, aopt, mode='fast', scale='loads')
P = dat['P'] P = dat['P']
PP[nn,:,:] = P PP[nn,:,:] = P
return PP return PP
def find_aopt_from_sep(sep, method='75perc'):
"""Returns an estimate of optimal number of components from rmsecv.
"""
if method=='vanilla':
# min rmsep
rmsecv = sqrt(sep.mean(0))
return rmsecv.argmin() + 1
elif method=='75perc':
prct = .75 #percentile
ind = 1.*sep.shape[0]*prct
med = median(sep)
prc_75 = []
for col in sep.T:
col.sort()
prc_75.append(col[int(ind)])
prc_75 = array(prc_75)
for i in range(1, sep.shape[1], 1):
if med[i-1]<prc_75[i]:
return i
return len(med)