Projects/laydi
Projects
/
laydi
Archived
7
0
Fork 0

Lib updates

This commit is contained in:
Arnar Flatberg 2007-07-23 17:33:21 +00:00
parent 7ea87e646a
commit a05d0faa0d
9 changed files with 937 additions and 166 deletions

View File

@ -10,7 +10,7 @@ from fluents.workflow import Function, OptionsDialog, Options
from fluents.dataset import Dataset
from fluents import plots, dataset, workflow, logger
import scipy
from engines import pca, pls
from engines import pca, pls, nipals_lpls
from cx_stats import leverage, variances, hotelling
from cx_utils import mat_center
from validation import *
@ -238,14 +238,14 @@ class PLS(Model):
"""Estimates cut off on significant vars by controlling fdr."""
if self._options['calc_qvals']==True:
qvals_sorted, qvals = pls_qvals(a, b,
qvals = pls_qvals(a, b,
aopt=None,
alpha=reg,
n_iter=n_iter,
algo='pls',
sim_method=sim_method)
self.model['qval'] = qvals
self.model['qval_sorted'] = qvals_sorted
#self.model['qval_sorted'] = qvals_sorted
else:
self.model['qval'] = None
self.model['qval_sorted'] = None
@ -288,6 +288,7 @@ class PLS(Model):
'qval_sorted':[ids_1, zero_dim],
'w_tsq' : [ids_1, zero_dim],
'rmsep' : [ids_3, pc_ids],
'CP': [ids_1, pc_ids]
}
array = self.model[name]
@ -330,6 +331,17 @@ class PLS(Model):
self.model['var_y'] = var_y
self.model['exp_var_y'] = exp_var_y
if options['calc_corrloads']:
corr_load = scipy.empty_like(self.model['P'].copy())
T = self.model['T']
X = self._data['X']
# For each variable/attribute in original matrix (not meancentered)
for i,score in enumerate(T.T):
for j, profile in enumerate(X.T):
corrs = scipy.corrcoef(score, profile)
corr_load[j,i] = corrs[0,1]
self.model['CP'] = corr_load
if options['calc_conf']:
self.confidence(**options.confidence_options())
@ -353,6 +365,141 @@ class PLS(Model):
#run with current data and options
return self.run_o(a, b)
class LPLS(Model):
def __init__(self, id='lpls', name='LPLS'):
Model.__init__(self, id, name)
self._options = LplsOptions()
def validation(self, opt):
"""Returns rmsep for lpls model.
"""
if opt['calc_cv']==True:
val_engine = opt['val_engine']
rmsep, aopt = val_engine(self.model['X'], self.model['Y'],
self.model['Z'], opt['amax'], opt['n_sets'], opt['xz_alpha'])
self.model['rmsep'] = rmsep
self.model['aopt'] = aopt
else:
self.model['rmsep'] = None
self.model['aopt'] = opt['aopt']
def confidence(self, opt):
"""Returns a confidence measure for model parameters
Supported parameters: W
"""
aopt = self.model['aopt']
if opt['calc_conf']:
Wx, Wz = lpls_jk(self.model['X'], self.model['Y'], self.model['Z'], aopt, n_sets)
Wcal = self.model['W'][:,:aopt]
Lcal = self.model['L'][:,:aopt]
# ensure that Wcal is scaled
tnorm = scipy.apply_along_axis(norm, 0, self.model['T'][:,:aopt])
Wcal = Wcal*tnorm
a,b,c,d,e = opt['p_center'], opt['crot'], opt['alpha'], opt['strict'], opt['cov_center']
tsqx = hotelling(Wx, Wcal, a,b,c,d,e)
tsqz = hotelling(Wz, Lcal, a,b,c,d,e)
self.model['tsqx'] = tsqx
self.model['tsqz'] = tsqz
else:
self.model['tsqx'] = None
self.model['tsqz'] = None
def permutation_confidence(self, opt):
"""Estimates cut off on significant vars by controlling fdr.
"""
self.model['qval'] = None
self.model['qval_sorted'] = None
def make_model(self, opt):
"""Make model on amax components.
"""
engine = opt['engine']
dat = engine(self._data['X'], self._data['Y'], self._data['Z'],
opt['amax'], opt['xz_alpha'], opt['center_mth'],
opt['mode'], opt['scale'], False)
self.model.update(dat)
def as_dataset(self, name, dtype='Dataset'):
"""Return any model parameter as Dataset
No ids matching
"""
if name not in self.model.keys():
return
DX, DY, DZ = self._dataset['X'], self._dataset['Y'], self._dataset['Z']
dim_name_0, dim_name_1 = DX.get_dim_name()
dim_name_2, dim_name_3 = DY.get_dim_name()
dim_name_4, dim_name_5 = DZ.get_dim_name()
#samples
ids_0 = [dim_name_0, DX.get_identifiers(dim_name_0, sorted=True)]
# x vars (genes)
ids_1 = [dim_name_1, DX.get_identifiers(dim_name_1, sorted=True)]
# y vars (sample descriptors)
ids_3 = [dim_name_3, DY.get_identifiers(dim_name_3, sorted=True)]
#z-vars (variable descriptors)
ids_4 = [dim_name_4, DZ.get_identifiers(dim_name_4, sorted=True)]
# components (hidden)
pc_ids = ['_comp', map(str, range(self._options['amax']))]
pc_ids_opt = ['_comp', map(str, range(self.model['aopt']))]
zero_dim = ['_doe',['0']] # null dim, vector (hidden)
match_ids = {'E' : [ids_0, ids_1],
'P' : [ids_1, pc_ids],
'T' : [ids_0, pc_ids],
'W' : [ids_1, pc_ids],
'L' : [ids_4, pc_ids],
'Q' : [ids_3, pc_ids],
'F' : [ids_0, ids_3],
'B' : [ids_1, ids_3],
'tsqx' : [ids_1, zero_dim],
'tsqz' : [ids_4, zero_dim],
'K' : [ids_1, pc_ids],
'rmsep' : [ids_3, pc_ids]
}
array = self.model[name]
M = Dataset(array, identifiers=match_ids[name], name=name)
return M
def get_out_plots(self, options):
out=[]
for plt in options['out_plots']:
out.append(plt(self))
return out
def run(self, a, b, c):
"""Run L-PLS with present options."""
options = self._options
self._dataset['X'] = a
self._dataset['Y'] = b
self._dataset['Z'] = c
self._data['X'] = a.asarray()
self._data['Y'] = b.asarray()
self._data['Z'] = c.asarray()
self.validation(options)
self.make_model(options)
if options['calc_conf']:
self.confidence(options)
out = [self.as_dataset(p) for p in options['out_data']]
for plt in self.get_out_plots(options):
out.append(plt)
return out
def run_gui(self, a, b, c):
"""Run LPLS with option gui.
"""
dialog = LPlsOptionsDialog([a, b, c], self._options)
dialog.show_all()
response = dialog.run()
dialog.hide()
if response == gtk.RESPONSE_OK:
# set output data and plots
dialog.set_output()
#run with current data and options
return self.run(a, b, c)
class PcaOptions(Options):
"""Options for Principal Component Analysis.
@ -403,7 +550,9 @@ class PcaOptions(Options):
]
opt['out_data'] = ['T','P', 'p_tsq']
opt['out_plots'] = [blmplots.PcaScorePlot,blmplots.PcaLoadingPlot,blmplots.LineViewXc]
opt['out_plots'] = [blmplots.PcaScorePlot,
blmplots.PcaLoadingPlot,
blmplots.LineViewXc]
self.update(opt)
@ -444,6 +593,7 @@ class PlsOptions(Options):
opt['center_mth'] = mat_center
opt['scale'] = 'scores'
opt['calc_corrloads'] = True
opt['calc_conf'] = False
opt['n_sets'] = 5
opt['strict'] = True
@ -468,7 +618,8 @@ class PlsOptions(Options):
(blmplots.PlsLoadingPlot, 'Loadings', True),
(blmplots.LineViewXc, 'Line view', True),
(blmplots.PredictionErrorPlot, 'Residual Error', False),
(blmplots.RMSEPPlot, 'RMSEP', False)
(blmplots.RMSEPPlot, 'RMSEP', False),
(blmplots.PlsCorrelationLoadingPlot, 'Corr. loadings', True)
]
opt['out_data'] = ['T','P', 'p_tsq']
@ -494,14 +645,87 @@ class PlsOptions(Options):
'strict', 'crot', 'cov_center']
return self._copy_from_list(opt_list)
def permutation_confidence(self):
opt_list = ['q_pert_method', 'q_iter']
return self._copy_from_list(opt_list)
class LplsOptions(Options):
"""Options for L-shaped Partial Least Squares Regression.
"""
def __init__(self):
Options.__init__(self)
self._set_default()
def _set_default(self):
opt = {}
opt['engine'] = nipals_lpls
opt['mode'] = 'normal' # how much info to calculate
opt['amax'] = 10
opt['aopt'] = 9
opt['xz_alpha'] = .5
opt['auto_aopt'] = False
opt['center'] = True
opt['center_mth'] = [2, 0, 1]
opt['scale'] = 'scores'
opt['calc_conf'] = False
opt['n_sets'] = 7
opt['strict'] = False
opt['p_center'] = 'med'
opt['alpha'] = .3
opt['cov_center'] = 'med'
opt['crot'] = True
opt['calc_cv'] = False
opt['cv_val_method'] = 'random'
opt['cv_val_sets'] = opt['n_sets']
opt['all_data'] = [('T', 'scores', True),
('Wx', 'X-weights', True),
('Wz', 'Z-weights', True),
('E','residuals', False),
('tsq_x', 't2X', False),
('rmsep', 'RMSEP', False)
]
# (class, name, sensitive, ticked)
opt['all_plots'] = [(blmplots.PlsScorePlot, 'Scores', True),
(blmplots.PlsLoadingPlot, 'Loadings', True),
(blmplots.LineViewXc, 'Line view', True),
(blmplots.PredictionErrorPlot, 'Residual Error', False),
(blmplots.RMSEPPlot, 'RMSEP', False),
(blmplots.LplsHypoidCorrelationPlot, 'Hypoid corr.', False)
]
opt['out_data'] = ['T','P']
opt['out_plots'] = [blmplots.PlsScorePlot,blmplots.PlsLoadingPlot,blmplots.LineViewXc]
#opt['out_data'] = None
opt['pack'] = False
opt['calc_qvals'] = False
opt['q_pert_method'] = 'shuffle_rows'
opt['q_iter'] = 20
self.update(opt)
def make_model_options(self):
"""Options for make_model method."""
opt_list = ['scale','mode', 'amax', 'engine']
return self._copy_from_list(opt_list)
def confidence_options(self):
"""Options for confidence method."""
opt_list = ['n_sets', 'aopt', 'alpha', 'p_center',
'strict', 'crot', 'cov_center']
return self._copy_from_list(opt_list)
def validation_options(self):
"""Options for pre_validation method."""
opt_list = ['amax', 'n_sets', 'cv_val_method']
return self._copy_from_list(opt_list)
def permutation_confidence(self):
opt_list = ['q_pert_method', 'q_iter']
return self._copy_from_list(opt_list)
class PcaOptionsDialog(OptionsDialog):
"""Options dialog for Principal Component Analysis.
@ -716,6 +940,210 @@ class PcaOptionsDialog(OptionsDialog):
self._options['strict'] = True
class LplsOptionsDialog(OptionsDialog):
"""Options dialog for L-shaped Partial Least squares regression.
"""
def __init__(self, data, options, input_names=['X', 'Y', 'Z']):
OptionsDialog.__init__(self, data, options, input_names)
glade_file = os.path.join(fluents.DATADIR, 'lpls_options.glade')
notebook_name = "vbox1"
page_name = "Options"
self.add_page_from_glade(glade_file, notebook_name, page_name)
# connect signals to handlers
dic = {"on_amax_value_changed" : self.on_amax_changed,
"on_aopt_value_changed" : self.on_aopt_changed,
"auto_aopt_toggled" : self.auto_aopt_toggled,
"center_toggled" : self.center_toggled,
#"on_scale_changed" : self.on_scale_changed,
"on_val_none" : self.val_toggled,
"on_val_cv" : self.cv_toggled,
"on_cv_method_changed" : self.on_cv_method_changed,
"on_cv_sets_changed" : self.on_cv_sets_changed,
"on_conf_toggled" : self.conf_toggled,
"on_subset_loc_changed" : self.on_subset_loc_changed,
"on_cov_loc_changed" : self.on_cov_loc_changed,
"on_alpha_changed" : self.on_alpha_changed,
"on_rot_changed" : self.on_rot_changed,
"on__toggled" : self.conf_toggled,
"on_qval_changed" : self.on_qval_changed,
"on_iter_changed" : self.on_iter_changed
}
self.wTree.signal_autoconnect(dic)
# set/ensure valid default values/ranges
#
amax_sb = self.wTree.get_widget("amax_spinbutton")
max_comp = min(data[0].shape) # max num of components
if self._options['amax']>max_comp:
logger.log('debug', 'amax default too large ... adjusting')
self._options['amax'] = max_comp
amax_sb.get_adjustment().set_all(self._options['amax'], 1, max_comp, 1, 0, 0)
# aopt spin button
aopt_sb = self.wTree.get_widget("aopt_spinbutton")
if self._options['aopt']>self._options['amax']:
self._options['aopt'] = self._options['amax'] + 1 - 1
aopt_sb.get_adjustment().set_all(self._options['aopt'], 1, self._options['amax'], 1, 0, 0)
# scale
# scale_cb = self.wTree.get_widget("scale_combobox")
# scale_cb.set_active(0)
# validation frames
if self._options['calc_cv']==False:
cv_frame = self.wTree.get_widget("cv_frame")
cv_frame.set_sensitive(False)
cv = self.wTree.get_widget("cv_method").set_active(0)
# confidence
if self._options['calc_conf']==True:
self.wTree.get_widget("subset_expander").set_sensitive(True)
else:
self.wTree.get_widget("subset_expander").set_sensitive(False)
cb = self.wTree.get_widget("subset_loc")
_m = {'med': 0, 'mean': 1, 'full_model': 2}
cb.set_active(_m.get(self._options['p_center']))
cb = self.wTree.get_widget("cov_loc")
_m = {'med': 0, 'mean': 1}
cb.set_active(_m.get(self._options['cov_center']))
hs = self.wTree.get_widget("alpha_scale")
hs.set_value(self._options['alpha'])
tb = self.wTree.get_widget("qvals")
tb.set_sensitive(True)
def on_amax_changed(self, sb):
logger.log("debug", "amax changed: new value: %s" %sb.get_value_as_int())
amax = sb.get_value_as_int()
# update aopt if needed
if amax<self._options['aopt']:
self._options['aopt'] = amax
aopt_sb = self.wTree.get_widget("aopt_spinbutton")
aopt_sb.get_adjustment().set_all(self._options['aopt'], 1, amax, 1, 0, 0)
self._options['amax'] = sb.get_value_as_int()
def on_aopt_changed(self, sb):
aopt = sb.get_value_as_int()
self._options['aopt'] = aopt
def auto_aopt_toggled(self, tb):
aopt_sb = self.wTree.get_widget("aopt_spinbutton")
if tb.get_active():
self._options['auto_aopt'] = True
aopt_sb.set_sensitive(False)
else:
self._options['auto_aopt'] = False
aopt_sb.set_sensitive(True)
def center_toggled(self, tb):
if tb.get_active():
self._options['center'] = True
else:
logger.log("debug", "centering set to False")
self._options['center'] = False
#def on_scale_changed(self, cb):
# scale = cb.get_active_text()
# if scale=='Scores':
# self._options['scale'] = 'scores'
# elif scale=='Loadings':
# self._options['scale'] = 'loads'
# else:
# raise IOError
def val_toggled(self, tb):
"""Callback for validation: None. """
cv_frame = self.wTree.get_widget("cv_frame")
cv_tb = self.wTree.get_widget("cv_toggle")
if tb.get_active():
self._options['calc_cv'] = False
cv_frame.set_sensitive(False)
cv_tb.set_sensitive(False)
else:
cv_tb.set_sensitive(True)
if cv_tb.get_active():
cv_frame.set_sensitive(True)
self._options['calc_cv'] = True
def cv_toggled(self, tb):
cv_frame = self.wTree.get_widget("cv_frame")
val_tb = self.wTree.get_widget("val_none_toggle")
if tb.get_active():
cv_frame.set_sensitive(True)
self._options['calc_cv'] = True
else:
cv_frame.set_sensitive(False)
self._options['calc_cv'] = False
def on_cv_method_changed(self, cb):
method = cb.get_active_text()
if method == 'Random':
self._options['cv_val_method'] = 'random'
def on_cv_sets_changed(self, sb):
val = sb.get_value_as_int()
self._options['cv_val_sets'] = val
def conf_toggled(self, tb):
if tb.get_active():
self._options['calc_conf'] = False
self.wTree.get_widget("subset_expander").set_sensitive(False)
else:
self._options['calc_conf'] = True
self.wTree.get_widget("subset_expander").set_sensitive(True)
def on_subset_loc_changed(self, cb):
method = cb.get_active_text()
if method=='Full model':
self._options['p_center'] = 'full_model'
elif method=='Median':
self._options['p_center'] = 'med'
elif method=='Mean':
self._options['p_center'] = 'mean'
def on_cov_loc_changed(self, cb):
method = cb.get_active_text()
if method=='Median':
self._options['cov_center'] = 'med'
elif method=='Mean':
self._options['cov_center'] = 'mean'
def on_alpha_changed(self, hs):
self._options['alpha'] = hs.get_value()
def on_rot_changed(self, rg):
proc, strict = rg
if proc.get_active():
self._options['crot'] = True
else:
self._options['crot'] = True
self._options['strict'] = True
def qval_toggled(self, tb):
if tb.get_active():
self._options['calc_qval'] = False
self.wTree.get_widget("qval_method").set_sensitive(False)
self.wTree.get_widget("q_iter").set_sensitive(False)
else:
self._options['calc_qval'] = True
self.wTree.get_widget("qval_method").set_sensitive(True)
self.wTree.get_widget("q_iter").set_sensitive(True)
def on_iter_changed(self, sb):
self._options['q_iter'] = sb.get_value()
def on_qval_changed(self, cb):
q_method = cb.get_active_text()
if method=='Shuffle rows':
self._options['q_pert_method'] = 'shuffle'
class PlsOptionsDialog(OptionsDialog):
"""Options dialog for Partial Least squares regression.
"""
@ -918,5 +1346,3 @@ class PlsOptionsDialog(OptionsDialog):
q_method = cb.get_active_text()
if method=='Shuffle rows':
self._options['q_pert_method'] = 'shuffle'

View File

@ -191,6 +191,11 @@ class PlsCorrelationLoadingPlot(BlmScatterPlot):
BlmScatterPlot.__init__(self, title, model, absi, ordi, part_name='CP')
class LplsHypoidCorrelationPlot(BlmScatterPlot):
def __init__(self, model, absi=0, ordi=1):
title = "Hypoid correlations(%s)" %model._dataset['X'].get_name()
BlmScatterPlot.__init__(self, title, model, absi, ordi, part_name='W')
class LineViewXc(plots.LineViewPlot):
"""A line view of centered raw data
"""
@ -214,8 +219,8 @@ class PlsQvalScatter(plots.ScatterPlot):
def __init__(self, model, pc=0):
if not model.model.has_key('w_tsq'):
return None
self._W = model.model['P']
dataset_1 = model.as_dataset('P')
self._W = model.model['W']
dataset_1 = model.as_dataset('W')
dataset_2 = model.as_dataset('w_tsq')
id_dim = dataset_1.get_dim_name(0) #genes
sel_dim = dataset_1.get_dim_name(1) #_comp

View File

@ -115,6 +115,7 @@ def expl_var_y(Y, T, Q):
def pls_qvals(a, b, aopt=None, alpha=.3,
n_iter=20, algo='pls',
center=True,
sim_method='shuffle',
p_center='med', cov_center='med',
crot=True, strict=False, metric=None):
@ -122,8 +123,98 @@ def pls_qvals(a, b, aopt=None, alpha=.3,
"""Returns qvals for pls model.
input:
a -- centered data matrix
b -- centered data matrix
a -- data matrix
b -- data matrix
aopt -- scalar, opt. number of components
alpha -- [0,1] regularisation parameter for T2-test
n_iter -- number of permutations
sim_method -- permutation method ['shuffle']
p_center -- location estimator for sub models ['med']
cov_center -- location estimator for covariance of submodels ['med']
crot -- bool, use rotations of sub models?
strict -- bool, use stict (rot/flips only) rotations?
metric -- bool, use row metric?
"""
m, n = a.shape
TSQ = zeros((n, n_iter), dtype='d') # (nvars x n_subsets)
n_false = zeros((n, n_iter), dtype='d')
#full model
if center:
ac = a - a.mean(0)
bc = b - b.mean(0)
if metric!=None:
ac = dot(ac, metric)
if algo=='bridge':
dat = bridge(ac, bc, aopt, 'loads', 'fast')
else:
dat = pls(ac, bc, aopt, 'loads', 'fast')
Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo, metric=metric, center=True)
tsq_full = hotelling(Wcv, dat['W'], p_center=p_center,
alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center)
t0 = time.time()
Vs = shuffle_1d(bc, n_iter, axis=0)
for i, b_shuff in enumerate(Vs):
t1 = time.time()
if algo=='bridge':
dat = bridge(ac, b_shuff, aopt, 'loads','fast')
else:
dat = pls(ac, b_shuff, aopt, 'loads', 'fast')
Wcv = pls_jkW(a, b_shuff, aopt, n_blocks=None, algo=algo, metric=metric)
TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center,
alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center)
print time.time() - t1
sort_index = argsort(tsq_full)[::-1]
back_sort_index = sort_index.argsort()
print time.time() - t0
# count false positives
tsq_full_sorted = tsq_full.take(sort_index)
for i in xrange(n_iter):
for j in xrange(n):
n_false[j,i] = sum(TSQ[:,i]>=tsq_full[j]) # number of false pos. genes (0-n)
false_pos = median(n_false, 1)
ll = arange(1, len(false_pos)+1, 1)
sort_qval = false_pos.take(sort_index)/ll
qval = false_pos/ll.take(back_sort_index)
print time.time() - t0
#return qval, false_pos, TSQ, tsq_full
return qval
def ensure_strict(C, only_flips=True):
"""Ensure that a rotation matrix does only 90 degree rotations.
In multiplication with pcs this allows flips and reordering.
if only_flips is True there will onlt be flips allowed
"""
Cm = C
S = sign(C) # signs
if only_flips==True:
C = eye(Cm.shape[0])*S
return C
Cm = zeros_like(C)
Cm.putmask(1.,abs(C)>.6)
if det(Cm)>1:
raise ValueError,"Implement this!"
return Cm*S
def pls_qvals_II(a, b, aopt=None, center=True, alpha=.3,
n_iter=20, algo='pls',
sim_method='shuffle',
p_center='med', cov_center='med',
crot=True, strict=False, metric=None):
"""Returns qvals for pls model.
Shuffling of variables in X is preprocessed in metric.
Null model is 'If I put genes randomly on network' ... if they are sign:
then this is due to network structure and not covariance with response.
input:
a -- data matrix
b -- data matrix
aopt -- scalar, opt. number of components
alpha -- [0,1] regularisation parameter for T2-test
n_iter -- number of permutations
@ -140,25 +231,33 @@ def pls_qvals(a, b, aopt=None, alpha=.3,
n_false = zeros((n, n_iter), dtype='<f8')
#full model
if metric!=None:
a = dot(a, metric)
# center?
if center==True:
ac = a - a.mean(0)
bc = b - b.mean(0)
if metric==None:
metric = eye(n,n)
if algo=='bridge':
dat = bridge(a, b, aopt, 'loads', 'fast')
dat = bridge(ac, bc, aopt, 'loads', 'fast')
else:
dat = pls(a, b, aopt, 'loads', 'fast')
dat = pls(ac, bc, aopt, 'loads', 'fast')
Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo, metric=metric)
tsq_full = hotelling(Wcv, dat['W'], p_center=p_center,
alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center)
t0 = time.time()
Vs = shuffle_1d(b, n_iter)
for i, b_shuff in enumerate(Vs):
Vs = shuffle_1d(a, n_iter, 1)
for i, a_shuff in enumerate(Vs):
t1 = time.time()
a = a_shuff - a_shuff.mean(0)
a = dot(a, metric)
if algo=='bridge':
dat = bridge(a, b_shuff, aopt, 'loads','fast')
dat = bridge(a, b, aopt, 'loads','fast')
else:
dat = pls(a, b, aopt, 'loads', 'fast')
Wcv = pls_jkW(a, b_shuff, aopt, n_blocks=None, algo=algo, metric=metric)
Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo, metric=metric)
TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center,
alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center)
@ -177,24 +276,8 @@ def pls_qvals(a, b, aopt=None, alpha=.3,
sort_qval = false_pos.take(sort_index)/ll
qval = false_pos/ll.take(back_sort_index)
print time.time() - t0
return qval, false_pos, TSQ, tsq_full
def ensure_strict(C, only_flips=True):
"""Ensure that a rotation matrix does only 90 degree rotations.
In multiplication with pcs this allows flips and reordering.
if only_flips is True there will onlt be flips allowed
"""
Cm = C
S = sign(C) # signs
if only_flips==True:
C = eye(Cm.shape[0])*S
return C
Cm = zeros_like(C)
Cm.putmask(1.,abs(C)>.6)
if det(Cm)>1:
raise ValueError,"Implement this!"
return Cm*S
#return qval, false_pos, TSQ, tsq_full
return qval
def leverage(aopt=1,*args):
"""Returns leverages
@ -253,3 +336,10 @@ def ssq(E, axis=0, weights=None):
raise NotImplementedError, "Higher order modes not supported"
return pow(Ew,2).sum(axis)
def vnorm(x):
"""Returns the euclidian norm of a vector.
This is considerably faster than linalg.norm
"""
return sqrt(dot(x,x.conj()))

View File

@ -1,7 +1,7 @@
from scipy import apply_along_axis,newaxis,zeros,\
median,round_,nonzero,dot,argmax,any,sqrt,ndarray,\
trace,zeros_like,sign,sort,real,argsort,rand,array,\
matrix
matrix,nan
from scipy.linalg import norm,svd,inv,eig
from scipy.stats import median,mean
@ -13,8 +13,10 @@ def normalise(a,axis=0,return_scales=False):
s = s[:,newaxis]
a_s = a/s
if return_scales:
return a_s, s
return a_s
def sub2ind(shape, i, j):
@ -75,7 +77,7 @@ def randperm(n):
out = zeros(n)
for i in range(n):
out[i] = dict[r[i]]
return array(out,dtype='i')
return array(out).astype('i')
def mat_center(X,axis=0,ret_mn=False):
"""Mean center matrix along axis.

View File

@ -3,7 +3,8 @@
There is no typechecking of any kind here, just focus on speed
"""
from scipy.linalg import svd,norm,inv,pinv,qr
import math
from scipy.linalg import svd,inv
from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\
apply_along_axis,mean,ones,randn,empty_like,outer,c_,\
rand,sum,cumsum,matrix
@ -18,17 +19,18 @@ def pca(a, aopt, scale='scores', mode='normal'):
m, n = a.shape
if m*10.>n:
u, s, vt = esvd(a)
if m*3>n:
u, s, v = esvd(a)
else:
u, s, vt = svd(a, full_matrices=0)
v = vt.T
eigvals = (1./m)*s
T = u*s
T = T[:,:aopt]
P = vt[:aopt,:].T
P = v[:,:aopt]
if scale=='loads':
tnorm = apply_along_axis(norm, 0, T)
tnorm = apply_along_axis(vnorm, 0, T)
T = T/tnorm
P = P*tnorm
@ -47,6 +49,7 @@ def pca(a, aopt, scale='scores', mode='normal'):
return {'T':T, 'P':P, 'E':E}
def pcr(a, b, aopt=2, scale='scores', mode='normal'):
"""Returns Principal component regression model."""
m, n = a.shape
@ -98,13 +101,13 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
u, s, vh = svd(dot(ab.T, ab))
w = dot(ab, u[:,:1])
w = w/norm(w)
w = w/vnorm(w)
r = w.copy()
if i>0:
for j in range(0,i,1):
r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis]
t = dot(a, r)
tt = norm(t)**2
tt = vnorm(t)**2
p = dot(a.T, t)/tt
q = dot(r.T, ab).T/tt
ab = ab - dot(p, q.T)*tt
@ -115,7 +118,7 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
if mode=='fast' and i==aopt-1:
if scale=='loads':
tnorm = apply_along_axis(norm, 0, T)
tnorm = apply_along_axis(vnorm, 0, T)
T = T/tnorm
W = W*tnorm
return {'T':T, 'W':W}
@ -134,7 +137,7 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
F = b - dot(T[:,:aopt], Q[:,:aopt].T)
if scale=='loads':
tnorm = apply_along_axis(norm, 0, T)
tnorm = apply_along_axis(vnorm, 0, T)
T = T/tnorm
W = W*tnorm
Q = Q*tnorm
@ -159,7 +162,7 @@ def w_simpls(aat, b, aopt):
u = dot(b, u[:,:1]) #y-factor scores
U[:,i] = u.ravel()
t = dot(aat, u)
t = t/norm(t)
t = t/vnorm(t)
T[:,i] = t.ravel()
h = dot(aat, t) #score-weights
H[:,i] = h.ravel()
@ -183,7 +186,7 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
W = u[:,:aopt]
K = vt[:aopt,:].T
T = dot(a, W)
tnorm = apply_along_axis(norm, 0, T) # norm of T-columns
tnorm = apply_along_axis(vnorm, 0, T) # norm of T-columns
if mode == 'fast':
if scale=='loads':
@ -196,16 +199,6 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
B = zeros((aopt, n, l), dtype='f')
for i in range(aopt):
B[i] = dot(W[:,:i+1], Q[:,:i+1].T)
# leverages
# fixme: probably need an orthogonal basis for row-space leverage
# T (scores) are not orthogonal
# Using a qr decomp to get an orthonormal basis for row-space
#Tq = qr(T)[0]
#s_lev,v_lev = leverage(aopt,Tq,W)
# explained variance
#var_x, exp_var_x = variances(a,T,W)
#qnorm = apply_along_axis(norm, 0, Q)
#var_y, exp_var_y = variances(b,U,Q/qnorm)
if mode == 'detailed':
E = empty((aopt, m, n))
@ -225,6 +218,128 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
return {'B':B, 'W':W, 'T':T, 'Q':Q, 'E':E, 'F':F, 'U':U, 'P':W}
def nipals_lpls(X, Y, Z, amax, alpha=.7, mean_ctr=[2, 0, 1], mode='normal', scale='scores', verbose=False):
""" L-shaped Partial Least Sqaures Regression by the nipals algorithm.
(X!Z)->Y
:input:
X : data matrix (m, n)
Y : data matrix (m, l)
Z : data matrix (n, o)
:output:
T : X-scores
W : X-weights/Z-weights
P : X-loadings
Q : Y-loadings
U : X-Y relation
L : Z-scores
K : Z-loads
B : Regression coefficients X->Y
b0: Regression coefficient intercept
evx : X-explained variance
evy : Y-explained variance
evz : Z-explained variance
:Notes:
"""
if mean_ctr!=None:
xctr, yctr, zctr = mean_ctr
X, mnX = center(X, xctr)
Y, mnY = center(Y, xctr)
Z, mnZ = center(Z, zctr)
varX = pow(X, 2).sum()
varY = pow(Y, 2).sum()
varZ = pow(Z, 2).sum()
m, n = X.shape
k, l = Y.shape
u, o = Z.shape
# initialize
U = empty((k, amax))
Q = empty((l, amax))
T = empty((m, amax))
W = empty((n, amax))
P = empty((n, amax))
K = empty((o, amax))
L = empty((u, amax))
var_x = empty((amax,))
var_y = empty((amax,))
var_z = empty((amax,))
for a in range(amax):
if verbose:
print "\n Working on comp. %s" %a
u = Y[:,:1]
diff = 1
MAX_ITER = 100
lim = 1e-5
niter = 0
while (diff>lim and niter<MAX_ITER):
niter += 1
u1 = u.copy()
w = dot(X.T, u)
w = w/sqrt(dot(w.T, w))
l = dot(Z, w)
k = dot(Z.T, l)
k = k/sqrt(dot(k.T, k))
w = alpha*k + (1-alpha)*w
w = w/sqrt(dot(w.T, w))
t = dot(X, w)
c = dot(Y.T, t)
c = c/sqrt(dot(c.T, c))
u = dot(Y, c)
diff = abs(u1 - u).max()
if verbose:
print "Converged after %s iterations" %niter
tt = dot(t.T, t)
p = dot(X.T, t)/tt
q = dot(Y.T, t)/tt
l = dot(Z, w)
U[:,a] = u.ravel()
W[:,a] = w.ravel()
P[:,a] = p.ravel()
T[:,a] = t.ravel()
Q[:,a] = q.ravel()
L[:,a] = l.ravel()
K[:,a] = k.ravel()
X = X - dot(t, p.T)
Y = Y - dot(t, q.T)
Z = (Z.T - dot(w, l.T)).T
var_x[a] = pow(X, 2).sum()
var_y[a] = pow(Y, 2).sum()
var_z[a] = pow(Z, 2).sum()
B = dot(dot(W, inv(dot(P.T, W))), Q.T)
b0 = mnY - dot(mnX, B)
# variance explained
evx = 100.0*(1 - var_x/varX)
evy = 100.0*(1 - var_y/varY)
evz = 100.0*(1 - var_z/varZ)
if scale=='loads':
tnorm = apply_along_axis(vnorm, 0, T)
T = T/tnorm
W = W*tnorm
Q = Q*tnorm
knorm = apply_along_axis(vnorm, 0, K)
L = L*knorm
K = K/knorm
return {'T':T, 'W':W, 'P':P, 'Q':Q, 'U':U, 'L':L, 'K':K, 'B':B, 'b0':b0, 'evx':evx, 'evy':evy, 'evz':evz}
########### Helper routines #########
def m_shape(array):
return matrix(array).shape
@ -239,17 +354,40 @@ def esvd(data,economy=1):
"""
m, n = data.shape
if m>=n:
u, s, vt = svd(dot(data.T, data))
data = dot(data.T, data)
u, s, vt = svd(data)
u = dot(data, vt.T)
v = vt.T
for i in xrange(n):
s[i] = norm(u[:,i])
s[i] = vnorm(u[:,i])
u[:,i] = u[:,i]/s[i]
else:
u, s, vt = svd(dot(data, data.T))
data = dot(data, data.T)
data = (data + data.T)/2.0
u, s, vt = svd(data)
v = dot(u.T, data)
for i in xrange(m):
s[i] = norm(v[i,:])
s[i] = vnorm(v[i,:])
v[i,:] = v[i,:]/s[i]
return u, s, v
return u, s, v.T
def vnorm(x):
# assume column arrays (or vectors)
return math.sqrt(dot(x.T, x))
def center(a, axis):
# 0 = col center, 1 = row center, 2 = double center
# -1 = nothing
if axis==-1:
mn = zeros((a.shape[1],))
elif axis==0:
mn = a.mean(0)
elif axis==1:
mn = a.mean(1)[:,newaxis]
elif axis==2:
mn = a.mean(0) + a.mean(1)[:,newaxis] - a.mean()
else:
raise IOError("input error: axis must be in [-1,0,1,2]")
return a - mn, mn

View File

@ -53,6 +53,7 @@ def gene_hypergeo_test(selection, category_dataset):
cat_count)
pvals = scipy.where(cat_count==0, 2, pvals)
pvals = scipy.where(scipy.isnan(pvals), 2, pvals)
out = {}
for i in range(pvals.size):
out[str(all_cats[i])] = (count[i], cat_count[i], pvals[i])

View File

@ -2,7 +2,7 @@ import os,sys
from itertools import izip
import networkx as NX
from scipy import shape,diag,dot,asarray,sqrt,real,zeros,eye,exp,maximum,\
outer,maximum,sum,diag,real
outer,maximum,sum,diag,real,atleast_2d
from scipy.linalg import eig,svd,inv,expm,norm
from cx_utils import sorted_eig
@ -378,6 +378,7 @@ Ke = expm(A) .... expm(-A)?
# 14.05.2006: diffusion returns negative values, using expm(-LL) instead (FIX)
# 13.09.2206: update for use in numpy
# 27.04.2007: diffusion now uses pade approximations to matrix exponential. Also the last
def K_expAdj(W, normalised=True, alpha=1.0):
"""Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel.
@ -433,7 +434,7 @@ def K_vonNeumann(W, normalised=True, alpha=1.0):
return dot(dot(vr,psigma),vri).astype(t)
def K_laplacian(W, normalised=True, alpha=1.0):
""" This is the matrix square root of the pseudo inverse of L.
""" This is the matrix pseudo inverse of L.
Also known as the average commute time matrix.
"""
W = asarray(W)
@ -464,8 +465,7 @@ def K_laplacian(W, normalised=True, alpha=1.0):
return K
def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5):
def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5, use_cut=False):
"""Returns diffusion kernel.
input:
-- W, adj. matrix
@ -480,25 +480,43 @@ def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5):
m, n = W.shape
if t in ['F','D']:
raise TypeError, "Complex input!"
D = diag(sum(W,0))
D = diag(W.sum(0))
L = D - W
if normalised==True:
T = diag(sqrt(1./(sum(W,0))))
T = diag(sqrt(1./W.sum(0)))
L = dot(dot(T, L), T)
e, vr = eig(L)
vri = inv(vr) #inv
cond = 1.0*{0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
cutoff = 1.*abs(cond*maximum.reduce(e))
psigma = eye(m) # if sing vals are 0 exp(0)=1 (unnecessary)
psigma = eye(m) # if eigvals are 0 exp(0)=1 (unnecessary)
#psigma = zeros((m,n), dtype='<f8')
for i in range(len(e)):
if abs(e[i]) > cutoff:
psigma[i,i] = exp(-beta*e[i])
#else:
# psigma[i,i] = 0.0
K = real(dot(dot(vr, psigma), vri))
I = eye(n, dtype='<f8')
K = (1. - alpha)*I + alpha*K
return K
def K_diffusion2(W, normalised=True, alpha=1.0, beta=0.5, ncomp=None):
"""Returns diffusion kernel, using fast pade approximation.
input:
-- W, adj. matrix
-- normalised [True/False]
-- beta, [0->), (diffusion degree)
"""
D = diag(W.sum(0))
L = D - W
if normalised==True:
T = diag(sqrt(1./W.sum(0)))
L = dot(dot(T, L), T)
return expm(-beta*L)
def K_modularity(W,alpha=1.0):
""" Returns the matrix square root of Newmans modularity."""
W = asarray(W)
@ -530,3 +548,20 @@ def kernel_score(K, W):
score = diag(dot(W, dot(K, W)) )
tot = sum(score)
return score, tot
def modularity_matrix(G, nodelist=None):
if not nodelist:
nodelist = G.nodes()
else:
G = NX.subgraph(G, nodelist)
A = NX.adj_matrix(G, nodelist=nodelist)
d = atleast_2d(G.degree(nbunch=nodelist))
m = 1.*G.number_of_edges()
B = A - A/m
return B

View File

@ -41,7 +41,8 @@ def pls_gen(a, b, n_blocks=None, center=False, index_out=False,axis=0, metric=No
"""Random block crossvalidation
Leave-one-out is a subset, with n_blocks equals a.shape[-1]
"""
index = randperm(a.shape[axis])
#index = randperm(a.shape[axis])
index = arange(a.shape[axis])
if n_blocks==None:
n_blocks = a.shape[axis]
n_in_set = ceil(float(a.shape[axis])/n_blocks)
@ -151,6 +152,7 @@ def shuffle_1d_block(a, n_sets=None, blocks=None, index_out=False, axis=0):
index = arange(m)
dummy = map(random.shuffle, array_split(index, blocks))
a_out = a.take(index, axis)
if index_out:
yield a_out, index
else:
@ -164,7 +166,8 @@ def shuffle_1d(a, n_sets, axis=0):
m = a.shape[axis]
for ii in xrange(n_sets):
index = randperm(m)
yield a.take(index, axis)
a = a.take(index, axis)
yield a
def diag_pert(a, n_sets=10, center=True, index_out=False):
"""Alter generator returning sets perturbed with means at diagonals.
@ -207,16 +210,15 @@ def diag_pert(a, n_sets=10, center=True, index_out=False):
def outerprod_centering(aat, ret_mn=True):
"""Returns mean centered symmetric outerproduct matrix.
"""Returns double centered symmetric outerproduct matrix.
"""
n = aat.shape[0]
h = aat.sum(0)[:,newaxis]
h = (h - mean(h)/2)/n
mn_a = h + h.T
h = aat.mean(0)[newaxis]
h = h - 0.5*h.mean()
mn_a = h + h.T # beauty of broadcasting
aatc = aat - mn_a
if ret_mn:
return aatc, aat.mean(0)
return aat - mn_a
return aatc, mn_a
return aatc

View File

@ -12,11 +12,47 @@ from cx_utils import m_shape
def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
"""Returns rmsep and aopt for pls tailored for wide X.
The root mean square error of cross validation is calculated
based on random block cross-validation. With number of blocks equal to
number of samples [default] gives leave-one-out cv.
The pls model is based on the simpls algorithm for wide X.
comments:
-- X, Y inputs need to be centered (fixme: check)
:Parameters:
X : ndarray
column centered data matrix of size (samples x variables)
Y : ndarray
column centered response matrix of size (samples x responses)
amax : scalar
Maximum number of components
n_blocks : scalar
Number of blocks in cross validation
:Returns:
rmsep : ndarray
Root Mean Square Error of cross-validated Predictions
aopt : scalar
Guestimate of the optimal number of components
:SeeAlso:
- pls_cv_val : Same output, not optimised for wide X
- w_simpls : Simpls algorithm for wide X
Notes
-----
Based (cowardly translated) on m-files from the Chemoact toolbox
X, Y inputs need to be centered (fixme: check)
Examples
--------
>>> import numpy as n
>>> X = n.array([[1., 2., 3.],[]])
>>> Y = n.array([[1., 2., 3.],[]])
>>> w_pls(X, Y, 1)
[4,5,6], 1
"""
k, l = m_shape(Y)
PRESS = zeros((l, amax+1), dtype='f')
if n_blocks==None:
@ -40,21 +76,13 @@ def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
E = Yout[:,j][:,newaxis] - TQ
E = E + sum(E, 0)/Din.shape[0]
PRESS[j,1:] = PRESS[j,1:] + sum(E**2, 0)
#Yhat = Y - dot(That,Q.T)
Yhat = Y - dot(That,Q.T)
rmsep = sqrt(PRESS/Y.shape[0])
aopt = find_aopt_from_sep(rmsep)
return rmsep, aopt
return rmsep, Yhat, aopt
def pls_val(X, Y, amax=2, n_blocks=10, algo='pls', metric=None):
""" Validation results of pls model.
comments:
-- X, Y inputs need to be centered (fixme: check)
"""
k, l = m_shape(Y)
PRESS = zeros((l, amax+1), dtype='<f8')
EE = zeros((amax, k, l), dtype='<f8')
@ -79,7 +107,30 @@ def pls_val(X, Y, amax=2, n_blocks=10, algo='pls', metric=None):
rmsep = sqrt(PRESS/(k-1.))
aopt = find_aopt_from_sep(rmsep)
return rmsep, aopt
return rmsep, Yhat, aopt
def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5):
"""Performs crossvalidation to get generalisation error in lpls"""
cv_iter = select_generators.pls_gen(X, Y, n_blocks=nsets,center=False,index_out=True)
k, l = Y.shape
Yhat = empty((a_max,k,l), 'd')
for i, (xcal,xi,ycal,yi,ind) in enumerate(cv_iter):
T, W, P, Q, U, L, K, B, b0, evx, evy, evz = nipals_lpls(xcal,ycal,Z,
a_max=a_max,
alpha=alpha,
mean_ctr=[2,0,1],
verbose=False)
for a in range(a_max):
Yhat[a,ind,:] = b0[a][0][0] + dot(xi, B[a])
Yhat_class = zeros_like(Yhat)
for a in range(a_max):
for i in range(k):
Yhat_class[a,i,argmax(Yhat[a,i,:])]=1.0
class_err = 100*((Yhat_class+Y)==2).sum(1)/Y.sum(0).astype('d')
sep = (Y - Yhat)**2
rmsep = sqrt(sep.mean(1))
aopt = find_aopt_from_sep(rmsep)
return rmsep, Yhat, aopt
def pca_alter_val(a, amax, n_sets=10, method='diag'):
"""Pca validation by altering elements in X.
@ -146,8 +197,7 @@ def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True, center=True, m
if n_blocks == None:
n_blocks = b.shape[0]
Wcv = empty((n_blocks, a.shape[1], amax), dtype='f')
Wcv = empty((n_blocks, a.shape[1], amax), dtype='d')
if use_pack and metric==None:
u, s, inflater = svd(a, full_matrices=0)
a = u*s
@ -161,11 +211,10 @@ def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True, center=True, m
dat = bridge(a_in, b_in, amax, 'loads', 'fast')
W = dat['W']
if use_pack and metric==None:
W = dot(inflater.T, W)
Wcv[nn,:,:] = W
Wcv[nn,:,:] = W[:,:,]
return Wcv
@ -200,6 +249,29 @@ def pca_jkP(a, aopt, n_blocks=None, metric=None):
return PP
def lpls_jk(X, Y, Z, a_max, nsets=None, alpha=.5):
cv_iter = select_generators.pls_gen(X, Y, n_blocks=nsets,center=False,index_out=False)
m, n = X.shape
k, l = Y.shape
o, p = Z.shape
if nsets==None:
nsets = m
WWx = empty((nsets, n, a_max), 'd')
WWz = empty((nsets, o, a_max), 'd')
#WWy = empty((nsets, l, a_max), 'd')
for i, (xcal,xi,ycal,yi) in enumerate(cv_iter):
T, W, P, Q, U, L, K, B, b0, evx, evy, evz = nipals_lpls(xcal,ycal,Z,
a_max=a_max,
alpha=alpha,
mean_ctr=[2,0,1],
scale='loads',
verbose=False)
WWx[i,:,:] = W
WWz[i,:,:] = L
#WWy[i,:,:] = Q
return WWx, WWz
def find_aopt_from_sep(sep, method='75perc'):
"""Returns an estimate of optimal number of components from rmsecv.
"""