Projects/laydi
Projects
/
laydi
Archived
7
0
Fork 0

Multiple lib changes

This commit is contained in:
Arnar Flatberg 2007-01-25 11:58:10 +00:00
parent a65d79697f
commit 1c2c2c8895
7 changed files with 519 additions and 152 deletions

View File

@ -1,7 +1,9 @@
"""This module contains bilinear models(Functions)
"""
import pygtk
import gtk
import gtk.glade
from fluents.workflow import Function, OptionsDialog, Options
from fluents.dataset import Dataset
from fluents import plots, dataset, workflow, logger
@ -12,7 +14,7 @@ from cx_utils import mat_center
from validation import *
import blmplots
import engines
import copy
class Model(Function):
"""Base class of bilinear models.
@ -39,19 +41,38 @@ class PCA(Model):
Model.__init__(self,id,name)
self._options = PcaOptions()
def pre_validation(self, amax, n_sets, val_engine):
"""Model calculations for maximum number of components.
def validation(self, amax, cv_val_sets, pert_val_sets, cv_val_method, pert_val_method):
"""Model validation and estimate of optimal numer of components.
"""
rmsep = val_engine(self.model['E0'], amax, n_sets)
self.model['rmsep'] = rmsep
self.model['aopt'] = rmsep.argmin()
if self._options['calc_cv']:
if cv_val_method == 'random':
sep, aopt = pca_cv_val(self.model['E0'], amax, cv_val_sets)
self.model['sep'] = sep
if self._options['calc_pert']:
if pert_val_method == 'random_diag':
sep, aopt = pca_alter_val(self.model['E0'], amax, pert_val_sets)
self.model['sep'] = sep
if self._options['calc_cv']==False and self._options['calc_pert']==False:
self.model['sep'] = None
aopt = self._options['amax']
if self._options['auto_aopt']:
logger.log("notice", "Auto aopt: " + str(aopt))
self._options['aopt'] = aopt
if aopt==1:
logger.log('notice', 'Aopt at first component!')
def confidence(self, aopt, n_sets, alpha, p_center,
crot, strict, cov_center ):
"""Returns a confidence measure for model parameters.
Based on aopt.
"""
aopt = self.model['aopt']
if aopt<2:
aopt = 2
logger.log('notice','Hotellings T2 needs more than 1 comp.\n switching to 2!!')
jk_segments = pca_jkP(self.model['E0'], aopt, n_sets)
Pcal = self.model['P'][:,:aopt]
tsq = hotelling(jk_segments, Pcal, p_center,
@ -96,8 +117,8 @@ class PCA(Model):
# vars
ids_1 = [dim_name_1, DX.get_identifiers(dim_name_1, sorted=True)]
# components (hidden)
pc_ids = ['_comp_a', map(str,range(self.model['aopt'])) ]
pc_ids_opt = ['_comp_o', map(str, range(self.model['aopt'])) ]
pc_ids = ['_amax', map(str,range(self._options['amax'])) ]
pc_ids_opt = ['_aopt', map(str, range(self._options['aopt'])) ]
zero_dim = ['_doe', ['0']] # null dim, vector (hidden)
match_ids = {'E':[ids_0, ids_1],
'E0':[ids_0, ids_1],
@ -121,8 +142,7 @@ class PCA(Model):
#try:
out.append(plt(self))
#except:
# print plt
#logger.log('debug', 'Plot: %s failed') %plt
# logger.log('debug', 'Plot: %s failed') %str(plt)
return out
def run_o(self, data):
@ -130,6 +150,8 @@ class PCA(Model):
"""
self.clear()
options = self._options
for item in options.items():
print item
self._dataset['X'] = data
self._data['X'] = data.asarray().astype('<f8')
if options['center']:
@ -138,7 +160,8 @@ class PCA(Model):
else:
self.model['E0'] = data.asarray()
self.pre_validation(**options.pre_validation_options())
self.validation(**options.validation_options())
self.model['aopt'] = self._options['aopt']
self.make_model(**options.make_model_options())
if options['calc_conf']:
self.confidence(**options.confidence_options())
@ -159,7 +182,6 @@ class PCA(Model):
if response == gtk.RESPONSE_OK:
# set output data and plots
dialog.set_output()
#run with current data and options
return self.run_o(data)
@ -172,10 +194,10 @@ class PLS(Model):
def pre_validation(self, amax, n_sets, val_engine):
"""Returns rmsec,rmsep for model.
"""
rmsep = val_engine(self.model['E0'], self.model['F0'],
rmsep, aopt = val_engine(self.model['E0'], self.model['F0'],
amax, n_sets)
self.model['rmsep'] = rmsep.mean(0)
self.model['aopt'] = rmsep.mean(0).argmin()
self.model['aopt'] = aopt
def confidence(self, aopt, n_sets, alpha, p_center,
crot, strict, cov_center ):
@ -341,34 +363,39 @@ class PcaOptions(Options):
opt['algo'] = 'pca'
opt['engine'] = engines.pca
opt['mode'] = 'normal' # how much info to calculate
opt['lod'] = 'compact' # how much info to store
opt['amax'] = 5
opt['aopt'] = 5
opt['amax'] = 10
opt['aopt'] = 100
opt['auto_aopt'] = False
opt['center'] = True
opt['center_mth'] = mat_center
opt['scale'] = 'scores'
opt['calc_conf'] = True
opt['n_sets'] = 5
opt['calc_conf'] = False
opt['n_sets'] = 5
opt['strict'] = True
opt['p_center'] = 'med'
opt['alpha'] = .8
opt['cov_center'] = 'med'
opt['crot'] = True
opt['val_engine'] = pca_alter_val
opt['val_n_sets'] = 10
opt['calc_cv'] = False
opt['calc_pert'] = True
opt['pert_val_method'] = 'random_diag'
opt['cv_val_method'] = 'random'
opt['cv_val_sets'] = 10
opt['pert_val_sets'] = 10
opt['all_data'] = [('T', 'scores', True),
('P', 'loadings', True),
('E','residuals', False),
('p_tsq', 't2', False),
('rmsep', 'root mean square error of prediction', False)
('rmsep', 'RMSEP', False)
]
opt['all_plots'] = [(blmplots.PcaScorePlot, 'Scores', True),
(blmplots.PcaLoadingPlot, 'Loadings', True),
(blmplots.LineViewXc, 'Line view', True)
(blmplots.LineViewXc, 'Line view', True),
(blmplots.PredictionErrorPlot, 'Residual Error', True)
]
opt['out_data'] = ['T','P', 'p_tsq']
@ -387,9 +414,10 @@ class PcaOptions(Options):
'strict', 'crot', 'cov_center']
return self._copy_from_list(opt_list)
def pre_validation_options(self):
def validation_options(self):
"""Options for pre_validation method."""
opt_list = ['amax', 'n_sets', 'val_engine']
opt_list = ['amax', 'cv_val_sets', 'pert_val_sets',
'cv_val_method', 'pert_val_method']
return self._copy_from_list(opt_list)
@ -411,7 +439,7 @@ class PlsOptions(Options):
opt['center'] = True
opt['center_mth'] = mat_center
opt['scale'] = 'scores'
opt['calc_conf'] = True
opt['calc_conf'] = False
opt['n_sets'] = 10
opt['strict'] = True
@ -420,13 +448,15 @@ class PlsOptions(Options):
opt['cov_center'] = 'med'
opt['crot'] = True
opt['calc_cv'] = True
opt['calc_pert'] = False
opt['val_engine'] = w_pls_cv_val
opt['all_data'] = [('T', 'scores', True),
('P', 'loadings', True),
('E','residuals', False),
('p_tsq', 't2', False),
('rmsep', 'root mean square error of prediction', False)
('rmsep', 'RMSEP', False)
]
opt['all_plots'] = [(blmplots.PlsScorePlot, 'Scores', True),
@ -468,9 +498,175 @@ class PcaOptionsDialog(OptionsDialog):
def __init__(self, data, options, input_names=['X']):
OptionsDialog.__init__(self, data, options, input_names)
glade_file = "/home/flatberg/Projects/project4/project4.glade"
notebook_name = "vbox1"
page_name = "Options"
self.add_page_from_glade(glade_file, notebook_name, page_name)
# connect signals to handlers
dic = {"on_amax_value_changed" : self.on_amax_changed,
"on_aopt_value_changed" : self.on_aopt_changed,
"auto_aopt_toggled" : self.auto_aopt_toggled,
"center_toggled" : self.center_toggled,
"on_scale_changed" : self.on_scale_changed,
"on_val_none" : self.val_toggled,
"on_val_cv" : self.cv_toggled,
"on_val_pert" : self.pert_toggled,
"on_cv_method_changed" : self.on_cv_method_changed,
"on_cv_sets_changed" : self.on_cv_sets_changed,
"on_pert_sets_changed" : self.on_pert_sets_changed,
"on_conf_toggled" : self.on_conf_toggled
}
self.wTree.signal_autoconnect(dic)
# set/ensure valid default values/ranges
amax_sb = self.wTree.get_widget("amax_spinbutton")
max_comp = min(data[0].shape) # max num of components
if self._options['amax']>max_comp:
logger.log('debug', 'amax default too large ... adjusting')
self._options['amax'] = max_comp
amax_sb.get_adjustment().set_all(self._options['amax'], 1, max_comp, 1, 0, 0)
# aopt spin button
aopt_sb = self.wTree.get_widget("aopt_spinbutton")
if self._options['aopt']>self._options['amax']:
self._options['aopt'] = self._options['amax'] + 1 - 1
aopt_sb.get_adjustment().set_all(self._options['aopt'], 1, self._options['amax'], 1, 0, 0)
# scale
scale_cb = self.wTree.get_widget("scale_combobox")
scale_cb.set_active(0)
# validation frames
if self._options['calc_cv']==False:
cv_frame = self.wTree.get_widget("cv_frame")
cv_frame.set_sensitive(False)
if self._options['calc_pert']==False:
pert_frame = self.wTree.get_widget("pert_frame")
pert_frame.set_sensitive(False)
cv = self.wTree.get_widget("cv_method").set_active(0)
pm = self.wTree.get_widget("pert_method").set_active(0)
# confidence
if self._options['calc_conf']==True:
self.wTree.get_widget("subset_frame").set_sensitive(True)
else:
self.wTree.get_widget("subset_frame").set_sensitive(False)
def on_amax_changed(self, sb):
logger.log("debug", "amax changed: new value: %s" %sb.get_value_as_int())
amax = sb.get_value_as_int()
# update aopt if needed
if amax<self._options['aopt']:
self._options['aopt'] = amax
aopt_sb = self.wTree.get_widget("aopt_spinbutton")
aopt_sb.get_adjustment().set_all(self._options['aopt'], 1, amax, 1, 0, 0)
self._options['amax'] = sb.get_value_as_int()
def on_aopt_changed(self, sb):
aopt = sb.get_value_as_int()
self._options['aopt'] = aopt
def auto_aopt_toggled(self, tb):
aopt_sb = self.wTree.get_widget("aopt_spinbutton")
if tb.get_active():
self._options['auto_aopt'] = True
aopt_sb.set_sensitive(False)
else:
self._options['auto_aopt'] = False
aopt_sb.set_sensitive(True)
def center_toggled(self, tb):
if tb.get_active():
self._options['center'] = True
else:
logger.log("debug", "centering set to False")
self._options['center'] = False
def on_scale_changed(self, cb):
scale = cb.get_active_text()
if scale=='Scores':
self._options['scale'] = 'scores'
elif scale=='Loadings':
self._options['scale'] = 'loads'
else:
raise IOError
def val_toggled(self, tb):
"""Callback for validation: None. """
cv_frame = self.wTree.get_widget("cv_frame")
pert_frame = self.wTree.get_widget("pert_frame")
cv_tb = self.wTree.get_widget("cv_toggle")
p_tb = self.wTree.get_widget("pert_toggle")
if tb.get_active():
self._options['calc_cv'] = False
self._options['calc_pert'] = False
cv_frame.set_sensitive(False)
pert_frame.set_sensitive(False)
cv_tb.set_sensitive(False)
p_tb.set_sensitive(False)
else:
p_tb.set_sensitive(True)
cv_tb.set_sensitive(True)
if p_tb.get_active():
pert_frame.set_sensitive(True)
self._options['calc_pert'] = True
if cv_tb.get_active():
cv_frame.set_sensitive(True)
self._options['calc_cv'] = True
def cv_toggled(self, tb):
cv_frame = self.wTree.get_widget("cv_frame")
if tb.get_active():
cv_frame.set_sensitive(True)
self._options['calc_cv'] = True
else:
cv_frame.set_sensitive(False)
self._options['calc_cv'] = False
def pert_toggled(self, tb):
pert_frame = self.wTree.get_widget("pert_frame")
if tb.get_active():
pert_frame.set_sensitive(True)
self._options['calc_pert'] = True
else:
pert_frame.set_sensitive(False)
self._options['calc_pert'] = False
def on_cv_method_changed(self, cb):
method = cb.get_active_text()
if method == 'Random':
self._options['cv_val_method'] = 'random'
def on_pert_method_changed(self, cb):
method = cb.get_active_text()
if method == 'Random diags':
self._options['pert_val_method'] = 'random_diag'
def on_cv_sets_changed(self, sb):
val = sb.get_value_as_int()
self._options['cv_val_sets'] = val
def on_pert_sets_changed(self, sb):
val = sb.get_value_as_int()
self._options['pert_val_sets'] = val
def on_conf_toggled(self, tb):
if tb.get_active():
self._options['calc_conf'] = False
self.wTree.get_widget("subset_frame").set_sensitive(False)
else:
self._options['calc_conf'] = True
self.wTree.get_widget("subset_frame").set_sensitive(True)
class PlsOptionsDialog(OptionsDialog):
"""Options dialog for Partial Least Squares Regression.
"""
def __init__(self, data, options, input_names=['X', 'Y']):
OptionsDialog.__init__(self, data, options, input_names)

View File

@ -12,8 +12,9 @@ fixme2:
colorbar, but when adding colors the colorbar shoud be created.
"""
from fluents import plots
from scipy import dot,sum,diag,arange,log,mean,newaxis
from scipy import dot,sum,diag,arange,log,mean,newaxis,sqrt
from matplotlib import cm
import pylab as PB
class PcaScorePlot(plots.ScatterPlot):
"""PCA Score plot"""
@ -103,38 +104,40 @@ class PlsLoadingPlot(plots.ScatterPlot):
def set_ordinate(self, n):
self.yaxis_data = self._T[:,n]
class LineViewXc(plots.LineViewPlot):
"""A line view of centered raw data
"""
def __init__(self, func_class, name='Profiles'):
def __init__(self, model, name='Profiles'):
# copy, center, plot
x = func_class._dataset['X'].copy()
x = model._dataset['X'].copy()
x._array = x._array - mean(x._array,0)[newaxis]
plots.LineViewPlot.__init__(self, x, 1, None, name)
class ParalellCoordinates(plots.Plot):
"""Parallell coordinates for score loads with many comp.
"""
def __init__(self, model, p='loads'):
pass
class PlsQvalScatter(plots.ScatterPlot):
"""A vulcano like plot of loads vs qvals
"""
def __init__(self, func_class, pc=0):
model = func_class.model
if not model.has_key('w_tsq'):
def __init__(self, model, pc=0):
if not model.model.has_key('w_tsq'):
return
self._W = model['P']
dataset_1 = func_class.as_dataset('P')
dataset_2 = func_class.as_dataset('w_tsq')
self._W = model.model['P']
dataset_1 = model.as_dataset('P')
dataset_2 = model.as_dataset('w_tsq')
id_dim = dataset_1.get_dim_name(0) #genes
sel_dim = dataset_1.get_dim_name(1) #_comp
sel_dim_2 = dataset_2.get_dim_name(1) #_zero_dim
id_1, = dataset_1.get_identifiers(sel_dim, [0])
id_2, = dataset_2.get_identifiers(sel_dim_2, [0])
if model.has_key('w_tsq'):
col = model['w_tsq'].ravel()
if model.model.has_key('w_tsq'):
col = model.model['w_tsq'].ravel()
col = normalise(col)
else:
col = 'g'
@ -143,6 +146,33 @@ class PlsQvalScatter(plots.ScatterPlot):
c=col, s=20, sel_dim_2=sel_dim_2,
name='Load Volcano')
class PredictionErrorPlot(plots.Plot):
"""A boxplot of prediction error vs. comp. number.
"""
def __init__(self, model, name="Pred. Err."):
if not model.model.has_key('sep'):
logger.log('notice', 'Model has no calculations of sep')
return
plots.Plot.__init__(self, name)
self._frozen = True
self.current_dim = 'johndoe'
self.ax = self.fig.add_subplot(111)
# draw
sep = model.model['sep']
aopt = model.model['aopt']
bx_plot_lines = self.ax.boxplot(sqrt(sep))
aopt_marker = self.ax.axvline(aopt, linewidth=10,
color='r',zorder=0,
alpha=.5)
# add canvas
self.add(self.canvas)
self.canvas.show()
def set_current_selection(self, selection):
pass
class InfluencePlot(plots.ScatterPlot):
"""

View File

@ -1,6 +1,7 @@
from scipy import apply_along_axis,newaxis,zeros,\
median,round_,nonzero,dot,argmax,any,sqrt,ndarray,\
trace,zeros_like,sign,sort,real,argsort,rand,array
trace,zeros_like,sign,sort,real,argsort,rand,array,\
matrix
from scipy.linalg import norm,svd,inv,eig
from scipy.stats import median,mean
@ -106,3 +107,7 @@ def mat_center(X,axis=0,ret_mn=False):
return Xs,mnX
else:
return Xs
def m_shape(array):
"""Returns the array shape on the form of a numpy.matrix."""
return matrix(array).shape

View File

@ -1,4 +1,3 @@
"""Module contain algorithms for (burdensome) calculations.
There is no typechecking of any kind here, just focus on speed
@ -7,7 +6,7 @@ There is no typechecking of any kind here, just focus on speed
from scipy.linalg import svd,norm,inv,pinv,qr
from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\
apply_along_axis,mean,ones,randn,empty_like,outer,c_,\
rand,sum,cumsum
rand,sum,cumsum,matrix
def pca(a, aopt, scale='scores', mode='normal'):
""" Principal Component Analysis model
@ -19,6 +18,7 @@ def pca(a, aopt, scale='scores', mode='normal'):
m, n = a.shape
u, s, vt = svd(a, full_matrices=0)
eigvals = (1./m)*s
T = u*s
T = T[:,:aopt]
P = vt[:aopt,:].T
@ -43,6 +43,29 @@ def pca(a, aopt, scale='scores', mode='normal'):
return {'T':T, 'P':P, 'E':E}
def pcr(a, b, aopt=2, scale='scores', mode='normal'):
"""Returns Principal component regression model."""
m, n = a.shape
try:
k, l = b.shape
except:
k = b.shape[0]
l = 1
B = empty((aopt, n, l))
U, s, Vt = svd(a, full_matrices=True)
T = U*s
T = T[:,:aopt]
P = Vt[:aopt,:].T
Q = dot(dot(inv(dot(T.T, T)), T.T), b).T
for i in range(aopt):
ti = T[:,:i+1]
r = dot(dot(inv(dot(ti.T,ti)), ti.T), b)
B[i] = dot(P[:,:i+1], r)
E = a - dot(T, P.T)
F = b - dot(T, Q.T)
return {'T':T, 'P':P,'Q': Q, 'B':B, 'E':E, 'F':F}
def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
"""Kernel pls for tall/wide matrices.
@ -51,9 +74,9 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
"""
m, n = a.shape
if ab!=None:
mm,l = ab.shape
mm, l = m_shape(ab)
else:
k,l = b.shape
k, l = m_shape(b)
W = empty((n, aopt))
P = empty((n, aopt))
@ -66,7 +89,7 @@ def pls(a, b, aopt=2, scale='scores', mode='normal', ab=None):
ab = dot(a.T, b)
for i in range(aopt):
if ab.shape[1]==1:
w = ab
w = ab.reshape(mm, l)
else:
u, s, vh = svd(dot(ab.T, ab))
w = dot(ab, u[:,:1])
@ -147,12 +170,11 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
"""Undeflated Ridged svd(X'Y)
"""
m, n = a.shape
k, l = b.shape
k, l = m_shape(b)
u, s, vt = svd(b, full_matrices=0)
g0 = dot(u*s, u.T)
g = (1 - r)*g0 + r*eye(m)
ag = dot(a.T, g)
u, s, vt = svd(ag, full_matrices=0)
W = u[:,:aopt]
K = vt[:aopt,:].T
@ -167,7 +189,7 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
U = dot(g0, K) #fixme check this
Q = dot(b.T, dot(T, inv(dot(T.T, T)) ))
B = zeros((aopt, n, l))
B = zeros((aopt, n, l), dtype='f')
for i in range(aopt):
B[i] = dot(W[:,:i+1], Q[:,:i+1].T)
# leverages
@ -198,3 +220,6 @@ def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
return {'B':B, 'W':W, 'T':T, 'Q':Q, 'E':E, 'F':F, 'U':U, 'P':W}
def m_shape(array):
return matrix(array).shape

View File

@ -418,6 +418,31 @@ def weighted_laplacian(G,with_labels=False):
else:
return L
def subnetworks(G, T2):
"""Return the highest scoring (T2-test) subgraph og G.
Use simulated annealing to identify highly scoring subgraphs.
ref: -- Ideker et.al (Bioinformatics 18, 2002)
-- Patil and Nielsen (PNAS 2006)
"""
N = 1000
states = [(node, False) for node in G.nodes()]
t2_last = 0.0
for i in xrange(N):
if i==0: #assign random states
states = [(state[0], True) for state in states if rand(1)>.5]
sub_nodes = [state[0] for state in states if state[1]]
Gsub = NX.subgraph(G, sub_nodes)
Gsub = NX.connected_components_subgraphs(Gsub)[0]
t2 = [T2[node] for node in Gsub]
if t2>t2_last:
pass
else:
p = numpy.exp()
"""Below are methods for calculating graph metrics
@ -473,7 +498,7 @@ Ke = expm(A) .... expm(-A)?
# 13.09.2206: update for use in numpy
def K_expAdj(W, normalised=False, alpha=1.0):
def K_expAdj(W, normalised=True, alpha=1.0):
"""Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel.
"""
W = asarray(W)
@ -499,7 +524,7 @@ def K_expAdj(W, normalised=False, alpha=1.0):
return dot(dot(vr,psigma),vri)
def K_vonNeumann(W,normalised=False,alpha=1.0):
def K_vonNeumann(W, normalised=True, alpha=1.0):
""" The geometric series of path lengths.
Returns matrix square root of pseudo inverse of the adjacency matrix.
"""

View File

@ -30,22 +30,14 @@ def w_pls_gen(aat,b,n_blocks=None,center=True,index_out=False):
b_in = b[inn,:]
b_out = b[out,:]
if center:
# centering projector: I - (1/n)11'
# nin = len(inn)
# Pc = eye(nin) - outer(ones((nin,)),ones((nin,)))/nin
# xxt - x( outer(ones((nin,)),ones((nin,)))/nin ) x.T
# de jong:
h = sum(aat_in,0)[ :,newaxis]
h = (h - mean(h)/2)/len(inn)
mn_a = h + h.T
aat_in = aat_in - mn_a
aat_in, mn = outerprod_centering(aat_in)
aat_out = aat_out - mn
if index_out:
yield aat_in,aat_out,b_in,b_out,out
else:
yield aat_in,aat_out,b_in,b_out
def pls_gen(a,b, n_blocks=None, center=False, index_out=False,axis=0):
def pls_gen(a, b, n_blocks=None, center=False, index_out=False,axis=0, metric=None):
"""Random block crossvalidation
Leave-one-out is a subset, with n_blocks equals a.shape[-1]
"""
@ -56,17 +48,38 @@ def pls_gen(a,b, n_blocks=None, center=False, index_out=False,axis=0):
out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_blocks)]
for out in out_ind_sets:
inn = [i for i in index if i not in out]
acal = a.take(inn, 0)
atrue = a.take(out, 0)
bcal = b.take(inn, 0)
btrue = b.take(out, 0)
if center:
a = a - mean(a,0)[newaxis]
b = b - mean(b,0)[newaxis]
mn_a = acal.mean(0)[newaxis]
acal = acal - mn_a
atrue = atrue - mn_a
mn_b = bcal.mean(0)[newaxis]
bcal = bcal - mn_b
btrue = btrue - mn_b
if metric!=None:
acal = dot(acal, metric)
if index_out:
yield a.take(inn,0),a.take(out,0), b.take(inn,0),b.take(out,0),out
yield acal, atrue, bcal, btrue, out
else:
yield a.take(inn,0),a.take(out,0), b.take(inn,0),b.take(out,0)
yield acal, atrue, bcal, btrue
def pca_gen(a, n_sets=None, center=False, index_out=False, axis=0):
"""PCA random block crossval generator.
"""Returns a generator of crossvalidation sample segments.
input:
-- a, data matrix (m x n)
-- n_sets, number of segments/subsets to generate.
-- center, bool, choice of centering each subset
-- index_out, bool, return subset index
-- axis, int, which axis to get subset from
ouput:
-- V, generator with (n_sets) memebers (subsets)
"""
m = a.shape[axis]
index = randperm(m)
@ -76,14 +89,19 @@ def pca_gen(a,n_sets=None, center=False, index_out=False,axis=0):
out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_sets)]
for out in out_ind_sets:
inn = [i for i in index if i not in out]
acal = a.take(inn, 0)
atrue = a.take(out, 0)
if center:
a = a - mean(a,0)[newaxis]
mn_a = acal.mean(0)[newaxis]
acal = acal - mn_a
atrue = atrue - mn_a
if index_out:
yield a.take(inn,0),a.take(out,0),out
yield acal, atrue, out
else:
yield a.take(inn,0),a.take(out,0)
yield acal, atrue
def w_pls_gen_jk(a,b,n_sets=None,center=True,index_out=False,axis=0):
def w_pls_gen_jk(a, b, n_sets=None, center=True,
index_out=False, axis=0):
"""Random block crossvalidation for wide X (m>>n)
Leave-one-out is a subset, with n_sets equals a.shape[-1]
@ -103,9 +121,8 @@ def w_pls_gen_jk(a,b,n_sets=None,center=True,index_out=False,axis=0):
a_in = a[inn,:]
mn_a = 0
mAB = 0
if center:
mn_a = mean(a,0)[newaxis]
mn_a = a_in.mean(0)[newaxis]
mAin = dot(-ones((1,nout)), a[out,:])/nin
mBin = dot(-ones((1,nout)), b[out,:])/nin
mAB = dot(mAin.T, (mBin*nin))
@ -113,9 +130,9 @@ def w_pls_gen_jk(a,b,n_sets=None,center=True,index_out=False,axis=0):
a_in = a_in - mn_a
if index_out:
yield ain,ab, out
yield a_in, ab_in, out
else:
yield a_in, ab
yield a_in, ab_in
def shuffle_1d_block(a, n_sets=None, blocks=None, index_out=False, axis=0):
"""Random block shuffling along 1d axis
@ -185,3 +202,19 @@ def diag_pert(a, n_sets=10, center=True, index_out=False):
yield a_out, asarray(out)
else:
yield a_out
def outerprod_centering(aat, ret_mn=True):
"""Returns mean centered symmetric outerproduct matrix.
"""
n = aat.shape[0]
h = aat.sum(0)[:,newaxis]
h = (h - mean(h)/2)/n
mn_a = h + h.T
aatc = aat - mn_a
if ret_mn:
return aatc, aat.mean(0)
return aat - mn_a

View File

@ -1,20 +1,24 @@
"""This module implements some common validation schemes from pca and pls.
"""
from scipy import ones,mean,sqrt,dot,newaxis,zeros,sum,empty,\
apply_along_axis,eye, kron
apply_along_axis,eye,kron,array,sort
from scipy.stats import median
from scipy.linalg import triu,inv,svd,norm
from select_generators import w_pls_gen,w_pls_gen_jk,pls_gen,pca_gen,diag_pert
from engines import w_simpls,pls,bridge,pca
from pylab import *
from cx_utils import m_shape
def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
"""RMSEP calc for pls with wide X.
"""Returns and RMSEP for pls tailored for wide X.
"""
k, l = Y.shape
k, l = m_shape(Y)
PRESS = zeros((l, amax+1), dtype='f')
# X,Y are centered
# X,Y are centered0
if n_blocks==None:
n_blocks = Y.shape[0]
V = w_pls_gen(dot(X, X.T), Y, n_blocks=n_blocks, center=True)
XXt = dot(X, X.T)
V = w_pls_gen(XXt, Y, n_blocks=n_blocks, center=True)
for Din, Doi, Yin, Yout in V:
ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0])
Yin = Yin - ym
@ -24,7 +28,6 @@ def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
Q, U, H = dat['Q'], dat['U'], dat['H']
That = dot(Doi, dot(U, inv(triu(dot(H.T,U))) ))
else:
"Other algo-support comming soon"
raise NotImplementedError
#Yhat = empty((amax, k, l),dtype='<f8')
Yhat = []
@ -34,13 +37,14 @@ def w_pls_cv_val(X, Y, amax, n_blocks=None, algo='simpls'):
E = E + sum(E, 0)/Din.shape[0]
PRESS[j,1:] = PRESS[j,1:] + sum(E**2, 0)
#Yhat = Y - dot(That,Q.T)
return sqrt(PRESS/Y.shape[0])
rmsep = sqrt(PRESS/Y.shape[0])
aopt = find_aopt_from_sep(rmsep)
return rmsep, aopt
def pls_val(X, Y, amax=2, n_blocks=10,algo='pls'):
""" Validation results of pls model.
"""
k, l = Y.shape
k, l = m_shape(Y)
PRESS = zeros((l, amax+1), dtype='<f8')
EE = zeros((amax, k, l), dtype='<f8')
Yhat = zeros((amax, k, l), dtype='<f8')
@ -50,6 +54,7 @@ def pls_val(X, Y, amax=2, n_blocks=10,algo='pls'):
ym = -sum(Yout,0)[newaxis]/Yin.shape[0]
Yin = (Yin - ym)
PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
if algo=='pls':
dat = pls(Xin, Yin, amax, mode='normal')
elif algo=='bridge':
@ -62,7 +67,9 @@ def pls_val(X, Y, amax=2, n_blocks=10,algo='pls'):
EE[a,out,:] = E
PRESS[:,a+1] = PRESS[:,a+1] + sum(E**2,0)
return sqrt(PRESS/(k-1.)), EE, Yhat
rmsep = sqrt(PRESS/(k-1.))
aopt = find_aopt_from_sep(rmsep)
return rmsep, aopt
def pca_alter_val(a, amax, n_sets=10, method='diag'):
"""Pca validation by altering elements in X.
@ -79,18 +86,27 @@ def pca_alter_val(a, amax, n_sets=10,method='diag'):
EE = a_sub - Xhat.ravel().take(ind)
tot = (a_sub**2).sum()
sep[i,j] = (EE**2).sum()/tot
return sqrt(sep.mean(0))
#return sep
sep = sqrt(sep)
aopt = find_aopt_from_sep(sep)
return sep, aopt
def pca_cv_val(X, amax, n_sets):
""" Cross validation of pca using random sets crossval.
def pca_cv_val(a, amax, n_sets):
""" Returns PRESS from cross-validated pca using random segments.
input:
-- a, data matrix (m x n)
-- amax, maximum nuber of components used
-- n_sets, number of segments to calculate
output:
-- sep, (amax x m x n), squared error of prediction (press)
-- aopt, guestimated optimal number of components
"""
m, n = X.shape
xtot = (X**2).sum()
V = pca_gen(X, n_sets=7, center=True, index_out=True)
m, n = a.shape
E = empty((amax, m, n), dtype='f')
xtot = (a**2).sum() # this needs centering
V = pca_gen(a, n_sets=7, center=True, index_out=True)
for xi, xout, ind in V:
dat_i = pca(xi, amax, mode='detailed')
dat_i = pca(xi, amax, mode='fast')
Pi = dat_i['P']
for a in xrange(amax):
Pia = Pi[:,:a+1]
@ -99,7 +115,9 @@ def pca_cv_val(X, amax, n_sets):
sep = []
for a in xrange(amax):
sep.append(E[a].sum()/xtot)
return sqrt(sep.mean(0))
sep = array(sep)
aopt = find_aopt_from_sep(sep)
return sep, aopt
def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True):
""" Returns CV-segments of paramter W for wide X.
@ -128,7 +146,20 @@ def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True):
return WW
def pca_jkP(a, aopt, n_blocks=None):
""" Returns CV-segments of paramter P.
"""Returns loading from PCA on CV-segments.
input:
-- a, data matrix (n x m)
-- aopt, number of components in model.
-- nblocks, number of segments
output:
-- PP, loadings collected in a three way matrix
(n_segments, m, aopt)
comments:
* The loadings are scaled with the (1/samples)*eigenvalues.
* Crossvalidation method is currently set to random blocks of samples.
todo: add support for T
fixme: more efficient to add this in validation loop
"""
@ -138,8 +169,30 @@ def pca_jkP(a, aopt, n_blocks=None):
PP = empty((n_blocks, a.shape[1], aopt), dtype='f')
V = pca_gen(a, n_sets=n_blocks, center=True)
for nn,(a_in, a_out) in enumerate(V):
dat = pca(a_in, aopt, mode='fast')
dat = pca(a_in, aopt, mode='fast', scale='loads')
P = dat['P']
PP[nn,:,:] = P
return PP
def find_aopt_from_sep(sep, method='75perc'):
"""Returns an estimate of optimal number of components from rmsecv.
"""
if method=='vanilla':
# min rmsep
rmsecv = sqrt(sep.mean(0))
return rmsecv.argmin() + 1
elif method=='75perc':
prct = .75 #percentile
ind = 1.*sep.shape[0]*prct
med = median(sep)
prc_75 = []
for col in sep.T:
col.sort()
prc_75.append(col[int(ind)])
prc_75 = array(prc_75)
for i in range(1, sep.shape[1], 1):
if med[i-1]<prc_75[i]:
return i
return len(med)