diff --git a/pyblm/crossvalidation.py b/pyblm/crossvalidation.py index a824068..ab7aded 100644 --- a/pyblm/crossvalidation.py +++ b/pyblm/crossvalidation.py @@ -140,7 +140,7 @@ def pca_jk(a, aopt, n_blocks=None): def pls_jk(X, Y, a_opt, nsets=None, center=True, verbose=False): """ Returns jack-knife segements of W. - *Parameters*: + *Parameters*: X : {array} Main data matrix (m, n) @@ -251,12 +251,14 @@ def find_aopt_from_sep(sep, method='vanilla'): closely before deciding on the optimal number of components. *Parameters*: + sep : {array} Squared error of prediction method : ['vanilla', '75perc'] Mehtod used to estimate optimal number of components *Returns*: + aopt : {integer} A guess on the optimal number of components """ @@ -297,12 +299,14 @@ def cv(N, K, randomise=True, sequential=False): Use sequential sampling *Returns*: + training : {array-like} training-indices validation : {array-like} validation-indices *Notes*: + If randomise is true, a copy of index is shuffled before partitioning, otherwise its order is preserved in training and validation. @@ -334,9 +338,10 @@ def cv(N, K, randomise=True, sequential=False): def diag_cv(shape, nsets=9): """Generates K (training, validation) index pairs. - Parameters: + *Parameters*: + N : {integer} - alpha -- scalar, approx. portion of data perturbed + alpha -- scalar, approx. portion of data perturbed """ try: m, n = shape @@ -371,7 +376,7 @@ def class_error(Yhat, Y, method='vanilla'): return Yhat_c, err -def class_errorII(T, Y, method='lda'): +def _class_errorII(T, Y, method='lda'): """ Not used ... """ pass diff --git a/pyblm/models.py b/pyblm/models.py new file mode 100644 index 0000000..17c6730 --- /dev/null +++ b/pyblm/models.py @@ -0,0 +1,282 @@ +"""Bilinear models""" + +from numpy import expand_dims + +from engines import pca + +def mean_center(x, axis=0): + """Mean center across axis.""" + return expand_dims(-x.mean(axis), axis) + +def scale(x, axis=0): + """ Scale across axis.""" + scale = ones((x.shape[axis-1],)) + #scale = 1./x.std(axis) + return expand_dims(scale, axis) + +class Model(object): + def __init__(name="johndoe"): + self.name = name + self.options = {} + + def save(self, filename='pca.ml'): + pass + + def load(self): + pass + + def clear(self): + for param in self.__dict__.keys(): + if param.startswith("_") and param[1]!="_": + exec "del self." + param + + def clear_core(self): + for param in self.__dict__.keys(): + if param.startswith("_"): + exec "del self." + param + #self.clear() + + +class PCA(Model): + def __init__(self, x, amax=10): + Model.__init__(self, name="PCA") + self._x = x + self.amax = amax + self.aopt = amax + + # properties + def amax(): + doc = "maximum number of components" + def fget(self): + return self._amax + def fset(self, a): + assert(a>0) + a = min(a, min(self.x.shape)) + if hasattr(self, "_amax"): + if a>self._amax: + # fixme: do a model update + raise NotImplementedError + self._amax = a + def fdel(self): + pass + return locals() + amax = property(**amax()) + + def tot_var(): + doc = "total variance" + def fget(self): + if not hasattr(self, "_tot_var"): + self._tot_var = (self.xw**2).sum() + return self._tot_var + def fset(self, tv): + self._tot_var = tv + def fdel(self): + del self._tot_var + return locals() + tot_var = property(**tot_var()) + + def scores(): + doc = "pca scores" + def fget(self): + if not hasattr(self, "_scores"): + u, s, v, tot_var = pcaengine(self.xw, self.amax) + self._scores = u + self.singvals = s + self.loadings = v + self.tot_var = tot_var + return self._scores[:,:self.amax] + def fset(self, t): + self._scores = t + def fdel(self): + del self._scores + return locals() # credit: David Niergarth + scores = property(**scores()) + + def loadings(): + doc = "pca loadings" + def fget(self): + if not hasattr(self, "_loadings"): + u, s, v, tot_var = pcaengine(self.xw, self.amax) + self._loadings = v + self.scores = u + self.singvals = s + self.tot_var = tot_var + return self._loadings[:,:self.amax] + def fdel(self): + del self._loadings + def fset(self, p): + self._loadings = p + return locals() + loadings = property(**loadings()) + + def singvals(): + doc = "Singular values" + def fget(self): + if not hasattr(self, "_singvals"): + u, s, v, tot_var = pcaengine(self.xw, self.amax) + self._singvals = s + self.scores = u + self.loadings = v + self.tot_var = tot_var + return self._singvals[:self.amax] + def fset(self, w): + self._singvals = w + def fdel(self): + del self._singvals + return locals() + singvals = property(**singvals()) + + def x(): + doc = "x is readonly, may not be deleted" + def fget(self): + return self._x + def fdel(self): + pass + return locals() + x = property(**x()) + + def xadd(): + doc = "column means" + def fget(self): + if not hasattr(self, "_xadd"): + self._xadd = center(self.x, axis=0) + return self._xadd + def fset(self, mnx): + if hasattr(self, "_xc"): + del self._xc + self._xadd = mnx + def fdel(self): + del self._xadd + if hasattr(self, "_xc"): + del self._xc + return locals() + xadd = property(**xadd()) + + def xc(): + doc = "centered input data" + def fget(self): + if not hasattr(self, "_xc"): + self._xc = self.x + self.xadd + return self._xc + def fset(self, xc): + self._xc = xc + def fdel(self): + del self._xc + return locals() + xc = property(**xc()) + + def xw(): + doc = "scaled input data" + def fget(self): + if not hasattr(self, "_xw"): + if self.x.shape==self.row_metric.shape: + self._xw = dot(self.xc, self.row_metric) + else: + self._xw = self.xc * self.row_metric + return self._xw + def fset(self, xw): + self._xw = xw + def fdel(self): + del self._xw + return locals() + xw = property(**xw()) + + def explained_variance(): + doc = "explained variance" + def fget(self): + if not hasattr(self, "_explained_variance"): + self._explained_variance = 100*(self.singvals**2)/self.tot_var + return self._explained_variance[:self.amax] + def fset(self, ev): + self._explained_variance = ev + def fdel(self): + del self._explained_variance + return locals() + explained_variance = property(**explained_variance()) + + def residuals(): + doc = "residuals" + def fget(self): + if not hasattr(self, "_residuals"): + res = empty((self.amax, self.x.shape[0], self.x.shape[1])) + for a in range(self.amax): + res[a,:,:] = self.xw - dot(self.scores[:,:a+1], self.loadings[:,:a+1].T) + self._residuals = res + return self._residuals + def fset(self, e): + self._residuals = e + def fdel(self): + del self._residuals + return locals() + residuals = property(**residuals()) + + def leverage(): + doc = "objects leverage" + def fget(self): + if not hasattr(self, "_leverage"): + u = self.scores/self.singvals + self._leverage = empty((self.amax, u.shape[0])) + for i in range(self.amax): + self._leverage[i,:] = 1./u.shape[0] + (u[:,:i+1]**2).sum(1) + return self._leverage[:self.amax,:] + def fset(self, lev): + self._leverage = lev + def fdel(self): + del self._leverage + return locals() + leverage = property(**leverage()) + + def row_metric(): + doc = "row metric" + def fget(self): + if not hasattr(self, "_row_metric"): + self._row_metric = scale(self.xc, axis=0) + return self._row_metric + def fset(self, w): + self._row_metric = w + def fdel(self): + del self._row_metric + if hasattr(self, "_xd"): + del self.xd + return locals() + row_metric = property(**row_metric()) + + def column_metric(): + doc = "column metric" + def fget(self): + if not hasattr(self, "_column_metric"): + self._column_metric = scale(self.xc, axis=1) + return self._column_metric + def fset(self, w): + + self._column_metric = w + # update model + def fdel(self): + del self._column_metric + if hasattr(self, "_xd"): + del self.xd + return locals() + column_metric = property(**column_metric()) + + def blm_update(self, a, b): + pass + + def append_columns(self, cols): + pass + + def append_rows(self, rows): + pass + + def delete_columns(self, index): + pass + + def delete_rows(self, index): + pass + + def reweight(self, ) + + +if __name__ == "__main__": + X = random.rand(4,10) + pcaobj = PCA(X) + print "explained variance" + str(pcaobj.explained_variance)