diff --git a/fluents/lib/cv_index.py b/fluents/lib/cv_index.py new file mode 100644 index 0000000..75ca4f1 --- /dev/null +++ b/fluents/lib/cv_index.py @@ -0,0 +1,66 @@ +from numpy import array_split,arange + + +def cv(n, k, randomise=False, sequential=False): + """ + Generates k (training, validation) index pairs. + + Each pair is a partition of arange(n), where validation is an iterable + of length ~n/k. + + If randomise is true, a copy of index is shuffled before partitioning, + otherwise its order is preserved in training and validation. + + Randomise overrides the sequential argument. If randomise is true, + sequential is False + + If sequential is true the index is partioned in continous blocks, + otherwise interleaved ordering is used. + """ + index = xrange(N) + if randomise: + from random import shuffle + index = list(index) + shuffle(index) + sequential = False + if sequential: + for validation in array_split(index, K): + training = [i for i in index if i not in validation] + yield training, validation + else: + for k in xrange(K): + training = [i for i in index if i % K != k] + validation = [i for i in index if i % K == k] + yield training, validation + +def shuffle_diag(shape, K, randomise=False, sequential=False): + """ + Generates k (training, validation) index pairs. + """ + m, n = shape + + if K>m or K>n: + msg = "You may not use more subsets than max(n_rows, n_cols)" + raise ValueError, msg + + mon = max(m, n) + #index = xrange(n) + index = [i for i in range(m*n) if i % m == 0] + print index + if randomise: + from random import shuffle + index = list(index) + shuffle(index) + sequential = False + + if sequential: + start_inds = array_split(index, K) + else: + for k in xrange(K): + start_inds = [index[i] for i in xrange(n) if i % K == k] + + print start_inds + for start in start_inds: + ind = arange(start, n*m, mon+1) + yield ind +