diff --git a/system/dataset.py b/system/dataset.py index 5823980..5dfbe4e 100644 --- a/system/dataset.py +++ b/system/dataset.py @@ -1,4 +1,4 @@ -from scipy import atleast_2d,asarray,ArrayType,shape +from scipy import atleast_2d,asarray,ArrayType,shape,nonzero from scipy import sort as array_sort from itertools import izip @@ -182,7 +182,7 @@ class Dataset: else: index = [self._map[dim][key] for key in idents] return asarray(index) - + class CategoryDataset(Dataset): """The category dataset class. @@ -192,25 +192,30 @@ class CategoryDataset(Dataset): There is support for using a less memory demanding, and fast intersection look-ups by representing the binary matrix as a dictionary in each dimension. + + Always has linked dimension in first dim: + ex matrix: + go_term1 go_term2 ... + gene_1 + gene_2 + gene_3 + . + . + . """ - def __init__(self): - Dataset.__init__(self) + def __init__(self,array,identifiers=None,name='A'): + Dataset.__init__(self,array,identifiers=identifiers,name=name) + self.has_dictlists = False - def as_collection(self,dim): - """Returns data as collection along dim""" - pass - - def add_collection(self,input_dict): - """Adds a category data as collection. - - A collection is a datastructure that contains a dictionary for - each pair of dimension in dataset, keyed by identifiers and - values is a set of identifiers in the other dimension - """ - #build category data as double dicts - pass - + def as_dict_lists(self): + """Returns data as dict of indices along first dim""" + data={} + for name,ind in self._map[self.get_dim_name(0)].items(): + data[name] = list(nonzero(self._array[ind,:])) + self._dictlists = data + self.has_dictlists=True + return data class GraphDataset(Dataset): """The graph dataset class.