Complete rewrite of dataset class, with (all) the necessary updates

2006-04-24 09:53:07 +00:00
parent 53d0228074
commit a2e4392a72
9 changed files with 426 additions and 234 deletions
--- a/system/dataset.py
+++ b/system/dataset.py
@@ -1,136 +1,225 @@
 import logger
-from scipy import array,take,asarray,shape,nonzero
-import project
-from itertools import izip 
+from scipy import atleast_2d,asarray,ArrayType


 class Dataset:
-    """Dataset base class.
-
+    """The Dataset base class.
+    
    A Dataset is an n-way array with defined string identifiers across
    all dimensions.
+
+    example of use:
+
+    ---
+    dim_name_rows = 'rows'
+    names_rows = ('row_a','row_b')
+    ids_1 = [dim_name_rows, names_rows]
+
+    dim_name_cols = 'cols'
+    names_cols = ('col_a','col_b','col_c','col_d')
+    ids_2 = [dim_name_cols, names_cols]
+
+    Array_X = rand(2,4)
+    data = Dataset(Array_X,(ids_1,ids_2),name="Testing")
+
+    dim_names = [dim for dim in data]
+
+    column_identifiers = [id for id in data['cols'].keys()]
+    column_index = [index for index in data['cols'].values()]
+
+    'cols' in data -> True
+
+    ---
+
+    data = Dataset(rand(10,20)) (generates dims and ids (no links))
    """
-    def __init__(self, input_array, def_list, name="Unnamed data"):
-        self._name = name
-        self._data = asarray(input_array)
-        dims = shape(self._data)
-        self.def_list = def_list
-        self._ids_set = set()
-        self.ids={}
-        self._dim_num = {}
-        self._dim_names = []
-        if len(dims)==1: # a vector is defined to be column vector!
-            self.dims = (dims[0],1)
-        else:
-            self.dims = dims
-        if len(def_list)!=len(self.dims):
-            raise ValueError,"array dims and identifyer mismatch"
-        for axis,(dim_name,ids) in enumerate(def_list):
-            enum_ids = {}
-            #if dim_name not in project.c_p.dim_names:
-            #    dim_name = project.c_p.suggest_dim_name(dim_name)
-            if not ids:
-                logger.log('debug','Creating identifiers along: '+ str(dim_name))
-                ids = self._create_identifiers(axis)
-            for num,name in enumerate(ids):
-                enum_ids[name] = num
-            self.ids[dim_name] = enum_ids
-            self._ids_set = self._ids_set.union(set(ids))
-            self._dim_num[dim_name] = axis
-            self._dim_names.append(dim_name)
-                    
-        for (dimname, ids), d in izip(def_list,self.dims): #check that data and labels match 
-            if len(self.ids[dimname]) != d:
-                raise ValueError,"dim size and identifyer mismatch"
+    def __init__(self,array=None,identifiers=None,shape=None,all_dims=[],**kwds):
+        self._name = kwds.get("name","Unnamed data")
+        self._dims = [] #existing dimensions in this dataset
+        self._map = {} # internal mapping for dataset:  identifier <--> index
+        self.has_array = False
+        self.shape = None
+    
+        if array==None:
+            if shape == None:
+                raise ValueError, "Must define shape if array is None"
+            else:
+                self.shape = shape
+                if identifiers!=None:
+                    self._set_identifiers(identifiers,all_dims)
+                else:
+                    ids = self._create_identifiers(shape,all_dims)
+                    self._set_identifiers(ids,all_dims)
+        elif isinstance(array,ArrayType):
+            array = atleast_2d(asarray(array))
+            self.shape = array.shape
+            if shape != None:
+                if self.shape!=shape:
+                    #logger.log("debug","Dataset and input shape mismatch")
+                    raise ValueError
+            if identifiers!=None:
+                self._set_identifiers(identifiers,all_dims)
+            else:
+                ids = self._create_identifiers(self.shape,all_dims)
+                self._set_identifiers(ids,all_dims)
            
-    def get_name(self):
+            self._array = array
+            self.has_array = True
+            
+        else:
+            raise ValueError, "array input must be of ArrayType or None"
+
+        self._all_dims = all_dims
+                        
+    def __str__self(self):
        return self._name

-    def get_dim_names(self):
-        return self._dim_names
-    
-    def names(self,axis=0):
-        """Returns identifier names of a dimension.
-        NB: sorted by values!
-        OK? necessary?"""
+    def __iter__(self):
+        """Returns an iterator over dimensions of dataset."""
+        return self._dims.__iter__()
+
+    def __contains__(self,dim):
+        """Returns True if dim is a dimension name in dataset."""
+        # return self._dims.__contains__(dim)
+        return self._map.__contains__(dim)
+
+    def __len__(self):
+        """Returns the number of dimensions in the dataset"""
+        return len(self._map)
+
+    def __getitem__(self,dim):
+        """Return the identifers along the dimension dim."""
+        return self._map[dim]
+
+    def _create_identifiers(self,shape,all_dims):
+        """Creates dimension names and identifier names, and returns
+        identifiers."""
        
-        if type(axis)==int:
-            dim_name = self._dim_names[axis]
-        elif type(axis)==str:
-            dim_name = axis
-        if dim_name not in self._dim_names:
-            raise ValueError, dim_name + " not a dimension in dataset" 
-        items = self.ids[dim_name].items()
-        backitems=[ [v[1],v[0]] for v in items]
-        backitems.sort()
-        sorted_ids=[ backitems[i][1] for i in range(0,len(backitems))]
-        return sorted_ids
-    
-    def extract_data(self,ids,dim_name):
-        """Extracts data along a dimension by identifiers"""
-        new_def_list = self.def_list[:]
-        ids_index = [self.ids[dim_name][id_name] for id_name in ids]
-        dim_number = self._dim_num[dim_name]
-        try:
-            out_data = take(self._data,ids_index,axis=dim_number)
-        except:
-            raise ValueError
-        new_def_list[dim_number][1] = ids
-        extracted_data = Dataset(out_data,def_list=new_def_list,parents=self.parents)
-        return extracted_data
-
-    def _create_identifiers(self,axis):
-        """Creates identifiers along an axis"""
-        n_dim = self.dims[axis]
-        return [str(axis) + '_' + str(i) for i in range(n_dim)]
-
-    def extract_id_from_index(self,dim_name,index):
-        """Returns a set of ids from array/list of indexes."""
-        dim_ids = self.ids[dim_name]
-        if type(index)==int:
-            index = [index]
-            
-        return set([id for id,ind in dim_ids.items() if ind in index])
-
-    def extract_index_from_id(self,dim_name,id):
-        """Returns an array of indexes from a set/list of identifiers
-        (or a single id)"""
-        dim_ids = self.ids[dim_name]
-        return array([ind for name,ind in dim_ids.items() if name in id])
-        
-    
-class CategoryDataset(Dataset):
-    def __init__(self,array,def_list):
-        Dataset.__init__(self,array,def_list)
-
-    def get_elements_by_category(self,dim,category):
-        """Returns all elements along input dim belonging to category.
-        Assumes a two-dim category data only!
-        """
-        if type(category)!=list:
-            raise ValueError, "category must be list"
-        gene_ids = []
-        axis_dim = self._dim_num[dim]
-        cat_index = self.extract_index_from_id(category)
-        for ind in cat_index:
-            if axis_dim==0:
-                gene_indx = nonzero(self._data[:,ind])
-            elif axis_dim==1:
-                gene_indx = nonzero(self._data[ind,:])
+        dim_names = ['rows','cols'] 
+        ids = []
+        for axis,n in enumerate(shape):
+            if axis<2:
+                dim_suggestion = dim_names[axis]
            else:
-                ValueError, "Only support for 2-dim data"
-            gene_ids.append(self.extract_id_from_index(dim,gene_index))
-        return gene_ids
-                
-        
-        
-                
+                dim_suggestion = 'dim'
+            while dim_suggestion in all_dims:
+                dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims) 
+            identifier_creation = [str(axis) + "_" + i for i in map(str,range(n))]
+            ids.append((dim_suggestion,identifier_creation))
+            all_dims.append(dim_suggestion)
+        return ids
+
+    def _set_identifiers(self,identifiers,all_dims):
+        """Creates internal mapping of identifiers structure."""
+        for dim,ids in identifiers:
+            pos_map={}
+            if dim not in self._dims:
+                self._dims.append(dim)
+                all_dims.append(dim)
+            else:
+                raise ValueError, "Dimension names must be unique"
            
-        
-        
+            for pos,id in enumerate(ids):
+                pos_map[id] = pos
+            self._map[dim] = pos_map
+        shape_chk = [len(i) for j,i in identifiers]
+        if shape_chk != list(self.shape):
+            raise ValueError, "Shape input: %s and array: %s mismatch" %(shape_chk,self.shape)

+    def _suggest_dim_name(self,dim_name,all_dims):
+        """Suggests a unique name for dim and returns it"""
+        c = 0
+        while dim_name in all_dims:
+            dim_name = dim_name + "_" + str(c)
+            c+=1
+        return dim_name
+        
+    def asarray(self):
+        """Returns the numeric array (data) of dataset"""
+        if not self.has_array:
+            raise ValueError, "Dataset is empty"
+        else:
+            return self._array
+
+    def add_array(self,array):
+        """Adds array as an ArrayType object.
+        A one-dim array is transformed to a two-dim array (row-vector)
+        """
+        
+        if self.has_array:
+            raise ValueError, "Dataset has array"
+        else:
+            if (len(self._map)!=len(array.shape)):
+                raise ValueError, "range(array_dims) and range(dim_names) mismatch"
+            if self.shape!=array.shape:
+                raise ValueError, "Input array must be of similar dimensions as dataset"
+            self._array = atleast_2d(asarray(array))
+            self.has_array = True
+
+    def get_name(self):
+        return self._name
+    def get_all_dims(self):
+        return self._all_dims
+
+    def get_identifiers(self):
+        #return [n for n in self._map.iteritems()]
+
+        # ensure correct order
+        # this has correct dims but not identifiers
+        ids = []
+        for dim in self._dims:
+            ids.append((dim,self._map[dim].keys()))
+        return ids
+
+class CategoryDataset(Dataset):
+    """The category dataset class.
+
+    A dataset for representing class information as binary
+    matrices (0/1-matrices).
+
+    There is support for using a less memory demanding, and
+    fast intersection look-ups by representing the binary matrix as a
+    dictionary in each dimension.
+    """
    
+    def __init__(self):
+        Dataset.__init__(self)
+        self.has_collection = False
+            
+    def as_array(self):
+        """Returns data as binary matrix"""
+        if not self.has_array and self.has_collection:
+            #build numeric array
+            pass

+    def as_collection(self,dim):
+        """Returns data as collection along dim"""
+        pass
+    
+    def add_collection(self,input_dict):
+        """Adds a category data as collection.
+
+        A collection is a datastructure that contains a dictionary for
+        each pair of dimension in dataset, keyed by identifiers and
+        values is a set of identifiers in the other dimension
+        """
+        #build category data as double dicts
+        pass
+
+
+class GraphDataset(Dataset):
+    """The graph dataset class.
+
+    A dataset class for representing graphs using an adjacency matrix
+    (aka. restricted to square symmetric signed integers matrices)
+    
+    If the library NetworkX is installed, there is support for
+    representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure.
+    """
+    def __init__(self):
+        Dataset.__init(self)
+        self.has_graph = False
+        
 class Selection:
    """Handles selected identifiers along each dimension of a dataset"""
    def __init__(self):