2006-04-25 11:53:35 +02:00
|
|
|
from system import logger
|
2006-04-24 11:53:07 +02:00
|
|
|
from scipy import atleast_2d,asarray,ArrayType
|
2006-04-17 00:57:50 +02:00
|
|
|
|
|
|
|
|
|
|
|
class Dataset:
|
2006-04-24 11:53:07 +02:00
|
|
|
"""The Dataset base class.
|
|
|
|
|
2006-04-17 11:08:40 +02:00
|
|
|
A Dataset is an n-way array with defined string identifiers across
|
|
|
|
all dimensions.
|
2006-04-24 11:53:07 +02:00
|
|
|
|
|
|
|
example of use:
|
|
|
|
|
|
|
|
---
|
|
|
|
dim_name_rows = 'rows'
|
|
|
|
names_rows = ('row_a','row_b')
|
|
|
|
ids_1 = [dim_name_rows, names_rows]
|
|
|
|
|
|
|
|
dim_name_cols = 'cols'
|
|
|
|
names_cols = ('col_a','col_b','col_c','col_d')
|
|
|
|
ids_2 = [dim_name_cols, names_cols]
|
|
|
|
|
|
|
|
Array_X = rand(2,4)
|
|
|
|
data = Dataset(Array_X,(ids_1,ids_2),name="Testing")
|
|
|
|
|
|
|
|
dim_names = [dim for dim in data]
|
|
|
|
|
|
|
|
column_identifiers = [id for id in data['cols'].keys()]
|
|
|
|
column_index = [index for index in data['cols'].values()]
|
|
|
|
|
|
|
|
'cols' in data -> True
|
|
|
|
|
|
|
|
---
|
|
|
|
|
|
|
|
data = Dataset(rand(10,20)) (generates dims and ids (no links))
|
2006-04-17 00:57:50 +02:00
|
|
|
"""
|
2006-04-24 11:53:07 +02:00
|
|
|
def __init__(self,array=None,identifiers=None,shape=None,all_dims=[],**kwds):
|
|
|
|
self._name = kwds.get("name","Unnamed data")
|
|
|
|
self._dims = [] #existing dimensions in this dataset
|
|
|
|
self._map = {} # internal mapping for dataset: identifier <--> index
|
|
|
|
self.has_array = False
|
|
|
|
self.shape = None
|
|
|
|
|
|
|
|
if array==None:
|
|
|
|
if shape == None:
|
|
|
|
raise ValueError, "Must define shape if array is None"
|
|
|
|
else:
|
|
|
|
self.shape = shape
|
|
|
|
if identifiers!=None:
|
|
|
|
self._set_identifiers(identifiers,all_dims)
|
|
|
|
else:
|
|
|
|
ids = self._create_identifiers(shape,all_dims)
|
|
|
|
self._set_identifiers(ids,all_dims)
|
|
|
|
elif isinstance(array,ArrayType):
|
|
|
|
array = atleast_2d(asarray(array))
|
|
|
|
self.shape = array.shape
|
|
|
|
if shape != None:
|
|
|
|
if self.shape!=shape:
|
|
|
|
#logger.log("debug","Dataset and input shape mismatch")
|
|
|
|
raise ValueError
|
|
|
|
if identifiers!=None:
|
|
|
|
self._set_identifiers(identifiers,all_dims)
|
|
|
|
else:
|
|
|
|
ids = self._create_identifiers(self.shape,all_dims)
|
|
|
|
self._set_identifiers(ids,all_dims)
|
2006-04-18 16:25:46 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
self._array = array
|
|
|
|
self.has_array = True
|
|
|
|
|
|
|
|
else:
|
|
|
|
raise ValueError, "array input must be of ArrayType or None"
|
|
|
|
|
|
|
|
self._all_dims = all_dims
|
|
|
|
|
|
|
|
def __str__self(self):
|
2006-04-21 14:28:29 +02:00
|
|
|
return self._name
|
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
def __iter__(self):
|
|
|
|
"""Returns an iterator over dimensions of dataset."""
|
|
|
|
return self._dims.__iter__()
|
|
|
|
|
|
|
|
def __contains__(self,dim):
|
|
|
|
"""Returns True if dim is a dimension name in dataset."""
|
|
|
|
# return self._dims.__contains__(dim)
|
|
|
|
return self._map.__contains__(dim)
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
"""Returns the number of dimensions in the dataset"""
|
|
|
|
return len(self._map)
|
|
|
|
|
|
|
|
def __getitem__(self,dim):
|
|
|
|
"""Return the identifers along the dimension dim."""
|
|
|
|
return self._map[dim]
|
|
|
|
|
|
|
|
def _create_identifiers(self,shape,all_dims):
|
|
|
|
"""Creates dimension names and identifier names, and returns
|
|
|
|
identifiers."""
|
2006-04-17 11:08:40 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
dim_names = ['rows','cols']
|
|
|
|
ids = []
|
|
|
|
for axis,n in enumerate(shape):
|
|
|
|
if axis<2:
|
|
|
|
dim_suggestion = dim_names[axis]
|
|
|
|
else:
|
|
|
|
dim_suggestion = 'dim'
|
|
|
|
while dim_suggestion in all_dims:
|
|
|
|
dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims)
|
|
|
|
identifier_creation = [str(axis) + "_" + i for i in map(str,range(n))]
|
|
|
|
ids.append((dim_suggestion,identifier_creation))
|
|
|
|
all_dims.append(dim_suggestion)
|
|
|
|
return ids
|
|
|
|
|
|
|
|
def _set_identifiers(self,identifiers,all_dims):
|
|
|
|
"""Creates internal mapping of identifiers structure."""
|
|
|
|
for dim,ids in identifiers:
|
|
|
|
pos_map={}
|
|
|
|
if dim not in self._dims:
|
|
|
|
self._dims.append(dim)
|
|
|
|
all_dims.append(dim)
|
|
|
|
else:
|
|
|
|
raise ValueError, "Dimension names must be unique"
|
2006-04-20 12:27:58 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
for pos,id in enumerate(ids):
|
|
|
|
pos_map[id] = pos
|
|
|
|
self._map[dim] = pos_map
|
|
|
|
shape_chk = [len(i) for j,i in identifiers]
|
|
|
|
if shape_chk != list(self.shape):
|
|
|
|
raise ValueError, "Shape input: %s and array: %s mismatch" %(shape_chk,self.shape)
|
2006-04-19 12:37:44 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
def _suggest_dim_name(self,dim_name,all_dims):
|
|
|
|
"""Suggests a unique name for dim and returns it"""
|
|
|
|
c = 0
|
|
|
|
while dim_name in all_dims:
|
|
|
|
dim_name = dim_name + "_" + str(c)
|
|
|
|
c+=1
|
|
|
|
return dim_name
|
2006-04-19 12:37:44 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
def asarray(self):
|
|
|
|
"""Returns the numeric array (data) of dataset"""
|
|
|
|
if not self.has_array:
|
|
|
|
raise ValueError, "Dataset is empty"
|
|
|
|
else:
|
|
|
|
return self._array
|
2006-04-19 12:37:44 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
def add_array(self,array):
|
|
|
|
"""Adds array as an ArrayType object.
|
|
|
|
A one-dim array is transformed to a two-dim array (row-vector)
|
2006-04-19 12:37:44 +02:00
|
|
|
"""
|
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
if self.has_array:
|
|
|
|
raise ValueError, "Dataset has array"
|
|
|
|
else:
|
|
|
|
if (len(self._map)!=len(array.shape)):
|
|
|
|
raise ValueError, "range(array_dims) and range(dim_names) mismatch"
|
|
|
|
if self.shape!=array.shape:
|
|
|
|
raise ValueError, "Input array must be of similar dimensions as dataset"
|
|
|
|
self._array = atleast_2d(asarray(array))
|
|
|
|
self.has_array = True
|
|
|
|
|
|
|
|
def get_name(self):
|
2006-04-24 13:23:30 +02:00
|
|
|
"""Returns dataset name"""
|
2006-04-24 11:53:07 +02:00
|
|
|
return self._name
|
2006-04-24 13:23:30 +02:00
|
|
|
|
2006-04-24 16:07:34 +02:00
|
|
|
def get_matrix(self):
|
|
|
|
"""Returns internal numeric matrix for dataset."""
|
|
|
|
return self._array
|
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
def get_all_dims(self):
|
2006-04-24 13:23:30 +02:00
|
|
|
"""Returns all dimensions in project"""
|
2006-04-24 11:53:07 +02:00
|
|
|
return self._all_dims
|
|
|
|
|
2006-04-24 13:23:30 +02:00
|
|
|
def get_dim_names(self):
|
|
|
|
"""Returns dim names"""
|
|
|
|
return [dim for dim in self._dims]
|
|
|
|
|
2006-05-02 16:06:48 +02:00
|
|
|
def get_identifiers(self, dim, indices=None):
|
|
|
|
"""Returns identifiers along dim, sorted by position (index).
|
|
|
|
|
|
|
|
You can optionally provide a list of indices to get only the
|
|
|
|
identifiers of a given position.
|
|
|
|
"""
|
2006-04-24 13:23:30 +02:00
|
|
|
items = self._map[dim].items()
|
|
|
|
backitems=[ [v[1],v[0]] for v in items]
|
|
|
|
backitems.sort()
|
|
|
|
sorted_ids=[ backitems[i][1] for i in range(0,len(backitems))]
|
2006-05-02 16:06:48 +02:00
|
|
|
|
|
|
|
if indices != None:
|
|
|
|
return [sorted_ids[index] for index in indices]
|
|
|
|
else:
|
|
|
|
return sorted_ids
|
|
|
|
|
|
|
|
def get_indices(self, dim, idents):
|
|
|
|
"""Get indices for identifiers along dimension."""
|
|
|
|
reverse = {}
|
|
|
|
for key, value in self._map[dim].items():
|
|
|
|
reverse[value] = key
|
|
|
|
return [self._map[dim][key] for key in idents]
|
2006-04-24 13:23:30 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
|
|
|
|
class CategoryDataset(Dataset):
|
|
|
|
"""The category dataset class.
|
|
|
|
|
|
|
|
A dataset for representing class information as binary
|
|
|
|
matrices (0/1-matrices).
|
|
|
|
|
|
|
|
There is support for using a less memory demanding, and
|
|
|
|
fast intersection look-ups by representing the binary matrix as a
|
|
|
|
dictionary in each dimension.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
Dataset.__init__(self)
|
|
|
|
self.has_collection = False
|
2006-04-19 12:37:44 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
def as_array(self):
|
|
|
|
"""Returns data as binary matrix"""
|
|
|
|
if not self.has_array and self.has_collection:
|
|
|
|
#build numeric array
|
|
|
|
pass
|
2006-04-19 12:37:44 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
def as_collection(self,dim):
|
|
|
|
"""Returns data as collection along dim"""
|
|
|
|
pass
|
2006-04-18 16:25:46 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
def add_collection(self,input_dict):
|
|
|
|
"""Adds a category data as collection.
|
|
|
|
|
|
|
|
A collection is a datastructure that contains a dictionary for
|
|
|
|
each pair of dimension in dataset, keyed by identifiers and
|
|
|
|
values is a set of identifiers in the other dimension
|
|
|
|
"""
|
|
|
|
#build category data as double dicts
|
|
|
|
pass
|
|
|
|
|
2006-04-19 12:37:44 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
class GraphDataset(Dataset):
|
|
|
|
"""The graph dataset class.
|
|
|
|
|
|
|
|
A dataset class for representing graphs using an adjacency matrix
|
|
|
|
(aka. restricted to square symmetric signed integers matrices)
|
|
|
|
|
|
|
|
If the library NetworkX is installed, there is support for
|
|
|
|
representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure.
|
|
|
|
"""
|
|
|
|
def __init__(self):
|
|
|
|
Dataset.__init(self)
|
|
|
|
self.has_graph = False
|
|
|
|
|
2006-04-17 00:57:50 +02:00
|
|
|
class Selection:
|
2006-04-17 11:08:40 +02:00
|
|
|
"""Handles selected identifiers along each dimension of a dataset"""
|
2006-04-17 00:57:50 +02:00
|
|
|
def __init__(self):
|
|
|
|
self.current_selection={}
|