From 676ea4e0b99504e190c394ef981436e4a0ff02ee Mon Sep 17 00:00:00 2001 From: flatberg Date: Tue, 1 Aug 2006 09:45:53 +0000 Subject: [PATCH] removed shape in constructor, added all_dims to dataset base class, updated get_identifiers and get_indices Denne linjen og de som er under vil bli ignorert-- M dataset.py --- system/dataset.py | 172 ++++++++++++++++++++++++++-------------------- 1 file changed, 99 insertions(+), 73 deletions(-) diff --git a/system/dataset.py b/system/dataset.py index d4918be..6254e0d 100644 --- a/system/dataset.py +++ b/system/dataset.py @@ -1,5 +1,6 @@ -from scipy import atleast_2d,asarray,ArrayType - +from scipy import atleast_2d,asarray,ArrayType,shape +from scipy import sort as array_sort +from itertools import izip class Dataset: """The Dataset base class. @@ -32,45 +33,26 @@ class Dataset: data = Dataset(rand(10,20)) (generates dims and ids (no links)) """ - def __init__(self,array=None,identifiers=None,shape=None,all_dims=[],**kwds): - self._name = kwds.get("name","Unnamed data") + def __init__(self,array,identifiers=None,name='Unnamed dataset'): self._dims = [] #existing dimensions in this dataset self._map = {} # internal mapping for dataset: identifier <--> index - self.has_array = False - self.shape = None - - if array==None: - if shape == None: - raise ValueError, "Must define shape if array is None" - else: - self.shape = shape - if identifiers!=None: - self._set_identifiers(identifiers,all_dims) - else: - ids = self._create_identifiers(shape,all_dims) - self._set_identifiers(ids,all_dims) - elif isinstance(array,ArrayType): + self._name = name + if isinstance(array,ArrayType): array = atleast_2d(asarray(array)) self.shape = array.shape - if shape != None: - if self.shape!=shape: - raise ValueError, "Differing in array and provided. %s != %s" % (self.shape, shape) if identifiers!=None: - self._set_identifiers(identifiers,all_dims) + self._set_identifiers(identifiers,self._all_dims) else: - ids = self._create_identifiers(self.shape,all_dims) - self._set_identifiers(ids,all_dims) + ids = self._create_identifiers(self.shape,self._all_dims) + self._set_identifiers(ids,self._all_dims) self._array = array - self.has_array = True else: - raise ValueError, "array input must be of ArrayType or None" - - self._all_dims = all_dims + raise ValueError, "Array input must be of ArrayType" def __str__(self): - return self._name + ":" + self._dims.__str__() + return self._name + ":\n" + "Dim names: " + self._dims.__str__() def __iter__(self): """Returns an iterator over dimensions of dataset.""" @@ -100,11 +82,10 @@ class Dataset: dim_suggestion = dim_names[axis] else: dim_suggestion = 'dim' - while dim_suggestion in all_dims: - dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims) + dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims) identifier_creation = [str(axis) + "_" + i for i in map(str,range(n))] ids.append((dim_suggestion,identifier_creation)) - all_dims.append(dim_suggestion) + all_dims.add(dim_suggestion) return ids def _set_identifiers(self,identifiers,all_dims): @@ -113,24 +94,22 @@ class Dataset: pos_map={} if dim not in self._dims: self._dims.append(dim) - all_dims.append(dim) + all_dims.add(dim) else: - raise ValueError, "Dimension names must be unique" + raise ValueError, "Dimension names must be unique whitin dataset" for pos,id in enumerate(ids): pos_map[id] = pos self._map[dim] = pos_map - shape_chk = [len(i) for j,i in identifiers] - if shape_chk != list(self.shape): - raise ValueError, "Shape input: %s and array: %s mismatch" %(shape_chk,self.shape) - + def _suggest_dim_name(self,dim_name,all_dims): """Suggests a unique name for dim and returns it""" c = 0 - while dim_name in all_dims: - dim_name = dim_name + "_" + str(c) + new_name = dim_name + while new_name in all_dims: + new_name = dim_name + "_" + str(c) c+=1 - return dim_name + return new_name def asarray(self): """Returns the numeric array (data) of dataset""" @@ -144,15 +123,9 @@ class Dataset: A one-dim array is transformed to a two-dim array (row-vector) """ - if self.has_array: - raise ValueError, "Dataset has array" - else: - if (len(self._map)!=len(array.shape)): - raise ValueError, "range(array_dims) and range(dim_names) mismatch" - if self.shape!=array.shape: - raise ValueError, "Input array must be of similar dimensions as dataset" - self._array = atleast_2d(asarray(array)) - self.has_array = True + if self.shape!=array.shape: + raise ValueError, "Input array must be of similar dimensions as dataset" + self._array = atleast_2d(asarray(array)) def get_name(self): """Returns dataset name""" @@ -162,34 +135,50 @@ class Dataset: """Returns all dimensions in project""" return self._all_dims - def get_dim_names(self): - """Returns dim names""" - return [dim for dim in self._dims] + def get_dim_name(self,axis=None): + """Returns dim name for an axis, if no axis is provided it returns a list of dims""" + if type(axis)==int: + return self._dims[axis] + else: + return [dim for dim in self] - def get_identifiers(self, dim, indices=None): - """Returns identifiers along dim, sorted by position (index). + def get_identifiers(self, dim, indices=None,sorted=True): + """Returns identifiers along dim, sorted by position (index) is optional. You can optionally provide a list of indices to get only the identifiers of a given position. + + Identifiers are the unique names (strings) for a variable in a given dim. + Index (Indices) are the Identifiers position in a matrix in a given dim. """ - items = self._map[dim].items() - backitems=[ [v[1],v[0]] for v in items] - backitems.sort() - sorted_ids=[ backitems[i][1] for i in range(0,len(backitems))] - - # we use id as scipy-arrays return a new array on boolean - # operations - if id(indices) != id(None): - return [sorted_ids[index] for index in indices] + if sorted==True: + items = self._map[dim].items() + backitems = [ [v[1],v[0]] for v in items] + backitems.sort() + ids = [ backitems[i][1] for i in range(0,len(backitems))] + else: - return sorted_ids + ids = self._map[dim].keys() - def get_indices(self, dim, idents): - """Get indices for identifiers along dimension.""" - reverse = {} - for key, value in self._map[dim].items(): - reverse[value] = key - return [self._map[dim][key] for key in idents] + if indices != None: + ids = [self._map[index] for index in indices] + + return ids + + + def get_indices(self, dim, idents=None): + """Returns indices for identifiers along dimension. + + You can optionally provide a list of identifiers to retrieve a index subset. + + + Identifiers are the unique names (strings) for a variable in a given dim. + Index (Indices) are the Identifiers position in a matrix in a given dim.""" + if idents==None: + index = array_sort(self._map[dim].values()) + else: + index = [self.map[dim][key] for key in idents] + return asarray(index) class CategoryDataset(Dataset): """The category dataset class. @@ -236,10 +225,47 @@ class GraphDataset(Dataset): If the library NetworkX is installed, there is support for representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure. """ - def __init__(self): - Dataset.__init(self) + def __init__(self,array=None,identifiers=None,shape=None,all_dims=[],**kwds): + Dataset.__init__(self,array=array,identifiers=identifiers,name='A') self.has_graph = False + def asnetworkx(self,nx_type='graph'): + dim = self.get_dim_names()[0] + ids = self.get_identifiers(dim) + adj_mat = self.asarray() + G = self._graph_from_adj_matrix(adj_mat,labels=ids) + self.has_graph = True + return G + + def _graph_from_adj_matrix(self,A,labels=None,nx_type='graph'): + """Creates a networkx graph class from adjacency matrix and ordered labels. + nx_type = ['graph',['xgraph']] + labels = None, results in string-numbered labels + + """ + import networkx as nx + m,n = shape(A)# adjacency matrix must be of type that evals to true/false for neigbours + if m!=n: + raise IOError, "Adjacency matrix must be square" + if nx_type=='graph': + G = nx.Graph() + elif nx_type=='x_graph': + G = nx.XGraph() + else: + raise IOError, "Unknown graph type: %s" %nx_type + + if labels==None: # if labels not provided mark vertices with numbers + labels = [str(i) for i in range(m)] + + + for nbrs,head in izip(A,labels): + for i,nbr in enumerate(nbrs): + if nbr: + tail = labels[i] + G.add_edge(head,tail) + return G +Dataset._all_dims=set() + class Selection: """Handles selected identifiers along each dimension of a dataset""" def __init__(self):