diff --git a/fluents/dataset.py b/fluents/dataset.py index 488f97f..d1db1d1 100644 --- a/fluents/dataset.py +++ b/fluents/dataset.py @@ -36,28 +36,30 @@ class Dataset: data = Dataset(rand(10,20)) (generates dims and ids (no links)) """ - def __init__(self,array,identifiers=None,name='Unnamed dataset'): + def __init__(self, array, identifiers=None, name='Unnamed dataset'): self._dims = [] #existing dimensions in this dataset self._map = {} # internal mapping for dataset: identifier <--> index self._name = name self._identifiers = identifiers self._type = 'n' - try: - array = atleast_2d(asarray(array)) - except: - print "Cant cast array as numpy-array" - return - # vectors are column vectors - if array.shape[0]==1: - array = array.T - self.shape = array.shape + if len(array.shape)==1: + array = atleast_2d(asarray(array)) + # vectors are column vectors + if array.shape[0]==1: + array = array.T + self.shape = array.shape if identifiers!=None: - self._set_identifiers(identifiers,self._all_dims) + identifier_shape = [len(i[1]) for i in identifiers] + if len(identifier_shape)!=len(self.shape): + raise ValueError, "Identifier list length must equal array dims" + for ni, na in zip(identifier_shape, self.shape): + if ni!=na: + raise ValueError, "identifier-array mismatch in %s: (idents: %s, array: %s)" %(self._name, ni, na) + self._set_identifiers(identifiers, self._all_dims) else: - self._identifiers = self._create_identifiers(self.shape,self._all_dims) - self._set_identifiers(self._identifiers,self._all_dims) - + self._identifiers = self._create_identifiers(self.shape, self._all_dims) + self._set_identifiers(self._identifiers, self._all_dims) self._array = array def __iter__(self): @@ -94,17 +96,16 @@ class Dataset: all_dims.add(dim_suggestion) return ids - def _set_identifiers(self,identifiers,all_dims): + def _set_identifiers(self, identifiers, all_dims): """Creates internal mapping of identifiers structure.""" - for dim,ids in identifiers: + for dim, ids in identifiers: pos_map = ReverseDict() if dim not in self._dims: self._dims.append(dim) all_dims.add(dim) else: raise ValueError, "Dimension names must be unique whitin dataset" - - for pos,id in enumerate(ids): + for pos, id in enumerate(ids): pos_map[id] = pos self._map[dim] = pos_map @@ -121,11 +122,10 @@ class Dataset: """Returns the numeric array (data) of dataset""" return self._array - def add_array(self,array): + def add_array(self, array): """Adds array as an ArrayType object. A one-dim array is transformed to a two-dim array (row-vector) """ - if self.shape!=array.shape: raise ValueError, "Input array must be of similar dimensions as dataset" self._array = atleast_2d(asarray(array)) @@ -138,7 +138,7 @@ class Dataset: """Returns all dimensions in project""" return self._all_dims - def get_dim_name(self,axis=None): + def get_dim_name(self, axis=None): """Returns dim name for an axis, if no axis is provided it returns a list of dims""" if type(axis)==int: @@ -178,7 +178,6 @@ class Dataset: You can optionally provide a list of identifiers to retrieve a index subset. - Identifiers are the unique names (strings) for a variable in a given dim. Index (Indices) are the Identifiers position in a matrix in a given dim. If none of the input identifiers are @@ -218,8 +217,8 @@ class CategoryDataset(Dataset): . """ - def __init__(self,array,identifiers=None,name='C'): - Dataset.__init__(self,array,identifiers=identifiers,name=name) + def __init__(self, array, identifiers=None, name='C'): + Dataset.__init__(self, array, identifiers=identifiers, name=name) self.has_dictlists = False self._type = 'c' @@ -229,7 +228,7 @@ class CategoryDataset(Dataset): ex: data['gene_id'] = ['map0030','map0010', ...] """ data={} - for name,ind in self._map[self.get_dim_name(0)].items(): + for name, ind in self._map[self.get_dim_name(0)].items(): data[name] = self.get_identifiers(self.get_dim_name(1), list(self._array[ind,:].nonzero())) self._dictlists = data @@ -240,7 +239,7 @@ class CategoryDataset(Dataset): """Returns data as a list of Selection objects. """ ret_list = [] - for cat_name,ind in self._map[self.get_dim_name(1)].items(): + for cat_name, ind in self._map[self.get_dim_name(1)].items(): ids = self.get_identifiers(self.get_dim_name(0), self._array[:,ind].nonzero()[0]) selection = Selection(cat_name) @@ -254,26 +253,26 @@ class GraphDataset(Dataset): A dataset class for representing graphs using an (weighted) adjacency matrix - (aka. restricted to square symmetric matrices) + (restricted to square symmetric matrices) If the library NetworkX is installed, there is support for representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure. """ - def __init__(self,array=None,identifiers=None,shape=None,all_dims=[],**kwds): - Dataset.__init__(self,array=array,identifiers=identifiers,name='A') + def __init__(self, array=None, identifiers=None, shape=None, all_dims=[],**kwds): + Dataset.__init__(self, array=array, identifiers=identifiers, name='A') self._graph = None self._type = 'g' - def asnetworkx(self,nx_type='graph'): + def asnetworkx(self, nx_type='graph'): dim = self.get_dim_name()[0] - ids = self.get_identifiers(dim,sorted=True) + ids = self.get_identifiers(dim, sorted=True) adj_mat = self.asarray() - G = self._graph_from_adj_matrix(adj_mat,labels=ids) + G = self._graph_from_adj_matrix(adj_mat, labels=ids) self._graph = G return G - def _graph_from_adj_matrix(self,A,labels=None): + def _graph_from_adj_matrix(self, A, labels=None): """Creates a networkx graph class from adjacency (possibly weighted) matrix and ordered labels. @@ -286,7 +285,7 @@ class GraphDataset(Dataset): except: print "Failed in import of NetworkX" return - m,n = A.shape# adjacency matrix must be of type that evals to true/false for neigbours + m, n = A.shape# adjacency matrix must be of type that evals to true/false for neigbours if m!=n: raise IOError, "Adjacency matrix must be square" @@ -298,17 +297,18 @@ class GraphDataset(Dataset): if labels==None: # if labels not provided mark vertices with numbers labels = [str(i) for i in range(m)] - for nbrs,head in izip(A,labels): - for i,nbr in enumerate(nbrs): + for nbrs, head in izip(A, labels): + for i, nbr in enumerate(nbrs): if nbr: tail = labels[i] if type(G)==nx.XGraph: - G.add_edge(head,tail,nbr) + G.add_edge(head, tail, nbr) else: - G.add_edge(head,tail) + G.add_edge(head, tail) return G -Dataset._all_dims=set() +Dataset._all_dims = set() + class ReverseDict(dict): """ @@ -336,30 +336,34 @@ def to_file(filepath,dataset,name=None): """ if not name: name = dataset._name - data = shelve.open(filepath,flag='c',protocol=2) + data = shelve.open(filepath, flag='c', protocol=2) if data: #we have an append names = data.keys() if name in names: print "Data with name: %s overwritten" %dataset._name - sub_data = {'array':dataset._array,'idents':dataset._identifiers,'type':dataset._type} + + sub_data = {'array':dataset._array, + 'idents':dataset._identifiers, + 'type':dataset._type} data[name] = sub_data data.close() def from_file(filepath): - """Read dataset from file """ - data = shelve.open(filepath,flag='r') + """Read dataset(s) from file """ + data = shelve.open(filepath, flag='r') out_data = [] for name in data.keys(): sub_data = data[name] if sub_data['type']=='c': - out_data.append(CategoryDataset(sub_data['array'],identifiers=sub_data['idents'],name=name)) + out_data.append(CategoryDataset(sub_data['array'], identifiers=sub_data['idents'], name=name)) elif sub_data['type']=='g': - out_data.append(GraphDataset(sub_data['array'],identifiers=sub_data['idents'],name=name)) + out_data.append(GraphDataset(sub_data['array'], identifiers=sub_data['idents'], name=name)) else: - out_data.append(Dataset(sub_data['array'],identifiers=sub_data['idents'],name=name)) + out_data.append(Dataset(sub_data['array'], identifiers=sub_data['idents'], name=name)) return out_data - + + class Selection(dict): """Handles selected identifiers along each dimension of a dataset"""