diff --git a/fluents/dataset.py b/fluents/dataset.py index 09fb58b..4bfe83e 100644 --- a/fluents/dataset.py +++ b/fluents/dataset.py @@ -1,4 +1,5 @@ -from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse +from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse,\ +where from scipy import sort as array_sort from itertools import izip import shelve @@ -6,7 +7,7 @@ import copy import re -class Dataset: +class Dataset(object): """The Dataset base class. A Dataset is an n-way array with defined string identifiers across @@ -273,20 +274,20 @@ class CategoryDataset(Dataset): def __init__(self, array, identifiers=None, name='C'): Dataset.__init__(self, array, identifiers=identifiers, name=name) - def asspmatrix(self): + def as_spmatrix(self): if isinstance(self._array, sparse.spmatrix): return self._array else: arr = self.asarray() return sparse.csr_matrix(arr.astype('i')) - def tospmatrix(self): + def to_spmatrix(self): if isinstance(self._array, sparse.spmatrix): self._array = self._array.tocsr() else: self._array = sparse.scr_matrix(self._array) - def as_dict_lists(self): + def as_dictlists(self): """Returns data as dict of identifiers along first dim. ex: data['gene_1'] = ['map0030','map0010', ...] @@ -334,73 +335,163 @@ class CategoryDataset(Dataset): class GraphDataset(Dataset): """The graph dataset class. - A dataset class for representing graphs using an (weighted) - adjacency matrix - (restricted to square symmetric matrices) + A dataset class for representing graphs. The constructor may use an + incidence matrix (possibly sparse) or (if networkx installed) a + networkx.(X)Graph structure. - If the library NetworkX is installed, there is support for - representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure. + If the networkx library is installed, there is support for + representing the graph as a networkx.Graph, or networkx.XGraph structure. """ - def __init__(self, array, identifiers=None, name='A'): - Dataset.__init__(self, array=array, identifiers=identifiers, name=name) + def __init__(self, input, identifiers=None, name='A', nodepos = None): + if isinstance(input, sparse.spmatrix): + arr = input + else: + try: + arr = asarray(input) + except: + raise ValueError("Could not identify input") + Dataset.__init__(self, array=arr, identifiers=identifiers, name=name) self._graph = None - self._pos = None + self.nodepos = nodepos - def asnetworkx(self, nx_type='graph'): - dim = self.get_dim_name()[0] - ids = self.get_identifiers(dim, sorted=True) - adj_mat = self.asarray() - G = self._graph_from_adj_matrix(adj_mat, labels=ids) + def as_spmatrix(self): + if isinstance(self._array, sparse.spmatrix): + return self._array + else: + arr = self.asarray() + return sparse.csr_matrix(arr.astype('i')) + + def to_spmatrix(self): + if isinstance(self._array, sparse.spmatrix): + self._array = self._array.tocsr() + else: + self._array = sparse.scr_matrix(self._array) + + def asnetworkx(self): + if self._graph != None: + return self._graph + dim0, dim1 = self.get_dim_name() + node_ids = self.get_identifiers(dim0, sorted=True) + edge_ids = self.get_identifiers(dim1, sorted=True) + G = self._graph_from_incidence_matrix(self._array, node_ids=node_ids, edge_ids=edge_ids) self._graph = G return G - - def _graph_from_adj_matrix(self, A, labels=None): - """Creates a networkx graph class from adjacency - (possibly weighted) matrix and ordered labels. - nx_type = ['graph',['xgraph']] - labels = None, results in string-numbered labels + def from_networkx(cls, G, node_dim, edge_dim, sp_format=True): + """Create graph dataset from networkx graph. + + When G is a Graph/Digraph edge identifiers will be created, + else (XGraoh/XDigraph) it is assumed that edge attributes are + the edge identifiers. """ + import networkx as nx + n = G.number_of_nodes() + m = G.number_of_edges() + + if isinstance(G, nx.DiGraph): + G = nx.XDiGraph(G) + G = G.to_directed() + elif isinstance(G, nx.Graph): + G = nx.XGraph(G) + + edge_ids = [e[2] for e in G.edges()] + node_ids = map(str, G.nodes()) + n2ind = {} + for ind, node in enumerate(node_ids): + n2ind[node] = ind + + if sp_format: + I = sparse.lil_matrix((n, m)) + else: + I = zeros((m, n), dtype='i') + + for i, (h, t, eid) in enumerate(G.edges()): + if eid != None: + edge_ids[i] = eid + else: + edge_ids[i] = 'e_' + str(i) + hind = n2ind[str(h)] + tind = n2ind[str(t)] + I[hind, i] = 1 + if G.is_directed(): + I[tind, i] = -1 + else: + I[tind, i] = 1 + idents = [[node_dim, node_ids], [edge_dim, edge_ids]] + if G.name != '': + name = G.name + else: + name = 'A' + ds = GraphDataset(I, idents, name) + return ds + + from_networkx = classmethod(from_networkx) + + def _incidence2adjacency(self, I): + """Incidence to adjacency matrix. + + I*I.T - eye(n)? + """ + raise NotImplementedError + + def _graph_from_incidence_matrix(self, I, node_ids, edge_ids): + """Creates a networkx graph class from incidence + (possibly weighted) matrix and ordered labels. + + labels = None, results in string-numbered labels + """ try: import networkx as nx except: print "Failed in import of NetworkX" return None - m, n = A.shape # adjacency matrix must be of type that evals to true/false for neigbours - if m != n: - raise IOError, "Adjacency matrix must be square" - if A[A[:,0].nonzero()[0][0],0] == 1: #unweighted graph - G = nx.Graph() - else: - G = nx.XGraph() + m, n = I.shape + assert(m == len(node_ids)) + assert(n == len(edge_ids)) + weights = [] + directed = False + G = nx.XDiGraph(name=self._name) + if isinstance(I, sparse.spmatrix): + I = I.tocsr() + for ename, col in izip(edge_ids, I.T): + if isinstance(I, sparse.spmatrix): + node_ind = col.indices + w1, w2 = col.data + else: + node_ind = where(col != 0)[0] + w1, w2 = col[node_ind] + node1 = node_ids[node_ind[0]] + node2 = node_ids[node_ind[1]] + if w1 < 0: # w1 is tail + directed = True + assert(w2 > 0 and (w1 + w2) == 0) + G.add_edge(node2, node1, ename) + weights.append(w2) + else: #w2 is tail or graph is undirected + assert(w1 > 0) + if w2 < 0: + directed = True + G.add_edge(node1, node2, ename) + weights.append(w1) + if not directed: + G = G.to_undirected() + return G, asarray(weights) - if labels == None: # if labels not provided mark vertices with numbers - labels = [str(i) for i in range(m)] - - for nbrs, head in izip(A, labels): - for i, nbr in enumerate(nbrs): - if nbr: - tail = labels[i] - if type(G)==nx.XGraph: - G.add_edge(head, tail, nbr) - else: - G.add_edge(head, tail) - return G - Dataset._all_dims = set() class ReverseDict(dict): - """ - A dictionary which can lookup values by key, and keys by value. + """A dictionary which can lookup values by key, and keys by value. + All values and keys must be hashable, and unique. - - d = ReverseDict((['a',1],['b',2])) - print d['a'] --> 1 - print d.reverse[1] --> 'a' + + example: + >>d = ReverseDict((['a',1],['b',2])) + >>print d['a'] --> 1 + >>print d.reverse[1] --> 'a' """ def __init__(self, *args, **kw): dict.__init__(self, *args, **kw) @@ -436,6 +527,7 @@ class Selection(dict): def select(self, axis, labels): self[axis] = labels + def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True): """Writes a dataset in fluents tab separated values (ftsv) form. @@ -471,16 +563,23 @@ def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True): fd.write('# type: %s' %type + '\n') for dim in ds.get_dim_name(): - print >> fd, "# dimension: %s" % dim, - for id in ds.get_identifiers(dim, None, True): - print >> fd, id, - print >> fd - - print >> fd, "# name: %s" % ds.get_name() + fd.write("# dimension: %s" % dim) + for ident in ds.get_identifiers(dim, sorted=True): + fd.write(" " + ident) + fd.write("\n") + fd.write("# name: %s" % ds.get_name() + '\n') + # xy-node-positions + if ds.nodepos != None: + fd.write("# nodepos:") + node_dim = ds.get_dim_name(0) + for ident in ds.get_identifiers(node_dim, sorted=True): + fd.write(" %s,%s" %ds.nodepos[ident]) + fd.write("\n") + # Write data - if hasattr(ds, "asspmatrix") and sp_format == True: - m = ds.asspmatrix() + if hasattr(ds, "as_spmatrix") and sp_format == True: + m = ds.as_spmatrix() else: m = ds.asarray() if isinstance(m, sparse.spmatrix): @@ -491,6 +590,107 @@ def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True): if opened: fd.close() +def read_ftsv(fd, sep=None): + """Read a dataset in fluents tab separated values (ftsv) form and return it. + + @param fd: An open file descriptor. + @return: A Dataset, CategoryDataset or GraphDataset depending on the information + read. + """ + opened = False + if isinstance(fd, str): + fd = open(fd) + opened = True + + split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)') + dimensions = [] + identifiers = {} + type = 'dataset' + name = 'Unnamed dataset' + sp_format = False + nodepos = None + # graphtype = 'graph' + + # Read header lines from file. + line = fd.readline() + while line: + m = split_re.match(line) + if m: + key, val = m.groups() + + # The line is on the form; + # dimension: dimname id1 id2 id3 ... + if key == 'dimension': + values = [v.strip() for v in val.split(' ')] + dimensions.append(values[0]) + identifiers[values[0]] = values[1:] + + # Read type of dataset. + # Should be dataset, category, or network + elif key == 'type': + type = val + + elif key == 'name': + name = val + + # storage format + # if sp_format is True then use coordinate triplets + elif key == 'sp_format': + if val in ['False', 'false', '0', 'F', 'f',]: + sp_format = False + elif val in ['True', 'true', '1', 'T', 't']: + sp_format = True + else: + raise ValueError("sp_format: %s not valid " %sp_format) + + elif key == 'nodepos': + node_dim = dimensions[0] + idents = identifiers[node_dim] + nodepos = {} + xys = val.split(" ") + for node_id, xy in zip(idents, xys): + x, y = map(float, xy.split(",")) + nodepos[node_id] = (x, y) + + else: + break + line = fd.readline() + + # Dimensions in the form [(dim1, [id1, id2, id3 ..) ...] + dims = [(x, identifiers[x]) for x in dimensions] + dim_lengths = [len(identifiers[x]) for x in dimensions] + + # Create matrix and assign element reader + if type == 'category': + if sp_format: + matrix = sparse.lil_matrix(dim_lengths) + else: + matrix = empty(dim_lengths, dtype='i') + else: + if sp_format: + matrix = sparse.lil_matrix(dim_lengths) + else: + matrix = empty(dim_lengths) + + if sp_format: + matrix = _read_sparse_elements(fd, matrix) + else: + matrix = _read_elements(fd, matrix) + + + # Create dataset of specified type + if type == 'category': + ds = CategoryDataset(matrix, dims, name) + elif type == 'network': + ds = GraphDataset(matrix, dims, name=name, nodepos=nodepos) + else: + ds = Dataset(matrix, dims, name) + + if opened: + fd.close() + + return ds + def _write_sparse_elements(fd, arr, fmt='%d', sep=None): """ Sparse coordinate format.""" fd.write('# sp_format: True\n\n') @@ -530,97 +730,3 @@ def _read_sparse_elements(fd, arr, sep=None): line = fd.readline() return arr.tocsr() -def read_ftsv(fd, sep=None): - """Read a dataset in fluents tab separated values (ftsv) form and return it. - - @param fd: An open file descriptor. - @return: A Dataset, CategoryDataset or GraphDataset depending on the information - read. - """ - opened = False - if isinstance(fd, str): - fd = open(fd) - opened = True - - split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)') - dimensions = [] - identifiers = {} - type = 'dataset' - name = 'Unnamed dataset' - sp_format = False - # graphtype = 'graph' - - # Read header lines from file. - line = fd.readline() - while line: - m = split_re.match(line) - if m: - key, val = m.groups() - - # The line is on the form; - # dimension: dimname id1 id2 id3 ... - if key == 'dimension': - values = [v.strip() for v in val.split(' ')] - dimensions.append(values[0]) - identifiers[values[0]] = values[1:] - - # Read type of dataset. - # Should be dataset, category, or network - elif key == 'type': - type = val - - elif key == 'name': - name = val - - # storage format - # if sp_format is True then use coordinate triplets - elif key == 'sp_format': - if val in ['False', 'false', '0', 'F', 'f',]: - sp_format = False - elif val in ['True', 'true', '1', 'T', 't']: - sp_format = True - else: - raise ValueError("sp_format: %s not valid " %sp_format) - - # elif key == 'graphtype': - # graphtype = val - - else: - break - line = fd.readline() - - # Dimensions in the form [(dim1, [id1, id2, id3 ..) ...] - dims = [(x, identifiers[x]) for x in dimensions] - dim_lengths = [len(identifiers[x]) for x in dimensions] - - # Create matrix and assign element reader - if type == 'category': - if sp_format: - matrix = sparse.lil_matrix(dim_lengths) - else: - matrix = empty(dim_lengths, dtype='i') - elif type == 'network': - matrix = empty(dim_lengths) - else: - matrix = empty(dim_lengths) - - if sp_format: - matrix = _read_sparse_elements(fd, matrix) - else: - matrix = _read_elements(fd, matrix) - - - # Create dataset of specified type - if type == 'category': - ds = CategoryDataset(matrix, dims, name) - elif type == 'network': - ds = GraphDataset(matrix, dims, name) - else: - ds = Dataset(matrix, dims, name) - - if opened: - fd.close() - - return ds - -