Sparse network support and nodepos read/write

2008-01-08 00:43:56 +00:00
parent bf29661af9
commit ab9c1ec84b
1 changed files with 258 additions and 152 deletions
--- a/fluents/dataset.py
+++ b/fluents/dataset.py
@@ -1,4 +1,5 @@
-from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse
+from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse,\
 where
 from scipy import sort as array_sort
 from itertools import izip
 import shelve
@@ -6,7 +7,7 @@ import copy
 import re
-class Dataset:
+class Dataset(object):
    """The Dataset base class.
    A Dataset is an n-way array with defined string identifiers across
@@ -273,20 +274,20 @@ class CategoryDataset(Dataset):
    def __init__(self, array, identifiers=None, name='C'):
        Dataset.__init__(self, array, identifiers=identifiers, name=name)
-    def asspmatrix(self):
+    def as_spmatrix(self):
        if isinstance(self._array, sparse.spmatrix):
            return self._array
        else:
            arr = self.asarray()
            return sparse.csr_matrix(arr.astype('i'))
-    def tospmatrix(self):
+    def to_spmatrix(self):
        if isinstance(self._array, sparse.spmatrix):
            self._array = self._array.tocsr()
        else:
            self._array = sparse.scr_matrix(self._array)
-    def as_dict_lists(self):
+    def as_dictlists(self):
        """Returns data as dict of identifiers along first dim.
        ex: data['gene_1'] = ['map0030','map0010', ...]
@@ -334,73 +335,163 @@ class CategoryDataset(Dataset):
 class GraphDataset(Dataset):
    """The graph dataset class.
-    A dataset class for representing graphs using an (weighted)
+    A dataset class for representing graphs. The constructor may use an 
-    adjacency matrix
+    incidence matrix (possibly sparse) or (if networkx installed) a 
-    (restricted to square symmetric matrices)
+    networkx.(X)Graph structure.
-    If the library NetworkX is installed, there is support for
+    If the networkx library is installed, there is support for
-    representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure.
+    representing the graph as a networkx.Graph, or networkx.XGraph structure.
    """
-    def __init__(self, array, identifiers=None, name='A'):
+    def __init__(self, input, identifiers=None, name='A', nodepos = None):      
-        Dataset.__init__(self, array=array, identifiers=identifiers, name=name)
+        if isinstance(input, sparse.spmatrix):
            arr = input
        else:
            try:
                arr = asarray(input)
            except:
                raise ValueError("Could not identify input")
        Dataset.__init__(self, array=arr, identifiers=identifiers, name=name)
        self._graph = None
-        self._pos = None
+        self.nodepos = nodepos
-    def asnetworkx(self, nx_type='graph'):
+    def as_spmatrix(self):
-        dim = self.get_dim_name()[0]
+        if isinstance(self._array, sparse.spmatrix):
-        ids = self.get_identifiers(dim, sorted=True)
+            return self._array
-        adj_mat = self.asarray()
+        else:
-        G = self._graph_from_adj_matrix(adj_mat, labels=ids)
+            arr = self.asarray()
            return sparse.csr_matrix(arr.astype('i'))
    def to_spmatrix(self):
        if isinstance(self._array, sparse.spmatrix):
            self._array = self._array.tocsr()
        else:
            self._array = sparse.scr_matrix(self._array)
    def asnetworkx(self):
        if self._graph != None:
            return self._graph
        dim0, dim1 = self.get_dim_name()
        node_ids = self.get_identifiers(dim0, sorted=True)
        edge_ids = self.get_identifiers(dim1, sorted=True)
        G = self._graph_from_incidence_matrix(self._array, node_ids=node_ids, edge_ids=edge_ids)
        self._graph = G
        return G
-    def _graph_from_adj_matrix(self, A, labels=None):
+    def from_networkx(cls, G, node_dim, edge_dim, sp_format=True):
-        """Creates a networkx graph class from adjacency
+        """Create graph dataset from networkx graph.
        (possibly weighted) matrix and ordered labels.
-        nx_type = ['graph',['xgraph']]
+        When G is a Graph/Digraph edge identifiers will be created,
-        labels = None, results in string-numbered labels
+        else (XGraoh/XDigraph) it is assumed that edge attributes are
        the edge identifiers.
        """
        import networkx as nx
        n = G.number_of_nodes()
        m = G.number_of_edges()
        if isinstance(G, nx.DiGraph):
            G = nx.XDiGraph(G)
            G = G.to_directed()
        elif isinstance(G, nx.Graph):
            G = nx.XGraph(G)
        edge_ids = [e[2] for e in G.edges()]
        node_ids = map(str, G.nodes())
        n2ind = {}
        for ind, node in enumerate(node_ids):
            n2ind[node] = ind
        if sp_format:
            I = sparse.lil_matrix((n, m))
        else:
            I = zeros((m, n), dtype='i')
        for i, (h, t, eid) in enumerate(G.edges()):
            if eid != None:
                edge_ids[i] = eid
            else:
                edge_ids[i] = 'e_' + str(i)
            hind = n2ind[str(h)]
            tind = n2ind[str(t)]
            I[hind, i] = 1
            if G.is_directed():
                I[tind, i] = -1
            else:
                I[tind, i] = 1
        idents = [[node_dim, node_ids], [edge_dim, edge_ids]]
        if G.name != '':
            name = G.name
        else:
            name = 'A'
        ds = GraphDataset(I, idents, name)
        return ds
    from_networkx = classmethod(from_networkx)            
    def _incidence2adjacency(self, I):
        """Incidence to adjacency matrix.
        I*I.T - eye(n)?
        """
        raise NotImplementedError
    def _graph_from_incidence_matrix(self, I, node_ids, edge_ids):
        """Creates a networkx graph class from incidence
        (possibly weighted) matrix and ordered labels.
        labels = None, results in string-numbered labels
        """
        try:
            import networkx as nx
        except:
            print "Failed in import of NetworkX"
            return None
        m, n = A.shape # adjacency matrix must be of type that evals to true/false for neigbours
        if m != n:
            raise IOError, "Adjacency matrix must be square"
-        if A[A[:,0].nonzero()[0][0],0] == 1: #unweighted graph
+        m, n = I.shape
-            G = nx.Graph()
+        assert(m == len(node_ids))
        assert(n == len(edge_ids))
        weights = []
        directed = False
        G = nx.XDiGraph(name=self._name)
        if isinstance(I, sparse.spmatrix):
            I = I.tocsr()
        for ename, col in izip(edge_ids, I.T):
            if isinstance(I, sparse.spmatrix):
                node_ind = col.indices
                w1, w2 = col.data
            else:
-            G = nx.XGraph()
+                node_ind = where(col != 0)[0]
-
+                w1, w2 = col[node_ind]
-        if labels == None: # if labels not provided mark vertices with numbers
+            node1 = node_ids[node_ind[0]]
-            labels = [str(i) for i in range(m)]
+            node2 = node_ids[node_ind[1]]
-
+            if w1 < 0: # w1 is tail
-        for nbrs, head in izip(A, labels):
+                directed = True
-            for i, nbr in enumerate(nbrs):
+                assert(w2 > 0 and (w1 + w2) == 0)
-                if nbr:
+                G.add_edge(node2, node1, ename)
-                    tail = labels[i]
+                weights.append(w2)
-                    if type(G)==nx.XGraph:
+            else: #w2 is tail or graph is undirected
-                        G.add_edge(head, tail, nbr)
+                assert(w1 > 0)
-                    else:
+                if w2 < 0:
-                        G.add_edge(head, tail)
+                    directed = True
-        return G
+                G.add_edge(node1, node2, ename)
                weights.append(w1)
        if not directed:
            G = G.to_undirected()
        return G, asarray(weights)
 Dataset._all_dims = set()
 class ReverseDict(dict):
-    """
+    """A dictionary which can lookup values by key, and keys by value.
-    A dictionary which can lookup values by key, and keys by value.
+    
    All values and keys must be hashable, and unique.
-    d = ReverseDict((['a',1],['b',2]))
+    example:
-    print d['a'] --> 1
+    >>d = ReverseDict((['a',1],['b',2]))
-    print d.reverse[1] --> 'a'
+    >>print d['a'] --> 1
    >>print d.reverse[1] --> 'a'
    """
    def __init__(self, *args, **kw):
        dict.__init__(self, *args, **kw)
@@ -436,6 +527,7 @@ class Selection(dict):
    def select(self, axis, labels):
        self[axis] = labels
 def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True):
    """Writes a dataset in fluents tab separated values (ftsv) form.
@@ -471,16 +563,23 @@ def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True):
    fd.write('# type: %s' %type + '\n')
    for dim in ds.get_dim_name():
-        print >> fd, "# dimension: %s" % dim,
+        fd.write("# dimension: %s" % dim)
-        for id in ds.get_identifiers(dim, None, True):
+        for ident in ds.get_identifiers(dim, sorted=True):
-            print >> fd, id,
+            fd.write(" " + ident)
-        print >> fd
+        fd.write("\n")
-    print >> fd, "# name: %s" % ds.get_name()
+    fd.write("# name: %s" % ds.get_name() + '\n')
    # xy-node-positions
    if ds.nodepos != None:
        fd.write("# nodepos:")
        node_dim = ds.get_dim_name(0)
        for ident in ds.get_identifiers(node_dim, sorted=True):
            fd.write(" %s,%s" %ds.nodepos[ident])
        fd.write("\n")
    # Write data
-    if hasattr(ds, "asspmatrix") and sp_format == True:
+    if hasattr(ds, "as_spmatrix") and sp_format == True:
-        m = ds.asspmatrix()
+        m = ds.as_spmatrix()
    else:
        m = ds.asarray()
    if isinstance(m, sparse.spmatrix):
@@ -491,6 +590,107 @@ def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True):
    if opened:
        fd.close()
 def read_ftsv(fd, sep=None):
    """Read a dataset in fluents tab separated values (ftsv) form and return it.
    @param fd: An open file descriptor.
    @return: A Dataset, CategoryDataset or GraphDataset depending on the information
    read.
    """
    opened = False
    if isinstance(fd, str):
        fd = open(fd)
        opened = True
    split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)')
    dimensions = []
    identifiers = {}
    type = 'dataset'
    name = 'Unnamed dataset'
    sp_format = False
    nodepos = None
    # graphtype = 'graph'
    # Read header lines from file.
    line = fd.readline()
    while line:
        m = split_re.match(line)
        if m:
            key, val = m.groups()
            # The line is on the form;
            # dimension: dimname id1 id2 id3 ...
            if key == 'dimension':
                values = [v.strip() for v in val.split(' ')]
                dimensions.append(values[0])
                identifiers[values[0]] = values[1:]
            # Read type of dataset.
            # Should be dataset, category, or network
            elif key == 'type':
                type = val
            elif key == 'name':
                name = val
            # storage format
            # if sp_format is True then use coordinate triplets
            elif key == 'sp_format':
                if val in ['False', 'false', '0', 'F', 'f',]:
                    sp_format = False
                elif val in ['True', 'true', '1', 'T', 't']:
                    sp_format = True
                else:
                    raise ValueError("sp_format: %s not valid " %sp_format)
            elif key == 'nodepos':
                node_dim = dimensions[0]
                idents = identifiers[node_dim]
                nodepos = {}
                xys = val.split(" ")
                for node_id, xy in zip(idents, xys):
                    x, y = map(float, xy.split(","))
                    nodepos[node_id] = (x, y)
        else:
            break
        line = fd.readline()
    # Dimensions in the form [(dim1, [id1, id2, id3 ..) ...] 
    dims = [(x, identifiers[x]) for x in dimensions]
    dim_lengths = [len(identifiers[x]) for x in dimensions]
    # Create matrix and assign element reader
    if type == 'category':
        if sp_format:
            matrix = sparse.lil_matrix(dim_lengths)
        else:
            matrix = empty(dim_lengths, dtype='i')
    else:
        if sp_format:
            matrix = sparse.lil_matrix(dim_lengths)
        else:
            matrix = empty(dim_lengths)
    if sp_format:
        matrix = _read_sparse_elements(fd, matrix)
    else:
        matrix = _read_elements(fd, matrix)
    # Create dataset of specified type
    if type == 'category':
        ds = CategoryDataset(matrix, dims, name)
    elif type == 'network':
        ds = GraphDataset(matrix, dims, name=name, nodepos=nodepos)
    else:
        ds = Dataset(matrix, dims, name)
    if opened:
        fd.close()
    return ds
 def _write_sparse_elements(fd, arr, fmt='%d', sep=None):
    """ Sparse coordinate format.""" 
    fd.write('# sp_format: True\n\n')
@@ -530,97 +730,3 @@ def _read_sparse_elements(fd, arr, sep=None):
        line = fd.readline()
    return arr.tocsr()
 def read_ftsv(fd, sep=None):
    """Read a dataset in fluents tab separated values (ftsv) form and return it.
    @param fd: An open file descriptor.
    @return: A Dataset, CategoryDataset or GraphDataset depending on the information
    read.
    """
    opened = False
    if isinstance(fd, str):
        fd = open(fd)
        opened = True
    split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)')
    dimensions = []
    identifiers = {}
    type = 'dataset'
    name = 'Unnamed dataset'
    sp_format = False
    # graphtype = 'graph'
    # Read header lines from file.
    line = fd.readline()
    while line:
        m = split_re.match(line)
        if m:
            key, val = m.groups()
            # The line is on the form;
            # dimension: dimname id1 id2 id3 ...
            if key == 'dimension':
                values = [v.strip() for v in val.split(' ')]
                dimensions.append(values[0])
                identifiers[values[0]] = values[1:]
            # Read type of dataset.
            # Should be dataset, category, or network
            elif key == 'type':
                type = val
            elif key == 'name':
                name = val
            # storage format
            # if sp_format is True then use coordinate triplets
            elif key == 'sp_format':
                if val in ['False', 'false', '0', 'F', 'f',]:
                    sp_format = False
                elif val in ['True', 'true', '1', 'T', 't']:
                    sp_format = True
                else:
                    raise ValueError("sp_format: %s not valid " %sp_format)
            # elif key == 'graphtype':
            #    graphtype = val
        else:
            break
        line = fd.readline()
    # Dimensions in the form [(dim1, [id1, id2, id3 ..) ...] 
    dims = [(x, identifiers[x]) for x in dimensions]
    dim_lengths = [len(identifiers[x]) for x in dimensions]
    # Create matrix and assign element reader
    if type == 'category':
        if sp_format:
            matrix = sparse.lil_matrix(dim_lengths)
        else:
            matrix = empty(dim_lengths, dtype='i')
    elif type == 'network':
        matrix = empty(dim_lengths)
    else:
        matrix = empty(dim_lengths)
    if sp_format:
        matrix = _read_sparse_elements(fd, matrix)
    else:
        matrix = _read_elements(fd, matrix)
    # Create dataset of specified type
    if type == 'category':
        ds = CategoryDataset(matrix, dims, name)
    elif type == 'network':
        ds = GraphDataset(matrix, dims, name)
    else:
        ds = Dataset(matrix, dims, name)
    if opened:
        fd.close()
    return ds