Sparse network support and nodepos read/write

2008-01-08 00:43:56 +00:00
parent bf29661af9
commit ab9c1ec84b
1 changed files with 258 additions and 152 deletions
--- a/fluents/dataset.py
+++ b/fluents/dataset.py
@@ -1,4 +1,5 @@
-from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse
+from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse,\
+where
 from scipy import sort as array_sort
 from itertools import izip
 import shelve
@@ -6,7 +7,7 @@ import copy
 import re


-class Dataset:
+class Dataset(object):
    """The Dataset base class.
    
    A Dataset is an n-way array with defined string identifiers across
@@ -273,20 +274,20 @@ class CategoryDataset(Dataset):
    def __init__(self, array, identifiers=None, name='C'):
        Dataset.__init__(self, array, identifiers=identifiers, name=name)

-    def asspmatrix(self):
+    def as_spmatrix(self):
        if isinstance(self._array, sparse.spmatrix):
            return self._array
        else:
            arr = self.asarray()
            return sparse.csr_matrix(arr.astype('i'))

-    def tospmatrix(self):
+    def to_spmatrix(self):
        if isinstance(self._array, sparse.spmatrix):
            self._array = self._array.tocsr()
        else:
            self._array = sparse.scr_matrix(self._array)

-    def as_dict_lists(self):
+    def as_dictlists(self):
        """Returns data as dict of identifiers along first dim.

        ex: data['gene_1'] = ['map0030','map0010', ...]
@@ -334,73 +335,163 @@ class CategoryDataset(Dataset):
 class GraphDataset(Dataset):
    """The graph dataset class.

-    A dataset class for representing graphs using an (weighted)
-    adjacency matrix
-    (restricted to square symmetric matrices)
+    A dataset class for representing graphs. The constructor may use an 
+    incidence matrix (possibly sparse) or (if networkx installed) a 
+    networkx.(X)Graph structure.
    
-    If the library NetworkX is installed, there is support for
-    representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure.
+    If the networkx library is installed, there is support for
+    representing the graph as a networkx.Graph, or networkx.XGraph structure.
    """
    
-    def __init__(self, array, identifiers=None, name='A'):
-        Dataset.__init__(self, array=array, identifiers=identifiers, name=name)
+    def __init__(self, input, identifiers=None, name='A', nodepos = None):      
+        if isinstance(input, sparse.spmatrix):
+            arr = input
+        else:
+            try:
+                arr = asarray(input)
+            except:
+                raise ValueError("Could not identify input")
+        Dataset.__init__(self, array=arr, identifiers=identifiers, name=name)
        self._graph = None
-        self._pos = None
+        self.nodepos = nodepos
        
-    def asnetworkx(self, nx_type='graph'):
-        dim = self.get_dim_name()[0]
-        ids = self.get_identifiers(dim, sorted=True)
-        adj_mat = self.asarray()
-        G = self._graph_from_adj_matrix(adj_mat, labels=ids)
+    def as_spmatrix(self):
+        if isinstance(self._array, sparse.spmatrix):
+            return self._array
+        else:
+            arr = self.asarray()
+            return sparse.csr_matrix(arr.astype('i'))
+
+    def to_spmatrix(self):
+        if isinstance(self._array, sparse.spmatrix):
+            self._array = self._array.tocsr()
+        else:
+            self._array = sparse.scr_matrix(self._array)
+    
+    def asnetworkx(self):
+        if self._graph != None:
+            return self._graph
+        dim0, dim1 = self.get_dim_name()
+        node_ids = self.get_identifiers(dim0, sorted=True)
+        edge_ids = self.get_identifiers(dim1, sorted=True)
+        G = self._graph_from_incidence_matrix(self._array, node_ids=node_ids, edge_ids=edge_ids)
        self._graph = G
        return G
        
-    def _graph_from_adj_matrix(self, A, labels=None):
-        """Creates a networkx graph class from adjacency
-        (possibly weighted) matrix and ordered labels.
+    def from_networkx(cls, G, node_dim, edge_dim, sp_format=True):
+        """Create graph dataset from networkx graph.
        
-        nx_type = ['graph',['xgraph']]
-        labels = None, results in string-numbered labels
+        When G is a Graph/Digraph edge identifiers will be created,
+        else (XGraoh/XDigraph) it is assumed that edge attributes are
+        the edge identifiers.
        """
        
+        import networkx as nx
+        n = G.number_of_nodes()
+        m = G.number_of_edges()
+        
+        if isinstance(G, nx.DiGraph):
+            G = nx.XDiGraph(G)
+            G = G.to_directed()
+        elif isinstance(G, nx.Graph):
+            G = nx.XGraph(G)
+        
+        edge_ids = [e[2] for e in G.edges()]
+        node_ids = map(str, G.nodes())
+        n2ind = {}
+        for ind, node in enumerate(node_ids):
+            n2ind[node] = ind
+        
+        if sp_format:
+            I = sparse.lil_matrix((n, m))
+        else:
+            I = zeros((m, n), dtype='i')
+        
+        for i, (h, t, eid) in enumerate(G.edges()):
+            if eid != None:
+                edge_ids[i] = eid
+            else:
+                edge_ids[i] = 'e_' + str(i)
+            hind = n2ind[str(h)]
+            tind = n2ind[str(t)]
+            I[hind, i] = 1
+            if G.is_directed():
+                I[tind, i] = -1
+            else:
+                I[tind, i] = 1
+        idents = [[node_dim, node_ids], [edge_dim, edge_ids]]
+        if G.name != '':
+            name = G.name
+        else:
+            name = 'A'
+        ds = GraphDataset(I, idents, name)
+        return ds
+    
+    from_networkx = classmethod(from_networkx)            
+    
+    def _incidence2adjacency(self, I):
+        """Incidence to adjacency matrix.
+        
+        I*I.T - eye(n)?
+        """
+        raise NotImplementedError
+    
+    def _graph_from_incidence_matrix(self, I, node_ids, edge_ids):
+        """Creates a networkx graph class from incidence
+        (possibly weighted) matrix and ordered labels.
+        
+        labels = None, results in string-numbered labels
+        """
        try:
            import networkx as nx
        except:
            print "Failed in import of NetworkX"
            return None
-        m, n = A.shape # adjacency matrix must be of type that evals to true/false for neigbours
-        if m != n:
-            raise IOError, "Adjacency matrix must be square"

-        if A[A[:,0].nonzero()[0][0],0] == 1: #unweighted graph
-            G = nx.Graph()
-        else:
-            G = nx.XGraph()
-
-        if labels == None: # if labels not provided mark vertices with numbers
-            labels = [str(i) for i in range(m)]
-
-        for nbrs, head in izip(A, labels):
-            for i, nbr in enumerate(nbrs):
-                if nbr:
-                    tail = labels[i]
-                    if type(G)==nx.XGraph:
-                        G.add_edge(head, tail, nbr)
-                    else:
-                        G.add_edge(head, tail)
-        return G
+        m, n = I.shape
+        assert(m == len(node_ids))
+        assert(n == len(edge_ids))
+        weights = []
+        directed = False
+        G = nx.XDiGraph(name=self._name)
+        if isinstance(I, sparse.spmatrix):
+            I = I.tocsr()
+        for ename, col in izip(edge_ids, I.T):
+            if isinstance(I, sparse.spmatrix):
+                node_ind = col.indices
+                w1, w2 = col.data
+            else:
+                node_ind = where(col != 0)[0]
+                w1, w2 = col[node_ind]
+            node1 = node_ids[node_ind[0]]
+            node2 = node_ids[node_ind[1]]
+            if w1 < 0: # w1 is tail
+                directed = True
+                assert(w2 > 0 and (w1 + w2) == 0)
+                G.add_edge(node2, node1, ename)
+                weights.append(w2)
+            else: #w2 is tail or graph is undirected
+                assert(w1 > 0)
+                if w2 < 0:
+                    directed = True
+                G.add_edge(node1, node2, ename)
+                weights.append(w1)
+        if not directed:
+            G = G.to_undirected()
+        return G, asarray(weights)

 Dataset._all_dims = set()


 class ReverseDict(dict):
-    """
-    A dictionary which can lookup values by key, and keys by value.
+    """A dictionary which can lookup values by key, and keys by value.
+    
    All values and keys must be hashable, and unique.
    
-    d = ReverseDict((['a',1],['b',2]))
-    print d['a'] --> 1
-    print d.reverse[1] --> 'a'
+    example:
+    >>d = ReverseDict((['a',1],['b',2]))
+    >>print d['a'] --> 1
+    >>print d.reverse[1] --> 'a'
    """
    def __init__(self, *args, **kw):
        dict.__init__(self, *args, **kw)
@@ -436,6 +527,7 @@ class Selection(dict):
    def select(self, axis, labels):
        self[axis] = labels

+
 def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True):
    """Writes a dataset in fluents tab separated values (ftsv) form.
    
@@ -471,16 +563,23 @@ def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True):
    fd.write('# type: %s' %type + '\n')

    for dim in ds.get_dim_name():
-        print >> fd, "# dimension: %s" % dim,
-        for id in ds.get_identifiers(dim, None, True):
-            print >> fd, id,
-        print >> fd
+        fd.write("# dimension: %s" % dim)
+        for ident in ds.get_identifiers(dim, sorted=True):
+            fd.write(" " + ident)
+        fd.write("\n")

-    print >> fd, "# name: %s" % ds.get_name()
+    fd.write("# name: %s" % ds.get_name() + '\n')
+    # xy-node-positions
+    if ds.nodepos != None:
+        fd.write("# nodepos:")
+        node_dim = ds.get_dim_name(0)
+        for ident in ds.get_identifiers(node_dim, sorted=True):
+            fd.write(" %s,%s" %ds.nodepos[ident])
+        fd.write("\n")
    
    # Write data
-    if hasattr(ds, "asspmatrix") and sp_format == True:
-        m = ds.asspmatrix()
+    if hasattr(ds, "as_spmatrix") and sp_format == True:
+        m = ds.as_spmatrix()
    else:
        m = ds.asarray()
    if isinstance(m, sparse.spmatrix):
@@ -491,6 +590,107 @@ def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True):
    if opened:
        fd.close()

+def read_ftsv(fd, sep=None):
+    """Read a dataset in fluents tab separated values (ftsv) form and return it.
+    
+    @param fd: An open file descriptor.
+    @return: A Dataset, CategoryDataset or GraphDataset depending on the information
+    read.
+    """
+    opened = False
+    if isinstance(fd, str):
+        fd = open(fd)
+        opened = True
+
+    split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)')
+    dimensions = []
+    identifiers = {}
+    type = 'dataset'
+    name = 'Unnamed dataset'
+    sp_format = False
+    nodepos = None
+    # graphtype = 'graph'
+
+    # Read header lines from file.
+    line = fd.readline()
+    while line:
+        m = split_re.match(line)
+        if m:
+            key, val = m.groups()
+            
+            # The line is on the form;
+            # dimension: dimname id1 id2 id3 ...
+            if key == 'dimension':
+                values = [v.strip() for v in val.split(' ')]
+                dimensions.append(values[0])
+                identifiers[values[0]] = values[1:]
+
+            # Read type of dataset.
+            # Should be dataset, category, or network
+            elif key == 'type':
+                type = val
+            
+            elif key == 'name':
+                name = val
+            
+            # storage format
+            # if sp_format is True then use coordinate triplets
+            elif key == 'sp_format':
+                if val in ['False', 'false', '0', 'F', 'f',]:
+                    sp_format = False
+                elif val in ['True', 'true', '1', 'T', 't']:
+                    sp_format = True
+                else:
+                    raise ValueError("sp_format: %s not valid " %sp_format)
+            
+            elif key == 'nodepos':
+                node_dim = dimensions[0]
+                idents = identifiers[node_dim]
+                nodepos = {}
+                xys = val.split(" ")
+                for node_id, xy in zip(idents, xys):
+                    x, y = map(float, xy.split(","))
+                    nodepos[node_id] = (x, y)
+        
+        else:
+            break
+        line = fd.readline()
+
+    # Dimensions in the form [(dim1, [id1, id2, id3 ..) ...] 
+    dims = [(x, identifiers[x]) for x in dimensions]
+    dim_lengths = [len(identifiers[x]) for x in dimensions]
+
+    # Create matrix and assign element reader
+    if type == 'category':
+        if sp_format:
+            matrix = sparse.lil_matrix(dim_lengths)
+        else:
+            matrix = empty(dim_lengths, dtype='i')
+    else:
+        if sp_format:
+            matrix = sparse.lil_matrix(dim_lengths)
+        else:
+            matrix = empty(dim_lengths)
+
+    if sp_format:
+        matrix = _read_sparse_elements(fd, matrix)
+    else:
+        matrix = _read_elements(fd, matrix)
+    
+
+    # Create dataset of specified type
+    if type == 'category':
+        ds = CategoryDataset(matrix, dims, name)
+    elif type == 'network':
+        ds = GraphDataset(matrix, dims, name=name, nodepos=nodepos)
+    else:
+        ds = Dataset(matrix, dims, name)
+
+    if opened:
+        fd.close()
+
+    return ds
+
 def _write_sparse_elements(fd, arr, fmt='%d', sep=None):
    """ Sparse coordinate format.""" 
    fd.write('# sp_format: True\n\n')
@@ -530,97 +730,3 @@ def _read_sparse_elements(fd, arr, sep=None):
        line = fd.readline()
    return arr.tocsr()

-def read_ftsv(fd, sep=None):
-    """Read a dataset in fluents tab separated values (ftsv) form and return it.
-    
-    @param fd: An open file descriptor.
-    @return: A Dataset, CategoryDataset or GraphDataset depending on the information
-    read.
-    """
-    opened = False
-    if isinstance(fd, str):
-        fd = open(fd)
-        opened = True
-
-    split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)')
-    dimensions = []
-    identifiers = {}
-    type = 'dataset'
-    name = 'Unnamed dataset'
-    sp_format = False
-    # graphtype = 'graph'
-
-    # Read header lines from file.
-    line = fd.readline()
-    while line:
-        m = split_re.match(line)
-        if m:
-            key, val = m.groups()
-            
-            # The line is on the form;
-            # dimension: dimname id1 id2 id3 ...
-            if key == 'dimension':
-                values = [v.strip() for v in val.split(' ')]
-                dimensions.append(values[0])
-                identifiers[values[0]] = values[1:]
-
-            # Read type of dataset.
-            # Should be dataset, category, or network
-            elif key == 'type':
-                type = val
-            
-            elif key == 'name':
-                name = val
-            
-            # storage format
-            # if sp_format is True then use coordinate triplets
-            elif key == 'sp_format':
-                if val in ['False', 'false', '0', 'F', 'f',]:
-                    sp_format = False
-                elif val in ['True', 'true', '1', 'T', 't']:
-                    sp_format = True
-                else:
-                    raise ValueError("sp_format: %s not valid " %sp_format)
-            
-            # elif key == 'graphtype':
-            #    graphtype = val
-
-        else:
-            break
-        line = fd.readline()
-
-    # Dimensions in the form [(dim1, [id1, id2, id3 ..) ...] 
-    dims = [(x, identifiers[x]) for x in dimensions]
-    dim_lengths = [len(identifiers[x]) for x in dimensions]
-
-    # Create matrix and assign element reader
-    if type == 'category':
-        if sp_format:
-            matrix = sparse.lil_matrix(dim_lengths)
-        else:
-            matrix = empty(dim_lengths, dtype='i')
-    elif type == 'network':
-        matrix = empty(dim_lengths)
-    else:
-        matrix = empty(dim_lengths)
-
-    if sp_format:
-        matrix = _read_sparse_elements(fd, matrix)
-    else:
-        matrix = _read_elements(fd, matrix)
-    
-
-    # Create dataset of specified type
-    if type == 'category':
-        ds = CategoryDataset(matrix, dims, name)
-    elif type == 'network':
-        ds = GraphDataset(matrix, dims, name)
-    else:
-        ds = Dataset(matrix, dims, name)
-
-    if opened:
-        fd.close()
-
-    return ds
-
-