Projects/laydi
Projects
/
laydi
Archived
7
0
Fork 0

Sparse network support and nodepos read/write

This commit is contained in:
Arnar Flatberg 2008-01-08 00:43:56 +00:00
parent bf29661af9
commit ab9c1ec84b
1 changed files with 258 additions and 152 deletions

View File

@ -1,4 +1,5 @@
from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse
from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse,\
where
from scipy import sort as array_sort
from itertools import izip
import shelve
@ -6,7 +7,7 @@ import copy
import re
class Dataset:
class Dataset(object):
"""The Dataset base class.
A Dataset is an n-way array with defined string identifiers across
@ -273,20 +274,20 @@ class CategoryDataset(Dataset):
def __init__(self, array, identifiers=None, name='C'):
Dataset.__init__(self, array, identifiers=identifiers, name=name)
def asspmatrix(self):
def as_spmatrix(self):
if isinstance(self._array, sparse.spmatrix):
return self._array
else:
arr = self.asarray()
return sparse.csr_matrix(arr.astype('i'))
def tospmatrix(self):
def to_spmatrix(self):
if isinstance(self._array, sparse.spmatrix):
self._array = self._array.tocsr()
else:
self._array = sparse.scr_matrix(self._array)
def as_dict_lists(self):
def as_dictlists(self):
"""Returns data as dict of identifiers along first dim.
ex: data['gene_1'] = ['map0030','map0010', ...]
@ -334,73 +335,163 @@ class CategoryDataset(Dataset):
class GraphDataset(Dataset):
"""The graph dataset class.
A dataset class for representing graphs using an (weighted)
adjacency matrix
(restricted to square symmetric matrices)
A dataset class for representing graphs. The constructor may use an
incidence matrix (possibly sparse) or (if networkx installed) a
networkx.(X)Graph structure.
If the library NetworkX is installed, there is support for
representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure.
If the networkx library is installed, there is support for
representing the graph as a networkx.Graph, or networkx.XGraph structure.
"""
def __init__(self, array, identifiers=None, name='A'):
Dataset.__init__(self, array=array, identifiers=identifiers, name=name)
def __init__(self, input, identifiers=None, name='A', nodepos = None):
if isinstance(input, sparse.spmatrix):
arr = input
else:
try:
arr = asarray(input)
except:
raise ValueError("Could not identify input")
Dataset.__init__(self, array=arr, identifiers=identifiers, name=name)
self._graph = None
self._pos = None
self.nodepos = nodepos
def asnetworkx(self, nx_type='graph'):
dim = self.get_dim_name()[0]
ids = self.get_identifiers(dim, sorted=True)
adj_mat = self.asarray()
G = self._graph_from_adj_matrix(adj_mat, labels=ids)
def as_spmatrix(self):
if isinstance(self._array, sparse.spmatrix):
return self._array
else:
arr = self.asarray()
return sparse.csr_matrix(arr.astype('i'))
def to_spmatrix(self):
if isinstance(self._array, sparse.spmatrix):
self._array = self._array.tocsr()
else:
self._array = sparse.scr_matrix(self._array)
def asnetworkx(self):
if self._graph != None:
return self._graph
dim0, dim1 = self.get_dim_name()
node_ids = self.get_identifiers(dim0, sorted=True)
edge_ids = self.get_identifiers(dim1, sorted=True)
G = self._graph_from_incidence_matrix(self._array, node_ids=node_ids, edge_ids=edge_ids)
self._graph = G
return G
def _graph_from_adj_matrix(self, A, labels=None):
"""Creates a networkx graph class from adjacency
(possibly weighted) matrix and ordered labels.
def from_networkx(cls, G, node_dim, edge_dim, sp_format=True):
"""Create graph dataset from networkx graph.
nx_type = ['graph',['xgraph']]
labels = None, results in string-numbered labels
When G is a Graph/Digraph edge identifiers will be created,
else (XGraoh/XDigraph) it is assumed that edge attributes are
the edge identifiers.
"""
import networkx as nx
n = G.number_of_nodes()
m = G.number_of_edges()
if isinstance(G, nx.DiGraph):
G = nx.XDiGraph(G)
G = G.to_directed()
elif isinstance(G, nx.Graph):
G = nx.XGraph(G)
edge_ids = [e[2] for e in G.edges()]
node_ids = map(str, G.nodes())
n2ind = {}
for ind, node in enumerate(node_ids):
n2ind[node] = ind
if sp_format:
I = sparse.lil_matrix((n, m))
else:
I = zeros((m, n), dtype='i')
for i, (h, t, eid) in enumerate(G.edges()):
if eid != None:
edge_ids[i] = eid
else:
edge_ids[i] = 'e_' + str(i)
hind = n2ind[str(h)]
tind = n2ind[str(t)]
I[hind, i] = 1
if G.is_directed():
I[tind, i] = -1
else:
I[tind, i] = 1
idents = [[node_dim, node_ids], [edge_dim, edge_ids]]
if G.name != '':
name = G.name
else:
name = 'A'
ds = GraphDataset(I, idents, name)
return ds
from_networkx = classmethod(from_networkx)
def _incidence2adjacency(self, I):
"""Incidence to adjacency matrix.
I*I.T - eye(n)?
"""
raise NotImplementedError
def _graph_from_incidence_matrix(self, I, node_ids, edge_ids):
"""Creates a networkx graph class from incidence
(possibly weighted) matrix and ordered labels.
labels = None, results in string-numbered labels
"""
try:
import networkx as nx
except:
print "Failed in import of NetworkX"
return None
m, n = A.shape # adjacency matrix must be of type that evals to true/false for neigbours
if m != n:
raise IOError, "Adjacency matrix must be square"
if A[A[:,0].nonzero()[0][0],0] == 1: #unweighted graph
G = nx.Graph()
else:
G = nx.XGraph()
if labels == None: # if labels not provided mark vertices with numbers
labels = [str(i) for i in range(m)]
for nbrs, head in izip(A, labels):
for i, nbr in enumerate(nbrs):
if nbr:
tail = labels[i]
if type(G)==nx.XGraph:
G.add_edge(head, tail, nbr)
else:
G.add_edge(head, tail)
return G
m, n = I.shape
assert(m == len(node_ids))
assert(n == len(edge_ids))
weights = []
directed = False
G = nx.XDiGraph(name=self._name)
if isinstance(I, sparse.spmatrix):
I = I.tocsr()
for ename, col in izip(edge_ids, I.T):
if isinstance(I, sparse.spmatrix):
node_ind = col.indices
w1, w2 = col.data
else:
node_ind = where(col != 0)[0]
w1, w2 = col[node_ind]
node1 = node_ids[node_ind[0]]
node2 = node_ids[node_ind[1]]
if w1 < 0: # w1 is tail
directed = True
assert(w2 > 0 and (w1 + w2) == 0)
G.add_edge(node2, node1, ename)
weights.append(w2)
else: #w2 is tail or graph is undirected
assert(w1 > 0)
if w2 < 0:
directed = True
G.add_edge(node1, node2, ename)
weights.append(w1)
if not directed:
G = G.to_undirected()
return G, asarray(weights)
Dataset._all_dims = set()
class ReverseDict(dict):
"""
A dictionary which can lookup values by key, and keys by value.
"""A dictionary which can lookup values by key, and keys by value.
All values and keys must be hashable, and unique.
d = ReverseDict((['a',1],['b',2]))
print d['a'] --> 1
print d.reverse[1] --> 'a'
example:
>>d = ReverseDict((['a',1],['b',2]))
>>print d['a'] --> 1
>>print d.reverse[1] --> 'a'
"""
def __init__(self, *args, **kw):
dict.__init__(self, *args, **kw)
@ -436,6 +527,7 @@ class Selection(dict):
def select(self, axis, labels):
self[axis] = labels
def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True):
"""Writes a dataset in fluents tab separated values (ftsv) form.
@ -471,16 +563,23 @@ def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True):
fd.write('# type: %s' %type + '\n')
for dim in ds.get_dim_name():
print >> fd, "# dimension: %s" % dim,
for id in ds.get_identifiers(dim, None, True):
print >> fd, id,
print >> fd
fd.write("# dimension: %s" % dim)
for ident in ds.get_identifiers(dim, sorted=True):
fd.write(" " + ident)
fd.write("\n")
print >> fd, "# name: %s" % ds.get_name()
fd.write("# name: %s" % ds.get_name() + '\n')
# xy-node-positions
if ds.nodepos != None:
fd.write("# nodepos:")
node_dim = ds.get_dim_name(0)
for ident in ds.get_identifiers(node_dim, sorted=True):
fd.write(" %s,%s" %ds.nodepos[ident])
fd.write("\n")
# Write data
if hasattr(ds, "asspmatrix") and sp_format == True:
m = ds.asspmatrix()
if hasattr(ds, "as_spmatrix") and sp_format == True:
m = ds.as_spmatrix()
else:
m = ds.asarray()
if isinstance(m, sparse.spmatrix):
@ -491,6 +590,107 @@ def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True):
if opened:
fd.close()
def read_ftsv(fd, sep=None):
"""Read a dataset in fluents tab separated values (ftsv) form and return it.
@param fd: An open file descriptor.
@return: A Dataset, CategoryDataset or GraphDataset depending on the information
read.
"""
opened = False
if isinstance(fd, str):
fd = open(fd)
opened = True
split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)')
dimensions = []
identifiers = {}
type = 'dataset'
name = 'Unnamed dataset'
sp_format = False
nodepos = None
# graphtype = 'graph'
# Read header lines from file.
line = fd.readline()
while line:
m = split_re.match(line)
if m:
key, val = m.groups()
# The line is on the form;
# dimension: dimname id1 id2 id3 ...
if key == 'dimension':
values = [v.strip() for v in val.split(' ')]
dimensions.append(values[0])
identifiers[values[0]] = values[1:]
# Read type of dataset.
# Should be dataset, category, or network
elif key == 'type':
type = val
elif key == 'name':
name = val
# storage format
# if sp_format is True then use coordinate triplets
elif key == 'sp_format':
if val in ['False', 'false', '0', 'F', 'f',]:
sp_format = False
elif val in ['True', 'true', '1', 'T', 't']:
sp_format = True
else:
raise ValueError("sp_format: %s not valid " %sp_format)
elif key == 'nodepos':
node_dim = dimensions[0]
idents = identifiers[node_dim]
nodepos = {}
xys = val.split(" ")
for node_id, xy in zip(idents, xys):
x, y = map(float, xy.split(","))
nodepos[node_id] = (x, y)
else:
break
line = fd.readline()
# Dimensions in the form [(dim1, [id1, id2, id3 ..) ...]
dims = [(x, identifiers[x]) for x in dimensions]
dim_lengths = [len(identifiers[x]) for x in dimensions]
# Create matrix and assign element reader
if type == 'category':
if sp_format:
matrix = sparse.lil_matrix(dim_lengths)
else:
matrix = empty(dim_lengths, dtype='i')
else:
if sp_format:
matrix = sparse.lil_matrix(dim_lengths)
else:
matrix = empty(dim_lengths)
if sp_format:
matrix = _read_sparse_elements(fd, matrix)
else:
matrix = _read_elements(fd, matrix)
# Create dataset of specified type
if type == 'category':
ds = CategoryDataset(matrix, dims, name)
elif type == 'network':
ds = GraphDataset(matrix, dims, name=name, nodepos=nodepos)
else:
ds = Dataset(matrix, dims, name)
if opened:
fd.close()
return ds
def _write_sparse_elements(fd, arr, fmt='%d', sep=None):
""" Sparse coordinate format."""
fd.write('# sp_format: True\n\n')
@ -530,97 +730,3 @@ def _read_sparse_elements(fd, arr, sep=None):
line = fd.readline()
return arr.tocsr()
def read_ftsv(fd, sep=None):
"""Read a dataset in fluents tab separated values (ftsv) form and return it.
@param fd: An open file descriptor.
@return: A Dataset, CategoryDataset or GraphDataset depending on the information
read.
"""
opened = False
if isinstance(fd, str):
fd = open(fd)
opened = True
split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)')
dimensions = []
identifiers = {}
type = 'dataset'
name = 'Unnamed dataset'
sp_format = False
# graphtype = 'graph'
# Read header lines from file.
line = fd.readline()
while line:
m = split_re.match(line)
if m:
key, val = m.groups()
# The line is on the form;
# dimension: dimname id1 id2 id3 ...
if key == 'dimension':
values = [v.strip() for v in val.split(' ')]
dimensions.append(values[0])
identifiers[values[0]] = values[1:]
# Read type of dataset.
# Should be dataset, category, or network
elif key == 'type':
type = val
elif key == 'name':
name = val
# storage format
# if sp_format is True then use coordinate triplets
elif key == 'sp_format':
if val in ['False', 'false', '0', 'F', 'f',]:
sp_format = False
elif val in ['True', 'true', '1', 'T', 't']:
sp_format = True
else:
raise ValueError("sp_format: %s not valid " %sp_format)
# elif key == 'graphtype':
# graphtype = val
else:
break
line = fd.readline()
# Dimensions in the form [(dim1, [id1, id2, id3 ..) ...]
dims = [(x, identifiers[x]) for x in dimensions]
dim_lengths = [len(identifiers[x]) for x in dimensions]
# Create matrix and assign element reader
if type == 'category':
if sp_format:
matrix = sparse.lil_matrix(dim_lengths)
else:
matrix = empty(dim_lengths, dtype='i')
elif type == 'network':
matrix = empty(dim_lengths)
else:
matrix = empty(dim_lengths)
if sp_format:
matrix = _read_sparse_elements(fd, matrix)
else:
matrix = _read_elements(fd, matrix)
# Create dataset of specified type
if type == 'category':
ds = CategoryDataset(matrix, dims, name)
elif type == 'network':
ds = GraphDataset(matrix, dims, name)
else:
ds = Dataset(matrix, dims, name)
if opened:
fd.close()
return ds