Projects/laydi
Projects
/
laydi
Archived
7
0
Fork 0

Sparse network support and nodepos read/write

This commit is contained in:
Arnar Flatberg 2008-01-08 00:43:56 +00:00
parent bf29661af9
commit ab9c1ec84b
1 changed files with 258 additions and 152 deletions

View File

@ -1,4 +1,5 @@
from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse,\
where
from scipy import sort as array_sort from scipy import sort as array_sort
from itertools import izip from itertools import izip
import shelve import shelve
@ -6,7 +7,7 @@ import copy
import re import re
class Dataset: class Dataset(object):
"""The Dataset base class. """The Dataset base class.
A Dataset is an n-way array with defined string identifiers across A Dataset is an n-way array with defined string identifiers across
@ -273,20 +274,20 @@ class CategoryDataset(Dataset):
def __init__(self, array, identifiers=None, name='C'): def __init__(self, array, identifiers=None, name='C'):
Dataset.__init__(self, array, identifiers=identifiers, name=name) Dataset.__init__(self, array, identifiers=identifiers, name=name)
def asspmatrix(self): def as_spmatrix(self):
if isinstance(self._array, sparse.spmatrix): if isinstance(self._array, sparse.spmatrix):
return self._array return self._array
else: else:
arr = self.asarray() arr = self.asarray()
return sparse.csr_matrix(arr.astype('i')) return sparse.csr_matrix(arr.astype('i'))
def tospmatrix(self): def to_spmatrix(self):
if isinstance(self._array, sparse.spmatrix): if isinstance(self._array, sparse.spmatrix):
self._array = self._array.tocsr() self._array = self._array.tocsr()
else: else:
self._array = sparse.scr_matrix(self._array) self._array = sparse.scr_matrix(self._array)
def as_dict_lists(self): def as_dictlists(self):
"""Returns data as dict of identifiers along first dim. """Returns data as dict of identifiers along first dim.
ex: data['gene_1'] = ['map0030','map0010', ...] ex: data['gene_1'] = ['map0030','map0010', ...]
@ -334,73 +335,163 @@ class CategoryDataset(Dataset):
class GraphDataset(Dataset): class GraphDataset(Dataset):
"""The graph dataset class. """The graph dataset class.
A dataset class for representing graphs using an (weighted) A dataset class for representing graphs. The constructor may use an
adjacency matrix incidence matrix (possibly sparse) or (if networkx installed) a
(restricted to square symmetric matrices) networkx.(X)Graph structure.
If the library NetworkX is installed, there is support for If the networkx library is installed, there is support for
representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure. representing the graph as a networkx.Graph, or networkx.XGraph structure.
""" """
def __init__(self, array, identifiers=None, name='A'): def __init__(self, input, identifiers=None, name='A', nodepos = None):
Dataset.__init__(self, array=array, identifiers=identifiers, name=name) if isinstance(input, sparse.spmatrix):
arr = input
else:
try:
arr = asarray(input)
except:
raise ValueError("Could not identify input")
Dataset.__init__(self, array=arr, identifiers=identifiers, name=name)
self._graph = None self._graph = None
self._pos = None self.nodepos = nodepos
def asnetworkx(self, nx_type='graph'): def as_spmatrix(self):
dim = self.get_dim_name()[0] if isinstance(self._array, sparse.spmatrix):
ids = self.get_identifiers(dim, sorted=True) return self._array
adj_mat = self.asarray() else:
G = self._graph_from_adj_matrix(adj_mat, labels=ids) arr = self.asarray()
return sparse.csr_matrix(arr.astype('i'))
def to_spmatrix(self):
if isinstance(self._array, sparse.spmatrix):
self._array = self._array.tocsr()
else:
self._array = sparse.scr_matrix(self._array)
def asnetworkx(self):
if self._graph != None:
return self._graph
dim0, dim1 = self.get_dim_name()
node_ids = self.get_identifiers(dim0, sorted=True)
edge_ids = self.get_identifiers(dim1, sorted=True)
G = self._graph_from_incidence_matrix(self._array, node_ids=node_ids, edge_ids=edge_ids)
self._graph = G self._graph = G
return G return G
def _graph_from_adj_matrix(self, A, labels=None): def from_networkx(cls, G, node_dim, edge_dim, sp_format=True):
"""Creates a networkx graph class from adjacency """Create graph dataset from networkx graph.
(possibly weighted) matrix and ordered labels.
nx_type = ['graph',['xgraph']] When G is a Graph/Digraph edge identifiers will be created,
labels = None, results in string-numbered labels else (XGraoh/XDigraph) it is assumed that edge attributes are
the edge identifiers.
""" """
import networkx as nx
n = G.number_of_nodes()
m = G.number_of_edges()
if isinstance(G, nx.DiGraph):
G = nx.XDiGraph(G)
G = G.to_directed()
elif isinstance(G, nx.Graph):
G = nx.XGraph(G)
edge_ids = [e[2] for e in G.edges()]
node_ids = map(str, G.nodes())
n2ind = {}
for ind, node in enumerate(node_ids):
n2ind[node] = ind
if sp_format:
I = sparse.lil_matrix((n, m))
else:
I = zeros((m, n), dtype='i')
for i, (h, t, eid) in enumerate(G.edges()):
if eid != None:
edge_ids[i] = eid
else:
edge_ids[i] = 'e_' + str(i)
hind = n2ind[str(h)]
tind = n2ind[str(t)]
I[hind, i] = 1
if G.is_directed():
I[tind, i] = -1
else:
I[tind, i] = 1
idents = [[node_dim, node_ids], [edge_dim, edge_ids]]
if G.name != '':
name = G.name
else:
name = 'A'
ds = GraphDataset(I, idents, name)
return ds
from_networkx = classmethod(from_networkx)
def _incidence2adjacency(self, I):
"""Incidence to adjacency matrix.
I*I.T - eye(n)?
"""
raise NotImplementedError
def _graph_from_incidence_matrix(self, I, node_ids, edge_ids):
"""Creates a networkx graph class from incidence
(possibly weighted) matrix and ordered labels.
labels = None, results in string-numbered labels
"""
try: try:
import networkx as nx import networkx as nx
except: except:
print "Failed in import of NetworkX" print "Failed in import of NetworkX"
return None return None
m, n = A.shape # adjacency matrix must be of type that evals to true/false for neigbours
if m != n:
raise IOError, "Adjacency matrix must be square"
if A[A[:,0].nonzero()[0][0],0] == 1: #unweighted graph m, n = I.shape
G = nx.Graph() assert(m == len(node_ids))
assert(n == len(edge_ids))
weights = []
directed = False
G = nx.XDiGraph(name=self._name)
if isinstance(I, sparse.spmatrix):
I = I.tocsr()
for ename, col in izip(edge_ids, I.T):
if isinstance(I, sparse.spmatrix):
node_ind = col.indices
w1, w2 = col.data
else: else:
G = nx.XGraph() node_ind = where(col != 0)[0]
w1, w2 = col[node_ind]
if labels == None: # if labels not provided mark vertices with numbers node1 = node_ids[node_ind[0]]
labels = [str(i) for i in range(m)] node2 = node_ids[node_ind[1]]
if w1 < 0: # w1 is tail
for nbrs, head in izip(A, labels): directed = True
for i, nbr in enumerate(nbrs): assert(w2 > 0 and (w1 + w2) == 0)
if nbr: G.add_edge(node2, node1, ename)
tail = labels[i] weights.append(w2)
if type(G)==nx.XGraph: else: #w2 is tail or graph is undirected
G.add_edge(head, tail, nbr) assert(w1 > 0)
else: if w2 < 0:
G.add_edge(head, tail) directed = True
return G G.add_edge(node1, node2, ename)
weights.append(w1)
if not directed:
G = G.to_undirected()
return G, asarray(weights)
Dataset._all_dims = set() Dataset._all_dims = set()
class ReverseDict(dict): class ReverseDict(dict):
""" """A dictionary which can lookup values by key, and keys by value.
A dictionary which can lookup values by key, and keys by value.
All values and keys must be hashable, and unique. All values and keys must be hashable, and unique.
d = ReverseDict((['a',1],['b',2])) example:
print d['a'] --> 1 >>d = ReverseDict((['a',1],['b',2]))
print d.reverse[1] --> 'a' >>print d['a'] --> 1
>>print d.reverse[1] --> 'a'
""" """
def __init__(self, *args, **kw): def __init__(self, *args, **kw):
dict.__init__(self, *args, **kw) dict.__init__(self, *args, **kw)
@ -436,6 +527,7 @@ class Selection(dict):
def select(self, axis, labels): def select(self, axis, labels):
self[axis] = labels self[axis] = labels
def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True): def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True):
"""Writes a dataset in fluents tab separated values (ftsv) form. """Writes a dataset in fluents tab separated values (ftsv) form.
@ -471,16 +563,23 @@ def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True):
fd.write('# type: %s' %type + '\n') fd.write('# type: %s' %type + '\n')
for dim in ds.get_dim_name(): for dim in ds.get_dim_name():
print >> fd, "# dimension: %s" % dim, fd.write("# dimension: %s" % dim)
for id in ds.get_identifiers(dim, None, True): for ident in ds.get_identifiers(dim, sorted=True):
print >> fd, id, fd.write(" " + ident)
print >> fd fd.write("\n")
print >> fd, "# name: %s" % ds.get_name() fd.write("# name: %s" % ds.get_name() + '\n')
# xy-node-positions
if ds.nodepos != None:
fd.write("# nodepos:")
node_dim = ds.get_dim_name(0)
for ident in ds.get_identifiers(node_dim, sorted=True):
fd.write(" %s,%s" %ds.nodepos[ident])
fd.write("\n")
# Write data # Write data
if hasattr(ds, "asspmatrix") and sp_format == True: if hasattr(ds, "as_spmatrix") and sp_format == True:
m = ds.asspmatrix() m = ds.as_spmatrix()
else: else:
m = ds.asarray() m = ds.asarray()
if isinstance(m, sparse.spmatrix): if isinstance(m, sparse.spmatrix):
@ -491,6 +590,107 @@ def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True):
if opened: if opened:
fd.close() fd.close()
def read_ftsv(fd, sep=None):
"""Read a dataset in fluents tab separated values (ftsv) form and return it.
@param fd: An open file descriptor.
@return: A Dataset, CategoryDataset or GraphDataset depending on the information
read.
"""
opened = False
if isinstance(fd, str):
fd = open(fd)
opened = True
split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)')
dimensions = []
identifiers = {}
type = 'dataset'
name = 'Unnamed dataset'
sp_format = False
nodepos = None
# graphtype = 'graph'
# Read header lines from file.
line = fd.readline()
while line:
m = split_re.match(line)
if m:
key, val = m.groups()
# The line is on the form;
# dimension: dimname id1 id2 id3 ...
if key == 'dimension':
values = [v.strip() for v in val.split(' ')]
dimensions.append(values[0])
identifiers[values[0]] = values[1:]
# Read type of dataset.
# Should be dataset, category, or network
elif key == 'type':
type = val
elif key == 'name':
name = val
# storage format
# if sp_format is True then use coordinate triplets
elif key == 'sp_format':
if val in ['False', 'false', '0', 'F', 'f',]:
sp_format = False
elif val in ['True', 'true', '1', 'T', 't']:
sp_format = True
else:
raise ValueError("sp_format: %s not valid " %sp_format)
elif key == 'nodepos':
node_dim = dimensions[0]
idents = identifiers[node_dim]
nodepos = {}
xys = val.split(" ")
for node_id, xy in zip(idents, xys):
x, y = map(float, xy.split(","))
nodepos[node_id] = (x, y)
else:
break
line = fd.readline()
# Dimensions in the form [(dim1, [id1, id2, id3 ..) ...]
dims = [(x, identifiers[x]) for x in dimensions]
dim_lengths = [len(identifiers[x]) for x in dimensions]
# Create matrix and assign element reader
if type == 'category':
if sp_format:
matrix = sparse.lil_matrix(dim_lengths)
else:
matrix = empty(dim_lengths, dtype='i')
else:
if sp_format:
matrix = sparse.lil_matrix(dim_lengths)
else:
matrix = empty(dim_lengths)
if sp_format:
matrix = _read_sparse_elements(fd, matrix)
else:
matrix = _read_elements(fd, matrix)
# Create dataset of specified type
if type == 'category':
ds = CategoryDataset(matrix, dims, name)
elif type == 'network':
ds = GraphDataset(matrix, dims, name=name, nodepos=nodepos)
else:
ds = Dataset(matrix, dims, name)
if opened:
fd.close()
return ds
def _write_sparse_elements(fd, arr, fmt='%d', sep=None): def _write_sparse_elements(fd, arr, fmt='%d', sep=None):
""" Sparse coordinate format.""" """ Sparse coordinate format."""
fd.write('# sp_format: True\n\n') fd.write('# sp_format: True\n\n')
@ -530,97 +730,3 @@ def _read_sparse_elements(fd, arr, sep=None):
line = fd.readline() line = fd.readline()
return arr.tocsr() return arr.tocsr()
def read_ftsv(fd, sep=None):
"""Read a dataset in fluents tab separated values (ftsv) form and return it.
@param fd: An open file descriptor.
@return: A Dataset, CategoryDataset or GraphDataset depending on the information
read.
"""
opened = False
if isinstance(fd, str):
fd = open(fd)
opened = True
split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)')
dimensions = []
identifiers = {}
type = 'dataset'
name = 'Unnamed dataset'
sp_format = False
# graphtype = 'graph'
# Read header lines from file.
line = fd.readline()
while line:
m = split_re.match(line)
if m:
key, val = m.groups()
# The line is on the form;
# dimension: dimname id1 id2 id3 ...
if key == 'dimension':
values = [v.strip() for v in val.split(' ')]
dimensions.append(values[0])
identifiers[values[0]] = values[1:]
# Read type of dataset.
# Should be dataset, category, or network
elif key == 'type':
type = val
elif key == 'name':
name = val
# storage format
# if sp_format is True then use coordinate triplets
elif key == 'sp_format':
if val in ['False', 'false', '0', 'F', 'f',]:
sp_format = False
elif val in ['True', 'true', '1', 'T', 't']:
sp_format = True
else:
raise ValueError("sp_format: %s not valid " %sp_format)
# elif key == 'graphtype':
# graphtype = val
else:
break
line = fd.readline()
# Dimensions in the form [(dim1, [id1, id2, id3 ..) ...]
dims = [(x, identifiers[x]) for x in dimensions]
dim_lengths = [len(identifiers[x]) for x in dimensions]
# Create matrix and assign element reader
if type == 'category':
if sp_format:
matrix = sparse.lil_matrix(dim_lengths)
else:
matrix = empty(dim_lengths, dtype='i')
elif type == 'network':
matrix = empty(dim_lengths)
else:
matrix = empty(dim_lengths)
if sp_format:
matrix = _read_sparse_elements(fd, matrix)
else:
matrix = _read_elements(fd, matrix)
# Create dataset of specified type
if type == 'category':
ds = CategoryDataset(matrix, dims, name)
elif type == 'network':
ds = GraphDataset(matrix, dims, name)
else:
ds = Dataset(matrix, dims, name)
if opened:
fd.close()
return ds