Sparse network support and nodepos read/write
This commit is contained in:
parent
bf29661af9
commit
ab9c1ec84b
|
@ -1,4 +1,5 @@
|
||||||
from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse
|
from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse,\
|
||||||
|
where
|
||||||
from scipy import sort as array_sort
|
from scipy import sort as array_sort
|
||||||
from itertools import izip
|
from itertools import izip
|
||||||
import shelve
|
import shelve
|
||||||
|
@ -6,7 +7,7 @@ import copy
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
class Dataset:
|
class Dataset(object):
|
||||||
"""The Dataset base class.
|
"""The Dataset base class.
|
||||||
|
|
||||||
A Dataset is an n-way array with defined string identifiers across
|
A Dataset is an n-way array with defined string identifiers across
|
||||||
|
@ -273,20 +274,20 @@ class CategoryDataset(Dataset):
|
||||||
def __init__(self, array, identifiers=None, name='C'):
|
def __init__(self, array, identifiers=None, name='C'):
|
||||||
Dataset.__init__(self, array, identifiers=identifiers, name=name)
|
Dataset.__init__(self, array, identifiers=identifiers, name=name)
|
||||||
|
|
||||||
def asspmatrix(self):
|
def as_spmatrix(self):
|
||||||
if isinstance(self._array, sparse.spmatrix):
|
if isinstance(self._array, sparse.spmatrix):
|
||||||
return self._array
|
return self._array
|
||||||
else:
|
else:
|
||||||
arr = self.asarray()
|
arr = self.asarray()
|
||||||
return sparse.csr_matrix(arr.astype('i'))
|
return sparse.csr_matrix(arr.astype('i'))
|
||||||
|
|
||||||
def tospmatrix(self):
|
def to_spmatrix(self):
|
||||||
if isinstance(self._array, sparse.spmatrix):
|
if isinstance(self._array, sparse.spmatrix):
|
||||||
self._array = self._array.tocsr()
|
self._array = self._array.tocsr()
|
||||||
else:
|
else:
|
||||||
self._array = sparse.scr_matrix(self._array)
|
self._array = sparse.scr_matrix(self._array)
|
||||||
|
|
||||||
def as_dict_lists(self):
|
def as_dictlists(self):
|
||||||
"""Returns data as dict of identifiers along first dim.
|
"""Returns data as dict of identifiers along first dim.
|
||||||
|
|
||||||
ex: data['gene_1'] = ['map0030','map0010', ...]
|
ex: data['gene_1'] = ['map0030','map0010', ...]
|
||||||
|
@ -334,73 +335,163 @@ class CategoryDataset(Dataset):
|
||||||
class GraphDataset(Dataset):
|
class GraphDataset(Dataset):
|
||||||
"""The graph dataset class.
|
"""The graph dataset class.
|
||||||
|
|
||||||
A dataset class for representing graphs using an (weighted)
|
A dataset class for representing graphs. The constructor may use an
|
||||||
adjacency matrix
|
incidence matrix (possibly sparse) or (if networkx installed) a
|
||||||
(restricted to square symmetric matrices)
|
networkx.(X)Graph structure.
|
||||||
|
|
||||||
If the library NetworkX is installed, there is support for
|
If the networkx library is installed, there is support for
|
||||||
representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure.
|
representing the graph as a networkx.Graph, or networkx.XGraph structure.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, array, identifiers=None, name='A'):
|
def __init__(self, input, identifiers=None, name='A', nodepos = None):
|
||||||
Dataset.__init__(self, array=array, identifiers=identifiers, name=name)
|
if isinstance(input, sparse.spmatrix):
|
||||||
|
arr = input
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
arr = asarray(input)
|
||||||
|
except:
|
||||||
|
raise ValueError("Could not identify input")
|
||||||
|
Dataset.__init__(self, array=arr, identifiers=identifiers, name=name)
|
||||||
self._graph = None
|
self._graph = None
|
||||||
self._pos = None
|
self.nodepos = nodepos
|
||||||
|
|
||||||
def asnetworkx(self, nx_type='graph'):
|
def as_spmatrix(self):
|
||||||
dim = self.get_dim_name()[0]
|
if isinstance(self._array, sparse.spmatrix):
|
||||||
ids = self.get_identifiers(dim, sorted=True)
|
return self._array
|
||||||
adj_mat = self.asarray()
|
else:
|
||||||
G = self._graph_from_adj_matrix(adj_mat, labels=ids)
|
arr = self.asarray()
|
||||||
|
return sparse.csr_matrix(arr.astype('i'))
|
||||||
|
|
||||||
|
def to_spmatrix(self):
|
||||||
|
if isinstance(self._array, sparse.spmatrix):
|
||||||
|
self._array = self._array.tocsr()
|
||||||
|
else:
|
||||||
|
self._array = sparse.scr_matrix(self._array)
|
||||||
|
|
||||||
|
def asnetworkx(self):
|
||||||
|
if self._graph != None:
|
||||||
|
return self._graph
|
||||||
|
dim0, dim1 = self.get_dim_name()
|
||||||
|
node_ids = self.get_identifiers(dim0, sorted=True)
|
||||||
|
edge_ids = self.get_identifiers(dim1, sorted=True)
|
||||||
|
G = self._graph_from_incidence_matrix(self._array, node_ids=node_ids, edge_ids=edge_ids)
|
||||||
self._graph = G
|
self._graph = G
|
||||||
return G
|
return G
|
||||||
|
|
||||||
def _graph_from_adj_matrix(self, A, labels=None):
|
def from_networkx(cls, G, node_dim, edge_dim, sp_format=True):
|
||||||
"""Creates a networkx graph class from adjacency
|
"""Create graph dataset from networkx graph.
|
||||||
(possibly weighted) matrix and ordered labels.
|
|
||||||
|
|
||||||
nx_type = ['graph',['xgraph']]
|
When G is a Graph/Digraph edge identifiers will be created,
|
||||||
labels = None, results in string-numbered labels
|
else (XGraoh/XDigraph) it is assumed that edge attributes are
|
||||||
|
the edge identifiers.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import networkx as nx
|
||||||
|
n = G.number_of_nodes()
|
||||||
|
m = G.number_of_edges()
|
||||||
|
|
||||||
|
if isinstance(G, nx.DiGraph):
|
||||||
|
G = nx.XDiGraph(G)
|
||||||
|
G = G.to_directed()
|
||||||
|
elif isinstance(G, nx.Graph):
|
||||||
|
G = nx.XGraph(G)
|
||||||
|
|
||||||
|
edge_ids = [e[2] for e in G.edges()]
|
||||||
|
node_ids = map(str, G.nodes())
|
||||||
|
n2ind = {}
|
||||||
|
for ind, node in enumerate(node_ids):
|
||||||
|
n2ind[node] = ind
|
||||||
|
|
||||||
|
if sp_format:
|
||||||
|
I = sparse.lil_matrix((n, m))
|
||||||
|
else:
|
||||||
|
I = zeros((m, n), dtype='i')
|
||||||
|
|
||||||
|
for i, (h, t, eid) in enumerate(G.edges()):
|
||||||
|
if eid != None:
|
||||||
|
edge_ids[i] = eid
|
||||||
|
else:
|
||||||
|
edge_ids[i] = 'e_' + str(i)
|
||||||
|
hind = n2ind[str(h)]
|
||||||
|
tind = n2ind[str(t)]
|
||||||
|
I[hind, i] = 1
|
||||||
|
if G.is_directed():
|
||||||
|
I[tind, i] = -1
|
||||||
|
else:
|
||||||
|
I[tind, i] = 1
|
||||||
|
idents = [[node_dim, node_ids], [edge_dim, edge_ids]]
|
||||||
|
if G.name != '':
|
||||||
|
name = G.name
|
||||||
|
else:
|
||||||
|
name = 'A'
|
||||||
|
ds = GraphDataset(I, idents, name)
|
||||||
|
return ds
|
||||||
|
|
||||||
|
from_networkx = classmethod(from_networkx)
|
||||||
|
|
||||||
|
def _incidence2adjacency(self, I):
|
||||||
|
"""Incidence to adjacency matrix.
|
||||||
|
|
||||||
|
I*I.T - eye(n)?
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _graph_from_incidence_matrix(self, I, node_ids, edge_ids):
|
||||||
|
"""Creates a networkx graph class from incidence
|
||||||
|
(possibly weighted) matrix and ordered labels.
|
||||||
|
|
||||||
|
labels = None, results in string-numbered labels
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
import networkx as nx
|
import networkx as nx
|
||||||
except:
|
except:
|
||||||
print "Failed in import of NetworkX"
|
print "Failed in import of NetworkX"
|
||||||
return None
|
return None
|
||||||
m, n = A.shape # adjacency matrix must be of type that evals to true/false for neigbours
|
|
||||||
if m != n:
|
|
||||||
raise IOError, "Adjacency matrix must be square"
|
|
||||||
|
|
||||||
if A[A[:,0].nonzero()[0][0],0] == 1: #unweighted graph
|
m, n = I.shape
|
||||||
G = nx.Graph()
|
assert(m == len(node_ids))
|
||||||
else:
|
assert(n == len(edge_ids))
|
||||||
G = nx.XGraph()
|
weights = []
|
||||||
|
directed = False
|
||||||
if labels == None: # if labels not provided mark vertices with numbers
|
G = nx.XDiGraph(name=self._name)
|
||||||
labels = [str(i) for i in range(m)]
|
if isinstance(I, sparse.spmatrix):
|
||||||
|
I = I.tocsr()
|
||||||
for nbrs, head in izip(A, labels):
|
for ename, col in izip(edge_ids, I.T):
|
||||||
for i, nbr in enumerate(nbrs):
|
if isinstance(I, sparse.spmatrix):
|
||||||
if nbr:
|
node_ind = col.indices
|
||||||
tail = labels[i]
|
w1, w2 = col.data
|
||||||
if type(G)==nx.XGraph:
|
else:
|
||||||
G.add_edge(head, tail, nbr)
|
node_ind = where(col != 0)[0]
|
||||||
else:
|
w1, w2 = col[node_ind]
|
||||||
G.add_edge(head, tail)
|
node1 = node_ids[node_ind[0]]
|
||||||
return G
|
node2 = node_ids[node_ind[1]]
|
||||||
|
if w1 < 0: # w1 is tail
|
||||||
|
directed = True
|
||||||
|
assert(w2 > 0 and (w1 + w2) == 0)
|
||||||
|
G.add_edge(node2, node1, ename)
|
||||||
|
weights.append(w2)
|
||||||
|
else: #w2 is tail or graph is undirected
|
||||||
|
assert(w1 > 0)
|
||||||
|
if w2 < 0:
|
||||||
|
directed = True
|
||||||
|
G.add_edge(node1, node2, ename)
|
||||||
|
weights.append(w1)
|
||||||
|
if not directed:
|
||||||
|
G = G.to_undirected()
|
||||||
|
return G, asarray(weights)
|
||||||
|
|
||||||
Dataset._all_dims = set()
|
Dataset._all_dims = set()
|
||||||
|
|
||||||
|
|
||||||
class ReverseDict(dict):
|
class ReverseDict(dict):
|
||||||
"""
|
"""A dictionary which can lookup values by key, and keys by value.
|
||||||
A dictionary which can lookup values by key, and keys by value.
|
|
||||||
All values and keys must be hashable, and unique.
|
All values and keys must be hashable, and unique.
|
||||||
|
|
||||||
d = ReverseDict((['a',1],['b',2]))
|
example:
|
||||||
print d['a'] --> 1
|
>>d = ReverseDict((['a',1],['b',2]))
|
||||||
print d.reverse[1] --> 'a'
|
>>print d['a'] --> 1
|
||||||
|
>>print d.reverse[1] --> 'a'
|
||||||
"""
|
"""
|
||||||
def __init__(self, *args, **kw):
|
def __init__(self, *args, **kw):
|
||||||
dict.__init__(self, *args, **kw)
|
dict.__init__(self, *args, **kw)
|
||||||
|
@ -436,6 +527,7 @@ class Selection(dict):
|
||||||
def select(self, axis, labels):
|
def select(self, axis, labels):
|
||||||
self[axis] = labels
|
self[axis] = labels
|
||||||
|
|
||||||
|
|
||||||
def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True):
|
def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True):
|
||||||
"""Writes a dataset in fluents tab separated values (ftsv) form.
|
"""Writes a dataset in fluents tab separated values (ftsv) form.
|
||||||
|
|
||||||
|
@ -471,16 +563,23 @@ def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True):
|
||||||
fd.write('# type: %s' %type + '\n')
|
fd.write('# type: %s' %type + '\n')
|
||||||
|
|
||||||
for dim in ds.get_dim_name():
|
for dim in ds.get_dim_name():
|
||||||
print >> fd, "# dimension: %s" % dim,
|
fd.write("# dimension: %s" % dim)
|
||||||
for id in ds.get_identifiers(dim, None, True):
|
for ident in ds.get_identifiers(dim, sorted=True):
|
||||||
print >> fd, id,
|
fd.write(" " + ident)
|
||||||
print >> fd
|
fd.write("\n")
|
||||||
|
|
||||||
print >> fd, "# name: %s" % ds.get_name()
|
fd.write("# name: %s" % ds.get_name() + '\n')
|
||||||
|
# xy-node-positions
|
||||||
|
if ds.nodepos != None:
|
||||||
|
fd.write("# nodepos:")
|
||||||
|
node_dim = ds.get_dim_name(0)
|
||||||
|
for ident in ds.get_identifiers(node_dim, sorted=True):
|
||||||
|
fd.write(" %s,%s" %ds.nodepos[ident])
|
||||||
|
fd.write("\n")
|
||||||
|
|
||||||
# Write data
|
# Write data
|
||||||
if hasattr(ds, "asspmatrix") and sp_format == True:
|
if hasattr(ds, "as_spmatrix") and sp_format == True:
|
||||||
m = ds.asspmatrix()
|
m = ds.as_spmatrix()
|
||||||
else:
|
else:
|
||||||
m = ds.asarray()
|
m = ds.asarray()
|
||||||
if isinstance(m, sparse.spmatrix):
|
if isinstance(m, sparse.spmatrix):
|
||||||
|
@ -491,6 +590,107 @@ def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True):
|
||||||
if opened:
|
if opened:
|
||||||
fd.close()
|
fd.close()
|
||||||
|
|
||||||
|
def read_ftsv(fd, sep=None):
|
||||||
|
"""Read a dataset in fluents tab separated values (ftsv) form and return it.
|
||||||
|
|
||||||
|
@param fd: An open file descriptor.
|
||||||
|
@return: A Dataset, CategoryDataset or GraphDataset depending on the information
|
||||||
|
read.
|
||||||
|
"""
|
||||||
|
opened = False
|
||||||
|
if isinstance(fd, str):
|
||||||
|
fd = open(fd)
|
||||||
|
opened = True
|
||||||
|
|
||||||
|
split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)')
|
||||||
|
dimensions = []
|
||||||
|
identifiers = {}
|
||||||
|
type = 'dataset'
|
||||||
|
name = 'Unnamed dataset'
|
||||||
|
sp_format = False
|
||||||
|
nodepos = None
|
||||||
|
# graphtype = 'graph'
|
||||||
|
|
||||||
|
# Read header lines from file.
|
||||||
|
line = fd.readline()
|
||||||
|
while line:
|
||||||
|
m = split_re.match(line)
|
||||||
|
if m:
|
||||||
|
key, val = m.groups()
|
||||||
|
|
||||||
|
# The line is on the form;
|
||||||
|
# dimension: dimname id1 id2 id3 ...
|
||||||
|
if key == 'dimension':
|
||||||
|
values = [v.strip() for v in val.split(' ')]
|
||||||
|
dimensions.append(values[0])
|
||||||
|
identifiers[values[0]] = values[1:]
|
||||||
|
|
||||||
|
# Read type of dataset.
|
||||||
|
# Should be dataset, category, or network
|
||||||
|
elif key == 'type':
|
||||||
|
type = val
|
||||||
|
|
||||||
|
elif key == 'name':
|
||||||
|
name = val
|
||||||
|
|
||||||
|
# storage format
|
||||||
|
# if sp_format is True then use coordinate triplets
|
||||||
|
elif key == 'sp_format':
|
||||||
|
if val in ['False', 'false', '0', 'F', 'f',]:
|
||||||
|
sp_format = False
|
||||||
|
elif val in ['True', 'true', '1', 'T', 't']:
|
||||||
|
sp_format = True
|
||||||
|
else:
|
||||||
|
raise ValueError("sp_format: %s not valid " %sp_format)
|
||||||
|
|
||||||
|
elif key == 'nodepos':
|
||||||
|
node_dim = dimensions[0]
|
||||||
|
idents = identifiers[node_dim]
|
||||||
|
nodepos = {}
|
||||||
|
xys = val.split(" ")
|
||||||
|
for node_id, xy in zip(idents, xys):
|
||||||
|
x, y = map(float, xy.split(","))
|
||||||
|
nodepos[node_id] = (x, y)
|
||||||
|
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
line = fd.readline()
|
||||||
|
|
||||||
|
# Dimensions in the form [(dim1, [id1, id2, id3 ..) ...]
|
||||||
|
dims = [(x, identifiers[x]) for x in dimensions]
|
||||||
|
dim_lengths = [len(identifiers[x]) for x in dimensions]
|
||||||
|
|
||||||
|
# Create matrix and assign element reader
|
||||||
|
if type == 'category':
|
||||||
|
if sp_format:
|
||||||
|
matrix = sparse.lil_matrix(dim_lengths)
|
||||||
|
else:
|
||||||
|
matrix = empty(dim_lengths, dtype='i')
|
||||||
|
else:
|
||||||
|
if sp_format:
|
||||||
|
matrix = sparse.lil_matrix(dim_lengths)
|
||||||
|
else:
|
||||||
|
matrix = empty(dim_lengths)
|
||||||
|
|
||||||
|
if sp_format:
|
||||||
|
matrix = _read_sparse_elements(fd, matrix)
|
||||||
|
else:
|
||||||
|
matrix = _read_elements(fd, matrix)
|
||||||
|
|
||||||
|
|
||||||
|
# Create dataset of specified type
|
||||||
|
if type == 'category':
|
||||||
|
ds = CategoryDataset(matrix, dims, name)
|
||||||
|
elif type == 'network':
|
||||||
|
ds = GraphDataset(matrix, dims, name=name, nodepos=nodepos)
|
||||||
|
else:
|
||||||
|
ds = Dataset(matrix, dims, name)
|
||||||
|
|
||||||
|
if opened:
|
||||||
|
fd.close()
|
||||||
|
|
||||||
|
return ds
|
||||||
|
|
||||||
def _write_sparse_elements(fd, arr, fmt='%d', sep=None):
|
def _write_sparse_elements(fd, arr, fmt='%d', sep=None):
|
||||||
""" Sparse coordinate format."""
|
""" Sparse coordinate format."""
|
||||||
fd.write('# sp_format: True\n\n')
|
fd.write('# sp_format: True\n\n')
|
||||||
|
@ -530,97 +730,3 @@ def _read_sparse_elements(fd, arr, sep=None):
|
||||||
line = fd.readline()
|
line = fd.readline()
|
||||||
return arr.tocsr()
|
return arr.tocsr()
|
||||||
|
|
||||||
def read_ftsv(fd, sep=None):
|
|
||||||
"""Read a dataset in fluents tab separated values (ftsv) form and return it.
|
|
||||||
|
|
||||||
@param fd: An open file descriptor.
|
|
||||||
@return: A Dataset, CategoryDataset or GraphDataset depending on the information
|
|
||||||
read.
|
|
||||||
"""
|
|
||||||
opened = False
|
|
||||||
if isinstance(fd, str):
|
|
||||||
fd = open(fd)
|
|
||||||
opened = True
|
|
||||||
|
|
||||||
split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)')
|
|
||||||
dimensions = []
|
|
||||||
identifiers = {}
|
|
||||||
type = 'dataset'
|
|
||||||
name = 'Unnamed dataset'
|
|
||||||
sp_format = False
|
|
||||||
# graphtype = 'graph'
|
|
||||||
|
|
||||||
# Read header lines from file.
|
|
||||||
line = fd.readline()
|
|
||||||
while line:
|
|
||||||
m = split_re.match(line)
|
|
||||||
if m:
|
|
||||||
key, val = m.groups()
|
|
||||||
|
|
||||||
# The line is on the form;
|
|
||||||
# dimension: dimname id1 id2 id3 ...
|
|
||||||
if key == 'dimension':
|
|
||||||
values = [v.strip() for v in val.split(' ')]
|
|
||||||
dimensions.append(values[0])
|
|
||||||
identifiers[values[0]] = values[1:]
|
|
||||||
|
|
||||||
# Read type of dataset.
|
|
||||||
# Should be dataset, category, or network
|
|
||||||
elif key == 'type':
|
|
||||||
type = val
|
|
||||||
|
|
||||||
elif key == 'name':
|
|
||||||
name = val
|
|
||||||
|
|
||||||
# storage format
|
|
||||||
# if sp_format is True then use coordinate triplets
|
|
||||||
elif key == 'sp_format':
|
|
||||||
if val in ['False', 'false', '0', 'F', 'f',]:
|
|
||||||
sp_format = False
|
|
||||||
elif val in ['True', 'true', '1', 'T', 't']:
|
|
||||||
sp_format = True
|
|
||||||
else:
|
|
||||||
raise ValueError("sp_format: %s not valid " %sp_format)
|
|
||||||
|
|
||||||
# elif key == 'graphtype':
|
|
||||||
# graphtype = val
|
|
||||||
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
line = fd.readline()
|
|
||||||
|
|
||||||
# Dimensions in the form [(dim1, [id1, id2, id3 ..) ...]
|
|
||||||
dims = [(x, identifiers[x]) for x in dimensions]
|
|
||||||
dim_lengths = [len(identifiers[x]) for x in dimensions]
|
|
||||||
|
|
||||||
# Create matrix and assign element reader
|
|
||||||
if type == 'category':
|
|
||||||
if sp_format:
|
|
||||||
matrix = sparse.lil_matrix(dim_lengths)
|
|
||||||
else:
|
|
||||||
matrix = empty(dim_lengths, dtype='i')
|
|
||||||
elif type == 'network':
|
|
||||||
matrix = empty(dim_lengths)
|
|
||||||
else:
|
|
||||||
matrix = empty(dim_lengths)
|
|
||||||
|
|
||||||
if sp_format:
|
|
||||||
matrix = _read_sparse_elements(fd, matrix)
|
|
||||||
else:
|
|
||||||
matrix = _read_elements(fd, matrix)
|
|
||||||
|
|
||||||
|
|
||||||
# Create dataset of specified type
|
|
||||||
if type == 'category':
|
|
||||||
ds = CategoryDataset(matrix, dims, name)
|
|
||||||
elif type == 'network':
|
|
||||||
ds = GraphDataset(matrix, dims, name)
|
|
||||||
else:
|
|
||||||
ds = Dataset(matrix, dims, name)
|
|
||||||
|
|
||||||
if opened:
|
|
||||||
fd.close()
|
|
||||||
|
|
||||||
return ds
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Reference in New Issue