2008-01-08 01:43:56 +01:00
|
|
|
from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse,\
|
|
|
|
where
|
2006-08-01 11:45:53 +02:00
|
|
|
from scipy import sort as array_sort
|
|
|
|
from itertools import izip
|
2006-08-13 13:35:12 +02:00
|
|
|
import shelve
|
2006-09-16 12:36:57 +02:00
|
|
|
import copy
|
2007-01-10 18:35:58 +01:00
|
|
|
import re
|
2006-04-17 00:57:50 +02:00
|
|
|
|
2008-01-06 18:01:00 +01:00
|
|
|
|
2008-01-08 01:43:56 +01:00
|
|
|
class Dataset(object):
|
2006-04-24 11:53:07 +02:00
|
|
|
"""The Dataset base class.
|
|
|
|
|
2006-04-17 11:08:40 +02:00
|
|
|
A Dataset is an n-way array with defined string identifiers across
|
|
|
|
all dimensions.
|
2006-04-24 11:53:07 +02:00
|
|
|
|
|
|
|
example of use:
|
|
|
|
|
|
|
|
---
|
|
|
|
dim_name_rows = 'rows'
|
|
|
|
names_rows = ('row_a','row_b')
|
|
|
|
ids_1 = [dim_name_rows, names_rows]
|
|
|
|
|
|
|
|
dim_name_cols = 'cols'
|
|
|
|
names_cols = ('col_a','col_b','col_c','col_d')
|
|
|
|
ids_2 = [dim_name_cols, names_cols]
|
|
|
|
|
|
|
|
Array_X = rand(2,4)
|
|
|
|
data = Dataset(Array_X,(ids_1,ids_2),name="Testing")
|
|
|
|
|
|
|
|
dim_names = [dim for dim in data]
|
|
|
|
|
|
|
|
column_identifiers = [id for id in data['cols'].keys()]
|
|
|
|
column_index = [index for index in data['cols'].values()]
|
|
|
|
|
|
|
|
'cols' in data -> True
|
|
|
|
|
|
|
|
---
|
|
|
|
|
|
|
|
data = Dataset(rand(10,20)) (generates dims and ids (no links))
|
2006-04-17 00:57:50 +02:00
|
|
|
"""
|
2007-08-02 12:08:52 +02:00
|
|
|
|
2007-01-31 12:54:54 +01:00
|
|
|
def __init__(self, array, identifiers=None, name='Unnamed dataset'):
|
2006-04-24 11:53:07 +02:00
|
|
|
self._dims = [] #existing dimensions in this dataset
|
|
|
|
self._map = {} # internal mapping for dataset: identifier <--> index
|
2006-08-01 11:45:53 +02:00
|
|
|
self._name = name
|
2006-08-13 13:35:12 +02:00
|
|
|
self._identifiers = identifiers
|
2007-03-14 17:06:16 +01:00
|
|
|
|
2008-01-06 18:01:00 +01:00
|
|
|
if not isinstance(array, sparse.spmatrix):
|
2006-04-24 11:53:07 +02:00
|
|
|
array = atleast_2d(asarray(array))
|
2008-01-06 18:01:00 +01:00
|
|
|
# vector are column (array)
|
|
|
|
if array.shape[0] == 1:
|
|
|
|
array = array.T
|
2006-09-16 12:36:57 +02:00
|
|
|
self.shape = array.shape
|
2007-03-14 17:06:16 +01:00
|
|
|
|
2008-01-06 18:01:00 +01:00
|
|
|
if identifiers != None:
|
2007-03-14 17:06:16 +01:00
|
|
|
self._validate_identifiers(identifiers)
|
2007-01-31 12:54:54 +01:00
|
|
|
self._set_identifiers(identifiers, self._all_dims)
|
2006-04-24 11:53:07 +02:00
|
|
|
else:
|
2007-01-31 12:54:54 +01:00
|
|
|
self._identifiers = self._create_identifiers(self.shape, self._all_dims)
|
|
|
|
self._set_identifiers(self._identifiers, self._all_dims)
|
2006-09-16 12:36:57 +02:00
|
|
|
self._array = array
|
2006-04-21 14:28:29 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
def __iter__(self):
|
|
|
|
"""Returns an iterator over dimensions of dataset."""
|
|
|
|
return self._dims.__iter__()
|
|
|
|
|
|
|
|
def __contains__(self,dim):
|
|
|
|
"""Returns True if dim is a dimension name in dataset."""
|
|
|
|
# return self._dims.__contains__(dim)
|
|
|
|
return self._map.__contains__(dim)
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
"""Returns the number of dimensions in the dataset"""
|
|
|
|
return len(self._map)
|
|
|
|
|
|
|
|
def __getitem__(self,dim):
|
|
|
|
"""Return the identifers along the dimension dim."""
|
|
|
|
return self._map[dim]
|
|
|
|
|
2007-08-03 11:44:31 +02:00
|
|
|
def _create_identifiers(self, shape, all_dims):
|
2006-04-24 11:53:07 +02:00
|
|
|
"""Creates dimension names and identifier names, and returns
|
|
|
|
identifiers."""
|
2006-04-17 11:08:40 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
dim_names = ['rows','cols']
|
|
|
|
ids = []
|
2008-01-06 18:01:00 +01:00
|
|
|
for axis, n in enumerate(shape):
|
|
|
|
if axis < 2:
|
2006-04-24 11:53:07 +02:00
|
|
|
dim_suggestion = dim_names[axis]
|
|
|
|
else:
|
|
|
|
dim_suggestion = 'dim'
|
2008-01-06 18:01:00 +01:00
|
|
|
dim_suggestion = self._suggest_dim_name(dim_suggestion, all_dims)
|
|
|
|
identifier_creation = [str(axis) + "_" + i for i in map(str, range(n))]
|
|
|
|
ids.append((dim_suggestion, identifier_creation))
|
2006-08-01 11:45:53 +02:00
|
|
|
all_dims.add(dim_suggestion)
|
2006-04-24 11:53:07 +02:00
|
|
|
return ids
|
|
|
|
|
2007-01-31 12:54:54 +01:00
|
|
|
def _set_identifiers(self, identifiers, all_dims):
|
2006-04-24 11:53:07 +02:00
|
|
|
"""Creates internal mapping of identifiers structure."""
|
2007-01-31 12:54:54 +01:00
|
|
|
for dim, ids in identifiers:
|
2006-08-08 10:05:26 +02:00
|
|
|
pos_map = ReverseDict()
|
2006-04-24 11:53:07 +02:00
|
|
|
if dim not in self._dims:
|
|
|
|
self._dims.append(dim)
|
2006-08-01 11:45:53 +02:00
|
|
|
all_dims.add(dim)
|
2006-04-24 11:53:07 +02:00
|
|
|
else:
|
2006-08-01 11:45:53 +02:00
|
|
|
raise ValueError, "Dimension names must be unique whitin dataset"
|
2007-01-31 12:54:54 +01:00
|
|
|
for pos, id in enumerate(ids):
|
2006-04-24 11:53:07 +02:00
|
|
|
pos_map[id] = pos
|
|
|
|
self._map[dim] = pos_map
|
2006-08-01 11:45:53 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
def _suggest_dim_name(self,dim_name,all_dims):
|
|
|
|
"""Suggests a unique name for dim and returns it"""
|
|
|
|
c = 0
|
2006-08-01 11:45:53 +02:00
|
|
|
new_name = dim_name
|
|
|
|
while new_name in all_dims:
|
|
|
|
new_name = dim_name + "_" + str(c)
|
2008-01-06 18:01:00 +01:00
|
|
|
c += 1
|
2006-08-01 11:45:53 +02:00
|
|
|
return new_name
|
2006-04-19 12:37:44 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
def asarray(self):
|
|
|
|
"""Returns the numeric array (data) of dataset"""
|
2008-01-06 18:01:00 +01:00
|
|
|
if isinstance(self._array, sparse.spmatrix):
|
|
|
|
return self._array.toarray()
|
2006-08-01 15:22:39 +02:00
|
|
|
return self._array
|
2006-04-19 12:37:44 +02:00
|
|
|
|
2008-01-06 18:01:00 +01:00
|
|
|
def set_array(self, array):
|
2006-04-24 11:53:07 +02:00
|
|
|
"""Adds array as an ArrayType object.
|
|
|
|
A one-dim array is transformed to a two-dim array (row-vector)
|
2006-04-19 12:37:44 +02:00
|
|
|
"""
|
2008-01-06 18:01:00 +01:00
|
|
|
if not isinstance(array, type(self._array)):
|
|
|
|
raise ValueError("Input array of type: %s does not match existing array type: %s") %(type(array), type(self._array))
|
|
|
|
if self.shape != array.shape:
|
2006-08-01 11:45:53 +02:00
|
|
|
raise ValueError, "Input array must be of similar dimensions as dataset"
|
|
|
|
self._array = atleast_2d(asarray(array))
|
2006-04-24 11:53:07 +02:00
|
|
|
|
|
|
|
def get_name(self):
|
2006-04-24 13:23:30 +02:00
|
|
|
"""Returns dataset name"""
|
2006-04-24 11:53:07 +02:00
|
|
|
return self._name
|
2006-04-24 13:23:30 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
def get_all_dims(self):
|
2006-04-24 13:23:30 +02:00
|
|
|
"""Returns all dimensions in project"""
|
2006-04-24 11:53:07 +02:00
|
|
|
return self._all_dims
|
|
|
|
|
2007-01-31 12:54:54 +01:00
|
|
|
def get_dim_name(self, axis=None):
|
2006-09-08 20:25:03 +02:00
|
|
|
"""Returns dim name for an axis, if no axis is provided it
|
|
|
|
returns a list of dims"""
|
2008-01-06 18:01:00 +01:00
|
|
|
if type(axis) == int:
|
2006-08-01 11:45:53 +02:00
|
|
|
return self._dims[axis]
|
|
|
|
else:
|
2007-08-03 11:44:31 +02:00
|
|
|
return [dim for dim in self._dims]
|
2006-04-24 13:23:30 +02:00
|
|
|
|
2007-06-22 17:35:45 +02:00
|
|
|
def common_dims(self, ds):
|
|
|
|
"""Returns a list of the common dimensions in the two datasets."""
|
|
|
|
dims = self.get_dim_name()
|
|
|
|
ds_dims = ds.get_dim_name()
|
|
|
|
return [d for d in dims if d in ds_dims]
|
|
|
|
|
2008-01-06 18:01:00 +01:00
|
|
|
def get_identifiers(self, dim, indices=None, sorted=False):
|
2006-09-08 20:25:03 +02:00
|
|
|
"""Returns identifiers along dim, sorted by position (index)
|
|
|
|
is optional.
|
2006-08-08 10:05:26 +02:00
|
|
|
|
2006-09-08 20:25:03 +02:00
|
|
|
You can optionally provide a list/ndarray of indices to get
|
|
|
|
only the identifiers of a given position.
|
2006-08-01 11:45:53 +02:00
|
|
|
|
2006-09-08 20:25:03 +02:00
|
|
|
Identifiers are the unique names (strings) for a variable in a
|
|
|
|
given dim. Index (Indices) are the Identifiers position in a
|
|
|
|
matrix in a given dim.
|
2006-05-02 16:06:48 +02:00
|
|
|
"""
|
2007-03-14 17:06:16 +01:00
|
|
|
if indices != None:
|
|
|
|
if len(indices) == 0:# if empty list or empty array
|
2006-10-06 12:19:19 +02:00
|
|
|
return []
|
2006-08-01 11:45:53 +02:00
|
|
|
if indices != None:
|
2006-10-06 12:19:19 +02:00
|
|
|
# be sure to match intersection
|
|
|
|
#indices = intersect1d(self.get_indices(dim),indices)
|
2006-08-08 10:05:26 +02:00
|
|
|
ids = [self._map[dim].reverse[i] for i in indices]
|
|
|
|
else:
|
2007-03-14 17:06:16 +01:00
|
|
|
if sorted == True:
|
2006-08-08 10:05:26 +02:00
|
|
|
ids = [self._map[dim].reverse[i] for i in array_sort(self._map[dim].values())]
|
|
|
|
else:
|
|
|
|
ids = self._map[dim].keys()
|
2006-08-01 11:45:53 +02:00
|
|
|
|
|
|
|
return ids
|
|
|
|
|
|
|
|
def get_indices(self, dim, idents=None):
|
|
|
|
"""Returns indices for identifiers along dimension.
|
2006-09-08 20:25:03 +02:00
|
|
|
You can optionally provide a list of identifiers to retrieve a
|
|
|
|
index subset.
|
2006-08-01 11:45:53 +02:00
|
|
|
|
2006-09-08 20:25:03 +02:00
|
|
|
Identifiers are the unique names (strings) for a variable in a
|
|
|
|
given dim. Index (Indices) are the Identifiers position in a
|
|
|
|
matrix in a given dim. If none of the input identifiers are
|
|
|
|
found an empty index is returned
|
2006-08-31 12:04:19 +02:00
|
|
|
"""
|
2007-02-15 14:36:36 +01:00
|
|
|
if not isinstance(idents, list) and not isinstance(idents, set):
|
|
|
|
raise ValueError("idents needs to be a list/set got: %s" %type(idents))
|
2008-01-06 18:01:00 +01:00
|
|
|
if idents == None:
|
2006-08-01 11:45:53 +02:00
|
|
|
index = array_sort(self._map[dim].values())
|
|
|
|
else:
|
2006-09-08 20:25:03 +02:00
|
|
|
index = [self._map[dim][key]
|
|
|
|
for key in idents if self._map[dim].has_key(key)]
|
2006-08-13 13:35:12 +02:00
|
|
|
return asarray(index)
|
2006-09-08 20:25:03 +02:00
|
|
|
|
2007-07-26 17:45:42 +02:00
|
|
|
def existing_identifiers(self, dim, idents):
|
2007-08-02 12:08:52 +02:00
|
|
|
"""Filters a list of identifiers to find those that are present in the
|
|
|
|
dataset.
|
|
|
|
|
|
|
|
The most common use of this function is to get a list of
|
|
|
|
identifiers who correspond one to one with the list of indices produced
|
|
|
|
when get_indices is given an identifier list. That is
|
|
|
|
ds.get_indices(dim, idents) and ds.exisiting_identifiers(dim, idents)
|
|
|
|
will have the same order.
|
|
|
|
|
|
|
|
@param dim: A dimension present in the dataset.
|
|
|
|
@param idents: A list of identifiers along the given dimension.
|
|
|
|
@return: A list of identifiers in the same order as idents, but
|
|
|
|
without elements not present in the dataset.
|
|
|
|
"""
|
2007-07-26 17:45:42 +02:00
|
|
|
if not isinstance(idents, list) and not isinstance(idents, set):
|
|
|
|
raise ValueError("idents needs to be a list/set got: %s" %type(idents))
|
|
|
|
|
|
|
|
return [key for key in idents if self._map[dim].has_key(key)]
|
|
|
|
|
2006-09-16 12:36:57 +02:00
|
|
|
def copy(self):
|
2006-10-06 12:19:19 +02:00
|
|
|
""" Returns deepcopy of dataset.
|
|
|
|
"""
|
2006-09-16 12:36:57 +02:00
|
|
|
return copy.deepcopy(self)
|
|
|
|
|
2008-01-15 12:41:53 +01:00
|
|
|
def subdata(self, dim, idents):
|
|
|
|
"""Returns a new dataset based on dimension and given identifiers.
|
|
|
|
|
|
|
|
"""
|
|
|
|
ds = self.copy()
|
|
|
|
indices = ds.get_indices(dim, idents)
|
2008-02-06 17:29:02 +01:00
|
|
|
idents = ds.get_identifiers(dim, indices=indices)
|
|
|
|
if not idents:
|
|
|
|
raise ValueError("No of identifers from: \n%s \nfound in %s" %(str(idents), ds._name))
|
2008-01-15 12:41:53 +01:00
|
|
|
ax = [i for i, name in enumerate(ds._dims) if name == dim][0]
|
|
|
|
subarr = ds._array.take(indices, ax)
|
2008-02-06 17:29:02 +01:00
|
|
|
new_indices = range(len(idents))
|
|
|
|
ds._map[dim] = ReverseDict(zip(idents, new_indices))
|
2008-01-15 12:41:53 +01:00
|
|
|
ds.shape = tuple(len(ds._map[d]) for d in ds._dims)
|
|
|
|
ds.set_array(subarr)
|
|
|
|
return ds
|
|
|
|
|
2007-08-03 11:44:31 +02:00
|
|
|
def transpose(self):
|
|
|
|
"""Returns a copy of transpose of a dataset.
|
|
|
|
|
|
|
|
As for the moment: only support for 2D-arrays.
|
|
|
|
"""
|
|
|
|
|
2008-01-06 18:01:00 +01:00
|
|
|
assert(len(self.shape) == 2)
|
2007-08-03 11:44:31 +02:00
|
|
|
ds = self.copy()
|
|
|
|
ds._array = ds._array.T
|
|
|
|
ds._dims.reverse()
|
|
|
|
ds.shape = ds._array.shape
|
|
|
|
return ds
|
|
|
|
|
2007-03-14 17:06:16 +01:00
|
|
|
def _validate_identifiers(self, identifiers):
|
|
|
|
for dim_name, ids in identifiers:
|
|
|
|
if len(set(ids)) != len(ids):
|
|
|
|
raise ValueError("Identifiers not unique in : %s" %dim_name)
|
|
|
|
identifier_shape = [len(i[1]) for i in identifiers]
|
2008-01-06 18:01:00 +01:00
|
|
|
if len(identifier_shape) != len(self.shape):
|
2007-03-14 17:06:16 +01:00
|
|
|
raise ValueError("Identifier list length must equal array dims")
|
|
|
|
for ni, na in zip(identifier_shape, self.shape):
|
|
|
|
if ni != na:
|
|
|
|
raise ValueError, "Identifier-array mismatch: %s: (idents: %s, array: %s)" %(self._name, ni, na)
|
2007-06-22 17:35:45 +02:00
|
|
|
|
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
class CategoryDataset(Dataset):
|
|
|
|
"""The category dataset class.
|
|
|
|
|
|
|
|
A dataset for representing class information as binary
|
|
|
|
matrices (0/1-matrices).
|
|
|
|
|
2008-01-06 18:01:00 +01:00
|
|
|
There is support for using a less memory demanding, sparse format. The
|
|
|
|
prefered (default) format for a category dataset is the compressed sparse row
|
|
|
|
format (csr)
|
2006-08-08 09:54:00 +02:00
|
|
|
|
|
|
|
Always has linked dimension in first dim:
|
|
|
|
ex matrix:
|
2006-10-17 17:06:24 +02:00
|
|
|
. go_term1 go_term2 ...
|
2006-08-08 09:54:00 +02:00
|
|
|
gene_1
|
|
|
|
gene_2
|
|
|
|
gene_3
|
|
|
|
.
|
|
|
|
.
|
|
|
|
.
|
2008-01-06 18:01:00 +01:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
"""
|
|
|
|
|
2007-01-31 12:54:54 +01:00
|
|
|
def __init__(self, array, identifiers=None, name='C'):
|
|
|
|
Dataset.__init__(self, array, identifiers=identifiers, name=name)
|
2008-01-07 14:13:27 +01:00
|
|
|
|
2008-01-08 01:43:56 +01:00
|
|
|
def as_spmatrix(self):
|
2008-01-07 14:13:27 +01:00
|
|
|
if isinstance(self._array, sparse.spmatrix):
|
|
|
|
return self._array
|
|
|
|
else:
|
|
|
|
arr = self.asarray()
|
|
|
|
return sparse.csr_matrix(arr.astype('i'))
|
|
|
|
|
2008-01-08 01:43:56 +01:00
|
|
|
def to_spmatrix(self):
|
2008-01-07 14:13:27 +01:00
|
|
|
if isinstance(self._array, sparse.spmatrix):
|
|
|
|
self._array = self._array.tocsr()
|
|
|
|
else:
|
|
|
|
self._array = sparse.scr_matrix(self._array)
|
|
|
|
|
2008-01-08 01:43:56 +01:00
|
|
|
def as_dictlists(self):
|
2008-01-06 18:01:00 +01:00
|
|
|
"""Returns data as dict of identifiers along first dim.
|
2006-08-30 14:37:08 +02:00
|
|
|
|
2008-01-06 18:01:00 +01:00
|
|
|
ex: data['gene_1'] = ['map0030','map0010', ...]
|
|
|
|
|
|
|
|
fixme: Deprecated?
|
2006-08-30 14:37:08 +02:00
|
|
|
"""
|
2008-01-06 18:01:00 +01:00
|
|
|
data = {}
|
2007-01-31 12:54:54 +01:00
|
|
|
for name, ind in self._map[self.get_dim_name(0)].items():
|
2008-01-06 18:01:00 +01:00
|
|
|
if isinstance(self._array, ndarray):
|
|
|
|
indices = self._array[ind,:].nonzero()[0]
|
|
|
|
elif isinstance(self._array, sparse.spmatrix):
|
|
|
|
if not isinstance(self._array, sparse.csr_matrix):
|
|
|
|
array = self._array.tocsr()
|
|
|
|
else:
|
|
|
|
array = self._array
|
|
|
|
indices = array[ind,:].indices
|
|
|
|
if len(indices) == 0: # should we allow categories with no members?
|
|
|
|
continue
|
|
|
|
data[name] = self.get_identifiers(self.get_dim_name(1), indices)
|
2006-08-08 09:54:00 +02:00
|
|
|
self._dictlists = data
|
2006-08-30 14:37:08 +02:00
|
|
|
return data
|
2006-04-19 12:37:44 +02:00
|
|
|
|
2006-08-30 14:37:08 +02:00
|
|
|
def as_selections(self):
|
|
|
|
"""Returns data as a list of Selection objects.
|
2008-01-06 18:01:00 +01:00
|
|
|
|
|
|
|
The list of selections is not ordered (sorted) by any means.
|
2006-08-30 14:37:08 +02:00
|
|
|
"""
|
|
|
|
ret_list = []
|
2007-01-31 12:54:54 +01:00
|
|
|
for cat_name, ind in self._map[self.get_dim_name(1)].items():
|
2008-01-06 18:01:00 +01:00
|
|
|
if isinstance(self._array, sparse.spmatrix):
|
|
|
|
if not isinstance(self._array, sparse.csc_matrix):
|
|
|
|
self._array = self._array.tocsc()
|
|
|
|
indices = self._array[:,ind].indices
|
|
|
|
else:
|
|
|
|
indices = self._array[:,ind].nonzero()[0]
|
|
|
|
if len(indices) == 0:
|
|
|
|
continue
|
|
|
|
ids = self.get_identifiers(self.get_dim_name(0), indices)
|
2006-08-30 14:37:08 +02:00
|
|
|
selection = Selection(cat_name)
|
2006-09-08 20:25:03 +02:00
|
|
|
selection.select(self.get_dim_name(0), ids)
|
2006-08-30 14:37:08 +02:00
|
|
|
ret_list.append(selection)
|
|
|
|
return ret_list
|
|
|
|
|
2006-08-08 10:05:26 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
class GraphDataset(Dataset):
|
|
|
|
"""The graph dataset class.
|
|
|
|
|
2008-01-08 01:43:56 +01:00
|
|
|
A dataset class for representing graphs. The constructor may use an
|
|
|
|
incidence matrix (possibly sparse) or (if networkx installed) a
|
|
|
|
networkx.(X)Graph structure.
|
2006-04-24 11:53:07 +02:00
|
|
|
|
2008-01-08 01:43:56 +01:00
|
|
|
If the networkx library is installed, there is support for
|
|
|
|
representing the graph as a networkx.Graph, or networkx.XGraph structure.
|
2006-04-24 11:53:07 +02:00
|
|
|
"""
|
2006-09-08 20:25:03 +02:00
|
|
|
|
2008-01-08 01:43:56 +01:00
|
|
|
def __init__(self, input, identifiers=None, name='A', nodepos = None):
|
|
|
|
if isinstance(input, sparse.spmatrix):
|
|
|
|
arr = input
|
|
|
|
else:
|
|
|
|
try:
|
|
|
|
arr = asarray(input)
|
|
|
|
except:
|
|
|
|
raise ValueError("Could not identify input")
|
|
|
|
Dataset.__init__(self, array=arr, identifiers=identifiers, name=name)
|
2006-09-16 12:36:57 +02:00
|
|
|
self._graph = None
|
2008-01-08 01:43:56 +01:00
|
|
|
self.nodepos = nodepos
|
2006-04-24 11:53:07 +02:00
|
|
|
|
2008-01-08 01:43:56 +01:00
|
|
|
def as_spmatrix(self):
|
|
|
|
if isinstance(self._array, sparse.spmatrix):
|
|
|
|
return self._array
|
|
|
|
else:
|
|
|
|
arr = self.asarray()
|
|
|
|
return sparse.csr_matrix(arr.astype('i'))
|
|
|
|
|
|
|
|
def to_spmatrix(self):
|
|
|
|
if isinstance(self._array, sparse.spmatrix):
|
|
|
|
self._array = self._array.tocsr()
|
|
|
|
else:
|
|
|
|
self._array = sparse.scr_matrix(self._array)
|
|
|
|
|
|
|
|
def asnetworkx(self):
|
|
|
|
if self._graph != None:
|
|
|
|
return self._graph
|
|
|
|
dim0, dim1 = self.get_dim_name()
|
|
|
|
node_ids = self.get_identifiers(dim0, sorted=True)
|
|
|
|
edge_ids = self.get_identifiers(dim1, sorted=True)
|
2008-02-06 17:29:02 +01:00
|
|
|
G, weights = self._graph_from_incidence_matrix(self._array, node_ids=node_ids, edge_ids=edge_ids)
|
2006-09-16 12:36:57 +02:00
|
|
|
self._graph = G
|
2006-08-01 11:45:53 +02:00
|
|
|
return G
|
2008-01-08 01:43:56 +01:00
|
|
|
|
|
|
|
def from_networkx(cls, G, node_dim, edge_dim, sp_format=True):
|
|
|
|
"""Create graph dataset from networkx graph.
|
|
|
|
|
|
|
|
When G is a Graph/Digraph edge identifiers will be created,
|
|
|
|
else (XGraoh/XDigraph) it is assumed that edge attributes are
|
|
|
|
the edge identifiers.
|
|
|
|
"""
|
|
|
|
|
|
|
|
import networkx as nx
|
|
|
|
n = G.number_of_nodes()
|
|
|
|
m = G.number_of_edges()
|
|
|
|
|
|
|
|
if isinstance(G, nx.DiGraph):
|
|
|
|
G = nx.XDiGraph(G)
|
|
|
|
elif isinstance(G, nx.Graph):
|
|
|
|
G = nx.XGraph(G)
|
|
|
|
|
|
|
|
edge_ids = [e[2] for e in G.edges()]
|
|
|
|
node_ids = map(str, G.nodes())
|
|
|
|
n2ind = {}
|
|
|
|
for ind, node in enumerate(node_ids):
|
|
|
|
n2ind[node] = ind
|
|
|
|
|
|
|
|
if sp_format:
|
|
|
|
I = sparse.lil_matrix((n, m))
|
|
|
|
else:
|
|
|
|
I = zeros((m, n), dtype='i')
|
|
|
|
|
|
|
|
for i, (h, t, eid) in enumerate(G.edges()):
|
|
|
|
if eid != None:
|
|
|
|
edge_ids[i] = eid
|
|
|
|
else:
|
|
|
|
edge_ids[i] = 'e_' + str(i)
|
|
|
|
hind = n2ind[str(h)]
|
|
|
|
tind = n2ind[str(t)]
|
|
|
|
I[hind, i] = 1
|
|
|
|
if G.is_directed():
|
|
|
|
I[tind, i] = -1
|
|
|
|
else:
|
|
|
|
I[tind, i] = 1
|
|
|
|
idents = [[node_dim, node_ids], [edge_dim, edge_ids]]
|
|
|
|
if G.name != '':
|
|
|
|
name = G.name
|
|
|
|
else:
|
|
|
|
name = 'A'
|
|
|
|
ds = GraphDataset(I, idents, name)
|
|
|
|
return ds
|
|
|
|
|
|
|
|
from_networkx = classmethod(from_networkx)
|
|
|
|
|
|
|
|
def _incidence2adjacency(self, I):
|
|
|
|
"""Incidence to adjacency matrix.
|
|
|
|
|
|
|
|
I*I.T - eye(n)?
|
|
|
|
"""
|
|
|
|
raise NotImplementedError
|
2006-08-01 11:45:53 +02:00
|
|
|
|
2008-01-08 01:43:56 +01:00
|
|
|
def _graph_from_incidence_matrix(self, I, node_ids, edge_ids):
|
|
|
|
"""Creates a networkx graph class from incidence
|
2006-09-16 12:36:57 +02:00
|
|
|
(possibly weighted) matrix and ordered labels.
|
2006-08-01 11:45:53 +02:00
|
|
|
|
2006-09-16 12:36:57 +02:00
|
|
|
labels = None, results in string-numbered labels
|
2006-08-01 11:45:53 +02:00
|
|
|
"""
|
2006-09-16 12:36:57 +02:00
|
|
|
try:
|
|
|
|
import networkx as nx
|
|
|
|
except:
|
|
|
|
print "Failed in import of NetworkX"
|
2008-01-06 18:01:00 +01:00
|
|
|
return None
|
2006-09-16 12:36:57 +02:00
|
|
|
|
2008-01-08 01:43:56 +01:00
|
|
|
m, n = I.shape
|
|
|
|
assert(m == len(node_ids))
|
|
|
|
assert(n == len(edge_ids))
|
|
|
|
weights = []
|
|
|
|
directed = False
|
|
|
|
G = nx.XDiGraph(name=self._name)
|
|
|
|
if isinstance(I, sparse.spmatrix):
|
|
|
|
I = I.tocsr()
|
|
|
|
for ename, col in izip(edge_ids, I.T):
|
|
|
|
if isinstance(I, sparse.spmatrix):
|
|
|
|
node_ind = col.indices
|
|
|
|
w1, w2 = col.data
|
|
|
|
else:
|
|
|
|
node_ind = where(col != 0)[0]
|
|
|
|
w1, w2 = col[node_ind]
|
|
|
|
node1 = node_ids[node_ind[0]]
|
|
|
|
node2 = node_ids[node_ind[1]]
|
|
|
|
if w1 < 0: # w1 is tail
|
|
|
|
directed = True
|
|
|
|
assert(w2 > 0 and (w1 + w2) == 0)
|
|
|
|
G.add_edge(node2, node1, ename)
|
|
|
|
weights.append(w2)
|
|
|
|
else: #w2 is tail or graph is undirected
|
|
|
|
assert(w1 > 0)
|
|
|
|
if w2 < 0:
|
|
|
|
directed = True
|
|
|
|
G.add_edge(node1, node2, ename)
|
|
|
|
weights.append(w1)
|
|
|
|
if not directed:
|
|
|
|
G = G.to_undirected()
|
|
|
|
return G, asarray(weights)
|
|
|
|
|
2007-01-31 12:54:54 +01:00
|
|
|
Dataset._all_dims = set()
|
|
|
|
|
2006-08-30 01:57:21 +02:00
|
|
|
|
2006-08-08 10:05:26 +02:00
|
|
|
class ReverseDict(dict):
|
2008-01-08 01:43:56 +01:00
|
|
|
"""A dictionary which can lookup values by key, and keys by value.
|
|
|
|
|
2006-08-08 10:05:26 +02:00
|
|
|
All values and keys must be hashable, and unique.
|
2008-01-08 01:43:56 +01:00
|
|
|
|
|
|
|
example:
|
|
|
|
>>d = ReverseDict((['a',1],['b',2]))
|
|
|
|
>>print d['a'] --> 1
|
|
|
|
>>print d.reverse[1] --> 'a'
|
2006-08-08 10:05:26 +02:00
|
|
|
"""
|
|
|
|
def __init__(self, *args, **kw):
|
|
|
|
dict.__init__(self, *args, **kw)
|
2008-01-06 18:01:00 +01:00
|
|
|
self.reverse = dict([[v, k] for k, v in self.items()])
|
2006-08-08 10:05:26 +02:00
|
|
|
|
|
|
|
def __setitem__(self, key, value):
|
|
|
|
dict.__setitem__(self, key, value)
|
2006-09-16 12:36:57 +02:00
|
|
|
try:
|
|
|
|
self.reverse[value] = key
|
|
|
|
except:
|
|
|
|
self.reverse = {value:key}
|
2006-08-13 13:35:12 +02:00
|
|
|
|
2007-01-31 12:54:54 +01:00
|
|
|
|
2006-08-30 01:57:21 +02:00
|
|
|
class Selection(dict):
|
2006-08-28 14:06:05 +02:00
|
|
|
"""Handles selected identifiers along each dimension of a dataset"""
|
2006-08-30 01:57:21 +02:00
|
|
|
|
|
|
|
def __init__(self, title='Unnamed Selecton'):
|
|
|
|
self.title = title
|
|
|
|
|
|
|
|
def __getitem__(self, key):
|
|
|
|
if not self.has_key(key):
|
|
|
|
return None
|
|
|
|
return dict.__getitem__(self, key)
|
|
|
|
|
|
|
|
def dims(self):
|
|
|
|
return self.keys()
|
|
|
|
|
|
|
|
def axis_len(self, axis):
|
|
|
|
if self._selection.has_key(axis):
|
|
|
|
return len(self._selection[axis])
|
|
|
|
return 0
|
|
|
|
|
|
|
|
def select(self, axis, labels):
|
|
|
|
self[axis] = labels
|
2007-01-10 18:35:58 +01:00
|
|
|
|
2008-01-08 01:43:56 +01:00
|
|
|
|
2008-01-07 14:13:27 +01:00
|
|
|
def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True):
|
2008-12-05 23:07:56 +01:00
|
|
|
"""Writes a dataset in laydi tab separated values (ftsv) form.
|
2007-08-02 12:08:52 +02:00
|
|
|
|
|
|
|
@param fd: An open file descriptor to the output file.
|
2008-01-06 18:01:00 +01:00
|
|
|
@param ds: The dataset to be written.
|
|
|
|
@param decimals: Number of decimals, only supported for dataset.
|
|
|
|
@param fmt: String formating
|
|
|
|
The function handles datasets of these classes:
|
|
|
|
Dataset, CategoryDataset and GraphDataset
|
2007-08-02 12:08:52 +02:00
|
|
|
"""
|
2007-08-08 14:23:45 +02:00
|
|
|
opened = False
|
|
|
|
if isinstance(fd, str):
|
|
|
|
fd = open(fd, 'w')
|
|
|
|
opened = True
|
2008-01-06 18:01:00 +01:00
|
|
|
|
2007-01-12 00:24:05 +01:00
|
|
|
# Write header information
|
|
|
|
if isinstance(ds, CategoryDataset):
|
|
|
|
type = 'category'
|
2008-01-06 18:01:00 +01:00
|
|
|
if fmt == None:
|
|
|
|
fmt = '%d'
|
2007-01-12 00:24:05 +01:00
|
|
|
elif isinstance(ds, GraphDataset):
|
|
|
|
type = 'network'
|
2008-01-06 18:01:00 +01:00
|
|
|
if fmt == None:
|
|
|
|
fmt = '%d'
|
2007-01-12 00:24:05 +01:00
|
|
|
elif isinstance(ds, Dataset):
|
|
|
|
type = 'dataset'
|
2008-01-06 18:01:00 +01:00
|
|
|
if fmt == None:
|
|
|
|
fmt = '%%.%df' % decimals
|
|
|
|
else:
|
|
|
|
fmt = '%%.%d' %decimals + fmt
|
2007-01-12 00:24:05 +01:00
|
|
|
else:
|
2008-01-06 18:01:00 +01:00
|
|
|
raise Exception("Unknown object type")
|
|
|
|
fd.write('# type: %s' %type + '\n')
|
2007-01-12 00:24:05 +01:00
|
|
|
|
|
|
|
for dim in ds.get_dim_name():
|
2008-01-08 01:43:56 +01:00
|
|
|
fd.write("# dimension: %s" % dim)
|
|
|
|
for ident in ds.get_identifiers(dim, sorted=True):
|
|
|
|
fd.write(" " + ident)
|
|
|
|
fd.write("\n")
|
|
|
|
|
|
|
|
fd.write("# name: %s" % ds.get_name() + '\n')
|
|
|
|
# xy-node-positions
|
2008-01-15 12:41:53 +01:00
|
|
|
if type == 'network' and ds.nodepos != None:
|
2008-01-08 01:43:56 +01:00
|
|
|
fd.write("# nodepos:")
|
|
|
|
node_dim = ds.get_dim_name(0)
|
|
|
|
for ident in ds.get_identifiers(node_dim, sorted=True):
|
|
|
|
fd.write(" %s,%s" %ds.nodepos[ident])
|
|
|
|
fd.write("\n")
|
|
|
|
|
2007-01-12 00:24:05 +01:00
|
|
|
# Write data
|
2008-01-08 01:43:56 +01:00
|
|
|
if hasattr(ds, "as_spmatrix") and sp_format == True:
|
|
|
|
m = ds.as_spmatrix()
|
2008-01-07 14:13:27 +01:00
|
|
|
else:
|
|
|
|
m = ds.asarray()
|
2008-01-06 18:01:00 +01:00
|
|
|
if isinstance(m, sparse.spmatrix):
|
|
|
|
_write_sparse_elements(fd, m, fmt, sep)
|
|
|
|
else:
|
|
|
|
_write_elements(fd, m, fmt, sep)
|
2007-02-28 17:33:11 +01:00
|
|
|
|
2008-01-06 18:01:00 +01:00
|
|
|
if opened:
|
|
|
|
fd.close()
|
|
|
|
|
|
|
|
def read_ftsv(fd, sep=None):
|
2008-12-05 23:07:56 +01:00
|
|
|
"""Read a dataset in laydi tab separated values (ftsv) form and return it.
|
2007-08-02 12:08:52 +02:00
|
|
|
|
|
|
|
@param fd: An open file descriptor.
|
|
|
|
@return: A Dataset, CategoryDataset or GraphDataset depending on the information
|
|
|
|
read.
|
|
|
|
"""
|
2007-08-08 14:23:45 +02:00
|
|
|
opened = False
|
|
|
|
if isinstance(fd, str):
|
|
|
|
fd = open(fd)
|
|
|
|
opened = True
|
|
|
|
|
2007-01-10 18:35:58 +01:00
|
|
|
split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)')
|
|
|
|
dimensions = []
|
|
|
|
identifiers = {}
|
|
|
|
type = 'dataset'
|
|
|
|
name = 'Unnamed dataset'
|
2008-01-06 18:01:00 +01:00
|
|
|
sp_format = False
|
2008-01-08 01:43:56 +01:00
|
|
|
nodepos = None
|
2008-01-06 18:01:00 +01:00
|
|
|
# graphtype = 'graph'
|
2007-01-10 18:35:58 +01:00
|
|
|
|
|
|
|
# Read header lines from file.
|
|
|
|
line = fd.readline()
|
|
|
|
while line:
|
|
|
|
m = split_re.match(line)
|
|
|
|
if m:
|
|
|
|
key, val = m.groups()
|
|
|
|
|
|
|
|
# The line is on the form;
|
|
|
|
# dimension: dimname id1 id2 id3 ...
|
|
|
|
if key == 'dimension':
|
|
|
|
values = [v.strip() for v in val.split(' ')]
|
|
|
|
dimensions.append(values[0])
|
|
|
|
identifiers[values[0]] = values[1:]
|
|
|
|
|
|
|
|
# Read type of dataset.
|
|
|
|
# Should be dataset, category, or network
|
|
|
|
elif key == 'type':
|
|
|
|
type = val
|
|
|
|
|
|
|
|
elif key == 'name':
|
|
|
|
name = val
|
2008-01-06 18:01:00 +01:00
|
|
|
|
|
|
|
# storage format
|
|
|
|
# if sp_format is True then use coordinate triplets
|
|
|
|
elif key == 'sp_format':
|
|
|
|
if val in ['False', 'false', '0', 'F', 'f',]:
|
|
|
|
sp_format = False
|
|
|
|
elif val in ['True', 'true', '1', 'T', 't']:
|
|
|
|
sp_format = True
|
|
|
|
else:
|
|
|
|
raise ValueError("sp_format: %s not valid " %sp_format)
|
|
|
|
|
2008-01-08 01:43:56 +01:00
|
|
|
elif key == 'nodepos':
|
|
|
|
node_dim = dimensions[0]
|
|
|
|
idents = identifiers[node_dim]
|
|
|
|
nodepos = {}
|
|
|
|
xys = val.split(" ")
|
|
|
|
for node_id, xy in zip(idents, xys):
|
|
|
|
x, y = map(float, xy.split(","))
|
|
|
|
nodepos[node_id] = (x, y)
|
|
|
|
|
2007-01-10 18:35:58 +01:00
|
|
|
else:
|
|
|
|
break
|
|
|
|
line = fd.readline()
|
|
|
|
|
|
|
|
# Dimensions in the form [(dim1, [id1, id2, id3 ..) ...]
|
|
|
|
dims = [(x, identifiers[x]) for x in dimensions]
|
|
|
|
dim_lengths = [len(identifiers[x]) for x in dimensions]
|
|
|
|
|
2008-01-06 18:01:00 +01:00
|
|
|
# Create matrix and assign element reader
|
2007-01-10 18:35:58 +01:00
|
|
|
if type == 'category':
|
2008-01-06 18:01:00 +01:00
|
|
|
if sp_format:
|
|
|
|
matrix = sparse.lil_matrix(dim_lengths)
|
|
|
|
else:
|
|
|
|
matrix = empty(dim_lengths, dtype='i')
|
2007-01-10 18:35:58 +01:00
|
|
|
else:
|
2008-01-08 01:43:56 +01:00
|
|
|
if sp_format:
|
|
|
|
matrix = sparse.lil_matrix(dim_lengths)
|
|
|
|
else:
|
|
|
|
matrix = empty(dim_lengths)
|
2008-01-07 13:44:08 +01:00
|
|
|
|
|
|
|
if sp_format:
|
|
|
|
matrix = _read_sparse_elements(fd, matrix)
|
|
|
|
else:
|
|
|
|
matrix = _read_elements(fd, matrix)
|
2008-01-06 18:01:00 +01:00
|
|
|
|
2007-01-10 18:35:58 +01:00
|
|
|
|
|
|
|
# Create dataset of specified type
|
|
|
|
if type == 'category':
|
2007-01-16 13:41:29 +01:00
|
|
|
ds = CategoryDataset(matrix, dims, name)
|
2007-01-10 18:35:58 +01:00
|
|
|
elif type == 'network':
|
2008-01-08 01:43:56 +01:00
|
|
|
ds = GraphDataset(matrix, dims, name=name, nodepos=nodepos)
|
2007-01-10 18:35:58 +01:00
|
|
|
else:
|
2007-01-16 13:41:29 +01:00
|
|
|
ds = Dataset(matrix, dims, name)
|
2007-01-10 18:35:58 +01:00
|
|
|
|
2007-08-08 14:23:45 +02:00
|
|
|
if opened:
|
|
|
|
fd.close()
|
|
|
|
|
2007-01-10 18:35:58 +01:00
|
|
|
return ds
|
|
|
|
|
2009-02-10 00:05:09 +01:00
|
|
|
def write_csv(fd, ds, decimals=7, sep='\t'):
|
|
|
|
"""Write a dataset as comma/tab/whatever dilimited data.
|
|
|
|
|
|
|
|
@param fd: An open file descriptor to the output file.
|
|
|
|
@param ds: The dataset to be written.
|
|
|
|
@param decimals: Number of decimals, only supported for dataset.
|
|
|
|
@param sep: Value separator
|
|
|
|
"""
|
|
|
|
|
|
|
|
## Open file if a string is passed instead of a file descriptor
|
|
|
|
opened = False
|
|
|
|
if isinstance(fd, str):
|
|
|
|
fd = open(fd, 'w')
|
|
|
|
opened = True
|
|
|
|
|
|
|
|
## Get data
|
|
|
|
rowdim, coldim = ds.get_dim_name()
|
|
|
|
rowids = ds.get_identifiers(rowdim)
|
|
|
|
colids = ds.get_identifiers(coldim)
|
|
|
|
a = ds.asarray()
|
|
|
|
y, x = a.shape
|
|
|
|
fmt = '%%%if' % decimals
|
|
|
|
|
|
|
|
## Write header
|
|
|
|
fd.write(rowdim)
|
|
|
|
fd.write(sep)
|
|
|
|
for i, id in enumerate(colids):
|
|
|
|
fd.write(id)
|
|
|
|
fd.write(sep)
|
|
|
|
fd.write('\n')
|
|
|
|
|
|
|
|
## Write matrix data
|
|
|
|
for j in range(y):
|
|
|
|
fd.write(rowids[j])
|
|
|
|
fd.write(sep)
|
|
|
|
for i in range(x):
|
|
|
|
fd.write(fmt % (a[j, i],))
|
|
|
|
fd.write(sep)
|
|
|
|
fd.write('\n')
|
|
|
|
|
|
|
|
## If we opened the stream, close it
|
|
|
|
if opened:
|
|
|
|
fd.close()
|
|
|
|
|
2008-01-08 01:43:56 +01:00
|
|
|
def _write_sparse_elements(fd, arr, fmt='%d', sep=None):
|
|
|
|
""" Sparse coordinate format."""
|
|
|
|
fd.write('# sp_format: True\n\n')
|
|
|
|
fmt = '%d %d ' + fmt + '\n'
|
|
|
|
csr = arr.tocsr()
|
|
|
|
for ii in xrange(csr.size):
|
|
|
|
ir, ic = csr.rowcol(ii)
|
|
|
|
data = csr.getdata(ii)
|
|
|
|
fd.write(fmt % (ir, ic, data))
|
|
|
|
|
|
|
|
def _write_elements(fd, arr, fmt='%f', sep='\t'):
|
|
|
|
"""Standard value separated format."""
|
|
|
|
fmt = fmt + sep
|
|
|
|
fd.write('\n')
|
|
|
|
y, x = arr.shape
|
|
|
|
for j in range(y):
|
|
|
|
for i in range(x):
|
|
|
|
fd.write(fmt %arr[j, i])
|
|
|
|
fd.write('\n')
|
|
|
|
|
|
|
|
def _read_elements(fd, arr, sep=None):
|
|
|
|
line = fd.readline()
|
|
|
|
i = 0
|
|
|
|
while line:
|
|
|
|
values = line.split(sep)
|
|
|
|
for j, val in enumerate(values):
|
|
|
|
arr[i,j] = float(val)
|
|
|
|
i += 1
|
|
|
|
line = fd.readline()
|
|
|
|
return arr
|
|
|
|
|
|
|
|
def _read_sparse_elements(fd, arr, sep=None):
|
|
|
|
line = fd.readline()
|
|
|
|
while line:
|
|
|
|
i, j, val = line.split()
|
|
|
|
arr[int(i),int(j)] = float(val)
|
|
|
|
line = fd.readline()
|
|
|
|
return arr.tocsr()
|
2007-08-08 14:23:45 +02:00
|
|
|
|