Projects/laydi
Projects
/
laydi
Archived
7
0
Fork 0

Added support for sparse category-dataset

This commit is contained in:
Arnar Flatberg 2008-01-06 17:01:00 +00:00
parent a84731da30
commit bed280353b
3 changed files with 161 additions and 123 deletions

View File

@ -1,10 +1,11 @@
from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse
from scipy import sort as array_sort from scipy import sort as array_sort
from itertools import izip from itertools import izip
import shelve import shelve
import copy import copy
import re import re
class Dataset: class Dataset:
"""The Dataset base class. """The Dataset base class.
@ -42,16 +43,15 @@ class Dataset:
self._map = {} # internal mapping for dataset: identifier <--> index self._map = {} # internal mapping for dataset: identifier <--> index
self._name = name self._name = name
self._identifiers = identifiers self._identifiers = identifiers
self._type = 'n'
if len(array.shape)==1: if not isinstance(array, sparse.spmatrix):
array = atleast_2d(asarray(array)) array = atleast_2d(asarray(array))
# vectors are column vectors # vector are column (array)
if array.shape[0]==1: if array.shape[0] == 1:
array = array.T array = array.T
self.shape = array.shape self.shape = array.shape
if identifiers!=None: if identifiers != None:
self._validate_identifiers(identifiers) self._validate_identifiers(identifiers)
self._set_identifiers(identifiers, self._all_dims) self._set_identifiers(identifiers, self._all_dims)
else: else:
@ -82,14 +82,14 @@ class Dataset:
dim_names = ['rows','cols'] dim_names = ['rows','cols']
ids = [] ids = []
for axis,n in enumerate(shape): for axis, n in enumerate(shape):
if axis<2: if axis < 2:
dim_suggestion = dim_names[axis] dim_suggestion = dim_names[axis]
else: else:
dim_suggestion = 'dim' dim_suggestion = 'dim'
dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims) dim_suggestion = self._suggest_dim_name(dim_suggestion, all_dims)
identifier_creation = [str(axis) + "_" + i for i in map(str,range(n))] identifier_creation = [str(axis) + "_" + i for i in map(str, range(n))]
ids.append((dim_suggestion,identifier_creation)) ids.append((dim_suggestion, identifier_creation))
all_dims.add(dim_suggestion) all_dims.add(dim_suggestion)
return ids return ids
@ -112,18 +112,22 @@ class Dataset:
new_name = dim_name new_name = dim_name
while new_name in all_dims: while new_name in all_dims:
new_name = dim_name + "_" + str(c) new_name = dim_name + "_" + str(c)
c+=1 c += 1
return new_name return new_name
def asarray(self): def asarray(self):
"""Returns the numeric array (data) of dataset""" """Returns the numeric array (data) of dataset"""
if isinstance(self._array, sparse.spmatrix):
return self._array.toarray()
return self._array return self._array
def add_array(self, array): def set_array(self, array):
"""Adds array as an ArrayType object. """Adds array as an ArrayType object.
A one-dim array is transformed to a two-dim array (row-vector) A one-dim array is transformed to a two-dim array (row-vector)
""" """
if self.shape!=array.shape: if not isinstance(array, type(self._array)):
raise ValueError("Input array of type: %s does not match existing array type: %s") %(type(array), type(self._array))
if self.shape != array.shape:
raise ValueError, "Input array must be of similar dimensions as dataset" raise ValueError, "Input array must be of similar dimensions as dataset"
self._array = atleast_2d(asarray(array)) self._array = atleast_2d(asarray(array))
@ -138,7 +142,7 @@ class Dataset:
def get_dim_name(self, axis=None): def get_dim_name(self, axis=None):
"""Returns dim name for an axis, if no axis is provided it """Returns dim name for an axis, if no axis is provided it
returns a list of dims""" returns a list of dims"""
if type(axis)==int: if type(axis) == int:
return self._dims[axis] return self._dims[axis]
else: else:
return [dim for dim in self._dims] return [dim for dim in self._dims]
@ -149,7 +153,7 @@ class Dataset:
ds_dims = ds.get_dim_name() ds_dims = ds.get_dim_name()
return [d for d in dims if d in ds_dims] return [d for d in dims if d in ds_dims]
def get_identifiers(self, dim, indices=None,sorted=False): def get_identifiers(self, dim, indices=None, sorted=False):
"""Returns identifiers along dim, sorted by position (index) """Returns identifiers along dim, sorted by position (index)
is optional. is optional.
@ -163,7 +167,6 @@ class Dataset:
if indices != None: if indices != None:
if len(indices) == 0:# if empty list or empty array if len(indices) == 0:# if empty list or empty array
return [] return []
if indices != None: if indices != None:
# be sure to match intersection # be sure to match intersection
#indices = intersect1d(self.get_indices(dim),indices) #indices = intersect1d(self.get_indices(dim),indices)
@ -188,7 +191,7 @@ class Dataset:
""" """
if not isinstance(idents, list) and not isinstance(idents, set): if not isinstance(idents, list) and not isinstance(idents, set):
raise ValueError("idents needs to be a list/set got: %s" %type(idents)) raise ValueError("idents needs to be a list/set got: %s" %type(idents))
if idents==None: if idents == None:
index = array_sort(self._map[dim].values()) index = array_sort(self._map[dim].values())
else: else:
index = [self._map[dim][key] index = [self._map[dim][key]
@ -226,7 +229,7 @@ class Dataset:
As for the moment: only support for 2D-arrays. As for the moment: only support for 2D-arrays.
""" """
#assert(self._array==ndarray) assert(len(self.shape) == 2)
ds = self.copy() ds = self.copy()
ds._array = ds._array.T ds._array = ds._array.T
ds._dims.reverse() ds._dims.reverse()
@ -234,12 +237,11 @@ class Dataset:
return ds return ds
def _validate_identifiers(self, identifiers): def _validate_identifiers(self, identifiers):
for dim_name, ids in identifiers: for dim_name, ids in identifiers:
if len(set(ids)) != len(ids): if len(set(ids)) != len(ids):
raise ValueError("Identifiers not unique in : %s" %dim_name) raise ValueError("Identifiers not unique in : %s" %dim_name)
identifier_shape = [len(i[1]) for i in identifiers] identifier_shape = [len(i[1]) for i in identifiers]
if len(identifier_shape)!=len(self.shape): if len(identifier_shape) != len(self.shape):
raise ValueError("Identifier list length must equal array dims") raise ValueError("Identifier list length must equal array dims")
for ni, na in zip(identifier_shape, self.shape): for ni, na in zip(identifier_shape, self.shape):
if ni != na: if ni != na:
@ -252,9 +254,9 @@ class CategoryDataset(Dataset):
A dataset for representing class information as binary A dataset for representing class information as binary
matrices (0/1-matrices). matrices (0/1-matrices).
There is support for using a less memory demanding, and There is support for using a less memory demanding, sparse format. The
fast intersection look-ups by representing the binary matrix as a prefered (default) format for a category dataset is the compressed sparse row
dictionary in each dimension. format (csr)
Always has linked dimension in first dim: Always has linked dimension in first dim:
ex matrix: ex matrix:
@ -265,33 +267,51 @@ class CategoryDataset(Dataset):
. .
. .
. .
""" """
def __init__(self, array, identifiers=None, name='C'): def __init__(self, array, identifiers=None, name='C'):
Dataset.__init__(self, array, identifiers=identifiers, name=name) Dataset.__init__(self, array, identifiers=identifiers, name=name)
self.has_dictlists = False
self._type = 'c'
def as_dict_lists(self): def as_dict_lists(self):
"""Returns data as dict of indices along first dim. """Returns data as dict of identifiers along first dim.
ex: data['gene_id'] = ['map0030','map0010', ...] ex: data['gene_1'] = ['map0030','map0010', ...]
fixme: Deprecated?
""" """
data={} data = {}
for name, ind in self._map[self.get_dim_name(0)].items(): for name, ind in self._map[self.get_dim_name(0)].items():
data[name] = self.get_identifiers(self.get_dim_name(1), if isinstance(self._array, ndarray):
list(self._array[ind,:].nonzero())) indices = self._array[ind,:].nonzero()[0]
elif isinstance(self._array, sparse.spmatrix):
if not isinstance(self._array, sparse.csr_matrix):
array = self._array.tocsr()
else:
array = self._array
indices = array[ind,:].indices
if len(indices) == 0: # should we allow categories with no members?
continue
data[name] = self.get_identifiers(self.get_dim_name(1), indices)
self._dictlists = data self._dictlists = data
self.has_dictlists = True
return data return data
def as_selections(self): def as_selections(self):
"""Returns data as a list of Selection objects. """Returns data as a list of Selection objects.
The list of selections is not ordered (sorted) by any means.
""" """
ret_list = [] ret_list = []
for cat_name, ind in self._map[self.get_dim_name(1)].items(): for cat_name, ind in self._map[self.get_dim_name(1)].items():
ids = self.get_identifiers(self.get_dim_name(0), if isinstance(self._array, sparse.spmatrix):
self._array[:,ind].nonzero()[0]) if not isinstance(self._array, sparse.csc_matrix):
self._array = self._array.tocsc()
indices = self._array[:,ind].indices
else:
indices = self._array[:,ind].nonzero()[0]
if len(indices) == 0:
continue
ids = self.get_identifiers(self.get_dim_name(0), indices)
selection = Selection(cat_name) selection = Selection(cat_name)
selection.select(self.get_dim_name(0), ids) selection.select(self.get_dim_name(0), ids)
ret_list.append(selection) ret_list.append(selection)
@ -309,10 +329,10 @@ class GraphDataset(Dataset):
representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure. representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure.
""" """
def __init__(self, array=None, identifiers=None, shape=None, all_dims=[],**kwds): def __init__(self, array, identifiers=None, name='A'):
Dataset.__init__(self, array=array, identifiers=identifiers, name='A') Dataset.__init__(self, array=array, identifiers=identifiers, name=name)
self._graph = None self._graph = None
self._type = 'g' self._pos = None
def asnetworkx(self, nx_type='graph'): def asnetworkx(self, nx_type='graph'):
dim = self.get_dim_name()[0] dim = self.get_dim_name()[0]
@ -334,17 +354,17 @@ class GraphDataset(Dataset):
import networkx as nx import networkx as nx
except: except:
print "Failed in import of NetworkX" print "Failed in import of NetworkX"
return return None
m, n = A.shape# adjacency matrix must be of type that evals to true/false for neigbours m, n = A.shape # adjacency matrix must be of type that evals to true/false for neigbours
if m!=n: if m != n:
raise IOError, "Adjacency matrix must be square" raise IOError, "Adjacency matrix must be square"
if A[A[:,0].nonzero()[0][0],0]==1: #unweighted graph if A[A[:,0].nonzero()[0][0],0] == 1: #unweighted graph
G = nx.Graph() G = nx.Graph()
else: else:
G = nx.XGraph() G = nx.XGraph()
if labels==None: # if labels not provided mark vertices with numbers if labels == None: # if labels not provided mark vertices with numbers
labels = [str(i) for i in range(m)] labels = [str(i) for i in range(m)]
for nbrs, head in izip(A, labels): for nbrs, head in izip(A, labels):
@ -371,7 +391,7 @@ class ReverseDict(dict):
""" """
def __init__(self, *args, **kw): def __init__(self, *args, **kw):
dict.__init__(self, *args, **kw) dict.__init__(self, *args, **kw)
self.reverse = dict([[v,k] for k,v in self.items()]) self.reverse = dict([[v, k] for k, v in self.items()])
def __setitem__(self, key, value): def __setitem__(self, key, value):
dict.__setitem__(self, key, value) dict.__setitem__(self, key, value)
@ -380,39 +400,6 @@ class ReverseDict(dict):
except: except:
self.reverse = {value:key} self.reverse = {value:key}
def to_file(filepath,dataset,name=None):
"""Write dataset to file. A file may contain multiple datasets.
append to file by using option mode='a'
"""
if not name:
name = dataset._name
data = shelve.open(filepath, flag='c', protocol=2)
if data: #we have an append
names = data.keys()
if name in names:
print "Data with name: %s overwritten" %dataset._name
sub_data = {'array':dataset._array,
'idents':dataset._identifiers,
'type':dataset._type}
data[name] = sub_data
data.close()
def from_file(filepath):
"""Read dataset(s) from file """
data = shelve.open(filepath, flag='r')
out_data = []
for name in data.keys():
sub_data = data[name]
if sub_data['type']=='c':
out_data.append(CategoryDataset(sub_data['array'], identifiers=sub_data['idents'], name=name))
elif sub_data['type']=='g':
out_data.append(GraphDataset(sub_data['array'], identifiers=sub_data['idents'], name=name))
else:
out_data.append(Dataset(sub_data['array'], identifiers=sub_data['idents'], name=name))
return out_data
class Selection(dict): class Selection(dict):
"""Handles selected identifiers along each dimension of a dataset""" """Handles selected identifiers along each dimension of a dataset"""
@ -436,31 +423,39 @@ class Selection(dict):
def select(self, axis, labels): def select(self, axis, labels):
self[axis] = labels self[axis] = labels
def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None):
def write_ftsv(fd, ds, decimals=7):
"""Writes a dataset in fluents tab separated values (ftsv) form. """Writes a dataset in fluents tab separated values (ftsv) form.
@param fd: An open file descriptor to the output file. @param fd: An open file descriptor to the output file.
@param ds: The dataset to be written. The function handles datasets @param ds: The dataset to be written.
of these classes: Dataset, CategoryDataset and GraphDataset @param decimals: Number of decimals, only supported for dataset.
@param fmt: String formating
The function handles datasets of these classes:
Dataset, CategoryDataset and GraphDataset
""" """
opened = False opened = False
if isinstance(fd, str): if isinstance(fd, str):
fd = open(fd, 'w') fd = open(fd, 'w')
opened = True opened = True
printstr = "%s\t"
# Write header information # Write header information
if isinstance(ds, CategoryDataset): if isinstance(ds, CategoryDataset):
type = 'category' type = 'category'
if fmt == None:
fmt = '%d'
elif isinstance(ds, GraphDataset): elif isinstance(ds, GraphDataset):
type = 'network' type = 'network'
if fmt == None:
fmt = '%d'
elif isinstance(ds, Dataset): elif isinstance(ds, Dataset):
type = 'dataset' type = 'dataset'
printstr = '%%.%df\t' % decimals if fmt == None:
fmt = '%%.%df' % decimals
else:
fmt = '%%.%d' %decimals + fmt
else: else:
raise Exception("Unknown object") raise Exception("Unknown object type")
print >> fd, "# type: %s" % type fd.write('# type: %s' %type + '\n')
for dim in ds.get_dim_name(): for dim in ds.get_dim_name():
print >> fd, "# dimension: %s" % dim, print >> fd, "# dimension: %s" % dim,
@ -469,23 +464,57 @@ def write_ftsv(fd, ds, decimals=7):
print >> fd print >> fd
print >> fd, "# name: %s" % ds.get_name() print >> fd, "# name: %s" % ds.get_name()
print >> fd
# Write data # Write data
m = ds.asarray() m = ds.asarray()
if type == 'category': if isinstance(m, sparse.spmatrix):
m = m.astype('i') _write_sparse_elements(fd, m, fmt, sep)
else:
y, x = m.shape _write_elements(fd, m, fmt, sep)
for j in range(y):
for i in range(x):
print >> fd, printstr % m[j, i],
print >> fd
if opened: if opened:
fd.close() fd.close()
def read_ftsv(fd): def _write_sparse_elements(fd, arr, fmt='%d', sep=None):
""" Sparse coordinate format."""
fd.write('# sp_format: True\n\n')
fmt = '%d %d ' + fmt + '\n'
csr = arr.tocsr()
for ii in xrange(csr.size):
ir, ic = csr.rowcol(ii)
data = csr.getdata(ii)
fd.write(fmt % (ir, ic, data))
def _write_elements(fd, arr, fmt='%f', sep='\t'):
"""Standard value separated format."""
fmt = fmt + sep
fd.write('\n')
y, x = arr.shape
for j in range(y):
for i in range(x):
fd.write(fmt %arr[j, i])
fd.write('\n')
def _read_elements(fd, arr, sep=None):
line = fd.readline()
i = 0
while line:
values = line.split(sep)
for j, val in enumerate(values):
arr[i,j] = float(val)
i += 1
line = fd.readline()
return arr
def _read_sparse_elements(fd, arr, sep=None):
line = fd.readline()
while line:
i, j, val = line.split()
arr[int(i),int(j)] = float(val)
line = fd.readline()
return arr.tocsr()
def read_ftsv(fd, sep=None):
"""Read a dataset in fluents tab separated values (ftsv) form and return it. """Read a dataset in fluents tab separated values (ftsv) form and return it.
@param fd: An open file descriptor. @param fd: An open file descriptor.
@ -502,7 +531,8 @@ def read_ftsv(fd):
identifiers = {} identifiers = {}
type = 'dataset' type = 'dataset'
name = 'Unnamed dataset' name = 'Unnamed dataset'
graphtype = 'graph' sp_format = False
# graphtype = 'graph'
# Read header lines from file. # Read header lines from file.
line = fd.readline() line = fd.readline()
@ -526,8 +556,18 @@ def read_ftsv(fd):
elif key == 'name': elif key == 'name':
name = val name = val
elif key == 'graphtype': # storage format
graphtype = val # if sp_format is True then use coordinate triplets
elif key == 'sp_format':
if val in ['False', 'false', '0', 'F', 'f',]:
sp_format = False
elif val in ['True', 'true', '1', 'T', 't']:
sp_format = True
else:
raise ValueError("sp_format: %s not valid " %sp_format)
# elif key == 'graphtype':
# graphtype = val
else: else:
break break
@ -537,22 +577,20 @@ def read_ftsv(fd):
dims = [(x, identifiers[x]) for x in dimensions] dims = [(x, identifiers[x]) for x in dimensions]
dim_lengths = [len(identifiers[x]) for x in dimensions] dim_lengths = [len(identifiers[x]) for x in dimensions]
# Create matrix # Create matrix and assign element reader
if type == 'category': if type == 'category':
matrix = zeros(dim_lengths, dtype=bool) if sp_format:
matrix = sparse.lil_matrix(dim_lengths)
read_elements = _read_sparse_elements
else:
matrix = empty(dim_lengths, dtype='i')
read_elements = _read_elements
elif type == 'network': elif type == 'network':
matrix = zeros(dim_lengths) matrix = empty(dim_lengths)
else: else:
matrix = zeros(dim_lengths) matrix = empty(dim_lengths)
line = fd.readline() matrix = read_elements(fd, matrix, sep)
y = 0
while line:
values = line.split()
for x, v in enumerate(values):
matrix[y,x] = float(v)
y += 1
line = fd.readline()
# Create dataset of specified type # Create dataset of specified type
if type == 'category': if type == 'category':

View File

@ -392,7 +392,7 @@ class NavigatorMenu(gtk.Menu):
ds = self.dataset.copy() ds = self.dataset.copy()
ds._name = self.dataset._name + ".rsc" ds._name = self.dataset._name + ".rsc"
axis = 1 axis = 1
ds._array = ds._array/scipy.expand_dims(ds._array.std(axis), axis) ds._array = ds.asarray()/scipy.expand_dims(ds.asarray().std(axis), axis)
icon = fluents.icon_factory.get(ds) icon = fluents.icon_factory.get(ds)
project.data_tree_insert(self.tree_iter, ds.get_name(), ds, None, "black", icon) project.data_tree_insert(self.tree_iter, ds.get_name(), ds, None, "black", icon)
@ -401,21 +401,21 @@ class NavigatorMenu(gtk.Menu):
ds = self.dataset.copy() ds = self.dataset.copy()
ds._name = self.dataset._name + ".csc" ds._name = self.dataset._name + ".csc"
axis = 0 axis = 0
ds._array = ds._array/scipy.expand_dims(ds._array.std(axis), axis) ds._array = ds.asarray()/scipy.expand_dims(ds.asarray().std(axis), axis)
icon = fluents.icon_factory.get(ds) icon = fluents.icon_factory.get(ds)
project.data_tree_insert(self.tree_iter, ds.get_name(), ds, None, "black", icon) project.data_tree_insert(self.tree_iter, ds.get_name(), ds, None, "black", icon)
def on_log(self, item, navigator): def on_log(self, item, navigator):
project = main.project project = main.project
try: try:
if not scipy.all(self.dataset._array>0): if not scipy.all(self.dataset.asarray()>0):
raise ValueError raise ValueError
except: except:
logger.log('warning', 'Datasets needs to be strictly positive for a log transform') logger.log('warning', 'Datasets needs to be strictly positive for a log transform')
return return
ds = self.dataset.copy() ds = self.dataset.copy()
ds._array = scipy.log(ds._array) ds._array = scipy.log(ds.asarray())
icon = fluents.icon_factory.get(ds) icon = fluents.icon_factory.get(ds)
ds._name = ds._name + ".log" ds._name = ds._name + ".log"
project.data_tree_insert(self.tree_iter, ds.get_name(), ds, None, "black", icon) project.data_tree_insert(self.tree_iter, ds.get_name(), ds, None, "black", icon)

View File

@ -305,8 +305,8 @@ class ScatterMarkerPlot(Plot):
self.ms = s self.ms = s
x_index = dataset_1[sel_dim][id_1] x_index = dataset_1[sel_dim][id_1]
y_index = dataset_2[sel_dim][id_2] y_index = dataset_2[sel_dim][id_2]
self.xaxis_data = dataset_1._array[:, x_index] self.xaxis_data = dataset_1.asarray()[:, x_index]
self.yaxis_data = dataset_2._array[:, y_index] self.yaxis_data = dataset_2.asarray()[:, y_index]
# init draw # init draw
self._selection_line = None self._selection_line = None
@ -390,8 +390,8 @@ class ScatterPlot(Plot):
y_index = dataset_2[sel_dim_2][id_2] y_index = dataset_2[sel_dim_2][id_2]
else: else:
y_index = dataset_2[sel_dim][id_2] y_index = dataset_2[sel_dim][id_2]
self.xaxis_data = dataset_1._array[:, x_index] self.xaxis_data = dataset_1.asarray()[:, x_index]
self.yaxis_data = dataset_2._array[:, y_index] self.yaxis_data = dataset_2.asarray()[:, y_index]
# init draw # init draw
self.init_draw() self.init_draw()
@ -436,7 +436,7 @@ class ScatterPlot(Plot):
def set_absicca(self, sb): def set_absicca(self, sb):
self._absi = sb.get_value_as_int() - 1 self._absi = sb.get_value_as_int() - 1
xy = self.dataset_1._array[:,[self._absi, self._ordi]] xy = self.dataset_1.asarray()[:,[self._absi, self._ordi]]
self.xaxis_data = xy[:,0] self.xaxis_data = xy[:,0]
self.yaxis_data = xy[:,1] self.yaxis_data = xy[:,1]
self.sc._offsets = xy self.sc._offsets = xy
@ -446,7 +446,7 @@ class ScatterPlot(Plot):
def set_ordinate(self, sb): def set_ordinate(self, sb):
self._ordi = sb.get_value_as_int() - 1 self._ordi = sb.get_value_as_int() - 1
xy = self.dataset_1._array[:,[self._absi, self._ordi]] xy = self.dataset_1.asarray()[:,[self._absi, self._ordi]]
self.xaxis_data = xy[:,0] self.xaxis_data = xy[:,0]
self.yaxis_data = xy[:,1] self.yaxis_data = xy[:,1]
self.sc._offsets = xy self.sc._offsets = xy