From bed280353b2f2e15872852744dc0421c8e5e57bb Mon Sep 17 00:00:00 2001 From: flatberg Date: Sun, 6 Jan 2008 17:01:00 +0000 Subject: [PATCH] Added support for sparse category-dataset --- fluents/dataset.py | 264 +++++++++++++++++++++++++------------------ fluents/navigator.py | 8 +- fluents/plots.py | 12 +- 3 files changed, 161 insertions(+), 123 deletions(-) diff --git a/fluents/dataset.py b/fluents/dataset.py index acf2749..221c0c2 100644 --- a/fluents/dataset.py +++ b/fluents/dataset.py @@ -1,10 +1,11 @@ -from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros +from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse from scipy import sort as array_sort from itertools import izip import shelve import copy import re + class Dataset: """The Dataset base class. @@ -42,16 +43,15 @@ class Dataset: self._map = {} # internal mapping for dataset: identifier <--> index self._name = name self._identifiers = identifiers - self._type = 'n' - if len(array.shape)==1: + if not isinstance(array, sparse.spmatrix): array = atleast_2d(asarray(array)) - # vectors are column vectors - if array.shape[0]==1: - array = array.T + # vector are column (array) + if array.shape[0] == 1: + array = array.T self.shape = array.shape - if identifiers!=None: + if identifiers != None: self._validate_identifiers(identifiers) self._set_identifiers(identifiers, self._all_dims) else: @@ -82,14 +82,14 @@ class Dataset: dim_names = ['rows','cols'] ids = [] - for axis,n in enumerate(shape): - if axis<2: + for axis, n in enumerate(shape): + if axis < 2: dim_suggestion = dim_names[axis] else: dim_suggestion = 'dim' - dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims) - identifier_creation = [str(axis) + "_" + i for i in map(str,range(n))] - ids.append((dim_suggestion,identifier_creation)) + dim_suggestion = self._suggest_dim_name(dim_suggestion, all_dims) + identifier_creation = [str(axis) + "_" + i for i in map(str, range(n))] + ids.append((dim_suggestion, identifier_creation)) all_dims.add(dim_suggestion) return ids @@ -112,18 +112,22 @@ class Dataset: new_name = dim_name while new_name in all_dims: new_name = dim_name + "_" + str(c) - c+=1 + c += 1 return new_name def asarray(self): """Returns the numeric array (data) of dataset""" + if isinstance(self._array, sparse.spmatrix): + return self._array.toarray() return self._array - def add_array(self, array): + def set_array(self, array): """Adds array as an ArrayType object. A one-dim array is transformed to a two-dim array (row-vector) """ - if self.shape!=array.shape: + if not isinstance(array, type(self._array)): + raise ValueError("Input array of type: %s does not match existing array type: %s") %(type(array), type(self._array)) + if self.shape != array.shape: raise ValueError, "Input array must be of similar dimensions as dataset" self._array = atleast_2d(asarray(array)) @@ -138,7 +142,7 @@ class Dataset: def get_dim_name(self, axis=None): """Returns dim name for an axis, if no axis is provided it returns a list of dims""" - if type(axis)==int: + if type(axis) == int: return self._dims[axis] else: return [dim for dim in self._dims] @@ -149,7 +153,7 @@ class Dataset: ds_dims = ds.get_dim_name() return [d for d in dims if d in ds_dims] - def get_identifiers(self, dim, indices=None,sorted=False): + def get_identifiers(self, dim, indices=None, sorted=False): """Returns identifiers along dim, sorted by position (index) is optional. @@ -163,7 +167,6 @@ class Dataset: if indices != None: if len(indices) == 0:# if empty list or empty array return [] - if indices != None: # be sure to match intersection #indices = intersect1d(self.get_indices(dim),indices) @@ -188,7 +191,7 @@ class Dataset: """ if not isinstance(idents, list) and not isinstance(idents, set): raise ValueError("idents needs to be a list/set got: %s" %type(idents)) - if idents==None: + if idents == None: index = array_sort(self._map[dim].values()) else: index = [self._map[dim][key] @@ -226,7 +229,7 @@ class Dataset: As for the moment: only support for 2D-arrays. """ - #assert(self._array==ndarray) + assert(len(self.shape) == 2) ds = self.copy() ds._array = ds._array.T ds._dims.reverse() @@ -234,12 +237,11 @@ class Dataset: return ds def _validate_identifiers(self, identifiers): - for dim_name, ids in identifiers: if len(set(ids)) != len(ids): raise ValueError("Identifiers not unique in : %s" %dim_name) identifier_shape = [len(i[1]) for i in identifiers] - if len(identifier_shape)!=len(self.shape): + if len(identifier_shape) != len(self.shape): raise ValueError("Identifier list length must equal array dims") for ni, na in zip(identifier_shape, self.shape): if ni != na: @@ -252,9 +254,9 @@ class CategoryDataset(Dataset): A dataset for representing class information as binary matrices (0/1-matrices). - There is support for using a less memory demanding, and - fast intersection look-ups by representing the binary matrix as a - dictionary in each dimension. + There is support for using a less memory demanding, sparse format. The + prefered (default) format for a category dataset is the compressed sparse row + format (csr) Always has linked dimension in first dim: ex matrix: @@ -265,33 +267,51 @@ class CategoryDataset(Dataset): . . . + """ def __init__(self, array, identifiers=None, name='C'): Dataset.__init__(self, array, identifiers=identifiers, name=name) - self.has_dictlists = False - self._type = 'c' def as_dict_lists(self): - """Returns data as dict of indices along first dim. + """Returns data as dict of identifiers along first dim. - ex: data['gene_id'] = ['map0030','map0010', ...] + ex: data['gene_1'] = ['map0030','map0010', ...] + + fixme: Deprecated? """ - data={} + data = {} for name, ind in self._map[self.get_dim_name(0)].items(): - data[name] = self.get_identifiers(self.get_dim_name(1), - list(self._array[ind,:].nonzero())) + if isinstance(self._array, ndarray): + indices = self._array[ind,:].nonzero()[0] + elif isinstance(self._array, sparse.spmatrix): + if not isinstance(self._array, sparse.csr_matrix): + array = self._array.tocsr() + else: + array = self._array + indices = array[ind,:].indices + if len(indices) == 0: # should we allow categories with no members? + continue + data[name] = self.get_identifiers(self.get_dim_name(1), indices) self._dictlists = data - self.has_dictlists = True return data def as_selections(self): """Returns data as a list of Selection objects. + + The list of selections is not ordered (sorted) by any means. """ ret_list = [] for cat_name, ind in self._map[self.get_dim_name(1)].items(): - ids = self.get_identifiers(self.get_dim_name(0), - self._array[:,ind].nonzero()[0]) + if isinstance(self._array, sparse.spmatrix): + if not isinstance(self._array, sparse.csc_matrix): + self._array = self._array.tocsc() + indices = self._array[:,ind].indices + else: + indices = self._array[:,ind].nonzero()[0] + if len(indices) == 0: + continue + ids = self.get_identifiers(self.get_dim_name(0), indices) selection = Selection(cat_name) selection.select(self.get_dim_name(0), ids) ret_list.append(selection) @@ -309,10 +329,10 @@ class GraphDataset(Dataset): representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure. """ - def __init__(self, array=None, identifiers=None, shape=None, all_dims=[],**kwds): - Dataset.__init__(self, array=array, identifiers=identifiers, name='A') + def __init__(self, array, identifiers=None, name='A'): + Dataset.__init__(self, array=array, identifiers=identifiers, name=name) self._graph = None - self._type = 'g' + self._pos = None def asnetworkx(self, nx_type='graph'): dim = self.get_dim_name()[0] @@ -334,17 +354,17 @@ class GraphDataset(Dataset): import networkx as nx except: print "Failed in import of NetworkX" - return - m, n = A.shape# adjacency matrix must be of type that evals to true/false for neigbours - if m!=n: + return None + m, n = A.shape # adjacency matrix must be of type that evals to true/false for neigbours + if m != n: raise IOError, "Adjacency matrix must be square" - if A[A[:,0].nonzero()[0][0],0]==1: #unweighted graph + if A[A[:,0].nonzero()[0][0],0] == 1: #unweighted graph G = nx.Graph() else: G = nx.XGraph() - if labels==None: # if labels not provided mark vertices with numbers + if labels == None: # if labels not provided mark vertices with numbers labels = [str(i) for i in range(m)] for nbrs, head in izip(A, labels): @@ -371,7 +391,7 @@ class ReverseDict(dict): """ def __init__(self, *args, **kw): dict.__init__(self, *args, **kw) - self.reverse = dict([[v,k] for k,v in self.items()]) + self.reverse = dict([[v, k] for k, v in self.items()]) def __setitem__(self, key, value): dict.__setitem__(self, key, value) @@ -380,39 +400,6 @@ class ReverseDict(dict): except: self.reverse = {value:key} -def to_file(filepath,dataset,name=None): - """Write dataset to file. A file may contain multiple datasets. - append to file by using option mode='a' - """ - if not name: - name = dataset._name - data = shelve.open(filepath, flag='c', protocol=2) - if data: #we have an append - names = data.keys() - if name in names: - print "Data with name: %s overwritten" %dataset._name - - sub_data = {'array':dataset._array, - 'idents':dataset._identifiers, - 'type':dataset._type} - data[name] = sub_data - data.close() - -def from_file(filepath): - """Read dataset(s) from file """ - data = shelve.open(filepath, flag='r') - out_data = [] - for name in data.keys(): - sub_data = data[name] - if sub_data['type']=='c': - out_data.append(CategoryDataset(sub_data['array'], identifiers=sub_data['idents'], name=name)) - elif sub_data['type']=='g': - out_data.append(GraphDataset(sub_data['array'], identifiers=sub_data['idents'], name=name)) - else: - out_data.append(Dataset(sub_data['array'], identifiers=sub_data['idents'], name=name)) - - return out_data - class Selection(dict): """Handles selected identifiers along each dimension of a dataset""" @@ -435,32 +422,40 @@ class Selection(dict): def select(self, axis, labels): self[axis] = labels - -def write_ftsv(fd, ds, decimals=7): +def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None): """Writes a dataset in fluents tab separated values (ftsv) form. @param fd: An open file descriptor to the output file. - @param ds: The dataset to be written. The function handles datasets - of these classes: Dataset, CategoryDataset and GraphDataset + @param ds: The dataset to be written. + @param decimals: Number of decimals, only supported for dataset. + @param fmt: String formating + The function handles datasets of these classes: + Dataset, CategoryDataset and GraphDataset """ opened = False if isinstance(fd, str): fd = open(fd, 'w') opened = True - - printstr = "%s\t" + # Write header information if isinstance(ds, CategoryDataset): type = 'category' + if fmt == None: + fmt = '%d' elif isinstance(ds, GraphDataset): type = 'network' + if fmt == None: + fmt = '%d' elif isinstance(ds, Dataset): type = 'dataset' - printstr = '%%.%df\t' % decimals + if fmt == None: + fmt = '%%.%df' % decimals + else: + fmt = '%%.%d' %decimals + fmt else: - raise Exception("Unknown object") - print >> fd, "# type: %s" % type + raise Exception("Unknown object type") + fd.write('# type: %s' %type + '\n') for dim in ds.get_dim_name(): print >> fd, "# dimension: %s" % dim, @@ -469,23 +464,57 @@ def write_ftsv(fd, ds, decimals=7): print >> fd print >> fd, "# name: %s" % ds.get_name() - print >> fd # Write data m = ds.asarray() - if type == 'category': - m = m.astype('i') - - y, x = m.shape - for j in range(y): - for i in range(x): - print >> fd, printstr % m[j, i], - print >> fd + if isinstance(m, sparse.spmatrix): + _write_sparse_elements(fd, m, fmt, sep) + else: + _write_elements(fd, m, fmt, sep) if opened: fd.close() -def read_ftsv(fd): +def _write_sparse_elements(fd, arr, fmt='%d', sep=None): + """ Sparse coordinate format.""" + fd.write('# sp_format: True\n\n') + fmt = '%d %d ' + fmt + '\n' + csr = arr.tocsr() + for ii in xrange(csr.size): + ir, ic = csr.rowcol(ii) + data = csr.getdata(ii) + fd.write(fmt % (ir, ic, data)) + +def _write_elements(fd, arr, fmt='%f', sep='\t'): + """Standard value separated format.""" + fmt = fmt + sep + fd.write('\n') + y, x = arr.shape + for j in range(y): + for i in range(x): + fd.write(fmt %arr[j, i]) + fd.write('\n') + +def _read_elements(fd, arr, sep=None): + line = fd.readline() + i = 0 + while line: + values = line.split(sep) + for j, val in enumerate(values): + arr[i,j] = float(val) + i += 1 + line = fd.readline() + return arr + +def _read_sparse_elements(fd, arr, sep=None): + line = fd.readline() + while line: + i, j, val = line.split() + arr[int(i),int(j)] = float(val) + line = fd.readline() + return arr.tocsr() + +def read_ftsv(fd, sep=None): """Read a dataset in fluents tab separated values (ftsv) form and return it. @param fd: An open file descriptor. @@ -502,7 +531,8 @@ def read_ftsv(fd): identifiers = {} type = 'dataset' name = 'Unnamed dataset' - graphtype = 'graph' + sp_format = False + # graphtype = 'graph' # Read header lines from file. line = fd.readline() @@ -525,9 +555,19 @@ def read_ftsv(fd): elif key == 'name': name = val - - elif key == 'graphtype': - graphtype = val + + # storage format + # if sp_format is True then use coordinate triplets + elif key == 'sp_format': + if val in ['False', 'false', '0', 'F', 'f',]: + sp_format = False + elif val in ['True', 'true', '1', 'T', 't']: + sp_format = True + else: + raise ValueError("sp_format: %s not valid " %sp_format) + + # elif key == 'graphtype': + # graphtype = val else: break @@ -537,22 +577,20 @@ def read_ftsv(fd): dims = [(x, identifiers[x]) for x in dimensions] dim_lengths = [len(identifiers[x]) for x in dimensions] - # Create matrix + # Create matrix and assign element reader if type == 'category': - matrix = zeros(dim_lengths, dtype=bool) + if sp_format: + matrix = sparse.lil_matrix(dim_lengths) + read_elements = _read_sparse_elements + else: + matrix = empty(dim_lengths, dtype='i') + read_elements = _read_elements elif type == 'network': - matrix = zeros(dim_lengths) + matrix = empty(dim_lengths) else: - matrix = zeros(dim_lengths) - - line = fd.readline() - y = 0 - while line: - values = line.split() - for x, v in enumerate(values): - matrix[y,x] = float(v) - y += 1 - line = fd.readline() + matrix = empty(dim_lengths) + + matrix = read_elements(fd, matrix, sep) # Create dataset of specified type if type == 'category': diff --git a/fluents/navigator.py b/fluents/navigator.py index e4e71ae..d710b68 100644 --- a/fluents/navigator.py +++ b/fluents/navigator.py @@ -392,7 +392,7 @@ class NavigatorMenu(gtk.Menu): ds = self.dataset.copy() ds._name = self.dataset._name + ".rsc" axis = 1 - ds._array = ds._array/scipy.expand_dims(ds._array.std(axis), axis) + ds._array = ds.asarray()/scipy.expand_dims(ds.asarray().std(axis), axis) icon = fluents.icon_factory.get(ds) project.data_tree_insert(self.tree_iter, ds.get_name(), ds, None, "black", icon) @@ -401,21 +401,21 @@ class NavigatorMenu(gtk.Menu): ds = self.dataset.copy() ds._name = self.dataset._name + ".csc" axis = 0 - ds._array = ds._array/scipy.expand_dims(ds._array.std(axis), axis) + ds._array = ds.asarray()/scipy.expand_dims(ds.asarray().std(axis), axis) icon = fluents.icon_factory.get(ds) project.data_tree_insert(self.tree_iter, ds.get_name(), ds, None, "black", icon) def on_log(self, item, navigator): project = main.project try: - if not scipy.all(self.dataset._array>0): + if not scipy.all(self.dataset.asarray()>0): raise ValueError except: logger.log('warning', 'Datasets needs to be strictly positive for a log transform') return ds = self.dataset.copy() - ds._array = scipy.log(ds._array) + ds._array = scipy.log(ds.asarray()) icon = fluents.icon_factory.get(ds) ds._name = ds._name + ".log" project.data_tree_insert(self.tree_iter, ds.get_name(), ds, None, "black", icon) diff --git a/fluents/plots.py b/fluents/plots.py index 108d9d5..29bcbfe 100644 --- a/fluents/plots.py +++ b/fluents/plots.py @@ -305,8 +305,8 @@ class ScatterMarkerPlot(Plot): self.ms = s x_index = dataset_1[sel_dim][id_1] y_index = dataset_2[sel_dim][id_2] - self.xaxis_data = dataset_1._array[:, x_index] - self.yaxis_data = dataset_2._array[:, y_index] + self.xaxis_data = dataset_1.asarray()[:, x_index] + self.yaxis_data = dataset_2.asarray()[:, y_index] # init draw self._selection_line = None @@ -390,8 +390,8 @@ class ScatterPlot(Plot): y_index = dataset_2[sel_dim_2][id_2] else: y_index = dataset_2[sel_dim][id_2] - self.xaxis_data = dataset_1._array[:, x_index] - self.yaxis_data = dataset_2._array[:, y_index] + self.xaxis_data = dataset_1.asarray()[:, x_index] + self.yaxis_data = dataset_2.asarray()[:, y_index] # init draw self.init_draw() @@ -436,7 +436,7 @@ class ScatterPlot(Plot): def set_absicca(self, sb): self._absi = sb.get_value_as_int() - 1 - xy = self.dataset_1._array[:,[self._absi, self._ordi]] + xy = self.dataset_1.asarray()[:,[self._absi, self._ordi]] self.xaxis_data = xy[:,0] self.yaxis_data = xy[:,1] self.sc._offsets = xy @@ -446,7 +446,7 @@ class ScatterPlot(Plot): def set_ordinate(self, sb): self._ordi = sb.get_value_as_int() - 1 - xy = self.dataset_1._array[:,[self._absi, self._ordi]] + xy = self.dataset_1.asarray()[:,[self._absi, self._ordi]] self.xaxis_data = xy[:,0] self.yaxis_data = xy[:,1] self.sc._offsets = xy