Added support for sparse category-dataset

2008-01-06 17:01:00 +00:00 · 2008-01-06 17:01:00 +00:00 · bed280353b
parent a84731da30
commit bed280353b
3 changed files with 161 additions and 123 deletions
--- a/fluents/dataset.py
+++ b/fluents/dataset.py
@ -1,10 +1,11 @@
-from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros
+from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse
 from scipy import sort as array_sort
 from itertools import izip
 import shelve
 import copy
 import re

+
 class Dataset:
    """The Dataset base class.
    
@ -42,16 +43,15 @@ class Dataset:
        self._map = {} # internal mapping for dataset:  identifier <--> index
        self._name = name
        self._identifiers = identifiers
-        self._type = 'n'
        
-        if len(array.shape)==1:
+        if not isinstance(array, sparse.spmatrix):
            array = atleast_2d(asarray(array))
-            # vectors are column vectors 
-            if array.shape[0]==1:
-                array = array.T
+        # vector are column (array)
+        if array.shape[0] == 1:
+            array = array.T
        self.shape = array.shape
        
-        if identifiers!=None:
+        if identifiers != None:
            self._validate_identifiers(identifiers)
            self._set_identifiers(identifiers, self._all_dims)
        else:
@ -82,14 +82,14 @@ class Dataset:
        
        dim_names = ['rows','cols'] 
        ids = []
-        for axis,n in enumerate(shape):
-            if axis<2:
+        for axis, n in enumerate(shape):
+            if axis < 2:
                dim_suggestion = dim_names[axis]
            else:
                dim_suggestion = 'dim'
-            dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims) 
-            identifier_creation = [str(axis) + "_" + i for i in map(str,range(n))]
-            ids.append((dim_suggestion,identifier_creation))
+            dim_suggestion = self._suggest_dim_name(dim_suggestion, all_dims) 
+            identifier_creation = [str(axis) + "_" + i for i in map(str, range(n))]
+            ids.append((dim_suggestion, identifier_creation))
            all_dims.add(dim_suggestion)
        return ids

@ -112,18 +112,22 @@ class Dataset:
        new_name = dim_name
        while new_name in all_dims:
            new_name = dim_name + "_" + str(c)
-            c+=1
+            c += 1
        return new_name
        
    def asarray(self):
        """Returns the numeric array (data) of dataset"""
+        if isinstance(self._array, sparse.spmatrix):
+            return self._array.toarray()
        return self._array

-    def add_array(self, array):
+    def set_array(self, array):
        """Adds array as an ArrayType object.
        A one-dim array is transformed to a two-dim array (row-vector)
        """
-        if self.shape!=array.shape:
+        if not isinstance(array, type(self._array)):
+            raise ValueError("Input array of type: %s does not match existing array type: %s") %(type(array), type(self._array))
+        if self.shape != array.shape:
            raise ValueError, "Input array must be of similar dimensions as dataset"
        self._array = atleast_2d(asarray(array))

@ -138,7 +142,7 @@ class Dataset:
    def get_dim_name(self, axis=None):
        """Returns dim name for an axis, if no axis is provided it
        returns a list of dims"""
-        if type(axis)==int:
+        if type(axis) == int:
            return self._dims[axis]
        else:
            return [dim for dim in self._dims]
@ -149,7 +153,7 @@ class Dataset:
        ds_dims = ds.get_dim_name()
        return [d for d in dims if d in ds_dims]
        
-    def get_identifiers(self, dim, indices=None,sorted=False):
+    def get_identifiers(self, dim, indices=None, sorted=False):
        """Returns identifiers along dim, sorted by position (index)
        is optional.
 	
@ -163,7 +167,6 @@ class Dataset:
        if indices != None:
            if len(indices) == 0:# if empty list or empty array
                return []
-        
        if indices != None:
            # be sure to match intersection
            #indices = intersect1d(self.get_indices(dim),indices)
@ -188,7 +191,7 @@ class Dataset:
        """
        if not isinstance(idents, list) and not isinstance(idents, set):
            raise ValueError("idents needs to be a list/set got: %s" %type(idents))
-        if idents==None:
+        if idents == None:
            index = array_sort(self._map[dim].values())
        else:
            index = [self._map[dim][key]
@ -226,7 +229,7 @@ class Dataset:
        As for the moment: only support for 2D-arrays.
        """
        
-        #assert(self._array==ndarray)
+        assert(len(self.shape) == 2)
        ds = self.copy()
        ds._array = ds._array.T
        ds._dims.reverse()
@ -234,12 +237,11 @@ class Dataset:
        return ds
    
    def _validate_identifiers(self, identifiers):
-        
        for dim_name, ids in identifiers: 
            if len(set(ids)) != len(ids):
                raise ValueError("Identifiers not unique in : %s" %dim_name)
        identifier_shape = [len(i[1]) for i in identifiers]
-        if len(identifier_shape)!=len(self.shape):
+        if len(identifier_shape) != len(self.shape):
            raise ValueError("Identifier list length must equal array dims")
        for ni, na in zip(identifier_shape, self.shape):
            if ni != na:
@ -252,9 +254,9 @@ class CategoryDataset(Dataset):
    A dataset for representing class information as binary
    matrices (0/1-matrices).

-    There is support for using a less memory demanding, and
-    fast intersection look-ups by representing the binary matrix as a
-    dictionary in each dimension.
+    There is support for using a less memory demanding, sparse format. The
+    prefered (default) format for a category dataset is the compressed sparse row 
+    format (csr)

    Always has linked dimension in first dim:
    ex matrix:
@ -265,33 +267,51 @@ class CategoryDataset(Dataset):
    .
    .
    .
+    
    """
    
    def __init__(self, array, identifiers=None, name='C'):
        Dataset.__init__(self, array, identifiers=identifiers, name=name)
-        self.has_dictlists = False
-        self._type = 'c'
            
    def as_dict_lists(self):
-        """Returns data as dict of indices along first dim.
+        """Returns data as dict of identifiers along first dim.

-        ex: data['gene_id'] = ['map0030','map0010', ...]
+        ex: data['gene_1'] = ['map0030','map0010', ...]
+        
+        fixme: Deprecated?
        """
-        data={}
+        data = {}
        for name, ind in self._map[self.get_dim_name(0)].items():
-            data[name] = self.get_identifiers(self.get_dim_name(1),
-                                              list(self._array[ind,:].nonzero()))
+            if isinstance(self._array, ndarray):
+                indices = self._array[ind,:].nonzero()[0]
+            elif isinstance(self._array, sparse.spmatrix):
+                if not isinstance(self._array, sparse.csr_matrix):
+                    array = self._array.tocsr()
+                else:
+                    array = self._array
+                indices = array[ind,:].indices
+            if len(indices) == 0: # should we allow categories with no members?
+                continue
+            data[name] = self.get_identifiers(self.get_dim_name(1), indices)
        self._dictlists = data
-        self.has_dictlists = True
        return data

    def as_selections(self):
        """Returns data as a list of Selection objects.
+
+        The list of selections is not ordered (sorted) by any means.
        """
        ret_list = []
        for cat_name, ind in self._map[self.get_dim_name(1)].items():
-            ids = self.get_identifiers(self.get_dim_name(0),
-                                       self._array[:,ind].nonzero()[0])
+            if isinstance(self._array, sparse.spmatrix):
+                if not isinstance(self._array, sparse.csc_matrix):
+                    self._array = self._array.tocsc()
+                indices = self._array[:,ind].indices
+            else:
+                indices = self._array[:,ind].nonzero()[0]
+            if len(indices) == 0:
+                continue
+            ids = self.get_identifiers(self.get_dim_name(0), indices)
            selection = Selection(cat_name)
            selection.select(self.get_dim_name(0), ids)
            ret_list.append(selection)
@ -309,10 +329,10 @@ class GraphDataset(Dataset):
    representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure.
    """
    
-    def __init__(self, array=None, identifiers=None, shape=None, all_dims=[],**kwds):
-        Dataset.__init__(self, array=array, identifiers=identifiers, name='A')
+    def __init__(self, array, identifiers=None, name='A'):
+        Dataset.__init__(self, array=array, identifiers=identifiers, name=name)
        self._graph = None
-        self._type = 'g'
+        self._pos = None
        
    def asnetworkx(self, nx_type='graph'):
        dim = self.get_dim_name()[0]
@ -334,17 +354,17 @@ class GraphDataset(Dataset):
            import networkx as nx
        except:
            print "Failed in import of NetworkX"
-            return
-        m, n = A.shape# adjacency matrix must be of type that evals to true/false for neigbours
-        if m!=n:
+            return None
+        m, n = A.shape # adjacency matrix must be of type that evals to true/false for neigbours
+        if m != n:
            raise IOError, "Adjacency matrix must be square"

-        if A[A[:,0].nonzero()[0][0],0]==1: #unweighted graph
+        if A[A[:,0].nonzero()[0][0],0] == 1: #unweighted graph
            G = nx.Graph()
        else:
            G = nx.XGraph()

-        if labels==None: # if labels not provided mark vertices with numbers
+        if labels == None: # if labels not provided mark vertices with numbers
            labels = [str(i) for i in range(m)]

        for nbrs, head in izip(A, labels):
@ -371,7 +391,7 @@ class ReverseDict(dict):
    """
    def __init__(self, *args, **kw):
        dict.__init__(self, *args, **kw)
-        self.reverse = dict([[v,k] for k,v in self.items()])
+        self.reverse = dict([[v, k] for k, v in self.items()])

    def __setitem__(self, key, value):
        dict.__setitem__(self, key, value)
@ -380,39 +400,6 @@ class ReverseDict(dict):
        except:
            self.reverse = {value:key}

-def to_file(filepath,dataset,name=None):
-    """Write dataset to file. A file may contain multiple datasets.
-    append to file by using option mode='a'
-    """
-    if not name:
-        name = dataset._name
-    data = shelve.open(filepath, flag='c', protocol=2)
-    if data: #we have an append 
-        names = data.keys()
-        if name in names:
-            print "Data with name: %s overwritten" %dataset._name
-    
-    sub_data = {'array':dataset._array,
-                'idents':dataset._identifiers,
-                'type':dataset._type}
-    data[name] = sub_data 
-    data.close()
-
-def from_file(filepath):
-    """Read dataset(s) from file """
-    data = shelve.open(filepath, flag='r')
-    out_data = []
-    for name in data.keys():
-        sub_data = data[name]
-        if sub_data['type']=='c':
-            out_data.append(CategoryDataset(sub_data['array'], identifiers=sub_data['idents'], name=name))
-        elif sub_data['type']=='g':
-            out_data.append(GraphDataset(sub_data['array'], identifiers=sub_data['idents'], name=name))
-        else:
-            out_data.append(Dataset(sub_data['array'], identifiers=sub_data['idents'], name=name)) 
-            
-    return out_data
-

 class Selection(dict):
    """Handles selected identifiers along each dimension of a dataset"""
@ -435,32 +422,40 @@ class Selection(dict):

    def select(self, axis, labels):
        self[axis] = labels
-    

-def write_ftsv(fd, ds, decimals=7):
+def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None):
    """Writes a dataset in fluents tab separated values (ftsv) form.
    
    @param fd: An open file descriptor to the output file.
-    @param ds: The dataset to be written. The function handles datasets
-    of these classes: Dataset, CategoryDataset and GraphDataset
+    @param ds: The dataset to be written. 
+    @param decimals: Number of decimals, only supported for dataset.
+    @param fmt: String formating
+    The function handles datasets of these classes: 
+    Dataset, CategoryDataset and GraphDataset
    """
    opened = False
    if isinstance(fd, str):
        fd = open(fd, 'w')
        opened = True
-
-    printstr = "%s\t"
+    
    # Write header information
    if isinstance(ds, CategoryDataset):
        type = 'category'
+        if fmt == None:
+            fmt = '%d'
    elif isinstance(ds, GraphDataset):
        type = 'network'
+        if fmt == None:
+            fmt = '%d'
    elif isinstance(ds, Dataset):
        type = 'dataset'
-        printstr = '%%.%df\t' % decimals
+        if fmt == None:
+            fmt = '%%.%df' % decimals
+        else:
+            fmt = '%%.%d' %decimals + fmt
    else:
-        raise Exception("Unknown object")
-    print >> fd, "# type: %s" % type
+        raise Exception("Unknown object type")
+    fd.write('# type: %s' %type + '\n')

    for dim in ds.get_dim_name():
        print >> fd, "# dimension: %s" % dim,
@ -469,23 +464,57 @@ def write_ftsv(fd, ds, decimals=7):
        print >> fd

    print >> fd, "# name: %s" % ds.get_name()
-    print >> fd

    # Write data
    m = ds.asarray()
-    if type == 'category':
-        m = m.astype('i')
-
-    y, x = m.shape
-    for j in range(y):
-        for i in range(x):
-            print >> fd, printstr % m[j, i], 
-        print >> fd
+    if isinstance(m, sparse.spmatrix):
+        _write_sparse_elements(fd, m, fmt, sep)
+    else:
+        _write_elements(fd, m, fmt, sep)

    if opened:
        fd.close()

-def read_ftsv(fd):
+def _write_sparse_elements(fd, arr, fmt='%d', sep=None):
+    """ Sparse coordinate format.""" 
+    fd.write('# sp_format: True\n\n')
+    fmt = '%d %d ' + fmt + '\n'
+    csr = arr.tocsr()
+    for ii in xrange(csr.size):
+        ir, ic = csr.rowcol(ii)
+        data = csr.getdata(ii)
+        fd.write(fmt % (ir, ic, data))
+
+def _write_elements(fd, arr, fmt='%f', sep='\t'):
+    """Standard value separated format."""
+    fmt = fmt + sep
+    fd.write('\n')
+    y, x = arr.shape
+    for j in range(y):
+        for i in range(x):
+            fd.write(fmt %arr[j, i])
+        fd.write('\n')
+
+def _read_elements(fd, arr, sep=None):
+    line = fd.readline()
+    i = 0
+    while line:
+        values = line.split(sep)
+        for j, val in enumerate(values):
+            arr[i,j] = float(val)
+        i += 1
+        line = fd.readline()
+    return arr
+
+def _read_sparse_elements(fd, arr, sep=None):
+    line = fd.readline()
+    while line:
+        i, j, val = line.split()
+        arr[int(i),int(j)] = float(val)
+        line = fd.readline()
+    return arr.tocsr()
+
+def read_ftsv(fd, sep=None):
    """Read a dataset in fluents tab separated values (ftsv) form and return it.
    
    @param fd: An open file descriptor.
@ -502,7 +531,8 @@ def read_ftsv(fd):
    identifiers = {}
    type = 'dataset'
    name = 'Unnamed dataset'
-    graphtype = 'graph'
+    sp_format = False
+    # graphtype = 'graph'

    # Read header lines from file.
    line = fd.readline()
@ -525,9 +555,19 @@ def read_ftsv(fd):
            
            elif key == 'name':
                name = val
-
-            elif key == 'graphtype':
-                graphtype = val
+            
+            # storage format
+            # if sp_format is True then use coordinate triplets
+            elif key == 'sp_format':
+                if val in ['False', 'false', '0', 'F', 'f',]:
+                    sp_format = False
+                elif val in ['True', 'true', '1', 'T', 't']:
+                    sp_format = True
+                else:
+                    raise ValueError("sp_format: %s not valid " %sp_format)
+            
+            # elif key == 'graphtype':
+            #    graphtype = val

        else:
            break
@ -537,22 +577,20 @@ def read_ftsv(fd):
    dims = [(x, identifiers[x]) for x in dimensions]
    dim_lengths = [len(identifiers[x]) for x in dimensions]

-    # Create matrix
+    # Create matrix and assign element reader
    if type == 'category':
-        matrix = zeros(dim_lengths, dtype=bool)
+        if sp_format:
+            matrix = sparse.lil_matrix(dim_lengths)
+            read_elements = _read_sparse_elements
+        else:
+            matrix = empty(dim_lengths, dtype='i')
+            read_elements = _read_elements
    elif type == 'network':
-        matrix = zeros(dim_lengths)
+        matrix = empty(dim_lengths)
    else:
-        matrix = zeros(dim_lengths)
-
-    line = fd.readline()
-    y = 0
-    while line:
-        values = line.split()
-        for x, v in enumerate(values):
-            matrix[y,x] = float(v)
-        y += 1
-        line = fd.readline()
+        matrix = empty(dim_lengths)
+    
+    matrix = read_elements(fd, matrix, sep)

    # Create dataset of specified type
    if type == 'category':
--- a/fluents/navigator.py
+++ b/fluents/navigator.py
@ -392,7 +392,7 @@ class NavigatorMenu(gtk.Menu):
        ds = self.dataset.copy()
        ds._name = self.dataset._name + ".rsc"
        axis = 1
-        ds._array = ds._array/scipy.expand_dims(ds._array.std(axis), axis)
+        ds._array = ds.asarray()/scipy.expand_dims(ds.asarray().std(axis), axis)
        icon = fluents.icon_factory.get(ds)
        project.data_tree_insert(self.tree_iter, ds.get_name(), ds, None, "black", icon)

@ -401,21 +401,21 @@ class NavigatorMenu(gtk.Menu):
        ds = self.dataset.copy()
        ds._name = self.dataset._name + ".csc"
        axis = 0
-        ds._array = ds._array/scipy.expand_dims(ds._array.std(axis), axis)
+        ds._array = ds.asarray()/scipy.expand_dims(ds.asarray().std(axis), axis)
        icon = fluents.icon_factory.get(ds)
        project.data_tree_insert(self.tree_iter, ds.get_name(), ds, None, "black", icon)

    def on_log(self, item, navigator):
        project = main.project
        try:
-            if not scipy.all(self.dataset._array>0):
+            if not scipy.all(self.dataset.asarray()>0):
                raise ValueError
        except:
            logger.log('warning', 'Datasets needs to be strictly positive for a log transform')
            return
        
        ds = self.dataset.copy()
-        ds._array = scipy.log(ds._array)
+        ds._array = scipy.log(ds.asarray())
        icon = fluents.icon_factory.get(ds)
        ds._name = ds._name + ".log"
        project.data_tree_insert(self.tree_iter, ds.get_name(), ds, None, "black", icon)
--- a/fluents/plots.py
+++ b/fluents/plots.py
@ -305,8 +305,8 @@ class ScatterMarkerPlot(Plot):
        self.ms = s
        x_index = dataset_1[sel_dim][id_1]
        y_index = dataset_2[sel_dim][id_2]
-        self.xaxis_data = dataset_1._array[:, x_index]
-        self.yaxis_data = dataset_2._array[:, y_index]
+        self.xaxis_data = dataset_1.asarray()[:, x_index]
+        self.yaxis_data = dataset_2.asarray()[:, y_index]

        # init draw
        self._selection_line = None
@ -390,8 +390,8 @@ class ScatterPlot(Plot):
            y_index = dataset_2[sel_dim_2][id_2]
        else:
            y_index = dataset_2[sel_dim][id_2]
-        self.xaxis_data = dataset_1._array[:, x_index]
-        self.yaxis_data = dataset_2._array[:, y_index]
+        self.xaxis_data = dataset_1.asarray()[:, x_index]
+        self.yaxis_data = dataset_2.asarray()[:, y_index]

        # init draw
        self.init_draw()
@ -436,7 +436,7 @@ class ScatterPlot(Plot):

    def set_absicca(self, sb):
        self._absi = sb.get_value_as_int() - 1
-        xy = self.dataset_1._array[:,[self._absi, self._ordi]]
+        xy = self.dataset_1.asarray()[:,[self._absi, self._ordi]]
        self.xaxis_data = xy[:,0]
        self.yaxis_data = xy[:,1]
        self.sc._offsets = xy
@ -446,7 +446,7 @@ class ScatterPlot(Plot):
        
    def set_ordinate(self, sb):
        self._ordi = sb.get_value_as_int() - 1
-        xy = self.dataset_1._array[:,[self._absi, self._ordi]]
+        xy = self.dataset_1.asarray()[:,[self._absi, self._ordi]]
        self.xaxis_data = xy[:,0]
        self.yaxis_data = xy[:,1]
        self.sc._offsets = xy