from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse from scipy import sort as array_sort from itertools import izip import shelve import copy import re class Dataset: """The Dataset base class. A Dataset is an n-way array with defined string identifiers across all dimensions. example of use: --- dim_name_rows = 'rows' names_rows = ('row_a','row_b') ids_1 = [dim_name_rows, names_rows] dim_name_cols = 'cols' names_cols = ('col_a','col_b','col_c','col_d') ids_2 = [dim_name_cols, names_cols] Array_X = rand(2,4) data = Dataset(Array_X,(ids_1,ids_2),name="Testing") dim_names = [dim for dim in data] column_identifiers = [id for id in data['cols'].keys()] column_index = [index for index in data['cols'].values()] 'cols' in data -> True --- data = Dataset(rand(10,20)) (generates dims and ids (no links)) """ def __init__(self, array, identifiers=None, name='Unnamed dataset'): self._dims = [] #existing dimensions in this dataset self._map = {} # internal mapping for dataset: identifier <--> index self._name = name self._identifiers = identifiers if not isinstance(array, sparse.spmatrix): array = atleast_2d(asarray(array)) # vector are column (array) if array.shape[0] == 1: array = array.T self.shape = array.shape if identifiers != None: self._validate_identifiers(identifiers) self._set_identifiers(identifiers, self._all_dims) else: self._identifiers = self._create_identifiers(self.shape, self._all_dims) self._set_identifiers(self._identifiers, self._all_dims) self._array = array def __iter__(self): """Returns an iterator over dimensions of dataset.""" return self._dims.__iter__() def __contains__(self,dim): """Returns True if dim is a dimension name in dataset.""" # return self._dims.__contains__(dim) return self._map.__contains__(dim) def __len__(self): """Returns the number of dimensions in the dataset""" return len(self._map) def __getitem__(self,dim): """Return the identifers along the dimension dim.""" return self._map[dim] def _create_identifiers(self, shape, all_dims): """Creates dimension names and identifier names, and returns identifiers.""" dim_names = ['rows','cols'] ids = [] for axis, n in enumerate(shape): if axis < 2: dim_suggestion = dim_names[axis] else: dim_suggestion = 'dim' dim_suggestion = self._suggest_dim_name(dim_suggestion, all_dims) identifier_creation = [str(axis) + "_" + i for i in map(str, range(n))] ids.append((dim_suggestion, identifier_creation)) all_dims.add(dim_suggestion) return ids def _set_identifiers(self, identifiers, all_dims): """Creates internal mapping of identifiers structure.""" for dim, ids in identifiers: pos_map = ReverseDict() if dim not in self._dims: self._dims.append(dim) all_dims.add(dim) else: raise ValueError, "Dimension names must be unique whitin dataset" for pos, id in enumerate(ids): pos_map[id] = pos self._map[dim] = pos_map def _suggest_dim_name(self,dim_name,all_dims): """Suggests a unique name for dim and returns it""" c = 0 new_name = dim_name while new_name in all_dims: new_name = dim_name + "_" + str(c) c += 1 return new_name def asarray(self): """Returns the numeric array (data) of dataset""" if isinstance(self._array, sparse.spmatrix): return self._array.toarray() return self._array def set_array(self, array): """Adds array as an ArrayType object. A one-dim array is transformed to a two-dim array (row-vector) """ if not isinstance(array, type(self._array)): raise ValueError("Input array of type: %s does not match existing array type: %s") %(type(array), type(self._array)) if self.shape != array.shape: raise ValueError, "Input array must be of similar dimensions as dataset" self._array = atleast_2d(asarray(array)) def get_name(self): """Returns dataset name""" return self._name def get_all_dims(self): """Returns all dimensions in project""" return self._all_dims def get_dim_name(self, axis=None): """Returns dim name for an axis, if no axis is provided it returns a list of dims""" if type(axis) == int: return self._dims[axis] else: return [dim for dim in self._dims] def common_dims(self, ds): """Returns a list of the common dimensions in the two datasets.""" dims = self.get_dim_name() ds_dims = ds.get_dim_name() return [d for d in dims if d in ds_dims] def get_identifiers(self, dim, indices=None, sorted=False): """Returns identifiers along dim, sorted by position (index) is optional. You can optionally provide a list/ndarray of indices to get only the identifiers of a given position. Identifiers are the unique names (strings) for a variable in a given dim. Index (Indices) are the Identifiers position in a matrix in a given dim. """ if indices != None: if len(indices) == 0:# if empty list or empty array return [] if indices != None: # be sure to match intersection #indices = intersect1d(self.get_indices(dim),indices) ids = [self._map[dim].reverse[i] for i in indices] else: if sorted == True: ids = [self._map[dim].reverse[i] for i in array_sort(self._map[dim].values())] else: ids = self._map[dim].keys() return ids def get_indices(self, dim, idents=None): """Returns indices for identifiers along dimension. You can optionally provide a list of identifiers to retrieve a index subset. Identifiers are the unique names (strings) for a variable in a given dim. Index (Indices) are the Identifiers position in a matrix in a given dim. If none of the input identifiers are found an empty index is returned """ if not isinstance(idents, list) and not isinstance(idents, set): raise ValueError("idents needs to be a list/set got: %s" %type(idents)) if idents == None: index = array_sort(self._map[dim].values()) else: index = [self._map[dim][key] for key in idents if self._map[dim].has_key(key)] return asarray(index) def existing_identifiers(self, dim, idents): """Filters a list of identifiers to find those that are present in the dataset. The most common use of this function is to get a list of identifiers who correspond one to one with the list of indices produced when get_indices is given an identifier list. That is ds.get_indices(dim, idents) and ds.exisiting_identifiers(dim, idents) will have the same order. @param dim: A dimension present in the dataset. @param idents: A list of identifiers along the given dimension. @return: A list of identifiers in the same order as idents, but without elements not present in the dataset. """ if not isinstance(idents, list) and not isinstance(idents, set): raise ValueError("idents needs to be a list/set got: %s" %type(idents)) return [key for key in idents if self._map[dim].has_key(key)] def copy(self): """ Returns deepcopy of dataset. """ return copy.deepcopy(self) def transpose(self): """Returns a copy of transpose of a dataset. As for the moment: only support for 2D-arrays. """ assert(len(self.shape) == 2) ds = self.copy() ds._array = ds._array.T ds._dims.reverse() ds.shape = ds._array.shape return ds def _validate_identifiers(self, identifiers): for dim_name, ids in identifiers: if len(set(ids)) != len(ids): raise ValueError("Identifiers not unique in : %s" %dim_name) identifier_shape = [len(i[1]) for i in identifiers] if len(identifier_shape) != len(self.shape): raise ValueError("Identifier list length must equal array dims") for ni, na in zip(identifier_shape, self.shape): if ni != na: raise ValueError, "Identifier-array mismatch: %s: (idents: %s, array: %s)" %(self._name, ni, na) class CategoryDataset(Dataset): """The category dataset class. A dataset for representing class information as binary matrices (0/1-matrices). There is support for using a less memory demanding, sparse format. The prefered (default) format for a category dataset is the compressed sparse row format (csr) Always has linked dimension in first dim: ex matrix: . go_term1 go_term2 ... gene_1 gene_2 gene_3 . . . """ def __init__(self, array, identifiers=None, name='C'): Dataset.__init__(self, array, identifiers=identifiers, name=name) def as_dict_lists(self): """Returns data as dict of identifiers along first dim. ex: data['gene_1'] = ['map0030','map0010', ...] fixme: Deprecated? """ data = {} for name, ind in self._map[self.get_dim_name(0)].items(): if isinstance(self._array, ndarray): indices = self._array[ind,:].nonzero()[0] elif isinstance(self._array, sparse.spmatrix): if not isinstance(self._array, sparse.csr_matrix): array = self._array.tocsr() else: array = self._array indices = array[ind,:].indices if len(indices) == 0: # should we allow categories with no members? continue data[name] = self.get_identifiers(self.get_dim_name(1), indices) self._dictlists = data return data def as_selections(self): """Returns data as a list of Selection objects. The list of selections is not ordered (sorted) by any means. """ ret_list = [] for cat_name, ind in self._map[self.get_dim_name(1)].items(): if isinstance(self._array, sparse.spmatrix): if not isinstance(self._array, sparse.csc_matrix): self._array = self._array.tocsc() indices = self._array[:,ind].indices else: indices = self._array[:,ind].nonzero()[0] if len(indices) == 0: continue ids = self.get_identifiers(self.get_dim_name(0), indices) selection = Selection(cat_name) selection.select(self.get_dim_name(0), ids) ret_list.append(selection) return ret_list class GraphDataset(Dataset): """The graph dataset class. A dataset class for representing graphs using an (weighted) adjacency matrix (restricted to square symmetric matrices) If the library NetworkX is installed, there is support for representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure. """ def __init__(self, array, identifiers=None, name='A'): Dataset.__init__(self, array=array, identifiers=identifiers, name=name) self._graph = None self._pos = None def asnetworkx(self, nx_type='graph'): dim = self.get_dim_name()[0] ids = self.get_identifiers(dim, sorted=True) adj_mat = self.asarray() G = self._graph_from_adj_matrix(adj_mat, labels=ids) self._graph = G return G def _graph_from_adj_matrix(self, A, labels=None): """Creates a networkx graph class from adjacency (possibly weighted) matrix and ordered labels. nx_type = ['graph',['xgraph']] labels = None, results in string-numbered labels """ try: import networkx as nx except: print "Failed in import of NetworkX" return None m, n = A.shape # adjacency matrix must be of type that evals to true/false for neigbours if m != n: raise IOError, "Adjacency matrix must be square" if A[A[:,0].nonzero()[0][0],0] == 1: #unweighted graph G = nx.Graph() else: G = nx.XGraph() if labels == None: # if labels not provided mark vertices with numbers labels = [str(i) for i in range(m)] for nbrs, head in izip(A, labels): for i, nbr in enumerate(nbrs): if nbr: tail = labels[i] if type(G)==nx.XGraph: G.add_edge(head, tail, nbr) else: G.add_edge(head, tail) return G Dataset._all_dims = set() class ReverseDict(dict): """ A dictionary which can lookup values by key, and keys by value. All values and keys must be hashable, and unique. d = ReverseDict((['a',1],['b',2])) print d['a'] --> 1 print d.reverse[1] --> 'a' """ def __init__(self, *args, **kw): dict.__init__(self, *args, **kw) self.reverse = dict([[v, k] for k, v in self.items()]) def __setitem__(self, key, value): dict.__setitem__(self, key, value) try: self.reverse[value] = key except: self.reverse = {value:key} class Selection(dict): """Handles selected identifiers along each dimension of a dataset""" def __init__(self, title='Unnamed Selecton'): self.title = title def __getitem__(self, key): if not self.has_key(key): return None return dict.__getitem__(self, key) def dims(self): return self.keys() def axis_len(self, axis): if self._selection.has_key(axis): return len(self._selection[axis]) return 0 def select(self, axis, labels): self[axis] = labels def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None): """Writes a dataset in fluents tab separated values (ftsv) form. @param fd: An open file descriptor to the output file. @param ds: The dataset to be written. @param decimals: Number of decimals, only supported for dataset. @param fmt: String formating The function handles datasets of these classes: Dataset, CategoryDataset and GraphDataset """ opened = False if isinstance(fd, str): fd = open(fd, 'w') opened = True # Write header information if isinstance(ds, CategoryDataset): type = 'category' if fmt == None: fmt = '%d' elif isinstance(ds, GraphDataset): type = 'network' if fmt == None: fmt = '%d' elif isinstance(ds, Dataset): type = 'dataset' if fmt == None: fmt = '%%.%df' % decimals else: fmt = '%%.%d' %decimals + fmt else: raise Exception("Unknown object type") fd.write('# type: %s' %type + '\n') for dim in ds.get_dim_name(): print >> fd, "# dimension: %s" % dim, for id in ds.get_identifiers(dim, None, True): print >> fd, id, print >> fd print >> fd, "# name: %s" % ds.get_name() # Write data m = ds.asarray() if isinstance(m, sparse.spmatrix): _write_sparse_elements(fd, m, fmt, sep) else: _write_elements(fd, m, fmt, sep) if opened: fd.close() def _write_sparse_elements(fd, arr, fmt='%d', sep=None): """ Sparse coordinate format.""" fd.write('# sp_format: True\n\n') fmt = '%d %d ' + fmt + '\n' csr = arr.tocsr() for ii in xrange(csr.size): ir, ic = csr.rowcol(ii) data = csr.getdata(ii) fd.write(fmt % (ir, ic, data)) def _write_elements(fd, arr, fmt='%f', sep='\t'): """Standard value separated format.""" fmt = fmt + sep fd.write('\n') y, x = arr.shape for j in range(y): for i in range(x): fd.write(fmt %arr[j, i]) fd.write('\n') def _read_elements(fd, arr, sep=None): line = fd.readline() i = 0 while line: values = line.split(sep) for j, val in enumerate(values): arr[i,j] = float(val) i += 1 line = fd.readline() return arr def _read_sparse_elements(fd, arr, sep=None): line = fd.readline() while line: i, j, val = line.split() arr[int(i),int(j)] = float(val) line = fd.readline() return arr.tocsr() def read_ftsv(fd, sep=None): """Read a dataset in fluents tab separated values (ftsv) form and return it. @param fd: An open file descriptor. @return: A Dataset, CategoryDataset or GraphDataset depending on the information read. """ opened = False if isinstance(fd, str): fd = open(fd) opened = True split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)') dimensions = [] identifiers = {} type = 'dataset' name = 'Unnamed dataset' sp_format = False # graphtype = 'graph' # Read header lines from file. line = fd.readline() while line: m = split_re.match(line) if m: key, val = m.groups() # The line is on the form; # dimension: dimname id1 id2 id3 ... if key == 'dimension': values = [v.strip() for v in val.split(' ')] dimensions.append(values[0]) identifiers[values[0]] = values[1:] # Read type of dataset. # Should be dataset, category, or network elif key == 'type': type = val elif key == 'name': name = val # storage format # if sp_format is True then use coordinate triplets elif key == 'sp_format': if val in ['False', 'false', '0', 'F', 'f',]: sp_format = False elif val in ['True', 'true', '1', 'T', 't']: sp_format = True else: raise ValueError("sp_format: %s not valid " %sp_format) # elif key == 'graphtype': # graphtype = val else: break line = fd.readline() # Dimensions in the form [(dim1, [id1, id2, id3 ..) ...] dims = [(x, identifiers[x]) for x in dimensions] dim_lengths = [len(identifiers[x]) for x in dimensions] # Create matrix and assign element reader if type == 'category': if sp_format: matrix = sparse.lil_matrix(dim_lengths) else: matrix = empty(dim_lengths, dtype='i') elif type == 'network': matrix = empty(dim_lengths) else: matrix = empty(dim_lengths) if sp_format: matrix = _read_sparse_elements(fd, matrix) else: matrix = _read_elements(fd, matrix) # Create dataset of specified type if type == 'category': ds = CategoryDataset(matrix, dims, name) elif type == 'network': ds = GraphDataset(matrix, dims, name) else: ds = Dataset(matrix, dims, name) if opened: fd.close() return ds