from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse,\ where from scipy import sort as array_sort from itertools import izip import shelve import copy import re class Dataset(object): """The Dataset base class. A Dataset is an n-way array with defined string identifiers across all dimensions. example of use: --- dim_name_rows = 'rows' names_rows = ('row_a','row_b') ids_1 = [dim_name_rows, names_rows] dim_name_cols = 'cols' names_cols = ('col_a','col_b','col_c','col_d') ids_2 = [dim_name_cols, names_cols] Array_X = rand(2,4) data = Dataset(Array_X,(ids_1,ids_2),name="Testing") dim_names = [dim for dim in data] column_identifiers = [id for id in data['cols'].keys()] column_index = [index for index in data['cols'].values()] 'cols' in data -> True --- data = Dataset(rand(10,20)) (generates dims and ids (no links)) """ def __init__(self, array, identifiers=None, name='Unnamed dataset'): self._dims = [] #existing dimensions in this dataset self._map = {} # internal mapping for dataset: identifier <--> index self._name = name self._identifiers = identifiers if not isinstance(array, sparse.spmatrix): array = atleast_2d(asarray(array)) # vector are column (array) if array.shape[0] == 1: array = array.T self.shape = array.shape if identifiers != None: self._validate_identifiers(identifiers) self._set_identifiers(identifiers, self._all_dims) else: self._identifiers = self._create_identifiers(self.shape, self._all_dims) self._set_identifiers(self._identifiers, self._all_dims) self._array = array def __iter__(self): """Returns an iterator over dimensions of dataset.""" return self._dims.__iter__() def __contains__(self,dim): """Returns True if dim is a dimension name in dataset.""" # return self._dims.__contains__(dim) return self._map.__contains__(dim) def __len__(self): """Returns the number of dimensions in the dataset""" return len(self._map) def __getitem__(self,dim): """Return the identifers along the dimension dim.""" return self._map[dim] def _create_identifiers(self, shape, all_dims): """Creates dimension names and identifier names, and returns identifiers.""" dim_names = ['rows','cols'] ids = [] for axis, n in enumerate(shape): if axis < 2: dim_suggestion = dim_names[axis] else: dim_suggestion = 'dim' dim_suggestion = self._suggest_dim_name(dim_suggestion, all_dims) identifier_creation = [str(axis) + "_" + i for i in map(str, range(n))] ids.append((dim_suggestion, identifier_creation)) all_dims.add(dim_suggestion) return ids def _set_identifiers(self, identifiers, all_dims): """Creates internal mapping of identifiers structure.""" for dim, ids in identifiers: pos_map = ReverseDict() if dim not in self._dims: self._dims.append(dim) all_dims.add(dim) else: raise ValueError, "Dimension names must be unique whitin dataset" for pos, id in enumerate(ids): pos_map[id] = pos self._map[dim] = pos_map def _suggest_dim_name(self,dim_name,all_dims): """Suggests a unique name for dim and returns it""" c = 0 new_name = dim_name while new_name in all_dims: new_name = dim_name + "_" + str(c) c += 1 return new_name def asarray(self): """Returns the numeric array (data) of dataset""" if isinstance(self._array, sparse.spmatrix): return self._array.toarray() return self._array def set_array(self, array): """Adds array as an ArrayType object. A one-dim array is transformed to a two-dim array (row-vector) """ if not isinstance(array, type(self._array)): raise ValueError("Input array of type: %s does not match existing array type: %s") %(type(array), type(self._array)) if self.shape != array.shape: raise ValueError, "Input array must be of similar dimensions as dataset" self._array = atleast_2d(asarray(array)) def get_name(self): """Returns dataset name""" return self._name def get_all_dims(self): """Returns all dimensions in project""" return self._all_dims def get_dim_name(self, axis=None): """Returns dim name for an axis, if no axis is provided it returns a list of dims""" if type(axis) == int: return self._dims[axis] else: return [dim for dim in self._dims] def common_dims(self, ds): """Returns a list of the common dimensions in the two datasets.""" dims = self.get_dim_name() ds_dims = ds.get_dim_name() return [d for d in dims if d in ds_dims] def get_identifiers(self, dim, indices=None, sorted=False): """Returns identifiers along dim, sorted by position (index) is optional. You can optionally provide a list/ndarray of indices to get only the identifiers of a given position. Identifiers are the unique names (strings) for a variable in a given dim. Index (Indices) are the Identifiers position in a matrix in a given dim. """ if indices != None: if len(indices) == 0:# if empty list or empty array return [] if indices != None: # be sure to match intersection #indices = intersect1d(self.get_indices(dim),indices) ids = [self._map[dim].reverse[i] for i in indices] else: if sorted == True: ids = [self._map[dim].reverse[i] for i in array_sort(self._map[dim].values())] else: ids = self._map[dim].keys() return ids def get_indices(self, dim, idents=None): """Returns indices for identifiers along dimension. You can optionally provide a list of identifiers to retrieve a index subset. Identifiers are the unique names (strings) for a variable in a given dim. Index (Indices) are the Identifiers position in a matrix in a given dim. If none of the input identifiers are found an empty index is returned """ if not isinstance(idents, list) and not isinstance(idents, set): raise ValueError("idents needs to be a list/set got: %s" %type(idents)) if idents == None: index = array_sort(self._map[dim].values()) else: index = [self._map[dim][key] for key in idents if self._map[dim].has_key(key)] return asarray(index) def existing_identifiers(self, dim, idents): """Filters a list of identifiers to find those that are present in the dataset. The most common use of this function is to get a list of identifiers who correspond one to one with the list of indices produced when get_indices is given an identifier list. That is ds.get_indices(dim, idents) and ds.exisiting_identifiers(dim, idents) will have the same order. @param dim: A dimension present in the dataset. @param idents: A list of identifiers along the given dimension. @return: A list of identifiers in the same order as idents, but without elements not present in the dataset. """ if not isinstance(idents, list) and not isinstance(idents, set): raise ValueError("idents needs to be a list/set got: %s" %type(idents)) return [key for key in idents if self._map[dim].has_key(key)] def copy(self): """ Returns deepcopy of dataset. """ return copy.deepcopy(self) def subdata(self, dim, idents): """Returns a new dataset based on dimension and given identifiers. """ ds = self.copy() indices = ds.get_indices(dim, idents) ax = [i for i, name in enumerate(ds._dims) if name == dim][0] subarr = ds._array.take(indices, ax) for k, v in ds._map[dim].items(): if k not in idents: del ds._map[dim][k] ds.shape = tuple(len(ds._map[d]) for d in ds._dims) ds.set_array(subarr) return ds def transpose(self): """Returns a copy of transpose of a dataset. As for the moment: only support for 2D-arrays. """ assert(len(self.shape) == 2) ds = self.copy() ds._array = ds._array.T ds._dims.reverse() ds.shape = ds._array.shape return ds def _validate_identifiers(self, identifiers): for dim_name, ids in identifiers: if len(set(ids)) != len(ids): raise ValueError("Identifiers not unique in : %s" %dim_name) identifier_shape = [len(i[1]) for i in identifiers] if len(identifier_shape) != len(self.shape): raise ValueError("Identifier list length must equal array dims") for ni, na in zip(identifier_shape, self.shape): if ni != na: raise ValueError, "Identifier-array mismatch: %s: (idents: %s, array: %s)" %(self._name, ni, na) class CategoryDataset(Dataset): """The category dataset class. A dataset for representing class information as binary matrices (0/1-matrices). There is support for using a less memory demanding, sparse format. The prefered (default) format for a category dataset is the compressed sparse row format (csr) Always has linked dimension in first dim: ex matrix: . go_term1 go_term2 ... gene_1 gene_2 gene_3 . . . """ def __init__(self, array, identifiers=None, name='C'): Dataset.__init__(self, array, identifiers=identifiers, name=name) def as_spmatrix(self): if isinstance(self._array, sparse.spmatrix): return self._array else: arr = self.asarray() return sparse.csr_matrix(arr.astype('i')) def to_spmatrix(self): if isinstance(self._array, sparse.spmatrix): self._array = self._array.tocsr() else: self._array = sparse.scr_matrix(self._array) def as_dictlists(self): """Returns data as dict of identifiers along first dim. ex: data['gene_1'] = ['map0030','map0010', ...] fixme: Deprecated? """ data = {} for name, ind in self._map[self.get_dim_name(0)].items(): if isinstance(self._array, ndarray): indices = self._array[ind,:].nonzero()[0] elif isinstance(self._array, sparse.spmatrix): if not isinstance(self._array, sparse.csr_matrix): array = self._array.tocsr() else: array = self._array indices = array[ind,:].indices if len(indices) == 0: # should we allow categories with no members? continue data[name] = self.get_identifiers(self.get_dim_name(1), indices) self._dictlists = data return data def as_selections(self): """Returns data as a list of Selection objects. The list of selections is not ordered (sorted) by any means. """ ret_list = [] for cat_name, ind in self._map[self.get_dim_name(1)].items(): if isinstance(self._array, sparse.spmatrix): if not isinstance(self._array, sparse.csc_matrix): self._array = self._array.tocsc() indices = self._array[:,ind].indices else: indices = self._array[:,ind].nonzero()[0] if len(indices) == 0: continue ids = self.get_identifiers(self.get_dim_name(0), indices) selection = Selection(cat_name) selection.select(self.get_dim_name(0), ids) ret_list.append(selection) return ret_list class GraphDataset(Dataset): """The graph dataset class. A dataset class for representing graphs. The constructor may use an incidence matrix (possibly sparse) or (if networkx installed) a networkx.(X)Graph structure. If the networkx library is installed, there is support for representing the graph as a networkx.Graph, or networkx.XGraph structure. """ def __init__(self, input, identifiers=None, name='A', nodepos = None): if isinstance(input, sparse.spmatrix): arr = input else: try: arr = asarray(input) except: raise ValueError("Could not identify input") Dataset.__init__(self, array=arr, identifiers=identifiers, name=name) self._graph = None self.nodepos = nodepos def as_spmatrix(self): if isinstance(self._array, sparse.spmatrix): return self._array else: arr = self.asarray() return sparse.csr_matrix(arr.astype('i')) def to_spmatrix(self): if isinstance(self._array, sparse.spmatrix): self._array = self._array.tocsr() else: self._array = sparse.scr_matrix(self._array) def asnetworkx(self): if self._graph != None: return self._graph dim0, dim1 = self.get_dim_name() node_ids = self.get_identifiers(dim0, sorted=True) edge_ids = self.get_identifiers(dim1, sorted=True) G = self._graph_from_incidence_matrix(self._array, node_ids=node_ids, edge_ids=edge_ids) self._graph = G return G def from_networkx(cls, G, node_dim, edge_dim, sp_format=True): """Create graph dataset from networkx graph. When G is a Graph/Digraph edge identifiers will be created, else (XGraoh/XDigraph) it is assumed that edge attributes are the edge identifiers. """ import networkx as nx n = G.number_of_nodes() m = G.number_of_edges() if isinstance(G, nx.DiGraph): G = nx.XDiGraph(G) elif isinstance(G, nx.Graph): G = nx.XGraph(G) edge_ids = [e[2] for e in G.edges()] node_ids = map(str, G.nodes()) n2ind = {} for ind, node in enumerate(node_ids): n2ind[node] = ind if sp_format: I = sparse.lil_matrix((n, m)) else: I = zeros((m, n), dtype='i') for i, (h, t, eid) in enumerate(G.edges()): if eid != None: edge_ids[i] = eid else: edge_ids[i] = 'e_' + str(i) hind = n2ind[str(h)] tind = n2ind[str(t)] I[hind, i] = 1 if G.is_directed(): I[tind, i] = -1 else: I[tind, i] = 1 idents = [[node_dim, node_ids], [edge_dim, edge_ids]] if G.name != '': name = G.name else: name = 'A' ds = GraphDataset(I, idents, name) return ds from_networkx = classmethod(from_networkx) def _incidence2adjacency(self, I): """Incidence to adjacency matrix. I*I.T - eye(n)? """ raise NotImplementedError def _graph_from_incidence_matrix(self, I, node_ids, edge_ids): """Creates a networkx graph class from incidence (possibly weighted) matrix and ordered labels. labels = None, results in string-numbered labels """ try: import networkx as nx except: print "Failed in import of NetworkX" return None m, n = I.shape assert(m == len(node_ids)) assert(n == len(edge_ids)) weights = [] directed = False G = nx.XDiGraph(name=self._name) if isinstance(I, sparse.spmatrix): I = I.tocsr() for ename, col in izip(edge_ids, I.T): if isinstance(I, sparse.spmatrix): node_ind = col.indices w1, w2 = col.data else: node_ind = where(col != 0)[0] w1, w2 = col[node_ind] node1 = node_ids[node_ind[0]] node2 = node_ids[node_ind[1]] if w1 < 0: # w1 is tail directed = True assert(w2 > 0 and (w1 + w2) == 0) G.add_edge(node2, node1, ename) weights.append(w2) else: #w2 is tail or graph is undirected assert(w1 > 0) if w2 < 0: directed = True G.add_edge(node1, node2, ename) weights.append(w1) if not directed: G = G.to_undirected() return G, asarray(weights) Dataset._all_dims = set() class ReverseDict(dict): """A dictionary which can lookup values by key, and keys by value. All values and keys must be hashable, and unique. example: >>d = ReverseDict((['a',1],['b',2])) >>print d['a'] --> 1 >>print d.reverse[1] --> 'a' """ def __init__(self, *args, **kw): dict.__init__(self, *args, **kw) self.reverse = dict([[v, k] for k, v in self.items()]) def __setitem__(self, key, value): dict.__setitem__(self, key, value) try: self.reverse[value] = key except: self.reverse = {value:key} class Selection(dict): """Handles selected identifiers along each dimension of a dataset""" def __init__(self, title='Unnamed Selecton'): self.title = title def __getitem__(self, key): if not self.has_key(key): return None return dict.__getitem__(self, key) def dims(self): return self.keys() def axis_len(self, axis): if self._selection.has_key(axis): return len(self._selection[axis]) return 0 def select(self, axis, labels): self[axis] = labels def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True): """Writes a dataset in fluents tab separated values (ftsv) form. @param fd: An open file descriptor to the output file. @param ds: The dataset to be written. @param decimals: Number of decimals, only supported for dataset. @param fmt: String formating The function handles datasets of these classes: Dataset, CategoryDataset and GraphDataset """ opened = False if isinstance(fd, str): fd = open(fd, 'w') opened = True # Write header information if isinstance(ds, CategoryDataset): type = 'category' if fmt == None: fmt = '%d' elif isinstance(ds, GraphDataset): type = 'network' if fmt == None: fmt = '%d' elif isinstance(ds, Dataset): type = 'dataset' if fmt == None: fmt = '%%.%df' % decimals else: fmt = '%%.%d' %decimals + fmt else: raise Exception("Unknown object type") fd.write('# type: %s' %type + '\n') for dim in ds.get_dim_name(): fd.write("# dimension: %s" % dim) for ident in ds.get_identifiers(dim, sorted=True): fd.write(" " + ident) fd.write("\n") fd.write("# name: %s" % ds.get_name() + '\n') # xy-node-positions if type == 'network' and ds.nodepos != None: fd.write("# nodepos:") node_dim = ds.get_dim_name(0) for ident in ds.get_identifiers(node_dim, sorted=True): fd.write(" %s,%s" %ds.nodepos[ident]) fd.write("\n") # Write data if hasattr(ds, "as_spmatrix") and sp_format == True: m = ds.as_spmatrix() else: m = ds.asarray() if isinstance(m, sparse.spmatrix): _write_sparse_elements(fd, m, fmt, sep) else: _write_elements(fd, m, fmt, sep) if opened: fd.close() def read_ftsv(fd, sep=None): """Read a dataset in fluents tab separated values (ftsv) form and return it. @param fd: An open file descriptor. @return: A Dataset, CategoryDataset or GraphDataset depending on the information read. """ opened = False if isinstance(fd, str): fd = open(fd) opened = True split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)') dimensions = [] identifiers = {} type = 'dataset' name = 'Unnamed dataset' sp_format = False nodepos = None # graphtype = 'graph' # Read header lines from file. line = fd.readline() while line: m = split_re.match(line) if m: key, val = m.groups() # The line is on the form; # dimension: dimname id1 id2 id3 ... if key == 'dimension': values = [v.strip() for v in val.split(' ')] dimensions.append(values[0]) identifiers[values[0]] = values[1:] # Read type of dataset. # Should be dataset, category, or network elif key == 'type': type = val elif key == 'name': name = val # storage format # if sp_format is True then use coordinate triplets elif key == 'sp_format': if val in ['False', 'false', '0', 'F', 'f',]: sp_format = False elif val in ['True', 'true', '1', 'T', 't']: sp_format = True else: raise ValueError("sp_format: %s not valid " %sp_format) elif key == 'nodepos': node_dim = dimensions[0] idents = identifiers[node_dim] nodepos = {} xys = val.split(" ") for node_id, xy in zip(idents, xys): x, y = map(float, xy.split(",")) nodepos[node_id] = (x, y) else: break line = fd.readline() # Dimensions in the form [(dim1, [id1, id2, id3 ..) ...] dims = [(x, identifiers[x]) for x in dimensions] dim_lengths = [len(identifiers[x]) for x in dimensions] # Create matrix and assign element reader if type == 'category': if sp_format: matrix = sparse.lil_matrix(dim_lengths) else: matrix = empty(dim_lengths, dtype='i') else: if sp_format: matrix = sparse.lil_matrix(dim_lengths) else: matrix = empty(dim_lengths) if sp_format: matrix = _read_sparse_elements(fd, matrix) else: matrix = _read_elements(fd, matrix) # Create dataset of specified type if type == 'category': ds = CategoryDataset(matrix, dims, name) elif type == 'network': ds = GraphDataset(matrix, dims, name=name, nodepos=nodepos) else: ds = Dataset(matrix, dims, name) if opened: fd.close() return ds def _write_sparse_elements(fd, arr, fmt='%d', sep=None): """ Sparse coordinate format.""" fd.write('# sp_format: True\n\n') fmt = '%d %d ' + fmt + '\n' csr = arr.tocsr() for ii in xrange(csr.size): ir, ic = csr.rowcol(ii) data = csr.getdata(ii) fd.write(fmt % (ir, ic, data)) def _write_elements(fd, arr, fmt='%f', sep='\t'): """Standard value separated format.""" fmt = fmt + sep fd.write('\n') y, x = arr.shape for j in range(y): for i in range(x): fd.write(fmt %arr[j, i]) fd.write('\n') def _read_elements(fd, arr, sep=None): line = fd.readline() i = 0 while line: values = line.split(sep) for j, val in enumerate(values): arr[i,j] = float(val) i += 1 line = fd.readline() return arr def _read_sparse_elements(fd, arr, sep=None): line = fd.readline() while line: i, j, val = line.split() arr[int(i),int(j)] = float(val) line = fd.readline() return arr.tocsr()