from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros from scipy import sort as array_sort from itertools import izip import shelve import copy import re class Dataset: """The Dataset base class. A Dataset is an n-way array with defined string identifiers across all dimensions. example of use: --- dim_name_rows = 'rows' names_rows = ('row_a','row_b') ids_1 = [dim_name_rows, names_rows] dim_name_cols = 'cols' names_cols = ('col_a','col_b','col_c','col_d') ids_2 = [dim_name_cols, names_cols] Array_X = rand(2,4) data = Dataset(Array_X,(ids_1,ids_2),name="Testing") dim_names = [dim for dim in data] column_identifiers = [id for id in data['cols'].keys()] column_index = [index for index in data['cols'].values()] 'cols' in data -> True --- data = Dataset(rand(10,20)) (generates dims and ids (no links)) """ def __init__(self, array, identifiers=None, name='Unnamed dataset'): self._dims = [] #existing dimensions in this dataset self._map = {} # internal mapping for dataset: identifier <--> index self._name = name self._identifiers = identifiers self._type = 'n' if len(array.shape)==1: array = atleast_2d(asarray(array)) # vectors are column vectors if array.shape[0]==1: array = array.T self.shape = array.shape if identifiers!=None: self._validate_identifiers(identifiers) self._set_identifiers(identifiers, self._all_dims) else: self._identifiers = self._create_identifiers(self.shape, self._all_dims) self._set_identifiers(self._identifiers, self._all_dims) self._array = array def __iter__(self): """Returns an iterator over dimensions of dataset.""" return self._dims.__iter__() def __contains__(self,dim): """Returns True if dim is a dimension name in dataset.""" # return self._dims.__contains__(dim) return self._map.__contains__(dim) def __len__(self): """Returns the number of dimensions in the dataset""" return len(self._map) def __getitem__(self,dim): """Return the identifers along the dimension dim.""" return self._map[dim] def _create_identifiers(self,shape,all_dims): """Creates dimension names and identifier names, and returns identifiers.""" dim_names = ['rows','cols'] ids = [] for axis,n in enumerate(shape): if axis<2: dim_suggestion = dim_names[axis] else: dim_suggestion = 'dim' dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims) identifier_creation = [str(axis) + "_" + i for i in map(str,range(n))] ids.append((dim_suggestion,identifier_creation)) all_dims.add(dim_suggestion) return ids def _set_identifiers(self, identifiers, all_dims): """Creates internal mapping of identifiers structure.""" for dim, ids in identifiers: pos_map = ReverseDict() if dim not in self._dims: self._dims.append(dim) all_dims.add(dim) else: raise ValueError, "Dimension names must be unique whitin dataset" for pos, id in enumerate(ids): pos_map[id] = pos self._map[dim] = pos_map def _suggest_dim_name(self,dim_name,all_dims): """Suggests a unique name for dim and returns it""" c = 0 new_name = dim_name while new_name in all_dims: new_name = dim_name + "_" + str(c) c+=1 return new_name def asarray(self): """Returns the numeric array (data) of dataset""" return self._array def add_array(self, array): """Adds array as an ArrayType object. A one-dim array is transformed to a two-dim array (row-vector) """ if self.shape!=array.shape: raise ValueError, "Input array must be of similar dimensions as dataset" self._array = atleast_2d(asarray(array)) def get_name(self): """Returns dataset name""" return self._name def get_all_dims(self): """Returns all dimensions in project""" return self._all_dims def get_dim_name(self, axis=None): """Returns dim name for an axis, if no axis is provided it returns a list of dims""" if type(axis)==int: return self._dims[axis] else: return [dim for dim in self] def common_dims(self, ds): """Returns a list of the common dimensions in the two datasets.""" dims = self.get_dim_name() ds_dims = ds.get_dim_name() return [d for d in dims if d in ds_dims] def get_identifiers(self, dim, indices=None,sorted=False): """Returns identifiers along dim, sorted by position (index) is optional. You can optionally provide a list/ndarray of indices to get only the identifiers of a given position. Identifiers are the unique names (strings) for a variable in a given dim. Index (Indices) are the Identifiers position in a matrix in a given dim. """ if indices != None: if len(indices) == 0:# if empty list or empty array return [] if indices != None: # be sure to match intersection #indices = intersect1d(self.get_indices(dim),indices) ids = [self._map[dim].reverse[i] for i in indices] else: if sorted == True: ids = [self._map[dim].reverse[i] for i in array_sort(self._map[dim].values())] else: ids = self._map[dim].keys() return ids def get_indices(self, dim, idents=None): """Returns indices for identifiers along dimension. You can optionally provide a list of identifiers to retrieve a index subset. Identifiers are the unique names (strings) for a variable in a given dim. Index (Indices) are the Identifiers position in a matrix in a given dim. If none of the input identifiers are found an empty index is returned """ if not isinstance(idents, list) and not isinstance(idents, set): raise ValueError("idents needs to be a list/set got: %s" %type(idents)) if idents==None: index = array_sort(self._map[dim].values()) else: index = [self._map[dim][key] for key in idents if self._map[dim].has_key(key)] return asarray(index) def existing_identifiers(self, dim, idents): if not isinstance(idents, list) and not isinstance(idents, set): raise ValueError("idents needs to be a list/set got: %s" %type(idents)) return [key for key in idents if self._map[dim].has_key(key)] def copy(self): """ Returns deepcopy of dataset. """ return copy.deepcopy(self) def _validate_identifiers(self, identifiers): for dim_name, ids in identifiers: if len(set(ids)) != len(ids): raise ValueError("Identifiers not unique in : %s" %dim_name) identifier_shape = [len(i[1]) for i in identifiers] if len(identifier_shape)!=len(self.shape): raise ValueError("Identifier list length must equal array dims") for ni, na in zip(identifier_shape, self.shape): if ni != na: raise ValueError, "Identifier-array mismatch: %s: (idents: %s, array: %s)" %(self._name, ni, na) class CategoryDataset(Dataset): """The category dataset class. A dataset for representing class information as binary matrices (0/1-matrices). There is support for using a less memory demanding, and fast intersection look-ups by representing the binary matrix as a dictionary in each dimension. Always has linked dimension in first dim: ex matrix: . go_term1 go_term2 ... gene_1 gene_2 gene_3 . . . """ def __init__(self, array, identifiers=None, name='C'): Dataset.__init__(self, array, identifiers=identifiers, name=name) self.has_dictlists = False self._type = 'c' def as_dict_lists(self): """Returns data as dict of indices along first dim. ex: data['gene_id'] = ['map0030','map0010', ...] """ data={} for name, ind in self._map[self.get_dim_name(0)].items(): data[name] = self.get_identifiers(self.get_dim_name(1), list(self._array[ind,:].nonzero())) self._dictlists = data self.has_dictlists = True return data def as_selections(self): """Returns data as a list of Selection objects. """ ret_list = [] for cat_name, ind in self._map[self.get_dim_name(1)].items(): ids = self.get_identifiers(self.get_dim_name(0), self._array[:,ind].nonzero()[0]) selection = Selection(cat_name) selection.select(self.get_dim_name(0), ids) ret_list.append(selection) return ret_list class GraphDataset(Dataset): """The graph dataset class. A dataset class for representing graphs using an (weighted) adjacency matrix (restricted to square symmetric matrices) If the library NetworkX is installed, there is support for representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure. """ def __init__(self, array=None, identifiers=None, shape=None, all_dims=[],**kwds): Dataset.__init__(self, array=array, identifiers=identifiers, name='A') self._graph = None self._type = 'g' def asnetworkx(self, nx_type='graph'): dim = self.get_dim_name()[0] ids = self.get_identifiers(dim, sorted=True) adj_mat = self.asarray() G = self._graph_from_adj_matrix(adj_mat, labels=ids) self._graph = G return G def _graph_from_adj_matrix(self, A, labels=None): """Creates a networkx graph class from adjacency (possibly weighted) matrix and ordered labels. nx_type = ['graph',['xgraph']] labels = None, results in string-numbered labels """ try: import networkx as nx except: print "Failed in import of NetworkX" return m, n = A.shape# adjacency matrix must be of type that evals to true/false for neigbours if m!=n: raise IOError, "Adjacency matrix must be square" if A[A[:,0].nonzero()[0][0],0]==1: #unweighted graph G = nx.Graph() else: G = nx.XGraph() if labels==None: # if labels not provided mark vertices with numbers labels = [str(i) for i in range(m)] for nbrs, head in izip(A, labels): for i, nbr in enumerate(nbrs): if nbr: tail = labels[i] if type(G)==nx.XGraph: G.add_edge(head, tail, nbr) else: G.add_edge(head, tail) return G Dataset._all_dims = set() class ReverseDict(dict): """ A dictionary which can lookup values by key, and keys by value. All values and keys must be hashable, and unique. d = ReverseDict((['a',1],['b',2])) print d['a'] --> 1 print d.reverse[1] --> 'a' """ def __init__(self, *args, **kw): dict.__init__(self, *args, **kw) self.reverse = dict([[v,k] for k,v in self.items()]) def __setitem__(self, key, value): dict.__setitem__(self, key, value) try: self.reverse[value] = key except: self.reverse = {value:key} def to_file(filepath,dataset,name=None): """Write dataset to file. A file may contain multiple datasets. append to file by using option mode='a' """ if not name: name = dataset._name data = shelve.open(filepath, flag='c', protocol=2) if data: #we have an append names = data.keys() if name in names: print "Data with name: %s overwritten" %dataset._name sub_data = {'array':dataset._array, 'idents':dataset._identifiers, 'type':dataset._type} data[name] = sub_data data.close() def from_file(filepath): """Read dataset(s) from file """ data = shelve.open(filepath, flag='r') out_data = [] for name in data.keys(): sub_data = data[name] if sub_data['type']=='c': out_data.append(CategoryDataset(sub_data['array'], identifiers=sub_data['idents'], name=name)) elif sub_data['type']=='g': out_data.append(GraphDataset(sub_data['array'], identifiers=sub_data['idents'], name=name)) else: out_data.append(Dataset(sub_data['array'], identifiers=sub_data['idents'], name=name)) return out_data class Selection(dict): """Handles selected identifiers along each dimension of a dataset""" def __init__(self, title='Unnamed Selecton'): self.title = title def __getitem__(self, key): if not self.has_key(key): return None return dict.__getitem__(self, key) def dims(self): return self.keys() def axis_len(self, axis): if self._selection.has_key(axis): return len(self._selection[axis]) return 0 def select(self, axis, labels): self[axis] = labels def write_ftsv(fd, ds): # Write header information if isinstance(ds, CategoryDataset): type = 'category' elif isinstance(ds, GraphDataset): type = 'network' elif isinstance(ds, Dataset): type = 'dataset' else: raise Exception("Unknown object") print >> fd, "# type: %s" % type for dim in ds.get_dim_name(): print >> fd, "# dimension: %s" % dim, for id in ds.get_identifiers(dim, None, True): print >> fd, id, print >> fd print >> fd, "# name: %s" % ds.get_name() print >> fd # Write data m = ds.asarray() if type == 'category': m = m.astype('i') y, x = m.shape for j in range(y): for i in range(x): print >> fd, "%s\t" % m[j, i], print >> fd def read_ftsv(fd): split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)') dimensions = [] identifiers = {} type = 'dataset' name = 'Unnamed dataset' graphtype = 'graph' # Read header lines from file. line = fd.readline() while line: m = split_re.match(line) if m: key, val = m.groups() # The line is on the form; # dimension: dimname id1 id2 id3 ... if key == 'dimension': values = [v.strip() for v in val.split(' ')] dimensions.append(values[0]) identifiers[values[0]] = values[1:] # Read type of dataset. # Should be dataset, category, or network elif key == 'type': type = val elif key == 'name': name = val elif key == 'graphtype': graphtype = val else: break line = fd.readline() # Dimensions in the form [(dim1, [id1, id2, id3 ..) ...] dims = [(x, identifiers[x]) for x in dimensions] dim_lengths = [len(identifiers[x]) for x in dimensions] # Create matrix if type == 'category': matrix = zeros(dim_lengths, dtype=bool) elif type == 'network': matrix = zeros(dim_lengths) else: matrix = zeros(dim_lengths) line = fd.readline() y = 0 while line: values = line.split() for x, v in enumerate(values): matrix[y,x] = float(v) y += 1 line = fd.readline() # Create dataset of specified type if type == 'category': ds = CategoryDataset(matrix, dims, name) elif type == 'network': ds = GraphDataset(matrix, dims, name) else: ds = Dataset(matrix, dims, name) return ds