2006-08-31 12:04:19 +02:00
|
|
|
from scipy import ndarray,atleast_2d,asarray
|
2006-08-01 11:45:53 +02:00
|
|
|
from scipy import sort as array_sort
|
|
|
|
from itertools import izip
|
2006-08-13 13:35:12 +02:00
|
|
|
import shelve
|
2006-04-17 00:57:50 +02:00
|
|
|
|
|
|
|
class Dataset:
|
2006-04-24 11:53:07 +02:00
|
|
|
"""The Dataset base class.
|
|
|
|
|
2006-04-17 11:08:40 +02:00
|
|
|
A Dataset is an n-way array with defined string identifiers across
|
|
|
|
all dimensions.
|
2006-04-24 11:53:07 +02:00
|
|
|
|
|
|
|
example of use:
|
|
|
|
|
|
|
|
---
|
|
|
|
dim_name_rows = 'rows'
|
|
|
|
names_rows = ('row_a','row_b')
|
|
|
|
ids_1 = [dim_name_rows, names_rows]
|
|
|
|
|
|
|
|
dim_name_cols = 'cols'
|
|
|
|
names_cols = ('col_a','col_b','col_c','col_d')
|
|
|
|
ids_2 = [dim_name_cols, names_cols]
|
|
|
|
|
|
|
|
Array_X = rand(2,4)
|
|
|
|
data = Dataset(Array_X,(ids_1,ids_2),name="Testing")
|
|
|
|
|
|
|
|
dim_names = [dim for dim in data]
|
|
|
|
|
|
|
|
column_identifiers = [id for id in data['cols'].keys()]
|
|
|
|
column_index = [index for index in data['cols'].values()]
|
|
|
|
|
|
|
|
'cols' in data -> True
|
|
|
|
|
|
|
|
---
|
|
|
|
|
|
|
|
data = Dataset(rand(10,20)) (generates dims and ids (no links))
|
2006-04-17 00:57:50 +02:00
|
|
|
"""
|
2006-08-01 11:45:53 +02:00
|
|
|
def __init__(self,array,identifiers=None,name='Unnamed dataset'):
|
2006-04-24 11:53:07 +02:00
|
|
|
self._dims = [] #existing dimensions in this dataset
|
|
|
|
self._map = {} # internal mapping for dataset: identifier <--> index
|
2006-08-01 11:45:53 +02:00
|
|
|
self._name = name
|
2006-08-13 13:35:12 +02:00
|
|
|
self._identifiers = identifiers
|
|
|
|
self._type = 'n'
|
2006-08-31 12:04:19 +02:00
|
|
|
if isinstance(array,ndarray):
|
2006-04-24 11:53:07 +02:00
|
|
|
array = atleast_2d(asarray(array))
|
2006-08-13 13:35:12 +02:00
|
|
|
# vectors are column vectors
|
|
|
|
if array.shape[0]==1:
|
2006-08-31 12:04:19 +02:00
|
|
|
array = array.T
|
2006-04-24 11:53:07 +02:00
|
|
|
self.shape = array.shape
|
|
|
|
if identifiers!=None:
|
2006-08-01 11:45:53 +02:00
|
|
|
self._set_identifiers(identifiers,self._all_dims)
|
2006-04-24 11:53:07 +02:00
|
|
|
else:
|
2006-08-13 13:35:12 +02:00
|
|
|
self._identifiers = self._create_identifiers(self.shape,self._all_dims)
|
|
|
|
self._set_identifiers(self._identifiers,self._all_dims)
|
2006-04-18 16:25:46 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
self._array = array
|
|
|
|
|
|
|
|
else:
|
2006-08-31 12:04:19 +02:00
|
|
|
raise ValueError, "Array input must be of type ndarray"
|
2006-04-21 14:28:29 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
def __iter__(self):
|
|
|
|
"""Returns an iterator over dimensions of dataset."""
|
|
|
|
return self._dims.__iter__()
|
|
|
|
|
|
|
|
def __contains__(self,dim):
|
|
|
|
"""Returns True if dim is a dimension name in dataset."""
|
|
|
|
# return self._dims.__contains__(dim)
|
|
|
|
return self._map.__contains__(dim)
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
"""Returns the number of dimensions in the dataset"""
|
|
|
|
return len(self._map)
|
|
|
|
|
|
|
|
def __getitem__(self,dim):
|
|
|
|
"""Return the identifers along the dimension dim."""
|
|
|
|
return self._map[dim]
|
|
|
|
|
|
|
|
def _create_identifiers(self,shape,all_dims):
|
|
|
|
"""Creates dimension names and identifier names, and returns
|
|
|
|
identifiers."""
|
2006-04-17 11:08:40 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
dim_names = ['rows','cols']
|
|
|
|
ids = []
|
|
|
|
for axis,n in enumerate(shape):
|
|
|
|
if axis<2:
|
|
|
|
dim_suggestion = dim_names[axis]
|
|
|
|
else:
|
|
|
|
dim_suggestion = 'dim'
|
2006-08-01 11:45:53 +02:00
|
|
|
dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims)
|
2006-04-24 11:53:07 +02:00
|
|
|
identifier_creation = [str(axis) + "_" + i for i in map(str,range(n))]
|
|
|
|
ids.append((dim_suggestion,identifier_creation))
|
2006-08-01 11:45:53 +02:00
|
|
|
all_dims.add(dim_suggestion)
|
2006-04-24 11:53:07 +02:00
|
|
|
return ids
|
|
|
|
|
|
|
|
def _set_identifiers(self,identifiers,all_dims):
|
|
|
|
"""Creates internal mapping of identifiers structure."""
|
|
|
|
for dim,ids in identifiers:
|
2006-08-08 10:05:26 +02:00
|
|
|
pos_map = ReverseDict()
|
2006-04-24 11:53:07 +02:00
|
|
|
if dim not in self._dims:
|
|
|
|
self._dims.append(dim)
|
2006-08-01 11:45:53 +02:00
|
|
|
all_dims.add(dim)
|
2006-04-24 11:53:07 +02:00
|
|
|
else:
|
2006-08-01 11:45:53 +02:00
|
|
|
raise ValueError, "Dimension names must be unique whitin dataset"
|
2006-04-20 12:27:58 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
for pos,id in enumerate(ids):
|
|
|
|
pos_map[id] = pos
|
|
|
|
self._map[dim] = pos_map
|
2006-08-01 11:45:53 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
def _suggest_dim_name(self,dim_name,all_dims):
|
|
|
|
"""Suggests a unique name for dim and returns it"""
|
|
|
|
c = 0
|
2006-08-01 11:45:53 +02:00
|
|
|
new_name = dim_name
|
|
|
|
while new_name in all_dims:
|
|
|
|
new_name = dim_name + "_" + str(c)
|
2006-04-24 11:53:07 +02:00
|
|
|
c+=1
|
2006-08-01 11:45:53 +02:00
|
|
|
return new_name
|
2006-04-19 12:37:44 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
def asarray(self):
|
|
|
|
"""Returns the numeric array (data) of dataset"""
|
2006-08-01 15:22:39 +02:00
|
|
|
return self._array
|
2006-04-19 12:37:44 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
def add_array(self,array):
|
|
|
|
"""Adds array as an ArrayType object.
|
|
|
|
A one-dim array is transformed to a two-dim array (row-vector)
|
2006-04-19 12:37:44 +02:00
|
|
|
"""
|
|
|
|
|
2006-08-01 11:45:53 +02:00
|
|
|
if self.shape!=array.shape:
|
|
|
|
raise ValueError, "Input array must be of similar dimensions as dataset"
|
|
|
|
self._array = atleast_2d(asarray(array))
|
2006-04-24 11:53:07 +02:00
|
|
|
|
|
|
|
def get_name(self):
|
2006-04-24 13:23:30 +02:00
|
|
|
"""Returns dataset name"""
|
2006-04-24 11:53:07 +02:00
|
|
|
return self._name
|
2006-04-24 13:23:30 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
def get_all_dims(self):
|
2006-04-24 13:23:30 +02:00
|
|
|
"""Returns all dimensions in project"""
|
2006-04-24 11:53:07 +02:00
|
|
|
return self._all_dims
|
|
|
|
|
2006-08-01 11:45:53 +02:00
|
|
|
def get_dim_name(self,axis=None):
|
2006-09-08 20:25:03 +02:00
|
|
|
"""Returns dim name for an axis, if no axis is provided it
|
|
|
|
returns a list of dims"""
|
2006-08-01 11:45:53 +02:00
|
|
|
if type(axis)==int:
|
|
|
|
return self._dims[axis]
|
|
|
|
else:
|
|
|
|
return [dim for dim in self]
|
2006-04-24 13:23:30 +02:00
|
|
|
|
2006-08-08 10:05:26 +02:00
|
|
|
def get_identifiers(self, dim, indices=None,sorted=False):
|
2006-09-08 20:25:03 +02:00
|
|
|
"""Returns identifiers along dim, sorted by position (index)
|
|
|
|
is optional.
|
2006-08-08 10:05:26 +02:00
|
|
|
|
2006-09-08 20:25:03 +02:00
|
|
|
You can optionally provide a list/ndarray of indices to get
|
|
|
|
only the identifiers of a given position.
|
2006-08-01 11:45:53 +02:00
|
|
|
|
2006-09-08 20:25:03 +02:00
|
|
|
Identifiers are the unique names (strings) for a variable in a
|
|
|
|
given dim. Index (Indices) are the Identifiers position in a
|
|
|
|
matrix in a given dim.
|
2006-05-02 16:06:48 +02:00
|
|
|
"""
|
2006-08-01 13:01:52 +02:00
|
|
|
try:
|
|
|
|
if len(indices)==0:# if empty list or empty array
|
|
|
|
indices=[]
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
|
2006-08-01 11:45:53 +02:00
|
|
|
if indices != None:
|
2006-08-08 10:05:26 +02:00
|
|
|
ids = [self._map[dim].reverse[i] for i in indices]
|
|
|
|
else:
|
|
|
|
if sorted==True:
|
|
|
|
ids = [self._map[dim].reverse[i] for i in array_sort(self._map[dim].values())]
|
|
|
|
else:
|
|
|
|
ids = self._map[dim].keys()
|
2006-08-01 11:45:53 +02:00
|
|
|
|
|
|
|
return ids
|
|
|
|
|
|
|
|
def get_indices(self, dim, idents=None):
|
|
|
|
"""Returns indices for identifiers along dimension.
|
|
|
|
|
2006-09-08 20:25:03 +02:00
|
|
|
You can optionally provide a list of identifiers to retrieve a
|
|
|
|
index subset.
|
2006-08-01 11:45:53 +02:00
|
|
|
|
|
|
|
|
2006-09-08 20:25:03 +02:00
|
|
|
Identifiers are the unique names (strings) for a variable in a
|
|
|
|
given dim. Index (Indices) are the Identifiers position in a
|
|
|
|
matrix in a given dim. If none of the input identifiers are
|
|
|
|
found an empty index is returned
|
2006-08-31 12:04:19 +02:00
|
|
|
"""
|
2006-08-01 11:45:53 +02:00
|
|
|
if idents==None:
|
|
|
|
index = array_sort(self._map[dim].values())
|
|
|
|
else:
|
2006-09-08 20:25:03 +02:00
|
|
|
index = [self._map[dim][key]
|
|
|
|
for key in idents if self._map[dim].has_key(key)]
|
2006-08-13 13:35:12 +02:00
|
|
|
return asarray(index)
|
2006-09-08 20:25:03 +02:00
|
|
|
|
2006-08-13 13:35:12 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
class CategoryDataset(Dataset):
|
|
|
|
"""The category dataset class.
|
|
|
|
|
|
|
|
A dataset for representing class information as binary
|
|
|
|
matrices (0/1-matrices).
|
|
|
|
|
|
|
|
There is support for using a less memory demanding, and
|
|
|
|
fast intersection look-ups by representing the binary matrix as a
|
|
|
|
dictionary in each dimension.
|
2006-08-08 09:54:00 +02:00
|
|
|
|
|
|
|
Always has linked dimension in first dim:
|
|
|
|
ex matrix:
|
|
|
|
go_term1 go_term2 ...
|
|
|
|
gene_1
|
|
|
|
gene_2
|
|
|
|
gene_3
|
|
|
|
.
|
|
|
|
.
|
|
|
|
.
|
2006-04-24 11:53:07 +02:00
|
|
|
"""
|
|
|
|
|
2006-08-08 10:05:26 +02:00
|
|
|
def __init__(self,array,identifiers=None,name='C'):
|
2006-08-08 09:54:00 +02:00
|
|
|
Dataset.__init__(self,array,identifiers=identifiers,name=name)
|
|
|
|
self.has_dictlists = False
|
2006-08-13 13:35:12 +02:00
|
|
|
self._type = 'c'
|
2006-04-19 12:37:44 +02:00
|
|
|
|
2006-08-08 09:54:00 +02:00
|
|
|
def as_dict_lists(self):
|
2006-08-30 14:37:08 +02:00
|
|
|
"""Returns data as dict of indices along first dim.
|
|
|
|
|
|
|
|
ex: data['gene_id'] = ['map0030','map0010', ...]
|
|
|
|
"""
|
2006-08-08 09:54:00 +02:00
|
|
|
data={}
|
|
|
|
for name,ind in self._map[self.get_dim_name(0)].items():
|
2006-09-08 20:25:03 +02:00
|
|
|
data[name] = self.get_identifiers(self.get_dim_name(1),
|
|
|
|
list(self._array[ind,:].nonzero()))
|
2006-08-08 09:54:00 +02:00
|
|
|
self._dictlists = data
|
2006-08-30 14:37:08 +02:00
|
|
|
self.has_dictlists = True
|
|
|
|
return data
|
2006-04-19 12:37:44 +02:00
|
|
|
|
2006-08-30 14:37:08 +02:00
|
|
|
def as_selections(self):
|
|
|
|
"""Returns data as a list of Selection objects.
|
|
|
|
"""
|
|
|
|
ret_list = []
|
|
|
|
for cat_name,ind in self._map[self.get_dim_name(1)].items():
|
2006-09-08 20:25:03 +02:00
|
|
|
ids = self.get_identifiers(self.get_dim_name(0),
|
|
|
|
self._array[:,ind].nonzero()[0])
|
2006-08-30 14:37:08 +02:00
|
|
|
selection = Selection(cat_name)
|
2006-09-08 20:25:03 +02:00
|
|
|
selection.select(self.get_dim_name(0), ids)
|
2006-08-30 14:37:08 +02:00
|
|
|
ret_list.append(selection)
|
|
|
|
return ret_list
|
|
|
|
|
2006-08-08 10:05:26 +02:00
|
|
|
|
2006-04-24 11:53:07 +02:00
|
|
|
class GraphDataset(Dataset):
|
|
|
|
"""The graph dataset class.
|
|
|
|
|
|
|
|
A dataset class for representing graphs using an adjacency matrix
|
|
|
|
(aka. restricted to square symmetric signed integers matrices)
|
|
|
|
|
|
|
|
If the library NetworkX is installed, there is support for
|
|
|
|
representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure.
|
|
|
|
"""
|
2006-09-08 20:25:03 +02:00
|
|
|
|
2006-08-01 11:45:53 +02:00
|
|
|
def __init__(self,array=None,identifiers=None,shape=None,all_dims=[],**kwds):
|
|
|
|
Dataset.__init__(self,array=array,identifiers=identifiers,name='A')
|
2006-04-24 11:53:07 +02:00
|
|
|
self.has_graph = False
|
2006-08-13 13:35:12 +02:00
|
|
|
self._type = 'g'
|
2006-04-24 11:53:07 +02:00
|
|
|
|
2006-08-01 11:45:53 +02:00
|
|
|
def asnetworkx(self,nx_type='graph'):
|
2006-08-01 15:22:39 +02:00
|
|
|
dim = self.get_dim_name()[0]
|
2006-08-28 14:06:05 +02:00
|
|
|
ids = self.get_identifiers(dim,sorted=True)
|
2006-08-01 11:45:53 +02:00
|
|
|
adj_mat = self.asarray()
|
|
|
|
G = self._graph_from_adj_matrix(adj_mat,labels=ids)
|
|
|
|
self.has_graph = True
|
|
|
|
return G
|
|
|
|
|
|
|
|
def _graph_from_adj_matrix(self,A,labels=None,nx_type='graph'):
|
2006-09-08 20:25:03 +02:00
|
|
|
"""Creates a networkx graph class from adjacency matrix and
|
|
|
|
ordered labels. nx_type = ['graph',['xgraph']] labels = None,
|
|
|
|
results in string-numbered labels
|
2006-08-01 11:45:53 +02:00
|
|
|
|
|
|
|
"""
|
|
|
|
import networkx as nx
|
2006-08-31 12:04:19 +02:00
|
|
|
m,n = A.shape# adjacency matrix must be of type that evals to true/false for neigbours
|
2006-08-01 11:45:53 +02:00
|
|
|
if m!=n:
|
|
|
|
raise IOError, "Adjacency matrix must be square"
|
|
|
|
if nx_type=='graph':
|
|
|
|
G = nx.Graph()
|
|
|
|
elif nx_type=='x_graph':
|
|
|
|
G = nx.XGraph()
|
|
|
|
else:
|
|
|
|
raise IOError, "Unknown graph type: %s" %nx_type
|
|
|
|
|
|
|
|
if labels==None: # if labels not provided mark vertices with numbers
|
|
|
|
labels = [str(i) for i in range(m)]
|
|
|
|
|
|
|
|
|
|
|
|
for nbrs,head in izip(A,labels):
|
|
|
|
for i,nbr in enumerate(nbrs):
|
|
|
|
if nbr:
|
|
|
|
tail = labels[i]
|
|
|
|
G.add_edge(head,tail)
|
|
|
|
return G
|
2006-08-01 15:22:39 +02:00
|
|
|
|
2006-08-01 11:45:53 +02:00
|
|
|
Dataset._all_dims=set()
|
2006-08-30 01:57:21 +02:00
|
|
|
|
2006-08-08 10:05:26 +02:00
|
|
|
class ReverseDict(dict):
|
|
|
|
"""
|
|
|
|
A dictionary which can lookup values by key, and keys by value.
|
|
|
|
All values and keys must be hashable, and unique.
|
|
|
|
|
|
|
|
d = ReverseDict((['a',1],['b',2]))
|
2006-08-30 14:37:08 +02:00
|
|
|
print d['a'] --> 1
|
|
|
|
print d.reverse[1] --> 'a'
|
2006-08-08 10:05:26 +02:00
|
|
|
"""
|
|
|
|
def __init__(self, *args, **kw):
|
|
|
|
dict.__init__(self, *args, **kw)
|
|
|
|
self.reverse = dict([[v,k] for k,v in self.items()])
|
|
|
|
|
|
|
|
def __setitem__(self, key, value):
|
|
|
|
dict.__setitem__(self, key, value)
|
|
|
|
self.reverse[value] = key
|
2006-08-13 13:35:12 +02:00
|
|
|
|
|
|
|
def to_file(filepath,dataset,name=None):
|
|
|
|
"""Write dataset to file. A file may contain multiple datasets.
|
|
|
|
append to file by using option mode='a'
|
|
|
|
"""
|
|
|
|
if not name:
|
|
|
|
name = dataset._name
|
2006-08-28 14:06:05 +02:00
|
|
|
data = shelve.open(filepath,flag='c',protocol=2)
|
2006-08-13 13:35:12 +02:00
|
|
|
if data: #we have an append
|
|
|
|
names = data.keys()
|
|
|
|
if name in names:
|
|
|
|
print "Data with name: %s overwritten" %dataset._name
|
|
|
|
sub_data = {'array':dataset._array,'idents':dataset._identifiers,'type':dataset._type}
|
|
|
|
data[name] = sub_data
|
|
|
|
data.close()
|
|
|
|
|
|
|
|
def from_file(filepath):
|
|
|
|
"""Read dataset from file """
|
2006-08-28 14:06:05 +02:00
|
|
|
data = shelve.open(filepath,flag='r')
|
2006-08-13 13:35:12 +02:00
|
|
|
out_data = []
|
|
|
|
for name in data.keys():
|
|
|
|
sub_data = data[name]
|
|
|
|
if sub_data['type']=='c':
|
|
|
|
out_data.append(CategoryDataset(sub_data['array'],identifiers=sub_data['idents'],name=name))
|
|
|
|
elif sub_data['type']=='g':
|
|
|
|
out_data.append(GraphDataset(sub_data['array'],identifiers=sub_data['idents'],name=name))
|
|
|
|
else:
|
|
|
|
out_data.append(Dataset(sub_data['array'],identifiers=sub_data['idents'],name=name))
|
|
|
|
|
|
|
|
return out_data
|
|
|
|
|
2006-08-30 01:57:21 +02:00
|
|
|
class Selection(dict):
|
2006-08-28 14:06:05 +02:00
|
|
|
"""Handles selected identifiers along each dimension of a dataset"""
|
2006-08-30 01:57:21 +02:00
|
|
|
|
|
|
|
def __init__(self, title='Unnamed Selecton'):
|
|
|
|
self.title = title
|
|
|
|
|
|
|
|
def __getitem__(self, key):
|
|
|
|
if not self.has_key(key):
|
|
|
|
return None
|
|
|
|
return dict.__getitem__(self, key)
|
|
|
|
|
|
|
|
def dims(self):
|
|
|
|
return self.keys()
|
|
|
|
|
|
|
|
def axis_len(self, axis):
|
|
|
|
if self._selection.has_key(axis):
|
|
|
|
return len(self._selection[axis])
|
|
|
|
return 0
|
|
|
|
|
|
|
|
def select(self, axis, labels):
|
|
|
|
self[axis] = labels
|
|
|
|
|