Complete rewrite of dataset class, with (all) the necessary updates
This commit is contained in:
@@ -1,136 +1,225 @@
|
||||
import logger
|
||||
from scipy import array,take,asarray,shape,nonzero
|
||||
import project
|
||||
from itertools import izip
|
||||
from scipy import atleast_2d,asarray,ArrayType
|
||||
|
||||
|
||||
class Dataset:
|
||||
"""Dataset base class.
|
||||
|
||||
"""The Dataset base class.
|
||||
|
||||
A Dataset is an n-way array with defined string identifiers across
|
||||
all dimensions.
|
||||
|
||||
example of use:
|
||||
|
||||
---
|
||||
dim_name_rows = 'rows'
|
||||
names_rows = ('row_a','row_b')
|
||||
ids_1 = [dim_name_rows, names_rows]
|
||||
|
||||
dim_name_cols = 'cols'
|
||||
names_cols = ('col_a','col_b','col_c','col_d')
|
||||
ids_2 = [dim_name_cols, names_cols]
|
||||
|
||||
Array_X = rand(2,4)
|
||||
data = Dataset(Array_X,(ids_1,ids_2),name="Testing")
|
||||
|
||||
dim_names = [dim for dim in data]
|
||||
|
||||
column_identifiers = [id for id in data['cols'].keys()]
|
||||
column_index = [index for index in data['cols'].values()]
|
||||
|
||||
'cols' in data -> True
|
||||
|
||||
---
|
||||
|
||||
data = Dataset(rand(10,20)) (generates dims and ids (no links))
|
||||
"""
|
||||
def __init__(self, input_array, def_list, name="Unnamed data"):
|
||||
self._name = name
|
||||
self._data = asarray(input_array)
|
||||
dims = shape(self._data)
|
||||
self.def_list = def_list
|
||||
self._ids_set = set()
|
||||
self.ids={}
|
||||
self._dim_num = {}
|
||||
self._dim_names = []
|
||||
if len(dims)==1: # a vector is defined to be column vector!
|
||||
self.dims = (dims[0],1)
|
||||
else:
|
||||
self.dims = dims
|
||||
if len(def_list)!=len(self.dims):
|
||||
raise ValueError,"array dims and identifyer mismatch"
|
||||
for axis,(dim_name,ids) in enumerate(def_list):
|
||||
enum_ids = {}
|
||||
#if dim_name not in project.c_p.dim_names:
|
||||
# dim_name = project.c_p.suggest_dim_name(dim_name)
|
||||
if not ids:
|
||||
logger.log('debug','Creating identifiers along: '+ str(dim_name))
|
||||
ids = self._create_identifiers(axis)
|
||||
for num,name in enumerate(ids):
|
||||
enum_ids[name] = num
|
||||
self.ids[dim_name] = enum_ids
|
||||
self._ids_set = self._ids_set.union(set(ids))
|
||||
self._dim_num[dim_name] = axis
|
||||
self._dim_names.append(dim_name)
|
||||
|
||||
for (dimname, ids), d in izip(def_list,self.dims): #check that data and labels match
|
||||
if len(self.ids[dimname]) != d:
|
||||
raise ValueError,"dim size and identifyer mismatch"
|
||||
def __init__(self,array=None,identifiers=None,shape=None,all_dims=[],**kwds):
|
||||
self._name = kwds.get("name","Unnamed data")
|
||||
self._dims = [] #existing dimensions in this dataset
|
||||
self._map = {} # internal mapping for dataset: identifier <--> index
|
||||
self.has_array = False
|
||||
self.shape = None
|
||||
|
||||
if array==None:
|
||||
if shape == None:
|
||||
raise ValueError, "Must define shape if array is None"
|
||||
else:
|
||||
self.shape = shape
|
||||
if identifiers!=None:
|
||||
self._set_identifiers(identifiers,all_dims)
|
||||
else:
|
||||
ids = self._create_identifiers(shape,all_dims)
|
||||
self._set_identifiers(ids,all_dims)
|
||||
elif isinstance(array,ArrayType):
|
||||
array = atleast_2d(asarray(array))
|
||||
self.shape = array.shape
|
||||
if shape != None:
|
||||
if self.shape!=shape:
|
||||
#logger.log("debug","Dataset and input shape mismatch")
|
||||
raise ValueError
|
||||
if identifiers!=None:
|
||||
self._set_identifiers(identifiers,all_dims)
|
||||
else:
|
||||
ids = self._create_identifiers(self.shape,all_dims)
|
||||
self._set_identifiers(ids,all_dims)
|
||||
|
||||
def get_name(self):
|
||||
self._array = array
|
||||
self.has_array = True
|
||||
|
||||
else:
|
||||
raise ValueError, "array input must be of ArrayType or None"
|
||||
|
||||
self._all_dims = all_dims
|
||||
|
||||
def __str__self(self):
|
||||
return self._name
|
||||
|
||||
def get_dim_names(self):
|
||||
return self._dim_names
|
||||
|
||||
def names(self,axis=0):
|
||||
"""Returns identifier names of a dimension.
|
||||
NB: sorted by values!
|
||||
OK? necessary?"""
|
||||
def __iter__(self):
|
||||
"""Returns an iterator over dimensions of dataset."""
|
||||
return self._dims.__iter__()
|
||||
|
||||
def __contains__(self,dim):
|
||||
"""Returns True if dim is a dimension name in dataset."""
|
||||
# return self._dims.__contains__(dim)
|
||||
return self._map.__contains__(dim)
|
||||
|
||||
def __len__(self):
|
||||
"""Returns the number of dimensions in the dataset"""
|
||||
return len(self._map)
|
||||
|
||||
def __getitem__(self,dim):
|
||||
"""Return the identifers along the dimension dim."""
|
||||
return self._map[dim]
|
||||
|
||||
def _create_identifiers(self,shape,all_dims):
|
||||
"""Creates dimension names and identifier names, and returns
|
||||
identifiers."""
|
||||
|
||||
if type(axis)==int:
|
||||
dim_name = self._dim_names[axis]
|
||||
elif type(axis)==str:
|
||||
dim_name = axis
|
||||
if dim_name not in self._dim_names:
|
||||
raise ValueError, dim_name + " not a dimension in dataset"
|
||||
items = self.ids[dim_name].items()
|
||||
backitems=[ [v[1],v[0]] for v in items]
|
||||
backitems.sort()
|
||||
sorted_ids=[ backitems[i][1] for i in range(0,len(backitems))]
|
||||
return sorted_ids
|
||||
|
||||
def extract_data(self,ids,dim_name):
|
||||
"""Extracts data along a dimension by identifiers"""
|
||||
new_def_list = self.def_list[:]
|
||||
ids_index = [self.ids[dim_name][id_name] for id_name in ids]
|
||||
dim_number = self._dim_num[dim_name]
|
||||
try:
|
||||
out_data = take(self._data,ids_index,axis=dim_number)
|
||||
except:
|
||||
raise ValueError
|
||||
new_def_list[dim_number][1] = ids
|
||||
extracted_data = Dataset(out_data,def_list=new_def_list,parents=self.parents)
|
||||
return extracted_data
|
||||
|
||||
def _create_identifiers(self,axis):
|
||||
"""Creates identifiers along an axis"""
|
||||
n_dim = self.dims[axis]
|
||||
return [str(axis) + '_' + str(i) for i in range(n_dim)]
|
||||
|
||||
def extract_id_from_index(self,dim_name,index):
|
||||
"""Returns a set of ids from array/list of indexes."""
|
||||
dim_ids = self.ids[dim_name]
|
||||
if type(index)==int:
|
||||
index = [index]
|
||||
|
||||
return set([id for id,ind in dim_ids.items() if ind in index])
|
||||
|
||||
def extract_index_from_id(self,dim_name,id):
|
||||
"""Returns an array of indexes from a set/list of identifiers
|
||||
(or a single id)"""
|
||||
dim_ids = self.ids[dim_name]
|
||||
return array([ind for name,ind in dim_ids.items() if name in id])
|
||||
|
||||
|
||||
class CategoryDataset(Dataset):
|
||||
def __init__(self,array,def_list):
|
||||
Dataset.__init__(self,array,def_list)
|
||||
|
||||
def get_elements_by_category(self,dim,category):
|
||||
"""Returns all elements along input dim belonging to category.
|
||||
Assumes a two-dim category data only!
|
||||
"""
|
||||
if type(category)!=list:
|
||||
raise ValueError, "category must be list"
|
||||
gene_ids = []
|
||||
axis_dim = self._dim_num[dim]
|
||||
cat_index = self.extract_index_from_id(category)
|
||||
for ind in cat_index:
|
||||
if axis_dim==0:
|
||||
gene_indx = nonzero(self._data[:,ind])
|
||||
elif axis_dim==1:
|
||||
gene_indx = nonzero(self._data[ind,:])
|
||||
dim_names = ['rows','cols']
|
||||
ids = []
|
||||
for axis,n in enumerate(shape):
|
||||
if axis<2:
|
||||
dim_suggestion = dim_names[axis]
|
||||
else:
|
||||
ValueError, "Only support for 2-dim data"
|
||||
gene_ids.append(self.extract_id_from_index(dim,gene_index))
|
||||
return gene_ids
|
||||
|
||||
|
||||
|
||||
|
||||
dim_suggestion = 'dim'
|
||||
while dim_suggestion in all_dims:
|
||||
dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims)
|
||||
identifier_creation = [str(axis) + "_" + i for i in map(str,range(n))]
|
||||
ids.append((dim_suggestion,identifier_creation))
|
||||
all_dims.append(dim_suggestion)
|
||||
return ids
|
||||
|
||||
def _set_identifiers(self,identifiers,all_dims):
|
||||
"""Creates internal mapping of identifiers structure."""
|
||||
for dim,ids in identifiers:
|
||||
pos_map={}
|
||||
if dim not in self._dims:
|
||||
self._dims.append(dim)
|
||||
all_dims.append(dim)
|
||||
else:
|
||||
raise ValueError, "Dimension names must be unique"
|
||||
|
||||
|
||||
|
||||
for pos,id in enumerate(ids):
|
||||
pos_map[id] = pos
|
||||
self._map[dim] = pos_map
|
||||
shape_chk = [len(i) for j,i in identifiers]
|
||||
if shape_chk != list(self.shape):
|
||||
raise ValueError, "Shape input: %s and array: %s mismatch" %(shape_chk,self.shape)
|
||||
|
||||
def _suggest_dim_name(self,dim_name,all_dims):
|
||||
"""Suggests a unique name for dim and returns it"""
|
||||
c = 0
|
||||
while dim_name in all_dims:
|
||||
dim_name = dim_name + "_" + str(c)
|
||||
c+=1
|
||||
return dim_name
|
||||
|
||||
def asarray(self):
|
||||
"""Returns the numeric array (data) of dataset"""
|
||||
if not self.has_array:
|
||||
raise ValueError, "Dataset is empty"
|
||||
else:
|
||||
return self._array
|
||||
|
||||
def add_array(self,array):
|
||||
"""Adds array as an ArrayType object.
|
||||
A one-dim array is transformed to a two-dim array (row-vector)
|
||||
"""
|
||||
|
||||
if self.has_array:
|
||||
raise ValueError, "Dataset has array"
|
||||
else:
|
||||
if (len(self._map)!=len(array.shape)):
|
||||
raise ValueError, "range(array_dims) and range(dim_names) mismatch"
|
||||
if self.shape!=array.shape:
|
||||
raise ValueError, "Input array must be of similar dimensions as dataset"
|
||||
self._array = atleast_2d(asarray(array))
|
||||
self.has_array = True
|
||||
|
||||
def get_name(self):
|
||||
return self._name
|
||||
def get_all_dims(self):
|
||||
return self._all_dims
|
||||
|
||||
def get_identifiers(self):
|
||||
#return [n for n in self._map.iteritems()]
|
||||
|
||||
# ensure correct order
|
||||
# this has correct dims but not identifiers
|
||||
ids = []
|
||||
for dim in self._dims:
|
||||
ids.append((dim,self._map[dim].keys()))
|
||||
return ids
|
||||
|
||||
class CategoryDataset(Dataset):
|
||||
"""The category dataset class.
|
||||
|
||||
A dataset for representing class information as binary
|
||||
matrices (0/1-matrices).
|
||||
|
||||
There is support for using a less memory demanding, and
|
||||
fast intersection look-ups by representing the binary matrix as a
|
||||
dictionary in each dimension.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
Dataset.__init__(self)
|
||||
self.has_collection = False
|
||||
|
||||
def as_array(self):
|
||||
"""Returns data as binary matrix"""
|
||||
if not self.has_array and self.has_collection:
|
||||
#build numeric array
|
||||
pass
|
||||
|
||||
def as_collection(self,dim):
|
||||
"""Returns data as collection along dim"""
|
||||
pass
|
||||
|
||||
def add_collection(self,input_dict):
|
||||
"""Adds a category data as collection.
|
||||
|
||||
A collection is a datastructure that contains a dictionary for
|
||||
each pair of dimension in dataset, keyed by identifiers and
|
||||
values is a set of identifiers in the other dimension
|
||||
"""
|
||||
#build category data as double dicts
|
||||
pass
|
||||
|
||||
|
||||
class GraphDataset(Dataset):
|
||||
"""The graph dataset class.
|
||||
|
||||
A dataset class for representing graphs using an adjacency matrix
|
||||
(aka. restricted to square symmetric signed integers matrices)
|
||||
|
||||
If the library NetworkX is installed, there is support for
|
||||
representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure.
|
||||
"""
|
||||
def __init__(self):
|
||||
Dataset.__init(self)
|
||||
self.has_graph = False
|
||||
|
||||
class Selection:
|
||||
"""Handles selected identifiers along each dimension of a dataset"""
|
||||
def __init__(self):
|
||||
|
Reference in New Issue
Block a user