Complete rewrite of dataset class, with (all) the necessary updates

This commit is contained in:
2006-04-24 09:53:07 +00:00
parent 53d0228074
commit a2e4392a72
9 changed files with 426 additions and 234 deletions

View File

@@ -1,136 +1,225 @@
import logger
from scipy import array,take,asarray,shape,nonzero
import project
from itertools import izip
from scipy import atleast_2d,asarray,ArrayType
class Dataset:
"""Dataset base class.
"""The Dataset base class.
A Dataset is an n-way array with defined string identifiers across
all dimensions.
example of use:
---
dim_name_rows = 'rows'
names_rows = ('row_a','row_b')
ids_1 = [dim_name_rows, names_rows]
dim_name_cols = 'cols'
names_cols = ('col_a','col_b','col_c','col_d')
ids_2 = [dim_name_cols, names_cols]
Array_X = rand(2,4)
data = Dataset(Array_X,(ids_1,ids_2),name="Testing")
dim_names = [dim for dim in data]
column_identifiers = [id for id in data['cols'].keys()]
column_index = [index for index in data['cols'].values()]
'cols' in data -> True
---
data = Dataset(rand(10,20)) (generates dims and ids (no links))
"""
def __init__(self, input_array, def_list, name="Unnamed data"):
self._name = name
self._data = asarray(input_array)
dims = shape(self._data)
self.def_list = def_list
self._ids_set = set()
self.ids={}
self._dim_num = {}
self._dim_names = []
if len(dims)==1: # a vector is defined to be column vector!
self.dims = (dims[0],1)
else:
self.dims = dims
if len(def_list)!=len(self.dims):
raise ValueError,"array dims and identifyer mismatch"
for axis,(dim_name,ids) in enumerate(def_list):
enum_ids = {}
#if dim_name not in project.c_p.dim_names:
# dim_name = project.c_p.suggest_dim_name(dim_name)
if not ids:
logger.log('debug','Creating identifiers along: '+ str(dim_name))
ids = self._create_identifiers(axis)
for num,name in enumerate(ids):
enum_ids[name] = num
self.ids[dim_name] = enum_ids
self._ids_set = self._ids_set.union(set(ids))
self._dim_num[dim_name] = axis
self._dim_names.append(dim_name)
for (dimname, ids), d in izip(def_list,self.dims): #check that data and labels match
if len(self.ids[dimname]) != d:
raise ValueError,"dim size and identifyer mismatch"
def __init__(self,array=None,identifiers=None,shape=None,all_dims=[],**kwds):
self._name = kwds.get("name","Unnamed data")
self._dims = [] #existing dimensions in this dataset
self._map = {} # internal mapping for dataset: identifier <--> index
self.has_array = False
self.shape = None
if array==None:
if shape == None:
raise ValueError, "Must define shape if array is None"
else:
self.shape = shape
if identifiers!=None:
self._set_identifiers(identifiers,all_dims)
else:
ids = self._create_identifiers(shape,all_dims)
self._set_identifiers(ids,all_dims)
elif isinstance(array,ArrayType):
array = atleast_2d(asarray(array))
self.shape = array.shape
if shape != None:
if self.shape!=shape:
#logger.log("debug","Dataset and input shape mismatch")
raise ValueError
if identifiers!=None:
self._set_identifiers(identifiers,all_dims)
else:
ids = self._create_identifiers(self.shape,all_dims)
self._set_identifiers(ids,all_dims)
def get_name(self):
self._array = array
self.has_array = True
else:
raise ValueError, "array input must be of ArrayType or None"
self._all_dims = all_dims
def __str__self(self):
return self._name
def get_dim_names(self):
return self._dim_names
def names(self,axis=0):
"""Returns identifier names of a dimension.
NB: sorted by values!
OK? necessary?"""
def __iter__(self):
"""Returns an iterator over dimensions of dataset."""
return self._dims.__iter__()
def __contains__(self,dim):
"""Returns True if dim is a dimension name in dataset."""
# return self._dims.__contains__(dim)
return self._map.__contains__(dim)
def __len__(self):
"""Returns the number of dimensions in the dataset"""
return len(self._map)
def __getitem__(self,dim):
"""Return the identifers along the dimension dim."""
return self._map[dim]
def _create_identifiers(self,shape,all_dims):
"""Creates dimension names and identifier names, and returns
identifiers."""
if type(axis)==int:
dim_name = self._dim_names[axis]
elif type(axis)==str:
dim_name = axis
if dim_name not in self._dim_names:
raise ValueError, dim_name + " not a dimension in dataset"
items = self.ids[dim_name].items()
backitems=[ [v[1],v[0]] for v in items]
backitems.sort()
sorted_ids=[ backitems[i][1] for i in range(0,len(backitems))]
return sorted_ids
def extract_data(self,ids,dim_name):
"""Extracts data along a dimension by identifiers"""
new_def_list = self.def_list[:]
ids_index = [self.ids[dim_name][id_name] for id_name in ids]
dim_number = self._dim_num[dim_name]
try:
out_data = take(self._data,ids_index,axis=dim_number)
except:
raise ValueError
new_def_list[dim_number][1] = ids
extracted_data = Dataset(out_data,def_list=new_def_list,parents=self.parents)
return extracted_data
def _create_identifiers(self,axis):
"""Creates identifiers along an axis"""
n_dim = self.dims[axis]
return [str(axis) + '_' + str(i) for i in range(n_dim)]
def extract_id_from_index(self,dim_name,index):
"""Returns a set of ids from array/list of indexes."""
dim_ids = self.ids[dim_name]
if type(index)==int:
index = [index]
return set([id for id,ind in dim_ids.items() if ind in index])
def extract_index_from_id(self,dim_name,id):
"""Returns an array of indexes from a set/list of identifiers
(or a single id)"""
dim_ids = self.ids[dim_name]
return array([ind for name,ind in dim_ids.items() if name in id])
class CategoryDataset(Dataset):
def __init__(self,array,def_list):
Dataset.__init__(self,array,def_list)
def get_elements_by_category(self,dim,category):
"""Returns all elements along input dim belonging to category.
Assumes a two-dim category data only!
"""
if type(category)!=list:
raise ValueError, "category must be list"
gene_ids = []
axis_dim = self._dim_num[dim]
cat_index = self.extract_index_from_id(category)
for ind in cat_index:
if axis_dim==0:
gene_indx = nonzero(self._data[:,ind])
elif axis_dim==1:
gene_indx = nonzero(self._data[ind,:])
dim_names = ['rows','cols']
ids = []
for axis,n in enumerate(shape):
if axis<2:
dim_suggestion = dim_names[axis]
else:
ValueError, "Only support for 2-dim data"
gene_ids.append(self.extract_id_from_index(dim,gene_index))
return gene_ids
dim_suggestion = 'dim'
while dim_suggestion in all_dims:
dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims)
identifier_creation = [str(axis) + "_" + i for i in map(str,range(n))]
ids.append((dim_suggestion,identifier_creation))
all_dims.append(dim_suggestion)
return ids
def _set_identifiers(self,identifiers,all_dims):
"""Creates internal mapping of identifiers structure."""
for dim,ids in identifiers:
pos_map={}
if dim not in self._dims:
self._dims.append(dim)
all_dims.append(dim)
else:
raise ValueError, "Dimension names must be unique"
for pos,id in enumerate(ids):
pos_map[id] = pos
self._map[dim] = pos_map
shape_chk = [len(i) for j,i in identifiers]
if shape_chk != list(self.shape):
raise ValueError, "Shape input: %s and array: %s mismatch" %(shape_chk,self.shape)
def _suggest_dim_name(self,dim_name,all_dims):
"""Suggests a unique name for dim and returns it"""
c = 0
while dim_name in all_dims:
dim_name = dim_name + "_" + str(c)
c+=1
return dim_name
def asarray(self):
"""Returns the numeric array (data) of dataset"""
if not self.has_array:
raise ValueError, "Dataset is empty"
else:
return self._array
def add_array(self,array):
"""Adds array as an ArrayType object.
A one-dim array is transformed to a two-dim array (row-vector)
"""
if self.has_array:
raise ValueError, "Dataset has array"
else:
if (len(self._map)!=len(array.shape)):
raise ValueError, "range(array_dims) and range(dim_names) mismatch"
if self.shape!=array.shape:
raise ValueError, "Input array must be of similar dimensions as dataset"
self._array = atleast_2d(asarray(array))
self.has_array = True
def get_name(self):
return self._name
def get_all_dims(self):
return self._all_dims
def get_identifiers(self):
#return [n for n in self._map.iteritems()]
# ensure correct order
# this has correct dims but not identifiers
ids = []
for dim in self._dims:
ids.append((dim,self._map[dim].keys()))
return ids
class CategoryDataset(Dataset):
"""The category dataset class.
A dataset for representing class information as binary
matrices (0/1-matrices).
There is support for using a less memory demanding, and
fast intersection look-ups by representing the binary matrix as a
dictionary in each dimension.
"""
def __init__(self):
Dataset.__init__(self)
self.has_collection = False
def as_array(self):
"""Returns data as binary matrix"""
if not self.has_array and self.has_collection:
#build numeric array
pass
def as_collection(self,dim):
"""Returns data as collection along dim"""
pass
def add_collection(self,input_dict):
"""Adds a category data as collection.
A collection is a datastructure that contains a dictionary for
each pair of dimension in dataset, keyed by identifiers and
values is a set of identifiers in the other dimension
"""
#build category data as double dicts
pass
class GraphDataset(Dataset):
"""The graph dataset class.
A dataset class for representing graphs using an adjacency matrix
(aka. restricted to square symmetric signed integers matrices)
If the library NetworkX is installed, there is support for
representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure.
"""
def __init__(self):
Dataset.__init(self)
self.has_graph = False
class Selection:
"""Handles selected identifiers along each dimension of a dataset"""
def __init__(self):