laydi/system/dataset.py

from system import logger
from scipy import atleast_2d,asarray,ArrayType


class Dataset:
    """The Dataset base class.
    
    A Dataset is an n-way array with defined string identifiers across
    all dimensions.

    example of use:

    ---
    dim_name_rows = 'rows'
    names_rows = ('row_a','row_b')
    ids_1 = [dim_name_rows, names_rows]

    dim_name_cols = 'cols'
    names_cols = ('col_a','col_b','col_c','col_d')
    ids_2 = [dim_name_cols, names_cols]

    Array_X = rand(2,4)
    data = Dataset(Array_X,(ids_1,ids_2),name="Testing")

    dim_names = [dim for dim in data]

    column_identifiers = [id for id in data['cols'].keys()]
    column_index = [index for index in data['cols'].values()]

    'cols' in data -> True

    ---

    data = Dataset(rand(10,20)) (generates dims and ids (no links))
    """
    def __init__(self,array=None,identifiers=None,shape=None,all_dims=[],**kwds):
        self._name = kwds.get("name","Unnamed data")
        self._dims = [] #existing dimensions in this dataset
        self._map = {} # internal mapping for dataset:  identifier <--> index
        self.has_array = False
        self.shape = None
    
        if array==None:
            if shape == None:
                raise ValueError, "Must define shape if array is None"
            else:
                self.shape = shape
                if identifiers!=None:
                    self._set_identifiers(identifiers,all_dims)
                else:
                    ids = self._create_identifiers(shape,all_dims)
                    self._set_identifiers(ids,all_dims)
        elif isinstance(array,ArrayType):
            array = atleast_2d(asarray(array))
            self.shape = array.shape
            if shape != None:
                if self.shape!=shape:
                    #logger.log("debug","Dataset and input shape mismatch")
                    raise ValueError
            if identifiers!=None:
                self._set_identifiers(identifiers,all_dims)
            else:
                ids = self._create_identifiers(self.shape,all_dims)
                self._set_identifiers(ids,all_dims)
            
            self._array = array
            self.has_array = True
            
        else:
            raise ValueError, "array input must be of ArrayType or None"

        self._all_dims = all_dims
                        
    def __str__self(self):
        return self._name

    def __iter__(self):
        """Returns an iterator over dimensions of dataset."""
        return self._dims.__iter__()

    def __contains__(self,dim):
        """Returns True if dim is a dimension name in dataset."""
        # return self._dims.__contains__(dim)
        return self._map.__contains__(dim)

    def __len__(self):
        """Returns the number of dimensions in the dataset"""
        return len(self._map)

    def __getitem__(self,dim):
        """Return the identifers along the dimension dim."""
        return self._map[dim]

    def _create_identifiers(self,shape,all_dims):
        """Creates dimension names and identifier names, and returns
        identifiers."""
        
        dim_names = ['rows','cols'] 
        ids = []
        for axis,n in enumerate(shape):
            if axis<2:
                dim_suggestion = dim_names[axis]
            else:
                dim_suggestion = 'dim'
            while dim_suggestion in all_dims:
                dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims) 
            identifier_creation = [str(axis) + "_" + i for i in map(str,range(n))]
            ids.append((dim_suggestion,identifier_creation))
            all_dims.append(dim_suggestion)
        return ids

    def _set_identifiers(self,identifiers,all_dims):
        """Creates internal mapping of identifiers structure."""
        for dim,ids in identifiers:
            pos_map={}
            if dim not in self._dims:
                self._dims.append(dim)
                all_dims.append(dim)
            else:
                raise ValueError, "Dimension names must be unique"
            
            for pos,id in enumerate(ids):
                pos_map[id] = pos
            self._map[dim] = pos_map
        shape_chk = [len(i) for j,i in identifiers]
        if shape_chk != list(self.shape):
            raise ValueError, "Shape input: %s and array: %s mismatch" %(shape_chk,self.shape)

    def _suggest_dim_name(self,dim_name,all_dims):
        """Suggests a unique name for dim and returns it"""
        c = 0
        while dim_name in all_dims:
            dim_name = dim_name + "_" + str(c)
            c+=1
        return dim_name
        
    def asarray(self):
        """Returns the numeric array (data) of dataset"""
        if not self.has_array:
            raise ValueError, "Dataset is empty"
        else:
            return self._array

    def add_array(self,array):
        """Adds array as an ArrayType object.
        A one-dim array is transformed to a two-dim array (row-vector)
        """
        
        if self.has_array:
            raise ValueError, "Dataset has array"
        else:
            if (len(self._map)!=len(array.shape)):
                raise ValueError, "range(array_dims) and range(dim_names) mismatch"
            if self.shape!=array.shape:
                raise ValueError, "Input array must be of similar dimensions as dataset"
            self._array = atleast_2d(asarray(array))
            self.has_array = True

    def get_name(self):
        """Returns dataset name"""
        return self._name

    def get_matrix(self):
        """Returns internal numeric matrix for dataset."""
        return self._array

    def get_all_dims(self):
        """Returns all dimensions in project"""
        return self._all_dims

    def get_dim_names(self):
        """Returns dim names""" 
        return [dim for dim in self._dims]

    def get_identifiers(self, dim, indices=None):
        """Returns identifiers along dim, sorted by position (index).

        You can optionally provide a list of indices to get only the
        identifiers of a given position.
        """
        items = self._map[dim].items()
        backitems=[ [v[1],v[0]] for v in items]
        backitems.sort()
        sorted_ids=[ backitems[i][1] for i in range(0,len(backitems))]

        if indices != None:
            return [sorted_ids[index] for index in indices]
        else:
            return sorted_ids

    def get_indices(self, dim, idents):
        """Get indices for identifiers along dimension."""
        reverse = {}
        for key, value in self._map[dim].items():
            reverse[value] = key
        return [self._map[dim][key] for key in idents]
     

class CategoryDataset(Dataset):
    """The category dataset class.

    A dataset for representing class information as binary
    matrices (0/1-matrices).

    There is support for using a less memory demanding, and
    fast intersection look-ups by representing the binary matrix as a
    dictionary in each dimension.
    """
    
    def __init__(self):
        Dataset.__init__(self)
        self.has_collection = False
            
    def as_array(self):
        """Returns data as binary matrix"""
        if not self.has_array and self.has_collection:
            #build numeric array
            pass

    def as_collection(self,dim):
        """Returns data as collection along dim"""
        pass
    
    def add_collection(self,input_dict):
        """Adds a category data as collection.

        A collection is a datastructure that contains a dictionary for
        each pair of dimension in dataset, keyed by identifiers and
        values is a set of identifiers in the other dimension
        """
        #build category data as double dicts
        pass


class GraphDataset(Dataset):
    """The graph dataset class.

    A dataset class for representing graphs using an adjacency matrix
    (aka. restricted to square symmetric signed integers matrices)
    
    If the library NetworkX is installed, there is support for
    representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure.
    """
    def __init__(self):
        Dataset.__init(self)
        self.has_graph = False
        
class Selection:
    """Handles selected identifiers along each dimension of a dataset"""
    def __init__(self):
        self.current_selection={}
Made system a package. 2006-04-25 11:53:35 +02:00			`from system import logger`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`from scipy import atleast_2d,asarray,ArrayType`
First commit 2006-04-17 00:57:50 +02:00

			`class Dataset:`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`"""The Dataset base class.`

op 2006-04-17 11:08:40 +02:00			`A Dataset is an n-way array with defined string identifiers across`
			`all dimensions.`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00
			`example of use:`

			`---`
			`dim_name_rows = 'rows'`
			`names_rows = ('row_a','row_b')`
			`ids_1 = [dim_name_rows, names_rows]`

			`dim_name_cols = 'cols'`
			`names_cols = ('col_a','col_b','col_c','col_d')`
			`ids_2 = [dim_name_cols, names_cols]`

			`Array_X = rand(2,4)`
			`data = Dataset(Array_X,(ids_1,ids_2),name="Testing")`

			`dim_names = [dim for dim in data]`

			`column_identifiers = [id for id in data['cols'].keys()]`
			`column_index = [index for index in data['cols'].values()]`

			`'cols' in data -> True`

			`---`

			`data = Dataset(rand(10,20)) (generates dims and ids (no links))`
First commit 2006-04-17 00:57:50 +02:00			`"""`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`def __init__(self,array=None,identifiers=None,shape=None,all_dims=[],**kwds):`
			`self._name = kwds.get("name","Unnamed data")`
			`self._dims = [] #existing dimensions in this dataset`
			`self._map = {} # internal mapping for dataset: identifier <--> index`
			`self.has_array = False`
			`self.shape = None`

			`if array==None:`
			`if shape == None:`
			`raise ValueError, "Must define shape if array is None"`
			`else:`
			`self.shape = shape`
			`if identifiers!=None:`
			`self._set_identifiers(identifiers,all_dims)`
			`else:`
			`ids = self._create_identifiers(shape,all_dims)`
			`self._set_identifiers(ids,all_dims)`
			`elif isinstance(array,ArrayType):`
			`array = atleast_2d(asarray(array))`
			`self.shape = array.shape`
			`if shape != None:`
			`if self.shape!=shape:`
			`#logger.log("debug","Dataset and input shape mismatch")`
			`raise ValueError`
			`if identifiers!=None:`
			`self._set_identifiers(identifiers,all_dims)`
			`else:`
			`ids = self._create_identifiers(self.shape,all_dims)`
			`self._set_identifiers(ids,all_dims)`
mainly play in plots 2006-04-18 16:25:46 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`self._array = array`
			`self.has_array = True`

			`else:`
			`raise ValueError, "array input must be of ArrayType or None"`

			`self._all_dims = all_dims`

			`def __str__self(self):`
Now can get and set name in Dataset. Also added getter for dim_names. 2006-04-21 14:28:29 +02:00			`return self._name`

Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`def __iter__(self):`
			`"""Returns an iterator over dimensions of dataset."""`
			`return self._dims.__iter__()`

			`def __contains__(self,dim):`
			`"""Returns True if dim is a dimension name in dataset."""`
			`# return self._dims.__contains__(dim)`
			`return self._map.__contains__(dim)`

			`def __len__(self):`
			`"""Returns the number of dimensions in the dataset"""`
			`return len(self._map)`

			`def __getitem__(self,dim):`
			`"""Return the identifers along the dimension dim."""`
			`return self._map[dim]`

			`def _create_identifiers(self,shape,all_dims):`
			`"""Creates dimension names and identifier names, and returns`
			`identifiers."""`
op 2006-04-17 11:08:40 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`dim_names = ['rows','cols']`
			`ids = []`
			`for axis,n in enumerate(shape):`
			`if axis<2:`
			`dim_suggestion = dim_names[axis]`
			`else:`
			`dim_suggestion = 'dim'`
			`while dim_suggestion in all_dims:`
			`dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims)`
			`identifier_creation = [str(axis) + "_" + i for i in map(str,range(n))]`
			`ids.append((dim_suggestion,identifier_creation))`
			`all_dims.append(dim_suggestion)`
			`return ids`

			`def _set_identifiers(self,identifiers,all_dims):`
			`"""Creates internal mapping of identifiers structure."""`
			`for dim,ids in identifiers:`
			`pos_map={}`
			`if dim not in self._dims:`
			`self._dims.append(dim)`
			`all_dims.append(dim)`
			`else:`
			`raise ValueError, "Dimension names must be unique"`
mainly overhaul of observers, and removal of project singleton 2006-04-20 12:27:58 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`for pos,id in enumerate(ids):`
			`pos_map[id] = pos`
			`self._map[dim] = pos_map`
			`shape_chk = [len(i) for j,i in identifiers]`
			`if shape_chk != list(self.shape):`
			`raise ValueError, "Shape input: %s and array: %s mismatch" %(shape_chk,self.shape)`
category data and plot selection update\| 2006-04-19 12:37:44 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`def _suggest_dim_name(self,dim_name,all_dims):`
			`"""Suggests a unique name for dim and returns it"""`
			`c = 0`
			`while dim_name in all_dims:`
			`dim_name = dim_name + "_" + str(c)`
			`c+=1`
			`return dim_name`
category data and plot selection update\| 2006-04-19 12:37:44 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`def asarray(self):`
			`"""Returns the numeric array (data) of dataset"""`
			`if not self.has_array:`
			`raise ValueError, "Dataset is empty"`
			`else:`
			`return self._array`
category data and plot selection update\| 2006-04-19 12:37:44 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`def add_array(self,array):`
			`"""Adds array as an ArrayType object.`
			`A one-dim array is transformed to a two-dim array (row-vector)`
category data and plot selection update\| 2006-04-19 12:37:44 +02:00			`"""`

Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`if self.has_array:`
			`raise ValueError, "Dataset has array"`
			`else:`
			`if (len(self._map)!=len(array.shape)):`
			`raise ValueError, "range(array_dims) and range(dim_names) mismatch"`
			`if self.shape!=array.shape:`
			`raise ValueError, "Input array must be of similar dimensions as dataset"`
			`self._array = atleast_2d(asarray(array))`
			`self.has_array = True`

			`def get_name(self):`
get_dim_names and get_identifiers added in dataset 2006-04-24 13:23:30 +02:00			`"""Returns dataset name"""`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`return self._name`
get_dim_names and get_identifiers added in dataset 2006-04-24 13:23:30 +02:00
Added matrix-getter to Dataset. Added rma-function to preprocessing incorporated in AffyMatrix-importer. Added PCAFunction to go_workflow for processing loaded affymatrix data. 2006-04-24 16:07:34 +02:00			`def get_matrix(self):`
			`"""Returns internal numeric matrix for dataset."""`
			`return self._array`

Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`def get_all_dims(self):`
get_dim_names and get_identifiers added in dataset 2006-04-24 13:23:30 +02:00			`"""Returns all dimensions in project"""`
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`return self._all_dims`

get_dim_names and get_identifiers added in dataset 2006-04-24 13:23:30 +02:00			`def get_dim_names(self):`
			`"""Returns dim names"""`
			`return [dim for dim in self._dims]`

Implemented Dataset.get_indices() and extended Dataset.get_identifiers() to convert between indices and identifiers and subsets thereof. Also added tests. 2006-05-02 16:06:48 +02:00			`def get_identifiers(self, dim, indices=None):`
			`"""Returns identifiers along dim, sorted by position (index).`

			`You can optionally provide a list of indices to get only the`
			`identifiers of a given position.`
			`"""`
get_dim_names and get_identifiers added in dataset 2006-04-24 13:23:30 +02:00			`items = self._map[dim].items()`
			`backitems=[ [v[1],v[0]] for v in items]`
			`backitems.sort()`
			`sorted_ids=[ backitems[i][1] for i in range(0,len(backitems))]`
Implemented Dataset.get_indices() and extended Dataset.get_identifiers() to convert between indices and identifiers and subsets thereof. Also added tests. 2006-05-02 16:06:48 +02:00
			`if indices != None:`
			`return [sorted_ids[index] for index in indices]`
			`else:`
			`return sorted_ids`

			`def get_indices(self, dim, idents):`
			`"""Get indices for identifiers along dimension."""`
			`reverse = {}`
			`for key, value in self._map[dim].items():`
			`reverse[value] = key`
			`return [self._map[dim][key] for key in idents]`
get_dim_names and get_identifiers added in dataset 2006-04-24 13:23:30 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00
			`class CategoryDataset(Dataset):`
			`"""The category dataset class.`

			`A dataset for representing class information as binary`
			`matrices (0/1-matrices).`

			`There is support for using a less memory demanding, and`
			`fast intersection look-ups by representing the binary matrix as a`
			`dictionary in each dimension.`
			`"""`

			`def __init__(self):`
			`Dataset.__init__(self)`
			`self.has_collection = False`
category data and plot selection update\| 2006-04-19 12:37:44 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`def as_array(self):`
			`"""Returns data as binary matrix"""`
			`if not self.has_array and self.has_collection:`
			`#build numeric array`
			`pass`
category data and plot selection update\| 2006-04-19 12:37:44 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`def as_collection(self,dim):`
			`"""Returns data as collection along dim"""`
			`pass`
mainly play in plots 2006-04-18 16:25:46 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`def add_collection(self,input_dict):`
			`"""Adds a category data as collection.`

			`A collection is a datastructure that contains a dictionary for`
			`each pair of dimension in dataset, keyed by identifiers and`
			`values is a set of identifiers in the other dimension`
			`"""`
			`#build category data as double dicts`
			`pass`

category data and plot selection update\| 2006-04-19 12:37:44 +02:00
Complete rewrite of dataset class, with (all) the necessary updates 2006-04-24 11:53:07 +02:00			`class GraphDataset(Dataset):`
			`"""The graph dataset class.`

			`A dataset class for representing graphs using an adjacency matrix`
			`(aka. restricted to square symmetric signed integers matrices)`

			`If the library NetworkX is installed, there is support for`
			`representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure.`
			`"""`
			`def __init__(self):`
			`Dataset.__init(self)`
			`self.has_graph = False`

First commit 2006-04-17 00:57:50 +02:00			`class Selection:`
op 2006-04-17 11:08:40 +02:00			`"""Handles selected identifiers along each dimension of a dataset"""`
First commit 2006-04-17 00:57:50 +02:00			`def __init__(self):`
			`self.current_selection={}`