Complete rewrite of dataset class, with (all) the necessary updates
This commit is contained in:
parent
53d0228074
commit
a2e4392a72
|
@ -1,135 +1,224 @@
|
||||||
import logger
|
import logger
|
||||||
from scipy import array,take,asarray,shape,nonzero
|
from scipy import atleast_2d,asarray,ArrayType
|
||||||
import project
|
|
||||||
from itertools import izip
|
|
||||||
|
|
||||||
|
|
||||||
class Dataset:
|
class Dataset:
|
||||||
"""Dataset base class.
|
"""The Dataset base class.
|
||||||
|
|
||||||
A Dataset is an n-way array with defined string identifiers across
|
A Dataset is an n-way array with defined string identifiers across
|
||||||
all dimensions.
|
all dimensions.
|
||||||
"""
|
|
||||||
def __init__(self, input_array, def_list, name="Unnamed data"):
|
|
||||||
self._name = name
|
|
||||||
self._data = asarray(input_array)
|
|
||||||
dims = shape(self._data)
|
|
||||||
self.def_list = def_list
|
|
||||||
self._ids_set = set()
|
|
||||||
self.ids={}
|
|
||||||
self._dim_num = {}
|
|
||||||
self._dim_names = []
|
|
||||||
if len(dims)==1: # a vector is defined to be column vector!
|
|
||||||
self.dims = (dims[0],1)
|
|
||||||
else:
|
|
||||||
self.dims = dims
|
|
||||||
if len(def_list)!=len(self.dims):
|
|
||||||
raise ValueError,"array dims and identifyer mismatch"
|
|
||||||
for axis,(dim_name,ids) in enumerate(def_list):
|
|
||||||
enum_ids = {}
|
|
||||||
#if dim_name not in project.c_p.dim_names:
|
|
||||||
# dim_name = project.c_p.suggest_dim_name(dim_name)
|
|
||||||
if not ids:
|
|
||||||
logger.log('debug','Creating identifiers along: '+ str(dim_name))
|
|
||||||
ids = self._create_identifiers(axis)
|
|
||||||
for num,name in enumerate(ids):
|
|
||||||
enum_ids[name] = num
|
|
||||||
self.ids[dim_name] = enum_ids
|
|
||||||
self._ids_set = self._ids_set.union(set(ids))
|
|
||||||
self._dim_num[dim_name] = axis
|
|
||||||
self._dim_names.append(dim_name)
|
|
||||||
|
|
||||||
for (dimname, ids), d in izip(def_list,self.dims): #check that data and labels match
|
example of use:
|
||||||
if len(self.ids[dimname]) != d:
|
|
||||||
raise ValueError,"dim size and identifyer mismatch"
|
---
|
||||||
|
dim_name_rows = 'rows'
|
||||||
|
names_rows = ('row_a','row_b')
|
||||||
|
ids_1 = [dim_name_rows, names_rows]
|
||||||
|
|
||||||
|
dim_name_cols = 'cols'
|
||||||
|
names_cols = ('col_a','col_b','col_c','col_d')
|
||||||
|
ids_2 = [dim_name_cols, names_cols]
|
||||||
|
|
||||||
|
Array_X = rand(2,4)
|
||||||
|
data = Dataset(Array_X,(ids_1,ids_2),name="Testing")
|
||||||
|
|
||||||
|
dim_names = [dim for dim in data]
|
||||||
|
|
||||||
|
column_identifiers = [id for id in data['cols'].keys()]
|
||||||
|
column_index = [index for index in data['cols'].values()]
|
||||||
|
|
||||||
|
'cols' in data -> True
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
data = Dataset(rand(10,20)) (generates dims and ids (no links))
|
||||||
|
"""
|
||||||
|
def __init__(self,array=None,identifiers=None,shape=None,all_dims=[],**kwds):
|
||||||
|
self._name = kwds.get("name","Unnamed data")
|
||||||
|
self._dims = [] #existing dimensions in this dataset
|
||||||
|
self._map = {} # internal mapping for dataset: identifier <--> index
|
||||||
|
self.has_array = False
|
||||||
|
self.shape = None
|
||||||
|
|
||||||
|
if array==None:
|
||||||
|
if shape == None:
|
||||||
|
raise ValueError, "Must define shape if array is None"
|
||||||
|
else:
|
||||||
|
self.shape = shape
|
||||||
|
if identifiers!=None:
|
||||||
|
self._set_identifiers(identifiers,all_dims)
|
||||||
|
else:
|
||||||
|
ids = self._create_identifiers(shape,all_dims)
|
||||||
|
self._set_identifiers(ids,all_dims)
|
||||||
|
elif isinstance(array,ArrayType):
|
||||||
|
array = atleast_2d(asarray(array))
|
||||||
|
self.shape = array.shape
|
||||||
|
if shape != None:
|
||||||
|
if self.shape!=shape:
|
||||||
|
#logger.log("debug","Dataset and input shape mismatch")
|
||||||
|
raise ValueError
|
||||||
|
if identifiers!=None:
|
||||||
|
self._set_identifiers(identifiers,all_dims)
|
||||||
|
else:
|
||||||
|
ids = self._create_identifiers(self.shape,all_dims)
|
||||||
|
self._set_identifiers(ids,all_dims)
|
||||||
|
|
||||||
|
self._array = array
|
||||||
|
self.has_array = True
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError, "array input must be of ArrayType or None"
|
||||||
|
|
||||||
|
self._all_dims = all_dims
|
||||||
|
|
||||||
|
def __str__self(self):
|
||||||
|
return self._name
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
"""Returns an iterator over dimensions of dataset."""
|
||||||
|
return self._dims.__iter__()
|
||||||
|
|
||||||
|
def __contains__(self,dim):
|
||||||
|
"""Returns True if dim is a dimension name in dataset."""
|
||||||
|
# return self._dims.__contains__(dim)
|
||||||
|
return self._map.__contains__(dim)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
"""Returns the number of dimensions in the dataset"""
|
||||||
|
return len(self._map)
|
||||||
|
|
||||||
|
def __getitem__(self,dim):
|
||||||
|
"""Return the identifers along the dimension dim."""
|
||||||
|
return self._map[dim]
|
||||||
|
|
||||||
|
def _create_identifiers(self,shape,all_dims):
|
||||||
|
"""Creates dimension names and identifier names, and returns
|
||||||
|
identifiers."""
|
||||||
|
|
||||||
|
dim_names = ['rows','cols']
|
||||||
|
ids = []
|
||||||
|
for axis,n in enumerate(shape):
|
||||||
|
if axis<2:
|
||||||
|
dim_suggestion = dim_names[axis]
|
||||||
|
else:
|
||||||
|
dim_suggestion = 'dim'
|
||||||
|
while dim_suggestion in all_dims:
|
||||||
|
dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims)
|
||||||
|
identifier_creation = [str(axis) + "_" + i for i in map(str,range(n))]
|
||||||
|
ids.append((dim_suggestion,identifier_creation))
|
||||||
|
all_dims.append(dim_suggestion)
|
||||||
|
return ids
|
||||||
|
|
||||||
|
def _set_identifiers(self,identifiers,all_dims):
|
||||||
|
"""Creates internal mapping of identifiers structure."""
|
||||||
|
for dim,ids in identifiers:
|
||||||
|
pos_map={}
|
||||||
|
if dim not in self._dims:
|
||||||
|
self._dims.append(dim)
|
||||||
|
all_dims.append(dim)
|
||||||
|
else:
|
||||||
|
raise ValueError, "Dimension names must be unique"
|
||||||
|
|
||||||
|
for pos,id in enumerate(ids):
|
||||||
|
pos_map[id] = pos
|
||||||
|
self._map[dim] = pos_map
|
||||||
|
shape_chk = [len(i) for j,i in identifiers]
|
||||||
|
if shape_chk != list(self.shape):
|
||||||
|
raise ValueError, "Shape input: %s and array: %s mismatch" %(shape_chk,self.shape)
|
||||||
|
|
||||||
|
def _suggest_dim_name(self,dim_name,all_dims):
|
||||||
|
"""Suggests a unique name for dim and returns it"""
|
||||||
|
c = 0
|
||||||
|
while dim_name in all_dims:
|
||||||
|
dim_name = dim_name + "_" + str(c)
|
||||||
|
c+=1
|
||||||
|
return dim_name
|
||||||
|
|
||||||
|
def asarray(self):
|
||||||
|
"""Returns the numeric array (data) of dataset"""
|
||||||
|
if not self.has_array:
|
||||||
|
raise ValueError, "Dataset is empty"
|
||||||
|
else:
|
||||||
|
return self._array
|
||||||
|
|
||||||
|
def add_array(self,array):
|
||||||
|
"""Adds array as an ArrayType object.
|
||||||
|
A one-dim array is transformed to a two-dim array (row-vector)
|
||||||
|
"""
|
||||||
|
|
||||||
|
if self.has_array:
|
||||||
|
raise ValueError, "Dataset has array"
|
||||||
|
else:
|
||||||
|
if (len(self._map)!=len(array.shape)):
|
||||||
|
raise ValueError, "range(array_dims) and range(dim_names) mismatch"
|
||||||
|
if self.shape!=array.shape:
|
||||||
|
raise ValueError, "Input array must be of similar dimensions as dataset"
|
||||||
|
self._array = atleast_2d(asarray(array))
|
||||||
|
self.has_array = True
|
||||||
|
|
||||||
def get_name(self):
|
def get_name(self):
|
||||||
return self._name
|
return self._name
|
||||||
|
def get_all_dims(self):
|
||||||
|
return self._all_dims
|
||||||
|
|
||||||
def get_dim_names(self):
|
def get_identifiers(self):
|
||||||
return self._dim_names
|
#return [n for n in self._map.iteritems()]
|
||||||
|
|
||||||
def names(self,axis=0):
|
|
||||||
"""Returns identifier names of a dimension.
|
|
||||||
NB: sorted by values!
|
|
||||||
OK? necessary?"""
|
|
||||||
|
|
||||||
if type(axis)==int:
|
|
||||||
dim_name = self._dim_names[axis]
|
|
||||||
elif type(axis)==str:
|
|
||||||
dim_name = axis
|
|
||||||
if dim_name not in self._dim_names:
|
|
||||||
raise ValueError, dim_name + " not a dimension in dataset"
|
|
||||||
items = self.ids[dim_name].items()
|
|
||||||
backitems=[ [v[1],v[0]] for v in items]
|
|
||||||
backitems.sort()
|
|
||||||
sorted_ids=[ backitems[i][1] for i in range(0,len(backitems))]
|
|
||||||
return sorted_ids
|
|
||||||
|
|
||||||
def extract_data(self,ids,dim_name):
|
|
||||||
"""Extracts data along a dimension by identifiers"""
|
|
||||||
new_def_list = self.def_list[:]
|
|
||||||
ids_index = [self.ids[dim_name][id_name] for id_name in ids]
|
|
||||||
dim_number = self._dim_num[dim_name]
|
|
||||||
try:
|
|
||||||
out_data = take(self._data,ids_index,axis=dim_number)
|
|
||||||
except:
|
|
||||||
raise ValueError
|
|
||||||
new_def_list[dim_number][1] = ids
|
|
||||||
extracted_data = Dataset(out_data,def_list=new_def_list,parents=self.parents)
|
|
||||||
return extracted_data
|
|
||||||
|
|
||||||
def _create_identifiers(self,axis):
|
|
||||||
"""Creates identifiers along an axis"""
|
|
||||||
n_dim = self.dims[axis]
|
|
||||||
return [str(axis) + '_' + str(i) for i in range(n_dim)]
|
|
||||||
|
|
||||||
def extract_id_from_index(self,dim_name,index):
|
|
||||||
"""Returns a set of ids from array/list of indexes."""
|
|
||||||
dim_ids = self.ids[dim_name]
|
|
||||||
if type(index)==int:
|
|
||||||
index = [index]
|
|
||||||
|
|
||||||
return set([id for id,ind in dim_ids.items() if ind in index])
|
|
||||||
|
|
||||||
def extract_index_from_id(self,dim_name,id):
|
|
||||||
"""Returns an array of indexes from a set/list of identifiers
|
|
||||||
(or a single id)"""
|
|
||||||
dim_ids = self.ids[dim_name]
|
|
||||||
return array([ind for name,ind in dim_ids.items() if name in id])
|
|
||||||
|
|
||||||
|
# ensure correct order
|
||||||
|
# this has correct dims but not identifiers
|
||||||
|
ids = []
|
||||||
|
for dim in self._dims:
|
||||||
|
ids.append((dim,self._map[dim].keys()))
|
||||||
|
return ids
|
||||||
|
|
||||||
class CategoryDataset(Dataset):
|
class CategoryDataset(Dataset):
|
||||||
def __init__(self,array,def_list):
|
"""The category dataset class.
|
||||||
Dataset.__init__(self,array,def_list)
|
|
||||||
|
|
||||||
def get_elements_by_category(self,dim,category):
|
A dataset for representing class information as binary
|
||||||
"""Returns all elements along input dim belonging to category.
|
matrices (0/1-matrices).
|
||||||
Assumes a two-dim category data only!
|
|
||||||
|
There is support for using a less memory demanding, and
|
||||||
|
fast intersection look-ups by representing the binary matrix as a
|
||||||
|
dictionary in each dimension.
|
||||||
"""
|
"""
|
||||||
if type(category)!=list:
|
|
||||||
raise ValueError, "category must be list"
|
def __init__(self):
|
||||||
gene_ids = []
|
Dataset.__init__(self)
|
||||||
axis_dim = self._dim_num[dim]
|
self.has_collection = False
|
||||||
cat_index = self.extract_index_from_id(category)
|
|
||||||
for ind in cat_index:
|
def as_array(self):
|
||||||
if axis_dim==0:
|
"""Returns data as binary matrix"""
|
||||||
gene_indx = nonzero(self._data[:,ind])
|
if not self.has_array and self.has_collection:
|
||||||
elif axis_dim==1:
|
#build numeric array
|
||||||
gene_indx = nonzero(self._data[ind,:])
|
pass
|
||||||
else:
|
|
||||||
ValueError, "Only support for 2-dim data"
|
def as_collection(self,dim):
|
||||||
gene_ids.append(self.extract_id_from_index(dim,gene_index))
|
"""Returns data as collection along dim"""
|
||||||
return gene_ids
|
pass
|
||||||
|
|
||||||
|
def add_collection(self,input_dict):
|
||||||
|
"""Adds a category data as collection.
|
||||||
|
|
||||||
|
A collection is a datastructure that contains a dictionary for
|
||||||
|
each pair of dimension in dataset, keyed by identifiers and
|
||||||
|
values is a set of identifiers in the other dimension
|
||||||
|
"""
|
||||||
|
#build category data as double dicts
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class GraphDataset(Dataset):
|
||||||
|
"""The graph dataset class.
|
||||||
|
|
||||||
|
A dataset class for representing graphs using an adjacency matrix
|
||||||
|
(aka. restricted to square symmetric signed integers matrices)
|
||||||
|
|
||||||
|
If the library NetworkX is installed, there is support for
|
||||||
|
representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure.
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
Dataset.__init(self)
|
||||||
|
self.has_graph = False
|
||||||
|
|
||||||
class Selection:
|
class Selection:
|
||||||
"""Handles selected identifiers along each dimension of a dataset"""
|
"""Handles selected identifiers along each dimension of a dataset"""
|
||||||
|
|
|
@ -58,7 +58,7 @@ class CreateProjectDruid(gtk.Window):
|
||||||
|
|
||||||
for dir in wf_path:
|
for dir in wf_path:
|
||||||
for fn in os.listdir(dir):
|
for fn in os.listdir(dir):
|
||||||
if fn.endswith('.py'):
|
if fn.endswith('.py') and ('#' not in fn) :
|
||||||
wf_files.append(fn[:-3])
|
wf_files.append(fn[:-3])
|
||||||
|
|
||||||
# Try to load each file and look for Workflow derived classes
|
# Try to load each file and look for Workflow derived classes
|
||||||
|
@ -68,6 +68,7 @@ class CreateProjectDruid(gtk.Window):
|
||||||
for wf in wf_info:
|
for wf in wf_info:
|
||||||
store.insert_after(None, (getattr(wf, 'name'), wf))
|
store.insert_after(None, (getattr(wf, 'name'), wf))
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
|
wf_info = self.workflow_classes(fn)
|
||||||
logger.log('warning', 'Cannot load workflow: %s' % fn)
|
logger.log('warning', 'Cannot load workflow: %s' % fn)
|
||||||
|
|
||||||
return store
|
return store
|
||||||
|
|
|
@ -224,25 +224,27 @@ class SinePlot(Plot):
|
||||||
|
|
||||||
|
|
||||||
class ScatterPlot(Plot):
|
class ScatterPlot(Plot):
|
||||||
def __init__(self,project):
|
def __init__(self,project,dataset,id_dim,sel_dim,id_1,id_2):
|
||||||
Plot.__init__(self,project)
|
Plot.__init__(self,project)
|
||||||
|
self.project = project
|
||||||
fig = Figure(figsize=(5,4), dpi=72)
|
fig = Figure(figsize=(5,4), dpi=72)
|
||||||
self.ax = ax = fig.add_subplot(111)
|
self.ax = ax = fig.add_subplot(111)
|
||||||
|
self.current_dim = id_dim
|
||||||
# testing testing
|
# testing testing
|
||||||
self.x_dataset = project.datasets[0]
|
self.dataset = dataset
|
||||||
x = self.x_dataset._data
|
x_index = dataset[sel_dim][id_1]
|
||||||
self.xaxis_data = xaxis_data = x[:,0] + scipy.randn(scipy.shape(x)[0])
|
y_index = dataset[sel_dim][id_2]
|
||||||
self.yaxis_data = yaxis_data = x[:,1]
|
|
||||||
self.current_dim = self.x_dataset._dim_names[0]
|
self.xaxis_data = dataset._array[:,x_index]
|
||||||
ax.plot(xaxis_data,yaxis_data,'og')
|
self.yaxis_data = dataset._array[:,y_index]
|
||||||
|
ax.plot(self.xaxis_data,self.yaxis_data,'og')
|
||||||
###
|
###
|
||||||
|
|
||||||
self.canvas = FigureCanvas(fig)
|
self.canvas = FigureCanvas(fig)
|
||||||
self.add(self.canvas)
|
self.add(self.canvas)
|
||||||
rectprops = dict(facecolor='gray', edgecolor = 'black',
|
rectprops = dict(facecolor='gray', edgecolor = 'black',
|
||||||
alpha=0.2, fill=True) #cool
|
alpha=0.2, fill=True) #cool
|
||||||
sel = RectangleSelector(ax, self.rectangle_select_callback,
|
self.sel = RectangleSelector(ax, self.rectangle_select_callback,
|
||||||
drawtype='box',useblit=True,rectprops=rectprops)
|
drawtype='box',useblit=True,rectprops=rectprops)
|
||||||
|
|
||||||
self.canvas.show()
|
self.canvas.show()
|
||||||
|
@ -251,45 +253,36 @@ class ScatterPlot(Plot):
|
||||||
'event1 and event2 are the press and release events'
|
'event1 and event2 are the press and release events'
|
||||||
x1, y1 = event1.xdata, event1.ydata
|
x1, y1 = event1.xdata, event1.ydata
|
||||||
x2, y2 = event2.xdata, event2.ydata
|
x2, y2 = event2.xdata, event2.ydata
|
||||||
logger.log('debug', "(%3.2f, %3.2f) --> (%3.2f, %3.2f)"%(x1,y1,x2,y2))
|
|
||||||
logger.log('debug',"The button you used were:%s, %s "%(event1.button, event2.button))
|
|
||||||
# get all points within x1, y1, x2, y2
|
|
||||||
ydata = self.yaxis_data
|
ydata = self.yaxis_data
|
||||||
xdata = self.xaxis_data
|
xdata = self.xaxis_data
|
||||||
if x1>x2:
|
if x1>x2:
|
||||||
logger.log('debug','Selection x_start bigger than x_end')
|
|
||||||
if y1<y2:
|
if y1<y2:
|
||||||
logger.log('debug','Selection y_start less than y_end')
|
|
||||||
index =scipy.nonzero((xdata<x1) & (xdata>x2) & (ydata>y1) & (ydata<y2))
|
index =scipy.nonzero((xdata<x1) & (xdata>x2) & (ydata>y1) & (ydata<y2))
|
||||||
else:
|
else:
|
||||||
logger.log('debug','Selection y_start larger than y_end')
|
|
||||||
index =scipy.nonzero((xdata<x1) & (xdata>x2) & (ydata<y1) & (ydata>y2))
|
index =scipy.nonzero((xdata<x1) & (xdata>x2) & (ydata<y1) & (ydata>y2))
|
||||||
else:
|
else:
|
||||||
logger.log('debug','Selection x_start less than x_end')
|
#logger.log('debug','Selection x_start less than x_end')
|
||||||
if y1<y2:
|
if y1<y2:
|
||||||
logger.log('debug','Selection y_start less than y_end')
|
#logger.log('debug','Selection y_start less than y_end')
|
||||||
index =scipy.nonzero((xdata>x1) & (xdata<x2) & (ydata>y1) & (ydata<y2))
|
index =scipy.nonzero((xdata>x1) & (xdata<x2) & (ydata>y1) & (ydata<y2))
|
||||||
else:
|
else:
|
||||||
logger.log('debug','Selection y_start bigger than y_end')
|
#logger.log('debug','Selection y_start bigger than y_end')
|
||||||
index =scipy.nonzero((xdata>x1) & (xdata<x2) & (ydata<y1) & (ydata>y2))
|
index =scipy.nonzero((xdata>x1) & (xdata<x2) & (ydata<y1) & (ydata>y2))
|
||||||
|
|
||||||
|
|
||||||
if len(index)==0:
|
if len(index)==0:
|
||||||
logger.log('debug','No points selected!')
|
logger.log('debug','No points selected!')
|
||||||
else:
|
else:
|
||||||
logger.log('debug','Selected:\n%s'%index)
|
ids = [id for id,ind in self.dataset[self.current_dim].items() if ind in index]
|
||||||
ids = self.x_dataset.extract_id_from_index('samples',index)
|
|
||||||
#update selection object
|
|
||||||
self.project.set_selection(self.current_dim,ids)
|
self.project.set_selection(self.current_dim,ids)
|
||||||
logger.log('debug','Selected identifiers:\n%s'%ids)
|
|
||||||
|
|
||||||
def update(self,project,key):
|
def update(self,project,key):
|
||||||
curr_sel = project.get_selection() # get selection object
|
curr_sel = project.get_selection() # get selection object
|
||||||
ids = curr_sel[self.current_dim] # current identifiers
|
ids = curr_sel[self.current_dim] # current identifiers
|
||||||
index = self.x_dataset.extract_index_from_id(self.current_dim,ids) #conversion to index
|
|
||||||
|
index = [ind for id,ind in self.dataset[self.current_dim].items() if id in ids] #conversion to index
|
||||||
xdata_new = scipy.take(self.xaxis_data,index) #take data
|
xdata_new = scipy.take(self.xaxis_data,index) #take data
|
||||||
ydata_new = scipy.take(self.yaxis_data,index)
|
ydata_new = scipy.take(self.yaxis_data,index)
|
||||||
self.ax.plot(self.xaxis_data,self.yaxis_data,'ob')
|
self.ax.plot(self.xaxis_data,self.yaxis_data,'og')
|
||||||
self.ax.plot(xdata_new,ydata_new,'or')
|
self.ax.plot(xdata_new,ydata_new,'or')
|
||||||
self.canvas.draw()
|
self.canvas.draw()
|
||||||
|
|
||||||
|
|
|
@ -13,8 +13,8 @@ class Project:
|
||||||
self.name = name
|
self.name = name
|
||||||
self.dim_names = []
|
self.dim_names = []
|
||||||
self._observers = {}
|
self._observers = {}
|
||||||
self.current_data=None
|
self.current_data = None
|
||||||
self.datasets=[]
|
self.datasets = []
|
||||||
self.sel_obj = dataset.Selection()
|
self.sel_obj = dataset.Selection()
|
||||||
|
|
||||||
def attach(self,observer,key):
|
def attach(self,observer,key):
|
||||||
|
@ -69,15 +69,11 @@ class Project:
|
||||||
def add_dataset(self,dataset):
|
def add_dataset(self,dataset):
|
||||||
"""Appends a new Dataset to the project."""
|
"""Appends a new Dataset to the project."""
|
||||||
self.datasets.append(dataset)
|
self.datasets.append(dataset)
|
||||||
for dim_name in dataset.ids.keys():
|
for dim_name in dataset.get_all_dims():
|
||||||
if dim_name not in self.dim_names:
|
if dim_name not in self.dim_names:
|
||||||
self.dim_names.append(dim_name)
|
self.dim_names.append(dim_name)
|
||||||
|
self.sel_obj.current_selection[dim_name] = set()
|
||||||
|
|
||||||
def suggest_dim_name(self,dim_name):
|
|
||||||
"""Creates an arbitrary unique name for a new dimension."""
|
|
||||||
while dim_name in self.dim_names:
|
|
||||||
dim_name = dim_name + "_t"
|
|
||||||
return dim_name
|
|
||||||
|
|
||||||
def object_at(self, path):
|
def object_at(self, path):
|
||||||
"Returns the object at a given path in the tree."
|
"Returns the object at a given path in the tree."
|
||||||
|
|
|
@ -30,6 +30,9 @@ class Workflow:
|
||||||
print ' %s' % fun.name
|
print ' %s' % fun.name
|
||||||
|
|
||||||
def add_project(self,project):
|
def add_project(self,project):
|
||||||
|
if project == None:
|
||||||
|
logger.log('notice','Proejct is empty')
|
||||||
|
logger.log('notice','Project added in : %s' %self.name)
|
||||||
self.project = project
|
self.project = project
|
||||||
|
|
||||||
class EmptyWorkflow(Workflow):
|
class EmptyWorkflow(Workflow):
|
||||||
|
|
|
@ -5,28 +5,30 @@ from dataset import *
|
||||||
from scipy import rand,shape
|
from scipy import rand,shape
|
||||||
|
|
||||||
class DatasetTest(unittest.TestCase):
|
class DatasetTest(unittest.TestCase):
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.dim_0_ids = ['sample_a','sample_b']
|
dim_0_ids = ('sample_a','sample_b')
|
||||||
self.dim_1_ids = ['gene_a','gene_b','gene_c']
|
dim_1_ids = ('gene_a','gene_b','gene_c')
|
||||||
self.dim_labels = ['samples','genes']
|
dim_labels = ('samples','genes')
|
||||||
self.def_list = [[self.dim_labels[0],self.dim_0_ids],[self.dim_labels[1],self.dim_1_ids]]
|
identifiers= [(dim_labels[0],dim_0_ids),(dim_labels[1],dim_1_ids)]
|
||||||
self.array = rand(2,3)
|
self.array = rand(2,3)
|
||||||
self.testdata = Dataset(self.array,self.def_list)
|
self.testdata = Dataset(self.array,identifiers)
|
||||||
|
|
||||||
def testCreation(self):
|
def testCreation(self):
|
||||||
assert self.testdata._data == self.array
|
data = self.testdata
|
||||||
assert 'sample_a' in self.testdata.ids['samples'].keys()
|
assert data._array == self.array
|
||||||
assert 'gene_b' in self.testdata.ids['genes'].keys()
|
assert 'sample_a' in data['samples'].keys()
|
||||||
|
assert data['samples']['sample_b']==1
|
||||||
|
assert 'gene_c' in data['genes'].keys()
|
||||||
|
assert data['genes']['gene_c']==2
|
||||||
|
|
||||||
|
#def testExtraction(self):
|
||||||
|
# ids = ['gene_a','gene_b']
|
||||||
def testExtraction(self):
|
# dim_name = 'genes'
|
||||||
ids = ['gene_a','gene_b']
|
# subset = self.testdata.extract_data(ids,dim_name)
|
||||||
dim_name = 'genes'
|
# assert shape(subset._data) == (2,2)
|
||||||
subset = self.testdata.extract_data(ids,dim_name)
|
# assert subset.ids[dim_name].keys() == ids
|
||||||
assert shape(subset._data) == (2,2)
|
# assert subset.ids[dim_name].values() == [0,1]
|
||||||
assert subset.ids[dim_name].keys() == ids
|
|
||||||
assert subset.ids[dim_name].values() == [0,1]
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -102,9 +102,7 @@ class TestDataFunction(Function):
|
||||||
def run(self, data):
|
def run(self, data):
|
||||||
logger.log('notice', 'Injecting foo test data')
|
logger.log('notice', 'Injecting foo test data')
|
||||||
x = randn(20,30)
|
x = randn(20,30)
|
||||||
axis_0 = ['rows',[]]
|
X = dataset.Dataset(x)
|
||||||
axis_1 = ['cols',[]]
|
|
||||||
X = dataset.Dataset(x,[axis_0,axis_1])
|
|
||||||
return [X, plots.SinePlot(None)]
|
return [X, plots.SinePlot(None)]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,23 +1,27 @@
|
||||||
import gtk
|
import gtk
|
||||||
import logger
|
import logger
|
||||||
from workflow import *
|
from workflow import *
|
||||||
from scipy import array,zeros
|
import pickle
|
||||||
from data import read_affy_annot,read_mootha,data_dict_to_matrix
|
from scipy import log2,transpose,dot,divide,shape,mean,resize,zeros
|
||||||
|
from scipy.linalg import svd,inv,norm,get_blas_funcs,eig
|
||||||
import plots
|
import plots
|
||||||
import dataset
|
import dataset
|
||||||
|
|
||||||
class PCAWorkflow(Workflow):
|
class PCAWorkflow(Workflow):
|
||||||
|
name = 'PCA Workflow'
|
||||||
|
description = 'PCA workflow. Uses real microarray data from a study of diabetes (Mootha et al.).'
|
||||||
|
|
||||||
def __init__(self, app):
|
def __init__(self, app):
|
||||||
Workflow.__init__(self, app)
|
Workflow.__init__(self, app)
|
||||||
self.name = 'PCAs Workflow'
|
#self.add_project(app.project)
|
||||||
|
#logger.log('notice','Current project added to: %s' %self.name)
|
||||||
|
|
||||||
load = Stage('load', 'Load Data')
|
load = Stage('load', 'Load Data')
|
||||||
load.add_function(LoadMoothaData())
|
load.add_function(LoadMoothaData())
|
||||||
self.add_stage(load)
|
self.add_stage(load)
|
||||||
|
|
||||||
preproc = Stage('preprocess', 'Preprocessing')
|
preproc = Stage('preprocess', 'Preprocessing')
|
||||||
preproc.add_function(Function('log2', 'Logarithm'))
|
preproc.add_function(Log2Function())
|
||||||
self.add_stage(preproc)
|
self.add_stage(preproc)
|
||||||
|
|
||||||
annot = Stage('annot', 'Affy annotations')
|
annot = Stage('annot', 'Affy annotations')
|
||||||
|
@ -25,7 +29,7 @@ class PCAWorkflow(Workflow):
|
||||||
self.add_stage(annot)
|
self.add_stage(annot)
|
||||||
|
|
||||||
model = Stage('model', 'Model')
|
model = Stage('model', 'Model')
|
||||||
model.add_function(Function('pca', 'PCA'))
|
model.add_function(PCAFunction(self))
|
||||||
self.add_stage(model)
|
self.add_stage(model)
|
||||||
|
|
||||||
logger.log('debug', '\tPCA\'s workflow is now active')
|
logger.log('debug', '\tPCA\'s workflow is now active')
|
||||||
|
@ -34,7 +38,6 @@ class LoadAnnotationsFunction(Function):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
Function.__init__(self, 'load', 'Load Annotations')
|
Function.__init__(self, 'load', 'Load Annotations')
|
||||||
self.annotations = None
|
|
||||||
|
|
||||||
def load_affy_file(self, filename):
|
def load_affy_file(self, filename):
|
||||||
f = open(filename)
|
f = open(filename)
|
||||||
|
@ -68,81 +71,188 @@ class LoadAnnotationsFunction(Function):
|
||||||
pathways = description[i_want]
|
pathways = description[i_want]
|
||||||
if not pathways[0][0]=='--':
|
if not pathways[0][0]=='--':
|
||||||
pass
|
pass
|
||||||
|
D = []
|
||||||
|
return [D]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
return [self.annotations]
|
|
||||||
|
|
||||||
class PCAFunction(Function):
|
class PCAFunction(Function):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self,workflow):
|
||||||
Function.__init__(self, 'X', 'a_opt')
|
Function.__init__(self, 'pca', 'PCA')
|
||||||
self.output = None
|
self.output = None
|
||||||
|
self.workflow = workflow
|
||||||
|
|
||||||
def run(self, data):
|
def run(self, data):
|
||||||
logger.log('debug', 'datatype: %s' % type(data))
|
logger.log('debug', 'datatype: %s' % type(data))
|
||||||
if not isinstance(data,dataset.Dataset):
|
if not isinstance(data,dataset.Dataset):
|
||||||
return None
|
return None
|
||||||
logger.log('debug', 'dimensions: %s' % data.dims)
|
#logger.log('debug', 'dimensions: %s' % data.dims)
|
||||||
|
|
||||||
## calculations
|
## calculations
|
||||||
T,P,E,tsq = pca(data._data,a_opt=2)
|
T,P,E,tsq = self.pca(data._array,5,tsq_loads=False)
|
||||||
comp_def = ['comp',['1','2']]
|
comp_def = ('comp',('1','2','3','4','5'))
|
||||||
singel_def = ['1',['s']]
|
singel_def = ('1',('s'))
|
||||||
col_def = [data._dim_names[0],data.names(0)]
|
|
||||||
row_def = [data._dim_names[1],data.names(1)]
|
# pull out input identifiers:
|
||||||
T = dataset.Dataset(T,[col_def,comp_def])
|
data_ids = []
|
||||||
P = dataset.Dataset(T,[row_def,comp_def])
|
for dim in data:
|
||||||
E = dataset.Dataset(E,[col_def,row_def])
|
data_ids.append((dim,data[dim].keys()))
|
||||||
tsq = dataset.Dataset(tsq,[row_def,sigel_def])
|
|
||||||
|
T = dataset.Dataset(T,(data_ids[0],comp_def))
|
||||||
|
P = dataset.Dataset(P,[data_ids[1],comp_def])
|
||||||
|
E = dataset.Dataset(E,data_ids)
|
||||||
|
#tsq = dataset.Dataset(tsq,[singel_def,data_ids[1])
|
||||||
|
|
||||||
## plots
|
## plots
|
||||||
loading_plot = plots.ScatterPlot()
|
loading_plot1 = plots.ScatterPlot(self.workflow.project,P,'genes','comp','1','2')
|
||||||
|
loading_plot2 = plots.ScatterPlot(self.workflow.project,P,'genes','comp','3','4')
|
||||||
|
score_plot = plots.ScatterPlot(self.workflow.project,T,'samples','comp','1','2')
|
||||||
|
|
||||||
|
return [T,P,E,loading_plot1,loading_plot2,score_plot]
|
||||||
|
|
||||||
|
def pca(self,X,a_opt,cent=True,scale='loads',tsq_loads=False):
|
||||||
|
"""Principal component analysis
|
||||||
|
|
||||||
|
input:
|
||||||
|
Xc -- matrix, data
|
||||||
|
a_opt -- scalar, max number of comp. to calculate
|
||||||
|
cent -- bool, centering [True]
|
||||||
|
crit -- string, pc criteria ['exp_var',['ief','rpv','average']]
|
||||||
|
scale -- string, scaling ['loads',['scores']]
|
||||||
|
tsq_loads -- bool, calculate t-squared? [True]
|
||||||
|
reg -- float, covariance regularizer for tsq calculations [0.2]
|
||||||
|
output:
|
||||||
|
T,P,E,r
|
||||||
|
|
||||||
|
"""
|
||||||
|
nSamples,nVarX = shape(X)
|
||||||
|
if cent:
|
||||||
|
Xc = self.mat_center(X)
|
||||||
|
else:
|
||||||
|
Xc = X
|
||||||
|
u,s,vh = self.esvd(Xc)
|
||||||
|
if scale=='scores':
|
||||||
|
T = u*s
|
||||||
|
T = T[:,:a_opt]
|
||||||
|
P = transpose(vh)
|
||||||
|
P = P[:,:a_opt]
|
||||||
|
elif scale=='loads':
|
||||||
|
T = u
|
||||||
|
T = T[:,:a_opt]
|
||||||
|
P = transpose(vh)*s
|
||||||
|
P = P[:,:a_opt]
|
||||||
|
|
||||||
|
E = Xc - dot(T,transpose(P))
|
||||||
|
varEach = s**2
|
||||||
|
totVar = sum(varEach)
|
||||||
|
r = divide(varEach,totVar)*100
|
||||||
|
return T,P,E,r
|
||||||
|
|
||||||
|
def mat_center(self,X,axis=0,ret_mn=False):
|
||||||
|
"""Mean center matrix along axis.
|
||||||
|
|
||||||
|
input:
|
||||||
|
X -- matrix, data
|
||||||
|
axis -- dim,
|
||||||
|
ret_mn -- bool, return mean
|
||||||
|
output:
|
||||||
|
Xc, [mnX]
|
||||||
|
|
||||||
|
NB: axis = 1 is column-centering, axis=0=row-centering
|
||||||
|
default is row centering (axis=0)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
rows,cols = shape(X)
|
||||||
|
except ValueError:
|
||||||
|
print "The X data needs to be two-dimensional"
|
||||||
|
|
||||||
|
if axis==0:
|
||||||
|
mnX = mean(X,axis)
|
||||||
|
Xs = X - resize(mnX,(rows,cols))
|
||||||
|
|
||||||
|
elif axis==1:
|
||||||
|
mnX = mean(X,axis)
|
||||||
|
Xs = transpose(transpose(X) - resize(mnX,(cols,rows)))
|
||||||
|
if ret_mn:
|
||||||
|
return Xs,mnX
|
||||||
|
else:
|
||||||
|
return Xs
|
||||||
|
|
||||||
|
def esvd(self,data,economy=1):
|
||||||
|
"""SVD with the option of economy sized calculation
|
||||||
|
Calculate subspaces of X'X or XX' depending on the shape
|
||||||
|
of the matrix.
|
||||||
|
Good for extreme fat or thin matrices.
|
||||||
|
|
||||||
|
"""
|
||||||
|
mm = self.mm
|
||||||
|
m,n = shape(data)
|
||||||
|
if m>=n:
|
||||||
|
u,s,v = svd(mm(data,data,trans_a=1))
|
||||||
|
u = mm(data,v,trans_b=1)
|
||||||
|
for i in xrange(n):
|
||||||
|
s[i] = norm(u[:,i])
|
||||||
|
u[:,i] = u[:,i]/s[i]
|
||||||
|
else:
|
||||||
|
u,s,v = svd(mm(data,data,trans_b=1))
|
||||||
|
v = mm(u,data,trans_a=1)
|
||||||
|
for i in xrange(m):
|
||||||
|
s[i] = norm(v[i,:])
|
||||||
|
v[i,:] = v[i,:]/s[i]
|
||||||
|
return u,s,v
|
||||||
|
|
||||||
|
def mm(self,a,b, alpha=1.0, beta=0.0, c=None, trans_a=0,
|
||||||
|
trans_b=0):
|
||||||
|
"""Fast matrix multiplication
|
||||||
|
|
||||||
|
Return alpha*(a*b) + beta*c.
|
||||||
|
|
||||||
|
a,b,c : matrices
|
||||||
|
alpha, beta: scalars
|
||||||
|
trans_a : 0 (a not transposed),
|
||||||
|
1 (a transposed),
|
||||||
|
2 (a conjugate transposed)
|
||||||
|
trans_b : 0 (b not transposed),
|
||||||
|
1 (b transposed),
|
||||||
|
2 (b conjugate transposed)
|
||||||
|
"""
|
||||||
|
if c:
|
||||||
|
gemm,= get_blas_funcs(('gemm',),(a,b,c))
|
||||||
|
else:
|
||||||
|
gemm,= get_blas_funcs(('gemm',),(a,b))
|
||||||
|
|
||||||
|
return gemm(alpha, a, b, beta, c, trans_a, trans_b)
|
||||||
|
|
||||||
return [T,P,E,r]
|
|
||||||
|
|
||||||
class LoadMoothaData(Function):
|
class LoadMoothaData(Function):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
Function.__init__(self, 'load', 'Load diabetes data')
|
Function.__init__(self, 'load', 'Load diabetes data')
|
||||||
self.annotations = None
|
|
||||||
|
|
||||||
def load_expression_file(self, filename):
|
def run(self,data):
|
||||||
f = open(filename)
|
data_file = open('full_data.pickle','r')
|
||||||
logger.log('notice', 'Loading expression file: %s' % filename)
|
data = pickle.load(data_file)
|
||||||
self.file = f
|
data_file.close()
|
||||||
self.filename = filename
|
sample_file = open('sample_labels.pickle','r')
|
||||||
|
sample_names = pickle.load(sample_file)
|
||||||
def on_response(self, dialog, response):
|
sample_file.close()
|
||||||
if response == gtk.RESPONSE_OK:
|
typecode='f'
|
||||||
logger.log('notice', 'Reading file: %s' % dialog.get_filename())
|
nSamps = len(sample_names)
|
||||||
self.load_expression_file(dialog.get_filename())
|
nVars = len(data.keys())
|
||||||
|
|
||||||
def run(self, data):
|
|
||||||
btns = ('Open', gtk.RESPONSE_OK, \
|
|
||||||
'Cancel', gtk.RESPONSE_CANCEL)
|
|
||||||
dialog = gtk.FileChooserDialog('Open diabetes expression File',
|
|
||||||
buttons=btns)
|
|
||||||
dialog.connect('response', self.on_response)
|
|
||||||
dialog.run()
|
|
||||||
dialog.destroy()
|
|
||||||
|
|
||||||
### Reading and parsing here
|
|
||||||
d,sample_names = read_mootha()
|
|
||||||
n_samps = len(sample_names)
|
|
||||||
n_genes = len(d.keys())
|
|
||||||
typecode = 'f'
|
|
||||||
x = zeros((n_samps,n_genes),typecode)
|
|
||||||
gene_ids = []
|
gene_ids = []
|
||||||
for i,(id,desc) in enumerate(d.items()):
|
x = zeros((nSamps,nVars),typecode)
|
||||||
|
for i,(id,desc) in enumerate(data.items()):
|
||||||
gene_ids.append(id)
|
gene_ids.append(id)
|
||||||
x[:,i] = desc[0].astype(typecode)
|
x[:,i] = desc[0].astype(typecode)
|
||||||
gene_def = ['genes',gene_ids]
|
gene_def = ('genes',gene_ids)
|
||||||
sample_def = ['samples', sample_names]
|
sample_def = ('samples', sample_names)
|
||||||
X = dataset.Dataset(x,[sample_def,gene_def]) # samples x genes
|
X = dataset.Dataset(x,identifiers=[sample_def,gene_def]) # samples x genes
|
||||||
return [X]
|
return [X]
|
||||||
|
|
||||||
|
class Log2Function(Function):
|
||||||
|
def __init__(self):
|
||||||
|
Function.__init__(self, 'log', 'Log2')
|
||||||
|
|
||||||
|
def run(self,data):
|
||||||
|
x = log2(data._array)
|
||||||
|
ids = data.get_identifiers()
|
||||||
|
return [dataset.Dataset(x,identifiers=ids,name='Log2_X')]
|
||||||
PCAWorkflow.name = 'PCA Workflow'
|
PCAWorkflow.name = 'PCA Workflow'
|
||||||
|
|
Reference in New Issue