Complete rewrite of dataset class, with (all) the necessary updates
This commit is contained in:
parent
53d0228074
commit
a2e4392a72
|
@ -1,135 +1,224 @@
|
|||
import logger
|
||||
from scipy import array,take,asarray,shape,nonzero
|
||||
import project
|
||||
from itertools import izip
|
||||
from scipy import atleast_2d,asarray,ArrayType
|
||||
|
||||
|
||||
class Dataset:
|
||||
"""Dataset base class.
|
||||
"""The Dataset base class.
|
||||
|
||||
A Dataset is an n-way array with defined string identifiers across
|
||||
all dimensions.
|
||||
"""
|
||||
def __init__(self, input_array, def_list, name="Unnamed data"):
|
||||
self._name = name
|
||||
self._data = asarray(input_array)
|
||||
dims = shape(self._data)
|
||||
self.def_list = def_list
|
||||
self._ids_set = set()
|
||||
self.ids={}
|
||||
self._dim_num = {}
|
||||
self._dim_names = []
|
||||
if len(dims)==1: # a vector is defined to be column vector!
|
||||
self.dims = (dims[0],1)
|
||||
else:
|
||||
self.dims = dims
|
||||
if len(def_list)!=len(self.dims):
|
||||
raise ValueError,"array dims and identifyer mismatch"
|
||||
for axis,(dim_name,ids) in enumerate(def_list):
|
||||
enum_ids = {}
|
||||
#if dim_name not in project.c_p.dim_names:
|
||||
# dim_name = project.c_p.suggest_dim_name(dim_name)
|
||||
if not ids:
|
||||
logger.log('debug','Creating identifiers along: '+ str(dim_name))
|
||||
ids = self._create_identifiers(axis)
|
||||
for num,name in enumerate(ids):
|
||||
enum_ids[name] = num
|
||||
self.ids[dim_name] = enum_ids
|
||||
self._ids_set = self._ids_set.union(set(ids))
|
||||
self._dim_num[dim_name] = axis
|
||||
self._dim_names.append(dim_name)
|
||||
|
||||
for (dimname, ids), d in izip(def_list,self.dims): #check that data and labels match
|
||||
if len(self.ids[dimname]) != d:
|
||||
raise ValueError,"dim size and identifyer mismatch"
|
||||
example of use:
|
||||
|
||||
---
|
||||
dim_name_rows = 'rows'
|
||||
names_rows = ('row_a','row_b')
|
||||
ids_1 = [dim_name_rows, names_rows]
|
||||
|
||||
dim_name_cols = 'cols'
|
||||
names_cols = ('col_a','col_b','col_c','col_d')
|
||||
ids_2 = [dim_name_cols, names_cols]
|
||||
|
||||
Array_X = rand(2,4)
|
||||
data = Dataset(Array_X,(ids_1,ids_2),name="Testing")
|
||||
|
||||
dim_names = [dim for dim in data]
|
||||
|
||||
column_identifiers = [id for id in data['cols'].keys()]
|
||||
column_index = [index for index in data['cols'].values()]
|
||||
|
||||
'cols' in data -> True
|
||||
|
||||
---
|
||||
|
||||
data = Dataset(rand(10,20)) (generates dims and ids (no links))
|
||||
"""
|
||||
def __init__(self,array=None,identifiers=None,shape=None,all_dims=[],**kwds):
|
||||
self._name = kwds.get("name","Unnamed data")
|
||||
self._dims = [] #existing dimensions in this dataset
|
||||
self._map = {} # internal mapping for dataset: identifier <--> index
|
||||
self.has_array = False
|
||||
self.shape = None
|
||||
|
||||
if array==None:
|
||||
if shape == None:
|
||||
raise ValueError, "Must define shape if array is None"
|
||||
else:
|
||||
self.shape = shape
|
||||
if identifiers!=None:
|
||||
self._set_identifiers(identifiers,all_dims)
|
||||
else:
|
||||
ids = self._create_identifiers(shape,all_dims)
|
||||
self._set_identifiers(ids,all_dims)
|
||||
elif isinstance(array,ArrayType):
|
||||
array = atleast_2d(asarray(array))
|
||||
self.shape = array.shape
|
||||
if shape != None:
|
||||
if self.shape!=shape:
|
||||
#logger.log("debug","Dataset and input shape mismatch")
|
||||
raise ValueError
|
||||
if identifiers!=None:
|
||||
self._set_identifiers(identifiers,all_dims)
|
||||
else:
|
||||
ids = self._create_identifiers(self.shape,all_dims)
|
||||
self._set_identifiers(ids,all_dims)
|
||||
|
||||
self._array = array
|
||||
self.has_array = True
|
||||
|
||||
else:
|
||||
raise ValueError, "array input must be of ArrayType or None"
|
||||
|
||||
self._all_dims = all_dims
|
||||
|
||||
def __str__self(self):
|
||||
return self._name
|
||||
|
||||
def __iter__(self):
|
||||
"""Returns an iterator over dimensions of dataset."""
|
||||
return self._dims.__iter__()
|
||||
|
||||
def __contains__(self,dim):
|
||||
"""Returns True if dim is a dimension name in dataset."""
|
||||
# return self._dims.__contains__(dim)
|
||||
return self._map.__contains__(dim)
|
||||
|
||||
def __len__(self):
|
||||
"""Returns the number of dimensions in the dataset"""
|
||||
return len(self._map)
|
||||
|
||||
def __getitem__(self,dim):
|
||||
"""Return the identifers along the dimension dim."""
|
||||
return self._map[dim]
|
||||
|
||||
def _create_identifiers(self,shape,all_dims):
|
||||
"""Creates dimension names and identifier names, and returns
|
||||
identifiers."""
|
||||
|
||||
dim_names = ['rows','cols']
|
||||
ids = []
|
||||
for axis,n in enumerate(shape):
|
||||
if axis<2:
|
||||
dim_suggestion = dim_names[axis]
|
||||
else:
|
||||
dim_suggestion = 'dim'
|
||||
while dim_suggestion in all_dims:
|
||||
dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims)
|
||||
identifier_creation = [str(axis) + "_" + i for i in map(str,range(n))]
|
||||
ids.append((dim_suggestion,identifier_creation))
|
||||
all_dims.append(dim_suggestion)
|
||||
return ids
|
||||
|
||||
def _set_identifiers(self,identifiers,all_dims):
|
||||
"""Creates internal mapping of identifiers structure."""
|
||||
for dim,ids in identifiers:
|
||||
pos_map={}
|
||||
if dim not in self._dims:
|
||||
self._dims.append(dim)
|
||||
all_dims.append(dim)
|
||||
else:
|
||||
raise ValueError, "Dimension names must be unique"
|
||||
|
||||
for pos,id in enumerate(ids):
|
||||
pos_map[id] = pos
|
||||
self._map[dim] = pos_map
|
||||
shape_chk = [len(i) for j,i in identifiers]
|
||||
if shape_chk != list(self.shape):
|
||||
raise ValueError, "Shape input: %s and array: %s mismatch" %(shape_chk,self.shape)
|
||||
|
||||
def _suggest_dim_name(self,dim_name,all_dims):
|
||||
"""Suggests a unique name for dim and returns it"""
|
||||
c = 0
|
||||
while dim_name in all_dims:
|
||||
dim_name = dim_name + "_" + str(c)
|
||||
c+=1
|
||||
return dim_name
|
||||
|
||||
def asarray(self):
|
||||
"""Returns the numeric array (data) of dataset"""
|
||||
if not self.has_array:
|
||||
raise ValueError, "Dataset is empty"
|
||||
else:
|
||||
return self._array
|
||||
|
||||
def add_array(self,array):
|
||||
"""Adds array as an ArrayType object.
|
||||
A one-dim array is transformed to a two-dim array (row-vector)
|
||||
"""
|
||||
|
||||
if self.has_array:
|
||||
raise ValueError, "Dataset has array"
|
||||
else:
|
||||
if (len(self._map)!=len(array.shape)):
|
||||
raise ValueError, "range(array_dims) and range(dim_names) mismatch"
|
||||
if self.shape!=array.shape:
|
||||
raise ValueError, "Input array must be of similar dimensions as dataset"
|
||||
self._array = atleast_2d(asarray(array))
|
||||
self.has_array = True
|
||||
|
||||
def get_name(self):
|
||||
return self._name
|
||||
def get_all_dims(self):
|
||||
return self._all_dims
|
||||
|
||||
def get_dim_names(self):
|
||||
return self._dim_names
|
||||
|
||||
def names(self,axis=0):
|
||||
"""Returns identifier names of a dimension.
|
||||
NB: sorted by values!
|
||||
OK? necessary?"""
|
||||
|
||||
if type(axis)==int:
|
||||
dim_name = self._dim_names[axis]
|
||||
elif type(axis)==str:
|
||||
dim_name = axis
|
||||
if dim_name not in self._dim_names:
|
||||
raise ValueError, dim_name + " not a dimension in dataset"
|
||||
items = self.ids[dim_name].items()
|
||||
backitems=[ [v[1],v[0]] for v in items]
|
||||
backitems.sort()
|
||||
sorted_ids=[ backitems[i][1] for i in range(0,len(backitems))]
|
||||
return sorted_ids
|
||||
|
||||
def extract_data(self,ids,dim_name):
|
||||
"""Extracts data along a dimension by identifiers"""
|
||||
new_def_list = self.def_list[:]
|
||||
ids_index = [self.ids[dim_name][id_name] for id_name in ids]
|
||||
dim_number = self._dim_num[dim_name]
|
||||
try:
|
||||
out_data = take(self._data,ids_index,axis=dim_number)
|
||||
except:
|
||||
raise ValueError
|
||||
new_def_list[dim_number][1] = ids
|
||||
extracted_data = Dataset(out_data,def_list=new_def_list,parents=self.parents)
|
||||
return extracted_data
|
||||
|
||||
def _create_identifiers(self,axis):
|
||||
"""Creates identifiers along an axis"""
|
||||
n_dim = self.dims[axis]
|
||||
return [str(axis) + '_' + str(i) for i in range(n_dim)]
|
||||
|
||||
def extract_id_from_index(self,dim_name,index):
|
||||
"""Returns a set of ids from array/list of indexes."""
|
||||
dim_ids = self.ids[dim_name]
|
||||
if type(index)==int:
|
||||
index = [index]
|
||||
|
||||
return set([id for id,ind in dim_ids.items() if ind in index])
|
||||
|
||||
def extract_index_from_id(self,dim_name,id):
|
||||
"""Returns an array of indexes from a set/list of identifiers
|
||||
(or a single id)"""
|
||||
dim_ids = self.ids[dim_name]
|
||||
return array([ind for name,ind in dim_ids.items() if name in id])
|
||||
def get_identifiers(self):
|
||||
#return [n for n in self._map.iteritems()]
|
||||
|
||||
# ensure correct order
|
||||
# this has correct dims but not identifiers
|
||||
ids = []
|
||||
for dim in self._dims:
|
||||
ids.append((dim,self._map[dim].keys()))
|
||||
return ids
|
||||
|
||||
class CategoryDataset(Dataset):
|
||||
def __init__(self,array,def_list):
|
||||
Dataset.__init__(self,array,def_list)
|
||||
"""The category dataset class.
|
||||
|
||||
def get_elements_by_category(self,dim,category):
|
||||
"""Returns all elements along input dim belonging to category.
|
||||
Assumes a two-dim category data only!
|
||||
A dataset for representing class information as binary
|
||||
matrices (0/1-matrices).
|
||||
|
||||
There is support for using a less memory demanding, and
|
||||
fast intersection look-ups by representing the binary matrix as a
|
||||
dictionary in each dimension.
|
||||
"""
|
||||
if type(category)!=list:
|
||||
raise ValueError, "category must be list"
|
||||
gene_ids = []
|
||||
axis_dim = self._dim_num[dim]
|
||||
cat_index = self.extract_index_from_id(category)
|
||||
for ind in cat_index:
|
||||
if axis_dim==0:
|
||||
gene_indx = nonzero(self._data[:,ind])
|
||||
elif axis_dim==1:
|
||||
gene_indx = nonzero(self._data[ind,:])
|
||||
else:
|
||||
ValueError, "Only support for 2-dim data"
|
||||
gene_ids.append(self.extract_id_from_index(dim,gene_index))
|
||||
return gene_ids
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def __init__(self):
|
||||
Dataset.__init__(self)
|
||||
self.has_collection = False
|
||||
|
||||
def as_array(self):
|
||||
"""Returns data as binary matrix"""
|
||||
if not self.has_array and self.has_collection:
|
||||
#build numeric array
|
||||
pass
|
||||
|
||||
def as_collection(self,dim):
|
||||
"""Returns data as collection along dim"""
|
||||
pass
|
||||
|
||||
def add_collection(self,input_dict):
|
||||
"""Adds a category data as collection.
|
||||
|
||||
A collection is a datastructure that contains a dictionary for
|
||||
each pair of dimension in dataset, keyed by identifiers and
|
||||
values is a set of identifiers in the other dimension
|
||||
"""
|
||||
#build category data as double dicts
|
||||
pass
|
||||
|
||||
|
||||
class GraphDataset(Dataset):
|
||||
"""The graph dataset class.
|
||||
|
||||
A dataset class for representing graphs using an adjacency matrix
|
||||
(aka. restricted to square symmetric signed integers matrices)
|
||||
|
||||
If the library NetworkX is installed, there is support for
|
||||
representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure.
|
||||
"""
|
||||
def __init__(self):
|
||||
Dataset.__init(self)
|
||||
self.has_graph = False
|
||||
|
||||
class Selection:
|
||||
"""Handles selected identifiers along each dimension of a dataset"""
|
||||
|
|
|
@ -58,7 +58,7 @@ class CreateProjectDruid(gtk.Window):
|
|||
|
||||
for dir in wf_path:
|
||||
for fn in os.listdir(dir):
|
||||
if fn.endswith('.py'):
|
||||
if fn.endswith('.py') and ('#' not in fn) :
|
||||
wf_files.append(fn[:-3])
|
||||
|
||||
# Try to load each file and look for Workflow derived classes
|
||||
|
@ -68,6 +68,7 @@ class CreateProjectDruid(gtk.Window):
|
|||
for wf in wf_info:
|
||||
store.insert_after(None, (getattr(wf, 'name'), wf))
|
||||
except Exception, e:
|
||||
wf_info = self.workflow_classes(fn)
|
||||
logger.log('warning', 'Cannot load workflow: %s' % fn)
|
||||
|
||||
return store
|
||||
|
|
|
@ -224,25 +224,27 @@ class SinePlot(Plot):
|
|||
|
||||
|
||||
class ScatterPlot(Plot):
|
||||
def __init__(self,project):
|
||||
def __init__(self,project,dataset,id_dim,sel_dim,id_1,id_2):
|
||||
Plot.__init__(self,project)
|
||||
self.project = project
|
||||
fig = Figure(figsize=(5,4), dpi=72)
|
||||
self.ax = ax = fig.add_subplot(111)
|
||||
|
||||
self.current_dim = id_dim
|
||||
# testing testing
|
||||
self.x_dataset = project.datasets[0]
|
||||
x = self.x_dataset._data
|
||||
self.xaxis_data = xaxis_data = x[:,0] + scipy.randn(scipy.shape(x)[0])
|
||||
self.yaxis_data = yaxis_data = x[:,1]
|
||||
self.current_dim = self.x_dataset._dim_names[0]
|
||||
ax.plot(xaxis_data,yaxis_data,'og')
|
||||
self.dataset = dataset
|
||||
x_index = dataset[sel_dim][id_1]
|
||||
y_index = dataset[sel_dim][id_2]
|
||||
|
||||
self.xaxis_data = dataset._array[:,x_index]
|
||||
self.yaxis_data = dataset._array[:,y_index]
|
||||
ax.plot(self.xaxis_data,self.yaxis_data,'og')
|
||||
###
|
||||
|
||||
self.canvas = FigureCanvas(fig)
|
||||
self.add(self.canvas)
|
||||
rectprops = dict(facecolor='gray', edgecolor = 'black',
|
||||
alpha=0.2, fill=True) #cool
|
||||
sel = RectangleSelector(ax, self.rectangle_select_callback,
|
||||
self.sel = RectangleSelector(ax, self.rectangle_select_callback,
|
||||
drawtype='box',useblit=True,rectprops=rectprops)
|
||||
|
||||
self.canvas.show()
|
||||
|
@ -251,45 +253,36 @@ class ScatterPlot(Plot):
|
|||
'event1 and event2 are the press and release events'
|
||||
x1, y1 = event1.xdata, event1.ydata
|
||||
x2, y2 = event2.xdata, event2.ydata
|
||||
logger.log('debug', "(%3.2f, %3.2f) --> (%3.2f, %3.2f)"%(x1,y1,x2,y2))
|
||||
logger.log('debug',"The button you used were:%s, %s "%(event1.button, event2.button))
|
||||
# get all points within x1, y1, x2, y2
|
||||
ydata = self.yaxis_data
|
||||
xdata = self.xaxis_data
|
||||
if x1>x2:
|
||||
logger.log('debug','Selection x_start bigger than x_end')
|
||||
if y1<y2:
|
||||
logger.log('debug','Selection y_start less than y_end')
|
||||
index =scipy.nonzero((xdata<x1) & (xdata>x2) & (ydata>y1) & (ydata<y2))
|
||||
else:
|
||||
logger.log('debug','Selection y_start larger than y_end')
|
||||
index =scipy.nonzero((xdata<x1) & (xdata>x2) & (ydata<y1) & (ydata>y2))
|
||||
else:
|
||||
logger.log('debug','Selection x_start less than x_end')
|
||||
#logger.log('debug','Selection x_start less than x_end')
|
||||
if y1<y2:
|
||||
logger.log('debug','Selection y_start less than y_end')
|
||||
#logger.log('debug','Selection y_start less than y_end')
|
||||
index =scipy.nonzero((xdata>x1) & (xdata<x2) & (ydata>y1) & (ydata<y2))
|
||||
else:
|
||||
logger.log('debug','Selection y_start bigger than y_end')
|
||||
#logger.log('debug','Selection y_start bigger than y_end')
|
||||
index =scipy.nonzero((xdata>x1) & (xdata<x2) & (ydata<y1) & (ydata>y2))
|
||||
|
||||
|
||||
if len(index)==0:
|
||||
logger.log('debug','No points selected!')
|
||||
else:
|
||||
logger.log('debug','Selected:\n%s'%index)
|
||||
ids = self.x_dataset.extract_id_from_index('samples',index)
|
||||
#update selection object
|
||||
ids = [id for id,ind in self.dataset[self.current_dim].items() if ind in index]
|
||||
self.project.set_selection(self.current_dim,ids)
|
||||
logger.log('debug','Selected identifiers:\n%s'%ids)
|
||||
|
||||
def update(self,project,key):
|
||||
curr_sel = project.get_selection() # get selection object
|
||||
ids = curr_sel[self.current_dim] # current identifiers
|
||||
index = self.x_dataset.extract_index_from_id(self.current_dim,ids) #conversion to index
|
||||
|
||||
index = [ind for id,ind in self.dataset[self.current_dim].items() if id in ids] #conversion to index
|
||||
xdata_new = scipy.take(self.xaxis_data,index) #take data
|
||||
ydata_new = scipy.take(self.yaxis_data,index)
|
||||
self.ax.plot(self.xaxis_data,self.yaxis_data,'ob')
|
||||
self.ax.plot(self.xaxis_data,self.yaxis_data,'og')
|
||||
self.ax.plot(xdata_new,ydata_new,'or')
|
||||
self.canvas.draw()
|
||||
|
||||
|
|
|
@ -13,8 +13,8 @@ class Project:
|
|||
self.name = name
|
||||
self.dim_names = []
|
||||
self._observers = {}
|
||||
self.current_data=None
|
||||
self.datasets=[]
|
||||
self.current_data = None
|
||||
self.datasets = []
|
||||
self.sel_obj = dataset.Selection()
|
||||
|
||||
def attach(self,observer,key):
|
||||
|
@ -69,15 +69,11 @@ class Project:
|
|||
def add_dataset(self,dataset):
|
||||
"""Appends a new Dataset to the project."""
|
||||
self.datasets.append(dataset)
|
||||
for dim_name in dataset.ids.keys():
|
||||
for dim_name in dataset.get_all_dims():
|
||||
if dim_name not in self.dim_names:
|
||||
self.dim_names.append(dim_name)
|
||||
self.sel_obj.current_selection[dim_name] = set()
|
||||
|
||||
def suggest_dim_name(self,dim_name):
|
||||
"""Creates an arbitrary unique name for a new dimension."""
|
||||
while dim_name in self.dim_names:
|
||||
dim_name = dim_name + "_t"
|
||||
return dim_name
|
||||
|
||||
def object_at(self, path):
|
||||
"Returns the object at a given path in the tree."
|
||||
|
|
|
@ -30,6 +30,9 @@ class Workflow:
|
|||
print ' %s' % fun.name
|
||||
|
||||
def add_project(self,project):
|
||||
if project == None:
|
||||
logger.log('notice','Proejct is empty')
|
||||
logger.log('notice','Project added in : %s' %self.name)
|
||||
self.project = project
|
||||
|
||||
class EmptyWorkflow(Workflow):
|
||||
|
|
|
@ -5,28 +5,30 @@ from dataset import *
|
|||
from scipy import rand,shape
|
||||
|
||||
class DatasetTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.dim_0_ids = ['sample_a','sample_b']
|
||||
self.dim_1_ids = ['gene_a','gene_b','gene_c']
|
||||
self.dim_labels = ['samples','genes']
|
||||
self.def_list = [[self.dim_labels[0],self.dim_0_ids],[self.dim_labels[1],self.dim_1_ids]]
|
||||
dim_0_ids = ('sample_a','sample_b')
|
||||
dim_1_ids = ('gene_a','gene_b','gene_c')
|
||||
dim_labels = ('samples','genes')
|
||||
identifiers= [(dim_labels[0],dim_0_ids),(dim_labels[1],dim_1_ids)]
|
||||
self.array = rand(2,3)
|
||||
self.testdata = Dataset(self.array,self.def_list)
|
||||
self.testdata = Dataset(self.array,identifiers)
|
||||
|
||||
def testCreation(self):
|
||||
assert self.testdata._data == self.array
|
||||
assert 'sample_a' in self.testdata.ids['samples'].keys()
|
||||
assert 'gene_b' in self.testdata.ids['genes'].keys()
|
||||
data = self.testdata
|
||||
assert data._array == self.array
|
||||
assert 'sample_a' in data['samples'].keys()
|
||||
assert data['samples']['sample_b']==1
|
||||
assert 'gene_c' in data['genes'].keys()
|
||||
assert data['genes']['gene_c']==2
|
||||
|
||||
|
||||
|
||||
def testExtraction(self):
|
||||
ids = ['gene_a','gene_b']
|
||||
dim_name = 'genes'
|
||||
subset = self.testdata.extract_data(ids,dim_name)
|
||||
assert shape(subset._data) == (2,2)
|
||||
assert subset.ids[dim_name].keys() == ids
|
||||
assert subset.ids[dim_name].values() == [0,1]
|
||||
#def testExtraction(self):
|
||||
# ids = ['gene_a','gene_b']
|
||||
# dim_name = 'genes'
|
||||
# subset = self.testdata.extract_data(ids,dim_name)
|
||||
# assert shape(subset._data) == (2,2)
|
||||
# assert subset.ids[dim_name].keys() == ids
|
||||
# assert subset.ids[dim_name].values() == [0,1]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -102,9 +102,7 @@ class TestDataFunction(Function):
|
|||
def run(self, data):
|
||||
logger.log('notice', 'Injecting foo test data')
|
||||
x = randn(20,30)
|
||||
axis_0 = ['rows',[]]
|
||||
axis_1 = ['cols',[]]
|
||||
X = dataset.Dataset(x,[axis_0,axis_1])
|
||||
X = dataset.Dataset(x)
|
||||
return [X, plots.SinePlot(None)]
|
||||
|
||||
|
||||
|
|
|
@ -1,23 +1,27 @@
|
|||
import gtk
|
||||
import logger
|
||||
from workflow import *
|
||||
from scipy import array,zeros
|
||||
from data import read_affy_annot,read_mootha,data_dict_to_matrix
|
||||
import pickle
|
||||
from scipy import log2,transpose,dot,divide,shape,mean,resize,zeros
|
||||
from scipy.linalg import svd,inv,norm,get_blas_funcs,eig
|
||||
import plots
|
||||
import dataset
|
||||
|
||||
class PCAWorkflow(Workflow):
|
||||
name = 'PCA Workflow'
|
||||
description = 'PCA workflow. Uses real microarray data from a study of diabetes (Mootha et al.).'
|
||||
|
||||
def __init__(self, app):
|
||||
Workflow.__init__(self, app)
|
||||
self.name = 'PCAs Workflow'
|
||||
#self.add_project(app.project)
|
||||
#logger.log('notice','Current project added to: %s' %self.name)
|
||||
|
||||
load = Stage('load', 'Load Data')
|
||||
load.add_function(LoadMoothaData())
|
||||
self.add_stage(load)
|
||||
|
||||
preproc = Stage('preprocess', 'Preprocessing')
|
||||
preproc.add_function(Function('log2', 'Logarithm'))
|
||||
preproc.add_function(Log2Function())
|
||||
self.add_stage(preproc)
|
||||
|
||||
annot = Stage('annot', 'Affy annotations')
|
||||
|
@ -25,7 +29,7 @@ class PCAWorkflow(Workflow):
|
|||
self.add_stage(annot)
|
||||
|
||||
model = Stage('model', 'Model')
|
||||
model.add_function(Function('pca', 'PCA'))
|
||||
model.add_function(PCAFunction(self))
|
||||
self.add_stage(model)
|
||||
|
||||
logger.log('debug', '\tPCA\'s workflow is now active')
|
||||
|
@ -34,7 +38,6 @@ class LoadAnnotationsFunction(Function):
|
|||
|
||||
def __init__(self):
|
||||
Function.__init__(self, 'load', 'Load Annotations')
|
||||
self.annotations = None
|
||||
|
||||
def load_affy_file(self, filename):
|
||||
f = open(filename)
|
||||
|
@ -68,81 +71,188 @@ class LoadAnnotationsFunction(Function):
|
|||
pathways = description[i_want]
|
||||
if not pathways[0][0]=='--':
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
return [self.annotations]
|
||||
D = []
|
||||
return [D]
|
||||
|
||||
class PCAFunction(Function):
|
||||
|
||||
def __init__(self):
|
||||
Function.__init__(self, 'X', 'a_opt')
|
||||
def __init__(self,workflow):
|
||||
Function.__init__(self, 'pca', 'PCA')
|
||||
self.output = None
|
||||
self.workflow = workflow
|
||||
|
||||
def run(self, data):
|
||||
logger.log('debug', 'datatype: %s' % type(data))
|
||||
if not isinstance(data,dataset.Dataset):
|
||||
return None
|
||||
logger.log('debug', 'dimensions: %s' % data.dims)
|
||||
#logger.log('debug', 'dimensions: %s' % data.dims)
|
||||
|
||||
## calculations
|
||||
T,P,E,tsq = pca(data._data,a_opt=2)
|
||||
comp_def = ['comp',['1','2']]
|
||||
singel_def = ['1',['s']]
|
||||
col_def = [data._dim_names[0],data.names(0)]
|
||||
row_def = [data._dim_names[1],data.names(1)]
|
||||
T = dataset.Dataset(T,[col_def,comp_def])
|
||||
P = dataset.Dataset(T,[row_def,comp_def])
|
||||
E = dataset.Dataset(E,[col_def,row_def])
|
||||
tsq = dataset.Dataset(tsq,[row_def,sigel_def])
|
||||
T,P,E,tsq = self.pca(data._array,5,tsq_loads=False)
|
||||
comp_def = ('comp',('1','2','3','4','5'))
|
||||
singel_def = ('1',('s'))
|
||||
|
||||
# pull out input identifiers:
|
||||
data_ids = []
|
||||
for dim in data:
|
||||
data_ids.append((dim,data[dim].keys()))
|
||||
|
||||
T = dataset.Dataset(T,(data_ids[0],comp_def))
|
||||
P = dataset.Dataset(P,[data_ids[1],comp_def])
|
||||
E = dataset.Dataset(E,data_ids)
|
||||
#tsq = dataset.Dataset(tsq,[singel_def,data_ids[1])
|
||||
|
||||
## plots
|
||||
loading_plot = plots.ScatterPlot()
|
||||
loading_plot1 = plots.ScatterPlot(self.workflow.project,P,'genes','comp','1','2')
|
||||
loading_plot2 = plots.ScatterPlot(self.workflow.project,P,'genes','comp','3','4')
|
||||
score_plot = plots.ScatterPlot(self.workflow.project,T,'samples','comp','1','2')
|
||||
|
||||
return [T,P,E,loading_plot1,loading_plot2,score_plot]
|
||||
|
||||
def pca(self,X,a_opt,cent=True,scale='loads',tsq_loads=False):
|
||||
"""Principal component analysis
|
||||
|
||||
input:
|
||||
Xc -- matrix, data
|
||||
a_opt -- scalar, max number of comp. to calculate
|
||||
cent -- bool, centering [True]
|
||||
crit -- string, pc criteria ['exp_var',['ief','rpv','average']]
|
||||
scale -- string, scaling ['loads',['scores']]
|
||||
tsq_loads -- bool, calculate t-squared? [True]
|
||||
reg -- float, covariance regularizer for tsq calculations [0.2]
|
||||
output:
|
||||
T,P,E,r
|
||||
|
||||
"""
|
||||
nSamples,nVarX = shape(X)
|
||||
if cent:
|
||||
Xc = self.mat_center(X)
|
||||
else:
|
||||
Xc = X
|
||||
u,s,vh = self.esvd(Xc)
|
||||
if scale=='scores':
|
||||
T = u*s
|
||||
T = T[:,:a_opt]
|
||||
P = transpose(vh)
|
||||
P = P[:,:a_opt]
|
||||
elif scale=='loads':
|
||||
T = u
|
||||
T = T[:,:a_opt]
|
||||
P = transpose(vh)*s
|
||||
P = P[:,:a_opt]
|
||||
|
||||
E = Xc - dot(T,transpose(P))
|
||||
varEach = s**2
|
||||
totVar = sum(varEach)
|
||||
r = divide(varEach,totVar)*100
|
||||
return T,P,E,r
|
||||
|
||||
def mat_center(self,X,axis=0,ret_mn=False):
|
||||
"""Mean center matrix along axis.
|
||||
|
||||
input:
|
||||
X -- matrix, data
|
||||
axis -- dim,
|
||||
ret_mn -- bool, return mean
|
||||
output:
|
||||
Xc, [mnX]
|
||||
|
||||
NB: axis = 1 is column-centering, axis=0=row-centering
|
||||
default is row centering (axis=0)
|
||||
"""
|
||||
try:
|
||||
rows,cols = shape(X)
|
||||
except ValueError:
|
||||
print "The X data needs to be two-dimensional"
|
||||
|
||||
if axis==0:
|
||||
mnX = mean(X,axis)
|
||||
Xs = X - resize(mnX,(rows,cols))
|
||||
|
||||
elif axis==1:
|
||||
mnX = mean(X,axis)
|
||||
Xs = transpose(transpose(X) - resize(mnX,(cols,rows)))
|
||||
if ret_mn:
|
||||
return Xs,mnX
|
||||
else:
|
||||
return Xs
|
||||
|
||||
def esvd(self,data,economy=1):
|
||||
"""SVD with the option of economy sized calculation
|
||||
Calculate subspaces of X'X or XX' depending on the shape
|
||||
of the matrix.
|
||||
Good for extreme fat or thin matrices.
|
||||
|
||||
"""
|
||||
mm = self.mm
|
||||
m,n = shape(data)
|
||||
if m>=n:
|
||||
u,s,v = svd(mm(data,data,trans_a=1))
|
||||
u = mm(data,v,trans_b=1)
|
||||
for i in xrange(n):
|
||||
s[i] = norm(u[:,i])
|
||||
u[:,i] = u[:,i]/s[i]
|
||||
else:
|
||||
u,s,v = svd(mm(data,data,trans_b=1))
|
||||
v = mm(u,data,trans_a=1)
|
||||
for i in xrange(m):
|
||||
s[i] = norm(v[i,:])
|
||||
v[i,:] = v[i,:]/s[i]
|
||||
return u,s,v
|
||||
|
||||
def mm(self,a,b, alpha=1.0, beta=0.0, c=None, trans_a=0,
|
||||
trans_b=0):
|
||||
"""Fast matrix multiplication
|
||||
|
||||
Return alpha*(a*b) + beta*c.
|
||||
|
||||
a,b,c : matrices
|
||||
alpha, beta: scalars
|
||||
trans_a : 0 (a not transposed),
|
||||
1 (a transposed),
|
||||
2 (a conjugate transposed)
|
||||
trans_b : 0 (b not transposed),
|
||||
1 (b transposed),
|
||||
2 (b conjugate transposed)
|
||||
"""
|
||||
if c:
|
||||
gemm,= get_blas_funcs(('gemm',),(a,b,c))
|
||||
else:
|
||||
gemm,= get_blas_funcs(('gemm',),(a,b))
|
||||
|
||||
return gemm(alpha, a, b, beta, c, trans_a, trans_b)
|
||||
|
||||
return [T,P,E,r]
|
||||
|
||||
class LoadMoothaData(Function):
|
||||
def __init__(self):
|
||||
Function.__init__(self, 'load', 'Load diabetes data')
|
||||
self.annotations = None
|
||||
|
||||
def load_expression_file(self, filename):
|
||||
f = open(filename)
|
||||
logger.log('notice', 'Loading expression file: %s' % filename)
|
||||
self.file = f
|
||||
self.filename = filename
|
||||
|
||||
def on_response(self, dialog, response):
|
||||
if response == gtk.RESPONSE_OK:
|
||||
logger.log('notice', 'Reading file: %s' % dialog.get_filename())
|
||||
self.load_expression_file(dialog.get_filename())
|
||||
|
||||
def run(self, data):
|
||||
btns = ('Open', gtk.RESPONSE_OK, \
|
||||
'Cancel', gtk.RESPONSE_CANCEL)
|
||||
dialog = gtk.FileChooserDialog('Open diabetes expression File',
|
||||
buttons=btns)
|
||||
dialog.connect('response', self.on_response)
|
||||
dialog.run()
|
||||
dialog.destroy()
|
||||
|
||||
### Reading and parsing here
|
||||
d,sample_names = read_mootha()
|
||||
n_samps = len(sample_names)
|
||||
n_genes = len(d.keys())
|
||||
typecode = 'f'
|
||||
x = zeros((n_samps,n_genes),typecode)
|
||||
def run(self,data):
|
||||
data_file = open('full_data.pickle','r')
|
||||
data = pickle.load(data_file)
|
||||
data_file.close()
|
||||
sample_file = open('sample_labels.pickle','r')
|
||||
sample_names = pickle.load(sample_file)
|
||||
sample_file.close()
|
||||
typecode='f'
|
||||
nSamps = len(sample_names)
|
||||
nVars = len(data.keys())
|
||||
gene_ids = []
|
||||
for i,(id,desc) in enumerate(d.items()):
|
||||
x = zeros((nSamps,nVars),typecode)
|
||||
for i,(id,desc) in enumerate(data.items()):
|
||||
gene_ids.append(id)
|
||||
x[:,i] = desc[0].astype(typecode)
|
||||
gene_def = ['genes',gene_ids]
|
||||
sample_def = ['samples', sample_names]
|
||||
X = dataset.Dataset(x,[sample_def,gene_def]) # samples x genes
|
||||
gene_def = ('genes',gene_ids)
|
||||
sample_def = ('samples', sample_names)
|
||||
X = dataset.Dataset(x,identifiers=[sample_def,gene_def]) # samples x genes
|
||||
return [X]
|
||||
|
||||
class Log2Function(Function):
|
||||
def __init__(self):
|
||||
Function.__init__(self, 'log', 'Log2')
|
||||
|
||||
def run(self,data):
|
||||
x = log2(data._array)
|
||||
ids = data.get_identifiers()
|
||||
return [dataset.Dataset(x,identifiers=ids,name='Log2_X')]
|
||||
PCAWorkflow.name = 'PCA Workflow'
|
||||
|
|
Reference in New Issue