Projects/laydi
Projects
/
laydi
Archived
7
0
Fork 0

Complete rewrite of dataset class, with (all) the necessary updates

This commit is contained in:
Arnar Flatberg 2006-04-24 09:53:07 +00:00
parent 53d0228074
commit a2e4392a72
9 changed files with 426 additions and 234 deletions

View File

@ -1,135 +1,224 @@
import logger import logger
from scipy import array,take,asarray,shape,nonzero from scipy import atleast_2d,asarray,ArrayType
import project
from itertools import izip
class Dataset: class Dataset:
"""Dataset base class. """The Dataset base class.
A Dataset is an n-way array with defined string identifiers across A Dataset is an n-way array with defined string identifiers across
all dimensions. all dimensions.
"""
def __init__(self, input_array, def_list, name="Unnamed data"):
self._name = name
self._data = asarray(input_array)
dims = shape(self._data)
self.def_list = def_list
self._ids_set = set()
self.ids={}
self._dim_num = {}
self._dim_names = []
if len(dims)==1: # a vector is defined to be column vector!
self.dims = (dims[0],1)
else:
self.dims = dims
if len(def_list)!=len(self.dims):
raise ValueError,"array dims and identifyer mismatch"
for axis,(dim_name,ids) in enumerate(def_list):
enum_ids = {}
#if dim_name not in project.c_p.dim_names:
# dim_name = project.c_p.suggest_dim_name(dim_name)
if not ids:
logger.log('debug','Creating identifiers along: '+ str(dim_name))
ids = self._create_identifiers(axis)
for num,name in enumerate(ids):
enum_ids[name] = num
self.ids[dim_name] = enum_ids
self._ids_set = self._ids_set.union(set(ids))
self._dim_num[dim_name] = axis
self._dim_names.append(dim_name)
for (dimname, ids), d in izip(def_list,self.dims): #check that data and labels match example of use:
if len(self.ids[dimname]) != d:
raise ValueError,"dim size and identifyer mismatch" ---
dim_name_rows = 'rows'
names_rows = ('row_a','row_b')
ids_1 = [dim_name_rows, names_rows]
dim_name_cols = 'cols'
names_cols = ('col_a','col_b','col_c','col_d')
ids_2 = [dim_name_cols, names_cols]
Array_X = rand(2,4)
data = Dataset(Array_X,(ids_1,ids_2),name="Testing")
dim_names = [dim for dim in data]
column_identifiers = [id for id in data['cols'].keys()]
column_index = [index for index in data['cols'].values()]
'cols' in data -> True
---
data = Dataset(rand(10,20)) (generates dims and ids (no links))
"""
def __init__(self,array=None,identifiers=None,shape=None,all_dims=[],**kwds):
self._name = kwds.get("name","Unnamed data")
self._dims = [] #existing dimensions in this dataset
self._map = {} # internal mapping for dataset: identifier <--> index
self.has_array = False
self.shape = None
if array==None:
if shape == None:
raise ValueError, "Must define shape if array is None"
else:
self.shape = shape
if identifiers!=None:
self._set_identifiers(identifiers,all_dims)
else:
ids = self._create_identifiers(shape,all_dims)
self._set_identifiers(ids,all_dims)
elif isinstance(array,ArrayType):
array = atleast_2d(asarray(array))
self.shape = array.shape
if shape != None:
if self.shape!=shape:
#logger.log("debug","Dataset and input shape mismatch")
raise ValueError
if identifiers!=None:
self._set_identifiers(identifiers,all_dims)
else:
ids = self._create_identifiers(self.shape,all_dims)
self._set_identifiers(ids,all_dims)
self._array = array
self.has_array = True
else:
raise ValueError, "array input must be of ArrayType or None"
self._all_dims = all_dims
def __str__self(self):
return self._name
def __iter__(self):
"""Returns an iterator over dimensions of dataset."""
return self._dims.__iter__()
def __contains__(self,dim):
"""Returns True if dim is a dimension name in dataset."""
# return self._dims.__contains__(dim)
return self._map.__contains__(dim)
def __len__(self):
"""Returns the number of dimensions in the dataset"""
return len(self._map)
def __getitem__(self,dim):
"""Return the identifers along the dimension dim."""
return self._map[dim]
def _create_identifiers(self,shape,all_dims):
"""Creates dimension names and identifier names, and returns
identifiers."""
dim_names = ['rows','cols']
ids = []
for axis,n in enumerate(shape):
if axis<2:
dim_suggestion = dim_names[axis]
else:
dim_suggestion = 'dim'
while dim_suggestion in all_dims:
dim_suggestion = self._suggest_dim_name(dim_suggestion,all_dims)
identifier_creation = [str(axis) + "_" + i for i in map(str,range(n))]
ids.append((dim_suggestion,identifier_creation))
all_dims.append(dim_suggestion)
return ids
def _set_identifiers(self,identifiers,all_dims):
"""Creates internal mapping of identifiers structure."""
for dim,ids in identifiers:
pos_map={}
if dim not in self._dims:
self._dims.append(dim)
all_dims.append(dim)
else:
raise ValueError, "Dimension names must be unique"
for pos,id in enumerate(ids):
pos_map[id] = pos
self._map[dim] = pos_map
shape_chk = [len(i) for j,i in identifiers]
if shape_chk != list(self.shape):
raise ValueError, "Shape input: %s and array: %s mismatch" %(shape_chk,self.shape)
def _suggest_dim_name(self,dim_name,all_dims):
"""Suggests a unique name for dim and returns it"""
c = 0
while dim_name in all_dims:
dim_name = dim_name + "_" + str(c)
c+=1
return dim_name
def asarray(self):
"""Returns the numeric array (data) of dataset"""
if not self.has_array:
raise ValueError, "Dataset is empty"
else:
return self._array
def add_array(self,array):
"""Adds array as an ArrayType object.
A one-dim array is transformed to a two-dim array (row-vector)
"""
if self.has_array:
raise ValueError, "Dataset has array"
else:
if (len(self._map)!=len(array.shape)):
raise ValueError, "range(array_dims) and range(dim_names) mismatch"
if self.shape!=array.shape:
raise ValueError, "Input array must be of similar dimensions as dataset"
self._array = atleast_2d(asarray(array))
self.has_array = True
def get_name(self): def get_name(self):
return self._name return self._name
def get_all_dims(self):
return self._all_dims
def get_dim_names(self): def get_identifiers(self):
return self._dim_names #return [n for n in self._map.iteritems()]
def names(self,axis=0):
"""Returns identifier names of a dimension.
NB: sorted by values!
OK? necessary?"""
if type(axis)==int:
dim_name = self._dim_names[axis]
elif type(axis)==str:
dim_name = axis
if dim_name not in self._dim_names:
raise ValueError, dim_name + " not a dimension in dataset"
items = self.ids[dim_name].items()
backitems=[ [v[1],v[0]] for v in items]
backitems.sort()
sorted_ids=[ backitems[i][1] for i in range(0,len(backitems))]
return sorted_ids
def extract_data(self,ids,dim_name):
"""Extracts data along a dimension by identifiers"""
new_def_list = self.def_list[:]
ids_index = [self.ids[dim_name][id_name] for id_name in ids]
dim_number = self._dim_num[dim_name]
try:
out_data = take(self._data,ids_index,axis=dim_number)
except:
raise ValueError
new_def_list[dim_number][1] = ids
extracted_data = Dataset(out_data,def_list=new_def_list,parents=self.parents)
return extracted_data
def _create_identifiers(self,axis):
"""Creates identifiers along an axis"""
n_dim = self.dims[axis]
return [str(axis) + '_' + str(i) for i in range(n_dim)]
def extract_id_from_index(self,dim_name,index):
"""Returns a set of ids from array/list of indexes."""
dim_ids = self.ids[dim_name]
if type(index)==int:
index = [index]
return set([id for id,ind in dim_ids.items() if ind in index])
def extract_index_from_id(self,dim_name,id):
"""Returns an array of indexes from a set/list of identifiers
(or a single id)"""
dim_ids = self.ids[dim_name]
return array([ind for name,ind in dim_ids.items() if name in id])
# ensure correct order
# this has correct dims but not identifiers
ids = []
for dim in self._dims:
ids.append((dim,self._map[dim].keys()))
return ids
class CategoryDataset(Dataset): class CategoryDataset(Dataset):
def __init__(self,array,def_list): """The category dataset class.
Dataset.__init__(self,array,def_list)
def get_elements_by_category(self,dim,category): A dataset for representing class information as binary
"""Returns all elements along input dim belonging to category. matrices (0/1-matrices).
Assumes a two-dim category data only!
There is support for using a less memory demanding, and
fast intersection look-ups by representing the binary matrix as a
dictionary in each dimension.
""" """
if type(category)!=list:
raise ValueError, "category must be list" def __init__(self):
gene_ids = [] Dataset.__init__(self)
axis_dim = self._dim_num[dim] self.has_collection = False
cat_index = self.extract_index_from_id(category)
for ind in cat_index: def as_array(self):
if axis_dim==0: """Returns data as binary matrix"""
gene_indx = nonzero(self._data[:,ind]) if not self.has_array and self.has_collection:
elif axis_dim==1: #build numeric array
gene_indx = nonzero(self._data[ind,:]) pass
else:
ValueError, "Only support for 2-dim data" def as_collection(self,dim):
gene_ids.append(self.extract_id_from_index(dim,gene_index)) """Returns data as collection along dim"""
return gene_ids pass
def add_collection(self,input_dict):
"""Adds a category data as collection.
A collection is a datastructure that contains a dictionary for
each pair of dimension in dataset, keyed by identifiers and
values is a set of identifiers in the other dimension
"""
#build category data as double dicts
pass
class GraphDataset(Dataset):
"""The graph dataset class.
A dataset class for representing graphs using an adjacency matrix
(aka. restricted to square symmetric signed integers matrices)
If the library NetworkX is installed, there is support for
representing the graph as a NetworkX.Graph, or NetworkX.XGraph structure.
"""
def __init__(self):
Dataset.__init(self)
self.has_graph = False
class Selection: class Selection:
"""Handles selected identifiers along each dimension of a dataset""" """Handles selected identifiers along each dimension of a dataset"""

View File

@ -58,7 +58,7 @@ class CreateProjectDruid(gtk.Window):
for dir in wf_path: for dir in wf_path:
for fn in os.listdir(dir): for fn in os.listdir(dir):
if fn.endswith('.py'): if fn.endswith('.py') and ('#' not in fn) :
wf_files.append(fn[:-3]) wf_files.append(fn[:-3])
# Try to load each file and look for Workflow derived classes # Try to load each file and look for Workflow derived classes
@ -68,6 +68,7 @@ class CreateProjectDruid(gtk.Window):
for wf in wf_info: for wf in wf_info:
store.insert_after(None, (getattr(wf, 'name'), wf)) store.insert_after(None, (getattr(wf, 'name'), wf))
except Exception, e: except Exception, e:
wf_info = self.workflow_classes(fn)
logger.log('warning', 'Cannot load workflow: %s' % fn) logger.log('warning', 'Cannot load workflow: %s' % fn)
return store return store

View File

@ -224,25 +224,27 @@ class SinePlot(Plot):
class ScatterPlot(Plot): class ScatterPlot(Plot):
def __init__(self,project): def __init__(self,project,dataset,id_dim,sel_dim,id_1,id_2):
Plot.__init__(self,project) Plot.__init__(self,project)
self.project = project
fig = Figure(figsize=(5,4), dpi=72) fig = Figure(figsize=(5,4), dpi=72)
self.ax = ax = fig.add_subplot(111) self.ax = ax = fig.add_subplot(111)
self.current_dim = id_dim
# testing testing # testing testing
self.x_dataset = project.datasets[0] self.dataset = dataset
x = self.x_dataset._data x_index = dataset[sel_dim][id_1]
self.xaxis_data = xaxis_data = x[:,0] + scipy.randn(scipy.shape(x)[0]) y_index = dataset[sel_dim][id_2]
self.yaxis_data = yaxis_data = x[:,1]
self.current_dim = self.x_dataset._dim_names[0] self.xaxis_data = dataset._array[:,x_index]
ax.plot(xaxis_data,yaxis_data,'og') self.yaxis_data = dataset._array[:,y_index]
ax.plot(self.xaxis_data,self.yaxis_data,'og')
### ###
self.canvas = FigureCanvas(fig) self.canvas = FigureCanvas(fig)
self.add(self.canvas) self.add(self.canvas)
rectprops = dict(facecolor='gray', edgecolor = 'black', rectprops = dict(facecolor='gray', edgecolor = 'black',
alpha=0.2, fill=True) #cool alpha=0.2, fill=True) #cool
sel = RectangleSelector(ax, self.rectangle_select_callback, self.sel = RectangleSelector(ax, self.rectangle_select_callback,
drawtype='box',useblit=True,rectprops=rectprops) drawtype='box',useblit=True,rectprops=rectprops)
self.canvas.show() self.canvas.show()
@ -251,45 +253,36 @@ class ScatterPlot(Plot):
'event1 and event2 are the press and release events' 'event1 and event2 are the press and release events'
x1, y1 = event1.xdata, event1.ydata x1, y1 = event1.xdata, event1.ydata
x2, y2 = event2.xdata, event2.ydata x2, y2 = event2.xdata, event2.ydata
logger.log('debug', "(%3.2f, %3.2f) --> (%3.2f, %3.2f)"%(x1,y1,x2,y2))
logger.log('debug',"The button you used were:%s, %s "%(event1.button, event2.button))
# get all points within x1, y1, x2, y2
ydata = self.yaxis_data ydata = self.yaxis_data
xdata = self.xaxis_data xdata = self.xaxis_data
if x1>x2: if x1>x2:
logger.log('debug','Selection x_start bigger than x_end')
if y1<y2: if y1<y2:
logger.log('debug','Selection y_start less than y_end')
index =scipy.nonzero((xdata<x1) & (xdata>x2) & (ydata>y1) & (ydata<y2)) index =scipy.nonzero((xdata<x1) & (xdata>x2) & (ydata>y1) & (ydata<y2))
else: else:
logger.log('debug','Selection y_start larger than y_end')
index =scipy.nonzero((xdata<x1) & (xdata>x2) & (ydata<y1) & (ydata>y2)) index =scipy.nonzero((xdata<x1) & (xdata>x2) & (ydata<y1) & (ydata>y2))
else: else:
logger.log('debug','Selection x_start less than x_end') #logger.log('debug','Selection x_start less than x_end')
if y1<y2: if y1<y2:
logger.log('debug','Selection y_start less than y_end') #logger.log('debug','Selection y_start less than y_end')
index =scipy.nonzero((xdata>x1) & (xdata<x2) & (ydata>y1) & (ydata<y2)) index =scipy.nonzero((xdata>x1) & (xdata<x2) & (ydata>y1) & (ydata<y2))
else: else:
logger.log('debug','Selection y_start bigger than y_end') #logger.log('debug','Selection y_start bigger than y_end')
index =scipy.nonzero((xdata>x1) & (xdata<x2) & (ydata<y1) & (ydata>y2)) index =scipy.nonzero((xdata>x1) & (xdata<x2) & (ydata<y1) & (ydata>y2))
if len(index)==0: if len(index)==0:
logger.log('debug','No points selected!') logger.log('debug','No points selected!')
else: else:
logger.log('debug','Selected:\n%s'%index) ids = [id for id,ind in self.dataset[self.current_dim].items() if ind in index]
ids = self.x_dataset.extract_id_from_index('samples',index)
#update selection object
self.project.set_selection(self.current_dim,ids) self.project.set_selection(self.current_dim,ids)
logger.log('debug','Selected identifiers:\n%s'%ids)
def update(self,project,key): def update(self,project,key):
curr_sel = project.get_selection() # get selection object curr_sel = project.get_selection() # get selection object
ids = curr_sel[self.current_dim] # current identifiers ids = curr_sel[self.current_dim] # current identifiers
index = self.x_dataset.extract_index_from_id(self.current_dim,ids) #conversion to index
index = [ind for id,ind in self.dataset[self.current_dim].items() if id in ids] #conversion to index
xdata_new = scipy.take(self.xaxis_data,index) #take data xdata_new = scipy.take(self.xaxis_data,index) #take data
ydata_new = scipy.take(self.yaxis_data,index) ydata_new = scipy.take(self.yaxis_data,index)
self.ax.plot(self.xaxis_data,self.yaxis_data,'ob') self.ax.plot(self.xaxis_data,self.yaxis_data,'og')
self.ax.plot(xdata_new,ydata_new,'or') self.ax.plot(xdata_new,ydata_new,'or')
self.canvas.draw() self.canvas.draw()

View File

@ -13,8 +13,8 @@ class Project:
self.name = name self.name = name
self.dim_names = [] self.dim_names = []
self._observers = {} self._observers = {}
self.current_data=None self.current_data = None
self.datasets=[] self.datasets = []
self.sel_obj = dataset.Selection() self.sel_obj = dataset.Selection()
def attach(self,observer,key): def attach(self,observer,key):
@ -69,15 +69,11 @@ class Project:
def add_dataset(self,dataset): def add_dataset(self,dataset):
"""Appends a new Dataset to the project.""" """Appends a new Dataset to the project."""
self.datasets.append(dataset) self.datasets.append(dataset)
for dim_name in dataset.ids.keys(): for dim_name in dataset.get_all_dims():
if dim_name not in self.dim_names: if dim_name not in self.dim_names:
self.dim_names.append(dim_name) self.dim_names.append(dim_name)
self.sel_obj.current_selection[dim_name] = set()
def suggest_dim_name(self,dim_name):
"""Creates an arbitrary unique name for a new dimension."""
while dim_name in self.dim_names:
dim_name = dim_name + "_t"
return dim_name
def object_at(self, path): def object_at(self, path):
"Returns the object at a given path in the tree." "Returns the object at a given path in the tree."

View File

@ -30,6 +30,9 @@ class Workflow:
print ' %s' % fun.name print ' %s' % fun.name
def add_project(self,project): def add_project(self,project):
if project == None:
logger.log('notice','Proejct is empty')
logger.log('notice','Project added in : %s' %self.name)
self.project = project self.project = project
class EmptyWorkflow(Workflow): class EmptyWorkflow(Workflow):

View File

@ -5,28 +5,30 @@ from dataset import *
from scipy import rand,shape from scipy import rand,shape
class DatasetTest(unittest.TestCase): class DatasetTest(unittest.TestCase):
def setUp(self): def setUp(self):
self.dim_0_ids = ['sample_a','sample_b'] dim_0_ids = ('sample_a','sample_b')
self.dim_1_ids = ['gene_a','gene_b','gene_c'] dim_1_ids = ('gene_a','gene_b','gene_c')
self.dim_labels = ['samples','genes'] dim_labels = ('samples','genes')
self.def_list = [[self.dim_labels[0],self.dim_0_ids],[self.dim_labels[1],self.dim_1_ids]] identifiers= [(dim_labels[0],dim_0_ids),(dim_labels[1],dim_1_ids)]
self.array = rand(2,3) self.array = rand(2,3)
self.testdata = Dataset(self.array,self.def_list) self.testdata = Dataset(self.array,identifiers)
def testCreation(self): def testCreation(self):
assert self.testdata._data == self.array data = self.testdata
assert 'sample_a' in self.testdata.ids['samples'].keys() assert data._array == self.array
assert 'gene_b' in self.testdata.ids['genes'].keys() assert 'sample_a' in data['samples'].keys()
assert data['samples']['sample_b']==1
assert 'gene_c' in data['genes'].keys()
assert data['genes']['gene_c']==2
#def testExtraction(self):
# ids = ['gene_a','gene_b']
def testExtraction(self): # dim_name = 'genes'
ids = ['gene_a','gene_b'] # subset = self.testdata.extract_data(ids,dim_name)
dim_name = 'genes' # assert shape(subset._data) == (2,2)
subset = self.testdata.extract_data(ids,dim_name) # assert subset.ids[dim_name].keys() == ids
assert shape(subset._data) == (2,2) # assert subset.ids[dim_name].values() == [0,1]
assert subset.ids[dim_name].keys() == ids
assert subset.ids[dim_name].values() == [0,1]
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -102,9 +102,7 @@ class TestDataFunction(Function):
def run(self, data): def run(self, data):
logger.log('notice', 'Injecting foo test data') logger.log('notice', 'Injecting foo test data')
x = randn(20,30) x = randn(20,30)
axis_0 = ['rows',[]] X = dataset.Dataset(x)
axis_1 = ['cols',[]]
X = dataset.Dataset(x,[axis_0,axis_1])
return [X, plots.SinePlot(None)] return [X, plots.SinePlot(None)]

View File

@ -1,23 +1,27 @@
import gtk import gtk
import logger import logger
from workflow import * from workflow import *
from scipy import array,zeros import pickle
from data import read_affy_annot,read_mootha,data_dict_to_matrix from scipy import log2,transpose,dot,divide,shape,mean,resize,zeros
from scipy.linalg import svd,inv,norm,get_blas_funcs,eig
import plots import plots
import dataset import dataset
class PCAWorkflow(Workflow): class PCAWorkflow(Workflow):
name = 'PCA Workflow'
description = 'PCA workflow. Uses real microarray data from a study of diabetes (Mootha et al.).'
def __init__(self, app): def __init__(self, app):
Workflow.__init__(self, app) Workflow.__init__(self, app)
self.name = 'PCAs Workflow' #self.add_project(app.project)
#logger.log('notice','Current project added to: %s' %self.name)
load = Stage('load', 'Load Data') load = Stage('load', 'Load Data')
load.add_function(LoadMoothaData()) load.add_function(LoadMoothaData())
self.add_stage(load) self.add_stage(load)
preproc = Stage('preprocess', 'Preprocessing') preproc = Stage('preprocess', 'Preprocessing')
preproc.add_function(Function('log2', 'Logarithm')) preproc.add_function(Log2Function())
self.add_stage(preproc) self.add_stage(preproc)
annot = Stage('annot', 'Affy annotations') annot = Stage('annot', 'Affy annotations')
@ -25,7 +29,7 @@ class PCAWorkflow(Workflow):
self.add_stage(annot) self.add_stage(annot)
model = Stage('model', 'Model') model = Stage('model', 'Model')
model.add_function(Function('pca', 'PCA')) model.add_function(PCAFunction(self))
self.add_stage(model) self.add_stage(model)
logger.log('debug', '\tPCA\'s workflow is now active') logger.log('debug', '\tPCA\'s workflow is now active')
@ -34,7 +38,6 @@ class LoadAnnotationsFunction(Function):
def __init__(self): def __init__(self):
Function.__init__(self, 'load', 'Load Annotations') Function.__init__(self, 'load', 'Load Annotations')
self.annotations = None
def load_affy_file(self, filename): def load_affy_file(self, filename):
f = open(filename) f = open(filename)
@ -68,81 +71,188 @@ class LoadAnnotationsFunction(Function):
pathways = description[i_want] pathways = description[i_want]
if not pathways[0][0]=='--': if not pathways[0][0]=='--':
pass pass
D = []
return [D]
return [self.annotations]
class PCAFunction(Function): class PCAFunction(Function):
def __init__(self): def __init__(self,workflow):
Function.__init__(self, 'X', 'a_opt') Function.__init__(self, 'pca', 'PCA')
self.output = None self.output = None
self.workflow = workflow
def run(self, data): def run(self, data):
logger.log('debug', 'datatype: %s' % type(data)) logger.log('debug', 'datatype: %s' % type(data))
if not isinstance(data,dataset.Dataset): if not isinstance(data,dataset.Dataset):
return None return None
logger.log('debug', 'dimensions: %s' % data.dims) #logger.log('debug', 'dimensions: %s' % data.dims)
## calculations ## calculations
T,P,E,tsq = pca(data._data,a_opt=2) T,P,E,tsq = self.pca(data._array,5,tsq_loads=False)
comp_def = ['comp',['1','2']] comp_def = ('comp',('1','2','3','4','5'))
singel_def = ['1',['s']] singel_def = ('1',('s'))
col_def = [data._dim_names[0],data.names(0)]
row_def = [data._dim_names[1],data.names(1)] # pull out input identifiers:
T = dataset.Dataset(T,[col_def,comp_def]) data_ids = []
P = dataset.Dataset(T,[row_def,comp_def]) for dim in data:
E = dataset.Dataset(E,[col_def,row_def]) data_ids.append((dim,data[dim].keys()))
tsq = dataset.Dataset(tsq,[row_def,sigel_def])
T = dataset.Dataset(T,(data_ids[0],comp_def))
P = dataset.Dataset(P,[data_ids[1],comp_def])
E = dataset.Dataset(E,data_ids)
#tsq = dataset.Dataset(tsq,[singel_def,data_ids[1])
## plots ## plots
loading_plot = plots.ScatterPlot() loading_plot1 = plots.ScatterPlot(self.workflow.project,P,'genes','comp','1','2')
loading_plot2 = plots.ScatterPlot(self.workflow.project,P,'genes','comp','3','4')
score_plot = plots.ScatterPlot(self.workflow.project,T,'samples','comp','1','2')
return [T,P,E,loading_plot1,loading_plot2,score_plot]
def pca(self,X,a_opt,cent=True,scale='loads',tsq_loads=False):
"""Principal component analysis
input:
Xc -- matrix, data
a_opt -- scalar, max number of comp. to calculate
cent -- bool, centering [True]
crit -- string, pc criteria ['exp_var',['ief','rpv','average']]
scale -- string, scaling ['loads',['scores']]
tsq_loads -- bool, calculate t-squared? [True]
reg -- float, covariance regularizer for tsq calculations [0.2]
output:
T,P,E,r
"""
nSamples,nVarX = shape(X)
if cent:
Xc = self.mat_center(X)
else:
Xc = X
u,s,vh = self.esvd(Xc)
if scale=='scores':
T = u*s
T = T[:,:a_opt]
P = transpose(vh)
P = P[:,:a_opt]
elif scale=='loads':
T = u
T = T[:,:a_opt]
P = transpose(vh)*s
P = P[:,:a_opt]
E = Xc - dot(T,transpose(P))
varEach = s**2
totVar = sum(varEach)
r = divide(varEach,totVar)*100
return T,P,E,r
def mat_center(self,X,axis=0,ret_mn=False):
"""Mean center matrix along axis.
input:
X -- matrix, data
axis -- dim,
ret_mn -- bool, return mean
output:
Xc, [mnX]
NB: axis = 1 is column-centering, axis=0=row-centering
default is row centering (axis=0)
"""
try:
rows,cols = shape(X)
except ValueError:
print "The X data needs to be two-dimensional"
if axis==0:
mnX = mean(X,axis)
Xs = X - resize(mnX,(rows,cols))
elif axis==1:
mnX = mean(X,axis)
Xs = transpose(transpose(X) - resize(mnX,(cols,rows)))
if ret_mn:
return Xs,mnX
else:
return Xs
def esvd(self,data,economy=1):
"""SVD with the option of economy sized calculation
Calculate subspaces of X'X or XX' depending on the shape
of the matrix.
Good for extreme fat or thin matrices.
"""
mm = self.mm
m,n = shape(data)
if m>=n:
u,s,v = svd(mm(data,data,trans_a=1))
u = mm(data,v,trans_b=1)
for i in xrange(n):
s[i] = norm(u[:,i])
u[:,i] = u[:,i]/s[i]
else:
u,s,v = svd(mm(data,data,trans_b=1))
v = mm(u,data,trans_a=1)
for i in xrange(m):
s[i] = norm(v[i,:])
v[i,:] = v[i,:]/s[i]
return u,s,v
def mm(self,a,b, alpha=1.0, beta=0.0, c=None, trans_a=0,
trans_b=0):
"""Fast matrix multiplication
Return alpha*(a*b) + beta*c.
a,b,c : matrices
alpha, beta: scalars
trans_a : 0 (a not transposed),
1 (a transposed),
2 (a conjugate transposed)
trans_b : 0 (b not transposed),
1 (b transposed),
2 (b conjugate transposed)
"""
if c:
gemm,= get_blas_funcs(('gemm',),(a,b,c))
else:
gemm,= get_blas_funcs(('gemm',),(a,b))
return gemm(alpha, a, b, beta, c, trans_a, trans_b)
return [T,P,E,r]
class LoadMoothaData(Function): class LoadMoothaData(Function):
def __init__(self): def __init__(self):
Function.__init__(self, 'load', 'Load diabetes data') Function.__init__(self, 'load', 'Load diabetes data')
self.annotations = None
def load_expression_file(self, filename): def run(self,data):
f = open(filename) data_file = open('full_data.pickle','r')
logger.log('notice', 'Loading expression file: %s' % filename) data = pickle.load(data_file)
self.file = f data_file.close()
self.filename = filename sample_file = open('sample_labels.pickle','r')
sample_names = pickle.load(sample_file)
def on_response(self, dialog, response): sample_file.close()
if response == gtk.RESPONSE_OK: typecode='f'
logger.log('notice', 'Reading file: %s' % dialog.get_filename()) nSamps = len(sample_names)
self.load_expression_file(dialog.get_filename()) nVars = len(data.keys())
def run(self, data):
btns = ('Open', gtk.RESPONSE_OK, \
'Cancel', gtk.RESPONSE_CANCEL)
dialog = gtk.FileChooserDialog('Open diabetes expression File',
buttons=btns)
dialog.connect('response', self.on_response)
dialog.run()
dialog.destroy()
### Reading and parsing here
d,sample_names = read_mootha()
n_samps = len(sample_names)
n_genes = len(d.keys())
typecode = 'f'
x = zeros((n_samps,n_genes),typecode)
gene_ids = [] gene_ids = []
for i,(id,desc) in enumerate(d.items()): x = zeros((nSamps,nVars),typecode)
for i,(id,desc) in enumerate(data.items()):
gene_ids.append(id) gene_ids.append(id)
x[:,i] = desc[0].astype(typecode) x[:,i] = desc[0].astype(typecode)
gene_def = ['genes',gene_ids] gene_def = ('genes',gene_ids)
sample_def = ['samples', sample_names] sample_def = ('samples', sample_names)
X = dataset.Dataset(x,[sample_def,gene_def]) # samples x genes X = dataset.Dataset(x,identifiers=[sample_def,gene_def]) # samples x genes
return [X] return [X]
class Log2Function(Function):
def __init__(self):
Function.__init__(self, 'log', 'Log2')
def run(self,data):
x = log2(data._array)
ids = data.get_identifiers()
return [dataset.Dataset(x,identifiers=ids,name='Log2_X')]
PCAWorkflow.name = 'PCA Workflow' PCAWorkflow.name = 'PCA Workflow'