147 lines
4.8 KiB
Python
147 lines
4.8 KiB
Python
import gtk
|
|
import logger
|
|
from workflow import *
|
|
from scipy import array,zeros
|
|
from data import read_affy_annot,read_mootha,data_dict_to_matrix
|
|
import plots
|
|
import dataset
|
|
|
|
class PCAWorkflow(Workflow):
|
|
|
|
def __init__(self, app):
|
|
Workflow.__init__(self, app)
|
|
self.name = 'PCAs Workflow'
|
|
|
|
load = Stage('load', 'Load Data')
|
|
load.add_function(LoadMoothaData())
|
|
self.add_stage(load)
|
|
|
|
preproc = Stage('preprocess', 'Preprocessing')
|
|
preproc.add_function(Function('log2', 'Logarithm'))
|
|
self.add_stage(preproc)
|
|
|
|
annot = Stage('annot', 'Affy annotations')
|
|
annot.add_function(LoadAnnotationsFunction())
|
|
self.add_stage(annot)
|
|
|
|
model = Stage('model', 'Model')
|
|
model.add_function(Function('pca', 'PCA'))
|
|
self.add_stage(model)
|
|
|
|
logger.log('debug', '\tPCA\'s workflow is now active')
|
|
|
|
class LoadAnnotationsFunction(Function):
|
|
|
|
def __init__(self):
|
|
Function.__init__(self, 'load', 'Load Annotations')
|
|
self.annotations = None
|
|
|
|
def load_affy_file(self, filename):
|
|
f = open(filename)
|
|
logger.log('notice', 'Loading annotation file: %s' % filename)
|
|
self.file = f
|
|
|
|
def on_response(self, dialog, response):
|
|
if response == gtk.RESPONSE_OK:
|
|
logger.log('notice', 'Reading file: %s' % dialog.get_filename())
|
|
self.load_affy_file(dialog.get_filename())
|
|
|
|
def run(self, data):
|
|
btns = ('Open', gtk.RESPONSE_OK, \
|
|
'Cancel', gtk.RESPONSE_CANCEL)
|
|
dialog = gtk.FileChooserDialog('Open Affy Annotation File',
|
|
buttons=btns)
|
|
dialog.connect('response', self.on_response)
|
|
dialog.run()
|
|
dialog.destroy()
|
|
|
|
### Reading and parsing here
|
|
annot = read_affy_annot(self.file)
|
|
i_want = 'Pathway'
|
|
nothing = '---'
|
|
ids_in_data = set(data.names('genes')) #assuming we have genes
|
|
sanity_check = set(annot.keys())
|
|
if not ids_in_data.intersection(sanity_check) == ids_in_data:
|
|
logger.log('debug','Some identifers in data does not exist in affy file!')
|
|
for affy_id,description in annot:
|
|
if affy_id in ids_in_data:
|
|
pathways = description[i_want]
|
|
if not pathways[0][0]=='--':
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return [self.annotations]
|
|
|
|
class PCAFunction(Function):
|
|
|
|
def __init__(self):
|
|
Function.__init__(self, 'X', 'a_opt')
|
|
self.output = None
|
|
|
|
def run(self, data):
|
|
logger.log('debug', 'datatype: %s' % type(data))
|
|
if not isinstance(data,dataset.Dataset):
|
|
return None
|
|
logger.log('debug', 'dimensions: %s' % data.dims)
|
|
|
|
## calculations
|
|
T,P,E,tsq = pca(data._data,a_opt=2)
|
|
comp_def = ['comp',['1','2']]
|
|
singel_def = ['1',['s']]
|
|
col_def = [data._dim_names[0],data.names(0)]
|
|
row_def = [data._dim_names[1],data.names(1)]
|
|
T = dataset.Dataset(T,[col_def,comp_def])
|
|
P = dataset.Dataset(T,[row_def,comp_def])
|
|
E = dataset.Dataset(E,[col_def,row_def])
|
|
tsq = dataset.Dataset(tsq,[row_def,sigel_def])
|
|
|
|
## plots
|
|
loading_plot = plots.ScatterPlot()
|
|
|
|
|
|
return [T,P,E,r]
|
|
|
|
class LoadMoothaData(Function):
|
|
def __init__(self):
|
|
Function.__init__(self, 'load', 'Load diabetes data')
|
|
self.annotations = None
|
|
|
|
def load_expression_file(self, filename):
|
|
f = open(filename)
|
|
logger.log('notice', 'Loading expression file: %s' % filename)
|
|
self.file = f
|
|
self.filename = filename
|
|
|
|
def on_response(self, dialog, response):
|
|
if response == gtk.RESPONSE_OK:
|
|
logger.log('notice', 'Reading file: %s' % dialog.get_filename())
|
|
self.load_expression_file(dialog.get_filename())
|
|
|
|
def run(self, data):
|
|
btns = ('Open', gtk.RESPONSE_OK, \
|
|
'Cancel', gtk.RESPONSE_CANCEL)
|
|
dialog = gtk.FileChooserDialog('Open diabetes expression File',
|
|
buttons=btns)
|
|
dialog.connect('response', self.on_response)
|
|
dialog.run()
|
|
dialog.destroy()
|
|
|
|
### Reading and parsing here
|
|
d,sample_names = read_mootha()
|
|
n_samps = len(sample_names)
|
|
n_genes = len(d.keys())
|
|
typecode = 'f'
|
|
x = zeros((n_samps,n_genes),typecode)
|
|
gene_ids = []
|
|
for i,(id,desc) in enumerate(d.items()):
|
|
gene_ids.append(id)
|
|
x[:,i] = desc[0].astype(typecode)
|
|
gene_def = ['genes',gene_ids]
|
|
sample_def = ['samples', sample_names]
|
|
X = dataset.Dataset(x,[sample_def,gene_def]) # samples x genes
|
|
return [X]
|