BREAKING STUFF!

Rename fluents to laydi.
This commit is contained in:
2008-12-05 21:48:24 +00:00
parent 45a06fab7f
commit 27e4504bf6
31 changed files with 0 additions and 0 deletions

3
laydi/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
import main

105
laydi/annotations.py Normal file
View File

@@ -0,0 +1,105 @@
from fluents import dataset
_dim_annotation_handlers = {}
def get_dim_annotations(dimname, annotation, ids):
"""Returns a list of annotations corresponding to the given ids in
dimension dimname"""
global _dim_annotation_handlers
if _dim_annotation_handlers.has_key(dimname):
return _dim_annotation_handlers[dimname].get_annotations(annotation, ids)
return None
def set_dim_handler(dimname, handler):
"""Set the handler for the given dimension."""
global _dim_annotation_handlers
_dim_annotation_handlers[dimname] = handler
def get_dim_handler(dimname):
"""Get the handler for the given dimension."""
global _dim_annotation_handlers
if _dim_annotation_handlers.has_key(dimname):
return _dim_annotation_handlers[dimname]
else:
return None
class AnnotationHandler:
def __init__(self):
pass
def get_annotations(self, annotationname, ids, default=None):
return None
def get_annotation_names(self):
return []
class DictAnnotationHandler(AnnotationHandler):
def __init__(self, d=None):
if d == None:
d = {}
self._dict = d
def get_annotations(self, annotationname, ids, default=None):
d = self._dict
retval = []
for id in ids:
if d[annotationname].has_key(id):
retval.append(d[annotationname][id])
else:
retval.append(default)
return retval
def add_annotations(self, annotationname, d):
self._dict[annotationname] = d
def get_annotation_names(self):
return self._dict.keys()
def read_annotations_file(filename):
"""Read annotations from file.
Reads annotations from a tab delimited file of the format::
dimname annotation_name1 annotation_name2 ...
id1 Foo 0.43
id2 Bar 0.59
"""
ann = DictAnnotationHandler()
dimname = None
annotation_dicts = []
annotation_names = []
fd = open(filename)
## Read the first line, which contains the dimension name and
## annotation names.
line = fd.readline()
values = [x.strip() for x in line.split('\t')]
dimname = values[0]
annotation_names = values[1:]
annotation_dicts = [{} for x in annotation_names]
## Read the lines containing the annotations. The first value on
## each line is an id along the dimension.
while line:
values = [x.strip() for x in line.split('\t')]
for i, x in enumerate(values[1:]):
annotation_dicts[i][values[0]] = x
line = fd.readline()
fd.close()
## Add everything to the annotation object and add the object to
## the specified dimension.
for i, a in enumerate(annotation_names):
ann.add_annotations(a, annotation_dicts[i])
_dim_annotation_handlers[dimname] = ann
return ann

1762
laydi/cfgparse.py Normal file

File diff suppressed because it is too large Load Diff

748
laydi/dataset.py Normal file
View File

@@ -0,0 +1,748 @@
from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros,empty,sparse,\
where
from scipy import sort as array_sort
from itertools import izip
import shelve
import copy
import re
class Dataset(object):
"""The Dataset base class.
A Dataset is an n-way array with defined string identifiers across
all dimensions.
example of use:
---
dim_name_rows = 'rows'
names_rows = ('row_a','row_b')
ids_1 = [dim_name_rows, names_rows]
dim_name_cols = 'cols'
names_cols = ('col_a','col_b','col_c','col_d')
ids_2 = [dim_name_cols, names_cols]
Array_X = rand(2,4)
data = Dataset(Array_X,(ids_1,ids_2),name="Testing")
dim_names = [dim for dim in data]
column_identifiers = [id for id in data['cols'].keys()]
column_index = [index for index in data['cols'].values()]
'cols' in data -> True
---
data = Dataset(rand(10,20)) (generates dims and ids (no links))
"""
def __init__(self, array, identifiers=None, name='Unnamed dataset'):
self._dims = [] #existing dimensions in this dataset
self._map = {} # internal mapping for dataset: identifier <--> index
self._name = name
self._identifiers = identifiers
if not isinstance(array, sparse.spmatrix):
array = atleast_2d(asarray(array))
# vector are column (array)
if array.shape[0] == 1:
array = array.T
self.shape = array.shape
if identifiers != None:
self._validate_identifiers(identifiers)
self._set_identifiers(identifiers, self._all_dims)
else:
self._identifiers = self._create_identifiers(self.shape, self._all_dims)
self._set_identifiers(self._identifiers, self._all_dims)
self._array = array
def __iter__(self):
"""Returns an iterator over dimensions of dataset."""
return self._dims.__iter__()
def __contains__(self,dim):
"""Returns True if dim is a dimension name in dataset."""
# return self._dims.__contains__(dim)
return self._map.__contains__(dim)
def __len__(self):
"""Returns the number of dimensions in the dataset"""
return len(self._map)
def __getitem__(self,dim):
"""Return the identifers along the dimension dim."""
return self._map[dim]
def _create_identifiers(self, shape, all_dims):
"""Creates dimension names and identifier names, and returns
identifiers."""
dim_names = ['rows','cols']
ids = []
for axis, n in enumerate(shape):
if axis < 2:
dim_suggestion = dim_names[axis]
else:
dim_suggestion = 'dim'
dim_suggestion = self._suggest_dim_name(dim_suggestion, all_dims)
identifier_creation = [str(axis) + "_" + i for i in map(str, range(n))]
ids.append((dim_suggestion, identifier_creation))
all_dims.add(dim_suggestion)
return ids
def _set_identifiers(self, identifiers, all_dims):
"""Creates internal mapping of identifiers structure."""
for dim, ids in identifiers:
pos_map = ReverseDict()
if dim not in self._dims:
self._dims.append(dim)
all_dims.add(dim)
else:
raise ValueError, "Dimension names must be unique whitin dataset"
for pos, id in enumerate(ids):
pos_map[id] = pos
self._map[dim] = pos_map
def _suggest_dim_name(self,dim_name,all_dims):
"""Suggests a unique name for dim and returns it"""
c = 0
new_name = dim_name
while new_name in all_dims:
new_name = dim_name + "_" + str(c)
c += 1
return new_name
def asarray(self):
"""Returns the numeric array (data) of dataset"""
if isinstance(self._array, sparse.spmatrix):
return self._array.toarray()
return self._array
def set_array(self, array):
"""Adds array as an ArrayType object.
A one-dim array is transformed to a two-dim array (row-vector)
"""
if not isinstance(array, type(self._array)):
raise ValueError("Input array of type: %s does not match existing array type: %s") %(type(array), type(self._array))
if self.shape != array.shape:
raise ValueError, "Input array must be of similar dimensions as dataset"
self._array = atleast_2d(asarray(array))
def get_name(self):
"""Returns dataset name"""
return self._name
def get_all_dims(self):
"""Returns all dimensions in project"""
return self._all_dims
def get_dim_name(self, axis=None):
"""Returns dim name for an axis, if no axis is provided it
returns a list of dims"""
if type(axis) == int:
return self._dims[axis]
else:
return [dim for dim in self._dims]
def common_dims(self, ds):
"""Returns a list of the common dimensions in the two datasets."""
dims = self.get_dim_name()
ds_dims = ds.get_dim_name()
return [d for d in dims if d in ds_dims]
def get_identifiers(self, dim, indices=None, sorted=False):
"""Returns identifiers along dim, sorted by position (index)
is optional.
You can optionally provide a list/ndarray of indices to get
only the identifiers of a given position.
Identifiers are the unique names (strings) for a variable in a
given dim. Index (Indices) are the Identifiers position in a
matrix in a given dim.
"""
if indices != None:
if len(indices) == 0:# if empty list or empty array
return []
if indices != None:
# be sure to match intersection
#indices = intersect1d(self.get_indices(dim),indices)
ids = [self._map[dim].reverse[i] for i in indices]
else:
if sorted == True:
ids = [self._map[dim].reverse[i] for i in array_sort(self._map[dim].values())]
else:
ids = self._map[dim].keys()
return ids
def get_indices(self, dim, idents=None):
"""Returns indices for identifiers along dimension.
You can optionally provide a list of identifiers to retrieve a
index subset.
Identifiers are the unique names (strings) for a variable in a
given dim. Index (Indices) are the Identifiers position in a
matrix in a given dim. If none of the input identifiers are
found an empty index is returned
"""
if not isinstance(idents, list) and not isinstance(idents, set):
raise ValueError("idents needs to be a list/set got: %s" %type(idents))
if idents == None:
index = array_sort(self._map[dim].values())
else:
index = [self._map[dim][key]
for key in idents if self._map[dim].has_key(key)]
return asarray(index)
def existing_identifiers(self, dim, idents):
"""Filters a list of identifiers to find those that are present in the
dataset.
The most common use of this function is to get a list of
identifiers who correspond one to one with the list of indices produced
when get_indices is given an identifier list. That is
ds.get_indices(dim, idents) and ds.exisiting_identifiers(dim, idents)
will have the same order.
@param dim: A dimension present in the dataset.
@param idents: A list of identifiers along the given dimension.
@return: A list of identifiers in the same order as idents, but
without elements not present in the dataset.
"""
if not isinstance(idents, list) and not isinstance(idents, set):
raise ValueError("idents needs to be a list/set got: %s" %type(idents))
return [key for key in idents if self._map[dim].has_key(key)]
def copy(self):
""" Returns deepcopy of dataset.
"""
return copy.deepcopy(self)
def subdata(self, dim, idents):
"""Returns a new dataset based on dimension and given identifiers.
"""
ds = self.copy()
indices = ds.get_indices(dim, idents)
idents = ds.get_identifiers(dim, indices=indices)
if not idents:
raise ValueError("No of identifers from: \n%s \nfound in %s" %(str(idents), ds._name))
ax = [i for i, name in enumerate(ds._dims) if name == dim][0]
subarr = ds._array.take(indices, ax)
new_indices = range(len(idents))
ds._map[dim] = ReverseDict(zip(idents, new_indices))
ds.shape = tuple(len(ds._map[d]) for d in ds._dims)
ds.set_array(subarr)
return ds
def transpose(self):
"""Returns a copy of transpose of a dataset.
As for the moment: only support for 2D-arrays.
"""
assert(len(self.shape) == 2)
ds = self.copy()
ds._array = ds._array.T
ds._dims.reverse()
ds.shape = ds._array.shape
return ds
def _validate_identifiers(self, identifiers):
for dim_name, ids in identifiers:
if len(set(ids)) != len(ids):
raise ValueError("Identifiers not unique in : %s" %dim_name)
identifier_shape = [len(i[1]) for i in identifiers]
if len(identifier_shape) != len(self.shape):
raise ValueError("Identifier list length must equal array dims")
for ni, na in zip(identifier_shape, self.shape):
if ni != na:
raise ValueError, "Identifier-array mismatch: %s: (idents: %s, array: %s)" %(self._name, ni, na)
class CategoryDataset(Dataset):
"""The category dataset class.
A dataset for representing class information as binary
matrices (0/1-matrices).
There is support for using a less memory demanding, sparse format. The
prefered (default) format for a category dataset is the compressed sparse row
format (csr)
Always has linked dimension in first dim:
ex matrix:
. go_term1 go_term2 ...
gene_1
gene_2
gene_3
.
.
.
"""
def __init__(self, array, identifiers=None, name='C'):
Dataset.__init__(self, array, identifiers=identifiers, name=name)
def as_spmatrix(self):
if isinstance(self._array, sparse.spmatrix):
return self._array
else:
arr = self.asarray()
return sparse.csr_matrix(arr.astype('i'))
def to_spmatrix(self):
if isinstance(self._array, sparse.spmatrix):
self._array = self._array.tocsr()
else:
self._array = sparse.scr_matrix(self._array)
def as_dictlists(self):
"""Returns data as dict of identifiers along first dim.
ex: data['gene_1'] = ['map0030','map0010', ...]
fixme: Deprecated?
"""
data = {}
for name, ind in self._map[self.get_dim_name(0)].items():
if isinstance(self._array, ndarray):
indices = self._array[ind,:].nonzero()[0]
elif isinstance(self._array, sparse.spmatrix):
if not isinstance(self._array, sparse.csr_matrix):
array = self._array.tocsr()
else:
array = self._array
indices = array[ind,:].indices
if len(indices) == 0: # should we allow categories with no members?
continue
data[name] = self.get_identifiers(self.get_dim_name(1), indices)
self._dictlists = data
return data
def as_selections(self):
"""Returns data as a list of Selection objects.
The list of selections is not ordered (sorted) by any means.
"""
ret_list = []
for cat_name, ind in self._map[self.get_dim_name(1)].items():
if isinstance(self._array, sparse.spmatrix):
if not isinstance(self._array, sparse.csc_matrix):
self._array = self._array.tocsc()
indices = self._array[:,ind].indices
else:
indices = self._array[:,ind].nonzero()[0]
if len(indices) == 0:
continue
ids = self.get_identifiers(self.get_dim_name(0), indices)
selection = Selection(cat_name)
selection.select(self.get_dim_name(0), ids)
ret_list.append(selection)
return ret_list
class GraphDataset(Dataset):
"""The graph dataset class.
A dataset class for representing graphs. The constructor may use an
incidence matrix (possibly sparse) or (if networkx installed) a
networkx.(X)Graph structure.
If the networkx library is installed, there is support for
representing the graph as a networkx.Graph, or networkx.XGraph structure.
"""
def __init__(self, input, identifiers=None, name='A', nodepos = None):
if isinstance(input, sparse.spmatrix):
arr = input
else:
try:
arr = asarray(input)
except:
raise ValueError("Could not identify input")
Dataset.__init__(self, array=arr, identifiers=identifiers, name=name)
self._graph = None
self.nodepos = nodepos
def as_spmatrix(self):
if isinstance(self._array, sparse.spmatrix):
return self._array
else:
arr = self.asarray()
return sparse.csr_matrix(arr.astype('i'))
def to_spmatrix(self):
if isinstance(self._array, sparse.spmatrix):
self._array = self._array.tocsr()
else:
self._array = sparse.scr_matrix(self._array)
def asnetworkx(self):
if self._graph != None:
return self._graph
dim0, dim1 = self.get_dim_name()
node_ids = self.get_identifiers(dim0, sorted=True)
edge_ids = self.get_identifiers(dim1, sorted=True)
G, weights = self._graph_from_incidence_matrix(self._array, node_ids=node_ids, edge_ids=edge_ids)
self._graph = G
return G
def from_networkx(cls, G, node_dim, edge_dim, sp_format=True):
"""Create graph dataset from networkx graph.
When G is a Graph/Digraph edge identifiers will be created,
else (XGraoh/XDigraph) it is assumed that edge attributes are
the edge identifiers.
"""
import networkx as nx
n = G.number_of_nodes()
m = G.number_of_edges()
if isinstance(G, nx.DiGraph):
G = nx.XDiGraph(G)
elif isinstance(G, nx.Graph):
G = nx.XGraph(G)
edge_ids = [e[2] for e in G.edges()]
node_ids = map(str, G.nodes())
n2ind = {}
for ind, node in enumerate(node_ids):
n2ind[node] = ind
if sp_format:
I = sparse.lil_matrix((n, m))
else:
I = zeros((m, n), dtype='i')
for i, (h, t, eid) in enumerate(G.edges()):
if eid != None:
edge_ids[i] = eid
else:
edge_ids[i] = 'e_' + str(i)
hind = n2ind[str(h)]
tind = n2ind[str(t)]
I[hind, i] = 1
if G.is_directed():
I[tind, i] = -1
else:
I[tind, i] = 1
idents = [[node_dim, node_ids], [edge_dim, edge_ids]]
if G.name != '':
name = G.name
else:
name = 'A'
ds = GraphDataset(I, idents, name)
return ds
from_networkx = classmethod(from_networkx)
def _incidence2adjacency(self, I):
"""Incidence to adjacency matrix.
I*I.T - eye(n)?
"""
raise NotImplementedError
def _graph_from_incidence_matrix(self, I, node_ids, edge_ids):
"""Creates a networkx graph class from incidence
(possibly weighted) matrix and ordered labels.
labels = None, results in string-numbered labels
"""
try:
import networkx as nx
except:
print "Failed in import of NetworkX"
return None
m, n = I.shape
assert(m == len(node_ids))
assert(n == len(edge_ids))
weights = []
directed = False
G = nx.XDiGraph(name=self._name)
if isinstance(I, sparse.spmatrix):
I = I.tocsr()
for ename, col in izip(edge_ids, I.T):
if isinstance(I, sparse.spmatrix):
node_ind = col.indices
w1, w2 = col.data
else:
node_ind = where(col != 0)[0]
w1, w2 = col[node_ind]
node1 = node_ids[node_ind[0]]
node2 = node_ids[node_ind[1]]
if w1 < 0: # w1 is tail
directed = True
assert(w2 > 0 and (w1 + w2) == 0)
G.add_edge(node2, node1, ename)
weights.append(w2)
else: #w2 is tail or graph is undirected
assert(w1 > 0)
if w2 < 0:
directed = True
G.add_edge(node1, node2, ename)
weights.append(w1)
if not directed:
G = G.to_undirected()
return G, asarray(weights)
Dataset._all_dims = set()
class ReverseDict(dict):
"""A dictionary which can lookup values by key, and keys by value.
All values and keys must be hashable, and unique.
example:
>>d = ReverseDict((['a',1],['b',2]))
>>print d['a'] --> 1
>>print d.reverse[1] --> 'a'
"""
def __init__(self, *args, **kw):
dict.__init__(self, *args, **kw)
self.reverse = dict([[v, k] for k, v in self.items()])
def __setitem__(self, key, value):
dict.__setitem__(self, key, value)
try:
self.reverse[value] = key
except:
self.reverse = {value:key}
class Selection(dict):
"""Handles selected identifiers along each dimension of a dataset"""
def __init__(self, title='Unnamed Selecton'):
self.title = title
def __getitem__(self, key):
if not self.has_key(key):
return None
return dict.__getitem__(self, key)
def dims(self):
return self.keys()
def axis_len(self, axis):
if self._selection.has_key(axis):
return len(self._selection[axis])
return 0
def select(self, axis, labels):
self[axis] = labels
def write_ftsv(fd, ds, decimals=7, sep='\t', fmt=None, sp_format=True):
"""Writes a dataset in fluents tab separated values (ftsv) form.
@param fd: An open file descriptor to the output file.
@param ds: The dataset to be written.
@param decimals: Number of decimals, only supported for dataset.
@param fmt: String formating
The function handles datasets of these classes:
Dataset, CategoryDataset and GraphDataset
"""
opened = False
if isinstance(fd, str):
fd = open(fd, 'w')
opened = True
# Write header information
if isinstance(ds, CategoryDataset):
type = 'category'
if fmt == None:
fmt = '%d'
elif isinstance(ds, GraphDataset):
type = 'network'
if fmt == None:
fmt = '%d'
elif isinstance(ds, Dataset):
type = 'dataset'
if fmt == None:
fmt = '%%.%df' % decimals
else:
fmt = '%%.%d' %decimals + fmt
else:
raise Exception("Unknown object type")
fd.write('# type: %s' %type + '\n')
for dim in ds.get_dim_name():
fd.write("# dimension: %s" % dim)
for ident in ds.get_identifiers(dim, sorted=True):
fd.write(" " + ident)
fd.write("\n")
fd.write("# name: %s" % ds.get_name() + '\n')
# xy-node-positions
if type == 'network' and ds.nodepos != None:
fd.write("# nodepos:")
node_dim = ds.get_dim_name(0)
for ident in ds.get_identifiers(node_dim, sorted=True):
fd.write(" %s,%s" %ds.nodepos[ident])
fd.write("\n")
# Write data
if hasattr(ds, "as_spmatrix") and sp_format == True:
m = ds.as_spmatrix()
else:
m = ds.asarray()
if isinstance(m, sparse.spmatrix):
_write_sparse_elements(fd, m, fmt, sep)
else:
_write_elements(fd, m, fmt, sep)
if opened:
fd.close()
def read_ftsv(fd, sep=None):
"""Read a dataset in fluents tab separated values (ftsv) form and return it.
@param fd: An open file descriptor.
@return: A Dataset, CategoryDataset or GraphDataset depending on the information
read.
"""
opened = False
if isinstance(fd, str):
fd = open(fd)
opened = True
split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)')
dimensions = []
identifiers = {}
type = 'dataset'
name = 'Unnamed dataset'
sp_format = False
nodepos = None
# graphtype = 'graph'
# Read header lines from file.
line = fd.readline()
while line:
m = split_re.match(line)
if m:
key, val = m.groups()
# The line is on the form;
# dimension: dimname id1 id2 id3 ...
if key == 'dimension':
values = [v.strip() for v in val.split(' ')]
dimensions.append(values[0])
identifiers[values[0]] = values[1:]
# Read type of dataset.
# Should be dataset, category, or network
elif key == 'type':
type = val
elif key == 'name':
name = val
# storage format
# if sp_format is True then use coordinate triplets
elif key == 'sp_format':
if val in ['False', 'false', '0', 'F', 'f',]:
sp_format = False
elif val in ['True', 'true', '1', 'T', 't']:
sp_format = True
else:
raise ValueError("sp_format: %s not valid " %sp_format)
elif key == 'nodepos':
node_dim = dimensions[0]
idents = identifiers[node_dim]
nodepos = {}
xys = val.split(" ")
for node_id, xy in zip(idents, xys):
x, y = map(float, xy.split(","))
nodepos[node_id] = (x, y)
else:
break
line = fd.readline()
# Dimensions in the form [(dim1, [id1, id2, id3 ..) ...]
dims = [(x, identifiers[x]) for x in dimensions]
dim_lengths = [len(identifiers[x]) for x in dimensions]
# Create matrix and assign element reader
if type == 'category':
if sp_format:
matrix = sparse.lil_matrix(dim_lengths)
else:
matrix = empty(dim_lengths, dtype='i')
else:
if sp_format:
matrix = sparse.lil_matrix(dim_lengths)
else:
matrix = empty(dim_lengths)
if sp_format:
matrix = _read_sparse_elements(fd, matrix)
else:
matrix = _read_elements(fd, matrix)
# Create dataset of specified type
if type == 'category':
ds = CategoryDataset(matrix, dims, name)
elif type == 'network':
ds = GraphDataset(matrix, dims, name=name, nodepos=nodepos)
else:
ds = Dataset(matrix, dims, name)
if opened:
fd.close()
return ds
def _write_sparse_elements(fd, arr, fmt='%d', sep=None):
""" Sparse coordinate format."""
fd.write('# sp_format: True\n\n')
fmt = '%d %d ' + fmt + '\n'
csr = arr.tocsr()
for ii in xrange(csr.size):
ir, ic = csr.rowcol(ii)
data = csr.getdata(ii)
fd.write(fmt % (ir, ic, data))
def _write_elements(fd, arr, fmt='%f', sep='\t'):
"""Standard value separated format."""
fmt = fmt + sep
fd.write('\n')
y, x = arr.shape
for j in range(y):
for i in range(x):
fd.write(fmt %arr[j, i])
fd.write('\n')
def _read_elements(fd, arr, sep=None):
line = fd.readline()
i = 0
while line:
values = line.split(sep)
for j, val in enumerate(values):
arr[i,j] = float(val)
i += 1
line = fd.readline()
return arr
def _read_sparse_elements(fd, arr, sep=None):
line = fd.readline()
while line:
i, j, val = line.split()
arr[int(i),int(j)] = float(val)
line = fd.readline()
return arr.tocsr()

108
laydi/dialogs.py Normal file
View File

@@ -0,0 +1,108 @@
import pygtk
# pygtk.require('2.0')
import gtk
import sys
import os
import gobject
import logger, project, workflow
DATADIR = os.path.dirname(sys.modules['fluents'].__file__)
GLADEFILENAME = os.path.join(DATADIR, 'fluents.glade')
class CreateProjectDruid(gtk.Window):
"""A druid for creating a new project.
The CreateProjectDruid gets a list of all classes derived from
Workflow, and asks the user to select one of these. A new project of
the selected class is added to the application."""
def __init__(self):
gtk.Window.__init__(self)
self.widget_tree = gtk.glade.XML(GLADEFILENAME, 'new_project_druid')
self.workflows = self.make_workflow_list()
self.selected = None
renderer = gtk.CellRendererText()
wf_name = gtk.TreeViewColumn('Workflow Name', renderer, text=0)
self['workflow_list'].insert_column(wf_name, 0)
self.wf_info = gtk.TextBuffer()
self['workflow_info'].set_buffer(self.wf_info)
def __getitem__(self, key):
return self.widget_tree.get_widget(key)
def make_workflow_list(self):
store = gtk.ListStore(gobject.TYPE_STRING, gobject.TYPE_PYOBJECT)
for wf in workflow.workflow_list():
store.insert_after(None, (wf.name, wf))
return store
def run(self):
self['workflow_list'].set_model(self.workflows)
self['druidpagestart1'].show()
self['druidpagefinish1'].show()
self['new_project_druid'].show()
self['druidpagefinish1'].connect('finish', self.finish)
self['workflow_list'].connect('cursor_changed', self.selection_updated)
self['druid'].connect('cancel', self.cancel)
self.connect('destroy', self.delete)
def delete(self, widget):
return False
def hide(self):
self['druidpagestart1'].hide()
self['druidpagefinish1'].hide()
self['new_project_druid'].hide()
gtk.Window.hide(self)
def finish(self, *rest):
tree, it = self['workflow_list'].get_selection().get_selected()
wf_class = self.workflows.get_value(it, 1)
proj = project.Project()
main.set_workflow(wf_class())
# self.app.set_workflow(wf(self.app))
# self.app.set_project(proj)
main.set_project(proj)
self.hide()
self.destroy()
def cancel(self, *ignored):
self.hide()
self.destroy()
def selection_updated(self, *rest):
tree, it = self['workflow_list'].get_selection().get_selected()
wf = self.workflows.get_value(it, 1)
self.wf_info.set_text(wf.description)
def get_text(title, text):
"""Allow user to type in a string for text."""
dlg = gtk.Dialog(title)
dlg.show()
text = gtk.Label(text)
text.show()
entry = gtk.Entry()
entry.show()
entry.set_activates_default(True)
dlg.vbox.pack_start(text)
dlg.vbox.pack_start(entry)
dlg.add_button(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL)
dlg.add_button(gtk.STOCK_OK, gtk.RESPONSE_OK)
dlg.set_default_response(gtk.RESPONSE_OK)
response = dlg.run()
retval = None
if response == gtk.RESPONSE_OK:
retval = entry.get_text()
dlg.destroy()
return retval

1223
laydi/fluents.glade Normal file

File diff suppressed because it is too large Load Diff

402
laydi/fluents.py Normal file
View File

@@ -0,0 +1,402 @@
#!/usr/bin/python
import os
import sys
import pygtk
pygtk.require('2.0')
import gobject
import gtk
import gtk.gdk
import gtk.glade
import gnome
import gnome.ui
import scipy
import pango
import project, workflow, dataset, view, navigator, dialogs, selections, plots, main
from logger import logger, LogView
PROGRAM_NAME = 'laydi'
VERSION = '0.1.0'
DATADIR = os.path.join(main.PYDIR, 'fluents')
#ICONDIR = os.path.join(DATADIR,"..","icons")
ICONDIR = main.ICONDIR
GLADEFILENAME = os.path.join(main.PYDIR, 'fluents/fluents.glade')
_icon_mapper = {dataset.Dataset: 'dataset',
dataset.CategoryDataset: 'category_dataset',
dataset.GraphDataset: 'graph_dataset',
plots.Plot: 'line_plot'}
class IconFactory:
"""Factory for icons that ensures that each icon is only loaded once."""
def __init__(self, path):
self._path = path
self._icons = {}
def get(self, iconname):
"""Returns the gdk loaded PixBuf for the given icon.
Reads the icon from file if necessary."""
# if iconname isnt a string, try to autoconvert
if not isinstance(iconname, str):
for cls in _icon_mapper.keys():
if isinstance(iconname, cls):
iconname = _icon_mapper[cls]
if self._icons.has_key(iconname):
return self._icons[iconname]
icon_fname = os.path.join(self._path, '%s.png' % iconname)
icon = gtk.gdk.pixbuf_new_from_file(icon_fname)
self._icons[iconname] = icon
return icon
icon_factory = IconFactory(ICONDIR)
class TableSizeSelection(gtk.Window):
def __init__(self):
self._SIZE = size = 5
gtk.Window.__init__(self, gtk.WINDOW_POPUP)
self._table = gtk.Table(size, size, True)
self._items = []
## Create a 3x3 table of EventBox object, doubly stored because
## gtk.Table does not support indexed retrieval.
for y in range(size):
line = []
for x in range(size):
ebox = gtk.EventBox()
ebox.add(gtk.Frame())
ebox.set_size_request(20, 20)
ebox.set_visible_window(True)
self._table.attach(ebox, x, x+1, y, y+1, gtk.FILL, gtk.FILL)
line.append(ebox)
self._items.append(line)
self.set_border_width(5)
self.add(self._table)
self.connect_signals()
def _get_child_pos(self, child):
size = self._SIZE
for x in range(size):
for y in range(size):
if self._items[y][x] == child:
return (x, y)
return None
def connect_signals(self):
size = self._SIZE
for x in range(size):
for y in range(size):
self._items[y][x].add_events(gtk.gdk.ENTER_NOTIFY_MASK)
self._items[y][x].connect("enter-notify-event",
self._on_enter_notify)
self._items[y][x].connect("button-release-event",
self._on_button_release)
def _on_enter_notify(self, widget, event):
size = self._SIZE
x, y = self._get_child_pos(widget)
for i in range(size):
for j in range(size):
if i <= x and j <= y:
self._items[j][i].set_state(gtk.STATE_SELECTED)
else:
self._items[j][i].set_state(gtk.STATE_NORMAL)
self.x = x
self.y = y
def _on_button_release(self, widget, event):
size = self._SIZE
self.emit('table-size-set', self.x+1, self.y+1)
self.hide_all()
for x in range(size):
for y in range(size):
self._items[y][x].set_state(gtk.STATE_NORMAL)
class ViewFrameToolButton (gtk.ToolItem):
def __init__(self):
gtk.ToolItem.__init__(self)
fname = os.path.join(ICONDIR, "table_size.png")
image = gtk.Image()
image.set_from_file(fname)
self._button = gtk.Button()
self._button.set_image(image)
self._button.set_property("can-focus", False)
eb = gtk.EventBox()
eb.add(self._button)
self.add(eb)
self._item = TableSizeSelection()
self._button.connect("button-press-event", self._on_show_menu)
image.show()
self._image = image
self._item.connect("table-size-set", self._on_table_size_set)
self._button.set_relief(gtk.RELIEF_NONE)
self.show_all()
def _on_show_menu(self, widget, event):
x, y = self._image.window.get_origin()
x2, y2, w, h, b = self._image.window.get_geometry()
self._item.move(x, y+h)
self._item.show_all()
def _on_table_size_set(self, widget, width, height):
main.application['main_view'].resize_table(width, height)
class FluentApp:
def __init__(self): # Application variables
# self.project = None
self.current_data = None
self._last_view = None
self._plot_toolbar = None
self._toolbar_state = None
gtk.glade.set_custom_handler(self.custom_object_factory)
self.widget_tree = gtk.glade.XML(GLADEFILENAME, 'appwindow')
# self.workflow = wf
self.idlist_crt = selections.IdListController(self['identifier_list'])
self.sellist_crt = selections.SelectionListController(self['selection_tree'],
self.idlist_crt)
self.dimlist_crt = selections.DimListController(self['dim_list'],
self.sellist_crt)
self.sellist_crt.set_dimlist_controller(self.dimlist_crt)
def init_gui(self):
self['appwindow'].set_size_request(800, 600)
# Set up workflow
self.wf_view = workflow.WorkflowView(main.workflow)
self.wf_view.show()
self['workflow_vbox'].pack_end(self.wf_view)
self._wf_menu = workflow.WorkflowMenu(main.workflow)
self._wf_menu.show()
wf_menuitem = gtk.MenuItem('Fu_nctions')
wf_menuitem.set_submenu(self._wf_menu)
wf_menuitem.show()
self['menubar1'].insert(wf_menuitem, 2)
# Connect signals
signals = {'on_quit1_activate' : (gtk.main_quit),
'on_appwindow_delete_event' : (gtk.main_quit),
'on_zoom_in_button_clicked' : (self.on_single_view),
'on_zoom_out_button_clicked' : (self.on_multiple_view),
'on_new1_activate' : (self.on_create_project),
'on_button_new_clicked' : (self.on_create_project),
'on_workflow_refresh_clicked' : (self.on_workflow_refresh_clicked),
'on_index1_activate' : (self.on_help_index),
'on_about1_activate' : (self.on_help_about),
'on_report_bug1_activate' : (self.on_help_report_bug),
'on_small_view1_activate' : (self.on_multiple_view),
'on_large_view1_activate' : (self.on_single_view),
'on_left1_activate' : (self.on_left),
'on_right1_activate' : (self.on_right),
'on_up1_activate' : (self.on_up),
'on_down1_activate' : (self.on_down),
'on_navigator1_activate' : (self.on_show_navigator),
'on_workflow1_activate' : (self.on_show_workflow),
'on_information1_activate' : (self.on_show_infopane),
}
self.widget_tree.signal_autoconnect(signals)
self['main_view'].connect('view-changed', self.on_view_changed)
# Log that we've set up the app now
logger.debug('Program started')
# Add ViewFrame table size to toolbar
tb = ViewFrameToolButton()
self['toolbar'].add(tb)
def set_project(self, proj):
logger.notice('Welcome to your new project. Grasp That Data!')
self.navigator_view.add_project(proj)
self.dimlist_crt.set_project(proj)
self.sellist_crt.set_project(proj)
def set_workflow(self, workflow):
main.workflow = workflow
self.wf_view.set_workflow(main.workflow)
def show(self):
self.init_gui()
def change_plot(self, plot):
"""Sets the plot in the currently active ViewFrame. If the plot is
already shown in another ViewFrame it will be moved from there."""
# Set current selection in the plot before showing it.
plot.selection_changed(None, main.project.get_selection())
self['main_view'].insert_view(plot)
self._update_toolbar(plot)
def change_plots(self, plots):
"""Changes all plots."""
self['main_view'].set_all_plots(plots)
v = self.get_active_view_frame().get_view()
self._update_toolbar(v)
def get_active_view_frame(self):
return self['main_view'].get_active_view_frame()
def _update_toolbar(self, view):
"""Set the plot specific toolbar to the toolbar of the currently
active plot."""
# don't do anything on no change
if self._last_view == view:
return
self._last_view = view
logger.debug("view changed to %s" % view)
window = self['plot_toolbar_dock']
if self._plot_toolbar:
toolbar_state = self._plot_toolbar.get_mode()
window.remove(self._plot_toolbar)
else:
toolbar_state = "default"
if view:
self._plot_toolbar = view.get_toolbar()
self._plot_toolbar.set_mode(toolbar_state)
else:
self._plot_toolbar = None
if self._plot_toolbar:
window.add(self._plot_toolbar)
# Methods to create GUI widgets from CustomWidgets in the glade file.
# The custom_object_factory calls other functions to generate specific
# widgets.
def custom_object_factory(self, glade, fun_name, widget_name, s1, s2, i1, i2):
"Called by the glade file reader to create custom GUI widgets."
handler = getattr(self, fun_name)
return handler(s1, s2, i1, i2)
def create_logview(self, str1, str2, int1, int2):
self.log_view = LogView(logger)
self.log_view.show()
return self.log_view
def create_main_view(self, str1, str2, int1, int2):
self.main_view = view.MainView()
self.main_view.show()
return self.main_view
def create_navigator_view(self, str1, str2, int1, int2):
self.navigator_view = navigator.NavigatorView()
self.navigator_view.show()
return self.navigator_view
def create_dim_list(self, str1, str2, int1, int2):
self.dim_list = selections.DimList()
self.dim_list.show()
return self.dim_list
def create_selection_tree(self, str1, str2, int1, int2):
self.selection_tree = selections.SelectionTree()
self.selection_tree.show()
return self.selection_tree
def create_identifier_list(self, str1, str2, int1, int2):
self.identifier_list = selections.IdentifierList()
self.identifier_list.show()
return self.identifier_list
def __getitem__(self, key):
return self.widget_tree.get_widget(key)
# Event handlers.
# These methods are called by the gtk framework in response to events and
# should not be called directly.
def on_single_view(self, *ignored):
self['main_view'].goto_large()
def on_multiple_view(self, *ignored):
self['main_view'].goto_small()
def on_create_project(self, *rest):
d = dialogs.CreateProjectDruid(self)
d.run()
def on_help_about(self, *rest):
widget_tree = gtk.glade.XML(GLADEFILENAME, 'aboutdialog')
about = widget_tree.get_widget('aboutdialog')
about.run()
def on_help_index(self, *ignored):
gnome.help_display_uri('https://dev.pvv.org/projects/fluents/wiki/help')
def on_help_report_bug(self, *ignored):
gnome.help_display_uri('https://dev.pvv.org/projects/fluents/newticket')
def on_workflow_refresh_clicked(self, *ignored):
try:
reload(sys.modules[main.workflow.__class__.__module__])
except Exception, e:
logger.warning('Cannot reload workflow')
logger.warning(e)
else:
logger.notice('Successfully reloaded workflow')
def on_view_changed(self, widget, vf):
self._update_toolbar(vf.get_view())
def on_show_navigator(self, item):
if item.get_active():
self['data_vbox'].show()
else:
self['data_vbox'].hide()
def on_show_workflow(self, item):
if item.get_active():
self['workflow_vbox'].show()
else:
self['workflow_vbox'].hide()
def on_show_infopane(self, item):
if item.get_active():
self['bottom_notebook'].show()
else:
self['bottom_notebook'].hide()
def on_left(self, item):
self.main_view.move_focus_left()
def on_right(self, item):
self.main_view.move_focus_right()
def on_up(self, item):
self.main_view.move_focus_up()
def on_down(self, item):
self.main_view.move_focus_down()
gobject.signal_new('table-size-set', TableSizeSelection,
gobject.SIGNAL_RUN_LAST,
gobject.TYPE_NONE,
(gobject.TYPE_INT, gobject.TYPE_INT))

284
laydi/lib/R_utils.py Normal file
View File

@@ -0,0 +1,284 @@
"""A collection of functions that use R.
Most functions use libraries from bioconductor
depends on:
(not updated)
-- bioconductor min. install
-- hgu133a
-- hgu133plus2
"""
import scipy
import Numeric as N
import rpy
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
def get_locusid(probelist=None,org="hgu133a"):
"""Returns a dictionary of locus link id for each affy probeset
and reverse mapping
innput:
[probelist] -- probelist of affy probesets
[org] -- chip type (organism)
out:
aff2loc, loc2aff
The mapping is one-to-one for affy->locus_id
However, there are several affy probesets for one locus_id
From bioc-mail-archive: BioC takes the GeneBank ids associated
with the probes (provided by the manufacture) and then maps them
to Entrez Gene ids using data from UniGene, Entrez Gene, and other
available data sources we trust. The Entrez Gene id a probe is
assigned to is determined by votes from all the sources used. If
there is no agreement among the sources, we take the smallest
Entrez Gene id.
"""
silent_eval("library("+org+")")
silent_eval('locus_ids = as.list('+org+'LOCUSID)')
silent_eval('pp<-as.list(locus_ids[!is.na(locus_ids)])')
loc_ids = rpy.r("pp")
for id in loc_ids:
loc_ids[id] = str(loc_ids[id])
aff2loc = {}
if probelist:
for pid in probelist:
try:
aff2loc[pid]=loc_ids[pid]
except:
print "Affy probeset: %s has no locus id" %pid
print "\nCONVERSION SUMMARY:\n \
Number of probesets input %s \n \
Number of translated locus ids: %s \n \
Number of missings: %s" %(len(probelist),len(aff2loc),len(probelist)-len(aff2loc))
else:
aff2loc = loc_ids
# reverse mapping
loc2aff = {}
for k,v in aff2loc.items():
if loc2aff.has_key(v):
loc2aff[v].append(k)
else:
loc2aff[v]=[k]
return aff2loc,loc2aff
def get_kegg_paths(org="hgu133plus2",id_type='aff',probelist=None):
"""Returns a dictionary of KEGG maps.
input:
org -- chip_type (see bioconductor.org)
id_type -- id ['aff','loc']
key: affy_id, value = list of kegg map id
example: '65884_at': ['00510', '00513']
"""
silent_eval("library("+org+")")
silent_eval('xx<-as.list('+org+'PATH)')
silent_eval('xp <- xx[!is.na(xx)]')
aff2path = rpy.r("xp")
dummy = rpy.r("xx")
if id_type=='loc':
aff2loc,loc2aff = get_locusid(org=org)
loc2path = {}
for id,path in aff2path.items():
if loc2path.has_key(id):
pp = [path.append(i) for i in loc2path[id]]
print "Found duplicate in path: %s" %path
loc2path[aff2loc[id]]=path
aff2path = loc2path
out = {}
if probelist:
for pid in probelist:
try:
out[pid]=aff2path[pid]
except:
print "Could not find id: %s" %pid
else:
out = aff2path
for k,v in out.items():
# if string convert tol list
try:
v + ''
out[k] = [v]
except:
out[k] = v
return out
def get_probe_list(org="hgu133plus2"):
rpy.r.library(org)
silent_eval('probe_list<-ls('+org+'ACCNUM )')
pl = rpy.r("probe_list")
return pl
def get_GO_from_aff(org="hgu133plus2",id_type='aff',probelist=None):
"""Returns a dictionary of GO terms.
input:
org -- chip_type (see bioconductor.org)
id_type -- id ['aff','loc']
key:
example: '65884_at':
"""
silent_eval("library("+org+")")
silent_eval('xx<-as.list('+org+'GO)')
silent_eval('xp <- xx[!is.na(xx)]')
aff2path = rpy.r("xp")
dummy = rpy.r("xx")
if id_type=='loc':
LOC = get_locusid(org=org)
loc2path = {}
for id,path in aff2path.items():
if loc2path.has_key(id):
pp = [path.append(i) for i in loc2path[id]]
print "Found duplicate in path: %s" %path
loc2path[LOC[id]]=path
aff2path = loc2path
out = {}
if probelist:
for pid in probelist:
try:
out[pid]=aff2path[pid]
except:
print "Could not find id: %s" %pid
return aff2path
def get_kegg_as_category(org="hgu133plus2",id_type='aff',probelist=None):
"""Returns kegg pathway memberships in dummy (1/0) matrix (genes x maps)
"""
kegg = get_kegg_paths(org=org, id_type=id_type, probelist=probelist)
maps = set()
for kpth in kegg.values():
maps.update(kpth)
n_maps = len(maps)
n_genes = len(kegg)
gene2index = dict(zip(kegg.keys(), range(n_genes)))
map2index = dict(zip(maps, range(n_maps)))
C = scipy.zeros((n_genes, n_maps))
for k,v in kegg.items():
for m in v:
C[gene2index[k], map2index[m]]=1
return C, list(maps), kegg.keys()
def impute(X, k=10, rowmax=0.5, colmax=0.8, maxp=1500, seed=362436069):
"""
A function to impute missing expression data, using nearest
neighbor averaging. (from bioconductors impute)
input:
data: An expression matrix with genes in the rows, samples in the
columns
k: Number of neighbors to be used in the imputation (default=10)
rowmax: The maximum percent missing data allowed in any row (default
50%). For any rows with more than 'rowmax'% missing are
imputed using the overall mean per sample.
colmax: The maximum percent missing data allowed in any column
(default 80%). If any column has more than 'colmax'% missing
data, the program halts and reports an error.
maxp: The largest block of genes imputed using the knn algorithm
inside 'impute.knn' (default 1500); larger blocks are divided
by two-means clustering (recursively) prior to imputation. If
'maxp=p', only knn imputation is done
seed: The seed used for the random number generator (default
362436069) for reproducibility.
call:
impute(data ,k = 10, rowmax = 0.5, colmax = 0.8, maxp = 1500, rng.seed=362436069)
"""
rpy.r.library("impute")
X = N.asarray(X) # cast as numeric array
m, n = scipy.shape(X)
if m>n:
print "Warning (impute): more samples than variables. running transpose"
t_flag = True
else:
X = N.transpose(X)
t_flag = False
rpy.r.assign("X", X)
rpy.r.assign("k", k)
rpy.r.assign("rmax", rowmax)
rpy.r.assign("cmax", colmax)
rpy.r.assign("maxp", maxp)
call = "out<-impute.knn(X,k=k,rowmax=rmax,colmax=cmax,maxp=maxp)"
silent_eval(call)
out = rpy.r("out")
if not t_flag:
E = out['data']
E = scipy.asarray(E)
E = E.T
else:
E = out['data']
E = scipy.asarray(E)
return E
def get_chip_annotation(org="hgu133a",annot='pmid', id_type='loc',probelist=None):
"""Returns a dictionary of annoations.
input:
org -- chip_type (see bioconductor.org)
annot -- annotation ['genename', 'pmid', ' symbol']
id_type -- id ['aff','loc']
key: id, value = list of annoations
example: '65884_at': ['15672394', '138402']
"""
_valid_annot = ['genename', 'pmid', 'symbol', 'enzyme', 'chr', 'chrloc']
if annot.lower() not in _valid_annot:
raise ValueError("Annotation must be one of %s" %_valid_annot)
silent_eval("library("+org+")")
silent_eval("dummy<-as.list("+org+annot.upper()+")")
silent_eval('annotations <- dummy[!is.na(dummy)]')
aff2annot = rpy.r("annotations")
if id_type=='loc':
aff2loc, loc2aff = get_locusid(org=org)
loc2annot = {}
for geneid, annotation in aff2annot.items():
annotation = ensure_list(annotation)
print annotation
if loc2annot.has_key(geneid):
for extra in loc2annot[geneid]:
annotation.append(extra)
print "Found duplicate in gene: %s" %geneid
loc2annot[aff2loc[geneid]] = annotation
aff2annot = loc2annot
out = {}
if probelist:
for pid in probelist:
try:
out[pid] = aff2annot.get(pid, 'none')
except:
print "Could not find id: %s" %pid
else:
out = aff2annot
return out
def ensure_list(value):
if isinstance(value, list):
return value
else:
return [value]

0
laydi/lib/__init__.py Normal file
View File

1378
laydi/lib/blmfuncs.py Normal file

File diff suppressed because it is too large Load Diff

458
laydi/lib/blmplots.py Normal file
View File

@@ -0,0 +1,458 @@
"""Specialised plots for functions defined in blmfuncs.py.
fixme:
-- If scatterplot is not inited with a colorvector there will be no
colorbar, but when adding colors the colorbar shoud be created.
"""
from matplotlib import cm,patches
import gtk
import fluents
from fluents import plots, main,logger
import scipy
from scipy import dot,sum,diag,arange,log,mean,newaxis,sqrt,apply_along_axis,empty
from scipy.stats import corrcoef
def correlation_loadings(data, T, test=True):
""" Returns correlation loadings.
:input:
- D: [nsamps, nvars], data (non-centered data)
- T: [nsamps, a_max], Scores
:ouput:
- R: [nvars, a_max], Correlation loadings
:notes:
"""
nsamps, nvars = data.shape
nsampsT, a_max = T.shape
if nsamps!=nsampsT: raise IOError("D/T mismatch")
# center
data = data - data.mean(0)
R = empty((nvars, a_max),'d')
for a in range(a_max):
for k in range(nvars):
R[k,a] = corrcoef(data[:,k], T[:,a])[0,1]
return R
class BlmScatterPlot(plots.ScatterPlot):
"""Scatter plot used for scores and loadings in bilinear models."""
def __init__(self, title, model, absi=0, ordi=1, part_name='T', color_by=None):
self.model = model
if model.model.has_key(part_name)!=True:
raise ValueError("Model part: %s not found in model" %mod_param)
self._T = model.model[part_name]
if self._T.shape[1]==1:
logger.log('notice', 'Scores have only one component')
absi= ordi = 0
self._absi = absi
self._ordi = ordi
self._cmap = cm.summer
dataset_1 = model.as_dataset(part_name)
id_dim = dataset_1.get_dim_name(0)
sel_dim = dataset_1.get_dim_name(1)
id_1, = dataset_1.get_identifiers(sel_dim, [absi])
id_2, = dataset_1.get_identifiers(sel_dim, [ordi])
col = 'b'
if model.model.has_key(color_by):
col = model.model[color_by].ravel()
plots.ScatterPlot.__init__(self, dataset_1, dataset_1, id_dim, sel_dim, id_1, id_2 ,c=col ,s=40 , name=title)
self._mappable.set_cmap(self._cmap)
self.sc = self._mappable
self.add_pc_spin_buttons(self._T.shape[1], absi, ordi)
def set_facecolor(self, colors):
"""Set patch facecolors.
"""
pass
def set_alphas(self, alphas):
"""Set alpha channel for all patches."""
pass
def set_sizes(self, sizes):
"""Set patch sizes."""
pass
def set_expvar_axlabels(self, param=None):
if param == None:
param = self._expvar_param
else:
self._expvar_param = param
if not self.model.model.has_key(param):
self.model.model[param] = None
if self.model.model[param]==None:
logger.log('notice', 'Param: %s not in model' %param)
print self.model.model.keys()
print self.model.model[param]
pass #fixme: do expvar calc here if not present
else:
expvar = self.model.model[param]
xstr = "Comp: %s , %.1f " %(self._absi, expvar[self._absi+1])
ystr = "Comp: %s , %.1f " %(self._ordi, expvar[self._ordi+1])
self.axes.set_xlabel(xstr)
self.axes.set_ylabel(ystr)
def add_pc_spin_buttons(self, amax, absi, ordi):
sb_a = gtk.SpinButton(climb_rate=1)
sb_a.set_range(1, amax)
sb_a.set_value(absi+1)
sb_a.set_increments(1, 5)
sb_a.connect('value_changed', self.set_absicca)
sb_o = gtk.SpinButton(climb_rate=1)
sb_o.set_range(1, amax)
sb_o.set_value(ordi+1)
sb_o.set_increments(1, 5)
sb_o.connect('value_changed', self.set_ordinate)
hbox = gtk.HBox()
gtk_label_a = gtk.Label("A:")
gtk_label_o = gtk.Label(" O:")
toolitem = gtk.ToolItem()
toolitem.set_expand(False)
toolitem.set_border_width(2)
toolitem.add(hbox)
hbox.pack_start(gtk_label_a)
hbox.pack_start(sb_a)
hbox.pack_start(gtk_label_o)
hbox.pack_start(sb_o)
self._toolbar.insert(toolitem, -1)
toolitem.set_tooltip(self._toolbar.tooltips, "Set Principal component")
self._toolbar.show_all() #do i need this?
def set_absicca(self, sb):
self._absi = sb.get_value_as_int() - 1
xy = self._T[:,[self._absi, self._ordi]]
self.xaxis_data = xy[:,0]
self.yaxis_data = xy[:,1]
self.sc._offsets = xy
self.selection_collection._offsets = xy
self.canvas.draw_idle()
pad = abs(self.xaxis_data.min()-self.xaxis_data.max())*0.05
new_lims = (self.xaxis_data.min() - pad, self.xaxis_data.max() + pad)
self.axes.set_xlim(new_lims, emit=True)
self.set_expvar_axlabels()
self.canvas.draw_idle()
def set_ordinate(self, sb):
self._ordi = sb.get_value_as_int() - 1
xy = self._T[:,[self._absi, self._ordi]]
self.xaxis_data = xy[:,0]
self.yaxis_data = xy[:,1]
self.sc._offsets = xy
self.selection_collection._offsets = xy
pad = abs(self.yaxis_data.min()-self.yaxis_data.max())*0.05
new_lims = (self.yaxis_data.min() - pad, self.yaxis_data.max() + pad)
self.axes.set_ylim(new_lims, emit=True)
self.set_expvar_axlabels()
self.canvas.draw_idle()
def show_labels(self, index=None):
if self._text_labels == None:
x = self.xaxis_data
y = self.yaxis_data
self._text_labels = {}
for name, n in self.dataset_1[self.current_dim].items():
txt = self.axes.text(x[n],y[n], name)
txt.set_visible(False)
self._text_labels[n] = txt
if index!=None:
self.hide_labels()
for indx,txt in self._text_labels.items():
if indx in index:
txt.set_visible(True)
self.canvas.draw_idle()
def hide_labels(self):
for txt in self._text_labels.values():
txt.set_visible(False)
self.canvas.draw_idle()
class PcaScreePlot(plots.BarPlot):
def __init__(self, model):
title = "Pca, (%s) Scree" %model._dataset['X'].get_name()
ds = model.as_dataset('eigvals')
if ds==None:
logger.log('notice', 'Model does not contain eigvals')
plots.BarPlot.__init__(self, ds, name=title)
class PcaScorePlot(BlmScatterPlot):
def __init__(self, model, absi=0, ordi=1):
title = "Pca scores (%s)" %model._dataset['X'].get_name()
BlmScatterPlot.__init__(self, title, model, absi, ordi, 'T')
self.set_expvar_axlabels(param="expvarx")
class PcaLoadingPlot(BlmScatterPlot):
def __init__(self, model, absi=0, ordi=1):
title = "Pca loadings (%s)" %model._dataset['X'].get_name()
BlmScatterPlot.__init__(self, title, model, absi, ordi, part_name='P', color_by='p_tsq')
self.set_expvar_axlabels(param="expvarx")
class PlsScorePlot(BlmScatterPlot):
def __init__(self, model, absi=0, ordi=1):
title = "Pls scores (%s)" %model._dataset['X'].get_name()
BlmScatterPlot.__init__(self, title, model, absi, ordi, 'T')
class PlsXLoadingPlot(BlmScatterPlot):
def __init__(self, model, absi=0, ordi=1):
title = "Pls x-loadings (%s)" %model._dataset['X'].get_name()
BlmScatterPlot.__init__(self, title, model, absi, ordi, part_name='P', color_by='w_tsq')
#self.set_expvar_axlabels(self, param="expvarx")
class PlsYLoadingPlot(BlmScatterPlot):
def __init__(self, model, absi=0, ordi=1):
title = "Pls y-loadings (%s)" %model._dataset['Y'].get_name()
BlmScatterPlot.__init__(self, title, model, absi, ordi, part_name='Q')
class PlsCorrelationLoadingPlot(BlmScatterPlot):
def __init__(self, model, absi=0, ordi=1):
title = "Pls correlation loadings (%s)" %model._dataset['X'].get_name()
BlmScatterPlot.__init__(self, title, model, absi, ordi, part_name='CP')
class LplsScorePlot(BlmScatterPlot):
def __init__(self, model, absi=0, ordi=1):
title = "L-pls scores (%s)" %model._dataset['X'].get_name()
BlmScatterPlot.__init__(self, title, model, absi, ordi, 'T')
self.set_expvar_axlabels("evx")
class LplsXLoadingPlot(BlmScatterPlot):
def __init__(self, model, absi=0, ordi=1):
title = "Lpls x-loadings (%s)" %model._dataset['X'].get_name()
BlmScatterPlot.__init__(self, title, model, absi, ordi, part_name='P', color_by='tsqx')
self.set_expvar_axlabels("evx")
class LplsZLoadingPlot(BlmScatterPlot, plots.PlotThresholder):
def __init__(self, model, absi=0, ordi=1):
title = "Lpls z-loadings (%s)" %model._dataset['Z'].get_name()
BlmScatterPlot.__init__(self, title, model, absi, ordi, part_name='L', color_by='tsqz')
self.set_expvar_axlabels(param="evz")
plots.PlotThresholder.__init__(self, "IC")
def _update_color_from_dataset(self, ds):
BlmScatterPlot._update_color_from_dataset(self, ds)
self.set_threshold_dataset(ds)
class LplsXCorrelationPlot(BlmScatterPlot):
def __init__(self, model, absi=0, ordi=1):
title = "Lpls x-corr. loads (%s)" %model._dataset['X'].get_name()
if not model.model.has_key('Rx'):
R = correlation_loadings(model._data['X'], model.model['T'])
model.model['Rx'] = R
BlmScatterPlot.__init__(self, title, model, absi, ordi, part_name='Rx')
self.set_expvar_axlabels("evx")
radius = 1
center = (0,0)
c100 = patches.Circle(center,radius=radius,
facecolor='gray',
alpha=.1,
zorder=1)
c50 = patches.Circle(center, radius= sqrt(radius/2.0),
facecolor='gray',
alpha=.1,
zorder=2)
self.axes.add_patch(c100)
self.axes.add_patch(c50)
self.axes.axhline(lw=1.5,color='k')
self.axes.axvline(lw=1.5,color='k')
self.axes.set_xlim([-1.05,1.05])
self.axes.set_ylim([-1.05, 1.05])
self.canvas.show()
class LplsZCorrelationPlot(BlmScatterPlot):
def __init__(self, model, absi=0, ordi=1):
title = "Lpls z-corr. loads (%s)" %model._dataset['Z'].get_name()
if not model.model.has_key('Rz'):
R = correlation_loadings(model._data['Z'].T, model.model['W'])
model.model['Rz'] = R
BlmScatterPlot.__init__(self, title, model, absi, ordi, part_name='Rz')
self.set_expvar_axlabels("evz")
radius = 1
center = (0,0)
c100 = patches.Circle(center,radius=radius,
facecolor='gray',
alpha=.1,
zorder=1)
c50 = patches.Circle(center, radius=sqrt(radius/2.0),
facecolor='gray',
alpha=.1,
zorder=2)
self.axes.add_patch(c100)
self.axes.add_patch(c50)
self.axes.axhline(lw=1.5,color='k')
self.axes.axvline(lw=1.5,color='k')
self.axes.set_xlim([-1.05,1.05])
self.axes.set_ylim([-1.05, 1.05])
self.canvas.show()
class LplsHypoidCorrelationPlot(BlmScatterPlot):
def __init__(self, model, absi=0, ordi=1):
title = "Hypoid correlations(%s)" %model._dataset['X'].get_name()
BlmScatterPlot.__init__(self, title, model, absi, ordi, part_name='W')
class LplsExplainedVariancePlot(plots.Plot):
def __init__(self, model):
self.model = model
plots.Plot.__init__(self, "Explained variance")
xax = scipy.arange(model.model['evx'].shape[0])
self.axes.plot(xax, model.model['evx'], 'b-', label='X', linewidth=1.5)
self.axes.plot(xax, model.model['evy'], 'k-', label='Y', linewidth=1.5)
self.axes.plot(xax, model.model['evz'], 'g-', label='Z', linewidth=1.5)
self.canvas.draw()
class LineViewXc(plots.LineViewPlot):
"""A line view of centered raw data
"""
def __init__(self, model, name='Profiles'):
dx = model._dataset['X']
plots.LineViewPlot.__init__(self, dx, 1, None, False,name)
self.add_center_check_button(self.data_is_centered)
def add_center_check_button(self, ticked):
"""Add a checker button for centerd view of data."""
cb = gtk.CheckButton("Center")
cb.set_active(ticked)
cb.connect('toggled', self._toggle_center)
toolitem = gtk.ToolItem()
toolitem.set_expand(False)
toolitem.set_border_width(2)
toolitem.add(cb)
self._toolbar.insert(toolitem, -1)
toolitem.set_tooltip(self._toolbar.tooltips, "Column center the line view")
self._toolbar.show_all() #do i need this?
def _toggle_center(self, active):
if self.data_is_centered:
self._data = self._data + self._mn_data
self.data_is_centered = False
else:
self._mn_data = self._data.mean(0)
self._data = self._data - self._mn_data
self.data_is_centered = True
self.make_lines()
self.set_background()
self.set_current_selection(main.project.get_selection())
class ParalellCoordinates(plots.Plot):
"""Parallell coordinates for score loads with many comp.
"""
def __init__(self, model, p='loads'):
pass
class PlsQvalScatter(plots.ScatterPlot):
"""A vulcano like plot of loads vs qvals
"""
def __init__(self, model, pc=0):
if not model.model.has_key('w_tsq'):
return None
self._W = model.model['W']
dataset_1 = model.as_dataset('W')
dataset_2 = model.as_dataset('w_tsq')
id_dim = dataset_1.get_dim_name(0) #genes
sel_dim = dataset_1.get_dim_name(1) #_comp
sel_dim_2 = dataset_2.get_dim_name(1) #_zero_dim
id_1, = dataset_1.get_identifiers(sel_dim, [0])
id_2, = dataset_2.get_identifiers(sel_dim_2, [0])
if model.model.has_key('w_tsq'):
col = model.model['w_tsq'].ravel()
#col = normalise(col)
else:
col = 'g'
plots.ScatterPlot.__init__(self, dataset_1, dataset_2,
id_dim, sel_dim, id_1, id_2,
c=col, s=20, sel_dim_2=sel_dim_2,
name='Load Volcano')
class PredictionErrorPlot(plots.Plot):
"""A boxplot of prediction error vs. comp. number.
"""
def __init__(self, model, name="Prediction Error"):
if not model.model.has_key('sep'):
logger.log('notice', 'Model has no calculations of sep')
return None
plots.Plot.__init__(self, name)
self._frozen = True
self.current_dim = 'johndoe'
self.axes = self.fig.add_subplot(111)
# draw
sep = model.model['sep']
aopt = model.model['aopt']
bx_plot_lines = self.axes.boxplot(sqrt(sep))
aopt_marker = self.axes.axvline(aopt, linewidth=10,
color='r',zorder=0,
alpha=.5)
# add canvas
self.add(self.canvas)
self.canvas.show()
def set_current_selection(self, selection):
pass
class TRBiplot(plots.ScatterPlot):
def __init__(self, model, absi=0, ordi=1):
title = "Target rotation biplot(%s)" %model._dataset['X'].get_name()
BlmScatterPlot.__init__(self, title, model, absi, ordi, 'B')
B = model.model.get('B')
# normalize B
Bnorm = scipy.apply_along_axis(scipy.linalg.norm, 1, B)
x = model._dataset['X'].copy()
Xc = x._array - mean(x._array,0)[newaxis]
w_rot = B/Bnorm
t_rot = dot(Xc, w_rot)
class InfluencePlot(plots.ScatterPlot):
""" Returns a leverage vs resiudal scatter plot.
"""
def __init__(self, model, dim, name="Influence"):
if not model.model.has_key('levx'):
logger.log('notice', 'Model has no calculations of leverages')
return
if not model.model.has_key('ssqx'):
logger.log('notice', 'Model has no calculations of residuals')
return
ds1 = model.as_dataset('levx')
ds2 = model.as_dataset('ssqx')
plots.ScatterPlot.__init__(self, ds1, ds2,
id_dim, sel_dim, id_1, id_2,
c=col, s=20, sel_dim_2=sel_dim_2,
name='Load Volcano')
class RMSEPPlot(plots.BarPlot):
def __init__(self, model, name="RMSEP"):
if not model.model.has_key('rmsep'):
logger.log('notice', 'Model has no calculations of sep')
return
dataset = model.as_dataset('rmsep')
plots.BarPlot.__init__(self, dataset, name=name)
def normalise(x):
"""Scale vector x to [0,1]
"""
x = x - x.min()
x = x/x.max()
return x

66
laydi/lib/cv_index.py Normal file
View File

@@ -0,0 +1,66 @@
from numpy import array_split,arange
def cv(n, k, randomise=False, sequential=False):
"""
Generates k (training, validation) index pairs.
Each pair is a partition of arange(n), where validation is an iterable
of length ~n/k.
If randomise is true, a copy of index is shuffled before partitioning,
otherwise its order is preserved in training and validation.
Randomise overrides the sequential argument. If randomise is true,
sequential is False
If sequential is true the index is partioned in continous blocks,
otherwise interleaved ordering is used.
"""
index = xrange(N)
if randomise:
from random import shuffle
index = list(index)
shuffle(index)
sequential = False
if sequential:
for validation in array_split(index, K):
training = [i for i in index if i not in validation]
yield training, validation
else:
for k in xrange(K):
training = [i for i in index if i % K != k]
validation = [i for i in index if i % K == k]
yield training, validation
def shuffle_diag(shape, K, randomise=False, sequential=False):
"""
Generates k (training, validation) index pairs.
"""
m, n = shape
if K>m or K>n:
msg = "You may not use more subsets than max(n_rows, n_cols)"
raise ValueError, msg
mon = max(m, n)
#index = xrange(n)
index = [i for i in range(m*n) if i % m == 0]
print index
if randomise:
from random import shuffle
index = list(index)
shuffle(index)
sequential = False
if sequential:
start_inds = array_split(index, K)
else:
for k in xrange(K):
start_inds = [index[i] for i in xrange(n) if i % K == k]
print start_inds
for start in start_inds:
ind = arange(start, n*m, mon+1)
yield ind

438
laydi/lib/cx_stats.py Normal file
View File

@@ -0,0 +1,438 @@
import time
import cPickle
from scipy import zeros,zeros_like,sqrt,dot,trace,sign,round_,argmax,\
sort,ravel,newaxis,asarray,diag,sum,outer,argsort,arange,ones_like,\
all,apply_along_axis,eye,atleast_2d,empty
from scipy.linalg import svd,inv,norm,det,sqrtm
from scipy.stats import mean,median
#import plots_lpls
from cx_utils import mat_center
from validation import pls_jkW, lpls_jk
from select_generators import shuffle_1d
from engines import pca, pls, bridge
from engines import nipals_lpls as lpls
def hotelling(Pcv, P, p_center='med', cov_center='med',
alpha=0.3, crot=True, strict=False):
"""Returns regularized hotelling T^2.
alpha -- regularisation towards pooled cov estimates
beta -- regularisation for unstable eigenvalues
p_center -- location method for submodels
cov_center -- location method for sub coviariances
alpha -- regularisation
crot -- rotate submodels toward full?
strict -- only rotate 90 degree ?
"""
m, n = P.shape
n_sets, n, amax = Pcv.shape
# allocate
T_sq = empty((n, ),dtype='d')
Cov_i = zeros((n, amax, amax),dtype='d')
# rotate sub_models to full model
if crot:
for i, Pi in enumerate(Pcv):
Pcv[i] = procrustes(P, Pi, strict=strict)
# center of pnull
if p_center=='med':
P_ctr = median(Pcv, 0)
elif p_center=='mean':
# fixme: mean is unstable
P_ctr = mean(Pcv, 0)
else: #use full
P_ctr = P
for i in xrange(n):
Pi = Pcv[:,i,:] # (n_sets x amax)
Pi_ctr = P_ctr[i,:] # (1 x amax)
Pim = (Pi - Pi_ctr[newaxis])*sqrt(n_sets-1)
Cov_i[i] = (1./n_sets)*dot(Pim.T, Pim)
if cov_center == 'med':
Cov = median(Cov_i, 0)
else:
Cov = mean(Cov_i, 0)
reg_cov = (1. - alpha)*Cov_i + alpha*Cov
for i in xrange(n):
#Pc = P_ctr[i,:][:,newaxis]
Pc = P_ctr[i,:]
sigma = reg_cov[i]
# T_sq[i] = (dot(Pc, inv(sigma) )*Pc).sum() #slow
T_sq[i] = dot(dot(Pc, inv(sigma)), Pc) # dont need to care about transposes
#T_sq[i] = dot(dot(Pc.T, inv(sigma)), Pc).ravel()
return T_sq
def procrustes(A, B, strict=True, center=False, verbose=False):
"""Rotation of B to A.
strict -- Only do flipping and shuffling
center -- Center before rotation, translate back after
verbose -- Print ssq
No scaling calculated.
Output B_rot = Rotated B
"""
if center:
A,mn_A = mat_center(A, ret_mn=True)
B,mn_B = mat_center(B, ret_mn=True)
u,s,vh = svd(dot(B.T, A))
v = vh.T
Cm = dot(u, v.T) #orthogonal rotation matrix
if strict: # just inverting and flipping
Cm = ensure_strict(Cm)
b_rot = dot(B, Cm)
if verbose:
print Cm.round()
fit = sum(ravel(B - b_rot)**2)
print "Sum of squares: %s" %fit
if center:
return mn_B + b_rot
else:
return b_rot
def expl_var_x(Xc, T):
"""Returns explained variance of X.
T should carry variance in length, Xc has zero col-mean.
"""
exp_var_x = diag(dot(T.T, T))*100/(sum(Xc**2))
return exp_var_x
def expl_var_y(Y, T, Q):
"""Returns explained variance of Y.
"""
# centered Y
exp_var_y = zeros((Q.shape[1], ))
for a in range(Q.shape[1]):
Ya = outer(T[:,a], Q[:,a])
exp_var_y[a] = 100*sum(Ya**2)/sum(Y**2)
return exp_var_y
def pls_qvals(a, b, aopt=None, alpha=.3,
n_iter=20, algo='pls',
center=True,
sim_method='shuffle',
p_center='med', cov_center='med',
crot=True, strict=False):
"""Returns qvals for pls model.
input:
a -- data matrix
b -- data matrix
aopt -- scalar, opt. number of components
alpha -- [0,1] regularisation parameter for T2-test
n_iter -- number of permutations
sim_method -- permutation method ['shuffle']
p_center -- location estimator for sub models ['med']
cov_center -- location estimator for covariance of submodels ['med']
crot -- bool, use rotations of sub models?
strict -- bool, use stict (rot/flips only) rotations?
"""
m, n = a.shape
TSQ = zeros((n, n_iter), dtype='d') # (nvars x n_subsets)
n_false = zeros((n, n_iter), dtype='d')
#full model
if center:
ac = a - a.mean(0)
bc = b - b.mean(0)
if algo=='bridge':
dat = bridge(ac, bc, aopt, 'loads', 'fast')
else:
dat = pls(ac, bc, aopt, 'loads', 'fast')
Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo,center=True)
tsq_full = hotelling(Wcv, dat['W'], p_center=p_center,
alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center)
#t0 = time.time()
Vs = shuffle_1d(bc, n_iter, axis=0)
for i, b_shuff in enumerate(Vs):
#t1 = time.time()
if algo=='bridge':
dat = bridge(ac, b_shuff, aopt, 'loads','fast')
else:
dat = pls(ac, b_shuff, aopt, 'loads', 'fast')
Wcv = pls_jkW(a, b_shuff, aopt, n_blocks=None, algo=algo)
TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center,
alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center)
#print time.time() - t1
return fdr(tsq_full, TSQ, median)
def ensure_strict(C, only_flips=True):
"""Ensure that a rotation matrix does only 90 degree rotations.
In multiplication with pcs this allows flips and reordering.
if only_flips is True there will onlt be flips allowed
"""
Cm = C
S = sign(C) # signs
if only_flips==True:
C = eye(Cm.shape[0])*S
return C
Cm = zeros_like(C)
Cm.putmask(1.,abs(C)>.6)
if det(Cm)>1:
raise ValueError,"Implement this!"
return Cm*S
def pls_qvals_II(a, b, aopt=None, center=True, alpha=.3,
n_iter=20, algo='pls',
sim_method='shuffle',
p_center='med', cov_center='med',
crot=True, strict=False):
"""Returns qvals for pls model.
Shuffling of variables in X.
Null model is 'If I put genes randomly on network' ... if they are sign:
then this is due to network structure and not covariance with response.
input:
a -- data matrix
b -- data matrix
aopt -- scalar, opt. number of components
alpha -- [0,1] regularisation parameter for T2-test
n_iter -- number of permutations
sim_method -- permutation method ['shuffle']
p_center -- location estimator for sub models ['med']
cov_center -- location estimator for covariance of submodels ['med']
crot -- bool, use rotations of sub models?
strict -- bool, use stict (rot/flips only) rotations?
"""
m, n = a.shape
TSQ = zeros((n, n_iter), dtype='<f8') # (nvars x n_subsets)
n_false = zeros((n, n_iter), dtype='<f8')
#full model
# center?
if center==True:
ac = a - a.mean(0)
bc = b - b.mean(0)
if algo=='bridge':
dat = bridge(ac, bc, aopt, 'loads', 'fast')
else:
dat = pls(ac, bc, aopt, 'loads', 'fast')
Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo)
tsq_full = hotelling(Wcv, dat['W'], p_center=p_center,
alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center)
t0 = time.time()
Vs = shuffle_1d(a, n_iter, 1)
for i, a_shuff in enumerate(Vs):
t1 = time.time()
a = a_shuff - a_shuff.mean(0)
if algo=='bridge':
dat = bridge(a, b, aopt, 'loads','fast')
else:
dat = pls(a, b, aopt, 'loads', 'fast')
Wcv = pls_jkW(a, b, aopt, n_blocks=None, algo=algo)
TSQ[:,i] = hotelling(Wcv, dat['W'], p_center=p_center,
alpha=alpha, crot=crot, strict=strict,
cov_center=cov_center)
print time.time() - t1
sort_index = argsort(tsq_full)[::-1]
back_sort_index = sort_index.argsort()
print time.time() - t0
# count false positives
tsq_full_sorted = tsq_full.take(sort_index)
for i in xrange(n_iter):
for j in xrange(n):
n_false[j,i] = sum(TSQ[:,i]>=tsq_full[j])
false_pos = median(n_false, 1)
ll = arange(1, len(false_pos)+1, 1)
sort_qval = false_pos.take(sort_index)/ll
qval = false_pos/ll.take(back_sort_index)
print time.time() - t0
#return qval, false_pos, TSQ, tsq_full
return qval
def leverage(aopt=1,*args):
"""Returns leverages
input : aopt, number of components to base leverage calculations on
*args, matrices of normed blm-paramters
output: leverages
For PCA typical inputs are normalised T or normalised P
For PLSR typical inputs are normalised T or normalised W
"""
if aopt<1:
raise ValueError,"Leverages only make sense for aopt>0"
lev = []
for u in args:
lev_u = 1./u.shape[0] + dot(u[:,:aopt], u[:,:aopt].T).diagonal()
lev.append(lev_u)
return lev
def variances(a, t, p):
"""Returns explained variance and ind. var from blm-params.
input:
a -- full centered matrix
t,p -- parameters from a bilinear approx of the above matrix.
output:
var -- variance of each component
var_exp -- cumulative explained variance in percentage
Typical inputs are: X(centered),T,P for PCA or
X(centered),T,P / Y(centered),T,Q for PLSR.
"""
tot_var = sum(a**2)
var = 100*(sum(p**2, 0)*sum(t**2, 0))/tot_var
var_exp = var.cumsum()
return var, var_exp
def residual_diagnostics(Y, Yhat, aopt=1):
"""Root mean errors and press values.
R2 vals
"""
pass
def ssq(E, axis=0, weights=None):
"""Sum of squares, supports weights."""
n = E.shape[axis]
if weights==None:
weights = eye(n)
else:
weigths = diag(weigths)
if axis==0:
Ew = dot(weights, E)
elif axis==1:
Ew = dot(E, weights)
else:
raise NotImplementedError, "Higher order modes not supported"
return pow(Ew,2).sum(axis)
def vnorm(x):
"""Returns the euclidian norm of a vector.
This is considerably faster than linalg.norm
"""
return sqrt(dot(x,x.conj()))
def mahalanobis(a, loc=None, acov=None, invcov=None):
"""Returns the distance of each observation in a
from the location estimate (loc) of the data,
relative to the shape of the data.
a : data matrix (n observations in rows, p variables in columns)
loc : location estimate of the data (p-dimensional vector)
covmat or invcov : scatter estimate of the data or the inverse of the scatter estimate (pxp matrix)
:Returns:
A vector containing the distances of all the observations to locvct.
"""
n, p = a.shape
if loc==None:
loc = a.mean(0)
loc = atleast_2d(loc)
if loc.shape[1]==1:
loc = loc.T; #ensure rowvector
assert(loc.shape[1]==p)
xc = a - loc
if acov==None and invcov==None:
acov = dot(xc.T, xc)
if invcov != None:
covmat = atleast_2d(invcov)
if min(covmat.shape)==1:
covmat = diag(invcov.ravel())
else:
covmat = atleast_2d(acov)
if min(covmat.shape)==1:
covmat = diag(covmat.ravel())
covmat = inv(covmat)
# mdist = diag(dot(dot(xc, covmat),xc.T))
mdist = (dot(xc, covmat)*xc).sum(1)
return mdist
def lpls_qvals(a, b, c, aopt=None, alpha=.3, zx_alpha=.5, n_iter=20,
sim_method='shuffle',p_center='med', cov_center='med',crot=True,
strict=False, mean_ctr=[2,0,2], nsets=None):
"""Returns qvals for l-pls model.
input:
a -- data matrix
b -- data matrix
c -- data matrix
aopt -- scalar, opt. number of components
alpha -- [0,1] regularisation parameter for T2-test
xz_alpha -- [0,1] how much z info to include
n_iter -- number of permutations
sim_method -- permutation method ['shuffle']
p_center -- location estimator for sub models ['med']
cov_center -- location estimator for covariance of submodels ['med']
crot -- bool, use rotations of sub models?
strict -- bool, use stict (rot/flips only) rotations?
"""
m, n = a.shape
p, k = c.shape
pert_tsq_x = zeros((n, n_iter), dtype='d') # (nxvars x n_subsets)
pert_tsq_z = zeros((p, n_iter), dtype='d') # (nzvars x n_subsets)
# Full model
#print "Full model start"
dat = lpls(a, b, c, aopt, scale='loads', mean_ctr=mean_ctr)
Wc, Lc = lpls_jk(a, b, c , aopt, nsets=nsets)
#print "Full hot"
cal_tsq_x = hotelling(Wc, dat['W'], alpha = alpha)
cal_tsq_z = hotelling(Lc, dat['L'], alpha = 0)
# Perturbations
Vs = shuffle_1d(b, n_iter, axis=0)
for i, b_shuff in enumerate(Vs):
print i
dat = lpls(a, b_shuff,c, aopt, scale='loads', mean_ctr=mean_ctr)
Wi, Li = lpls_jk(a, b_shuff, c, aopt, nsets=nsets)
pert_tsq_x[:,i] = hotelling(Wi, dat['W'], alpha=alpha)
pert_tsq_z[:,i] = hotelling(Li, dat['L'], alpha=alpha)
return cal_tsq_z, pert_tsq_z, cal_tsq_x, pert_tsq_x
def fdr(tsq, tsqp, loc_method='mean'):
n, = tsq.shape
k, m = tsqp.shape
assert(n==k)
n_false = empty((n, m), 'd')
sort_index = argsort(tsq)[::-1]
r_index = argsort(sort_index)
for i in xrange(m):
for j in xrange(n):
n_false[j,i] = (tsqp[:,i]>tsq[j]).sum()
#cPickle.dump(n_false, open("/tmp/nfalse.dat_"+str(n), "w"))
if loc_method=='mean':
fp = mean(n_false,1)
elif loc_method == 'median':
fp = median(n_false.T)
else:
raise ValueError
n_signif = (arange(n) + 1.0)[r_index]
fd_rate = fp/n_signif
return fd_rate

115
laydi/lib/cx_utils.py Normal file
View File

@@ -0,0 +1,115 @@
from scipy import apply_along_axis,newaxis,zeros,\
median,round_,nonzero,dot,argmax,any,sqrt,ndarray,\
trace,zeros_like,sign,sort,real,argsort,rand,array,\
matrix,nan
from scipy.linalg import norm,svd,inv,eig
from scipy.stats import median,mean
def normalise(a, axis=0, return_scales=False):
s = apply_along_axis(norm, axis, a)
if axis==0:
s = s[newaxis]
else:
s = s[:,newaxis]
a_s = a/s
if return_scales:
return a_s, s
return a_s
def sub2ind(shape, i, j):
"""Indices from subscripts. Only support for 2d"""
row,col = shape
ind = []
for k in xrange(len(i)):
for m in xrange(len(j)):
ind.append(i[k]*col + j[m])
return ind
def sorted_eig(a, b=None,sort_by='sm'):
"""
Just eig with real part of output sorted:
This is for convenience only, not general!
sort_by='sm': return the eigenvectors by eigenvalues
of smallest magnitude first. (default)
'lm': returns largest eigenvalues first
output: just as eig with 2 outputs
-- s,v (eigvals,eigenvectors)
(This is reversed output compared to matlab)
"""
s,v = eig(a, b)
s = real(s) # dont expect any imaginary part
v = real(v)
ind = argsort(s)
if sort_by=='lm':
ind = ind[::-1]
v = v.take(ind, 1)
s = s.take(ind)
return s,v
def str2num(string_number):
"""Convert input (string number) into number, if float(string_number) fails, a nan is inserted.
"""
missings = ['','nan','NaN','NA']
try:
num = float(string_number)
except:
if string_number in missings:
num = nan
else:
print "Found strange entry: %s" %string_number
raise
return num
def randperm(n):
r = rand(n)
dict={}
for i in range(n):
dict[r[i]] = i
r = sort(r)
out = zeros(n)
for i in range(n):
out[i] = dict[r[i]]
return array(out).astype('i')
def mat_center(X,axis=0,ret_mn=False):
"""Mean center matrix along axis.
X -- matrix, data
axis -- dim,
ret_mn -- bool, return mean
output:
Xc, [mnX]
NB: axis = 1 is column-centering, axis=0=row-centering
default is row centering (axis=0)
"""
try:
rows,cols = X.shape
except ValueError:
print "The X data needs to be two-dimensional"
if axis==0:
mnX = mean(X,axis)[newaxis]
Xs = X - mnX
elif axis==1:
mnX = mean(X,axis)[newaxis]
Xs = (X.T - mnX).T
if ret_mn:
return Xs,mnX
else:
return Xs
def m_shape(array):
"""Returns the array shape on the form of a numpy.matrix."""
return matrix(array).shape

879
laydi/lib/engines.py Normal file
View File

@@ -0,0 +1,879 @@
"""Module contain algorithms for low-rank models.
There is almost no typechecking of any kind here, just focus on speed
"""
import math
import warnings
from scipy.linalg import svd,inv
from scipy import dot,empty,eye,newaxis,zeros,sqrt,diag,\
apply_along_axis,mean,ones,randn,empty_like,outer,r_,c_,\
rand,sum,cumsum,matrix, expand_dims,minimum,where,arange,inner,tile
has_sym = True
has_arpack = True
try:
from symeig import symeig
except:
has_sym = False
try:
from scipy.sandbox import arpack
except:
has_arpack = False
def pca(a, aopt,scale='scores',mode='normal',center_axis=0):
""" Principal Component Analysis.
Performs PCA on given matrix and returns results in a dictionary.
:Parameters:
a : array
Data measurement matrix, (samples x variables)
aopt : int
Number of components to use, aopt<=min(samples, variables)
:Returns:
results : dict
keys -- values, T -- scores, P -- loadings, E -- residuals,
lev --leverages, ssq -- sum of squares, expvar -- cumulative
explained variance, aopt -- number of components used
:OtherParam eters:
mode : str
Amount of info retained, ('fast', 'normal', 'detailed')
center_axis : int
Center along given axis. If neg.: no centering (-inf,..., matrix modes)
:SeeAlso:
- pcr : other blm
- pls : other blm
- lpls : other blm
Notes
-----
Uses kernel speed-up if m>>n or m<<n.
If residuals turn rank deficient, a lower number of component than given
in input will be used. The number of components used is given in
results-dict.
Examples
--------
>>> import scipy,engines
>>> a=scipy.asarray([[1,2,3],[2,4,5]])
>>> dat=engines.pca(a, 2)
>>> dat['expvarx']
array([0.,99.8561562, 100.])
"""
m, n = a.shape
assert(aopt<=min(m,n))
if center_axis>=0:
a = a - expand_dims(a.mean(center_axis), center_axis)
if m>(n+100) or n>(m+100):
u, s, v = esvd(a, amax=None) # fixme:amax option need to work with expl.var
else:
u, s, vt = svd(a, 0)
v = vt.T
e = s**2
tol = 1e-10
eff_rank = sum(s>s[0]*tol)
aopt = minimum(aopt, eff_rank)
T = u*s
s = s[:aopt]
T = T[:,:aopt]
P = v[:,:aopt]
if scale=='loads':
T = T/s
P = P*s
if mode == 'fast':
return {'T':T, 'P':P, 'aopt':aopt}
if mode=='detailed':
E = empty((aopt, m, n))
ssq = []
lev = []
for ai in range(aopt):
E[ai,:,:] = a - dot(T[:,:ai+1], P[:,:ai+1].T)
ssq.append([(E[ai,:,:]**2).mean(0), (E[ai,:,:]**2).mean(1)])
if scale=='loads':
lev.append([((s*T)**2).sum(1), (P**2).sum(1)])
else:
lev.append([(T**2).sum(1), ((s*P)**2).sum(1)])
else:
# residuals
E = a - dot(T, P.T)
#E = a
SEP = E**2
ssq = [SEP.sum(0), SEP.sum(1)]
# leverages
if scale=='loads':
lev = [(1./m)+(T**2).sum(1), (1./n)+((P/s)**2).sum(1)]
else:
lev = [(1./m)+((T/s)**2).sum(1), (1./n)+(P**2).sum(1)]
# variances
expvarx = r_[0, 100*e.cumsum()/e.sum()][:aopt+1]
return {'T':T, 'P':P, 'E':E, 'expvarx':expvarx, 'levx':lev, 'ssqx':ssq, 'aopt':aopt, 'eigvals': e[:aopt,newaxis]}
def pcr(a, b, aopt, scale='scores',mode='normal',center_axis=0):
""" Principal Component Regression.
Performs PCR on given matrix and returns results in a dictionary.
:Parameters:
a : array
Data measurement matrix, (samples x variables)
b : array
Data response matrix, (samples x responses)
aopt : int
Number of components to use, aopt<=min(samples, variables)
:Returns:
results : dict
keys -- values, T -- scores, P -- loadings, E -- residuals,
levx -- leverages, ssqx -- sum of squares, expvarx -- cumulative
explained variance, aopt -- number of components used
:OtherParameters:
mode : str
Amount of info retained, ('fast', 'normal', 'detailed')
center_axis : int
Center along given axis. If neg.: no centering (-inf,..., matrix modes)
:SeeAlso:
- pca : other blm
- pls : other blm
- lpls : other blm
Notes
-----
Uses kernel speed-up if m>>n or m<<n.
If residuals turn rank deficient, a lower number of component than given
in input will be used. The number of components used is given in results-dict.
Examples
--------
>>> import scipy,engines
>>> a=scipy.asarray([[1,2,3],[2,4,5]])
>>> b=scipy.asarray([[1,1],[2,3]])
>>> dat=engines.pcr(a, 2)
>>> dat['expvarx']
array([0.,99.8561562, 100.])
"""
k, l = m_shape(b)
if center_axis>=0:
b = b - expand_dims(b.mean(center_axis), center_axis)
dat = pca(a, aopt=aopt, scale=scale, mode=mode, center_axis=center_axis)
T = dat['T']
weights = apply_along_axis(vnorm, 0, T)**2
if scale=='loads':
Q = dot(b.T, T*weights)
else:
Q = dot(b.T, T/weights)
if mode=='fast':
dat.update({'Q':Q})
return dat
if mode=='detailed':
F = empty((aopt, k, l))
for i in range(aopt):
F[i,:,:] = b - dot(T[:,:i+1], Q[:,:i+1].T)
else:
F = b - dot(T, Q.T)
expvary = r_[0, 100*((T**2).sum(0)*(Q**2).sum(0)/(b**2).sum()).cumsum()[:aopt]]
#fixme: Y-var leverages
dat.update({'Q':Q, 'F':F, 'expvary':expvary})
return dat
def pls(a, b, aopt=2, scale='scores', mode='normal', center_axis=-1, ab=None):
"""Partial Least Squares Regression.
Performs PLS on given matrix and returns results in a dictionary.
:Parameters:
a : array
Data measurement matrix, (samples x variables)
b : array
Data response matrix, (samples x responses)
aopt : int
Number of components to use, aopt<=min(samples, variables)
:Returns:
results : dict
keys -- values, T -- scores, P -- loadings, E -- residuals,
levx -- leverages, ssqx -- sum of squares, expvarx -- cumulative
explained variance of descriptors, expvary -- cumulative explained
variance of responses, aopt -- number of components used
:OtherParameters:
mode : str
Amount of info retained, ('fast', 'normal', 'detailed')
center_axis : int
Center along given axis. If neg.: no centering (-inf,..., matrix modes)
:SeeAlso:
- pca : other blm
- pcr : other blm
- lpls : other blm
Notes
-----
Uses kernel speed-up if m>>n or m<<n.
If residuals turn rank deficient, a lower number of component than given
in input will be used. The number of components used is given in results-dict.
Examples
--------
>>> import scipy,engines
>>> a=scipy.asarray([[1,2,3],[2,4,5]])
>>> b=scipy.asarray([[1,1],[2,3]])
>>> dat=engines.pls(a, b, 2)
>>> dat['expvarx']
array([0.,99.8561562, 100.])
"""
m, n = m_shape(a)
if ab!=None:
mm, l = m_shape(ab)
assert(m==mm)
else:
k, l = m_shape(b)
if center_axis>=0:
a = a - expand_dims(a.mean(center_axis), center_axis)
b = b - expand_dims(b.mean(center_axis), center_axis)
W = empty((n, aopt))
P = empty((n, aopt))
R = empty((n, aopt))
Q = empty((l, aopt))
T = empty((m, aopt))
B = empty((aopt, n, l))
tt = empty((aopt,))
if ab==None:
ab = dot(a.T, b)
for i in range(aopt):
if ab.shape[1]==1: #pls 1
w = ab.reshape(n, l)
w = w/vnorm(w)
elif n<l: # more yvars than xvars
if has_sym:
s, w = symeig(dot(ab, ab.T),range=[n,n],overwrite=True)
else:
w, s, vh = svd(dot(ab, ab.T))
w = w[:,:1]
else: # standard wide xdata
if has_sym:
s, q = symeig(dot(ab.T, ab),range=[l,l],overwrite=True)
else:
q, s, vh = svd(dot(ab.T, ab))
q = q[:,:1]
w = dot(ab, q)
w = w/vnorm(w)
r = w.copy()
if i>0:
for j in range(0, i, 1):
r = r - dot(P[:,j].T, w)*R[:,j][:,newaxis]
t = dot(a, r)
tt[i] = tti = dot(t.T, t).ravel()
p = dot(a.T, t)/tti
q = dot(r.T, ab).T/tti
ab = ab - dot(p, q.T)*tti
T[:,i] = t.ravel()
W[:,i] = w.ravel()
if mode=='fast' and i==aopt-1:
if scale=='loads':
tnorm = sqrt(tt)
T = T/tnorm
W = W*tnorm
return {'T':T, 'W':W}
P[:,i] = p.ravel()
R[:,i] = r.ravel()
Q[:,i] = q.ravel()
#B[i] = dot(R[:,:i+1], Q[:,:i+1].T)
qnorm = apply_along_axis(vnorm, 0, Q)
tnorm = sqrt(tt)
pp = (P**2).sum(0)
if mode=='detailed':
E = empty((aopt, m, n))
F = empty((aopt, k, l))
ssqx, ssqy = [], []
leverage = empty((aopt, m))
h2x = [] #hotellings T^2
h2y = []
for ai in range(aopt):
E[ai,:,:] = a - dot(T[:,:ai+1], P[:,:ai+1].T)
F[i-1] = b - dot(T[:,:i], Q[:,:i].T)
ssqx.append([(E[ai,:,:]**2).mean(0), (E[ai,:,:]**2).mean(1)])
ssqy.append([(F[ai,:,:]**2).mean(0), (F[ai,:,:]**2).mean(1)])
leverage[ai,:] = 1./m + ((T[:,:ai+1]/tnorm[:ai+1])**2).sum(1)
h2y.append(1./k + ((Q[:,:ai+1]/qnorm[:ai+1])**2).sum(1))
else:
# residuals
E = a - dot(T, P.T)
F = b - dot(T, Q.T)
sepx = E**2
ssqx = [sepx.sum(0), sepx.sum(1)]
sepy = F**2
ssqy = [sepy.sum(0), sepy.sum(1)]
# leverage
leverage = 1./m + ((T/tnorm)**2).sum(1)
h2x = []
h2y = []
# variances
tp= tt*pp
tq = tt*qnorm*qnorm
expvarx = r_[0, 100*tp/(a*a).sum()]
expvary = r_[0, 100*tq/(b*b).sum()]
if scale=='loads':
T = T/tnorm
W = W*tnorm
Q = Q*tnorm
P = P*tnorm
return {'Q':Q, 'P':P, 'T':T, 'W':W, 'R':R, 'E':E, 'F':F,
'expvarx':expvarx, 'expvary':expvary, 'ssqx':ssqx, 'ssqy':ssqy,
'leverage':leverage, 'h2':h2x}
def w_simpls(aat, b, aopt):
""" Simpls for wide matrices.
Fast pls for crossval, used in calc rmsep for wide X
There is no P or W. T is normalised
"""
bb = b.copy()
m, m = aat.shape
U = empty((m, aopt)) # W
T = empty((m, aopt))
H = empty((m, aopt)) # R
PROJ = empty((m, aopt)) # P?
for i in range(aopt):
q, s, vh = svd(dot(dot(b.T, aat), b), full_matrices=0)
u = dot(b, q[:,:1]) #y-factor scores
U[:,i] = u.ravel()
t = dot(aat, u)
t = t/vnorm(t)
T[:,i] = t.ravel()
h = dot(aat, t) #score-weights
H[:,i] = h.ravel()
PROJ[:,:i+1] = dot(T[:,:i+1], inv(dot(T[:,:i+1].T, H[:,:i+1])) )
if i<aopt:
b = b - dot(PROJ[:,:i+1], dot(H[:,:i+1].T,b) )
C = dot(bb.T, T)
return {'T':T, 'U':U, 'Q':C, 'H':H}
def w_pls(aat, b, aopt):
""" Pls for wide matrices.
Fast pls for crossval, used in calc rmsep for wide X
There is no P or W. T is normalised
aat = centered kernel matrix
b = centered y
"""
bb = b.copy()
k, l = m_shape(b)
m, m = m_shape(aat)
U = empty((m, aopt)) # W
T = empty((m, aopt))
R = empty((m, aopt)) # R
PROJ = empty((m, aopt)) # P?
for i in range(aopt):
if has_sym:
s, q = symeig(dot(dot(b.T, aat), b), range=(l,l),overwrite=True)
else:
q, s, vh = svd(dot(dot(b.T, aat), b), full_matrices=0)
q = q[:,:1]
u = dot(b , q) #y-factor scores
U[:,i] = u.ravel()
t = dot(aat, u)
t = t/vnorm(t)
T[:,i] = t.ravel()
r = dot(aat, t)#score-weights
#r = r/vnorm(r)
R[:,i] = r.ravel()
PROJ[:,: i+1] = dot(T[:,:i+1], inv(dot(T[:,:i+1].T, R[:,:i+1])) )
if i<aopt:
b = b - dot(PROJ[:,:i+1], dot(R[:,:i+1].T, b) )
C = dot(bb.T, T)
return {'T':T, 'U':U, 'Q':C, 'R':R}
def bridge(a, b, aopt, scale='scores', mode='normal', r=0):
"""Undeflated Ridged svd(X'Y)
"""
m, n = m_shape(a)
k, l = m_shape(b)
u, s, vt = svd(b, full_matrices=0)
g0 = dot(u*s, u.T)
g = (1 - r)*g0 + r*eye(m)
ag = dot(a.T, g)
u, s, vt = svd(ag, full_matrices=0)
W = u[:,:aopt]
K = vt[:aopt,:].T
T = dot(a, W)
tnorm = apply_along_axis(vnorm, 0, T) # norm of T-columns
if mode == 'fast':
if scale=='loads':
T = T/tnorm
W = W*tnorm
return {'T':T, 'W':W}
U = dot(g0, K) #fixme check this
Q = dot(b.T, dot(T, inv(dot(T.T, T)) ))
B = zeros((aopt, n, l), dtype='f')
for i in range(aopt):
B[i] = dot(W[:,:i+1], Q[:,:i+1].T)
if mode == 'detailed':
E = empty((aopt, m, n))
F = empty((aopt, k, l))
for i in range(aopt):
E[i] = a - dot(T[:,:i+1], W[:,:i+1].T)
F[i] = b - dot(a, B[i])
else: #normal
F = b - dot(a, B[-1])
E = a - dot(T, W.T)
if scale=='loads':
T = T/tnorm
W = W*tnorm
Q = Q*tnorm
return {'B':B, 'W':W, 'T':T, 'Q':Q, 'E':E, 'F':F, 'U':U, 'P':W}
def nipals_lpls(X, Y, Z, a_max, alpha=.7, mean_ctr=[2, 0, 1], scale='scores', verbose=False):
""" L-shaped Partial Least Sqaures Regression by the nipals algorithm.
(X!Z)->Y
:input:
X : data matrix (m, n)
Y : data matrix (m, l)
Z : data matrix (n, o)
:output:
T : X-scores
W : X-weights/Z-weights
P : X-loadings
Q : Y-loadings
U : X-Y relation
L : Z-scores
K : Z-loads
B : Regression coefficients X->Y
b0: Regression coefficient intercept
evx : X-explained variance
evy : Y-explained variance
evz : Z-explained variance
mnx : X location
mny : Y location
mnz : Z location
:Notes:
"""
if mean_ctr!=None:
xctr, yctr, zctr = mean_ctr
X, mnX = center(X, xctr)
Y, mnY = center(Y, yctr)
Z, mnZ = center(Z, zctr)
varX = (X**2).sum()
varY = (Y**2).sum()
varZ = (Z**2).sum()
m, n = X.shape
k, l = Y.shape
u, o = Z.shape
# initialize
U = empty((k, a_max))
Q = empty((l, a_max))
T = empty((m, a_max))
W = empty((n, a_max))
P = empty((n, a_max))
K = empty((o, a_max))
L = empty((u, a_max))
B = empty((a_max, n, l))
#b0 = empty((a_max, 1, l))
var_x = empty((a_max,))
var_y = empty((a_max,))
var_z = empty((a_max,))
MAX_ITER = 250
LIM = 1e-1
for a in range(a_max):
if verbose:
print "\nWorking on comp. %s" %a
u = Y[:,:1]
diff = 1
niter = 0
while (diff>LIM and niter<MAX_ITER):
niter += 1
u1 = u.copy()
w = dot(X.T, u)
w = w/sqrt(dot(w.T, w))
#w = w/dot(w.T, w)
l = dot(Z, w)
k = dot(Z.T, l)
k = k/sqrt(dot(k.T, k))
#k = k/dot(k.T, k)
w = alpha*k + (1-alpha)*w
#print sqrt(dot(w.T, w))
w = w/sqrt(dot(w.T, w))
t = dot(X, w)
c = dot(Y.T, t)
c = c/sqrt(dot(c.T, c))
u = dot(Y, c)
diff = dot((u-u1).T, (u-u1))
if verbose:
print "Converged after %s iterations" %niter
print "Error: %.2E" %diff
tt = dot(t.T, t)
p = dot(X.T, t)/tt
q = dot(Y.T, t)/tt
l = dot(Z, w)
U[:,a] = u.ravel()
W[:,a] = w.ravel()
P[:,a] = p.ravel()
T[:,a] = t.ravel()
Q[:,a] = q.ravel()
L[:,a] = l.ravel()
K[:,a] = k.ravel()
X = X - dot(t, p.T)
Y = Y - dot(t, q.T)
Z = (Z.T - dot(w, l.T)).T
var_x[a] = pow(X, 2).sum()
var_y[a] = pow(Y, 2).sum()
var_z[a] = pow(Z, 2).sum()
B[a] = dot(dot(W[:,:a+1], inv(dot(P[:,:a+1].T, W[:,:a+1]))), Q[:,:a+1].T)
#b0[a] = mnY - dot(mnX, B[a])
# variance explained
evx = 100.0*(1 - var_x/varX)
evy = 100.0*(1 - var_y/varY)
evz = 100.0*(1 - var_z/varZ)
if scale=='loads':
tnorm = apply_along_axis(vnorm, 0, T)
T = T/tnorm
W = W*tnorm
Q = Q*tnorm
knorm = apply_along_axis(vnorm, 0, K)
L = L*knorm
K = K/knorm
return {'T':T, 'W':W, 'P':P, 'Q':Q, 'U':U, 'L':L, 'K':K, 'B':B, 'evx':evx, 'evy':evy, 'evz':evz,'mnx': mnX, 'mny': mnY, 'mnz': mnZ}
def nipals_pls(X, Y, a_max, alpha=.7, ax_center=0, mode='normal', scale='scores', verbose=False):
"""Partial Least Sqaures Regression by the nipals algorithm.
(X!Z)->Y
:input:
X : data matrix (m, n)
Y : data matrix (m, l)
:output:
T : X-scores
W : X-weights/Z-weights
P : X-loadings
Q : Y-loadings
U : X-Y relation
B : Regression coefficients X->Y
b0: Regression coefficient intercept
evx : X-explained variance
evy : Y-explained variance
evz : Z-explained variance
:Notes:
"""
if ax_center>=0:
mn_x = expand_dims(X.mean(ax_center), ax_center)
mn_y = expand_dims(Y.mean(ax_center), ax_center)
X = X - mn_x
Y = Y - mn_y
varX = pow(X, 2).sum()
varY = pow(Y, 2).sum()
m, n = X.shape
k, l = Y.shape
# initialize
U = empty((k, a_max))
Q = empty((l, a_max))
T = empty((m, a_max))
W = empty((n, a_max))
P = empty((n, a_max))
B = empty((a_max, n, l))
b0 = empty((a_max, m, l))
var_x = empty((a_max,))
var_y = empty((a_max,))
t1 = X[:,:1]
for a in range(a_max):
if verbose:
print "\n Working on comp. %s" %a
u = Y[:,:1]
diff = 1
MAX_ITER = 100
lim = 1e-16
niter = 0
while (diff>lim and niter<MAX_ITER):
niter += 1
#u1 = u.copy()
w = dot(X.T, u)
w = w/sqrt(dot(w.T, w))
#l = dot(Z, w)
#k = dot(Z.T, l)
#k = k/sqrt(dot(k.T, k))
#w = alpha*k + (1-alpha)*w
#w = w/sqrt(dot(w.T, w))
t = dot(X, w)
q = dot(Y.T, t)
q = q/sqrt(dot(q.T, q))
u = dot(Y, q)
diff = vnorm(t1 - t)
t1 = t.copy()
if verbose:
print "Converged after %s iterations" %niter
#tt = dot(t.T, t)
#p = dot(X.T, t)/tt
#q = dot(Y.T, t)/tt
#l = dot(Z, w)
p = dot(X.T, t)/dot(t.T, t)
p_norm = vnorm(p)
t = t*p_norm
w = w*p_norm
p = p/p_norm
U[:,a] = u.ravel()
W[:,a] = w.ravel()
P[:,a] = p.ravel()
T[:,a] = t.ravel()
Q[:,a] = q.ravel()
X = X - dot(t, p.T)
Y = Y - dot(t, q.T)
var_x[a] = pow(X, 2).sum()
var_y[a] = pow(Y, 2).sum()
B[a] = dot(dot(W[:,:a+1], inv(dot(P[:,:a+1].T, W[:,:a+1]))), Q[:,:a+1].T)
b0[a] = mn_y - dot(mn_x, B[a])
# variance explained
evx = 100.0*(1 - var_x/varX)
evy = 100.0*(1 - var_y/varY)
if scale=='loads':
tnorm = apply_along_axis(vnorm, 0, T)
T = T/tnorm
W = W*tnorm
Q = Q*tnorm
return {'T':T, 'W':W, 'P':P, 'Q':Q, 'U':U, 'B':B, 'b0':b0, 'evx':evx, 'evy':evy,
'mnx': mnX, 'mny': mnY, 'xc': X, 'yc': Y}
########### Helper routines #########
def m_shape(array):
return matrix(array).shape
def esvd(data, amax=None):
"""SVD with the option of economy sized calculation
Calculate subspaces of X'X or XX' depending on the shape
of the matrix.
Good for extreme fat or thin matrices
:notes:
Numpy supports this by setting full_matrices=0
"""
has_arpack = True
try:
import arpack
except:
has_arpack = False
m, n = data.shape
if m>=n:
kernel = dot(data.T, data)
if has_arpack:
if amax==None:
amax = n
s, v = arpack.eigen_symmetric(kernel,k=amax, which='LM',
maxiter=200,tol=1e-5)
if has_sym:
if amax==None:
amax = n
pcrange = None
else:
pcrange = [n-amax, n]
s, v = symeig(kernel, range=pcrange, overwrite=True)
s = s[::-1].real
v = v[:,::-1].real
else:
u, s, vt = svd(kernel)
v = vt.T
s = sqrt(s)
u = dot(data, v)/s
else:
kernel = dot(data, data.T)
if has_sym:
if amax==None:
amax = m
pcrange = None
else:
pcrange = [m-amax, m]
s, u = symeig(kernel, range=pcrange, overwrite=True)
s = s[::-1]
u = u[:,::-1]
else:
u, s, vt = svd(kernel)
s = sqrt(s)
v = dot(data.T, u)/s
# some use of symeig returns the 0 imaginary part
return u.real, s.real, v.real
def vnorm(x):
# assume column arrays (or vectors)
return math.sqrt(dot(x.T, x))
def center(a, axis):
# 0 = col center, 1 = row center, 2 = double center
# -1 = nothing
# check if we have a vector
is_vec = len(a.shape)==1
if not is_vec:
is_vec = a.shape[0]==1 or a.shape[1]==1
if is_vec:
if axis==2:
warnings.warn("Double centering of vecor ignored, using ordinary centering")
if axis==-1:
mn = 0
else:
mn = a.mean()
return a - mn, mn
# !!!fixme: use broadcasting
if axis==-1:
mn = zeros((1,a.shape[1],))
#mn = tile(mn, (a.shape[0], 1))
elif axis==0:
mn = a.mean(0)[newaxis]
#mn = tile(mn, (a.shape[0], 1))
elif axis==1:
mn = a.mean(1)[:,newaxis]
#mn = tile(mn, (1, a.shape[1]))
elif axis==2:
mn = a.mean(0)[newaxis] + a.mean(1)[:,newaxis] - a.mean()
return a - mn , a.mean(0)[newaxis]
else:
raise IOError("input error: axis must be in [-1,0,1,2]")
return a - mn, mn
def scale(a, axis):
if axis==-1:
sc = zeros((a.shape[1],))
elif axis==0:
sc = a.std(0)
elif axis==1:
sc = a.std(1)[:,newaxis]
else:
raise IOError("input error: axis must be in [-1,0,1]")
return a - sc, sc
## #PCA CALCS
## % Calculate Q limit using unused eigenvalues
## temp = diag(s);
## if n < m
## emod = temp(lv+1:n,:);
## else
## emod = temp(lv+1:m,:);
## end
## th1 = sum(emod);
## th2 = sum(emod.^2);
## th3 = sum(emod.^3);
## h0 = 1 - ((2*th1*th3)/(3*th2^2));
## if h0 <= 0.0
## h0 = .0001;
## disp(' ')
## disp('Warning: Distribution of unused eigenvalues indicates that')
## disp(' you should probably retain more PCs in the model.')
## end
## q = th1*(((1.65*sqrt(2*th2*h0^2)/th1) + 1 + th2*h0*(h0-1)/th1^2)^(1/h0));
## disp(' ')
## disp('The 95% Q limit is')
## disp(q)
## if plots >= 1
## lim = [q q];
## plot(scl,res,scllim,lim,'--b')
## str = sprintf('Process Residual Q with 95 Percent Limit Based on %g PC Model',lv);
## title(str)
## xlabel('Sample Number')
## ylabel('Residual')
## pause
## end
## % Calculate T^2 limit using ftest routine
## if lv > 1
## if m > 300
## tsq = (lv*(m-1)/(m-lv))*ftest(.95,300,lv,2);
## else
## tsq = (lv*(m-1)/(m-lv))*ftest(.95,m-lv,lv,2);
## end
## disp(' ')
## disp('The 95% T^2 limit is')
## disp(tsq)
## % Calculate the value of T^2 by normalizing the scores to
## % unit variance and summing them up
## if plots >= 1.0
## temp2 = scores*inv(diag(ssq(1:lv,2).^.5));
## tsqvals = sum((temp2.^2)');
## tlim = [tsq tsq];
## plot(scl,tsqvals,scllim,tlim,'--b')
## str = sprintf('Value of T^2 with 95 Percent Limit Based on %g PC Model',lv);
## title(str)
## xlabel('Sample Number')
## ylabel('Value of T^2')
## end
## else
## disp('T^2 not calculated when number of latent variables = 1')
## tsq = 1.96^2;
## end

95
laydi/lib/hypergeom.py Normal file
View File

@@ -0,0 +1,95 @@
import scipy
try:
# FIXME: remove rpy in a more proper way
import rpy_does_not_exist
has_rpy = True
silent_eval = rpy.with_mode(rpy.NO_CONVERSION, rpy.r)
except:
has_rpy = False
def gene_hypergeo_test(selection, category_dataset):
"""Returns the pvals from a hypergeometric test of significance.
input:
-- selection: list of selected identifiers along 0 dim of cat.set
-- category dataset, categories along dim 1 (cols)
"""
gene_dim_name = category_dataset.get_dim_name(0)
category_dim_name = category_dataset.get_dim_name(1)
#categories
all_cats = category_dataset.get_identifiers(category_dim_name, sorted=True)
# gene_ids universe
all_genes = category_dataset.get_identifiers(gene_dim_name)
# signifcant genes
good_genes_all = list(selection)
gg_index = category_dataset.get_indices(gene_dim_name, good_genes_all)
# significant genes pr. category
good_genes_cat = []
for col in category_dataset.asarray().T:
index = scipy.where(col==1)[0]
index = scipy.intersect1d(index, gg_index)
if index.size==0:
good_genes_cat.append([])
else:
good_genes_cat.append(category_dataset.get_identifiers(gene_dim_name, index))
count = map(len, good_genes_cat)
count = scipy.asarray([max(i, 0) for i in count])
cat_count = category_dataset.asarray().sum(0)
if has_rpy:
rpy.r.assign("x", count - 1) #number of sign. genes in category i
rpy.r.assign("m", len(good_genes_all)) # number of sign. genes tot
rpy.r.assign("n", len(all_genes)-len(good_genes_all) ) # num. genes not sign.
rpy.r.assign("k", cat_count) #num. genes in cat i
silent_eval('pvals <- phyper(x, m, n, k, lower.tail=FALSE)')
pvals = rpy.r("pvals")
else:
pvals = p_hyper_geom(count, len(good_genes_all),
len(all_genes)-len(good_genes_all),
cat_count)
pvals = scipy.where(cat_count==0, 2, pvals)
pvals = scipy.where(scipy.isnan(pvals), 2, pvals)
out = {}
for i in range(pvals.size):
out[str(all_cats[i])] = (count[i], cat_count[i], pvals[i])
return out
def p_hyper_geom(x, m, n, k):
"""Distribution function for the hypergeometric distribution.
Inputs:
-- x: vector of quantiles representing the number of white balls
drawn without replacement from an urn which contains both
black and white balls.
-- m: the number of white balls in the urn.
-- n: the number of black balls in the urn.
-- k: [vector] the number of balls drawn from the urn
Comments:
Similar to R's phyper with lower.tail=FALSE
"""
M = m + n
multiple_draws = False
if isinstance(k, scipy.ndarray) and k.size>1:
multiple_draws = True
n_draws = k.size
if n_draws<x.size:
print "n_draws: %d and n_found: %d Length mismatch, zero padded" %(k.size, x.size)
N = k
n = m
if not multiple_draws:
out = scipy.stats.hypergeom.pmf(x, M, n, N).cumsum()
else:
out = scipy.zeros((max(n_draws, x.size),))
for i in xrange(N.size):
out[i] = scipy.stats.hypergeom.pmf(x, M, n, N[i]).cumsum()[i]
return out

567
laydi/lib/nx_utils.py Normal file
View File

@@ -0,0 +1,567 @@
import os,sys
from itertools import izip
import networkx as NX
from scipy import shape,diag,dot,asarray,sqrt,real,zeros,eye,exp,maximum,\
outer,maximum,sum,diag,real,atleast_2d
from scipy.linalg import eig,svd,inv,expm,norm
from cx_utils import sorted_eig
import numpy
eps = numpy.finfo(float).eps.item()
feps = numpy.finfo(numpy.single).eps.item()
_array_precision = {'f': 0, 'd': 1, 'F': 0, 'D': 1,'i': 1}
class NXUTILSException(Exception): pass
def xgraph_to_graph(G):
"""Convert an Xgraph to an ordinary graph.
Edge attributes, mult.edges and self-loops are lost in the process.
"""
GG = NX.convert.from_dict_of_lists(NX.convert.to_dict_of_lists(G))
return GG
def get_affinity_matrix(G, data, ids, dist='e', mask=None, weight=None, t=0, out='dist'):
"""
Function for calculating a general affinity matrix, based upon distances.
Affiniy = 1 - distance ((10-1) 1 is far apart)
INPUT
data:
gene expression data, type dict data[gene] = expression-vector
G:
The network (networkx.base.Graph object)
mask:
The array mask shows which data are missing. If mask[i][j]==0, then
data[i][j] is missing.
weights:
The array weight contains the weights to be used when calculating distances.
transpose:
If transpose==0, then genes are clustered. If transpose==1, microarrays are
clustered.
dist:
The character dist defines the distance function to be used:
dist=='e': Euclidean distance
dist=='b': City Block distance
dist=='h': Harmonically summed Euclidean distance
dist=='c': Pearson correlation
dist=='a': absolute value of the correlation
dist=='u': uncentered correlation
dist=='x': absolute uncentered correlation
dist=='s': Spearman's rank correlation
dist=='k': Kendall's tau
For other values of dist, the default (Euclidean distance) is used.
OUTPUT
D :
Similariy matrix (nGenes x nGenes), symetric, d_ij e in [0,1]
Normalized so max weight = 1.0
"""
try:
from Bio import Cluster as CLS
except:
raise NXUTILSError("Import of Biopython failed")
n_var = len(data)
n_samp = len(data[data.keys()[0]])
X = zeros((nVar, nSamp),dtpye='<f8')
for i, gene in enumerate(ids): #this shuld be right!!
X[i,:] = data[gene]
#X = transpose(X) # distancematrix needs matrix as (nGenes,nSamples)
D_list = CLS.distancematrix(X, dist=dist)
D = zeros((nVar,nVar),dtype='<f8')
for i,row in enumerate(D_list):
if i>0:
D[i,:len(row)]=row
D = D + D.T
MAX = 30.0
D_max = max(ravel(D))/MAX
D_n = D/D_max #normalised (max = 10.0)
D_n = (MAX+1.) - D_n #using correlation (inverse distance for dists)
A = NX.adj_matrix(G, nodelist=ids)
if out=='dist':
return D_n*A
elif out=='heat_kernel':
t=1.0
K = exp(-t*D*A)
return K
elif out=='complete':
return D_n
else:
return []
def remove_one_degree_nodes(G, iter=True):
"""Removes all nodes with only one neighbour. These nodes does
not contribute to community structure.
input:
G -- graph
iter -- True/False iteratively remove?
"""
G_copy = G.copy()
if iter==True:
while 1:
bad_nodes=[]
for node in G_copy.nodes():
if len(G_copy.neighbors(node))==1:
bad_nodes.append(node)
if len(bad_nodes)>0:
G_copy.delete_nodes_from(bad_nodes)
else:
break
else:
bad_nodes=[]
for ngb in G_copy.neighbors_iter():
if len(G_copy.neighbors(node))==1:
bad_nodes.append(node)
if len(bad_nodes)>0:
G_copy.delete_nodes_from(bad_nodes)
print "Deleted %s nodes from network" %(len(G)-len(G_copy))
return G_copy
def key_players(G, n=1, with_labels=False):
"""
Resilince measure
Identification of key nodes by fraction of nodes in
disconnected subgraph when the node is removed.
output:
fraction of nodes disconnected when node i is removed
"""
i=0
frac=[]
labels = {}
for node in G.nodes():
i+=1
print i
T = G.copy()
T.delete_node(node)
n_nodes = T.number_of_nodes()
sub_graphs = NX.connected_component_subgraphs(T)
n = len(sub_graphs)
if n>1:
strong_comp = sub_graphs[0]
fraction = 1.0 - 1.0*strong_comp.number_of_nodes()/n_nodes
frac.append(fraction)
labels[node]=fraction
else:
frac.append(0.0)
labels[node]=0.0
out = 1.0 - array(frac)
if with_labels==True:
return out,labels
else:
return out
def node_weighted_adj_matrix(G, weights=None, ave_type='harmonic', with_labels=False):
"""Return a weighted adjacency matrix of graph. The weights are
node weights.
input: G -- graph
weights -- dict, keys: nodes, values: weights
with_labels -- True/False, return labels?
output: A -- weighted eadjacency matrix
[index] -- node labels
"""
n=G.order()
# make an dictionary that maps vertex name to position
index={}
count=0
for node in G.nodes():
index[node]=count
count = count+1
a = zeros((n,n))
if type(G)=='networkx.xbase.XGraph':
raise
for head,tail in G.edges():
if ave_type == 'geometric':
a[index[head],index[tail]]= sqrt(weights[head]*weights[tail])
a[index[tail],index[head]]= a[index[head],index[tail]]
elif ave_type == 'harmonic':
a[index[head],index[tail]] = mean(weights[head],weights[tail])
a[index[tail],index[head]]= mean(weights[head],weights[tail])
if with_labels:
return a,index
else:
return a
def weighted_adj_matrix(G, with_labels=False):
"""Adjacency matrix of an XGraph whos weights are given in edges.
"""
A, labels = NX.adj_matrix(G, with_labels=True)
W = A.astype('<f8')
for orf, i in labels.items():
for orf2, j in labels.items():
if G.has_edge(orf, orf2):
edge_weight = G.get_edge(orf, orf2)
W[i,j] = edge_weight
W[j,i] = edge_weight
if with_labels==True:
return W, labels
else:
return W
def assortative_index(G):
"""Ouputs two vectors: the degree and the neighbor average degree.
Used to measure the assortative mixing. If the average degree is
pos. correlated with the degree we know that hubs tend to connect
to other hubs.
input: G, graph connected!!
ouput: d,mn_d: degree, and average degree of neighb.
(degree sorting from degree(with_labels=True))
"""
d = G.degree(with_labels=True)
out=[]
for node in G.nodes():
nn = G.neighbors(node)
if len(nn)>0:
nn_d = mean([float(d[i]) for i in nn])
out.append((d[node], nn_d))
return array(out).T
def struct_equivalence(G,n1,n2):
"""Returns the structural equivalence of a node pair. Two nodes
are structural equal if they share the same neighbors.
x_s = [ne(n1) union ne(n2) - ne(n1) intersection ne(n2)]/[ne(n1)
union ne(n2) + ne(n1) intersection ne(n2)]
ref: Brun et.al 2003
"""
#[ne(n1) union ne(n2) - ne(n1) intersection ne(n2
s1 = set(G.neighbors(n1))
s2 = set(G.neighbors(n2))
num_union = len(s1.union(s2))
num_intersection = len(s1.intersection(s2))
if num_union & num_intersection:
xs=0
else:
xs = (num_union - num_intersection)/(num_union + num_intersection)
return xs
def struct_equivalence_all(G):
"""Not finnished.
"""
A,labels = NX.adj_matrix(G,with_labels=True)
pass
def hamming_distance(n1,n2):
"""Not finnsihed.
"""
pass
def graph_corrcoeff(G, vec=None, nodelist=None, sim='corr'):
"""Returns the correlation coefficient for each node. The
correlation coefficient is between the node and its neighbours.
"""
if nodelist==None:
nodelist=G.nodes()
if vec == None:
vec = G.degree(nodelist)
if len(vec)!=len(nodelist):
raise NXUTILSError("The node value vector is not of same length (%s) as the nodelist(%s)") %(len(vec), len(nodelist))
A = NX.ad_matrix(G, nodelist=nodelist)
for i, node in enumerate(nodelist):
nei_i = A[i,:]==1
vec_i = vec[nei_i]
def weighted_laplacian(G,with_labels=False):
"""Return standard Laplacian of graph from a weighted adjacency matrix."""
n= G.order()
I = scipy.eye(n)
A = weighted_adj_matrix(G)
D = I*scipy.sum(A, 0)
L = D-A
if with_labels:
A,index = weighted_adj_matrix(G, with_labels=True)
return L, index
else:
return L
def grow_subnetworks(G, T2):
"""Return the highest scoring (T2-test) subgraph og G.
Use simulated annealing to identify highly grow subgraphs.
ref: -- Ideker et.al (Bioinformatics 18, 2002)
-- Patil and Nielsen (PNAS 2006)
"""
N = 1000
states = [(node, False) for node in G.nodes()]
t2_last = 0.0
for i in xrange(N):
if i==0: #assign random states
states = [(state[0], True) for state in states if rand(1)>.5]
sub_nodes = [state[0] for state in states if state[1]]
Gsub = NX.subgraph(G, sub_nodes)
Gsub = NX.connected_components_subgraphs(Gsub)[0]
t2 = [T2[node] for node in Gsub]
if t2>t2_last:
pass
else:
p = numpy.exp()
"""Below are methods for calculating graph metrics
Four main decompositions :
0.) Adjacency diffusion kernel expm(A),
1.) von neumann kernels (diagonalisation of adjacency matrix)
2.) laplacian kernels (geometric series of adj.)
3.) diffusion kernels (exponential series of adj.)
---- Kv
von_neumann : Kv = (I-alpha*A)^-1 (mod: A(I-alpha*A)^-1)? ,
geom. series
---- Kl
laplacian: Kl = (I-alpha*L)^-1 , geom. series
---- Kd
laplacian_diffusion: Kd = expm(-alpha*L)
exp. series
---- Ke
Exponential diffusion.
Ke = expm(A) .... expm(-A)?
"""
# TODO:
# check for numerical unstable eigenvalues and set to zero
# othervise some inverses wil explode ->ok ..using pinv for inverses
#
# This gives results that look numerical unstable
#
# -- divided adj by sum(A[:]), check this one (paper by Lebart scales with number of edges)
#
#
#
# the neumann kernel is defined in Kandola to be K = A*(I-A)^-1
# lowest eigenvectors are same as the highest of K = A*A ?
# this needs clarification
# diffusion is still wrong! ... ok
# diff needs normalisation?! check the meaning of exp(-s) = exp(1/s) -L = 1/degree ... etc
# Is it the negative of exp. of adj. metrix in Kandola?
#
# Normalised=False returns only nans (no idea why!!) ... fixed ok
# 31.1: diff is ok exp(0)=1 not zero!
# 07.03.2005: normalisation is ok: -> normalisation will emphasize high degree nodes
# 10.03.2005: symeig is unstable an returns nans of some eigenvectors? switching back to eig
# 14.05.2006: diffusion returns negative values, using expm(-LL) instead (FIX)
# 13.09.2206: update for use in numpy
# 27.04.2007: diffusion now uses pade approximations to matrix exponential. Also the last
def K_expAdj(W, normalised=True, alpha=1.0):
"""Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel.
"""
W = asarray(W)
t = W.dtype.char
if len(W.shape)!=2:
raise ValueError, "Non-matrix input to matrix function."
m,n = W.shape
if t in ['F','D']:
raise TypeError, "Complex input!"
if normalised==True:
T = diag( sqrt( 1./(sum(W,0))) )
W = dot(dot(T, W), T)
e,vr = eig(W)
s = real(e)**2 # from eigenvalues to singularvalues
vri = inv(vr)
s = maximum.reduce(s) + s
cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
cutoff = abs(cond*maximum.reduce(s))
psigma = eye(m)
for i in range(len(s)):
if abs(s[i]) > cutoff:
psigma[i,i] = .5*alpha*exp(s[i])
return dot(dot(vr,psigma),vri)
def K_vonNeumann(W, normalised=True, alpha=1.0):
""" The geometric series of path lengths.
Returns matrix square root of pseudo inverse of the adjacency matrix.
"""
W = asarray(W)
t = W.dtype.char
if len(W.shape)!=2:
raise ValueError, "Non-matrix input to matrix function."
m,n = W.shape
if t in ['F','D']:
raise TypeError, "Complex input!"
if normalised==True:
T = diag(sqrt(1./(sum(W,0))))
W = dot(dot(T,W),T)
e,vr = eig(W)
vri = inv(vr)
e = real(e) # we only work with real pos. eigvals
e = maximum.reduce(e) + e
cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
cutoff = cond*maximum.reduce(e)
psigma = zeros((m,n),t)
for i in range(len(e)):
if e[i] > cutoff:
psigma[i,i] = 1.0/e[i] #these are eig.vals (=sqrt(sing.vals))
return dot(dot(vr,psigma),vri).astype(t)
def K_laplacian(W, normalised=True, alpha=1.0):
""" This is the matrix pseudo inverse of L.
Also known as the average commute time matrix.
"""
W = asarray(W)
t = W.dtype.char
if len(W.shape)!=2:
raise ValueError, "Non-matrix input to matrix function."
m,n = W.shape
if t in ['F','D']:
raise TypeError, "Complex input!"
D = diag(sum(W,0))
L = D - W
if normalised==True:
T = diag(sqrt(1./sum(W, 0)))
L = dot(dot(T, L), T)
e,vr = eig(L)
e = real(e)
vri = inv(vr)
cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
cutoff = cond*maximum.reduce(e)
psigma = zeros((m,),t) # if s close to zero -> set 1/s = 0
for i in range(len(e)):
if e[i] > cutoff:
psigma[i] = 1.0/e[i]
K = dot(dot(vr, diag(psigma)), vri).astype(t)
K = real(K)
I = eye(n)
K = (1-alpha)*I + alpha*K
return K
def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5, use_cut=False):
"""Returns diffusion kernel.
input:
-- W, adj. matrix
-- normalised [True/False]
-- alpha, [0,1] (degree of network influence)
-- beta, [0->), (diffusion degree)
"""
W = asarray(W)
t = W.dtype.char
if len(W.shape)!=2:
raise ValueError, "Non-matrix input to matrix function."
m, n = W.shape
if t in ['F','D']:
raise TypeError, "Complex input!"
D = diag(W.sum(0))
L = D - W
if normalised==True:
T = diag(sqrt(1./W.sum(0)))
L = dot(dot(T, L), T)
e, vr = eig(L)
vri = inv(vr) #inv
cond = 1.0*{0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
cutoff = 1.*abs(cond*maximum.reduce(e))
psigma = eye(m) # if eigvals are 0 exp(0)=1 (unnecessary)
#psigma = zeros((m,n), dtype='<f8')
for i in range(len(e)):
if abs(e[i]) > cutoff:
psigma[i,i] = exp(-beta*e[i])
#else:
# psigma[i,i] = 0.0
K = real(dot(dot(vr, psigma), vri))
I = eye(n, dtype='<f8')
K = (1. - alpha)*I + alpha*K
return K
def K_diffusion2(W, normalised=True, alpha=1.0, beta=0.5, ncomp=None):
"""Returns diffusion kernel, using fast pade approximation.
input:
-- W, adj. matrix
-- normalised [True/False]
-- beta, [0->), (diffusion degree)
"""
D = diag(W.sum(0))
L = D - W
if normalised==True:
T = diag(sqrt(1./W.sum(0)))
L = dot(dot(T, L), T)
return expm(-beta*L)
def K_modularity(W, alpha=1.0):
""" Returns the matrix square root of Newmans modularity."""
W = asarray(W)
t = W.dtype.char
m, n = W.shape
d = sum(W, 0)
m = 1.*sum(d)
B = W - (outer(d, d)/m)
s,v = sorted_eig(B, sort_by='lm')
psigma = zeros( (n, n), dtype='<f8' )
for i in range(len(s)):
if s[i]>1e-7:
psigma[i,i] = sqrt(s[i])
#psigma[i,i] = s[i]
K = dot(dot(v, psigma), v.T)
I = eye(n)
K = (1 - alpha)*I + alpha*K
return K
def kernel_score(K, W):
"""Returns the modularity score.
K -- (modularity) kernel
W -- adjacency matrix (possibly weighted)
"""
# normalize W (: W'W=I)
m, n = shape(W)
for i in range(n):
W[:,i] = W[:,i]/norm(W[:,i])
score = diag(dot(W, dot(K, W)) )
tot = sum(score)
return score, tot
def modularity_matrix(G, nodelist=None):
if not nodelist:
nodelist = G.nodes()
else:
G = NX.subgraph(G, nodelist)
A = NX.adj_matrix(G, nodelist=nodelist)
d = atleast_2d(G.degree(nbunch=nodelist))
m = 1.*G.number_of_edges()
B = A - dot(d.T, d)/m
return B

28
laydi/lib/packer.py Normal file
View File

@@ -0,0 +1,28 @@
class Packer:
"""A compression object used to speed up model calculations.
Often used in conjunction with crossvalidation and perturbations
analysis.
"""
def __init__(self,array):
self._shape = array.shape
self._array = array
self._packed_data = None
def expand(self,a):
if self._inflater!=None:
return dot(self._inflater,a)
def collapse(self,axis=None,mode='svd'):
if not axis:
axis = argmin(self._array.shape) # default is the smallest dim
if axis == 1:
self._array = self._array.T
u, s, vt = svd(self._array,full_matrices=0)
self._inflater = vt.T
self._packed_data = u*s
return self._packed_data
def get_packed_data(self):
return self._packed_data

View File

@@ -0,0 +1,223 @@
"""Matrix cross validation selection generators
"""
from scipy import take,arange,ceil,repeat,newaxis,mean,asarray,dot,ones,\
random,array_split,floor,vstack,asarray,minimum
from cx_utils import randperm
def w_pls_gen(aat,b,n_blocks=None,center=True,index_out=False):
"""Random block crossvalidation for wide (XX.T) trick in PLS.
Leave-one-out is a subset, with n_blocks equals nSamples
aat -- outerproduct of X
b -- Y
n_blocks =
center -- use centering of calibration ,sets (aat_in,b_in) are centered
Returns:
-- aat_in,aat_out,b_in,b_out,[out]
"""
m, n = aat.shape
index = randperm(m)
if n_blocks==None: n_blocks = m
nValuesInBlock = m/n_blocks
if n_blocks==m:
index = arange(m)
out_ind = [index[i*nValuesInBlock:(i+1)*nValuesInBlock] for i in range(n_blocks)]
for out in out_ind:
inn = [i for i in index if i not in out]
aat_in = aat[inn,:][:,inn]
aat_out = aat[out,:][:,inn]
b_in = b[inn,:]
b_out = b[out,:]
if center:
aat_in, mn = outerprod_centering(aat_in)
b_in = b_in - b_in.mean(0) # b_in + b_out/(b_in.shape[0])
if index_out:
yield aat_in,aat_out,b_in,b_out,out
else:
yield aat_in,aat_out,b_in,b_out
def pls_gen(a, b, n_blocks=None, center=False, index_out=False,axis=0):
"""Random block crossvalidation
Leave-one-out is a subset, with n_blocks equals a.shape[-1]
"""
index = randperm(a.shape[axis])
#index = arange(a.shape[axis])
if n_blocks==None:
n_blocks = a.shape[axis]
n_in_set = ceil(float(a.shape[axis])/n_blocks)
out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_blocks)]
for out in out_ind_sets:
inn = [i for i in index if i not in out]
acal = a.take(inn, 0)
atrue = a.take(out, 0)
bcal = b.take(inn, 0)
btrue = b.take(out, 0)
if center:
mn_a = acal.mean(0)[newaxis]
acal = acal - mn_a
atrue = atrue - mn_a
mn_b = bcal.mean(0)[newaxis]
bcal = bcal - mn_b
btrue = btrue - mn_b
if index_out:
yield acal, atrue, bcal, btrue, out
else:
yield acal, atrue, bcal, btrue
def pca_gen(a, n_sets=None, center=False, index_out=False, axis=0):
"""Returns a generator of crossvalidation sample segments.
input:
-- a, data matrix (m x n)
-- n_sets, number of segments/subsets to generate.
-- center, bool, choice of centering each subset
-- index_out, bool, return subset index
-- axis, int, which axis to get subset from
ouput:
-- V, generator with (n_sets) memebers (subsets)
"""
m = a.shape[axis]
index = randperm(m)
if n_sets==None:
n_sets = m
n_in_set = ceil(float(m)/n_sets)
out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_sets)]
for out in out_ind_sets:
inn = [i for i in index if i not in out]
acal = a.take(inn, 0)
atrue = a.take(out, 0)
if center:
mn_a = acal.mean(0)[newaxis]
acal = acal - mn_a
atrue = atrue - mn_a
if index_out:
yield acal, atrue, out
else:
yield acal, atrue
def w_pls_gen_jk(a, b, n_sets=None, center=True,
index_out=False, axis=0):
"""Random block crossvalidation for wide X (m>>n)
Leave-one-out is a subset, with n_sets equals a.shape[-1]
Returns : X_m and X_m'Y_m
"""
m = a.shape[axis]
ab = dot(a.T, b)
index = randperm(m)
if n_sets==None:
n_sets = m
n_in_set = ceil(float(m)/n_sets)
out_ind_sets = [index[i*n_in_set:(i+1)*n_in_set] for i in range(n_sets)]
for out in out_ind_sets:
inn = [i for i in index if i not in out]
nin = len(inn)
nout = len(out)
a_in = a[inn,:]
mn_a = 0
mAB = 0
if center:
mn_a = a_in.mean(0)[newaxis]
mAin = dot(-ones((1,nout)), a[out,:])/nin
mBin = dot(-ones((1,nout)), b[out,:])/nin
mAB = dot(mAin.T, (mBin*nin))
ab_in = ab - dot(a[out,].T, b[out,:]) - mAB
a_in = a_in - mn_a
if index_out:
yield a_in, ab_in, out
else:
yield a_in, ab_in
def shuffle_1d_block(a, n_sets=None, blocks=None, index_out=False, axis=0):
"""Random block shuffling along 1d axis
Returns : Shuffled a by axis
"""
m = a.shape[axis]
if blocks==None:
blocks = m
for ii in xrange(n_sets):
index = randperm(m)
if blocks==m:
a_out = a.take(index, axis)
else:
index = arange(m)
dummy = map(random.shuffle, array_split(index, blocks))
a_out = a.take(index, axis)
if index_out:
yield a_out, index
else:
yield a_out
def shuffle_1d(a, n_sets, axis=0):
"""Random shuffling along 1d axis.
Returns : Shuffled a by axis
"""
m = a.shape[axis]
for ii in xrange(n_sets):
index = randperm(m)
a = a.take(index, axis)
yield a
def diag_pert(a, n_sets=10, center=True, index_out=False):
"""Alter generator returning sets perturbed with means at diagonals.
input:
X -- matrix, data
alpha -- scalar, approx. portion of data perturbed
"""
m, n = a.shape
tr=False
if m>n:
a = a.T
m, n = a.shape
tr = True
if n_sets>m or n_sets>n:
msg = "You may not use more subsets than max(n_rows, n_cols)"
raise ValueError, msg
nm=n*m
start_inds = array_split(randperm(m),n_sets) # we use random start diags
if center:
a = a - mean(a, 0)[newaxis]
for v in range(n_sets):
a_out = a.copy()
out = []
for start in start_inds[v]:
ind = arange(start+v, nm, n+1)
[out.append(i) for i in ind]
if center:
a_out.put(a.mean(),ind)
else:
a_out.put(0, ind)
if tr:
a_out = a_out.T
if index_out:
yield a_out, asarray(out)
else:
yield a_out
def outerprod_centering(aat, ret_mn=True):
"""Returns double centered symmetric outerproduct matrix.
"""
h = aat.mean(0)[newaxis]
h = h - 0.5*h.mean()
mn_a = h + h.T # beauty of broadcasting
aatc = aat - mn_a
if ret_mn:
return aatc, h
return aatc

315
laydi/lib/validation.py Normal file
View File

@@ -0,0 +1,315 @@
"""This module implements some common validation schemes from pca and pls.
"""
from scipy import ones,mean,sqrt,dot,newaxis,zeros,sum,empty,\
apply_along_axis,eye,kron,array,sort,zeros_like,argmax,atleast_2d
from scipy.stats import median
from scipy.linalg import triu,inv,svd,norm
from select_generators import w_pls_gen,w_pls_gen_jk,pls_gen,pca_gen,diag_pert
from engines import w_simpls,pls,bridge,pca,nipals_lpls
from cx_utils import m_shape
def w_pls_cv_val(X, Y, amax, n_blocks=None):
"""Returns rmsep and aopt for pls tailored for wide X.
The root mean square error of cross validation is calculated
based on random block cross-validation. With number of blocks equal to
number of samples [default] gives leave-one-out cv.
The pls model is based on the simpls algorithm for wide X.
:Parameters:
X : ndarray
column centered data matrix of size (samples x variables)
Y : ndarray
column centered response matrix of size (samples x responses)
amax : scalar
Maximum number of components
n_blocks : scalar
Number of blocks in cross validation
:Returns:
rmsep : ndarray
Root Mean Square Error of cross-validated Predictions
aopt : scalar
Guestimate of the optimal number of components
:SeeAlso:
- pls_cv_val : Same output, not optimised for wide X
- w_simpls : Simpls algorithm for wide X
Notes
-----
Based (cowardly translated) on m-files from the Chemoact toolbox
X, Y inputs need to be centered (fixme: check)
Examples
--------
>>> import numpy as n
>>> X = n.array([[1., 2., 3.],[]])
>>> Y = n.array([[1., 2., 3.],[]])
>>> w_pls(X, Y, 1)
[4,5,6], 1
"""
k, l = m_shape(Y)
PRESS = zeros((l, amax+1), dtype='f')
if n_blocks==None:
n_blocks = Y.shape[0]
XXt = dot(X, X.T)
V = w_pls_gen(XXt, Y, n_blocks=n_blocks, center=True)
for Din, Doi, Yin, Yout in V:
ym = -sum(Yout, 0)[newaxis]/(1.0*Yin.shape[0])
PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
dat = w_simpls(Din, Yin, amax)
Q, U, H = dat['Q'], dat['U'], dat['H']
That = dot(Doi, dot(U, inv(triu(dot(H.T, U))) ))
Yhat = []
for j in range(l):
TQ = dot(That, triu(dot(Q[j,:][:,newaxis], ones((1,amax)))) )
E = Yout[:,j][:,newaxis] - TQ
E = E + sum(E, 0)/Din.shape[0]
PRESS[j,1:] = PRESS[j,1:] + sum(E**2, 0)
#Yhat = Yin - dot(That,Q.T)
msep = PRESS/(Y.shape[0])
aopt = find_aopt_from_sep(msep)
return sqrt(msep), aopt
def pls_val(X, Y, amax=2, n_blocks=10, algo='pls'):
k, l = m_shape(Y)
PRESS = zeros((l, amax+1), dtype='<f8')
EE = zeros((amax, k, l), dtype='<f8')
Yhat = zeros((amax, k, l), dtype='<f8')
V = pls_gen(X, Y, n_blocks=n_blocks, center=True, index_out=True)
for Xin, Xout, Yin, Yout, out in V:
ym = -sum(Yout,0)[newaxis]/Yin.shape[0]
Yin = (Yin - ym)
PRESS[:,0] = PRESS[:,0] + ((Yout - ym)**2).sum(0)
if algo=='pls':
dat = pls(Xin, Yin, amax, mode='normal')
elif algo=='bridge':
dat = simpls(Xin, Yin, amax, mode='normal')
for a in range(amax):
Ba = dat['B'][a,:,:]
Yhat[a,out[:],:] = dot(Xout, Ba)
E = Yout - dot(Xout, Ba)
EE[a,out,:] = E
PRESS[:,a+1] = PRESS[:,a+1] + sum(E**2,0)
#rmsep = sqrt(PRESS/(k-1.))
msep = PRESS
aopt = find_aopt_from_sep(msep)
return msep, Yhat, aopt
def lpls_val(X, Y, Z, a_max=2, nsets=None,alpha=.5, mean_ctr=[2,0,2]):
"""Performs crossvalidation to get generalisation error in lpls"""
assert(nsets<=X.shape[0])
cv_iter = pls_gen(X, Y, n_blocks=nsets,center=False,index_out=True)
k, l = Y.shape
Yc = empty((k, l), 'd')
Yhat = empty((a_max, k, l), 'd')
Yhatc = empty((a_max, k, l), 'd')
sep2 = empty((a_max, k, l), 'd')
for i, (xcal,xi,ycal,yi,ind) in enumerate(cv_iter):
print ind
dat = nipals_lpls(xcal,ycal,Z,
a_max=a_max,
alpha=alpha,
mean_ctr=mean_ctr,
verbose=False)
B = dat['B']
#b0 = dat['b0']
for a in range(a_max):
if mean_ctr[0] in [0, 2]:
xi = xi - dat['mnx']
else:
xi = xi - xi.mean(1)[:,newaxis] #???: cheating?
if mean_ctr[1] in [0, 2]:
ym = dat['mny']
else:
ym = yi.mean(1)[:,newaxis] #???: check this
Yhat[a,ind,:] = atleast_2d(ym + dot(xi, B[a]))
#Yhat[a,ind,:] = atleast_2d(b0[a] + dot(xi, B[a]))
# todo: need a better support for class validation
y_is_class = Y.dtype.char.lower() in ['i','p', 'b', 'h','?']
#print Y.dtype.char
if y_is_class:
Yhat_class = zeros_like(Yhat)
for a in range(a_max):
for i in range(k):
Yhat_class[a,i,argmax(Yhat[a,i,:])] = 1.0
class_err = 100*((Yhat_class+Y)==2).sum(1)/Y.sum(0).astype('d')
sep = (Y - Yhat)**2
rmsep = sqrt(sep.mean(1)).T
#rmsep2 = sqrt(sep2.mean(1))
aopt = find_aopt_from_sep(rmsep)
return rmsep, Yhat, aopt
def pca_alter_val(a, amax, n_sets=10, method='diag'):
"""Pca validation by altering elements in X.
comments:
-- may do all jk estimates in this loop
"""
V = diag_pert(a, n_sets, center=True, index_out=True)
sep = empty((n_sets, amax), dtype='f')
for i, (xi, ind) in enumerate(V):
dat_i = pca(xi, amax, mode='detailed')
Ti, Pi = dat_i['T'],dat_i['P']
for j in xrange(amax):
Xhat = dot(Ti[:,:j+1], Pi[:,:j+1].T)
a_sub = a.ravel().take(ind)
EE = a_sub - Xhat.ravel().take(ind)
tot = (a_sub**2).sum()
sep[i,j] = (EE**2).sum()/tot
sep = sqrt(sep)
aopt = find_aopt_from_sep(sep)
return sep, aopt
def pca_cv_val(a, amax, n_sets):
""" Returns PRESS from cross-validated pca using random segments.
input:
-- a, data matrix (m x n)
-- amax, maximum nuber of components used
-- n_sets, number of segments to calculate
output:
-- sep, (amax x m x n), squared error of prediction (press)
-- aopt, guestimated optimal number of components
"""
m, n = a.shape
E = empty((amax, m, n), dtype='f')
xtot = (a**2).sum() # this needs centering
V = pca_gen(a, n_sets=7, center=True, index_out=True)
for xi, xout, ind in V:
dat_i = pca(xi, amax, mode='fast')
Pi = dat_i['P']
for a in xrange(amax):
Pia = Pi[:,:a+1]
E[a][ind,:] = (X[ind,:] - dot(xout, dot(Pia,Pia.T) ))**2
sep = []
for a in xrange(amax):
sep.append(E[a].sum()/xtot)
sep = array(sep)
aopt = find_aopt_from_sep(sep)
return sep, aopt
def pls_jkW(a, b, amax, n_blocks=None, algo='pls', use_pack=True, center=True):
""" Returns CV-segments of paramter W for wide X.
todo: add support for T,Q and B
"""
if n_blocks == None:
n_blocks = b.shape[0]
Wcv = empty((n_blocks, a.shape[1], amax), dtype='d')
if use_pack:
u, s, inflater = svd(a, full_matrices=0)
a = u*s
V = pls_gen(a, b, n_blocks=n_blocks, center=center)
for nn,(a_in, a_out, b_in, b_out) in enumerate(V):
if algo=='pls':
dat = pls(a_in, b_in, amax, 'loads', 'fast')
elif algo=='bridge':
dat = bridge(a_in, b_in, amax, 'loads', 'fast')
W = dat['W']
if use_pack:
W = dot(inflater.T, W)
Wcv[nn,:,:] = W[:,:,]
return Wcv
def pca_jkP(a, aopt, n_blocks=None):
"""Returns loading from PCA on CV-segments.
input:
-- a, data matrix (n x m)
-- aopt, number of components in model.
-- nblocks, number of segments
output:
-- PP, loadings collected in a three way matrix
(n_segments, m, aopt)
comments:
* The loadings are scaled with the (1/samples)*eigenvalues.
* Crossvalidation method is currently set to random blocks of samples.
todo: add support for T
fixme: more efficient to add this in validation loop
"""
if n_blocks == None:
n_blocks = a.shape[0]
PP = empty((n_blocks, a.shape[1], aopt), dtype='f')
V = pca_gen(a, n_sets=n_blocks, center=True)
for nn,(a_in, a_out) in enumerate(V):
dat = pca(a_in, aopt, mode='fast', scale='loads')
P = dat['P']
PP[nn,:,:] = P
return PP
def lpls_jk(X, Y, Z, a_max, nsets=None, xz_alpha=.5, mean_ctr=[2,0,2]):
cv_iter = pls_gen(X, Y, n_blocks=nsets,center=False,index_out=False)
m, n = X.shape
k, l = Y.shape
o, p = Z.shape
if nsets==None:
nsets = m
WWx = empty((nsets, n, a_max), 'd')
WWz = empty((nsets, o, a_max), 'd')
#WWy = empty((nsets, l, a_max), 'd')
for i, (xcal, xi, ycal, yi) in enumerate(cv_iter):
dat = nipals_lpls(xcal,ycal,Z,a_max=a_max,alpha=xz_alpha,
mean_ctr=mean_ctr,scale='loads',verbose=False)
WWx[i,:,:] = dat['W']
WWz[i,:,:] = dat['L']
#WWy[i,:,:] = dat['Q']
return WWx, WWz
def find_aopt_from_sep(sep, method='75perc'):
"""Returns an estimate of optimal number of components from rmsecv.
"""
sep = sep.copy()
if method=='vanilla':
# min rmsep
rmsecv = sqrt(sep.mean(0))
return rmsecv.argmin() + 1
elif method=='75perc':
prct = .75 #percentile
ind = 1.*sep.shape[0]*prct
med = median(sep)
prc_75 = []
for col in sep.T:
col.sort() #this is inplace -> ruins sep, so we are doing a copy
prc_75.append(col[int(ind)])
prc_75 = array(prc_75)
for i in range(1, sep.shape[1], 1):
if med[i-1]<prc_75[i]:
return i
return len(med)

168
laydi/logger.py Normal file
View File

@@ -0,0 +1,168 @@
import gobject
import gtk
import time
class Logger:
def __init__(self):
self.store = gtk.ListStore(gobject.TYPE_STRING,
gobject.TYPE_STRING,
gobject.TYPE_STRING)
self.levels = ['debug', 'notice', 'warning', 'error']
self.level_text = {'debug': 'Debug',
'notice': 'Notice',
'warning': 'Warning',
'error': 'Error'}
self.components = {}
self.colors = { 'debug': 'grey',
'notice': 'black',
'warning': 'brown',
'error': 'red' }
def log(self, level, message):
iter = self.store.append()
self.store.set_value(iter, 0, level)
self.store.set_value(iter, 1, message)
self.store.set_value(iter, 2, self.colors[level])
def level_number(self, level):
return self.levels.index(level)
def debug(self, message):
self.log('debug', message)
def notice(self, message):
self.log('notice', message)
def warning(self, message):
self.log('warning', message)
def error(self, message):
self.log('error', message)
class LogView(gtk.TreeView):
def __init__(self, logger=None, level='notice'):
self.logger = logger
self.model = logger.store
self.level = level
self.level_no = logger.level_number(level)
# Set up filter
self.filter = self.model.filter_new()
gtk.TreeView.__init__(self, self.filter)
self.filter.set_visible_func(self.level_filter)
# Set up log level column
renderer = gtk.CellRendererText()
self.level_col = gtk.TreeViewColumn('Level', renderer, text=0)
self.level_col.add_attribute(renderer, "foreground", 2)
self.append_column(self.level_col)
# Set up message column
renderer = gtk.CellRendererText()
self.message_col = gtk.TreeViewColumn('Message', renderer, text=1)
self.message_col.add_attribute(renderer, "foreground", 2)
self.append_column(self.message_col)
# Activate context menu
self.menu = LogMenu(self.logger, self)
self.connect('popup_menu', self.popup_menu)
self.connect('button_press_event', self.mouse_popup_menu)
# Make sure tree view displays bottom entry when entered
def scroll_to_last(model, path, it):
if path:
self.scroll_to_cell(path)
self.model.connect('row-changed', scroll_to_last)
def set_level(self, level):
self.level = level
self.level_no = self.logger.levels.index(level)
self.filter.refilter()
self.queue_draw()
def popup_menu(self, *rest):
self.menu.popup(None, None, None, 0, 0)
def mouse_popup_menu(self, widget, event):
if event.button == 3:
self.menu.popup(None, None, None, event.button, event.time)
def level_filter(self, store, iter):
if store.get_value(iter,0):
value = self.logger.levels.index(store.get_value(iter, 0))
return value >= self.level_no
else:
return False
class LogLevelMenu(gtk.Menu):
def __init__(self, logger, view):
self.logger = logger
self.view = view
items = []
gtk.Menu.__init__(self)
for level in logger.levels:
if len(items) == 0:
group = None
else:
group = items[0]
item = gtk.RadioMenuItem(group, logger.level_text[level], level)
item.connect('activate', self.set_log_level, level)
items.append(item)
self.append(item)
item.show()
item_no = logger.level_number(view.level)
items[item_no].set_active(True)
def set_log_level(self, widget, level, *rest):
if widget.active:
self.view.set_level(level)
class LogComponentMenu(gtk.Menu):
def __init__(self, logger, view):
gtk.Menu.__init__(self)
components = logger.components.keys()
components.sort(str.__gt__)
for c in components:
item = gtk.MenuItem(c)
self.append(item)
item.show()
# for component in logger.components
class LogMenu(gtk.Menu):
def __init__(self, logger, view):
gtk.Menu.__init__(self)
self.logger = logger
# View Log Level
self.view_menu = LogLevelMenu(logger, view)
self.view_item = gtk.MenuItem('View Log Level')
self.view_item.set_submenu(self.view_menu)
self.append(self.view_item)
self.view_item.show()
# View Components
self.component_menu = LogComponentMenu(logger, view)
self.component_item = gtk.MenuItem('View Components')
self.component_item.set_submenu(self.component_menu)
self.append(self.component_item)
self.component_item.show()
# Clear Log
clear_item = gtk.MenuItem('Clear Log')
clear_item.connect('activate', self.activate_clear_button)
self.append(clear_item)
clear_item.show()
def activate_clear_button(self, item):
self.logger.store.clear()
logger = Logger()
log = logger.log

101
laydi/main.py Normal file
View File

@@ -0,0 +1,101 @@
import sys
import os.path
import paths
# Site specific directories set by configure script.
PREFIX = paths.PREFIX
BINDIR = paths.BINDIR
DATADIR = paths.DATADIR
DOCDIR = paths.DOCDIR
PYDIR = paths.PYDIR
ICONDIR = os.path.join(DATADIR, 'icons')
#: Dictionary of observers
_observers = {}
#: The current Navigator object.
navigator = None
#: The current application
application = None
#: The current project
project = None
#: The current workflow
workflow = None
#: A cfgparse/optparse options object.
options = None
def notify_observers(name):
call = "%s_changed" % name
for s in _observers.get(name, []):
getattr(s, call)(getattr(sys.modules[__name__], name))
def _add_observer(name, observer):
"""Adds observer as an observer of the named object."""
if not _observers.has_key(name):
_observers[name] = []
_observers[name].append(observer)
def _remove_observer(name, observer):
"""Removes observer as an observer of the named object."""
if not _observers.has_key(name):
return
_observers.remove(observer)
def add_navigator_observer(observer):
_add_observer('navigator', observer)
def add_project_observer(observer):
_add_observer('project', observer)
def add_workflow_observer(observer):
_add_observer('workflow', observer)
def add_application_observer(observer):
_add_observer('application', observer)
def remove_navigator_observer(observer):
_remove_observer('navigator', observer)
def remove_project_observer(observer):
_remove_observer('project', observer)
def remove_workflow_observer(observer):
_remove_observer('workflow', observer)
def remove_application_observer(observer):
_remove_observer('application', observer)
def remove_options_observer(observer):
_remove_observer('options', observer)
def set_navigator(nav):
global navigator
navigator = nav
notify_observers('navigator')
def set_application(app):
global application
application = app
notify_observers('application')
def set_project(p):
global project
project = p
notify_observers('project')
def set_workflow(wf):
global workflow
workflow = wf
notify_observers('workflow')
def set_options(opt):
global options
options = opt
notify_observers('options')

444
laydi/navigator.py Normal file
View File

@@ -0,0 +1,444 @@
import gtk
import gobject
import plots
import time
import fluents
from logger import logger
import dataset, plots, project, workflow, main
import scipy
class NavigatorView (gtk.TreeView):
"""The NavigatorView is a tree view of the project.
There is always one NavigatorView, that shows the functions, plots and
datasets in the current project.
"""
def __init__(self):
if main.project:
self.data_tree = main.project.data_tree
else:
self.data_tree = None
gtk.TreeView.__init__(self)
# Various properties
self.set_enable_tree_lines(True)
self.set_headers_visible(False)
self.get_hadjustment().set_value(0)
# Selection Mode
self.get_selection().set_mode(gtk.SELECTION_MULTIPLE)
self.get_selection().connect('changed',self.on_selection_changed)
self._previous_selection = []
# Setting up TextRenderers etc
self.connect('row_activated', self.on_row_activated)
self.connect('cursor_changed', self.on_cursor_changed)
# Activate context menu
self.menu = NavigatorMenu(self)
self.connect('popup_menu', self.popup_menu)
self.connect('button_press_event', self.on_mouse_event)
self.textrenderer = textrenderer = gtk.CellRendererText()
pixbufrenderer = gtk.CellRendererPixbuf()
self.object_col = gtk.TreeViewColumn('Object')
self.object_col.pack_start(pixbufrenderer,expand=False)
self.object_col.pack_start(textrenderer,expand=False)
self.object_col.set_attributes(textrenderer, cell_background=3,
foreground=4, text=0)
self.object_col.set_attributes(pixbufrenderer, pixbuf=5)
self.append_column(self.object_col)
# send events to plots / itself
self.enable_model_drag_source(gtk.gdk.BUTTON1_MASK,
[("GTK_TREE_MODEL_ROW", gtk.TARGET_SAME_APP, 7)],
gtk.gdk.ACTION_LINK | gtk.gdk.ACTION_MOVE)
self.connect("drag-data-get",self.slot_drag_data)
logger.debug('Initializing navigator window.')
def slot_drag_data(self, treeview, context, selection, target_id, etime):
"""Sets the data for a drag event."""
treeselection = treeview.get_selection()
model, paths = treeselection.get_selected_rows()
if paths:
self.data_tree.drag_data_get(paths[0], selection)
def add_project(self, project):
"""Dependency injection."""
self.data_tree = project.data_tree
self.set_model(project.data_tree)
self.data_tree.connect('row-changed',self.on_row_changed)
def on_selection_changed(self, selection):
"""Update the list of currently selected datasets."""
# update prev selection right away in case of multiple events
model, paths = selection.get_selected_rows()
if not paths: # a plot is marked: do nothing
return
tmp = self._previous_selection
self._previous_selection = paths
tree = self.data_tree
# set timestamp on newly selected objects
[tree.set_value(tree.get_iter(path), 6, time.time())
for path in paths if path not in tmp]
objs = [tree.get_iter(path) for path in paths]
objs = [(tree[iter][6], tree[iter][2]) for iter in objs]
objs.sort()
objs = [obj for timestamp, obj in objs]
if objs and isinstance(objs[0], dataset.Dataset):
logger.debug('Selecting dataset')
main.project.current_data = objs
else:
logger.debug('Deselecting dataset')
main.project.current_data = []
def on_row_changed(self, treestore, pos, iter):
"""Set correct focus and colours when rows have changed."""
obj = treestore[iter][2]
obj_type = treestore[iter][1]
if not (obj or obj_type):
return
self.expand_to_path(pos)
if isinstance(obj, dataset.Dataset):
self.set_cursor(pos)
self.grab_focus()
def on_row_activated(self, widget, path, column):
tree_iter = self.data_tree.get_iter(path)
obj = self.data_tree.get_value(tree_iter, 2)
if isinstance(obj, plots.Plot):
logger.debug('Activating plot')
main.application.change_plot(obj)
elif isinstance(obj, dataset.Dataset):
pass
elif obj == None:
children = []
i = self.data_tree.iter_children(tree_iter)
while i:
child = self.data_tree.get(i, 2)[0]
if isinstance(child, plots.Plot):
children.append(child)
i = self.data_tree.iter_next(i)
main.application.change_plots(children)
else:
t = type(obj)
logger.notice('Activated datatype was %s. Don\'t know what to do.' % t)
def popup_menu(self, *rest):
self.menu.popup(None, None, None, 0, 0)
def on_mouse_event(self, widget, event):
path = widget.get_path_at_pos(int(event.x), int(event.y))
if path:
iter = self.data_tree.get_iter(path[0])
obj = self.data_tree.get_value(iter, 2)
else:
iter = None
obj = None
if isinstance(obj, dataset.Dataset):
self.menu.set_dataset(obj, iter)
else:
self.menu.set_dataset(None, iter)
if event.button == 3:
self.menu.popup(None, None, None, event.button, event.time)
def on_cursor_changed(self, widget):
"""Update statusbar to contain dataset information.
Lists the dimensions of a dataset in the statusbar of the program
if a dataset is focused in the navigator.
"""
path = widget.get_cursor()[0]
tree_iter = self.data_tree.get_iter(path)
obj = self.data_tree.get_value(tree_iter, 2)
if isinstance(obj, dataset.Dataset):
dims = zip(obj.get_dim_name(), obj.shape)
dim_text = ", ".join(["%s (%d)" % dim for dim in dims])
else:
dim_text = ""
main.application['appbar1'].push(dim_text)
class NavigatorMenu(gtk.Menu):
def __init__(self, navigator):
gtk.Menu.__init__(self)
self.navigator = navigator
self.dataset = None
self.tree_iter = None
# Populate main menu
self.load_item = gtk.MenuItem('Load dataset')
self.load_item.connect('activate', self.on_load_dataset, navigator)
self.append(self.load_item)
self.load_item.show()
self.save_item = gtk.MenuItem('Save dataset')
self.save_item.connect('activate', self.on_save_dataset, navigator)
self.append(self.save_item)
self.save_item.show()
self.delete_item = gtk.MenuItem('Delete')
self.delete_item.connect('activate', self.on_delete, navigator)
self.append(self.delete_item)
self.delete_item.show()
self.split_item = gtk.MenuItem('Split on selection')
self.split_item.connect('activate', self.on_split, navigator)
self.append(self.split_item)
self.split_item.show()
# Build transform sub menu
self.trans_menu = gtk.Menu()
self.trans_tr_item = gtk.MenuItem('Transpose')
self.trans_tr_item.connect('activate', self.on_transpose, navigator)
self.trans_menu.append(self.trans_tr_item)
self.trans_tr_item.show()
self.trans_stdr_item = gtk.MenuItem('Std. rows')
self.trans_stdr_item.connect('activate', self.on_standardise_rows, navigator)
self.trans_menu.append(self.trans_stdr_item)
self.trans_stdr_item.show()
self.trans_stdc_item = gtk.MenuItem('Std. cols')
self.trans_stdc_item.connect('activate', self.on_standardise_cols, navigator)
self.trans_menu.append(self.trans_stdc_item)
self.trans_stdc_item.show()
self.trans_log_item = gtk.MenuItem('Log')
self.trans_log_item.connect('activate', self.on_log, navigator)
self.trans_menu.append(self.trans_log_item)
self.trans_log_item.show()
self.trans_item = gtk.MenuItem("Transformation")
self.append(self.trans_item)
self.trans_item.set_submenu(self.trans_menu)
self.trans_item.show()
# Build plot sub menu
self.plot_menu = gtk.Menu()
self.plot_image_item = gtk.MenuItem('Image Plot')
self.plot_image_item.connect('activate', self.on_plot_image, navigator)
self.plot_menu.append(self.plot_image_item)
self.plot_image_item.show()
self.plot_hist_item = gtk.MenuItem('Histogram')
self.plot_hist_item.connect('activate', self.on_plot_hist, navigator)
self.plot_menu.append(self.plot_hist_item)
self.plot_hist_item.show()
self.plot_scatter_item = gtk.MenuItem('Scatter')
self.plot_scatter_item.connect('activate', self.on_plot_scatter, navigator)
self.plot_menu.append(self.plot_scatter_item)
self.plot_scatter_item.show()
self.plot_line_item = gtk.MenuItem('Line view')
self.plot_line_item.connect('activate', self.on_plot_line, navigator)
self.plot_menu.append(self.plot_line_item)
self.plot_line_item.show()
self.plot_item = gtk.MenuItem('Plot')
self.append(self.plot_item)
self.plot_item.set_submenu(self.plot_menu)
self.plot_item.show()
def set_dataset(self, ds, it):
self.dataset = ds
self.tree_iter = it
if ds == None:
self.save_item.set_property('sensitive', False)
self.plot_item.set_property('sensitive', False)
self.trans_item.set_property('sensitive', False)
else:
self.save_item.set_property('sensitive', True)
self.plot_item.set_property('sensitive', True)
self.trans_item.set_property('sensitive', True)
def load_dataset(self, filename):
"""Load the dataset from the given file and add it to the project."""
ds = dataset.read_ftsv(filename)
if isinstance(ds, dataset.GraphDataset):
icon = fluents.icon_factory.get("graph_dataset")
elif isinstance(ds, dataset.CategoryDataset):
icon = fluents.icon_factory.get("category_dataset")
else:
icon = fluents.icon_factory.get("dataset")
main.project.add_dataset(ds)
main.project.data_tree_insert(None, ds.get_name(), ds, None, "black", icon)
def on_load_dataset(self, item, navigator):
# Set up file chooser.
dialog = gtk.FileChooserDialog('Load dataset')
dialog.set_action(gtk.FILE_CHOOSER_ACTION_OPEN)
dialog.add_buttons(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
gtk.STOCK_OPEN, gtk.RESPONSE_OK)
dialog.set_select_multiple(True)
dialog.set_current_folder(main.options.datadir)
retval = dialog.run()
if retval in [gtk.RESPONSE_CANCEL, gtk.RESPONSE_DELETE_EVENT]:
pass
elif retval == gtk.RESPONSE_OK:
for filename in dialog.get_filenames():
self.load_dataset(filename)
else:
print "unknown; ", retval
dialog.destroy()
def on_save_dataset(self, item, navigator):
dialog = gtk.FileChooserDialog('Save dataset')
dialog.set_action(gtk.FILE_CHOOSER_ACTION_SAVE)
dialog.add_buttons(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, gtk.STOCK_SAVE, gtk.RESPONSE_OK)
dialog.set_current_name("%s.ftsv" % self.dataset.get_name())
retval = dialog.run()
if retval in [gtk.RESPONSE_CANCEL, gtk.RESPONSE_DELETE_EVENT]:
logger.debug("Cancelled save dataset")
elif retval == gtk.RESPONSE_OK:
logger.debug("Saving dataset as: %s" % dialog.get_filename())
fd = open(dialog.get_filename(), 'w')
dataset.write_ftsv(fd, self.dataset)
fd.close()
else:
print "unknown; ", retval
dialog.destroy()
def on_delete(self, item, navigator):
tm, rows = navigator.get_selection().get_selected_rows()
iters = [tm.get_iter(r) for r in rows]
iters.reverse()
for i in iters:
main.project.delete_data(i)
# tm.remove(i)
def on_plot_image(self, item, navigator):
plot = plots.ImagePlot(self.dataset, name='Image Plot')
icon = fluents.icon_factory.get("line_plot")
main.project.data_tree_insert(self.tree_iter, 'Image Plot', plot, None, "black", icon)
# fixme: image plot selections are not well defined
#plot.set_selection_listener(project.set_selection)
#project._selection_observers.append(plot)
def on_plot_hist(self, item, navigator):
project = main.project
plot = plots.HistogramPlot(self.dataset, name='Histogram')
icon = fluents.icon_factory.get("line_plot")
project.data_tree_insert(self.tree_iter, 'Histogram', plot, None, "black", icon)
plot.set_selection_listener(project.set_selection)
project._selection_observers.append(plot)
def on_plot_scatter(self, item, navigator):
project = main.project
datasets = main.project.current_data
ds_major = datasets[0]
dims_major = ds_major.get_dim_name()
ids_major = ds_major.get_identifiers(dims_major[1], sorted=True)
if len(datasets) > 1:
# If there is more than one active dataset -> try to use the two first
ds_minor = datasets[1]
dims_minor = ds_minor.get_dim_name()
if dims_minor != dims_major or ds_minor.shape[0] != ds_major.shape[0]:
# the selected datasets are not matched -> use initial selected
ds_minor = ds_major
else:
#Only one dataset selected
ds_minor = ds_major
plot = plots.ScatterPlot(ds_major, ds_minor,
dims_major[0], dims_major[1],
ids_major[0], ids_major[1],
name='Scatter (%s)' % ds_major.get_name())
plot.add_axes_spin_buttons(len(ids_major), 0, 1)
icon = fluents.icon_factory.get("line_plot")
project.data_tree_insert(self.tree_iter, 'Scatter', plot, None, "black", icon)
plot.set_selection_listener(project.set_selection)
project._selection_observers.append(plot)
def on_plot_line(self, item, navigator):
project = main.project
ds = self.dataset
dims = ds.get_dim_name()
ids = ds.get_identifiers(dims[1])
plot = plots.LineViewPlot(ds, name='Line (%s)' % ds.get_name())
icon = fluents.icon_factory.get("line_plot")
project.data_tree_insert(self.tree_iter, 'Line view', plot, None, "black", icon)
plot.set_selection_listener(project.set_selection)
project._selection_observers.append(plot)
def on_transpose(self, item, navigator):
project = main.project
ds = self.dataset.transpose()
ds._name = ds._name + ".T"
icon = fluents.icon_factory.get(ds)
project.data_tree_insert(self.tree_iter, ds.get_name(), ds, None, "black", icon)
def on_standardise_rows(self, item, navigator):
project = main.project
ds = self.dataset.copy()
ds._name = self.dataset._name + ".rsc"
axis = 1
ds._array = ds.asarray()/scipy.expand_dims(ds.asarray().std(axis), axis)
icon = fluents.icon_factory.get(ds)
project.data_tree_insert(self.tree_iter, ds.get_name(), ds, None, "black", icon)
def on_standardise_cols(self, item, navigator):
project = main.project
ds = self.dataset.copy()
ds._name = self.dataset._name + ".csc"
axis = 0
ds._array = ds.asarray()/scipy.expand_dims(ds.asarray().std(axis), axis)
icon = fluents.icon_factory.get(ds)
project.data_tree_insert(self.tree_iter, ds.get_name(), ds, None, "black", icon)
def on_log(self, item, navigator):
project = main.project
try:
if not scipy.all(self.dataset.asarray()>0):
raise ValueError
except:
logger.log('warning', 'Datasets needs to be strictly positive for a log transform')
return
ds = self.dataset.copy()
ds._array = scipy.log(ds.asarray())
icon = fluents.icon_factory.get(ds)
ds._name = ds._name + ".log"
project.data_tree_insert(self.tree_iter, ds.get_name(), ds, None, "black", icon)
def on_split(self, item, navigator):
if self.dataset is None:
logger.warn("Only datasets can be split.")
return
dim = self.dataset.get_dim_name(0)
project = main.project
sel_ids = set(project.get_selection()[dim])
sel_ds = self.dataset.subdata(dim, sel_ids)
unsel_ids = set(self.dataset.get_identifiers(dim)) - set(sel_ids)
unsel_ds = self.dataset.subdata(dim, unsel_ids)
icon = fluents.icon_factory.get(self.dataset)
project.data_tree_insert(self.tree_iter, 'Selected', sel_ds, None, "black", icon)
project.data_tree_insert(self.tree_iter, 'Unselected', unsel_ds, None, "black", icon)

7
laydi/paths.py.m4 Normal file
View File

@@ -0,0 +1,7 @@
PREFIX = "M4_PREFIX"
BINDIR = "M4_BINDIR"
DATADIR = "M4_DATADIR"
DOCDIR = "M4_DOCDIR"
PYDIR = "M4_PYDIR"

1138
laydi/pca_options.glade Normal file

File diff suppressed because it is too large Load Diff

1195
laydi/plots.py Normal file

File diff suppressed because it is too large Load Diff

1092
laydi/pls_options.glade Normal file

File diff suppressed because it is too large Load Diff

154
laydi/project.py Normal file
View File

@@ -0,0 +1,154 @@
import os
import scipy
import gobject
import gtk
import fluents
import logger, dataset, plots, main
class Project:
"""A Project contains datasets, selections etc.
The project, of which the application has only one at any given time,
is the container for all datasets, plots and selections in use. The data
in the project is organized in a gtk.TreeStrore that is displayed in the
navigator.
"""
def __init__(self, name="Testing"):
self.data_tree = gtk.TreeStore(str,
str,
object,
str,
str,
gobject.TYPE_OBJECT,
float)
self.name = name
self.dim_names = []
self._selection_observers = []
self._dataset_observers = []
self.current_data = []
self.datasets = []
self.sel_obj = dataset.Selection('Current Selection')
self.selections = []
self._last_selection = None
self._dataset_iter_map = {}
def add_selection_observer(self, observer):
self._selection_observers.append(observer)
observer.selection_changed(None, self.get_selection())
def notify_selection_listeners(self, dim_name):
"""Notifies observers"""
for observer in self._selection_observers:
observer.selection_changed(dim_name, self.get_selection())
def add_dataset_observer(self, observer):
self._dataset_observers.append(observer)
observer.dataset_changed()
def notify_dataset_listeners(self):
"""Notifies observers when new datasets are added"""
for observer in self._dataset_observers:
observer.dataset_changed()
def set_selection(self, dim_name, selection):
"""Sets a current selection and notify observers"""
self.sel_obj[dim_name] = set(selection)
self.notify_selection_listeners(dim_name)
self._last_selection = selection
def get_selection(self):
"""Returns the current selection object"""
return self.sel_obj
def delete_data(self, it):
"""Delete elements from the project."""
child = self.data_tree.iter_children(it)
while child != None:
c = self.data_tree.iter_next(child)
self.delete_data(child)
child = c
main.application.main_view.remove_view(self.data_tree.get(it, 2)[0])
self.data_tree.remove(it)
def add_data(self, parents, data, fun='Function'):
"""Adds a set of data and plots to the navigator.
This method is usually called after a Function in a workflow
has finished and returns its output."""
if len(parents) > 0:
parent_iter = self._dataset_iter_map[parents[0]]
else:
parent_iter = None
# Add the function node to the tree
icon = fluents.icon_factory.get("folder_grey")
it = self.data_tree_insert(parent_iter, fun, None, None, "black", icon)
# Add all returned datasets/plots/selections
for d in data:
# Any kind of dataset
if isinstance(d, dataset.Dataset):
if isinstance(d, dataset.GraphDataset):
icon = fluents.icon_factory.get("graph_dataset")
elif isinstance(d, dataset.CategoryDataset):
icon = fluents.icon_factory.get("category_dataset")
else:
icon = fluents.icon_factory.get("dataset")
self.add_dataset(d)
self.data_tree_insert(it, d.get_name(), d, None, "black", icon)
# Any kind of plot
elif isinstance(d, plots.Plot):
icon = fluents.icon_factory.get("line_plot")
self.data_tree_insert(it, d.get_title(), d, None, "black", icon)
d.set_selection_listener(self.set_selection)
self._selection_observers.append(d)
# Selections are not added to the data tree
elif isinstance(d, dataset.Selection):
self.add_selection(d)
def data_tree_insert(self, parent, text, data, bg, fg, icon, selected = 0):
"""Inserts data into the tree view.
@param text: The title of the object.
@param data: A dataset, plot or function object.
@param bg: Background color.
@param fg: Foreground (font) color.
@param icon: Pixmap icon.
"""
tree = self.data_tree
it = tree.append(parent)
tree[it] = [text, type(data), data, bg, fg, icon, selected]
self._dataset_iter_map[data] = it
return it
def add_dataset(self, dataset):
"""Appends a new Dataset to the project."""
logger.log('debug','Adding dataset: %s' %dataset.get_name())
self.datasets.append(dataset)
for dim_name in dataset.get_all_dims():
if dim_name not in self.dim_names:
self.dim_names.append(dim_name)
self.sel_obj[dim_name] = set()
self.notify_selection_listeners(dim_name)
self.notify_dataset_listeners()
def add_selection(self, selection):
"""Adds a new selection to the project."""
self.selections.append(selection)
self.notify_dataset_listeners()
def object_at(self, path):
"""Returns the object at a given path in the tree."""
it = self.get_iter(path)
obj = self[it][2]
if obj:
obj.show()
return obj
#def set_current_data(self, obj):
# self.current_data = obj

659
laydi/selections.py Normal file
View File

@@ -0,0 +1,659 @@
import pygtk
import gtk
import gtk.gdk
import gtk.glade
import gnome
import gnome.ui
import gobject
import scipy
import logger, dataset, main
import annotations
from lib import hypergeom
class SimpleMenu(gtk.Menu):
def __init__(self):
gtk.Menu.__init__(self)
def add_simple_item(self, title, function, *args):
item = gtk.MenuItem(title)
item.connect('activate', function, *args)
self.append(item)
item.show()
class IdListController:
"""Controller class for the identifier list."""
def __init__(self, idlist):
self._idlist = idlist
self._idlist.get_selection().set_mode(gtk.SELECTION_MULTIPLE)
self._idlist.set_rubber_banding(True)
# dimname: current_annotation_name
self._annotation = {}
# current dimension
self._dimension = None
# id, annotation
self._idstore = gtk.ListStore(gobject.TYPE_STRING,
gobject.TYPE_STRING)
self._idstore.set_sort_func(0, self._numeric_compare)
# Annotation tree column
self._annotation_column = None
## Set up identifier list
idlist.set_model(self._idstore)
renderer = gtk.CellRendererText()
dim_column = gtk.TreeViewColumn('Identifiers', renderer, text=0)
dim_column.set_sort_indicator(True)
dim_column.set_sort_column_id(0)
dim_column.set_sort_order(gtk.SORT_ASCENDING)
idlist.insert_column(dim_column, 0)
idlist.connect('button-press-event', self._button_pressed)
## Enable dropping
idlist.drag_dest_set(gtk.DEST_DEFAULT_ALL,
[("GTK_TREE_MODEL_ROW", gtk.TARGET_SAME_APP, 7)],
gtk.gdk.ACTION_LINK)
idlist.connect('drag-data-received', self._drag_data_received)
## Set up identifier list context menu
menu = self._menu = SimpleMenu()
menu.add_simple_item('Import...', self._on_import_list)
menu.add_simple_item('Export...', self._on_export_list)
menu.add_simple_item('Add to selection', self._on_make_selection)
item = gtk.MenuItem('Show annotations')
menu.append(item)
item.show()
self._menu_ann = item
##
## Public interface
##
def set_dimension(self, dimname):
"""Set dimension"""
if dimname == self._dimension:
return
self._dimension = dimname
self.set_annotation(self._annotation.get(dimname, None))
if not self._annotation.has_key(dimname):
self._annotation[dimname] = None
def set_annotation(self, annotation):
"""Set the displayed annotation to annotation. If annotation is None,
the annotation column is hidden. Otherwise the annotation column is
shown and filled with values from the given annotation field."""
if annotation == None:
if self._annotation_column != None:
self._idlist.remove_column(self._annotation_column)
self._annotation_column = None
else:
idlist = [x[0] for x in self._idstore]
annlist = annotations.get_dim_annotations(self._dimension,
annotation,
idlist)
for i, x in enumerate(self._idstore):
x[1] = annlist[i]
if self._annotation_column == None:
renderer = gtk.CellRendererText()
col = gtk.TreeViewColumn(annotation, renderer, text=1)
col.set_sort_indicator(True)
col.set_sort_column_id(1)
col.set_sort_order(gtk.SORT_ASCENDING)
self._idlist.append_column(col)
self._annotation_column = col
self._annotation_column.set_title(annotation)
self._annotation[self._dimension] = annotation
def set_selection(self, selection):
"""Set the selection to be displayed.
The selection is not stored, the values are copied into the TreeStore"""
self._idstore.clear()
# Return if no selection
if selection == None:
return
# Otherwise show selection, possibly with annotations.
#id_list = list(selection[self._dimension])
idlist = list(selection[self._dimension])
if self._annotation[self._dimension] != None:
annlist = annotations.get_dim_annotations(self._dimension,
self._annotation[self._dimension],
idlist)
for id, ann in zip(idlist, annlist):
self._idstore.append((id, ann))
else:
for e in idlist:
self._idstore.append((e, None))
##
## Private interface
##
def _update_annotations_menu(self):
"""Updates the annotations menu with the available annotations for the
current dim."""
dim_h = annotations.get_dim_handler(self._dimension)
if not dim_h:
print "set_sensitive(False)"
self._menu_ann.set_sensitive(False)
else:
annotations_menu = gtk.Menu()
print "set_sensitive(True)"
self._menu_ann.set_sensitive(True)
dh = annotations.get_dim_handler(self._dimension)
ann_names = dh.get_annotation_names()
for ann in ann_names:
item = gtk.MenuItem(ann)
item.connect('activate', self._on_annotation_activated, ann)
annotations_menu.append(item)
item.show()
self._menu_ann.set_submenu(annotations_menu)
def import_annotation_file(self):
"""Pops up a file dialog and ask the user to select the annotation
file to be loaded. Only one file can be selected. The file is loaded
into a annotations.AnnotationDictHandler object"""
dialog = gtk.FileChooserDialog('Load annotations')
dialog.set_action(gtk.FILE_CHOOSER_ACTION_OPEN)
dialog.add_buttons(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
gtk.STOCK_OPEN, gtk.RESPONSE_OK)
dialog.set_select_multiple(True)
retval = dialog.run()
if retval in [gtk.RESPONSE_CANCEL, gtk.RESPONSE_DELETE_EVENT]:
pass
elif retval == gtk.RESPONSE_OK:
for filename in dialog.get_filenames():
annotations.read_annotations_file(filename)
else:
print "unknown; ", retval
dialog.destroy()
def export_annotations(self):
"""Pops up a file dialog and ask the user to select a file to save
the currently displayed annotations to.
"""
dialog = gtk.FileChooserDialog('Load annotations')
dialog.set_action(gtk.FILE_CHOOSER_ACTION_SAVE)
dialog.add_buttons(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
gtk.STOCK_SAVE, gtk.RESPONSE_OK)
retval = dialog.run()
if retval in [gtk.RESPONSE_CANCEL, gtk.RESPONSE_DELETE_EVENT]:
pass
elif retval == gtk.RESPONSE_OK:
filename = dialog.get_filename()
fd = open(filename, 'w')
dim = self._dimension
print >> fd, "%s\t%s" % (dim, self._annotation[dim])
for id, value in self._idstore:
print >> fd, "%s\t%s" % (id, value)
fd.close()
else:
print "unknown; ", retval
dialog.destroy()
def set_rank(self, ds):
print "Set rank."
ra = scipy.sum(ds.asarray(), 1)
ranks = {}
dim = ds.get_dim_name()[0]
for key, value in ds[dim].items():
ranks[key] = ra[value]
ann_h = annotations.get_dim_handler(self._dimension)
if ann_h is None:
ann_h = annotations.DictAnnotationHandler()
annotations.set_dim_handler(self._dimension, ann_h)
ann_h.add_annotations('Rank', ranks)
##
## GTK Callbacks
##
def _numeric_compare(self, treemodel, iter1, iter2):
column = treemodel.get_sort_column_id()[0]
item1 = treemodel.get_value(iter1, column)
item2 = treemodel.get_value(iter2, column)
try:
item1 = float(item1)
item2 = float(item2)
except:
logger.log("notice", "Could not convert to float: %s, %s" %(item1, item2))
return cmp(item1, item2)
def _popup_menu(self, *rest):
self._update_annotations_menu()
self._menu.popup(None, None, None, 0, 0)
def _on_annotation_activated(self, menuitem, annotation):
self.set_annotation(annotation)
def _button_pressed(self, widget, event):
if event.button == 3:
self._update_annotations_menu()
self._menu.popup(None, None, None, event.button, event.time)
def _on_export_list(self, menuitem):
self.export_annotations()
def _on_import_list(self, menuitem):
self.import_annotation_file()
def _on_make_selection(self, menuitem):
selection = self._idlist.get_selection()
model, paths = selection.get_selected_rows()
if paths==None: return
iters = [self._idstore.get_iter(p) for p in paths]
ids = [self._idstore.get_value(i, 0) for i in iters]
main.project.set_selection(self._dimension, ids)
def _drag_data_received(self, widget, drag_context, x, y,
selection, info, timestamp):
treestore, path = selection.tree_get_row_drag_data()
i = treestore.get_iter(path)
obj = treestore.get_value(i, 2)
if isinstance(obj, dataset.Dataset):
if self._dimension in obj.get_dim_name():
self.set_rank(obj)
widget.emit_stop_by_name('drag-data-received')
class SelectionListController:
def __init__(self, seltree, idlist_controller):
self._seltree = seltree
self._sel_stores = {}
self._detail_cols = []
self._dimension = None
self._idlist_controller = idlist_controller
self._details_on = False
# Selection column
renderer = gtk.CellRendererText()
sel_column = gtk.TreeViewColumn('Selection', renderer, text=0)
sel_column.set_resizable(True)
sel_column.set_max_width(200)
seltree.insert_column(sel_column, 0)
# Detail columns
cols = [('In CS', 3), ('All', 4), ('Rank', 5)]
for name, store_col_num in cols:
col = gtk.TreeViewColumn(name, renderer, text=store_col_num)
col.set_sort_indicator(True)
col.set_sort_column_id(store_col_num)
col.set_sort_order(gtk.SORT_ASCENDING)
self._detail_cols.append(col)
# Signals
seltree.connect('row-activated', self._on_row_activated)
seltree.connect('cursor-changed', self._on_cursor_changed)
seltree.connect('button-press-event', self._on_button_pressed)
seltree.drag_dest_set(gtk.DEST_DEFAULT_ALL,
[("GTK_TREE_MODEL_ROW", gtk.TARGET_SAME_APP, 7)],
gtk.gdk.ACTION_LINK)
seltree.connect('drag-data-received', self._drag_data_received)
# Selections context menu
self._seltree_menu = SimpleMenu()
self._seltree_menu.add_simple_item('Sort by selection',
self._on_seltree_sort)
self._seltree_menu.add_simple_item('Show details',
self._enable_details, True)
self._seltree_menu.add_simple_item('Hide details',
self._enable_details, False)
#
# Public interface
#
def activate(self):
self._seltree.set_cursor((0,))
def set_project(self, project):
"""Dependency injection."""
main.project.add_selection_observer(self)
def set_dimlist_controller(self, dimlist_controller):
"""Dependency injection of the dimension list controller."""
self._dimlist_controller = dimlist_controller
def set_dimension(self, dim):
"""Set the current dimension, changing the model of the treeview
to match dim. After this the current dimension of the identifier list
is updated."""
self._ensure_selection_store(dim)
self._seltree.set_model(self._sel_stores[dim])
self._idlist_controller.set_dimension(dim)
self._dimension = dim
def selection_changed(self, dimname, selection):
"""Callback function from Project."""
for dim in selection.dims():
self._ensure_selection_store(dim)
store = self._sel_stores[dim]
if not self._get_current_selection_iter(selection, dim):
n = len(selection[dim])
values = (selection.title, selection, dim, n, n, 0)
store.insert_after(None, None, values)
else:
# update size of current selection
for row in store:
if row[1]==selection:
row[3] = row[4] = len(selection[dim])
path = self._seltree.get_cursor()
if path and self._sel_stores.has_key(self._dimension):
it = self._sel_stores[self._dimension].get_iter(path[0])
sel = self._sel_stores[self._dimension].get_value(it, 1)
self._idlist_controller.set_selection(sel)
def add_dataset(self, dataset):
"""Converts a CategoryDataset to Selection objects and adds it to
the selection tree. The name of the dataset will be the parent
node in the tree, and the identifers along the first axis will
be added as the names of the subselections."""
dim_name = dataset.get_dim_name(0)
self._ensure_selection_store(dim_name)
store = self._sel_stores[dim_name]
di = self._get_dataset_iter(dataset)
if not di:
n_tot = dataset.shape[0]
selection = main.project.get_selection().get(dim_name)
ds_idents = dataset.get_identifiers(dim_name)
n_cs = len(selection.intersection(ds_idents))
values = (dataset.get_name(), dataset, dim_name, n_cs, n_tot, 2)
i = store.insert_after(None, None, values)
for selection in dataset.as_selections():
n_sel = len(selection[dim_name])
values = (selection.title, selection, dim_name, 0, n_sel, 0)
store.insert_after(i, None, values)
#
# Private interface
#
def _add_selection_store(self, dim):
"""Add a new gtk.TreeStore for the selections on a dimension."""
# Create new store
# Two types of lines, one for CategoryDatasets and one for
# Selections. The elements are title, link to dataset or selection,
# name of dimension, num. members in selection, num. in
# intersection with current selection and the rank of selection.
store = gtk.TreeStore(gobject.TYPE_STRING,
gobject.TYPE_PYOBJECT,
gobject.TYPE_STRING,
gobject.TYPE_INT,
gobject.TYPE_INT,
gobject.TYPE_FLOAT)
# Set selection store for this dimension
self._sel_stores[dim] = store
def _ensure_selection_store(self, dim):
"""Ensure that the object has a gtk.TreeStore for the given dimension"""
# Do not overwrite existing stores
if self._sel_stores.has_key(dim):
return
self._add_selection_store(dim)
def _get_dataset_iter(self, ds):
"""Returns the iterator to the selection tree row containing a
given dataset."""
store = self._sel_stores[ds.get_dim_name(0)]
i = store.get_iter_first()
while i:
if store.get_value(i, 1) == ds:
return i
i = store.iter_next(i)
return None
def _get_current_selection_iter(self, selection, dimension):
if not self._sel_stores.has_key(dimension):
return None
store = self._sel_stores[dimension]
i = store.get_iter_first()
while i:
if store.get_value(i, 1) == selection:
if store.get_value(i, 2) == dimension:
return i
i = store.iter_next(i)
return None
def _sort_selections(self, dataset):
"""Ranks selections by intersection with current selection.
Ranks determined by the hypergeometric distribution.
"""
dim_name = dataset.get_dim_name(0)
sel_store = self._sel_stores[dim_name]
selection_obj = main.project.get_selection()
current_selection = selection_obj.get(dim_name)
if current_selection==None: return
pvals = hypergeom.gene_hypergeo_test(current_selection, dataset)
for row in sel_store:
if row[1]==dataset:
for child in row.iterchildren():
name = child[0]
child[3] = pvals[name][0]
child[4] = pvals[name][1]
child[5] = pvals[name][2]
sel_store.set_sort_column_id(5, gtk.SORT_ASCENDING)
#
# GTK callbacks
#
def _enable_details(self, widget, bool):
if self._details_on == bool : return
self._details_on = bool
if bool==True:
for col in self._detail_cols:
self._seltree.insert_column(col, -1)
else:
for col in self._detail_cols:
self._seltree.remove_column(col)
def _drag_data_received(self, widget, drag_context, x, y,
selection, info, timestamp):
treestore, path = selection.tree_get_row_drag_data()
i = treestore.get_iter(path)
obj = treestore.get_value(i, 2)
if isinstance(obj, dataset.CategoryDataset):
self.add_dataset(obj)
self._dimlist_controller.set_dimension(obj.get_dim_name(0))
widget.emit_stop_by_name('drag-data-received')
def _on_cursor_changed(self, widget):
"Show the list of identifier strings."
store = self._sel_stores[self._dimension]
p = self._seltree.get_cursor()[0]
i = store.get_iter(p)
obj = store.get_value(i, 1)
if isinstance(obj, dataset.Selection):
self._idlist_controller.set_selection(obj)
else:
self._idlist_controller.set_selection(None)
def _on_row_activated(self, widget, path, column):
store = self._sel_stores[self._dimension]
i = store.get_iter(path)
obj = store.get_value(i, 1)
if isinstance(obj, dataset.Dataset):
seltree = self._seltree
if seltree.row_expanded(path):
seltree.collapse_row(path)
else:
seltree.expand_row(path, True)
elif isinstance(obj, dataset.Selection):
main.project.set_selection(self._dimension,
obj[self._dimension])
def _on_button_pressed(self, widget, event):
"""Button press callbak."""
if event.button == 3:
self._seltree_menu.popup(None, None, None, event.button, event.time)
def _on_seltree_sort(self, menuitem):
"""Sort selection tree if row is category dataset."""
store = self._sel_stores[self._dimension]
p = self._seltree.get_cursor()[0]
i = store.get_iter(p)
obj = store.get_value(i, 1)
if isinstance(obj, dataset.CategoryDataset):
self._sort_selections(obj)
class DimListController:
def __init__(self, dimlist, seltree_controller):
self._current_dim = None
self._seltree_controller = seltree_controller
self.show_hidden = False
## dimstore is a list of all dimensions in the application
self.dimstore = gtk.ListStore(gobject.TYPE_STRING)
# filter for hiding dims prefixed with underscore
self.dimstore_filter = self.dimstore.filter_new()
self.dimstore_filter.set_visible_func(self._dimension_filter)
## The widgets we are controlling
self.dimlist = dimlist
## Set up dimensions list
dimlist.set_model(self.dimstore_filter)
renderer = gtk.CellRendererText()
dim_column = gtk.TreeViewColumn('Dimension', renderer, text=0)
dimlist.insert_column(dim_column, 0)
# Signals
dimlist.connect('row-activated', self._dim_row_activated)
dimlist.connect('cursor-changed', self._dim_cursor_changed)
dimlist.connect('button-press-event', self._dimlist_button_pressed)
# Set up dimension context menu
self._dimlist_menu = SimpleMenu()
self._dimlist_menu.add_simple_item('Hide', self._on_dim_hide)
self._dimlist_menu.add_simple_item('Show all', self._on_dim_show)
##
## Public interface
##
def set_project(self, project):
"""Dependency injection."""
# self.project = project
self.dim_names = project.dim_names
self.update_dims()
project.add_dataset_observer(self)
def get_dimension(self, dim):
"""Returns the iterator to the dimension with the given name, or
None if not found."""
i = self.dimstore_filter.get_iter_first()
while i:
if self.dimstore_filter.get_value(i, 0) == dim:
return i
i = self.dimstore_filter.iter_next(i)
return None
def set_dimension(self, dimname):
"""Sets the current dimension."""
self._current_dim = dimname
dim = self.get_dimension(self._current_dim)
path = self.dimstore_filter.get_path(dim)
if self.dimlist.get_cursor()[0] != path:
self.dimlist.set_cursor(self.dimstore_filter.get_path(dim))
self._seltree_controller.set_dimension(dimname)
def dataset_changed(self):
"""Callback function from Project."""
self.update_dims()
def update_dims(self):
"""Update the list of dimensions shown"""
for dim in self.dim_names:
if not self.get_dimension(dim):
self.dimstore.insert_after(None, (dim,))
self.dimstore_filter.refilter()
#
# Private interface
#
def _dimension_filter(self, store, row):
"""Filters out dimensions with underscore prefix."""
if self.show_hidden:
return True
visible = False
name = store.get_value(row, 0)
if name != None:
visible = name[0]!="_"
return visible
#
# GTK Callbacks.
#
def _on_dim_hide(self, menuitem):
"""Menu item callback function which hides underscore prefixed
dimensions."""
self.show_hidden = False
self.dimstore_filter.refilter()
def _on_dim_show(self, menuitem):
"""Menu item callback function that shows underscore prefixed
dimension names."""
self.show_hidden = True
self.dimstore_filter.refilter()
def _dim_cursor_changed(self, widget):
cursor = self.dimlist.get_cursor()[0]
i = self.dimstore_filter.get_iter(cursor)
row = self.dimstore_filter.get_value(i, 0)
self.set_dimension(row)
self._seltree_controller.activate()
def _dim_row_activated(self, widget, path, column):
#self._seltree_controller.set_dimension(dim)
pass
def _dimlist_button_pressed(self, widget, event):
if event.button == 3:
self._dimlist_menu.popup(None, None, None, event.button, event.time)

1002
laydi/view.py Normal file

File diff suppressed because it is too large Load Diff

480
laydi/workflow.py Normal file
View File

@@ -0,0 +1,480 @@
import gtk, gobject
import sys
import os
import inspect
import logger
import fluents
import main
def _workflow_classes(dir, modname):
"""Returns a list of all subclasses of Workflow in a given module"""
workflow_classes = []
module = __import__('%s' % (modname,))
d = module.__dict__
for wf in d.values():
try:
if issubclass(wf, Workflow):
workflow_classes.append(wf)
except TypeError, e:
pass
return workflow_classes
def workflow_list():
"""Returns a list containing all new workflows"""
retval = []
# List all .py files that can contain workflow classes
wf_path = sys.modules['workflows'].__path__
wf_files = []
for dir in wf_path:
for fn in os.listdir(dir):
if fn.endswith('.py') and ('#' not in fn):
wf_files.append(fn[:-3])
# Try to load each file and look for Workflow derived classes
for fn in wf_files:
try:
for wf in _workflow_classes(fn):
retval.append(wf)
except Exception, e:
logger.log('warning', 'Cannot load workflow: %s' % fn)
logger.log('warning', e)
return retval
def find_workflow(basename):
"""Searches for a workflow with a given filename."""
print "find_workflow"
# List all .py files that can contain workflow classes
wf_path = main.options.workflowdir.split(';')
wf_file = None
for dir in wf_path:
fn = os.path.join(dir, "%s.py" % basename)
if os.path.isfile(fn):
wf_file = fn
return _workflow_classes(dir, basename)[0]
return None
class Workflow:
"""Defines a workflow that contains a set of analysis stages.
A Workflow is a set of analysis stages for a certain type of analysis.
Each stage contains some possible operations to do accomplish that
task.
"""
name = "Workflow"
ident = None
description = "Workflow Description"
def __init__(self):
print "Setting stages"
self.stages = []
self.stages_by_id = {}
def get_data_file_name(self, filename):
"""Checks if a file with the given name exists in the data directory.
Returns the file name if the file exists in the data directory, which
is defined as datadir/workflowname. If the file does not exist, or the
workflow does not have an identificator, this method returns None."""
print os.path.join(main.options.datadir, self.ident, filename)
if self.ident == None:
return None
fn = os.path.join(main.options.datadir, self.ident, filename)
if os.path.isfile(fn):
return fn
return None
def add_stage(self, stage):
self.stages.append(stage)
self.stages_by_id[stage.id] = stage
def print_tree(self):
print self.name
for stage in self.stages:
print ' %s' % stage.name
for fun in stage.functions:
print ' %s' % fun.name
# def add_project(self,project):
# if project == None:
# logger.log('notice','Proejct is empty')
# logger.log('notice','Project added in : %s' %self.name)
# self.project = project
class EmptyWorkflow(Workflow):
name = 'Empty Workflow'
def __init__(self):
print "initing empty workflow"
Workflow.__init__(self)
class Stage:
"""A stage is a part of the data analysis process.
Each stage contains a set of functions that can be used to
accomplish the task. A typical early stage is 'preprocessing', which
can be done in several ways, each represented by a function.
"""
def __init__(self, id, name):
self.id = id
self.name = name
self.functions = []
self.functions_by_id = {}
def add_function(self, fun):
self.functions.append(fun)
self.functions_by_id[fun.id] = fun
class Function:
"""A Function object encapsulates a function on a data set.
Each Function instance encapsulates some function that can be applied
to one or more types of data.
"""
def __init__(self, id, name):
self.id = id
self.name = name
# just return a Validation object
def validate_input(input):
return Validation(True,"Validation Not Implemented")
def run(self):
pass
class Validation:
def __init__(self,result, reason):
self.succeeded = result
self.reason = reason
class WorkflowView (gtk.VBox):
def __init__(self, wf):
gtk.VBox.__init__(self)
self.workflow = wf
self.setup_workflow(wf)
def setup_workflow(self, wf):
# Add stage in the process
for stage in wf.stages:
exp = gtk.Expander(stage.name)
btn_align = gtk.Alignment(xscale=0.9)
btn_align.set_padding(0,4,20,0)
btn_align.show()
btn_box = gtk.VBox()
btn_align.add(btn_box)
btn_box.show()
exp.add(btn_align)
# Add functions in each stage
for fun in stage.functions:
btn = gtk.Button(fun.name)
btn.connect('clicked',
lambda button, f=fun : run_function(f))
btn_box.add(btn)
btn.show()
exp.show()
self.pack_start(exp, expand=False, fill=False)
def remove_workflow(self):
for c in self.get_children():
c.hide()
self.remove(c)
def set_workflow(self, workflow):
self.workflow = workflow
self.remove_workflow()
self.setup_workflow(workflow)
class Options(dict):
"""Options base class.
"""
def __init__(self, *args,**kw):
dict.__init__(self, *args, **kw)
self['out_plots'] = []
self['out_data'] = []
self['all_plots'] = []
self['all_data'] = []
def _copy_from_list(self, key_list):
"""Returns suboptions (dictionary) from a list of keys.
"""
d = {}
for key in key_list:
d[key] = self.get(key, None)
return d
class OptionsDialog(gtk.Dialog):
"""The basic input/output dialog box.
This defines the first page of the function options-gui.
Any function that invokes a option-gui will inherit from this class.
"""
def __init__(self, data, options, input_names=['X','Y']):
gtk.Dialog.__init__(self, 'Input-Output dialog',
None,
gtk.DIALOG_DESTROY_WITH_PARENT,
(gtk.STOCK_OK, gtk.RESPONSE_OK,
gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL))
self._options = options
self._data = data
self._editable = True
self.set_size_request(550,450)
# create notebook
self.nb = nb = gtk.Notebook()
# 1. page: input/output
#inputs
input_frame = gtk.Frame("Input")
hbox = gtk.HBox(True, 8)
align = gtk.Alignment(1, 1, 1, 1)
align.set_padding(8, 8, 8, 8)
align.add(hbox)
input_frame.add(align)
for i, name in enumerate(input_names):
frame = gtk.Frame(name)
frame.set_label_align(0.5, 0.5)
label = gtk.Label(data[i]._name + "\n" + str(data[i]._array.shape))
frame.add(label)
hbox.add(frame)
#outputs
output_frame = gtk.Frame("Output")
output_hbox = gtk.HBox(True,4)
output_align = gtk.Alignment(1, 1, 1, 1)
output_align.set_padding(8, 8, 8, 8) #left padding:8
output_align.add(output_hbox)
output_frame.add(output_align)
# plots
plot_list = gtk.ListStore(str, 'gboolean', gtk.gdk.Pixbuf)
plot_treeview = gtk.TreeView(plot_list)
# Add plots
plot_icon = fluents.icon_factory.get('line_plot')
for plt, name, use in self._options['all_plots']:
plot_list.append((name, use, plot_icon))
# Renderer for icon
plot_icon = fluents.icon_factory.get('line_plot')
icon_renderer = gtk.CellRendererPixbuf()
icon_renderer.set_property('pixbuf', plot_icon)
# Renderer for active toggle.
active_renderer = gtk.CellRendererToggle()
active_renderer.set_property('mode', gtk.CELL_RENDERER_MODE_ACTIVATABLE)
active_renderer.connect('toggled', toggled, plot_list)
active_column = gtk.TreeViewColumn('Use', active_renderer, active=1)
# Renderer for plot title.
title_renderer = gtk.CellRendererText()
title_renderer.set_property('mode', gtk.CELL_RENDERER_MODE_EDITABLE)
title_column = gtk.TreeViewColumn('Plot', title_renderer, text=0)
title_column.pack_start(icon_renderer, expand=False)
# Add columns to tree view.
plot_treeview.append_column(active_column)
plot_treeview.append_column(title_column)
## datasets
dataset_list = gtk.ListStore(str, 'gboolean', gtk.gdk.Pixbuf)
dataset_treeview = gtk.TreeView(dataset_list)
# Add datasets
data_icon = fluents.icon_factory.get('dataset')
for dat, name, use in self._options['all_data']:
dataset_list.append((name, use, data_icon))
# Renderer for icon
icon_renderer = gtk.CellRendererPixbuf()
icon_renderer.set_property('pixbuf', data_icon)
# Renderer for active toggle.
active_renderer = gtk.CellRendererToggle()
active_renderer.set_property('mode', gtk.CELL_RENDERER_MODE_ACTIVATABLE)
active_renderer.connect('toggled', toggled, dataset_list)
active_column = gtk.TreeViewColumn('Use', active_renderer, active=1)
# Renderer for dataset title.
title_renderer = gtk.CellRendererText()
title_renderer.set_property('mode', gtk.CELL_RENDERER_MODE_EDITABLE)
title_column = gtk.TreeViewColumn('Dataset', title_renderer, text=0)
title_column.pack_start(icon_renderer, expand=False)
# Add columns to tree view.
dataset_treeview.append_column(active_column)
dataset_treeview.append_column(title_column)
# add treeviews to output frame
output_hbox.add(plot_treeview)
output_hbox.add(dataset_treeview)
# vbox for input/spacer/output
vbox1 = gtk.VBox()
vbox1.add(input_frame)
vbox1.add(gtk.HSeparator())
vbox1.add(output_frame)
# add vbox to notebook
nb.insert_page(vbox1, gtk.Label("Input/Output"), 0)
self.vbox.add(nb)
#keep ref to liststores
self.dataset_list = dataset_list
self.plot_list = plot_list
def run(self):
self.vbox.show_all()
return gtk.Dialog.run(self)
def set_options(self, options):
self._options = options
def update_options(self, options):
self._options.update(options)
def set_output(self):
# get toggled output data
out_data = [item[0] for name, mark, ic in self.dataset_list for item in self._options['all_data'] if mark==True and name==item[1]]
# get toggled plots
out_plots = [item[0] for name, mark, ic in self.plot_list for item in self._options['all_plots'] if mark==True and name==item[1]]
# update options
self._options['out_data'] = out_data
self._options['out_plots'] = out_plots
def set_editable(self, editable):
self._editable = True
def set_data(self, data):
self._data = data
def get_data(self):
return self._data
def get_options(self):
return self._options
def add_page_from_glade(self, glade_file, widget_name, page_title):
"""Adds a new page(s) to the existing notebook.
The input widget (added as a page in notebook) is defined
in the glade file.
input:
glade_file -- path to glade file
widget_name -- name of widget from glade file
"""
try:
self.wTree = gtk.glade.XML(glade_file)
except:
logger.log('notice', 'Could not find glade file: %s' %glade_file)
widget = self.wTree.get_widget(widget_name)
win = widget.get_parent()
win.hide()
widget.unparent()
self.nb.insert_page(widget, gtk.Label(page_title), -1)
self.nb.set_current_page(0)
def toggled(renderer, path, store):
it = store.get_iter(path)
old_value = store.get_value(it, 1)
store.set_value(it, 1, not old_value)
class WorkflowMenu (gtk.Menu):
def __init__(self, workflow):
gtk.Menu.__init__(self)
self._workflow = workflow
for stage in workflow.stages:
self.append(self._create_stage_item(stage))
def _create_stage_item(self, stage):
stage_menu_item = gtk.MenuItem(stage.name)
stage_menu_item.show()
stage_menu = gtk.Menu()
stage_menu_item.set_submenu(stage_menu)
for fun in stage.functions:
stage_menu.append(self._create_function_item(fun))
return stage_menu_item
def _create_function_item(self, func):
menuitem = gtk.MenuItem(func.name)
menuitem.connect('activate',
lambda item, f=func : run_function(f))
menuitem.show()
return menuitem
def run_function(function):
logger.log('debug', 'Starting function: %s' % function.name)
parent_data = main.project.current_data
validation = function.validate_input()
if not validation.succeeded:
logger.log('warning','Invalid Inputdata: ' + str(reason))
return
args, varargs, varkw, defaults = inspect.getargspec(function.run)
# first argument is 'self' and second should be the selection
# and we don't care about those...
args.remove('self')
if "selection" in args:
pass_selection = True
args.remove('selection')
else:
pass_selection = False
if varargs and len(parent_data) < len(args):
logger.log('warning', "Function requires minimum %d datasets selected." % len(args))
return
elif not varargs and args and len(args) != len(parent_data):
# functions requiring datasets have to have the right number
logger.log('warning', "Function requires %d datasets, but only %d selected." % (len(args), len(parent_data)))
return
if not args:
# we allow functions requiring no data to be run even if a
# dataset is is selected
data = []
else:
data = parent_data
if pass_selection:
# if the function has a 'selection' argument, we pass in
# the selection
new_data = function.run(selection=main.project.get_selection(), *data)
else:
new_data = function.run(*data)
if new_data != None:
main.project.add_data(parent_data, new_data, function.name)
logger.log('debug', 'Function ended: %s' % function.name)