Projects/laydi
Projects
/
laydi
Archived
7
0
Fork 0

Implemented PhenotypeDataset with tests.

This commit is contained in:
Truls Alexander Tangstad 2006-05-03 14:04:28 +00:00
parent 56a6028547
commit b757da5929
3 changed files with 133 additions and 2 deletions

View File

@ -56,7 +56,7 @@ class Dataset:
if shape != None: if shape != None:
if self.shape!=shape: if self.shape!=shape:
#logger.log("debug","Dataset and input shape mismatch") #logger.log("debug","Dataset and input shape mismatch")
raise ValueError raise ValueError, "Differing in array and provided. %s != %s" % (self.shape, shape)
if identifiers!=None: if identifiers!=None:
self._set_identifiers(identifiers,all_dims) self._set_identifiers(identifiers,all_dims)
else: else:

View File

@ -0,0 +1,81 @@
import unittest
import sys
sys.path.append("../..")
from workflows.affy_workflow import PhenotypeDataset
class PhenotypeDatasetTest(unittest.TestCase):
def testEmptyData(self):
# we have a list of cel-files, but no categories
cel_data = """\
CEL
02-05-33
03-07-38
"""
dataset = PhenotypeDataset(cel_data)
self.assertEquals(['CEL', 'phenotypes'], dataset.get_dim_names())
self.assertEquals(['02-05-33', '03-07-38'], dataset.get_identifiers('CEL'))
self.assertEquals([], dataset.get_identifiers('phenotypes'))
def testFloatData(self):
cel_data = """\
CEL\tage
02-05-33\t8
03-07-38\t9
"""
dataset = PhenotypeDataset(cel_data)
self.assertEquals(['CEL', 'phenotypes'], dataset.get_dim_names())
self.assertEquals(['age'], dataset.get_identifiers('phenotypes'))
self.assertEquals([[8],
[9]], dataset.asarray().tolist())
def testCategoryData(self):
"""Categories expand to one numeric column for each category choice."""
cel_data = """\
CEL\tsick
02-05-33\tyes
03-07-38\tno
04-93-33\tyes
08-32-33\tmaybe
"""
dataset = PhenotypeDataset(cel_data)
self.assertEquals(['CEL', 'phenotypes'], dataset.get_dim_names())
self.assertEquals(['sick-yes', 'sick-no', 'sick-maybe'], dataset.get_identifiers('phenotypes'))
self.assertEquals([[1, 0, 0],
[0, 1, 0],
[1, 0, 0],
[0, 0, 1]], dataset.asarray().tolist())
def testMultipleCategoriesAndFloats(self):
cel_data = """\
CEL\tsex\tage\tinfected
02-05-33\tF\t8\tI
02-05-34\tF\t9\tN
02-05-35\tM\t8\tI
"""
dataset = PhenotypeDataset(cel_data)
self.assertEquals(['sex-F', 'sex-M', 'age', 'infected-I', 'infected-N'],
dataset.get_identifiers('phenotypes'))
self.assertEquals([[1, 0, 8, 1, 0],
[1, 0, 9, 0, 1],
[0, 1, 8, 1, 0]], dataset.asarray().tolist())
def testGetPhenotypeTable(self):
cel_data = """\
CEL\tsex\tage\tinfected
02-05-33\tF\t8\tI
02-05-34\tF\t9\tN
02-05-35\tM\t8\tI
"""
dataset = PhenotypeDataset(cel_data)
self.assertEquals([['CEL', 'sex', 'age', 'infected'],
['02-05-33', 'F', '8', 'I'],
['02-05-34', 'F', '9', 'N'],
['02-05-35', 'M', '8', 'I']], dataset.get_phenotype_table())
if __name__=='__main__':
unittest.main()

View File

@ -1,6 +1,6 @@
import gtk import gtk
from system import dataset, logger, plots, workflow from system import dataset, logger, plots, workflow
from scipy import randn from scipy import randn, array, transpose, zeros
import cPickle import cPickle
@ -185,3 +185,53 @@ class PCAFunction(workflow.Function):
return [T, P, loading_plot, score_plot] return [T, P, loading_plot, score_plot]
class PhenotypeDataset(dataset.Dataset):
def __init__(self, string):
self._table = rows = [line.split("\t") for line in string.splitlines()]
columns = zip(*rows[1:])
cel_names = columns[0]
col_names = rows[0][1:]
phenotypes = []
categories = {}
for col_name, column in zip(col_names, columns[1:]):
try:
categories[col_name] = map(int, column)
phenotypes.append(col_name)
except ValueError:
# category-data
keys = []
entries = {}
for i, entry in enumerate(column):
if entry not in entries:
keys.append(entry)
entries[entry] = []
entries[entry].append(i)
for key in keys:
z = zeros(len(column))
for i in entries[key]:
z[i] = 1
key = "%s-%s" % (col_name, key)
phenotypes.append(key)
categories[key] = z
matrix_data = []
for key in phenotypes:
matrix_data.append(categories[key])
if matrix_data:
a = transpose(array(matrix_data))
else:
a = None
dataset.Dataset.__init__(self, a, identifiers=[('CEL', cel_names),
('phenotypes', phenotypes)],
shape=(len(cel_names),len(phenotypes)))
def get_phenotype_table(self):
return self._table