Implemented PhenotypeDataset with tests.
This commit is contained in:
parent
56a6028547
commit
b757da5929
|
@ -56,7 +56,7 @@ class Dataset:
|
||||||
if shape != None:
|
if shape != None:
|
||||||
if self.shape!=shape:
|
if self.shape!=shape:
|
||||||
#logger.log("debug","Dataset and input shape mismatch")
|
#logger.log("debug","Dataset and input shape mismatch")
|
||||||
raise ValueError
|
raise ValueError, "Differing in array and provided. %s != %s" % (self.shape, shape)
|
||||||
if identifiers!=None:
|
if identifiers!=None:
|
||||||
self._set_identifiers(identifiers,all_dims)
|
self._set_identifiers(identifiers,all_dims)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -0,0 +1,81 @@
|
||||||
|
import unittest
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.append("../..")
|
||||||
|
from workflows.affy_workflow import PhenotypeDataset
|
||||||
|
|
||||||
|
class PhenotypeDatasetTest(unittest.TestCase):
|
||||||
|
def testEmptyData(self):
|
||||||
|
# we have a list of cel-files, but no categories
|
||||||
|
cel_data = """\
|
||||||
|
CEL
|
||||||
|
02-05-33
|
||||||
|
03-07-38
|
||||||
|
"""
|
||||||
|
dataset = PhenotypeDataset(cel_data)
|
||||||
|
self.assertEquals(['CEL', 'phenotypes'], dataset.get_dim_names())
|
||||||
|
self.assertEquals(['02-05-33', '03-07-38'], dataset.get_identifiers('CEL'))
|
||||||
|
self.assertEquals([], dataset.get_identifiers('phenotypes'))
|
||||||
|
|
||||||
|
def testFloatData(self):
|
||||||
|
cel_data = """\
|
||||||
|
CEL\tage
|
||||||
|
02-05-33\t8
|
||||||
|
03-07-38\t9
|
||||||
|
"""
|
||||||
|
dataset = PhenotypeDataset(cel_data)
|
||||||
|
self.assertEquals(['CEL', 'phenotypes'], dataset.get_dim_names())
|
||||||
|
self.assertEquals(['age'], dataset.get_identifiers('phenotypes'))
|
||||||
|
self.assertEquals([[8],
|
||||||
|
[9]], dataset.asarray().tolist())
|
||||||
|
|
||||||
|
def testCategoryData(self):
|
||||||
|
"""Categories expand to one numeric column for each category choice."""
|
||||||
|
cel_data = """\
|
||||||
|
CEL\tsick
|
||||||
|
02-05-33\tyes
|
||||||
|
03-07-38\tno
|
||||||
|
04-93-33\tyes
|
||||||
|
08-32-33\tmaybe
|
||||||
|
"""
|
||||||
|
dataset = PhenotypeDataset(cel_data)
|
||||||
|
self.assertEquals(['CEL', 'phenotypes'], dataset.get_dim_names())
|
||||||
|
self.assertEquals(['sick-yes', 'sick-no', 'sick-maybe'], dataset.get_identifiers('phenotypes'))
|
||||||
|
self.assertEquals([[1, 0, 0],
|
||||||
|
[0, 1, 0],
|
||||||
|
[1, 0, 0],
|
||||||
|
[0, 0, 1]], dataset.asarray().tolist())
|
||||||
|
|
||||||
|
def testMultipleCategoriesAndFloats(self):
|
||||||
|
cel_data = """\
|
||||||
|
CEL\tsex\tage\tinfected
|
||||||
|
02-05-33\tF\t8\tI
|
||||||
|
02-05-34\tF\t9\tN
|
||||||
|
02-05-35\tM\t8\tI
|
||||||
|
"""
|
||||||
|
dataset = PhenotypeDataset(cel_data)
|
||||||
|
self.assertEquals(['sex-F', 'sex-M', 'age', 'infected-I', 'infected-N'],
|
||||||
|
dataset.get_identifiers('phenotypes'))
|
||||||
|
|
||||||
|
self.assertEquals([[1, 0, 8, 1, 0],
|
||||||
|
[1, 0, 9, 0, 1],
|
||||||
|
[0, 1, 8, 1, 0]], dataset.asarray().tolist())
|
||||||
|
|
||||||
|
def testGetPhenotypeTable(self):
|
||||||
|
cel_data = """\
|
||||||
|
CEL\tsex\tage\tinfected
|
||||||
|
02-05-33\tF\t8\tI
|
||||||
|
02-05-34\tF\t9\tN
|
||||||
|
02-05-35\tM\t8\tI
|
||||||
|
"""
|
||||||
|
dataset = PhenotypeDataset(cel_data)
|
||||||
|
|
||||||
|
self.assertEquals([['CEL', 'sex', 'age', 'infected'],
|
||||||
|
['02-05-33', 'F', '8', 'I'],
|
||||||
|
['02-05-34', 'F', '9', 'N'],
|
||||||
|
['02-05-35', 'M', '8', 'I']], dataset.get_phenotype_table())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__=='__main__':
|
||||||
|
unittest.main()
|
|
@ -1,6 +1,6 @@
|
||||||
import gtk
|
import gtk
|
||||||
from system import dataset, logger, plots, workflow
|
from system import dataset, logger, plots, workflow
|
||||||
from scipy import randn
|
from scipy import randn, array, transpose, zeros
|
||||||
import cPickle
|
import cPickle
|
||||||
|
|
||||||
|
|
||||||
|
@ -185,3 +185,53 @@ class PCAFunction(workflow.Function):
|
||||||
|
|
||||||
return [T, P, loading_plot, score_plot]
|
return [T, P, loading_plot, score_plot]
|
||||||
|
|
||||||
|
|
||||||
|
class PhenotypeDataset(dataset.Dataset):
|
||||||
|
def __init__(self, string):
|
||||||
|
self._table = rows = [line.split("\t") for line in string.splitlines()]
|
||||||
|
columns = zip(*rows[1:])
|
||||||
|
cel_names = columns[0]
|
||||||
|
col_names = rows[0][1:]
|
||||||
|
phenotypes = []
|
||||||
|
categories = {}
|
||||||
|
|
||||||
|
for col_name, column in zip(col_names, columns[1:]):
|
||||||
|
try:
|
||||||
|
categories[col_name] = map(int, column)
|
||||||
|
phenotypes.append(col_name)
|
||||||
|
except ValueError:
|
||||||
|
# category-data
|
||||||
|
keys = []
|
||||||
|
entries = {}
|
||||||
|
for i, entry in enumerate(column):
|
||||||
|
if entry not in entries:
|
||||||
|
keys.append(entry)
|
||||||
|
entries[entry] = []
|
||||||
|
|
||||||
|
entries[entry].append(i)
|
||||||
|
|
||||||
|
|
||||||
|
for key in keys:
|
||||||
|
z = zeros(len(column))
|
||||||
|
for i in entries[key]:
|
||||||
|
z[i] = 1
|
||||||
|
key = "%s-%s" % (col_name, key)
|
||||||
|
phenotypes.append(key)
|
||||||
|
categories[key] = z
|
||||||
|
|
||||||
|
matrix_data = []
|
||||||
|
for key in phenotypes:
|
||||||
|
matrix_data.append(categories[key])
|
||||||
|
|
||||||
|
if matrix_data:
|
||||||
|
a = transpose(array(matrix_data))
|
||||||
|
else:
|
||||||
|
a = None
|
||||||
|
|
||||||
|
dataset.Dataset.__init__(self, a, identifiers=[('CEL', cel_names),
|
||||||
|
('phenotypes', phenotypes)],
|
||||||
|
shape=(len(cel_names),len(phenotypes)))
|
||||||
|
|
||||||
|
|
||||||
|
def get_phenotype_table(self):
|
||||||
|
return self._table
|
||||||
|
|
Reference in New Issue