Projects/laydi
Projects
/
laydi
Archived
7
0
Fork 0

Added read_ftsv in dataset.py. This method reads fluents tab separated values

files and returns a dataset.
This commit is contained in:
Einar Ryeng 2007-01-10 17:35:58 +00:00
parent 060732d980
commit 9274b044b7
2 changed files with 75 additions and 74 deletions

View File

@ -1,8 +1,9 @@
from scipy import ndarray,atleast_2d,asarray,intersect1d from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros
from scipy import sort as array_sort from scipy import sort as array_sort
from itertools import izip from itertools import izip
import shelve import shelve
import copy import copy
import re
class Dataset: class Dataset:
"""The Dataset base class. """The Dataset base class.
@ -380,4 +381,74 @@ class Selection(dict):
def select(self, axis, labels): def select(self, axis, labels):
self[axis] = labels self[axis] = labels
def read_ftsv(fd):
split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)')
dimensions = []
identifiers = {}
type = 'dataset'
name = 'Unnamed dataset'
graphtype = 'graph'
# Read header lines from file.
line = fd.readline()
while line:
m = split_re.match(line)
if m:
key, val = m.groups()
# The line is on the form;
# dimension: dimname id1 id2 id3 ...
if key == 'dimension':
values = [v.strip() for v in val.split(' ')]
dimensions.append(values[0])
identifiers[values[0]] = values[1:]
# Read type of dataset.
# Should be dataset, category, or network
elif key == 'type':
type = val
elif key == 'name':
name = val
elif key == 'graphtype':
graphtype = val
else:
break
line = fd.readline()
# Dimensions in the form [(dim1, [id1, id2, id3 ..) ...]
dims = [(x, identifiers[x]) for x in dimensions]
dim_lengths = [len(identifiers[x]) for x in dimensions]
# Create matrix
if type == 'category':
matrix = zeros(dim_lengths, dtype=bool)
elif type == 'network':
matrix = zeros(dim_lengths)
else:
matrix = zeros(dim_lengths)
line = fd.readline()
y = 0
while line:
values = line.split()
for x, v in enumerate(values):
matrix[y,x] = float(v)
y += 1
line = fd.readline()
# Create dataset of specified type
if type == 'category':
ds = CategoryDataset(matrix, dims)
elif type == 'network':
ds = GraphDataset(matrix, dims)
else:
ds = Dataset(matrix, dims)
return ds

View File

@ -4,6 +4,7 @@ import geneontology
#import gostat #import gostat
from scipy import array, randn, log, ones, zeros from scipy import array, randn, log, ones, zeros
import networkx import networkx
import re
EVIDENCE_CODES=[('IMP', 'Inferred from mutant phenotype'), EVIDENCE_CODES=[('IMP', 'Inferred from mutant phenotype'),
('IGI', 'Inferred from genetic interaction'), ('IGI', 'Inferred from genetic interaction'),
@ -137,81 +138,10 @@ class LoadTextDatasetFunction(workflow.Function):
def __init__(self): def __init__(self):
workflow.Function.__init__(self, 'load-text-ds', 'Load text dataset') workflow.Function.__init__(self, 'load-text-ds', 'Load text dataset')
def read_text_dataset(self, fd):
split_re = re.compile('^#\s*(\w+)\s*:\s*(.)')
dimensions = []
identifiers = {}
type = 'dataset'
name = 'Unnamed dataset'
graphtype = 'graph'
# Read header lines from file.
line = fd.readline()
while line:
m = split_re.match(line)
if m:
key, val = m
# The line is on the form;
# dimension: dimname id1 id2 id3 ...
if key == 'dimension':
values = [v.strip() for v in val.split(' ')]
dimensions.append(values[0])
identifiers[values[0]] = values[1:]
headers[key] = val.strip()
# Read type of dataset.
# Should be dataset, category, or network
elif key == 'type':
type = val
elif key == 'name':
name = val
elif key == 'graphtype':
graphtype = val
else:
break
line = f.readline()
# Dimensions in the form [(dim1, [id1, id2, id3 ..) ...]
dims = [(x, identifiers[x]) for x in dimensions]
dim_lengths = [len(identifiers[x]) for x in dimensions]
# Create dataset of specified type
if type == 'category':
matrix = zeros(dim_lengths, dtype=bool)
ds = dataset.CategoryDataset(matrix, dims)
elif type == 'network':
matrix = zeros(dim_lengths)
ds = dataset.GraphDataset(matrix, dims)
else:
matrix = zeros(dim_lengths)
ds = dataset.Dataset(matrix, dims)
line = f.readline()
y = 0
while line:
values = line.split()
for x, v in enumerate(values):
matrix[x,y] = float(v)
y += 1
line = f.readline()
# Build NetowrkX graph from matrix.
if type == 'network':
matrix = zeros(dim_lengths)
ds = dataset.NetworkDataset(matrix, dims)
def run(self): def run(self):
f = open('/home/einarr/foodata.fcsv') f = open('/home/einarr/data/goa-condensed.ftsv')
return read_text_dataset(f) return [dataset.read_ftsv(f)]
class LoadAnnotationsFunction(workflow.Function): class LoadAnnotationsFunction(workflow.Function):