Added read_ftsv in dataset.py. This method reads fluents tab separated values
files and returns a dataset.
This commit is contained in:
parent
060732d980
commit
9274b044b7
|
@ -1,8 +1,9 @@
|
||||||
from scipy import ndarray,atleast_2d,asarray,intersect1d
|
from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros
|
||||||
from scipy import sort as array_sort
|
from scipy import sort as array_sort
|
||||||
from itertools import izip
|
from itertools import izip
|
||||||
import shelve
|
import shelve
|
||||||
import copy
|
import copy
|
||||||
|
import re
|
||||||
|
|
||||||
class Dataset:
|
class Dataset:
|
||||||
"""The Dataset base class.
|
"""The Dataset base class.
|
||||||
|
@ -381,3 +382,73 @@ class Selection(dict):
|
||||||
def select(self, axis, labels):
|
def select(self, axis, labels):
|
||||||
self[axis] = labels
|
self[axis] = labels
|
||||||
|
|
||||||
|
|
||||||
|
def read_ftsv(fd):
|
||||||
|
split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)')
|
||||||
|
dimensions = []
|
||||||
|
identifiers = {}
|
||||||
|
type = 'dataset'
|
||||||
|
name = 'Unnamed dataset'
|
||||||
|
graphtype = 'graph'
|
||||||
|
|
||||||
|
# Read header lines from file.
|
||||||
|
line = fd.readline()
|
||||||
|
while line:
|
||||||
|
m = split_re.match(line)
|
||||||
|
if m:
|
||||||
|
key, val = m.groups()
|
||||||
|
|
||||||
|
# The line is on the form;
|
||||||
|
# dimension: dimname id1 id2 id3 ...
|
||||||
|
if key == 'dimension':
|
||||||
|
values = [v.strip() for v in val.split(' ')]
|
||||||
|
dimensions.append(values[0])
|
||||||
|
identifiers[values[0]] = values[1:]
|
||||||
|
|
||||||
|
# Read type of dataset.
|
||||||
|
# Should be dataset, category, or network
|
||||||
|
elif key == 'type':
|
||||||
|
type = val
|
||||||
|
|
||||||
|
elif key == 'name':
|
||||||
|
name = val
|
||||||
|
|
||||||
|
elif key == 'graphtype':
|
||||||
|
graphtype = val
|
||||||
|
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
line = fd.readline()
|
||||||
|
|
||||||
|
# Dimensions in the form [(dim1, [id1, id2, id3 ..) ...]
|
||||||
|
dims = [(x, identifiers[x]) for x in dimensions]
|
||||||
|
dim_lengths = [len(identifiers[x]) for x in dimensions]
|
||||||
|
|
||||||
|
# Create matrix
|
||||||
|
if type == 'category':
|
||||||
|
matrix = zeros(dim_lengths, dtype=bool)
|
||||||
|
elif type == 'network':
|
||||||
|
matrix = zeros(dim_lengths)
|
||||||
|
else:
|
||||||
|
matrix = zeros(dim_lengths)
|
||||||
|
|
||||||
|
line = fd.readline()
|
||||||
|
y = 0
|
||||||
|
while line:
|
||||||
|
values = line.split()
|
||||||
|
for x, v in enumerate(values):
|
||||||
|
matrix[y,x] = float(v)
|
||||||
|
y += 1
|
||||||
|
line = fd.readline()
|
||||||
|
|
||||||
|
# Create dataset of specified type
|
||||||
|
if type == 'category':
|
||||||
|
ds = CategoryDataset(matrix, dims)
|
||||||
|
elif type == 'network':
|
||||||
|
ds = GraphDataset(matrix, dims)
|
||||||
|
else:
|
||||||
|
ds = Dataset(matrix, dims)
|
||||||
|
|
||||||
|
return ds
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,7 @@ import geneontology
|
||||||
#import gostat
|
#import gostat
|
||||||
from scipy import array, randn, log, ones, zeros
|
from scipy import array, randn, log, ones, zeros
|
||||||
import networkx
|
import networkx
|
||||||
|
import re
|
||||||
|
|
||||||
EVIDENCE_CODES=[('IMP', 'Inferred from mutant phenotype'),
|
EVIDENCE_CODES=[('IMP', 'Inferred from mutant phenotype'),
|
||||||
('IGI', 'Inferred from genetic interaction'),
|
('IGI', 'Inferred from genetic interaction'),
|
||||||
|
@ -137,80 +138,9 @@ class LoadTextDatasetFunction(workflow.Function):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
workflow.Function.__init__(self, 'load-text-ds', 'Load text dataset')
|
workflow.Function.__init__(self, 'load-text-ds', 'Load text dataset')
|
||||||
|
|
||||||
def read_text_dataset(self, fd):
|
|
||||||
split_re = re.compile('^#\s*(\w+)\s*:\s*(.)')
|
|
||||||
dimensions = []
|
|
||||||
identifiers = {}
|
|
||||||
type = 'dataset'
|
|
||||||
name = 'Unnamed dataset'
|
|
||||||
graphtype = 'graph'
|
|
||||||
|
|
||||||
# Read header lines from file.
|
|
||||||
line = fd.readline()
|
|
||||||
while line:
|
|
||||||
m = split_re.match(line)
|
|
||||||
if m:
|
|
||||||
key, val = m
|
|
||||||
|
|
||||||
# The line is on the form;
|
|
||||||
# dimension: dimname id1 id2 id3 ...
|
|
||||||
if key == 'dimension':
|
|
||||||
values = [v.strip() for v in val.split(' ')]
|
|
||||||
dimensions.append(values[0])
|
|
||||||
identifiers[values[0]] = values[1:]
|
|
||||||
|
|
||||||
headers[key] = val.strip()
|
|
||||||
|
|
||||||
# Read type of dataset.
|
|
||||||
# Should be dataset, category, or network
|
|
||||||
elif key == 'type':
|
|
||||||
type = val
|
|
||||||
|
|
||||||
elif key == 'name':
|
|
||||||
name = val
|
|
||||||
|
|
||||||
elif key == 'graphtype':
|
|
||||||
graphtype = val
|
|
||||||
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
line = f.readline()
|
|
||||||
|
|
||||||
# Dimensions in the form [(dim1, [id1, id2, id3 ..) ...]
|
|
||||||
dims = [(x, identifiers[x]) for x in dimensions]
|
|
||||||
dim_lengths = [len(identifiers[x]) for x in dimensions]
|
|
||||||
|
|
||||||
# Create dataset of specified type
|
|
||||||
if type == 'category':
|
|
||||||
matrix = zeros(dim_lengths, dtype=bool)
|
|
||||||
ds = dataset.CategoryDataset(matrix, dims)
|
|
||||||
elif type == 'network':
|
|
||||||
matrix = zeros(dim_lengths)
|
|
||||||
ds = dataset.GraphDataset(matrix, dims)
|
|
||||||
else:
|
|
||||||
matrix = zeros(dim_lengths)
|
|
||||||
ds = dataset.Dataset(matrix, dims)
|
|
||||||
|
|
||||||
line = f.readline()
|
|
||||||
y = 0
|
|
||||||
while line:
|
|
||||||
values = line.split()
|
|
||||||
for x, v in enumerate(values):
|
|
||||||
matrix[x,y] = float(v)
|
|
||||||
y += 1
|
|
||||||
line = f.readline()
|
|
||||||
|
|
||||||
# Build NetowrkX graph from matrix.
|
|
||||||
if type == 'network':
|
|
||||||
matrix = zeros(dim_lengths)
|
|
||||||
ds = dataset.NetworkDataset(matrix, dims)
|
|
||||||
|
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
f = open('/home/einarr/foodata.fcsv')
|
f = open('/home/einarr/data/goa-condensed.ftsv')
|
||||||
return read_text_dataset(f)
|
return [dataset.read_ftsv(f)]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class LoadAnnotationsFunction(workflow.Function):
|
class LoadAnnotationsFunction(workflow.Function):
|
||||||
|
|
Reference in New Issue