Added read_ftsv in dataset.py. This method reads fluents tab separated values
files and returns a dataset.
This commit is contained in:
parent
060732d980
commit
9274b044b7
|
@ -1,8 +1,9 @@
|
|||
from scipy import ndarray,atleast_2d,asarray,intersect1d
|
||||
from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros
|
||||
from scipy import sort as array_sort
|
||||
from itertools import izip
|
||||
import shelve
|
||||
import copy
|
||||
import re
|
||||
|
||||
class Dataset:
|
||||
"""The Dataset base class.
|
||||
|
@ -381,3 +382,73 @@ class Selection(dict):
|
|||
def select(self, axis, labels):
|
||||
self[axis] = labels
|
||||
|
||||
|
||||
def read_ftsv(fd):
|
||||
split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)')
|
||||
dimensions = []
|
||||
identifiers = {}
|
||||
type = 'dataset'
|
||||
name = 'Unnamed dataset'
|
||||
graphtype = 'graph'
|
||||
|
||||
# Read header lines from file.
|
||||
line = fd.readline()
|
||||
while line:
|
||||
m = split_re.match(line)
|
||||
if m:
|
||||
key, val = m.groups()
|
||||
|
||||
# The line is on the form;
|
||||
# dimension: dimname id1 id2 id3 ...
|
||||
if key == 'dimension':
|
||||
values = [v.strip() for v in val.split(' ')]
|
||||
dimensions.append(values[0])
|
||||
identifiers[values[0]] = values[1:]
|
||||
|
||||
# Read type of dataset.
|
||||
# Should be dataset, category, or network
|
||||
elif key == 'type':
|
||||
type = val
|
||||
|
||||
elif key == 'name':
|
||||
name = val
|
||||
|
||||
elif key == 'graphtype':
|
||||
graphtype = val
|
||||
|
||||
else:
|
||||
break
|
||||
line = fd.readline()
|
||||
|
||||
# Dimensions in the form [(dim1, [id1, id2, id3 ..) ...]
|
||||
dims = [(x, identifiers[x]) for x in dimensions]
|
||||
dim_lengths = [len(identifiers[x]) for x in dimensions]
|
||||
|
||||
# Create matrix
|
||||
if type == 'category':
|
||||
matrix = zeros(dim_lengths, dtype=bool)
|
||||
elif type == 'network':
|
||||
matrix = zeros(dim_lengths)
|
||||
else:
|
||||
matrix = zeros(dim_lengths)
|
||||
|
||||
line = fd.readline()
|
||||
y = 0
|
||||
while line:
|
||||
values = line.split()
|
||||
for x, v in enumerate(values):
|
||||
matrix[y,x] = float(v)
|
||||
y += 1
|
||||
line = fd.readline()
|
||||
|
||||
# Create dataset of specified type
|
||||
if type == 'category':
|
||||
ds = CategoryDataset(matrix, dims)
|
||||
elif type == 'network':
|
||||
ds = GraphDataset(matrix, dims)
|
||||
else:
|
||||
ds = Dataset(matrix, dims)
|
||||
|
||||
return ds
|
||||
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ import geneontology
|
|||
#import gostat
|
||||
from scipy import array, randn, log, ones, zeros
|
||||
import networkx
|
||||
import re
|
||||
|
||||
EVIDENCE_CODES=[('IMP', 'Inferred from mutant phenotype'),
|
||||
('IGI', 'Inferred from genetic interaction'),
|
||||
|
@ -137,80 +138,9 @@ class LoadTextDatasetFunction(workflow.Function):
|
|||
|
||||
def __init__(self):
|
||||
workflow.Function.__init__(self, 'load-text-ds', 'Load text dataset')
|
||||
|
||||
def read_text_dataset(self, fd):
|
||||
split_re = re.compile('^#\s*(\w+)\s*:\s*(.)')
|
||||
dimensions = []
|
||||
identifiers = {}
|
||||
type = 'dataset'
|
||||
name = 'Unnamed dataset'
|
||||
graphtype = 'graph'
|
||||
|
||||
# Read header lines from file.
|
||||
line = fd.readline()
|
||||
while line:
|
||||
m = split_re.match(line)
|
||||
if m:
|
||||
key, val = m
|
||||
|
||||
# The line is on the form;
|
||||
# dimension: dimname id1 id2 id3 ...
|
||||
if key == 'dimension':
|
||||
values = [v.strip() for v in val.split(' ')]
|
||||
dimensions.append(values[0])
|
||||
identifiers[values[0]] = values[1:]
|
||||
|
||||
headers[key] = val.strip()
|
||||
|
||||
# Read type of dataset.
|
||||
# Should be dataset, category, or network
|
||||
elif key == 'type':
|
||||
type = val
|
||||
|
||||
elif key == 'name':
|
||||
name = val
|
||||
|
||||
elif key == 'graphtype':
|
||||
graphtype = val
|
||||
|
||||
else:
|
||||
break
|
||||
line = f.readline()
|
||||
|
||||
# Dimensions in the form [(dim1, [id1, id2, id3 ..) ...]
|
||||
dims = [(x, identifiers[x]) for x in dimensions]
|
||||
dim_lengths = [len(identifiers[x]) for x in dimensions]
|
||||
|
||||
# Create dataset of specified type
|
||||
if type == 'category':
|
||||
matrix = zeros(dim_lengths, dtype=bool)
|
||||
ds = dataset.CategoryDataset(matrix, dims)
|
||||
elif type == 'network':
|
||||
matrix = zeros(dim_lengths)
|
||||
ds = dataset.GraphDataset(matrix, dims)
|
||||
else:
|
||||
matrix = zeros(dim_lengths)
|
||||
ds = dataset.Dataset(matrix, dims)
|
||||
|
||||
line = f.readline()
|
||||
y = 0
|
||||
while line:
|
||||
values = line.split()
|
||||
for x, v in enumerate(values):
|
||||
matrix[x,y] = float(v)
|
||||
y += 1
|
||||
line = f.readline()
|
||||
|
||||
# Build NetowrkX graph from matrix.
|
||||
if type == 'network':
|
||||
matrix = zeros(dim_lengths)
|
||||
ds = dataset.NetworkDataset(matrix, dims)
|
||||
|
||||
|
||||
def run(self):
|
||||
f = open('/home/einarr/foodata.fcsv')
|
||||
return read_text_dataset(f)
|
||||
|
||||
f = open('/home/einarr/data/goa-condensed.ftsv')
|
||||
return [dataset.read_ftsv(f)]
|
||||
|
||||
|
||||
class LoadAnnotationsFunction(workflow.Function):
|
||||
|
|
Reference in New Issue