Added read_ftsv in dataset.py. This method reads fluents tab separated values

files and returns a dataset.
2007-01-10 17:35:58 +00:00
parent 060732d980
commit 9274b044b7
2 changed files with 75 additions and 74 deletions
--- a/fluents/dataset.py
+++ b/fluents/dataset.py
@@ -1,8 +1,9 @@
-from scipy import ndarray,atleast_2d,asarray,intersect1d
+from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros
 from scipy import sort as array_sort
 from itertools import izip
 import shelve
 import copy
 import re
 class Dataset:
    """The Dataset base class.
@@ -381,3 +382,73 @@ class Selection(dict):
    def select(self, axis, labels):
        self[axis] = labels
 def read_ftsv(fd):
    split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)')
    dimensions = []
    identifiers = {}
    type = 'dataset'
    name = 'Unnamed dataset'
    graphtype = 'graph'
    # Read header lines from file.
    line = fd.readline()
    while line:
        m = split_re.match(line)
        if m:
            key, val = m.groups()
            # The line is on the form;
            # dimension: dimname id1 id2 id3 ...
            if key == 'dimension':
                values = [v.strip() for v in val.split(' ')]
                dimensions.append(values[0])
                identifiers[values[0]] = values[1:]
            # Read type of dataset.
            # Should be dataset, category, or network
            elif key == 'type':
                type = val
            elif key == 'name':
                name = val
            elif key == 'graphtype':
                graphtype = val
        else:
            break
        line = fd.readline()
    # Dimensions in the form [(dim1, [id1, id2, id3 ..) ...] 
    dims = [(x, identifiers[x]) for x in dimensions]
    dim_lengths = [len(identifiers[x]) for x in dimensions]
    # Create matrix
    if type == 'category':
        matrix = zeros(dim_lengths, dtype=bool)
    elif type == 'network':
        matrix = zeros(dim_lengths)
    else:
        matrix = zeros(dim_lengths)
    line = fd.readline()
    y = 0
    while line:
        values = line.split()
        for x, v in enumerate(values):
            matrix[y,x] = float(v)
        y += 1
        line = fd.readline()
    # Create dataset of specified type
    if type == 'category':
        ds = CategoryDataset(matrix, dims)
    elif type == 'network':
        ds = GraphDataset(matrix, dims)
    else:
        ds = Dataset(matrix, dims)
    return ds
--- a/workflows/go_workflow.py
+++ b/workflows/go_workflow.py
@@ -4,6 +4,7 @@ import geneontology
 #import gostat
 from scipy import array, randn, log, ones, zeros
 import networkx
 import re
 EVIDENCE_CODES=[('IMP', 'Inferred from mutant phenotype'),
                ('IGI', 'Inferred from genetic interaction'),
@@ -137,80 +138,9 @@ class LoadTextDatasetFunction(workflow.Function):
    def __init__(self):
        workflow.Function.__init__(self, 'load-text-ds', 'Load text dataset')
    def read_text_dataset(self, fd):
        split_re = re.compile('^#\s*(\w+)\s*:\s*(.)')
        dimensions = []
        identifiers = {}
        type = 'dataset'
        name = 'Unnamed dataset'
        graphtype = 'graph'
        # Read header lines from file.
        line = fd.readline()
        while line:
            m = split_re.match(line)
            if m:
                key, val = m
                # The line is on the form;
                # dimension: dimname id1 id2 id3 ...
                if key == 'dimension':
                    values = [v.strip() for v in val.split(' ')]
                    dimensions.append(values[0])
                    identifiers[values[0]] = values[1:]
                    headers[key] = val.strip()
                # Read type of dataset.
                # Should be dataset, category, or network
                elif key == 'type':
                    type = val
                elif key == 'name':
                    name = val
                elif key == 'graphtype':
                    graphtype = val
            else:
                break
            line = f.readline()
        # Dimensions in the form [(dim1, [id1, id2, id3 ..) ...] 
        dims = [(x, identifiers[x]) for x in dimensions]
        dim_lengths = [len(identifiers[x]) for x in dimensions]
        # Create dataset of specified type
        if type == 'category':
            matrix = zeros(dim_lengths, dtype=bool)
            ds = dataset.CategoryDataset(matrix, dims)
        elif type == 'network':
            matrix = zeros(dim_lengths)
            ds = dataset.GraphDataset(matrix, dims)
        else:
            matrix = zeros(dim_lengths)
            ds = dataset.Dataset(matrix, dims)
        line = f.readline()
        y = 0
        while line:
            values = line.split()
            for x, v in enumerate(values):
                matrix[x,y] = float(v)
            y += 1
            line = f.readline()
        # Build NetowrkX graph from matrix.
        if type == 'network':
            matrix = zeros(dim_lengths)
            ds = dataset.NetworkDataset(matrix, dims)
    def run(self):
-        f = open('/home/einarr/foodata.fcsv')
+        f = open('/home/einarr/data/goa-condensed.ftsv')
-        return read_text_dataset(f)
+        return [dataset.read_ftsv(f)]
 class LoadAnnotationsFunction(workflow.Function):