Added read_ftsv in dataset.py. This method reads fluents tab separated values

files and returns a dataset.
2007-01-10 17:35:58 +00:00
parent 060732d980
commit 9274b044b7
2 changed files with 75 additions and 74 deletions
--- a/fluents/dataset.py
+++ b/fluents/dataset.py
@@ -1,8 +1,9 @@
-from scipy import ndarray,atleast_2d,asarray,intersect1d
+from scipy import ndarray,atleast_2d,asarray,intersect1d,zeros
 from scipy import sort as array_sort
 from itertools import izip
 import shelve
 import copy
+import re

 class Dataset:
    """The Dataset base class.
@@ -381,3 +382,73 @@ class Selection(dict):
    def select(self, axis, labels):
        self[axis] = labels
    
+
+def read_ftsv(fd):
+    split_re = re.compile('^#\s*(\w+)\s*:\s*(.+)')
+    dimensions = []
+    identifiers = {}
+    type = 'dataset'
+    name = 'Unnamed dataset'
+    graphtype = 'graph'
+
+    # Read header lines from file.
+    line = fd.readline()
+    while line:
+        m = split_re.match(line)
+        if m:
+            key, val = m.groups()
+            
+            # The line is on the form;
+            # dimension: dimname id1 id2 id3 ...
+            if key == 'dimension':
+                values = [v.strip() for v in val.split(' ')]
+                dimensions.append(values[0])
+                identifiers[values[0]] = values[1:]
+
+            # Read type of dataset.
+            # Should be dataset, category, or network
+            elif key == 'type':
+                type = val
+            
+            elif key == 'name':
+                name = val
+
+            elif key == 'graphtype':
+                graphtype = val
+
+        else:
+            break
+        line = fd.readline()
+
+    # Dimensions in the form [(dim1, [id1, id2, id3 ..) ...] 
+    dims = [(x, identifiers[x]) for x in dimensions]
+    dim_lengths = [len(identifiers[x]) for x in dimensions]
+
+    # Create matrix
+    if type == 'category':
+        matrix = zeros(dim_lengths, dtype=bool)
+    elif type == 'network':
+        matrix = zeros(dim_lengths)
+    else:
+        matrix = zeros(dim_lengths)
+
+    line = fd.readline()
+    y = 0
+    while line:
+        values = line.split()
+        for x, v in enumerate(values):
+            matrix[y,x] = float(v)
+        y += 1
+        line = fd.readline()
+
+    # Create dataset of specified type
+    if type == 'category':
+        ds = CategoryDataset(matrix, dims)
+    elif type == 'network':
+        ds = GraphDataset(matrix, dims)
+    else:
+        ds = Dataset(matrix, dims)
+
+    return ds
+
+
--- a/workflows/go_workflow.py
+++ b/workflows/go_workflow.py
@@ -4,6 +4,7 @@ import geneontology
 #import gostat
 from scipy import array, randn, log, ones, zeros
 import networkx
+import re

 EVIDENCE_CODES=[('IMP', 'Inferred from mutant phenotype'),
                ('IGI', 'Inferred from genetic interaction'),
@@ -137,80 +138,9 @@ class LoadTextDatasetFunction(workflow.Function):

    def __init__(self):
        workflow.Function.__init__(self, 'load-text-ds', 'Load text dataset')
-    
-    def read_text_dataset(self, fd):
-        split_re = re.compile('^#\s*(\w+)\s*:\s*(.)')
-        dimensions = []
-        identifiers = {}
-        type = 'dataset'
-        name = 'Unnamed dataset'
-        graphtype = 'graph'
-
-        # Read header lines from file.
-        line = fd.readline()
-        while line:
-            m = split_re.match(line)
-            if m:
-                key, val = m
-                
-                # The line is on the form;
-                # dimension: dimname id1 id2 id3 ...
-                if key == 'dimension':
-                    values = [v.strip() for v in val.split(' ')]
-                    dimensions.append(values[0])
-                    identifiers[values[0]] = values[1:]
-
-                    headers[key] = val.strip()
-
-                # Read type of dataset.
-                # Should be dataset, category, or network
-                elif key == 'type':
-                    type = val
-                
-                elif key == 'name':
-                    name = val
-
-                elif key == 'graphtype':
-                    graphtype = val
-
-            else:
-                break
-            line = f.readline()
-
-        # Dimensions in the form [(dim1, [id1, id2, id3 ..) ...] 
-        dims = [(x, identifiers[x]) for x in dimensions]
-        dim_lengths = [len(identifiers[x]) for x in dimensions]
-
-        # Create dataset of specified type
-        if type == 'category':
-            matrix = zeros(dim_lengths, dtype=bool)
-            ds = dataset.CategoryDataset(matrix, dims)
-        elif type == 'network':
-            matrix = zeros(dim_lengths)
-            ds = dataset.GraphDataset(matrix, dims)
-        else:
-            matrix = zeros(dim_lengths)
-            ds = dataset.Dataset(matrix, dims)
-
-        line = f.readline()
-        y = 0
-        while line:
-            values = line.split()
-            for x, v in enumerate(values):
-                matrix[x,y] = float(v)
-            y += 1
-            line = f.readline()
-
-        # Build NetowrkX graph from matrix.
-        if type == 'network':
-            matrix = zeros(dim_lengths)
-            ds = dataset.NetworkDataset(matrix, dims)
-
-
    def run(self):
-        f = open('/home/einarr/foodata.fcsv')
-        return read_text_dataset(f)
-        
+        f = open('/home/einarr/data/goa-condensed.ftsv')
+        return [dataset.read_ftsv(f)]
        

 class LoadAnnotationsFunction(workflow.Function):