Added txt2ftsv, a script to build ftsv category datasets from lists of identifiers.

Usages is txt2ftsv [-d dimension] [-o output.ftsv] [-n dsname] [FILE1 [FILE2 ...]] Input and output can be - for STDIN/STDOUT respectively. The script is primarily ment as a hack to easily load selections. Data is only half sanitized.
2009-02-05 00:11:48 +00:00
parent fdf51c7c7f
commit b46c381c3e
1 changed files with 86 additions and 0 deletions
--- a/bin/txt2ftsv
+++ b/bin/txt2ftsv
@@ -0,0 +1,86 @@
 #!/usr/bin/env python
 import numpy
 import os.path
 import sys
 from laydi import dataset
 from getopt import getopt
 dimension = 'dim_doe'
 output_fn = '-'
 ds_name = None
 def print_help():
    print
    print 'options:'
    print '    -h, --help             Show this help text.'
    print '    -d, --dimension=DIM    Make output in dimension DIM'
    print '    -n, --name=NAME        Set name of output dataset'
    print '    -o, --output=FILE      Save output dataset in FILE'
    print
 def parse_options():
    global ds_name
    global output_fn
    short_opts = 'hd:n:o:'
    long_opts = ['help', 'dimension', 'name', 'output']
    options, params = getopt(sys.argv[1:], short_opts, long_opts)
    for opt, val in options:
        if opt in ['-h', '--help']:
            print_help()
            sys.exit(0)
        elif opt in ['-d', '--dimension']:
            global dimension
            dimension = val
        elif opt in ['-n', '--name']:
            ds_name = val
        elif opt in ['-o', '--output']:
            output_fn = val
    if ds_name == None:
        if output_fn != None:
            ds_name = output_fn
        else:
            ds_name = 'txt2ftsv'
    if len(params) == 0:
        print_help()
        sys.exit(1)
    return params
 def read_file(fd):
    lines = fd.readlines()
    return [l.strip() for l in lines if l.strip() != '']
 def build_dataset(dimension, id_lists, filenames):
    all_ids = reduce(set.union, [set(x) for x in id_lists])
    x = numpy.zeros((len(all_ids), len(id_lists)), 'b')
    for i, idl in enumerate(id_lists):
        for j, id in enumerate(idl):
            x[j,i] = True
    return dataset.CategoryDataset(x, [(dimension, all_ids), ('files', filenames)], name=ds_name)
 if __name__ == '__main__':
    id_lists = []
    filenames = parse_options()
    for fn in filenames:
        if os.path.exists(fn):
            fd = open(fn)
            id_lists.append(read_file(fd))
            fd.close()
        elif fn == '-':
            id_lists.append(read_file(sys.stdin))
    ds = build_dataset(dimension, id_lists, filenames)
    print ds
    if output_fn == '-':
        dataset.write_ftsv(sys.stdout, ds)
    else:
        dataset.write_ftsv(output_fn, ds)