From b46c381c3e247216a3c0bc2ac09f6dc16cd07500 Mon Sep 17 00:00:00 2001 From: einarr Date: Thu, 5 Feb 2009 00:11:48 +0000 Subject: [PATCH] Added txt2ftsv, a script to build ftsv category datasets from lists of identifiers. Usages is txt2ftsv [-d dimension] [-o output.ftsv] [-n dsname] [FILE1 [FILE2 ...]] Input and output can be - for STDIN/STDOUT respectively. The script is primarily ment as a hack to easily load selections. Data is only half sanitized. --- bin/txt2ftsv | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100755 bin/txt2ftsv diff --git a/bin/txt2ftsv b/bin/txt2ftsv new file mode 100755 index 0000000..dccc696 --- /dev/null +++ b/bin/txt2ftsv @@ -0,0 +1,86 @@ +#!/usr/bin/env python + +import numpy +import os.path +import sys + +from laydi import dataset +from getopt import getopt + +dimension = 'dim_doe' +output_fn = '-' +ds_name = None + +def print_help(): + print + print 'options:' + print ' -h, --help Show this help text.' + print ' -d, --dimension=DIM Make output in dimension DIM' + print ' -n, --name=NAME Set name of output dataset' + print ' -o, --output=FILE Save output dataset in FILE' + print + +def parse_options(): + global ds_name + global output_fn + + short_opts = 'hd:n:o:' + long_opts = ['help', 'dimension', 'name', 'output'] + options, params = getopt(sys.argv[1:], short_opts, long_opts) + + for opt, val in options: + if opt in ['-h', '--help']: + print_help() + sys.exit(0) + elif opt in ['-d', '--dimension']: + global dimension + dimension = val + elif opt in ['-n', '--name']: + ds_name = val + elif opt in ['-o', '--output']: + output_fn = val + + if ds_name == None: + if output_fn != None: + ds_name = output_fn + else: + ds_name = 'txt2ftsv' + + if len(params) == 0: + print_help() + sys.exit(1) + + return params + +def read_file(fd): + lines = fd.readlines() + return [l.strip() for l in lines if l.strip() != ''] + +def build_dataset(dimension, id_lists, filenames): + all_ids = reduce(set.union, [set(x) for x in id_lists]) + x = numpy.zeros((len(all_ids), len(id_lists)), 'b') + for i, idl in enumerate(id_lists): + for j, id in enumerate(idl): + x[j,i] = True + return dataset.CategoryDataset(x, [(dimension, all_ids), ('files', filenames)], name=ds_name) + +if __name__ == '__main__': + id_lists = [] + + filenames = parse_options() + for fn in filenames: + if os.path.exists(fn): + fd = open(fn) + id_lists.append(read_file(fd)) + fd.close() + elif fn == '-': + id_lists.append(read_file(sys.stdin)) + + ds = build_dataset(dimension, id_lists, filenames) + print ds + + if output_fn == '-': + dataset.write_ftsv(sys.stdout, ds) + else: + dataset.write_ftsv(output_fn, ds) +