#!/usr/bin/env python import numpy import os.path import sys from laydi import dataset from getopt import getopt dimension = 'dim_doe' output_fn = '-' ds_name = None def print_help(): print print 'options:' print ' -h, --help Show this help text.' print ' -d, --dimension=DIM Make output in dimension DIM' print ' -n, --name=NAME Set name of output dataset' print ' -o, --output=FILE Save output dataset in FILE' print def parse_options(): global ds_name global output_fn short_opts = 'hd:n:o:' long_opts = ['help', 'dimension', 'name', 'output'] options, params = getopt(sys.argv[1:], short_opts, long_opts) for opt, val in options: if opt in ['-h', '--help']: print_help() sys.exit(0) elif opt in ['-d', '--dimension']: global dimension dimension = val elif opt in ['-n', '--name']: ds_name = val elif opt in ['-o', '--output']: output_fn = val if ds_name == None: if output_fn != None: ds_name = output_fn else: ds_name = 'txt2ftsv' if len(params) == 0: print_help() sys.exit(1) return params def read_file(fd): lines = fd.readlines() return [l.strip() for l in lines if l.strip() != ''] def build_dataset(dimension, id_lists, filenames): all_ids = reduce(set.union, [set(x) for x in id_lists]) x = numpy.zeros((len(all_ids), len(id_lists)), 'b') for i, idl in enumerate(id_lists): for j, id in enumerate(idl): x[j,i] = True return dataset.CategoryDataset(x, [(dimension, all_ids), ('files', filenames)], name=ds_name) if __name__ == '__main__': id_lists = [] filenames = parse_options() for fn in filenames: if os.path.exists(fn): fd = open(fn) id_lists.append(read_file(fd)) fd.close() elif fn == '-': id_lists.append(read_file(sys.stdin)) ds = build_dataset(dimension, id_lists, filenames) print ds if output_fn == '-': dataset.write_ftsv(sys.stdout, ds) else: dataset.write_ftsv(output_fn, ds)