#!/usr/bin/env python import numpy import os.path import sys from laydi import dataset from getopt import getopt dimension = 'dim_doe' output_fn = '-' ds_name = None category = False sparse = False def print_help(): print print 'options:' print ' -h, --help Show this help text.' print ' -c, --category Make category dataset' print ' -d, --dimension=DIM Make output in dimension DIM' print ' -n, --name=NAME Set name of output dataset' print ' -o, --output=FILE Save output dataset in FILE' print ' -s, --sparse Save output in sparse format' print def parse_options(): global ds_name global output_fn short_opts = 'cd:hn:o:' long_opts = ['help', 'category', 'dimension', 'name', 'output', 'sparse'] options, params = getopt(sys.argv[1:], short_opts, long_opts) for opt, val in options: if opt in ['-h', '--help']: print_help() sys.exit(0) elif opt in ['-c', '--category']: global category category = True elif opt in ['-d', '--dimension']: global dimension dimension = val elif opt in ['-n', '--name']: ds_name = val elif opt in ['-o', '--output']: output_fn = val elif opt in ['-s', '--sparse']: global sparse sparse = True if ds_name == None: if output_fn != None: ds_name = output_fn else: ds_name = 'txt2ftsv' if len(params) == 0: print_help() sys.exit(1) return params def read_file(fd): lines = fd.readlines() return [l.strip() for l in lines if l.strip() != ''] def build_dataset(dimension, id_lists, filenames): all_ids = list(reduce(set.union, [set(x) for x in id_lists])) x = numpy.zeros((len(all_ids), len(id_lists)), 'b') for i, idl in enumerate(id_lists): for id in idl: x[all_ids.index(id),i] = True if category: ds = dataset.CategoryDataset(x, [(dimension, all_ids), ('files', filenames)], name=ds_name) else: ds = dataset.Dataset(x, [(dimension, all_ids), ('files', filenames)], name=ds_name) return ds if __name__ == '__main__': id_lists = [] filenames = parse_options() for fn in filenames: if os.path.exists(fn): fd = open(fn) id_lists.append(read_file(fd)) fd.close() elif fn == '-': id_lists.append(read_file(sys.stdin)) ds = build_dataset(dimension, id_lists, filenames) if output_fn == '-': dataset.write_ftsv(sys.stdout, ds, sp_format=sparse) else: dataset.write_ftsv(output_fn, ds, sp_format=sparse)