2009-02-05 01:11:48 +01:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
import numpy
|
|
|
|
import os.path
|
|
|
|
import sys
|
|
|
|
|
|
|
|
from laydi import dataset
|
|
|
|
from getopt import getopt
|
|
|
|
|
|
|
|
dimension = 'dim_doe'
|
|
|
|
output_fn = '-'
|
|
|
|
ds_name = None
|
2009-02-05 21:08:51 +01:00
|
|
|
category = False
|
|
|
|
sparse = False
|
2009-02-05 01:11:48 +01:00
|
|
|
|
|
|
|
def print_help():
|
|
|
|
print
|
|
|
|
print 'options:'
|
|
|
|
print ' -h, --help Show this help text.'
|
2009-02-05 21:08:51 +01:00
|
|
|
print ' -c, --category Make category dataset'
|
2009-02-05 01:11:48 +01:00
|
|
|
print ' -d, --dimension=DIM Make output in dimension DIM'
|
|
|
|
print ' -n, --name=NAME Set name of output dataset'
|
|
|
|
print ' -o, --output=FILE Save output dataset in FILE'
|
2009-02-05 21:08:51 +01:00
|
|
|
print ' -s, --sparse Save output in sparse format'
|
2009-02-05 01:11:48 +01:00
|
|
|
print
|
|
|
|
|
|
|
|
def parse_options():
|
|
|
|
global ds_name
|
|
|
|
global output_fn
|
|
|
|
|
2009-02-05 21:08:51 +01:00
|
|
|
short_opts = 'cd:hn:o:'
|
|
|
|
long_opts = ['help', 'category', 'dimension', 'name', 'output', 'sparse']
|
2009-02-05 01:11:48 +01:00
|
|
|
options, params = getopt(sys.argv[1:], short_opts, long_opts)
|
|
|
|
|
|
|
|
for opt, val in options:
|
|
|
|
if opt in ['-h', '--help']:
|
|
|
|
print_help()
|
|
|
|
sys.exit(0)
|
2009-02-05 21:08:51 +01:00
|
|
|
elif opt in ['-c', '--category']:
|
|
|
|
global category
|
|
|
|
category = True
|
2009-02-05 01:11:48 +01:00
|
|
|
elif opt in ['-d', '--dimension']:
|
|
|
|
global dimension
|
|
|
|
dimension = val
|
|
|
|
elif opt in ['-n', '--name']:
|
|
|
|
ds_name = val
|
|
|
|
elif opt in ['-o', '--output']:
|
|
|
|
output_fn = val
|
2009-02-05 21:08:51 +01:00
|
|
|
elif opt in ['-s', '--sparse']:
|
|
|
|
global sparse
|
|
|
|
sparse = True
|
2009-02-05 01:11:48 +01:00
|
|
|
|
|
|
|
if ds_name == None:
|
|
|
|
if output_fn != None:
|
|
|
|
ds_name = output_fn
|
|
|
|
else:
|
|
|
|
ds_name = 'txt2ftsv'
|
|
|
|
|
|
|
|
if len(params) == 0:
|
|
|
|
print_help()
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
return params
|
|
|
|
|
|
|
|
def read_file(fd):
|
|
|
|
lines = fd.readlines()
|
|
|
|
return [l.strip() for l in lines if l.strip() != '']
|
|
|
|
|
|
|
|
def build_dataset(dimension, id_lists, filenames):
|
2009-02-06 23:21:19 +01:00
|
|
|
all_ids = list(reduce(set.union, [set(x) for x in id_lists]))
|
2009-02-05 01:11:48 +01:00
|
|
|
x = numpy.zeros((len(all_ids), len(id_lists)), 'b')
|
|
|
|
for i, idl in enumerate(id_lists):
|
2009-02-06 23:21:19 +01:00
|
|
|
for id in idl:
|
|
|
|
x[all_ids.index(id),i] = True
|
2009-02-05 21:08:51 +01:00
|
|
|
|
|
|
|
if category:
|
|
|
|
ds = dataset.CategoryDataset(x, [(dimension, all_ids), ('files', filenames)], name=ds_name)
|
|
|
|
else:
|
|
|
|
ds = dataset.Dataset(x, [(dimension, all_ids), ('files', filenames)], name=ds_name)
|
|
|
|
return ds
|
2009-02-05 01:11:48 +01:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
id_lists = []
|
|
|
|
|
|
|
|
filenames = parse_options()
|
|
|
|
for fn in filenames:
|
|
|
|
if os.path.exists(fn):
|
|
|
|
fd = open(fn)
|
|
|
|
id_lists.append(read_file(fd))
|
|
|
|
fd.close()
|
|
|
|
elif fn == '-':
|
|
|
|
id_lists.append(read_file(sys.stdin))
|
|
|
|
|
|
|
|
ds = build_dataset(dimension, id_lists, filenames)
|
|
|
|
|
|
|
|
if output_fn == '-':
|
2009-02-05 21:08:51 +01:00
|
|
|
dataset.write_ftsv(sys.stdout, ds, sp_format=sparse)
|
2009-02-05 01:11:48 +01:00
|
|
|
else:
|
2009-02-05 21:08:51 +01:00
|
|
|
dataset.write_ftsv(output_fn, ds, sp_format=sparse)
|
2009-02-05 01:11:48 +01:00
|
|
|
|