#!/usr/bin/python import getopt import numpy import sys from laydi import dataset VERSION = "0.1.0" dataset_fn = "-" def print_help(): print "illumina2ftsv %s" % VERSION print print "Usage: illumina2ftsv [options] " print def parse_options(): s_opts = "d:h" l_opts = ["dataset", "help"] options, params = getopt.getopt(sys.argv[1:], s_opts, l_opts) for opt, val in options: if opt in ["-d", "--dataset"]: global dataset_fn dataset_fn = val elif opt in ["-h", "--help"]: print_help() sys.exit(0) if len(params) != 1: print_help() sys.exit(1) return params def read_illumina_file(fn): fd = open(fn) line = fd.readline() if line.strip() != "Illumina Inc. GenomeStudio version 1.7.0": raise Exception("File cannot be recognized as Illumina textual data") headers = {} line= fd.readline() while line.strip() != "": key, val = line.split("=", 1) headers[key.strip()] = val.strip() line = fd.readline() col_headers = fd.readline().split('\t') values = [] line = fd.readline() while line != "": values.append([x.strip() for x in line.split('\t')]) line = fd.readline() probe_col = col_headers.index("ProbeID") print "probe id column:" header_cols = [] samples = [] for i, colname in enumerate(col_headers): if colname.startswith("AVG_Signal-"): header_cols.append(i) samples.append(colname.split("-", 1)[1]) print header_cols print samples a = numpy.array(values) m = numpy.array(a[:,header_cols], dtype='d') print m probe_ids = list(a[:, probe_col]) print "samples: ", len(samples) print "probe_ids: ", len(probe_ids) print "shape: ", m.shape ds = dataset.Dataset(m.transpose(), [('samples', samples), ('probe-ids', probe_ids)], name="Average Expr.") dataset.write_ftsv("test.ftsv", ds) if __name__ == '__main__': fn = parse_options()[0] read_illumina_file(fn)