diff --git a/scripts/illumina2ftsv/illumina2ftsv b/scripts/illumina2ftsv/illumina2ftsv index adc9329..70b5ddb 100644 --- a/scripts/illumina2ftsv/illumina2ftsv +++ b/scripts/illumina2ftsv/illumina2ftsv @@ -1,6 +1,7 @@ #!/usr/bin/python import getopt +import numpy import sys from laydi import dataset @@ -16,7 +17,7 @@ def parse_options(): def read_illumina_file(fn): fd = open(fn) line = fd.readline() - if line != "Illumina Inc. GenomeStudio version 1.7.0": + if line.strip() != "Illumina Inc. GenomeStudio version 1.7.0": raise Exception("File cannot be recognized as Illumina textual data") headers = {} @@ -31,15 +32,38 @@ def read_illumina_file(fn): values = [] line = fd.readline() while line != "": - values.append[x.strip() for x in line.split('\t') + values.append([x.strip() for x in line.split('\t')]) + line = fd.readline() - probe_col = col_headers.find("ProbeID") - + probe_col = col_headers.index("ProbeID") + print "probe id column:" + header_cols = [] + samples = [] + for i, colname in enumerate(col_headers): + if colname.startswith("AVG_Signal-"): + header_cols.append(i) + samples.append(colname.split("-", 1)[1]) + print header_cols + print samples + + a = numpy.array(values) + m = numpy.array(a[:,header_cols], dtype='d') + print m + + probe_ids = list(a[:, probe_col]) + + print "samples: ", len(samples) + print "probe_ids: ", len(probe_ids) + print "shape: ", m.shape + + ds = dataset.Dataset(m.transpose(), [('samples', samples), ('probe-ids', probe_ids)], name="Average Expr.") + dataset.write_ftsv("test.ftsv", ds) if __name__ == '__main__': - fn = params[0] + + fn = parse_options()[0] read_illumina_file(fn)