2010-09-24 17:18:52 +02:00
|
|
|
#!/usr/bin/python
|
|
|
|
|
|
|
|
import getopt
|
2010-09-27 21:45:56 +02:00
|
|
|
import numpy
|
2010-09-24 17:18:52 +02:00
|
|
|
import sys
|
|
|
|
|
|
|
|
from laydi import dataset
|
|
|
|
|
2011-01-04 12:43:24 +01:00
|
|
|
VERSION = "0.1.0"
|
|
|
|
|
|
|
|
dataset_fn = "-"
|
|
|
|
|
|
|
|
def print_help():
|
|
|
|
print "illumina2ftsv %s" % VERSION
|
|
|
|
print
|
|
|
|
print "Usage: illumina2ftsv [options] <illumina_genome_studio_file>"
|
|
|
|
print
|
|
|
|
|
|
|
|
|
2010-09-24 17:18:52 +02:00
|
|
|
def parse_options():
|
2011-01-04 12:43:24 +01:00
|
|
|
s_opts = "d:h"
|
|
|
|
l_opts = ["dataset", "help"]
|
2010-09-24 17:18:52 +02:00
|
|
|
|
|
|
|
options, params = getopt.getopt(sys.argv[1:], s_opts, l_opts)
|
|
|
|
|
2011-01-04 12:43:24 +01:00
|
|
|
for opt, val in options:
|
|
|
|
if opt in ["-d", "--dataset"]:
|
|
|
|
global dataset_fn
|
|
|
|
dataset_fn = val
|
|
|
|
elif opt in ["-h", "--help"]:
|
|
|
|
print_help()
|
|
|
|
sys.exit(0)
|
|
|
|
|
|
|
|
if len(params) != 1:
|
|
|
|
print_help()
|
|
|
|
sys.exit(1)
|
|
|
|
|
2010-09-24 17:18:52 +02:00
|
|
|
return params
|
|
|
|
|
|
|
|
def read_illumina_file(fn):
|
|
|
|
fd = open(fn)
|
|
|
|
line = fd.readline()
|
2010-09-27 21:45:56 +02:00
|
|
|
if line.strip() != "Illumina Inc. GenomeStudio version 1.7.0":
|
2010-09-24 17:18:52 +02:00
|
|
|
raise Exception("File cannot be recognized as Illumina textual data")
|
|
|
|
|
|
|
|
headers = {}
|
|
|
|
line= fd.readline()
|
|
|
|
while line.strip() != "":
|
|
|
|
key, val = line.split("=", 1)
|
|
|
|
headers[key.strip()] = val.strip()
|
|
|
|
line = fd.readline()
|
|
|
|
|
|
|
|
col_headers = fd.readline().split('\t')
|
|
|
|
|
|
|
|
values = []
|
|
|
|
line = fd.readline()
|
|
|
|
while line != "":
|
2010-09-27 21:45:56 +02:00
|
|
|
values.append([x.strip() for x in line.split('\t')])
|
|
|
|
line = fd.readline()
|
|
|
|
|
|
|
|
probe_col = col_headers.index("ProbeID")
|
|
|
|
print "probe id column:"
|
|
|
|
|
|
|
|
header_cols = []
|
|
|
|
samples = []
|
|
|
|
for i, colname in enumerate(col_headers):
|
|
|
|
if colname.startswith("AVG_Signal-"):
|
|
|
|
header_cols.append(i)
|
|
|
|
samples.append(colname.split("-", 1)[1])
|
2010-09-24 17:18:52 +02:00
|
|
|
|
2010-09-27 21:45:56 +02:00
|
|
|
print header_cols
|
|
|
|
print samples
|
2010-09-24 17:18:52 +02:00
|
|
|
|
2010-09-27 21:45:56 +02:00
|
|
|
a = numpy.array(values)
|
|
|
|
m = numpy.array(a[:,header_cols], dtype='d')
|
|
|
|
print m
|
|
|
|
|
|
|
|
probe_ids = list(a[:, probe_col])
|
|
|
|
|
|
|
|
print "samples: ", len(samples)
|
|
|
|
print "probe_ids: ", len(probe_ids)
|
|
|
|
print "shape: ", m.shape
|
2010-09-24 17:18:52 +02:00
|
|
|
|
2010-09-27 21:45:56 +02:00
|
|
|
ds = dataset.Dataset(m.transpose(), [('samples', samples), ('probe-ids', probe_ids)], name="Average Expr.")
|
|
|
|
dataset.write_ftsv("test.ftsv", ds)
|
2010-09-24 17:18:52 +02:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2010-09-27 21:45:56 +02:00
|
|
|
|
|
|
|
fn = parse_options()[0]
|
2010-09-24 17:18:52 +02:00
|
|
|
read_illumina_file(fn)
|
|
|
|
|
|
|
|
|