This repository has been archived on 2024-07-04. You can view files and clone it, but cannot push or open issues or pull requests.
laydi/scripts/illumina/illumina2ftsv

93 lines
2.1 KiB
Plaintext
Raw Normal View History

#!/usr/bin/python
import getopt
2010-09-27 21:45:56 +02:00
import numpy
import sys
from laydi import dataset
VERSION = "0.1.0"
dataset_fn = "-"
def print_help():
print "illumina2ftsv %s" % VERSION
print
print "Usage: illumina2ftsv [options] <illumina_genome_studio_file>"
print
def parse_options():
s_opts = "d:h"
l_opts = ["dataset", "help"]
options, params = getopt.getopt(sys.argv[1:], s_opts, l_opts)
for opt, val in options:
if opt in ["-d", "--dataset"]:
global dataset_fn
dataset_fn = val
elif opt in ["-h", "--help"]:
print_help()
sys.exit(0)
if len(params) != 1:
print_help()
sys.exit(1)
return params
def read_illumina_file(fn):
fd = open(fn)
line = fd.readline()
2010-09-27 21:45:56 +02:00
if line.strip() != "Illumina Inc. GenomeStudio version 1.7.0":
raise Exception("File cannot be recognized as Illumina textual data")
headers = {}
line= fd.readline()
while line.strip() != "":
key, val = line.split("=", 1)
headers[key.strip()] = val.strip()
line = fd.readline()
col_headers = fd.readline().split('\t')
values = []
line = fd.readline()
while line != "":
2010-09-27 21:45:56 +02:00
values.append([x.strip() for x in line.split('\t')])
line = fd.readline()
probe_col = col_headers.index("ProbeID")
print "probe id column:"
header_cols = []
samples = []
for i, colname in enumerate(col_headers):
if colname.startswith("AVG_Signal-"):
header_cols.append(i)
samples.append(colname.split("-", 1)[1])
2010-09-27 21:45:56 +02:00
print header_cols
print samples
2010-09-27 21:45:56 +02:00
a = numpy.array(values)
m = numpy.array(a[:,header_cols], dtype='d')
print m
probe_ids = list(a[:, probe_col])
print "samples: ", len(samples)
print "probe_ids: ", len(probe_ids)
print "shape: ", m.shape
2010-09-27 21:45:56 +02:00
ds = dataset.Dataset(m.transpose(), [('samples', samples), ('probe-ids', probe_ids)], name="Average Expr.")
dataset.write_ftsv("test.ftsv", ds)
if __name__ == '__main__':
2010-09-27 21:45:56 +02:00
fn = parse_options()[0]
read_illumina_file(fn)