Renamed directory illumina2ftsv to illumina to gather all Illumina-relevant

scripts there.  Added laydi-annot-illumina script that generates Laydi
annotation files from Illumina text annotation files.
This commit is contained in:
2011-01-04 11:43:24 +00:00
parent f001d12584
commit 5f1f4d0dc2
2 changed files with 96 additions and 2 deletions

View File

@@ -0,0 +1,92 @@
#!/usr/bin/python
import getopt
import numpy
import sys
from laydi import dataset
VERSION = "0.1.0"
dataset_fn = "-"
def print_help():
print "illumina2ftsv %s" % VERSION
print
print "Usage: illumina2ftsv [options] <illumina_genome_studio_file>"
print
def parse_options():
s_opts = "d:h"
l_opts = ["dataset", "help"]
options, params = getopt.getopt(sys.argv[1:], s_opts, l_opts)
for opt, val in options:
if opt in ["-d", "--dataset"]:
global dataset_fn
dataset_fn = val
elif opt in ["-h", "--help"]:
print_help()
sys.exit(0)
if len(params) != 1:
print_help()
sys.exit(1)
return params
def read_illumina_file(fn):
fd = open(fn)
line = fd.readline()
if line.strip() != "Illumina Inc. GenomeStudio version 1.7.0":
raise Exception("File cannot be recognized as Illumina textual data")
headers = {}
line= fd.readline()
while line.strip() != "":
key, val = line.split("=", 1)
headers[key.strip()] = val.strip()
line = fd.readline()
col_headers = fd.readline().split('\t')
values = []
line = fd.readline()
while line != "":
values.append([x.strip() for x in line.split('\t')])
line = fd.readline()
probe_col = col_headers.index("ProbeID")
print "probe id column:"
header_cols = []
samples = []
for i, colname in enumerate(col_headers):
if colname.startswith("AVG_Signal-"):
header_cols.append(i)
samples.append(colname.split("-", 1)[1])
print header_cols
print samples
a = numpy.array(values)
m = numpy.array(a[:,header_cols], dtype='d')
print m
probe_ids = list(a[:, probe_col])
print "samples: ", len(samples)
print "probe_ids: ", len(probe_ids)
print "shape: ", m.shape
ds = dataset.Dataset(m.transpose(), [('samples', samples), ('probe-ids', probe_ids)], name="Average Expr.")
dataset.write_ftsv("test.ftsv", ds)
if __name__ == '__main__':
fn = parse_options()[0]
read_illumina_file(fn)

View File

@@ -0,0 +1,71 @@
#!/usr/bin/python
import getopt
import os, os.path
import sys
OUTPUT_COLS = ["Probe_Id", "RefSeq_ID", "Unigene_ID", "Entrez_Gene_ID", "Accession", "Symbol", "Chromosome", "Definition"]
def print_help():
print "laydi-annot-illumina"
print
print "Usage: laydi-annot-illumina <illumina-annotation-file.txt>"
print
print "Description:"
print " Produce laydi annotation files from Illumina text annotation files"
print " Illumina files can be downloaded from:"
print " http://www.switchtoi.com/annotationfiles.ilmn"
print
def parse_cmdline():
short_opts = "h"
long_opts = ["help"]
options, params = getopt.getopt(sys.argv[1:], short_opts, long_opts)
for key, val in options:
if key in ["-h", "--help"]:
print_help()
sys.exit(0)
if len(params) != 1:
print_help()
sys.exit(1)
return params[0]
def convert_annotations(fn_in, fn_out):
fd_in = open(fn_in)
fd_out = open(fn_out, "w")
# Skip headers
line = fd_in.readline()
while not line.startswith("[Probes]"):
line = fd_in.readline()
colnames = fd_in.readline().split("\t")
export_colnums = [colnames.index(x) for x in OUTPUT_COLS]
# Print output column headers
export_colnames = ["probe-id"] + colnames[1:]
print >> fd_out, "\t".join(export_colnames)
line = fd_in.readline()
while not line == "" and not line.startswith("["):
values = line.split("\t")
output_values = [values[x] for x in export_colnums]
print >> fd_out, "\t".join(output_values)
line = fd_in.readline()
if __name__ == "__main__":
fn_in = parse_cmdline()
fn_out = os.path.split(fn_in)[1]
fn_out = os.path.splitext(fn_out)[0] + ".annot"
print "Reading: %s" % (fn_in,)
print "Writing: %s" % (fn_out,)
print
print "Annotations:"
print ", ".join(OUTPUT_COLS)
convert_annotations(fn_in, fn_out)