Renamed directory illumina2ftsv to illumina to gather all Illumina-relevant

scripts there. Added laydi-annot-illumina script that generates Laydi annotation files from Illumina text annotation files.
2011-01-04 11:43:24 +00:00
parent f001d12584
commit 5f1f4d0dc2
2 changed files with 96 additions and 2 deletions
--- a/scripts/illumina/illumina2ftsv
+++ b/scripts/illumina/illumina2ftsv
@@ -0,0 +1,92 @@
+#!/usr/bin/python
+
+import getopt
+import numpy
+import sys
+
+from laydi import dataset
+
+VERSION = "0.1.0"
+
+dataset_fn = "-"
+
+def print_help():
+    print "illumina2ftsv %s" % VERSION
+    print 
+    print "Usage: illumina2ftsv [options] <illumina_genome_studio_file>"
+    print
+
+
+def parse_options():
+    s_opts = "d:h"
+    l_opts = ["dataset", "help"]
+
+    options, params = getopt.getopt(sys.argv[1:], s_opts, l_opts)
+
+    for opt, val in options:
+        if opt in ["-d", "--dataset"]:
+            global dataset_fn
+            dataset_fn = val
+        elif opt in ["-h", "--help"]:
+            print_help()
+            sys.exit(0)
+
+    if len(params) != 1:
+        print_help()
+        sys.exit(1)
+
+    return params
+
+def read_illumina_file(fn):
+    fd = open(fn)
+    line = fd.readline()
+    if line.strip() != "Illumina Inc. GenomeStudio version 1.7.0":
+        raise Exception("File cannot be recognized as Illumina textual data")
+
+    headers = {}
+    line= fd.readline()
+    while line.strip() != "":
+        key, val = line.split("=", 1)
+        headers[key.strip()] = val.strip()
+        line = fd.readline()
+    
+    col_headers = fd.readline().split('\t')
+    
+    values = []
+    line = fd.readline()
+    while line != "":
+        values.append([x.strip() for x in line.split('\t')])
+        line = fd.readline()
+
+    probe_col = col_headers.index("ProbeID")
+    print "probe id column:"   
+
+    header_cols = []
+    samples = []
+    for i, colname in enumerate(col_headers):
+        if colname.startswith("AVG_Signal-"):
+            header_cols.append(i)
+            samples.append(colname.split("-", 1)[1])
+
+    print header_cols
+    print samples
+
+    a = numpy.array(values)
+    m = numpy.array(a[:,header_cols], dtype='d')
+    print m
+    
+    probe_ids = list(a[:, probe_col])
+
+    print "samples: ", len(samples)
+    print "probe_ids: ", len(probe_ids)
+    print "shape: ", m.shape
+
+    ds = dataset.Dataset(m.transpose(), [('samples', samples), ('probe-ids', probe_ids)], name="Average Expr.")
+    dataset.write_ftsv("test.ftsv", ds)
+
+if __name__ == '__main__':
+    
+    fn = parse_options()[0]
+    read_illumina_file(fn)
+
+
--- a/scripts/illumina/laydi-annot-illumina
+++ b/scripts/illumina/laydi-annot-illumina
@@ -0,0 +1,71 @@
+#!/usr/bin/python
+
+import getopt
+import os, os.path
+import sys
+
+OUTPUT_COLS = ["Probe_Id", "RefSeq_ID", "Unigene_ID", "Entrez_Gene_ID", "Accession", "Symbol", "Chromosome", "Definition"]
+
+def print_help(): 
+    print "laydi-annot-illumina"
+    print
+    print "Usage: laydi-annot-illumina <illumina-annotation-file.txt>"
+    print
+    print "Description:"
+    print "    Produce laydi annotation files from Illumina text annotation files"
+    print "    Illumina files can be downloaded from:"
+    print "    http://www.switchtoi.com/annotationfiles.ilmn"
+    print
+
+def parse_cmdline():
+    short_opts = "h"
+    long_opts = ["help"]
+    options, params = getopt.getopt(sys.argv[1:], short_opts, long_opts)
+
+    for key, val in options:
+        if key in ["-h", "--help"]:
+            print_help()
+            sys.exit(0)
+    
+    if len(params) != 1:
+        print_help()
+        sys.exit(1)
+
+    return params[0]
+
+def convert_annotations(fn_in, fn_out):
+    fd_in = open(fn_in)
+    fd_out = open(fn_out, "w")
+
+    # Skip headers
+    line = fd_in.readline()
+    while not line.startswith("[Probes]"):
+        line = fd_in.readline()
+
+    colnames = fd_in.readline().split("\t")
+    export_colnums = [colnames.index(x) for x in OUTPUT_COLS]
+
+    # Print output column headers
+    export_colnames = ["probe-id"] + colnames[1:]
+    print >> fd_out, "\t".join(export_colnames)
+
+    line = fd_in.readline()
+    while not line == "" and not line.startswith("["):
+        values = line.split("\t")
+        output_values = [values[x] for x in export_colnums]
+        print >> fd_out, "\t".join(output_values)
+        line = fd_in.readline()
+
+if __name__ == "__main__":
+    fn_in = parse_cmdline()
+    fn_out = os.path.split(fn_in)[1]
+    fn_out = os.path.splitext(fn_out)[0] + ".annot"
+    
+    print "Reading: %s" % (fn_in,)
+    print "Writing: %s" % (fn_out,)
+    print
+    print "Annotations:"
+    print ", ".join(OUTPUT_COLS)
+
+    convert_annotations(fn_in, fn_out)
+