laydi/scripts/illumina/illumina2ftsv

#!/usr/bin/python

import getopt
import numpy
import sys

from laydi import dataset

VERSION = "0.1.0"

dataset_fn = "-"

def print_help():
    print "illumina2ftsv %s" % VERSION
    print 
    print "Usage: illumina2ftsv [options] <illumina_genome_studio_file>"
    print


def parse_options():
    s_opts = "d:h"
    l_opts = ["dataset", "help"]

    options, params = getopt.getopt(sys.argv[1:], s_opts, l_opts)

    for opt, val in options:
        if opt in ["-d", "--dataset"]:
            global dataset_fn
            dataset_fn = val
        elif opt in ["-h", "--help"]:
            print_help()
            sys.exit(0)

    if len(params) != 1:
        print_help()
        sys.exit(1)

    return params

def read_illumina_file(fn):
    fd = open(fn)
    line = fd.readline()
    if line.strip() != "Illumina Inc. GenomeStudio version 1.7.0":
        raise Exception("File cannot be recognized as Illumina textual data")

    headers = {}
    line= fd.readline()
    while line.strip() != "":
        key, val = line.split("=", 1)
        headers[key.strip()] = val.strip()
        line = fd.readline()
    
    col_headers = fd.readline().split('\t')
    
    values = []
    line = fd.readline()
    while line != "":
        values.append([x.strip() for x in line.split('\t')])
        line = fd.readline()

    probe_col = col_headers.index("ProbeID")
    print "probe id column:"   

    header_cols = []
    samples = []
    for i, colname in enumerate(col_headers):
        if colname.startswith("AVG_Signal-"):
            header_cols.append(i)
            samples.append(colname.split("-", 1)[1])

    print header_cols
    print samples

    a = numpy.array(values)
    m = numpy.array(a[:,header_cols], dtype='d')
    print m
    
    probe_ids = list(a[:, probe_col])

    print "samples: ", len(samples)
    print "probe_ids: ", len(probe_ids)
    print "shape: ", m.shape

    ds = dataset.Dataset(m.transpose(), [('samples', samples), ('probe-ids', probe_ids)], name="Average Expr.")
    dataset.write_ftsv("test.ftsv", ds)

if __name__ == '__main__':
    
    fn = parse_options()[0]
    read_illumina_file(fn)
Begun writing illumina data to ftsv converter. 2010-09-24 17:18:52 +02:00			`#!/usr/bin/python`

			`import getopt`
Working version. 2010-09-27 21:45:56 +02:00			`import numpy`
Begun writing illumina data to ftsv converter. 2010-09-24 17:18:52 +02:00			`import sys`

			`from laydi import dataset`

Renamed directory illumina2ftsv to illumina to gather all Illumina-relevant scripts there. Added laydi-annot-illumina script that generates Laydi annotation files from Illumina text annotation files. 2011-01-04 12:43:24 +01:00			`VERSION = "0.1.0"`

			`dataset_fn = "-"`

			`def print_help():`
			`print "illumina2ftsv %s" % VERSION`
			`print`
			`print "Usage: illumina2ftsv [options] <illumina_genome_studio_file>"`
			`print`


Begun writing illumina data to ftsv converter. 2010-09-24 17:18:52 +02:00			`def parse_options():`
Renamed directory illumina2ftsv to illumina to gather all Illumina-relevant scripts there. Added laydi-annot-illumina script that generates Laydi annotation files from Illumina text annotation files. 2011-01-04 12:43:24 +01:00			`s_opts = "d:h"`
			`l_opts = ["dataset", "help"]`
Begun writing illumina data to ftsv converter. 2010-09-24 17:18:52 +02:00
			`options, params = getopt.getopt(sys.argv[1:], s_opts, l_opts)`

Renamed directory illumina2ftsv to illumina to gather all Illumina-relevant scripts there. Added laydi-annot-illumina script that generates Laydi annotation files from Illumina text annotation files. 2011-01-04 12:43:24 +01:00			`for opt, val in options:`
			`if opt in ["-d", "--dataset"]:`
			`global dataset_fn`
			`dataset_fn = val`
			`elif opt in ["-h", "--help"]:`
			`print_help()`
			`sys.exit(0)`

			`if len(params) != 1:`
			`print_help()`
			`sys.exit(1)`

Begun writing illumina data to ftsv converter. 2010-09-24 17:18:52 +02:00			`return params`

			`def read_illumina_file(fn):`
			`fd = open(fn)`
			`line = fd.readline()`
Working version. 2010-09-27 21:45:56 +02:00			`if line.strip() != "Illumina Inc. GenomeStudio version 1.7.0":`
Begun writing illumina data to ftsv converter. 2010-09-24 17:18:52 +02:00			`raise Exception("File cannot be recognized as Illumina textual data")`

			`headers = {}`
			`line= fd.readline()`
			`while line.strip() != "":`
			`key, val = line.split("=", 1)`
			`headers[key.strip()] = val.strip()`
			`line = fd.readline()`

			`col_headers = fd.readline().split('\t')`

			`values = []`
			`line = fd.readline()`
			`while line != "":`
Working version. 2010-09-27 21:45:56 +02:00			`values.append([x.strip() for x in line.split('\t')])`
			`line = fd.readline()`

			`probe_col = col_headers.index("ProbeID")`
			`print "probe id column:"`

			`header_cols = []`
			`samples = []`
			`for i, colname in enumerate(col_headers):`
			`if colname.startswith("AVG_Signal-"):`
			`header_cols.append(i)`
			`samples.append(colname.split("-", 1)[1])`
Begun writing illumina data to ftsv converter. 2010-09-24 17:18:52 +02:00
Working version. 2010-09-27 21:45:56 +02:00			`print header_cols`
			`print samples`
Begun writing illumina data to ftsv converter. 2010-09-24 17:18:52 +02:00
Working version. 2010-09-27 21:45:56 +02:00			`a = numpy.array(values)`
			`m = numpy.array(a[:,header_cols], dtype='d')`
			`print m`

			`probe_ids = list(a[:, probe_col])`

			`print "samples: ", len(samples)`
			`print "probe_ids: ", len(probe_ids)`
			`print "shape: ", m.shape`
Begun writing illumina data to ftsv converter. 2010-09-24 17:18:52 +02:00
Working version. 2010-09-27 21:45:56 +02:00			`ds = dataset.Dataset(m.transpose(), [('samples', samples), ('probe-ids', probe_ids)], name="Average Expr.")`
			`dataset.write_ftsv("test.ftsv", ds)`
Begun writing illumina data to ftsv converter. 2010-09-24 17:18:52 +02:00
			`if __name__ == '__main__':`
Working version. 2010-09-27 21:45:56 +02:00
			`fn = parse_options()[0]`
Begun writing illumina data to ftsv converter. 2010-09-24 17:18:52 +02:00			`read_illumina_file(fn)`