Script to generate mapping files between identifiers found in tab separated files.

This commit is contained in:
Einar Ryeng 2011-03-24 08:46:17 +00:00
parent f1b3009f11
commit 6be624e872

View File

@ -0,0 +1,93 @@
#!/usr/bin/python
import getopt
import os, os.path
import sys
OUTPUT_COLS = ["Array_Address_Id", "Entrez_Gene_ID", "Accession", "ILMN_Gene", "Definition", ]
def print_help():
print "laydi-mapping-illumina"
print
print "Usage: laydi-mapping-illumina <illumina-annotation-file.txt> <from_dim> <to_dim>"
print
print "Description:"
print " Produce mapping files from Illumina text annotation files"
print " Illumina files can be downloaded from:"
print " http://www.switchtoi.com/annotationfiles.ilmn"
print
print " NOTE: <from_dim> and <to_dim> are the column names in the illumina text file,"
print " not laydi dimensions."
print
def parse_cmdline():
short_opts = "h"
long_opts = ["help"]
options, params = getopt.getopt(sys.argv[1:], short_opts, long_opts)
for key, val in options:
if key in ["-h", "--help"]:
print_help()
sys.exit(0)
if len(params) != 3:
print_help()
sys.exit(1)
return params
def build_map(fn, from_dim, to_dim):
retval = {}
fd = open(fn)
line = fd.readline()
while line != "" and line.strip() != "[Probes]":
line = fd.readline()
if line == "":
return None
line = fd.readline()
cols = [x.strip() for x in line.split("\t")]
from_col = cols.index(from_dim)
to_col = cols.index(to_dim)
line = fd.readline()
while line != "" and not line.strip().startswith("["):
key = line.split("\t")[from_col]
val = line.split("\t")[to_col]
if not retval.has_key(key):
retval[key] = [val]
else:
retval[key].append(val)
line = fd.readline()
return retval
def write_map(fd, d, from_dim, to_dim):
opened_here = False
if isinstance(fd, str):
fd = open(fd, "w")
opened_here = True
print >> fd, "# from: %s" % from_dim
print >> fd, "# to: %s" % to_dim
print >> fd, "# description: "
print >> fd
for k, v in d.items():
print >> fd, k,
for e in v:
print >> fd, e,
print >> fd
if opened_here:
fd.close()
if __name__ == '__main__':
fn, from_dim, to_dim = parse_cmdline()
m = build_map(fn, from_dim, to_dim)
write_map(sys.stdout, m, from_dim, to_dim)