#!/usr/bin/python import optparse import os import sys probes = {} bp = {} cc = {} mf = {} def split_value(string): """Splits a tab delimited value from affymetrix csv files""" string = string.strip() values = [x.strip() for x in string.split('///')] if len(values) == 1 and values[0] == '---': return [] return values def split_subvalues(string): """Splits a value into smaller components""" string = string.strip() values = [x.strip() for x in string.split('//')] if len(values) == 1 and values[0] == '--': return [] return values def set_probes(probe, entrez): """Set probe values for each entrez value.""" for gene_id in split_value(entrez): if not probes.has_key(gene_id): probes[gene_id] = [] probes[gene_id].append(probe.strip()) def set_go(d, entrez, terms): genes = split_value(entrez) terms = split_value(terms) for gene in genes: if not d.has_key(gene): d[gene] = [] for term in terms: d[gene].append(split_subvalues(term)[0]) def parse_options(): op = optparse.OptionParser() op.add_option('-b', '--biological-process', dest="bp", help="Output annotations in the biological process tree.", action="store_true", default=False) op.add_option('-c', '--cellular-component', dest="cc", help="Output annotations in the cellular component tree.", action="store_true", default=False) op.add_option('-d', '--output-dataset', help="Export as ftsv (Fluents dataset) file.") op.add_option('-m', '--molecular-function', dest="mf", help="Output annotations in the molecular function tree.", action="store_true", default=False) op.add_option('-u', '--unique-terms-only', dest="only_terms", help="Output only a list of all unique GO terms annotated to the genes", action="store_true", default=False) return op.parse_args() def read_file(options): fd = open('entrez-go-mapping.cccsv') for line in fd.readlines(): values = line.split(':::') probeid = values[0] set_probes(probeid, values[1]) if options.bp: set_go(bp, values[1], values[2]) if options.cc: set_go(cc, values[1], values[3]) if options.mf: set_go(mf, values[1], values[4]) fd.close() if __name__ == '__main__': options, args = parse_options() read_file(options) if options.only_terms: s = set() for gene in args: if options.bp and bp.has_key(gene): for x in bp[gene]: s.add(x) if options.mf and bp.has_key(gene): for x in mf[gene]: s.add(x) if options.cc and bp.has_key(gene): for x in cc[gene]: s.add(x) for term in s: print "GO:%07d" % int(term) sys.exit(0) for gene in args: print gene, if options.bp and bp.has_key(gene): for x in bp[gene]: print "GO:%07d" % int(x), if options.cc and bp.has_key(gene): for x in cc[gene]: print "GO:%07d" % int(x), if options.mf and bp.has_key(gene): for x in mf[gene]: print "GO:%07d" % int(x), print