laydi/scripts/geneontology/entrez-go-mapping

#!/usr/bin/python

import optparse
import os
import sys

probes = {}
bp = {}
cc = {}
mf = {}


def split_value(string):
    """Splits a tab delimited value from affymetrix csv files"""
    string = string.strip()
    values = [x.strip() for x in string.split('///')]
    if len(values) == 1 and values[0] == '---':
        return []
    return values

def split_subvalues(string):
    """Splits a value into smaller components"""
    string = string.strip()
    values = [x.strip() for x in string.split('//')]
    if len(values) == 1 and values[0] == '--':
        return []
    return values

def set_probes(probe, entrez):
    """Set probe values for each entrez value."""
    for gene_id in split_value(entrez):
        if not probes.has_key(gene_id):
            probes[gene_id] = []
        probes[gene_id].append(probe.strip())

def set_go(d, entrez, terms):
    genes = split_value(entrez)
    terms = split_value(terms)
    for gene in genes:
        if not d.has_key(gene):
            d[gene] = []
        for term in terms:
            d[gene].append(split_subvalues(term)[0])


def parse_options():
    op = optparse.OptionParser()
    op.add_option('-b', '--biological-process', dest="bp",
                  help="Output annotations in the biological process tree.",
                  action="store_true", default=False)
    op.add_option('-c', '--cellular-component', dest="cc",
                  help="Output annotations in the cellular component tree.",
                  action="store_true", default=False)
    op.add_option('-d', '--output-dataset',
                  help="Export as ftsv (Fluents dataset) file.")
    op.add_option('-m', '--molecular-function', dest="mf",
                  help="Output annotations in the molecular function tree.",
                  action="store_true", default=False)
    op.add_option('-u', '--unique-terms-only', dest="only_terms",
                  help="Output only a list of all unique GO terms annotated to the genes",
                  action="store_true", default=False)
    return op.parse_args()

def read_file(options):
    fd = open('entrez-go-mapping.cccsv')
    for line in fd.readlines():
        values = line.split(':::')

        probeid = values[0]
        set_probes(probeid, values[1])
        if options.bp:
            set_go(bp, values[1], values[2])
        if options.cc:
            set_go(cc, values[1], values[3])
        if options.mf:
            set_go(mf, values[1], values[4])
    fd.close()

if __name__ == '__main__':
    options, args = parse_options()
    read_file(options)

    if options.only_terms:
        s = set()
        for gene in args:
            if options.bp and bp.has_key(gene):
                for x in bp[gene]:
                    s.add(x)
            if options.mf and bp.has_key(gene):
                for x in mf[gene]:
                    s.add(x)
            if options.cc and bp.has_key(gene):
                for x in cc[gene]:
                    s.add(x)

        for term in s:
            print "GO:%07d" % int(term)
        sys.exit(0)

    for gene in args:
        print gene,
        if options.bp and bp.has_key(gene):
            for x in bp[gene]:
                print "GO:%07d" % int(x),
        if options.cc and bp.has_key(gene):
            for x in cc[gene]:
                print "GO:%07d" % int(x),
        if options.mf and bp.has_key(gene):
            for x in mf[gene]:
                print "GO:%07d" % int(x),
        print