laydi/scripts/geneontology/entrez-go-mapping

#!/usr/bin/python 

import optparse
import os
import sys

probes = {}
bp = {}
cc = {}
mf = {}


def split_value(string):
    """Splits a tab delimited value from affymetrix csv files"""
    string = string.strip()
    values = [x.strip() for x in string.split('///')]
    if len(values) == 1 and values[0] == '---':
        return []
    return values

def split_subvalues(string):
    """Splits a value into smaller components"""
    string = string.strip()
    values = [x.strip() for x in string.split('//')]
    if len(values) == 1 and values[0] == '--':
        return []
    return values

def set_probes(probe, entrez):
    """Set probe values for each entrez value."""
    for gene_id in split_value(entrez):
        if not probes.has_key(gene_id):
            probes[gene_id] = []
        probes[gene_id].append(probe.strip())

def set_go(d, entrez, terms):
    genes = split_value(entrez)
    terms = split_value(terms)
    for gene in genes:
        if not d.has_key(gene):
            d[gene] = []
        for term in terms:
            d[gene].append(split_subvalues(term)[0])


def parse_options():
    op = optparse.OptionParser()
    op.add_option('-b', '--biological-process', dest="bp",
                  help="Output annotations in the biological process tree.",
                  action="store_true", default=False)
    op.add_option('-c', '--cellular-component', dest="cc",
                  help="Output annotations in the cellular component tree.",
                  action="store_true", default=False)
    op.add_option('-d', '--output-dataset',
                  help="Export as ftsv (Fluents dataset) file.")
    op.add_option('-m', '--molecular-function', dest="mf",
                  help="Output annotations in the molecular function tree.",
                  action="store_true", default=False)
    op.add_option('-u', '--unique-terms-only', dest="only_terms",
                  help="Output only a list of all unique GO terms annotated to the genes",
                  action="store_true", default=False)
    return op.parse_args()

def read_file(options):
    fd = open('entrez-go-mapping.cccsv')
    for line in fd.readlines():
        values = line.split(':::')
        
        probeid = values[0]
        set_probes(probeid, values[1])
        if options.bp:
            set_go(bp, values[1], values[2])
        if options.cc:
            set_go(cc, values[1], values[3])
        if options.mf:
            set_go(mf, values[1], values[4])
    fd.close()
        
if __name__ == '__main__':
    options, args = parse_options()
    read_file(options)

    if options.only_terms:
        s = set()
        for gene in args:
            if options.bp and bp.has_key(gene):
                for x in bp[gene]:
                    s.add(x)
            if options.mf and bp.has_key(gene):
                for x in mf[gene]:
                    s.add(x)
            if options.cc and bp.has_key(gene):
                for x in cc[gene]:
                    s.add(x)
        
        for term in s:
            print "GO:%07d" % int(term)
        sys.exit(0)

    for gene in args:
        print gene,
        if options.bp and bp.has_key(gene):
            for x in bp[gene]:
                print "GO:%07d" % int(x),
        if options.cc and bp.has_key(gene):
            for x in cc[gene]:
                print "GO:%07d" % int(x),
        if options.mf and bp.has_key(gene):
            for x in mf[gene]:
                print "GO:%07d" % int(x),
        print
Added entrez-go-mapping that maps entrez IDs to GO terms based on a file of the form: affy_id ::: geneid_1 /// geneid_2 ::: go(bp) ::: go(cc) ::: go(mf) 2007-03-16 00:47:24 +01:00			`#!/usr/bin/python`

			`import optparse`
			`import os`
			`import sys`

			`probes = {}`
			`bp = {}`
			`cc = {}`
			`mf = {}`


			`def split_value(string):`
			`"""Splits a tab delimited value from affymetrix csv files"""`
			`string = string.strip()`
			`values = [x.strip() for x in string.split('///')]`
			`if len(values) == 1 and values[0] == '---':`
			`return []`
			`return values`

			`def split_subvalues(string):`
			`"""Splits a value into smaller components"""`
			`string = string.strip()`
			`values = [x.strip() for x in string.split('//')]`
			`if len(values) == 1 and values[0] == '--':`
			`return []`
			`return values`

			`def set_probes(probe, entrez):`
			`"""Set probe values for each entrez value."""`
			`for gene_id in split_value(entrez):`
			`if not probes.has_key(gene_id):`
			`probes[gene_id] = []`
			`probes[gene_id].append(probe.strip())`

			`def set_go(d, entrez, terms):`
			`genes = split_value(entrez)`
			`terms = split_value(terms)`
			`for gene in genes:`
			`if not d.has_key(gene):`
			`d[gene] = []`
			`for term in terms:`
			`d[gene].append(split_subvalues(term)[0])`


			`def parse_options():`
			`op = optparse.OptionParser()`
			`op.add_option('-b', '--biological-process', dest="bp",`
			`help="Output annotations in the biological process tree.",`
			`action="store_true", default=False)`
			`op.add_option('-c', '--cellular-component', dest="cc",`
			`help="Output annotations in the cellular component tree.",`
			`action="store_true", default=False)`
			`op.add_option('-d', '--output-dataset',`
			`help="Export as ftsv (Fluents dataset) file.")`
			`op.add_option('-m', '--molecular-function', dest="mf",`
			`help="Output annotations in the molecular function tree.",`
			`action="store_true", default=False)`
			`op.add_option('-u', '--unique-terms-only', dest="only_terms",`
			`help="Output only a list of all unique GO terms annotated to the genes",`
			`action="store_true", default=False)`
			`return op.parse_args()`

			`def read_file(options):`
			`fd = open('entrez-go-mapping.cccsv')`
			`for line in fd.readlines():`
			`values = line.split(':::')`

			`probeid = values[0]`
			`set_probes(probeid, values[1])`
			`if options.bp:`
			`set_go(bp, values[1], values[2])`
			`if options.cc:`
			`set_go(cc, values[1], values[3])`
			`if options.mf:`
			`set_go(mf, values[1], values[4])`
			`fd.close()`

			`if __name__ == '__main__':`
			`options, args = parse_options()`
			`read_file(options)`

			`if options.only_terms:`
			`s = set()`
			`for gene in args:`
			`if options.bp and bp.has_key(gene):`
			`for x in bp[gene]:`
			`s.add(x)`
			`if options.mf and bp.has_key(gene):`
			`for x in mf[gene]:`
			`s.add(x)`
			`if options.cc and bp.has_key(gene):`
			`for x in cc[gene]:`
			`s.add(x)`

			`for term in s:`
			`print "GO:%07d" % int(term)`
			`sys.exit(0)`

			`for gene in args:`
			`print gene,`
			`if options.bp and bp.has_key(gene):`
			`for x in bp[gene]:`
			`print "GO:%07d" % int(x),`
			`if options.cc and bp.has_key(gene):`
			`for x in cc[gene]:`
			`print "GO:%07d" % int(x),`
			`if options.mf and bp.has_key(gene):`
			`for x in mf[gene]:`
			`print "GO:%07d" % int(x),`
			`print`