Added entrez-go-mapping that maps entrez IDs to GO terms based on a file of the form:

affy_id ::: geneid_1 /// geneid_2 ::: go(bp) ::: go(cc) ::: go(mf)
This commit is contained in:
Einar Ryeng 2007-03-15 23:47:24 +00:00
parent 6fe4ff3c59
commit 3dc1867be4

View File

@ -0,0 +1,112 @@
#!/usr/bin/python
import optparse
import os
import sys
probes = {}
bp = {}
cc = {}
mf = {}
def split_value(string):
"""Splits a tab delimited value from affymetrix csv files"""
string = string.strip()
values = [x.strip() for x in string.split('///')]
if len(values) == 1 and values[0] == '---':
return []
return values
def split_subvalues(string):
"""Splits a value into smaller components"""
string = string.strip()
values = [x.strip() for x in string.split('//')]
if len(values) == 1 and values[0] == '--':
return []
return values
def set_probes(probe, entrez):
"""Set probe values for each entrez value."""
for gene_id in split_value(entrez):
if not probes.has_key(gene_id):
probes[gene_id] = []
probes[gene_id].append(probe.strip())
def set_go(d, entrez, terms):
genes = split_value(entrez)
terms = split_value(terms)
for gene in genes:
if not d.has_key(gene):
d[gene] = []
for term in terms:
d[gene].append(split_subvalues(term)[0])
def parse_options():
op = optparse.OptionParser()
op.add_option('-b', '--biological-process', dest="bp",
help="Output annotations in the biological process tree.",
action="store_true", default=False)
op.add_option('-c', '--cellular-component', dest="cc",
help="Output annotations in the cellular component tree.",
action="store_true", default=False)
op.add_option('-d', '--output-dataset',
help="Export as ftsv (Fluents dataset) file.")
op.add_option('-m', '--molecular-function', dest="mf",
help="Output annotations in the molecular function tree.",
action="store_true", default=False)
op.add_option('-u', '--unique-terms-only', dest="only_terms",
help="Output only a list of all unique GO terms annotated to the genes",
action="store_true", default=False)
return op.parse_args()
def read_file(options):
fd = open('entrez-go-mapping.cccsv')
for line in fd.readlines():
values = line.split(':::')
probeid = values[0]
set_probes(probeid, values[1])
if options.bp:
set_go(bp, values[1], values[2])
if options.cc:
set_go(cc, values[1], values[3])
if options.mf:
set_go(mf, values[1], values[4])
fd.close()
if __name__ == '__main__':
options, args = parse_options()
read_file(options)
if options.only_terms:
s = set()
for gene in args:
if options.bp and bp.has_key(gene):
for x in bp[gene]:
s.add(x)
if options.mf and bp.has_key(gene):
for x in mf[gene]:
s.add(x)
if options.cc and bp.has_key(gene):
for x in cc[gene]:
s.add(x)
for term in s:
print "GO:%07d" % int(term)
sys.exit(0)
for gene in args:
print gene,
if options.bp and bp.has_key(gene):
for x in bp[gene]:
print "GO:%07d" % int(x),
if options.cc and bp.has_key(gene):
for x in cc[gene]:
print "GO:%07d" % int(x),
if options.mf and bp.has_key(gene):
for x in mf[gene]:
print "GO:%07d" % int(x),
print