Added entrez-go-mapping that maps entrez IDs to GO terms based on a file of the form:
affy_id ::: geneid_1 /// geneid_2 ::: go(bp) ::: go(cc) ::: go(mf)
This commit is contained in:
parent
6fe4ff3c59
commit
3dc1867be4
112
scripts/geneontology/entrez-go-mapping
Executable file
112
scripts/geneontology/entrez-go-mapping
Executable file
@ -0,0 +1,112 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
import optparse
|
||||
import os
|
||||
import sys
|
||||
|
||||
probes = {}
|
||||
bp = {}
|
||||
cc = {}
|
||||
mf = {}
|
||||
|
||||
|
||||
def split_value(string):
|
||||
"""Splits a tab delimited value from affymetrix csv files"""
|
||||
string = string.strip()
|
||||
values = [x.strip() for x in string.split('///')]
|
||||
if len(values) == 1 and values[0] == '---':
|
||||
return []
|
||||
return values
|
||||
|
||||
def split_subvalues(string):
|
||||
"""Splits a value into smaller components"""
|
||||
string = string.strip()
|
||||
values = [x.strip() for x in string.split('//')]
|
||||
if len(values) == 1 and values[0] == '--':
|
||||
return []
|
||||
return values
|
||||
|
||||
def set_probes(probe, entrez):
|
||||
"""Set probe values for each entrez value."""
|
||||
for gene_id in split_value(entrez):
|
||||
if not probes.has_key(gene_id):
|
||||
probes[gene_id] = []
|
||||
probes[gene_id].append(probe.strip())
|
||||
|
||||
def set_go(d, entrez, terms):
|
||||
genes = split_value(entrez)
|
||||
terms = split_value(terms)
|
||||
for gene in genes:
|
||||
if not d.has_key(gene):
|
||||
d[gene] = []
|
||||
for term in terms:
|
||||
d[gene].append(split_subvalues(term)[0])
|
||||
|
||||
|
||||
def parse_options():
|
||||
op = optparse.OptionParser()
|
||||
op.add_option('-b', '--biological-process', dest="bp",
|
||||
help="Output annotations in the biological process tree.",
|
||||
action="store_true", default=False)
|
||||
op.add_option('-c', '--cellular-component', dest="cc",
|
||||
help="Output annotations in the cellular component tree.",
|
||||
action="store_true", default=False)
|
||||
op.add_option('-d', '--output-dataset',
|
||||
help="Export as ftsv (Fluents dataset) file.")
|
||||
op.add_option('-m', '--molecular-function', dest="mf",
|
||||
help="Output annotations in the molecular function tree.",
|
||||
action="store_true", default=False)
|
||||
op.add_option('-u', '--unique-terms-only', dest="only_terms",
|
||||
help="Output only a list of all unique GO terms annotated to the genes",
|
||||
action="store_true", default=False)
|
||||
return op.parse_args()
|
||||
|
||||
def read_file(options):
|
||||
fd = open('entrez-go-mapping.cccsv')
|
||||
for line in fd.readlines():
|
||||
values = line.split(':::')
|
||||
|
||||
probeid = values[0]
|
||||
set_probes(probeid, values[1])
|
||||
if options.bp:
|
||||
set_go(bp, values[1], values[2])
|
||||
if options.cc:
|
||||
set_go(cc, values[1], values[3])
|
||||
if options.mf:
|
||||
set_go(mf, values[1], values[4])
|
||||
fd.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
options, args = parse_options()
|
||||
read_file(options)
|
||||
|
||||
if options.only_terms:
|
||||
s = set()
|
||||
for gene in args:
|
||||
if options.bp and bp.has_key(gene):
|
||||
for x in bp[gene]:
|
||||
s.add(x)
|
||||
if options.mf and bp.has_key(gene):
|
||||
for x in mf[gene]:
|
||||
s.add(x)
|
||||
if options.cc and bp.has_key(gene):
|
||||
for x in cc[gene]:
|
||||
s.add(x)
|
||||
|
||||
for term in s:
|
||||
print "GO:%07d" % int(term)
|
||||
sys.exit(0)
|
||||
|
||||
for gene in args:
|
||||
print gene,
|
||||
if options.bp and bp.has_key(gene):
|
||||
for x in bp[gene]:
|
||||
print "GO:%07d" % int(x),
|
||||
if options.cc and bp.has_key(gene):
|
||||
for x in cc[gene]:
|
||||
print "GO:%07d" % int(x),
|
||||
if options.mf and bp.has_key(gene):
|
||||
for x in mf[gene]:
|
||||
print "GO:%07d" % int(x),
|
||||
print
|
||||
|
Reference in New Issue
Block a user