113 lines
3.4 KiB
Plaintext
113 lines
3.4 KiB
Plaintext
|
#!/usr/bin/python
|
||
|
|
||
|
import optparse
|
||
|
import os
|
||
|
import sys
|
||
|
|
||
|
probes = {}
|
||
|
bp = {}
|
||
|
cc = {}
|
||
|
mf = {}
|
||
|
|
||
|
|
||
|
def split_value(string):
|
||
|
"""Splits a tab delimited value from affymetrix csv files"""
|
||
|
string = string.strip()
|
||
|
values = [x.strip() for x in string.split('///')]
|
||
|
if len(values) == 1 and values[0] == '---':
|
||
|
return []
|
||
|
return values
|
||
|
|
||
|
def split_subvalues(string):
|
||
|
"""Splits a value into smaller components"""
|
||
|
string = string.strip()
|
||
|
values = [x.strip() for x in string.split('//')]
|
||
|
if len(values) == 1 and values[0] == '--':
|
||
|
return []
|
||
|
return values
|
||
|
|
||
|
def set_probes(probe, entrez):
|
||
|
"""Set probe values for each entrez value."""
|
||
|
for gene_id in split_value(entrez):
|
||
|
if not probes.has_key(gene_id):
|
||
|
probes[gene_id] = []
|
||
|
probes[gene_id].append(probe.strip())
|
||
|
|
||
|
def set_go(d, entrez, terms):
|
||
|
genes = split_value(entrez)
|
||
|
terms = split_value(terms)
|
||
|
for gene in genes:
|
||
|
if not d.has_key(gene):
|
||
|
d[gene] = []
|
||
|
for term in terms:
|
||
|
d[gene].append(split_subvalues(term)[0])
|
||
|
|
||
|
|
||
|
def parse_options():
|
||
|
op = optparse.OptionParser()
|
||
|
op.add_option('-b', '--biological-process', dest="bp",
|
||
|
help="Output annotations in the biological process tree.",
|
||
|
action="store_true", default=False)
|
||
|
op.add_option('-c', '--cellular-component', dest="cc",
|
||
|
help="Output annotations in the cellular component tree.",
|
||
|
action="store_true", default=False)
|
||
|
op.add_option('-d', '--output-dataset',
|
||
|
help="Export as ftsv (Fluents dataset) file.")
|
||
|
op.add_option('-m', '--molecular-function', dest="mf",
|
||
|
help="Output annotations in the molecular function tree.",
|
||
|
action="store_true", default=False)
|
||
|
op.add_option('-u', '--unique-terms-only', dest="only_terms",
|
||
|
help="Output only a list of all unique GO terms annotated to the genes",
|
||
|
action="store_true", default=False)
|
||
|
return op.parse_args()
|
||
|
|
||
|
def read_file(options):
|
||
|
fd = open('entrez-go-mapping.cccsv')
|
||
|
for line in fd.readlines():
|
||
|
values = line.split(':::')
|
||
|
|
||
|
probeid = values[0]
|
||
|
set_probes(probeid, values[1])
|
||
|
if options.bp:
|
||
|
set_go(bp, values[1], values[2])
|
||
|
if options.cc:
|
||
|
set_go(cc, values[1], values[3])
|
||
|
if options.mf:
|
||
|
set_go(mf, values[1], values[4])
|
||
|
fd.close()
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
options, args = parse_options()
|
||
|
read_file(options)
|
||
|
|
||
|
if options.only_terms:
|
||
|
s = set()
|
||
|
for gene in args:
|
||
|
if options.bp and bp.has_key(gene):
|
||
|
for x in bp[gene]:
|
||
|
s.add(x)
|
||
|
if options.mf and bp.has_key(gene):
|
||
|
for x in mf[gene]:
|
||
|
s.add(x)
|
||
|
if options.cc and bp.has_key(gene):
|
||
|
for x in cc[gene]:
|
||
|
s.add(x)
|
||
|
|
||
|
for term in s:
|
||
|
print "GO:%07d" % int(term)
|
||
|
sys.exit(0)
|
||
|
|
||
|
for gene in args:
|
||
|
print gene,
|
||
|
if options.bp and bp.has_key(gene):
|
||
|
for x in bp[gene]:
|
||
|
print "GO:%07d" % int(x),
|
||
|
if options.cc and bp.has_key(gene):
|
||
|
for x in cc[gene]:
|
||
|
print "GO:%07d" % int(x),
|
||
|
if options.mf and bp.has_key(gene):
|
||
|
for x in mf[gene]:
|
||
|
print "GO:%07d" % int(x),
|
||
|
print
|
||
|
|