Added entrez-go-mapping that maps entrez IDs to GO terms based on a file of the form:

affy_id ::: geneid_1 /// geneid_2 ::: go(bp) ::: go(cc) ::: go(mf)
2007-03-15 23:47:24 +00:00 · 2007-03-15 23:47:24 +00:00 · 3dc1867be4
commit 3dc1867be4
parent 6fe4ff3c59
1 changed files with 112 additions and 0 deletions
--- a/scripts/geneontology/entrez-go-mapping
+++ b/scripts/geneontology/entrez-go-mapping
@ -0,0 +1,112 @@
+#!/usr/bin/python 
+
+import optparse
+import os
+import sys
+
+probes = {}
+bp = {}
+cc = {}
+mf = {}
+
+
+def split_value(string):
+    """Splits a tab delimited value from affymetrix csv files"""
+    string = string.strip()
+    values = [x.strip() for x in string.split('///')]
+    if len(values) == 1 and values[0] == '---':
+        return []
+    return values
+
+def split_subvalues(string):
+    """Splits a value into smaller components"""
+    string = string.strip()
+    values = [x.strip() for x in string.split('//')]
+    if len(values) == 1 and values[0] == '--':
+        return []
+    return values
+
+def set_probes(probe, entrez):
+    """Set probe values for each entrez value."""
+    for gene_id in split_value(entrez):
+        if not probes.has_key(gene_id):
+            probes[gene_id] = []
+        probes[gene_id].append(probe.strip())
+
+def set_go(d, entrez, terms):
+    genes = split_value(entrez)
+    terms = split_value(terms)
+    for gene in genes:
+        if not d.has_key(gene):
+            d[gene] = []
+        for term in terms:
+            d[gene].append(split_subvalues(term)[0])
+
+
+def parse_options():
+    op = optparse.OptionParser()
+    op.add_option('-b', '--biological-process', dest="bp",
+                  help="Output annotations in the biological process tree.",
+                  action="store_true", default=False)
+    op.add_option('-c', '--cellular-component', dest="cc",
+                  help="Output annotations in the cellular component tree.",
+                  action="store_true", default=False)
+    op.add_option('-d', '--output-dataset',
+                  help="Export as ftsv (Fluents dataset) file.")
+    op.add_option('-m', '--molecular-function', dest="mf",
+                  help="Output annotations in the molecular function tree.",
+                  action="store_true", default=False)
+    op.add_option('-u', '--unique-terms-only', dest="only_terms",
+                  help="Output only a list of all unique GO terms annotated to the genes",
+                  action="store_true", default=False)
+    return op.parse_args()
+
+def read_file(options):
+    fd = open('entrez-go-mapping.cccsv')
+    for line in fd.readlines():
+        values = line.split(':::')
+        
+        probeid = values[0]
+        set_probes(probeid, values[1])
+        if options.bp:
+            set_go(bp, values[1], values[2])
+        if options.cc:
+            set_go(cc, values[1], values[3])
+        if options.mf:
+            set_go(mf, values[1], values[4])
+    fd.close()
+        
+if __name__ == '__main__':
+    options, args = parse_options()
+    read_file(options)
+
+    if options.only_terms:
+        s = set()
+        for gene in args:
+            if options.bp and bp.has_key(gene):
+                for x in bp[gene]:
+                    s.add(x)
+            if options.mf and bp.has_key(gene):
+                for x in mf[gene]:
+                    s.add(x)
+            if options.cc and bp.has_key(gene):
+                for x in cc[gene]:
+                    s.add(x)
+        
+        for term in s:
+            print "GO:%07d" % int(term)
+        sys.exit(0)
+
+    for gene in args:
+        print gene,
+        if options.bp and bp.has_key(gene):
+            for x in bp[gene]:
+                print "GO:%07d" % int(x),
+        if options.cc and bp.has_key(gene):
+            for x in cc[gene]:
+                print "GO:%07d" % int(x),
+        if options.mf and bp.has_key(gene):
+            for x in mf[gene]:
+                print "GO:%07d" % int(x),
+        print
+