go-gene-matrix takes a GO vs. GO distance matrix and a gene-go-mapping file
and makes a gene vs. go distance matrix based on the shortest distances found between each gene and go term.
This commit is contained in:
parent
1ab558248c
commit
335efe231a
80
scripts/geneontology/go-gene-matrix
Executable file
80
scripts/geneontology/go-gene-matrix
Executable file
@ -0,0 +1,80 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
import os, sys
|
||||
import getopt
|
||||
sys.path.append('../..')
|
||||
from fluents import dataset
|
||||
import numpy
|
||||
|
||||
max_val = numpy.inf
|
||||
no_nan = False
|
||||
|
||||
def print_help():
|
||||
print
|
||||
print "Usage: go-gene-matrix <go-dist-matrix.ftsv> <gene-go-mapping.txt>"
|
||||
print
|
||||
print "Description:"
|
||||
print " Takes a GO term by GO term distance matrix and a file that"
|
||||
print " maps GO terms to genes as input arguments and produces a"
|
||||
print " dataset that contains the shortest distances between all"
|
||||
print " genes and GO terms."
|
||||
print
|
||||
print "Options:"
|
||||
print " -h, --help Show this help text."
|
||||
print " -m, --max-dist Trunkate all distances to this value."
|
||||
print
|
||||
|
||||
def get_parameters():
|
||||
global max_val
|
||||
short_opts = "hm:"
|
||||
long_opts = ["help", "max-dist="]
|
||||
|
||||
options, params = getopt.getopt(sys.argv[1:], short_opts, long_opts)
|
||||
for opt, val in options:
|
||||
if opt in ['-h', '--help']:
|
||||
print_help()
|
||||
sys.exit(0)
|
||||
elif opt in ['-m', '--max-dist']:
|
||||
max_val = int(val)
|
||||
|
||||
if len(params) < 2:
|
||||
print_help()
|
||||
sys.exit(1)
|
||||
|
||||
return params
|
||||
|
||||
if __name__ == '__main__':
|
||||
params = get_parameters()
|
||||
|
||||
# Read dataset
|
||||
fd = open(params[0])
|
||||
ds = dataset.read_ftsv(fd)
|
||||
array = ds.asarray()
|
||||
fd.close()
|
||||
|
||||
# Read mapping
|
||||
sorted_keys = []
|
||||
mapping = {}
|
||||
fd = open(params[1])
|
||||
lines = fd.readlines()
|
||||
for line in lines:
|
||||
values = line.split()
|
||||
if len(values) > 0:
|
||||
mapping[values[0]] = values[1:]
|
||||
sorted_keys.append(values[0])
|
||||
|
||||
# Create new dataset
|
||||
matrix = numpy.zeros((len(sorted_keys), ds.shape[0]))
|
||||
dim = ds.get_dim_name(0)
|
||||
for i, gene in enumerate(sorted_keys):
|
||||
for j, go in enumerate(ds[dim]):
|
||||
min = max_val
|
||||
for go2 in mapping[gene]:
|
||||
if ds[dim].has_key(go2) and array[j, ds[dim][go2]] < min:
|
||||
min = array[j, ds[dim][go2]]
|
||||
matrix[i, j] = min
|
||||
out_ds = dataset.Dataset(matrix,
|
||||
(('genes', sorted_keys), ('go-terms', ds[dim])),
|
||||
"Gene by GO matrix")
|
||||
dataset.write_ftsv(sys.stdout, out_ds)
|
||||
|
Reference in New Issue
Block a user