einarr
335efe231a
and makes a gene vs. go distance matrix based on the shortest distances found between each gene and go term.
81 lines
2.2 KiB
Python
Executable File
81 lines
2.2 KiB
Python
Executable File
#!/usr/bin/python
|
|
|
|
import os, sys
|
|
import getopt
|
|
sys.path.append('../..')
|
|
from fluents import dataset
|
|
import numpy
|
|
|
|
max_val = numpy.inf
|
|
no_nan = False
|
|
|
|
def print_help():
|
|
print
|
|
print "Usage: go-gene-matrix <go-dist-matrix.ftsv> <gene-go-mapping.txt>"
|
|
print
|
|
print "Description:"
|
|
print " Takes a GO term by GO term distance matrix and a file that"
|
|
print " maps GO terms to genes as input arguments and produces a"
|
|
print " dataset that contains the shortest distances between all"
|
|
print " genes and GO terms."
|
|
print
|
|
print "Options:"
|
|
print " -h, --help Show this help text."
|
|
print " -m, --max-dist Trunkate all distances to this value."
|
|
print
|
|
|
|
def get_parameters():
|
|
global max_val
|
|
short_opts = "hm:"
|
|
long_opts = ["help", "max-dist="]
|
|
|
|
options, params = getopt.getopt(sys.argv[1:], short_opts, long_opts)
|
|
for opt, val in options:
|
|
if opt in ['-h', '--help']:
|
|
print_help()
|
|
sys.exit(0)
|
|
elif opt in ['-m', '--max-dist']:
|
|
max_val = int(val)
|
|
|
|
if len(params) < 2:
|
|
print_help()
|
|
sys.exit(1)
|
|
|
|
return params
|
|
|
|
if __name__ == '__main__':
|
|
params = get_parameters()
|
|
|
|
# Read dataset
|
|
fd = open(params[0])
|
|
ds = dataset.read_ftsv(fd)
|
|
array = ds.asarray()
|
|
fd.close()
|
|
|
|
# Read mapping
|
|
sorted_keys = []
|
|
mapping = {}
|
|
fd = open(params[1])
|
|
lines = fd.readlines()
|
|
for line in lines:
|
|
values = line.split()
|
|
if len(values) > 0:
|
|
mapping[values[0]] = values[1:]
|
|
sorted_keys.append(values[0])
|
|
|
|
# Create new dataset
|
|
matrix = numpy.zeros((len(sorted_keys), ds.shape[0]))
|
|
dim = ds.get_dim_name(0)
|
|
for i, gene in enumerate(sorted_keys):
|
|
for j, go in enumerate(ds[dim]):
|
|
min = max_val
|
|
for go2 in mapping[gene]:
|
|
if ds[dim].has_key(go2) and array[j, ds[dim][go2]] < min:
|
|
min = array[j, ds[dim][go2]]
|
|
matrix[i, j] = min
|
|
out_ds = dataset.Dataset(matrix,
|
|
(('genes', sorted_keys), ('go-terms', ds[dim])),
|
|
"Gene by GO matrix")
|
|
dataset.write_ftsv(sys.stdout, out_ds)
|
|
|