2007-05-15 20:15:51 +02:00
|
|
|
#!/usr/bin/python
|
|
|
|
|
|
|
|
import os, sys
|
|
|
|
import getopt
|
|
|
|
sys.path.append('../..')
|
2008-12-05 23:07:56 +01:00
|
|
|
from laydi import dataset
|
2007-05-15 20:15:51 +02:00
|
|
|
import numpy
|
|
|
|
|
|
|
|
max_val = numpy.inf
|
|
|
|
no_nan = False
|
|
|
|
|
|
|
|
def print_help():
|
|
|
|
print
|
|
|
|
print "Usage: go-gene-matrix <go-dist-matrix.ftsv> <gene-go-mapping.txt>"
|
|
|
|
print
|
|
|
|
print "Description:"
|
|
|
|
print " Takes a GO term by GO term distance matrix and a file that"
|
|
|
|
print " maps GO terms to genes as input arguments and produces a"
|
|
|
|
print " dataset that contains the shortest distances between all"
|
|
|
|
print " genes and GO terms."
|
|
|
|
print
|
|
|
|
print "Options:"
|
|
|
|
print " -h, --help Show this help text."
|
|
|
|
print " -m, --max-dist Trunkate all distances to this value."
|
|
|
|
print
|
|
|
|
|
|
|
|
def get_parameters():
|
|
|
|
global max_val
|
|
|
|
short_opts = "hm:"
|
|
|
|
long_opts = ["help", "max-dist="]
|
|
|
|
|
|
|
|
options, params = getopt.getopt(sys.argv[1:], short_opts, long_opts)
|
|
|
|
for opt, val in options:
|
|
|
|
if opt in ['-h', '--help']:
|
|
|
|
print_help()
|
|
|
|
sys.exit(0)
|
|
|
|
elif opt in ['-m', '--max-dist']:
|
|
|
|
max_val = int(val)
|
|
|
|
|
|
|
|
if len(params) < 2:
|
|
|
|
print_help()
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
return params
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
params = get_parameters()
|
|
|
|
|
|
|
|
# Read dataset
|
|
|
|
fd = open(params[0])
|
|
|
|
ds = dataset.read_ftsv(fd)
|
|
|
|
array = ds.asarray()
|
|
|
|
fd.close()
|
|
|
|
|
|
|
|
# Read mapping
|
|
|
|
sorted_keys = []
|
|
|
|
mapping = {}
|
|
|
|
fd = open(params[1])
|
|
|
|
lines = fd.readlines()
|
|
|
|
for line in lines:
|
|
|
|
values = line.split()
|
|
|
|
if len(values) > 0:
|
|
|
|
mapping[values[0]] = values[1:]
|
|
|
|
sorted_keys.append(values[0])
|
|
|
|
|
|
|
|
# Create new dataset
|
|
|
|
matrix = numpy.zeros((len(sorted_keys), ds.shape[0]))
|
|
|
|
dim = ds.get_dim_name(0)
|
|
|
|
for i, gene in enumerate(sorted_keys):
|
|
|
|
for j, go in enumerate(ds[dim]):
|
|
|
|
min = max_val
|
|
|
|
for go2 in mapping[gene]:
|
|
|
|
if ds[dim].has_key(go2) and array[j, ds[dim][go2]] < min:
|
|
|
|
min = array[j, ds[dim][go2]]
|
|
|
|
matrix[i, j] = min
|
|
|
|
out_ds = dataset.Dataset(matrix,
|
|
|
|
(('genes', sorted_keys), ('go-terms', ds[dim])),
|
|
|
|
"Gene by GO matrix")
|
|
|
|
dataset.write_ftsv(sys.stdout, out_ds)
|
|
|
|
|