This repository has been archived on 2024-07-04. You can view files and clone it, but cannot push or open issues or pull requests.
laydi/scripts/geneontology/go-gene-matrix

81 lines
2.2 KiB
Plaintext
Raw Normal View History

#!/usr/bin/python
import os, sys
import getopt
sys.path.append('../..')
from laydi import dataset
import numpy
max_val = numpy.inf
no_nan = False
def print_help():
print
print "Usage: go-gene-matrix <go-dist-matrix.ftsv> <gene-go-mapping.txt>"
print
print "Description:"
print " Takes a GO term by GO term distance matrix and a file that"
print " maps GO terms to genes as input arguments and produces a"
print " dataset that contains the shortest distances between all"
print " genes and GO terms."
print
print "Options:"
print " -h, --help Show this help text."
print " -m, --max-dist Trunkate all distances to this value."
print
def get_parameters():
global max_val
short_opts = "hm:"
long_opts = ["help", "max-dist="]
options, params = getopt.getopt(sys.argv[1:], short_opts, long_opts)
for opt, val in options:
if opt in ['-h', '--help']:
print_help()
sys.exit(0)
elif opt in ['-m', '--max-dist']:
max_val = int(val)
if len(params) < 2:
print_help()
sys.exit(1)
return params
if __name__ == '__main__':
params = get_parameters()
# Read dataset
fd = open(params[0])
ds = dataset.read_ftsv(fd)
array = ds.asarray()
fd.close()
# Read mapping
sorted_keys = []
mapping = {}
fd = open(params[1])
lines = fd.readlines()
for line in lines:
values = line.split()
if len(values) > 0:
mapping[values[0]] = values[1:]
sorted_keys.append(values[0])
# Create new dataset
matrix = numpy.zeros((len(sorted_keys), ds.shape[0]))
dim = ds.get_dim_name(0)
for i, gene in enumerate(sorted_keys):
for j, go in enumerate(ds[dim]):
min = max_val
for go2 in mapping[gene]:
if ds[dim].has_key(go2) and array[j, ds[dim][go2]] < min:
min = array[j, ds[dim][go2]]
matrix[i, j] = min
out_ds = dataset.Dataset(matrix,
(('genes', sorted_keys), ('go-terms', ds[dim])),
"Gene by GO matrix")
dataset.write_ftsv(sys.stdout, out_ds)