diff --git a/scripts/geneontology/go-gene-matrix b/scripts/geneontology/go-gene-matrix new file mode 100755 index 0000000..2aa7a17 --- /dev/null +++ b/scripts/geneontology/go-gene-matrix @@ -0,0 +1,80 @@ +#!/usr/bin/python + +import os, sys +import getopt +sys.path.append('../..') +from fluents import dataset +import numpy + +max_val = numpy.inf +no_nan = False + +def print_help(): + print + print "Usage: go-gene-matrix " + print + print "Description:" + print " Takes a GO term by GO term distance matrix and a file that" + print " maps GO terms to genes as input arguments and produces a" + print " dataset that contains the shortest distances between all" + print " genes and GO terms." + print + print "Options:" + print " -h, --help Show this help text." + print " -m, --max-dist Trunkate all distances to this value." + print + +def get_parameters(): + global max_val + short_opts = "hm:" + long_opts = ["help", "max-dist="] + + options, params = getopt.getopt(sys.argv[1:], short_opts, long_opts) + for opt, val in options: + if opt in ['-h', '--help']: + print_help() + sys.exit(0) + elif opt in ['-m', '--max-dist']: + max_val = int(val) + + if len(params) < 2: + print_help() + sys.exit(1) + + return params + +if __name__ == '__main__': + params = get_parameters() + + # Read dataset + fd = open(params[0]) + ds = dataset.read_ftsv(fd) + array = ds.asarray() + fd.close() + + # Read mapping + sorted_keys = [] + mapping = {} + fd = open(params[1]) + lines = fd.readlines() + for line in lines: + values = line.split() + if len(values) > 0: + mapping[values[0]] = values[1:] + sorted_keys.append(values[0]) + + # Create new dataset + matrix = numpy.zeros((len(sorted_keys), ds.shape[0])) + dim = ds.get_dim_name(0) + for i, gene in enumerate(sorted_keys): + for j, go in enumerate(ds[dim]): + min = max_val + for go2 in mapping[gene]: + if ds[dim].has_key(go2) and array[j, ds[dim][go2]] < min: + min = array[j, ds[dim][go2]] + matrix[i, j] = min + out_ds = dataset.Dataset(matrix, + (('genes', sorted_keys), ('go-terms', ds[dim])), + "Gene by GO matrix") + dataset.write_ftsv(sys.stdout, out_ds) +