#!/usr/bin/python import os, sys import getopt sys.path.append('../..') from fluents import dataset import numpy max_val = numpy.inf no_nan = False def print_help(): print print "Usage: go-gene-matrix " print print "Description:" print " Takes a GO term by GO term distance matrix and a file that" print " maps GO terms to genes as input arguments and produces a" print " dataset that contains the shortest distances between all" print " genes and GO terms." print print "Options:" print " -h, --help Show this help text." print " -m, --max-dist Trunkate all distances to this value." print def get_parameters(): global max_val short_opts = "hm:" long_opts = ["help", "max-dist="] options, params = getopt.getopt(sys.argv[1:], short_opts, long_opts) for opt, val in options: if opt in ['-h', '--help']: print_help() sys.exit(0) elif opt in ['-m', '--max-dist']: max_val = int(val) if len(params) < 2: print_help() sys.exit(1) return params if __name__ == '__main__': params = get_parameters() # Read dataset fd = open(params[0]) ds = dataset.read_ftsv(fd) array = ds.asarray() fd.close() # Read mapping sorted_keys = [] mapping = {} fd = open(params[1]) lines = fd.readlines() for line in lines: values = line.split() if len(values) > 0: mapping[values[0]] = values[1:] sorted_keys.append(values[0]) # Create new dataset matrix = numpy.zeros((len(sorted_keys), ds.shape[0])) dim = ds.get_dim_name(0) for i, gene in enumerate(sorted_keys): for j, go in enumerate(ds[dim]): min = max_val for go2 in mapping[gene]: if ds[dim].has_key(go2) and array[j, ds[dim][go2]] < min: min = array[j, ds[dim][go2]] matrix[i, j] = min out_ds = dataset.Dataset(matrix, (('genes', sorted_keys), ('go-terms', ds[dim])), "Gene by GO matrix") dataset.write_ftsv(sys.stdout, out_ds)