go-gene-matrix takes a GO vs. GO distance matrix and a gene-go-mapping file
and makes a gene vs. go distance matrix based on the shortest distances found between each gene and go term.
This commit is contained in:
		
							
								
								
									
										80
									
								
								scripts/geneontology/go-gene-matrix
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										80
									
								
								scripts/geneontology/go-gene-matrix
									
									
									
									
									
										Executable file
									
								
							@@ -0,0 +1,80 @@
 | 
				
			|||||||
 | 
					#!/usr/bin/python
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import os, sys
 | 
				
			||||||
 | 
					import getopt
 | 
				
			||||||
 | 
					sys.path.append('../..')
 | 
				
			||||||
 | 
					from fluents import dataset
 | 
				
			||||||
 | 
					import numpy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					max_val = numpy.inf
 | 
				
			||||||
 | 
					no_nan = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def print_help():
 | 
				
			||||||
 | 
					    print
 | 
				
			||||||
 | 
					    print "Usage: go-gene-matrix <go-dist-matrix.ftsv> <gene-go-mapping.txt>"
 | 
				
			||||||
 | 
					    print
 | 
				
			||||||
 | 
					    print "Description:"
 | 
				
			||||||
 | 
					    print "    Takes a GO term by GO term distance matrix and a file that"
 | 
				
			||||||
 | 
					    print "    maps GO terms to genes as input arguments and produces a"
 | 
				
			||||||
 | 
					    print "    dataset that contains the shortest distances between all"
 | 
				
			||||||
 | 
					    print "    genes and GO terms."
 | 
				
			||||||
 | 
					    print
 | 
				
			||||||
 | 
					    print "Options:"
 | 
				
			||||||
 | 
					    print "    -h, --help       Show this help text."
 | 
				
			||||||
 | 
					    print "    -m, --max-dist   Trunkate all distances to this value."
 | 
				
			||||||
 | 
					    print
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_parameters():
 | 
				
			||||||
 | 
					    global max_val
 | 
				
			||||||
 | 
					    short_opts = "hm:"
 | 
				
			||||||
 | 
					    long_opts = ["help", "max-dist="]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    options, params = getopt.getopt(sys.argv[1:], short_opts, long_opts)
 | 
				
			||||||
 | 
					    for opt, val in options:
 | 
				
			||||||
 | 
					        if opt in ['-h', '--help']:
 | 
				
			||||||
 | 
					            print_help()
 | 
				
			||||||
 | 
					            sys.exit(0)
 | 
				
			||||||
 | 
					        elif opt in ['-m', '--max-dist']:
 | 
				
			||||||
 | 
					            max_val = int(val)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if len(params) < 2:
 | 
				
			||||||
 | 
					        print_help()
 | 
				
			||||||
 | 
					        sys.exit(1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return params
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					    params = get_parameters()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Read dataset
 | 
				
			||||||
 | 
					    fd = open(params[0])
 | 
				
			||||||
 | 
					    ds = dataset.read_ftsv(fd)
 | 
				
			||||||
 | 
					    array = ds.asarray()
 | 
				
			||||||
 | 
					    fd.close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Read mapping
 | 
				
			||||||
 | 
					    sorted_keys = []
 | 
				
			||||||
 | 
					    mapping = {}
 | 
				
			||||||
 | 
					    fd = open(params[1])
 | 
				
			||||||
 | 
					    lines = fd.readlines()
 | 
				
			||||||
 | 
					    for line in lines:
 | 
				
			||||||
 | 
					        values = line.split()
 | 
				
			||||||
 | 
					        if len(values) > 0:
 | 
				
			||||||
 | 
					            mapping[values[0]] = values[1:]
 | 
				
			||||||
 | 
					            sorted_keys.append(values[0])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Create new dataset
 | 
				
			||||||
 | 
					    matrix = numpy.zeros((len(sorted_keys), ds.shape[0]))
 | 
				
			||||||
 | 
					    dim = ds.get_dim_name(0)
 | 
				
			||||||
 | 
					    for i, gene in enumerate(sorted_keys):
 | 
				
			||||||
 | 
					        for j, go in enumerate(ds[dim]):
 | 
				
			||||||
 | 
					            min = max_val
 | 
				
			||||||
 | 
					            for go2 in mapping[gene]:
 | 
				
			||||||
 | 
					                if ds[dim].has_key(go2) and array[j, ds[dim][go2]] < min:
 | 
				
			||||||
 | 
					                    min = array[j, ds[dim][go2]]
 | 
				
			||||||
 | 
					            matrix[i, j] = min
 | 
				
			||||||
 | 
					    out_ds = dataset.Dataset(matrix, 
 | 
				
			||||||
 | 
					                             (('genes', sorted_keys), ('go-terms', ds[dim])),
 | 
				
			||||||
 | 
					                             "Gene by GO matrix")
 | 
				
			||||||
 | 
					    dataset.write_ftsv(sys.stdout, out_ds)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		Reference in New Issue
	
	Block a user