Gene ontolgy distance script in C. Will hopefully speed up calculations when
finished.
This commit is contained in:
parent
6f19fe1b4b
commit
0904a59310
12
scripts/geneontology/go-distance/Makefile
Normal file
12
scripts/geneontology/go-distance/Makefile
Normal file
@ -0,0 +1,12 @@
|
||||
|
||||
all: go-distance
|
||||
|
||||
godist.o: godist.c godist.h
|
||||
gcc -ggdb -c godist.c
|
||||
|
||||
go-distance: godist.o main.o
|
||||
gcc -ggdb -o go-distance godist.o main.o
|
||||
|
||||
clean:
|
||||
-rm go-distance godist.o main.o
|
||||
|
10503
scripts/geneontology/go-distance/go-terms.txt
Normal file
10503
scripts/geneontology/go-distance/go-terms.txt
Normal file
File diff suppressed because it is too large
Load Diff
17829
scripts/geneontology/go-distance/go-tree.txt
Normal file
17829
scripts/geneontology/go-distance/go-tree.txt
Normal file
File diff suppressed because it is too large
Load Diff
170
scripts/geneontology/go-distance/godist.c
Normal file
170
scripts/geneontology/go-distance/godist.c
Normal file
@ -0,0 +1,170 @@
|
||||
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <search.h>
|
||||
#include "godist.h"
|
||||
|
||||
void print_terms();
|
||||
void add_link(char*, char*);
|
||||
struct node* get_bp();
|
||||
|
||||
/* initialisation */
|
||||
int godist_init() {
|
||||
/* Initialize hash table and array */
|
||||
hcreate(MAX_NODES);
|
||||
term_array_size = 0;
|
||||
link_count = 0;
|
||||
|
||||
/* Read ontology terms from file */
|
||||
printf("Reading GO terms from go-terms.txt...");
|
||||
FILE *term_fd = fopen("go-terms.txt", "r");
|
||||
|
||||
if (term_fd == NULL) {
|
||||
printf("cannot open file: go-terms.txt\n");
|
||||
exit(errno);
|
||||
}
|
||||
|
||||
int i;
|
||||
while((i = godist_read_term(term_fd)) == 13) {
|
||||
/* printf("%d\n", i);*/
|
||||
}
|
||||
fclose(term_fd);
|
||||
printf(" %d terms\n", term_array_size);
|
||||
|
||||
/* Read ontology structure from file */
|
||||
printf("Reading GO structure from go-tree.txt...");
|
||||
FILE *tree_fd = fopen("go-tree.txt", "r");
|
||||
|
||||
if (tree_fd == NULL) {
|
||||
printf("cannot open file: go-tree.txt\n");
|
||||
exit(errno);
|
||||
}
|
||||
|
||||
while((i = godist_read_assoc(tree_fd)) == 2) {
|
||||
link_count++;
|
||||
}
|
||||
fclose(tree_fd);
|
||||
printf(" %d edges\n", link_count);
|
||||
|
||||
accumulate_evidence(get_bp());
|
||||
}
|
||||
|
||||
void godist_exit() {
|
||||
int i;
|
||||
for (i=0; i<term_array_size; i++) {
|
||||
free(term_array[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int godist_read_assoc(FILE *fd) {
|
||||
char term1[11], term2[11];
|
||||
int retval;
|
||||
retval = fscanf(fd, " %10s %10s ", term1, term2);
|
||||
if (retval != EOF) {
|
||||
add_link(term1, term2);
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
int godist_read_term(FILE *fd) {
|
||||
char term[11];
|
||||
int ev[12];
|
||||
int i;
|
||||
ENTRY e, *res;
|
||||
int nread = fscanf(fd, " %10s %d %d %d %d %d %d %d %d %d %d %d %d ",
|
||||
term, &ev[0], &ev[1], &ev[2], &ev[3], &ev[4], &ev[5],
|
||||
&ev[6], &ev[7], &ev[8], &ev[9], &ev[10], &ev[11]);
|
||||
if (errno != 0) {
|
||||
printf("errno: %d\n", errno);
|
||||
}
|
||||
if (nread == 13) {
|
||||
struct node *n = (struct node*) malloc(sizeof(struct node));
|
||||
n->parentc = 0;
|
||||
n->childrenc = 0;
|
||||
for (i=0; i<12; i++) {
|
||||
n->evidence[i] = ev[i];
|
||||
n->acc_evidence[i] = 0;
|
||||
}
|
||||
strcpy(n->term, term);
|
||||
|
||||
/* add to hash table */
|
||||
e.key = n->term;
|
||||
res = hsearch(e, ENTER);
|
||||
|
||||
term_array[term_array_size++] = n;
|
||||
}
|
||||
|
||||
return nread;
|
||||
}
|
||||
|
||||
/* distance functions */
|
||||
float go_distance(char *term1, char *term2) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
void add_node(char *term, int parentc, struct node **parents) {
|
||||
|
||||
}
|
||||
|
||||
void add_link(char *parent_id, char *child_id) {
|
||||
ENTRY *ep, e;
|
||||
struct node *parent, *child;
|
||||
|
||||
char key[11];
|
||||
strcpy(key, parent_id);
|
||||
e.key = key;
|
||||
ep = hsearch(e, FIND);
|
||||
if (!ep) {
|
||||
printf("Cannot find term %s\n", e.key);
|
||||
return;
|
||||
}
|
||||
parent = (struct node*) ep->key;
|
||||
|
||||
strcpy(key, child_id);
|
||||
e.key = key;
|
||||
ep = hsearch(e, FIND);
|
||||
if (!ep) {
|
||||
printf("Cannot find term %s\n", e.key);
|
||||
return;
|
||||
}
|
||||
child = (struct node*) ep->key;
|
||||
|
||||
if (parent->childrenc +1 > MAX_CHILDREN) {
|
||||
printf("FIXME: increase child count");
|
||||
return;
|
||||
}
|
||||
parent->children[parent->childrenc++] = child;
|
||||
child->parents[child->parentc++] = parent;
|
||||
}
|
||||
|
||||
struct node *get_bp() {
|
||||
ENTRY e, *ep;
|
||||
e.key = "GO:0008150";
|
||||
ep = hsearch(e, FIND);
|
||||
if (ep)
|
||||
return ep;
|
||||
if (ep == ESRCH)
|
||||
printf("ESRCH\n");
|
||||
if (ep == ESRCH)
|
||||
printf("ENOMEM\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void accumulate_evidence(struct node *n) {
|
||||
int i, j;
|
||||
for (i=0; i<(n->childrenc); i++) {
|
||||
accumulate_evidence(n->children[i]);
|
||||
for (j=0; j<12; j++)
|
||||
n->evidence[j] += n->children[i]->evidence[j];
|
||||
}
|
||||
}
|
||||
|
||||
void print_terms() {
|
||||
int i;
|
||||
for (i=0; i<term_array_size; i++) {
|
||||
printf("%s\n", term_array[i]->term);
|
||||
}
|
||||
}
|
||||
|
47
scripts/geneontology/go-distance/godist.h
Normal file
47
scripts/geneontology/go-distance/godist.h
Normal file
@ -0,0 +1,47 @@
|
||||
#ifndef GODIST_H
|
||||
#define GODIST_H
|
||||
|
||||
#include <search.h>
|
||||
|
||||
#define MAX_NODES 15000
|
||||
#define MAX_PARENTS 100
|
||||
#define MAX_CHILDREN 100
|
||||
|
||||
struct node;
|
||||
struct node {
|
||||
/* GO term id. E.g: "GO:0005180" */
|
||||
char term[11];
|
||||
/* Information content */
|
||||
float ic;
|
||||
/* Depth in tree */
|
||||
int depth;
|
||||
/* Evidence codes */
|
||||
int evidence[12];
|
||||
/* Accumulated evidence codes */
|
||||
int acc_evidence[12];
|
||||
|
||||
/* Parent count and parents */
|
||||
int parentc;
|
||||
struct node *parents[MAX_PARENTS];
|
||||
|
||||
/* Child count and children */
|
||||
int childrenc;
|
||||
struct node *children[MAX_CHILDREN];
|
||||
};
|
||||
|
||||
struct node* term_array[MAX_NODES];
|
||||
long term_array_size;
|
||||
int link_count;
|
||||
|
||||
/* Ontology initialisation functions. */
|
||||
int godist_init();
|
||||
int godist_read_assoc(FILE *fd);
|
||||
int godist_read_term(FILE *fd);
|
||||
void accumulate_evidence(struct node*);
|
||||
|
||||
/* Distance metric functions */
|
||||
float resnik_distance(char *term1, char *term2);
|
||||
float fussimeg_distance(char *term1, char *term2);
|
||||
|
||||
#endif
|
||||
|
31
scripts/geneontology/go-distance/main.c
Normal file
31
scripts/geneontology/go-distance/main.c
Normal file
@ -0,0 +1,31 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include "godist.h"
|
||||
|
||||
extern char *optarg;
|
||||
extern int optind, opterr, optopt;
|
||||
|
||||
#define _GNU_SOURCE
|
||||
#include <getopt.h>
|
||||
|
||||
void print_help() {
|
||||
printf("go-distance 0.1.0\n\n");
|
||||
printf("Usage: go-distance [hr] <go-terms>\n\n");
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int i;
|
||||
while ((i = getopt(argc, argv, "h")) != -1) {
|
||||
switch(i) {
|
||||
case 104:
|
||||
print_help();
|
||||
exit(0);
|
||||
break;
|
||||
};
|
||||
}
|
||||
godist_init();
|
||||
godist_exit();
|
||||
}
|
||||
|
Reference in New Issue
Block a user