360 lines
8.3 KiB
C
360 lines
8.3 KiB
C
|
|
#include <math.h>
|
|
#include <string.h>
|
|
#include <errno.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <search.h>
|
|
#include "godist.h"
|
|
|
|
void print_terms();
|
|
void add_link(char*, char*);
|
|
struct node* get_bp();
|
|
struct node* get_term(char *);
|
|
void calc_ic(struct node *, unsigned int);
|
|
struct node *common_subsumer(struct node *, struct node *);
|
|
float resnik(struct node *, struct node *);
|
|
|
|
|
|
/* initialisation */
|
|
int godist_init() {
|
|
/* Initialize hash table and array */
|
|
hcreate(MAX_NODES);
|
|
term_array_size = 0;
|
|
link_count = 0;
|
|
struct node *n;
|
|
|
|
/* Read ontology terms from file */
|
|
printf("Reading GO terms from go-terms.txt...");
|
|
FILE *term_fd = fopen("go-terms.txt", "r");
|
|
|
|
if (term_fd == NULL) {
|
|
printf("cannot open file: go-terms.txt\n");
|
|
exit(errno);
|
|
}
|
|
|
|
int i;
|
|
while((i = godist_read_term(term_fd)) == 13) {
|
|
/* printf("%d\n", i);*/
|
|
}
|
|
fclose(term_fd);
|
|
printf(" %d terms\n", term_array_size);
|
|
|
|
/* Read ontology structure from file */
|
|
printf("Reading GO structure from go-tree.txt...");
|
|
FILE *tree_fd = fopen("go-tree.txt", "r");
|
|
|
|
if (tree_fd == NULL) {
|
|
printf("cannot open file: go-tree.txt\n");
|
|
exit(errno);
|
|
}
|
|
|
|
while((i = godist_read_assoc(tree_fd)) == 2) {
|
|
link_count++;
|
|
}
|
|
fclose(tree_fd);
|
|
printf(" %d edges\n", link_count);
|
|
|
|
printf("Calculating accumulated evidence...");
|
|
fflush(stdout);
|
|
for (i=0; i<term_array_size; i++) {
|
|
clear_flags(get_bp());
|
|
accumulate_evidence(term_array[i]);
|
|
}
|
|
printf("\n");
|
|
|
|
evidence = 0xff;
|
|
total_ann = 0;
|
|
n = get_bp();
|
|
for (i=0; i<12; i++)
|
|
if (evidence & 1<<i)
|
|
total_ann += n->acc_evidence[i];
|
|
printf("Using %d annotations.\n", total_ann);
|
|
|
|
print_term(get_term("GO:0006006"));
|
|
print_term(get_term("GO:0019318"));
|
|
print_term(get_term("GO:0005996"));
|
|
print_term(get_bp());
|
|
/*
|
|
print_term(get_term("GO:0040007"));
|
|
print_term(get_term("GO:0007275"));
|
|
print_term(get_term("GO:0007582"));
|
|
print_term(get_term("GO:0043473"));
|
|
print_term(get_term("GO:0000004"));
|
|
print_term(get_term("GO:0051704"));
|
|
print_term(get_term("GO:0000003"));
|
|
print_term(get_term("GO:0016032"));
|
|
print_term(get_term("GO:0009987"));
|
|
print_term(get_term("GO:0050896"));
|
|
print_term(get_term("GO:0050789"));
|
|
*/
|
|
printf("Calculation information content...");
|
|
fflush(stdout);
|
|
calculate_ics(0xffff);
|
|
printf("\n");
|
|
/* calc_ic(get_bp(), 0xffff);*/
|
|
/* find_multi_parented();*/
|
|
common_subsumer(get_term("GO:0000003"), get_term("GO:0000004"));
|
|
/** should return go:0016032 */
|
|
common_subsumer(get_term("GO:0019081"), get_term("GO:0050434"));
|
|
|
|
printf("Resnik: %f\n", resnik(get_term("GO:0000003"), get_term("GO:0000004")));
|
|
|
|
}
|
|
|
|
void godist_exit() {
|
|
int i;
|
|
for (i=0; i<term_array_size; i++) {
|
|
free(term_array[i]);
|
|
}
|
|
}
|
|
|
|
int godist_read_assoc(FILE *fd) {
|
|
char term1[11], term2[11];
|
|
int retval;
|
|
retval = fscanf(fd, " %10s %10s ", term1, term2);
|
|
if (retval != EOF) {
|
|
add_link(term1, term2);
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
int godist_read_term(FILE *fd) {
|
|
char term[11];
|
|
int ev[12];
|
|
int i;
|
|
ENTRY e, *res;
|
|
int nread = fscanf(fd, " %10s %d %d %d %d %d %d %d %d %d %d %d %d ",
|
|
term, &ev[0], &ev[1], &ev[2], &ev[3], &ev[4], &ev[5],
|
|
&ev[6], &ev[7], &ev[8], &ev[9], &ev[10], &ev[11]);
|
|
if (errno != 0) {
|
|
printf("errno: %d\n", errno);
|
|
}
|
|
if (nread == 13) {
|
|
struct node *n = (struct node*) malloc(sizeof(struct node));
|
|
n->parentc = 0;
|
|
n->childrenc = 0;
|
|
n->visited = 0;
|
|
for (i=0; i<12; i++) {
|
|
n->evidence[i] = ev[i];
|
|
n->acc_evidence[i] = 0;
|
|
}
|
|
strcpy(n->term, term);
|
|
|
|
/* add to hash table */
|
|
e.key = n->term;
|
|
e.data = (void*)n;
|
|
res = hsearch(e, ENTER);
|
|
|
|
term_array[term_array_size++] = n;
|
|
}
|
|
|
|
return nread;
|
|
}
|
|
|
|
/* distance functions */
|
|
float go_distance(char *term1, char *term2) {
|
|
return 0.0;
|
|
}
|
|
|
|
void clear_flags(struct node *n) {
|
|
int i, j;
|
|
for (i=0; i<term_array_size; i++) {
|
|
term_array[i]->visited = 0;
|
|
for (j=0; j<12; j++)
|
|
term_array[i]->temp_acc[j] = 0;
|
|
}
|
|
}
|
|
|
|
void add_link(char *parent_id, char *child_id) {
|
|
ENTRY *ep, e;
|
|
struct node *parent, *child;
|
|
|
|
char key[11];
|
|
strcpy(key, parent_id);
|
|
e.key = key;
|
|
ep = hsearch(e, FIND);
|
|
if (!ep) {
|
|
printf("Cannot find term %s\n", e.key);
|
|
return;
|
|
}
|
|
parent = (struct node*) ep->key;
|
|
|
|
strcpy(key, child_id);
|
|
e.key = key;
|
|
ep = hsearch(e, FIND);
|
|
if (!ep) {
|
|
printf("Cannot find term %s\n", e.key);
|
|
return;
|
|
}
|
|
child = (struct node*) ep->key;
|
|
|
|
if (parent->childrenc +1 > MAX_CHILDREN) {
|
|
printf("FIXME: increase child count");
|
|
return;
|
|
}
|
|
parent->children[parent->childrenc] = child;
|
|
parent->childrenc++;
|
|
child->parents[child->parentc] = parent;
|
|
child->parentc++;
|
|
}
|
|
|
|
struct node *get_bp() {
|
|
return get_term("GO:0008150");
|
|
}
|
|
|
|
struct node *get_term(char *term) {
|
|
ENTRY e, *ep;
|
|
e.key = term;
|
|
ep = hsearch(e, FIND);
|
|
|
|
if (ep) {
|
|
return ep->data;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
void accumulate_evidence(struct node *n) {
|
|
int i;
|
|
acc_ev(n);
|
|
for (i=0; i<12; i++) {
|
|
n->acc_evidence[i] = n->temp_acc[i];
|
|
}
|
|
}
|
|
|
|
void acc_ev(struct node *n) {
|
|
int i, j;
|
|
if (n->visited)
|
|
return;
|
|
n->visited = 1;
|
|
|
|
for (i=0; i<12; i++)
|
|
n->temp_acc[i] = n->evidence[i];
|
|
|
|
for (i=0; i<(n->childrenc); i++) {
|
|
if (!n->children[i]->visited) {
|
|
acc_ev(n->children[i]);
|
|
for (j=0; j<12; j++)
|
|
n->temp_acc[j] += n->children[i]->temp_acc[j];
|
|
}
|
|
}
|
|
}
|
|
|
|
void print_terms() {
|
|
int i;
|
|
for (i=0; i<term_array_size; i++) {
|
|
printf("%s\n", term_array[i]->term);
|
|
}
|
|
}
|
|
|
|
void print_term(struct node *n) {
|
|
int i;
|
|
printf("%s\n", n->term);
|
|
printf(" children: %d\n", n->childrenc);
|
|
printf(" parents: %d\n", n->parentc);
|
|
printf(" evidence: ");
|
|
for (i=0; i<12; i++)
|
|
printf("%d ", n->evidence[i]);
|
|
printf("\n");
|
|
printf(" accumulated evidence: ");
|
|
for (i=0; i<12; i++)
|
|
printf("%d ", n->acc_evidence[i]);
|
|
printf("\n");
|
|
}
|
|
|
|
void find_multi_parented() {
|
|
int i;
|
|
for (i=0; i<term_array_size; i++) {
|
|
if (term_array[i]->parentc > 1)
|
|
printf("%s -- %d\n", term_array[i]->term, term_array[i]->parentc);
|
|
}
|
|
}
|
|
|
|
void calculate_ics(unsigned int evidence) {
|
|
int i;
|
|
for (i=0; i<term_array_size; i++)
|
|
calc_ic(term_array[i], evidence);
|
|
}
|
|
|
|
void calc_ic(struct node *n, unsigned int evidence) {
|
|
int i;
|
|
float ann=0.0;
|
|
for (i=0; i<12; i++)
|
|
if (evidence & 1<<i)
|
|
ann += (float) n->acc_evidence[i];
|
|
n->ic = -log(ann/total_ann);
|
|
/* printf("%f\n", n->ic);*/
|
|
}
|
|
|
|
struct node *common_subsumer(struct node *n1, struct node *n2) {
|
|
struct node *anc1[MAX_NODES];
|
|
struct node *anc2[MAX_NODES];
|
|
int ancc1=0, ancc2=0;
|
|
int i, j;
|
|
struct node *retval=NULL;
|
|
|
|
add_ancestors(&ancc1, anc1, n1);
|
|
add_ancestors(&ancc2, anc2, n2);
|
|
for (i=0; i<ancc1; i++)
|
|
for (j=0; j<ancc2; j++)
|
|
if (anc1[i] == anc2[j])
|
|
if ((!retval) || (anc1[i]->ic > retval->ic))
|
|
retval = anc1[i];
|
|
if (retval)
|
|
;// printf("Retval: %s\n", retval->term);
|
|
else
|
|
printf("No value to return");
|
|
return retval;
|
|
}
|
|
|
|
void add_ancestors(int *ancc, struct node *anc[], struct node *n) {
|
|
int i=0;
|
|
anc[(*ancc)++] = n;
|
|
for (i=0; i<n->parentc; i++)
|
|
add_ancestors(ancc, anc, n->parents[i]);
|
|
}
|
|
|
|
float resnik(struct node *n1, struct node *n2) {
|
|
struct node *subsumer = common_subsumer(n1, n2);
|
|
if (!subsumer)
|
|
return 20;
|
|
else
|
|
return n1->ic + n2->ic - 2.0 * subsumer->ic;
|
|
}
|
|
|
|
|
|
int read_terms(FILE *fd, struct node *terms[], int *termc) {
|
|
char term[11];
|
|
int retval;
|
|
printf("read_terms\n");
|
|
retval = fscanf(fd, " %10s ", term);
|
|
while (retval != EOF) {
|
|
printf(".");
|
|
fflush(stdout);
|
|
terms[(*termc)++] = get_term(term);
|
|
retval = fscanf(fd, " %10s ", term);
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
void build_dataset() {
|
|
struct node *terms[MAX_NODES];
|
|
int termc = 0;
|
|
int i, j;
|
|
|
|
FILE *fd = fopen("dimension", "r");
|
|
|
|
read_terms(fd, terms, &termc);
|
|
|
|
for (i=0; i<termc; i++) {
|
|
for (j=0; j<termc; j++) {
|
|
printf("%f ", resnik(terms[i], terms[j]));
|
|
}
|
|
printf("\n");
|
|
}
|
|
|
|
|
|
fclose(fd);
|
|
}
|
|
|