#include #include #include #include #include #include #include "godist.h" void print_terms(); void add_link(char*, char*); struct node* get_bp(); struct node* get_term(char *); void calc_ic(struct node *, unsigned int); struct node *common_subsumer(struct node *, struct node *); float resnik(struct node *, struct node *); /* initialisation */ int godist_init() { /* Initialize hash table and array */ hcreate(MAX_NODES); term_array_size = 0; link_count = 0; struct node *n; /* Read ontology terms from file */ printf("Reading GO terms from go-terms.txt..."); FILE *term_fd = fopen("go-terms.txt", "r"); if (term_fd == NULL) { printf("cannot open file: go-terms.txt\n"); exit(errno); } int i; while((i = godist_read_term(term_fd)) == 13) { /* printf("%d\n", i);*/ } fclose(term_fd); printf(" %d terms\n", term_array_size); /* Read ontology structure from file */ printf("Reading GO structure from go-tree.txt..."); FILE *tree_fd = fopen("go-tree.txt", "r"); if (tree_fd == NULL) { printf("cannot open file: go-tree.txt\n"); exit(errno); } while((i = godist_read_assoc(tree_fd)) == 2) { link_count++; } fclose(tree_fd); printf(" %d edges\n", link_count); printf("Calculating accumulated evidence..."); fflush(stdout); for (i=0; iacc_evidence[i]; printf("Using %d annotations.\n", total_ann); print_term(get_term("GO:0006006")); print_term(get_term("GO:0019318")); print_term(get_term("GO:0005996")); print_term(get_bp()); /* print_term(get_term("GO:0040007")); print_term(get_term("GO:0007275")); print_term(get_term("GO:0007582")); print_term(get_term("GO:0043473")); print_term(get_term("GO:0000004")); print_term(get_term("GO:0051704")); print_term(get_term("GO:0000003")); print_term(get_term("GO:0016032")); print_term(get_term("GO:0009987")); print_term(get_term("GO:0050896")); print_term(get_term("GO:0050789")); */ printf("Calculation information content..."); fflush(stdout); calculate_ics(0xffff); printf("\n"); /* calc_ic(get_bp(), 0xffff);*/ /* find_multi_parented();*/ common_subsumer(get_term("GO:0000003"), get_term("GO:0000004")); /** should return go:0016032 */ common_subsumer(get_term("GO:0019081"), get_term("GO:0050434")); printf("Resnik: %f\n", resnik(get_term("GO:0000003"), get_term("GO:0000004"))); } void godist_exit() { int i; for (i=0; iparentc = 0; n->childrenc = 0; n->visited = 0; for (i=0; i<12; i++) { n->evidence[i] = ev[i]; n->acc_evidence[i] = 0; } strcpy(n->term, term); /* add to hash table */ e.key = n->term; e.data = (void*)n; res = hsearch(e, ENTER); term_array[term_array_size++] = n; } return nread; } /* distance functions */ float go_distance(char *term1, char *term2) { return 0.0; } void clear_flags(struct node *n) { int i, j; for (i=0; ivisited = 0; for (j=0; j<12; j++) term_array[i]->temp_acc[j] = 0; } } void add_link(char *parent_id, char *child_id) { ENTRY *ep, e; struct node *parent, *child; char key[11]; strcpy(key, parent_id); e.key = key; ep = hsearch(e, FIND); if (!ep) { printf("Cannot find term %s\n", e.key); return; } parent = (struct node*) ep->key; strcpy(key, child_id); e.key = key; ep = hsearch(e, FIND); if (!ep) { printf("Cannot find term %s\n", e.key); return; } child = (struct node*) ep->key; if (parent->childrenc +1 > MAX_CHILDREN) { printf("FIXME: increase child count"); return; } parent->children[parent->childrenc] = child; parent->childrenc++; child->parents[child->parentc] = parent; child->parentc++; } struct node *get_bp() { return get_term("GO:0008150"); } struct node *get_term(char *term) { ENTRY e, *ep; e.key = term; ep = hsearch(e, FIND); if (ep) { return ep->data; } return NULL; } void accumulate_evidence(struct node *n) { int i; acc_ev(n); for (i=0; i<12; i++) { n->acc_evidence[i] = n->temp_acc[i]; } } void acc_ev(struct node *n) { int i, j; if (n->visited) return; n->visited = 1; for (i=0; i<12; i++) n->temp_acc[i] = n->evidence[i]; for (i=0; i<(n->childrenc); i++) { if (!n->children[i]->visited) { acc_ev(n->children[i]); for (j=0; j<12; j++) n->temp_acc[j] += n->children[i]->temp_acc[j]; } } } void print_terms() { int i; for (i=0; iterm); } } void print_term(struct node *n) { int i; printf("%s\n", n->term); printf(" children: %d\n", n->childrenc); printf(" parents: %d\n", n->parentc); printf(" evidence: "); for (i=0; i<12; i++) printf("%d ", n->evidence[i]); printf("\n"); printf(" accumulated evidence: "); for (i=0; i<12; i++) printf("%d ", n->acc_evidence[i]); printf("\n"); } void find_multi_parented() { int i; for (i=0; iparentc > 1) printf("%s -- %d\n", term_array[i]->term, term_array[i]->parentc); } } void calculate_ics(unsigned int evidence) { int i; for (i=0; iacc_evidence[i]; n->ic = -log(ann/total_ann); /* printf("%f\n", n->ic);*/ } struct node *common_subsumer(struct node *n1, struct node *n2) { struct node *anc1[MAX_NODES]; struct node *anc2[MAX_NODES]; int ancc1=0, ancc2=0; int i, j; struct node *retval=NULL; add_ancestors(&ancc1, anc1, n1); add_ancestors(&ancc2, anc2, n2); for (i=0; iic > retval->ic)) retval = anc1[i]; if (retval) ;// printf("Retval: %s\n", retval->term); else printf("No value to return"); return retval; } void add_ancestors(int *ancc, struct node *anc[], struct node *n) { int i=0; anc[(*ancc)++] = n; for (i=0; iparentc; i++) add_ancestors(ancc, anc, n->parents[i]); } float resnik(struct node *n1, struct node *n2) { struct node *subsumer = common_subsumer(n1, n2); if (!subsumer) return 20; else return n1->ic + n2->ic - 2.0 * subsumer->ic; } int read_terms(FILE *fd, struct node *terms[], int *termc) { char term[11]; int retval; printf("read_terms\n"); retval = fscanf(fd, " %10s ", term); while (retval != EOF) { printf("."); fflush(stdout); terms[(*termc)++] = get_term(term); retval = fscanf(fd, " %10s ", term); } return retval; } void build_dataset() { struct node *terms[MAX_NODES]; int termc = 0; int i, j; FILE *fd = fopen("dimension", "r"); read_terms(fd, terms, &termc); for (i=0; i