Mercurial > data_structures
changeset 15:d11dfc49b559
Seems to be working, but I need to prove the algorithms and also make ctrie_remove compress down, too. See the final output where the standalone 'n' node could be pushed into it's children
author | Eris Caffee <discordia@eldalin.com> |
---|---|
date | Wed, 03 Oct 2012 01:07:04 -0500 |
parents | ef73b96fdcae |
children | 5d8158ad2d61 |
files | src/ctrie.c src/ctrie.h src/ctrie_test.c |
diffstat | 3 files changed, 242 insertions(+), 122 deletions(-) [+] |
line diff
1.1 --- a/src/ctrie.c Tue Oct 02 20:00:38 2012 -0500 1.2 +++ b/src/ctrie.c Wed Oct 03 01:07:04 2012 -0500 1.3 @@ -61,7 +61,7 @@ 1.4 ctrie->root->data = NULL; 1.5 ctrie->size = ctrie->count = 0; 1.6 1.7 - printf("The root node is %p\n", ctrie->root); 1.8 + // printf("The root node is %p\n", (void *) ctrie->root); 1.9 return ctrie; 1.10 } 1.11 1.12 @@ -137,8 +137,7 @@ 1.13 if ( (NULL == ctrie) || (NULL == key) ) 1.14 return NULL; 1.15 1.16 - printf("root node is "); 1.17 - ctrie_print_node(ctrie->root); 1.18 + // printf("root node is "); ctrie_print_node(ctrie->root); 1.19 return ctrie_insert_node(ctrie, ctrie->root, key, data); 1.20 } 1.21 1.22 @@ -151,74 +150,81 @@ 1.23 1.24 struct ctrie_node * p = ctrie_longest_matching_child(node, key); 1.25 1.26 - /* 1.27 - printf("Longest match is "); 1.28 + 1.29 + // printf("Longest match is "); 1.30 if (NULL != p) 1.31 ctrie_print_node(p); 1.32 else 1.33 printf("(nil)\n"); 1.34 - */ 1.35 + 1.36 1.37 if (NULL == p) 1.38 { 1.39 // No matching child found. Create new(key, data) as child of node 1.40 struct ctrie_node * new = ctrie_node_new(key, data); 1.41 if (NULL == new) 1.42 - return NULL; 1.43 + return NULL; 1.44 new->parent = node; 1.45 1.46 if (NULL == node->child) 1.47 - node->child = new; 1.48 - else 1.49 - { 1.50 - struct ctrie_node * s = node->child; 1.51 - while (NULL != s->next_sibling) 1.52 - s = s->next_sibling; 1.53 - s->next_sibling = new; 1.54 - new->prev_sibling = s; 1.55 - } 1.56 + node->child = new; 1.57 + else 1.58 + { 1.59 + struct ctrie_node * s = node->child; 1.60 + while (NULL != s->next_sibling) 1.61 + s = s->next_sibling; 1.62 + s->next_sibling = new; 1.63 + new->prev_sibling = s; 1.64 + } 1.65 1.66 ctrie->size++; 1.67 ctrie->count++; 1.68 1.69 - /* 1.70 - printf("Added "); 1.71 - ctrie_print_node(new); 1.72 - */ 1.73 + 1.74 + // printf("Added "); ctrie_print_node(new); 1.75 + 1.76 1.77 return key; 1.78 } 1.79 else if (ctrie_str_common(p->key, key) == strlen(p->key)) 1.80 { 1.81 // Found a node whose key matches for it's entire length 1.82 - if (0 == strcmp(p->key, key)) 1.83 - { 1.84 - // Key already exists. Error 1.85 - return NULL; 1.86 - } 1.87 + // Does it match the entire rest of the key we are inserting? 1.88 + if (ctrie_str_common(p->key, key) == strlen(key)) 1.89 + { 1.90 + if (p->data) 1.91 + { 1.92 + // printf("This node already exists\n"); 1.93 + // Key already exists. Error 1.94 + return NULL; 1.95 + } 1.96 + else 1.97 + { 1.98 + p->data = data; 1.99 + ctrie->count++; 1.100 + return key; 1.101 + } 1.102 + } 1.103 else 1.104 - { 1.105 - // The node found has a key that is a prefix of the key we are inserting. 1.106 - // Recurse with the remainder of the key 1.107 - // printf("Recursing because node's entire key %s is a prefix of %s\n", p->key, key); 1.108 - return ctrie_insert_node(ctrie, p, key+strlen(p->key), data); 1.109 - } 1.110 + { 1.111 + // The node found has a key that is a prefix of the key we are inserting. 1.112 + // Recurse with the remainder of the key 1.113 + // printf("Recursing because node's entire key %s is a prefix of %s\n", p->key, key); 1.114 + return ctrie_insert_node(ctrie, p, key+strlen(p->key), data); 1.115 + } 1.116 } 1.117 else 1.118 { 1.119 // Found a node that matches for only part of it's length. 1.120 // Split the node and then recurse. 1.121 1.122 - /* 1.123 - printf("splitting this node: "); 1.124 - ctrie_print_node(p); 1.125 - */ 1.126 + // printf("splitting this node: "); ctrie_print_node(p); 1.127 1.128 // Get the new shorter key for p. 1.129 int matchlen = ctrie_str_common(p->key, key); 1.130 char * prefix = malloc(matchlen+1); 1.131 if (NULL == prefix) 1.132 - return NULL; 1.133 + return NULL; 1.134 1.135 strncpy(prefix, key, matchlen); 1.136 prefix[matchlen] = '\0'; 1.137 @@ -249,8 +255,8 @@ 1.138 */ 1.139 1.140 // Now add the user requested new key as a child of p 1.141 - // printf("Recursing with key %s as a prefix of %s\n", p->key, key); 1.142 - return ctrie_insert_node(ctrie, p, key + matchlen, data); 1.143 + // printf("Recursing with node %p key %s as a prefix of %s\n", p->parent, p->parent->key, key); 1.144 + return ctrie_insert_node(ctrie, p->parent, key, data); 1.145 } 1.146 1.147 // should not get here 1.148 @@ -265,13 +271,13 @@ 1.149 int len = 0; 1.150 int l; 1.151 1.152 - printf("Searching for match in children of %p\n", node); 1.153 + //printf("Searching for >%s< in children of %p (%s)\n", key, (void *) node, node->key); 1.154 1.155 struct ctrie_node * p = node->child; 1.156 while (NULL != p) 1.157 { 1.158 l = ctrie_str_common(p->key, key); 1.159 - printf("common length is %d\n", l); 1.160 + //printf(" Checking >%s< common length is %d\n", p->key, l); 1.161 if (l > len) 1.162 { 1.163 len = l; 1.164 @@ -279,6 +285,8 @@ 1.165 } 1.166 p = p->next_sibling; 1.167 } 1.168 + 1.169 + //printf("Returning longest match %p\n", longest); 1.170 return longest; 1.171 } 1.172 1.173 @@ -294,10 +302,14 @@ 1.174 int 1.175 ctrie_str_common(char * s1, char * s2) 1.176 { 1.177 + // printf("s1 >%s< s1 >%s<\n", s1, s2); 1.178 int i = 0; 1.179 - while ( (*s1 != '\0') && (*s2 != '\0') && 1.180 + while ( (*(s1+i) != '\0') && (*(s2+i) != '\0') && 1.181 (*(s1+i) == *(s2+i)) ) 1.182 + { 1.183 + // printf("*s1+%d %c *s2+%d %c\n", i, (*(s1+i) == '\0' ? "." : *(s1+i)), i, (*(s2+i) == '\0' ? "." : *(s2+i))); 1.184 i++; 1.185 + } 1.186 1.187 return i; 1.188 } 1.189 @@ -309,7 +321,9 @@ 1.190 if (NULL == node) 1.191 printf("(nil node)\n"); 1.192 else 1.193 - printf("%s \t%p\t parent:%p \tchild:%p \tprev:%p \tnext:%p \tdata:%p\n", node->key, node, node->parent, node->child, node->prev_sibling, node->next_sibling, node->data); 1.194 + printf("%s \t%p\t parent:%p \tchild:%p \tprev:%p \tnext:%p \tdata:%p\n", 1.195 + node->key, (void *) node, (void *) node->parent, (void *) node->child, 1.196 + (void *) node->prev_sibling, (void *) node->next_sibling, node->data); 1.197 1.198 } 1.199 1.200 @@ -376,8 +390,10 @@ 1.201 void 1.202 ctrie_visit_node(struct ctrie * ctrie, struct ctrie_node * node, void (*op)(char * k, void * d), char * prefix) 1.203 { 1.204 + // printf("Visiting node %p with prefix >%s<\n", node, prefix); 1.205 if (NULL != node->data) 1.206 { 1.207 + // printf("node has data\n"); 1.208 char * k = malloc(strlen(prefix) + strlen(node->key)); 1.209 if (NULL == k) 1.210 return; 1.211 @@ -390,6 +406,7 @@ 1.212 1.213 if(NULL != node->child) 1.214 { 1.215 + // printf("node has children\n"); 1.216 char * k = malloc(strlen(prefix) + strlen(node->key)); 1.217 if (NULL == k) 1.218 return; 1.219 @@ -426,17 +443,15 @@ 1.220 size_t len = strlen(key); 1.221 for (int i = 0; i <= len; i += strlen(node->key)) 1.222 { 1.223 - node = ctrie_search_children(node, key+i); 1.224 + node = ctrie_longest_matching_child(node, key+i); 1.225 if (NULL == node) 1.226 return NULL; 1.227 - if (NULL != node->data) 1.228 + if (strlen(key+i) == strlen(node->key)) 1.229 return node; 1.230 } 1.231 1.232 // Should never get here 1.233 return NULL; 1.234 - 1.235 - struct ctrie_node * p = ctrie_longest_matching_child(node, key); 1.236 } 1.237 1.238 //////////////////////////////////////////////////////////////////////////////// 1.239 @@ -453,73 +468,21 @@ 1.240 return p; 1.241 } 1.242 1.243 - 1.244 - 1.245 - 1.246 - 1.247 - 1.248 - 1.249 - 1.250 - 1.251 - 1.252 - 1.253 - 1.254 - 1.255 - 1.256 - 1.257 - 1.258 - 1.259 - 1.260 - 1.261 - 1.262 -//////////////////////////////////////////////////////////////////////////////// 1.263 -void * 1.264 -ctrie_remove(struct ctrie * ctrie, char * key) 1.265 - { 1.266 - if ( (NULL == ctrie) || (NULL == key) ) 1.267 - return NULL; 1.268 - 1.269 - struct ctrie_node * node = ctrie_find_node(ctrie, key); 1.270 - if (NULL == node) 1.271 - return NULL; 1.272 - 1.273 - void * data = node->data; 1.274 - 1.275 - while ( (NULL == node->prev_sibling) && (NULL == node->prev_sibling) ) 1.276 - { 1.277 - struct ctrie_node * p = node; 1.278 - node = node->parent; 1.279 - free(p); 1.280 - ctrie->size--; 1.281 - } 1.282 - 1.283 - if (node == ctrie->root) 1.284 - return data; 1.285 - 1.286 - if (node->prev_sibling) 1.287 - node->prev_sibling->next_sibling = node->next_sibling; 1.288 - else 1.289 - node->parent->child = node->next_sibling; 1.290 - 1.291 - if (node->next_sibling) 1.292 - node->next_sibling->prev_sibling = node->prev_sibling; 1.293 - 1.294 - free(node); 1.295 - 1.296 - ctrie->count--; 1.297 - 1.298 - return data; 1.299 - } 1.300 - 1.301 //////////////////////////////////////////////////////////////////////////////// 1.302 void 1.303 -ctrie_subtree_list_push(char * k, void * d) 1.304 +ctrie_subtree_list_push(char * key, void * data) 1.305 { 1.306 if (NULL == subkey_list) 1.307 return; 1.308 - 1.309 + char * k = malloc(strlen(key)+1); 1.310 + if (NULL == k) 1.311 + return; 1.312 + strcpy(k, key); 1.313 if ( (0 == subkey_list_limit) || (list_size(subkey_list) < subkey_list_limit) ) 1.314 + { 1.315 list_push_front(subkey_list, k); 1.316 + // printf("list push >%s<\n", k); 1.317 + } 1.318 } 1.319 1.320 //////////////////////////////////////////////////////////////////////////////// 1.321 @@ -532,25 +495,47 @@ 1.322 if ( (NULL == ctrie) || (NULL == prefix) ) 1.323 return NULL; 1.324 1.325 - struct ctrie_node * node = ctrie_find_node(ctrie, prefix); 1.326 + // struct ctrie_node * node = ctrie_find_node(ctrie, prefix); 1.327 + 1.328 + struct ctrie_node * p = NULL; 1.329 + struct ctrie_node * node = ctrie->root; 1.330 + size_t len = strlen(prefix); 1.331 + // printf("len %d\n", len); 1.332 + int index; 1.333 + for (int i = 0; i < len; i += ctrie_str_common(prefix+i, node->key)) 1.334 + { 1.335 + // printf("i %d\n", i); 1.336 + p = node; 1.337 + node = ctrie_longest_matching_child(node, prefix+i); 1.338 + // printf("longest returned %p %d\n", node, strlen(node->key)); 1.339 + if (NULL == node) 1.340 + { 1.341 + // printf("breaking\n"); 1.342 + break; 1.343 + } 1.344 + index = i; 1.345 + } 1.346 + 1.347 if (NULL == node) 1.348 return NULL; 1.349 1.350 + // printf("Found longest match as %p >%s< index is %d\n", node, node->key, index); 1.351 + 1.352 subkey_list = list_new(); 1.353 if (NULL == subkey_list) 1.354 return NULL; 1.355 subkey_list_limit = limit; 1.356 1.357 - size_t len = strlen(prefix); 1.358 + len = strlen(prefix); 1.359 char * pref = malloc(len+1); 1.360 if (NULL == pref) 1.361 return NULL; 1.362 - strncpy(pref, prefix, len-1); 1.363 - pref[len-1] = '\0'; 1.364 - 1.365 - ctrie_visit_node(ctrie, node->parent, ctrie_subtree_list_push, pref); 1.366 + strncpy(pref, prefix, index); 1.367 + pref[index] = '\0'; 1.368 + // printf("Built pref as >%s<\n", pref); 1.369 + ctrie_visit_node(ctrie, node, ctrie_subtree_list_push, pref); 1.370 free(pref); 1.371 - 1.372 + // printf("Returning\n"); 1.373 return subkey_list; 1.374 } 1.375 1.376 @@ -568,3 +553,83 @@ 1.377 1.378 list_delete(list); 1.379 } 1.380 + 1.381 +//////////////////////////////////////////////////////////////////////////////// 1.382 +void * 1.383 +ctrie_remove(struct ctrie * ctrie, char * key) 1.384 + { 1.385 + if ( (NULL == ctrie) || (NULL == key) ) 1.386 + return NULL; 1.387 + 1.388 + // printf("Looking for %s\n", key); 1.389 + struct ctrie_node * node = ctrie_find_node(ctrie->root, key); 1.390 + if (NULL == node) 1.391 + return NULL; 1.392 + 1.393 + // printf("Found node %p ", node); ctrie_print_node(node); 1.394 + 1.395 + // We found a node, but is it an actual data node or just a prefix? 1.396 + if (NULL == node->data) 1.397 + return NULL; 1.398 + 1.399 + // OK, so it's a data node. If it has no children we can delete it. 1.400 + // Otherwise we just remove the data and maybe collapse nodes 1.401 + 1.402 + void * data = node->data; 1.403 + struct ctrie_node * parent = node->parent; 1.404 + 1.405 + if (NULL == node->child) 1.406 + { 1.407 + // Remove from sibling list. 1.408 + if (node->prev_sibling) 1.409 + node->prev_sibling->next_sibling = node->next_sibling; 1.410 + if (node->next_sibling) 1.411 + node->next_sibling->prev_sibling = node->prev_sibling; 1.412 + 1.413 + // Point the parent to the new first child, if needed. 1.414 + if (parent->child == node) 1.415 + { 1.416 + if (node->prev_sibling) 1.417 + parent->child = node->prev_sibling; 1.418 + else 1.419 + parent->child = node->next_sibling; 1.420 + } 1.421 + 1.422 + free(node->key); 1.423 + free(node); 1.424 + ctrie->size--; 1.425 + ctrie->count--; 1.426 + } 1.427 + else 1.428 + { 1.429 + node->data = NULL; 1.430 + ctrie->count--; 1.431 + } 1.432 + 1.433 + 1.434 + // printf("merging nodes\n"); ctrie_dump_raw(ctrie); 1.435 + 1.436 + // At this point we may have nodes that need to be collapsed together. 1.437 + // This happens when the parent has only one child. 1.438 + if (NULL == parent->child->next_sibling) 1.439 + { 1.440 + char * k = malloc(strlen(parent->key) + strlen(parent->child->key) + 1); 1.441 + // UGH! I need a real error signalling mechanism to handle this. 1.442 + if (NULL == k) 1.443 + return data; 1.444 + k[0] = '\0'; 1.445 + strcat(k, parent->key); 1.446 + strcat(k, parent->child->key); 1.447 + free(parent->key); 1.448 + parent->key = k; 1.449 + 1.450 + node = parent->child; 1.451 + parent->child = node->child; 1.452 + free(node->key); 1.453 + free(node); 1.454 + ctrie->size--; 1.455 + } 1.456 + 1.457 + return data; 1.458 + } 1.459 +
2.1 --- a/src/ctrie.h Tue Oct 02 20:00:38 2012 -0500 2.2 +++ b/src/ctrie.h Wed Oct 03 01:07:04 2012 -0500 2.3 @@ -1,7 +1,7 @@ 2.4 #ifndef CTRIE_H_ 2.5 #define CTRIE_H_ 2.6 2.7 -// Compact trie - internal nodes are collapsed together if they have no siblings. 2.8 +// Compact trie - internal nodes are collapsed together if they have no siblings.F 2.9 2.10 struct ctrie; 2.11
3.1 --- a/src/ctrie_test.c Tue Oct 02 20:00:38 2012 -0500 3.2 +++ b/src/ctrie_test.c Wed Oct 03 01:07:04 2012 -0500 3.3 @@ -7,7 +7,7 @@ 3.4 3.5 void print_val(char * key, void * data) 3.6 { 3.7 - printf("%s\t\t%s\n", key, data); 3.8 + printf("%s\t\t%s\n", key, (char *) data); 3.9 } 3.10 3.11 int main(int argc, char ** argv) 3.12 @@ -40,12 +40,16 @@ 3.13 i += 2; 3.14 } 3.15 3.16 - ctrie_dump_raw(ctrie); 3.17 ctrie_dump(ctrie); 3.18 ctrie_walk_keys(ctrie, print_val); 3.19 + ctrie_dump_raw(ctrie); 3.20 printf("ctrie is storing %zd words in %zd keys\n", ctrie_count(ctrie), ctrie_size(ctrie)); 3.21 3.22 - exit(EXIT_SUCCESS); 3.23 + printf("\n\n\n"); 3.24 + ctrie_insert(ctrie, "a", "indefinite article. Used before an initial consonant sound."); 3.25 + ctrie_dump_raw(ctrie); 3.26 + printf("ctrie is storing %zd words in %zd keys\n", ctrie_count(ctrie), ctrie_size(ctrie)); 3.27 + 3.28 3.29 //////////////////////////////////////////// 3.30 printf("===================================================\n"); 3.31 @@ -69,12 +73,12 @@ 3.32 printf("Nothing found for '%s'\n", word); 3.33 else 3.34 { 3.35 - printf("list size is %d\n", list_size(list)); 3.36 + printf("list size is %zd\n", list_size(list)); 3.37 struct list_iterator * it = list_begin(list); 3.38 struct list_iterator * next = it; 3.39 while (NULL != next) 3.40 { 3.41 - printf("%s\n", list_value(it)); 3.42 + printf("%s\n", (char *) list_value(it)); 3.43 next = list_next(it); 3.44 } 3.45 ctrie_free_subkey_list(list); 3.46 @@ -89,23 +93,67 @@ 3.47 printf("Nothing found for '%s'\n", word); 3.48 else 3.49 { 3.50 - printf("list size is %d\n", list_size(list)); 3.51 + printf("list size is %zd\n", list_size(list)); 3.52 struct list_iterator * it = list_begin(list); 3.53 struct list_iterator * next = it; 3.54 while (NULL != next) 3.55 { 3.56 - printf("%s\n", list_value(it)); 3.57 + printf("%s\n", (char *) list_value(it)); 3.58 next = list_next(it); 3.59 } 3.60 ctrie_free_subkey_list(list); 3.61 } 3.62 3.63 + 3.64 + //////////////////////////////////////////// 3.65 + printf("===================================================\n"); 3.66 + strcpy(word, "he"); 3.67 + printf("Looking for completions of '%s':\n", word); 3.68 + list = ctrie_get_subkeys(ctrie, word, 0); 3.69 + if (NULL == list) 3.70 + printf("Nothing found for '%s'\n", word); 3.71 + else 3.72 + { 3.73 + printf("list size is %zd\n", list_size(list)); 3.74 + struct list_iterator * it = list_begin(list); 3.75 + struct list_iterator * next = it; 3.76 + while (NULL != next) 3.77 + { 3.78 + printf("%s\n", (char *) list_value(it)); 3.79 + next = list_next(it); 3.80 + } 3.81 + ctrie_free_subkey_list(list); 3.82 + } 3.83 + 3.84 + 3.85 + //////////////////////////////////////////// 3.86 + printf("===================================================\n"); 3.87 + strcpy(word, "helpe"); 3.88 + printf("Looking for completions of '%s':\n", word); 3.89 + list = ctrie_get_subkeys(ctrie, word, 0); 3.90 + if (NULL == list) 3.91 + printf("Nothing found for '%s'\n", word); 3.92 + else 3.93 + { 3.94 + printf("list size is %zd\n", list_size(list)); 3.95 + struct list_iterator * it = list_begin(list); 3.96 + struct list_iterator * next = it; 3.97 + while (NULL != next) 3.98 + { 3.99 + printf("%s\n", (char *) list_value(it)); 3.100 + next = list_next(it); 3.101 + } 3.102 + ctrie_free_subkey_list(list); 3.103 + } 3.104 + 3.105 + 3.106 //////////////////////////////////////////// 3.107 printf("===================================================\n"); 3.108 strcpy(word, "anybody"); 3.109 printf("remove '%s'\n", word); 3.110 ctrie_remove(ctrie, word); 3.111 3.112 + // remove a word that is not in the dictionary, but is a prefix of other words. Should do nothing. 3.113 strcpy(word, "ann"); 3.114 printf("remove '%s'\n", word); 3.115 ctrie_remove(ctrie, word); 3.116 @@ -118,9 +166,16 @@ 3.117 printf("remove '%s'\n", word); 3.118 ctrie_remove(ctrie, word); 3.119 3.120 - ctrie_dump(ctrie); 3.121 + strcpy(word, "an"); 3.122 + printf("remove '%s'\n", word); 3.123 + ctrie_remove(ctrie, word); 3.124 + 3.125 + ctrie_dump_raw(ctrie); 3.126 printf("ctrie is storing %zd words in %zd keys\n", ctrie_count(ctrie), ctrie_size(ctrie)); 3.127 3.128 + 3.129 + exit(EXIT_SUCCESS); 3.130 + 3.131 //////////////////////////////////////////// 3.132 printf("===================================================\n"); 3.133 strcpy(word, "anybody");