view string_search/kmp.c @ 12:5b67bf33fccc

Added comments. Broke out file read toa function slurp_file().
author Eris Caffee <discordia@eldalin.com>
date Tue, 18 Aug 2015 14:26:12 -0500
parents b8cfd6af18b3
children
line source
1 /* Demo client to compare brute force search against Knuth-Morris-Pratt string search.
2 *
3 * gcc --std=c99 kmp.c
4 *
5 * a.out "string to find" file_to_search
6 *
7 */
9 #define _GNU_SOURCE 1
10 #include <stdio.h>
11 #include <string.h>
12 #include <stddef.h>
13 #include <stdlib.h>
14 #include <sys/time.h>
16 /******************************************************************************/
17 /* Knuth-Morris-Pratt algorithm
18 *
19 * As described on Wikipedia
20 * https://en.wikipedia.org/wiki/Knuth%E2%80%93Morris%E2%80%93Pratt_algorithm
21 */
23 size_t kmp_search( char *haystack, char *needle ) {
25 size_t haystack_length = strlen(haystack);
26 size_t needle_length = strlen(needle);
28 /* Build the backup table */
30 int pos = 2;
31 int cnd = 0;
33 needle_length = strlen( needle );
35 int *table = malloc( needle_length * sizeof(int) );
36 table[0] = -1;
37 table[1] = 0;
39 while ( pos < needle_length ) {
40 if ( needle[pos-1] == needle[cnd] ) {
41 cnd++;
42 table[pos] = cnd;
43 pos++;
44 }
45 else if ( cnd > 0 ) {
46 cnd = table[cnd];
47 }
48 else {
49 table[pos] = 0;
50 pos++;
51 }
52 }
54 /* Do the actual search */
56 size_t m = 0;
57 size_t i = 0;
59 while ( m + i < haystack_length ) {
60 if ( needle[i] == haystack[m+i] ) {
61 if ( i == needle_length - 1 ) {
62 return m;
63 }
64 i++;
65 }
66 else {
67 if ( table[i] > -1 ) {
68 m = m + i - table[i];
69 i = table[i];
70 } else {
71 i = 0;
72 m++;
73 }
74 }
75 }
77 return -1;
78 }
80 /******************************************************************************/
81 /* brute force search
82 */
84 size_t find_string ( char *haystack, char * needle ) {
86 int needle_length = strlen( needle );
87 int haystack_length = strlen( haystack );
88 for ( size_t i = 0; i < haystack_length - needle_length; i++) {
89 if ( strncmp( needle, &haystack[i], needle_length ) == 0 ) {
90 return i;
91 }
92 }
93 return -1;
94 }
96 /******************************************************************************/
97 /* Calculate the difference between two struct timeval's.
98 *
99 * OUT: struct timeval *elapsed The calculated difference.
100 * IN: struct timeval *start The starting time;
101 * IN: struct timeval *end The ending time;
102 */
104 void timevaldiff( struct timeval *elapsed, struct timeval *start, struct timeval *end ) {
105 time_t sec;
106 suseconds_t usec;
108 sec = end->tv_sec - start->tv_sec;
109 usec = end->tv_usec - start->tv_usec;
111 while ( usec > 1000000 ) {
112 sec += 1;
113 usec -= 1000000;
114 }
116 elapsed->tv_sec = sec;
117 elapsed->tv_usec = usec;
118 }
120 /******************************************************************************/
121 /* Read the entire contents of a file into memory.
122 *
123 * IN: char *filename The name of the file to read. It will be open in "rb"
124 * mode.
125 * IN: char **buffer The address of a char * which will be set to the start
126 * of the memory buffer containing the contents of the
127 * file. You should call free() on this when you are done.
128 *
129 * Return value: The amount of data read. Note that the data in the
130 * buffer will be null terminated, but this may be
131 * meaningless if the file is a binary file, so you should
132 * use the return value when iterating through the data.
133 */
135 size_t slurp_file( char *filename, char **buffer ) {
137 FILE * file;
138 char temp[65536];
139 size_t bytes_read;
140 size_t buffer_top = 0;
141 size_t buffer_max = 1;
143 *buffer = malloc( sizeof( char ) );
145 file = fopen( filename, "rb" );
146 while ( bytes_read = fread( temp, sizeof( char ), sizeof( temp ), file ) ) {
147 while ( buffer_max - buffer_top < bytes_read ) {
148 size_t new_buffer_max = 2 * buffer_max;
149 char * new_buffer = malloc( new_buffer_max );
150 for ( size_t i = 0; i < buffer_max; i++ ) {
151 new_buffer[i] = (*buffer)[i];
152 }
153 free( *buffer );
154 *buffer = new_buffer;
155 buffer_max = new_buffer_max;
156 }
157 memcpy( &(*buffer)[ buffer_top ], temp, bytes_read );
158 buffer_top += bytes_read;
159 }
161 (*buffer)[ buffer_top + 1 ] = 0;
163 fclose( file );
165 return buffer_top;
166 }
168 /******************************************************************************/
170 int main( int argc, char **argv ) {
171 char *def_needle = "ABCDABD";
172 char *def_haystack = "ABC ABCDAB ABCDABCDABDE";
174 char * needle;
175 char * haystack;
176 size_t haystack_max;
177 size_t haystack_top = 0;
179 if ( argc >= 2 ) {
180 needle = argv[1];
181 }
182 else {
183 needle = def_needle;
184 }
186 size_t filesize;
187 if ( argc >= 3 ) {
188 filesize = slurp_file( argv[2], &haystack );
189 }
190 else {
191 haystack = strdup( def_haystack );
192 filesize = strlen( def_haystack );
193 }
195 struct timeval start;
196 struct timeval end;
197 struct timeval elapsed;
198 size_t pos;
199 char * found;
201 gettimeofday( &start, NULL );
202 pos = kmp_search( haystack, needle );
203 gettimeofday( &end, NULL );
204 timevaldiff( &elapsed, &start, &end );
206 printf( "pos %d\n", pos );
207 found = strndup( &haystack[pos], strlen(needle)+80 );
208 printf( "found >%s<\n", found );
209 printf( "kmp found in %d.%06d seconds\n", elapsed.tv_sec, elapsed.tv_usec );
212 gettimeofday( &start, NULL );
213 pos = find_string( haystack, needle );
214 gettimeofday( &end, NULL );
215 timevaldiff( &elapsed, &start, &end );
217 printf( "pos %d\n", pos );
218 found = strndup( &haystack[pos], strlen(needle)+80 );
219 printf( "found >%s<\n", found );
220 printf( "brute force found in %d.%06d seconds\n", elapsed.tv_sec, elapsed.tv_usec );
222 free(haystack);
224 exit( EXIT_SUCCESS );
225 }