diff --git a/Makefile b/Makefile index f1f4f1396..499239fd9 100644 --- a/Makefile +++ b/Makefile @@ -131,6 +131,7 @@ SUBDIR += tests/pcre-flags SUBDIR += tests/pcre-repeat SUBDIR += tests/pred SUBDIR += tests/re_literal +SUBDIR += tests/re_strings SUBDIR += tests/reverse SUBDIR += tests/trim SUBDIR += tests/union diff --git a/include/fsm/fsm.h b/include/fsm/fsm.h index 7c3883749..2b4c438f1 100644 --- a/include/fsm/fsm.h +++ b/include/fsm/fsm.h @@ -207,6 +207,12 @@ fsm_setend(struct fsm *fsm, fsm_state_t state, int end); int fsm_setendid(struct fsm *fsm, fsm_end_id_t id); +/* Associate a numeric ID with a specific end state in an fsm. + * Returns 1 on success, 0 on error. + * */ +int +fsm_setendidstate(struct fsm *fsm, fsm_state_t end_state, fsm_end_id_t id); + /* Get the end IDs associated with an end state, if any. * If id_buf has enough cells to store all the end IDs (according * to id_buf_count) then they are written into id_buf[] and diff --git a/include/re/strings.h b/include/re/strings.h index 06387a54c..fe4f7adc2 100644 --- a/include/re/strings.h +++ b/include/re/strings.h @@ -7,7 +7,7 @@ #ifndef RE_STRINGS_H #define RE_STRINGS_H -struct fsm; +#include struct fsm_options; struct re_strings; @@ -42,10 +42,10 @@ void re_strings_free(struct re_strings *g); int -re_strings_add_raw(struct re_strings *g, const void *p, size_t n); +re_strings_add_raw(struct re_strings *g, const void *p, size_t n, const fsm_end_id_t *endid); int -re_strings_add_str(struct re_strings *g, const char *s); +re_strings_add_str(struct re_strings *g, const char *s, const fsm_end_id_t *endid); struct fsm * re_strings_build(struct re_strings *g, diff --git a/src/libfsm/endids.c b/src/libfsm/endids.c index 444ccbc2e..1fc98ae68 100644 --- a/src/libfsm/endids.c +++ b/src/libfsm/endids.c @@ -84,6 +84,16 @@ fsm_setendid(struct fsm *fsm, fsm_end_id_t id) return 1; } +int +fsm_setendidstate(struct fsm *fsm, fsm_state_t end_state, fsm_end_id_t id) +{ + enum fsm_endid_set_res sres = fsm_endid_set(fsm, end_state, id); + if (sres == FSM_ENDID_SET_ERROR_ALLOC_FAIL) { + return 0; + } + return 1; +} + enum fsm_getendids_res fsm_getendids(const struct fsm *fsm, fsm_state_t end_state, size_t id_buf_count, fsm_end_id_t *id_buf, diff --git a/src/libfsm/libfsm.syms b/src/libfsm/libfsm.syms index 415bffbea..a2570b8c9 100644 --- a/src/libfsm/libfsm.syms +++ b/src/libfsm/libfsm.syms @@ -89,6 +89,7 @@ fsm_getendids fsm_setendid fsm_mapendids fsm_increndids +fsm_setendidstate fsm_countedges fsm_countstates diff --git a/src/libre/ac.c b/src/libre/ac.c index 2ebdb4f7e..efe13df36 100644 --- a/src/libre/ac.c +++ b/src/libre/ac.c @@ -13,6 +13,7 @@ #include #include +#include #include "ac.h" @@ -21,6 +22,9 @@ enum { POOL_BLOCK_SIZE = 256 }; struct trie_state { struct trie_state *children[256]; struct trie_state *fail; + /* use a state set as an endid set */ + struct state_set *endids; + fsm_state_t st; unsigned int index; unsigned int output:1; @@ -73,6 +77,7 @@ newstate(struct trie_graph *g) st->index = ++g->nstates; st->output = 0; + st->endids = NULL; return st; } @@ -86,6 +91,10 @@ cleanup_pool(struct trie_graph *g) p = g->pool; g->pool = p->next; + for (size_t i = 0; i < p->n; i++) { + state_set_free(p->states[i].endids); + } + free(p->states); free(p); } @@ -126,7 +135,7 @@ trie_create(void) } struct trie_graph * -trie_add_word(struct trie_graph *g, const char *w, size_t n) +trie_add_word(struct trie_graph *g, const char *w, size_t n, const fsm_end_id_t *endid) { struct trie_state *st; size_t i; @@ -159,6 +168,9 @@ trie_add_word(struct trie_graph *g, const char *w, size_t n) g->depth = n; } + if (endid != NULL) { + state_set_add(&st->endids, NULL, (fsm_state_t)*endid); + } return g; } @@ -278,7 +290,7 @@ trie_to_fsm_state(struct trie_state *ts, struct fsm *fsm, assert(fsm != NULL); assert(q != NULL); - if (ts->output && have_end) { + if (ts->output && have_end && state_set_empty(ts->endids)) { *q = single_end; return 1; } @@ -315,6 +327,16 @@ trie_to_fsm_state(struct trie_state *ts, struct fsm *fsm, if (ts->output) { fsm_setend(fsm, st, 1); + + struct state_iter si; + fsm_state_t state; + state_set_reset(ts->endids, &si); + while (state_set_next(&si, &state)) { + fsm_end_id_t endid = (fsm_end_id_t)state; + if (!fsm_setendidstate(fsm, st, endid)) { + return 0; + } + } } *q = st; diff --git a/src/libre/ac.h b/src/libre/ac.h index 480c76bfb..edabca27c 100644 --- a/src/libre/ac.h +++ b/src/libre/ac.h @@ -7,6 +7,8 @@ #ifndef AC_H #define AC_H +#include "fsm/fsm.h" + struct fsm; struct fsm_state; struct fsm_options; @@ -20,7 +22,8 @@ void trie_free(struct trie_graph *g); struct trie_graph * -trie_add_word(struct trie_graph *g, const char *w, size_t n); +trie_add_word(struct trie_graph *g, const char *w, size_t n, + const fsm_end_id_t *endid); int trie_add_failure_edges(struct trie_graph *g); diff --git a/src/libre/re_strings.c b/src/libre/re_strings.c index 06b7dc772..b2f04fec1 100644 --- a/src/libre/re_strings.c +++ b/src/libre/re_strings.c @@ -32,7 +32,7 @@ re_strings(const struct fsm_options *opt, const char *a[], size_t n, } for (i = 0; i < n; i++) { - if (!re_strings_add_str(g, a[i])) { + if (!re_strings_add_str(g, a[i], NULL)) { goto error; } } @@ -64,20 +64,20 @@ re_strings_free(struct re_strings *g) } int -re_strings_add_raw(struct re_strings *g, const void *p, size_t n) +re_strings_add_raw(struct re_strings *g, const void *p, size_t n, const fsm_end_id_t *endid) { assert(p != NULL); assert(n > 0); - return trie_add_word((struct trie_graph *) g, p, n) != NULL; + return trie_add_word((struct trie_graph *) g, p, n, endid) != NULL; } int -re_strings_add_str(struct re_strings *g, const char *s) +re_strings_add_str(struct re_strings *g, const char *s, const fsm_end_id_t *endid) { assert(s != NULL); - return re_strings_add_raw(g, s, strlen(s)); + return re_strings_add_raw(g, s, strlen(s), endid); } struct fsm * diff --git a/tests/re_strings/Makefile b/tests/re_strings/Makefile new file mode 100644 index 000000000..7fc7f2548 --- /dev/null +++ b/tests/re_strings/Makefile @@ -0,0 +1,26 @@ +.include "../../share/mk/top.mk" + +TEST.tests/re_strings != ls -1 tests/re_strings/re_strings*.c +TEST_SRCDIR.tests/re_strings = tests/re_strings +TEST_OUTDIR.tests/re_strings = ${BUILD}/tests/re_strings + +.for n in ${TEST.tests/re_strings:T:R:C/^re_strings//} +test:: ${TEST_OUTDIR.tests/re_strings}/res${n} +SRC += ${TEST_SRCDIR.tests/re_strings}/re_strings${n}.c +CFLAGS.${TEST_SRCDIR.tests/re_strings}/re_strings${n}.c = -UNDEBUG + +${TEST_OUTDIR.tests/re_strings}/run${n}: ${TEST_OUTDIR.tests/re_strings}/re_strings${n}.o ${TEST_OUTDIR.tests/re_strings}/testutil.o + ${CC} ${CFLAGS} -o ${TEST_OUTDIR.tests/re_strings}/run${n} ${TEST_OUTDIR.tests/re_strings}/re_strings${n}.o ${TEST_OUTDIR.tests/re_strings}/testutil.o ${BUILD}/lib/libfsm.a ${BUILD}/lib/libre.a + +${TEST_OUTDIR.tests/re_strings}/re_strings${n}.o: tests/re_strings/testutil.h + +${TEST_OUTDIR.tests/re_strings}/res${n}: ${TEST_OUTDIR.tests/re_strings}/run${n} + ( ${TEST_OUTDIR.tests/re_strings}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/re_strings}/res${n} + +.for lib in ${LIB:Mlibfsm} ${LIB:Mlibre} +${TEST_OUTDIR.tests/re_strings}/run${n}: ${BUILD}/lib/${lib:R}.a +.endfor +.endfor + +${TEST_OUTDIR.tests/re_strings}/testutil.o: tests/re_strings/testutil.c + ${CC} ${CFLAGS} -c -o ${TEST_OUTDIR.tests/re_strings}/testutil.o tests/re_strings/testutil.c diff --git a/tests/re_strings/re_strings1.c b/tests/re_strings/re_strings1.c new file mode 100644 index 000000000..44f10d41d --- /dev/null +++ b/tests/re_strings/re_strings1.c @@ -0,0 +1,21 @@ +#include "testutil.h" + +const char *strings[] = { + "aa", + "ab", + "ac", + "ba", + "bb", + "bc", + "ca", + "cb", + "cc", + NULL, +}; + +int main(int argc, char **argv) +{ + (void)argc; + (void)argv; + return run_test(strings); +} diff --git a/tests/re_strings/re_strings2.c b/tests/re_strings/re_strings2.c new file mode 100644 index 000000000..6e4c80459 --- /dev/null +++ b/tests/re_strings/re_strings2.c @@ -0,0 +1,17 @@ +#include "testutil.h" + +const char *strings[] = { + "first", + "duplicate", + "duplicate", + "duplicate", + "last", + NULL, +}; + +int main(int argc, char **argv) +{ + (void)argc; + (void)argv; + return run_test(strings); +} diff --git a/tests/re_strings/re_strings3.c b/tests/re_strings/re_strings3.c new file mode 100644 index 000000000..3f4f1d052 --- /dev/null +++ b/tests/re_strings/re_strings3.c @@ -0,0 +1,15 @@ +#include "testutil.h" + +const char *strings[] = { + "duplicate", + "duplicate", + "duplicate", + NULL, +}; + +int main(int argc, char **argv) +{ + (void)argc; + (void)argv; + return run_test(strings); +} diff --git a/tests/re_strings/re_strings4.c b/tests/re_strings/re_strings4.c new file mode 100644 index 000000000..b1e00e70f --- /dev/null +++ b/tests/re_strings/re_strings4.c @@ -0,0 +1,13 @@ +#include "testutil.h" + +const char *strings[] = { + /* empty */ + NULL, +}; + +int main(int argc, char **argv) +{ + (void)argc; + (void)argv; + return run_test(strings); +} diff --git a/tests/re_strings/testutil.c b/tests/re_strings/testutil.c new file mode 100644 index 000000000..4f4678b0d --- /dev/null +++ b/tests/re_strings/testutil.c @@ -0,0 +1,71 @@ +#include "testutil.h" + +#include +#include + +#include "fsm/fsm.h" +#include "fsm/options.h" + +#include "re/re.h" +#include "re/strings.h" + +static struct fsm_options opt; + +#define MAX_INPUTS 100 +static fsm_end_id_t id_buf[MAX_INPUTS]; + +int +run_test(const char **strings) +{ + struct re_strings *s = re_strings_new(); + assert(s != NULL); + + fsm_end_id_t id = 0; + const char **input = strings; + while (*input != NULL) { + if (!re_strings_add_str(s, *input, &id)) { + assert(!"re_strings_add_str"); + } + + input++; + id++; + assert(id < MAX_INPUTS); + } + + const int flags = 0; /* not anchored */ + + struct fsm *fsm = re_strings_build(s, &opt, flags); + assert(fsm != NULL); + + /* Each literal string input should match, and the set of + * matching endids should include the expected one. */ + id = 0; + input = strings; + while (*input != NULL) { + fsm_state_t end; + const char **string = input; + const int res = fsm_exec(fsm, fsm_sgetc, string, &end, NULL); + assert(res > 0); /* match */ + + size_t written; + enum fsm_getendids_res eres = fsm_getendids(fsm, end, + MAX_INPUTS, id_buf, &written); + assert(eres == FSM_GETENDIDS_FOUND); + bool found = false; + for (size_t i = 0; i < written; i++) { + if (id_buf[i] == id) { + found = true; + break; + } + } + assert(found); + + input++; + id++; + } + + re_strings_free(s); + fsm_free(fsm); + + return EXIT_SUCCESS; +} diff --git a/tests/re_strings/testutil.h b/tests/re_strings/testutil.h new file mode 100644 index 000000000..6898200b7 --- /dev/null +++ b/tests/re_strings/testutil.h @@ -0,0 +1,11 @@ +#ifndef TESTUTIL_H +#define TESTUTIL_H + +#include +#include +#include + +int +run_test(const char **strings); + +#endif