diff --git a/Makefile b/Makefile index a4d6c6c72..b356c4f0f 100644 --- a/Makefile +++ b/Makefile @@ -116,6 +116,7 @@ SUBDIR += tests/intersect SUBDIR += tests/eclosure SUBDIR += tests/equals SUBDIR += tests/subtract +SUBDIR += tests/detect_required SUBDIR += tests/determinise SUBDIR += tests/eager_output SUBDIR += tests/endids diff --git a/include/adt/bitmap.h b/include/adt/bitmap.h index ac4a94995..61c4f9a57 100644 --- a/include/adt/bitmap.h +++ b/include/adt/bitmap.h @@ -8,6 +8,9 @@ #define ADT_BITMAP_H #include +#include + +#include #include "print/esc.h" struct fsm_state; @@ -23,6 +26,9 @@ bm_get(const struct bm *bm, size_t i); void bm_set(struct bm *bm, size_t i); +void +bm_unset(struct bm *bm, size_t i); + /* Get a writeable pointer to the Nth word of the char set bitmap, * or NULL if out of bounds. */ uint64_t * @@ -51,5 +57,17 @@ bm_snprint(const struct bm *bm, const struct fsm_options *opt, int boxed, escputc *escputc); +void +bm_copy(struct bm *dst, const struct bm *src); + +void +bm_intersect(struct bm *dst, const struct bm *src); + +void +bm_union(struct bm *dst, const struct bm *src); + +int +bm_any(const struct bm *bm); + #endif diff --git a/include/fsm/walk.h b/include/fsm/walk.h index b433380d5..e1ab5f29e 100644 --- a/include/fsm/walk.h +++ b/include/fsm/walk.h @@ -7,6 +7,8 @@ #ifndef FSM_WALK_H #define FSM_WALK_H +#include + struct fsm; struct fsm_state; @@ -128,5 +130,34 @@ fsm_generate_matches_cb fsm_generate_cb_printf; * to escape all characters or just nonprintable ones. */ fsm_generate_matches_cb fsm_generate_cb_printf_escaped; +/* Walk a DFA and detect which characters MUST appear in the input for a + * match to be possible. For example, if input for the DFA corresponding + * to /^(abc|dbe)$/ does not contain 'b' at all, there's no way it can + * ever match, so executing the regex is unnecessary. This does not detect + * which characters must appear before/after others or how many times, just + * which must be present. + * + * The input must be a DFA. When run with EXPENSIVE_CHECKS this will + * check and return ERROR_MISUSE if it is not, otherwise this is an + * unchecked error. + * + * The character map will be cleared before populating. If *count is + * non-NULL it will be updated with how many required characters were + * found. + * + * There is an optional step_limit -- if this is reached, then it will + * return FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED and a + * cleared bitmap, because any partial information could still have been + * contradicted later. If the step_limit is 0 it will be ignored. */ +enum fsm_detect_required_characters_res { + FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN, + FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED, + FSM_DETECT_REQUIRED_CHARACTERS_ERROR_MISUSE = -1, + FSM_DETECT_REQUIRED_CHARACTERS_ERROR_ALLOC = -2, +}; +enum fsm_detect_required_characters_res +fsm_detect_required_characters(const struct fsm *dfa, size_t step_limit, + uint64_t charmap[4], size_t *count); + #endif diff --git a/man/fsm.1/fsm.1.xml b/man/fsm.1/fsm.1.xml index 6df32e186..776aaaab9 100644 --- a/man/fsm.1/fsm.1.xml +++ b/man/fsm.1/fsm.1.xml @@ -33,6 +33,7 @@ -G &length.arg;"> -k &io.arg;"> -i &iterations.arg;"> + -S &limit.arg;"> -U &charset.arg;"> -X"> @@ -325,6 +326,14 @@ + + &S.opt; + + + Set a step limit for long-running operations. + + + &t.opt; @@ -487,6 +496,13 @@ of each state in the &fsm;. Printed to &stdout.lit;; exit status is always true. + + requiredchars + – + Determine characters that must appear in any + inputs that could match the &fsm;. Exit status is true + unless the step limit was reached. + diff --git a/src/adt/bitmap.c b/src/adt/bitmap.c index 434dfc222..1fb016083 100644 --- a/src/adt/bitmap.c +++ b/src/adt/bitmap.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -34,6 +35,15 @@ bm_set(struct bm *bm, size_t i) u64bitset_set(bm->map, i); } +void +bm_unset(struct bm *bm, size_t i) +{ + assert(bm != NULL); + assert(i <= UCHAR_MAX); + + u64bitset_clear(bm->map, i); +} + uint64_t * bm_nth_word(struct bm *bm, size_t n) { @@ -325,3 +335,34 @@ bm_snprint(const struct bm *bm, const struct fsm_options *opt, return -1; } + +void +bm_copy(struct bm *dst, const struct bm *src) +{ + memcpy(dst, src, sizeof(*src)); +} + +void +bm_intersect(struct bm *dst, const struct bm *src) +{ + for (size_t i = 0; i < sizeof(src->map)/sizeof(src->map[0]); i++) { + dst->map[i] &= src->map[i]; + } +} + +void +bm_union(struct bm *dst, const struct bm *src) +{ + for (size_t i = 0; i < sizeof(src->map)/sizeof(src->map[0]); i++) { + dst->map[i] |= src->map[i]; + } +} + +int +bm_any(const struct bm *bm) +{ + for (size_t i = 0; i < sizeof(bm->map)/sizeof(bm->map[0]); i++) { + if (bm->map[i]) { return 1; } + } + return 0; +} diff --git a/src/fsm/main.c b/src/fsm/main.c index 194a18bcd..da65791dd 100644 --- a/src/fsm/main.c +++ b/src/fsm/main.c @@ -25,6 +25,7 @@ #include #include /* XXX */ +#include #include "libfsm/internal.h" /* XXX */ @@ -101,6 +102,16 @@ query_epsilonclosure(const struct fsm *fsm, fsm_state_t state) abort(); } +static int +query_required_chars(const struct fsm *fsm, fsm_state_t state) +{ + (void) fsm; + (void) state; + + /* never called */ + abort(); +} + static void usage(void) { @@ -227,7 +238,9 @@ static int { "hasambiguity", fsm_has, fsm_hasnondeterminism }, { "hasnondeterminism", fsm_has, fsm_hasnondeterminism }, { "hasepsilons", fsm_has, fsm_hasepsilons }, - { "epsilons", fsm_has, fsm_hasepsilons } + { "epsilons", fsm_has, fsm_hasepsilons }, + { "requiredchars", NULL, query_required_chars }, + { "chars", NULL, query_required_chars }, }; assert(name != NULL); @@ -378,6 +391,7 @@ main(int argc, char *argv[]) int xfiles; int r; size_t generate_bounds = 0; + size_t step_limit = 0; int (*query)(const struct fsm *, fsm_state_t); int (*walk )(const struct fsm *, @@ -404,7 +418,7 @@ main(int argc, char *argv[]) { int c; - while (c = getopt(argc, argv, "h" "aCcgwXe:k:i:" "xpq:l:dG:mrt:EU:W:"), c != -1) { + while (c = getopt(argc, argv, "h" "aCcgwXe:k:i:" "xpq:l:dG:mrt:ES:U:W:"), c != -1) { switch (c) { case 'a': opt.anonymous_states = 1; break; case 'c': opt.consolidate_edges = 1; break; @@ -451,6 +465,10 @@ main(int argc, char *argv[]) } break; + case 'S': + step_limit = strtoul(optarg, NULL, 10); + break; /* can be 0 */ + case 'h': usage(); exit(EXIT_SUCCESS); @@ -669,6 +687,34 @@ main(int argc, char *argv[]) closure_free(fsm, closures, fsm->statecount); return 0; + } else if (query == query_required_chars) { + assert(walk == NULL); + uint64_t charmap[4]; + size_t count; + enum fsm_detect_required_characters_res res; + res = fsm_detect_required_characters(fsm, step_limit, charmap, &count); + if (res == FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED) { + fprintf(stderr, "fsm_detect_required_characters: step limit reached (%zd)\n", step_limit); + exit(EXIT_FAILURE); + } else { + assert(res == FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN); + char buf[257] = {0}; + size_t used = 0; + for (size_t i = 0; i < 256; i++) { + if (u64bitset_get(charmap, i)) { + buf[used++] = (char)i; + } + } + printf("%zd ", count); + for (size_t i = 0; i < used; i++) { + c_escputc_str(stdout, &opt, buf[i]); + } + printf("\n"); + + fsm_free(fsm); + fsm_to_cleanup = NULL; + return EXIT_SUCCESS; + } } else { assert(walk != NULL); r |= !walk(fsm, query); diff --git a/src/libfsm/Makefile b/src/libfsm/Makefile index ca056518a..c7782f0ff 100644 --- a/src/libfsm/Makefile +++ b/src/libfsm/Makefile @@ -7,6 +7,7 @@ SRC += src/libfsm/complete.c SRC += src/libfsm/consolidate.c SRC += src/libfsm/clone.c SRC += src/libfsm/closure.c +SRC += src/libfsm/detect_required.c SRC += src/libfsm/eager_output.c SRC += src/libfsm/edge.c SRC += src/libfsm/empty.c diff --git a/src/libfsm/detect_required.c b/src/libfsm/detect_required.c new file mode 100644 index 000000000..8488376fc --- /dev/null +++ b/src/libfsm/detect_required.c @@ -0,0 +1,275 @@ +/* + * Copyright 2024 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include "internal.h" + +#define LOG_BASE 0 +#define LOG_PROGRESS (LOG_BASE + 0) +#define LOG_STEPS (LOG_BASE + 0) + +/* More than one label */ +#define LABEL_GROUP ((uint16_t)-1) + +struct dr_env { + const struct fsm *dfa; + size_t steps; + + /* Number of times a unique label has been required -- this is a count so that going + * from 0 <-> 1 can set/clear the accumulator, but going from 1 -> 2 etc. does not. */ + size_t counts[256]; + struct bm current; + bool first_end_state; + struct bm overall; + + struct dr_stack { + size_t used; + size_t ceil; + struct stack_frame { + fsm_state_t state; + uint16_t label; /* unique label followed to get here, or LABEL_GROUP */ + struct edge_group_iter iter; + } *frames; + } stack; +}; + +#define DEF_STACK_FRAMES 16 + +/* Check symbols[]: if there's more than one bit set, then set label to + * LABEL_GROUP, otherwise set it to the single bit's character value. + * At least one bit must be set. */ +static void check_symbols(const struct edge_group_iter_info *info, uint16_t *label) +{ + bool any = false; + + for (size_t i = 0; i < 256/64; i++) { + uint64_t w = info->symbols[i]; + if (w == 0) { continue; } + + /* get position of lowest set bit */ + for (size_t b = 0; b < 64; b++) { + const uint64_t bit = 1ULL << b; + if (w & bit) { + if (any) { + *label = LABEL_GROUP; + return; + } + + /* clear it, check if there's anything else set */ + w &= ~bit; + if (w != 0) { + *label = LABEL_GROUP; + return; + } + + *label = 64*i + b; + any = true; + break; + } + } + } + + /* there must be at least one bit set */ + assert(any); +} + +/* Walk a DFA and attempt to detect which characters must appear in any input to match. + * This finds the intersection of characters required on any start->end paths (tracking + * edges with only one label that must be followed by all matches), so it can take + * prohibitively long for large/complex DFAs. */ +enum fsm_detect_required_characters_res +fsm_detect_required_characters(const struct fsm *dfa, size_t step_limit, + uint64_t charmap[4], size_t *count) +{ + assert(dfa != NULL); + assert(charmap != NULL); + + #if EXPENSIVE_CHECKS + if (!fsm_all(dfa, fsm_isdfa)) { + return FSM_DETECT_REQUIRED_CHARACTERS_ERROR_MISUSE; + } + #endif + + enum fsm_detect_required_characters_res res = FSM_DETECT_REQUIRED_CHARACTERS_ERROR_ALLOC; + + struct dr_env env = { + .dfa = dfa, + .first_end_state = true, + }; + + assert(env.counts[0] == 0); + + const size_t state_count = fsm_countstates(dfa); + fsm_state_t start_state; + if (!fsm_getstart(dfa, &start_state)) { + res = FSM_DETECT_REQUIRED_CHARACTERS_ERROR_MISUSE; + goto cleanup; + } + + #if EXPENSIVE_CHECKS + for (fsm_state_t s = 0; s < state_count; s++) { + assert(!dfa->states[s].visited); + } + #endif + + for (size_t i = 0; i < 4; i++) { charmap[i] = 0; } + + /* If the start state is also an end state, then + * it matches the empty string, so we're done. */ + if (fsm_isend(dfa, start_state)) { + res = FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN; + if (count != NULL) { + *count = 0; + } + goto cleanup; + } + + env.stack.frames = f_malloc(dfa->alloc, DEF_STACK_FRAMES * sizeof(env.stack.frames[0])); + if (env.stack.frames == NULL) { goto cleanup; } + env.stack.ceil = DEF_STACK_FRAMES; + + { /* set up start state's stack frame */ + struct stack_frame *sf0 = &env.stack.frames[0]; + sf0->state = start_state; + sf0->label = LABEL_GROUP; + + dfa->states[start_state].visited = true; + + edge_set_group_iter_reset(dfa->states[start_state].edges, + EDGE_GROUP_ITER_ALL, &sf0->iter); + env.stack.used = 1; + } + + while (env.stack.used > 0) { + struct stack_frame *sf = &env.stack.frames[env.stack.used - 1]; + struct edge_group_iter_info info; + env.steps++; + if (LOG_STEPS > 1) { + fprintf(stderr, "-- steps %zu/%zu\n", env.steps, step_limit); + } + if (env.steps == step_limit) { + res = FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED; + goto cleanup; + } + + if (edge_set_group_iter_next(&sf->iter, &info)) { + assert(info.to < state_count); + if (dfa->states[info.to].visited) { + continue; /* skip visited state */ + } + + if (env.stack.used == env.stack.ceil) { /* grow stack */ + const size_t nceil = 2*env.stack.ceil; + assert(nceil > env.stack.ceil); + struct stack_frame *nframes = f_realloc(dfa->alloc, + env.stack.frames, nceil * sizeof(nframes[0])); + if (nframes == NULL) { + return FSM_DETECT_REQUIRED_CHARACTERS_ERROR_ALLOC; + } + + env.stack.frames = nframes; + env.stack.ceil = nceil; + } + + /* enter state */ + dfa->states[info.to].visited = true; + + struct stack_frame *nsf = &env.stack.frames[env.stack.used]; + nsf->state = info.to; + check_symbols(&info, &nsf->label); + + if (nsf->label != LABEL_GROUP) { + size_t offset = (nsf->label & 0xff); + const size_t label_count = ++env.counts[offset]; + if (label_count == 1) { + bm_set(&env.current, offset); + } + } + + edge_set_group_iter_reset(dfa->states[info.to].edges, + EDGE_GROUP_ITER_ALL, &nsf->iter); + env.stack.used++; + + if (fsm_isend(dfa, info.to)) { + if (env.first_end_state) { + bm_copy(&env.overall, &env.current); + env.first_end_state = false; + } else { /* intersect */ + bm_intersect(&env.overall, &env.current); + } + + if (LOG_PROGRESS) { + fprintf(stderr, "-- current: "); + bm_print(stderr, NULL, &env.current, 0, fsm_escputc); + fprintf(stderr, ", overall: "); + bm_print(stderr, NULL, &env.overall, 0, fsm_escputc); + fprintf(stderr, "\n"); + } + + /* Intersecting with the empty set will always be empty, so + * further exploration is unnecessary. */ + if (!bm_any(&env.overall)) { + res = FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN; + break; + } + } + + } else { /* done with state */ + /* If this state was reached via a unique label, then + * reduce the count. If the count returns to 0, remove + * it from the constraint set. */ + if (sf->label != LABEL_GROUP) { + size_t offset = (sf->label & 0xff); + const size_t label_count = --env.counts[offset]; + if (label_count == 0) { + bm_unset(&env.current, offset); + } + } + + /* clear visited */ + dfa->states[sf->state].visited = false; + + env.stack.used--; + } + } + + if (LOG_STEPS) { + fprintf(stderr, "%s: finished in %zu/%zu steps\n", __func__, env.steps, step_limit); + } + + for (size_t i = 0; i < 4; i++) { + charmap[i] = *bm_nth_word(&env.overall, i); + } + + if (count != NULL) { + *count = bm_count(&env.overall); + } + + res = FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN; + +cleanup: + f_free(dfa->alloc, env.stack.frames); + + for (fsm_state_t s = 0; s < state_count; s++) { + dfa->states[s].visited = false; + } + + return res; +} diff --git a/src/libfsm/libfsm.syms b/src/libfsm/libfsm.syms index a1a8c72a9..75d89cd5c 100644 --- a/src/libfsm/libfsm.syms +++ b/src/libfsm/libfsm.syms @@ -17,6 +17,7 @@ fsm_reachableall fsm_reachableany fsm_walk_edges fsm_walk_states +fsm_detect_required_characters # fsm_epsilonsonly diff --git a/tests/detect_required/Makefile b/tests/detect_required/Makefile new file mode 100644 index 000000000..06d76277e --- /dev/null +++ b/tests/detect_required/Makefile @@ -0,0 +1,43 @@ +.include "../../share/mk/top.mk" + +TEST.tests/detect_required != ls -1 tests/detect_required/detect_required*.c +TEST.tests/detect_required != ls -1 tests/detect_required/out*.txt +TEST_SRCDIR.tests/detect_required = tests/detect_required +TEST_OUTDIR.tests/detect_required = ${BUILD}/tests/detect_required + +FSM=${BUILD}/bin/fsm +RE=${BUILD}/bin/re + +${TEST_OUTDIR.tests/detect_required}/testutil.o: tests/detect_required/testutil.c + ${CC} ${CFLAGS} -c -o ${TEST_OUTDIR.tests/detect_required}/testutil.o tests/detect_required/testutil.c + +.for n in ${TEST.tests/detect_required:T:Mout*.txt:R:C/^out//} + +${TEST_OUTDIR.tests/detect_required}/got${n}.txt: ${TEST_SRCDIR.tests/detect_required}/in${n}.re + ( ${RE} -r pcre -l fsm -p -y ${.ALLSRC:M*.re} | ${FSM} -q chars ) \ + > $@ + +${TEST_OUTDIR.tests/detect_required}/res${n}: \ + ${TEST_SRCDIR.tests/detect_required}/out${n}.txt \ + ${TEST_OUTDIR.tests/detect_required}/got${n}.txt + +TXTTEST_RESULT += ${TEST_OUTDIR.tests/detect_required}/res${n} +.endfor + + +test:: ${TEST_OUTDIR.tests/detect_required}/res_step_limit +SRC += ${TEST_SRCDIR.tests/detect_required}/detect_required_step_limit.c +CFLAGS.${TEST_SRCDIR.tests/detect_required}/detect_required_step_limit.c = -UNDEBUG + +${TEST_OUTDIR.tests/detect_required}/run_step_limit: ${TEST_OUTDIR.tests/detect_required}/detect_required_step_limit.o ${TEST_OUTDIR.tests/detect_required}/testutil.o + ${CC} ${CFLAGS} -o ${TEST_OUTDIR.tests/detect_required}/run_step_limit ${TEST_OUTDIR.tests/detect_required}/detect_required_step_limit.o ${TEST_OUTDIR.tests/detect_required}/testutil.o ${BUILD}/lib/libfsm.a ${BUILD}/lib/libre.a + +${TEST_OUTDIR.tests/detect_required}/detect_required_step_limit.o: tests/detect_required/testutil.h + +${TEST_OUTDIR.tests/detect_required}/res_step_limit: ${TEST_OUTDIR.tests/detect_required}/run_step_limit + ( ${TEST_OUTDIR.tests/detect_required}/run_step_limit 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/detect_required}/res_step_limit + +.for lib in ${LIB:Mlibfsm} ${LIB:Mlibre} +${TEST_OUTDIR.tests/detect_required}/run_step_limit: ${BUILD}/lib/${lib:R}.a +.endfor + diff --git a/tests/detect_required/detect_required_step_limit.c b/tests/detect_required/detect_required_step_limit.c new file mode 100644 index 000000000..743173f90 --- /dev/null +++ b/tests/detect_required/detect_required_step_limit.c @@ -0,0 +1,56 @@ +#include "testutil.h" + +#include +#include +#include +#include +#include + +int main() +{ + enum re_flags flags = 0; + struct re_err err; + const char *regex = "^abcde$"; + + struct fsm *fsm = re_comp(RE_PCRE, fsm_sgetc, ®ex, NULL, flags, &err); + assert(fsm != NULL); + + if (!fsm_determinise(fsm)) { + assert(!"determinise"); + return EXIT_FAILURE; + } + if (!fsm_minimise(fsm)) { + assert(!"minimise"); + return EXIT_FAILURE; + } + + uint64_t charmap[4]; + + /* keep decreasing the step limit until it's hit, and check that + * the bitmap is cleared. */ + bool hit_step_limit = false; + size_t step_limit = 25; + while (!hit_step_limit) { + assert(step_limit > 0); + + const enum fsm_detect_required_characters_res res = fsm_detect_required_characters(fsm, step_limit, charmap, NULL); + if (res == FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED) { + hit_step_limit = true; + + /* this should not contain any partially complete information */ + for (size_t i = 0; i < 4; i++) { + if (charmap[i] != 0) { + fprintf(stderr, "-- Test failure: partial information set when step limit reached\n"); + return EXIT_FAILURE; + } + } + } + + step_limit--; + } + assert(hit_step_limit); + printf("-- successfully hit step limit at %zd\n", step_limit); + + fsm_free(fsm); + return EXIT_SUCCESS; +} diff --git a/tests/detect_required/in0.re b/tests/detect_required/in0.re new file mode 100644 index 000000000..5d762054a --- /dev/null +++ b/tests/detect_required/in0.re @@ -0,0 +1 @@ +^$ \ No newline at end of file diff --git a/tests/detect_required/in1.re b/tests/detect_required/in1.re new file mode 100644 index 000000000..25081a78c --- /dev/null +++ b/tests/detect_required/in1.re @@ -0,0 +1 @@ +^a$ \ No newline at end of file diff --git a/tests/detect_required/in2.re b/tests/detect_required/in2.re new file mode 100644 index 000000000..fe1ec125a --- /dev/null +++ b/tests/detect_required/in2.re @@ -0,0 +1 @@ +^abcde$ \ No newline at end of file diff --git a/tests/detect_required/in3.re b/tests/detect_required/in3.re new file mode 100644 index 000000000..bed2c040e --- /dev/null +++ b/tests/detect_required/in3.re @@ -0,0 +1 @@ +^(ab|cd)$ \ No newline at end of file diff --git a/tests/detect_required/in4.re b/tests/detect_required/in4.re new file mode 100644 index 000000000..72c6c30d0 --- /dev/null +++ b/tests/detect_required/in4.re @@ -0,0 +1 @@ +^(ab|cd|ef)$ \ No newline at end of file diff --git a/tests/detect_required/in5.re b/tests/detect_required/in5.re new file mode 100644 index 000000000..454fb8261 --- /dev/null +++ b/tests/detect_required/in5.re @@ -0,0 +1 @@ +^(abc|def)$ \ No newline at end of file diff --git a/tests/detect_required/in6.re b/tests/detect_required/in6.re new file mode 100644 index 000000000..bdbaa3c85 --- /dev/null +++ b/tests/detect_required/in6.re @@ -0,0 +1 @@ +^(abc|dbf)$ \ No newline at end of file diff --git a/tests/detect_required/in7.re b/tests/detect_required/in7.re new file mode 100644 index 000000000..1957f292a --- /dev/null +++ b/tests/detect_required/in7.re @@ -0,0 +1 @@ +^abc(def)*ghi$ \ No newline at end of file diff --git a/tests/detect_required/in8.re b/tests/detect_required/in8.re new file mode 100644 index 000000000..9ec28ce00 --- /dev/null +++ b/tests/detect_required/in8.re @@ -0,0 +1 @@ +^abc(def)+ghi$ \ No newline at end of file diff --git a/tests/detect_required/in9.re b/tests/detect_required/in9.re new file mode 100644 index 000000000..e3f42fa55 --- /dev/null +++ b/tests/detect_required/in9.re @@ -0,0 +1 @@ +^ghi(def)abc$ \ No newline at end of file diff --git a/tests/detect_required/out0.txt b/tests/detect_required/out0.txt new file mode 100644 index 000000000..ff95acde3 --- /dev/null +++ b/tests/detect_required/out0.txt @@ -0,0 +1 @@ +0 diff --git a/tests/detect_required/out1.txt b/tests/detect_required/out1.txt new file mode 100644 index 000000000..8ba4a7a8d --- /dev/null +++ b/tests/detect_required/out1.txt @@ -0,0 +1 @@ +1 a diff --git a/tests/detect_required/out2.txt b/tests/detect_required/out2.txt new file mode 100644 index 000000000..7a88aba43 --- /dev/null +++ b/tests/detect_required/out2.txt @@ -0,0 +1 @@ +5 abcde diff --git a/tests/detect_required/out3.txt b/tests/detect_required/out3.txt new file mode 100644 index 000000000..ff95acde3 --- /dev/null +++ b/tests/detect_required/out3.txt @@ -0,0 +1 @@ +0 diff --git a/tests/detect_required/out4.txt b/tests/detect_required/out4.txt new file mode 100644 index 000000000..ff95acde3 --- /dev/null +++ b/tests/detect_required/out4.txt @@ -0,0 +1 @@ +0 diff --git a/tests/detect_required/out5.txt b/tests/detect_required/out5.txt new file mode 100644 index 000000000..ff95acde3 --- /dev/null +++ b/tests/detect_required/out5.txt @@ -0,0 +1 @@ +0 diff --git a/tests/detect_required/out6.txt b/tests/detect_required/out6.txt new file mode 100644 index 000000000..bacfc4097 --- /dev/null +++ b/tests/detect_required/out6.txt @@ -0,0 +1 @@ +1 b diff --git a/tests/detect_required/out7.txt b/tests/detect_required/out7.txt new file mode 100644 index 000000000..099f9b056 --- /dev/null +++ b/tests/detect_required/out7.txt @@ -0,0 +1 @@ +6 abcghi diff --git a/tests/detect_required/out8.txt b/tests/detect_required/out8.txt new file mode 100644 index 000000000..19fa3c37b --- /dev/null +++ b/tests/detect_required/out8.txt @@ -0,0 +1 @@ +9 abcdefghi diff --git a/tests/detect_required/out9.txt b/tests/detect_required/out9.txt new file mode 100644 index 000000000..19fa3c37b --- /dev/null +++ b/tests/detect_required/out9.txt @@ -0,0 +1 @@ +9 abcdefghi diff --git a/tests/detect_required/testutil.c b/tests/detect_required/testutil.c new file mode 100644 index 000000000..a6ec8c296 --- /dev/null +++ b/tests/detect_required/testutil.c @@ -0,0 +1,85 @@ +#include "testutil.h" + +#include + +#include +#include +#include +#include +#include +#include + +#include + +bool +run_test(const struct testcase *tc) +{ + bool test_res = false; + + enum re_flags flags = 0; + struct re_err err; + char *regex = (char *)tc->regex; + const char *required = tc->required ? tc->required : ""; + const size_t step_limit = tc->step_limit ? tc->step_limit : DEF_STEP_LIMIT; + + fprintf(stderr, "-- test: regex '%s', required '%s'\n", tc->regex, required); + + struct fsm *fsm = re_comp(RE_PCRE, fsm_sgetc, ®ex, NULL, flags, &err); + if (fsm == NULL) { + return false; + } + + if (!fsm_determinise(fsm)) { + assert(!"determinise"); + return false; + } + if (!fsm_minimise(fsm)) { + assert(!"minimise"); + return false; + } + + uint64_t charmap[4]; + + { + const size_t statecount = fsm_countstates(fsm); + size_t ends = 0; + for (size_t i = 0; i < statecount; i++) { + if (fsm_isend(fsm, i)) { + ends++; + } + } + fprintf(stderr, "-- statecount %zu, %zu ends\n", statecount, ends); + } + + + const enum fsm_detect_required_characters_res res = fsm_detect_required_characters(fsm, step_limit, charmap, NULL); + if (res == FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED) { + fprintf(stderr, "-- step limit reached, halting\n"); + goto cleanup; + } + assert(res == FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN); + + char buf[257] = {0}; + size_t used = 0; + assert(!u64bitset_get(charmap, 0)); /* does not contain 0x00 */ + + for (size_t i = 0; i < 256; i++) { + if (u64bitset_get(charmap, i)) { + buf[used++] = (char)i; + } + } + + if (0 != strcmp(required, buf)) { + fprintf(stderr, "Error: mismatch\n"); + fprintf(stderr, "-- expected: [%s]\n", required); + fprintf(stderr, "-- got: [%s]\n", buf); + goto cleanup; + } + + test_res = true; + +cleanup: + fsm_free(fsm); + + return test_res; +} diff --git a/tests/detect_required/testutil.h b/tests/detect_required/testutil.h new file mode 100644 index 000000000..f9378c190 --- /dev/null +++ b/tests/detect_required/testutil.h @@ -0,0 +1,21 @@ +#ifndef TESTUTIL_H +#define TESTUTIL_H + +#include +#include +#include +#include + +#define DEF_STEP_LIMIT 100000 + +struct testcase { + const char *regex; + const char *required; + size_t max_gen_buffer; /* 0: default */ + size_t step_limit; +}; + +bool +run_test(const struct testcase *tc); + +#endif