From e3e868f9002246ac398be38e558f19810193e1dd Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 9 Oct 2024 16:08:15 -0400 Subject: [PATCH 1/8] Temporary interface: Add fsm_determinise_with_config. This works, but we may want to make some naming changes. I'd prefer to use a struct rather than a single parameter here -- as long as other fields have a sensible default when zeroed, the callers will only need to make the interface change once. --- include/fsm/fsm.h | 15 ++++++++++++++ src/libfsm/determinise.c | 43 +++++++++++++++++++++++++++++++++------- src/libfsm/libfsm.syms | 1 + 3 files changed, 52 insertions(+), 7 deletions(-) diff --git a/include/fsm/fsm.h b/include/fsm/fsm.h index 877d5c1bf..1dd710d0e 100644 --- a/include/fsm/fsm.h +++ b/include/fsm/fsm.h @@ -385,6 +385,21 @@ fsm_remove_epsilons(struct fsm *fsm); int fsm_determinise(struct fsm *fsm); +/* Determinise, with a passed in configuration + * and a distinct return value for reaching + * the state limit. */ +struct fsm_determinise_config { + size_t state_limit; /* 0: no limit */ +}; +enum fsm_determinise_with_config_res { + FSM_DETERMINISE_WITH_CONFIG_OK, + FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED, + FSM_DETERMINISE_WITH_CONFIG_ERRNO, +}; +enum fsm_determinise_with_config_res +fsm_determinise_with_config(struct fsm *fsm, + const struct fsm_determinise_config *config); + /* * Make a DFA complete, as per fsm_iscomplete. */ diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index 42992b6bc..0190e65ad 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -17,16 +17,20 @@ dump_labels(FILE *f, const uint64_t labels[4]) } } -int -fsm_determinise(struct fsm *nfa) +enum fsm_determinise_with_config_res +fsm_determinise_with_config(struct fsm *nfa, + const struct fsm_determinise_config *config) { - int res = 0; + enum fsm_determinise_with_config_res res = FSM_DETERMINISE_WITH_CONFIG_ERRNO; struct mappingstack *stack = NULL; struct interned_state_set_pool *issp = NULL; struct map map = { NULL, 0, 0, NULL }; struct mapping *curr = NULL; size_t dfacount = 0; + const size_t state_limit = config == NULL + ? 0 + : config->state_limit; struct analyze_closures_env ac_env = { 0 }; @@ -40,7 +44,7 @@ fsm_determinise(struct fsm *nfa) */ if (fsm_has(nfa, fsm_hasepsilons)) { if (!fsm_remove_epsilons(nfa)) { - return 0; + return FSM_DETERMINISE_WITH_CONFIG_ERRNO; } } @@ -52,7 +56,12 @@ fsm_determinise(struct fsm *nfa) issp = interned_state_set_pool_alloc(nfa->alloc); if (issp == NULL) { - return 0; + return FSM_DETERMINISE_WITH_CONFIG_ERRNO; + } + + if (state_limit != 0 && fsm_countstates(nfa) > state_limit) { + res = FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED; + goto cleanup; } { @@ -74,7 +83,7 @@ fsm_determinise(struct fsm *nfa) */ if (!fsm_getstart(nfa, &start)) { - res = 1; + res = FSM_DETERMINISE_WITH_CONFIG_OK; goto cleanup; } @@ -150,6 +159,11 @@ fsm_determinise(struct fsm *nfa) assert(m->dfastate < dfacount); } else { /* not found -- add a new one and push it to the stack for processing */ + + if (state_limit != 0 && dfacount > state_limit) { + res = FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED; + goto cleanup; + } if (!map_add(&map, dfacount, iss, &m)) { goto cleanup; } @@ -260,7 +274,7 @@ fsm_determinise(struct fsm *nfa) assert(fsm_all(nfa, fsm_isdfa)); #endif - res = 1; + res = FSM_DETERMINISE_WITH_CONFIG_OK; cleanup: map_free(&map); @@ -311,6 +325,21 @@ fsm_determinise(struct fsm *nfa) return res; } +int +fsm_determinise(struct fsm *nfa) +{ + enum fsm_determinise_with_config_res res = fsm_determinise_with_config(nfa, NULL); + switch (res) { + case FSM_DETERMINISE_WITH_CONFIG_OK: + return 1; + case FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED: + /* unreachable */ + return 0; + case FSM_DETERMINISE_WITH_CONFIG_ERRNO: + return 0; + } +} + /* Add DFA_state to the list for NFA_state. */ static int add_reverse_mapping(const struct fsm_alloc *alloc, diff --git a/src/libfsm/libfsm.syms b/src/libfsm/libfsm.syms index 34be09e77..67497d00a 100644 --- a/src/libfsm/libfsm.syms +++ b/src/libfsm/libfsm.syms @@ -101,6 +101,7 @@ fsm_countstates fsm_trim fsm_reverse fsm_determinise +fsm_determinise_with_config fsm_remove_epsilons fsm_complete fsm_minimise From 07e1a0ce6a0c6d4d11b4790435217cac8c371bbf Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 10 Oct 2024 11:44:01 -0400 Subject: [PATCH 2/8] Add 'default:' (should be unreachable) for switch case. This is getting rejected by CI. --- src/libfsm/determinise.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index 0190e65ad..559230175 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -336,6 +336,7 @@ fsm_determinise(struct fsm *nfa) /* unreachable */ return 0; case FSM_DETERMINISE_WITH_CONFIG_ERRNO: + default: return 0; } } From a61a9daa0a9614a58def3defcc5035950c8cd581 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 9 Oct 2024 09:48:21 -0400 Subject: [PATCH 3/8] fsm_generate_matches: Add seed argument, random choice if non-zero. There isn't a flag to set the seed in fsm or re yet. Update callers, add a 0 seed argument to default to the existing behavior. Also, no end states means nothing to do, so exit right away. --- include/fsm/walk.h | 6 +++- src/fsm/main.c | 2 +- src/libfsm/gen.c | 69 ++++++++++++++++++++++++++++++++++++++++++---- src/re/main.c | 2 +- tests/gen/gen1.c | 1 + tests/gen/gen2.c | 2 +- tests/gen/gen3.c | 2 +- 7 files changed, 74 insertions(+), 10 deletions(-) diff --git a/include/fsm/walk.h b/include/fsm/walk.h index e1ab5f29e..ea5a223e0 100644 --- a/include/fsm/walk.h +++ b/include/fsm/walk.h @@ -90,6 +90,10 @@ fsm_walk_edges(const struct fsm *fsm, void *opaque, * functionally equivalent cases makes testing dramatically faster, * but exploring every edge could be added later. * + * If seed is zero then it will generate the first label in the label + * set, otherwise a label from the set will be chosen using rand() + * (favoring printable characters). + * * Note: fsm is non-const because it calls fsm_trim on the FSM * internally. This records the shortest distance from each state to an * end state, which is used to prune branches that would not produce @@ -114,7 +118,7 @@ fsm_generate_matches_cb(const struct fsm *fsm, const char *input, size_t input_length, fsm_state_t end_state, void *opaque); int -fsm_generate_matches(struct fsm *fsm, size_t max_length, +fsm_generate_matches(struct fsm *fsm, size_t max_length, unsigned seed, fsm_generate_matches_cb *cb, void *opaque); /* Callback provided for the most basic use case for diff --git a/src/fsm/main.c b/src/fsm/main.c index da65791dd..f9d1bb3b0 100644 --- a/src/fsm/main.c +++ b/src/fsm/main.c @@ -770,7 +770,7 @@ main(int argc, char *argv[]) } if (generate_bounds > 0) { - r = fsm_generate_matches(fsm, generate_bounds, fsm_generate_cb_printf_escaped, &opt); + r = fsm_generate_matches(fsm, generate_bounds, 0, fsm_generate_cb_printf_escaped, &opt); } fsm_free(fsm); diff --git a/src/libfsm/gen.c b/src/libfsm/gen.c index 26a77be1c..9f78e67db 100644 --- a/src/libfsm/gen.c +++ b/src/libfsm/gen.c @@ -77,6 +77,7 @@ struct gen_ctx { fsm_generate_matches_cb *cb; bool done; + bool randomized; size_t buf_ceil; size_t buf_used; @@ -139,7 +140,7 @@ static bool grow_stack(struct gen_ctx *ctx); int -fsm_generate_matches(struct fsm *fsm, size_t max_length, +fsm_generate_matches(struct fsm *fsm, size_t max_length, unsigned seed, fsm_generate_matches_cb *cb, void *opaque) { if (max_length == 0) { @@ -147,9 +148,13 @@ fsm_generate_matches(struct fsm *fsm, size_t max_length, return 0; } + if (!fsm_has(fsm, fsm_isend)) { + return 1; /* no end state -> nothing to do */ + } + INIT_TIMERS(); TIME(&pre); - int res = gen_init_outer(fsm, max_length, cb, opaque, false, 0); + int res = gen_init_outer(fsm, max_length, cb, opaque, seed != 0, seed); TIME(&post); DIFF_MSEC("fsm_generate_matches", pre, post, NULL); @@ -208,8 +213,9 @@ gen_init_outer(struct fsm *fsm, size_t max_length, assert(fsm_all(fsm, fsm_isdfa)); /* DFA-only */ - assert(!randomized); /* not yet supported */ - (void)seed; + if (randomized) { + srand(seed); + } #if LOG_GEN > 1 fprintf(stderr, "%s: %u states\n", __func__, fsm_countstates(fsm)); @@ -224,6 +230,7 @@ gen_init_outer(struct fsm *fsm, size_t max_length, .max_length = max_length, .cb = cb, .opaque = opaque, + .randomized = randomized, }; if (!gen_init(&ctx, fsm)) { @@ -524,6 +531,55 @@ first_symbol(const uint64_t *symbols) return 0; } +static unsigned char +random_symbol(const uint64_t *symbols) +{ + bool has_zero = false; + unsigned i = 0; + + /* printable and non-printable character choices */ + size_t choice_count = 0; + unsigned char choices[256]; + size_t np_choice_count = 0; + unsigned char np_choices[256]; + + while (i < 256) { + const uint64_t w = symbols[i/64]; + if ((i & 63) == 0 && w == 0) { + i += 64; + continue; + } + if (w & (1ULL << (i & 63))) { + if (i == 0) { + has_zero = true; + } else if (isprint(i)) { + choices[choice_count++] = (unsigned char)i; + } else { + np_choices[np_choice_count++] = (unsigned char)i; + } + } + i++; + } + + if (choice_count > 0) { + const size_t c = rand() % choice_count; + return choices[c]; + } + + if (np_choice_count > 0) { + const size_t c = rand() % np_choice_count; + return np_choices[c]; + } + + /* Prefer anything besides 0x00 if present, since that will truncate the string. */ + if (has_zero) { + return 0; + } + + assert(!"empty set"); + return 0; +} + #if DUMP_EDGES static void dump_edges(fsm_state_t state, struct edge_set *edges) @@ -538,6 +594,7 @@ dump_edges(fsm_state_t state, struct edge_set *edges) size_t i = 0; while (edge_set_group_iter_next(&ei, &eg)) { const unsigned char symbol = first_symbol(eg.symbols); + const unsigned char symbol = random_symbol(eg.symbols); fprintf(stderr, "%s: %d -- %zu/%zu -- 0x%02x (%c) -> %d\n", __func__, state, i, count, symbol, isprint(symbol) ? symbol : '.', eg.to); @@ -585,7 +642,9 @@ sfs_step_edges(struct gen_ctx *ctx, struct gen_stack_frame *sf) struct edge_group_iter_info eg; if (iter_next_transition(ctx, sf, &eg)) { - const unsigned char symbol = first_symbol(eg.symbols); + const unsigned char symbol = ctx->randomized + ? random_symbol(eg.symbols) + : first_symbol(eg.symbols); const fsm_state_t state = eg.to; LOG(2, "sfs_step_edges: got edge 0x%x ('%c')\n", diff --git a/src/re/main.c b/src/re/main.c index 62e51f78d..9059af96e 100644 --- a/src/re/main.c +++ b/src/re/main.c @@ -1047,7 +1047,7 @@ main(int argc, char *argv[]) } if (generate_bounds > 0) { - if (!fsm_generate_matches(fsm, generate_bounds, fsm_generate_cb_printf_escaped, &opt)) { + if (!fsm_generate_matches(fsm, generate_bounds, 0, fsm_generate_cb_printf_escaped, &opt)) { exit(EXIT_FAILURE); } diff --git a/tests/gen/gen1.c b/tests/gen/gen1.c index 25b30b82b..b25ab2bbc 100644 --- a/tests/gen/gen1.c +++ b/tests/gen/gen1.c @@ -34,6 +34,7 @@ int main(void) { assert(fsm != NULL); if (!fsm_generate_matches(fsm, MAX_EXP_MATCH + 1 /* for \0 */, + 0, gtest_matches_cb, &matches)) { fprintf(stderr, "fsm_generate_matches: error\n"); exit(EXIT_FAILURE); diff --git a/tests/gen/gen2.c b/tests/gen/gen2.c index 02faa5e50..a475e5395 100644 --- a/tests/gen/gen2.c +++ b/tests/gen/gen2.c @@ -28,7 +28,7 @@ int main(void) { struct fsm *fsm = gtest_fsm_of_matches(&matches); assert(fsm != NULL); - if (!fsm_generate_matches(fsm, MAX_EXP_MATCH + 1, gtest_matches_cb, &matches)) { + if (!fsm_generate_matches(fsm, MAX_EXP_MATCH + 1, 0, gtest_matches_cb, &matches)) { fprintf(stderr, "fsm_generate_matches: error\n"); exit(EXIT_FAILURE); } diff --git a/tests/gen/gen3.c b/tests/gen/gen3.c index f24217622..7aa0aebce 100644 --- a/tests/gen/gen3.c +++ b/tests/gen/gen3.c @@ -146,7 +146,7 @@ int main(void) { struct fsm *fsm = build(); assert(fsm != NULL); - if (!fsm_generate_matches(fsm, 11, matches_cb, NULL)) { + if (!fsm_generate_matches(fsm, 11, 0, matches_cb, NULL)) { fprintf(stderr, "fsm_generate_matches: error\n"); exit(EXIT_FAILURE); } From ffe6f69fa68b5aaa5f8dd0d1515ece3f4ec341ca Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 11 Sep 2024 10:20:47 -0400 Subject: [PATCH 4/8] common.h: TRACK_TIMES should be 0 when BUILD_FOR_FUZZER is set. It fills up the fuzzer logs with lots of uninteresting output. --- include/adt/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/adt/common.h b/include/adt/common.h index 8604edacd..a83604c97 100644 --- a/include/adt/common.h +++ b/include/adt/common.h @@ -36,7 +36,7 @@ /* If non-zero, expand the timer macros defined below, otherwise * they compile away. */ #ifndef TRACK_TIMES -#define TRACK_TIMES 0 +#define TRACK_TIMES (0 && !BUILD_FOR_FUZZER) #endif #if EXPENSIVE_CHECKS && TRACK_TIMES From b2f9d3d93c858231e8b4de4abf5d49bd01c9226b Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 10 Sep 2024 13:51:20 -0400 Subject: [PATCH 5/8] determinise: Remove stale comment. --- src/libfsm/determinise.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index 559230175..fc7c68ba4 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -185,8 +185,6 @@ fsm_determinise_with_config(struct fsm *nfa, } ac_env.output_count = 0; - - /* All elements in sclosures[] are interned, so they will be freed later. */ } while ((curr = stack_pop(stack))); { From 35892eb03cf1e866991fd8d7de81d0c26b296bba Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 9 Sep 2024 08:29:49 -0400 Subject: [PATCH 6/8] trim's integrity_check should only run with EXPENSIVE_CHECKS. --- src/libfsm/trim.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/libfsm/trim.c b/src/libfsm/trim.c index 4f45607bd..c37965fd8 100644 --- a/src/libfsm/trim.c +++ b/src/libfsm/trim.c @@ -462,6 +462,10 @@ integrity_check(const char *descr, const struct fsm *fsm) return; #endif +#if !EXPENSIVE_CHECKS + return; +#endif + if (LOG_TRIM > 1) { fprintf(stderr, "integrity check: %s...\n", descr); } From cf8fc651ea87b2f3b44071161f42db8b7d9aecaa Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Fri, 11 Oct 2024 13:50:28 -0400 Subject: [PATCH 7/8] Remove direct call to srand(), just recommend the caller use it. Instead of taking a seed argument, just treat `int randomized` like a flag. (I'd use `bool`, but the header consistently uses `int` for boolean arguments.) --- include/fsm/walk.h | 9 +++++---- src/libfsm/gen.c | 12 ++++-------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/include/fsm/walk.h b/include/fsm/walk.h index ea5a223e0..cb97e989a 100644 --- a/include/fsm/walk.h +++ b/include/fsm/walk.h @@ -90,9 +90,10 @@ fsm_walk_edges(const struct fsm *fsm, void *opaque, * functionally equivalent cases makes testing dramatically faster, * but exploring every edge could be added later. * - * If seed is zero then it will generate the first label in the label - * set, otherwise a label from the set will be chosen using rand() - * (favoring printable characters). + * If randomized is zero then it will generate the first label in the + * label set, otherwise a label from the set will be chosen using rand() + * (favoring printable characters). The caller can use srand() + * beforehand to set a PRNG seed. * * Note: fsm is non-const because it calls fsm_trim on the FSM * internally. This records the shortest distance from each state to an @@ -118,7 +119,7 @@ fsm_generate_matches_cb(const struct fsm *fsm, const char *input, size_t input_length, fsm_state_t end_state, void *opaque); int -fsm_generate_matches(struct fsm *fsm, size_t max_length, unsigned seed, +fsm_generate_matches(struct fsm *fsm, size_t max_length, int randomized, fsm_generate_matches_cb *cb, void *opaque); /* Callback provided for the most basic use case for diff --git a/src/libfsm/gen.c b/src/libfsm/gen.c index 9f78e67db..8b8551489 100644 --- a/src/libfsm/gen.c +++ b/src/libfsm/gen.c @@ -107,7 +107,7 @@ struct gen_ctx { static bool gen_init_outer(struct fsm *fsm, size_t max_length, fsm_generate_matches_cb *cb, void *opaque, - bool randomized, unsigned seed); + bool randomized); static bool gen_init(struct gen_ctx *ctx, struct fsm *fsm); @@ -140,7 +140,7 @@ static bool grow_stack(struct gen_ctx *ctx); int -fsm_generate_matches(struct fsm *fsm, size_t max_length, unsigned seed, +fsm_generate_matches(struct fsm *fsm, size_t max_length, int randomized, fsm_generate_matches_cb *cb, void *opaque) { if (max_length == 0) { @@ -154,7 +154,7 @@ fsm_generate_matches(struct fsm *fsm, size_t max_length, unsigned seed, INIT_TIMERS(); TIME(&pre); - int res = gen_init_outer(fsm, max_length, cb, opaque, seed != 0, seed); + int res = gen_init_outer(fsm, max_length, cb, opaque, randomized != 0); TIME(&post); DIFF_MSEC("fsm_generate_matches", pre, post, NULL); @@ -204,7 +204,7 @@ fsm_generate_cb_printf(const struct fsm *fsm, static bool gen_init_outer(struct fsm *fsm, size_t max_length, fsm_generate_matches_cb *cb, void *opaque, - bool randomized, unsigned seed) + bool randomized) { int res = false; if (fsm == NULL || cb == NULL || max_length == 0) { @@ -213,10 +213,6 @@ gen_init_outer(struct fsm *fsm, size_t max_length, assert(fsm_all(fsm, fsm_isdfa)); /* DFA-only */ - if (randomized) { - srand(seed); - } - #if LOG_GEN > 1 fprintf(stderr, "%s: %u states\n", __func__, fsm_countstates(fsm)); #endif From 92e6e827e2d69eb28110a26628b15d353ec498a6 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 16 Sep 2024 16:00:21 -0400 Subject: [PATCH 8/8] Add re_is_anchored interface. --- include/re/re.h | 16 ++++++++++++++++ src/libre/libre.syms | 1 + src/libre/re.c | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/include/re/re.h b/include/re/re.h index 20408e98a..841e4e946 100644 --- a/include/re/re.h +++ b/include/re/re.h @@ -136,6 +136,22 @@ re_comp(enum re_dialect dialect, const struct fsm_alloc *alloc, enum re_flags flags, struct re_err *err); +/* Parse and analyze the regex enough to determine whether it is + * anchored at the start and/or end. + * + * As long as the result is checked for RE_IS_ANCHORED_ERROR first, + * the result can be used like a bitset. */ +enum re_is_anchored_res { + RE_IS_ANCHORED_NONE = 0x00, + RE_IS_ANCHORED_START = 0x01, + RE_IS_ANCHORED_END = 0x02, + RE_IS_ANCHORED_BOTH = 0x03, + RE_IS_ANCHORED_ERROR = 0xFFFF, +}; +enum re_is_anchored_res +re_is_anchored(enum re_dialect dialect, re_getchar_fun *f, void *opaque, + enum re_flags flags, struct re_err *err); + /* * Return a human-readable string describing a given error code. The string * returned has static storage, and must not be freed. diff --git a/src/libre/libre.syms b/src/libre/libre.syms index a4f1a223b..9d381cb0f 100644 --- a/src/libre/libre.syms +++ b/src/libre/libre.syms @@ -3,6 +3,7 @@ re_is_literal re_flags re_strerror re_perror +re_is_anchored ast_print ast_print_dot diff --git a/src/libre/re.c b/src/libre/re.c index 15af848b5..013e2b58c 100644 --- a/src/libre/re.c +++ b/src/libre/re.c @@ -335,3 +335,40 @@ re_is_literal(enum re_dialect dialect, int (*getc)(void *opaque), void *opaque, return -1; } +enum re_is_anchored_res +re_is_anchored(enum re_dialect dialect, re_getchar_fun *getc, void *opaque, + enum re_flags flags, struct re_err *err) +{ + /* FIXME: copy/pasted from above, factor out common code later. */ + + struct ast *ast; + const struct dialect *m; + int unsatisfiable; + + assert(getc != NULL); + + m = re_dialect(dialect); + if (m == NULL) { + if (err != NULL) { err->e = RE_EBADDIALECT; } + return RE_IS_ANCHORED_ERROR; + } + + flags |= m->flags; + + ast = re_parse(dialect, getc, opaque, flags, err, &unsatisfiable); + if (ast == NULL) { + return RE_IS_ANCHORED_ERROR; + } + + /* Copy anchoring flags, ending up with NONE, START, END, or BOTH. */ + enum re_is_anchored_res res = RE_IS_ANCHORED_NONE; + if (ast->expr->flags & AST_FLAG_ANCHORED_START) { + res |= RE_IS_ANCHORED_START; + } + if (ast->expr->flags & AST_FLAG_ANCHORED_END) { + res |= RE_IS_ANCHORED_END; + } + + ast_free(ast); + return res; +}