From 4a32885726915980244ee89d957147cc97d63815 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 27 Aug 2024 14:08:39 -0400 Subject: [PATCH] experimental: Add eager outputs, similar to endids but eagerly matched. When combining several unanchored regexes it becomes VERY expensive to handle combinations of matches via the end state -- essentially, the whole reachable DFA gets separate matching and non-matching copies for each pattern, leading to a DFA whose size is proportional to the number of *possible combinations* of matches. With eager outputs, we can set a flag for matching as we reach the end of the original pattern (before looping back and possibly also matching other patterns), which keeps the state count from blowing up in fsm_determinise. To see how much difference this makes, the test tests/eager_output/run7 combines 26 different patterns. It should finish very quickly (~50 msec, just now). Try running it with `env FORCE_ENDIDS=N` for N increasing from 4 to 26. Around 10-11 it will start taking several seconds, and memory usage will roughly double with each step. This PR adds `fsm_union_repeated_pattern_group`, a variant of `fsm_union_array` that combines a set of DFAs into a single NFA, but correctly handles a mix of anchored and unanchored ends without the state count blowing up. It currently needs flags passed in for each fsm indicating whether the start and/or end are anchored, and there is a hacky special case that removes self-edges from states with eager outputs and instead connects them to a single overall unanchored end loop. I haven't yet figured out how to handle this properly in the general case, but it works for this specific use case, provided all the DFAs are combined at once. (Combining multiple DFAs each produced by determinising fsm_union_repeated_pattern_group's result probably won't work correctly.) I have tried detecting and ignoring those edges in fsm_determinise, after epsilon removal, but so far either it still causes the graph size to blow up or subtly breaks something else. This is still experimental, and the code generation for `-lc` here is quite hacky -- it expects the caller to define a `FSM_SET_EAGER_OUTPUT` acro, since the code generation interface doesn't define where the match info will go yet. A later PR will add a new code generation mode with better support for eager outputs, and I plan to eventually integrate this better with rx, AMBIG_MULTIPLE, and so on. (This squashes down a couple false starts.) --- Makefile | 1 + fuzz/target.c | 524 +++++++++++++++++- include/fsm/bool.h | 10 + include/fsm/fsm.h | 46 ++ include/fsm/print.h | 3 + include/re/re.h | 15 + src/libfsm/Makefile | 1 + src/libfsm/clone.c | 38 ++ src/libfsm/consolidate.c | 46 ++ src/libfsm/determinise.c | 131 ++++- src/libfsm/determinise_internal.h | 10 +- src/libfsm/eager_output.c | 403 ++++++++++++++ src/libfsm/eager_output.h | 46 ++ src/libfsm/epsilons.c | 147 ++++- src/libfsm/exec.c | 55 +- src/libfsm/fsm.c | 11 + src/libfsm/internal.h | 5 + src/libfsm/libfsm.syms | 11 + src/libfsm/merge.c | 42 ++ src/libfsm/minimise.c | 79 ++- src/libfsm/print/c.c | 13 + src/libfsm/print/ir.c | 35 ++ src/libfsm/print/ir.h | 5 + src/libfsm/state.c | 8 + src/libfsm/union.c | 233 ++++++++ src/libre/libre.syms | 1 + src/libre/re.c | 34 ++ tests/eager_output/Makefile | 22 + tests/eager_output/eager_output1.c | 12 + tests/eager_output/eager_output2.c | 17 + tests/eager_output/eager_output3.c | 16 + tests/eager_output/eager_output4.c | 13 + tests/eager_output/eager_output5.c | 14 + tests/eager_output/eager_output6.c | 34 ++ tests/eager_output/eager_output7.c | 103 ++++ tests/eager_output/eager_output_at_start.c | 12 + tests/eager_output/eager_output_fr1.c | 13 + tests/eager_output/eager_output_fr2.c | 13 + tests/eager_output/eager_output_fr3.c | 13 + .../eager_output_mixed_anchored_unanchored.c | 46 ++ tests/eager_output/utils.c | 278 ++++++++++ tests/eager_output/utils.h | 64 +++ 42 files changed, 2587 insertions(+), 36 deletions(-) create mode 100644 src/libfsm/eager_output.c create mode 100644 src/libfsm/eager_output.h create mode 100644 tests/eager_output/Makefile create mode 100644 tests/eager_output/eager_output1.c create mode 100644 tests/eager_output/eager_output2.c create mode 100644 tests/eager_output/eager_output3.c create mode 100644 tests/eager_output/eager_output4.c create mode 100644 tests/eager_output/eager_output5.c create mode 100644 tests/eager_output/eager_output6.c create mode 100644 tests/eager_output/eager_output7.c create mode 100644 tests/eager_output/eager_output_at_start.c create mode 100644 tests/eager_output/eager_output_fr1.c create mode 100644 tests/eager_output/eager_output_fr2.c create mode 100644 tests/eager_output/eager_output_fr3.c create mode 100644 tests/eager_output/eager_output_mixed_anchored_unanchored.c create mode 100644 tests/eager_output/utils.c create mode 100644 tests/eager_output/utils.h diff --git a/Makefile b/Makefile index 514f80bba..b356c4f0f 100644 --- a/Makefile +++ b/Makefile @@ -118,6 +118,7 @@ SUBDIR += tests/equals SUBDIR += tests/subtract SUBDIR += tests/detect_required SUBDIR += tests/determinise +SUBDIR += tests/eager_output SUBDIR += tests/endids SUBDIR += tests/epsilons SUBDIR += tests/fsm diff --git a/fuzz/target.c b/fuzz/target.c index 543891bb9..d56a9bf82 100644 --- a/fuzz/target.c +++ b/fuzz/target.c @@ -26,10 +26,21 @@ /* 10 seconds */ #define TIMEOUT_USEC (10ULL * 1000 * 1000) +static bool verbosity_checked = false; +static bool verbose = false; + +#define LOG(...) \ + do { \ + if (verbose) { \ + fprintf(stderr, __VA_ARGS__); \ + } \ + } while (0) \ + enum run_mode { MODE_DEFAULT, MODE_SHUFFLE_MINIMISE, MODE_ALL_PRINT_FUNCTIONS, + MODE_EAGER_OUTPUT, }; @@ -344,6 +355,508 @@ fuzz_all_print_functions(FILE *f, const char *pattern, bool det, bool min, const return EXIT_SUCCESS; } +#define MAX_PATTERNS 4 +struct eager_output_cb_info { + size_t used; + fsm_output_id_t ids[MAX_PATTERNS]; +}; + +static void +reset_eager_output_info(struct eager_output_cb_info *info) +{ + info->used = 0; +} + +struct feo_env { + bool ok; + size_t pattern_count; + size_t fsm_count; + size_t max_match_count; + size_t max_steps; + + char *patterns[MAX_PATTERNS]; + struct fsm *fsms[MAX_PATTERNS]; + struct fsm *combined; + + /* which pattern is being used for generation, (size_t)-1 for combined */ + size_t current_pattern; + + struct eager_output_cb_info outputs; + struct eager_output_cb_info outputs_combined; +}; + +void +append_eager_output_cb(fsm_output_id_t id, void *opaque) +{ + struct eager_output_cb_info *info = (struct eager_output_cb_info *)opaque; + + for (size_t i = 0; i < info->used; i++) { + if (info->ids[i] == id) { + return; /* already present */ + } + } + + assert(info->used < MAX_PATTERNS); + info->ids[info->used++] = id; +} + +static enum fsm_generate_matches_cb_res +gen_combined_check_individual_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque); + +static enum fsm_generate_matches_cb_res +gen_individual_check_combined_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque); + +#define DEF_MAX_STEPS 100000 +#define DEF_MAX_MATCH_COUNT 1000 + +/* This isn't part of the public interface, per se. */ +void +fsm_eager_output_dump(FILE *f, const struct fsm *fsm); + +static int +fuzz_eager_output(const uint8_t *data, size_t size) +{ + struct feo_env env = { + .ok = true, + .pattern_count = 0, + .max_steps = DEF_MAX_STEPS, + .max_match_count = DEF_MAX_MATCH_COUNT, + }; + + { + const char *steps = getenv("STEPS"); + const char *matches = getenv("MATCHES"); + if (steps != NULL) { + env.max_steps = strtoul(steps, NULL, 10); + assert(env.max_steps > 0); + } + if (matches != NULL) { + env.max_match_count = strtoul(matches, NULL, 10); + assert(env.max_match_count > 0); + } + } + + int ret = 0; + + size_t max_pattern_length = 0; + + /* chop data into a series of patterns */ + { + size_t prev = 0; + size_t offset = 0; + + /* Patterns with lots of '.' can take a while to determinise. + * That slows down fuzzer coverage, but isn't interesting here. */ + size_t dots = 0; + + while (offset < size && env.pattern_count < MAX_PATTERNS) { +#define MAX_DOTS 4 + if (data[offset] == '.') { dots++; } + + if (data[offset] == '\0' || data[offset] == '\n' || offset == size - 1) { + size_t len = offset - prev; + + if (dots > MAX_DOTS) { + /* ignored */ + prev = offset; + } else if (len > 0) { + char *pattern = malloc(len + 1); + assert(pattern != NULL); + + memcpy(pattern, &data[prev], len); + if (len > 0 && pattern[len] == '\n') { + len--; /* drop trailing newline */ + } + pattern[len] = '\0'; + bool keep = true; + + if (len > 0) { + for (size_t i = 0; i < len - 1; i++) { + if (pattern[i] == '\\' && pattern[i + 1] == 'x') { + /* ignore unhandled parser errors from "\x", see #386 */ + keep = false; + } + } + } + + if (keep) { + env.patterns[env.pattern_count++] = pattern; + + if (len > max_pattern_length) { + max_pattern_length = len; + } + } else { + free(pattern); + } + prev = offset; + dots = 0; + } + } + + offset++; + } + } + + struct re_anchoring_info anchorage[MAX_PATTERNS] = {0}; + + /* for each pattern, attempt to compile to a DFA */ + for (size_t p_i = 0; p_i < env.pattern_count; p_i++) { + const char *p = env.patterns[p_i]; + + if (!re_is_anchored(RE_PCRE, fsm_sgetc, &p, 0, NULL, &anchorage[p_i])) { + continue; /* unsupported regex */ + } + + p = env.patterns[p_i]; + struct fsm *fsm = re_comp(RE_PCRE, fsm_sgetc, &p, NULL, 0, NULL); + + LOG("%s: pattern %zd: '%s' => %p\n", __func__, p_i, env.patterns[p_i], (void *)fsm); + + if (fsm == NULL) { + continue; /* invalid regex */ + } + + const fsm_output_id_t endid = (fsm_output_id_t)p_i; + ret = fsm_seteageroutputonends(fsm, endid); + assert(ret == 1); + + if (verbose) { + fprintf(stderr, "==== pattern %zd, pre det\n", p_i); + fsm_dump(stderr, fsm); + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + + fsm_state_t c = fsm_countstates(fsm); + for (fsm_state_t i = 0; i < c; i++) { + fprintf(stderr, "-- %d: end? %d\n", i, fsm_isend(fsm, i)); + } + } + + ret = fsm_determinise(fsm); + assert(ret == 1); + + ret = fsm_minimise(fsm); + assert(ret == 1); + + fsm_state_t start; + if (!fsm_getstart(fsm, &start)) { + fsm_free(fsm); + continue; + } + + if (verbose) { + fprintf(stderr, "==== pattern %zd, post det\n", p_i); + fsm_dump(stderr, fsm); + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + + fsm_state_t c = fsm_countstates(fsm); + for (fsm_state_t i = 0; i < c; i++) { + fprintf(stderr, "-- %d: end? %d\n", i, fsm_isend(fsm, i)); + } + } + + fsm_eager_output_set_cb(fsm, append_eager_output_cb, &env.outputs); + env.fsms[env.fsm_count++] = fsm; + } + + /* don't bother checking combined behavior unless there's multiple DFAs */ + if (env.fsm_count < 2) { goto cleanup; } + + /* copy and combine fsms into one DFA */ + { + size_t used = 0; + struct fsm_union_entry entries[MAX_PATTERNS] = {0}; + + for (size_t i = 0; i < env.fsm_count; i++) { + /* there can be gaps, fsms[] lines up with patterns[] */ + if (env.fsms[i] == NULL) { continue; } + + fsm_state_t start; + if (!fsm_getstart(env.fsms[i], &start)) { + assert(!"hit"); + } + + struct fsm *cp = fsm_clone(env.fsms[i]); + assert(cp != NULL); + + if (verbose) { + fprintf(stderr, "==== cp %zd\n", i); + fsm_dump(stderr, cp); + fsm_eager_output_dump(stderr, cp); + fprintf(stderr, "====\n"); + + fsm_state_t c = fsm_countstates(cp); + for (fsm_state_t i = 0; i < c; i++) { + fprintf(stderr, "-- %d: end? %d\n", i, fsm_isend(cp, i)); + } + } + + entries[used].fsm = cp; + entries[used].anchored_start = anchorage[i].start; + entries[used].anchored_end = anchorage[i].end; + used++; + } + + if (used == 0) { + goto cleanup; /* nothing to do */ + } + + /* consumes entries[] */ + struct fsm *fsm = fsm_union_repeated_pattern_group(used, entries, NULL); + assert(fsm != NULL); + + if (verbose) { + fprintf(stderr, "==== combined (pre-det)\n"); + fsm_dump(stderr, fsm); + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + if (!fsm_determinise(fsm)) { + assert(!"failed to determinise"); + } + + if (!fsm_minimise(fsm)) { + assert(!"failed to minimise"); + } + + LOG("%s: combined state_count %d\n", __func__, fsm_countstates(fsm)); + env.combined = fsm; + /* fsm_eager_output_set_cb(fsm, append_eager_output_cb, &env.outputs_combined); */ + + if (verbose) { + fprintf(stderr, "==== combined\n"); + fsm_dump(stderr, env.combined); + fsm_eager_output_dump(stderr, env.combined); + fprintf(stderr, "====\n"); + } + + } + + /* Use fsm_generate_matches to check for matches that got lost + * and false positives introduced while combining the DFAs. + * Use the combined DFA to generate matches, check that the + * match behavior agrees with the individual DFA copies. */ + env.current_pattern = (size_t)-1; + if (!fsm_generate_matches(env.combined, max_pattern_length, gen_combined_check_individual_cb, &env)) { + goto cleanup; + } + + if (!env.ok) { goto cleanup; } + + /* Likewise, use every individual DFA to generate matches and */ + /* check behavior against the combined DFA. */ + for (size_t i = 0; i < env.pattern_count; i++) { + env.current_pattern = i; + if (!fsm_generate_matches(env.combined, max_pattern_length, gen_individual_check_combined_cb, &env)) { + goto cleanup; + } + } + + ret = env.ok ? EXIT_SUCCESS : EXIT_FAILURE; +cleanup: + for (size_t i = 0; i < MAX_PATTERNS; i++) { + if (env.patterns[i] != NULL) { + free(env.patterns[i]); + env.patterns[i] = NULL; + } + if (env.fsms[i] != NULL) { + fsm_free(env.fsms[i]); + } + } + if (env.combined != NULL) { + fsm_free(env.combined); + } + + return ret; +} + +static int +cmp_output_id(const void *pa, const void *pb) +{ + const fsm_output_id_t a = *(fsm_output_id_t *)pa; + const fsm_output_id_t b = *(fsm_output_id_t *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + +static bool +match_input_get_eager_outputs(struct fsm *fsm, const char *input, size_t input_length, + struct eager_output_cb_info *dst) +{ + (void)input_length; + fsm_state_t end; + + reset_eager_output_info(dst); + + fsm_eager_output_set_cb(fsm, append_eager_output_cb, dst); + const int ret = fsm_exec(fsm, fsm_sgetc, &input, &end, NULL); + if (ret == 0) { + return false; /* no match */ + } else { + assert(ret == 1); /* match */ + } + + /* sort the IDs, to make comparison cheaper */ + qsort(dst->ids, dst->used, sizeof(dst->ids[0]), cmp_output_id); + return true; /* match */ +} + +/* For a given matching input generated by the combined DFA, check that + * only the expected individual source DFAs match. */ +static enum fsm_generate_matches_cb_res +gen_combined_check_individual_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque) +{ + (void)fsm; + (void)depth; + (void)end_state; + + struct feo_env *env = opaque; + assert(env->current_pattern == (size_t)-1); + + if (match_count > env->max_match_count) { return FSM_GENERATE_MATCHES_CB_RES_HALT; } + if (steps > env->max_steps) { return FSM_GENERATE_MATCHES_CB_RES_HALT; } + + /* execute, to set eager outputs */ + if (!match_input_get_eager_outputs(env->combined, input, input_length, &env->outputs_combined)) { + env->ok = false; + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + + size_t individual_outputs_used = 0; + fsm_output_id_t individual_outputs[MAX_PATTERNS]; + + for (size_t i = 0; i < env->pattern_count; i++) { + struct fsm *fsm = env->fsms[i]; + if (fsm == NULL) { continue; } + + if (!match_input_get_eager_outputs(fsm, input, input_length, &env->outputs)) { + env->ok = false; + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + + if (env->outputs.used > 0) { + assert(env->outputs.used == 1); + individual_outputs[individual_outputs_used++] = env->outputs.ids[0]; + } + } + + bool match = true; + if (env->outputs_combined.used != individual_outputs_used) { + match = false; + } + + for (size_t cmb_i = 0; cmb_i < env->outputs_combined.used; cmb_i++) { + const fsm_output_id_t cur = env->outputs_combined.ids[cmb_i]; + assert(env->fsms[cmb_i] != NULL); + bool found = false; + for (size_t i = 0; i < individual_outputs_used; i++) { + if (individual_outputs[i] == cur) { + found = true; + break; + } + } + if (!found) { + match = false; + break; + } + } + + if (!match) { + fprintf(stderr, "%s: combined <-> individual mismatch for input '%s'(%zd)!\n", __func__, input, input_length); + + fprintf(stderr, "-- combined: %zu IDs:", env->outputs_combined.used); + for (size_t cmb_i = 0; cmb_i < env->outputs_combined.used; cmb_i++) { + fprintf(stderr, " %d", env->outputs_combined.ids[cmb_i]); + } + fprintf(stderr, "\n"); + fprintf(stderr, "-- individiual: %zu IDs:", individual_outputs_used); + for (size_t i = 0; i < individual_outputs_used; i++) { + fprintf(stderr, " %d", individual_outputs[i]); + } + fprintf(stderr, "\n"); + goto fail; + } + + return FSM_GENERATE_MATCHES_CB_RES_CONTINUE; + +fail: + env->ok = false; + return FSM_GENERATE_MATCHES_CB_RES_HALT; +} + +/* For a given matching input generated by one of the source DFAs, check that + * the combined DFA also matches, and that the only other source DFAs that match + * are ones that should according to the combined DFA. */ +static enum fsm_generate_matches_cb_res +gen_individual_check_combined_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque) +{ + (void)fsm; + (void)depth; + (void)end_state; + + struct feo_env *env = opaque; + assert(env->current_pattern < env->pattern_count); + if (match_count > env->max_match_count) { return FSM_GENERATE_MATCHES_CB_RES_HALT; } + if (steps > env->max_steps) { return FSM_GENERATE_MATCHES_CB_RES_HALT; } + + struct fsm *cur_fsm = env->fsms[env->current_pattern]; + if (cur_fsm == NULL) { return FSM_GENERATE_MATCHES_CB_RES_CONTINUE; } + + /* execute, to set eager outputs */ + if (!match_input_get_eager_outputs(cur_fsm, input, input_length, &env->outputs)) { + goto fail; + } + if (!match_input_get_eager_outputs(env->combined, input, input_length, &env->outputs_combined)) { + goto fail; + } + + assert(env->outputs.used == 1); + + bool found = false; + for (size_t i = 0; i < env->outputs_combined.used; i++) { + if (env->outputs_combined.ids[i] == env->outputs.ids[0]) { + found = true; + break; + } + } + + if (!found) { + fprintf(stderr, "%s: combined <-> individual mismatch for input '%s'(%zd)!\n", __func__, input, input_length); + + fprintf(stderr, "-- combined: %zu IDs:", env->outputs_combined.used); + for (size_t cmb_i = 0; cmb_i < env->outputs_combined.used; cmb_i++) { + fprintf(stderr, " %d", env->outputs_combined.ids[cmb_i]); + } + fprintf(stderr, "\n"); + fprintf(stderr, "-- pattern %zd: %zu IDs:", env->current_pattern, env->outputs.used); + for (size_t i = 0; i < env->outputs.used; i++) { + fprintf(stderr, " %d", env->outputs.ids[i]); + } + fprintf(stderr, "\n"); + goto fail; + } + + return FSM_GENERATE_MATCHES_CB_RES_CONTINUE; + +fail: + env->ok = false; + return FSM_GENERATE_MATCHES_CB_RES_HALT; +} +#undef MAX_PATTERNS + #define MAX_FUZZER_DATA (64 * 1024) static uint8_t data_buf[MAX_FUZZER_DATA + 1]; @@ -358,6 +871,7 @@ get_run_mode(void) switch (mode[0]) { case 'm': return MODE_SHUFFLE_MINIMISE; case 'p': return MODE_ALL_PRINT_FUNCTIONS; + case 'E': return MODE_EAGER_OUTPUT; case 'd': default: return MODE_DEFAULT; @@ -373,6 +887,11 @@ harness_fuzzer_target(const uint8_t *data, size_t size) return EXIT_SUCCESS; } + if (!verbosity_checked) { + verbosity_checked = true; + verbose = getenv("VERBOSE") != NULL; + } + /* Ensure that input is '\0'-terminated. */ if (size > MAX_FUZZER_DATA) { size = MAX_FUZZER_DATA; @@ -392,6 +911,9 @@ harness_fuzzer_target(const uint8_t *data, size_t size) case MODE_SHUFFLE_MINIMISE: return shuffle_minimise(pattern); + case MODE_EAGER_OUTPUT: + return fuzz_eager_output(data, size); + case MODE_ALL_PRINT_FUNCTIONS: { if (dev_null == NULL) { @@ -403,7 +925,7 @@ harness_fuzzer_target(const uint8_t *data, size_t size) const bool det = b0 & 0x1; const bool min = b0 & 0x2; const enum fsm_io io_mode = (b0 >> 2) % 3; - + const char *shifted_pattern = (const char *)&data_buf[1]; int res = fuzz_all_print_functions(dev_null, shifted_pattern, det, min, io_mode); return res; diff --git a/include/fsm/bool.h b/include/fsm/bool.h index d92518297..4d9f1889a 100644 --- a/include/fsm/bool.h +++ b/include/fsm/bool.h @@ -52,6 +52,16 @@ struct fsm * fsm_union_array(size_t fsm_count, struct fsm **fsms, struct fsm_combined_base_pair *bases); +struct fsm_union_entry { + struct fsm *fsm; + bool anchored_start; + bool anchored_end; +}; + +struct fsm * +fsm_union_repeated_pattern_group(size_t entry_count, + struct fsm_union_entry *entries, struct fsm_combined_base_pair *bases); + struct fsm * fsm_intersect(struct fsm *a, struct fsm *b); diff --git a/include/fsm/fsm.h b/include/fsm/fsm.h index 877d5c1bf..701efe70b 100644 --- a/include/fsm/fsm.h +++ b/include/fsm/fsm.h @@ -7,6 +7,7 @@ #ifndef FSM_H #define FSM_H +#include #include struct fsm; @@ -27,6 +28,9 @@ typedef unsigned int fsm_state_t; * original FSM(s) matched when executing a combined FSM. */ typedef unsigned int fsm_end_id_t; +/* Eager output ID. */ +typedef unsigned int fsm_output_id_t; + #define FSM_END_ID_MAX UINT_MAX /* @@ -266,6 +270,39 @@ fsm_mapendids(struct fsm * fsm, fsm_endid_remap_fun remap, void *opaque); void fsm_increndids(struct fsm * fsm, int delta); +/* Associate an eagerly matched numeric ID with the end states in an fsm. + * + * This is similar to fsm_setendid, but has different performance + * trade-offs. In particular, it can become extremely expensive to + * combine multiple DFAs with endids on their end states when they + * representing regexes with unanchored ends, because the FSM has to + * explicitly represent all the possible combinations of matches by + * copying the entire path to every reachable end state. Eager endids + * are associated with the edge leaving the main pattern match. + * + * Returns 1 on success, 0 on error. + * */ +int +fsm_seteagerendid(struct fsm *fsm, fsm_end_id_t id); + +/* Set an eager output ID to emit every time the state is entered. + * This turns the automata into a Moore machine. */ +int +fsm_seteageroutput(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id); + +/* Set an eager output ID on all current end states. */ +int +fsm_seteageroutputonends(struct fsm *fsm, fsm_output_id_t id); + +/* HACK */ +typedef void +fsm_eager_output_cb(fsm_output_id_t id, void *opaque); +void +fsm_eager_output_set_cb(struct fsm *fsm, fsm_eager_output_cb *cb, void *opaque); + +void +fsm_eager_output_get_cb(const struct fsm *fsm, fsm_eager_output_cb **cb, void **opaque); + /* * Find the state (if there is just one), or add epsilon edges from all states, * for which the given predicate is true. @@ -436,6 +473,15 @@ fsm_shortest(const struct fsm *fsm, fsm_state_t start, fsm_state_t goal, unsigned (*cost)(fsm_state_t from, fsm_state_t to, char c)); +/* HACK */ +typedef void +fsm_eager_endid_cb(fsm_end_id_t id, void *opaque); +void +fsm_eager_endid_set_cb(struct fsm *fsm, fsm_eager_endid_cb *cb, void *opaque); + +void +fsm_eager_endid_get_cb(const struct fsm *fsm, fsm_eager_endid_cb **cb, void **opaque); + /* * Execute an FSM reading input from the user-specified callback fsm_getc(). * fsm_getc() is passed the opaque pointer given, and is expected to return diff --git a/include/fsm/print.h b/include/fsm/print.h index 9f7264e81..10244129b 100644 --- a/include/fsm/print.h +++ b/include/fsm/print.h @@ -45,6 +45,9 @@ enum fsm_print_lang { struct fsm_state_metadata { const fsm_end_id_t *end_ids; size_t end_id_count; + + const fsm_output_id_t *eager_output_ids; + size_t eager_output_count; }; /* diff --git a/include/re/re.h b/include/re/re.h index 20408e98a..a3e1f7e0c 100644 --- a/include/re/re.h +++ b/include/re/re.h @@ -136,6 +136,21 @@ re_comp(enum re_dialect dialect, const struct fsm_alloc *alloc, enum re_flags flags, struct re_err *err); +struct re_anchoring_info { + int start; + int end; + /* FIXME: this could also check for AST_FLAG_NULLABLE, AST_FLAG_UNSATISFIABLE, + * AST_FLAG_ALWAYS_CONSUMES, AST_FLAG_CAN_CONSUME */ +}; + +/* Parse and analyze the regex enough to determine whether it is + * anchored at the start and/or end. Returns 0 if the regex is not + * supported, otherwise returns 1 and writes anchoring flags into *info. */ +int +re_is_anchored(enum re_dialect dialect, re_getchar_fun *f, void *opaque, + enum re_flags flags, struct re_err *err, + struct re_anchoring_info *info); + /* * Return a human-readable string describing a given error code. The string * returned has static storage, and must not be freed. diff --git a/src/libfsm/Makefile b/src/libfsm/Makefile index 5e2ed57e3..c7782f0ff 100644 --- a/src/libfsm/Makefile +++ b/src/libfsm/Makefile @@ -8,6 +8,7 @@ SRC += src/libfsm/consolidate.c SRC += src/libfsm/clone.c SRC += src/libfsm/closure.c SRC += src/libfsm/detect_required.c +SRC += src/libfsm/eager_output.c SRC += src/libfsm/edge.c SRC += src/libfsm/empty.c SRC += src/libfsm/end.c diff --git a/src/libfsm/clone.c b/src/libfsm/clone.c index 9fd236a4d..2161599ae 100644 --- a/src/libfsm/clone.c +++ b/src/libfsm/clone.c @@ -19,6 +19,7 @@ #include "internal.h" #include "capture.h" #include "endids.h" +#include "eager_output.h" #define LOG_CLONE_ENDIDS 0 @@ -28,6 +29,9 @@ copy_capture_actions(struct fsm *dst, const struct fsm *src); static int copy_end_ids(struct fsm *dst, const struct fsm *src); +static int +copy_eager_output_ids(struct fsm *dst, const struct fsm *src); + struct fsm * fsm_clone(const struct fsm *fsm) { @@ -80,6 +84,12 @@ fsm_clone(const struct fsm *fsm) fsm_free(new); return NULL; } + + /* does not copy callback */ + if (!copy_eager_output_ids(new, fsm)) { + fsm_free(new); + return NULL; + } } return new; @@ -159,3 +169,31 @@ copy_end_ids(struct fsm *dst, const struct fsm *src) return env.ok; } + +struct copy_eager_output_ids_env { + bool ok; + struct fsm *dst; +}; + +static int +copy_eager_output_ids_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + struct copy_eager_output_ids_env *env = opaque; + if (!fsm_seteageroutput(env->dst, state, id)) { + env->ok = false; + return 0; + } + + return 1; +} + +static int +copy_eager_output_ids(struct fsm *dst, const struct fsm *src) +{ + struct copy_eager_output_ids_env env; + env.dst = dst; + env.ok = true; + + fsm_eager_output_iter_all(src, copy_eager_output_ids_cb, &env); + return env.ok; +} diff --git a/src/libfsm/consolidate.c b/src/libfsm/consolidate.c index 236a4f6f5..b7a8905b2 100644 --- a/src/libfsm/consolidate.c +++ b/src/libfsm/consolidate.c @@ -25,6 +25,7 @@ #include "internal.h" #include "capture.h" #include "endids.h" +#include "eager_output.h" #define LOG_MAPPING 0 #define LOG_CONSOLIDATE_CAPTURES 0 @@ -53,6 +54,10 @@ static int consolidate_end_ids(struct fsm *dst, const struct fsm *src, const fsm_state_t *mapping, size_t mapping_count); +static int +consolidate_eager_output_ids(struct fsm *dst, const struct fsm *src, + const fsm_state_t *mapping, size_t mapping_count); + static fsm_state_t mapping_cb(fsm_state_t id, const void *opaque) { @@ -154,6 +159,10 @@ fsm_consolidate(const struct fsm *src, } } + if (!consolidate_eager_output_ids(dst, src, mapping, mapping_count)) { + goto cleanup; + } + f_free(src->alloc, seen); return dst; @@ -270,3 +279,40 @@ consolidate_end_ids(struct fsm *dst, const struct fsm *src, return ret; } + +struct consolidate_eager_output_ids_env { + bool ok; + struct fsm *dst; + const fsm_state_t *mapping; + size_t mapping_count; +}; + +static int +consolidate_eager_output_ids_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + struct consolidate_eager_output_ids_env *env = opaque; + assert(state < env->mapping_count); + const fsm_state_t dst_state = env->mapping[state]; + + if (!fsm_seteageroutput(env->dst, dst_state, id)) { + env->ok = false; + return 0; + } + + return 1; +} + +static int +consolidate_eager_output_ids(struct fsm *dst, const struct fsm *src, + const fsm_state_t *mapping, size_t mapping_count) +{ + struct consolidate_eager_output_ids_env env = { + .ok = true, + .dst = dst, + .mapping = mapping, + .mapping_count = mapping_count, + }; + fsm_eager_output_iter_all(src, consolidate_eager_output_ids_cb, &env); + return env.ok; +} + diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index 42992b6bc..8978ce06c 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -6,6 +6,9 @@ #include "determinise_internal.h" +#include +#include + static void dump_labels(FILE *f, const uint64_t labels[4]) { @@ -253,6 +256,10 @@ fsm_determinise(struct fsm *nfa) goto cleanup; } + if (!remap_eager_outputs(&map, issp, dfa, nfa)) { + goto cleanup; + } + fsm_move(nfa, dfa); } @@ -334,6 +341,22 @@ add_reverse_mapping(const struct fsm_alloc *alloc, return 1; } +static void +free_reverse_mappings(const struct fsm_alloc *alloc, size_t map_count, struct reverse_mapping *rmaps) +{ + if (rmaps == NULL) { return; } + + for (size_t map_i = 0; map_i < map_count; map_i++) { + struct reverse_mapping *rmap = &rmaps[map_i]; + for (size_t i = 0; i < rmap->count; i++) { + f_free(alloc, rmap[i].list); + rmap->count = 0; + rmap[i].list = NULL; + } + } + f_free(alloc, rmaps); +} + static int det_copy_capture_actions_cb(fsm_state_t state, enum capture_action_type type, unsigned capture_id, fsm_state_t to, @@ -405,7 +428,7 @@ hash_iss(interned_state_set_id iss) } static struct mapping * -map_first(struct map *map, struct map_iter *iter) +map_first(const struct map *map, struct map_iter *iter) { iter->m = map; iter->i = 0; @@ -641,22 +664,14 @@ stack_pop(struct mappingstack *stack) return item; } -static int -remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, - struct fsm *dst_dfa, struct fsm *src_nfa) +static struct reverse_mapping * +build_reverse_mappings(const struct map *map, struct interned_state_set_pool *issp, + struct fsm *dst_dfa, const struct fsm *src_nfa) { + struct reverse_mapping *reverse_mappings = NULL; struct map_iter it; struct state_iter si; struct mapping *m; - struct reverse_mapping *reverse_mappings; - fsm_state_t state; - const size_t capture_count = fsm_countcaptures(src_nfa); - size_t i, j; - int res = 0; - - if (capture_count == 0) { - return 1; - } /* This is not 1 to 1 -- if state X is now represented by multiple * states Y in the DFA, and state X has action(s) when transitioning @@ -667,9 +682,7 @@ remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, * checking reachability from every X, but the actual path * handling later will also check reachability. */ reverse_mappings = f_calloc(dst_dfa->alloc, src_nfa->statecount, sizeof(reverse_mappings[0])); - if (reverse_mappings == NULL) { - return 0; - } + if (reverse_mappings == NULL) { goto cleanup; } /* build reverse mappings table: for every NFA state X, if X is part * of the new DFA state Y, then add Y to a list for X */ @@ -679,6 +692,7 @@ remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, assert(m->dfastate < dst_dfa->statecount); ss = interned_state_set_get_state_set(issp, iss_id); + fsm_state_t state; for (state_set_reset(ss, &si); state_set_next(&si, &state); ) { if (!add_reverse_mapping(dst_dfa->alloc, reverse_mappings, @@ -688,33 +702,47 @@ remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, } } -#if LOG_DETERMINISE_CAPTURES +#if LOG_BUILD_REVERSE_MAPPING fprintf(stderr, "#### reverse mapping for %zu states\n", src_nfa->statecount); - for (i = 0; i < src_nfa->statecount; i++) { + for (size_t i = 0; i < src_nfa->statecount; i++) { struct reverse_mapping *rm = &reverse_mappings[i]; fprintf(stderr, "%lu:", i); - for (j = 0; j < rm->count; j++) { + for (size_t j = 0; j < rm->count; j++) { fprintf(stderr, " %u", rm->list[j]); } fprintf(stderr, "\n"); } -#else - (void)j; #endif + return reverse_mappings; + +cleanup: + free_reverse_mappings(dst_dfa->alloc, src_nfa->statecount, reverse_mappings); + return NULL; +} + +static int +remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, + struct fsm *dst_dfa, struct fsm *src_nfa) +{ + const size_t capture_count = fsm_countcaptures(src_nfa); + int res = 0; + + if (capture_count == 0) { + return 1; + } + + struct reverse_mapping *reverse_mappings = build_reverse_mappings(map, issp, dst_dfa, src_nfa); + if (reverse_mappings == NULL) { goto cleanup; } + if (!det_copy_capture_actions(reverse_mappings, dst_dfa, src_nfa)) { goto cleanup; } res = 1; -cleanup: - for (i = 0; i < src_nfa->statecount; i++) { - if (reverse_mappings[i].list != NULL) { - f_free(dst_dfa->alloc, reverse_mappings[i].list); - } - } - f_free(dst_dfa->alloc, reverse_mappings); +cleanup: + free_reverse_mappings(dst_dfa->alloc, src_nfa->statecount, reverse_mappings); return res; } @@ -2528,3 +2556,50 @@ analyze_closures__grow_outputs(struct analyze_closures_env *env) env->output_ceil = nceil; return 1; } + +struct remap_eager_output_env { + bool ok; + struct fsm *dst; + fsm_state_t dst_state; +}; + +static int +remap_eager_output_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + (void)state; + struct remap_eager_output_env *env = opaque; + if (!fsm_seteageroutput(env->dst, env->dst_state, id)) { + env->ok = false; + return 0; + } + + return 1; +} + +static int +remap_eager_outputs(const struct map *map, struct interned_state_set_pool *issp, + struct fsm *dst_dfa, const struct fsm *src_nfa) +{ + /* For each DFA state, get the set of NFA states corresponding to it from the + * map and issp, then copy every eager output ID over. */ + struct map_iter iter; + for (struct mapping *b = map_first(map, &iter); b != NULL; b = map_next(&iter)) { + struct state_set *ss = interned_state_set_get_state_set(issp, b->iss); + assert(ss != NULL); + + struct state_iter it; + fsm_state_t s; + state_set_reset(ss, &it); + while (state_set_next(&it, &s)) { + struct remap_eager_output_env env = { + .ok = true, + .dst = dst_dfa, + .dst_state = b->dfastate, + }; + fsm_eager_output_iter_state(src_nfa, s, remap_eager_output_cb, &env); + if (!env.ok) { return 0; } + } + } + + return 1; +} diff --git a/src/libfsm/determinise_internal.h b/src/libfsm/determinise_internal.h index cfd4ea663..2e925d28c 100644 --- a/src/libfsm/determinise_internal.h +++ b/src/libfsm/determinise_internal.h @@ -23,6 +23,7 @@ #include "internal.h" #include "capture.h" #include "endids.h" +#include "eager_output.h" #include @@ -35,6 +36,7 @@ #define LOG_AC 0 #define LOG_GROUPING 0 #define LOG_ANALYSIS_STATS 0 +#define LOG_BUILD_REVERSE_MAPPING 0 #if LOG_DETERMINISE_CAPTURES || LOG_INPUT #include @@ -72,7 +74,7 @@ struct map { }; struct map_iter { - struct map *m; + const struct map *m; size_t i; }; @@ -304,7 +306,7 @@ static void map_free(struct map *map); static struct mapping * -map_first(struct map *map, struct map_iter *iter); +map_first(const struct map *map, struct map_iter *iter); static struct mapping * map_next(struct map_iter *iter); @@ -325,6 +327,10 @@ static int remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, struct fsm *dst_dfa, struct fsm *src_nfa); +static int +remap_eager_outputs(const struct map *map, struct interned_state_set_pool *issp, + struct fsm *dst_dfa, const struct fsm *src_nfa); + static struct mappingstack * stack_init(const struct fsm_alloc *alloc); diff --git a/src/libfsm/eager_output.c b/src/libfsm/eager_output.c new file mode 100644 index 000000000..e37a8a4bf --- /dev/null +++ b/src/libfsm/eager_output.c @@ -0,0 +1,403 @@ +/* + * Copyright 2024 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#include +#include + +#include "internal.h" + +#include +#include + +#include +#include +#include + +#include "eager_output.h" + +#define LOG_LEVEL 0 + +/* must be a power of 2 */ +#define DEF_BUCKET_COUNT 4 +#define DEF_ENTRY_CEIL 2 + +struct eager_output_info { + fsm_eager_output_cb *cb; + void *opaque; + + struct eager_output_htab { + size_t bucket_count; + size_t buckets_used; + /* empty if entry is NULL, otherwise keyed by state */ + struct eager_output_bucket { + fsm_state_t state; + struct eager_output_entry { + unsigned used; + unsigned ceil; + fsm_end_id_t ids[]; + } *entry; + } *buckets; + } htab; +}; + +void +fsm_eager_output_set_cb(struct fsm *fsm, fsm_eager_output_cb *cb, void *opaque) +{ +#if LOG_LEVEL > 2 + fprintf(stderr, "-- fsm_eager_output_set_cb %p\n", (void *)fsm); +#endif + assert(fsm != NULL); + assert(fsm->eager_output_info != NULL); + fsm->eager_output_info->cb = cb; + fsm->eager_output_info->opaque = opaque; +} + +void +fsm_eager_output_get_cb(const struct fsm *fsm, fsm_eager_output_cb **cb, void **opaque) +{ + *cb = fsm->eager_output_info->cb; + *opaque = fsm->eager_output_info->opaque; +} + +int +fsm_eager_output_init(struct fsm *fsm) +{ + struct eager_output_info *ei = f_calloc(fsm->alloc, 1, sizeof(*ei)); + + if (ei == NULL) { return 0; } + + struct eager_output_bucket *buckets = f_calloc(fsm->alloc, + DEF_BUCKET_COUNT, sizeof(buckets[0])); + if (buckets == NULL) { + f_free(fsm->alloc, ei); + return 0; + } + +#if LOG_LEVEL > 2 + fprintf(stderr, "-- fsm_eager_output_init %p\n", (void *)fsm); +#endif + + ei->htab.buckets = buckets; + ei->htab.bucket_count = DEF_BUCKET_COUNT; + + fsm->eager_output_info = ei; + return 1; +} + +void +fsm_eager_output_free(struct fsm *fsm) +{ + if (fsm == NULL || fsm->eager_output_info == NULL) { return; } + + for (size_t i = 0; i < fsm->eager_output_info->htab.bucket_count; i++) { + struct eager_output_bucket *b = &fsm->eager_output_info->htab.buckets[i]; + if (b->entry == NULL) { continue; } + f_free(fsm->alloc, b->entry); + } + f_free(fsm->alloc, fsm->eager_output_info->htab.buckets); + + f_free(fsm->alloc, fsm->eager_output_info); +#if LOG_LEVEL > 2 + fprintf(stderr, "-- fsm_eager_output_free %p\n", (void *)fsm); +#endif + fsm->eager_output_info = NULL; +} + +int +fsm_seteageroutputonends(struct fsm *fsm, fsm_output_id_t id) +{ + assert(fsm != NULL); + const size_t count = fsm_countstates(fsm); + for (size_t i = 0; i < count; i++) { + if (fsm_isend(fsm, i)) { + if (!fsm_seteageroutput(fsm, i, id)) { return 0; } + } + } + return 1; +} + +static bool +grow_htab(const struct fsm_alloc *alloc, struct eager_output_htab *htab) +{ + const size_t nbucket_count = 2*htab->bucket_count; + assert(nbucket_count != 0); + + struct eager_output_bucket *nbuckets = f_calloc(alloc, nbucket_count, + sizeof(nbuckets[0])); + if (nbuckets == NULL) { return false; } + + const uint64_t nmask = nbucket_count - 1; + assert((nmask & nbucket_count) == 0); /* power of 2 */ + + for (size_t ob_i = 0; ob_i < htab->bucket_count; ob_i++) { + struct eager_output_bucket *ob = &htab->buckets[ob_i]; + if (ob->entry == NULL) { continue; } + + const uint64_t hash = hash_id(ob->state); + for (size_t probes = 0; probes < nbucket_count; probes++) { + const size_t nb_i = (hash + probes) & nmask; + struct eager_output_bucket *nb = &nbuckets[nb_i]; + if (nb->entry == NULL) { + nb->state = ob->state; + nb->entry = ob->entry; + break; + } else { + assert(nb->state != ob->state); + } + } + } + + f_free(alloc, htab->buckets); + htab->bucket_count = nbucket_count; + htab->buckets = nbuckets; + return true; +} + +int +fsm_seteageroutput(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id) +{ + assert(fsm != NULL); + + struct eager_output_info *info = fsm->eager_output_info; + assert(info->htab.bucket_count > 0); + + if (info->htab.buckets_used >= info->htab.bucket_count/2) { + if (!grow_htab(fsm->alloc, &info->htab)) { return 0; } + } + + const uint64_t hash = hash_id(state); + const uint64_t mask = info->htab.bucket_count - 1; + assert((mask & info->htab.bucket_count) == 0); /* power of 2 */ + + /* fprintf(stderr, "%s: bucket_count %zd\n", __func__, info->htab.bucket_count); */ + for (size_t probes = 0; probes < info->htab.bucket_count; probes++) { + const size_t b_i = (hash + probes) & mask; + struct eager_output_bucket *b = &info->htab.buckets[b_i]; + /* fprintf(stderr, "%s: state %d -> b_i %zd, state %d, entry %p\n", */ + /* __func__, state, b_i, b->state, (void *)b->entry); */ + struct eager_output_entry *e = b->entry; + if (e == NULL) { /* empty */ + /* add */ + const size_t alloc_sz = sizeof(*e) + + DEF_ENTRY_CEIL * sizeof(e->ids[0]); + e = f_calloc(fsm->alloc, 1, alloc_sz); + if (e == NULL) { + return 0; + } + e->ceil = DEF_ENTRY_CEIL; + b->state = state; + b->entry = e; + info->htab.buckets_used++; + /* fprintf(stderr, "%s: buckets_used %zd\n", __func__, info->htab.buckets_used); */ + /* fprintf(stderr, "%s: saved new entry in bucket %zd\n", __func__, b_i); */ + } else if (b->state != state) { /* collision */ + continue; + } + + if (e->used == e->ceil) { + const size_t nceil = 2 * e->ceil; + const size_t nsize = sizeof(*e) + + nceil * sizeof(e->ids[0]); + struct eager_output_entry *nentry = f_realloc(fsm->alloc, e, nsize); + if (nentry == NULL) { return 0; } + nentry->ceil = nceil; + b->entry = nentry; + e = b->entry; + } + + /* ignore duplicates */ + for (size_t i = 0; i < e->used; i++) { + if (e->ids[i] == id) { return 1; } + } + + e->ids[e->used++] = id; + /* fprintf(stderr, "%s: e->ids_used %u\n", __func__, e->used); */ + fsm->states[state].has_eager_outputs = 1; + return 1; + } + + return 1; +} + +bool +fsm_eager_output_has_eager_output(const struct fsm *fsm) +{ + assert(fsm->eager_output_info != NULL); + const struct eager_output_htab *htab = &fsm->eager_output_info->htab; + + for (size_t b_i = 0; b_i < htab->bucket_count; b_i++) { + struct eager_output_bucket *b = &htab->buckets[b_i]; + if (b->entry == NULL) { continue; } + if (b->entry->used > 0) { return 1; } + } + return 0; +} + +bool +fsm_eager_output_state_has_eager_output(const struct fsm *fsm, fsm_state_t state) +{ + assert(state < fsm->statecount); + return fsm->states[state].has_eager_outputs; +} + +void +fsm_eager_output_iter_state(const struct fsm *fsm, + fsm_state_t state, fsm_eager_output_iter_cb *cb, void *opaque) +{ + assert(fsm != NULL); + assert(cb != NULL); + + const uint64_t hash = hash_id(state); + + struct eager_output_info *info = fsm->eager_output_info; + const uint64_t mask = info->htab.bucket_count - 1; + assert((mask & info->htab.bucket_count) == 0); /* power of 2 */ + + for (size_t probes = 0; probes < info->htab.bucket_count; probes++) { + const size_t b_i = (hash + probes) & mask; + struct eager_output_bucket *b = &info->htab.buckets[b_i]; + /* fprintf(stderr, "%s: state %d -> b_i %zd, state %d, entry %p\n", */ + /* __func__, state, b_i, b->state, (void *)b->entry); */ + struct eager_output_entry *e = b->entry; + if (e == NULL) { /* empty */ + return; + } else if (b->state != state) { /* collision */ + continue; + } + + assert(e->used == 0 || fsm->states[state].has_eager_outputs); + + for (size_t i = 0; i < e->used; i++) { + if (!cb(state, e->ids[i], opaque)) { return; } + } + } +} + +void +fsm_eager_output_iter_all(const struct fsm *fsm, + fsm_eager_output_iter_cb *cb, void *opaque) +{ + assert(fsm != NULL); + assert(cb != NULL); + assert(fsm->eager_output_info != NULL); + + struct eager_output_info *info = fsm->eager_output_info; + + /* fprintf(stderr, "%s: bucket_count %zd\n", __func__, info->htab.bucket_count); */ + for (size_t b_i = 0; b_i < info->htab.bucket_count; b_i++) { + struct eager_output_bucket *b = &info->htab.buckets[b_i]; + struct eager_output_entry *e = b->entry; + /* fprintf(stderr, "%s: b_i %zd, state %d, entry %p\n", */ + /* __func__, b_i, b->state, (void *)b->entry); */ + if (e == NULL) { /* empty */ + continue; + } + assert(e->used == 0 || fsm->states[b->state].has_eager_outputs); + + for (size_t i = 0; i < e->used; i++) { + if (!cb(b->state, e->ids[i], opaque)) { return; } + } + } +} + +struct dump_env { + FILE *f; + size_t count; +}; + +static int +dump_cb(fsm_state_t state, fsm_end_id_t id, void *opaque) + +{ + struct dump_env *env = opaque; + fprintf(env->f, "-- %d: id %d\n", state, id); + env->count++; + return 1; +} + +void +fsm_eager_output_dump(FILE *f, const struct fsm *fsm) +{ + struct dump_env env = { .f = f }; + fprintf(f, "%s:\n", __func__); + fsm_eager_output_iter_all(fsm, dump_cb, (void *)&env); + fprintf(f, "== %zu total\n", env.count); +} + +static int +inc_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + (void)state; + (void)id; + size_t *count = opaque; + (*count)++; + return 1; +} + +bool +fsm_eager_output_has_any(const struct fsm *fsm, + fsm_state_t state, size_t *count) +{ + size_t c = 0; + fsm_eager_output_iter_state(fsm, state, &inc_cb, &c); + if (count != NULL) { *count = c; } + return c > 0; +} + +int +fsm_eager_output_compact(struct fsm *fsm, fsm_state_t *mapping, size_t mapping_count) +{ + /* Don't reallocate unless something has actually changed. */ + bool changes = false; + for (size_t i = 0; i < mapping_count; i++) { + if (mapping[i] != i) { + changes = true; + break; + } + } + + /* nothing to do */ + if (!changes) { return 1; } + + struct eager_output_info *eoi = fsm->eager_output_info; + + struct eager_output_bucket *nbuckets = f_calloc(fsm->alloc, + eoi->htab.bucket_count, sizeof(nbuckets[0])); + if (nbuckets == NULL) { + return 0; + } + + const uint64_t mask = eoi->htab.bucket_count - 1; + assert((eoi->htab.bucket_count & mask) == 0); + + for (size_t ob_i = 0; ob_i < eoi->htab.bucket_count; ob_i++) { + const struct eager_output_bucket *ob = &eoi->htab.buckets[ob_i]; + if (ob->entry == NULL) { continue; } + + assert(ob->state < mapping_count); + const fsm_state_t nstate = mapping[ob->state]; + if (nstate == FSM_STATE_REMAP_NO_STATE) { continue; } + + const uint64_t hash = hash_id(nstate); + + bool placed = false; + for (size_t probes = 0; probes < eoi->htab.bucket_count; probes++) { + const size_t nb_i = (hash + probes) & mask; + struct eager_output_bucket *nb = &nbuckets[nb_i]; + if (nb->entry == NULL) { + nb->state = nstate; + nb->entry = ob->entry; + placed = true; + break; + } + } + assert(placed); + } + + f_free(fsm->alloc, eoi->htab.buckets); + eoi->htab.buckets = nbuckets; + return 1; +} diff --git a/src/libfsm/eager_output.h b/src/libfsm/eager_output.h new file mode 100644 index 000000000..1b48ba4c4 --- /dev/null +++ b/src/libfsm/eager_output.h @@ -0,0 +1,46 @@ +#ifndef EAGER_OUTPUT_H +#define EAGER_OUTPUT_H + +#include +#include +#include + +struct eager_output_info; + +int +fsm_eager_output_init(struct fsm *fsm); + +void +fsm_eager_output_free(struct fsm *fsm); + +bool +fsm_eager_output_has_eager_output(const struct fsm *fsm); + +bool +fsm_eager_output_state_has_eager_output(const struct fsm *fsm, fsm_state_t state); + +void +fsm_eager_output_dump(FILE *f, const struct fsm *fsm); + +/* Callback for fsm_eager_output_iter_*. + * The return value indicates whether iteration should continue. + * The results may not be sorted in any particular order. */ +typedef int +fsm_eager_output_iter_cb(fsm_state_t state, fsm_output_id_t id, void *opaque); + +void +fsm_eager_output_iter_state(const struct fsm *fsm, + fsm_state_t state, fsm_eager_output_iter_cb *cb, void *opaque); + +void +fsm_eager_output_iter_all(const struct fsm *fsm, + fsm_eager_output_iter_cb *cb, void *opaque); + +bool +fsm_eager_output_has_any(const struct fsm *fsm, + fsm_state_t state, size_t *count); + +int +fsm_eager_output_compact(struct fsm *fsm, fsm_state_t *mapping, size_t mapping_count); + +#endif diff --git a/src/libfsm/epsilons.c b/src/libfsm/epsilons.c index 9394a2d9b..adfcdec2a 100644 --- a/src/libfsm/epsilons.c +++ b/src/libfsm/epsilons.c @@ -9,24 +9,42 @@ #include #include #include +#include #include #include +#include #include #include #include #include +#include #include "internal.h" #include "capture.h" #include "endids.h" +#include "eager_output.h" #define DUMP_EPSILON_CLOSURES 0 #define DEF_PENDING_CAPTURE_ACTIONS_CEIL 2 #define LOG_RM_EPSILONS_CAPTURES 0 #define DEF_CARRY_ENDIDS_COUNT 2 +#define LOG_LEVEL 0 + +#if LOG_LEVEL > 0 +static bool log_it; +#define LOG(LVL, ...) \ + do { \ + if (log_it && LVL <= LOG_LEVEL) { \ + fprintf(stderr, __VA_ARGS__); \ + } \ + } while (0) +#else +#define LOG(_LVL, ...) +#endif + struct remap_env { #ifndef NDEBUG char tag; @@ -57,6 +75,49 @@ static int carry_endids(struct fsm *fsm, struct state_set *states, fsm_state_t s); +static void +mark_states_reachable_by_label(const struct fsm *nfa, uint64_t *reachable_by_label); + +struct eager_output_buf { +#define DEF_EAGER_OUTPUT_BUF_CEIL 8 + bool ok; + const struct fsm_alloc *alloc; + size_t ceil; + size_t used; + fsm_output_id_t *ids; +}; + +static bool +append_eager_output_id(struct eager_output_buf *buf, fsm_output_id_t id) +{ + if (buf->used == buf->ceil) { + const size_t nceil = buf->ceil == 0 ? DEF_EAGER_OUTPUT_BUF_CEIL : 2*buf->ceil; + fsm_output_id_t *nids = f_realloc(buf->alloc, buf->ids, nceil * sizeof(nids[0])); + if (nids == NULL) { + buf->ok = false; + return false; + } + buf->ids = nids; + buf->ceil = nceil; + } + + for (size_t i = 0; i < buf->used; i++) { + /* avoid duplicates */ + if (buf->ids[i] == id) { return true; } + } + + buf->ids[buf->used++] = id; + return true; +} + +static int +collect_eager_output_ids_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + (void)state; + struct eager_output_buf *buf = opaque; + return append_eager_output_id(buf, id) ? 1 : 0; +} + int fsm_remove_epsilons(struct fsm *nfa) { @@ -64,9 +125,20 @@ fsm_remove_epsilons(struct fsm *nfa) int res = 0; struct state_set **eclosures = NULL; fsm_state_t s; + struct eager_output_buf eager_output_buf = { + .ok = true, + .alloc = nfa->alloc, + }; + uint64_t *reachable_by_label = NULL; + + LOG(2, "%s: starting\n", __func__); INIT_TIMERS(); +#if LOG_LEVEL > 0 + log_it = getenv("LOG") != NULL; +#endif + assert(nfa != NULL); TIME(&pre); @@ -94,6 +166,17 @@ fsm_remove_epsilons(struct fsm *nfa) } #endif + const size_t state_words = u64bitset_words(state_count); + reachable_by_label = f_calloc(nfa->alloc, state_words, sizeof(reachable_by_label[0])); + if (reachable_by_label == NULL) { goto cleanup; } + + mark_states_reachable_by_label(nfa, reachable_by_label); + + fsm_state_t start; + if (!fsm_getstart(nfa, &start)) { + goto cleanup; /* no start state */ + } + for (s = 0; s < state_count; s++) { struct state_iter si; fsm_state_t es_id; @@ -101,6 +184,12 @@ fsm_remove_epsilons(struct fsm *nfa) struct edge_group_iter egi; struct edge_group_iter_info info; + /* If the state isn't reachable by a label and isn't the start state, + * skip processing -- it will soon become garbage. */ + if (!u64bitset_get(reachable_by_label, s) && s != start) { + continue; + } + /* Process the epsilon closure. */ state_set_reset(eclosures[s], &si); while (state_set_next(&si, &es_id)) { @@ -129,6 +218,16 @@ fsm_remove_epsilons(struct fsm *nfa) } } + /* Collect every eager output ID from any state + * in the current state's epsilon closure to the + * current state. These will be added at the end. */ + { + if (fsm_eager_output_has_any(nfa, es_id, NULL)) { + fsm_eager_output_iter_state(nfa, es_id, collect_eager_output_ids_cb, &eager_output_buf); + if (!eager_output_buf.ok) { goto cleanup; } + } + } + /* For every state in this state's transitive * epsilon closure, add all of their sets of * labeled edges. */ @@ -144,6 +243,13 @@ fsm_remove_epsilons(struct fsm *nfa) } } } + + for (size_t i = 0; i < eager_output_buf.used; i++) { + if (!fsm_seteageroutput(nfa, s, eager_output_buf.ids[i])) { + goto cleanup; + } + } + eager_output_buf.used = 0; /* clear */ } /* Remove the epsilon-edge state sets from everything. @@ -170,13 +276,53 @@ fsm_remove_epsilons(struct fsm *nfa) res = 1; cleanup: + LOG(2, "%s: finishing\n", __func__); if (eclosures != NULL) { closure_free(nfa, eclosures, state_count); } + f_free(nfa->alloc, reachable_by_label); + f_free(nfa->alloc, eager_output_buf.ids); return res; } +/* For every state, mark every state reached by a labeled edge as + * reachable. This doesn't check that the FROM state is reachable from + * the start state (trim will do that soon enough), it's just used to + * check which states will become unreachable once epsilon edges are + * removed. We don't need to add eager endids for them, because they + * will soon be disconnected from the epsilon-free NFA. */ +static void +mark_states_reachable_by_label(const struct fsm *nfa, uint64_t *reachable_by_label) +{ + fsm_state_t start; + if (!fsm_getstart(nfa, &start)) { + return; /* nothing reachable */ + } + u64bitset_set(reachable_by_label, start); + + const fsm_state_t state_count = fsm_countstates(nfa); + + for (size_t s_i = 0; s_i < state_count; s_i++) { + struct edge_group_iter egi; + struct edge_group_iter_info info; + + struct fsm_state *s = &nfa->states[s_i]; + + /* Clear the visited flag, it will be used to avoid cycles. */ +#if 1 + assert(s->visited == 0); /* stale */ +#endif + s->visited = 0; + + edge_set_group_iter_reset(s->edges, EDGE_GROUP_ITER_ALL, &egi); + while (edge_set_group_iter_next(&egi, &info)) { + LOG(1, "%s: reachable: %d\n", __func__, info.to); + u64bitset_set(reachable_by_label, info.to); + } + } +} + static int remap_capture_actions(struct fsm *nfa, struct state_set **eclosures) { @@ -425,4 +571,3 @@ carry_endids(struct fsm *fsm, struct state_set *states, return env.ok; } - diff --git a/src/libfsm/exec.c b/src/libfsm/exec.c index 9f7b21802..077494b8f 100644 --- a/src/libfsm/exec.c +++ b/src/libfsm/exec.c @@ -20,9 +20,12 @@ #include "internal.h" #include "capture.h" +#include "eager_output.h" #define LOG_EXEC 0 +#define LOG_EAGER 0 + static int transition(const struct fsm *fsm, fsm_state_t state, int c, size_t offset, struct fsm_capture *captures, @@ -43,6 +46,44 @@ transition(const struct fsm *fsm, fsm_state_t state, int c, return 1; } +struct check_eager_outputs_for_state_env { + const struct fsm *fsm; + fsm_eager_output_cb *cb; + void *opaque; +}; + +static int +match_eager_outputs_for_state_cb(fsm_state_t state, fsm_end_id_t id, void *opaque) +{ + /* HACK update the types here once it's working */ + (void)state; + struct check_eager_outputs_for_state_env *env = opaque; +#if LOG_EAGER + fprintf(stderr, "%s: state %d, id %d\n", __func__, state, id); +#endif + env->cb(id, env->opaque); + return 1; +} + +static int +match_eager_outputs_for_state(const struct fsm *fsm, fsm_state_t state) +{ + /* HACK update the types here once it's working */ + fsm_eager_output_cb *cb = NULL; + void *opaque = NULL; + fsm_eager_output_get_cb(fsm, &cb, &opaque); + if (cb == NULL) { return 1; } /* nothing to do */ + + struct check_eager_outputs_for_state_env env = { + .fsm = fsm, + .cb = cb, + .opaque = opaque, + }; + fsm_eager_output_iter_state(fsm, + state, match_eager_outputs_for_state_cb, &env); + return 1; +} + int fsm_exec(const struct fsm *fsm, int (*fsm_getc)(void *opaque), void *opaque, @@ -73,6 +114,7 @@ fsm_exec(const struct fsm *fsm, errno = EINVAL; return -1; } + const fsm_state_t start = state; for (i = 0; i < capture_count; i++) { captures[i].pos[0] = FSM_CAPTURE_NO_POS; @@ -83,6 +125,12 @@ fsm_exec(const struct fsm *fsm, fprintf(stderr, "fsm_exec: starting at %d\n", state); #endif + if (fsm->states[start].has_eager_outputs) { + if (!match_eager_outputs_for_state(fsm, start)) { + return 0; + } + } + while (c = fsm_getc(opaque), c != EOF) { if (!transition(fsm, state, c, offset, captures, &state)) { #if LOG_EXEC @@ -91,6 +139,12 @@ fsm_exec(const struct fsm *fsm, return 0; } + if (fsm->states[state].has_eager_outputs) { + if (!match_eager_outputs_for_state(fsm, state)) { + return 0; + } + } + #if LOG_EXEC fprintf(stderr, "fsm_exec: @ %zu, input '%c', new state %u\n", offset, c, state); @@ -113,4 +167,3 @@ fsm_exec(const struct fsm *fsm, *end = state; return 1; } - diff --git a/src/libfsm/fsm.c b/src/libfsm/fsm.c index ba2d2db26..c442c8262 100644 --- a/src/libfsm/fsm.c +++ b/src/libfsm/fsm.c @@ -21,6 +21,7 @@ #include "internal.h" #include "capture.h" #include "endids.h" +#include "eager_output.h" /* guess for default state allocation */ #define FSM_DEFAULT_STATEALLOC 128 @@ -39,6 +40,7 @@ free_contents(struct fsm *fsm) fsm_capture_free(fsm); fsm_endid_free(fsm); + fsm_eager_output_free(fsm); f_free(fsm->alloc, fsm->states); } @@ -92,6 +94,14 @@ fsm_new_statealloc(const struct fsm_alloc *alloc, size_t statealloc) return NULL; } + if (!fsm_eager_output_init(new)) { + f_free(new->alloc, new->states); + f_free(new->alloc, new); + fsm_capture_free(new); + fsm_endid_free(new); + return NULL; + } + return new; } @@ -133,6 +143,7 @@ fsm_move(struct fsm *dst, struct fsm *src) dst->capture_info = src->capture_info; dst->endid_info = src->endid_info; + dst->eager_output_info = src->eager_output_info; f_free(src->alloc, src); } diff --git a/src/libfsm/internal.h b/src/libfsm/internal.h index f84bbef0f..46997c82a 100644 --- a/src/libfsm/internal.h +++ b/src/libfsm/internal.h @@ -60,6 +60,10 @@ struct fsm_state { /* meaningful within one particular transformation only */ unsigned int visited:1; + + /* If 0, then this state has no need for checking + * the fsm->eager_output_info struct. */ + unsigned int has_eager_outputs:1; }; struct fsm { @@ -75,6 +79,7 @@ struct fsm { struct fsm_capture_info *capture_info; struct endid_info *endid_info; + struct eager_output_info *eager_output_info; }; struct fsm * diff --git a/src/libfsm/libfsm.syms b/src/libfsm/libfsm.syms index 34be09e77..75c20eb64 100644 --- a/src/libfsm/libfsm.syms +++ b/src/libfsm/libfsm.syms @@ -2,6 +2,7 @@ fsm_complement fsm_union fsm_union_array +fsm_union_repeated_pattern_group fsm_intersect fsm_intersect_charset @@ -72,6 +73,8 @@ fsm_removestate fsm_shuffle fsm_vacuum +fsm_new_statealloc + fsm_addedge_any fsm_addedge_epsilon fsm_addedge_literal @@ -95,6 +98,14 @@ fsm_setendid fsm_mapendids fsm_increndids +fsm_endid_dump + +fsm_seteageroutput +fsm_seteageroutputonends +# short term hack +fsm_eager_output_set_cb +fsm_eager_output_dump + fsm_countedges fsm_countstates diff --git a/src/libfsm/merge.c b/src/libfsm/merge.c index 8c972c145..ccc1568ff 100644 --- a/src/libfsm/merge.c +++ b/src/libfsm/merge.c @@ -22,6 +22,7 @@ #include "capture.h" #include "internal.h" #include "endids.h" +#include "eager_output.h" #define LOG_MERGE_ENDIDS 0 @@ -39,6 +40,9 @@ copy_capture_actions(struct fsm *dst, struct fsm *src); static int copy_end_ids(struct fsm *dst, struct fsm *src, fsm_state_t base_src); +static int +copy_eager_output_ids(struct fsm *dst, struct fsm *src, fsm_state_t base_src); + static struct fsm * merge(struct fsm *dst, struct fsm *src, fsm_state_t *base_dst, fsm_state_t *base_src, @@ -113,6 +117,11 @@ merge(struct fsm *dst, struct fsm *src, return NULL; } + if (!copy_eager_output_ids(dst, src, *base_src)) { + /* non-recoverable -- destructive operation */ + return NULL; + } + f_free(src->alloc, src->states); src->states = NULL; src->statealloc = 0; @@ -194,6 +203,39 @@ copy_end_ids(struct fsm *dst, struct fsm *src, fsm_state_t base_src) return fsm_endid_iter_bulk(src, copy_end_ids_cb, &env); } +struct copy_eager_output_ids_env { + bool ok; + struct fsm *dst; + struct fsm *src; + fsm_state_t base_src; +}; + +static int +copy_eager_output_ids_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + struct copy_eager_output_ids_env *env = opaque; + if (!fsm_seteageroutput(env->dst, state + env->base_src, id)) { + env->ok = false; + return 0; + } + + return 1; + +} + +static int +copy_eager_output_ids(struct fsm *dst, struct fsm *src, fsm_state_t base_src) +{ + struct copy_eager_output_ids_env env = { + .ok = true, + .dst = dst, + .src = src, + .base_src = base_src, + }; + fsm_eager_output_iter_all(src, copy_eager_output_ids_cb, &env); + return env.ok; +} + struct fsm * fsm_mergeab(struct fsm *a, struct fsm *b, fsm_state_t *base_b) diff --git a/src/libfsm/minimise.c b/src/libfsm/minimise.c index a8d53c57e..86f00b46f 100644 --- a/src/libfsm/minimise.c +++ b/src/libfsm/minimise.c @@ -25,6 +25,8 @@ #include "internal.h" #include "capture.h" +#include "eager_output.h" +#include "endids.h" #define LOG_MAPPINGS 0 #define LOG_STEPS 0 @@ -54,12 +56,21 @@ struct end_metadata { unsigned count; fsm_end_id_t *ids; } end; + + struct end_metadata_eager_outputs { + unsigned count; + fsm_output_id_t *ids; + } eager_outputs; }; static int collect_end_ids(const struct fsm *fsm, fsm_state_t s, struct end_metadata_end *e); +static int +collect_eager_output_ids(const struct fsm *fsm, fsm_state_t s, + struct end_metadata_eager_outputs *e); + int fsm_minimise(struct fsm *fsm) { @@ -122,6 +133,10 @@ fsm_minimise(struct fsm *fsm) /* Minimisation should never add states. */ assert(minimised_states <= orig_states); + for (size_t i = 0; i < fsm->statecount; i++) { + assert(mapping[i] < fsm->statecount); + } + /* Use the mapping to consolidate the current states * into a new DFA, combining states that could not be * proven distinguishable. */ @@ -693,6 +708,9 @@ same_end_metadata(const struct end_metadata *a, const struct end_metadata *b) if (a->end.count != b->end.count) { return 0; } + if (a->eager_outputs.count != b->eager_outputs.count) { + return 0; + } /* compare -- these must be sorted */ @@ -702,6 +720,12 @@ same_end_metadata(const struct end_metadata *a, const struct end_metadata *b) } } + for (size_t i = 0; i < a->eager_outputs.count; i++) { + if (a->eager_outputs.ids[i] != b->eager_outputs.ids[i]) { + return 0; + } + } + return 1; } @@ -750,14 +774,21 @@ split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm) #endif while (s != NO_ID) { struct end_metadata *e = &end_md[s]; - if (!fsm_isend(fsm, s)) { - break; /* this EC has non-end states, skip */ + const bool is_end = fsm_isend(fsm, s); + const bool has_eager_outputs = fsm_eager_output_state_has_eager_output(fsm, s); + + if (!is_end && !has_eager_outputs) { + break; /* skip */ } if (!collect_end_ids(fsm, s, &e->end)) { goto cleanup; } + if (!collect_eager_output_ids(fsm, s, &e->eager_outputs)) { + goto cleanup; + } + s = env->jump[s]; } } @@ -789,6 +820,10 @@ split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm) incremental_hash_of_ids(&hash, s_md->end.ids[eid_i]); } + for (size_t eo_i = 0; eo_i < s_md->eager_outputs.count; eo_i++) { + incremental_hash_of_ids(&hash, s_md->eager_outputs.ids[eo_i]); + } + for (size_t b_i = 0; b_i < bucket_count; b_i++) { fsm_state_t *b = &htab[(b_i + hash) & mask]; const fsm_state_t other = *b; @@ -932,6 +967,9 @@ split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm) if (e->end.ids != NULL) { f_free(fsm->alloc, e->end.ids); } + if (e->eager_outputs.ids != NULL) { + f_free(fsm->alloc, e->eager_outputs.ids); + } } f_free(fsm->alloc, end_md); } @@ -959,7 +997,7 @@ collect_end_ids(const struct fsm *fsm, fsm_state_t s, #if LOG_ECS fprintf(stderr, "%d:", s); - for (size_t i = 0; i < written; i++) { + for (size_t i = 0; i < e->count; i++) { fprintf(stderr, " %u", e->ids[i]); } fprintf(stderr, "\n"); @@ -968,6 +1006,41 @@ collect_end_ids(const struct fsm *fsm, fsm_state_t s, return 1; } +static int +collect_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + (void)state; + struct end_metadata_eager_outputs *e = opaque; + e->ids[e->count++] = id; + return 1; +} + +static int cmp_eager_output_id(const void *pa, const void *pb) +{ + const fsm_output_id_t a = *(fsm_output_id_t *)pa; + const fsm_output_id_t b = *(fsm_output_id_t *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + +static int +collect_eager_output_ids(const struct fsm *fsm, fsm_state_t state, + struct end_metadata_eager_outputs *e) +{ + size_t count = 0; + if (!fsm_eager_output_has_any(fsm, state, &count)) { + return 1; /* nothing to do */ + } + + e->ids = f_malloc(fsm->alloc, count * sizeof(e->ids[0])); + if (e->ids == NULL) { return 0; } + + fsm_eager_output_iter_state(fsm, state, collect_cb, e); + + /* sort, to normalize set */ + qsort(e->ids, e->count, sizeof(e->ids[0]), cmp_eager_output_id); + return 1; +} + #if EXPENSIVE_CHECKS static void check_done_ec_offset(const struct min_env *env) diff --git a/src/libfsm/print/c.c b/src/libfsm/print/c.c index 22b03963e..cc3927dc6 100644 --- a/src/libfsm/print/c.c +++ b/src/libfsm/print/c.c @@ -222,6 +222,14 @@ print_case(FILE *f, const struct ir *ir, assert(f != NULL); assert(cs != NULL); + if (cs->eager_outputs != NULL && opt->fragment) { + /* If .fragment is set and the state has eager outputs, then emit a call to a + * macro (the caller is expected to define). This is a temporary interface. */ + for (size_t i = 0; i < cs->eager_outputs->count; i++) { + fprintf(f, "\t\t\tFSM_SET_EAGER_OUTPUT(%u);\n", cs->eager_outputs->ids[i]); + } + } + switch (cs->strategy) { case IR_NONE: fprintf(f, "\t\t\t"); @@ -377,6 +385,11 @@ print_endstates(FILE *f, const struct fsm_state_metadata state_metadata = { .end_ids = ir->states[i].endids.ids, .end_id_count = ir->states[i].endids.count, + + .eager_output_count = (ir->states[i].eager_outputs == NULL + ? 0 : ir->states[i].eager_outputs->count), + .eager_output_ids = (ir->states[i].eager_outputs == NULL + ? NULL : ir->states[i].eager_outputs->ids), }; if (-1 == print_hook_accept(f, opt, hooks, diff --git a/src/libfsm/print/ir.c b/src/libfsm/print/ir.c index 457716dcc..81d5890e0 100644 --- a/src/libfsm/print/ir.c +++ b/src/libfsm/print/ir.c @@ -26,6 +26,7 @@ #include #include "libfsm/internal.h" +#include "libfsm/eager_output.h" #include "ir.h" @@ -505,6 +506,23 @@ make_example(const struct fsm *fsm, fsm_state_t s, char **example) return 0; } +static int +append_eager_output_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + struct ir_state_eager_output *outputs = opaque; + (void)state; + outputs->ids[outputs->count++] = id; + return 1; +} + +static int +cmp_fsm_output_id_t(const void *pa, const void *pb) +{ + const fsm_output_id_t a = *(fsm_output_id_t *)pa; + const fsm_output_id_t b = *(fsm_output_id_t *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + struct ir * make_ir(const struct fsm *fsm, const struct fsm_options *opt) { @@ -544,6 +562,8 @@ make_ir(const struct fsm *fsm, const struct fsm_options *opt) ir->states[i].endids.ids = NULL; ir->states[i].endids.count = 0; + ir->states[i].eager_outputs = NULL; + if (fsm_isend(fsm, i)) { fsm_end_id_t *ids; size_t count; @@ -567,6 +587,20 @@ make_ir(const struct fsm *fsm, const struct fsm_options *opt) ir->states[i].endids.count = count; } + size_t count; + if (fsm_eager_output_has_any(fsm, i, &count)) { + struct ir_state_eager_output *outputs = f_malloc(fsm->alloc, + sizeof(*outputs) + count * sizeof(outputs->ids[0])); + if (outputs == NULL) { + goto error; + } + outputs->count = 0; + fsm_eager_output_iter_state(fsm, i, append_eager_output_cb, outputs); + assert(outputs->count == count); + qsort(outputs->ids, outputs->count, sizeof(outputs->ids[0]), cmp_fsm_output_id_t); + ir->states[i].eager_outputs = outputs; + } + if (make_state(fsm, i, &ir->states[i]) == -1) { goto error; } @@ -630,6 +664,7 @@ free_ir(const struct fsm *fsm, struct ir *ir) for (i = 0; i < ir->n; i++) { f_free(fsm->alloc, (void *) ir->states[i].example); f_free(fsm->alloc, (void *) ir->states[i].endids.ids); + f_free(fsm->alloc, (void *) ir->states[i].eager_outputs); switch (ir->states[i].strategy) { case IR_TABLE: diff --git a/src/libfsm/print/ir.h b/src/libfsm/print/ir.h index b375ba850..7678d3f35 100644 --- a/src/libfsm/print/ir.h +++ b/src/libfsm/print/ir.h @@ -59,6 +59,11 @@ struct ir_state { size_t count; } endids; + struct ir_state_eager_output { + size_t count; + fsm_output_id_t ids[]; + } *eager_outputs; /* NULL -> 0 */ + unsigned int isend:1; enum ir_strategy strategy; diff --git a/src/libfsm/state.c b/src/libfsm/state.c index c845cbe46..d96c33653 100644 --- a/src/libfsm/state.c +++ b/src/libfsm/state.c @@ -19,6 +19,7 @@ #include "internal.h" #include "endids.h" +#include "eager_output.h" int fsm_addstate(struct fsm *fsm, fsm_state_t *state) @@ -44,6 +45,7 @@ fsm_addstate(struct fsm *fsm, fsm_state_t *state) for (i = fsm->statealloc; i < n; i++) { tmp[i].has_capture_actions = 0; + tmp[i].has_eager_outputs = 0; } fsm->statealloc = n; @@ -87,6 +89,8 @@ fsm_addstate_bulk(struct fsm *fsm, size_t n) new->visited = 0; new->epsilons = NULL; new->edges = NULL; + + new->has_eager_outputs = 0; } fsm->statecount += n; @@ -259,6 +263,10 @@ fsm_compact_states(struct fsm *fsm, if (!fsm_endid_compact(fsm, mapping, orig_statecount)) { return 0; } + if (!fsm_eager_output_compact(fsm, mapping, orig_statecount)) { + return 0; + } + assert(dst == kept); assert(kept == fsm->statecount); diff --git a/src/libfsm/union.c b/src/libfsm/union.c index a3b4b230c..0b18cd30c 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -15,9 +15,14 @@ #include #include #include +#include +#include #include "internal.h" +#include +#include "eager_output.h" + #define LOG_UNION_ARRAY 0 struct fsm * @@ -151,3 +156,231 @@ fsm_union_array(size_t fsm_count, return res; } + +#define LOG_UNION_REPEATED_PATTERN_GROUP 0 + +/* Combine an array of FSMs into a single FSM in one pass, with an extra loop + * so that more than one pattern with eager outputs can match. */ +struct fsm * +fsm_union_repeated_pattern_group(size_t entry_count, + struct fsm_union_entry *entries, struct fsm_combined_base_pair *bases) +{ + const struct fsm_alloc *alloc = entries[0].fsm->alloc; + const bool log = 0 || LOG_UNION_REPEATED_PATTERN_GROUP; + + if (entry_count == 1) { + return entries[0].fsm; + } + + size_t est_total_states = 0; + for (size_t i = 0; i < entry_count; i++) { + assert(entries[i].fsm); + if (entries[i].fsm->alloc != alloc) { + errno = EINVAL; + return NULL; + } + const size_t count = fsm_countstates(entries[i].fsm); + est_total_states += count; + } + + est_total_states += 5; /* new start and end, new unanchored start and end loops */ + + struct fsm *res = fsm_new_statealloc(alloc, est_total_states); + if (res == NULL) { return NULL; } + + /* collected end states */ + struct ends_buf { + size_t ceil; + size_t used; + fsm_state_t *states; + } ends = { .ceil = 0 }; + + /* The new overall start state, which will have an epsilon edge to... */ + fsm_state_t global_start; + if (!fsm_addstate(res, &global_start)) { goto fail; } + + /* states linking to the starts of unanchored and anchored subgraphs, respectively. */ + fsm_state_t global_start_loop, global_start_anchored; + if (!fsm_addstate(res, &global_start_loop)) { goto fail; } + if (!fsm_addstate(res, &global_start_anchored)) { goto fail; } + + /* The unanchored end loop state, and an end state with no outgoing edges. */ + fsm_state_t global_end_loop, global_end; + if (!fsm_addstate(res, &global_end)) { goto fail; } + if (!fsm_addstate(res, &global_end_loop)) { goto fail; } + + /* link the start to the start loop and anchored start, and the start loop to itself */ + if (log) { + fprintf(stderr, "link_before: global_start %d -> global_start_loop %d and global_start_anchored %d\n", + global_start, global_start_loop, global_start_anchored); + } + if (!fsm_addedge_epsilon(res, global_start, global_start_loop)) { goto fail; } + if (!fsm_addedge_epsilon(res, global_start, global_start_anchored)) { goto fail; } + if (!fsm_addedge_any(res, global_start_loop, global_start_loop)) { goto fail; } + + /* link the end loop and end */ + if (log) { + fprintf(stderr, "link_before: global_end_loop %d -> global_end %d (and -> self)\n", global_end_loop, global_end); + } + if (!fsm_addedge_epsilon(res, global_end_loop, global_end)) { goto fail; } + if (!fsm_addedge_any(res, global_end_loop, global_end_loop)) { goto fail; } + + if (bases != NULL) { + memset(bases, 0x00, entry_count * sizeof(bases[0])); + } + + for (size_t fsm_i = 0; fsm_i < entry_count; fsm_i++) { + ends.used = 0; /* reset */ + + struct fsm *fsm = entries[fsm_i].fsm; + entries[fsm_i].fsm = NULL; /* transfer ownership */ + + const size_t state_count = fsm_countstates(fsm); + + fsm_state_t fsm_start; + if (!fsm_getstart(fsm, &fsm_start)) { + fsm_free(fsm); /* no start, just discard */ + continue; + } + + for (fsm_state_t s_i = 0; s_i < state_count; s_i++) { + if (fsm_isend(fsm, s_i)) { + if (ends.used == ends.ceil) { /* grow? */ + size_t nceil = (ends.ceil == 0 ? 4 : 2*ends.ceil); + fsm_state_t *nstates = f_realloc(alloc, + ends.states, nceil * sizeof(nstates[0])); + if (nstates == NULL) { goto fail; } + ends.ceil = nceil; + ends.states = nstates; + } + ends.states[ends.used++] = s_i; + } + } + + if (ends.used == 0) { + fsm_free(fsm); /* no ends, just discard */ + continue; + } + + /* When combining these, remove self-edges from any states on the FSMs to be + * combined that also have eager output IDs. We are about to add an epsilon edge + * from each to a shared state that won't have eager output IDs. + * + * Eager output matching should be idempotent, so carrying it to other reachable + * state is redundant, and it leads to a combinatorial explosion that blows up the + * state count while determinising the combined FSM otherwise. + * + * For example, if /aaa/, /bbb/, and /ccc/ are combined into a DFA that repeats + * the sub-patterns (like `^.*(?:(aaa)|(bbb)|(ccc))+.*$`), the self-edge at each + * eager output state would combine with every reachable state from then on, + * leading to a copy of the whole reachable subgraph colored by every + * combination of eager output IDs: aaa, bbb, ccc, aaa+bbb, aaa+ccc, + * bbb+ccc, aaa+bbb+ccc. Instead of three relatively separate subgraphs + * that set the eager output at their last state, one for each pattern, + * it leads to 8 (2**3) subgraph clusters because it encodes _each + * distinct combination_ in the DFA. This becomes incredibly expensive + * as the combined pattern count increases; it's essentially what I'm + * trying to avoid by adding eager output support in the first place. + * + * FIXME: instead of actively removing these, filter in fsm_determinise? */ + if (fsm_eager_output_has_eager_output(fsm)) { + /* for any state that has eager outputs and a self edge, + * remove the self edge before further linkage */ + for (fsm_state_t s = 0; s < fsm->statecount; s++) { + if (!fsm_eager_output_has_any(fsm, s, NULL)) { continue; } + struct edge_set *edges = fsm->states[s].edges; + struct edge_set *new = edge_set_new(); + + struct edge_group_iter iter; + struct edge_group_iter_info info; + edge_set_group_iter_reset(edges, EDGE_GROUP_ITER_ALL, &iter); + while (edge_set_group_iter_next(&iter, &info)) { + if (info.to != s) { + if (!edge_set_add_bulk(&new, fsm->alloc, + info.symbols, info.to)) { + goto fail; + } + } + } + edge_set_free(fsm->alloc, edges); + fsm->states[s].edges = new; + } + } + + /* call fsm_merge; we really don't care which is which */ + struct fsm_combine_info combine_info; + struct fsm *merged = fsm_merge(res, fsm, &combine_info); + if (merged == NULL) { goto fail; } + + /* update offsets if res had its state IDs shifted forward */ + global_start += combine_info.base_a; + global_start_loop += combine_info.base_a; + global_start_anchored += combine_info.base_a;; + global_end += combine_info.base_a; + global_end_loop += combine_info.base_a; + + /* also update offsets for the FSM's states */ + fsm_start += combine_info.base_b; + for (size_t i = 0; i < ends.used; i++) { + ends.states[i] += combine_info.base_b; + } + + if (bases != NULL) { + bases[fsm_i].state = combine_info.base_b; + bases[fsm_i].capture = combine_info.capture_base_b; + } + + if (log) { + fprintf(stderr, "%s: fsm[%zd].start: %d\n", __func__, fsm_i, fsm_start); + for (size_t i = 0; i < ends.used; i++) { + fprintf(stderr, "%s: fsm[%zd].ends[%zd]: %d\n", __func__, fsm_i, i, ends.states[i]); + } + } + + /* link to the FSM's start state */ + const fsm_state_t start_src = entries[fsm_i].anchored_start ? global_start_anchored : global_start_loop; + if (!fsm_addedge_epsilon(merged, start_src, fsm_start)) { goto fail; } + if (log) { + fprintf(stderr, "%s: linking %s %d to fsm[%zd]'s start %d (anchored? %d)\n", + __func__, + entries[fsm_i].anchored_start ? "global_start_anchored" : "global_start_loop", + start_src, fsm_i, fsm_start, entries[fsm_i].anchored_start); + } + + /* link from the FSM's ends */ + const fsm_state_t end_dst = entries[fsm_i].anchored_end ? global_end : global_end_loop; + for (size_t i = 0; i < ends.used; i++) { + if (log) { + fprintf(stderr, "%s: linking fsm[%zd]'s end[%zd] %d (anchored? %d) to %s %d\n", + __func__, fsm_i, i, ends.states[i], entries[fsm_i].anchored_end, + entries[fsm_i].anchored_end ? "global_end" : "global_end_loop", + end_dst); + } + if (!fsm_addedge_epsilon(merged, ends.states[i], end_dst)) { goto fail; } + } + + res = merged; + } + + /* Link from the global_end_loop to the global_start_loop, so patterns with an + * unanchored start can follow other patterns with an unanchored end. */ + if (log) { + fprintf(stderr, "%s: g_start %d, g_start_loop %d, g_start_anchored %d, g_end_loop %d, g_end %d (after all merging)\n", + __func__, global_start, global_start_loop, global_start_anchored, global_end_loop, global_end); + fprintf(stderr, "%s: linking global_end_loop %d to global_start_loop %d\n", + __func__, global_end_loop, global_start_loop); + fprintf(stderr, "%s: setting global_start %d and end %d\n", __func__, global_start, global_end); + } + if (!fsm_addedge_epsilon(res, global_end_loop, global_start_loop)) { goto fail; } + + /* This needs to be set after merging, because that clears the start state. */ + fsm_setstart(res, global_start); + fsm_setend(res, global_end, 1); + + f_free(alloc, ends.states); + return res; + +fail: + f_free(alloc, ends.states); + return NULL; +} diff --git a/src/libre/libre.syms b/src/libre/libre.syms index a4f1a223b..9d381cb0f 100644 --- a/src/libre/libre.syms +++ b/src/libre/libre.syms @@ -3,6 +3,7 @@ re_is_literal re_flags re_strerror re_perror +re_is_anchored ast_print ast_print_dot diff --git a/src/libre/re.c b/src/libre/re.c index 15af848b5..c19183dcc 100644 --- a/src/libre/re.c +++ b/src/libre/re.c @@ -335,3 +335,37 @@ re_is_literal(enum re_dialect dialect, int (*getc)(void *opaque), void *opaque, return -1; } +/* FIXME: placeholder interface */ +int +re_is_anchored(enum re_dialect dialect, re_getchar_fun *getc, void *opaque, + enum re_flags flags, struct re_err *err, + struct re_anchoring_info *info) +{ + /* FIXME: copy/pasted from above, factor out common */ + + struct ast *ast; + const struct dialect *m; + int unsatisfiable; + + assert(getc != NULL); + assert(info != NULL); + + m = re_dialect(dialect); + if (m == NULL) { + if (err != NULL) { err->e = RE_EBADDIALECT; } + return 0; + } + + flags |= m->flags; + + ast = re_parse(dialect, getc, opaque, flags, err, &unsatisfiable); + if (ast == NULL) { + return 0; + } + + info->start = (ast->expr->flags & AST_FLAG_ANCHORED_START) != 0; + info->end = (ast->expr->flags & AST_FLAG_ANCHORED_END) != 0; + + ast_free(ast); + return 1; +} diff --git a/tests/eager_output/Makefile b/tests/eager_output/Makefile new file mode 100644 index 000000000..a650bf802 --- /dev/null +++ b/tests/eager_output/Makefile @@ -0,0 +1,22 @@ +.include "../../share/mk/top.mk" + +TEST.tests/eager_output != ls -1 tests/eager_output/eager_output*.c +TEST_SRCDIR.tests/eager_output = tests/eager_output +TEST_OUTDIR.tests/eager_output = ${BUILD}/tests/eager_output + +.for n in ${TEST.tests/eager_output:T:R:C/^eager_output//} +INCDIR.${TEST_SRCDIR.tests/eager_output}/eager_output${n}.c += src/adt +.endfor + +SRC += ${TEST_SRCDIR.tests/eager_output}/utils.c + +.for n in ${TEST.tests/eager_output:T:R:C/^eager_output//} +test:: ${TEST_OUTDIR.tests/eager_output}/res${n} +SRC += ${TEST_SRCDIR.tests/eager_output}/eager_output${n}.c +CFLAGS.${TEST_SRCDIR.tests/eager_output}/eager_output${n}.c += -UNDEBUG + +${TEST_OUTDIR.tests/eager_output}/run${n}: ${TEST_OUTDIR.tests/eager_output}/eager_output${n}.o ${TEST_OUTDIR.tests/eager_output}/utils.o ${BUILD}/lib/libfsm.a ${BUILD}/lib/libre.a + ${CC} ${CFLAGS} ${CFLAGS.${TEST_SRCDIR.tests/eager_output}/eager_output${n}.c} -o ${TEST_OUTDIR.tests/eager_output}/run${n} ${TEST_OUTDIR.tests/eager_output}/eager_output${n}.o ${TEST_OUTDIR.tests/eager_output}/utils.o ${BUILD}/lib/libfsm.a ${BUILD}/lib/libre.a +${TEST_OUTDIR.tests/eager_output}/res${n}: ${TEST_OUTDIR.tests/eager_output}/run${n} + ( ${TEST_OUTDIR.tests/eager_output}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/eager_output}/res${n} +.endfor diff --git a/tests/eager_output/eager_output1.c b/tests/eager_output/eager_output1.c new file mode 100644 index 000000000..f20ef77b7 --- /dev/null +++ b/tests/eager_output/eager_output1.c @@ -0,0 +1,12 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { "abc" }, + .inputs = { + { .input = "abc", .expected_ids = { 1 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output2.c b/tests/eager_output/eager_output2.c new file mode 100644 index 000000000..cdac204e2 --- /dev/null +++ b/tests/eager_output/eager_output2.c @@ -0,0 +1,17 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { "ab(c|d|e)" }, + .inputs = { + { .input = "abc", .expected_ids = { 1 } }, + { .input = "abd", .expected_ids = { 1 } }, + { .input = "abe", .expected_ids = { 1 } }, + { .input = "Xabe", .expected_ids = { 1 } }, + { .input = "abeX", .expected_ids = { 1 } }, + { .input = "XabeX", .expected_ids = { 1 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output3.c b/tests/eager_output/eager_output3.c new file mode 100644 index 000000000..c11bc58a4 --- /dev/null +++ b/tests/eager_output/eager_output3.c @@ -0,0 +1,16 @@ +#include "utils.h" + +/* test that eager endids are correctly propagated through fsm_determinise() and fsm_minimise() */ +int main(void) +{ + struct eager_output_test test = { + .patterns = { "ab(c|d|e)?" }, + .inputs = { + { .input = "ab", .expected_ids = { 1 } }, + { .input = "abc", .expected_ids = { 1 } }, + { .input = "abd", .expected_ids = { 1 } }, + { .input = "abe", .expected_ids = { 1 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output4.c b/tests/eager_output/eager_output4.c new file mode 100644 index 000000000..47cd32029 --- /dev/null +++ b/tests/eager_output/eager_output4.c @@ -0,0 +1,13 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { "abcde$" }, + .inputs = { + { .input = "abcde", .expected_ids = { 1 } }, + { .input = "Xabcde", .expected_ids = { 1 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output5.c b/tests/eager_output/eager_output5.c new file mode 100644 index 000000000..4551c68b1 --- /dev/null +++ b/tests/eager_output/eager_output5.c @@ -0,0 +1,14 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { "^abc$", "^ab*c$" }, + .inputs = { + { .input = "ac", .expected_ids = { 2 } }, + { .input = "abc", .expected_ids = { 1, 2 } }, + { .input = "abbc", .expected_ids = { 2 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output6.c b/tests/eager_output/eager_output6.c new file mode 100644 index 000000000..5431d0981 --- /dev/null +++ b/tests/eager_output/eager_output6.c @@ -0,0 +1,34 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { + "apple", + "banana", + "carrot", + "durian", + "eggplant", + "fig", + "grapefruit", + "hazelnut", + "iceberg lettuce", + "jicama", + }, + .inputs = { + { .input = "apple", .expected_ids = { 1 } }, + { .input = "banana", .expected_ids = { 2 } }, + { .input = "carrot", .expected_ids = { 3 } }, + { .input = "durian", .expected_ids = { 4 } }, + { .input = "eggplant", .expected_ids = { 5 } }, + { .input = "fig", .expected_ids = { 6 } }, + { .input = "grapefruit", .expected_ids = { 7 } }, + { .input = "hazelnut", .expected_ids = { 8 } }, + { .input = "iceberg lettuce", .expected_ids = { 9 } }, + { .input = "jicama", .expected_ids = { 10 } }, + { .input = "apple banana carrot", .expected_ids = { 1, 2, 3 } }, + }, + }; + + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output7.c b/tests/eager_output/eager_output7.c new file mode 100644 index 000000000..3d123878b --- /dev/null +++ b/tests/eager_output/eager_output7.c @@ -0,0 +1,103 @@ +#include "utils.h" + +int main(void) +{ + /* Run this test with env FORCE_ENDIDS=N ... to see how much more + * expensive it is to combine the first N patterns using endids, + * rather than eager_outputs. It becomes VERY slow for >= 9 or so. + * (Note that the checks probably will not pass for N < 4, because + * it will start skipping appear in the early test inputs.) */ + bool force_endids = false; + size_t force_endid_count = 0; + { + const char *str = getenv("FORCE_ENDIDS"); + if (str != NULL) { + force_endid_count = atoi(str); + if (force_endid_count == 0) { + force_endid_count = 26; + } + force_endids = true; + } + } + + struct eager_output_test test = { + .patterns = { + [0] = "apple", + [1] = "banana", + [2] = "carrot", + [3] = "durian", + [4] = "eggplant", + [5] = "fig", + [6] = "grapefruit", + [7] = "hazelnut", + [8] = "iceberg lettuce", + [9] = "jicama", + [10] = "kiwano", + [11] = "lemon", + [12] = "mango", + [13] = "nectarine", + [14] = "orange", + [15] = "plum", + [16] = "quince", + [17] = "radish", + [18] = "strawberry", + [19] = "turnip", + [20] = "ube", + [21] = "vanilla", + [22] = "watermelon", + [23] = "xigua watermelon", + [24] = "yam", + [25] = "zucchini", + }, + .inputs = { + /* Note: expected IDs are shifted by 1, it's 0-terminated. */ + { .input = "apple", .expected_ids = { 1 } }, + { .input = "banana", .expected_ids = { 2 } }, + { .input = "carrot", .expected_ids = { 3 } }, + { .input = "apple banana", .expected_ids = { 1, 2 } }, + { .input = "carrot durian apple", .expected_ids = { 1, 3, 4 } }, + { .input = "carrot fig apple", .expected_ids = { 1, 3, 6 } }, + + /* leading characters and an incomplete trailing match */ + { .input = "mumble mumble fig hazelnut banana xigua watermelo", .expected_ids = { 2, 6, 8 } }, + + /* redundant matches */ + { .input = "ube ube ube ube ube", .expected_ids = { 21 } }, + + /* everything */ + { .input = + "apple banana carrot durian eggplant fig grapefruit " + "hazelnut iceberg lettuce jicamaa kiwano lemon mango " + "nectarine orange plum quince radish strawberry " + "turnip ube vanilla watermelon xigua watermelon yam zucchini", + .expected_ids = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + }, + }, + /* everything, only spaces appearing in patterns */ + { .input = + "applebananacarrotdurianeggplantfiggrapefruit" + "hazelnuticeberg lettucejicamaakiwanolemonmango" + "nectarineorangeplumquinceradishstrawberry" + "turnipubevanillawatermelonxigua watermelonyamzucchini", + .expected_ids = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + }, + }, + }, + }; + + /* truncate patterns to the first N */ + if (force_endids) { + assert(force_endid_count > 0 && force_endid_count <= 26); + test.patterns[force_endid_count] = NULL; + + /* truncate test inputs to just the first couple, since + * later inputs use later patterns */ + test.inputs[5].input = NULL; + } + + return run_test(&test, false, force_endids); +} diff --git a/tests/eager_output/eager_output_at_start.c b/tests/eager_output/eager_output_at_start.c new file mode 100644 index 000000000..407aa4e77 --- /dev/null +++ b/tests/eager_output/eager_output_at_start.c @@ -0,0 +1,12 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { "" }, + .inputs = { + { .input = "", .expected_ids = { 1 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output_fr1.c b/tests/eager_output/eager_output_fr1.c new file mode 100644 index 000000000..e8e5f3395 --- /dev/null +++ b/tests/eager_output/eager_output_fr1.c @@ -0,0 +1,13 @@ +#include "utils.h" + +/* Fuzzer regresison */ +int main(void) +{ + struct eager_output_test test = { + .patterns = { "ab", "" }, + .inputs = { + { .input = "ab", .expected_ids = { 1, 2 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output_fr2.c b/tests/eager_output/eager_output_fr2.c new file mode 100644 index 000000000..404e98644 --- /dev/null +++ b/tests/eager_output/eager_output_fr2.c @@ -0,0 +1,13 @@ +#include "utils.h" + +/* Fuzzer regresison */ +int main(void) +{ + struct eager_output_test test = { + .patterns = { "", "" }, + .inputs = { + { .input = "", .expected_ids = { 1, 2 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output_fr3.c b/tests/eager_output/eager_output_fr3.c new file mode 100644 index 000000000..c7e4127a6 --- /dev/null +++ b/tests/eager_output/eager_output_fr3.c @@ -0,0 +1,13 @@ +#include "utils.h" + +/* Fuzzer regresison */ +int main(void) +{ + struct eager_output_test test = { + .patterns = { "^", "" }, + .inputs = { + { .input = "", .expected_ids = { 1, 2 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output_mixed_anchored_unanchored.c b/tests/eager_output/eager_output_mixed_anchored_unanchored.c new file mode 100644 index 000000000..a586f9840 --- /dev/null +++ b/tests/eager_output/eager_output_mixed_anchored_unanchored.c @@ -0,0 +1,46 @@ +#include "utils.h" + +int main(void) +{ + /* fprintf(stderr, "%s: skipping for now, this doesn't pass yet.\n", __FILE__); */ + /* return EXIT_SUCCESS; */ + + struct eager_output_test test = { + .patterns = { + "^abc$", + "def", + "^ghi", + "jkl$", + "mno", + }, + .inputs = { + { .input = "abc", .expected_ids = { 1 } }, + { .input = "def", .expected_ids = { 2 } }, + { .input = "ghi", .expected_ids = { 3 } }, + { .input = "jkl", .expected_ids = { 4 } }, + { .input = "mno", .expected_ids = { 5 } }, + + { .input = "defmno", .expected_ids = { 2, 5 } }, + { .input = " def mno ", .expected_ids = { 2, 5 } }, + + /* Matching a start-anchored pattern followed by + * unanchored ones should just work. */ + { .input = "ghi def", .expected_ids = { 2, 3 } }, + + /* An unanchored pattern before a start-anchored pattern + * should only match the unanchored pattern. */ + { .input = "def ghi", .expected_ids = { 2 } }, + + /* Matching an unanchored pattern before an + * end-anchored one is fine. */ + { .input = "mno jkl", .expected_ids = { 4, 5 } }, + + /* This should match "mno" with the "jkl" prefix + * ignored by the unanchored start, which does + * not count as a match for "jkl$". */ + { .input = "jkl mno", .expected_ids = { 5 } }, + }, + }; + + return run_test(&test, false, false); +} diff --git a/tests/eager_output/utils.c b/tests/eager_output/utils.c new file mode 100644 index 000000000..4bee8d848 --- /dev/null +++ b/tests/eager_output/utils.c @@ -0,0 +1,278 @@ +#include "utils.h" + +void +fsm_eager_output_dump(FILE *f, const struct fsm *fsm); + +void +fsm_endid_dump(FILE *f, const struct fsm *fsm); + +void +append_eager_output_cb(fsm_output_id_t id, void *opaque) +{ + struct cb_info *info = (struct cb_info *)opaque; + assert(info->used < MAX_IDS); + + for (size_t i = 0; i < info->used; i++) { + if (info->ids[i] == id) { + return; /* already present */ + } + } + + info->ids[info->used++] = id; +} + +int +cmp_output(const void *pa, const void *pb) +{ + const fsm_output_id_t a = *(fsm_output_id_t *)pa; + const fsm_output_id_t b = *(fsm_output_id_t *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + +struct fsm_options print_options = { + .consolidate_edges = 1, + .comments = 0, + .group_edges = 1, +}; + +void +dump(const struct fsm *fsm) +{ + fsm_print(stderr, fsm, + &print_options, NULL, FSM_PRINT_DOT); +} + +int +run_test(const struct eager_output_test *test, bool allow_extra_outputs, bool force_endids) +{ + struct fsm_union_entry entries[MAX_PATTERNS] = {0}; + + allow_extra_outputs = false; + + size_t fsms_used = 0; + int ret = 0; + + int log = 0; + { + const char *logstr = getenv("LOG"); + if (logstr != NULL) { + if (logstr[0] == 'y') { /* make "y" or "yes" non-zero */ + logstr = "1"; + } + log = atoi(logstr); + } + } + + for (size_t i = 0; i < MAX_PATTERNS; i++) { + const char *p = test->patterns[i]; + if (test->patterns[i] == NULL) { break; } + const size_t len = strlen(p); + struct fsm_union_entry *e = &entries[fsms_used]; + + /* For sake of these patterns, they are anchored if the first/last + * character is '^' and '$', respectively. This is too simplistic + * for the general case, though. */ + if (len > 0) { + if (p[0] == '^') { e->anchored_start = true; } + if (p[len - 1] == '$') { e->anchored_end = true; } + /* fprintf(stderr, "%s: p[%zd]: '%s', start %d, end %d\n", */ + /* __func__, fsms_used, p, e->anchored_start, e->anchored_end); */ + } + + struct fsm *fsm = re_comp(RE_PCRE, fsm_sgetc, &p, NULL, 0, NULL); + assert(fsm != NULL); + + /* Zero is used to terminate expected_ids, so don't use it here. */ + const fsm_output_id_t output_id = (fsm_output_id_t) (i + 1); + const fsm_end_id_t end_id = (fsm_end_id_t) (i + 1); + + /* Set either an end ID or an eager output ID, depending on + * whether the fsm is anchored at the end or not. */ + if (e->anchored_end || force_endids) { + ret = fsm_setendid(fsm, end_id); + } else { + ret = fsm_seteageroutputonends(fsm, output_id); + } + assert(ret == 1); + + if (log) { + fprintf(stderr, "==== source DFA %zd (pre det+min)\n", i); + if (log > 1) { dump(fsm); } + fsm_eager_output_dump(stderr, fsm); + fsm_endid_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + ret = fsm_determinise(fsm); + assert(ret == 1); + + if (log) { + fprintf(stderr, "==== source DFA %zd (post det)\n", i); + if (log > 1) { dump(fsm); } + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + ret = fsm_minimise(fsm); + assert(ret == 1); + + if (log) { + fprintf(stderr, "==== source DFA %zd (post det+min)\n", i); + if (log > 1) { dump(fsm); } + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + e->fsm = fsm; + fsms_used++; + } + + /* If there's only one pattern this just returns fsms[0]. */ + struct fsm *fsm = fsm_union_repeated_pattern_group(fsms_used, entries, NULL); + assert(fsm != NULL); + + if (log) { + fprintf(stderr, "==== combined (pre det+min)\n"); + if (log > 1) { dump(fsm); } + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "--- endids:\n"); + fsm_endid_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + if (log) { + fprintf(stderr, "=== determinising combined... NFA has %u states\n", fsm_countstates(fsm)); + } + ret = fsm_determinise(fsm); + assert(ret == 1); + if (log) { + fprintf(stderr, "=== determinising combined...done, DFA has %u states\n", fsm_countstates(fsm)); + } + + if (log) { + fprintf(stderr, "==== combined (post det)\n"); + if (log > 1) { dump(fsm); } + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + ret = fsm_minimise(fsm); + if (log) { + fprintf(stderr, "=== minimised combined...done, DFA has %u states\n", fsm_countstates(fsm)); + } + assert(ret == 1); + + if (log) { + fprintf(stderr, "==== combined (post det+min)\n"); + if (log > 1) { dump(fsm); } + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "--- endids:\n"); + fsm_endid_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + struct cb_info outputs = { 0 }; + fsm_eager_output_set_cb(fsm, append_eager_output_cb, &outputs); + + for (size_t i_i = 0; i_i < MAX_INPUTS; i_i++) { + outputs.used = 0; + const char *input = test->inputs[i_i].input; + if (input == NULL) { break; } + + size_t expected_id_count = 0; + for (size_t id_i = 0; id_i < MAX_ENDIDS; id_i++) { + const fsm_output_id_t id = test->inputs[i_i].expected_ids[id_i]; + if (id == 0) { break; } + expected_id_count++; + + /* must be ascending */ + if (id_i > 0) { + assert(id > test->inputs[i_i].expected_ids[id_i - 1]); + } + } + + if (log) { + fprintf(stderr, "%s: input %zd: \"%s\", expecting %zd ids:", + __func__, i_i, input, expected_id_count); + for (size_t i = 0; i < expected_id_count; i++) { + fprintf(stderr, " %d", test->inputs[i_i].expected_ids[i]); + } + } + + if (test->inputs[i_i].expect_fail) { + expected_id_count = 0; + } + + fsm_state_t end; /* only set on match */ + ret = fsm_exec(fsm, fsm_sgetc, &input, &end, NULL); + + if (ret == 1) { +#define ENDID_BUF_SIZE 32 + fsm_end_id_t endid_buf[ENDID_BUF_SIZE] = {0}; + const size_t endid_count = fsm_endid_count(fsm, end); + /* fprintf(stderr, "%s: endid_count %zd for state %d\n", __func__, endid_count, end); */ + assert(endid_count < ENDID_BUF_SIZE); + if (!fsm_endid_get(fsm, end, /*ENDID_BUF_SIZE*/ endid_count, endid_buf)) { + assert(!"fsm_endid_get failed"); + } + + /* Copy endid outputs into outputs.ids[], since for testing + * purposes we don't care about the difference between eager + * output and endids here -- the values don't overlap. */ + assert(outputs.used + endid_count <= MAX_IDS); + for (size_t endid_i = 0; endid_i < endid_count; endid_i++) { + if (log) { + fprintf(stderr, "-- adding endid %zd: %d\n", endid_i, endid_buf[endid_i]); + } + outputs.ids[outputs.used++] = (fsm_output_id_t)endid_buf[endid_i]; + } + } + + if (ret == 0) { + /* if it didn't match, ignore the eager output IDs. this should + * eventually happen internal to fsm_exec or codegen. */ + outputs.used = 0; + } + + /* NEXT match IDs, sort outputs[] buffer first */ + qsort(outputs.ids, outputs.used, sizeof(outputs.ids[0]), cmp_output); + + if (log) { + fprintf(stderr, "-- got %zd:", outputs.used); + for (size_t i = 0; i < outputs.used; i++) { + fprintf(stderr, " %d", outputs.ids[i]); + } + fprintf(stderr, "\n"); + } + + if (expected_id_count == 0) { + assert(ret == 0 || outputs.used == 0); /* no match */ + continue; + } else { + assert(ret == 1); + } + + if (!allow_extra_outputs) { + assert(outputs.used == expected_id_count); + } else { + assert(outputs.used >= expected_id_count); + } + + size_t floor = 0; + for (size_t exp_i = 0; exp_i < outputs.used; exp_i++) { + bool found = false; + for (size_t got_i = floor; got_i < outputs.used; got_i++) { + if (outputs.ids[got_i] == test->inputs[i_i].expected_ids[exp_i]) { + floor = got_i + 1; + found = true; + break; + } + } + assert(found); + } + } + + fsm_free(fsm); + + return EXIT_SUCCESS;; +} diff --git a/tests/eager_output/utils.h b/tests/eager_output/utils.h new file mode 100644 index 000000000..672c01977 --- /dev/null +++ b/tests/eager_output/utils.h @@ -0,0 +1,64 @@ +#ifndef UTILS_H +#define UTILS_H + +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#define MAX_IDS 32 + +#include + +#include + +#define MAX_PATTERNS 150 +#define MAX_INPUTS 64 +#define MAX_ENDIDS 32 + +struct eager_output_test { + const char *patterns[MAX_PATTERNS]; + + struct { + const char *input; + bool expect_fail; + /* Terminated by 0. pattern[i] => id of i+1. Must be sorted. */ + fsm_output_id_t expected_ids[MAX_ENDIDS]; + } inputs[MAX_INPUTS]; +}; + +void +append_eager_output_cb(fsm_output_id_t id, void *opaque); + +int +cmp_output(const void *pa, const void *pb); + +int +run_test(const struct eager_output_test *test, bool allow_extra_outputs, bool force_endids); + +struct cb_info { + size_t used; + fsm_end_id_t ids[MAX_IDS]; +}; + +void +dump(const struct fsm *fsm); + +void +append_eager_output_cb(fsm_end_id_t id, void *opaque); + +#endif