From 3d4beb1c476fc9111a51f929d410440497e7d0a3 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 22 Apr 2024 16:55:47 -0400 Subject: [PATCH] re_strings: add support for endids. This adds an extra parameter to `re_strings_add_str` and `re_strings_add_raw` that (if non-NULL) will associate a single endid with the string being added. When `re_strings_build` constructs the DFA it will produce a separate end state for each end. This needs further testing with multiple overlapping patterns. When multiple literal strings appear in the input only the latest match will be reported. --- include/fsm/fsm.h | 6 ++++++ include/re/strings.h | 6 +++--- src/libfsm/endids.c | 10 ++++++++++ src/libfsm/libfsm.syms | 1 + src/libre/ac.c | 12 ++++++++++-- src/libre/ac.h | 5 ++++- src/libre/re_strings.c | 10 +++++----- 7 files changed, 39 insertions(+), 11 deletions(-) diff --git a/include/fsm/fsm.h b/include/fsm/fsm.h index 7c3883749..2b4c438f1 100644 --- a/include/fsm/fsm.h +++ b/include/fsm/fsm.h @@ -207,6 +207,12 @@ fsm_setend(struct fsm *fsm, fsm_state_t state, int end); int fsm_setendid(struct fsm *fsm, fsm_end_id_t id); +/* Associate a numeric ID with a specific end state in an fsm. + * Returns 1 on success, 0 on error. + * */ +int +fsm_setendidstate(struct fsm *fsm, fsm_state_t end_state, fsm_end_id_t id); + /* Get the end IDs associated with an end state, if any. * If id_buf has enough cells to store all the end IDs (according * to id_buf_count) then they are written into id_buf[] and diff --git a/include/re/strings.h b/include/re/strings.h index 06387a54c..fe4f7adc2 100644 --- a/include/re/strings.h +++ b/include/re/strings.h @@ -7,7 +7,7 @@ #ifndef RE_STRINGS_H #define RE_STRINGS_H -struct fsm; +#include struct fsm_options; struct re_strings; @@ -42,10 +42,10 @@ void re_strings_free(struct re_strings *g); int -re_strings_add_raw(struct re_strings *g, const void *p, size_t n); +re_strings_add_raw(struct re_strings *g, const void *p, size_t n, const fsm_end_id_t *endid); int -re_strings_add_str(struct re_strings *g, const char *s); +re_strings_add_str(struct re_strings *g, const char *s, const fsm_end_id_t *endid); struct fsm * re_strings_build(struct re_strings *g, diff --git a/src/libfsm/endids.c b/src/libfsm/endids.c index 444ccbc2e..1fc98ae68 100644 --- a/src/libfsm/endids.c +++ b/src/libfsm/endids.c @@ -84,6 +84,16 @@ fsm_setendid(struct fsm *fsm, fsm_end_id_t id) return 1; } +int +fsm_setendidstate(struct fsm *fsm, fsm_state_t end_state, fsm_end_id_t id) +{ + enum fsm_endid_set_res sres = fsm_endid_set(fsm, end_state, id); + if (sres == FSM_ENDID_SET_ERROR_ALLOC_FAIL) { + return 0; + } + return 1; +} + enum fsm_getendids_res fsm_getendids(const struct fsm *fsm, fsm_state_t end_state, size_t id_buf_count, fsm_end_id_t *id_buf, diff --git a/src/libfsm/libfsm.syms b/src/libfsm/libfsm.syms index 415bffbea..a2570b8c9 100644 --- a/src/libfsm/libfsm.syms +++ b/src/libfsm/libfsm.syms @@ -89,6 +89,7 @@ fsm_getendids fsm_setendid fsm_mapendids fsm_increndids +fsm_setendidstate fsm_countedges fsm_countstates diff --git a/src/libre/ac.c b/src/libre/ac.c index 2ebdb4f7e..d5f8e6f4b 100644 --- a/src/libre/ac.c +++ b/src/libre/ac.c @@ -16,6 +16,7 @@ #include "ac.h" +#define ENDID_NONE ((fsm_end_id_t)-1) enum { POOL_BLOCK_SIZE = 256 }; struct trie_state { @@ -25,6 +26,7 @@ struct trie_state { unsigned int index; unsigned int output:1; unsigned int have_st:1; + fsm_end_id_t endid; /* or ENDID_NONE */ }; struct trie_pool { @@ -126,7 +128,7 @@ trie_create(void) } struct trie_graph * -trie_add_word(struct trie_graph *g, const char *w, size_t n) +trie_add_word(struct trie_graph *g, const char *w, size_t n, const fsm_end_id_t *endid) { struct trie_state *st; size_t i; @@ -159,6 +161,7 @@ trie_add_word(struct trie_graph *g, const char *w, size_t n) g->depth = n; } + st->endid = (endid == NULL ? ENDID_NONE : *endid); return g; } @@ -278,7 +281,7 @@ trie_to_fsm_state(struct trie_state *ts, struct fsm *fsm, assert(fsm != NULL); assert(q != NULL); - if (ts->output && have_end) { + if (ts->output && have_end && ts->endid == ENDID_NONE) { *q = single_end; return 1; } @@ -315,6 +318,11 @@ trie_to_fsm_state(struct trie_state *ts, struct fsm *fsm, if (ts->output) { fsm_setend(fsm, st, 1); + if (ts->endid != ENDID_NONE) { + if (!fsm_setendidstate(fsm, st, ts->endid)) { + return 0; + } + } } *q = st; diff --git a/src/libre/ac.h b/src/libre/ac.h index 480c76bfb..edabca27c 100644 --- a/src/libre/ac.h +++ b/src/libre/ac.h @@ -7,6 +7,8 @@ #ifndef AC_H #define AC_H +#include "fsm/fsm.h" + struct fsm; struct fsm_state; struct fsm_options; @@ -20,7 +22,8 @@ void trie_free(struct trie_graph *g); struct trie_graph * -trie_add_word(struct trie_graph *g, const char *w, size_t n); +trie_add_word(struct trie_graph *g, const char *w, size_t n, + const fsm_end_id_t *endid); int trie_add_failure_edges(struct trie_graph *g); diff --git a/src/libre/re_strings.c b/src/libre/re_strings.c index 06b7dc772..b2f04fec1 100644 --- a/src/libre/re_strings.c +++ b/src/libre/re_strings.c @@ -32,7 +32,7 @@ re_strings(const struct fsm_options *opt, const char *a[], size_t n, } for (i = 0; i < n; i++) { - if (!re_strings_add_str(g, a[i])) { + if (!re_strings_add_str(g, a[i], NULL)) { goto error; } } @@ -64,20 +64,20 @@ re_strings_free(struct re_strings *g) } int -re_strings_add_raw(struct re_strings *g, const void *p, size_t n) +re_strings_add_raw(struct re_strings *g, const void *p, size_t n, const fsm_end_id_t *endid) { assert(p != NULL); assert(n > 0); - return trie_add_word((struct trie_graph *) g, p, n) != NULL; + return trie_add_word((struct trie_graph *) g, p, n, endid) != NULL; } int -re_strings_add_str(struct re_strings *g, const char *s) +re_strings_add_str(struct re_strings *g, const char *s, const fsm_end_id_t *endid) { assert(s != NULL); - return re_strings_add_raw(g, s, strlen(s)); + return re_strings_add_raw(g, s, strlen(s), endid); } struct fsm *