Skip to content

Commit

Permalink
Merge pull request #31 from fastly/sv/upstream-sync-re-is-anchored
Browse files Browse the repository at this point in the history
Upstream sync: `re_is_anchored` and a few more misc. changes
  • Loading branch information
silentbicycle authored Oct 12, 2024
2 parents 8b53634 + 09a917a commit 592b613
Show file tree
Hide file tree
Showing 15 changed files with 186 additions and 23 deletions.
2 changes: 1 addition & 1 deletion include/adt/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
/* If non-zero, expand the timer macros defined below, otherwise
* they compile away. */
#ifndef TRACK_TIMES
#define TRACK_TIMES 0
#define TRACK_TIMES (0 && !BUILD_FOR_FUZZER)
#endif

#if EXPENSIVE_CHECKS && TRACK_TIMES
Expand Down
15 changes: 15 additions & 0 deletions include/fsm/fsm.h
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,21 @@ fsm_remove_epsilons(struct fsm *fsm);
int
fsm_determinise(struct fsm *fsm);

/* Determinise, with a passed in configuration
* and a distinct return value for reaching
* the state limit. */
struct fsm_determinise_config {
size_t state_limit; /* 0: no limit */
};
enum fsm_determinise_with_config_res {
FSM_DETERMINISE_WITH_CONFIG_OK,
FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED,
FSM_DETERMINISE_WITH_CONFIG_ERRNO,
};
enum fsm_determinise_with_config_res
fsm_determinise_with_config(struct fsm *fsm,
const struct fsm_determinise_config *config);

/*
* Make a DFA complete, as per fsm_iscomplete.
*/
Expand Down
7 changes: 6 additions & 1 deletion include/fsm/walk.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ fsm_walk_edges(const struct fsm *fsm, void *opaque,
* functionally equivalent cases makes testing dramatically faster,
* but exploring every edge could be added later.
*
* If randomized is zero then it will generate the first label in the
* label set, otherwise a label from the set will be chosen using rand()
* (favoring printable characters). The caller can use srand()
* beforehand to set a PRNG seed.
*
* Note: fsm is non-const because it calls fsm_trim on the FSM
* internally. This records the shortest distance from each state to an
* end state, which is used to prune branches that would not produce
Expand All @@ -114,7 +119,7 @@ fsm_generate_matches_cb(const struct fsm *fsm,
const char *input, size_t input_length,
fsm_state_t end_state, void *opaque);
int
fsm_generate_matches(struct fsm *fsm, size_t max_length,
fsm_generate_matches(struct fsm *fsm, size_t max_length, int randomized,
fsm_generate_matches_cb *cb, void *opaque);

/* Callback provided for the most basic use case for
Expand Down
16 changes: 16 additions & 0 deletions include/re/re.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,22 @@ re_comp(enum re_dialect dialect,
const struct fsm_alloc *alloc,
enum re_flags flags, struct re_err *err);

/* Parse and analyze the regex enough to determine whether it is
* anchored at the start and/or end.
*
* As long as the result is checked for RE_IS_ANCHORED_ERROR first,
* the result can be used like a bitset. */
enum re_is_anchored_res {
RE_IS_ANCHORED_NONE = 0x00,
RE_IS_ANCHORED_START = 0x01,
RE_IS_ANCHORED_END = 0x02,
RE_IS_ANCHORED_BOTH = 0x03,
RE_IS_ANCHORED_ERROR = 0xFFFF,
};
enum re_is_anchored_res
re_is_anchored(enum re_dialect dialect, re_getchar_fun *f, void *opaque,
enum re_flags flags, struct re_err *err);

/*
* Return a human-readable string describing a given error code. The string
* returned has static storage, and must not be freed.
Expand Down
2 changes: 1 addition & 1 deletion src/fsm/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -770,7 +770,7 @@ main(int argc, char *argv[])
}

if (generate_bounds > 0) {
r = fsm_generate_matches(fsm, generate_bounds, fsm_generate_cb_printf_escaped, &opt);
r = fsm_generate_matches(fsm, generate_bounds, 0, fsm_generate_cb_printf_escaped, &opt);
}

fsm_free(fsm);
Expand Down
46 changes: 37 additions & 9 deletions src/libfsm/determinise.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,20 @@ dump_labels(FILE *f, const uint64_t labels[4])
}
}

int
fsm_determinise(struct fsm *nfa)
enum fsm_determinise_with_config_res
fsm_determinise_with_config(struct fsm *nfa,
const struct fsm_determinise_config *config)
{
int res = 0;
enum fsm_determinise_with_config_res res = FSM_DETERMINISE_WITH_CONFIG_ERRNO;
struct mappingstack *stack = NULL;

struct interned_state_set_pool *issp = NULL;
struct map map = { NULL, 0, 0, NULL };
struct mapping *curr = NULL;
size_t dfacount = 0;
const size_t state_limit = config == NULL
? 0
: config->state_limit;

struct analyze_closures_env ac_env = { 0 };

Expand All @@ -40,7 +44,7 @@ fsm_determinise(struct fsm *nfa)
*/
if (fsm_has(nfa, fsm_hasepsilons)) {
if (!fsm_remove_epsilons(nfa)) {
return 0;
return FSM_DETERMINISE_WITH_CONFIG_ERRNO;
}
}

Expand All @@ -52,7 +56,12 @@ fsm_determinise(struct fsm *nfa)

issp = interned_state_set_pool_alloc(nfa->alloc);
if (issp == NULL) {
return 0;
return FSM_DETERMINISE_WITH_CONFIG_ERRNO;
}

if (state_limit != 0 && fsm_countstates(nfa) > state_limit) {
res = FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED;
goto cleanup;
}

{
Expand All @@ -74,7 +83,7 @@ fsm_determinise(struct fsm *nfa)
*/

if (!fsm_getstart(nfa, &start)) {
res = 1;
res = FSM_DETERMINISE_WITH_CONFIG_OK;
goto cleanup;
}

Expand Down Expand Up @@ -150,6 +159,11 @@ fsm_determinise(struct fsm *nfa)
assert(m->dfastate < dfacount);
} else {
/* not found -- add a new one and push it to the stack for processing */

if (state_limit != 0 && dfacount > state_limit) {
res = FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED;
goto cleanup;
}
if (!map_add(&map, dfacount, iss, &m)) {
goto cleanup;
}
Expand All @@ -171,8 +185,6 @@ fsm_determinise(struct fsm *nfa)
}

ac_env.output_count = 0;

/* All elements in sclosures[] are interned, so they will be freed later. */
} while ((curr = stack_pop(stack)));

{
Expand Down Expand Up @@ -260,7 +272,7 @@ fsm_determinise(struct fsm *nfa)
assert(fsm_all(nfa, fsm_isdfa));
#endif

res = 1;
res = FSM_DETERMINISE_WITH_CONFIG_OK;

cleanup:
map_free(&map);
Expand Down Expand Up @@ -311,6 +323,22 @@ fsm_determinise(struct fsm *nfa)
return res;
}

int
fsm_determinise(struct fsm *nfa)
{
enum fsm_determinise_with_config_res res = fsm_determinise_with_config(nfa, NULL);
switch (res) {
case FSM_DETERMINISE_WITH_CONFIG_OK:
return 1;
case FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED:
/* unreachable */
return 0;
case FSM_DETERMINISE_WITH_CONFIG_ERRNO:
default:
return 0;
}
}

/* Add DFA_state to the list for NFA_state. */
static int
add_reverse_mapping(const struct fsm_alloc *alloc,
Expand Down
71 changes: 63 additions & 8 deletions src/libfsm/gen.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ struct gen_ctx {
fsm_generate_matches_cb *cb;

bool done;
bool randomized;

size_t buf_ceil;
size_t buf_used;
Expand Down Expand Up @@ -106,7 +107,7 @@ struct gen_ctx {
static bool
gen_init_outer(struct fsm *fsm, size_t max_length,
fsm_generate_matches_cb *cb, void *opaque,
bool randomized, unsigned seed);
bool randomized);

static bool
gen_init(struct gen_ctx *ctx, struct fsm *fsm);
Expand Down Expand Up @@ -139,17 +140,21 @@ static bool
grow_stack(struct gen_ctx *ctx);

int
fsm_generate_matches(struct fsm *fsm, size_t max_length,
fsm_generate_matches(struct fsm *fsm, size_t max_length, int randomized,
fsm_generate_matches_cb *cb, void *opaque)
{
if (max_length == 0) {
errno = EINVAL;
return 0;
}

if (!fsm_has(fsm, fsm_isend)) {
return 1; /* no end state -> nothing to do */
}

INIT_TIMERS();
TIME(&pre);
int res = gen_init_outer(fsm, max_length, cb, opaque, false, 0);
int res = gen_init_outer(fsm, max_length, cb, opaque, randomized != 0);
TIME(&post);

DIFF_MSEC("fsm_generate_matches", pre, post, NULL);
Expand Down Expand Up @@ -199,7 +204,7 @@ fsm_generate_cb_printf(const struct fsm *fsm,
static bool
gen_init_outer(struct fsm *fsm, size_t max_length,
fsm_generate_matches_cb *cb, void *opaque,
bool randomized, unsigned seed)
bool randomized)
{
int res = false;
if (fsm == NULL || cb == NULL || max_length == 0) {
Expand All @@ -208,9 +213,6 @@ gen_init_outer(struct fsm *fsm, size_t max_length,

assert(fsm_all(fsm, fsm_isdfa)); /* DFA-only */

assert(!randomized); /* not yet supported */
(void)seed;

#if LOG_GEN > 1
fprintf(stderr, "%s: %u states\n", __func__, fsm_countstates(fsm));
#endif
Expand All @@ -224,6 +226,7 @@ gen_init_outer(struct fsm *fsm, size_t max_length,
.max_length = max_length,
.cb = cb,
.opaque = opaque,
.randomized = randomized,
};

if (!gen_init(&ctx, fsm)) {
Expand Down Expand Up @@ -524,6 +527,55 @@ first_symbol(const uint64_t *symbols)
return 0;
}

static unsigned char
random_symbol(const uint64_t *symbols)
{
bool has_zero = false;
unsigned i = 0;

/* printable and non-printable character choices */
size_t choice_count = 0;
unsigned char choices[256];
size_t np_choice_count = 0;
unsigned char np_choices[256];

while (i < 256) {
const uint64_t w = symbols[i/64];
if ((i & 63) == 0 && w == 0) {
i += 64;
continue;
}
if (w & (1ULL << (i & 63))) {
if (i == 0) {
has_zero = true;
} else if (isprint(i)) {
choices[choice_count++] = (unsigned char)i;
} else {
np_choices[np_choice_count++] = (unsigned char)i;
}
}
i++;
}

if (choice_count > 0) {
const size_t c = rand() % choice_count;
return choices[c];
}

if (np_choice_count > 0) {
const size_t c = rand() % np_choice_count;
return np_choices[c];
}

/* Prefer anything besides 0x00 if present, since that will truncate the string. */
if (has_zero) {
return 0;
}

assert(!"empty set");
return 0;
}

#if DUMP_EDGES
static void
dump_edges(fsm_state_t state, struct edge_set *edges)
Expand All @@ -538,6 +590,7 @@ dump_edges(fsm_state_t state, struct edge_set *edges)
size_t i = 0;
while (edge_set_group_iter_next(&ei, &eg)) {
const unsigned char symbol = first_symbol(eg.symbols);
const unsigned char symbol = random_symbol(eg.symbols);
fprintf(stderr, "%s: %d -- %zu/%zu -- 0x%02x (%c) -> %d\n",
__func__, state, i, count,
symbol, isprint(symbol) ? symbol : '.', eg.to);
Expand Down Expand Up @@ -585,7 +638,9 @@ sfs_step_edges(struct gen_ctx *ctx, struct gen_stack_frame *sf)
struct edge_group_iter_info eg;

if (iter_next_transition(ctx, sf, &eg)) {
const unsigned char symbol = first_symbol(eg.symbols);
const unsigned char symbol = ctx->randomized
? random_symbol(eg.symbols)
: first_symbol(eg.symbols);
const fsm_state_t state = eg.to;

LOG(2, "sfs_step_edges: got edge 0x%x ('%c')\n",
Expand Down
1 change: 1 addition & 0 deletions src/libfsm/libfsm.syms
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ fsm_countstates
fsm_trim
fsm_reverse
fsm_determinise
fsm_determinise_with_config
fsm_remove_epsilons
fsm_complete
fsm_minimise
Expand Down
4 changes: 4 additions & 0 deletions src/libfsm/trim.c
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,10 @@ integrity_check(const char *descr, const struct fsm *fsm)
return;
#endif

#if !EXPENSIVE_CHECKS
return;
#endif

if (LOG_TRIM > 1) {
fprintf(stderr, "integrity check: %s...\n", descr);
}
Expand Down
1 change: 1 addition & 0 deletions src/libre/libre.syms
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ re_is_literal
re_flags
re_strerror
re_perror
re_is_anchored

ast_print
ast_print_dot
Expand Down
Loading

0 comments on commit 592b613

Please sign in to comment.