Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upstream sync: re_is_anchored and a few more misc. changes #31

Merged
merged 11 commits into from
Oct 12, 2024
Merged
2 changes: 1 addition & 1 deletion include/adt/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
/* If non-zero, expand the timer macros defined below, otherwise
* they compile away. */
#ifndef TRACK_TIMES
#define TRACK_TIMES 0
#define TRACK_TIMES (0 && !BUILD_FOR_FUZZER)
#endif

#if EXPENSIVE_CHECKS && TRACK_TIMES
Expand Down
15 changes: 15 additions & 0 deletions include/fsm/fsm.h
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,21 @@ fsm_remove_epsilons(struct fsm *fsm);
int
fsm_determinise(struct fsm *fsm);

/* Determinise, with a passed in configuration
* and a distinct return value for reaching
* the state limit. */
struct fsm_determinise_config {
size_t state_limit; /* 0: no limit */
};
enum fsm_determinise_with_config_res {
FSM_DETERMINISE_WITH_CONFIG_OK,
FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED,
FSM_DETERMINISE_WITH_CONFIG_ERRNO,
};
enum fsm_determinise_with_config_res
fsm_determinise_with_config(struct fsm *fsm,
const struct fsm_determinise_config *config);

/*
* Make a DFA complete, as per fsm_iscomplete.
*/
Expand Down
7 changes: 6 additions & 1 deletion include/fsm/walk.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ fsm_walk_edges(const struct fsm *fsm, void *opaque,
* functionally equivalent cases makes testing dramatically faster,
* but exploring every edge could be added later.
*
* If randomized is zero then it will generate the first label in the
* label set, otherwise a label from the set will be chosen using rand()
* (favoring printable characters). The caller can use srand()
* beforehand to set a PRNG seed.
*
* Note: fsm is non-const because it calls fsm_trim on the FSM
* internally. This records the shortest distance from each state to an
* end state, which is used to prune branches that would not produce
Expand All @@ -114,7 +119,7 @@ fsm_generate_matches_cb(const struct fsm *fsm,
const char *input, size_t input_length,
fsm_state_t end_state, void *opaque);
int
fsm_generate_matches(struct fsm *fsm, size_t max_length,
fsm_generate_matches(struct fsm *fsm, size_t max_length, int randomized,
fsm_generate_matches_cb *cb, void *opaque);

/* Callback provided for the most basic use case for
Expand Down
16 changes: 16 additions & 0 deletions include/re/re.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,22 @@ re_comp(enum re_dialect dialect,
const struct fsm_alloc *alloc,
enum re_flags flags, struct re_err *err);

/* Parse and analyze the regex enough to determine whether it is
* anchored at the start and/or end.
*
* As long as the result is checked for RE_IS_ANCHORED_ERROR first,
* the result can be used like a bitset. */
enum re_is_anchored_res {
RE_IS_ANCHORED_NONE = 0x00,
RE_IS_ANCHORED_START = 0x01,
RE_IS_ANCHORED_END = 0x02,
RE_IS_ANCHORED_BOTH = 0x03,
RE_IS_ANCHORED_ERROR = 0xFFFF,
};
enum re_is_anchored_res
re_is_anchored(enum re_dialect dialect, re_getchar_fun *f, void *opaque,
enum re_flags flags, struct re_err *err);

/*
* Return a human-readable string describing a given error code. The string
* returned has static storage, and must not be freed.
Expand Down
2 changes: 1 addition & 1 deletion src/fsm/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -770,7 +770,7 @@ main(int argc, char *argv[])
}

if (generate_bounds > 0) {
r = fsm_generate_matches(fsm, generate_bounds, fsm_generate_cb_printf_escaped, &opt);
r = fsm_generate_matches(fsm, generate_bounds, 0, fsm_generate_cb_printf_escaped, &opt);
}

fsm_free(fsm);
Expand Down
46 changes: 37 additions & 9 deletions src/libfsm/determinise.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,20 @@ dump_labels(FILE *f, const uint64_t labels[4])
}
}

int
fsm_determinise(struct fsm *nfa)
enum fsm_determinise_with_config_res
fsm_determinise_with_config(struct fsm *nfa,
const struct fsm_determinise_config *config)
{
int res = 0;
enum fsm_determinise_with_config_res res = FSM_DETERMINISE_WITH_CONFIG_ERRNO;
struct mappingstack *stack = NULL;

struct interned_state_set_pool *issp = NULL;
struct map map = { NULL, 0, 0, NULL };
struct mapping *curr = NULL;
size_t dfacount = 0;
const size_t state_limit = config == NULL
? 0
: config->state_limit;

struct analyze_closures_env ac_env = { 0 };

Expand All @@ -40,7 +44,7 @@ fsm_determinise(struct fsm *nfa)
*/
if (fsm_has(nfa, fsm_hasepsilons)) {
if (!fsm_remove_epsilons(nfa)) {
return 0;
return FSM_DETERMINISE_WITH_CONFIG_ERRNO;
}
}

Expand All @@ -52,7 +56,12 @@ fsm_determinise(struct fsm *nfa)

issp = interned_state_set_pool_alloc(nfa->alloc);
if (issp == NULL) {
return 0;
return FSM_DETERMINISE_WITH_CONFIG_ERRNO;
}

if (state_limit != 0 && fsm_countstates(nfa) > state_limit) {
res = FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED;
goto cleanup;
}

{
Expand All @@ -74,7 +83,7 @@ fsm_determinise(struct fsm *nfa)
*/

if (!fsm_getstart(nfa, &start)) {
res = 1;
res = FSM_DETERMINISE_WITH_CONFIG_OK;
goto cleanup;
}

Expand Down Expand Up @@ -150,6 +159,11 @@ fsm_determinise(struct fsm *nfa)
assert(m->dfastate < dfacount);
} else {
/* not found -- add a new one and push it to the stack for processing */

if (state_limit != 0 && dfacount > state_limit) {
res = FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED;
goto cleanup;
}
if (!map_add(&map, dfacount, iss, &m)) {
goto cleanup;
}
Expand All @@ -171,8 +185,6 @@ fsm_determinise(struct fsm *nfa)
}

ac_env.output_count = 0;

/* All elements in sclosures[] are interned, so they will be freed later. */
} while ((curr = stack_pop(stack)));

{
Expand Down Expand Up @@ -260,7 +272,7 @@ fsm_determinise(struct fsm *nfa)
assert(fsm_all(nfa, fsm_isdfa));
#endif

res = 1;
res = FSM_DETERMINISE_WITH_CONFIG_OK;

cleanup:
map_free(&map);
Expand Down Expand Up @@ -311,6 +323,22 @@ fsm_determinise(struct fsm *nfa)
return res;
}

int
fsm_determinise(struct fsm *nfa)
{
enum fsm_determinise_with_config_res res = fsm_determinise_with_config(nfa, NULL);
switch (res) {
case FSM_DETERMINISE_WITH_CONFIG_OK:
return 1;
case FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED:
/* unreachable */
return 0;
case FSM_DETERMINISE_WITH_CONFIG_ERRNO:
default:
return 0;
}
}

/* Add DFA_state to the list for NFA_state. */
static int
add_reverse_mapping(const struct fsm_alloc *alloc,
Expand Down
71 changes: 63 additions & 8 deletions src/libfsm/gen.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ struct gen_ctx {
fsm_generate_matches_cb *cb;

bool done;
bool randomized;

size_t buf_ceil;
size_t buf_used;
Expand Down Expand Up @@ -106,7 +107,7 @@ struct gen_ctx {
static bool
gen_init_outer(struct fsm *fsm, size_t max_length,
fsm_generate_matches_cb *cb, void *opaque,
bool randomized, unsigned seed);
bool randomized);

static bool
gen_init(struct gen_ctx *ctx, struct fsm *fsm);
Expand Down Expand Up @@ -139,17 +140,21 @@ static bool
grow_stack(struct gen_ctx *ctx);

int
fsm_generate_matches(struct fsm *fsm, size_t max_length,
fsm_generate_matches(struct fsm *fsm, size_t max_length, int randomized,
fsm_generate_matches_cb *cb, void *opaque)
{
if (max_length == 0) {
errno = EINVAL;
return 0;
}

if (!fsm_has(fsm, fsm_isend)) {
return 1; /* no end state -> nothing to do */
}

INIT_TIMERS();
TIME(&pre);
int res = gen_init_outer(fsm, max_length, cb, opaque, false, 0);
int res = gen_init_outer(fsm, max_length, cb, opaque, randomized != 0);
TIME(&post);

DIFF_MSEC("fsm_generate_matches", pre, post, NULL);
Expand Down Expand Up @@ -199,7 +204,7 @@ fsm_generate_cb_printf(const struct fsm *fsm,
static bool
gen_init_outer(struct fsm *fsm, size_t max_length,
fsm_generate_matches_cb *cb, void *opaque,
bool randomized, unsigned seed)
bool randomized)
{
int res = false;
if (fsm == NULL || cb == NULL || max_length == 0) {
Expand All @@ -208,9 +213,6 @@ gen_init_outer(struct fsm *fsm, size_t max_length,

assert(fsm_all(fsm, fsm_isdfa)); /* DFA-only */

assert(!randomized); /* not yet supported */
(void)seed;

#if LOG_GEN > 1
fprintf(stderr, "%s: %u states\n", __func__, fsm_countstates(fsm));
#endif
Expand All @@ -224,6 +226,7 @@ gen_init_outer(struct fsm *fsm, size_t max_length,
.max_length = max_length,
.cb = cb,
.opaque = opaque,
.randomized = randomized,
};

if (!gen_init(&ctx, fsm)) {
Expand Down Expand Up @@ -524,6 +527,55 @@ first_symbol(const uint64_t *symbols)
return 0;
}

static unsigned char
random_symbol(const uint64_t *symbols)
{
bool has_zero = false;
unsigned i = 0;

/* printable and non-printable character choices */
size_t choice_count = 0;
unsigned char choices[256];
size_t np_choice_count = 0;
unsigned char np_choices[256];

while (i < 256) {
const uint64_t w = symbols[i/64];
if ((i & 63) == 0 && w == 0) {
i += 64;
continue;
}
if (w & (1ULL << (i & 63))) {
if (i == 0) {
has_zero = true;
} else if (isprint(i)) {
choices[choice_count++] = (unsigned char)i;
} else {
np_choices[np_choice_count++] = (unsigned char)i;
}
}
i++;
}

if (choice_count > 0) {
const size_t c = rand() % choice_count;
return choices[c];
}

if (np_choice_count > 0) {
const size_t c = rand() % np_choice_count;
return np_choices[c];
}

/* Prefer anything besides 0x00 if present, since that will truncate the string. */
if (has_zero) {
return 0;
}

assert(!"empty set");
return 0;
}

#if DUMP_EDGES
static void
dump_edges(fsm_state_t state, struct edge_set *edges)
Expand All @@ -538,6 +590,7 @@ dump_edges(fsm_state_t state, struct edge_set *edges)
size_t i = 0;
while (edge_set_group_iter_next(&ei, &eg)) {
const unsigned char symbol = first_symbol(eg.symbols);
const unsigned char symbol = random_symbol(eg.symbols);
fprintf(stderr, "%s: %d -- %zu/%zu -- 0x%02x (%c) -> %d\n",
__func__, state, i, count,
symbol, isprint(symbol) ? symbol : '.', eg.to);
Expand Down Expand Up @@ -585,7 +638,9 @@ sfs_step_edges(struct gen_ctx *ctx, struct gen_stack_frame *sf)
struct edge_group_iter_info eg;

if (iter_next_transition(ctx, sf, &eg)) {
const unsigned char symbol = first_symbol(eg.symbols);
const unsigned char symbol = ctx->randomized
? random_symbol(eg.symbols)
: first_symbol(eg.symbols);
const fsm_state_t state = eg.to;

LOG(2, "sfs_step_edges: got edge 0x%x ('%c')\n",
Expand Down
1 change: 1 addition & 0 deletions src/libfsm/libfsm.syms
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ fsm_countstates
fsm_trim
fsm_reverse
fsm_determinise
fsm_determinise_with_config
fsm_remove_epsilons
fsm_complete
fsm_minimise
Expand Down
4 changes: 4 additions & 0 deletions src/libfsm/trim.c
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,10 @@ integrity_check(const char *descr, const struct fsm *fsm)
return;
#endif

#if !EXPENSIVE_CHECKS
return;
#endif

if (LOG_TRIM > 1) {
fprintf(stderr, "integrity check: %s...\n", descr);
}
Expand Down
1 change: 1 addition & 0 deletions src/libre/libre.syms
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ re_is_literal
re_flags
re_strerror
re_perror
re_is_anchored

ast_print
ast_print_dot
Expand Down
Loading
Loading