diff --git a/Makefile b/Makefile index 499239fd9..e6d578f2c 100644 --- a/Makefile +++ b/Makefile @@ -114,6 +114,7 @@ SUBDIR += tests/intersect SUBDIR += tests/eclosure SUBDIR += tests/equals SUBDIR += tests/subtract +SUBDIR += tests/detect_required SUBDIR += tests/determinise SUBDIR += tests/endids SUBDIR += tests/epsilons diff --git a/include/fsm/walk.h b/include/fsm/walk.h index b433380d5..bc030a017 100644 --- a/include/fsm/walk.h +++ b/include/fsm/walk.h @@ -7,6 +7,8 @@ #ifndef FSM_WALK_H #define FSM_WALK_H +#include + struct fsm; struct fsm_state; @@ -128,5 +130,34 @@ fsm_generate_matches_cb fsm_generate_cb_printf; * to escape all characters or just nonprintable ones. */ fsm_generate_matches_cb fsm_generate_cb_printf_escaped; +/* Walk a DFA and detect which characters MUST appear in the input for a + * match to be possible. For example, if input for the DFA corresponding + * to /^(abc|dbe)$/ does not contain 'b' at all, there's no way it can + * ever match, so executing the regex is unnecessary. This does not detect + * which characters must appear before/after others or how many times, just + * which must be present. + * + * The input must be a DFA. When run with EXPENSIVE_CHECKS this will + * check and return ERROR_MISUSE if it is not, otherwise this is an + * unchecked error. + * + * The bitmap will be cleared before populating. Afterward, + * bm_count(bitmap) will return how many required characters were + * found. + * + * There is an optional step_limit -- if this is reached, then it will + * return FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED and a + * cleared bitmap, because any partial information could still have been + * contradicted later. If the step_limit is 0 it will be ignored. */ +enum fsm_detect_required_characters_res { + FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN, + FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED, + FSM_DETECT_REQUIRED_CHARACTERS_ERROR_MISUSE = -1, + FSM_DETECT_REQUIRED_CHARACTERS_ERROR_ALLOC = -2, +}; +enum fsm_detect_required_characters_res +fsm_detect_required_characters(const struct fsm *dfa, size_t step_limit, + struct bm *bitmap); + #endif diff --git a/src/libfsm/Makefile b/src/libfsm/Makefile index 9af51a5a4..1fe70bec4 100644 --- a/src/libfsm/Makefile +++ b/src/libfsm/Makefile @@ -6,6 +6,7 @@ SRC += src/libfsm/complete.c SRC += src/libfsm/consolidate.c SRC += src/libfsm/clone.c SRC += src/libfsm/closure.c +SRC += src/libfsm/detect_required.c SRC += src/libfsm/edge.c SRC += src/libfsm/empty.c SRC += src/libfsm/end.c diff --git a/src/libfsm/detect_required.c b/src/libfsm/detect_required.c new file mode 100644 index 000000000..3d6bf9edd --- /dev/null +++ b/src/libfsm/detect_required.c @@ -0,0 +1,515 @@ +/* + * Copyright 2024 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include "internal.h" + +#define END_OF_FREELIST ((fsm_state_t)-1) +#define DEF_CURSOR_CEIL 1 /* force frequent realloc */ + +#define LOG_BASE 0 +#define LOG_RMAP (LOG_BASE + 0) +#define LOG_QUEUE (LOG_BASE + 0) +#define LOG_MERGE (LOG_BASE + 0) +#define LOG_CURSOR (LOG_BASE + 0) +#define LOG_PROGRESS (LOG_BASE + 0) + +#define USE_UNIQUE_ID 1 + +struct drc_cursor { + struct bm bitmap; + fsm_state_t state; /* or freelist id, or END_OF_FREELIST */ +#if USE_UNIQUE_ID + size_t unique_id; +#endif + uint64_t visited[/* u64bitset_words(state_count) */]; +}; + +struct drc_state { + const struct fsm_alloc *alloc; + const struct fsm *dfa; + const size_t state_count; + const size_t visited_words; + +#if USE_UNIQUE_ID + size_t unique_id_counter; +#endif + + struct { + size_t ceil; + struct drc_cursor **cursors; + + /* This (and the cursor->state field) are used as a + * freelist. There can never be more cursors than + * there are end states, so the cursor ID must fit + * in an fsm_state_t. */ + fsm_state_t freelist; + } cursor; + + size_t edge_count; + size_t unique_edge_count; + struct edge_alist { + fsm_state_t from; + fsm_state_t to; + bool unique; + uint8_t unique_char; + } *rmap; + + /* Accumulator for intersection of bitmaps. Set to the bitmap of + * the first cursor to reach the start state, and intersected + * thereafter. */ + struct bm accum; +}; + +static int +cmp_rmap_by_to(const void *pa, const void *pb) +{ + const struct edge_alist *a = (const struct edge_alist *)pa; + const struct edge_alist *b = (const struct edge_alist *)pb; + + /* this is a reverse mapping, so sort by the to state */ + if (a->to < b->to) { return -1; } + if (a->to > b->to) { return 1; } + + if (a->from < b->from) { return -1; } + if (a->from > b->from) { return 1; } + + /* shouldn't get here: should be unique */ + return 0; +} + +static bool +init_rmap(const struct fsm *dfa, struct drc_state *state) +{ + struct edge_group_iter iter; + struct edge_group_iter_info info; + + state->edge_count = 0; + state->unique_edge_count = 0; + + /* first pass: count edges */ + for (fsm_state_t s_i = 0; s_i < state->state_count; s_i++) { + edge_set_group_iter_reset(dfa->states[s_i].edges, + EDGE_GROUP_ITER_ALL, &iter); + while (edge_set_group_iter_next(&iter, &info)) { + if (info.to == s_i) { continue; } /* ignored */ + state->edge_count++; + } + } + + struct edge_alist *rmap = malloc(state->edge_count * sizeof(rmap[0])); + if (rmap == NULL) { return false; } + + /* second pass: populate */ + size_t rmap_used = 0; + for (fsm_state_t s_i = 0; s_i < state->state_count; s_i++) { + edge_set_group_iter_reset(dfa->states[s_i].edges, + EDGE_GROUP_ITER_ALL, &iter); + while (edge_set_group_iter_next(&iter, &info)) { + /* filter self-edges, they don't impact the result */ + if (info.to == s_i) { continue; } + + struct edge_alist *elt = &rmap[rmap_used]; + elt->from = s_i; + elt->to = info.to; + + size_t label_count = 0; + for (size_t i = 0; i < 4; i++) { + label_count += (size_t)u64bitset_popcount(info.symbols[i]); + } + assert(label_count > 0); + + if (label_count == 1) { + state->unique_edge_count++; + elt->unique = true; + bool unique_char_found = false; + for (size_t i = 0; i < 4; i++) { + const uint64_t w = info.symbols[i]; + if (w == 0) { continue; } + for (uint64_t bit_i = 0; bit_i < 64; bit_i++) { + if (w & (1ULL << bit_i)) { + elt->unique_char = 64*i + bit_i; + unique_char_found = true; + break; + } + } + } + assert(unique_char_found); + } else { + elt->unique = false; + } + + rmap_used++; + } + } + + /* invert mapping */ + qsort(rmap, state->edge_count, sizeof(rmap[0]), cmp_rmap_by_to); + +#if LOG_RMAP + for (size_t i = 0; i < rmap_used; i++) { + struct edge_alist *elt = &rmap[i]; + fprintf(stderr, "%s: rmap[%zu]: from %u, to %u, unique ? %d", + __func__, i, elt->from, elt->to, elt->unique); + if (elt->unique) { + fprintf(stderr, " -- 0x%02x\n", elt->unique_char); + } else { + fprintf(stderr, "\n"); + } + } +#endif + + state->rmap = rmap; + return true; +} + +static bool +request_cursor(struct drc_state *state, fsm_state_t *cursor_id) +{ + fsm_state_t freelist = state->cursor.freelist; + if (freelist == END_OF_FREELIST) { + const size_t oceil = state->cursor.ceil; + const size_t nceil = oceil == 0 + ? DEF_CURSOR_CEIL + : 2*state->cursor.ceil; + +#if LOG_CURSOR + fprintf(stderr, "%s: growing %zu -> %zu\n", __func__, oceil, nceil); +#endif + + struct drc_cursor **ncursors = f_realloc(state->alloc, + state->cursor.cursors, nceil * sizeof(ncursors[0])); + if (ncursors == NULL) { return false; } + + /* allocate new cursors */ + for (size_t i = oceil; i < nceil; i++) { + struct drc_cursor *c = malloc(sizeof(*c) + + state->visited_words * sizeof(c->visited[0])); + if (c == NULL) { + return false; + } + ncursors[i] = c; + } + + /* link on freelist */ + for (size_t i = oceil; i < nceil; i++) { + struct drc_cursor *c = ncursors[i]; + fsm_state_t next = i + 1; + if (next == nceil) { next = END_OF_FREELIST; } + c->state = next; + } + + state->cursor.ceil = nceil; + state->cursor.cursors = ncursors; + state->cursor.freelist = oceil; + freelist = state->cursor.freelist; + } + + assert(freelist < state->cursor.ceil); + struct drc_cursor *c = state->cursor.cursors[freelist]; + state->cursor.freelist = c->state; /* next link */ + c->state = (fsm_state_t)-2; + bm_clear(&c->bitmap); + memset(c->visited, 0x00, state->visited_words * sizeof(c->visited[0])); + +#if LOG_CURSOR > 1 + fprintf(stderr, "%s: requested cursor_id %u\n", __func__, freelist); +#endif + *cursor_id = freelist; + return true; +} + +static void +release_cursor(struct drc_state *state, fsm_state_t cursor_id) +{ + assert(cursor_id < state->cursor.ceil); + struct drc_cursor *c = state->cursor.cursors[cursor_id]; + c->state = state->cursor.freelist; + +#if LOG_CURSOR > 1 + fprintf(stderr, "%s: released cursor_id %u\n", __func__, cursor_id); +#endif + state->cursor.freelist = cursor_id; +} + +static size_t +rmap_seek(const struct edge_alist *rmap, size_t edge_count, fsm_state_t state) +{ + /* FIXME linear search, use bsearch later */ + for (size_t i = 0; i < edge_count; i++) { + if (rmap[i].to == state) { return i; } + } + + return edge_count; /* not found */ +} + +static struct drc_cursor * +get_cursor(struct drc_state *state, fsm_state_t cursor_id) +{ + /* this function exists to wrap the assert */ + assert(cursor_id < state->cursor.ceil); + return state->cursor.cursors[cursor_id]; +} + +enum fsm_detect_required_characters_res +fsm_detect_required_characters(const struct fsm *dfa, size_t step_limit, struct bm *bitmap) +{ + assert(dfa != NULL); + assert(bitmap != NULL); + + #if EXPENSIVE_CHECKS + if (!fsm_all(dfa, fsm_isdfa)) { + return FSM_DETECT_REQUIRED_CHARACTERS_ERROR_MISUSE; + } + #endif + + enum fsm_detect_required_characters_res res = FSM_DETECT_REQUIRED_CHARACTERS_ERROR_MISUSE; + + const size_t state_count = fsm_countstates(dfa); + fsm_state_t start_state; + if (!fsm_getstart(dfa, &start_state)) { + goto cleanup; + } + + struct drc_state state = { + .alloc = dfa->opt->alloc, + .dfa = dfa, + .state_count = state_count, + .visited_words = u64bitset_words(state_count), + .cursor.freelist = END_OF_FREELIST, + }; + + struct queue *q = NULL; + + bm_clear(bitmap); + + q = queue_new_dynamic(dfa->opt->alloc, state_count); + if (q == NULL) { + res = FSM_DETECT_REQUIRED_CHARACTERS_ERROR_ALLOC; + goto cleanup; + } + + if (!init_rmap(dfa, &state)) { + res = FSM_DETECT_REQUIRED_CHARACTERS_ERROR_ALLOC; + goto cleanup; + } + + /* If the DFA doesn't have any single-label edges, then walking + * the paths from every end state won't add any constraints. */ + if (state.unique_edge_count == 0) { + res = FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN; + goto cleanup; + } + + size_t steps = 0; + + /* Do the analysis for each end state, adding extra cursors + * wherever the path diverges, and intersect the requirement + * bitmaps across all cursors for all end states. The total cost + * is proportional to the number of states and the number of end + * states. */ + bool first_path = true; + for (fsm_state_t s_i = 0; s_i < state_count; s_i++) { + if (!fsm_isend(dfa, s_i)) { continue; } + +#if LOG_PROGRESS + fprintf(stderr, "-- analyzing end-state %u\n", s_i); +#endif + + fsm_state_t s; + assert(!queue_pop(q, &s)); /* empty */ + + /* This is managed by ID rather than pointer because the + * pointers become stale whenever the array is reallocated. */ + fsm_state_t cursor_id; + if (!request_cursor(&state, &cursor_id)) { + res = FSM_DETECT_REQUIRED_CHARACTERS_ERROR_ALLOC; + goto cleanup; + } + + { + struct drc_cursor *cursor = get_cursor(&state, cursor_id); + cursor->state = s_i; + +#if LOG_QUEUE + fprintf(stderr, "%s: queue pushing %u (at end state)\n", __func__, cursor_id); +#endif + if (!queue_push(q, cursor_id)) { + assert(!"internal error"); + goto cleanup; + } + +#if LOG_PROGRESS > 1 + fprintf(stderr, "%s: marking end state %u visited on cursor %u\n", __func__, s_i, cursor_id); +#endif + u64bitset_set(cursor->visited, s_i); + } + + while (queue_pop(q, &cursor_id)) { + steps++; + if ((steps % 10000) == 0) { + fprintf(stderr, " -- %zu steps...\n", steps); + } + if (steps == step_limit) { + res = FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED; + /* Note: this does not copy the partial info, since + * further processing might find alternate routes that + * clear the currently set constraints. */ + goto cleanup; + } + + struct drc_cursor *cursor = get_cursor(&state, cursor_id); +#if LOG_QUEUE + fprintf(stderr, "%s: queue popped %u -- state %u\n", __func__, cursor_id, cursor->state); +#endif + assert(cursor->state < state_count); + + if (cursor->state == start_state) { +#if LOG_MERGE + fprintf(stderr, "%s: cursor %u reached start_state %u with bitmap ", + __func__, cursor_id, start_state); + bm_print(stderr, state.dfa->opt, &cursor->bitmap, 0, fsm_escputc); + fprintf(stderr, "\n"); +#endif + if (first_path) { + bm_copy(&state.accum, &cursor->bitmap); + first_path = false; + } else { + bm_intersect(&state.accum, &cursor->bitmap); + } + +#if LOG_MERGE + fprintf(stderr, "%s: merged accumulator is now ", __func__); + bm_print(stderr, state.dfa->opt, &state.accum, 0, fsm_escputc); + fprintf(stderr, "\n"); +#endif + + if (!bm_any(&state.accum)) { + /* unconstrained path found -- further work cannot + * add any new information, so we're done */ +#if LOG_PROGRESS + fprintf(stderr, "%s: unconstrained path found, we're done\n", __func__); +#endif + res = FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN; + goto cleanup; + } + + release_cursor(&state, cursor_id); + continue; + } + + /* start of reverse edges */ + size_t offset = rmap_seek(state.rmap, state.edge_count, cursor->state); + + while (offset < state.edge_count) { + struct edge_alist *elt = &state.rmap[offset]; + +#if LOG_PROGRESS > 1 + fprintf(stderr, "%s: offset %zu, elt->from %u, elt->to %u, cursor->state %u, cursor->bitmap ", + __func__, offset, elt->from, elt->to, cursor->state); + bm_print(stderr, state.dfa->opt, &cursor->bitmap, 0, fsm_escputc); + fprintf(stderr, "\n"); +#endif + + if (elt->to != cursor->state) { + break; + } + assert(elt->to != elt->from); /* self-edges were filtered before */ + + if (u64bitset_get(cursor->visited, elt->from)) { +#if LOG_PROGRESS > 2 + fprintf(stderr, "%s: skipping %u, visited\n", __func__, elt->from); +#endif + offset++; + continue; + } + + fsm_state_t other_cursor_id; + if (!request_cursor(&state, &other_cursor_id)) { + res = FSM_DETECT_REQUIRED_CHARACTERS_ERROR_ALLOC; + goto cleanup; + } + + struct drc_cursor *ocursor = get_cursor(&state, other_cursor_id); + ocursor->state = elt->from; + bm_copy(&ocursor->bitmap, &cursor->bitmap); + memcpy(ocursor->visited, cursor->visited, + state.visited_words * sizeof(cursor->visited[0])); + +#if USE_UNIQUE_ID + ocursor->unique_id = state.unique_id_counter++; +#endif + +#if LOG_PROGRESS > 1 + fprintf(stderr, "%s: marking %u visited on cursor %u\n", __func__, elt->from, other_cursor_id); +#endif + u64bitset_set(ocursor->visited, elt->from); + + if (elt->unique) { +#if LOG_PROGRESS + fprintf(stderr, "%s: marking 0x%02x (%c) as required on cursor %u\n", + __func__, elt->unique_char, + isprint(elt->unique_char) ? elt->unique_char : '.', + other_cursor_id); +#endif + bm_set(&ocursor->bitmap, (size_t)elt->unique_char); + } + +#if LOG_QUEUE + fprintf(stderr, "%s: queue pushing %u, state %u (backlink %u -> %u)\n", + __func__, other_cursor_id, ocursor->state, elt->from, elt->to); +#endif + /* fprintf(stdout, "-- %u <- %u, %zu to %zu\n", elt->to, elt->from, cursor->unique_id, ocursor->unique_id); */ + + if (!queue_push(q, other_cursor_id)) { + assert(!"internal error"); + goto cleanup; + } + + offset++; + } + + release_cursor(&state, cursor_id); + } + } + + /* The final result is the intersection of every bitmap + * reaching the start state. */ + bm_copy(bitmap, &state.accum); + +#if LOG_PROGRESS + fprintf(stderr, "%s: final result: ", __func__); + bm_print(stderr, state.dfa->opt, &state.accum, 0, fsm_escputc); + fprintf(stderr, "\n"); +#endif + + res = FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN; + +cleanup: + free(state.rmap); + for (size_t i = 0; i < state.cursor.ceil; i++) { + free(state.cursor.cursors[i]); + } + free(state.cursor.cursors); + queue_free(q); + + return res; +} diff --git a/src/libfsm/libfsm.syms b/src/libfsm/libfsm.syms index a2570b8c9..1004bea42 100644 --- a/src/libfsm/libfsm.syms +++ b/src/libfsm/libfsm.syms @@ -15,6 +15,7 @@ fsm_reachableall fsm_reachableany fsm_walk_edges fsm_walk_states +fsm_detect_required_characters # fsm_epsilonsonly diff --git a/tests/detect_required/Makefile b/tests/detect_required/Makefile new file mode 100644 index 000000000..34214f07e --- /dev/null +++ b/tests/detect_required/Makefile @@ -0,0 +1,26 @@ +.include "../../share/mk/top.mk" + +TEST.tests/detect_required != ls -1 tests/detect_required/detect_required*.c +TEST_SRCDIR.tests/detect_required = tests/detect_required +TEST_OUTDIR.tests/detect_required = ${BUILD}/tests/detect_required + +.for n in ${TEST.tests/detect_required:T:R:C/^detect_required//} +test:: ${TEST_OUTDIR.tests/detect_required}/res${n} +SRC += ${TEST_SRCDIR.tests/detect_required}/detect_required${n}.c +CFLAGS.${TEST_SRCDIR.tests/detect_required}/detect_required${n}.c = -UNDEBUG + +${TEST_OUTDIR.tests/detect_required}/run${n}: ${TEST_OUTDIR.tests/detect_required}/detect_required${n}.o ${TEST_OUTDIR.tests/detect_required}/testutil.o + ${CC} ${CFLAGS} -o ${TEST_OUTDIR.tests/detect_required}/run${n} ${TEST_OUTDIR.tests/detect_required}/detect_required${n}.o ${TEST_OUTDIR.tests/detect_required}/testutil.o ${BUILD}/lib/libfsm.a ${BUILD}/lib/libre.a + +${TEST_OUTDIR.tests/detect_required}/detect_required${n}.o: tests/detect_required/testutil.h + +${TEST_OUTDIR.tests/detect_required}/res${n}: ${TEST_OUTDIR.tests/detect_required}/run${n} + ( ${TEST_OUTDIR.tests/detect_required}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/detect_required}/res${n} + +.for lib in ${LIB:Mlibfsm} ${LIB:Mlibre} +${TEST_OUTDIR.tests/detect_required}/run${n}: ${BUILD}/lib/${lib:R}.a +.endfor +.endfor + +${TEST_OUTDIR.tests/detect_required}/testutil.o: tests/detect_required/testutil.c + ${CC} ${CFLAGS} -c -o ${TEST_OUTDIR.tests/detect_required}/testutil.o tests/detect_required/testutil.c diff --git a/tests/detect_required/detect_required1.c b/tests/detect_required/detect_required1.c new file mode 100644 index 000000000..52bb83477 --- /dev/null +++ b/tests/detect_required/detect_required1.c @@ -0,0 +1,32 @@ +#include "testutil.h" + +const struct testcase tests[] = { + { .regex = "^$", .required = "" }, + { .regex = "^a$", .required = "a" }, + { .regex = "^abcde$", .required = "abcde" }, + { .regex = "^(ab|cd)$", .required = "" }, + { .regex = "^(ab|cd|ef)$", .required = "" }, + { .regex = "^(abc|def)$", .required = "" }, + { .regex = "^(abc|dbf)$", .required = "b" }, + { .regex = "^abc(def)*ghi$", .required = "abcghi" }, + { .regex = "^abc(def)+ghi$", .required = "abcdefghi" }, + { .regex = "^ghi(def)abc$", .required = "abcdefghi" }, +}; + +int main() +{ + const bool first_fail = getenv("FIRST_FAIL") != NULL; + const size_t testcount = sizeof(tests)/sizeof(tests[0]); + + size_t failures = 0; + for (size_t i = 0; i < testcount; i++) { + if (!run_test(&tests[i])) { + failures++; + if (first_fail) { break; } + } + } + + return failures == 0 + ? EXIT_SUCCESS + : EXIT_FAILURE; +} diff --git a/tests/detect_required/detect_required_step_limit.c b/tests/detect_required/detect_required_step_limit.c new file mode 100644 index 000000000..6e5808b1e --- /dev/null +++ b/tests/detect_required/detect_required_step_limit.c @@ -0,0 +1,58 @@ +#include "testutil.h" + +#include +#include +#include +#include +#include +#include + +static const struct fsm_options opt; + +int main() +{ + enum re_flags flags = 0; + struct re_err err; + const char *regex = "^abcde$"; + + struct fsm *fsm = re_comp(RE_PCRE, fsm_sgetc, ®ex, &opt, flags, &err); + assert(fsm != NULL); + + if (!fsm_determinise(fsm)) { + assert(!"determinise"); + return EXIT_FAILURE; + } + if (!fsm_minimise(fsm)) { + assert(!"minimise"); + return EXIT_FAILURE; + } + + struct bm bitmap; + + /* keep decreasing the step limit until it's hit, and check that + * the bitmap is cleared. */ + bool hit_step_limit = false; + size_t step_limit = 25; + while (!hit_step_limit) { + assert(step_limit > 0); + + const enum fsm_detect_required_characters_res res = fsm_detect_required_characters(fsm, step_limit, &bitmap); + if (res == FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED) { + hit_step_limit = true; + + /* this should not contain any partially complete information */ + for (size_t i = 0; i < 4; i++) { + const uint64_t *w = bm_nth_word(&bitmap, i); + if (*w != 0) { + fprintf(stderr, "-- Test failure: partial information set when step limit reached\n"); + return EXIT_FAILURE; + } + } + } + + step_limit--; + } + + fsm_free(fsm); + return EXIT_SUCCESS; +} diff --git a/tests/detect_required/testutil.c b/tests/detect_required/testutil.c new file mode 100644 index 000000000..c08160355 --- /dev/null +++ b/tests/detect_required/testutil.c @@ -0,0 +1,106 @@ +#include "testutil.h" + +#include + +#include +#include +#include +#include +#include +#include + +#include + +static const struct fsm_options opt = { + .group_edges = 1, +}; + +bool +run_test(const struct testcase *tc) +{ + bool test_res = false; + + enum re_flags flags = 0; + struct re_err err; + char *regex = (char *)tc->regex; + const char *required = tc->required ? tc->required : ""; + const size_t step_limit = tc->step_limit ? tc->step_limit : DEF_STEP_LIMIT; + + fprintf(stderr, "-- test: regex '%s', required '%s'\n", tc->regex, required); + + struct fsm *fsm = re_comp(RE_PCRE, fsm_sgetc, ®ex, &opt, flags, &err); + if (fsm == NULL) { + return false; + } + /* assert(fsm != NULL); */ + + if (!fsm_determinise(fsm)) { + assert(!"determinise"); + return false; + } + if (!fsm_minimise(fsm)) { + assert(!"minimise"); + return false; + } + + if (getenv("PRINT_DOT")) { + fsm_print_dot(stderr, fsm); + } + if (getenv("PRINT_FSM")) { + fsm_print_fsm(stderr, fsm); + } + + struct bm bitmap; + bm_clear(&bitmap); + + { + const size_t statecount = fsm_countstates(fsm); + size_t ends = 0; + for (size_t i = 0; i < statecount; i++) { + if (fsm_isend(fsm, i)) { + ends++; + } + } + fprintf(stderr, "-- statecount %zu, %zu ends\n", statecount, ends); + } + + + const enum fsm_detect_required_characters_res res = fsm_detect_required_characters(fsm, step_limit, &bitmap); + if (res == FSM_DETECT_REQUIRED_CHARACTERS_STEP_LIMIT_REACHED) { + fprintf(stderr, "-- step limit reached, halting\n"); + goto cleanup; + } + assert(res == FSM_DETECT_REQUIRED_CHARACTERS_WRITTEN); + + char buf[257] = {0}; + size_t used = 0; + assert(!bm_get(&bitmap, 0)); /* does not contain 0x00 */ + + int i = 0; + for (;;) { + const size_t next = bm_next(&bitmap, i, 1); + if (next > UCHAR_MAX) { break; } + buf[used++] = (char)next; + i = next; + } + + if (0 != strcmp(required, buf)) { + fprintf(stderr, "Error: mismatch\n"); + fprintf(stderr, "-- expected: [%s]\n", required); + fprintf(stderr, "-- got: [%s]\n", buf); + goto cleanup; + } + + /* TODO: use fsm_generate_matches to check. it just yields one + * character from an edge, so it won't indicate whether that + * specific character is required, but if it generates without + * something the test says is required that probably means the + * test is wrong. */ + + test_res = true; + +cleanup: + fsm_free(fsm); + + return test_res; +} diff --git a/tests/detect_required/testutil.h b/tests/detect_required/testutil.h new file mode 100644 index 000000000..f9378c190 --- /dev/null +++ b/tests/detect_required/testutil.h @@ -0,0 +1,21 @@ +#ifndef TESTUTIL_H +#define TESTUTIL_H + +#include +#include +#include +#include + +#define DEF_STEP_LIMIT 100000 + +struct testcase { + const char *regex; + const char *required; + size_t max_gen_buffer; /* 0: default */ + size_t step_limit; +}; + +bool +run_test(const struct testcase *tc); + +#endif