diff --git a/Makefile b/Makefile index 514f80bba..b356c4f0f 100644 --- a/Makefile +++ b/Makefile @@ -118,6 +118,7 @@ SUBDIR += tests/equals SUBDIR += tests/subtract SUBDIR += tests/detect_required SUBDIR += tests/determinise +SUBDIR += tests/eager_output SUBDIR += tests/endids SUBDIR += tests/epsilons SUBDIR += tests/fsm diff --git a/fuzz/target.c b/fuzz/target.c index 543891bb9..d56a9bf82 100644 --- a/fuzz/target.c +++ b/fuzz/target.c @@ -26,10 +26,21 @@ /* 10 seconds */ #define TIMEOUT_USEC (10ULL * 1000 * 1000) +static bool verbosity_checked = false; +static bool verbose = false; + +#define LOG(...) \ + do { \ + if (verbose) { \ + fprintf(stderr, __VA_ARGS__); \ + } \ + } while (0) \ + enum run_mode { MODE_DEFAULT, MODE_SHUFFLE_MINIMISE, MODE_ALL_PRINT_FUNCTIONS, + MODE_EAGER_OUTPUT, }; @@ -344,6 +355,508 @@ fuzz_all_print_functions(FILE *f, const char *pattern, bool det, bool min, const return EXIT_SUCCESS; } +#define MAX_PATTERNS 4 +struct eager_output_cb_info { + size_t used; + fsm_output_id_t ids[MAX_PATTERNS]; +}; + +static void +reset_eager_output_info(struct eager_output_cb_info *info) +{ + info->used = 0; +} + +struct feo_env { + bool ok; + size_t pattern_count; + size_t fsm_count; + size_t max_match_count; + size_t max_steps; + + char *patterns[MAX_PATTERNS]; + struct fsm *fsms[MAX_PATTERNS]; + struct fsm *combined; + + /* which pattern is being used for generation, (size_t)-1 for combined */ + size_t current_pattern; + + struct eager_output_cb_info outputs; + struct eager_output_cb_info outputs_combined; +}; + +void +append_eager_output_cb(fsm_output_id_t id, void *opaque) +{ + struct eager_output_cb_info *info = (struct eager_output_cb_info *)opaque; + + for (size_t i = 0; i < info->used; i++) { + if (info->ids[i] == id) { + return; /* already present */ + } + } + + assert(info->used < MAX_PATTERNS); + info->ids[info->used++] = id; +} + +static enum fsm_generate_matches_cb_res +gen_combined_check_individual_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque); + +static enum fsm_generate_matches_cb_res +gen_individual_check_combined_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque); + +#define DEF_MAX_STEPS 100000 +#define DEF_MAX_MATCH_COUNT 1000 + +/* This isn't part of the public interface, per se. */ +void +fsm_eager_output_dump(FILE *f, const struct fsm *fsm); + +static int +fuzz_eager_output(const uint8_t *data, size_t size) +{ + struct feo_env env = { + .ok = true, + .pattern_count = 0, + .max_steps = DEF_MAX_STEPS, + .max_match_count = DEF_MAX_MATCH_COUNT, + }; + + { + const char *steps = getenv("STEPS"); + const char *matches = getenv("MATCHES"); + if (steps != NULL) { + env.max_steps = strtoul(steps, NULL, 10); + assert(env.max_steps > 0); + } + if (matches != NULL) { + env.max_match_count = strtoul(matches, NULL, 10); + assert(env.max_match_count > 0); + } + } + + int ret = 0; + + size_t max_pattern_length = 0; + + /* chop data into a series of patterns */ + { + size_t prev = 0; + size_t offset = 0; + + /* Patterns with lots of '.' can take a while to determinise. + * That slows down fuzzer coverage, but isn't interesting here. */ + size_t dots = 0; + + while (offset < size && env.pattern_count < MAX_PATTERNS) { +#define MAX_DOTS 4 + if (data[offset] == '.') { dots++; } + + if (data[offset] == '\0' || data[offset] == '\n' || offset == size - 1) { + size_t len = offset - prev; + + if (dots > MAX_DOTS) { + /* ignored */ + prev = offset; + } else if (len > 0) { + char *pattern = malloc(len + 1); + assert(pattern != NULL); + + memcpy(pattern, &data[prev], len); + if (len > 0 && pattern[len] == '\n') { + len--; /* drop trailing newline */ + } + pattern[len] = '\0'; + bool keep = true; + + if (len > 0) { + for (size_t i = 0; i < len - 1; i++) { + if (pattern[i] == '\\' && pattern[i + 1] == 'x') { + /* ignore unhandled parser errors from "\x", see #386 */ + keep = false; + } + } + } + + if (keep) { + env.patterns[env.pattern_count++] = pattern; + + if (len > max_pattern_length) { + max_pattern_length = len; + } + } else { + free(pattern); + } + prev = offset; + dots = 0; + } + } + + offset++; + } + } + + struct re_anchoring_info anchorage[MAX_PATTERNS] = {0}; + + /* for each pattern, attempt to compile to a DFA */ + for (size_t p_i = 0; p_i < env.pattern_count; p_i++) { + const char *p = env.patterns[p_i]; + + if (!re_is_anchored(RE_PCRE, fsm_sgetc, &p, 0, NULL, &anchorage[p_i])) { + continue; /* unsupported regex */ + } + + p = env.patterns[p_i]; + struct fsm *fsm = re_comp(RE_PCRE, fsm_sgetc, &p, NULL, 0, NULL); + + LOG("%s: pattern %zd: '%s' => %p\n", __func__, p_i, env.patterns[p_i], (void *)fsm); + + if (fsm == NULL) { + continue; /* invalid regex */ + } + + const fsm_output_id_t endid = (fsm_output_id_t)p_i; + ret = fsm_seteageroutputonends(fsm, endid); + assert(ret == 1); + + if (verbose) { + fprintf(stderr, "==== pattern %zd, pre det\n", p_i); + fsm_dump(stderr, fsm); + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + + fsm_state_t c = fsm_countstates(fsm); + for (fsm_state_t i = 0; i < c; i++) { + fprintf(stderr, "-- %d: end? %d\n", i, fsm_isend(fsm, i)); + } + } + + ret = fsm_determinise(fsm); + assert(ret == 1); + + ret = fsm_minimise(fsm); + assert(ret == 1); + + fsm_state_t start; + if (!fsm_getstart(fsm, &start)) { + fsm_free(fsm); + continue; + } + + if (verbose) { + fprintf(stderr, "==== pattern %zd, post det\n", p_i); + fsm_dump(stderr, fsm); + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + + fsm_state_t c = fsm_countstates(fsm); + for (fsm_state_t i = 0; i < c; i++) { + fprintf(stderr, "-- %d: end? %d\n", i, fsm_isend(fsm, i)); + } + } + + fsm_eager_output_set_cb(fsm, append_eager_output_cb, &env.outputs); + env.fsms[env.fsm_count++] = fsm; + } + + /* don't bother checking combined behavior unless there's multiple DFAs */ + if (env.fsm_count < 2) { goto cleanup; } + + /* copy and combine fsms into one DFA */ + { + size_t used = 0; + struct fsm_union_entry entries[MAX_PATTERNS] = {0}; + + for (size_t i = 0; i < env.fsm_count; i++) { + /* there can be gaps, fsms[] lines up with patterns[] */ + if (env.fsms[i] == NULL) { continue; } + + fsm_state_t start; + if (!fsm_getstart(env.fsms[i], &start)) { + assert(!"hit"); + } + + struct fsm *cp = fsm_clone(env.fsms[i]); + assert(cp != NULL); + + if (verbose) { + fprintf(stderr, "==== cp %zd\n", i); + fsm_dump(stderr, cp); + fsm_eager_output_dump(stderr, cp); + fprintf(stderr, "====\n"); + + fsm_state_t c = fsm_countstates(cp); + for (fsm_state_t i = 0; i < c; i++) { + fprintf(stderr, "-- %d: end? %d\n", i, fsm_isend(cp, i)); + } + } + + entries[used].fsm = cp; + entries[used].anchored_start = anchorage[i].start; + entries[used].anchored_end = anchorage[i].end; + used++; + } + + if (used == 0) { + goto cleanup; /* nothing to do */ + } + + /* consumes entries[] */ + struct fsm *fsm = fsm_union_repeated_pattern_group(used, entries, NULL); + assert(fsm != NULL); + + if (verbose) { + fprintf(stderr, "==== combined (pre-det)\n"); + fsm_dump(stderr, fsm); + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + if (!fsm_determinise(fsm)) { + assert(!"failed to determinise"); + } + + if (!fsm_minimise(fsm)) { + assert(!"failed to minimise"); + } + + LOG("%s: combined state_count %d\n", __func__, fsm_countstates(fsm)); + env.combined = fsm; + /* fsm_eager_output_set_cb(fsm, append_eager_output_cb, &env.outputs_combined); */ + + if (verbose) { + fprintf(stderr, "==== combined\n"); + fsm_dump(stderr, env.combined); + fsm_eager_output_dump(stderr, env.combined); + fprintf(stderr, "====\n"); + } + + } + + /* Use fsm_generate_matches to check for matches that got lost + * and false positives introduced while combining the DFAs. + * Use the combined DFA to generate matches, check that the + * match behavior agrees with the individual DFA copies. */ + env.current_pattern = (size_t)-1; + if (!fsm_generate_matches(env.combined, max_pattern_length, gen_combined_check_individual_cb, &env)) { + goto cleanup; + } + + if (!env.ok) { goto cleanup; } + + /* Likewise, use every individual DFA to generate matches and */ + /* check behavior against the combined DFA. */ + for (size_t i = 0; i < env.pattern_count; i++) { + env.current_pattern = i; + if (!fsm_generate_matches(env.combined, max_pattern_length, gen_individual_check_combined_cb, &env)) { + goto cleanup; + } + } + + ret = env.ok ? EXIT_SUCCESS : EXIT_FAILURE; +cleanup: + for (size_t i = 0; i < MAX_PATTERNS; i++) { + if (env.patterns[i] != NULL) { + free(env.patterns[i]); + env.patterns[i] = NULL; + } + if (env.fsms[i] != NULL) { + fsm_free(env.fsms[i]); + } + } + if (env.combined != NULL) { + fsm_free(env.combined); + } + + return ret; +} + +static int +cmp_output_id(const void *pa, const void *pb) +{ + const fsm_output_id_t a = *(fsm_output_id_t *)pa; + const fsm_output_id_t b = *(fsm_output_id_t *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + +static bool +match_input_get_eager_outputs(struct fsm *fsm, const char *input, size_t input_length, + struct eager_output_cb_info *dst) +{ + (void)input_length; + fsm_state_t end; + + reset_eager_output_info(dst); + + fsm_eager_output_set_cb(fsm, append_eager_output_cb, dst); + const int ret = fsm_exec(fsm, fsm_sgetc, &input, &end, NULL); + if (ret == 0) { + return false; /* no match */ + } else { + assert(ret == 1); /* match */ + } + + /* sort the IDs, to make comparison cheaper */ + qsort(dst->ids, dst->used, sizeof(dst->ids[0]), cmp_output_id); + return true; /* match */ +} + +/* For a given matching input generated by the combined DFA, check that + * only the expected individual source DFAs match. */ +static enum fsm_generate_matches_cb_res +gen_combined_check_individual_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque) +{ + (void)fsm; + (void)depth; + (void)end_state; + + struct feo_env *env = opaque; + assert(env->current_pattern == (size_t)-1); + + if (match_count > env->max_match_count) { return FSM_GENERATE_MATCHES_CB_RES_HALT; } + if (steps > env->max_steps) { return FSM_GENERATE_MATCHES_CB_RES_HALT; } + + /* execute, to set eager outputs */ + if (!match_input_get_eager_outputs(env->combined, input, input_length, &env->outputs_combined)) { + env->ok = false; + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + + size_t individual_outputs_used = 0; + fsm_output_id_t individual_outputs[MAX_PATTERNS]; + + for (size_t i = 0; i < env->pattern_count; i++) { + struct fsm *fsm = env->fsms[i]; + if (fsm == NULL) { continue; } + + if (!match_input_get_eager_outputs(fsm, input, input_length, &env->outputs)) { + env->ok = false; + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + + if (env->outputs.used > 0) { + assert(env->outputs.used == 1); + individual_outputs[individual_outputs_used++] = env->outputs.ids[0]; + } + } + + bool match = true; + if (env->outputs_combined.used != individual_outputs_used) { + match = false; + } + + for (size_t cmb_i = 0; cmb_i < env->outputs_combined.used; cmb_i++) { + const fsm_output_id_t cur = env->outputs_combined.ids[cmb_i]; + assert(env->fsms[cmb_i] != NULL); + bool found = false; + for (size_t i = 0; i < individual_outputs_used; i++) { + if (individual_outputs[i] == cur) { + found = true; + break; + } + } + if (!found) { + match = false; + break; + } + } + + if (!match) { + fprintf(stderr, "%s: combined <-> individual mismatch for input '%s'(%zd)!\n", __func__, input, input_length); + + fprintf(stderr, "-- combined: %zu IDs:", env->outputs_combined.used); + for (size_t cmb_i = 0; cmb_i < env->outputs_combined.used; cmb_i++) { + fprintf(stderr, " %d", env->outputs_combined.ids[cmb_i]); + } + fprintf(stderr, "\n"); + fprintf(stderr, "-- individiual: %zu IDs:", individual_outputs_used); + for (size_t i = 0; i < individual_outputs_used; i++) { + fprintf(stderr, " %d", individual_outputs[i]); + } + fprintf(stderr, "\n"); + goto fail; + } + + return FSM_GENERATE_MATCHES_CB_RES_CONTINUE; + +fail: + env->ok = false; + return FSM_GENERATE_MATCHES_CB_RES_HALT; +} + +/* For a given matching input generated by one of the source DFAs, check that + * the combined DFA also matches, and that the only other source DFAs that match + * are ones that should according to the combined DFA. */ +static enum fsm_generate_matches_cb_res +gen_individual_check_combined_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque) +{ + (void)fsm; + (void)depth; + (void)end_state; + + struct feo_env *env = opaque; + assert(env->current_pattern < env->pattern_count); + if (match_count > env->max_match_count) { return FSM_GENERATE_MATCHES_CB_RES_HALT; } + if (steps > env->max_steps) { return FSM_GENERATE_MATCHES_CB_RES_HALT; } + + struct fsm *cur_fsm = env->fsms[env->current_pattern]; + if (cur_fsm == NULL) { return FSM_GENERATE_MATCHES_CB_RES_CONTINUE; } + + /* execute, to set eager outputs */ + if (!match_input_get_eager_outputs(cur_fsm, input, input_length, &env->outputs)) { + goto fail; + } + if (!match_input_get_eager_outputs(env->combined, input, input_length, &env->outputs_combined)) { + goto fail; + } + + assert(env->outputs.used == 1); + + bool found = false; + for (size_t i = 0; i < env->outputs_combined.used; i++) { + if (env->outputs_combined.ids[i] == env->outputs.ids[0]) { + found = true; + break; + } + } + + if (!found) { + fprintf(stderr, "%s: combined <-> individual mismatch for input '%s'(%zd)!\n", __func__, input, input_length); + + fprintf(stderr, "-- combined: %zu IDs:", env->outputs_combined.used); + for (size_t cmb_i = 0; cmb_i < env->outputs_combined.used; cmb_i++) { + fprintf(stderr, " %d", env->outputs_combined.ids[cmb_i]); + } + fprintf(stderr, "\n"); + fprintf(stderr, "-- pattern %zd: %zu IDs:", env->current_pattern, env->outputs.used); + for (size_t i = 0; i < env->outputs.used; i++) { + fprintf(stderr, " %d", env->outputs.ids[i]); + } + fprintf(stderr, "\n"); + goto fail; + } + + return FSM_GENERATE_MATCHES_CB_RES_CONTINUE; + +fail: + env->ok = false; + return FSM_GENERATE_MATCHES_CB_RES_HALT; +} +#undef MAX_PATTERNS + #define MAX_FUZZER_DATA (64 * 1024) static uint8_t data_buf[MAX_FUZZER_DATA + 1]; @@ -358,6 +871,7 @@ get_run_mode(void) switch (mode[0]) { case 'm': return MODE_SHUFFLE_MINIMISE; case 'p': return MODE_ALL_PRINT_FUNCTIONS; + case 'E': return MODE_EAGER_OUTPUT; case 'd': default: return MODE_DEFAULT; @@ -373,6 +887,11 @@ harness_fuzzer_target(const uint8_t *data, size_t size) return EXIT_SUCCESS; } + if (!verbosity_checked) { + verbosity_checked = true; + verbose = getenv("VERBOSE") != NULL; + } + /* Ensure that input is '\0'-terminated. */ if (size > MAX_FUZZER_DATA) { size = MAX_FUZZER_DATA; @@ -392,6 +911,9 @@ harness_fuzzer_target(const uint8_t *data, size_t size) case MODE_SHUFFLE_MINIMISE: return shuffle_minimise(pattern); + case MODE_EAGER_OUTPUT: + return fuzz_eager_output(data, size); + case MODE_ALL_PRINT_FUNCTIONS: { if (dev_null == NULL) { @@ -403,7 +925,7 @@ harness_fuzzer_target(const uint8_t *data, size_t size) const bool det = b0 & 0x1; const bool min = b0 & 0x2; const enum fsm_io io_mode = (b0 >> 2) % 3; - + const char *shifted_pattern = (const char *)&data_buf[1]; int res = fuzz_all_print_functions(dev_null, shifted_pattern, det, min, io_mode); return res; diff --git a/include/fsm/bool.h b/include/fsm/bool.h index d92518297..4d9f1889a 100644 --- a/include/fsm/bool.h +++ b/include/fsm/bool.h @@ -52,6 +52,16 @@ struct fsm * fsm_union_array(size_t fsm_count, struct fsm **fsms, struct fsm_combined_base_pair *bases); +struct fsm_union_entry { + struct fsm *fsm; + bool anchored_start; + bool anchored_end; +}; + +struct fsm * +fsm_union_repeated_pattern_group(size_t entry_count, + struct fsm_union_entry *entries, struct fsm_combined_base_pair *bases); + struct fsm * fsm_intersect(struct fsm *a, struct fsm *b); diff --git a/include/fsm/fsm.h b/include/fsm/fsm.h index 877d5c1bf..701efe70b 100644 --- a/include/fsm/fsm.h +++ b/include/fsm/fsm.h @@ -7,6 +7,7 @@ #ifndef FSM_H #define FSM_H +#include #include struct fsm; @@ -27,6 +28,9 @@ typedef unsigned int fsm_state_t; * original FSM(s) matched when executing a combined FSM. */ typedef unsigned int fsm_end_id_t; +/* Eager output ID. */ +typedef unsigned int fsm_output_id_t; + #define FSM_END_ID_MAX UINT_MAX /* @@ -266,6 +270,39 @@ fsm_mapendids(struct fsm * fsm, fsm_endid_remap_fun remap, void *opaque); void fsm_increndids(struct fsm * fsm, int delta); +/* Associate an eagerly matched numeric ID with the end states in an fsm. + * + * This is similar to fsm_setendid, but has different performance + * trade-offs. In particular, it can become extremely expensive to + * combine multiple DFAs with endids on their end states when they + * representing regexes with unanchored ends, because the FSM has to + * explicitly represent all the possible combinations of matches by + * copying the entire path to every reachable end state. Eager endids + * are associated with the edge leaving the main pattern match. + * + * Returns 1 on success, 0 on error. + * */ +int +fsm_seteagerendid(struct fsm *fsm, fsm_end_id_t id); + +/* Set an eager output ID to emit every time the state is entered. + * This turns the automata into a Moore machine. */ +int +fsm_seteageroutput(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id); + +/* Set an eager output ID on all current end states. */ +int +fsm_seteageroutputonends(struct fsm *fsm, fsm_output_id_t id); + +/* HACK */ +typedef void +fsm_eager_output_cb(fsm_output_id_t id, void *opaque); +void +fsm_eager_output_set_cb(struct fsm *fsm, fsm_eager_output_cb *cb, void *opaque); + +void +fsm_eager_output_get_cb(const struct fsm *fsm, fsm_eager_output_cb **cb, void **opaque); + /* * Find the state (if there is just one), or add epsilon edges from all states, * for which the given predicate is true. @@ -436,6 +473,15 @@ fsm_shortest(const struct fsm *fsm, fsm_state_t start, fsm_state_t goal, unsigned (*cost)(fsm_state_t from, fsm_state_t to, char c)); +/* HACK */ +typedef void +fsm_eager_endid_cb(fsm_end_id_t id, void *opaque); +void +fsm_eager_endid_set_cb(struct fsm *fsm, fsm_eager_endid_cb *cb, void *opaque); + +void +fsm_eager_endid_get_cb(const struct fsm *fsm, fsm_eager_endid_cb **cb, void **opaque); + /* * Execute an FSM reading input from the user-specified callback fsm_getc(). * fsm_getc() is passed the opaque pointer given, and is expected to return diff --git a/include/fsm/print.h b/include/fsm/print.h index 9f7264e81..10244129b 100644 --- a/include/fsm/print.h +++ b/include/fsm/print.h @@ -45,6 +45,9 @@ enum fsm_print_lang { struct fsm_state_metadata { const fsm_end_id_t *end_ids; size_t end_id_count; + + const fsm_output_id_t *eager_output_ids; + size_t eager_output_count; }; /* diff --git a/include/re/re.h b/include/re/re.h index 20408e98a..a3e1f7e0c 100644 --- a/include/re/re.h +++ b/include/re/re.h @@ -136,6 +136,21 @@ re_comp(enum re_dialect dialect, const struct fsm_alloc *alloc, enum re_flags flags, struct re_err *err); +struct re_anchoring_info { + int start; + int end; + /* FIXME: this could also check for AST_FLAG_NULLABLE, AST_FLAG_UNSATISFIABLE, + * AST_FLAG_ALWAYS_CONSUMES, AST_FLAG_CAN_CONSUME */ +}; + +/* Parse and analyze the regex enough to determine whether it is + * anchored at the start and/or end. Returns 0 if the regex is not + * supported, otherwise returns 1 and writes anchoring flags into *info. */ +int +re_is_anchored(enum re_dialect dialect, re_getchar_fun *f, void *opaque, + enum re_flags flags, struct re_err *err, + struct re_anchoring_info *info); + /* * Return a human-readable string describing a given error code. The string * returned has static storage, and must not be freed. diff --git a/src/libfsm/Makefile b/src/libfsm/Makefile index 5e2ed57e3..c7782f0ff 100644 --- a/src/libfsm/Makefile +++ b/src/libfsm/Makefile @@ -8,6 +8,7 @@ SRC += src/libfsm/consolidate.c SRC += src/libfsm/clone.c SRC += src/libfsm/closure.c SRC += src/libfsm/detect_required.c +SRC += src/libfsm/eager_output.c SRC += src/libfsm/edge.c SRC += src/libfsm/empty.c SRC += src/libfsm/end.c diff --git a/src/libfsm/clone.c b/src/libfsm/clone.c index 9fd236a4d..2161599ae 100644 --- a/src/libfsm/clone.c +++ b/src/libfsm/clone.c @@ -19,6 +19,7 @@ #include "internal.h" #include "capture.h" #include "endids.h" +#include "eager_output.h" #define LOG_CLONE_ENDIDS 0 @@ -28,6 +29,9 @@ copy_capture_actions(struct fsm *dst, const struct fsm *src); static int copy_end_ids(struct fsm *dst, const struct fsm *src); +static int +copy_eager_output_ids(struct fsm *dst, const struct fsm *src); + struct fsm * fsm_clone(const struct fsm *fsm) { @@ -80,6 +84,12 @@ fsm_clone(const struct fsm *fsm) fsm_free(new); return NULL; } + + /* does not copy callback */ + if (!copy_eager_output_ids(new, fsm)) { + fsm_free(new); + return NULL; + } } return new; @@ -159,3 +169,31 @@ copy_end_ids(struct fsm *dst, const struct fsm *src) return env.ok; } + +struct copy_eager_output_ids_env { + bool ok; + struct fsm *dst; +}; + +static int +copy_eager_output_ids_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + struct copy_eager_output_ids_env *env = opaque; + if (!fsm_seteageroutput(env->dst, state, id)) { + env->ok = false; + return 0; + } + + return 1; +} + +static int +copy_eager_output_ids(struct fsm *dst, const struct fsm *src) +{ + struct copy_eager_output_ids_env env; + env.dst = dst; + env.ok = true; + + fsm_eager_output_iter_all(src, copy_eager_output_ids_cb, &env); + return env.ok; +} diff --git a/src/libfsm/consolidate.c b/src/libfsm/consolidate.c index 236a4f6f5..b7a8905b2 100644 --- a/src/libfsm/consolidate.c +++ b/src/libfsm/consolidate.c @@ -25,6 +25,7 @@ #include "internal.h" #include "capture.h" #include "endids.h" +#include "eager_output.h" #define LOG_MAPPING 0 #define LOG_CONSOLIDATE_CAPTURES 0 @@ -53,6 +54,10 @@ static int consolidate_end_ids(struct fsm *dst, const struct fsm *src, const fsm_state_t *mapping, size_t mapping_count); +static int +consolidate_eager_output_ids(struct fsm *dst, const struct fsm *src, + const fsm_state_t *mapping, size_t mapping_count); + static fsm_state_t mapping_cb(fsm_state_t id, const void *opaque) { @@ -154,6 +159,10 @@ fsm_consolidate(const struct fsm *src, } } + if (!consolidate_eager_output_ids(dst, src, mapping, mapping_count)) { + goto cleanup; + } + f_free(src->alloc, seen); return dst; @@ -270,3 +279,40 @@ consolidate_end_ids(struct fsm *dst, const struct fsm *src, return ret; } + +struct consolidate_eager_output_ids_env { + bool ok; + struct fsm *dst; + const fsm_state_t *mapping; + size_t mapping_count; +}; + +static int +consolidate_eager_output_ids_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + struct consolidate_eager_output_ids_env *env = opaque; + assert(state < env->mapping_count); + const fsm_state_t dst_state = env->mapping[state]; + + if (!fsm_seteageroutput(env->dst, dst_state, id)) { + env->ok = false; + return 0; + } + + return 1; +} + +static int +consolidate_eager_output_ids(struct fsm *dst, const struct fsm *src, + const fsm_state_t *mapping, size_t mapping_count) +{ + struct consolidate_eager_output_ids_env env = { + .ok = true, + .dst = dst, + .mapping = mapping, + .mapping_count = mapping_count, + }; + fsm_eager_output_iter_all(src, consolidate_eager_output_ids_cb, &env); + return env.ok; +} + diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index 42992b6bc..8978ce06c 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -6,6 +6,9 @@ #include "determinise_internal.h" +#include +#include + static void dump_labels(FILE *f, const uint64_t labels[4]) { @@ -253,6 +256,10 @@ fsm_determinise(struct fsm *nfa) goto cleanup; } + if (!remap_eager_outputs(&map, issp, dfa, nfa)) { + goto cleanup; + } + fsm_move(nfa, dfa); } @@ -334,6 +341,22 @@ add_reverse_mapping(const struct fsm_alloc *alloc, return 1; } +static void +free_reverse_mappings(const struct fsm_alloc *alloc, size_t map_count, struct reverse_mapping *rmaps) +{ + if (rmaps == NULL) { return; } + + for (size_t map_i = 0; map_i < map_count; map_i++) { + struct reverse_mapping *rmap = &rmaps[map_i]; + for (size_t i = 0; i < rmap->count; i++) { + f_free(alloc, rmap[i].list); + rmap->count = 0; + rmap[i].list = NULL; + } + } + f_free(alloc, rmaps); +} + static int det_copy_capture_actions_cb(fsm_state_t state, enum capture_action_type type, unsigned capture_id, fsm_state_t to, @@ -405,7 +428,7 @@ hash_iss(interned_state_set_id iss) } static struct mapping * -map_first(struct map *map, struct map_iter *iter) +map_first(const struct map *map, struct map_iter *iter) { iter->m = map; iter->i = 0; @@ -641,22 +664,14 @@ stack_pop(struct mappingstack *stack) return item; } -static int -remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, - struct fsm *dst_dfa, struct fsm *src_nfa) +static struct reverse_mapping * +build_reverse_mappings(const struct map *map, struct interned_state_set_pool *issp, + struct fsm *dst_dfa, const struct fsm *src_nfa) { + struct reverse_mapping *reverse_mappings = NULL; struct map_iter it; struct state_iter si; struct mapping *m; - struct reverse_mapping *reverse_mappings; - fsm_state_t state; - const size_t capture_count = fsm_countcaptures(src_nfa); - size_t i, j; - int res = 0; - - if (capture_count == 0) { - return 1; - } /* This is not 1 to 1 -- if state X is now represented by multiple * states Y in the DFA, and state X has action(s) when transitioning @@ -667,9 +682,7 @@ remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, * checking reachability from every X, but the actual path * handling later will also check reachability. */ reverse_mappings = f_calloc(dst_dfa->alloc, src_nfa->statecount, sizeof(reverse_mappings[0])); - if (reverse_mappings == NULL) { - return 0; - } + if (reverse_mappings == NULL) { goto cleanup; } /* build reverse mappings table: for every NFA state X, if X is part * of the new DFA state Y, then add Y to a list for X */ @@ -679,6 +692,7 @@ remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, assert(m->dfastate < dst_dfa->statecount); ss = interned_state_set_get_state_set(issp, iss_id); + fsm_state_t state; for (state_set_reset(ss, &si); state_set_next(&si, &state); ) { if (!add_reverse_mapping(dst_dfa->alloc, reverse_mappings, @@ -688,33 +702,47 @@ remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, } } -#if LOG_DETERMINISE_CAPTURES +#if LOG_BUILD_REVERSE_MAPPING fprintf(stderr, "#### reverse mapping for %zu states\n", src_nfa->statecount); - for (i = 0; i < src_nfa->statecount; i++) { + for (size_t i = 0; i < src_nfa->statecount; i++) { struct reverse_mapping *rm = &reverse_mappings[i]; fprintf(stderr, "%lu:", i); - for (j = 0; j < rm->count; j++) { + for (size_t j = 0; j < rm->count; j++) { fprintf(stderr, " %u", rm->list[j]); } fprintf(stderr, "\n"); } -#else - (void)j; #endif + return reverse_mappings; + +cleanup: + free_reverse_mappings(dst_dfa->alloc, src_nfa->statecount, reverse_mappings); + return NULL; +} + +static int +remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, + struct fsm *dst_dfa, struct fsm *src_nfa) +{ + const size_t capture_count = fsm_countcaptures(src_nfa); + int res = 0; + + if (capture_count == 0) { + return 1; + } + + struct reverse_mapping *reverse_mappings = build_reverse_mappings(map, issp, dst_dfa, src_nfa); + if (reverse_mappings == NULL) { goto cleanup; } + if (!det_copy_capture_actions(reverse_mappings, dst_dfa, src_nfa)) { goto cleanup; } res = 1; -cleanup: - for (i = 0; i < src_nfa->statecount; i++) { - if (reverse_mappings[i].list != NULL) { - f_free(dst_dfa->alloc, reverse_mappings[i].list); - } - } - f_free(dst_dfa->alloc, reverse_mappings); +cleanup: + free_reverse_mappings(dst_dfa->alloc, src_nfa->statecount, reverse_mappings); return res; } @@ -2528,3 +2556,50 @@ analyze_closures__grow_outputs(struct analyze_closures_env *env) env->output_ceil = nceil; return 1; } + +struct remap_eager_output_env { + bool ok; + struct fsm *dst; + fsm_state_t dst_state; +}; + +static int +remap_eager_output_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + (void)state; + struct remap_eager_output_env *env = opaque; + if (!fsm_seteageroutput(env->dst, env->dst_state, id)) { + env->ok = false; + return 0; + } + + return 1; +} + +static int +remap_eager_outputs(const struct map *map, struct interned_state_set_pool *issp, + struct fsm *dst_dfa, const struct fsm *src_nfa) +{ + /* For each DFA state, get the set of NFA states corresponding to it from the + * map and issp, then copy every eager output ID over. */ + struct map_iter iter; + for (struct mapping *b = map_first(map, &iter); b != NULL; b = map_next(&iter)) { + struct state_set *ss = interned_state_set_get_state_set(issp, b->iss); + assert(ss != NULL); + + struct state_iter it; + fsm_state_t s; + state_set_reset(ss, &it); + while (state_set_next(&it, &s)) { + struct remap_eager_output_env env = { + .ok = true, + .dst = dst_dfa, + .dst_state = b->dfastate, + }; + fsm_eager_output_iter_state(src_nfa, s, remap_eager_output_cb, &env); + if (!env.ok) { return 0; } + } + } + + return 1; +} diff --git a/src/libfsm/determinise_internal.h b/src/libfsm/determinise_internal.h index cfd4ea663..2e925d28c 100644 --- a/src/libfsm/determinise_internal.h +++ b/src/libfsm/determinise_internal.h @@ -23,6 +23,7 @@ #include "internal.h" #include "capture.h" #include "endids.h" +#include "eager_output.h" #include @@ -35,6 +36,7 @@ #define LOG_AC 0 #define LOG_GROUPING 0 #define LOG_ANALYSIS_STATS 0 +#define LOG_BUILD_REVERSE_MAPPING 0 #if LOG_DETERMINISE_CAPTURES || LOG_INPUT #include @@ -72,7 +74,7 @@ struct map { }; struct map_iter { - struct map *m; + const struct map *m; size_t i; }; @@ -304,7 +306,7 @@ static void map_free(struct map *map); static struct mapping * -map_first(struct map *map, struct map_iter *iter); +map_first(const struct map *map, struct map_iter *iter); static struct mapping * map_next(struct map_iter *iter); @@ -325,6 +327,10 @@ static int remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, struct fsm *dst_dfa, struct fsm *src_nfa); +static int +remap_eager_outputs(const struct map *map, struct interned_state_set_pool *issp, + struct fsm *dst_dfa, const struct fsm *src_nfa); + static struct mappingstack * stack_init(const struct fsm_alloc *alloc); diff --git a/src/libfsm/eager_output.c b/src/libfsm/eager_output.c new file mode 100644 index 000000000..e37a8a4bf --- /dev/null +++ b/src/libfsm/eager_output.c @@ -0,0 +1,403 @@ +/* + * Copyright 2024 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#include +#include + +#include "internal.h" + +#include +#include + +#include +#include +#include + +#include "eager_output.h" + +#define LOG_LEVEL 0 + +/* must be a power of 2 */ +#define DEF_BUCKET_COUNT 4 +#define DEF_ENTRY_CEIL 2 + +struct eager_output_info { + fsm_eager_output_cb *cb; + void *opaque; + + struct eager_output_htab { + size_t bucket_count; + size_t buckets_used; + /* empty if entry is NULL, otherwise keyed by state */ + struct eager_output_bucket { + fsm_state_t state; + struct eager_output_entry { + unsigned used; + unsigned ceil; + fsm_end_id_t ids[]; + } *entry; + } *buckets; + } htab; +}; + +void +fsm_eager_output_set_cb(struct fsm *fsm, fsm_eager_output_cb *cb, void *opaque) +{ +#if LOG_LEVEL > 2 + fprintf(stderr, "-- fsm_eager_output_set_cb %p\n", (void *)fsm); +#endif + assert(fsm != NULL); + assert(fsm->eager_output_info != NULL); + fsm->eager_output_info->cb = cb; + fsm->eager_output_info->opaque = opaque; +} + +void +fsm_eager_output_get_cb(const struct fsm *fsm, fsm_eager_output_cb **cb, void **opaque) +{ + *cb = fsm->eager_output_info->cb; + *opaque = fsm->eager_output_info->opaque; +} + +int +fsm_eager_output_init(struct fsm *fsm) +{ + struct eager_output_info *ei = f_calloc(fsm->alloc, 1, sizeof(*ei)); + + if (ei == NULL) { return 0; } + + struct eager_output_bucket *buckets = f_calloc(fsm->alloc, + DEF_BUCKET_COUNT, sizeof(buckets[0])); + if (buckets == NULL) { + f_free(fsm->alloc, ei); + return 0; + } + +#if LOG_LEVEL > 2 + fprintf(stderr, "-- fsm_eager_output_init %p\n", (void *)fsm); +#endif + + ei->htab.buckets = buckets; + ei->htab.bucket_count = DEF_BUCKET_COUNT; + + fsm->eager_output_info = ei; + return 1; +} + +void +fsm_eager_output_free(struct fsm *fsm) +{ + if (fsm == NULL || fsm->eager_output_info == NULL) { return; } + + for (size_t i = 0; i < fsm->eager_output_info->htab.bucket_count; i++) { + struct eager_output_bucket *b = &fsm->eager_output_info->htab.buckets[i]; + if (b->entry == NULL) { continue; } + f_free(fsm->alloc, b->entry); + } + f_free(fsm->alloc, fsm->eager_output_info->htab.buckets); + + f_free(fsm->alloc, fsm->eager_output_info); +#if LOG_LEVEL > 2 + fprintf(stderr, "-- fsm_eager_output_free %p\n", (void *)fsm); +#endif + fsm->eager_output_info = NULL; +} + +int +fsm_seteageroutputonends(struct fsm *fsm, fsm_output_id_t id) +{ + assert(fsm != NULL); + const size_t count = fsm_countstates(fsm); + for (size_t i = 0; i < count; i++) { + if (fsm_isend(fsm, i)) { + if (!fsm_seteageroutput(fsm, i, id)) { return 0; } + } + } + return 1; +} + +static bool +grow_htab(const struct fsm_alloc *alloc, struct eager_output_htab *htab) +{ + const size_t nbucket_count = 2*htab->bucket_count; + assert(nbucket_count != 0); + + struct eager_output_bucket *nbuckets = f_calloc(alloc, nbucket_count, + sizeof(nbuckets[0])); + if (nbuckets == NULL) { return false; } + + const uint64_t nmask = nbucket_count - 1; + assert((nmask & nbucket_count) == 0); /* power of 2 */ + + for (size_t ob_i = 0; ob_i < htab->bucket_count; ob_i++) { + struct eager_output_bucket *ob = &htab->buckets[ob_i]; + if (ob->entry == NULL) { continue; } + + const uint64_t hash = hash_id(ob->state); + for (size_t probes = 0; probes < nbucket_count; probes++) { + const size_t nb_i = (hash + probes) & nmask; + struct eager_output_bucket *nb = &nbuckets[nb_i]; + if (nb->entry == NULL) { + nb->state = ob->state; + nb->entry = ob->entry; + break; + } else { + assert(nb->state != ob->state); + } + } + } + + f_free(alloc, htab->buckets); + htab->bucket_count = nbucket_count; + htab->buckets = nbuckets; + return true; +} + +int +fsm_seteageroutput(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id) +{ + assert(fsm != NULL); + + struct eager_output_info *info = fsm->eager_output_info; + assert(info->htab.bucket_count > 0); + + if (info->htab.buckets_used >= info->htab.bucket_count/2) { + if (!grow_htab(fsm->alloc, &info->htab)) { return 0; } + } + + const uint64_t hash = hash_id(state); + const uint64_t mask = info->htab.bucket_count - 1; + assert((mask & info->htab.bucket_count) == 0); /* power of 2 */ + + /* fprintf(stderr, "%s: bucket_count %zd\n", __func__, info->htab.bucket_count); */ + for (size_t probes = 0; probes < info->htab.bucket_count; probes++) { + const size_t b_i = (hash + probes) & mask; + struct eager_output_bucket *b = &info->htab.buckets[b_i]; + /* fprintf(stderr, "%s: state %d -> b_i %zd, state %d, entry %p\n", */ + /* __func__, state, b_i, b->state, (void *)b->entry); */ + struct eager_output_entry *e = b->entry; + if (e == NULL) { /* empty */ + /* add */ + const size_t alloc_sz = sizeof(*e) + + DEF_ENTRY_CEIL * sizeof(e->ids[0]); + e = f_calloc(fsm->alloc, 1, alloc_sz); + if (e == NULL) { + return 0; + } + e->ceil = DEF_ENTRY_CEIL; + b->state = state; + b->entry = e; + info->htab.buckets_used++; + /* fprintf(stderr, "%s: buckets_used %zd\n", __func__, info->htab.buckets_used); */ + /* fprintf(stderr, "%s: saved new entry in bucket %zd\n", __func__, b_i); */ + } else if (b->state != state) { /* collision */ + continue; + } + + if (e->used == e->ceil) { + const size_t nceil = 2 * e->ceil; + const size_t nsize = sizeof(*e) + + nceil * sizeof(e->ids[0]); + struct eager_output_entry *nentry = f_realloc(fsm->alloc, e, nsize); + if (nentry == NULL) { return 0; } + nentry->ceil = nceil; + b->entry = nentry; + e = b->entry; + } + + /* ignore duplicates */ + for (size_t i = 0; i < e->used; i++) { + if (e->ids[i] == id) { return 1; } + } + + e->ids[e->used++] = id; + /* fprintf(stderr, "%s: e->ids_used %u\n", __func__, e->used); */ + fsm->states[state].has_eager_outputs = 1; + return 1; + } + + return 1; +} + +bool +fsm_eager_output_has_eager_output(const struct fsm *fsm) +{ + assert(fsm->eager_output_info != NULL); + const struct eager_output_htab *htab = &fsm->eager_output_info->htab; + + for (size_t b_i = 0; b_i < htab->bucket_count; b_i++) { + struct eager_output_bucket *b = &htab->buckets[b_i]; + if (b->entry == NULL) { continue; } + if (b->entry->used > 0) { return 1; } + } + return 0; +} + +bool +fsm_eager_output_state_has_eager_output(const struct fsm *fsm, fsm_state_t state) +{ + assert(state < fsm->statecount); + return fsm->states[state].has_eager_outputs; +} + +void +fsm_eager_output_iter_state(const struct fsm *fsm, + fsm_state_t state, fsm_eager_output_iter_cb *cb, void *opaque) +{ + assert(fsm != NULL); + assert(cb != NULL); + + const uint64_t hash = hash_id(state); + + struct eager_output_info *info = fsm->eager_output_info; + const uint64_t mask = info->htab.bucket_count - 1; + assert((mask & info->htab.bucket_count) == 0); /* power of 2 */ + + for (size_t probes = 0; probes < info->htab.bucket_count; probes++) { + const size_t b_i = (hash + probes) & mask; + struct eager_output_bucket *b = &info->htab.buckets[b_i]; + /* fprintf(stderr, "%s: state %d -> b_i %zd, state %d, entry %p\n", */ + /* __func__, state, b_i, b->state, (void *)b->entry); */ + struct eager_output_entry *e = b->entry; + if (e == NULL) { /* empty */ + return; + } else if (b->state != state) { /* collision */ + continue; + } + + assert(e->used == 0 || fsm->states[state].has_eager_outputs); + + for (size_t i = 0; i < e->used; i++) { + if (!cb(state, e->ids[i], opaque)) { return; } + } + } +} + +void +fsm_eager_output_iter_all(const struct fsm *fsm, + fsm_eager_output_iter_cb *cb, void *opaque) +{ + assert(fsm != NULL); + assert(cb != NULL); + assert(fsm->eager_output_info != NULL); + + struct eager_output_info *info = fsm->eager_output_info; + + /* fprintf(stderr, "%s: bucket_count %zd\n", __func__, info->htab.bucket_count); */ + for (size_t b_i = 0; b_i < info->htab.bucket_count; b_i++) { + struct eager_output_bucket *b = &info->htab.buckets[b_i]; + struct eager_output_entry *e = b->entry; + /* fprintf(stderr, "%s: b_i %zd, state %d, entry %p\n", */ + /* __func__, b_i, b->state, (void *)b->entry); */ + if (e == NULL) { /* empty */ + continue; + } + assert(e->used == 0 || fsm->states[b->state].has_eager_outputs); + + for (size_t i = 0; i < e->used; i++) { + if (!cb(b->state, e->ids[i], opaque)) { return; } + } + } +} + +struct dump_env { + FILE *f; + size_t count; +}; + +static int +dump_cb(fsm_state_t state, fsm_end_id_t id, void *opaque) + +{ + struct dump_env *env = opaque; + fprintf(env->f, "-- %d: id %d\n", state, id); + env->count++; + return 1; +} + +void +fsm_eager_output_dump(FILE *f, const struct fsm *fsm) +{ + struct dump_env env = { .f = f }; + fprintf(f, "%s:\n", __func__); + fsm_eager_output_iter_all(fsm, dump_cb, (void *)&env); + fprintf(f, "== %zu total\n", env.count); +} + +static int +inc_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + (void)state; + (void)id; + size_t *count = opaque; + (*count)++; + return 1; +} + +bool +fsm_eager_output_has_any(const struct fsm *fsm, + fsm_state_t state, size_t *count) +{ + size_t c = 0; + fsm_eager_output_iter_state(fsm, state, &inc_cb, &c); + if (count != NULL) { *count = c; } + return c > 0; +} + +int +fsm_eager_output_compact(struct fsm *fsm, fsm_state_t *mapping, size_t mapping_count) +{ + /* Don't reallocate unless something has actually changed. */ + bool changes = false; + for (size_t i = 0; i < mapping_count; i++) { + if (mapping[i] != i) { + changes = true; + break; + } + } + + /* nothing to do */ + if (!changes) { return 1; } + + struct eager_output_info *eoi = fsm->eager_output_info; + + struct eager_output_bucket *nbuckets = f_calloc(fsm->alloc, + eoi->htab.bucket_count, sizeof(nbuckets[0])); + if (nbuckets == NULL) { + return 0; + } + + const uint64_t mask = eoi->htab.bucket_count - 1; + assert((eoi->htab.bucket_count & mask) == 0); + + for (size_t ob_i = 0; ob_i < eoi->htab.bucket_count; ob_i++) { + const struct eager_output_bucket *ob = &eoi->htab.buckets[ob_i]; + if (ob->entry == NULL) { continue; } + + assert(ob->state < mapping_count); + const fsm_state_t nstate = mapping[ob->state]; + if (nstate == FSM_STATE_REMAP_NO_STATE) { continue; } + + const uint64_t hash = hash_id(nstate); + + bool placed = false; + for (size_t probes = 0; probes < eoi->htab.bucket_count; probes++) { + const size_t nb_i = (hash + probes) & mask; + struct eager_output_bucket *nb = &nbuckets[nb_i]; + if (nb->entry == NULL) { + nb->state = nstate; + nb->entry = ob->entry; + placed = true; + break; + } + } + assert(placed); + } + + f_free(fsm->alloc, eoi->htab.buckets); + eoi->htab.buckets = nbuckets; + return 1; +} diff --git a/src/libfsm/eager_output.h b/src/libfsm/eager_output.h new file mode 100644 index 000000000..1b48ba4c4 --- /dev/null +++ b/src/libfsm/eager_output.h @@ -0,0 +1,46 @@ +#ifndef EAGER_OUTPUT_H +#define EAGER_OUTPUT_H + +#include +#include +#include + +struct eager_output_info; + +int +fsm_eager_output_init(struct fsm *fsm); + +void +fsm_eager_output_free(struct fsm *fsm); + +bool +fsm_eager_output_has_eager_output(const struct fsm *fsm); + +bool +fsm_eager_output_state_has_eager_output(const struct fsm *fsm, fsm_state_t state); + +void +fsm_eager_output_dump(FILE *f, const struct fsm *fsm); + +/* Callback for fsm_eager_output_iter_*. + * The return value indicates whether iteration should continue. + * The results may not be sorted in any particular order. */ +typedef int +fsm_eager_output_iter_cb(fsm_state_t state, fsm_output_id_t id, void *opaque); + +void +fsm_eager_output_iter_state(const struct fsm *fsm, + fsm_state_t state, fsm_eager_output_iter_cb *cb, void *opaque); + +void +fsm_eager_output_iter_all(const struct fsm *fsm, + fsm_eager_output_iter_cb *cb, void *opaque); + +bool +fsm_eager_output_has_any(const struct fsm *fsm, + fsm_state_t state, size_t *count); + +int +fsm_eager_output_compact(struct fsm *fsm, fsm_state_t *mapping, size_t mapping_count); + +#endif diff --git a/src/libfsm/epsilons.c b/src/libfsm/epsilons.c index 9394a2d9b..adfcdec2a 100644 --- a/src/libfsm/epsilons.c +++ b/src/libfsm/epsilons.c @@ -9,24 +9,42 @@ #include #include #include +#include #include #include +#include #include #include #include #include +#include #include "internal.h" #include "capture.h" #include "endids.h" +#include "eager_output.h" #define DUMP_EPSILON_CLOSURES 0 #define DEF_PENDING_CAPTURE_ACTIONS_CEIL 2 #define LOG_RM_EPSILONS_CAPTURES 0 #define DEF_CARRY_ENDIDS_COUNT 2 +#define LOG_LEVEL 0 + +#if LOG_LEVEL > 0 +static bool log_it; +#define LOG(LVL, ...) \ + do { \ + if (log_it && LVL <= LOG_LEVEL) { \ + fprintf(stderr, __VA_ARGS__); \ + } \ + } while (0) +#else +#define LOG(_LVL, ...) +#endif + struct remap_env { #ifndef NDEBUG char tag; @@ -57,6 +75,49 @@ static int carry_endids(struct fsm *fsm, struct state_set *states, fsm_state_t s); +static void +mark_states_reachable_by_label(const struct fsm *nfa, uint64_t *reachable_by_label); + +struct eager_output_buf { +#define DEF_EAGER_OUTPUT_BUF_CEIL 8 + bool ok; + const struct fsm_alloc *alloc; + size_t ceil; + size_t used; + fsm_output_id_t *ids; +}; + +static bool +append_eager_output_id(struct eager_output_buf *buf, fsm_output_id_t id) +{ + if (buf->used == buf->ceil) { + const size_t nceil = buf->ceil == 0 ? DEF_EAGER_OUTPUT_BUF_CEIL : 2*buf->ceil; + fsm_output_id_t *nids = f_realloc(buf->alloc, buf->ids, nceil * sizeof(nids[0])); + if (nids == NULL) { + buf->ok = false; + return false; + } + buf->ids = nids; + buf->ceil = nceil; + } + + for (size_t i = 0; i < buf->used; i++) { + /* avoid duplicates */ + if (buf->ids[i] == id) { return true; } + } + + buf->ids[buf->used++] = id; + return true; +} + +static int +collect_eager_output_ids_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + (void)state; + struct eager_output_buf *buf = opaque; + return append_eager_output_id(buf, id) ? 1 : 0; +} + int fsm_remove_epsilons(struct fsm *nfa) { @@ -64,9 +125,20 @@ fsm_remove_epsilons(struct fsm *nfa) int res = 0; struct state_set **eclosures = NULL; fsm_state_t s; + struct eager_output_buf eager_output_buf = { + .ok = true, + .alloc = nfa->alloc, + }; + uint64_t *reachable_by_label = NULL; + + LOG(2, "%s: starting\n", __func__); INIT_TIMERS(); +#if LOG_LEVEL > 0 + log_it = getenv("LOG") != NULL; +#endif + assert(nfa != NULL); TIME(&pre); @@ -94,6 +166,17 @@ fsm_remove_epsilons(struct fsm *nfa) } #endif + const size_t state_words = u64bitset_words(state_count); + reachable_by_label = f_calloc(nfa->alloc, state_words, sizeof(reachable_by_label[0])); + if (reachable_by_label == NULL) { goto cleanup; } + + mark_states_reachable_by_label(nfa, reachable_by_label); + + fsm_state_t start; + if (!fsm_getstart(nfa, &start)) { + goto cleanup; /* no start state */ + } + for (s = 0; s < state_count; s++) { struct state_iter si; fsm_state_t es_id; @@ -101,6 +184,12 @@ fsm_remove_epsilons(struct fsm *nfa) struct edge_group_iter egi; struct edge_group_iter_info info; + /* If the state isn't reachable by a label and isn't the start state, + * skip processing -- it will soon become garbage. */ + if (!u64bitset_get(reachable_by_label, s) && s != start) { + continue; + } + /* Process the epsilon closure. */ state_set_reset(eclosures[s], &si); while (state_set_next(&si, &es_id)) { @@ -129,6 +218,16 @@ fsm_remove_epsilons(struct fsm *nfa) } } + /* Collect every eager output ID from any state + * in the current state's epsilon closure to the + * current state. These will be added at the end. */ + { + if (fsm_eager_output_has_any(nfa, es_id, NULL)) { + fsm_eager_output_iter_state(nfa, es_id, collect_eager_output_ids_cb, &eager_output_buf); + if (!eager_output_buf.ok) { goto cleanup; } + } + } + /* For every state in this state's transitive * epsilon closure, add all of their sets of * labeled edges. */ @@ -144,6 +243,13 @@ fsm_remove_epsilons(struct fsm *nfa) } } } + + for (size_t i = 0; i < eager_output_buf.used; i++) { + if (!fsm_seteageroutput(nfa, s, eager_output_buf.ids[i])) { + goto cleanup; + } + } + eager_output_buf.used = 0; /* clear */ } /* Remove the epsilon-edge state sets from everything. @@ -170,13 +276,53 @@ fsm_remove_epsilons(struct fsm *nfa) res = 1; cleanup: + LOG(2, "%s: finishing\n", __func__); if (eclosures != NULL) { closure_free(nfa, eclosures, state_count); } + f_free(nfa->alloc, reachable_by_label); + f_free(nfa->alloc, eager_output_buf.ids); return res; } +/* For every state, mark every state reached by a labeled edge as + * reachable. This doesn't check that the FROM state is reachable from + * the start state (trim will do that soon enough), it's just used to + * check which states will become unreachable once epsilon edges are + * removed. We don't need to add eager endids for them, because they + * will soon be disconnected from the epsilon-free NFA. */ +static void +mark_states_reachable_by_label(const struct fsm *nfa, uint64_t *reachable_by_label) +{ + fsm_state_t start; + if (!fsm_getstart(nfa, &start)) { + return; /* nothing reachable */ + } + u64bitset_set(reachable_by_label, start); + + const fsm_state_t state_count = fsm_countstates(nfa); + + for (size_t s_i = 0; s_i < state_count; s_i++) { + struct edge_group_iter egi; + struct edge_group_iter_info info; + + struct fsm_state *s = &nfa->states[s_i]; + + /* Clear the visited flag, it will be used to avoid cycles. */ +#if 1 + assert(s->visited == 0); /* stale */ +#endif + s->visited = 0; + + edge_set_group_iter_reset(s->edges, EDGE_GROUP_ITER_ALL, &egi); + while (edge_set_group_iter_next(&egi, &info)) { + LOG(1, "%s: reachable: %d\n", __func__, info.to); + u64bitset_set(reachable_by_label, info.to); + } + } +} + static int remap_capture_actions(struct fsm *nfa, struct state_set **eclosures) { @@ -425,4 +571,3 @@ carry_endids(struct fsm *fsm, struct state_set *states, return env.ok; } - diff --git a/src/libfsm/exec.c b/src/libfsm/exec.c index 9f7b21802..077494b8f 100644 --- a/src/libfsm/exec.c +++ b/src/libfsm/exec.c @@ -20,9 +20,12 @@ #include "internal.h" #include "capture.h" +#include "eager_output.h" #define LOG_EXEC 0 +#define LOG_EAGER 0 + static int transition(const struct fsm *fsm, fsm_state_t state, int c, size_t offset, struct fsm_capture *captures, @@ -43,6 +46,44 @@ transition(const struct fsm *fsm, fsm_state_t state, int c, return 1; } +struct check_eager_outputs_for_state_env { + const struct fsm *fsm; + fsm_eager_output_cb *cb; + void *opaque; +}; + +static int +match_eager_outputs_for_state_cb(fsm_state_t state, fsm_end_id_t id, void *opaque) +{ + /* HACK update the types here once it's working */ + (void)state; + struct check_eager_outputs_for_state_env *env = opaque; +#if LOG_EAGER + fprintf(stderr, "%s: state %d, id %d\n", __func__, state, id); +#endif + env->cb(id, env->opaque); + return 1; +} + +static int +match_eager_outputs_for_state(const struct fsm *fsm, fsm_state_t state) +{ + /* HACK update the types here once it's working */ + fsm_eager_output_cb *cb = NULL; + void *opaque = NULL; + fsm_eager_output_get_cb(fsm, &cb, &opaque); + if (cb == NULL) { return 1; } /* nothing to do */ + + struct check_eager_outputs_for_state_env env = { + .fsm = fsm, + .cb = cb, + .opaque = opaque, + }; + fsm_eager_output_iter_state(fsm, + state, match_eager_outputs_for_state_cb, &env); + return 1; +} + int fsm_exec(const struct fsm *fsm, int (*fsm_getc)(void *opaque), void *opaque, @@ -73,6 +114,7 @@ fsm_exec(const struct fsm *fsm, errno = EINVAL; return -1; } + const fsm_state_t start = state; for (i = 0; i < capture_count; i++) { captures[i].pos[0] = FSM_CAPTURE_NO_POS; @@ -83,6 +125,12 @@ fsm_exec(const struct fsm *fsm, fprintf(stderr, "fsm_exec: starting at %d\n", state); #endif + if (fsm->states[start].has_eager_outputs) { + if (!match_eager_outputs_for_state(fsm, start)) { + return 0; + } + } + while (c = fsm_getc(opaque), c != EOF) { if (!transition(fsm, state, c, offset, captures, &state)) { #if LOG_EXEC @@ -91,6 +139,12 @@ fsm_exec(const struct fsm *fsm, return 0; } + if (fsm->states[state].has_eager_outputs) { + if (!match_eager_outputs_for_state(fsm, state)) { + return 0; + } + } + #if LOG_EXEC fprintf(stderr, "fsm_exec: @ %zu, input '%c', new state %u\n", offset, c, state); @@ -113,4 +167,3 @@ fsm_exec(const struct fsm *fsm, *end = state; return 1; } - diff --git a/src/libfsm/fsm.c b/src/libfsm/fsm.c index ba2d2db26..c442c8262 100644 --- a/src/libfsm/fsm.c +++ b/src/libfsm/fsm.c @@ -21,6 +21,7 @@ #include "internal.h" #include "capture.h" #include "endids.h" +#include "eager_output.h" /* guess for default state allocation */ #define FSM_DEFAULT_STATEALLOC 128 @@ -39,6 +40,7 @@ free_contents(struct fsm *fsm) fsm_capture_free(fsm); fsm_endid_free(fsm); + fsm_eager_output_free(fsm); f_free(fsm->alloc, fsm->states); } @@ -92,6 +94,14 @@ fsm_new_statealloc(const struct fsm_alloc *alloc, size_t statealloc) return NULL; } + if (!fsm_eager_output_init(new)) { + f_free(new->alloc, new->states); + f_free(new->alloc, new); + fsm_capture_free(new); + fsm_endid_free(new); + return NULL; + } + return new; } @@ -133,6 +143,7 @@ fsm_move(struct fsm *dst, struct fsm *src) dst->capture_info = src->capture_info; dst->endid_info = src->endid_info; + dst->eager_output_info = src->eager_output_info; f_free(src->alloc, src); } diff --git a/src/libfsm/internal.h b/src/libfsm/internal.h index f84bbef0f..46997c82a 100644 --- a/src/libfsm/internal.h +++ b/src/libfsm/internal.h @@ -60,6 +60,10 @@ struct fsm_state { /* meaningful within one particular transformation only */ unsigned int visited:1; + + /* If 0, then this state has no need for checking + * the fsm->eager_output_info struct. */ + unsigned int has_eager_outputs:1; }; struct fsm { @@ -75,6 +79,7 @@ struct fsm { struct fsm_capture_info *capture_info; struct endid_info *endid_info; + struct eager_output_info *eager_output_info; }; struct fsm * diff --git a/src/libfsm/libfsm.syms b/src/libfsm/libfsm.syms index 34be09e77..75c20eb64 100644 --- a/src/libfsm/libfsm.syms +++ b/src/libfsm/libfsm.syms @@ -2,6 +2,7 @@ fsm_complement fsm_union fsm_union_array +fsm_union_repeated_pattern_group fsm_intersect fsm_intersect_charset @@ -72,6 +73,8 @@ fsm_removestate fsm_shuffle fsm_vacuum +fsm_new_statealloc + fsm_addedge_any fsm_addedge_epsilon fsm_addedge_literal @@ -95,6 +98,14 @@ fsm_setendid fsm_mapendids fsm_increndids +fsm_endid_dump + +fsm_seteageroutput +fsm_seteageroutputonends +# short term hack +fsm_eager_output_set_cb +fsm_eager_output_dump + fsm_countedges fsm_countstates diff --git a/src/libfsm/merge.c b/src/libfsm/merge.c index 8c972c145..ccc1568ff 100644 --- a/src/libfsm/merge.c +++ b/src/libfsm/merge.c @@ -22,6 +22,7 @@ #include "capture.h" #include "internal.h" #include "endids.h" +#include "eager_output.h" #define LOG_MERGE_ENDIDS 0 @@ -39,6 +40,9 @@ copy_capture_actions(struct fsm *dst, struct fsm *src); static int copy_end_ids(struct fsm *dst, struct fsm *src, fsm_state_t base_src); +static int +copy_eager_output_ids(struct fsm *dst, struct fsm *src, fsm_state_t base_src); + static struct fsm * merge(struct fsm *dst, struct fsm *src, fsm_state_t *base_dst, fsm_state_t *base_src, @@ -113,6 +117,11 @@ merge(struct fsm *dst, struct fsm *src, return NULL; } + if (!copy_eager_output_ids(dst, src, *base_src)) { + /* non-recoverable -- destructive operation */ + return NULL; + } + f_free(src->alloc, src->states); src->states = NULL; src->statealloc = 0; @@ -194,6 +203,39 @@ copy_end_ids(struct fsm *dst, struct fsm *src, fsm_state_t base_src) return fsm_endid_iter_bulk(src, copy_end_ids_cb, &env); } +struct copy_eager_output_ids_env { + bool ok; + struct fsm *dst; + struct fsm *src; + fsm_state_t base_src; +}; + +static int +copy_eager_output_ids_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + struct copy_eager_output_ids_env *env = opaque; + if (!fsm_seteageroutput(env->dst, state + env->base_src, id)) { + env->ok = false; + return 0; + } + + return 1; + +} + +static int +copy_eager_output_ids(struct fsm *dst, struct fsm *src, fsm_state_t base_src) +{ + struct copy_eager_output_ids_env env = { + .ok = true, + .dst = dst, + .src = src, + .base_src = base_src, + }; + fsm_eager_output_iter_all(src, copy_eager_output_ids_cb, &env); + return env.ok; +} + struct fsm * fsm_mergeab(struct fsm *a, struct fsm *b, fsm_state_t *base_b) diff --git a/src/libfsm/minimise.c b/src/libfsm/minimise.c index a8d53c57e..86f00b46f 100644 --- a/src/libfsm/minimise.c +++ b/src/libfsm/minimise.c @@ -25,6 +25,8 @@ #include "internal.h" #include "capture.h" +#include "eager_output.h" +#include "endids.h" #define LOG_MAPPINGS 0 #define LOG_STEPS 0 @@ -54,12 +56,21 @@ struct end_metadata { unsigned count; fsm_end_id_t *ids; } end; + + struct end_metadata_eager_outputs { + unsigned count; + fsm_output_id_t *ids; + } eager_outputs; }; static int collect_end_ids(const struct fsm *fsm, fsm_state_t s, struct end_metadata_end *e); +static int +collect_eager_output_ids(const struct fsm *fsm, fsm_state_t s, + struct end_metadata_eager_outputs *e); + int fsm_minimise(struct fsm *fsm) { @@ -122,6 +133,10 @@ fsm_minimise(struct fsm *fsm) /* Minimisation should never add states. */ assert(minimised_states <= orig_states); + for (size_t i = 0; i < fsm->statecount; i++) { + assert(mapping[i] < fsm->statecount); + } + /* Use the mapping to consolidate the current states * into a new DFA, combining states that could not be * proven distinguishable. */ @@ -693,6 +708,9 @@ same_end_metadata(const struct end_metadata *a, const struct end_metadata *b) if (a->end.count != b->end.count) { return 0; } + if (a->eager_outputs.count != b->eager_outputs.count) { + return 0; + } /* compare -- these must be sorted */ @@ -702,6 +720,12 @@ same_end_metadata(const struct end_metadata *a, const struct end_metadata *b) } } + for (size_t i = 0; i < a->eager_outputs.count; i++) { + if (a->eager_outputs.ids[i] != b->eager_outputs.ids[i]) { + return 0; + } + } + return 1; } @@ -750,14 +774,21 @@ split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm) #endif while (s != NO_ID) { struct end_metadata *e = &end_md[s]; - if (!fsm_isend(fsm, s)) { - break; /* this EC has non-end states, skip */ + const bool is_end = fsm_isend(fsm, s); + const bool has_eager_outputs = fsm_eager_output_state_has_eager_output(fsm, s); + + if (!is_end && !has_eager_outputs) { + break; /* skip */ } if (!collect_end_ids(fsm, s, &e->end)) { goto cleanup; } + if (!collect_eager_output_ids(fsm, s, &e->eager_outputs)) { + goto cleanup; + } + s = env->jump[s]; } } @@ -789,6 +820,10 @@ split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm) incremental_hash_of_ids(&hash, s_md->end.ids[eid_i]); } + for (size_t eo_i = 0; eo_i < s_md->eager_outputs.count; eo_i++) { + incremental_hash_of_ids(&hash, s_md->eager_outputs.ids[eo_i]); + } + for (size_t b_i = 0; b_i < bucket_count; b_i++) { fsm_state_t *b = &htab[(b_i + hash) & mask]; const fsm_state_t other = *b; @@ -932,6 +967,9 @@ split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm) if (e->end.ids != NULL) { f_free(fsm->alloc, e->end.ids); } + if (e->eager_outputs.ids != NULL) { + f_free(fsm->alloc, e->eager_outputs.ids); + } } f_free(fsm->alloc, end_md); } @@ -959,7 +997,7 @@ collect_end_ids(const struct fsm *fsm, fsm_state_t s, #if LOG_ECS fprintf(stderr, "%d:", s); - for (size_t i = 0; i < written; i++) { + for (size_t i = 0; i < e->count; i++) { fprintf(stderr, " %u", e->ids[i]); } fprintf(stderr, "\n"); @@ -968,6 +1006,41 @@ collect_end_ids(const struct fsm *fsm, fsm_state_t s, return 1; } +static int +collect_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + (void)state; + struct end_metadata_eager_outputs *e = opaque; + e->ids[e->count++] = id; + return 1; +} + +static int cmp_eager_output_id(const void *pa, const void *pb) +{ + const fsm_output_id_t a = *(fsm_output_id_t *)pa; + const fsm_output_id_t b = *(fsm_output_id_t *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + +static int +collect_eager_output_ids(const struct fsm *fsm, fsm_state_t state, + struct end_metadata_eager_outputs *e) +{ + size_t count = 0; + if (!fsm_eager_output_has_any(fsm, state, &count)) { + return 1; /* nothing to do */ + } + + e->ids = f_malloc(fsm->alloc, count * sizeof(e->ids[0])); + if (e->ids == NULL) { return 0; } + + fsm_eager_output_iter_state(fsm, state, collect_cb, e); + + /* sort, to normalize set */ + qsort(e->ids, e->count, sizeof(e->ids[0]), cmp_eager_output_id); + return 1; +} + #if EXPENSIVE_CHECKS static void check_done_ec_offset(const struct min_env *env) diff --git a/src/libfsm/print/c.c b/src/libfsm/print/c.c index 22b03963e..cc3927dc6 100644 --- a/src/libfsm/print/c.c +++ b/src/libfsm/print/c.c @@ -222,6 +222,14 @@ print_case(FILE *f, const struct ir *ir, assert(f != NULL); assert(cs != NULL); + if (cs->eager_outputs != NULL && opt->fragment) { + /* If .fragment is set and the state has eager outputs, then emit a call to a + * macro (the caller is expected to define). This is a temporary interface. */ + for (size_t i = 0; i < cs->eager_outputs->count; i++) { + fprintf(f, "\t\t\tFSM_SET_EAGER_OUTPUT(%u);\n", cs->eager_outputs->ids[i]); + } + } + switch (cs->strategy) { case IR_NONE: fprintf(f, "\t\t\t"); @@ -377,6 +385,11 @@ print_endstates(FILE *f, const struct fsm_state_metadata state_metadata = { .end_ids = ir->states[i].endids.ids, .end_id_count = ir->states[i].endids.count, + + .eager_output_count = (ir->states[i].eager_outputs == NULL + ? 0 : ir->states[i].eager_outputs->count), + .eager_output_ids = (ir->states[i].eager_outputs == NULL + ? NULL : ir->states[i].eager_outputs->ids), }; if (-1 == print_hook_accept(f, opt, hooks, diff --git a/src/libfsm/print/ir.c b/src/libfsm/print/ir.c index 457716dcc..81d5890e0 100644 --- a/src/libfsm/print/ir.c +++ b/src/libfsm/print/ir.c @@ -26,6 +26,7 @@ #include #include "libfsm/internal.h" +#include "libfsm/eager_output.h" #include "ir.h" @@ -505,6 +506,23 @@ make_example(const struct fsm *fsm, fsm_state_t s, char **example) return 0; } +static int +append_eager_output_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + struct ir_state_eager_output *outputs = opaque; + (void)state; + outputs->ids[outputs->count++] = id; + return 1; +} + +static int +cmp_fsm_output_id_t(const void *pa, const void *pb) +{ + const fsm_output_id_t a = *(fsm_output_id_t *)pa; + const fsm_output_id_t b = *(fsm_output_id_t *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + struct ir * make_ir(const struct fsm *fsm, const struct fsm_options *opt) { @@ -544,6 +562,8 @@ make_ir(const struct fsm *fsm, const struct fsm_options *opt) ir->states[i].endids.ids = NULL; ir->states[i].endids.count = 0; + ir->states[i].eager_outputs = NULL; + if (fsm_isend(fsm, i)) { fsm_end_id_t *ids; size_t count; @@ -567,6 +587,20 @@ make_ir(const struct fsm *fsm, const struct fsm_options *opt) ir->states[i].endids.count = count; } + size_t count; + if (fsm_eager_output_has_any(fsm, i, &count)) { + struct ir_state_eager_output *outputs = f_malloc(fsm->alloc, + sizeof(*outputs) + count * sizeof(outputs->ids[0])); + if (outputs == NULL) { + goto error; + } + outputs->count = 0; + fsm_eager_output_iter_state(fsm, i, append_eager_output_cb, outputs); + assert(outputs->count == count); + qsort(outputs->ids, outputs->count, sizeof(outputs->ids[0]), cmp_fsm_output_id_t); + ir->states[i].eager_outputs = outputs; + } + if (make_state(fsm, i, &ir->states[i]) == -1) { goto error; } @@ -630,6 +664,7 @@ free_ir(const struct fsm *fsm, struct ir *ir) for (i = 0; i < ir->n; i++) { f_free(fsm->alloc, (void *) ir->states[i].example); f_free(fsm->alloc, (void *) ir->states[i].endids.ids); + f_free(fsm->alloc, (void *) ir->states[i].eager_outputs); switch (ir->states[i].strategy) { case IR_TABLE: diff --git a/src/libfsm/print/ir.h b/src/libfsm/print/ir.h index b375ba850..7678d3f35 100644 --- a/src/libfsm/print/ir.h +++ b/src/libfsm/print/ir.h @@ -59,6 +59,11 @@ struct ir_state { size_t count; } endids; + struct ir_state_eager_output { + size_t count; + fsm_output_id_t ids[]; + } *eager_outputs; /* NULL -> 0 */ + unsigned int isend:1; enum ir_strategy strategy; diff --git a/src/libfsm/state.c b/src/libfsm/state.c index c845cbe46..d96c33653 100644 --- a/src/libfsm/state.c +++ b/src/libfsm/state.c @@ -19,6 +19,7 @@ #include "internal.h" #include "endids.h" +#include "eager_output.h" int fsm_addstate(struct fsm *fsm, fsm_state_t *state) @@ -44,6 +45,7 @@ fsm_addstate(struct fsm *fsm, fsm_state_t *state) for (i = fsm->statealloc; i < n; i++) { tmp[i].has_capture_actions = 0; + tmp[i].has_eager_outputs = 0; } fsm->statealloc = n; @@ -87,6 +89,8 @@ fsm_addstate_bulk(struct fsm *fsm, size_t n) new->visited = 0; new->epsilons = NULL; new->edges = NULL; + + new->has_eager_outputs = 0; } fsm->statecount += n; @@ -259,6 +263,10 @@ fsm_compact_states(struct fsm *fsm, if (!fsm_endid_compact(fsm, mapping, orig_statecount)) { return 0; } + if (!fsm_eager_output_compact(fsm, mapping, orig_statecount)) { + return 0; + } + assert(dst == kept); assert(kept == fsm->statecount); diff --git a/src/libfsm/union.c b/src/libfsm/union.c index a3b4b230c..0b18cd30c 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -15,9 +15,14 @@ #include #include #include +#include +#include #include "internal.h" +#include +#include "eager_output.h" + #define LOG_UNION_ARRAY 0 struct fsm * @@ -151,3 +156,231 @@ fsm_union_array(size_t fsm_count, return res; } + +#define LOG_UNION_REPEATED_PATTERN_GROUP 0 + +/* Combine an array of FSMs into a single FSM in one pass, with an extra loop + * so that more than one pattern with eager outputs can match. */ +struct fsm * +fsm_union_repeated_pattern_group(size_t entry_count, + struct fsm_union_entry *entries, struct fsm_combined_base_pair *bases) +{ + const struct fsm_alloc *alloc = entries[0].fsm->alloc; + const bool log = 0 || LOG_UNION_REPEATED_PATTERN_GROUP; + + if (entry_count == 1) { + return entries[0].fsm; + } + + size_t est_total_states = 0; + for (size_t i = 0; i < entry_count; i++) { + assert(entries[i].fsm); + if (entries[i].fsm->alloc != alloc) { + errno = EINVAL; + return NULL; + } + const size_t count = fsm_countstates(entries[i].fsm); + est_total_states += count; + } + + est_total_states += 5; /* new start and end, new unanchored start and end loops */ + + struct fsm *res = fsm_new_statealloc(alloc, est_total_states); + if (res == NULL) { return NULL; } + + /* collected end states */ + struct ends_buf { + size_t ceil; + size_t used; + fsm_state_t *states; + } ends = { .ceil = 0 }; + + /* The new overall start state, which will have an epsilon edge to... */ + fsm_state_t global_start; + if (!fsm_addstate(res, &global_start)) { goto fail; } + + /* states linking to the starts of unanchored and anchored subgraphs, respectively. */ + fsm_state_t global_start_loop, global_start_anchored; + if (!fsm_addstate(res, &global_start_loop)) { goto fail; } + if (!fsm_addstate(res, &global_start_anchored)) { goto fail; } + + /* The unanchored end loop state, and an end state with no outgoing edges. */ + fsm_state_t global_end_loop, global_end; + if (!fsm_addstate(res, &global_end)) { goto fail; } + if (!fsm_addstate(res, &global_end_loop)) { goto fail; } + + /* link the start to the start loop and anchored start, and the start loop to itself */ + if (log) { + fprintf(stderr, "link_before: global_start %d -> global_start_loop %d and global_start_anchored %d\n", + global_start, global_start_loop, global_start_anchored); + } + if (!fsm_addedge_epsilon(res, global_start, global_start_loop)) { goto fail; } + if (!fsm_addedge_epsilon(res, global_start, global_start_anchored)) { goto fail; } + if (!fsm_addedge_any(res, global_start_loop, global_start_loop)) { goto fail; } + + /* link the end loop and end */ + if (log) { + fprintf(stderr, "link_before: global_end_loop %d -> global_end %d (and -> self)\n", global_end_loop, global_end); + } + if (!fsm_addedge_epsilon(res, global_end_loop, global_end)) { goto fail; } + if (!fsm_addedge_any(res, global_end_loop, global_end_loop)) { goto fail; } + + if (bases != NULL) { + memset(bases, 0x00, entry_count * sizeof(bases[0])); + } + + for (size_t fsm_i = 0; fsm_i < entry_count; fsm_i++) { + ends.used = 0; /* reset */ + + struct fsm *fsm = entries[fsm_i].fsm; + entries[fsm_i].fsm = NULL; /* transfer ownership */ + + const size_t state_count = fsm_countstates(fsm); + + fsm_state_t fsm_start; + if (!fsm_getstart(fsm, &fsm_start)) { + fsm_free(fsm); /* no start, just discard */ + continue; + } + + for (fsm_state_t s_i = 0; s_i < state_count; s_i++) { + if (fsm_isend(fsm, s_i)) { + if (ends.used == ends.ceil) { /* grow? */ + size_t nceil = (ends.ceil == 0 ? 4 : 2*ends.ceil); + fsm_state_t *nstates = f_realloc(alloc, + ends.states, nceil * sizeof(nstates[0])); + if (nstates == NULL) { goto fail; } + ends.ceil = nceil; + ends.states = nstates; + } + ends.states[ends.used++] = s_i; + } + } + + if (ends.used == 0) { + fsm_free(fsm); /* no ends, just discard */ + continue; + } + + /* When combining these, remove self-edges from any states on the FSMs to be + * combined that also have eager output IDs. We are about to add an epsilon edge + * from each to a shared state that won't have eager output IDs. + * + * Eager output matching should be idempotent, so carrying it to other reachable + * state is redundant, and it leads to a combinatorial explosion that blows up the + * state count while determinising the combined FSM otherwise. + * + * For example, if /aaa/, /bbb/, and /ccc/ are combined into a DFA that repeats + * the sub-patterns (like `^.*(?:(aaa)|(bbb)|(ccc))+.*$`), the self-edge at each + * eager output state would combine with every reachable state from then on, + * leading to a copy of the whole reachable subgraph colored by every + * combination of eager output IDs: aaa, bbb, ccc, aaa+bbb, aaa+ccc, + * bbb+ccc, aaa+bbb+ccc. Instead of three relatively separate subgraphs + * that set the eager output at their last state, one for each pattern, + * it leads to 8 (2**3) subgraph clusters because it encodes _each + * distinct combination_ in the DFA. This becomes incredibly expensive + * as the combined pattern count increases; it's essentially what I'm + * trying to avoid by adding eager output support in the first place. + * + * FIXME: instead of actively removing these, filter in fsm_determinise? */ + if (fsm_eager_output_has_eager_output(fsm)) { + /* for any state that has eager outputs and a self edge, + * remove the self edge before further linkage */ + for (fsm_state_t s = 0; s < fsm->statecount; s++) { + if (!fsm_eager_output_has_any(fsm, s, NULL)) { continue; } + struct edge_set *edges = fsm->states[s].edges; + struct edge_set *new = edge_set_new(); + + struct edge_group_iter iter; + struct edge_group_iter_info info; + edge_set_group_iter_reset(edges, EDGE_GROUP_ITER_ALL, &iter); + while (edge_set_group_iter_next(&iter, &info)) { + if (info.to != s) { + if (!edge_set_add_bulk(&new, fsm->alloc, + info.symbols, info.to)) { + goto fail; + } + } + } + edge_set_free(fsm->alloc, edges); + fsm->states[s].edges = new; + } + } + + /* call fsm_merge; we really don't care which is which */ + struct fsm_combine_info combine_info; + struct fsm *merged = fsm_merge(res, fsm, &combine_info); + if (merged == NULL) { goto fail; } + + /* update offsets if res had its state IDs shifted forward */ + global_start += combine_info.base_a; + global_start_loop += combine_info.base_a; + global_start_anchored += combine_info.base_a;; + global_end += combine_info.base_a; + global_end_loop += combine_info.base_a; + + /* also update offsets for the FSM's states */ + fsm_start += combine_info.base_b; + for (size_t i = 0; i < ends.used; i++) { + ends.states[i] += combine_info.base_b; + } + + if (bases != NULL) { + bases[fsm_i].state = combine_info.base_b; + bases[fsm_i].capture = combine_info.capture_base_b; + } + + if (log) { + fprintf(stderr, "%s: fsm[%zd].start: %d\n", __func__, fsm_i, fsm_start); + for (size_t i = 0; i < ends.used; i++) { + fprintf(stderr, "%s: fsm[%zd].ends[%zd]: %d\n", __func__, fsm_i, i, ends.states[i]); + } + } + + /* link to the FSM's start state */ + const fsm_state_t start_src = entries[fsm_i].anchored_start ? global_start_anchored : global_start_loop; + if (!fsm_addedge_epsilon(merged, start_src, fsm_start)) { goto fail; } + if (log) { + fprintf(stderr, "%s: linking %s %d to fsm[%zd]'s start %d (anchored? %d)\n", + __func__, + entries[fsm_i].anchored_start ? "global_start_anchored" : "global_start_loop", + start_src, fsm_i, fsm_start, entries[fsm_i].anchored_start); + } + + /* link from the FSM's ends */ + const fsm_state_t end_dst = entries[fsm_i].anchored_end ? global_end : global_end_loop; + for (size_t i = 0; i < ends.used; i++) { + if (log) { + fprintf(stderr, "%s: linking fsm[%zd]'s end[%zd] %d (anchored? %d) to %s %d\n", + __func__, fsm_i, i, ends.states[i], entries[fsm_i].anchored_end, + entries[fsm_i].anchored_end ? "global_end" : "global_end_loop", + end_dst); + } + if (!fsm_addedge_epsilon(merged, ends.states[i], end_dst)) { goto fail; } + } + + res = merged; + } + + /* Link from the global_end_loop to the global_start_loop, so patterns with an + * unanchored start can follow other patterns with an unanchored end. */ + if (log) { + fprintf(stderr, "%s: g_start %d, g_start_loop %d, g_start_anchored %d, g_end_loop %d, g_end %d (after all merging)\n", + __func__, global_start, global_start_loop, global_start_anchored, global_end_loop, global_end); + fprintf(stderr, "%s: linking global_end_loop %d to global_start_loop %d\n", + __func__, global_end_loop, global_start_loop); + fprintf(stderr, "%s: setting global_start %d and end %d\n", __func__, global_start, global_end); + } + if (!fsm_addedge_epsilon(res, global_end_loop, global_start_loop)) { goto fail; } + + /* This needs to be set after merging, because that clears the start state. */ + fsm_setstart(res, global_start); + fsm_setend(res, global_end, 1); + + f_free(alloc, ends.states); + return res; + +fail: + f_free(alloc, ends.states); + return NULL; +} diff --git a/src/libre/libre.syms b/src/libre/libre.syms index a4f1a223b..9d381cb0f 100644 --- a/src/libre/libre.syms +++ b/src/libre/libre.syms @@ -3,6 +3,7 @@ re_is_literal re_flags re_strerror re_perror +re_is_anchored ast_print ast_print_dot diff --git a/src/libre/re.c b/src/libre/re.c index 15af848b5..c19183dcc 100644 --- a/src/libre/re.c +++ b/src/libre/re.c @@ -335,3 +335,37 @@ re_is_literal(enum re_dialect dialect, int (*getc)(void *opaque), void *opaque, return -1; } +/* FIXME: placeholder interface */ +int +re_is_anchored(enum re_dialect dialect, re_getchar_fun *getc, void *opaque, + enum re_flags flags, struct re_err *err, + struct re_anchoring_info *info) +{ + /* FIXME: copy/pasted from above, factor out common */ + + struct ast *ast; + const struct dialect *m; + int unsatisfiable; + + assert(getc != NULL); + assert(info != NULL); + + m = re_dialect(dialect); + if (m == NULL) { + if (err != NULL) { err->e = RE_EBADDIALECT; } + return 0; + } + + flags |= m->flags; + + ast = re_parse(dialect, getc, opaque, flags, err, &unsatisfiable); + if (ast == NULL) { + return 0; + } + + info->start = (ast->expr->flags & AST_FLAG_ANCHORED_START) != 0; + info->end = (ast->expr->flags & AST_FLAG_ANCHORED_END) != 0; + + ast_free(ast); + return 1; +} diff --git a/tests/eager_output/Makefile b/tests/eager_output/Makefile new file mode 100644 index 000000000..a650bf802 --- /dev/null +++ b/tests/eager_output/Makefile @@ -0,0 +1,22 @@ +.include "../../share/mk/top.mk" + +TEST.tests/eager_output != ls -1 tests/eager_output/eager_output*.c +TEST_SRCDIR.tests/eager_output = tests/eager_output +TEST_OUTDIR.tests/eager_output = ${BUILD}/tests/eager_output + +.for n in ${TEST.tests/eager_output:T:R:C/^eager_output//} +INCDIR.${TEST_SRCDIR.tests/eager_output}/eager_output${n}.c += src/adt +.endfor + +SRC += ${TEST_SRCDIR.tests/eager_output}/utils.c + +.for n in ${TEST.tests/eager_output:T:R:C/^eager_output//} +test:: ${TEST_OUTDIR.tests/eager_output}/res${n} +SRC += ${TEST_SRCDIR.tests/eager_output}/eager_output${n}.c +CFLAGS.${TEST_SRCDIR.tests/eager_output}/eager_output${n}.c += -UNDEBUG + +${TEST_OUTDIR.tests/eager_output}/run${n}: ${TEST_OUTDIR.tests/eager_output}/eager_output${n}.o ${TEST_OUTDIR.tests/eager_output}/utils.o ${BUILD}/lib/libfsm.a ${BUILD}/lib/libre.a + ${CC} ${CFLAGS} ${CFLAGS.${TEST_SRCDIR.tests/eager_output}/eager_output${n}.c} -o ${TEST_OUTDIR.tests/eager_output}/run${n} ${TEST_OUTDIR.tests/eager_output}/eager_output${n}.o ${TEST_OUTDIR.tests/eager_output}/utils.o ${BUILD}/lib/libfsm.a ${BUILD}/lib/libre.a +${TEST_OUTDIR.tests/eager_output}/res${n}: ${TEST_OUTDIR.tests/eager_output}/run${n} + ( ${TEST_OUTDIR.tests/eager_output}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/eager_output}/res${n} +.endfor diff --git a/tests/eager_output/eager_output1.c b/tests/eager_output/eager_output1.c new file mode 100644 index 000000000..f20ef77b7 --- /dev/null +++ b/tests/eager_output/eager_output1.c @@ -0,0 +1,12 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { "abc" }, + .inputs = { + { .input = "abc", .expected_ids = { 1 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output2.c b/tests/eager_output/eager_output2.c new file mode 100644 index 000000000..cdac204e2 --- /dev/null +++ b/tests/eager_output/eager_output2.c @@ -0,0 +1,17 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { "ab(c|d|e)" }, + .inputs = { + { .input = "abc", .expected_ids = { 1 } }, + { .input = "abd", .expected_ids = { 1 } }, + { .input = "abe", .expected_ids = { 1 } }, + { .input = "Xabe", .expected_ids = { 1 } }, + { .input = "abeX", .expected_ids = { 1 } }, + { .input = "XabeX", .expected_ids = { 1 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output3.c b/tests/eager_output/eager_output3.c new file mode 100644 index 000000000..c11bc58a4 --- /dev/null +++ b/tests/eager_output/eager_output3.c @@ -0,0 +1,16 @@ +#include "utils.h" + +/* test that eager endids are correctly propagated through fsm_determinise() and fsm_minimise() */ +int main(void) +{ + struct eager_output_test test = { + .patterns = { "ab(c|d|e)?" }, + .inputs = { + { .input = "ab", .expected_ids = { 1 } }, + { .input = "abc", .expected_ids = { 1 } }, + { .input = "abd", .expected_ids = { 1 } }, + { .input = "abe", .expected_ids = { 1 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output4.c b/tests/eager_output/eager_output4.c new file mode 100644 index 000000000..47cd32029 --- /dev/null +++ b/tests/eager_output/eager_output4.c @@ -0,0 +1,13 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { "abcde$" }, + .inputs = { + { .input = "abcde", .expected_ids = { 1 } }, + { .input = "Xabcde", .expected_ids = { 1 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output5.c b/tests/eager_output/eager_output5.c new file mode 100644 index 000000000..4551c68b1 --- /dev/null +++ b/tests/eager_output/eager_output5.c @@ -0,0 +1,14 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { "^abc$", "^ab*c$" }, + .inputs = { + { .input = "ac", .expected_ids = { 2 } }, + { .input = "abc", .expected_ids = { 1, 2 } }, + { .input = "abbc", .expected_ids = { 2 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output6.c b/tests/eager_output/eager_output6.c new file mode 100644 index 000000000..5431d0981 --- /dev/null +++ b/tests/eager_output/eager_output6.c @@ -0,0 +1,34 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { + "apple", + "banana", + "carrot", + "durian", + "eggplant", + "fig", + "grapefruit", + "hazelnut", + "iceberg lettuce", + "jicama", + }, + .inputs = { + { .input = "apple", .expected_ids = { 1 } }, + { .input = "banana", .expected_ids = { 2 } }, + { .input = "carrot", .expected_ids = { 3 } }, + { .input = "durian", .expected_ids = { 4 } }, + { .input = "eggplant", .expected_ids = { 5 } }, + { .input = "fig", .expected_ids = { 6 } }, + { .input = "grapefruit", .expected_ids = { 7 } }, + { .input = "hazelnut", .expected_ids = { 8 } }, + { .input = "iceberg lettuce", .expected_ids = { 9 } }, + { .input = "jicama", .expected_ids = { 10 } }, + { .input = "apple banana carrot", .expected_ids = { 1, 2, 3 } }, + }, + }; + + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output7.c b/tests/eager_output/eager_output7.c new file mode 100644 index 000000000..3d123878b --- /dev/null +++ b/tests/eager_output/eager_output7.c @@ -0,0 +1,103 @@ +#include "utils.h" + +int main(void) +{ + /* Run this test with env FORCE_ENDIDS=N ... to see how much more + * expensive it is to combine the first N patterns using endids, + * rather than eager_outputs. It becomes VERY slow for >= 9 or so. + * (Note that the checks probably will not pass for N < 4, because + * it will start skipping appear in the early test inputs.) */ + bool force_endids = false; + size_t force_endid_count = 0; + { + const char *str = getenv("FORCE_ENDIDS"); + if (str != NULL) { + force_endid_count = atoi(str); + if (force_endid_count == 0) { + force_endid_count = 26; + } + force_endids = true; + } + } + + struct eager_output_test test = { + .patterns = { + [0] = "apple", + [1] = "banana", + [2] = "carrot", + [3] = "durian", + [4] = "eggplant", + [5] = "fig", + [6] = "grapefruit", + [7] = "hazelnut", + [8] = "iceberg lettuce", + [9] = "jicama", + [10] = "kiwano", + [11] = "lemon", + [12] = "mango", + [13] = "nectarine", + [14] = "orange", + [15] = "plum", + [16] = "quince", + [17] = "radish", + [18] = "strawberry", + [19] = "turnip", + [20] = "ube", + [21] = "vanilla", + [22] = "watermelon", + [23] = "xigua watermelon", + [24] = "yam", + [25] = "zucchini", + }, + .inputs = { + /* Note: expected IDs are shifted by 1, it's 0-terminated. */ + { .input = "apple", .expected_ids = { 1 } }, + { .input = "banana", .expected_ids = { 2 } }, + { .input = "carrot", .expected_ids = { 3 } }, + { .input = "apple banana", .expected_ids = { 1, 2 } }, + { .input = "carrot durian apple", .expected_ids = { 1, 3, 4 } }, + { .input = "carrot fig apple", .expected_ids = { 1, 3, 6 } }, + + /* leading characters and an incomplete trailing match */ + { .input = "mumble mumble fig hazelnut banana xigua watermelo", .expected_ids = { 2, 6, 8 } }, + + /* redundant matches */ + { .input = "ube ube ube ube ube", .expected_ids = { 21 } }, + + /* everything */ + { .input = + "apple banana carrot durian eggplant fig grapefruit " + "hazelnut iceberg lettuce jicamaa kiwano lemon mango " + "nectarine orange plum quince radish strawberry " + "turnip ube vanilla watermelon xigua watermelon yam zucchini", + .expected_ids = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + }, + }, + /* everything, only spaces appearing in patterns */ + { .input = + "applebananacarrotdurianeggplantfiggrapefruit" + "hazelnuticeberg lettucejicamaakiwanolemonmango" + "nectarineorangeplumquinceradishstrawberry" + "turnipubevanillawatermelonxigua watermelonyamzucchini", + .expected_ids = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + }, + }, + }, + }; + + /* truncate patterns to the first N */ + if (force_endids) { + assert(force_endid_count > 0 && force_endid_count <= 26); + test.patterns[force_endid_count] = NULL; + + /* truncate test inputs to just the first couple, since + * later inputs use later patterns */ + test.inputs[5].input = NULL; + } + + return run_test(&test, false, force_endids); +} diff --git a/tests/eager_output/eager_output_at_start.c b/tests/eager_output/eager_output_at_start.c new file mode 100644 index 000000000..407aa4e77 --- /dev/null +++ b/tests/eager_output/eager_output_at_start.c @@ -0,0 +1,12 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { "" }, + .inputs = { + { .input = "", .expected_ids = { 1 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output_fr1.c b/tests/eager_output/eager_output_fr1.c new file mode 100644 index 000000000..e8e5f3395 --- /dev/null +++ b/tests/eager_output/eager_output_fr1.c @@ -0,0 +1,13 @@ +#include "utils.h" + +/* Fuzzer regresison */ +int main(void) +{ + struct eager_output_test test = { + .patterns = { "ab", "" }, + .inputs = { + { .input = "ab", .expected_ids = { 1, 2 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output_fr2.c b/tests/eager_output/eager_output_fr2.c new file mode 100644 index 000000000..404e98644 --- /dev/null +++ b/tests/eager_output/eager_output_fr2.c @@ -0,0 +1,13 @@ +#include "utils.h" + +/* Fuzzer regresison */ +int main(void) +{ + struct eager_output_test test = { + .patterns = { "", "" }, + .inputs = { + { .input = "", .expected_ids = { 1, 2 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output_fr3.c b/tests/eager_output/eager_output_fr3.c new file mode 100644 index 000000000..c7e4127a6 --- /dev/null +++ b/tests/eager_output/eager_output_fr3.c @@ -0,0 +1,13 @@ +#include "utils.h" + +/* Fuzzer regresison */ +int main(void) +{ + struct eager_output_test test = { + .patterns = { "^", "" }, + .inputs = { + { .input = "", .expected_ids = { 1, 2 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output_mixed_anchored_unanchored.c b/tests/eager_output/eager_output_mixed_anchored_unanchored.c new file mode 100644 index 000000000..a586f9840 --- /dev/null +++ b/tests/eager_output/eager_output_mixed_anchored_unanchored.c @@ -0,0 +1,46 @@ +#include "utils.h" + +int main(void) +{ + /* fprintf(stderr, "%s: skipping for now, this doesn't pass yet.\n", __FILE__); */ + /* return EXIT_SUCCESS; */ + + struct eager_output_test test = { + .patterns = { + "^abc$", + "def", + "^ghi", + "jkl$", + "mno", + }, + .inputs = { + { .input = "abc", .expected_ids = { 1 } }, + { .input = "def", .expected_ids = { 2 } }, + { .input = "ghi", .expected_ids = { 3 } }, + { .input = "jkl", .expected_ids = { 4 } }, + { .input = "mno", .expected_ids = { 5 } }, + + { .input = "defmno", .expected_ids = { 2, 5 } }, + { .input = " def mno ", .expected_ids = { 2, 5 } }, + + /* Matching a start-anchored pattern followed by + * unanchored ones should just work. */ + { .input = "ghi def", .expected_ids = { 2, 3 } }, + + /* An unanchored pattern before a start-anchored pattern + * should only match the unanchored pattern. */ + { .input = "def ghi", .expected_ids = { 2 } }, + + /* Matching an unanchored pattern before an + * end-anchored one is fine. */ + { .input = "mno jkl", .expected_ids = { 4, 5 } }, + + /* This should match "mno" with the "jkl" prefix + * ignored by the unanchored start, which does + * not count as a match for "jkl$". */ + { .input = "jkl mno", .expected_ids = { 5 } }, + }, + }; + + return run_test(&test, false, false); +} diff --git a/tests/eager_output/utils.c b/tests/eager_output/utils.c new file mode 100644 index 000000000..4bee8d848 --- /dev/null +++ b/tests/eager_output/utils.c @@ -0,0 +1,278 @@ +#include "utils.h" + +void +fsm_eager_output_dump(FILE *f, const struct fsm *fsm); + +void +fsm_endid_dump(FILE *f, const struct fsm *fsm); + +void +append_eager_output_cb(fsm_output_id_t id, void *opaque) +{ + struct cb_info *info = (struct cb_info *)opaque; + assert(info->used < MAX_IDS); + + for (size_t i = 0; i < info->used; i++) { + if (info->ids[i] == id) { + return; /* already present */ + } + } + + info->ids[info->used++] = id; +} + +int +cmp_output(const void *pa, const void *pb) +{ + const fsm_output_id_t a = *(fsm_output_id_t *)pa; + const fsm_output_id_t b = *(fsm_output_id_t *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + +struct fsm_options print_options = { + .consolidate_edges = 1, + .comments = 0, + .group_edges = 1, +}; + +void +dump(const struct fsm *fsm) +{ + fsm_print(stderr, fsm, + &print_options, NULL, FSM_PRINT_DOT); +} + +int +run_test(const struct eager_output_test *test, bool allow_extra_outputs, bool force_endids) +{ + struct fsm_union_entry entries[MAX_PATTERNS] = {0}; + + allow_extra_outputs = false; + + size_t fsms_used = 0; + int ret = 0; + + int log = 0; + { + const char *logstr = getenv("LOG"); + if (logstr != NULL) { + if (logstr[0] == 'y') { /* make "y" or "yes" non-zero */ + logstr = "1"; + } + log = atoi(logstr); + } + } + + for (size_t i = 0; i < MAX_PATTERNS; i++) { + const char *p = test->patterns[i]; + if (test->patterns[i] == NULL) { break; } + const size_t len = strlen(p); + struct fsm_union_entry *e = &entries[fsms_used]; + + /* For sake of these patterns, they are anchored if the first/last + * character is '^' and '$', respectively. This is too simplistic + * for the general case, though. */ + if (len > 0) { + if (p[0] == '^') { e->anchored_start = true; } + if (p[len - 1] == '$') { e->anchored_end = true; } + /* fprintf(stderr, "%s: p[%zd]: '%s', start %d, end %d\n", */ + /* __func__, fsms_used, p, e->anchored_start, e->anchored_end); */ + } + + struct fsm *fsm = re_comp(RE_PCRE, fsm_sgetc, &p, NULL, 0, NULL); + assert(fsm != NULL); + + /* Zero is used to terminate expected_ids, so don't use it here. */ + const fsm_output_id_t output_id = (fsm_output_id_t) (i + 1); + const fsm_end_id_t end_id = (fsm_end_id_t) (i + 1); + + /* Set either an end ID or an eager output ID, depending on + * whether the fsm is anchored at the end or not. */ + if (e->anchored_end || force_endids) { + ret = fsm_setendid(fsm, end_id); + } else { + ret = fsm_seteageroutputonends(fsm, output_id); + } + assert(ret == 1); + + if (log) { + fprintf(stderr, "==== source DFA %zd (pre det+min)\n", i); + if (log > 1) { dump(fsm); } + fsm_eager_output_dump(stderr, fsm); + fsm_endid_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + ret = fsm_determinise(fsm); + assert(ret == 1); + + if (log) { + fprintf(stderr, "==== source DFA %zd (post det)\n", i); + if (log > 1) { dump(fsm); } + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + ret = fsm_minimise(fsm); + assert(ret == 1); + + if (log) { + fprintf(stderr, "==== source DFA %zd (post det+min)\n", i); + if (log > 1) { dump(fsm); } + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + e->fsm = fsm; + fsms_used++; + } + + /* If there's only one pattern this just returns fsms[0]. */ + struct fsm *fsm = fsm_union_repeated_pattern_group(fsms_used, entries, NULL); + assert(fsm != NULL); + + if (log) { + fprintf(stderr, "==== combined (pre det+min)\n"); + if (log > 1) { dump(fsm); } + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "--- endids:\n"); + fsm_endid_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + if (log) { + fprintf(stderr, "=== determinising combined... NFA has %u states\n", fsm_countstates(fsm)); + } + ret = fsm_determinise(fsm); + assert(ret == 1); + if (log) { + fprintf(stderr, "=== determinising combined...done, DFA has %u states\n", fsm_countstates(fsm)); + } + + if (log) { + fprintf(stderr, "==== combined (post det)\n"); + if (log > 1) { dump(fsm); } + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + ret = fsm_minimise(fsm); + if (log) { + fprintf(stderr, "=== minimised combined...done, DFA has %u states\n", fsm_countstates(fsm)); + } + assert(ret == 1); + + if (log) { + fprintf(stderr, "==== combined (post det+min)\n"); + if (log > 1) { dump(fsm); } + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "--- endids:\n"); + fsm_endid_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + struct cb_info outputs = { 0 }; + fsm_eager_output_set_cb(fsm, append_eager_output_cb, &outputs); + + for (size_t i_i = 0; i_i < MAX_INPUTS; i_i++) { + outputs.used = 0; + const char *input = test->inputs[i_i].input; + if (input == NULL) { break; } + + size_t expected_id_count = 0; + for (size_t id_i = 0; id_i < MAX_ENDIDS; id_i++) { + const fsm_output_id_t id = test->inputs[i_i].expected_ids[id_i]; + if (id == 0) { break; } + expected_id_count++; + + /* must be ascending */ + if (id_i > 0) { + assert(id > test->inputs[i_i].expected_ids[id_i - 1]); + } + } + + if (log) { + fprintf(stderr, "%s: input %zd: \"%s\", expecting %zd ids:", + __func__, i_i, input, expected_id_count); + for (size_t i = 0; i < expected_id_count; i++) { + fprintf(stderr, " %d", test->inputs[i_i].expected_ids[i]); + } + } + + if (test->inputs[i_i].expect_fail) { + expected_id_count = 0; + } + + fsm_state_t end; /* only set on match */ + ret = fsm_exec(fsm, fsm_sgetc, &input, &end, NULL); + + if (ret == 1) { +#define ENDID_BUF_SIZE 32 + fsm_end_id_t endid_buf[ENDID_BUF_SIZE] = {0}; + const size_t endid_count = fsm_endid_count(fsm, end); + /* fprintf(stderr, "%s: endid_count %zd for state %d\n", __func__, endid_count, end); */ + assert(endid_count < ENDID_BUF_SIZE); + if (!fsm_endid_get(fsm, end, /*ENDID_BUF_SIZE*/ endid_count, endid_buf)) { + assert(!"fsm_endid_get failed"); + } + + /* Copy endid outputs into outputs.ids[], since for testing + * purposes we don't care about the difference between eager + * output and endids here -- the values don't overlap. */ + assert(outputs.used + endid_count <= MAX_IDS); + for (size_t endid_i = 0; endid_i < endid_count; endid_i++) { + if (log) { + fprintf(stderr, "-- adding endid %zd: %d\n", endid_i, endid_buf[endid_i]); + } + outputs.ids[outputs.used++] = (fsm_output_id_t)endid_buf[endid_i]; + } + } + + if (ret == 0) { + /* if it didn't match, ignore the eager output IDs. this should + * eventually happen internal to fsm_exec or codegen. */ + outputs.used = 0; + } + + /* NEXT match IDs, sort outputs[] buffer first */ + qsort(outputs.ids, outputs.used, sizeof(outputs.ids[0]), cmp_output); + + if (log) { + fprintf(stderr, "-- got %zd:", outputs.used); + for (size_t i = 0; i < outputs.used; i++) { + fprintf(stderr, " %d", outputs.ids[i]); + } + fprintf(stderr, "\n"); + } + + if (expected_id_count == 0) { + assert(ret == 0 || outputs.used == 0); /* no match */ + continue; + } else { + assert(ret == 1); + } + + if (!allow_extra_outputs) { + assert(outputs.used == expected_id_count); + } else { + assert(outputs.used >= expected_id_count); + } + + size_t floor = 0; + for (size_t exp_i = 0; exp_i < outputs.used; exp_i++) { + bool found = false; + for (size_t got_i = floor; got_i < outputs.used; got_i++) { + if (outputs.ids[got_i] == test->inputs[i_i].expected_ids[exp_i]) { + floor = got_i + 1; + found = true; + break; + } + } + assert(found); + } + } + + fsm_free(fsm); + + return EXIT_SUCCESS;; +} diff --git a/tests/eager_output/utils.h b/tests/eager_output/utils.h new file mode 100644 index 000000000..672c01977 --- /dev/null +++ b/tests/eager_output/utils.h @@ -0,0 +1,64 @@ +#ifndef UTILS_H +#define UTILS_H + +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#define MAX_IDS 32 + +#include + +#include + +#define MAX_PATTERNS 150 +#define MAX_INPUTS 64 +#define MAX_ENDIDS 32 + +struct eager_output_test { + const char *patterns[MAX_PATTERNS]; + + struct { + const char *input; + bool expect_fail; + /* Terminated by 0. pattern[i] => id of i+1. Must be sorted. */ + fsm_output_id_t expected_ids[MAX_ENDIDS]; + } inputs[MAX_INPUTS]; +}; + +void +append_eager_output_cb(fsm_output_id_t id, void *opaque); + +int +cmp_output(const void *pa, const void *pb); + +int +run_test(const struct eager_output_test *test, bool allow_extra_outputs, bool force_endids); + +struct cb_info { + size_t used; + fsm_end_id_t ids[MAX_IDS]; +}; + +void +dump(const struct fsm *fsm); + +void +append_eager_output_cb(fsm_end_id_t id, void *opaque); + +#endif