Skip to content

Commit

Permalink
Experimental initial support for CDATA output mode.
Browse files Browse the repository at this point in the history
This is an alternative for -lc and -lvmc that avoids very expensive
compilation when the resulting C output is quite large. For this
mode, most of the output is C data literals (a couple structs tables),
followed by a very small (~50 loc) interpreter for the data. This is
much faster to compile -- for a data set I'm working with now, it's
30 seconds to build compared to several hours and/or gcc exhausting
memory.

Generating output with comments enabled will include inline comments
about the format, along with per-state comments showing labels,
endids, and eager outputs. It will only generate code for endids
and eager outputs if the DFA has them.

This is experimental. I expect the interfaces will change a bit in
the near future, and I am still working on performance tuning.

There is some code to detect and reuse repeated runs of IDs in the
output tables, but there is a bug leading to them not being
terminated properly (possibly causing false positives), so it's
currently disabled.

To see a good example of the format, with comments, run:
    build/bin//re -rpcre -lcdata -u '^abc'
  • Loading branch information
silentbicycle committed Oct 10, 2024
1 parent 8fd728e commit 98e9cb4
Show file tree
Hide file tree
Showing 12 changed files with 1,311 additions and 10 deletions.
2 changes: 2 additions & 0 deletions include/fsm/print.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ enum fsm_print_lang {
FSM_PRINT_VMC, /* ISO C90 code, VM style */
FSM_PRINT_VMDOT, /* Graphviz Dot format, showing VM opcodes */

FSM_PRINT_CDATA, /* C data tables and small interpreter */

FSM_PRINT_VMOPS_C, /* VM opcodes as a datastructure */
FSM_PRINT_VMOPS_H,
FSM_PRINT_VMOPS_MAIN
Expand Down
6 changes: 5 additions & 1 deletion include/fsm/walk.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ fsm_walk_edges(const struct fsm *fsm, void *opaque,
* functionally equivalent cases makes testing dramatically faster,
* but exploring every edge could be added later.
*
* If seed is zero then it will generate the first label in the label
* set, otherwise a label from the set will be chosen using rand()
* (favoring printable characters).
*
* Note: fsm is non-const because it calls fsm_trim on the FSM
* internally. This records the shortest distance from each state to an
* end state, which is used to prune branches that would not produce
Expand All @@ -114,7 +118,7 @@ fsm_generate_matches_cb(const struct fsm *fsm,
const char *input, size_t input_length,
fsm_state_t end_state, void *opaque);
int
fsm_generate_matches(struct fsm *fsm, size_t max_length,
fsm_generate_matches(struct fsm *fsm, size_t max_length, unsigned seed,
fsm_generate_matches_cb *cb, void *opaque);

/* Callback provided for the most basic use case for
Expand Down
2 changes: 1 addition & 1 deletion src/fsm/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -770,7 +770,7 @@ main(int argc, char *argv[])
}

if (generate_bounds > 0) {
r = fsm_generate_matches(fsm, generate_bounds, fsm_generate_cb_printf_escaped, &opt);
r = fsm_generate_matches(fsm, generate_bounds, 0, fsm_generate_cb_printf_escaped, &opt);
}

fsm_free(fsm);
Expand Down
65 changes: 60 additions & 5 deletions src/libfsm/gen.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ struct gen_ctx {
fsm_generate_matches_cb *cb;

bool done;
bool randomized;

size_t buf_ceil;
size_t buf_used;
Expand Down Expand Up @@ -139,7 +140,7 @@ static bool
grow_stack(struct gen_ctx *ctx);

int
fsm_generate_matches(struct fsm *fsm, size_t max_length,
fsm_generate_matches(struct fsm *fsm, size_t max_length, unsigned seed,
fsm_generate_matches_cb *cb, void *opaque)
{
if (max_length == 0) {
Expand All @@ -153,7 +154,7 @@ fsm_generate_matches(struct fsm *fsm, size_t max_length,

INIT_TIMERS();
TIME(&pre);
int res = gen_init_outer(fsm, max_length, cb, opaque, false, 0);
int res = gen_init_outer(fsm, max_length, cb, opaque, seed != 0, seed);
TIME(&post);

DIFF_MSEC("fsm_generate_matches", pre, post, NULL);
Expand Down Expand Up @@ -212,8 +213,9 @@ gen_init_outer(struct fsm *fsm, size_t max_length,

assert(fsm_all(fsm, fsm_isdfa)); /* DFA-only */

assert(!randomized); /* not yet supported */
(void)seed;
if (randomized) {
srand(seed);
}

#if LOG_GEN > 1
fprintf(stderr, "%s: %u states\n", __func__, fsm_countstates(fsm));
Expand All @@ -228,6 +230,7 @@ gen_init_outer(struct fsm *fsm, size_t max_length,
.max_length = max_length,
.cb = cb,
.opaque = opaque,
.randomized = randomized,
};

if (!gen_init(&ctx, fsm)) {
Expand Down Expand Up @@ -528,6 +531,55 @@ first_symbol(const uint64_t *symbols)
return 0;
}

static unsigned char
random_symbol(const uint64_t *symbols)
{
bool has_zero = false;
unsigned i = 0;

/* printable and non-printable character choices */
size_t choice_count = 0;
unsigned char choices[256];
size_t np_choice_count = 0;
unsigned char np_choices[256];

while (i < 256) {
const uint64_t w = symbols[i/64];
if ((i & 63) == 0 && w == 0) {
i += 64;
continue;
}
if (w & (1ULL << (i & 63))) {
if (i == 0) {
has_zero = true;
} else if (isprint(i)) {
choices[choice_count++] = (unsigned char)i;
} else {
np_choices[np_choice_count++] = (unsigned char)i;
}
}
i++;
}

if (choice_count > 0) {
const size_t c = rand() % choice_count;
return choices[c];
}

if (np_choice_count > 0) {
const size_t c = rand() % np_choice_count;
return np_choices[c];
}

/* Prefer anything besides 0x00 if present, since that will truncate the string. */
if (has_zero) {
return 0;
}

assert(!"empty set");
return 0;
}

#if DUMP_EDGES
static void
dump_edges(fsm_state_t state, struct edge_set *edges)
Expand All @@ -542,6 +594,7 @@ dump_edges(fsm_state_t state, struct edge_set *edges)
size_t i = 0;
while (edge_set_group_iter_next(&ei, &eg)) {
const unsigned char symbol = first_symbol(eg.symbols);
const unsigned char symbol = random_symbol(eg.symbols);
fprintf(stderr, "%s: %d -- %zu/%zu -- 0x%02x (%c) -> %d\n",
__func__, state, i, count,
symbol, isprint(symbol) ? symbol : '.', eg.to);
Expand Down Expand Up @@ -589,7 +642,9 @@ sfs_step_edges(struct gen_ctx *ctx, struct gen_stack_frame *sf)
struct edge_group_iter_info eg;

if (iter_next_transition(ctx, sf, &eg)) {
const unsigned char symbol = first_symbol(eg.symbols);
const unsigned char symbol = ctx->randomized
? random_symbol(eg.symbols)
: first_symbol(eg.symbols);
const fsm_state_t state = eg.to;

LOG(2, "sfs_step_edges: got edge 0x%x ('%c')\n",
Expand Down
2 changes: 2 additions & 0 deletions src/libfsm/print.c
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,8 @@ fsm_print(FILE *f, const struct fsm *fsm,
case FSM_PRINT_VMC: print_vm = fsm_print_vmc; break;
case FSM_PRINT_VMDOT: print_vm = fsm_print_vmdot; break;

case FSM_PRINT_CDATA: print_ir = fsm_print_cdata; break;

case FSM_PRINT_VMOPS_C: print_vm = fsm_print_vmops_c; break;
case FSM_PRINT_VMOPS_H: print_vm = fsm_print_vmops_h; break;
case FSM_PRINT_VMOPS_MAIN: print_vm = fsm_print_vmops_main; break;
Expand Down
1 change: 1 addition & 0 deletions src/libfsm/print.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ vm_print_f fsm_print_llvm;
vm_print_f fsm_print_rust;
vm_print_f fsm_print_sh;
vm_print_f fsm_print_vmc;
ir_print_f fsm_print_cdata;

vm_print_f fsm_print_vmdot;
vm_print_f fsm_print_vmops_c;
Expand Down
1 change: 1 addition & 0 deletions src/libfsm/print/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ SRC += src/libfsm/print/irdot.c
SRC += src/libfsm/print/irjson.c
SRC += src/libfsm/print/json.c
SRC += src/libfsm/print/llvm.c
SRC += src/libfsm/print/cdata.c
SRC += src/libfsm/print/rust.c
SRC += src/libfsm/print/sh.c
SRC += src/libfsm/print/vmasm.c
Expand Down
Loading

0 comments on commit 98e9cb4

Please sign in to comment.