Skip to content

Commit

Permalink
Refactor further to expose the simpler escape search possible
Browse files Browse the repository at this point in the history
  • Loading branch information
byroot committed Feb 1, 2025
1 parent 61cda86 commit e03515a
Showing 1 changed file with 143 additions and 97 deletions.
240 changes: 143 additions & 97 deletions ext/json/ext/generator/generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,20 +103,20 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
static const unsigned char CHAR_LENGTH_MASK = 7;
static const unsigned char ESCAPE_MASK = 8;

static const unsigned char escape_table[256] = {
// ASCII Control Characters
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
// ASCII Characters
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
typedef struct _search_state {
const char *ptr;
const char *end;
const char *cursor;
FBuffer *buffer;
} search_state;

static const unsigned char ascii_only_escape_table[256] = {
static inline void search_flush(search_state *search)
{
fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor);
search->cursor = search->ptr;
}

static const unsigned char escape_table_basic[256] = {
// ASCII Control Characters
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
Expand All @@ -127,20 +127,105 @@ static const unsigned char ascii_only_escape_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// Continuation byte
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// First byte of a 2-byte code point
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
// First byte of a 3-byte code point
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
//First byte of a 4+ byte code point
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
};

static inline unsigned char search_escape_basic(search_state *search)
{
while (search->ptr < search->end) {
if (RB_UNLIKELY(escape_table_basic[(const unsigned char)*search->ptr])) {
search_flush(search);
return 1;
} else {
search->ptr++;
}
}
search_flush(search);
return 0;
}

static inline void escape_UTF8_char_basic(search_state *search) {
const unsigned char ch = (unsigned char)*search->ptr;
switch (ch) {
case '"': fbuffer_append(search->buffer, "\\\"", 2); break;
case '\\': fbuffer_append(search->buffer, "\\\\", 2); break;
case '/': fbuffer_append(search->buffer, "\\/", 2); break;
case '\b': fbuffer_append(search->buffer, "\\b", 2); break;
case '\f': fbuffer_append(search->buffer, "\\f", 2); break;
case '\n': fbuffer_append(search->buffer, "\\n", 2); break;
case '\r': fbuffer_append(search->buffer, "\\r", 2); break;
case '\t': fbuffer_append(search->buffer, "\\t", 2); break;
default: {
const char *hexdig = "0123456789abcdef";
char scratch[6] = { '\\', 'u', '0', '0', 0, 0 };
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(search->buffer, scratch, 6);
break;
}
}
search->ptr++;
search->cursor = search->ptr;
}

/* Converts in_string to a JSON string (without the wrapping '"'
* characters) in FBuffer out_buffer.
*
* Character are JSON-escaped according to:
*
* - Always: ASCII control characters (0x00-0x1F), dquote, and
* backslash.
*
* - If out_ascii_only: non-ASCII characters (>0x7F)
*
* - If script_safe: forwardslash (/), line separator (U+2028), and
* paragraph separator (U+2029)
*
* Everything else (should be UTF-8) is just passed through and
* appended to the result.
*/
static inline void convert_UTF8_to_JSON(search_state *search)
{
while (search_escape_basic(search)) {
escape_UTF8_char_basic(search);
}
}

static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) {
const unsigned char ch = (unsigned char)*search->ptr;
switch (ch_len) {
case 1: {
switch (ch) {
case '"': fbuffer_append(search->buffer, "\\\"", 2); break;
case '\\': fbuffer_append(search->buffer, "\\\\", 2); break;
case '/': fbuffer_append(search->buffer, "\\/", 2); break;
case '\b': fbuffer_append(search->buffer, "\\b", 2); break;
case '\f': fbuffer_append(search->buffer, "\\f", 2); break;
case '\n': fbuffer_append(search->buffer, "\\n", 2); break;
case '\r': fbuffer_append(search->buffer, "\\r", 2); break;
case '\t': fbuffer_append(search->buffer, "\\t", 2); break;
default: {
const char *hexdig = "0123456789abcdef";
char scratch[6] = { '\\', 'u', '0', '0', 0, 0 };
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(search->buffer, scratch, 6);
break;
}
}
break;
}
case 3: {
if (search->ptr[2] & 1) {
fbuffer_append(search->buffer, "\\u2029", 6);
} else {
fbuffer_append(search->buffer, "\\u2028", 6);
}
break;
}
}
search->cursor = (search->ptr += ch_len);
}

static const unsigned char script_safe_escape_table[256] = {
// ASCII Control Characters
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
Expand All @@ -166,25 +251,11 @@ static const unsigned char script_safe_escape_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
};


typedef struct _search_state {
const char *ptr;
const char *end;
const char *cursor;
FBuffer *buffer;
} search_state;

static inline void search_flush(search_state *search)
{
fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor);
search->cursor = search->ptr;
}

static inline unsigned char search_escape(search_state *search, const unsigned char escape_table[256])
static inline unsigned char search_script_safe_escape(search_state *search)
{
while (search->ptr < search->end) {
unsigned char ch = (unsigned char)*search->ptr;
unsigned char ch_len = escape_table[ch];
unsigned char ch_len = script_safe_escape_table[ch];

if (RB_UNLIKELY(ch_len)) {
if (ch_len & ESCAPE_MASK) {
Expand All @@ -208,66 +279,39 @@ static inline unsigned char search_escape(search_state *search, const unsigned c
return 0;
}

static inline void fast_escape_UTF8_char(search_state *search, unsigned char ch_len) {
const unsigned char ch = (unsigned char)*search->ptr;
switch (ch_len) {
case 1: {
switch (ch) {
case '"': fbuffer_append(search->buffer, "\\\"", 2); break;
case '\\': fbuffer_append(search->buffer, "\\\\", 2); break;
case '/': fbuffer_append(search->buffer, "\\/", 2); break;
case '\b': fbuffer_append(search->buffer, "\\b", 2); break;
case '\f': fbuffer_append(search->buffer, "\\f", 2); break;
case '\n': fbuffer_append(search->buffer, "\\n", 2); break;
case '\r': fbuffer_append(search->buffer, "\\r", 2); break;
case '\t': fbuffer_append(search->buffer, "\\t", 2); break;
default: {
const char *hexdig = "0123456789abcdef";
char scratch[6] = { '\\', 'u', '0', '0', 0, 0 };
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(search->buffer, scratch, 6);
break;
}
}
break;
}
case 3: {
if (search->ptr[2] & 1) {
fbuffer_append(search->buffer, "\\u2029", 6);
} else {
fbuffer_append(search->buffer, "\\u2028", 6);
}
break;
}
}
search->cursor = (search->ptr += ch_len);
}

/* Converts in_string to a JSON string (without the wrapping '"'
* characters) in FBuffer out_buffer.
*
* Character are JSON-escaped according to:
*
* - Always: ASCII control characters (0x00-0x1F), dquote, and
* backslash.
*
* - If out_ascii_only: non-ASCII characters (>0x7F)
*
* - If script_safe: forwardslash (/), line separator (U+2028), and
* paragraph separator (U+2029)
*
* Everything else (should be UTF-8) is just passed through and
* appended to the result.
*/
static inline void convert_UTF8_to_JSON(search_state *search, const unsigned char escape_table[256])
static void convert_UTF8_to_script_safe_JSON(search_state *search)
{
unsigned char ch_len;
while ((ch_len = search_escape(search, escape_table))) {
fast_escape_UTF8_char(search, ch_len);
while ((ch_len = search_script_safe_escape(search))) {
escape_UTF8_char(search, ch_len);
}
}

static const unsigned char ascii_only_escape_table[256] = {
// ASCII Control Characters
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
// ASCII Characters
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// Continuation byte
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// First byte of a 2-byte code point
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
// First byte of a 3-byte code point
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
//First byte of a 4+ byte code point
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
};

static inline unsigned char search_ascii_only_escape(search_state *search, const unsigned char escape_table[256])
{
while (search->ptr < search->end) {
Expand Down Expand Up @@ -934,8 +978,10 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat
case ENC_CODERANGE_VALID:
if (RB_UNLIKELY(state->ascii_only)) {
convert_UTF8_to_ASCII_only_JSON(&search, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
} else if (RB_UNLIKELY(state->script_safe)) {
convert_UTF8_to_script_safe_JSON(&search);
} else {
convert_UTF8_to_JSON(&search, state->script_safe ? script_safe_escape_table : escape_table);
convert_UTF8_to_JSON(&search);
}
break;
default:
Expand Down

0 comments on commit e03515a

Please sign in to comment.