Skip to content

Commit

Permalink
Use bit twiddling to speed up JSON generation.
Browse files Browse the repository at this point in the history
  • Loading branch information
samyron committed Jan 29, 2025
1 parent 39ef430 commit f75aa8c
Show file tree
Hide file tree
Showing 2 changed files with 165 additions and 47 deletions.
176 changes: 129 additions & 47 deletions ext/json/ext/generator/generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -192,61 +192,143 @@ static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const un
unsigned long beg = 0, pos = 0;

#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
#define PROCESS_BYTE if (RB_UNLIKELY(ch_len)) { \
switch (ch_len) { \
case 9: { \
FLUSH_POS(1); \
switch (ch) { \
case '"': fbuffer_append(out_buffer, "\\\"", 2); break; \
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; \
case '/': fbuffer_append(out_buffer, "\\/", 2); break; \
case '\b': fbuffer_append(out_buffer, "\\b", 2); break; \
case '\f': fbuffer_append(out_buffer, "\\f", 2); break; \
case '\n': fbuffer_append(out_buffer, "\\n", 2); break; \
case '\r': fbuffer_append(out_buffer, "\\r", 2); break; \
case '\t': fbuffer_append(out_buffer, "\\t", 2); break; \
default: { \
scratch[2] = '0'; \
scratch[3] = '0'; \
scratch[4] = hexdig[(ch >> 4) & 0xf]; \
scratch[5] = hexdig[ch & 0xf]; \
fbuffer_append(out_buffer, scratch, 6); \
break; \
} \
} \
break; \
} \
case 11: { \
unsigned char b2 = ptr[pos + 1]; \
if (RB_UNLIKELY(b2 == 0x80)) { \
unsigned char b3 = ptr[pos + 2]; \
if (b3 == 0xA8) { \
FLUSH_POS(3); \
fbuffer_append(out_buffer, "\\u2028", 6); \
break; \
} else if (b3 == 0xA9) { \
FLUSH_POS(3); \
fbuffer_append(out_buffer, "\\u2029", 6); \
break; \
} \
} \
ch_len = 3; \
} \
default: \
pos += ch_len; \
break; \
} \
} else { \
pos++; \
}

while (pos < len) {
unsigned char ch = ptr[pos];
unsigned char ch_len = escape_table[ch];
/* JSON encoding */
if (escape_table != script_safe_escape_table) {
// Taken from: https://github.com/ruby/ruby/blob/96a5da67864a15eea7b79e552c7684ddd182f76c/string.c#L671-L748
// TODO revisit for the compiler version check

// These macros from https://graphics.stanford.edu/~seander/bithacks.html
# if SIZEOF_UINTPTR_T == 8
#define hasless(x,n) (((x)-~0ULL/255*(n))&~(x)&~0ULL/255*128)
#define haszero(v) (((v) - 0x0101010101010101ULL) & ~(v) & 0x8080808080808080ULL)
#define MASK_DOUBLEQUOTE 0x2222222222222222ULL
#define MASK_FORWARDSLASH 0x5c5c5c5c5c5c5c5cULL
# elif SIZEOF_UINTPTR_T == 4
#define hasless(x,n) (((x)-~0UL/255*(n))&~(x)&~0UL/255*128)
#define haszero(v) (((v) - 0x01010101UL) & ~(v) & 0x80808080UL)
#define MASK_DOUBLEQUOTE 0x22222222UL
#define MASK_FORWARDSLASH 0x5c5c5c5cUL
# else
# error "don't know what to do."
#endif

if (RB_UNLIKELY(ch_len)) {
switch (ch_len) {
case 9: {
FLUSH_POS(1);
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
default: {
scratch[2] = '0';
scratch[3] = '0';
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(out_buffer, scratch, 6);
break;
}
if ((pos + SIZEOF_UINTPTR_T*2) < len) {
/*
* Align the pointer to a sizeof(uintptr_t)-byte boundary.
* TODO: Use the same alignment technique as https://github.com/ruby/ruby/blob/96a5da67864a15eea7b79e552c7684ddd182f76c/string.c#L671-L748
*/
char *char_ptr;
for (char_ptr = (char *) ptr; pos < len && (uintptr_t) char_ptr % SIZEOF_UINTPTR_T != 0;) {
unsigned long start = pos;
char ch = *char_ptr;
unsigned char ch_len = escape_table[(uint8_t) ch];
PROCESS_BYTE;
// PROCESS_BYTE might process more than one byte. Ensure we increment char_ptr appropriately.
char_ptr += (pos - start);
}

uintptr_t *lp = (uintptr_t *) char_ptr;

while (pos + SIZEOF_UINTPTR_T < len) {
uintptr_t chunk = *lp;

uintptr_t has_less_than_0x20 = hasless(chunk, ' ');

/*
* This is effectively a memchr(3). Look for both a double quote
* and a forward slash.
*/
uintptr_t tmp1 = chunk ^ MASK_DOUBLEQUOTE;
uintptr_t haszero1 = haszero(tmp1);

uintptr_t tmp2 = chunk ^ MASK_FORWARDSLASH;
uintptr_t haszero2 = haszero(tmp2);

if ((has_less_than_0x20 | haszero1 | haszero2) != 0) {
for(size_t i=0; i<sizeof(uintptr_t); i++) {
unsigned char ch = ptr[pos];
unsigned char ch_len = escape_table[ch];
PROCESS_BYTE;
}
break;
}
case 11: {
unsigned char b2 = ptr[pos + 1];
if (RB_UNLIKELY(b2 == 0x80)) {
unsigned char b3 = ptr[pos + 2];
if (b3 == 0xA8) {
FLUSH_POS(3);
fbuffer_append(out_buffer, "\\u2028", 6);
break;
} else if (b3 == 0xA9) {
FLUSH_POS(3);
fbuffer_append(out_buffer, "\\u2029", 6);
break;
}
/*
* Realign lp as the previous loop may have left us in a position that's
* no longer aligned on a sizeof(uintptr_t) boundary.
*/
for (char_ptr = (char *) ptr+pos; pos < len && (uintptr_t) char_ptr % SIZEOF_UINTPTR_T != 0;) {
unsigned long start = pos;
char ch = *char_ptr;
unsigned char ch_len = escape_table[(uint8_t) ch];
PROCESS_BYTE;
// This might process more than one byte. Ensure we increment char_ptr appropriately.
char_ptr += (pos - start);
}
ch_len = 3;
// fallthrough
lp = (uintptr_t *) char_ptr;
} else {
lp++;
pos += SIZEOF_UINTPTR_T;
continue;
}
default:
pos += ch_len;
break;
}
} else {
pos++;
}
}

#undef hasless
#undef haszero

while (pos < len) {
unsigned char ch = ptr[pos];
unsigned char ch_len = escape_table[ch];
/* JSON encoding */

PROCESS_BYTE;
}
#undef FLUSH_POS

if (beg < len) {
Expand Down
36 changes: 36 additions & 0 deletions test/json/json_generator_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,10 @@ def test_backslash
json = '["/"]'
assert_equal json, generate(data)
#
data = [ '////////////////////////////////////////////////////////////////////////////////////' ]
json = '["////////////////////////////////////////////////////////////////////////////////////"]'
assert_equal json, generate(data)
#
data = [ '/' ]
json = '["\/"]'
assert_equal json, generate(data, :script_safe => true)
Expand All @@ -455,13 +459,45 @@ def test_backslash
json = '["\""]'
assert_equal json, generate(data)
#
data = [ '///////////' ]
json = '["\/\/\/\/\/\/\/\/\/\/\/"]'
assert_equal json, generate(data, :script_safe => true)
#
data = [ '///////////////////////////////////////////////////////' ]
json = '["\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/"]'
assert_equal json, generate(data, :script_safe => true)
#
data = ["'"]
json = '["\\\'"]'
assert_equal '["\'"]', generate(data)
#
data = ["倩", "瀨"]
json = '["倩","瀨"]'
assert_equal json, generate(data, script_safe: true)
#
data = '["This is a "test" of the emergency broadcast system."]'
json = "\"[\\\"This is a \\\"test\\\" of the emergency broadcast system.\\\"]\""
assert_equal json, generate(data)
#
data = '\tThis is a test of the emergency broadcast system.'
json = "\"\\\\tThis is a test of the emergency broadcast system.\""
assert_equal json, generate(data)
#
data = 'This\tis a test of the emergency broadcast system.'
json = "\"This\\\\tis a test of the emergency broadcast system.\""
assert_equal json, generate(data)
#
data = 'This is\ta test of the emergency broadcast system.'
json = "\"This is\\\\ta test of the emergency broadcast system.\""
assert_equal json, generate(data)
#
data = 'This is a test of the emergency broadcast\tsystem.'
json = "\"This is a test of the emergency broadcast\\\\tsystem.\""
assert_equal json, generate(data)
#
data = 'This is a test of the emergency broadcast\tsystem.\n'
json = "\"This is a test of the emergency broadcast\\\\tsystem.\\\\n\""
assert_equal json, generate(data)
end

def test_string_subclass
Expand Down

0 comments on commit f75aa8c

Please sign in to comment.