Skip to content

Commit

Permalink
tests/refactor: state update and improved JSON parsing (#11)
Browse files Browse the repository at this point in the history
* add: `reddit_response.json`

* refactor tests + add failing case

* easier fix

* test: parse to key

* tests: key parsing

* bug: `next_end_of_kv` on read `:`

* fix: `end_of_kv` bug

* test: find value

* tests: `inside_value` and `inside_value_to_exit`

* test: parse to NEXT key

* parses JSON with two string keys

* WIP: value inside value

* comment

* refactor (#10)

* wip: start with bitmask

* WIP: time to start testing

* tests: `ArrayAdd` and `ArrayMul`

* tests passing

* update comments

* feat: 2 key depth 1 json

* 2 kv json and all tests passing

* nested json works!!!

* reduce constraints

* cleanup

* rename variables

* more cleaning

* more cleanup

* make comments clean

* WAYLON NITPICKING ME LOL
  • Loading branch information
Autoparallel authored Aug 15, 2024
1 parent 274d15a commit ed2c440
Show file tree
Hide file tree
Showing 14 changed files with 648 additions and 107 deletions.
24 changes: 24 additions & 0 deletions circuits.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,30 @@
21
]
},
"test_extract_two_key": {
"file": "extract",
"template": "Extract",
"params": [
4,
40
]
},
"test_extract_depth": {
"file": "extract",
"template": "Extract",
"params": [
4,
64
]
},
"test_extract_sambhav": {
"file": "extract",
"template": "Extract",
"params": [
4,
105
]
},
"test_extract_hard": {
"file": "extract",
"template": "Extract",
Expand Down
35 changes: 14 additions & 21 deletions circuits/extract.circom
Original file line number Diff line number Diff line change
Expand Up @@ -24,35 +24,28 @@ template Extract(KEY_BYTES, DATA_BYTES) {
// Initialze the parser
component State[DATA_BYTES];
State[0] = StateUpdate();
State[0].byte <== data[0];
State[0].tree_depth <== 0;
State[0].parsing_to_key <== 1; // Initialize by saying we are parsing to the first key
State[0].inside_key <== 0;
State[0].parsing_to_value <== 0;
State[0].inside_value <== 0;
State[0].escaping <== 0;
State[0].end_of_kv <== 0;
State[0].byte <== data[0];
State[0].tree_depth <== 0;
State[0].parsing_key <== 0;
State[0].inside_key <== 0;
State[0].parsing_value <== 0;
State[0].inside_value <== 0;

for(var data_pointer = 1; data_pointer < DATA_BYTES; data_pointer++) {
State[data_pointer] = StateUpdate();
State[data_pointer].byte <== data[data_pointer];
State[data_pointer].tree_depth <== State[data_pointer - 1].next_tree_depth;
State[data_pointer].parsing_to_key <== State[data_pointer - 1].next_parsing_to_key;
State[data_pointer].inside_key <== State[data_pointer - 1].next_inside_key;
State[data_pointer].parsing_to_value <== State[data_pointer - 1].next_parsing_to_value;
State[data_pointer].inside_value <== State[data_pointer - 1].next_inside_value;
State[data_pointer].end_of_kv <== State[data_pointer - 1].next_end_of_kv;
// TODO: For the next state, we should use `next_`, this is only to make this compile for now.
State[data_pointer].escaping <== State[data_pointer - 1].escaping;

State[data_pointer].byte <== data[data_pointer];
State[data_pointer].tree_depth <== State[data_pointer - 1].next_tree_depth;
State[data_pointer].parsing_key <== State[data_pointer - 1].next_parsing_key;
State[data_pointer].inside_key <== State[data_pointer - 1].next_inside_key;
State[data_pointer].parsing_value <== State[data_pointer - 1].next_parsing_value;
State[data_pointer].inside_value <== State[data_pointer - 1].next_inside_value;

// Debugging
log("State[", data_pointer, "].tree_depth", "= ", State[data_pointer].tree_depth);
log("State[", data_pointer, "].parsing_to_key", "= ", State[data_pointer].parsing_to_key);
log("State[", data_pointer, "].parsing_key", "= ", State[data_pointer].parsing_key);
log("State[", data_pointer, "].inside_key", "= ", State[data_pointer].inside_key);
log("State[", data_pointer, "].parsing_to_value", "= ", State[data_pointer].parsing_to_value);
log("State[", data_pointer, "].parsing_value", "= ", State[data_pointer].parsing_value);
log("State[", data_pointer, "].inside_value", "= ", State[data_pointer].inside_value);
log("State[", data_pointer, "].end_of_kv", "= ", State[data_pointer].end_of_kv);
log("---");
}

Expand Down
20 changes: 20 additions & 0 deletions circuits/operators.circom
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,23 @@ template Contains(n) {
// Apply `not` to this by 1-x
out <== 1 - someEqual.out;
}

template ArrayAdd(n) {
signal input lhs[n];
signal input rhs[n];
signal output out[n];

for(var i = 0; i < n; i++) {
out[i] <== lhs[i] + rhs[i];
}
}

template ArrayMul(n) {
signal input lhs[n];
signal input rhs[n];
signal output out[n];

for(var i = 0; i < n; i++) {
out[i] <== lhs[i] * rhs[i];
}
}
171 changes: 105 additions & 66 deletions circuits/parser.circom
Original file line number Diff line number Diff line change
Expand Up @@ -22,40 +22,30 @@ State[20]| " | COMPLETE WITH KV PARSING
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
State[20].next_tree_depth == 0 | VALID JSON
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Notes:
- If there is no comma after leaving a value, then we should not be parsing to key. If anything breaks here, JSON was bad.
*/

/*
TODO
*/
template StateUpdate() {
signal input byte;

signal input tree_depth; // STATUS_INDICATOR -- how deep in a JSON branch we are, e.g., `user.balance.value` key should be at depth `3`.
// Should always be greater than or equal to `0` (TODO: implement this constraint).
signal input byte;

signal input parsing_to_key; // BIT_FLAG -- whether we are currently parsing bytes until we find the next key (mutally exclusive with `inside_key` and both `*_value flags).
signal input inside_key; // BIT_FLAG -- whether we are currently inside a key (mutually exclusive with `parsing_to_key` and both `*_value` flags).

signal input parsing_to_value; // BIT_FLAG -- whether we are currently parsing bytes until we find the next value (mutually exclusive with `inside_value` and both `*_key` flags).
signal input inside_value; // BIT_FLAG -- whether we are currently inside a value (mutually exclusive with `parsing_to_value` and both `*_key` flags).

signal input escaping; // BIT_FLAG -- whether we have hit an escape ASCII symbol inside of a key or value.
signal input tree_depth; // STATUS_INDICATOR -- how deep in a JSON branch we are, e.g., `user.balance.value` key should be at depth `3`.
// constrainted to be greater than or equal to `0`.
signal input parsing_key; // BIT_FLAG -- whether we are currently parsing bytes until we find the next key (mutally exclusive with `inside_key` and both `*_value flags).
signal input inside_key; // BIT_FLAG -- whether we are currently inside a key (mutually exclusive with `parsing_key` and both `*_value` flags).
signal input parsing_value; // BIT_FLAG -- whether we are currently parsing bytes until we find the next value (mutually exclusive with `inside_value` and both `*_key` flags).
signal input inside_value; // BIT_FLAG -- whether we are currently inside a value (mutually exclusive with `parsing_value` and both `*_key` flags).

signal input end_of_kv; // BIT_FLAG -- reached end of key-value sequence, looking for comma delimiter or end of file signified by `tree_depth == 0`.
signal output next_tree_depth; // STATUS_INDICATOR -- next state for `tree_depth`.
signal output next_parsing_key; // BIT_FLAG -- next state for `parsing_key`.
signal output next_inside_key; // BIT_FLAG -- next state for `inside_key`.
signal output next_parsing_value; // BIT_FLAG -- next state for `parsing_value`.
signal output next_inside_value; // BIT_FLAG -- next state for `inside_value`.

signal output next_tree_depth; // BIT_FLAG -- next state for `tree_depth`.
signal output next_parsing_to_key; // BIT_FLAG -- next state for `parsing_to_key`.
signal output next_inside_key; // BIT_FLAG -- next state for `inside_key`.
signal output next_parsing_to_value; // BIT_FLAG -- next state for `parsing_to_value`.
signal output next_inside_value; // BIT_FLAG -- next state for `inside_value`.
signal output next_end_of_kv; // BIT_FLAG -- next state for `end_of_kv`.

// signal output escaping; // TODO: Add this in!
// TODO: Add this in!
// signal input escaping; // BIT_FLAG -- whether we have hit an escape ASCII symbol inside of a key or value.
// signal output escaping;

//--------------------------------------------------------------------------------------------//
//-Delimeters---------------------------------------------------------------------------------//
Expand Down Expand Up @@ -85,51 +75,67 @@ template StateUpdate() {
var escape = 92;
//--------------------------------------------------------------------------------------------//

//--------------------------------------------------------------------------------------------//
//-MACHINE INSTRUCTIONS-----------------------------------------------------------------------//
// TODO: ADD CASE FOR `is_number` for in range 48-57 https://www.ascii-code.com since a value may just be a number
// Output management
component matcher = Switch(8, 3);
var do_nothing[3] = [ 0, 0, 0]; // Command returned by switch if we want to do nothing, e.g. read a whitespace char while looking for a key
var increase_depth[3] = [ 1, 0, 0]; // Command returned by switch if we hit a start brace `{`
var decrease_depth[3] = [-1, 0, 0]; // Command returned by switch if we hit a end brace `}`
var hit_quote[3] = [ 0, 1, 0]; // Command returned by switch if we hit a quote `"`
var hit_colon[3] = [ 0, 0, 1]; // Command returned by switch if we hit a colon `:`

matcher.branches <== [start_brace, end_brace, quote, colon, start_bracket, end_bracket, comma, escape ];
matcher.vals <== [increase_depth, decrease_depth, hit_quote, hit_colon, do_nothing, do_nothing, do_nothing, do_nothing];
matcher.case <== byte;


// TODO: These could likely go into a switch statement with the output of the `Switch` above.
// TODO: Also could probably clean up things with de Morgan's laws or whatever.
// An `IF ELSE` template would also be handy!
next_inside_key <== inside_key + (parsing_to_key - inside_key) * matcher.out[1]; // IF (`parsing_to_key` AND `hit_quote`) THEN `next_inside_key <== 1` ELSEIF (`inside_key` AND `hit_quote`) THEN `next_inside_key <== 0`
// - note: can rewrite as -> `inside_key * (1-matcher.out[1]) + parsing_to_key * matcher.out[1]`, but this will not be quadratic (according to circom)
next_parsing_to_key <== parsing_to_key * (1 - matcher.out[1]); // IF (`parsing_to_key` AND `hit_quote`) THEN `parsing_to_key <== 0`

next_inside_value <== inside_value + (parsing_to_value - inside_value) * matcher.out[1]; // IF (`parsing_to_value` AND `hit_quote`) THEN `next_inside_value <== 1` ELSEIF (`inside_value` AND `hit_quote`) THEN `next_inside_value <==0`
// -note: can rewrite as -> `(1 - inside_value) * matcher_out[1] + parsing_to_value * matcher.out[1]
//--------------------------------------------------------------------------------------------//
//-Instructions for ASCII---------------------------------------------------------------------//
var state[5] = [tree_depth, parsing_key, inside_key, parsing_value, inside_value];
var do_nothing[5] = [ 0, 0, 0, 0, 0 ]; // Command returned by switch if we want to do nothing, e.g. read a whitespace char while looking for a key
var hit_start_brace[5] = [ 1, 1, 0, -1, 0 ]; // Command returned by switch if we hit a start brace `{`
var hit_end_brace[5] = [-1, 0, 0, 0, 0 ]; // Command returned by switch if we hit a end brace `}`
var hit_quote[5] = [ 0, 0, 1, 0, 1 ]; // Command returned by switch if we hit a quote `"`
var hit_colon[5] = [ 0, -1, 0, 1, 0 ]; // Command returned by switch if we hit a colon `:`
var hit_comma[5] = [ 0, 1, 0, -1, 0 ]; // Command returned by switch if we hit a comma `,`
//--------------------------------------------------------------------------------------------//

signal NOT_PARSING_TO_KEY_AND_NOT_INSIDE_KEY <== (1 - parsing_to_key) * (1 - inside_key); // (NOT `parsing_to_key`) AND (NOT `inside_key`)
signal PARSING_TO_VALUE_AND_NOT_HIT_QUOTE <== parsing_to_value * (1 - matcher.out[1]); // `parsing_to_value` AND (NOT `hit_quote`)
next_parsing_to_value <== PARSING_TO_VALUE_AND_NOT_HIT_QUOTE + NOT_PARSING_TO_KEY_AND_NOT_INSIDE_KEY * matcher.out[2]; // IF (`parsing_to_value` AND (NOT `hit_quote`)) THEN `next_parsing_to_value <== 1 ELSEIF ((NOT `parsing_to_value` AND (NOT `inside_value)) AND `hit_colon`) THEN `next_parsing_to_value <== 1`

signal NOT_PARSING_TO_VALUE_AND_NOT_INSIDE_VALUE <== (1 - parsing_to_value) * (1 - inside_value); // (NOT `parsing_to_value`) AND (NOT `inside_value`)
next_end_of_kv <== NOT_PARSING_TO_KEY_AND_NOT_INSIDE_KEY * NOT_PARSING_TO_VALUE_AND_NOT_INSIDE_VALUE; // IF ((NOT `parsing_to_key`) AND (NOT `inside_key`)) AND (NOT(`parsing_to_value`) AND NOT( `inside_value)) THEN `next_end_of_kv <== 1`


// TODO: Assert this never goes below zero (mod p)
next_tree_depth <== tree_depth + (parsing_to_key + next_end_of_kv) * matcher.out[0]; // IF ((`parsing_to_key` OR `next_end_of_kv`) AND `read_brace` THEN `increase/decrease_depth`
//--------------------------------------------------------------------------------------------//
//-State machine updating---------------------------------------------------------------------//
// * yield instruction based on what byte we read *
component matcher = Switch(5, 5);
matcher.branches <== [start_brace, end_brace, quote, colon, comma ];
matcher.vals <== [hit_start_brace, hit_end_brace, hit_quote, hit_colon, hit_comma];
matcher.case <== byte;
// * get the instruction mask based on current state *
component mask = StateToMask();
mask.state <== state;
// * multiply the mask array elementwise with the instruction array *
component mulMaskAndOut = ArrayMul(5);
mulMaskAndOut.lhs <== mask.mask;
mulMaskAndOut.rhs <== matcher.out;
// * add the masked instruction to the state to get new state *
component addToState = ArrayAdd(5);
addToState.lhs <== state;
addToState.rhs <== mulMaskAndOut.out;
// * set the new state *
next_tree_depth <== addToState.out[0];
next_parsing_key <== addToState.out[1];
next_inside_key <== addToState.out[2];
next_parsing_value <== addToState.out[3];
next_inside_value <== addToState.out[4];
//--------------------------------------------------------------------------------------------//

// Constrain bit flags
next_parsing_to_key * (1 - next_parsing_to_key) === 0; // - constrain that `next_parsing_to_key` remain a bit flag
next_inside_key * (1 - next_inside_key) === 0; // - constrain that `next_inside_key` remain a bit flag
next_parsing_to_value * (1 - next_parsing_to_value) === 0; // - constrain that `next_parsing_to_value` remain a bit flag
next_inside_value * (1 - next_inside_value) === 0; // - constrain that `next_inside_value` remain a bit flag
next_end_of_kv * (1 - next_end_of_kv) === 0; // - constrain that `next_end_of_kv` remain a bit flag
//--------------------------------------------------------------------------------------------//
// // DEBUGGING: internal state
// for(var i = 0; i<5; i++) {
// log("-----------------------");
// log("mask[",i,"]: ", mask.mask[i]);
// log("mulMaskAndOut[",i,"]:", mulMaskAndOut.out[i]);
// log("state[",i,"]: ", state[i]);
// log("next_state[",i,"]: ", addToState.out[i]);
// }
//--------------------------------------------------------------------------------------------//

// TODO: Can hit comma and then be sent to next KV, so comma will engage `parsing_to_key`
//--------------------------------------------------------------------------------------------//
//-Constraints--------------------------------------------------------------------------------//
// * constrain bit flags *
next_parsing_key * (1 - next_parsing_key) === 0; // - constrain that `next_parsing_key` remain a bit flag
next_inside_key * (1 - next_inside_key) === 0; // - constrain that `next_inside_key` remain a bit flag
next_parsing_value * (1 - next_parsing_value) === 0; // - constrain that `next_parsing_value` remain a bit flag
next_inside_value * (1 - next_inside_value) === 0; // - constrain that `next_inside_value` remain a bit flag
// * constrain `tree_depth` to never hit -1 (TODO: should always moves in 1 bit increments?)
component isMinusOne = IsEqual();
isMinusOne.in[0] <== -1;
isMinusOne.in[1] <== next_tree_depth;
isMinusOne.out === 0;
//--------------------------------------------------------------------------------------------//
}

Expand Down Expand Up @@ -177,4 +183,37 @@ template Switch(m, n) {
match <== matchChecker.out;

out <== sum;
}

// TODO: Note at the moment mask 2 and 4 are the same, so this can be removed if it maintains.
template StateToMask() {
signal input state[5];
signal output mask[5];

var tree_depth = state[0];
var parsing_key = state[1];
var inside_key = state[2];
var parsing_value = state[3];
var inside_value = state[4];

signal NOT_INSIDE_KEY_AND_NOT_INSIDE_VALUE <== (1 - inside_key) * (1 - inside_value);
signal NOT_PARSING_VALUE_NOT_INSIDE_VALUE <== (1 - parsing_value) * (1 - inside_value);

component init_tree = IsZero();
init_tree.in <== tree_depth;

// `tree_depth` can change: `IF (parsing_key XOR parsing_value XOR end_of_kv)`
mask[0] <== init_tree.out + parsing_key + parsing_value; // TODO: Make sure these are never both 1!

// `parsing_key` can change: `IF ((NOT inside_key) AND (NOT inside_value) AND (NOT parsing_value))`
mask[1] <== NOT_INSIDE_KEY_AND_NOT_INSIDE_VALUE;

// `inside_key` can change: `IF ((NOT parsing_value) AND (NOT inside_value) AND inside_key) THEN mask <== -1 ELSEIF (NOT parsing_value) AND (NOT inside_value) THEN mask <== 1`
mask[2] <== NOT_PARSING_VALUE_NOT_INSIDE_VALUE - 2 * inside_key;

// `parsing_value` can change: `IF ((NOT inside_key) AND (NOT inside_value) AND (tree_depth != 0))`
mask[3] <== NOT_INSIDE_KEY_AND_NOT_INSIDE_VALUE * (1 - init_tree.out);

// `inside_value` can change: `IF (parsing_value AND (NOT inside_value)) THEN mask <== 1 ELSEIF (inside_value) mask <== -1`
mask[4] <== parsing_value - 2 * inside_value;
}
Loading

0 comments on commit ed2c440

Please sign in to comment.