Skip to content

Commit

Permalink
fix[parser]: fix bad tokenization of hex strings (#4406)
Browse files Browse the repository at this point in the history
this commit fixes parsing of hex strings. there were several issues
with the hex string pre-parser, including:

- modification of the string locations
- incorrectly not exiting the state machine if a non-string token is
  encountered.

this commit fixes the state machine, changes the pre-parser to leave
the locations of hex strings unmodified as to minimize the changes to
locations in the reformatted code vs source code. to see the effect,
print out the reformatted code of the test cases included in this PR
before and after this commit.

this commit additionally adds several sanity checks to the pre-parser so
that the chance of future tokenization bugs is minimized.
  • Loading branch information
charles-cooper authored Jan 9, 2025
1 parent 66272e6 commit 9db1546
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 21 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/pull-request.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ jobs:
# lang: language changes
# stdlib: changes to the stdlib
# ux: language changes (UX)
# parser: parser changes
# tool: integration
# ir: (old) IR/codegen changes
# codegen: lowering from vyper AST to codegen
Expand All @@ -46,6 +47,7 @@ jobs:
lang
stdlib
ux
parser
tool
ir
codegen
Expand Down
15 changes: 15 additions & 0 deletions tests/functional/codegen/types/test_bytes.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest

from vyper.compiler import compile_code
from vyper.exceptions import TypeMismatch


Expand Down Expand Up @@ -281,6 +282,20 @@ def test2(l: Bytes[{m}] = x"{val}") -> bool:
assert c.test2(vyper_literal) is True


def test_hex_literal_parser_edge_case():
# see GH issue 4405 example 2
code = """
interface FooBar:
def test(a: Bytes[2], b: String[4]): payable
@deploy
def __init__(ext: FooBar):
extcall ext.test(x'6161', x'6161') #ext.test(b'\x61\61', '6161') gets called
"""
with pytest.raises(TypeMismatch):
compile_code(code)


def test_zero_padding_with_private(get_contract):
code = """
counter: uint256
Expand Down
30 changes: 29 additions & 1 deletion tests/functional/syntax/test_bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,17 @@ def test() -> Bytes[1]:
"""
@external
def test() -> Bytes[2]:
a: Bytes[2] = x"abc"
a: Bytes[2] = x"abc" # non-hex nibbles
return a
""",
SyntaxException,
),
(
"""
@external
def test() -> Bytes[10]:
# GH issue 4405 example 1
a: Bytes[10] = x x x x x x"61" # messed up hex prefix
return a
""",
SyntaxException,
Expand All @@ -107,6 +117,24 @@ def test_bytes_fail(bad_code):
compiler.compile_code(bad_code)


@pytest.mark.xfail
def test_hexbytes_offset():
good_code = """
event X:
a: Bytes[2]
@deploy
def __init__():
# GH issue 4405, example 1
#
# log changes offset of HexString, and the hex_string_locations tracked
# location is incorrect when visiting ast
log X(a = x"6161")
"""
# move this to valid list once it passes.
assert compiler.compile_code(good_code) is not None


valid_list = [
"""
@external
Expand Down
5 changes: 5 additions & 0 deletions vyper/ast/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,10 @@ def _parse_to_ast_with_settings(
# postcondition: consumed all the for loop annotations
assert len(pre_parser.for_loop_annotations) == 0

# postcondition: we have used all the hex strings found by the
# pre-parser
assert len(pre_parser.hex_string_locations) == 0

# Convert to Vyper AST.
module = vy_ast.get_node(py_ast)
assert isinstance(module, vy_ast.Module) # mypy hint
Expand Down Expand Up @@ -440,6 +444,7 @@ def visit_Constant(self, node):
node.col_offset,
)
node.ast_type = "HexBytes"
self._pre_parser.hex_string_locations.remove(key)
else:
node.ast_type = "Str"
elif isinstance(node.value, bytes):
Expand Down
43 changes: 23 additions & 20 deletions vyper/ast/pre_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,37 +109,40 @@ def consume(self, token):
class HexStringParser:
def __init__(self):
self.locations = []
self._current_x = None
self._tokens = []
self._state = ParserState.NOT_RUNNING

def consume(self, token, result):
# prepare to check if the next token is a STRING
if token.type == NAME and token.string == "x":
self._state = ParserState.RUNNING
self._current_x = token
return True

if self._state == ParserState.NOT_RUNNING:
if token.type == NAME and token.string == "x":
self._tokens.append(token)
self._state = ParserState.RUNNING
return True

return False

if self._state == ParserState.RUNNING:
current_x = self._current_x
self._current_x = None
self._state = ParserState.NOT_RUNNING
assert self._state == ParserState.RUNNING, "unreachable"

toks = [current_x]
self._state = ParserState.NOT_RUNNING

# drop the leading x token if the next token is a STRING to avoid a python
# parser error
if token.type == STRING:
self.locations.append(current_x.start)
toks = [TokenInfo(STRING, token.string, current_x.start, token.end, token.line)]
result.extend(toks)
return True
if token.type != STRING:
# flush the tokens we have accumulated and move on
result.extend(self._tokens)
self._tokens = []
return False

result.extend(toks)
# mark hex string in locations for later processing
self.locations.append(token.start)

return False
# discard the `x` token and apply sanity checks -
# we should only be discarding one token.
assert len(self._tokens) == 1
assert (x_tok := self._tokens[0]).type == NAME and x_tok.string == "x"
self._tokens = [] # discard tokens

result.append(token)
return True


# compound statements that are replaced with `class`
Expand Down

0 comments on commit 9db1546

Please sign in to comment.