fix[parser]: fix bad tokenization of hex strings (#4406)

this commit fixes parsing of hex strings. there were several issues with the hex string pre-parser, including: - modification of the string locations - incorrectly not exiting the state machine if a non-string token is encountered. this commit fixes the state machine, changes the pre-parser to leave the locations of hex strings unmodified as to minimize the changes to locations in the reformatted code vs source code. to see the effect, print out the reformatted code of the test cases included in this PR before and after this commit. this commit additionally adds several sanity checks to the pre-parser so that the chance of future tokenization bugs is minimized.
vyperlang · Jan 9, 2025 · 9db1546 · 9db1546
1 parent 66272e6
commit 9db1546
Show file tree

Hide file tree

Showing 5 changed files with 74 additions and 21 deletions.
diff --git a/.github/workflows/pull-request.yaml b/.github/workflows/pull-request.yaml
@@ -34,6 +34,7 @@ jobs:
           # lang: language changes
           # stdlib: changes to the stdlib
           # ux: language changes (UX)
+          # parser: parser changes
           # tool: integration
           # ir: (old) IR/codegen changes
           # codegen: lowering from vyper AST to codegen
@@ -46,6 +47,7 @@ jobs:
             lang
             stdlib
             ux
+            parser
             tool
             ir
             codegen

diff --git a/tests/functional/codegen/types/test_bytes.py b/tests/functional/codegen/types/test_bytes.py
@@ -1,5 +1,6 @@
 import pytest
 
+from vyper.compiler import compile_code
 from vyper.exceptions import TypeMismatch
 
 
@@ -281,6 +282,20 @@ def test2(l: Bytes[{m}] = x"{val}") -> bool:
     assert c.test2(vyper_literal) is True
 
 
+def test_hex_literal_parser_edge_case():
+    # see GH issue 4405 example 2
+    code = """
+interface FooBar:
+    def test(a: Bytes[2], b: String[4]): payable
+
+@deploy
+def __init__(ext: FooBar):
+    extcall ext.test(x'6161', x'6161')  #ext.test(b'\x61\61', '6161') gets called
+    """
+    with pytest.raises(TypeMismatch):
+        compile_code(code)
+
+
 def test_zero_padding_with_private(get_contract):
     code = """
 counter: uint256

diff --git a/tests/functional/syntax/test_bytes.py b/tests/functional/syntax/test_bytes.py
@@ -81,7 +81,17 @@ def test() -> Bytes[1]:
         """
 @external
 def test() -> Bytes[2]:
-    a: Bytes[2] = x"abc"
+    a: Bytes[2] = x"abc"  # non-hex nibbles
+    return a
+    """,
+        SyntaxException,
+    ),
+    (
+        """
+@external
+def test() -> Bytes[10]:
+    # GH issue 4405 example 1
+    a: Bytes[10] = x x x x x x"61"  # messed up hex prefix
     return a
     """,
         SyntaxException,
@@ -107,6 +117,24 @@ def test_bytes_fail(bad_code):
             compiler.compile_code(bad_code)
 
 
+@pytest.mark.xfail
+def test_hexbytes_offset():
+    good_code = """
+    event X:
+    a: Bytes[2]
+
+@deploy
+def __init__():
+    # GH issue 4405, example 1
+    #
+    # log changes offset of HexString, and the hex_string_locations tracked
+    # location is incorrect when visiting ast
+    log X(a = x"6161")
+    """
+    # move this to valid list once it passes.
+    assert compiler.compile_code(good_code) is not None
+
+
 valid_list = [
     """
 @external

diff --git a/vyper/ast/parse.py b/vyper/ast/parse.py
@@ -117,6 +117,10 @@ def _parse_to_ast_with_settings(
     # postcondition: consumed all the for loop annotations
     assert len(pre_parser.for_loop_annotations) == 0
 
+    # postcondition: we have used all the hex strings found by the
+    # pre-parser
+    assert len(pre_parser.hex_string_locations) == 0
+
     # Convert to Vyper AST.
     module = vy_ast.get_node(py_ast)
     assert isinstance(module, vy_ast.Module)  # mypy hint
@@ -440,6 +444,7 @@ def visit_Constant(self, node):
                         node.col_offset,
                     )
                 node.ast_type = "HexBytes"
+                self._pre_parser.hex_string_locations.remove(key)
             else:
                 node.ast_type = "Str"
         elif isinstance(node.value, bytes):

diff --git a/vyper/ast/pre_parser.py b/vyper/ast/pre_parser.py
@@ -109,37 +109,40 @@ def consume(self, token):
 class HexStringParser:
     def __init__(self):
         self.locations = []
-        self._current_x = None
+        self._tokens = []
         self._state = ParserState.NOT_RUNNING
 
     def consume(self, token, result):
         # prepare to check if the next token is a STRING
-        if token.type == NAME and token.string == "x":
-            self._state = ParserState.RUNNING
-            self._current_x = token
-            return True
-
         if self._state == ParserState.NOT_RUNNING:
+            if token.type == NAME and token.string == "x":
+                self._tokens.append(token)
+                self._state = ParserState.RUNNING
+                return True
+
             return False
 
-        if self._state == ParserState.RUNNING:
-            current_x = self._current_x
-            self._current_x = None
-            self._state = ParserState.NOT_RUNNING
+        assert self._state == ParserState.RUNNING, "unreachable"
 
-            toks = [current_x]
+        self._state = ParserState.NOT_RUNNING
 
-            # drop the leading x token if the next token is a STRING to avoid a python
-            # parser error
-            if token.type == STRING:
-                self.locations.append(current_x.start)
-                toks = [TokenInfo(STRING, token.string, current_x.start, token.end, token.line)]
-                result.extend(toks)
-                return True
+        if token.type != STRING:
+            # flush the tokens we have accumulated and move on
+            result.extend(self._tokens)
+            self._tokens = []
+            return False
 
-            result.extend(toks)
+        # mark hex string in locations for later processing
+        self.locations.append(token.start)
 
-        return False
+        # discard the `x` token and apply sanity checks -
+        # we should only be discarding one token.
+        assert len(self._tokens) == 1
+        assert (x_tok := self._tokens[0]).type == NAME and x_tok.string == "x"
+        self._tokens = []  # discard tokens
+
+        result.append(token)
+        return True
 
 
 # compound statements that are replaced with `class`