diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d0f9f46..cd5dcf3c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,22 +6,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - * Added rudimentary support for `clojure.stacktrace` with `print-cause-trace` (part of #721). + * Added rudimentary support for `clojure.stacktrace` with `print-cause-trace` (part of #721) + * Added support for `bytes` literals using a `#b` prefix (#732) ### Fixed - * Fix issue with `case` evaluating all of its clauses expressions (#699). - * Fix issue with relative paths dropping their first character on MS-Windows (#703). - * Fix incompatibility with `(str nil)` returning "nil" (#706). - * Fix `sort-by` support for maps and boolean comparator fns (#709). - * Fix `sort` support for maps and boolean comparator fns (#711). - * Fix `(is (= exp act))` should only evaluate its args once on failure (#712). - * Fix issue with `with` failing with a traceback error when an exception is thrown (#714). - * Fix issue with `sort-*` family of funtions returning an error on an empty seq (#716). - * Fix issue with `intern` failing when used (#725). - * Fix issue with `ns` not being available after `in-ns` on the REPL (#718). - * Fixed issue with import modules aliasing using ns eval (#719). - * Fix issue with `ns-resolve` throwing an error on macros (#720). - * Fix issue with py module `readerwritelock` locks handling (#722). + * Fix issue with `case` evaluating all of its clauses expressions (#699) + * Fix issue with relative paths dropping their first character on MS-Windows (#703) + * Fix incompatibility with `(str nil)` returning "nil" (#706) + * Fix `sort-by` support for maps and boolean comparator fns (#709) + * Fix `sort` support for maps and boolean comparator fns (#711) + * Fix `(is (= exp act))` should only evaluate its args once on failure (#712) + * Fix issue with `with` failing with a traceback error when an exception is thrown (#714) + * Fix issue with `sort-*` family of funtions returning an error on an empty seq (#716) + * Fix issue with `intern` failing when used (#725) + * Fix issue with `ns` not being available after `in-ns` on the REPL (#718) + * Fixed issue with import modules aliasing using ns eval (#719) + * Fix issue with `ns-resolve` throwing an error on macros (#720) + * Fix issue with py module `readerwritelock` locks handling (#722) ## [v0.1.0a2] ### Added diff --git a/docs/reader.rst b/docs/reader.rst index 0fbc2977..1ebadd63 100644 --- a/docs/reader.rst +++ b/docs/reader.rst @@ -96,6 +96,38 @@ Strings are denoted as a series of characters enclosed by ``"`` quotation marks. If a string needs to contain a quotation mark literal, that quotation mark should be escaped as ``\"``. Strings may be multi-line by default and only a closing ``"`` will terminate reading a string. Strings correspond to the Python ``str`` type. +String literals are always read with the UTF-8 encoding. + +String literals may contain the following escape sequences: ``\\``, ``\a``, ``\b``, ``\f``, ``\n``, ``\r``, ``\t``, ``\v``. +Their meanings match the equivalent escape sequences supported in `Python string literals `_\. + + +Byte Strings +------------ + +:: + + basilisp.user=> #b "" + #b "" + basilisp.user=> #b "this is a string" + #b "this is a string" + basilisp.user=> (type #b "") + + +Byte strings are denoted as a series of ASCII characters enclosed by ``"`` quotation marks and preceded by a ``#b``. +If a string needs to contain a quotation mark literal, that quotation mark should be escaped as ``\"``. +Strings may be multi-line by default and only a closing ``"`` will terminate reading a string. +Strings correspond to the Python ``bytes`` type. + +Byte string literals may contain the following escape sequences: ``\\``, ``\a``, ``\b``, ``\f``, ``\n``, ``\r``, ``\t``, ``\v``. +Byte strings may also characters using a hex escape code as ``\xhh`` where ``hh`` is a hexadecimal value. +Their meanings match the equivalent escape sequences supported in `Python byte string literals `_\. + + +.. warning:: + + As in Python, byte string literals may not include any characters outside of the ASCII range. + .. _character_literals: diff --git a/src/basilisp/core.lpy b/src/basilisp/core.lpy index 5a733187..8d1907cd 100644 --- a/src/basilisp/core.lpy +++ b/src/basilisp/core.lpy @@ -1321,11 +1321,6 @@ [x] (and (integer? x) (neg? x))) -(defn ^:inline nil? - "Return ``true`` if ``x`` is ``nil``\\, otherwise ``false``\\." - [x] - (operator/is- x nil)) - (defn ^:inline some? "Return ``true`` if ``x`` is not ``nil``\\, otherwise ``false`` s." [x] @@ -3603,7 +3598,7 @@ ;; pairs - pairs of bindings; either symbol/seq pairs or modifier pairs gen-iter (fn gen-iter [pairs] (if (seq pairs) - (let [for-iter (gensym "for") + (let [for-iter (gensym "for") seq-arg (gensym "seq") pair (first pairs) binding (first pair) diff --git a/src/basilisp/lang/compiler/analyzer.py b/src/basilisp/lang/compiler/analyzer.py index 4247113b..1bf00015 100644 --- a/src/basilisp/lang/compiler/analyzer.py +++ b/src/basilisp/lang/compiler/analyzer.py @@ -3583,6 +3583,7 @@ def _const_node_type(_: Any) -> ConstType: for tp, const_type in { bool: ConstType.BOOL, + bytes: ConstType.BYTES, complex: ConstType.NUMBER, datetime: ConstType.INST, Decimal: ConstType.DECIMAL, @@ -3612,6 +3613,7 @@ def _const_node_type(_: Any) -> ConstType: @_analyze_form.register(bool) +@_analyze_form.register(bytes) @_analyze_form.register(complex) @_analyze_form.register(datetime) @_analyze_form.register(Decimal) @@ -3649,6 +3651,7 @@ def _const_node(form: ReaderForm, ctx: AnalyzerContext) -> Const: form, ( bool, + bytes, complex, datetime, Decimal, diff --git a/src/basilisp/lang/compiler/generator.py b/src/basilisp/lang/compiler/generator.py index dca3d74b..e61078b2 100644 --- a/src/basilisp/lang/compiler/generator.py +++ b/src/basilisp/lang/compiler/generator.py @@ -3275,6 +3275,7 @@ def _const_meta_kwargs_ast( @_const_val_to_py_ast.register(bool) +@_const_val_to_py_ast.register(bytes) @_const_val_to_py_ast.register(type(None)) @_const_val_to_py_ast.register(complex) @_const_val_to_py_ast.register(float) diff --git a/src/basilisp/lang/compiler/nodes.py b/src/basilisp/lang/compiler/nodes.py index bb565f7c..ad92bd86 100644 --- a/src/basilisp/lang/compiler/nodes.py +++ b/src/basilisp/lang/compiler/nodes.py @@ -271,6 +271,7 @@ class ConstType(Enum): SET = kw.keyword("set") VECTOR = kw.keyword("vector") BOOL = kw.keyword("bool") + BYTES = kw.keyword("bytes") KEYWORD = kw.keyword("keyword") SYMBOL = kw.keyword("symbol") STRING = kw.keyword("string") diff --git a/src/basilisp/lang/obj.py b/src/basilisp/lang/obj.py index 8d766987..cdb69ed6 100644 --- a/src/basilisp/lang/obj.py +++ b/src/basilisp/lang/obj.py @@ -202,6 +202,12 @@ def _lrepr_bool(o: bool, **_) -> str: return repr(o).lower() +@lrepr.register(bytes) +def _lrepr_bytes(o: bytes, **_) -> str: + v = repr(o) + return f'#b "{v[2:-1]}"' + + @lrepr.register(type(None)) def _lrepr_nil(_: None, **__) -> str: return "nil" diff --git a/src/basilisp/lang/reader.py b/src/basilisp/lang/reader.py index 9eecf874..cac1b305 100644 --- a/src/basilisp/lang/reader.py +++ b/src/basilisp/lang/reader.py @@ -843,7 +843,7 @@ def _read_num( # noqa: C901 # pylint: disable=too-many-statements def _read_str(ctx: ReaderContext, allow_arbitrary_escapes: bool = False) -> str: - """Return a string from the input stream. + """Return a UTF-8 encoded string from the input stream. If allow_arbitrary_escapes is True, do not throw a SyntaxError if an unknown escape sequence is encountered.""" @@ -869,6 +869,75 @@ def _read_str(ctx: ReaderContext, allow_arbitrary_escapes: bool = False) -> str: s.append(token) +_BYTES_ESCAPE_CHARS = { + '"': b'"', + "\\": b"\\", + "a": b"\a", + "b": b"\b", + "f": b"\f", + "n": b"\n", + "r": b"\r", + "t": b"\t", + "v": b"\v", +} + + +def _read_hex_byte(ctx: ReaderContext) -> bytes: + """Read a byte with a 2 digit hex code such as `\\xff`.""" + reader = ctx.reader + c1 = reader.next_token() + c2 = reader.next_token() + try: + return bytes([int("".join(["0x", c1, c2]), base=16)]) + except ValueError as e: + raise ctx.syntax_error( + f"Invalid byte representation for base 16: 0x{c1}{c2}" + ) from e + + +def _read_byte_str(ctx: ReaderContext) -> bytes: + """Return a byte string from the input stream. + + Byte strings have the same restrictions and semantics as byte literals in Python. + Individual characters must be within the ASCII range or must be valid escape sequences. + """ + reader = ctx.reader + + token = reader.peek() + while whitespace_chars.match(token): + token = reader.next_token() + + if token != '"': + raise ctx.syntax_error(f"Expected '\"'; got '{token}' instead") + + b: List[bytes] = [] + while True: + token = reader.next_token() + if token == "": + raise ctx.eof_error("Unexpected EOF in byte string") + if ord(token) < 1 or ord(token) > 127: + raise ctx.eof_error("Byte strings must contain only ASCII characters") + if token == "\\": + token = reader.next_token() + escape_char = _BYTES_ESCAPE_CHARS.get(token, None) + if escape_char: + b.append(escape_char) + continue + elif token == "x": + b.append(_read_hex_byte(ctx)) + continue + else: + # In Python, invalid escape sequences entered into byte strings are + # retained with backslash for debugging purposes, so we do the same. + b.append(b"\\") + b.append(token.encode("utf-8")) + continue + if token == '"': + reader.next_token() + return b"".join(b) + b.append(token.encode("utf-8")) + + @_with_loc def _read_sym(ctx: ReaderContext) -> MaybeSymbol: """Return a symbol from the input stream. @@ -1380,7 +1449,7 @@ def _load_record_or_type( raise ctx.syntax_error("Records may only be constructed from Vectors and Maps") -def _read_reader_macro(ctx: ReaderContext) -> LispReaderForm: +def _read_reader_macro(ctx: ReaderContext) -> LispReaderForm: # noqa: MC0001 """Return a data structure evaluated as a reader macro from the input stream.""" start = ctx.reader.advance() @@ -1408,6 +1477,9 @@ def _read_reader_macro(ctx: ReaderContext) -> LispReaderForm: return _read_reader_conditional(ctx) elif token == "#": return _read_numeric_constant(ctx) + elif token == "b": + ctx.reader.advance() + return _read_byte_str(ctx) elif ns_name_chars.match(token): s = _read_sym(ctx) assert isinstance(s, sym.Symbol) diff --git a/src/basilisp/lang/typing.py b/src/basilisp/lang/typing.py index e9402b15..7aa69a75 100644 --- a/src/basilisp/lang/typing.py +++ b/src/basilisp/lang/typing.py @@ -21,6 +21,7 @@ LispNumber = Union[int, float, Fraction] LispForm = Union[ bool, + bytes, complex, datetime, Decimal, diff --git a/tests/basilisp/lrepr_test.py b/tests/basilisp/lrepr_test.py index 32def295..f69bbbb8 100644 --- a/tests/basilisp/lrepr_test.py +++ b/tests/basilisp/lrepr_test.py @@ -173,6 +173,11 @@ def test_print_readably(lcompile: CompileFn): '#uuid "81f35603-0408-4b3d-bbc0-462e3702747f"', '(pr-str #uuid "81f35603-0408-4b3d-bbc0-462e3702747f")', ), + ('#b ""', '(pr-str #b "")'), + ( + r'#b "\x7fELF\x01\x01\x01\x00"', + r'(pr-str #b "\x7f\x45\x4c\x46\x01\x01\x01\x00")', + ), ('#"\\s"', '(pr-str #"\\s")'), ( '#inst "2018-11-28T12:43:25.477000+00:00"', @@ -205,6 +210,11 @@ def test_lrepr(lcompile: CompileFn, repr: str, code: str): (-float("inf"), "(read-string (pr-str ##-Inf))"), ("hi", '(read-string (pr-str "hi"))'), ("Hello\nworld!", '(read-string (pr-str "Hello\nworld!"))'), + (b"", '(read-string (pr-str #b ""))'), + ( + b"\x7fELF\x01\x01\x01\x00", + r'(read-string (pr-str #b "\x7f\x45\x4c\x46\x01\x01\x01\x00"))', + ), ( uuid.UUID("81f35603-0408-4b3d-bbc0-462e3702747f"), '(read-string (pr-str #uuid "81f35603-0408-4b3d-bbc0-462e3702747f"))', @@ -253,6 +263,11 @@ def test_lrepr_round_trip_special_cases(lcompile: CompileFn): ("##-Inf", "(print-str ##-Inf)"), ("hi", '(print-str "hi")'), ("Hello\nworld!", '(print-str "Hello\nworld!")'), + ('#b ""', '(print-str #b "")'), + ( + r'#b "\x7fELF\x01\x01\x01\x00"', + r'(print-str #b "\x7f\x45\x4c\x46\x01\x01\x01\x00")', + ), # In Clojure, (print-str #uuid "...") produces '#uuid "..."' but in Basilisp # print-str is tied directly to str (which in Clojure simply returns the string # part of the UUID). diff --git a/tests/basilisp/reader_test.py b/tests/basilisp/reader_test.py index 8482097a..1bafc917 100644 --- a/tests/basilisp/reader_test.py +++ b/tests/basilisp/reader_test.py @@ -387,6 +387,61 @@ def test_missing_terminating_quote(self): read_str_first('"Start of a string') +class TestByteString: + def test_must_include_quote(self): + with pytest.raises(reader.SyntaxError): + read_str_first(r"#b []") + + @pytest.mark.parametrize( + "v,raw", + [ + (b"", '#b""'), + (b"", '#b ""'), + (b'"', r'#b "\""'), + (b"\\", r'#b "\\"'), + (b"\a", r'#b "\a"'), + (b"\b", r'#b "\b"'), + (b"\f", r'#b "\f"'), + (b"\n", r'#b "\n"'), + (b"\r", r'#b "\r"'), + (b"\t", r'#b "\t"'), + (b"\v", r'#b "\v"'), + ( + b"\x7f\x45\x4c\x46\x01\x01\x01\x00", + r'#b "\x7f\x45\x4c\x46\x01\x01\x01\x00"', + ), + (b"\x7fELF\x01\x01\x01\x00", r'#b "\x7fELF\x01\x01\x01\x00"'), + (b"Regular string but with bytes", '#b "Regular string but with bytes"'), + ( + b"Byte string with 'inner string'", + "#b \"Byte string with 'inner string'\"", + ), + ( + b'Byte string with "inner string"', + r'#b "Byte string with \"inner string\""', + ), + ], + ) + def test_legal_byte_string(self, v: str, raw: str): + assert v == read_str_first(raw) + + def test_cannot_include_non_ascii(self): + with pytest.raises(reader.SyntaxError): + read_str_first(rf'#b "{chr(432)}"') + + def test_invalid_escape_remains(self): + assert b"\q" == read_str_first(r'#b "\q"') + + @pytest.mark.parametrize("v", [r'#b "\xjj"', r'#b "\xf"', r'#b "\x"']) + def test_invalid_hex_escape_sequence(self, v: str): + with pytest.raises(reader.SyntaxError): + read_str_first(v) + + def test_missing_terminating_quote(self): + with pytest.raises(reader.SyntaxError): + read_str_first('#b "Start of a string') + + @pytest.mark.parametrize("s", ["", " ", "\t"]) def test_whitespace(s: str): assert read_str_first(s) is None