ziglang · Vexu · Feb 19, 2023 · Aug 30, 2022 · Aug 27, 2022 · Aug 28, 2022
diff --git a/doc/langref.html.in b/doc/langref.html.in
@@ -11551,7 +11551,8 @@ fn readU32Be() u32 {}
       </p>
       <p>
       Each LF may be immediately preceded by a single CR (byte value 0x0d, code point U+000d, {#syntax#}'\r'{#endsyntax#})
-      to form a Windows style line ending, but this is discouraged.
+      to form a Windows style line ending, but this is discouraged. Note that in mulitline strings, CRLF sequences will
+      be encoded as LF when compiled into a zig program.
       A CR in any other context is not allowed.
       </p>
       <p>

diff --git a/lib/std/zig/Ast.zig b/lib/std/zig/Ast.zig
@@ -171,7 +171,7 @@ pub fn tokenSlice(tree: Ast, token_index: TokenIndex) []const u8 {
         .index = token_starts[token_index],
         .pending_invalid_token = null,
     };
-    const token = tokenizer.next();
+    const token = tokenizer.findTagAtCurrentIndex(token_tag);
     assert(token.tag == token_tag);
     return tree.source[token.loc.start..token.loc.end];
 }

diff --git a/lib/std/zig/tokenizer.zig b/lib/std/zig/tokenizer.zig
@@ -406,6 +406,38 @@ pub const Tokenizer = struct {
         saw_at_sign,
     };
 
+    /// This is a workaround to the fact that the tokenizer can queue up
+    /// 'pending_invalid_token's when parsing literals, which means that we need
+    /// to scan from the start of the current line to find a matching tag - just
+    /// in case it was an invalid character generated during literal
+    /// tokenization. Ideally this processing of this would be pushed to the AST
+    /// parser or another later stage, both to give more useful error messages
+    /// with that extra context and in order to be able to remove this
+    /// workaround.
+    pub fn findTagAtCurrentIndex(self: *Tokenizer, tag: Token.Tag) Token {
+        if (tag == .invalid) {
+            const target_index = self.index;
+            var starting_index = target_index;
+            while (starting_index > 0) {
+                if (self.buffer[starting_index] == '\n') {
+                    break;
+                }
+                starting_index -= 1;
+            }
+
+            self.index = starting_index;
+            while (self.index <= target_index or self.pending_invalid_token != null) {
+                const result = self.next();
+                if (result.loc.start == target_index and result.tag == tag) {
+                    return result;
+                }
+            }
+            unreachable;
+        } else {
+            return self.next();
+        }
+    }
+
     pub fn next(self: *Tokenizer) Token {
         if (self.pending_invalid_token) |token| {
             self.pending_invalid_token = null;
@@ -1127,7 +1159,7 @@ pub const Tokenizer = struct {
                         state = .start;
                         result.loc.start = self.index + 1;
                     },
-                    '\t', '\r' => state = .line_comment,
+                    '\t' => state = .line_comment,
                     else => {
                         state = .line_comment;
                         self.checkLiteralCharacter();
@@ -1141,7 +1173,7 @@ pub const Tokenizer = struct {
                         result.tag = .doc_comment;
                         break;
                     },
-                    '\t', '\r' => {
+                    '\t' => {
                         state = .doc_comment;
                         result.tag = .doc_comment;
                     },
@@ -1163,12 +1195,12 @@ pub const Tokenizer = struct {
                         state = .start;
                         result.loc.start = self.index + 1;
                     },
-                    '\t', '\r' => {},
+                    '\t' => {},
                     else => self.checkLiteralCharacter(),
                 },
                 .doc_comment => switch (c) {
                     0, '\n' => break,
-                    '\t', '\r' => {},
+                    '\t' => {},
                     else => self.checkLiteralCharacter(),
                 },
                 .int => switch (c) {
@@ -1239,7 +1271,15 @@ pub const Tokenizer = struct {
     fn getInvalidCharacterLength(self: *Tokenizer) u3 {
         const c0 = self.buffer[self.index];
         if (std.ascii.isASCII(c0)) {
-            if (std.ascii.isControl(c0)) {
+            if (c0 == '\r') {
+                if (self.index + 1 < self.buffer.len and self.buffer[self.index + 1] == '\n') {
+                    // Carriage returns are *only* allowed just before a linefeed as part of a CRLF pair, otherwise
+                    // they constitute an illegal byte!
+                    return 0;
+                } else {
+                    return 1;
+                }
+            } else if (std.ascii.isControl(c0)) {
                 // ascii control codes are never allowed
                 // (note that \n was checked before we got here)
                 return 1;

diff --git a/src/AstGen.zig b/src/AstGen.zig
@@ -10491,14 +10491,16 @@ fn strLitNodeAsString(astgen: *AstGen, node: Ast.Node.Index) !IndexSlice {
     var tok_i = start;
     {
         const slice = tree.tokenSlice(tok_i);
-        const line_bytes = slice[2 .. slice.len - 1];
+        const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1;
+        const line_bytes = slice[2 .. slice.len - carriage_return_ending];
         try string_bytes.appendSlice(gpa, line_bytes);
         tok_i += 1;
     }
     // Following lines: each line prepends a newline.
     while (tok_i <= end) : (tok_i += 1) {
         const slice = tree.tokenSlice(tok_i);
-        const line_bytes = slice[2 .. slice.len - 1];
+        const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1;
+        const line_bytes = slice[2 .. slice.len - carriage_return_ending];
         try string_bytes.ensureUnusedCapacity(gpa, line_bytes.len + 1);
         string_bytes.appendAssumeCapacity('\n');
         string_bytes.appendSliceAssumeCapacity(line_bytes);

diff --git a/test/compare_output.zig b/test/compare_output.zig
@@ -535,4 +535,13 @@ pub fn addCases(cases: *tests.CompareOutputContext) void {
         \\debug: free - len: 5
         \\
     );
+
+    cases.add("valid carriage return example", "const io = @import(\"std\").io;\r\n" ++ // Testing CRLF line endings are valid
+        "\r\n" ++
+        "pub \r fn main() void {\r\n" ++ // Testing isolated carriage return as whitespace is valid
+        "    const stdout = io.getStdOut().writer();\r\n" ++
+        "    stdout.print(\\\\A Multiline\r\n" ++ // testing CRLF at end of multiline string line is valid and normalises to \n in the output
+        "                 \\\\String\r\n" ++
+        "                 , .{}) catch unreachable;\r\n" ++
+        "}\r\n", "A Multiline\nString");
 }
diff --git a/test/compile_errors.zig b/test/compile_errors.zig
@@ -174,6 +174,16 @@ pub fn addCases(ctx: *TestContext) !void {
         });
     }
 
+    {
+        const case = ctx.obj("isolated carriage return in multiline string literal", .{});
+        case.backend = .stage2;
+
+        case.addError("const foo = \\\\\test\r\r rogue carriage return\n;", &[_][]const u8{
+            ":1:19: error: expected ';' after declaration",
+            ":1:20: note: invalid byte: '\\r'",
+        });
+    }
+
     {
         const case = ctx.obj("missing semicolon at EOF", .{});
         case.addError(