Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correctly handle carriage return characters according to the spec #12661

Merged
merged 5 commits into from
Feb 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/langref.html.in
Original file line number Diff line number Diff line change
Expand Up @@ -11551,7 +11551,8 @@ fn readU32Be() u32 {}
</p>
<p>
Each LF may be immediately preceded by a single CR (byte value 0x0d, code point U+000d, {#syntax#}'\r'{#endsyntax#})
to form a Windows style line ending, but this is discouraged.
to form a Windows style line ending, but this is discouraged. Note that in mulitline strings, CRLF sequences will
be encoded as LF when compiled into a zig program.
A CR in any other context is not allowed.
</p>
<p>
Expand Down
2 changes: 1 addition & 1 deletion lib/std/zig/Ast.zig
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ pub fn tokenSlice(tree: Ast, token_index: TokenIndex) []const u8 {
.index = token_starts[token_index],
.pending_invalid_token = null,
};
const token = tokenizer.next();
const token = tokenizer.findTagAtCurrentIndex(token_tag);
assert(token.tag == token_tag);
return tree.source[token.loc.start..token.loc.end];
}
Expand Down
50 changes: 45 additions & 5 deletions lib/std/zig/tokenizer.zig
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,38 @@ pub const Tokenizer = struct {
saw_at_sign,
};

/// This is a workaround to the fact that the tokenizer can queue up
/// 'pending_invalid_token's when parsing literals, which means that we need
/// to scan from the start of the current line to find a matching tag - just
/// in case it was an invalid character generated during literal
/// tokenization. Ideally this processing of this would be pushed to the AST
/// parser or another later stage, both to give more useful error messages
/// with that extra context and in order to be able to remove this
/// workaround.
pub fn findTagAtCurrentIndex(self: *Tokenizer, tag: Token.Tag) Token {
if (tag == .invalid) {
const target_index = self.index;
var starting_index = target_index;
while (starting_index > 0) {
if (self.buffer[starting_index] == '\n') {
break;
}
starting_index -= 1;
}

self.index = starting_index;
while (self.index <= target_index or self.pending_invalid_token != null) {
const result = self.next();
if (result.loc.start == target_index and result.tag == tag) {
return result;
}
}
unreachable;
} else {
return self.next();
}
}

pub fn next(self: *Tokenizer) Token {
if (self.pending_invalid_token) |token| {
self.pending_invalid_token = null;
Expand Down Expand Up @@ -1127,7 +1159,7 @@ pub const Tokenizer = struct {
state = .start;
result.loc.start = self.index + 1;
},
'\t', '\r' => state = .line_comment,
'\t' => state = .line_comment,
else => {
state = .line_comment;
self.checkLiteralCharacter();
Expand All @@ -1141,7 +1173,7 @@ pub const Tokenizer = struct {
result.tag = .doc_comment;
break;
},
'\t', '\r' => {
'\t' => {
state = .doc_comment;
result.tag = .doc_comment;
},
Expand All @@ -1163,12 +1195,12 @@ pub const Tokenizer = struct {
state = .start;
result.loc.start = self.index + 1;
},
'\t', '\r' => {},
'\t' => {},
else => self.checkLiteralCharacter(),
},
.doc_comment => switch (c) {
0, '\n' => break,
'\t', '\r' => {},
'\t' => {},
else => self.checkLiteralCharacter(),
},
.int => switch (c) {
Expand Down Expand Up @@ -1239,7 +1271,15 @@ pub const Tokenizer = struct {
fn getInvalidCharacterLength(self: *Tokenizer) u3 {
const c0 = self.buffer[self.index];
if (std.ascii.isASCII(c0)) {
if (std.ascii.isControl(c0)) {
if (c0 == '\r') {
if (self.index + 1 < self.buffer.len and self.buffer[self.index + 1] == '\n') {
// Carriage returns are *only* allowed just before a linefeed as part of a CRLF pair, otherwise
// they constitute an illegal byte!
return 0;
} else {
return 1;
}
} else if (std.ascii.isControl(c0)) {
// ascii control codes are never allowed
// (note that \n was checked before we got here)
return 1;
Expand Down
6 changes: 4 additions & 2 deletions src/AstGen.zig
Original file line number Diff line number Diff line change
Expand Up @@ -10491,14 +10491,16 @@ fn strLitNodeAsString(astgen: *AstGen, node: Ast.Node.Index) !IndexSlice {
var tok_i = start;
{
const slice = tree.tokenSlice(tok_i);
const line_bytes = slice[2 .. slice.len - 1];
const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1;
const line_bytes = slice[2 .. slice.len - carriage_return_ending];
try string_bytes.appendSlice(gpa, line_bytes);
tok_i += 1;
}
// Following lines: each line prepends a newline.
while (tok_i <= end) : (tok_i += 1) {
const slice = tree.tokenSlice(tok_i);
const line_bytes = slice[2 .. slice.len - 1];
const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1;
const line_bytes = slice[2 .. slice.len - carriage_return_ending];
try string_bytes.ensureUnusedCapacity(gpa, line_bytes.len + 1);
string_bytes.appendAssumeCapacity('\n');
string_bytes.appendSliceAssumeCapacity(line_bytes);
Expand Down
9 changes: 9 additions & 0 deletions test/compare_output.zig
Original file line number Diff line number Diff line change
Expand Up @@ -535,4 +535,13 @@ pub fn addCases(cases: *tests.CompareOutputContext) void {
\\debug: free - len: 5
\\
);

cases.add("valid carriage return example", "const io = @import(\"std\").io;\r\n" ++ // Testing CRLF line endings are valid
"\r\n" ++
"pub \r fn main() void {\r\n" ++ // Testing isolated carriage return as whitespace is valid
" const stdout = io.getStdOut().writer();\r\n" ++
" stdout.print(\\\\A Multiline\r\n" ++ // testing CRLF at end of multiline string line is valid and normalises to \n in the output
" \\\\String\r\n" ++
" , .{}) catch unreachable;\r\n" ++
"}\r\n", "A Multiline\nString");
}
10 changes: 10 additions & 0 deletions test/compile_errors.zig
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,16 @@ pub fn addCases(ctx: *TestContext) !void {
});
}

{
const case = ctx.obj("isolated carriage return in multiline string literal", .{});
case.backend = .stage2;

case.addError("const foo = \\\\\test\r\r rogue carriage return\n;", &[_][]const u8{
":1:19: error: expected ';' after declaration",
":1:20: note: invalid byte: '\\r'",
});
}

{
const case = ctx.obj("missing semicolon at EOF", .{});
case.addError(
Expand Down