Skip to content

Commit

Permalink
Scan from line start when finding tag in tokenizer
Browse files Browse the repository at this point in the history
This resolves a crash that can occur for invalid bytes like carriage
returns that are valid characters when not parsed from within literals.

There are potentially other edge cases this could resolve as well, as
the calling code for this function didn't account for any potential
'pending_invalid_tokens' that could be queued up by the tokenizer from
within another state.
  • Loading branch information
moosichu committed Sep 5, 2022
1 parent 3deb33f commit 533901a
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 1 deletion.
2 changes: 1 addition & 1 deletion lib/std/zig/Ast.zig
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ pub fn tokenSlice(tree: Ast, token_index: TokenIndex) []const u8 {
.index = token_starts[token_index],
.pending_invalid_token = null,
};
const token = tokenizer.next();
const token = tokenizer.findTagAtCurrentIndex(token_tag);
assert(token.tag == token_tag);
return tree.source[token.loc.start..token.loc.end];
}
Expand Down
32 changes: 32 additions & 0 deletions lib/std/zig/tokenizer.zig
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,38 @@ pub const Tokenizer = struct {
saw_at_sign,
};

/// This is a workaround to the fact that the tokenizer can queue up
/// 'pending_invalid_token's when parsing literals, which means that we need
/// to scan from the start of the current line to find a matching tag - just
/// in case it was an invalid character generated during literal
/// tokenization. Ideally this processing of this would be pushed to the AST
/// parser or another later stage, both to give more useful error messages
/// with that extra context and in order to be able to remove this
/// workaround.
pub fn findTagAtCurrentIndex(self: *Tokenizer, tag: Token.Tag) Token {
if (tag == .invalid) {
const target_index = self.index;
var starting_index = target_index;
while (starting_index > 0) {
if (self.buffer[starting_index] == '\n') {
break;
}
starting_index -= 1;
}

self.index = starting_index;
while (self.index <= target_index or self.pending_invalid_token != null) {
const result = self.next();
if (result.loc.start == target_index and result.tag == tag) {
return result;
}
}
unreachable;
} else {
return self.next();
}
}

pub fn next(self: *Tokenizer) Token {
if (self.pending_invalid_token) |token| {
self.pending_invalid_token = null;
Expand Down

0 comments on commit 533901a

Please sign in to comment.