Skip to content

Commit

Permalink
Only accept carriage returns before line feeds
Browse files Browse the repository at this point in the history
Previous commit was much less strict about this, this more closely
matches the desired spec of only allow CR characters in a CRLF pair, but
not otherwise.
  • Loading branch information
moosichu committed Sep 5, 2022
1 parent 2f2d7cf commit 3121238
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 30 deletions.
34 changes: 27 additions & 7 deletions lib/std/zig/tokenizer.zig
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ pub const Tokenizer = struct {
switch (state) {
.start => switch (c) {
0 => break,
' ', '\n', '\t', '\r' => {
' ', '\n', '\t' => {
result.loc.start = self.index + 1;
},
'"' => {
Expand Down Expand Up @@ -597,6 +597,18 @@ pub const Tokenizer = struct {
state = .int_literal_dec;
result.tag = .integer_literal;
},
'\r' => {
// Carriage returns are *only* allowed just before a linefeed as part of a CRLF pair, otherwise
// they constitute an illegal byte!
if (self.index + 1 < self.buffer.len and self.buffer[self.index + 1] == '\n') {
result.loc.start = self.index + 1;
} else {
result.tag = .invalid;
result.loc.end = self.index;
self.index += 1;
return result;
}
},
else => {
result.tag = .invalid;
result.loc.end = self.index;
Expand Down Expand Up @@ -935,7 +947,7 @@ pub const Tokenizer = struct {
self.index += 1;
break;
},
'\t', '\r' => {},
'\t' => {},
else => self.checkLiteralCharacter(),
},

Expand Down Expand Up @@ -1169,7 +1181,7 @@ pub const Tokenizer = struct {
state = .start;
result.loc.start = self.index + 1;
},
'\t', '\r' => state = .line_comment,
'\t' => state = .line_comment,
else => {
state = .line_comment;
self.checkLiteralCharacter();
Expand All @@ -1183,7 +1195,7 @@ pub const Tokenizer = struct {
result.tag = .doc_comment;
break;
},
'\t', '\r' => {
'\t' => {
state = .doc_comment;
result.tag = .doc_comment;
},
Expand All @@ -1199,12 +1211,12 @@ pub const Tokenizer = struct {
state = .start;
result.loc.start = self.index + 1;
},
'\t', '\r' => {},
'\t' => {},
else => self.checkLiteralCharacter(),
},
.doc_comment => switch (c) {
0, '\n' => break,
'\t', '\r' => {},
'\t' => {},
else => self.checkLiteralCharacter(),
},
.zero => switch (c) {
Expand Down Expand Up @@ -1465,7 +1477,15 @@ pub const Tokenizer = struct {
fn getInvalidCharacterLength(self: *Tokenizer) u3 {
const c0 = self.buffer[self.index];
if (std.ascii.isASCII(c0)) {
if (std.ascii.isCntrl(c0)) {
if (c0 == '\r') {
if (self.index + 1 < self.buffer.len and self.buffer[self.index + 1] == '\n') {
// Carriage returns are *only* allowed just before a linefeed as part of a CRLF pair, otherwise
// they constitute an illegal byte!
return 0;
} else {
return 1;
}
} else if (std.ascii.isCntrl(c0)) {
// ascii control codes are never allowed
// (note that \n was checked before we got here)
return 1;
Expand Down
30 changes: 7 additions & 23 deletions src/AstGen.zig
Original file line number Diff line number Diff line change
Expand Up @@ -9976,35 +9976,19 @@ fn strLitNodeAsString(astgen: *AstGen, node: Ast.Node.Index) !IndexSlice {
var tok_i = start;
{
const slice = tree.tokenSlice(tok_i);
const line_bytes = slice[2 .. slice.len - 1];
const carriage_return_count = mem.count(u8, line_bytes, "\r");
if (carriage_return_count > 0) {
try string_bytes.ensureUnusedCapacity(gpa, line_bytes.len - carriage_return_count);
for (line_bytes) |line_byte| {
if (line_byte == '\r') continue;
string_bytes.appendAssumeCapacity(line_byte);
}
} else {
try string_bytes.appendSlice(gpa, line_bytes);
}
const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1;
const line_bytes = slice[2 .. slice.len - carriage_return_ending];
try string_bytes.appendSlice(gpa, line_bytes);
tok_i += 1;
}
// Following lines: each line prepends a newline.
while (tok_i <= end) : (tok_i += 1) {
const slice = tree.tokenSlice(tok_i);
const line_bytes = slice[2 .. slice.len - 1];

const carriage_return_count = mem.count(u8, line_bytes, "\r");
try string_bytes.ensureUnusedCapacity(gpa, line_bytes.len - carriage_return_count + 1);
const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1;
const line_bytes = slice[2 .. slice.len - carriage_return_ending];
try string_bytes.ensureUnusedCapacity(gpa, line_bytes.len + 1);
string_bytes.appendAssumeCapacity('\n');
if (carriage_return_count > 0) {
for (line_bytes) |line_byte| {
if (line_byte == '\r') continue;
string_bytes.appendAssumeCapacity(line_byte);
}
} else {
string_bytes.appendSliceAssumeCapacity(line_bytes);
}
string_bytes.appendSliceAssumeCapacity(line_bytes);
}
const len = string_bytes.items.len - str_index;
try string_bytes.append(gpa, 0);
Expand Down

0 comments on commit 3121238

Please sign in to comment.