Skip to content

Commit

Permalink
Fix a decent amount of string literal edge cases
Browse files Browse the repository at this point in the history
Fixes some data string literals, DLGINCLUDE strings, and accelerator string edge cases.
  • Loading branch information
squeek502 committed Mar 28, 2024
1 parent 28b222f commit fda2c68
Show file tree
Hide file tree
Showing 11 changed files with 737 additions and 130 deletions.
2 changes: 2 additions & 0 deletions build.zig
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ pub fn build(b: *std.Build) void {
_ = addFuzzyTest(b, "stringtable", mode, target, resinator, all_fuzzy_tests_step, test_options);
_ = addFuzzyTest(b, "fonts", mode, target, resinator, all_fuzzy_tests_step, test_options);
_ = addFuzzyTest(b, "dlginclude", mode, target, resinator, all_fuzzy_tests_step, test_options);
_ = addFuzzyTest(b, "strings", mode, target, resinator, all_fuzzy_tests_step, test_options);
_ = addFuzzyTest(b, "accelerators", mode, target, resinator, all_fuzzy_tests_step, test_options);

_ = addFuzzer(b, "fuzz_rc", &.{}, resinator, target);

Expand Down
72 changes: 37 additions & 35 deletions src/compile.zig
Original file line number Diff line number Diff line change
Expand Up @@ -401,19 +401,12 @@ pub const Compiler = struct {
return first_error orelse error.FileNotFound;
}

/// Returns a Windows-1252 encoded string regardless of the current output code page.
/// All codepoints are encoded as a maximum of 2 bytes, where unescaped codepoints
/// >= 0x10000 are encoded as `??` and everything else is encoded as 1 byte.
pub fn parseDlgIncludeString(self: *Compiler, token: Token) ![]u8 {
// For the purposes of parsing, we want to strip the L prefix
// if it exists since we want escaped integers to be limited to
// their ascii string range.
//
// We keep track of whether or not there was an L prefix, though,
// since there's more weirdness to come.
var bytes = self.sourceBytesForToken(token);
var was_wide_string = false;
if (bytes.slice[0] == 'L' or bytes.slice[0] == 'l') {
was_wide_string = true;
bytes.slice = bytes.slice[1..];
}
const bytes = self.sourceBytesForToken(token);
const output_code_page = self.output_code_pages.getForToken(token);

var buf = try std.ArrayList(u8).initCapacity(self.allocator, bytes.slice.len);
errdefer buf.deinit();
Expand All @@ -423,34 +416,38 @@ pub const Compiler = struct {
.diagnostics = .{ .diagnostics = self.diagnostics, .token = token },
});

// No real idea what's going on here, but this matches the rc.exe behavior
// This is similar to the logic in parseQuotedString, but ends up with everything
// encoded as Windows-1252. This effectively consolidates the two-step process
// of rc.exe into one step, since rc.exe's preprocessor converts to UTF-16 (this
// is when invalid sequences are replaced by the replacement character (U+FFFD)),
// and then that's run through the parser. Our preprocessor keeps things in their
// original encoding, meaning we emulate the <encoding> -> UTF-16 -> Windows-1252
// results all at once.
while (try iterative_parser.next()) |parsed| {
const c = parsed.codepoint;
switch (was_wide_string) {
true => {
switch (c) {
0...0x7F, 0xA0...0xFF => try buf.append(@intCast(c)),
0x80...0x9F => {
if (windows1252.bestFitFromCodepoint(c)) |_| {
try buf.append(@intCast(c));
} else {
try buf.append('?');
}
},
else => {
if (windows1252.bestFitFromCodepoint(c)) |best_fit| {
try buf.append(best_fit);
} else if (c < 0x10000 or c == code_pages.Codepoint.invalid) {
try buf.append('?');
} else {
try buf.appendSlice("??");
}
},
switch (iterative_parser.declared_string_type) {
.wide => {
if (windows1252.bestFitFromCodepoint(c)) |best_fit| {
try buf.append(best_fit);
} else if (c < 0x10000 or c == code_pages.Codepoint.invalid or parsed.escaped_surrogate_pair) {
try buf.append('?');
} else {
try buf.appendSlice("??");
}
},
false => {
.ascii => {
if (parsed.from_escaped_integer) {
try buf.append(@truncate(c));
const truncated: u8 = @truncate(c);
switch (output_code_page) {
.utf8 => switch (truncated) {
0...0x7F => try buf.append(truncated),
else => try buf.append('?'),
},
.windows1252 => {
try buf.append(truncated);
},
else => unreachable, // unsupported code page
}
} else {
if (windows1252.bestFitFromCodepoint(c)) |best_fit| {
try buf.append(best_fit);
Expand Down Expand Up @@ -484,6 +481,10 @@ pub const Compiler = struct {
const parsed_filename_terminated = std.mem.sliceTo(parsed_filename, 0);

header.applyMemoryFlags(node.common_resource_attributes, self.source);
// This is effectively limited by `max_string_literal_codepoints` which is a u15.
// Each codepoint within a DLGINCLUDE string is encoded as a maximum of
// 2 bytes, which means that the maximum byte length of a DLGINCLUDE string is
// (including the NUL terminator): 32,767 * 2 + 1 = 65,535 or exactly the u16 max.
header.data_size = @intCast(parsed_filename_terminated.len + 1);
try header.write(writer, .{ .diagnostics = self.diagnostics, .token = node.id });
try writer.writeAll(parsed_filename_terminated);
Expand Down Expand Up @@ -1298,6 +1299,7 @@ pub const Compiler = struct {
return res.parseAcceleratorKeyString(bytes, is_virt, .{
.start_column = column,
.diagnostics = .{ .diagnostics = self.diagnostics, .token = literal.token },
.output_code_page = self.output_code_pages.getForToken(literal.token),
});
}
}
Expand Down
172 changes: 130 additions & 42 deletions src/literals.zig
Original file line number Diff line number Diff line change
Expand Up @@ -98,12 +98,24 @@ pub const IterativeStringParser = struct {

pub const ParsedCodepoint = struct {
codepoint: u21,
/// Note: If this is true, `codepoint` will be a value with a max of maxInt(u16).
/// This is enforced by using saturating arithmetic, so in e.g. a wide string literal the
/// octal escape sequence \7777777 (2,097,151) will be parsed into the value 0xFFFF (65,535).
/// If the value needs to be truncated to a smaller integer (for ASCII string literals), then that
/// must be done by the caller.
/// Note: If this is true, `codepoint` will have an effective maximum value
/// of 0xFFFF, as `codepoint` is calculated using wrapping arithmetic on a u16.
/// If the value needs to be truncated to a smaller integer (e.g. for ASCII string
/// literals), then that must be done by the caller.
from_escaped_integer: bool = false,
/// Denotes that the codepoint is:
/// - Escaped (has a \ in front of it), and
/// - Has a value >= U+10000, meaning it would be encoded as a surrogate
/// pair in UTF-16, and
/// - Is part of a wide string literal
///
/// Normally in wide string literals, invalid escapes are omitted
/// during parsing (the codepoints are not returned at all during
/// the `next` call), but this is a special case in which the
/// escape only applies to the high surrogate pair of the codepoint.
///
/// TODO: Maybe just return the low surrogate codepoint by itself in this case.
escaped_surrogate_pair: bool = false,
};

pub fn next(self: *IterativeStringParser) std.mem.Allocator.Error!?ParsedCodepoint {
Expand Down Expand Up @@ -269,7 +281,63 @@ pub const IterativeStringParser = struct {
backtrack = true;
},
else => switch (self.declared_string_type) {
.wide => {}, // invalid escape sequences are skipped in wide strings
.wide => {
// All invalid escape sequences are skipped in wide strings,
// but there is a special case around \<tab> where the \
// is skipped but the tab character is processed.
// It's actually a bit weirder than that, though, since
// the preprocessor is the one that does the <tab> -> spaces
// conversion, so it goes something like this:
//
// Before preprocessing: L"\<tab>"
// After preprocessing: L"\ "
//
// So the parser only sees an escaped space character followed
// by some other number of spaces >= 0.
//
// However, our preprocessor keeps tab characters intact, so we emulate
// the above behavior by skipping the \ and then outputting one less
// space than normal for the <tab> character.
if (c == '\t') {
// Only warn about a tab getting converted to spaces once per string
if (self.diagnostics != null and !self.seen_tab) {
try self.diagnostics.?.diagnostics.append(ErrorDetails{
.err = .tab_converted_to_spaces,
.type = .warning,
.token = self.diagnostics.?.token,
});
try self.diagnostics.?.diagnostics.append(ErrorDetails{
.err = .tab_converted_to_spaces,
.type = .note,
.token = self.diagnostics.?.token,
.print_source_line = false,
});
self.seen_tab = true;
}

const cols = columnsUntilTabStop(self.column, 8);
// If the tab character would only be converted to a single space,
// then we can just skip both the \ and the <tab> and move on.
if (cols > 1) {
self.num_pending_spaces = @intCast(cols - 2);
self.index += codepoint.byte_len;
return .{ .codepoint = ' ' };
}
}
// There's a second special case when the codepoint would be encoded
// as a surrogate pair in UTF-16, as the escape 'applies' to the
// high surrogate pair only in this instance. This is a side-effect
// of the Win32 RC compiler preprocessor outputting UTF-16 and the
// compiler itself seemingly working on code units instead of code points
// in this particular instance.
//
// We emulate this behavior by emitting the codepoint, but with a marker
// that indicates that it needs to be handled specially.
if (c >= 0x10000 and c != code_pages.Codepoint.invalid) {
self.index += codepoint.byte_len;
return .{ .codepoint = c, .escaped_surrogate_pair = true };
}
},
.ascii => {
// we intentionally avoid incrementing self.index
// to handle the current char in the next call,
Expand Down Expand Up @@ -303,6 +371,9 @@ pub const IterativeStringParser = struct {
},
.escaped_octal => switch (c) {
'0'...'7' => {
// Note: We use wrapping arithmetic on a u16 here since there's been no observed
// string parsing scenario where an escaped integer with a value >= the u16
// max is interpreted as anything but the truncated u16 value.
string_escape_n *%= 8;
string_escape_n +%= std.fmt.charToDigit(@intCast(c), 8) catch unreachable;
string_escape_i += 1;
Expand Down Expand Up @@ -389,46 +460,51 @@ pub fn parseQuotedString(

while (try iterative_parser.next()) |parsed| {
const c = parsed.codepoint;
if (parsed.from_escaped_integer) {
// We truncate here to get the correct behavior for ascii strings
try buf.append(std.mem.nativeToLittle(T, @truncate(c)));
} else {
switch (literal_type) {
.ascii => switch (options.output_code_page) {
.windows1252 => {
if (windows1252.bestFitFromCodepoint(c)) |best_fit| {
try buf.append(best_fit);
} else if (c < 0x10000 or c == code_pages.Codepoint.invalid) {
try buf.append('?');
} else {
try buf.appendSlice("??");
}
},
.utf8 => {
var codepoint_to_encode = c;
if (c == code_pages.Codepoint.invalid) {
codepoint_to_encode = '�';
}
var utf8_buf: [4]u8 = undefined;
const utf8_len = std.unicode.utf8Encode(codepoint_to_encode, &utf8_buf) catch unreachable;
try buf.appendSlice(utf8_buf[0..utf8_len]);
},
else => unreachable, // Unsupported code page
},
.wide => {
if (c == code_pages.Codepoint.invalid) {
try buf.append(std.mem.nativeToLittle(u16, '�'));
} else if (c < 0x10000) {
const short: u16 = @intCast(c);
try buf.append(std.mem.nativeToLittle(u16, short));
switch (literal_type) {
.ascii => switch (options.output_code_page) {
.windows1252 => {
if (parsed.from_escaped_integer) {
try buf.append(@truncate(c));
} else if (windows1252.bestFitFromCodepoint(c)) |best_fit| {
try buf.append(best_fit);
} else if (c < 0x10000 or c == code_pages.Codepoint.invalid) {
try buf.append('?');
} else {
try buf.appendSlice("??");
}
},
.utf8 => {
var codepoint_to_encode = c;
if (parsed.from_escaped_integer) {
codepoint_to_encode = @as(T, @truncate(c));
}
const escaped_integer_outside_ascii_range = parsed.from_escaped_integer and codepoint_to_encode > 0x7F;
if (escaped_integer_outside_ascii_range or c == code_pages.Codepoint.invalid) {
codepoint_to_encode = '�';
}
var utf8_buf: [4]u8 = undefined;
const utf8_len = std.unicode.utf8Encode(codepoint_to_encode, &utf8_buf) catch unreachable;
try buf.appendSlice(utf8_buf[0..utf8_len]);
},
else => unreachable, // Unsupported code page
},
.wide => {
if (parsed.from_escaped_integer) {
try buf.append(std.mem.nativeToLittle(u16, @truncate(c)));
} else if (c == code_pages.Codepoint.invalid) {
try buf.append(std.mem.nativeToLittle(u16, '�'));
} else if (c < 0x10000) {
const short: u16 = @intCast(c);
try buf.append(std.mem.nativeToLittle(u16, short));
} else {
if (!parsed.escaped_surrogate_pair) {
const high = @as(u16, @intCast((c - 0x10000) >> 10)) + 0xD800;
try buf.append(std.mem.nativeToLittle(u16, high));
const low = @as(u16, @intCast(c & 0x3FF)) + 0xDC00;
try buf.append(std.mem.nativeToLittle(u16, low));
}
},
}
const low = @as(u16, @intCast(c & 0x3FF)) + 0xDC00;
try buf.append(std.mem.nativeToLittle(u16, low));
}
},
}
}

Expand Down Expand Up @@ -652,6 +728,18 @@ test "parse quoted ascii string with utf8 code page" {
));
}

test "parse quoted string with different input/output code pages" {
var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator);
defer arena_allocator.deinit();
const arena = arena_allocator.allocator();

try std.testing.expectEqualSlices(u8, "€���\x60\x7F", try parseQuotedAsciiString(
arena,
.{ .slice = "\"\x80\\x8a\\600\\612\\540\\577\"", .code_page = .windows1252 },
.{ .output_code_page = .utf8 },
));
}

test "parse quoted wide string" {
var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator);
defer arena_allocator.deinit();
Expand Down
Loading

0 comments on commit fda2c68

Please sign in to comment.