Fix a decent amount of string literal edge cases

Fixes some data string literals, DLGINCLUDE strings, and accelerator string edge cases.
squeek502 · Mar 28, 2024 · fda2c68 · fda2c68
1 parent 28b222f
commit fda2c68
Show file tree

Hide file tree

Showing 11 changed files with 737 additions and 130 deletions.
diff --git a/build.zig b/build.zig
@@ -122,6 +122,8 @@ pub fn build(b: *std.Build) void {
     _ = addFuzzyTest(b, "stringtable", mode, target, resinator, all_fuzzy_tests_step, test_options);
     _ = addFuzzyTest(b, "fonts", mode, target, resinator, all_fuzzy_tests_step, test_options);
     _ = addFuzzyTest(b, "dlginclude", mode, target, resinator, all_fuzzy_tests_step, test_options);
+    _ = addFuzzyTest(b, "strings", mode, target, resinator, all_fuzzy_tests_step, test_options);
+    _ = addFuzzyTest(b, "accelerators", mode, target, resinator, all_fuzzy_tests_step, test_options);
 
     _ = addFuzzer(b, "fuzz_rc", &.{}, resinator, target);
 

diff --git a/src/compile.zig b/src/compile.zig
@@ -401,19 +401,12 @@ pub const Compiler = struct {
         return first_error orelse error.FileNotFound;
     }
 
+    /// Returns a Windows-1252 encoded string regardless of the current output code page.
+    /// All codepoints are encoded as a maximum of 2 bytes, where unescaped codepoints
+    /// >= 0x10000 are encoded as `??` and everything else is encoded as 1 byte.
     pub fn parseDlgIncludeString(self: *Compiler, token: Token) ![]u8 {
-        // For the purposes of parsing, we want to strip the L prefix
-        // if it exists since we want escaped integers to be limited to
-        // their ascii string range.
-        //
-        // We keep track of whether or not there was an L prefix, though,
-        // since there's more weirdness to come.
-        var bytes = self.sourceBytesForToken(token);
-        var was_wide_string = false;
-        if (bytes.slice[0] == 'L' or bytes.slice[0] == 'l') {
-            was_wide_string = true;
-            bytes.slice = bytes.slice[1..];
-        }
+        const bytes = self.sourceBytesForToken(token);
+        const output_code_page = self.output_code_pages.getForToken(token);
 
         var buf = try std.ArrayList(u8).initCapacity(self.allocator, bytes.slice.len);
         errdefer buf.deinit();
@@ -423,34 +416,38 @@ pub const Compiler = struct {
             .diagnostics = .{ .diagnostics = self.diagnostics, .token = token },
         });
 
-        // No real idea what's going on here, but this matches the rc.exe behavior
+        // This is similar to the logic in parseQuotedString, but ends up with everything
+        // encoded as Windows-1252. This effectively consolidates the two-step process
+        // of rc.exe into one step, since rc.exe's preprocessor converts to UTF-16 (this
+        // is when invalid sequences are replaced by the replacement character (U+FFFD)),
+        // and then that's run through the parser. Our preprocessor keeps things in their
+        // original encoding, meaning we emulate the <encoding> -> UTF-16 -> Windows-1252
+        // results all at once.
         while (try iterative_parser.next()) |parsed| {
             const c = parsed.codepoint;
-            switch (was_wide_string) {
-                true => {
-                    switch (c) {
-                        0...0x7F, 0xA0...0xFF => try buf.append(@intCast(c)),
-                        0x80...0x9F => {
-                            if (windows1252.bestFitFromCodepoint(c)) |_| {
-                                try buf.append(@intCast(c));
-                            } else {
-                                try buf.append('?');
-                            }
-                        },
-                        else => {
-                            if (windows1252.bestFitFromCodepoint(c)) |best_fit| {
-                                try buf.append(best_fit);
-                            } else if (c < 0x10000 or c == code_pages.Codepoint.invalid) {
-                                try buf.append('?');
-                            } else {
-                                try buf.appendSlice("??");
-                            }
-                        },
+            switch (iterative_parser.declared_string_type) {
+                .wide => {
+                    if (windows1252.bestFitFromCodepoint(c)) |best_fit| {
+                        try buf.append(best_fit);
+                    } else if (c < 0x10000 or c == code_pages.Codepoint.invalid or parsed.escaped_surrogate_pair) {
+                        try buf.append('?');
+                    } else {
+                        try buf.appendSlice("??");
                     }
                 },
-                false => {
+                .ascii => {
                     if (parsed.from_escaped_integer) {
-                        try buf.append(@truncate(c));
+                        const truncated: u8 = @truncate(c);
+                        switch (output_code_page) {
+                            .utf8 => switch (truncated) {
+                                0...0x7F => try buf.append(truncated),
+                                else => try buf.append('?'),
+                            },
+                            .windows1252 => {
+                                try buf.append(truncated);
+                            },
+                            else => unreachable, // unsupported code page
+                        }
                     } else {
                         if (windows1252.bestFitFromCodepoint(c)) |best_fit| {
                             try buf.append(best_fit);
@@ -484,6 +481,10 @@ pub const Compiler = struct {
             const parsed_filename_terminated = std.mem.sliceTo(parsed_filename, 0);
 
             header.applyMemoryFlags(node.common_resource_attributes, self.source);
+            // This is effectively limited by `max_string_literal_codepoints` which is a u15.
+            // Each codepoint within a DLGINCLUDE string is encoded as a maximum of
+            // 2 bytes, which means that the maximum byte length of a DLGINCLUDE string is
+            // (including the NUL terminator): 32,767 * 2 + 1 = 65,535 or exactly the u16 max.
             header.data_size = @intCast(parsed_filename_terminated.len + 1);
             try header.write(writer, .{ .diagnostics = self.diagnostics, .token = node.id });
             try writer.writeAll(parsed_filename_terminated);
@@ -1298,6 +1299,7 @@ pub const Compiler = struct {
             return res.parseAcceleratorKeyString(bytes, is_virt, .{
                 .start_column = column,
                 .diagnostics = .{ .diagnostics = self.diagnostics, .token = literal.token },
+                .output_code_page = self.output_code_pages.getForToken(literal.token),
             });
         }
     }

diff --git a/src/literals.zig b/src/literals.zig
@@ -98,12 +98,24 @@ pub const IterativeStringParser = struct {
 
     pub const ParsedCodepoint = struct {
         codepoint: u21,
-        /// Note: If this is true, `codepoint` will be a value with a max of maxInt(u16).
-        /// This is enforced by using saturating arithmetic, so in e.g. a wide string literal the
-        /// octal escape sequence \7777777 (2,097,151) will be parsed into the value 0xFFFF (65,535).
-        /// If the value needs to be truncated to a smaller integer (for ASCII string literals), then that
-        /// must be done by the caller.
+        /// Note: If this is true, `codepoint` will have an effective maximum value
+        /// of 0xFFFF, as `codepoint` is calculated using wrapping arithmetic on a u16.
+        /// If the value needs to be truncated to a smaller integer (e.g. for ASCII string
+        /// literals), then that must be done by the caller.
         from_escaped_integer: bool = false,
+        /// Denotes that the codepoint is:
+        /// - Escaped (has a \ in front of it), and
+        /// - Has a value >= U+10000, meaning it would be encoded as a surrogate
+        ///   pair in UTF-16, and
+        /// - Is part of a wide string literal
+        ///
+        /// Normally in wide string literals, invalid escapes are omitted
+        /// during parsing (the codepoints are not returned at all during
+        /// the `next` call), but this is a special case in which the
+        /// escape only applies to the high surrogate pair of the codepoint.
+        ///
+        /// TODO: Maybe just return the low surrogate codepoint by itself in this case.
+        escaped_surrogate_pair: bool = false,
     };
 
     pub fn next(self: *IterativeStringParser) std.mem.Allocator.Error!?ParsedCodepoint {
@@ -269,7 +281,63 @@ pub const IterativeStringParser = struct {
                                 backtrack = true;
                             },
                             else => switch (self.declared_string_type) {
-                                .wide => {}, // invalid escape sequences are skipped in wide strings
+                                .wide => {
+                                    // All invalid escape sequences are skipped in wide strings,
+                                    // but there is a special case around \<tab> where the \
+                                    // is skipped but the tab character is processed.
+                                    // It's actually a bit weirder than that, though, since
+                                    // the preprocessor is the one that does the <tab> -> spaces
+                                    // conversion, so it goes something like this:
+                                    //
+                                    // Before preprocessing: L"\<tab>"
+                                    // After preprocessing:  L"\     "
+                                    //
+                                    // So the parser only sees an escaped space character followed
+                                    // by some other number of spaces >= 0.
+                                    //
+                                    // However, our preprocessor keeps tab characters intact, so we emulate
+                                    // the above behavior by skipping the \ and then outputting one less
+                                    // space than normal for the <tab> character.
+                                    if (c == '\t') {
+                                        // Only warn about a tab getting converted to spaces once per string
+                                        if (self.diagnostics != null and !self.seen_tab) {
+                                            try self.diagnostics.?.diagnostics.append(ErrorDetails{
+                                                .err = .tab_converted_to_spaces,
+                                                .type = .warning,
+                                                .token = self.diagnostics.?.token,
+                                            });
+                                            try self.diagnostics.?.diagnostics.append(ErrorDetails{
+                                                .err = .tab_converted_to_spaces,
+                                                .type = .note,
+                                                .token = self.diagnostics.?.token,
+                                                .print_source_line = false,
+                                            });
+                                            self.seen_tab = true;
+                                        }
+
+                                        const cols = columnsUntilTabStop(self.column, 8);
+                                        // If the tab character would only be converted to a single space,
+                                        // then we can just skip both the \ and the <tab> and move on.
+                                        if (cols > 1) {
+                                            self.num_pending_spaces = @intCast(cols - 2);
+                                            self.index += codepoint.byte_len;
+                                            return .{ .codepoint = ' ' };
+                                        }
+                                    }
+                                    // There's a second special case when the codepoint would be encoded
+                                    // as a surrogate pair in UTF-16, as the escape 'applies' to the
+                                    // high surrogate pair only in this instance. This is a side-effect
+                                    // of the Win32 RC compiler preprocessor outputting UTF-16 and the
+                                    // compiler itself seemingly working on code units instead of code points
+                                    // in this particular instance.
+                                    //
+                                    // We emulate this behavior by emitting the codepoint, but with a marker
+                                    // that indicates that it needs to be handled specially.
+                                    if (c >= 0x10000 and c != code_pages.Codepoint.invalid) {
+                                        self.index += codepoint.byte_len;
+                                        return .{ .codepoint = c, .escaped_surrogate_pair = true };
+                                    }
+                                },
                                 .ascii => {
                                     // we intentionally avoid incrementing self.index
                                     // to handle the current char in the next call,
@@ -303,6 +371,9 @@ pub const IterativeStringParser = struct {
                 },
                 .escaped_octal => switch (c) {
                     '0'...'7' => {
+                        // Note: We use wrapping arithmetic on a u16 here since there's been no observed
+                        // string parsing scenario where an escaped integer with a value >= the u16
+                        // max is interpreted as anything but the truncated u16 value.
                         string_escape_n *%= 8;
                         string_escape_n +%= std.fmt.charToDigit(@intCast(c), 8) catch unreachable;
                         string_escape_i += 1;
@@ -389,46 +460,51 @@ pub fn parseQuotedString(
 
     while (try iterative_parser.next()) |parsed| {
         const c = parsed.codepoint;
-        if (parsed.from_escaped_integer) {
-            // We truncate here to get the correct behavior for ascii strings
-            try buf.append(std.mem.nativeToLittle(T, @truncate(c)));
-        } else {
-            switch (literal_type) {
-                .ascii => switch (options.output_code_page) {
-                    .windows1252 => {
-                        if (windows1252.bestFitFromCodepoint(c)) |best_fit| {
-                            try buf.append(best_fit);
-                        } else if (c < 0x10000 or c == code_pages.Codepoint.invalid) {
-                            try buf.append('?');
-                        } else {
-                            try buf.appendSlice("??");
-                        }
-                    },
-                    .utf8 => {
-                        var codepoint_to_encode = c;
-                        if (c == code_pages.Codepoint.invalid) {
-                            codepoint_to_encode = '�';
-                        }
-                        var utf8_buf: [4]u8 = undefined;
-                        const utf8_len = std.unicode.utf8Encode(codepoint_to_encode, &utf8_buf) catch unreachable;
-                        try buf.appendSlice(utf8_buf[0..utf8_len]);
-                    },
-                    else => unreachable, // Unsupported code page
-                },
-                .wide => {
-                    if (c == code_pages.Codepoint.invalid) {
-                        try buf.append(std.mem.nativeToLittle(u16, '�'));
-                    } else if (c < 0x10000) {
-                        const short: u16 = @intCast(c);
-                        try buf.append(std.mem.nativeToLittle(u16, short));
+        switch (literal_type) {
+            .ascii => switch (options.output_code_page) {
+                .windows1252 => {
+                    if (parsed.from_escaped_integer) {
+                        try buf.append(@truncate(c));
+                    } else if (windows1252.bestFitFromCodepoint(c)) |best_fit| {
+                        try buf.append(best_fit);
+                    } else if (c < 0x10000 or c == code_pages.Codepoint.invalid) {
+                        try buf.append('?');
                     } else {
+                        try buf.appendSlice("??");
+                    }
+                },
+                .utf8 => {
+                    var codepoint_to_encode = c;
+                    if (parsed.from_escaped_integer) {
+                        codepoint_to_encode = @as(T, @truncate(c));
+                    }
+                    const escaped_integer_outside_ascii_range = parsed.from_escaped_integer and codepoint_to_encode > 0x7F;
+                    if (escaped_integer_outside_ascii_range or c == code_pages.Codepoint.invalid) {
+                        codepoint_to_encode = '�';
+                    }
+                    var utf8_buf: [4]u8 = undefined;
+                    const utf8_len = std.unicode.utf8Encode(codepoint_to_encode, &utf8_buf) catch unreachable;
+                    try buf.appendSlice(utf8_buf[0..utf8_len]);
+                },
+                else => unreachable, // Unsupported code page
+            },
+            .wide => {
+                if (parsed.from_escaped_integer) {
+                    try buf.append(std.mem.nativeToLittle(u16, @truncate(c)));
+                } else if (c == code_pages.Codepoint.invalid) {
+                    try buf.append(std.mem.nativeToLittle(u16, '�'));
+                } else if (c < 0x10000) {
+                    const short: u16 = @intCast(c);
+                    try buf.append(std.mem.nativeToLittle(u16, short));
+                } else {
+                    if (!parsed.escaped_surrogate_pair) {
                         const high = @as(u16, @intCast((c - 0x10000) >> 10)) + 0xD800;
                         try buf.append(std.mem.nativeToLittle(u16, high));
-                        const low = @as(u16, @intCast(c & 0x3FF)) + 0xDC00;
-                        try buf.append(std.mem.nativeToLittle(u16, low));
                     }
-                },
-            }
+                    const low = @as(u16, @intCast(c & 0x3FF)) + 0xDC00;
+                    try buf.append(std.mem.nativeToLittle(u16, low));
+                }
+            },
         }
     }
 
@@ -652,6 +728,18 @@ test "parse quoted ascii string with utf8 code page" {
     ));
 }
 
+test "parse quoted string with different input/output code pages" {
+    var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator);
+    defer arena_allocator.deinit();
+    const arena = arena_allocator.allocator();
+
+    try std.testing.expectEqualSlices(u8, "€���\x60\x7F", try parseQuotedAsciiString(
+        arena,
+        .{ .slice = "\"\x80\\x8a\\600\\612\\540\\577\"", .code_page = .windows1252 },
+        .{ .output_code_page = .utf8 },
+    ));
+}
+
 test "parse quoted wide string" {
     var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator);
     defer arena_allocator.deinit();