diff --git a/build.zig b/build.zig index 0e8d18e..12fdbe7 100644 --- a/build.zig +++ b/build.zig @@ -122,6 +122,8 @@ pub fn build(b: *std.Build) void { _ = addFuzzyTest(b, "stringtable", mode, target, resinator, all_fuzzy_tests_step, test_options); _ = addFuzzyTest(b, "fonts", mode, target, resinator, all_fuzzy_tests_step, test_options); _ = addFuzzyTest(b, "dlginclude", mode, target, resinator, all_fuzzy_tests_step, test_options); + _ = addFuzzyTest(b, "strings", mode, target, resinator, all_fuzzy_tests_step, test_options); + _ = addFuzzyTest(b, "accelerators", mode, target, resinator, all_fuzzy_tests_step, test_options); _ = addFuzzer(b, "fuzz_rc", &.{}, resinator, target); diff --git a/src/compile.zig b/src/compile.zig index ebe741e..f074f08 100644 --- a/src/compile.zig +++ b/src/compile.zig @@ -401,19 +401,12 @@ pub const Compiler = struct { return first_error orelse error.FileNotFound; } + /// Returns a Windows-1252 encoded string regardless of the current output code page. + /// All codepoints are encoded as a maximum of 2 bytes, where unescaped codepoints + /// >= 0x10000 are encoded as `??` and everything else is encoded as 1 byte. pub fn parseDlgIncludeString(self: *Compiler, token: Token) ![]u8 { - // For the purposes of parsing, we want to strip the L prefix - // if it exists since we want escaped integers to be limited to - // their ascii string range. - // - // We keep track of whether or not there was an L prefix, though, - // since there's more weirdness to come. - var bytes = self.sourceBytesForToken(token); - var was_wide_string = false; - if (bytes.slice[0] == 'L' or bytes.slice[0] == 'l') { - was_wide_string = true; - bytes.slice = bytes.slice[1..]; - } + const bytes = self.sourceBytesForToken(token); + const output_code_page = self.output_code_pages.getForToken(token); var buf = try std.ArrayList(u8).initCapacity(self.allocator, bytes.slice.len); errdefer buf.deinit(); @@ -423,34 +416,38 @@ pub const Compiler = struct { .diagnostics = .{ .diagnostics = self.diagnostics, .token = token }, }); - // No real idea what's going on here, but this matches the rc.exe behavior + // This is similar to the logic in parseQuotedString, but ends up with everything + // encoded as Windows-1252. This effectively consolidates the two-step process + // of rc.exe into one step, since rc.exe's preprocessor converts to UTF-16 (this + // is when invalid sequences are replaced by the replacement character (U+FFFD)), + // and then that's run through the parser. Our preprocessor keeps things in their + // original encoding, meaning we emulate the <encoding> -> UTF-16 -> Windows-1252 + // results all at once. while (try iterative_parser.next()) |parsed| { const c = parsed.codepoint; - switch (was_wide_string) { - true => { - switch (c) { - 0...0x7F, 0xA0...0xFF => try buf.append(@intCast(c)), - 0x80...0x9F => { - if (windows1252.bestFitFromCodepoint(c)) |_| { - try buf.append(@intCast(c)); - } else { - try buf.append('?'); - } - }, - else => { - if (windows1252.bestFitFromCodepoint(c)) |best_fit| { - try buf.append(best_fit); - } else if (c < 0x10000 or c == code_pages.Codepoint.invalid) { - try buf.append('?'); - } else { - try buf.appendSlice("??"); - } - }, + switch (iterative_parser.declared_string_type) { + .wide => { + if (windows1252.bestFitFromCodepoint(c)) |best_fit| { + try buf.append(best_fit); + } else if (c < 0x10000 or c == code_pages.Codepoint.invalid or parsed.escaped_surrogate_pair) { + try buf.append('?'); + } else { + try buf.appendSlice("??"); } }, - false => { + .ascii => { if (parsed.from_escaped_integer) { - try buf.append(@truncate(c)); + const truncated: u8 = @truncate(c); + switch (output_code_page) { + .utf8 => switch (truncated) { + 0...0x7F => try buf.append(truncated), + else => try buf.append('?'), + }, + .windows1252 => { + try buf.append(truncated); + }, + else => unreachable, // unsupported code page + } } else { if (windows1252.bestFitFromCodepoint(c)) |best_fit| { try buf.append(best_fit); @@ -484,6 +481,10 @@ pub const Compiler = struct { const parsed_filename_terminated = std.mem.sliceTo(parsed_filename, 0); header.applyMemoryFlags(node.common_resource_attributes, self.source); + // This is effectively limited by `max_string_literal_codepoints` which is a u15. + // Each codepoint within a DLGINCLUDE string is encoded as a maximum of + // 2 bytes, which means that the maximum byte length of a DLGINCLUDE string is + // (including the NUL terminator): 32,767 * 2 + 1 = 65,535 or exactly the u16 max. header.data_size = @intCast(parsed_filename_terminated.len + 1); try header.write(writer, .{ .diagnostics = self.diagnostics, .token = node.id }); try writer.writeAll(parsed_filename_terminated); @@ -1298,6 +1299,7 @@ pub const Compiler = struct { return res.parseAcceleratorKeyString(bytes, is_virt, .{ .start_column = column, .diagnostics = .{ .diagnostics = self.diagnostics, .token = literal.token }, + .output_code_page = self.output_code_pages.getForToken(literal.token), }); } } diff --git a/src/literals.zig b/src/literals.zig index b653e08..dcbd61d 100644 --- a/src/literals.zig +++ b/src/literals.zig @@ -98,12 +98,24 @@ pub const IterativeStringParser = struct { pub const ParsedCodepoint = struct { codepoint: u21, - /// Note: If this is true, `codepoint` will be a value with a max of maxInt(u16). - /// This is enforced by using saturating arithmetic, so in e.g. a wide string literal the - /// octal escape sequence \7777777 (2,097,151) will be parsed into the value 0xFFFF (65,535). - /// If the value needs to be truncated to a smaller integer (for ASCII string literals), then that - /// must be done by the caller. + /// Note: If this is true, `codepoint` will have an effective maximum value + /// of 0xFFFF, as `codepoint` is calculated using wrapping arithmetic on a u16. + /// If the value needs to be truncated to a smaller integer (e.g. for ASCII string + /// literals), then that must be done by the caller. from_escaped_integer: bool = false, + /// Denotes that the codepoint is: + /// - Escaped (has a \ in front of it), and + /// - Has a value >= U+10000, meaning it would be encoded as a surrogate + /// pair in UTF-16, and + /// - Is part of a wide string literal + /// + /// Normally in wide string literals, invalid escapes are omitted + /// during parsing (the codepoints are not returned at all during + /// the `next` call), but this is a special case in which the + /// escape only applies to the high surrogate pair of the codepoint. + /// + /// TODO: Maybe just return the low surrogate codepoint by itself in this case. + escaped_surrogate_pair: bool = false, }; pub fn next(self: *IterativeStringParser) std.mem.Allocator.Error!?ParsedCodepoint { @@ -269,7 +281,63 @@ pub const IterativeStringParser = struct { backtrack = true; }, else => switch (self.declared_string_type) { - .wide => {}, // invalid escape sequences are skipped in wide strings + .wide => { + // All invalid escape sequences are skipped in wide strings, + // but there is a special case around \<tab> where the \ + // is skipped but the tab character is processed. + // It's actually a bit weirder than that, though, since + // the preprocessor is the one that does the <tab> -> spaces + // conversion, so it goes something like this: + // + // Before preprocessing: L"\<tab>" + // After preprocessing: L"\ " + // + // So the parser only sees an escaped space character followed + // by some other number of spaces >= 0. + // + // However, our preprocessor keeps tab characters intact, so we emulate + // the above behavior by skipping the \ and then outputting one less + // space than normal for the <tab> character. + if (c == '\t') { + // Only warn about a tab getting converted to spaces once per string + if (self.diagnostics != null and !self.seen_tab) { + try self.diagnostics.?.diagnostics.append(ErrorDetails{ + .err = .tab_converted_to_spaces, + .type = .warning, + .token = self.diagnostics.?.token, + }); + try self.diagnostics.?.diagnostics.append(ErrorDetails{ + .err = .tab_converted_to_spaces, + .type = .note, + .token = self.diagnostics.?.token, + .print_source_line = false, + }); + self.seen_tab = true; + } + + const cols = columnsUntilTabStop(self.column, 8); + // If the tab character would only be converted to a single space, + // then we can just skip both the \ and the <tab> and move on. + if (cols > 1) { + self.num_pending_spaces = @intCast(cols - 2); + self.index += codepoint.byte_len; + return .{ .codepoint = ' ' }; + } + } + // There's a second special case when the codepoint would be encoded + // as a surrogate pair in UTF-16, as the escape 'applies' to the + // high surrogate pair only in this instance. This is a side-effect + // of the Win32 RC compiler preprocessor outputting UTF-16 and the + // compiler itself seemingly working on code units instead of code points + // in this particular instance. + // + // We emulate this behavior by emitting the codepoint, but with a marker + // that indicates that it needs to be handled specially. + if (c >= 0x10000 and c != code_pages.Codepoint.invalid) { + self.index += codepoint.byte_len; + return .{ .codepoint = c, .escaped_surrogate_pair = true }; + } + }, .ascii => { // we intentionally avoid incrementing self.index // to handle the current char in the next call, @@ -303,6 +371,9 @@ pub const IterativeStringParser = struct { }, .escaped_octal => switch (c) { '0'...'7' => { + // Note: We use wrapping arithmetic on a u16 here since there's been no observed + // string parsing scenario where an escaped integer with a value >= the u16 + // max is interpreted as anything but the truncated u16 value. string_escape_n *%= 8; string_escape_n +%= std.fmt.charToDigit(@intCast(c), 8) catch unreachable; string_escape_i += 1; @@ -389,46 +460,51 @@ pub fn parseQuotedString( while (try iterative_parser.next()) |parsed| { const c = parsed.codepoint; - if (parsed.from_escaped_integer) { - // We truncate here to get the correct behavior for ascii strings - try buf.append(std.mem.nativeToLittle(T, @truncate(c))); - } else { - switch (literal_type) { - .ascii => switch (options.output_code_page) { - .windows1252 => { - if (windows1252.bestFitFromCodepoint(c)) |best_fit| { - try buf.append(best_fit); - } else if (c < 0x10000 or c == code_pages.Codepoint.invalid) { - try buf.append('?'); - } else { - try buf.appendSlice("??"); - } - }, - .utf8 => { - var codepoint_to_encode = c; - if (c == code_pages.Codepoint.invalid) { - codepoint_to_encode = '�'; - } - var utf8_buf: [4]u8 = undefined; - const utf8_len = std.unicode.utf8Encode(codepoint_to_encode, &utf8_buf) catch unreachable; - try buf.appendSlice(utf8_buf[0..utf8_len]); - }, - else => unreachable, // Unsupported code page - }, - .wide => { - if (c == code_pages.Codepoint.invalid) { - try buf.append(std.mem.nativeToLittle(u16, '�')); - } else if (c < 0x10000) { - const short: u16 = @intCast(c); - try buf.append(std.mem.nativeToLittle(u16, short)); + switch (literal_type) { + .ascii => switch (options.output_code_page) { + .windows1252 => { + if (parsed.from_escaped_integer) { + try buf.append(@truncate(c)); + } else if (windows1252.bestFitFromCodepoint(c)) |best_fit| { + try buf.append(best_fit); + } else if (c < 0x10000 or c == code_pages.Codepoint.invalid) { + try buf.append('?'); } else { + try buf.appendSlice("??"); + } + }, + .utf8 => { + var codepoint_to_encode = c; + if (parsed.from_escaped_integer) { + codepoint_to_encode = @as(T, @truncate(c)); + } + const escaped_integer_outside_ascii_range = parsed.from_escaped_integer and codepoint_to_encode > 0x7F; + if (escaped_integer_outside_ascii_range or c == code_pages.Codepoint.invalid) { + codepoint_to_encode = '�'; + } + var utf8_buf: [4]u8 = undefined; + const utf8_len = std.unicode.utf8Encode(codepoint_to_encode, &utf8_buf) catch unreachable; + try buf.appendSlice(utf8_buf[0..utf8_len]); + }, + else => unreachable, // Unsupported code page + }, + .wide => { + if (parsed.from_escaped_integer) { + try buf.append(std.mem.nativeToLittle(u16, @truncate(c))); + } else if (c == code_pages.Codepoint.invalid) { + try buf.append(std.mem.nativeToLittle(u16, '�')); + } else if (c < 0x10000) { + const short: u16 = @intCast(c); + try buf.append(std.mem.nativeToLittle(u16, short)); + } else { + if (!parsed.escaped_surrogate_pair) { const high = @as(u16, @intCast((c - 0x10000) >> 10)) + 0xD800; try buf.append(std.mem.nativeToLittle(u16, high)); - const low = @as(u16, @intCast(c & 0x3FF)) + 0xDC00; - try buf.append(std.mem.nativeToLittle(u16, low)); } - }, - } + const low = @as(u16, @intCast(c & 0x3FF)) + 0xDC00; + try buf.append(std.mem.nativeToLittle(u16, low)); + } + }, } } @@ -652,6 +728,18 @@ test "parse quoted ascii string with utf8 code page" { )); } +test "parse quoted string with different input/output code pages" { + var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena_allocator.deinit(); + const arena = arena_allocator.allocator(); + + try std.testing.expectEqualSlices(u8, "€���\x60\x7F", try parseQuotedAsciiString( + arena, + .{ .slice = "\"\x80\\x8a\\600\\612\\540\\577\"", .code_page = .windows1252 }, + .{ .output_code_page = .utf8 }, + )); +} + test "parse quoted wide string" { var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator); defer arena_allocator.deinit(); diff --git a/src/res.zig b/src/res.zig index 991e0b8..0b42a65 100644 --- a/src/res.zig +++ b/src/res.zig @@ -603,12 +603,34 @@ pub const AcceleratorModifiers = struct { const AcceleratorKeyCodepointTranslator = struct { string_type: literals.StringType, + output_code_page: CodePage, pub fn translate(self: @This(), maybe_parsed: ?literals.IterativeStringParser.ParsedCodepoint) ?u21 { const parsed = maybe_parsed orelse return null; if (parsed.codepoint == Codepoint.invalid) return 0xFFFD; - if (parsed.from_escaped_integer and self.string_type == .ascii) { - return windows1252.toCodepoint(@truncate(parsed.codepoint)); + if (parsed.from_escaped_integer) { + switch (self.string_type) { + .ascii => { + const truncated: u8 = @truncate(parsed.codepoint); + switch (self.output_code_page) { + .utf8 => switch (truncated) { + 0...0x7F => return truncated, + else => return 0xFFFD, + }, + .windows1252 => return windows1252.toCodepoint(truncated), + else => unreachable, // unsupported code page + } + }, + .wide => { + const truncated: u16 = @truncate(parsed.codepoint); + return truncated; + }, + } + } + if (parsed.escaped_surrogate_pair) { + // The codepoint of only the low surrogate + const low = @as(u16, @intCast(parsed.codepoint & 0x3FF)) + 0xDC00; + return low; } return parsed.codepoint; } @@ -623,14 +645,17 @@ pub fn parseAcceleratorKeyString(bytes: SourceBytes, is_virt: bool, options: lit } var parser = literals.IterativeStringParser.init(bytes, options); - var translator = AcceleratorKeyCodepointTranslator{ .string_type = parser.declared_string_type }; + var translator = AcceleratorKeyCodepointTranslator{ + .string_type = parser.declared_string_type, + .output_code_page = options.output_code_page, + }; const first_codepoint = translator.translate(try parser.next()) orelse return error.EmptyAccelerator; // 0 is treated as a terminator, so this is equivalent to an empty string if (first_codepoint == 0) return error.EmptyAccelerator; if (first_codepoint == '^') { - // Note: Emitting this warning unconditonally whenever ^ is the first character + // Note: Emitting this warning unconditionally whenever ^ is the first character // matches the Win32 RC behavior, but it's questionable whether or not // the warning should be emitted for ^^ since that results in the ASCII // character ^ being written to the .res. @@ -643,6 +668,12 @@ pub fn parseAcceleratorKeyString(bytes: SourceBytes, is_virt: bool, options: lit } const c = translator.translate(try parser.next()) orelse return error.InvalidControlCharacter; + + const third_codepoint = translator.translate(try parser.next()); + // 0 is treated as a terminator, so a 0 in the third position is fine but + // anything else is too many codepoints for an accelerator + if (third_codepoint != null and third_codepoint.? != 0) return error.InvalidControlCharacter; + switch (c) { '^' => return '^', // special case 'a'...'z', 'A'...'Z' => return std.ascii.toUpper(@intCast(c)) - 0x40, @@ -873,6 +904,47 @@ test "accelerator keys" { false, .{}, )); + + // Misc special cases + try std.testing.expectEqual(@as(u16, 0xFFFD), try parseAcceleratorKeyString( + .{ .slice = "\"\\777\"", .code_page = .utf8 }, + false, + .{ .output_code_page = .utf8 }, + )); + try std.testing.expectEqual(@as(u16, 0xFFFF), try parseAcceleratorKeyString( + .{ .slice = "L\"\\7777777\"", .code_page = .utf8 }, + false, + .{ .output_code_page = .utf8 }, + )); + try std.testing.expectEqual(@as(u16, 0x01), try parseAcceleratorKeyString( + .{ .slice = "L\"\\200001\"", .code_page = .utf8 }, + false, + .{ .output_code_page = .utf8 }, + )); + // Escape of a codepoint >= 0x10000 omits the high surrogate pair + try std.testing.expectEqual(@as(u16, 0xDF48), try parseAcceleratorKeyString( + .{ .slice = "L\"\\𐍈\"", .code_page = .utf8 }, + false, + .{ .output_code_page = .utf8 }, + )); + // Invalid escape code is skipped, allows for 2 codepoints afterwards + try std.testing.expectEqual(@as(u16, 0x7878), try parseAcceleratorKeyString( + .{ .slice = "L\"\\kxx\"", .code_page = .utf8 }, + false, + .{ .output_code_page = .utf8 }, + )); + // Escape of a codepoint >= 0x10000 allows for a codepoint afterwards + try std.testing.expectEqual(@as(u16, 0x4878), try parseAcceleratorKeyString( + .{ .slice = "L\"\\𐍈x\"", .code_page = .utf8 }, + false, + .{ .output_code_page = .utf8 }, + )); + // Input code page of 1252, output code page of utf-8 + try std.testing.expectEqual(@as(u16, 0xFFFD), try parseAcceleratorKeyString( + .{ .slice = "\"\\270\"", .code_page = .windows1252 }, + false, + .{ .output_code_page = .utf8 }, + )); } pub const ForcedOrdinal = struct { diff --git a/test/compile.zig b/test/compile.zig index be75d58..9797475 100644 --- a/test/compile.zig +++ b/test/compile.zig @@ -1222,6 +1222,24 @@ test "separate input and output code pages" { ); } +test "input code page windows-1252, output code page utf8" { + const source = "#pragma code_page(1252)\n1 RCDATA { \"\xd1\\x8a\\603\\106\xf6\\xb5\x90\x84\\506|\\x09\\102\x8c\\x9b\\754\" }"; + const expected = "\x00\x00\x00\x00 \x00\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1f\x00\x00\x00 \x00\x00\x00\xff\xff\n\x00\xff\xff\x01\x00\x00\x00\x00\x000\x00\t\x04\x00\x00\x00\x00\x00\x00\x00\x00\xc3\x91\xef\xbf\xbd\xef\xbf\xbdF\xc3\xb6\xef\xbf\xbd\xc2\x90\xe2\x80\x9eF|\tB\xc5\x92\xef\xbf\xbd\xef\xbf\xbd\x00"; + try testCompileWithOutputAndOptions(source, expected, .{ + .cwd = std.fs.cwd(), + .default_code_page = .utf8, + }); +} + +test "input code page utf8, output code page windows-1252" { + const source = "#pragma code_page(65001)\n1 RCDATA { \"\xd1\\x8a\\603\\106\xf6\\xb5\x90\x84\\506|\\x09\\102\x8c\\x9b\\754\" }"; + const expected = "\x00\x00\x00\x00 \x00\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x00\x00 \x00\x00\x00\xff\xff\n\x00\xff\xff\x01\x00\x00\x00\x00\x000\x00\t\x04\x00\x00\x00\x00\x00\x00\x00\x00?\x8a\x83F?\xb5??F|\tB?\x9b\xec\x00"; + try testCompileWithOutputAndOptions(source, expected, .{ + .cwd = std.fs.cwd(), + .default_code_page = .windows1252, + }); +} + test "tab within a string literal" { try testCompileErrorDetails( &.{ @@ -1236,14 +1254,39 @@ test "tab within a string literal" { ); } +test "escaped tab characters within strings" { + // `source` corresponds to the following: + // + // 1 RCDATA { + // "\<tab>" + // L"\<tab>" + // "\<tab>" + // L"\<tab>" + // } + // 2 DLGINCLUDE "\<tab>" + // 3 DLGINCLUDE L"\<tab>" + // 4 DLGINCLUDE "\<tab>" + // 5 DLGINCLUDE L"\<tab>" + // + // where the <tab>s are literal tabstop characters. + // The intention is to test cases where the tabstop is + // replaced by multiple spaces and a single space. + const source = "1 RCDATA {\r\n \"\\\t\"\r\nL\"\\\t\"\r\n \"\\\t\"\r\n L\"\\\t\"\r\n}\r\n2 DLGINCLUDE \"\\\t\"\r\n3 DLGINCLUDE L\"\\\t\"\r\n4 DLGINCLUDE \"\\\t\"\r\n5 DLGINCLUDE L\"\\\t\""; + const expected = "\x00\x00\x00\x00 \x00\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00\x00 \x00\x00\x00\xff\xff\n\x00\xff\xff\x01\x00\x00\x00\x00\x000\x00\t\x04\x00\x00\x00\x00\x00\x00\x00\x00\\ \x00 \x00 \x00 \x00\\ \n\x00\x00\x00 \x00\x00\x00\xff\xff\x11\x00\xff\xff\x02\x00\x00\x00\x00\x000\x10\t\x04\x00\x00\x00\x00\x00\x00\x00\x00\\ \x00\x00\x00\x08\x00\x00\x00 \x00\x00\x00\xff\xff\x11\x00\xff\xff\x03\x00\x00\x00\x00\x000\x10\t\x04\x00\x00\x00\x00\x00\x00\x00\x00 \x00\x03\x00\x00\x00 \x00\x00\x00\xff\xff\x11\x00\xff\xff\x04\x00\x00\x00\x00\x000\x10\t\x04\x00\x00\x00\x00\x00\x00\x00\x00\\ \x00\x00\x01\x00\x00\x00 \x00\x00\x00\xff\xff\x11\x00\xff\xff\x05\x00\x00\x00\x00\x000\x10\t\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"; + + try testCompileWithOutput(source, expected, std.fs.cwd()); +} + test "DLGINCLUDE strings" { + // \400 truncates to \x00 in the ascii literal + // In the wide literal, \400 is Ā (U+0100) which is converted to A via the Windows-1252 best fit const source = - \\1 DLGINCLUDE "\400\777\xff\x80\x81кириллица" - \\2 DLGINCLUDE L"\400\777\xff\x80\x81кириллица" + \\1 DLGINCLUDE "\x41\501\777\xff\x80\x81кириллица\400" + \\2 DLGINCLUDE L"\x41\501\777\xff\x80\x81кириллица\400" ; - const windows1252_expected_output = "\x00\x00\x00\x00 \x00\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00 \x00\x00\x00\xff\xff\x11\x00\xff\xff\x01\x00\x00\x00\x00\x000\x10\t\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x18\x00\x00\x00 \x00\x00\x00\xff\xff\x11\x00\xff\xff\x02\x00\x00\x00\x00\x000\x10\t\x04\x00\x00\x00\x00\x00\x00\x00\x00A?\xff?\x81\xd0\xba\xd0\xb8\xd1\x80\xd0\xb8\xd0\xbb\xd0\xbb\xd0\xb8\xd1\x86\xd0\xb0\x00"; - const utf8_expected_output = "\x00\x00\x00\x00 \x00\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00 \x00\x00\x00\xff\xff\x11\x00\xff\xff\x01\x00\x00\x00\x00\x000\x10\t\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x00\x00 \x00\x00\x00\xff\xff\x11\x00\xff\xff\x02\x00\x00\x00\x00\x000\x10\t\x04\x00\x00\x00\x00\x00\x00\x00\x00A?\xff?\x81?????????\x00\x00"; + const windows1252_expected_output = "\x00\x00\x00\x00 \x00\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x19\x00\x00\x00 \x00\x00\x00\xff\xff\x11\x00\xff\xff\x01\x00\x00\x00\x00\x000\x10\t\x04\x00\x00\x00\x00\x00\x00\x00\x00AA\xff\xff\x80\x81\xd0\xba\xd0\xb8\xd1\x80\xd0\xb8\xd0\xbb\xd0\xbb\xd0\xb8\xd1\x86\xd0\xb0\x00\x00\x00\x00\x1a\x00\x00\x00 \x00\x00\x00\xff\xff\x11\x00\xff\xff\x02\x00\x00\x00\x00\x000\x10\t\x04\x00\x00\x00\x00\x00\x00\x00\x00AL?\xff?\x81\xd0\xba\xd0\xb8\xd1\x80\xd0\xb8\xd0\xbb\xd0\xbb\xd0\xb8\xd1\x86\xd0\xb0A\x00\x00\x00"; + const utf8_expected_output = "\x00\x00\x00\x00 \x00\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00\x00 \x00\x00\x00\xff\xff\x11\x00\xff\xff\x01\x00\x00\x00\x00\x000\x10\t\x04\x00\x00\x00\x00\x00\x00\x00\x00AA?????????????\x00\x11\x00\x00\x00 \x00\x00\x00\xff\xff\x11\x00\xff\xff\x02\x00\x00\x00\x00\x000\x10\t\x04\x00\x00\x00\x00\x00\x00\x00\x00AL?\xff?\x81?????????A\x00\x00\x00\x00"; try testCompileWithOutput(source, windows1252_expected_output, std.fs.cwd()); try testCompileWithOutputAndOptions(source, utf8_expected_output, .{ @@ -1252,6 +1295,69 @@ test "DLGINCLUDE strings" { }); } +test "DLGINCLUDE input code page windows-1252, output code page utf8" { + const source = "#pragma code_page(1252)\n1 DLGINCLUDE \"\xd1\\x8a\\603\\106\xf6\\xb5\x90\x84\\506|\\x09\\102\x8c\\x9b\\754\""; + const expected = "\x00\x00\x00\x00 \x00\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00\x00 \x00\x00\x00\xff\xff\x11\x00\xff\xff\x01\x00\x00\x00\x00\x000\x10\t\x04\x00\x00\x00\x00\x00\x00\x00\x00\xd1??F\xf6?\x90\x84F|\tB\x8c??\x00"; + try testCompileWithOutputAndOptions(source, expected, .{ + .cwd = std.fs.cwd(), + .default_code_page = .utf8, + }); +} + +test "DLGINCLUDE input code page utf8, output code page windows-1252" { + const source = "#pragma code_page(65001)\n1 DLGINCLUDE \"\xd1\\x8a\\603\\106\xf6\\xb5\x90\x84\\506|\\x09\\102\x8c\\x9b\\754\""; + const expected = "\x00\x00\x00\x00 \x00\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00\x00 \x00\x00\x00\xff\xff\x11\x00\xff\xff\x01\x00\x00\x00\x00\x000\x10\t\x04\x00\x00\x00\x00\x00\x00\x00\x00?\x8a\x83F?\xb5??F|\tB?\x9b\xec\x00"; + try testCompileWithOutputAndOptions(source, expected, .{ + .cwd = std.fs.cwd(), + .default_code_page = .windows1252, + }); +} + +test "DLGINCLUDE wide windows-1252" { + const source = + \\1 DLGINCLUDE L"€𐍈ह\x58bc\200001\777\xff\x80\x81кириллица\400" + ; + const expected = "\x00\x00\x00\x00 \x00\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00$\x00\x00\x00 \x00\x00\x00\xff\xff\x11\x00\xff\xff\x01\x00\x00\x00\x00\x000\x10\t\x04\x00\x00\x00\x00\x00\x00\x00\x00\xe2\x82\xac\xf0\x90\x8d\x88\xe0\xa4\xb9?\x01?\xff?\x81\xd0\xba\xd0\xb8\xd1\x80\xd0\xb8\xd0\xbb\xd0\xbb\xd0\xb8\xd1\x86\xd0\xb0A\x00"; + try testCompileWithOutputAndOptions(source, expected, .{ + .cwd = std.fs.cwd(), + }); +} + +test "DLGINCLUDE wide utf8" { + const source = + \\1 DLGINCLUDE L"€𐍈ह\x58bc\200001\777\xff\x80\x81кириллица\400" + ; + const expected = "\x00\x00\x00\x00 \x00\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x15\x00\x00\x00 \x00\x00\x00\xff\xff\x11\x00\xff\xff\x01\x00\x00\x00\x00\x000\x10\t\x04\x00\x00\x00\x00\x00\x00\x00\x00\x80????\x01?\xff?\x81?????????A\x00\x00\x00\x00"; + try testCompileWithOutputAndOptions(source, expected, .{ + .cwd = std.fs.cwd(), + .default_code_page = .utf8, + }); +} + +test "DLGINCLUDE wide input code page windows-1252, output code page utf8" { + const source = + \\#pragma code_page(1252) + \\1 DLGINCLUDE L"€𐍈ह\x58bc\200001\777\xff\x80\x81кириллица\400" + ; + const expected = "\x00\x00\x00\x00 \x00\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00$\x00\x00\x00 \x00\x00\x00\xff\xff\x11\x00\xff\xff\x01\x00\x00\x00\x00\x000\x10\t\x04\x00\x00\x00\x00\x00\x00\x00\x00\xe2\x82\xac\xf0\x90\x8d\x88\xe0\xa4\xb9?\x01?\xff?\x81\xd0\xba\xd0\xb8\xd1\x80\xd0\xb8\xd0\xbb\xd0\xbb\xd0\xb8\xd1\x86\xd0\xb0A\x00"; + try testCompileWithOutputAndOptions(source, expected, .{ + .cwd = std.fs.cwd(), + .default_code_page = .utf8, + }); +} + +test "DLGINCLUDE wide input code page utf8, output code page windows-1252" { + const source = + \\#pragma code_page(65001) + \\1 DLGINCLUDE L"€𐍈ह\x58bc\200001\777\xff\x80\x81кириллица\400" + ; + const expected = "\x00\x00\x00\x00 \x00\x00\x00\xff\xff\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x15\x00\x00\x00 \x00\x00\x00\xff\xff\x11\x00\xff\xff\x01\x00\x00\x00\x00\x000\x10\t\x04\x00\x00\x00\x00\x00\x00\x00\x00\x80????\x01?\xff?\x81?????????A\x00\x00\x00\x00"; + try testCompileWithOutputAndOptions(source, expected, .{ + .cwd = std.fs.cwd(), + .default_code_page = .windows1252, + }); +} + fn testCompileWithOutput(source: []const u8, expected_output: []const u8, cwd: std.fs.Dir) !void { return testCompileWithOutputAndOptions(source, expected_output, .{ .cwd = cwd }); } diff --git a/test/fuzzy_accelerators.zig b/test/fuzzy_accelerators.zig new file mode 100644 index 0000000..168edf0 --- /dev/null +++ b/test/fuzzy_accelerators.zig @@ -0,0 +1,93 @@ +const std = @import("std"); +const resinator = @import("resinator"); +const utils = @import("utils.zig"); +const fuzzy_options = @import("fuzzy_options"); +const iterations = fuzzy_options.max_iterations; + +test "fuzz" { + const allocator = std.testing.allocator; + var random = std.rand.DefaultPrng.init(0); + const rand = random.random(); + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + + const tmp_path = try tmp.dir.realpathAlloc(allocator, "."); + defer allocator.free(tmp_path); + + var source_buffer = std.ArrayList(u8).init(allocator); + defer source_buffer.deinit(); + + var cache_path_buffer = std.ArrayList(u8).init(allocator); + defer cache_path_buffer.deinit(); + + var i: u64 = 0; + while (iterations == 0 or i < iterations) : (i += 1) { + const num_sequences = rand.uintAtMostBiased(u16, 3) + 1; + const string_type: resinator.literals.StringType = if (i % 2 == 0) .wide else .ascii; + const control = rand.boolean(); + const literal = literal: { + const literal = switch (string_type) { + .ascii => try utils.randomStringLiteralExact(.ascii, allocator, rand, num_sequences), + .wide => try utils.randomStringLiteralExact(.wide, allocator, rand, num_sequences), + }; + if (!control) break :literal literal; + defer allocator.free(literal); + const insert_i: usize = if (string_type == .ascii) 1 else 2; + const control_literal = try allocator.alloc(u8, literal.len + 1); + @memcpy(control_literal[0..insert_i], literal[0..insert_i]); + control_literal[insert_i] = '^'; + @memcpy(control_literal[insert_i + 1 ..], literal[insert_i..]); + break :literal control_literal; + }; + defer allocator.free(literal); + + const Variation = enum { + io_windows1252, + in_1252_out_utf8, + io_utf8, + in_utf8_out_1252, + }; + for (std.enums.values(Variation)) |variation| { + source_buffer.clearRetainingCapacity(); + var source_writer = source_buffer.writer(); + // Swap the input code page using #pragma code_page + // (this only works if its the first #pragma code_page in the file) + switch (variation) { + .in_1252_out_utf8 => try source_writer.writeAll("#pragma code_page(1252)\n"), + .in_utf8_out_1252 => try source_writer.writeAll("#pragma code_page(65001)\n"), + else => {}, + } + try source_writer.print("1 ACCELERATORS {{ {s}, 0x1 }}", .{literal}); + + const source = source_buffer.items; + + utils.expectSameResOutput(allocator, source, .{ + .cwd = tmp.dir, + .cwd_path = tmp_path, + .default_code_page = switch (variation) { + // The default code page affects the output code page only if the first + // #pragma code_page changes the code page + .io_windows1252, .in_utf8_out_1252 => .windows1252, + .io_utf8, .in_1252_out_utf8 => .utf8, + }, + }) catch |err| { + cache_path_buffer.clearRetainingCapacity(); + try cache_path_buffer.appendSlice("zig-cache/tmp/fuzzy_accelerators_"); + try utils.appendNumberStr(&cache_path_buffer, i); + try cache_path_buffer.append('_'); + try cache_path_buffer.appendSlice(@tagName(string_type)); + try cache_path_buffer.append('_'); + try cache_path_buffer.appendSlice(@tagName(variation)); + try cache_path_buffer.append('_'); + try utils.appendNumberStr(&cache_path_buffer, num_sequences); + try cache_path_buffer.append('_'); + try cache_path_buffer.appendSlice(@errorName(err)); + try cache_path_buffer.appendSlice(".rc"); + + // write out the source file to disk for debugging + try std.fs.cwd().writeFile(cache_path_buffer.items, source); + }; + } + } +} diff --git a/test/fuzzy_ascii_strings.zig b/test/fuzzy_ascii_strings.zig index 85e505d..13c4fb2 100644 --- a/test/fuzzy_ascii_strings.zig +++ b/test/fuzzy_ascii_strings.zig @@ -78,7 +78,7 @@ test "fuzz" { var i: u64 = 0; while (iterations == 0 or i < iterations) : (i += 1) { source_buffer.shrinkRetainingCapacity(0); - const literal = try utils.randomAsciiStringLiteral(allocator, rand); + const literal = try utils.randomStringLiteral(.ascii, allocator, rand, 256); defer allocator.free(literal); var source_writer = source_buffer.writer(); try source_writer.print("1 RCDATA {{ {s} }}", .{literal}); diff --git a/test/fuzzy_dlginclude.zig b/test/fuzzy_dlginclude.zig index 2ec7131..b9fdaee 100644 --- a/test/fuzzy_dlginclude.zig +++ b/test/fuzzy_dlginclude.zig @@ -1,8 +1,86 @@ const std = @import("std"); +const resinator = @import("resinator"); const utils = @import("utils.zig"); const fuzzy_options = @import("fuzzy_options"); const iterations = fuzzy_options.max_iterations; +test "fuzz" { + const allocator = std.testing.allocator; + var random = std.rand.DefaultPrng.init(0); + const rand = random.random(); + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + + const tmp_path = try tmp.dir.realpathAlloc(allocator, "."); + defer allocator.free(tmp_path); + + var source_buffer = std.ArrayList(u8).init(allocator); + defer source_buffer.deinit(); + + var cache_path_buffer = std.ArrayList(u8).init(allocator); + defer cache_path_buffer.deinit(); + + var i: u64 = 0; + while (iterations == 0 or i < iterations) : (i += 1) { + const num_sequences = rand.uintAtMostBiased(u16, 512); + const string_type: resinator.literals.StringType = if (i % 2 == 0) .wide else .ascii; + const literal = switch (string_type) { + .ascii => try utils.randomStringLiteralExact(.ascii, allocator, rand, num_sequences), + .wide => try utils.randomStringLiteralExact(.wide, allocator, rand, num_sequences), + }; + defer allocator.free(literal); + + const Variation = enum { + io_windows1252, + in_1252_out_utf8, + io_utf8, + in_utf8_out_1252, + }; + for (std.enums.values(Variation)) |variation| { + source_buffer.clearRetainingCapacity(); + var source_writer = source_buffer.writer(); + // Swap the input code page using #pragma code_page + // (this only works if its the first #pragma code_page in the file) + switch (variation) { + .in_1252_out_utf8 => try source_writer.writeAll("#pragma code_page(1252)\n"), + .in_utf8_out_1252 => try source_writer.writeAll("#pragma code_page(65001)\n"), + else => {}, + } + try source_writer.print("1 DLGINCLUDE {s}", .{literal}); + + const source = source_buffer.items; + + utils.expectSameResOutput(allocator, source, .{ + .cwd = tmp.dir, + .cwd_path = tmp_path, + .default_code_page = switch (variation) { + // The default code page affects the output code page only if the first + // #pragma code_page changes the code page + .io_windows1252, .in_utf8_out_1252 => .windows1252, + .io_utf8, .in_1252_out_utf8 => .utf8, + }, + }) catch |err| { + cache_path_buffer.clearRetainingCapacity(); + try cache_path_buffer.appendSlice("zig-cache/tmp/fuzzy_dlginclude_"); + try utils.appendNumberStr(&cache_path_buffer, i); + try cache_path_buffer.append('_'); + try cache_path_buffer.appendSlice(@tagName(string_type)); + try cache_path_buffer.append('_'); + try cache_path_buffer.appendSlice(@tagName(variation)); + try cache_path_buffer.append('_'); + try utils.appendNumberStr(&cache_path_buffer, num_sequences); + try cache_path_buffer.append('_'); + try cache_path_buffer.appendSlice(@errorName(err)); + try cache_path_buffer.appendSlice(".rc"); + + // write out the source file to disk for debugging + try std.fs.cwd().writeFile(cache_path_buffer.items, source); + }; + } + } +} + test "octal escapes, ascii string literal" { const allocator = std.testing.allocator; diff --git a/test/fuzzy_strings.zig b/test/fuzzy_strings.zig new file mode 100644 index 0000000..13dfa65 --- /dev/null +++ b/test/fuzzy_strings.zig @@ -0,0 +1,83 @@ +const std = @import("std"); +const resinator = @import("resinator"); +const utils = @import("utils.zig"); +const fuzzy_options = @import("fuzzy_options"); +const iterations = fuzzy_options.max_iterations; + +test "fuzz" { + const allocator = std.testing.allocator; + var random = std.rand.DefaultPrng.init(0); + const rand = random.random(); + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + + const tmp_path = try tmp.dir.realpathAlloc(allocator, "."); + defer allocator.free(tmp_path); + + var source_buffer = std.ArrayList(u8).init(allocator); + defer source_buffer.deinit(); + + var cache_path_buffer = std.ArrayList(u8).init(allocator); + defer cache_path_buffer.deinit(); + + var i: u64 = 0; + while (iterations == 0 or i < iterations) : (i += 1) { + const num_sequences = rand.uintAtMostBiased(u16, 32); + const string_type: resinator.literals.StringType = if (i % 2 == 0) .wide else .ascii; + const literal = switch (string_type) { + .ascii => try utils.randomStringLiteralExact(.ascii, allocator, rand, num_sequences), + .wide => try utils.randomStringLiteralExact(.wide, allocator, rand, num_sequences), + }; + defer allocator.free(literal); + + const Variation = enum { + io_windows1252, + in_1252_out_utf8, + io_utf8, + in_utf8_out_1252, + }; + for (std.enums.values(Variation)) |variation| { + source_buffer.clearRetainingCapacity(); + var source_writer = source_buffer.writer(); + // Swap the input code page using #pragma code_page + // (this only works if its the first #pragma code_page in the file) + switch (variation) { + .in_1252_out_utf8 => try source_writer.writeAll("#pragma code_page(1252)\n"), + .in_utf8_out_1252 => try source_writer.writeAll("#pragma code_page(65001)\n"), + else => {}, + } + try source_writer.print("1 RCDATA {{ {s} }}", .{literal}); + + const source = source_buffer.items; + + utils.expectSameResOutput(allocator, source, .{ + .cwd = tmp.dir, + .cwd_path = tmp_path, + .default_code_page = switch (variation) { + // The default code page affects the output code page only if the first + // #pragma code_page changes the code page + .io_windows1252, .in_utf8_out_1252 => .windows1252, + .io_utf8, .in_1252_out_utf8 => .utf8, + }, + }) catch |err| { + cache_path_buffer.clearRetainingCapacity(); + try cache_path_buffer.appendSlice("zig-cache/tmp/fuzzy_strings_"); + try utils.appendNumberStr(&cache_path_buffer, i); + try cache_path_buffer.append('_'); + try cache_path_buffer.appendSlice(@tagName(string_type)); + try cache_path_buffer.append('_'); + try cache_path_buffer.appendSlice(@tagName(variation)); + try cache_path_buffer.append('_'); + try utils.appendNumberStr(&cache_path_buffer, num_sequences); + try cache_path_buffer.append('_'); + try cache_path_buffer.appendSlice(@errorName(err)); + try cache_path_buffer.appendSlice(".rc"); + + // write out the source file to disk for debugging + try std.fs.cwd().writeFile(cache_path_buffer.items, source); + return err; + }; + } + } +} diff --git a/test/fuzzy_stringtable.zig b/test/fuzzy_stringtable.zig index 1bc3df8..fa2a68b 100644 --- a/test/fuzzy_stringtable.zig +++ b/test/fuzzy_stringtable.zig @@ -20,7 +20,7 @@ test "fuzz" { var i: u64 = 0; while (iterations == 0 or i < iterations) : (i += 1) { source_buffer.shrinkRetainingCapacity(0); - const literal = try utils.randomAsciiStringLiteral(allocator, rand); + const literal = try utils.randomStringLiteral(.ascii, allocator, rand, 256); defer allocator.free(literal); var source_writer = source_buffer.writer(); try source_writer.print("STRINGTABLE {{ 1, {s} }}\n", .{literal}); diff --git a/test/utils.zig b/test/utils.zig index 44a7743..ce683da 100644 --- a/test/utils.zig +++ b/test/utils.zig @@ -115,6 +115,7 @@ fn inputContainsKnownPreprocessorDifference(data: []const u8) bool { pub const GetResultOptions = struct { cwd: std.fs.Dir, cwd_path: []const u8, + default_code_page: enum { windows1252, utf8 } = .windows1252, /// Only used in the Win32 version output_path: ?[]const u8 = null, }; @@ -153,6 +154,10 @@ pub fn getResinatorResultFromFile(allocator: Allocator, input_filepath: []const .cwd = options.cwd, .diagnostics = &result.diagnostics, .source_mappings = &mapping_results.mappings, + .default_code_page = switch (options.default_code_page) { + .windows1252 => .windows1252, + .utf8 => .utf8, + }, // TODO: Make this configurable .ignore_include_env_var = true, }; @@ -196,6 +201,10 @@ pub fn getWin32ResultFromFile(allocator: Allocator, input_path: []const u8, opti .argv = &[_][]const u8{ // Note: This relies on `rc.exe` being in the PATH "rc.exe", + switch (options.default_code_page) { + .windows1252 => "/c1252", + .utf8 => "/c65001", + }, // TODO: Make this configurable "/x", // ignore INCLUDE env var "/fo", @@ -319,53 +328,118 @@ pub fn randomOperator(rand: std.rand.Random) u8 { return dict[index]; } -pub fn randomAsciiStringLiteral(allocator: Allocator, rand: std.rand.Random) ![]const u8 { - // max string literal length is 4097 so this will generate some invalid string literals - // need at least two for the "" - const slice_len = rand.uintAtMostBiased(u16, 256) + 2; - var buf = try allocator.alloc(u8, slice_len); - errdefer allocator.free(buf); - - buf[0] = '"'; - var i: usize = 1; - while (i < slice_len - 1) : (i += 1) { - var byte = rand.int(u8); - switch (byte) { - // these are currently invalid within string literals, so swap them out - 0x00, 0x1A, 0x7F => byte += 1, - // \r is a mess, so just change it to a space - // (clang's preprocessor converts \r to \n, rc skips them entirely) - '\r' => byte = ' ', - // \n within string literals are similarly fraught but they mostly expose - // bugs within the Windows RC compiler (where "\n\x01" causes a compile error, - // but "<anything besides newlines>\x01" doesn't). - // For sanity's sake, just don't put newlines in for now. - '\n' => byte = ' ', - '\\' => { - // backslash at the very end of the string leads to \" which is - // currently disallowed, so avoid that. - if (i + 1 == slice_len - 1) { - byte += 1; +pub fn appendRandomStringLiteralSequence(rand: std.rand.Random, buf: *std.ArrayList(u8), is_last: bool, comptime string_type: resinator.literals.StringType) !void { + const SequenceType = enum { byte, non_ascii_codepoint, octal, hex }; + const sequence_type = rand.enumValue(SequenceType); + switch (sequence_type) { + .byte => { + while (true) { + const byte = rand.int(u8); + switch (byte) { + // these are currently invalid within string literals + 0x00, 0x1A, 0x7F => continue, + // \r is a mess, so just change it to a space + // (clang's preprocessor converts \r to \n, rc skips them entirely) + '\r' => continue, + // \n within string literals are similarly fraught but they mostly expose + // bugs within the Windows RC compiler (where "\n\x01" causes a compile error, + // but "<anything besides newlines>\x01" doesn't). + // For sanity's sake, just don't put newlines in for now. + '\n' => continue, + // backslash at the very end of the string leads to \" which is + // currently disallowed, so avoid that. + '\\' => if (is_last) continue, + // Need to escape double quotes as "", but don't want to create a \"" sequence. + '"' => if (buf.items.len > 0 and buf.items[buf.items.len - 1] == '\\') continue, + else => {}, } - }, - '"' => { - // Double quotes need to be escaped to keep this a single string literal - // so try to add one before this char if it'll create an escaped quote ("") - // but not a \"" sequence. - // Otherwise, just swap it out by incrementing it. - if (i >= 2 and buf[i - 1] != '"' and buf[i - 2] != '\\') { - buf[i - 1] = '"'; - } else { - byte += 1; + try buf.append(byte); + // Escape double quotes by appending a second double quote + if (byte == '"') try buf.append(byte); + break; + } + }, + .non_ascii_codepoint => { + while (true) { + const codepoint = rand.intRangeAtMost(u21, 0x80, 0x10FFFF); + if (!std.unicode.utf8ValidCodepoint(codepoint)) continue; + switch (codepoint) { + // disallowed private use character + '\u{E000}' => continue, + // disallowed BOM + '\u{FEFF}' => continue, + else => {}, } - }, - else => {}, - } - buf[i] = byte; + const codepoint_sequence_length = std.unicode.utf8CodepointSequenceLength(codepoint) catch unreachable; + const start_index = buf.items.len; + try buf.resize(buf.items.len + codepoint_sequence_length); + _ = std.unicode.utf8Encode(codepoint, buf.items[start_index..]) catch unreachable; + break; + } + }, + .octal => { + const max_val = switch (string_type) { + .ascii => 0o777, + .wide => 0o7777777, + }; + const max_digits = switch (string_type) { + .ascii => 3, + .wide => 7, + }; + const val = rand.uintAtMost(u21, max_val); + const width = rand.intRangeAtMost(u21, 1, max_digits); + try buf.ensureUnusedCapacity(max_digits + 1); + const unused_slice = buf.unusedCapacitySlice(); + const written = std.fmt.bufPrint(unused_slice, "\\{o:0>[1]}", .{ val, width }) catch unreachable; + buf.items.len += written.len; + }, + .hex => { + const max_val = switch (string_type) { + .ascii => 0xFF, + .wide => 0xFFFF, + }; + const max_digits = switch (string_type) { + .ascii => 2, + .wide => 4, + }; + const val = rand.uintAtMost(u16, max_val); + const width = rand.intRangeAtMost(u16, 1, max_digits); + try buf.ensureUnusedCapacity(max_digits + 2); + const unused_slice = buf.unusedCapacitySlice(); + const written = std.fmt.bufPrint(unused_slice, "\\x{x:0>[1]}", .{ val, width }) catch unreachable; + buf.items.len += written.len; + }, } - buf[slice_len - 1] = '"'; +} - return buf; +pub fn randomStringLiteral( + comptime string_type: resinator.literals.StringType, + allocator: Allocator, + rand: std.rand.Random, + max_num_sequences: u16, +) ![]const u8 { + const num_sequences = rand.uintAtMostBiased(u16, max_num_sequences); + return randomStringLiteralExact(string_type, allocator, rand, num_sequences); +} + +pub fn randomStringLiteralExact( + comptime string_type: resinator.literals.StringType, + allocator: Allocator, + rand: std.rand.Random, + num_sequences: u16, +) ![]const u8 { + var buf = try std.ArrayList(u8).initCapacity(allocator, 2 + num_sequences * 4); + errdefer buf.deinit(); + + if (string_type == .wide) try buf.append('L'); + try buf.append('"'); + var i: usize = 0; + while (i < num_sequences) : (i += 1) { + try appendRandomStringLiteralSequence(rand, &buf, i == num_sequences - 1, string_type); + } + try buf.append('"'); + + return try buf.toOwnedSlice(); } /// Alphanumeric ASCII + any bytes >= 128 @@ -394,6 +468,15 @@ pub fn randomAlphanumExtendedBytes(allocator: Allocator, rand: std.rand.Random) return buf; } +pub fn appendNumberStr(buf: *std.ArrayList(u8), num: anytype) !void { + const num_digits = if (num == 0) 1 else std.math.log10_int(num) + 1; + try buf.ensureUnusedCapacity(num_digits); + const unused_slice = buf.unusedCapacitySlice(); + const written = std.fmt.bufPrint(unused_slice, "{}", .{num}) catch unreachable; + std.debug.assert(written.len == num_digits); + buf.items.len += num_digits; +} + /// Iterates all K-permutations of the given size `n` where k varies from (0..n), /// or (0..max_k) if specified via `initMax`. /// e.g. for AllKPermutationsIterator(3) the returns from `next` will be (in this order):