From 524e7346f5fc1ace4d8de11c3f7674245d8f3cc8 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Mon, 29 Jan 2024 21:52:34 -0500 Subject: [PATCH 1/2] Some possible optimizations. --- src/UTF8_validation.cs | 234 ++++++++++++++++++++++++----------------- 1 file changed, 137 insertions(+), 97 deletions(-) diff --git a/src/UTF8_validation.cs b/src/UTF8_validation.cs index e929c1d..3e2a690 100644 --- a/src/UTF8_validation.cs +++ b/src/UTF8_validation.cs @@ -19,55 +19,65 @@ public static class Vector256Extensions // Gets the second lane of the current vector and the first lane of the previous vector and returns, then shift it right by an appropriate number of bytes (less than 16, or less than 128 bits) [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector256 Prev(this Vector256 current, Vector256 prev, int N = 1) + public static Vector256 Prev1(this Vector256 current, Vector256 prev) { // Permute2x128 takes two 128-bit lane of two 256-bit vector and fuse them into a single vector // 0x21 = 00 10 00 01 translates into a fusing of // second 128-bit lane of first source, // first 128bit lane of second source, + // Compiles to: + // vperm2i128 ymm1, ymm1, ymm0, 33 + // vpalignr ymm0, ymm0, ymm1, 15 Vector256 shuffle = Avx2.Permute2x128(prev, current, 0x21); - return Avx2.AlignRight(current, shuffle, (byte)(16 - N)); //shifts right by a certain amount + return Avx2.AlignRight(current, shuffle, (byte)(16 - 1)); //shifts right by a certain amount } - public static Vector256 Lookup16(this Vector256 source, Vector256 lookupTable) + public static Vector256 Prev2(this Vector256 current, Vector256 prev) { - return Avx2.Shuffle(lookupTable, source); + + // Permute2x128 takes two 128-bit lane of two 256-bit vector and fuse them into a single vector + // 0x21 = 00 10 00 01 translates into a fusing of + // second 128-bit lane of first source, + // first 128bit lane of second source, + // Compiles to + // vperm2i128 ymm1, ymm1, ymm0, 33 + // vpalignr ymm0, ymm0, ymm1, 14 + Vector256 shuffle = Avx2.Permute2x128(prev, current, 0x21); + return Avx2.AlignRight(current, shuffle, (byte)(16 - 2)); //shifts right by a certain amount } - public static Vector256 Lookup16(this Vector256 source, - byte replace0, byte replace1, byte replace2, byte replace3, - byte replace4, byte replace5, byte replace6, byte replace7, - byte replace8, byte replace9, byte replace10, byte replace11, - byte replace12, byte replace13, byte replace14, byte replace15) - { - // if (!Avx2.IsSupported) - // { - // throw new PlatformNotSupportedException("AVX2 is not supported on this processor."); - // } - Vector256 lookupTable = Vector256.Create( - replace0, replace1, replace2, replace3, - replace4, replace5, replace6, replace7, - replace8, replace9, replace10, replace11, - replace12, replace13, replace14, replace15, - // Repeat the pattern for the remaining elements - replace0, replace1, replace2, replace3, - replace4, replace5, replace6, replace7, - replace8, replace9, replace10, replace11, - replace12, replace13, replace14, replace15 - ); + public static Vector256 Prev3(this Vector256 current, Vector256 prev) + { + // Permute2x128 takes two 128-bit lane of two 256-bit vector and fuse them into a single vector + // 0x21 = 00 10 00 01 translates into a fusing of + // second 128-bit lane of first source, + // first 128bit lane of second source, + // Compiles to + // vperm2i128 ymm1, ymm1, ymm0, 33 + // vpalignr ymm0, ymm0, ymm1, 13 + Vector256 shuffle = Avx2.Permute2x128(prev, current, 0x21); + return Avx2.AlignRight(current, shuffle, (byte)(16 - 3)); //shifts right by a certain amount + } + public static Vector256 Lookup16(this Vector256 source, Vector256 lookupTable) + { + // Compiles to + // vpshufb ymm0, ymm0, ymmword ptr[rdx] return Avx2.Shuffle(lookupTable, source); } - public static Vector256 ShiftRightLogical(this Vector256 vector, byte shiftAmount) + + public static Vector256 ShiftRightLogical4(this Vector256 vector) { + // Compiles to + // vpsrlw ymm0, ymm0, 4 Vector256 extended = vector.AsUInt16(); // Perform the shift operation on each 16-bit element - Vector256 shifted = Avx2.ShiftRightLogical(extended, shiftAmount); + Vector256 shifted = Avx2.ShiftRightLogical(extended, 4); Vector256 narrowed = shifted.AsByte(); @@ -119,7 +129,7 @@ public static unsafe class Utf8Utility return pInputBuffer; } - var checker = new SimdUnicode.utf8_validation.utf8_checker(); + var checker = new SimdUnicode.Utf8Validation.utf8_checker(); int processedLength = 0; // Helpers.CheckForGCCollections("Before AVX2 procession"); @@ -129,7 +139,7 @@ public static unsafe class Utf8Utility Vector256 currentBlock = Avx.LoadVector256(pInputBuffer + processedLength); // Helpers.CheckForGCCollections($"Before check_next_input:{processedLength}"); - checker.check_next_input(currentBlock); + checker.CheckNextInput(currentBlock); // Helpers.CheckForGCCollections($"After check_next_input:{processedLength}"); processedLength += 32; @@ -149,7 +159,7 @@ public static unsafe class Utf8Utility Vector256 remainingBlock = Vector256.Create(remainingBytes.ToArray()); - checker.check_next_input(remainingBlock); + checker.CheckNextInput(remainingBlock); processedLength += inputLength - processedLength; } @@ -172,8 +182,8 @@ public static unsafe class Utf8Utility // } - checker.check_eof(); - if (checker.errors()) + checker.CheckEof(); + if (checker.Errors()) { return pInputBuffer + processedLength; } @@ -184,7 +194,7 @@ public static unsafe class Utf8Utility // C# docs suggests that classes are allocated on the heap: // it doesnt seem to do much in this case but I thought the suggestion to be sensible. - public struct utf8_validation + public struct Utf8Validation { public struct utf8_checker { @@ -208,7 +218,7 @@ public utf8_checker() // This is the simplest least time-consuming implementation. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void check_next_input(Vector256 input) + public void CheckNextInput(Vector256 input) { // Check if the entire 256-bit vector is ASCII @@ -218,8 +228,8 @@ public void check_next_input(Vector256 input) { // Contains non-ASCII characters, process the vector - check_utf8_bytes(input, prev_input_block); - prev_incomplete = is_incomplete(input); + CheckUtf8Bytes(input, prev_input_block); + prev_incomplete = IsIncomplete(input); } @@ -232,76 +242,87 @@ public void check_next_input(Vector256 input) [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void check_utf8_bytes(Vector256 input, Vector256 prev_input) + public void CheckUtf8Bytes(Vector256 input, Vector256 prevInput) { - Vector256 prev1 = input.Prev(prev_input, 1); + Vector256 prev1 = input.Prev1(prevInput); // check 1-2 bytes character - Vector256 sc = check_special_cases(input, prev1); + Vector256 sc = CheckSpecialCases(input, prev1); // Console.WriteLine("Special_case Vector before check_multibyte_lengths: " + VectorToString(error)); // All remaining checks are for invalid 3-4 byte sequences, which either have too many continuations // or not enough (section 6.2 of the paper) - error = Avx2.Or(error, check_multibyte_lengths(input, prev_input, sc)); + error = Avx2.Or(error, CheckMultibyteLengths(input, prevInput, sc)); // Console.WriteLine("Error Vector after check_utf8_bytes/after check_multibyte_lengths: " + VectorToString(error)); } // [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool errors() + public bool Errors() { // Console.WriteLine("Error Vector at the end: " + VectorToString(error)); - + // compiles to: + // vptest ymm0, ymm0 + // setne al + // movzx rax, al return !Avx2.TestZ(error, error); } // [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void check_eof() + public void CheckEof() { // Console.WriteLine("Error Vector before check_eof(): " + VectorToString(error)); // Console.WriteLine("prev_incomplete Vector in check_eof(): " + VectorToString(prev_incomplete)); - + // Compiles to: + // vpor ymm0, ymm0, ymmword ptr [rcx+0x40] error = Avx2.Or(error, prev_incomplete); // Console.WriteLine("Error Vector before check_eof(): " + VectorToString(error)); } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - - // This corresponds to section 6.1 e.g Table 6 of the paper e.g. 1-2 bytes - private Vector256 check_special_cases(Vector256 input, Vector256 prev1) - { - - // define bits that indicate error code - // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) - // Bit 1 = Too Long (ASCII followed by continuation) - // Bit 2 = Overlong 3-byte - // Bit 4 = Surrogate - // Bit 5 = Overlong 2-byte - // Bit 7 = Two Continuations - const byte TOO_SHORT = 1 << 0; - const byte TOO_LONG = 1 << 1; - const byte OVERLONG_3 = 1 << 2; - const byte SURROGATE = 1 << 4; - const byte OVERLONG_2 = 1 << 5; - const byte TWO_CONTS = 1 << 7; - const byte TOO_LARGE = 1 << 3; - const byte TOO_LARGE_1000 = 1 << 6; - const byte OVERLONG_4 = 1 << 6; - const byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; - - Vector256 byte_1_high = prev1.ShiftRightLogical(4).Lookup16( + const byte TOO_SHORT = 1 << 0; + const byte TOO_LONG = 1 << 1; + const byte OVERLONG_3 = 1 << 2; + const byte SURROGATE = 1 << 4; + const byte OVERLONG_2 = 1 << 5; + const byte TWO_CONTS = 1 << 7; + const byte TOO_LARGE = 1 << 3; + const byte TOO_LARGE_1000 = 1 << 6; + const byte OVERLONG_4 = 1 << 6; + const byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; + + static readonly Vector256 shuf1 = Vector256.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + TOO_SHORT | OVERLONG_2, + TOO_SHORT, + TOO_SHORT | OVERLONG_3 | SURROGATE, + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, TOO_SHORT | OVERLONG_2, TOO_SHORT, TOO_SHORT | OVERLONG_3 | SURROGATE, - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 - ); + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); - Vector256 byte_1_low = (prev1 & Vector256.Create((byte)0x0F)).Lookup16( + static readonly Vector256 shuf2 = Vector256.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + CARRY | OVERLONG_2, + CARRY, + CARRY, + CARRY | TOO_LARGE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, CARRY | OVERLONG_2, CARRY, @@ -317,39 +338,61 @@ private Vector256 check_special_cases(Vector256 input, Vector256 byte_2_high = input.ShiftRightLogical(4).Lookup16( + static readonly Vector256 shuf3 = Vector256.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT - ); + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + + // This corresponds to section 6.1 e.g Table 6 of the paper e.g. 1-2 bytes + private static Vector256 CheckSpecialCases(Vector256 input, Vector256 prev1) + { + + // define bits that indicate error code + // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) + // Bit 1 = Too Long (ASCII followed by continuation) + // Bit 2 = Overlong 3-byte + // Bit 4 = Surrogate + // Bit 5 = Overlong 2-byte + // Bit 7 = Two Continuations + + Vector256 byte_1_high = prev1.ShiftRightLogical4().Lookup16(shuf1); + + Vector256 byte_1_low = (prev1 & Vector256.Create((byte)0x0F)).Lookup16(shuf2); + + Vector256 byte_2_high = input.ShiftRightLogical4().Lookup16(shuf3); return Avx2.And(Avx2.And(byte_1_high, byte_1_low), byte_2_high); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - - private Vector256 check_multibyte_lengths(Vector256 input, Vector256 prev_input, Vector256 sc) + private static Vector256 CheckMultibyteLengths(Vector256 input, Vector256 prev_input, Vector256 sc) { // Console.WriteLine("sc: " + VectorToString(sc)); // Console.WriteLine("Input: " + VectorToString(input)); // Console.WriteLine("Input(Binary): " + VectorToBinary(input)); - Vector256 prev2 = input.Prev(prev_input, 2); + Vector256 prev2 = input.Prev2(prev_input); // Console.WriteLine("Prev2: " + VectorToBinary(prev2)); - Vector256 prev3 = input.Prev(prev_input, 3); + Vector256 prev3 = input.Prev3(prev_input); // Console.WriteLine("Prev3: " + VectorToBinary(prev3)); - Vector256 must23 = must_be_2_3_continuation(prev2, prev3); + Vector256 must23 = Must_be_2_3_continuation(prev2, prev3); // Console.WriteLine("must be 2 3 continuation: " + VectorToString(must23)); Vector256 must23_80 = Avx2.And(must23, Vector256.Create((byte)0x80)); @@ -358,8 +401,7 @@ private Vector256 check_multibyte_lengths(Vector256 input, Vector256 } [MethodImpl(MethodImplOptions.AggressiveInlining)] - - private Vector256 must_be_2_3_continuation(Vector256 prev2, Vector256 prev3) + private static Vector256 Must_be_2_3_continuation(Vector256 prev2, Vector256 prev3) { Vector256 is_third_byte = Avx2.SubtractSaturate(prev2, Vector256.Create((byte)(0b11100000u - 0x80))); Vector256 is_fourth_byte = Avx2.SubtractSaturate(prev3, Vector256.Create((byte)(0b11110000u - 0x80))); @@ -376,25 +418,21 @@ private Vector256 must_be_2_3_continuation(Vector256 prev2, Vector25 } - private static readonly byte[] MaxArray = new byte[32] - { - 255, 255, 255, 255, 255, 255, 255, 255, + static readonly Vector256 maxValue = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1 - }; - Vector256 maxValue = Vector256.Create(MaxArray); + 255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1); - // private static readonly Vector256 maxValue = Vector256.Create( - // 255, 255, 255, 255, 255, 255, 255, 255, - // 255, 255, 255, 255, 255, 255, 255, 255, - // 255, 255, 255, 255, 255, 255, 255, 255, - // 255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1); + // private static readonly Vector256 maxValue = Vector256.Create( + // 255, 255, 255, 255, 255, 255, 255, 255, + // 255, 255, 255, 255, 255, 255, 255, 255, + // 255, 255, 255, 255, 255, 255, 255, 255, + // 255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1); [MethodImpl(MethodImplOptions.AggressiveInlining)] - private Vector256 is_incomplete(Vector256 input) + private static Vector256 IsIncomplete(Vector256 input) { // Console.WriteLine("Input Vector is_incomplete: " + VectorToString(input)); // byte[] maxArray = new byte[32] @@ -414,8 +452,10 @@ private Vector256 is_incomplete(Vector256 input) [MethodImpl(MethodImplOptions.AggressiveInlining)] - private Vector256 SaturatingSubtractUnsigned(Vector256 left, Vector256 right) + private static Vector256 SaturatingSubtractUnsigned(Vector256 left, Vector256 right) { + // Compiles to + // vpsubusw ymm0, ymm0, ymmword ptr [r8] if (!Avx2.IsSupported) { throw new PlatformNotSupportedException("AVX2 is not supported on this processor."); From 7bc999b874431b4bce10cafd2c4d128b3b34d4c7 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Tue, 30 Jan 2024 00:03:26 -0500 Subject: [PATCH 2/2] More optimizations. --- benchmark/Benchmark.cs | 6 +- src/UTF8_validation.cs | 302 +++++++++++++++++++++++++++-------------- 2 files changed, 201 insertions(+), 107 deletions(-) diff --git a/benchmark/Benchmark.cs b/benchmark/Benchmark.cs index 273d5ac..6354f9f 100644 --- a/benchmark/Benchmark.cs +++ b/benchmark/Benchmark.cs @@ -95,7 +95,7 @@ protected void IntroduceError(byte[] utf8, Random random) { utf8[position + 1] = (byte)((utf8[position + 1] & 0b11000011) | (s << 2)); errorIntroduced = true; - break; // Just introduce one surrogate error + break; // Just introduce one surrogate error // TODO: having a loop that breaks immediately does not make much sense !!!!!!!!!!!!!!!!!!!!!! } } break; @@ -433,7 +433,7 @@ public class RealDataBenchmark : BenchmarkBase @"data/turkish.utf8.txt", @"data/german.utf8.txt", @"data/japanese.utf8.txt")] - public string FileName; + public string? FileName; private string[] _lines = Array.Empty(); private byte[][] _linesUtf8 = Array.Empty(); @@ -607,7 +607,7 @@ public static void Main(string[] args) } // Create a BenchmarkDotNet config with a custom maximum parameter column width - var config = DefaultConfig.Instance.With(SummaryStyle.Default.WithMaxParameterColumnWidth(100)); + var config = DefaultConfig.Instance.With(summaryStyle: SummaryStyle.Default.WithMaxParameterColumnWidth(100)); // Check if a specific argument (e.g., "runall") is provided if (args.Length > 0 && args[0] == "runall") diff --git a/src/UTF8_validation.cs b/src/UTF8_validation.cs index 3e2a690..b91550d 100644 --- a/src/UTF8_validation.cs +++ b/src/UTF8_validation.cs @@ -3,6 +3,8 @@ using System.Runtime.Intrinsics.X86; using System.Linq; using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics.Arm; +using static System.Net.Mime.MediaTypeNames; // C# already have something that is *more or less* equivalent to our C++ simd class: // Vector256 https://learn.microsoft.com/en-us/dotnet/api/system.runtime.intrinsics.vector256-1?view=net-7.0 @@ -46,8 +48,7 @@ public static Vector256 Prev2(this Vector256 current, Vector256 shuffle = Avx2.Permute2x128(prev, current, 0x21); return Avx2.AlignRight(current, shuffle, (byte)(16 - 2)); //shifts right by a certain amount } - - + public static Vector256 Prev3(this Vector256 current, Vector256 prev) { @@ -116,20 +117,25 @@ public static unsafe class Utf8Utility // return string.Join(" ", binaryStrings); // } - - - - // Returns a pointer to the first invalid byte in the input buffer if it's invalid, or a pointer to the end if it's valid. // [MethodImpl(MethodImplOptions.AggressiveInlining)] public static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength) { + //////////////// + // TODO: I recommend taking this code and calling it something + // else. Then have the current function (GetPointerToFirstInvalidByte) + // call the SIMD function only if inputLength is sufficiently large (maybe 64 bytes), + // otherwise, use the scalar function. + //////////////// if (pInputBuffer == null || inputLength <= 0) { return pInputBuffer; } + Vector256 error = Vector256.Zero; + Vector256 prev_input_block = Vector256.Zero; + Vector256 prev_incomplete = Vector256.Zero; + - var checker = new SimdUnicode.Utf8Validation.utf8_checker(); int processedLength = 0; // Helpers.CheckForGCCollections("Before AVX2 procession"); @@ -139,7 +145,7 @@ public static unsafe class Utf8Utility Vector256 currentBlock = Avx.LoadVector256(pInputBuffer + processedLength); // Helpers.CheckForGCCollections($"Before check_next_input:{processedLength}"); - checker.CheckNextInput(currentBlock); + Utf8Validation.utf8_checker.CheckNextInput(currentBlock, ref prev_input_block, ref prev_incomplete, ref error); // Helpers.CheckForGCCollections($"After check_next_input:{processedLength}"); processedLength += 32; @@ -147,10 +153,13 @@ public static unsafe class Utf8Utility } // Helpers.CheckForGCCollections("After AVX2 procession"); - - + if (processedLength < inputLength) { + // Unfortunalely, this approach with stackalloc might be expensive. + // TODO: replace it by a simple scalar routine. You need to handle + // prev_incomplete but it should be doable. + Span remainingBytes = stackalloc byte[32]; for (int i = 0; i < inputLength - processedLength; i++) { @@ -158,8 +167,7 @@ public static unsafe class Utf8Utility } Vector256 remainingBlock = Vector256.Create(remainingBytes.ToArray()); - - checker.CheckNextInput(remainingBlock); + Utf8Validation.utf8_checker.CheckNextInput(remainingBlock, ref prev_input_block, ref prev_incomplete, ref error); processedLength += inputLength - processedLength; } @@ -171,7 +179,7 @@ public static unsafe class Utf8Utility // { // // Directly call the scalar function on the remaining part of the buffer // byte* invalidBytePointer = GetPointerToFirstInvalidByte(pInputBuffer + processedLength, inputLength - processedLength -1); - + // // You can then use `invalidBytePointer` as needed, for example: // // if (invalidBytePointer != pInputBuffer + inputLength) { // // // Handle the case where an invalid byte is found @@ -180,10 +188,10 @@ public static unsafe class Utf8Utility // // Update processedLength to reflect the processing done by the scalar function // processedLength += (int)(invalidBytePointer - pInputBuffer); // } - - checker.CheckEof(); - if (checker.Errors()) + + Utf8Validation.utf8_checker.CheckEof(ref error, prev_incomplete); + if (Utf8Validation.utf8_checker.Errors(error)) { return pInputBuffer + processedLength; } @@ -198,37 +206,67 @@ public struct Utf8Validation { public struct utf8_checker { - Vector256 error; - Vector256 prev_input_block; - Vector256 prev_incomplete; - - - public utf8_checker() - { - error = Vector256.Zero; - prev_input_block = Vector256.Zero; - prev_incomplete = Vector256.Zero; - } - // This is the first point of entry for this function // The original C++ implementation is much more extensive and assumes a 512 bit stream as well as several implementations // In this case I focus solely on AVX2 instructions for prototyping and benchmarking purposes. // This is the simplest least time-consuming implementation. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void CheckNextInput(Vector256 input) + public static void CheckNextInput(Vector256 input, ref Vector256 prev_input_block, ref Vector256 prev_incomplete, ref Vector256 error) { + // Compiles to: + /* + G_M000_IG02: ;; offset=0x0003 + vmovups ymm0, ymmword ptr [rcx] + vpmovmskb eax, ymm0 + test eax, eax + je G_M000_IG04 + + G_M000_IG03: ;; offset=0x0013 + vmovups ymm1, ymmword ptr [rdx] + vperm2i128 ymm1, ymm1, ymm0, 33 + vpalignr ymm2, ymm0, ymm1, 15 + vpsrlw ymm3, ymm2, 4 + vmovups ymm4, ymmword ptr [reloc @RWD00] + vpshufb ymm3, ymm4, ymm3 + vpand ymm2, ymm2, ymmword ptr [reloc @RWD32] + vmovups ymm4, ymmword ptr [reloc @RWD64] + vpshufb ymm2, ymm4, ymm2 + vpand ymm2, ymm3, ymm2 + vpsrlw ymm3, ymm0, 4 + vmovups ymm4, ymmword ptr [reloc @RWD96] + vpshufb ymm3, ymm4, ymm3 + vpand ymm2, ymm2, ymm3 + vmovups ymm3, ymmword ptr [r9] + vpalignr ymm4, ymm0, ymm1, 14 + vpsubusb ymm4, ymm4, ymmword ptr [reloc @RWD128] + vpalignr ymm0, ymm0, ymm1, 13 + vpsubusb ymm0, ymm0, ymmword ptr [reloc @RWD160] + vpor ymm0, ymm4, ymm0 + vpand ymm0, ymm0, ymmword ptr [reloc @RWD192] + vpxor ymm0, ymm0, ymm2 + vpor ymm0, ymm3, ymm0 + vmovups ymmword ptr [r9], ymm0 + vmovups ymm0, ymmword ptr [rcx] + vpsubusw ymm0, ymm0, ymmword ptr [reloc @RWD224] + vmovups ymmword ptr [r8], ymm0 + + G_M000_IG04: ;; offset=0x00AF + vmovups ymm0, ymmword ptr [rcx] + vmovups ymmword ptr [rdx], ymm0 + */ // Check if the entire 256-bit vector is ASCII - + + Vector256 inputSBytes = input.AsSByte(); // Reinterpret the byte vector as sbyte int mask = Avx2.MoveMask(inputSBytes.AsByte()); if (mask != 0) { // Contains non-ASCII characters, process the vector - CheckUtf8Bytes(input, prev_input_block); + CheckUtf8Bytes(input, prev_input_block, ref error); prev_incomplete = IsIncomplete(input); } @@ -242,8 +280,33 @@ public void CheckNextInput(Vector256 input) [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void CheckUtf8Bytes(Vector256 input, Vector256 prevInput) + public static void CheckUtf8Bytes(Vector256 input, Vector256 prevInput, ref Vector256 error) { + // compiles to + // vmovups ymm0, ymmword ptr [rcx] + // vmovups ymm1, ymmword ptr[rdx] + // vperm2i128 ymm1, ymm1, ymm0, 33 + // vpalignr ymm2, ymm0, ymm1, 15 + // vpsrlw ymm3, ymm2, 4 + // vmovups ymm4, ymmword ptr[reloc @RWD00] + // vpshufb ymm3, ymm4, ymm3 + // vpand ymm2, ymm2, ymmword ptr[reloc @RWD32] + // ymm4, ymmword ptr[reloc @RWD64] + // vpshufb ymm2, ymm4, ymm2 + // vpand ymm2, ymm3, ymm2 + // vpsrlw ymm3, ymm0, 4 + // vmovups ymm4, ymmword ptr[reloc @RWD96] + // vpshufb ymm3, ymm4, ymm3 + // vpand ymm2, ymm2, ymm3 + // vmovups ymm3, ymmword ptr[r8] + // vpalignr ymm4, ymm0, ymm1, 14 + // ymm4, ymm4, ymmword ptr[reloc @RWD128] + // vpalignr ymm0, ymm0, ymm1, 13 + // vpsubusb ymm0, ymm0, ymmword ptr[reloc @RWD160] + // vpor ymm0, ymm4, ymm0 + // vpand ymm0, ymm0, ymmword ptr[reloc @RWD192] + // vpxor ymm0, ymm0, ymm2 + // vpor ymm0, ymm3, ymm0 Vector256 prev1 = input.Prev1(prevInput); // check 1-2 bytes character Vector256 sc = CheckSpecialCases(input, prev1); @@ -258,7 +321,7 @@ public void CheckUtf8Bytes(Vector256 input, Vector256 prevInput) // [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool Errors() + public static bool Errors(Vector256 error) { // Console.WriteLine("Error Vector at the end: " + VectorToString(error)); // compiles to: @@ -270,7 +333,7 @@ public bool Errors() // [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void CheckEof() + public static void CheckEof(ref Vector256 error, Vector256 prev_incomplete) { // Console.WriteLine("Error Vector before check_eof(): " + VectorToString(error)); // Console.WriteLine("prev_incomplete Vector in check_eof(): " + VectorToString(prev_incomplete)); @@ -292,67 +355,6 @@ public void CheckEof() const byte OVERLONG_4 = 1 << 6; const byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; - static readonly Vector256 shuf1 = Vector256.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, - TOO_SHORT | OVERLONG_2, - TOO_SHORT, - TOO_SHORT | OVERLONG_3 | SURROGATE, - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, - TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, - TOO_SHORT | OVERLONG_2, - TOO_SHORT, - TOO_SHORT | OVERLONG_3 | SURROGATE, - TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); - - static readonly Vector256 shuf2 = Vector256.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - CARRY | OVERLONG_2, - CARRY, - CARRY, - CARRY | TOO_LARGE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, - CARRY | OVERLONG_2, - CARRY, - CARRY, - CARRY | TOO_LARGE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, - CARRY | TOO_LARGE | TOO_LARGE_1000, - CARRY | TOO_LARGE | TOO_LARGE_1000); - - static readonly Vector256 shuf3 = Vector256.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, - TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, - TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -367,7 +369,82 @@ private static Vector256 CheckSpecialCases(Vector256 input, Vector25 // Bit 4 = Surrogate // Bit 5 = Overlong 2-byte // Bit 7 = Two Continuations - + // Compiles to + // vmovups ymm0, ymmword ptr [r8] + // vpsrlw ymm1, ymm0, 4 + // vmovups ymm2, ymmword ptr[reloc @RWD00] + // vpshufb ymm1, ymm2, ymm1 + // vpand ymm0, ymm0, ymmword ptr[reloc @RWD32] + // vmovups ymm2, ymmword ptr[reloc @RWD64] + // vpshufb ymm0, ymm2, ymm0 + // vpand ymm0, ymm1, ymm0 + // vmovups ymm1, ymmword ptr[rdx] + // vpsrlw ymm1, ymm1, 4 + // vmovups ymm2, ymmword ptr[reloc @RWD96] + // vpshufb ymm1, ymm2, ymm1 + // vpand ymm0, ymm0, ymm1 + + Vector256 shuf1 = Vector256.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + TOO_SHORT | OVERLONG_2, + TOO_SHORT, + TOO_SHORT | OVERLONG_3 | SURROGATE, + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + TOO_SHORT | OVERLONG_2, + TOO_SHORT, + TOO_SHORT | OVERLONG_3 | SURROGATE, + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); + + Vector256 shuf2 = Vector256.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + CARRY | OVERLONG_2, + CARRY, + CARRY, + CARRY | TOO_LARGE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + CARRY | OVERLONG_2, + CARRY, + CARRY, + CARRY | TOO_LARGE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000); + + Vector256 shuf3 = Vector256.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); Vector256 byte_1_high = prev1.ShiftRightLogical4().Lookup16(shuf1); Vector256 byte_1_low = (prev1 & Vector256.Create((byte)0x0F)).Lookup16(shuf2); @@ -384,6 +461,14 @@ private static Vector256 CheckMultibyteLengths(Vector256 input, Vect // Console.WriteLine("Input: " + VectorToString(input)); // Console.WriteLine("Input(Binary): " + VectorToBinary(input)); + // compiles to: + // vperm2i128 ymm1, ymm1, ymm0, 33 + // vpalignr ymm2, ymm0, ymm1, 14 + // vpsubusb ymm2, ymm2, ymmword ptr[reloc @RWD00] + // vpalignr ymm0, ymm0, ymm1, 13 + // vpsubusb ymm0, ymm0, ymmword ptr[reloc @RWD32] + // vpor ymm0, ymm2, ymm0 + // vpand ymm0, ymm0, ymmword ptr[reloc Vector256 prev2 = input.Prev2(prev_input); // Console.WriteLine("Prev2: " + VectorToBinary(prev2)); @@ -392,7 +477,7 @@ private static Vector256 CheckMultibyteLengths(Vector256 input, Vect // Console.WriteLine("Prev3: " + VectorToBinary(prev3)); - Vector256 must23 = Must_be_2_3_continuation(prev2, prev3); + Vector256 must23 = MustBe23Continuation(prev2, prev3); // Console.WriteLine("must be 2 3 continuation: " + VectorToString(must23)); Vector256 must23_80 = Avx2.And(must23, Vector256.Create((byte)0x80)); @@ -401,8 +486,16 @@ private static Vector256 CheckMultibyteLengths(Vector256 input, Vect } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector256 Must_be_2_3_continuation(Vector256 prev2, Vector256 prev3) + private static Vector256 MustBe23Continuation(Vector256 prev2, Vector256 prev3) { + // Compiles to + // vmovups ymm0, ymmword ptr [rdx] + // vpsubusb ymm0, ymm0, ymmword ptr [reloc @RWD00] + // vmovups ymm1, ymmword ptr[r8] + // vpsubusb ymm1, ymm1, ymmword ptr[reloc @RWD32] + // vpor ymm0, ymm0, ymm1 + + Vector256 is_third_byte = Avx2.SubtractSaturate(prev2, Vector256.Create((byte)(0b11100000u - 0x80))); Vector256 is_fourth_byte = Avx2.SubtractSaturate(prev3, Vector256.Create((byte)(0b11110000u - 0x80))); @@ -418,11 +511,6 @@ private static Vector256 Must_be_2_3_continuation(Vector256 prev2, V } - static readonly Vector256 maxValue = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1); - // private static readonly Vector256 maxValue = Vector256.Create( // 255, 255, 255, 255, 255, 255, 255, 255, // 255, 255, 255, 255, 255, 255, 255, 255, @@ -443,7 +531,13 @@ private static Vector256 IsIncomplete(Vector256 input) // 255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1 // }; // Vector256 max_value = Vector256.Create(maxArray); - + // Compiles to + // vmovups ymm0, ymmword ptr [rdx] + // vpsubusw ymm0, ymm0, ymmword ptr[reloc @RWD00] + Vector256 maxValue = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1); Vector256 result = SaturatingSubtractUnsigned(input, maxValue); // Console.WriteLine("Result Vector is_incomplete: " + VectorToString(result));