From 0bccb1ece5db6a0a073f083bf6ed5a141ee0be74 Mon Sep 17 00:00:00 2001 From: Paulo Matos Date: Wed, 22 Jan 2025 10:04:25 +0100 Subject: [PATCH 1/3] Ensure predicate cache is reset when control flow leaves block Whenever the control float leaves the block, it might clobber the predicate register so we reset the cache whenever that happens. Fixes #4264 --- .../Core/ArchHelpers/Arm64Emitter.cpp | 19 ++++++++------- .../Interface/Core/ArchHelpers/Arm64Emitter.h | 10 ++++++++ .../Source/Interface/Core/JIT/MemoryOps.cpp | 24 +++++++------------ .../Interface/Core/OpcodeDispatcher.cpp | 7 ++---- FEXCore/Source/Interface/IR/IR.json | 15 +++++------- FEXCore/Source/Interface/IR/IREmitter.cpp | 1 + .../IR/Passes/x87StackOptimizationPass.cpp | 7 +++--- 7 files changed, 42 insertions(+), 41 deletions(-) diff --git a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp index d54a467e3f..b49abc0e54 100644 --- a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp +++ b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp @@ -1,7 +1,6 @@ // SPDX-License-Identifier: MIT #include "Interface/Core/ArchHelpers/Arm64Emitter.h" #include "FEXCore/Core/X86Enums.h" -#include "FEXCore/Utils/AllocatorHooks.h" #include "Interface/Core/Dispatcher/Dispatcher.h" #include "Interface/Context/Context.h" @@ -60,8 +59,8 @@ namespace x64 { // p6 and p7 registers are used as temporaries no not added here for RA // See PREF_TMP_16B and PREF_TMP_32B // p0-p1 are also used in the jit as temps. - // Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5. - constexpr std::array PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; + // Also p8-p15 cannot be used can only encode p0-p7, p2 is a special register, so we're left with p3-p5. + constexpr std::array PR = {ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; constexpr unsigned RAPairs = 6; @@ -100,6 +99,7 @@ namespace x64 { ARMEmitter::Reg::r20, ARMEmitter::Reg::r21, ARMEmitter::Reg::r22, + // PF/AF must be last. REG_PF, REG_AF, }; @@ -112,8 +112,8 @@ namespace x64 { // p6 and p7 registers are used as temporaries no not added here for RA // See PREF_TMP_16B and PREF_TMP_32B // p0-p1 are also used in the jit as temps. - // Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5. - constexpr std::array PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; + // Also p8-p15 cannot be used can only encode p0-p7, p2 is a special register, so we're left with p3-p5. + constexpr std::array PR = {ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; constexpr unsigned RAPairs = 6; @@ -249,8 +249,8 @@ namespace x32 { // p6 and p7 registers are used as temporaries no not added here for RA // See PREF_TMP_16B and PREF_TMP_32B // p0-p1 are also used in the jit as temps. - // Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5. - constexpr std::array PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; + // Also p8-p15 cannot be used can only encode p0-p7, p2 is a special register, so we're left with p3-p5. + constexpr std::array PR = {ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; // All are caller saved constexpr std::array SRAFPR = { @@ -631,7 +631,7 @@ void Arm64Emitter::FillSpecialRegs(ARMEmitter::Register TmpReg, ARMEmitter::Regi } #endif - if (SetPredRegs) { + if (SetPredRegs && (EmitterCTX->HostFeatures.SupportsSVE256 || EmitterCTX->HostFeatures.SupportsSVE128)) { // Set up predicate registers. // We don't bother spilling these in SpillStaticRegs, // since all that matters is we restore them on a fill. @@ -643,6 +643,9 @@ void Arm64Emitter::FillSpecialRegs(ARMEmitter::Register TmpReg, ARMEmitter::Regi if (EmitterCTX->HostFeatures.SupportsSVE128) { ptrue(ARMEmitter::SubRegSize::i8Bit, PRED_TMP_16B, ARMEmitter::PredicatePattern::SVE_VL16); } + + // Fill in the predicate register for the x87 ldst SVE optimization. + ptrue(ARMEmitter::SubRegSize::i16Bit, PRED_X87_SVEOPT, ARMEmitter::PredicatePattern::SVE_VL5); } } diff --git a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h index c30b582bd9..d61f14a8da 100644 --- a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h +++ b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h @@ -46,6 +46,10 @@ constexpr auto REG_AF = ARMEmitter::Reg::r27; // Vector temporaries constexpr auto VTMP1 = ARMEmitter::VReg::v0; constexpr auto VTMP2 = ARMEmitter::VReg::v1; + +// Predicate register for X87 SVE Optimization +constexpr auto SVE_OPT_PRED = ARMEmitter::PReg::p2; + #else constexpr auto TMP1 = ARMEmitter::XReg::x10; constexpr auto TMP2 = ARMEmitter::XReg::x11; @@ -65,6 +69,9 @@ constexpr auto VTMP2 = ARMEmitter::VReg::v17; constexpr auto EC_CALL_CHECKER_PC_REG = ARMEmitter::XReg::x9; constexpr auto EC_ENTRY_CPUAREA_REG = ARMEmitter::XReg::x17; +// Predicate register for X87 SVE Optimization +constexpr auto SVE_OPT_PRED = ARMEmitter::PReg::p2; + // These structures are not included in the standard Windows headers, define the offsets of members we care about for EC here. constexpr size_t TEB_CPU_AREA_OFFSET = 0x1788; constexpr size_t TEB_PEB_OFFSET = 0x60; @@ -79,6 +86,9 @@ constexpr uint64_t EC_CODE_BITMAP_MAX_ADDRESS = 1ULL << 47; // Will force one single instruction block to be generated first if set when entering the JIT filling SRA. constexpr auto ENTRY_FILL_SRA_SINGLE_INST_REG = TMP1; +// Predicate to use in the X87 SVE optimization +constexpr ARMEmitter::PRegister PRED_X87_SVEOPT = ARMEmitter::PReg::p2; + // Predicate register temporaries (used when AVX support is enabled) // PRED_TMP_16B indicates a predicate register that indicates the first 16 bytes set to 1. // PRED_TMP_32B indicates a predicate register that indicates the first 32 bytes set to 1. diff --git a/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp b/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp index b819aeecb5..47fdacfedf 100644 --- a/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp @@ -8,6 +8,7 @@ tags: backend|arm64 #include "FEXCore/Core/X86Enums.h" #include "FEXCore/Utils/LogManager.h" #include "Interface/Context/Context.h" +#include "Interface/Core/ArchHelpers/Arm64Emitter.h" #include "Interface/Core/CPUID.h" #include "Interface/Core/JIT/JITClass.h" #include "Interface/IR/IR.h" @@ -1590,21 +1591,14 @@ DEF_OP(StoreMem) { } } -DEF_OP(InitPredicate) { - const auto Op = IROp->C(); - const auto OpSize = IROp->Size; - ptrue(ConvertSubRegSize16(OpSize), GetPReg(Node), static_cast(Op->Pattern)); -} +DEF_OP(StoreMemX87SVEOptPredicate) { + const auto Op = IROp->C(); + const auto Predicate = PRED_X87_SVEOPT; -DEF_OP(StoreMemPredicate) { - const auto Op = IROp->C(); - const auto Predicate = GetPReg(Op->Mask.ID()); + LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "StoreMemX87SVEOptPredicate needs SVE support"); const auto RegData = GetVReg(Op->Value.ID()); const auto MemReg = GetReg(Op->Addr.ID()); - - LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "StoreMemPredicate needs SVE support"); - const auto MemDst = ARMEmitter::SVEMemOperand(MemReg.X(), 0); switch (IROp->ElementSize) { @@ -1628,13 +1622,13 @@ DEF_OP(StoreMemPredicate) { } } -DEF_OP(LoadMemPredicate) { - const auto Op = IROp->C(); +DEF_OP(LoadMemX87SVEOptPredicate) { + const auto Op = IROp->C(); const auto Dst = GetVReg(Node); - const auto Predicate = GetPReg(Op->Mask.ID()); + const auto Predicate = PRED_X87_SVEOPT; const auto MemReg = GetReg(Op->Addr.ID()); - LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "LoadMemPredicate needs SVE support"); + LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "LoadMemX87SVEOptPredicate needs SVE support"); const auto MemDst = ARMEmitter::SVEMemOperand(MemReg.X(), 0); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index db1b5e38d0..edd51ae629 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -4313,9 +4313,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T if (OpSize == OpSize::f80Bit) { Ref MemSrc = LoadEffectiveAddress(A, true); if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { - // Using SVE we can load this with a single instruction. - auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); - return _LoadMemPredicate(OpSize::i128Bit, OpSize::i16Bit, PReg, MemSrc); + return _LoadMemX87SVEOptPredicate(OpSize::i128Bit, OpSize::i16Bit, MemSrc); } else { // For X87 extended doubles, Split the load. auto Res = _LoadMem(Class, OpSize::i64Bit, MemSrc, Align == OpSize::iInvalid ? OpSize : Align); @@ -4448,8 +4446,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl if (OpSize == OpSize::f80Bit) { Ref MemStoreDst = LoadEffectiveAddress(A, true); if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { - auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); - _StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, PReg, MemStoreDst); + _StoreMemX87SVEOptPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, MemStoreDst); } else { // For X87 extended doubles, split before storing _StoreMem(FPRClass, OpSize::i64Bit, MemStoreDst, Src, Align); diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index c2eff80377..d265046b90 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -567,19 +567,16 @@ ] }, - "PRED = InitPredicate OpSize:#Size, u8:$Pattern": { - "Desc": ["Initialize predicate register from Pattern"], - "DestSize": "Size" - }, - - "StoreMemPredicate OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Value, PRED:$Mask, GPR:$Addr": { - "Desc": [ "Stores a value to memory using SVE predicate mask." ], + "StoreMemX87SVEOptPredicate OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Value, GPR:$Addr": { + "Desc": [ "Stores a value to memory using SVE predicate mask that's designed", + "specifically for use in the X87 SVE Ldst optimization." ], "DestSize": "RegisterSize", "HasSideEffects": true, "ElementSize": "ElementSize" }, - "FPR = LoadMemPredicate OpSize:#RegisterSize, OpSize:#ElementSize, PRED:$Mask, GPR:$Addr": { - "Desc": [ "Loads a value to memory using SVE predicate mask." ], + "FPR = LoadMemX87SVEOptPredicate OpSize:#RegisterSize, OpSize:#ElementSize, GPR:$Addr": { + "Desc": [ "Loads a value to memory using SVE predicate mask that's designed", + "specifically for use in the X87 SVE Ldst optimization." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, diff --git a/FEXCore/Source/Interface/IR/IREmitter.cpp b/FEXCore/Source/Interface/IR/IREmitter.cpp index 61d7d4bcb6..a65c58ea69 100644 --- a/FEXCore/Source/Interface/IR/IREmitter.cpp +++ b/FEXCore/Source/Interface/IR/IREmitter.cpp @@ -41,6 +41,7 @@ FEXCore::IR::RegisterClassType IREmitter::WalkFindRegClass(Ref Node) { case FPRClass: case GPRFixedClass: case FPRFixedClass: + case PREDClass: case InvalidClass: return Class; default: break; } diff --git a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp index 98c43f1e3b..8fc28cb9c5 100644 --- a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp @@ -6,7 +6,7 @@ #include "FEXCore/IR/IR.h" #include "FEXCore/Utils/Profiler.h" #include "FEXCore/Core/HostFeatures.h" -#include "CodeEmitter/Emitter.h" +#include "Interface/Core/ArchHelpers/Arm64Emitter.h" #include #include @@ -838,13 +838,12 @@ void X87StackOptimization::Run(IREmitter* Emit) { if (Op->StoreSize != OpSize::f80Bit) { // if it's not 80bits then convert StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode); } - if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize() + if (Op->StoreSize == OpSize::f80Bit) { if (Features.SupportsSVE128 || Features.SupportsSVE256) { - auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); if (!IsZero(Offset)) { AddrNode = IREmit->_Add(OpSize::i64Bit, AddrNode, Offset); } - IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode); + IREmit->_StoreMemX87SVEOptPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, AddrNode); } else { // For X87 extended doubles, split before storing IREmit->_StoreMem(FPRClass, OpSize::i64Bit, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1); From 3dc7b8d90a128e40b4a2f7dad0cdb6b88c82e909 Mon Sep 17 00:00:00 2001 From: Paulo Matos Date: Wed, 22 Jan 2025 10:07:12 +0100 Subject: [PATCH 2/3] asm_tests: Ensure predicate cache is reset when control flow leaves block --- unittests/ASM/X87/MemcopyWithCPUID.asm | 36 ++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 unittests/ASM/X87/MemcopyWithCPUID.asm diff --git a/unittests/ASM/X87/MemcopyWithCPUID.asm b/unittests/ASM/X87/MemcopyWithCPUID.asm new file mode 100644 index 0000000000..62e7558747 --- /dev/null +++ b/unittests/ASM/X87/MemcopyWithCPUID.asm @@ -0,0 +1,36 @@ +%ifdef CONFIG +{ + "RegData": { + "RBX": "0x8000000000000000", + "RCX": "0x3fff" + } +} +%endif + +; Related to #4274 - ensures that if cpuid clobbers the predicate register, +; we reset the predicate cache. + +section .data +align 8 + +data: + dt 1.0 + +section .bss +align 8 + +data2: + resb 10 + +section .text +lea r8, [rel data] +fld tword [r8] + +mov rax, 0x0 +cpuid ; Will this instruction clobber the predicate register? + +fstp tword [rel data2] + +mov rbx, [rel data2] +mov rcx, [rel data2+8] +hlt From ddd241fe39973802f16c17ff3152ecb808e17e1f Mon Sep 17 00:00:00 2001 From: Paulo Matos Date: Wed, 22 Jan 2025 10:07:36 +0100 Subject: [PATCH 3/3] instcount: Ensure predicate cache is reset when control flow leaves block --- unittests/InstructionCountCI/X87ldst-SVE.json | 34 ++++--------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/unittests/InstructionCountCI/X87ldst-SVE.json b/unittests/InstructionCountCI/X87ldst-SVE.json index 81b6b205cd..a4a0fc187d 100644 --- a/unittests/InstructionCountCI/X87ldst-SVE.json +++ b/unittests/InstructionCountCI/X87ldst-SVE.json @@ -14,13 +14,12 @@ }, "Instructions": { "fstp tword [rax]": { - "ExpectedInstructionCount": 13, + "ExpectedInstructionCount": 12, "Comment": "Single 80-bit store.", "ExpectedArm64ASM": [ "ldrb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x4]", "ldrb w21, [x28, #1298]", "mov w22, #0x1", @@ -34,7 +33,7 @@ }, "2-store 80bit": { "x86InstructionCount": 2, - "ExpectedInstructionCount": 25, + "ExpectedInstructionCount": 23, "x86Insts": [ "fstp tword [rax]", "fstp tword [rax+10]" @@ -43,7 +42,6 @@ "ldrb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x4]", "ldrb w21, [x28, #1298]", "mov w22, #0x1", @@ -55,7 +53,6 @@ "strb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "add x21, x4, #0xa (10)", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", @@ -69,7 +66,7 @@ }, "8-store 80bit": { "x86InstructionCount": 8, - "ExpectedInstructionCount": 97, + "ExpectedInstructionCount": 89, "x86Insts": [ "fstp tword [rax]", "fstp tword [rax+10]", @@ -84,7 +81,6 @@ "ldrb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x4]", "ldrb w21, [x28, #1298]", "mov w22, #0x1", @@ -96,7 +92,6 @@ "strb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "add x21, x4, #0xa (10)", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", @@ -108,7 +103,6 @@ "strb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "add x21, x4, #0x14 (20)", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", @@ -120,7 +114,6 @@ "strb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "add x21, x4, #0x1e (30)", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", @@ -132,7 +125,6 @@ "strb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "add x21, x4, #0x28 (40)", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", @@ -144,7 +136,6 @@ "strb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "add x21, x4, #0x32 (50)", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", @@ -156,7 +147,6 @@ "strb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "add x21, x4, #0x3c (60)", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", @@ -168,7 +158,6 @@ "strb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "add x21, x4, #0x46 (70)", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", @@ -181,10 +170,9 @@ ] }, "fld tword [rax]": { - "ExpectedInstructionCount": 13, + "ExpectedInstructionCount": 12, "Comment": "Single 80-bit store.", "ExpectedArm64ASM": [ - "ptrue p2.h, vl5", "ld1h {z2.h}, p2/z, [x4]", "ldrb w20, [x28, #1019]", "mov w21, #0x1", @@ -201,16 +189,14 @@ }, "2-load 80bit": { "x86InstructionCount": 2, - "ExpectedInstructionCount": 22, + "ExpectedInstructionCount": 20, "x86Insts": [ "fld tword [rax]", "fld tword [rax+10]" ], "ExpectedArm64ASM": [ - "ptrue p2.h, vl5", "ld1h {z2.h}, p2/z, [x4]", "add x20, x4, #0xa (10)", - "ptrue p2.h, vl5", "ld1h {z3.h}, p2/z, [x20]", "ldrb w20, [x28, #1019]", "sub w20, w20, #0x2 (2)", @@ -233,7 +219,7 @@ }, "8-load 80bit": { "x86InstructionCount": 8, - "ExpectedInstructionCount": 59, + "ExpectedInstructionCount": 51, "x86Insts": [ "fld tword [rax]", "fld tword [rax+10]", @@ -245,28 +231,20 @@ "fld tword [rax+70]" ], "ExpectedArm64ASM": [ - "ptrue p2.h, vl5", "ld1h {z2.h}, p2/z, [x4]", "add x20, x4, #0xa (10)", - "ptrue p2.h, vl5", "ld1h {z3.h}, p2/z, [x20]", "add x20, x4, #0x14 (20)", - "ptrue p2.h, vl5", "ld1h {z4.h}, p2/z, [x20]", "add x20, x4, #0x1e (30)", - "ptrue p2.h, vl5", "ld1h {z5.h}, p2/z, [x20]", "add x20, x4, #0x28 (40)", - "ptrue p2.h, vl5", "ld1h {z6.h}, p2/z, [x20]", "add x20, x4, #0x32 (50)", - "ptrue p2.h, vl5", "ld1h {z7.h}, p2/z, [x20]", "add x20, x4, #0x3c (60)", - "ptrue p2.h, vl5", "ld1h {z8.h}, p2/z, [x20]", "add x20, x4, #0x46 (70)", - "ptrue p2.h, vl5", "ld1h {z9.h}, p2/z, [x20]", "ldrb w20, [x28, #1019]", "sub w20, w20, #0x8 (8)",