diff --git a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp index d54a467e3f..b49abc0e54 100644 --- a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp +++ b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp @@ -1,7 +1,6 @@ // SPDX-License-Identifier: MIT #include "Interface/Core/ArchHelpers/Arm64Emitter.h" #include "FEXCore/Core/X86Enums.h" -#include "FEXCore/Utils/AllocatorHooks.h" #include "Interface/Core/Dispatcher/Dispatcher.h" #include "Interface/Context/Context.h" @@ -60,8 +59,8 @@ namespace x64 { // p6 and p7 registers are used as temporaries no not added here for RA // See PREF_TMP_16B and PREF_TMP_32B // p0-p1 are also used in the jit as temps. - // Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5. - constexpr std::array PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; + // Also p8-p15 cannot be used can only encode p0-p7, p2 is a special register, so we're left with p3-p5. + constexpr std::array PR = {ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; constexpr unsigned RAPairs = 6; @@ -100,6 +99,7 @@ namespace x64 { ARMEmitter::Reg::r20, ARMEmitter::Reg::r21, ARMEmitter::Reg::r22, + // PF/AF must be last. REG_PF, REG_AF, }; @@ -112,8 +112,8 @@ namespace x64 { // p6 and p7 registers are used as temporaries no not added here for RA // See PREF_TMP_16B and PREF_TMP_32B // p0-p1 are also used in the jit as temps. - // Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5. - constexpr std::array PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; + // Also p8-p15 cannot be used can only encode p0-p7, p2 is a special register, so we're left with p3-p5. + constexpr std::array PR = {ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; constexpr unsigned RAPairs = 6; @@ -249,8 +249,8 @@ namespace x32 { // p6 and p7 registers are used as temporaries no not added here for RA // See PREF_TMP_16B and PREF_TMP_32B // p0-p1 are also used in the jit as temps. - // Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5. - constexpr std::array PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; + // Also p8-p15 cannot be used can only encode p0-p7, p2 is a special register, so we're left with p3-p5. + constexpr std::array PR = {ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; // All are caller saved constexpr std::array SRAFPR = { @@ -631,7 +631,7 @@ void Arm64Emitter::FillSpecialRegs(ARMEmitter::Register TmpReg, ARMEmitter::Regi } #endif - if (SetPredRegs) { + if (SetPredRegs && (EmitterCTX->HostFeatures.SupportsSVE256 || EmitterCTX->HostFeatures.SupportsSVE128)) { // Set up predicate registers. // We don't bother spilling these in SpillStaticRegs, // since all that matters is we restore them on a fill. @@ -643,6 +643,9 @@ void Arm64Emitter::FillSpecialRegs(ARMEmitter::Register TmpReg, ARMEmitter::Regi if (EmitterCTX->HostFeatures.SupportsSVE128) { ptrue(ARMEmitter::SubRegSize::i8Bit, PRED_TMP_16B, ARMEmitter::PredicatePattern::SVE_VL16); } + + // Fill in the predicate register for the x87 ldst SVE optimization. + ptrue(ARMEmitter::SubRegSize::i16Bit, PRED_X87_SVEOPT, ARMEmitter::PredicatePattern::SVE_VL5); } } diff --git a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h index c30b582bd9..d61f14a8da 100644 --- a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h +++ b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h @@ -46,6 +46,10 @@ constexpr auto REG_AF = ARMEmitter::Reg::r27; // Vector temporaries constexpr auto VTMP1 = ARMEmitter::VReg::v0; constexpr auto VTMP2 = ARMEmitter::VReg::v1; + +// Predicate register for X87 SVE Optimization +constexpr auto SVE_OPT_PRED = ARMEmitter::PReg::p2; + #else constexpr auto TMP1 = ARMEmitter::XReg::x10; constexpr auto TMP2 = ARMEmitter::XReg::x11; @@ -65,6 +69,9 @@ constexpr auto VTMP2 = ARMEmitter::VReg::v17; constexpr auto EC_CALL_CHECKER_PC_REG = ARMEmitter::XReg::x9; constexpr auto EC_ENTRY_CPUAREA_REG = ARMEmitter::XReg::x17; +// Predicate register for X87 SVE Optimization +constexpr auto SVE_OPT_PRED = ARMEmitter::PReg::p2; + // These structures are not included in the standard Windows headers, define the offsets of members we care about for EC here. constexpr size_t TEB_CPU_AREA_OFFSET = 0x1788; constexpr size_t TEB_PEB_OFFSET = 0x60; @@ -79,6 +86,9 @@ constexpr uint64_t EC_CODE_BITMAP_MAX_ADDRESS = 1ULL << 47; // Will force one single instruction block to be generated first if set when entering the JIT filling SRA. constexpr auto ENTRY_FILL_SRA_SINGLE_INST_REG = TMP1; +// Predicate to use in the X87 SVE optimization +constexpr ARMEmitter::PRegister PRED_X87_SVEOPT = ARMEmitter::PReg::p2; + // Predicate register temporaries (used when AVX support is enabled) // PRED_TMP_16B indicates a predicate register that indicates the first 16 bytes set to 1. // PRED_TMP_32B indicates a predicate register that indicates the first 32 bytes set to 1. diff --git a/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp b/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp index b819aeecb5..47fdacfedf 100644 --- a/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp @@ -8,6 +8,7 @@ tags: backend|arm64 #include "FEXCore/Core/X86Enums.h" #include "FEXCore/Utils/LogManager.h" #include "Interface/Context/Context.h" +#include "Interface/Core/ArchHelpers/Arm64Emitter.h" #include "Interface/Core/CPUID.h" #include "Interface/Core/JIT/JITClass.h" #include "Interface/IR/IR.h" @@ -1590,21 +1591,14 @@ DEF_OP(StoreMem) { } } -DEF_OP(InitPredicate) { - const auto Op = IROp->C(); - const auto OpSize = IROp->Size; - ptrue(ConvertSubRegSize16(OpSize), GetPReg(Node), static_cast(Op->Pattern)); -} +DEF_OP(StoreMemX87SVEOptPredicate) { + const auto Op = IROp->C(); + const auto Predicate = PRED_X87_SVEOPT; -DEF_OP(StoreMemPredicate) { - const auto Op = IROp->C(); - const auto Predicate = GetPReg(Op->Mask.ID()); + LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "StoreMemX87SVEOptPredicate needs SVE support"); const auto RegData = GetVReg(Op->Value.ID()); const auto MemReg = GetReg(Op->Addr.ID()); - - LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "StoreMemPredicate needs SVE support"); - const auto MemDst = ARMEmitter::SVEMemOperand(MemReg.X(), 0); switch (IROp->ElementSize) { @@ -1628,13 +1622,13 @@ DEF_OP(StoreMemPredicate) { } } -DEF_OP(LoadMemPredicate) { - const auto Op = IROp->C(); +DEF_OP(LoadMemX87SVEOptPredicate) { + const auto Op = IROp->C(); const auto Dst = GetVReg(Node); - const auto Predicate = GetPReg(Op->Mask.ID()); + const auto Predicate = PRED_X87_SVEOPT; const auto MemReg = GetReg(Op->Addr.ID()); - LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "LoadMemPredicate needs SVE support"); + LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "LoadMemX87SVEOptPredicate needs SVE support"); const auto MemDst = ARMEmitter::SVEMemOperand(MemReg.X(), 0); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index db1b5e38d0..edd51ae629 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -4313,9 +4313,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T if (OpSize == OpSize::f80Bit) { Ref MemSrc = LoadEffectiveAddress(A, true); if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { - // Using SVE we can load this with a single instruction. - auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); - return _LoadMemPredicate(OpSize::i128Bit, OpSize::i16Bit, PReg, MemSrc); + return _LoadMemX87SVEOptPredicate(OpSize::i128Bit, OpSize::i16Bit, MemSrc); } else { // For X87 extended doubles, Split the load. auto Res = _LoadMem(Class, OpSize::i64Bit, MemSrc, Align == OpSize::iInvalid ? OpSize : Align); @@ -4448,8 +4446,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl if (OpSize == OpSize::f80Bit) { Ref MemStoreDst = LoadEffectiveAddress(A, true); if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { - auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); - _StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, PReg, MemStoreDst); + _StoreMemX87SVEOptPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, MemStoreDst); } else { // For X87 extended doubles, split before storing _StoreMem(FPRClass, OpSize::i64Bit, MemStoreDst, Src, Align); diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index c2eff80377..d265046b90 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -567,19 +567,16 @@ ] }, - "PRED = InitPredicate OpSize:#Size, u8:$Pattern": { - "Desc": ["Initialize predicate register from Pattern"], - "DestSize": "Size" - }, - - "StoreMemPredicate OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Value, PRED:$Mask, GPR:$Addr": { - "Desc": [ "Stores a value to memory using SVE predicate mask." ], + "StoreMemX87SVEOptPredicate OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Value, GPR:$Addr": { + "Desc": [ "Stores a value to memory using SVE predicate mask that's designed", + "specifically for use in the X87 SVE Ldst optimization." ], "DestSize": "RegisterSize", "HasSideEffects": true, "ElementSize": "ElementSize" }, - "FPR = LoadMemPredicate OpSize:#RegisterSize, OpSize:#ElementSize, PRED:$Mask, GPR:$Addr": { - "Desc": [ "Loads a value to memory using SVE predicate mask." ], + "FPR = LoadMemX87SVEOptPredicate OpSize:#RegisterSize, OpSize:#ElementSize, GPR:$Addr": { + "Desc": [ "Loads a value to memory using SVE predicate mask that's designed", + "specifically for use in the X87 SVE Ldst optimization." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, diff --git a/FEXCore/Source/Interface/IR/IREmitter.cpp b/FEXCore/Source/Interface/IR/IREmitter.cpp index 61d7d4bcb6..a65c58ea69 100644 --- a/FEXCore/Source/Interface/IR/IREmitter.cpp +++ b/FEXCore/Source/Interface/IR/IREmitter.cpp @@ -41,6 +41,7 @@ FEXCore::IR::RegisterClassType IREmitter::WalkFindRegClass(Ref Node) { case FPRClass: case GPRFixedClass: case FPRFixedClass: + case PREDClass: case InvalidClass: return Class; default: break; } diff --git a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp index 98c43f1e3b..8fc28cb9c5 100644 --- a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp @@ -6,7 +6,7 @@ #include "FEXCore/IR/IR.h" #include "FEXCore/Utils/Profiler.h" #include "FEXCore/Core/HostFeatures.h" -#include "CodeEmitter/Emitter.h" +#include "Interface/Core/ArchHelpers/Arm64Emitter.h" #include #include @@ -838,13 +838,12 @@ void X87StackOptimization::Run(IREmitter* Emit) { if (Op->StoreSize != OpSize::f80Bit) { // if it's not 80bits then convert StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode); } - if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize() + if (Op->StoreSize == OpSize::f80Bit) { if (Features.SupportsSVE128 || Features.SupportsSVE256) { - auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); if (!IsZero(Offset)) { AddrNode = IREmit->_Add(OpSize::i64Bit, AddrNode, Offset); } - IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode); + IREmit->_StoreMemX87SVEOptPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, AddrNode); } else { // For X87 extended doubles, split before storing IREmit->_StoreMem(FPRClass, OpSize::i64Bit, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1); diff --git a/unittests/ASM/X87/MemcopyWithCPUID.asm b/unittests/ASM/X87/MemcopyWithCPUID.asm new file mode 100644 index 0000000000..62e7558747 --- /dev/null +++ b/unittests/ASM/X87/MemcopyWithCPUID.asm @@ -0,0 +1,36 @@ +%ifdef CONFIG +{ + "RegData": { + "RBX": "0x8000000000000000", + "RCX": "0x3fff" + } +} +%endif + +; Related to #4274 - ensures that if cpuid clobbers the predicate register, +; we reset the predicate cache. + +section .data +align 8 + +data: + dt 1.0 + +section .bss +align 8 + +data2: + resb 10 + +section .text +lea r8, [rel data] +fld tword [r8] + +mov rax, 0x0 +cpuid ; Will this instruction clobber the predicate register? + +fstp tword [rel data2] + +mov rbx, [rel data2] +mov rcx, [rel data2+8] +hlt diff --git a/unittests/InstructionCountCI/X87ldst-SVE.json b/unittests/InstructionCountCI/X87ldst-SVE.json index 81b6b205cd..a4a0fc187d 100644 --- a/unittests/InstructionCountCI/X87ldst-SVE.json +++ b/unittests/InstructionCountCI/X87ldst-SVE.json @@ -14,13 +14,12 @@ }, "Instructions": { "fstp tword [rax]": { - "ExpectedInstructionCount": 13, + "ExpectedInstructionCount": 12, "Comment": "Single 80-bit store.", "ExpectedArm64ASM": [ "ldrb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x4]", "ldrb w21, [x28, #1298]", "mov w22, #0x1", @@ -34,7 +33,7 @@ }, "2-store 80bit": { "x86InstructionCount": 2, - "ExpectedInstructionCount": 25, + "ExpectedInstructionCount": 23, "x86Insts": [ "fstp tword [rax]", "fstp tword [rax+10]" @@ -43,7 +42,6 @@ "ldrb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x4]", "ldrb w21, [x28, #1298]", "mov w22, #0x1", @@ -55,7 +53,6 @@ "strb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "add x21, x4, #0xa (10)", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", @@ -69,7 +66,7 @@ }, "8-store 80bit": { "x86InstructionCount": 8, - "ExpectedInstructionCount": 97, + "ExpectedInstructionCount": 89, "x86Insts": [ "fstp tword [rax]", "fstp tword [rax+10]", @@ -84,7 +81,6 @@ "ldrb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x4]", "ldrb w21, [x28, #1298]", "mov w22, #0x1", @@ -96,7 +92,6 @@ "strb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "add x21, x4, #0xa (10)", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", @@ -108,7 +103,6 @@ "strb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "add x21, x4, #0x14 (20)", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", @@ -120,7 +114,6 @@ "strb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "add x21, x4, #0x1e (30)", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", @@ -132,7 +125,6 @@ "strb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "add x21, x4, #0x28 (40)", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", @@ -144,7 +136,6 @@ "strb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "add x21, x4, #0x32 (50)", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", @@ -156,7 +147,6 @@ "strb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "add x21, x4, #0x3c (60)", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", @@ -168,7 +158,6 @@ "strb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "add x21, x4, #0x46 (70)", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", @@ -181,10 +170,9 @@ ] }, "fld tword [rax]": { - "ExpectedInstructionCount": 13, + "ExpectedInstructionCount": 12, "Comment": "Single 80-bit store.", "ExpectedArm64ASM": [ - "ptrue p2.h, vl5", "ld1h {z2.h}, p2/z, [x4]", "ldrb w20, [x28, #1019]", "mov w21, #0x1", @@ -201,16 +189,14 @@ }, "2-load 80bit": { "x86InstructionCount": 2, - "ExpectedInstructionCount": 22, + "ExpectedInstructionCount": 20, "x86Insts": [ "fld tword [rax]", "fld tword [rax+10]" ], "ExpectedArm64ASM": [ - "ptrue p2.h, vl5", "ld1h {z2.h}, p2/z, [x4]", "add x20, x4, #0xa (10)", - "ptrue p2.h, vl5", "ld1h {z3.h}, p2/z, [x20]", "ldrb w20, [x28, #1019]", "sub w20, w20, #0x2 (2)", @@ -233,7 +219,7 @@ }, "8-load 80bit": { "x86InstructionCount": 8, - "ExpectedInstructionCount": 59, + "ExpectedInstructionCount": 51, "x86Insts": [ "fld tword [rax]", "fld tword [rax+10]", @@ -245,28 +231,20 @@ "fld tword [rax+70]" ], "ExpectedArm64ASM": [ - "ptrue p2.h, vl5", "ld1h {z2.h}, p2/z, [x4]", "add x20, x4, #0xa (10)", - "ptrue p2.h, vl5", "ld1h {z3.h}, p2/z, [x20]", "add x20, x4, #0x14 (20)", - "ptrue p2.h, vl5", "ld1h {z4.h}, p2/z, [x20]", "add x20, x4, #0x1e (30)", - "ptrue p2.h, vl5", "ld1h {z5.h}, p2/z, [x20]", "add x20, x4, #0x28 (40)", - "ptrue p2.h, vl5", "ld1h {z6.h}, p2/z, [x20]", "add x20, x4, #0x32 (50)", - "ptrue p2.h, vl5", "ld1h {z7.h}, p2/z, [x20]", "add x20, x4, #0x3c (60)", - "ptrue p2.h, vl5", "ld1h {z8.h}, p2/z, [x20]", "add x20, x4, #0x46 (70)", - "ptrue p2.h, vl5", "ld1h {z9.h}, p2/z, [x20]", "ldrb w20, [x28, #1019]", "sub w20, w20, #0x8 (8)",