From c61d9a979efa92d7bae559f97a44d1ca15c260de Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Thu, 9 Jun 2022 20:30:21 +0800 Subject: [PATCH 01/12] Uset Top-Down scheduling policy in AMDGPU backend --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 1728f4725..6e3842fe6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -645,7 +645,7 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, // Enabling both top down and bottom up scheduling seems to give us less // register spills than just using one of these approaches on its own. - Policy.OnlyTopDown = false; + Policy.OnlyTopDown = true; Policy.OnlyBottomUp = false; // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. From a4f8ff6a4555ad39d0cb7c2a05013e29be63e668 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Thu, 9 Jun 2022 20:31:26 +0800 Subject: [PATCH 02/12] Disable Post- and Post-RA- machine scheduler in AMDGPU backend --- llvm/lib/Target/AMDGPU/SISchedule.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index b24c061af..54a3a2011 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -75,7 +75,7 @@ class SISchedMachineModel : SchedMachineModel { // to the register pressure analysis. let MicroOpBufferSize = 1; let IssueWidth = 1; - let PostRAScheduler = 1; + let PostRAScheduler = 0; // FIXME:Approximate 2 * branch cost. Try to hack around bad // early-ifcvt heuristics. These need improvement to avoid the OOE From a14f3e6549f5290502fb7cb95099340dc10d4276 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Thu, 9 Jun 2022 20:33:32 +0800 Subject: [PATCH 03/12] Remove unnecessary dependency added by LoadClusterMutation --- llvm/lib/CodeGen/MachineScheduler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 9c9574bc1..ed01b38b4 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -1599,7 +1599,7 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps( continue; LLVM_DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum << ")\n"); - DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial)); + // DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial)); } } else { // Copy predecessor edges from SUb to SUa to avoid the SUnits that From 63f279178f782b843879edd3afe50a899930de21 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Thu, 9 Jun 2022 20:34:52 +0800 Subject: [PATCH 04/12] Remove unnecessary instruction prioritizing logic --- llvm/lib/CodeGen/MachineScheduler.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index ed01b38b4..40c2f502e 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -3148,12 +3148,12 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand, DAG->MF)) return; - // Avoid increasing the max critical pressure in the scheduled region. - if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax, - Cand.RPDelta.CriticalMax, - TryCand, Cand, RegCritical, TRI, - DAG->MF)) - return; + // // Avoid increasing the max critical pressure in the scheduled region. + // if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax, + // Cand.RPDelta.CriticalMax, + // TryCand, Cand, RegCritical, TRI, + // DAG->MF)) + // return; // We only compare a subset of features when comparing nodes between // Top and Bottom boundary. Some properties are simply incomparable, in many From 6ee7ef5034bdaa59c3c9379dd2a724e1efa7fcc0 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Thu, 9 Jun 2022 20:36:41 +0800 Subject: [PATCH 05/12] Shorten 16-pass MFMA instruction latency down to 16 cycles --- llvm/lib/Target/AMDGPU/SISchedule.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index 54a3a2011..f5eff6c83 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -157,8 +157,8 @@ multiclass SICommonWriteRes { def : HWWriteRes; let ResourceCycles = [8] in def : HWWriteRes; - let ResourceCycles = [16] in - def : HWWriteRes; + let ResourceCycles = [4] in + def : HWWriteRes; def : ReadAdvance; From 7afa78bf2386ede0b8f8f801e1b5c3033b7e0742 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Thu, 9 Jun 2022 20:40:24 +0800 Subject: [PATCH 06/12] Mark "HWVMEM" as reserved resource --- llvm/lib/Target/AMDGPU/SISchedule.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index f5eff6c83..b6d0f5dfa 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -102,7 +102,7 @@ def HWSALU : ProcResource<1> { let BufferSize = 1; } def HWVMEM : ProcResource<1> { - let BufferSize = 15; // Taken from S_WAITCNT + let BufferSize = 0; // Taken from S_WAITCNT } def HWVALU : ProcResource<1> { let BufferSize = 1; From 7c1bd8ebe177a1dbb9bb7e6d34247a5bffb966e3 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Thu, 9 Jun 2022 20:41:15 +0800 Subject: [PATCH 07/12] Enlarge HWVMEM latency up to 64 cycles --- llvm/lib/Target/AMDGPU/SISchedule.td | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index b6d0f5dfa..d9f73ef35 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -142,6 +142,7 @@ multiclass SICommonWriteRes { def : HWWriteRes; // Can be between 2 and 64 def : HWWriteRes; def : HWWriteRes; + let ResourceCycles = [16] in def : HWWriteRes; def : HWWriteRes; // XXX: Guessed ??? From 48e8f1f0aeddc37e21f38164ccfa700b130f7c40 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Fri, 17 Jun 2022 00:34:16 +0800 Subject: [PATCH 08/12] Revert "Shorten 16-pass MFMA instruction latency down to 16 cycles" This reverts commit 6ee7ef5034bdaa59c3c9379dd2a724e1efa7fcc0. --- llvm/lib/Target/AMDGPU/SISchedule.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index d9f73ef35..13062dfef 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -158,8 +158,8 @@ multiclass SICommonWriteRes { def : HWWriteRes; let ResourceCycles = [8] in def : HWWriteRes; - let ResourceCycles = [4] in - def : HWWriteRes; + let ResourceCycles = [16] in + def : HWWriteRes; def : ReadAdvance; From 0844d2e66395c3e7bdc7e0a61c1ee94e87c2eef3 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Fri, 17 Jun 2022 00:36:31 +0800 Subject: [PATCH 09/12] Set VMEM's ResourceCycles same as its latency --- llvm/lib/Target/AMDGPU/SISchedule.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index 13062dfef..e83f86a37 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -142,7 +142,7 @@ multiclass SICommonWriteRes { def : HWWriteRes; // Can be between 2 and 64 def : HWWriteRes; def : HWWriteRes; - let ResourceCycles = [16] in + let ResourceCycles = [80] in def : HWWriteRes; def : HWWriteRes; // XXX: Guessed ??? From 3b6d0e1e362c94638478417c44b2031d7c125811 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Fri, 17 Jun 2022 07:46:51 +0800 Subject: [PATCH 10/12] Only change the latency for BUFFER_LOAD* instructions --- llvm/lib/Target/AMDGPU/SISchedule.td | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index e83f86a37..9242ddf93 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -22,6 +22,7 @@ def WriteLDS : SchedWrite; def WriteSALU : SchedWrite; def WriteSMEM : SchedWrite; def WriteVMEM : SchedWrite; +def ReadVMEM : SchedWrite; def WriteBarrier : SchedWrite; def MIVGPRRead : SchedRead; @@ -142,8 +143,9 @@ multiclass SICommonWriteRes { def : HWWriteRes; // Can be between 2 and 64 def : HWWriteRes; def : HWWriteRes; - let ResourceCycles = [80] in def : HWWriteRes; + let ResourceCycles = [80] in + def : HWWriteRes; def : HWWriteRes; // XXX: Guessed ??? def : HWVALUWriteRes; @@ -213,6 +215,8 @@ def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>; def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>; def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>; +def : InstRW<[ReadVMEM], (instregex "^BUFFER_LOAD")>; + } // End SchedModel = SIQuarterSpeedModel let SchedModel = SIDPFullSpeedModel in { From 9cf240ae17cfa653ca82609f05b65a8ee44fea13 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Fri, 17 Jun 2022 08:07:00 +0800 Subject: [PATCH 11/12] Revert "Only change the latency for BUFFER_LOAD* instructions" This reverts commit 3b6d0e1e362c94638478417c44b2031d7c125811. --- llvm/lib/Target/AMDGPU/SISchedule.td | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index 9242ddf93..e83f86a37 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -22,7 +22,6 @@ def WriteLDS : SchedWrite; def WriteSALU : SchedWrite; def WriteSMEM : SchedWrite; def WriteVMEM : SchedWrite; -def ReadVMEM : SchedWrite; def WriteBarrier : SchedWrite; def MIVGPRRead : SchedRead; @@ -143,9 +142,8 @@ multiclass SICommonWriteRes { def : HWWriteRes; // Can be between 2 and 64 def : HWWriteRes; def : HWWriteRes; - def : HWWriteRes; let ResourceCycles = [80] in - def : HWWriteRes; + def : HWWriteRes; def : HWWriteRes; // XXX: Guessed ??? def : HWVALUWriteRes; @@ -215,8 +213,6 @@ def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>; def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>; def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>; -def : InstRW<[ReadVMEM], (instregex "^BUFFER_LOAD")>; - } // End SchedModel = SIQuarterSpeedModel let SchedModel = SIDPFullSpeedModel in { From b4797e1a18a58740da985b2d803b6926abbc5446 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Fri, 17 Jun 2022 16:37:30 +0800 Subject: [PATCH 12/12] Set ResourceCycles=5 for HWLGKM --- llvm/lib/Target/AMDGPU/SISchedule.td | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index e83f86a37..5c5d954ca 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -96,7 +96,7 @@ def HWExport : ProcResource<1> { let BufferSize = 7; // Taken from S_WAITCNT } def HWLGKM : ProcResource<1> { - let BufferSize = 31; // Taken from S_WAITCNT + let BufferSize = 0; // Taken from S_WAITCNT } def HWSALU : ProcResource<1> { let BufferSize = 1; @@ -139,6 +139,7 @@ multiclass SICommonWriteRes { def : HWWriteRes; def : HWWriteRes; + let ResourceCycles = [5] in def : HWWriteRes; // Can be between 2 and 64 def : HWWriteRes; def : HWWriteRes;