From 6dc1d3145831814071bbd50e3c6aa2f1e1f88853 Mon Sep 17 00:00:00 2001 From: guochen2 Date: Mon, 27 Jan 2025 17:05:47 -0500 Subject: [PATCH] true16 codegen for v_cmp --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 3 +- llvm/lib/Target/AMDGPU/VOPCInstructions.td | 57 +- llvm/lib/Target/AMDGPU/VOPInstructions.td | 4 + .../inst-select-amdgcn.fcmp.constants.w32.mir | 12 +- .../inst-select-amdgcn.fcmp.constants.w64.mir | 12 +- llvm/test/CodeGen/AMDGPU/fcmp.f16.ll | 2362 +++++++++++------ llvm/test/CodeGen/AMDGPU/fp-classify.ll | 117 +- llvm/test/CodeGen/AMDGPU/icmp.i16.ll | 44 + 8 files changed, 1784 insertions(+), 827 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 40eaba2c09209dd..3bbbbcf71d8aecd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1207,9 +1207,8 @@ static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, unsigned FakeS16Opc, unsigned S32Opc, unsigned S64Opc) { if (Size == 16) - // FIXME-TRUE16 use TrueS16Opc when realtrue16 is supported for CMP code return ST.hasTrue16BitInsts() - ? ST.useRealTrue16Insts() ? FakeS16Opc : FakeS16Opc + ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc : S16Opc; if (Size == 32) return S32Opc; diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index e16ac4423265ec8..00a3381b3fd49ec 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -1035,6 +1035,20 @@ multiclass VOPCClassPat64 { >; } +multiclass VOPCClassPat64_t16 { + defvar inst = !cast(inst_name#"_t16_e64"); + defvar P = inst.Pfl; + def : GCNPat < + (i1:$sdst + (AMDGPUfp_class + (P.Src0VT (VOP3ModsNonCanonicalizing P.Src0VT:$src0, i32:$src0_modifiers)), + i32:$src1)), + (inst i32:$src0_modifiers, VSrcT_f16:$src0, + 0 /* src1_modifiers */, (f16 (EXTRACT_SUBREG VGPR_32:$src1, lo16)), + 0) /* op_sel */ + >; +} + multiclass VOPCClassPat64_fake16 { defvar inst = !cast(inst_name#"_fake16_e64"); defvar P = inst.Pfl; @@ -1158,6 +1172,7 @@ multiclass VOPC_CLASS_F16 { } let True16Predicate = UseRealTrue16Insts in { defm _t16 : VOPC_Class_Pseudos ; + defm : VOPCClassPat64_t16; } let True16Predicate = UseFakeTrue16Insts in { defm _fake16 : VOPC_Class_Pseudos ; @@ -1207,27 +1222,30 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">; // We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith() // complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place. -multiclass ICMP_Pattern { +multiclass ICMP_Pattern { let WaveSizePredicate = isWave64 in def : GCNPat < (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), - (i64 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_64)) + (i64 (COPY_TO_REGCLASS dstInst, SReg_64)) >; let WaveSizePredicate = isWave32 in { def : GCNPat < (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), - (i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32)) + (i32 (COPY_TO_REGCLASS dstInst, SReg_32)) >; // Support codegen of i64 setcc in wave32 mode. def : GCNPat < (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), - (i64 (REG_SEQUENCE SReg_64, (inst $src0, $src1), sub0, (S_MOV_B32 (i32 0)), sub1)) + (i64 (REG_SEQUENCE SReg_64, dstInst, sub0, (S_MOV_B32 (i32 0)), sub1)) >; } } +multiclass ICMP_Pattern_t16 + : ICMP_Pattern; + defm : ICMP_Pattern ; defm : ICMP_Pattern ; defm : ICMP_Pattern ; @@ -1250,6 +1268,19 @@ defm : ICMP_Pattern ; defm : ICMP_Pattern ; defm : ICMP_Pattern ; +let True16Predicate = UseRealTrue16Insts in { +defm : ICMP_Pattern_t16 ; +defm : ICMP_Pattern_t16 ; +defm : ICMP_Pattern_t16 ; +defm : ICMP_Pattern_t16 ; +defm : ICMP_Pattern_t16 ; +defm : ICMP_Pattern_t16 ; +defm : ICMP_Pattern_t16 ; +defm : ICMP_Pattern_t16 ; +defm : ICMP_Pattern_t16 ; +defm : ICMP_Pattern_t16 ; +} // End True16Predicate = UseRealTrue16Insts + let True16Predicate = UseFakeTrue16Insts in { defm : ICMP_Pattern ; defm : ICMP_Pattern ; @@ -1335,6 +1366,24 @@ defm : FCMP_Pattern ; defm : FCMP_Pattern ; defm : FCMP_Pattern ; +let True16Predicate = UseRealTrue16Insts in { +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; + +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +defm : FCMP_Pattern ; +} // End True16Predicate = UseRealTrue16Insts + let True16Predicate = UseFakeTrue16Insts in { defm : FCMP_Pattern ; defm : FCMP_Pattern ; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 3b5358b737aa4ca..19c39a168e0c9fb 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -2012,6 +2012,10 @@ def : ClassPat { let True16Predicate = NotHasTrue16BitInsts; } +def : ClassPat_t16 { + let True16Predicate = UseRealTrue16Insts; +} + def : ClassPat_t16 { let True16Predicate = UseFakeTrue16Insts; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir index cdb67caea12cf0e..49383135ab0c587 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir @@ -17,11 +17,9 @@ body: | ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] - ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_fake16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_fake16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[V_CVT_F16_F32_t16_e64_]], 0, [[V_CVT_F16_F32_t16_e64_1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] ; ; GFX11-FAKE16-LABEL: name: fcmp_false_f16 ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 @@ -55,11 +53,9 @@ body: | ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] - ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_fake16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_fake16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[V_CVT_F16_F32_t16_e64_]], 0, [[V_CVT_F16_F32_t16_e64_1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] ; ; GFX11-FAKE16-LABEL: name: fcmp_true_f16 ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir index ed811d37c3d0fc0..828eb5d3fb40ac7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir @@ -17,11 +17,9 @@ body: | ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] - ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_fake16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_fake16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[V_CVT_F16_F32_t16_e64_]], 0, [[V_CVT_F16_F32_t16_e64_1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] ; ; GFX11-FAKE16-LABEL: name: fcmp_false_f16 ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 @@ -55,11 +53,9 @@ body: | ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] - ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_fake16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_fake16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[V_CVT_F16_F32_t16_e64_]], 0, [[V_CVT_F16_F32_t16_e64_1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] ; ; GFX11-FAKE16-LABEL: name: fcmp_true_f16 ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll index 23b54c6741e5125..a25c183dca0a1f0 100644 --- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11-FAKE16 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX12 %s define amdgpu_kernel void @fcmp_f16_lt( @@ -55,30 +56,57 @@ define amdgpu_kernel void @fcmp_f16_lt( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_f16_lt: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_f16_lt: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_f16_lt: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_f16_lt: ; GFX12: ; %bb.0: ; %entry @@ -167,31 +195,58 @@ define amdgpu_kernel void @fcmp_f16_lt_abs( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_f16_lt_abs: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_lt_f16_e64 s2, |v0|, |v1| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, s2 -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_f16_lt_abs: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s2, |v0.l|, |v0.h| +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, s2 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_f16_lt_abs: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e64 s2, |v0|, |v1| +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, s2 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_f16_lt_abs: ; GFX12: ; %bb.0: ; %entry @@ -285,30 +340,57 @@ define amdgpu_kernel void @fcmp_f16_eq( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_f16_eq: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_f16_eq: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_f16_eq: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_f16_eq: ; GFX12: ; %bb.0: ; %entry @@ -397,30 +479,57 @@ define amdgpu_kernel void @fcmp_f16_le( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_f16_le: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_f16_le: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_le_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_f16_le: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_f16_le: ; GFX12: ; %bb.0: ; %entry @@ -509,30 +618,57 @@ define amdgpu_kernel void @fcmp_f16_gt( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_f16_gt: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_f16_gt: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_f16_gt: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_f16_gt: ; GFX12: ; %bb.0: ; %entry @@ -621,30 +757,57 @@ define amdgpu_kernel void @fcmp_f16_lg( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_f16_lg: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_f16_lg: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_f16_lg: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_f16_lg: ; GFX12: ; %bb.0: ; %entry @@ -733,30 +896,57 @@ define amdgpu_kernel void @fcmp_f16_ge( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_f16_ge: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_f16_ge: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_f16_ge: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_f16_ge: ; GFX12: ; %bb.0: ; %entry @@ -845,30 +1035,57 @@ define amdgpu_kernel void @fcmp_f16_o( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_f16_o: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_f16_o: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_f16_o: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_f16_o: ; GFX12: ; %bb.0: ; %entry @@ -957,30 +1174,57 @@ define amdgpu_kernel void @fcmp_f16_u( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_f16_u: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_f16_u: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_f16_u: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_f16_u: ; GFX12: ; %bb.0: ; %entry @@ -1069,30 +1313,57 @@ define amdgpu_kernel void @fcmp_f16_nge( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_f16_nge: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_f16_nge: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_f16_nge: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_f16_nge: ; GFX12: ; %bb.0: ; %entry @@ -1181,30 +1452,57 @@ define amdgpu_kernel void @fcmp_f16_nlg( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_f16_nlg: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_f16_nlg: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_f16_nlg: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_f16_nlg: ; GFX12: ; %bb.0: ; %entry @@ -1293,30 +1591,57 @@ define amdgpu_kernel void @fcmp_f16_ngt( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_f16_ngt: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_f16_ngt: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_f16_ngt: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_f16_ngt: ; GFX12: ; %bb.0: ; %entry @@ -1405,30 +1730,57 @@ define amdgpu_kernel void @fcmp_f16_nle( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_f16_nle: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_f16_nle: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_f16_nle: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_f16_nle: ; GFX12: ; %bb.0: ; %entry @@ -1517,30 +1869,57 @@ define amdgpu_kernel void @fcmp_f16_neq( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_f16_neq: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_f16_neq: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_f16_neq: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_f16_neq: ; GFX12: ; %bb.0: ; %entry @@ -1629,30 +2008,57 @@ define amdgpu_kernel void @fcmp_f16_nlt( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_f16_nlt: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_f16_nlt: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_f16_nlt: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_f16_nlt: ; GFX12: ; %bb.0: ; %entry @@ -1751,35 +2157,65 @@ define amdgpu_kernel void @fcmp_v2f16_lt( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_v2f16_lt: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_v2f16_lt: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_v2f16_lt: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_v2f16_lt: ; GFX12: ; %bb.0: ; %entry @@ -1884,35 +2320,65 @@ define amdgpu_kernel void @fcmp_v2f16_eq( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_v2f16_eq: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_v2f16_eq: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_v2f16_eq: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, v1, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_v2f16_eq: ; GFX12: ; %bb.0: ; %entry @@ -2016,35 +2482,65 @@ define amdgpu_kernel void @fcmp_v2f16_le( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_v2f16_le: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_v2f16_le: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_le_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_le_f16_e32 vcc_lo, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_v2f16_le: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_le_f16_e32 vcc_lo, v1, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_le_f16_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_v2f16_le: ; GFX12: ; %bb.0: ; %entry @@ -2148,35 +2644,65 @@ define amdgpu_kernel void @fcmp_v2f16_gt( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_v2f16_gt: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_v2f16_gt: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_v2f16_gt: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, v1, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_v2f16_gt: ; GFX12: ; %bb.0: ; %entry @@ -2281,35 +2807,65 @@ define amdgpu_kernel void @fcmp_v2f16_lg( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_v2f16_lg: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_v2f16_lg: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_v2f16_lg: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, v1, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_v2f16_lg: ; GFX12: ; %bb.0: ; %entry @@ -2414,35 +2970,65 @@ define amdgpu_kernel void @fcmp_v2f16_ge( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_v2f16_ge: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_v2f16_ge: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_v2f16_ge: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, v1, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_v2f16_ge: ; GFX12: ; %bb.0: ; %entry @@ -2547,35 +3133,65 @@ define amdgpu_kernel void @fcmp_v2f16_o( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_v2f16_o: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_v2f16_o: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_v2f16_o: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_v2f16_o: ; GFX12: ; %bb.0: ; %entry @@ -2680,35 +3296,65 @@ define amdgpu_kernel void @fcmp_v2f16_u( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_v2f16_u: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_v2f16_u: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_v2f16_u: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v1, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_v2f16_u: ; GFX12: ; %bb.0: ; %entry @@ -2812,35 +3458,65 @@ define amdgpu_kernel void @fcmp_v2f16_nge( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_v2f16_nge: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_v2f16_nge: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_nge_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_v2f16_nge: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_nge_f16_e32 vcc_lo, v1, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_v2f16_nge: ; GFX12: ; %bb.0: ; %entry @@ -2944,35 +3620,65 @@ define amdgpu_kernel void @fcmp_v2f16_nlg( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_v2f16_nlg: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_v2f16_nlg: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_v2f16_nlg: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v1, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_v2f16_nlg: ; GFX12: ; %bb.0: ; %entry @@ -3077,35 +3783,65 @@ define amdgpu_kernel void @fcmp_v2f16_ngt( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_v2f16_ngt: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_v2f16_ngt: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_v2f16_ngt: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_v2f16_ngt: ; GFX12: ; %bb.0: ; %entry @@ -3209,35 +3945,65 @@ define amdgpu_kernel void @fcmp_v2f16_nle( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_v2f16_nle: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_v2f16_nle: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_v2f16_nle: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_v2f16_nle: ; GFX12: ; %bb.0: ; %entry @@ -3341,35 +4107,65 @@ define amdgpu_kernel void @fcmp_v2f16_neq( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_v2f16_neq: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_v2f16_neq: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_neq_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_v2f16_neq: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_neq_f16_e32 vcc_lo, v1, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_v2f16_neq: ; GFX12: ; %bb.0: ; %entry @@ -3473,35 +4269,65 @@ define amdgpu_kernel void @fcmp_v2f16_nlt( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fcmp_v2f16_nlt: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fcmp_v2f16_nlt: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fcmp_v2f16_nlt: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: fcmp_v2f16_nlt: ; GFX12: ; %bb.0: ; %entry diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index e7c425a2d2752dd..cc11e256d5544ef 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s declare float @llvm.fabs.f32(float) #1 declare double @llvm.fabs.f64(double) #1 @@ -619,18 +620,32 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: test_isinf_pattern_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x204 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_isinf_pattern_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_class_f16_e64 s2, v0.l, 0x204 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_isinf_pattern_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 0x204 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %fabs = tail call half @llvm.fabs.f16(half %x) #1 %cmp = fcmp oeq half %fabs, 0xH7C00 %ext = zext i1 %cmp to i32 @@ -669,18 +684,32 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: test_isfinite_pattern_0_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_isfinite_pattern_0_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_class_f16_e64 s2, v0.l, 0x1f8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_isfinite_pattern_0_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %ord = fcmp ord half %x, 0.0 %x.fabs = tail call half @llvm.fabs.f16(half %x) #1 %ninf = fcmp une half %x.fabs, 0xH7C00 @@ -718,18 +747,32 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: test_isfinite_pattern_4_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_isfinite_pattern_4_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_class_f16_e64 s2, v0.l, 0x1f8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_isfinite_pattern_4_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %ord = fcmp ord half %x, 0.0 %x.fabs = tail call half @llvm.fabs.f16(half %x) #1 %ninf = fcmp one half %x.fabs, 0xH7C00 diff --git a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll index c1a074a81b2aa2b..790a457c2b3371c 100644 --- a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll @@ -1,5 +1,9 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s| FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s| FileCheck -check-prefixes=GCN,GFX11-FAKE16 %s +; FIXME-TRUE16. In true16 flow, the codegen introduces addtional s2v copy and mov, and revert the operand order thus picking different cmp instructions +; This should be corrected after addtional mov/copy is removed +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s| FileCheck -check-prefixes=GCN,GFX11-TRUE16 %s ;;;==========================================================================;;; ;; 16-bit integer comparisons @@ -8,6 +12,8 @@ ; GCN-LABEL: {{^}}i16_eq: ; VI: v_cmp_eq_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_eq_u16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_eq_u16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h define amdgpu_kernel void @i16_eq(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -26,6 +32,8 @@ entry: ; GCN-LABEL: {{^}}i16_ne: ; VI: v_cmp_ne_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ne_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_ne_u16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_ne_u16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h define amdgpu_kernel void @i16_ne(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -44,6 +52,8 @@ entry: ; GCN-LABEL: {{^}}i16_ugt: ; VI: v_cmp_gt_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_gt_u16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_gt_u16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h define amdgpu_kernel void @i16_ugt(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -62,6 +72,8 @@ entry: ; GCN-LABEL: {{^}}i16_uge: ; VI: v_cmp_ge_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_ge_u16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_ge_u16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h define amdgpu_kernel void @i16_uge(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -80,6 +92,8 @@ entry: ; GCN-LABEL: {{^}}i16_ult: ; VI: v_cmp_lt_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_lt_u16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_lt_u16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h define amdgpu_kernel void @i16_ult(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -98,6 +112,8 @@ entry: ; GCN-LABEL: {{^}}i16_ule: ; VI: v_cmp_le_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_le_u16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_le_u16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h define amdgpu_kernel void @i16_ule(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -117,6 +133,8 @@ entry: ; GCN-LABEL: {{^}}i16_sgt: ; VI: v_cmp_gt_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_gt_i16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_gt_i16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h define amdgpu_kernel void @i16_sgt(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -135,6 +153,8 @@ entry: ; GCN-LABEL: {{^}}i16_sge: ; VI: v_cmp_ge_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_ge_i16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_ge_i16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h define amdgpu_kernel void @i16_sge(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -153,6 +173,8 @@ entry: ; GCN-LABEL: {{^}}i16_slt: ; VI: v_cmp_lt_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_lt_i16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_lt_i16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h define amdgpu_kernel void @i16_slt(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -171,6 +193,8 @@ entry: ; GCN-LABEL: {{^}}i16_sle: ; VI: v_cmp_le_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_le_i16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_le_i16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h define amdgpu_kernel void @i16_sle(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -190,6 +214,8 @@ entry: ; GCN-LABEL: {{^}}i16_eq_v_s: ; VI: v_cmp_eq_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_eq_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_eq_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_eq_u16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l define amdgpu_kernel void @i16_eq_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -206,6 +232,8 @@ entry: ; GCN-LABEL: {{^}}i16_ne_v_s: ; VI: v_cmp_ne_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ne_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_ne_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_ne_u16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l define amdgpu_kernel void @i16_ne_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -222,6 +250,8 @@ entry: ; GCN-LABEL: {{^}}i16_ugt_v_s: ; VI: v_cmp_lt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_lt_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_gt_u16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l define amdgpu_kernel void @i16_ugt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -238,6 +268,8 @@ entry: ; GCN-LABEL: {{^}}i16_uge_v_s: ; VI: v_cmp_le_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_le_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_ge_u16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l define amdgpu_kernel void @i16_uge_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -254,6 +286,8 @@ entry: ; GCN-LABEL: {{^}}i16_ult_v_s: ; VI: v_cmp_gt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_gt_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_lt_u16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l define amdgpu_kernel void @i16_ult_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -270,6 +304,8 @@ entry: ; GCN-LABEL: {{^}}i16_ule_v_s: ; VI: v_cmp_ge_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_ge_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_le_u16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l define amdgpu_kernel void @i16_ule_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -286,6 +322,8 @@ entry: ; GCN-LABEL: {{^}}i16_sgt_v_s: ; VI: v_cmp_lt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_lt_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_gt_i16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l define amdgpu_kernel void @i16_sgt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -302,6 +340,8 @@ entry: ; GCN-LABEL: {{^}}i16_sge_v_s: ; VI: v_cmp_le_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_le_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_ge_i16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l define amdgpu_kernel void @i16_sge_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -318,6 +358,8 @@ entry: ; GCN-LABEL: {{^}}i16_slt_v_s: ; VI: v_cmp_gt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_gt_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_lt_i16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l define amdgpu_kernel void @i16_slt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -334,6 +376,8 @@ entry: ; GCN-LABEL: {{^}}i16_sle_v_s: ; VI: v_cmp_ge_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-FAKE16: v_cmp_ge_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_cmp_le_i16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l define amdgpu_kernel void @i16_sle_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x()