diff --git a/hc/src/hc_amdgcn.ll b/hc/src/hc_amdgcn.ll index 631c38a2..f7e1d94c 100644 --- a/hc/src/hc_amdgcn.ll +++ b/hc/src/hc_amdgcn.ll @@ -192,20 +192,20 @@ define i32 @__atomic_wrapdec(i32 addrspace(4)* nocapture %addr, i32 %val) #1 { ; llvm.amdgcn.atomic.dec.i32.p4i32 declare i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* nocapture, i32, i32, i32, i1) #4 -define i64 @__clock_u64() #1 { +define i64 @__clock_u64() #8 { %ret = tail call i64 @llvm.amdgcn.s.memrealtime() ret i64 %ret } -declare i64 @llvm.amdgcn.s.memrealtime() #1 +declare i64 @llvm.amdgcn.s.memrealtime() #8 -define i64 @__cycle_u64() #1 { +define i64 @__cycle_u64() #8 { %ret = tail call i64 @llvm.amdgcn.s.memtime() ret i64 %ret } -declare i64 @llvm.amdgcn.s.memtime() #1 +declare i64 @llvm.amdgcn.s.memtime() #8 define i32 @get_group_segment_size() #0 { %1 = call i32 @llvm.amdgcn.s.getreg(i32 17158) #0 @@ -253,4 +253,5 @@ attributes #4 = { convergent nounwind } attributes #5 = { alwaysinline nounwind } attributes #6 = { alwaysinline norecurse nounwind readnone } attributes #7 = { norecurse nounwind readnone } +attributes #8 = { alwaysinline nounwind inaccessiblememonly norecurse } attributes #9 = { convergent nounwind readnone } diff --git a/hc/src/hc_kernel.cl b/hc/src/hc_kernel.cl index 3bbd1c77..98fd1bca 100644 --- a/hc/src/hc_kernel.cl +++ b/hc/src/hc_kernel.cl @@ -13,18 +13,18 @@ amp_get_global_id(int dim) switch(dim) { case 0: - l = __llvm_amdgcn_workitem_id_x(); - g = __llvm_amdgcn_workgroup_id_x(); + l = __builtin_amdgcn_workitem_id_x(); + g = __builtin_amdgcn_workgroup_id_x(); s = p->workgroup_size_x; break; case 1: - l = __llvm_amdgcn_workitem_id_y(); - g = __llvm_amdgcn_workgroup_id_y(); + l = __builtin_amdgcn_workitem_id_y(); + g = __builtin_amdgcn_workgroup_id_y(); s = p->workgroup_size_y; break; case 2: - l = __llvm_amdgcn_workitem_id_z(); - g = __llvm_amdgcn_workgroup_id_z(); + l = __builtin_amdgcn_workitem_id_z(); + g = __builtin_amdgcn_workgroup_id_z(); s = p->workgroup_size_z; break; default: @@ -59,11 +59,11 @@ amp_get_local_id(int dim) { switch(dim) { case 0: - return __llvm_amdgcn_workitem_id_x(); + return __builtin_amdgcn_workitem_id_x(); case 1: - return __llvm_amdgcn_workitem_id_y(); + return __builtin_amdgcn_workitem_id_y(); case 2: - return __llvm_amdgcn_workitem_id_z(); + return __builtin_amdgcn_workitem_id_z(); default: return 0; } @@ -102,11 +102,11 @@ amp_get_group_id(int dim) { switch(dim) { case 0: - return __llvm_amdgcn_workgroup_id_x(); + return __builtin_amdgcn_workgroup_id_x(); case 1: - return __llvm_amdgcn_workgroup_id_y(); + return __builtin_amdgcn_workgroup_id_y(); case 2: - return __llvm_amdgcn_workgroup_id_z(); + return __builtin_amdgcn_workgroup_id_z(); default: return 0; } @@ -115,24 +115,7 @@ amp_get_group_id(int dim) ATTR uint amp_get_local_size(int dim) { - __constant hsa_kernel_dispatch_packet_t *p = __llvm_amdgcn_dispatch_ptr(); - uint d; - - switch(dim) { - case 0: - d = p->workgroup_size_x; - break; - case 1: - d = p->workgroup_size_y; - break; - case 2: - d = p->workgroup_size_z; - break; - default: - d = 1; - break; - } - return d; + return __ockl_get_local_size(dim); } ATTR uint @@ -176,10 +159,10 @@ hc_work_group_barrier(cl_mem_fence_flags flags, memory_scope scope) { if (flags) { atomic_work_item_fence(flags, memory_order_release, scope); - __llvm_amdgcn_s_barrier(); + __builtin_amdgcn_s_barrier(); atomic_work_item_fence(flags, memory_order_acquire, scope); } else { - __llvm_amdgcn_s_barrier(); + __builtin_amdgcn_s_barrier(); } } diff --git a/irif/inc/irif.h b/irif/inc/irif.h index 9121bb39..0ac816a2 100644 --- a/irif/inc/irif.h +++ b/irif/inc/irif.h @@ -99,27 +99,6 @@ extern __attribute__((const)) float __llvm_canonicalize_f32(float) __asm("llvm.c extern __attribute__((const)) double __llvm_canonicalize_f64(double) __asm("llvm.canonicalize.f64"); // Intrinsics requiring wrapping -extern bool __llvm_sadd_with_overflow_i16(short, short, __private short*); -extern bool __llvm_uadd_with_overflow_i16(ushort, ushort, __private ushort*); -extern bool __llvm_sadd_with_overflow_i32(int, int, __private int*); -extern bool __llvm_uadd_with_overflow_i32(uint, uint, __private uint*); -extern bool __llvm_sadd_with_overflow_i64(long, long, __private long*); -extern bool __llvm_uadd_with_overflow_i64(ulong, ulong, __private ulong*); - -extern bool __llvm_ssub_with_overflow_i16(short, short, __private short*); -extern bool __llvm_usub_with_overflow_i16(ushort, ushort, __private ushort*); -extern bool __llvm_ssub_with_overflow_i32(int, int, __private int*); -extern bool __llvm_usub_with_overflow_i32(uint, uint, __private uint*); -extern bool __llvm_ssub_with_overflow_i64(long, long, __private long*); -extern bool __llvm_usub_with_overflow_i64(ulong, ulong, __private ulong*); - -extern bool __llvm_smul_with_overflow_i16(short, short, __private short*); -extern bool __llvm_umul_with_overflow_i16(ushort, ushort, __private ushort*); -extern bool __llvm_smul_with_overflow_i32(int, int, __private int*); -extern bool __llvm_umul_with_overflow_i32(uint, uint, __private uint*); -extern bool __llvm_smul_with_overflow_i64(long, long, __private long*); -extern bool __llvm_umul_with_overflow_i64(ulong, ulong, __private ulong*); - extern __attribute__((const)) uchar __llvm_ctlz_i8(uchar); extern __attribute__((const)) ushort __llvm_ctlz_i16(ushort); extern __attribute__((const)) uint __llvm_ctlz_i32(uint); @@ -313,18 +292,6 @@ extern __attribute__((const)) double __llvm_amdgcn_trig_preop_f64(double, int) _ extern __attribute__((const)) half __llvm_amdgcn_fmed3_f16(half, half, half) __asm("llvm.amdgcn.fmed3.f16"); extern __attribute__((const)) float __llvm_amdgcn_fmed3_f32(float, float, float) __asm("llvm.amdgcn.fmed3.f32"); -extern void __llvm_amdgcn_s_sendmsg(uint, uint) __asm("llvm.amdgcn.s.sendmsg"); -extern void __llvm_amdgcn_s_barrier(void) __asm("llvm.amdgcn.s.barrier"); -extern void __llvm_amdgcn_wave_barrier(void) __asm("llvm.amdgcn.wave.barrier"); - -extern __attribute__((const)) uint __llvm_amdgcn_workitem_id_x(void) __asm("llvm.amdgcn.workitem.id.x"); -extern __attribute__((const)) uint __llvm_amdgcn_workitem_id_y(void) __asm("llvm.amdgcn.workitem.id.y"); -extern __attribute__((const)) uint __llvm_amdgcn_workitem_id_z(void) __asm("llvm.amdgcn.workitem.id.z"); - -extern __attribute__((const)) uint __llvm_amdgcn_workgroup_id_x(void) __asm("llvm.amdgcn.workgroup.id.x"); -extern __attribute__((const)) uint __llvm_amdgcn_workgroup_id_y(void) __asm("llvm.amdgcn.workgroup.id.y"); -extern __attribute__((const)) uint __llvm_amdgcn_workgroup_id_z(void) __asm("llvm.amdgcn.workgroup.id.z"); - extern __attribute__((const)) __constant void *__llvm_amdgcn_dispatch_ptr(void) __asm("llvm.amdgcn.dispatch.ptr"); extern __attribute__((const)) __constant void *__llvm_amdgcn_queue_ptr(void) __asm("llvm.amdgcn.queue.ptr"); extern __attribute__((const)) __constant void *__llvm_amdgcn_kernarg_segment_ptr(void) __asm("llvm.amdgcn.kernarg.segment.ptr"); diff --git a/irif/src/overflow.ll b/irif/src/overflow.ll deleted file mode 100644 index cb069b1c..00000000 --- a/irif/src/overflow.ll +++ /dev/null @@ -1,183 +0,0 @@ -; ===-------------------------------------------------------------------------- -; ROCm Device Libraries -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; ===------------------------------------------------------------------------*/ - -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" -target triple = "amdgcn--amdhsa" - -declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) -declare {i16, i1} @llvm.uadd.with.overflow.i16(i16, i16) - -define zeroext i1 @__llvm_sadd_with_overflow_i16(i16, i16, i16* nocapture) #0 { - %4 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %0, i16 %1) - %5 = extractvalue {i16, i1} %4, 0 - store i16 %5, i16* %2, align 4 - %6 = extractvalue {i16, i1} %4, 1 - ret i1 %6 -} - -define zeroext i1 @__llvm_uadd_with_overflow_i16(i16, i16, i16* nocapture) #0 { - %4 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %0, i16 %1) - %5 = extractvalue {i16, i1} %4, 0 - store i16 %5, i16* %2, align 4 - %6 = extractvalue {i16, i1} %4, 1 - ret i1 %6 -} - -declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) -declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) - -define zeroext i1 @__llvm_sadd_with_overflow_i32(i32, i32, i32* nocapture) #0 { - %4 = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %0, i32 %1) - %5 = extractvalue {i32, i1} %4, 0 - store i32 %5, i32* %2, align 4 - %6 = extractvalue {i32, i1} %4, 1 - ret i1 %6 -} - -define zeroext i1 @__llvm_uadd_with_overflow_i32(i32, i32, i32* nocapture) #0 { - %4 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %0, i32 %1) - %5 = extractvalue {i32, i1} %4, 0 - store i32 %5, i32* %2, align 4 - %6 = extractvalue {i32, i1} %4, 1 - ret i1 %6 -} - -declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) -declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) - -define zeroext i1 @__llvm_sadd_with_overflow_i64(i64, i64, i64* nocapture) #0 { - %4 = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %0, i64 %1) - %5 = extractvalue {i64, i1} %4, 0 - store i64 %5, i64* %2, align 4 - %6 = extractvalue {i64, i1} %4, 1 - ret i1 %6 -} - -define zeroext i1 @__llvm_uadd_with_overflow_i64(i64, i64, i64* nocapture) #0 { - %4 = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %0, i64 %1) - %5 = extractvalue {i64, i1} %4, 0 - store i64 %5, i64* %2, align 4 - %6 = extractvalue {i64, i1} %4, 1 - ret i1 %6 -} - -declare {i16, i1} @llvm.ssub.with.overflow.i16(i16, i16) -declare {i16, i1} @llvm.usub.with.overflow.i16(i16, i16) - -define zeroext i1 @__llvm_ssub_with_overflow_i16(i16, i16, i16* nocapture) #0 { - %4 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %0, i16 %1) - %5 = extractvalue {i16, i1} %4, 0 - store i16 %5, i16* %2, align 4 - %6 = extractvalue {i16, i1} %4, 1 - ret i1 %6 -} - -define zeroext i1 @__llvm_usub_with_overflow_i16(i16, i16, i16* nocapture) #0 { - %4 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %0, i16 %1) - %5 = extractvalue {i16, i1} %4, 0 - store i16 %5, i16* %2, align 4 - %6 = extractvalue {i16, i1} %4, 1 - ret i1 %6 -} - -declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) -declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) - -define zeroext i1 @__llvm_ssub_with_overflow_i32(i32, i32, i32* nocapture) #0 { - %4 = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %0, i32 %1) - %5 = extractvalue {i32, i1} %4, 0 - store i32 %5, i32* %2, align 4 - %6 = extractvalue {i32, i1} %4, 1 - ret i1 %6 -} - -define zeroext i1 @__llvm_usub_with_overflow_i32(i32, i32, i32* nocapture) #0 { - %4 = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %0, i32 %1) - %5 = extractvalue {i32, i1} %4, 0 - store i32 %5, i32* %2, align 4 - %6 = extractvalue {i32, i1} %4, 1 - ret i1 %6 -} - -declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) -declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) - -define zeroext i1 @__llvm_ssub_with_overflow_i64(i64, i64, i64* nocapture) #0 { - %4 = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %0, i64 %1) - %5 = extractvalue {i64, i1} %4, 0 - store i64 %5, i64* %2, align 4 - %6 = extractvalue {i64, i1} %4, 1 - ret i1 %6 -} - -define zeroext i1 @__llvm_usub_with_overflow_i64(i64, i64, i64* nocapture) #0 { - %4 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %0, i64 %1) - %5 = extractvalue {i64, i1} %4, 0 - store i64 %5, i64* %2, align 4 - %6 = extractvalue {i64, i1} %4, 1 - ret i1 %6 -} - -declare {i16, i1} @llvm.smul.with.overflow.i16(i16, i16) -declare {i16, i1} @llvm.umul.with.overflow.i16(i16, i16) - -define zeroext i1 @__llvm_smul_with_overflow_i16(i16, i16, i16* nocapture) #0 { - %4 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %0, i16 %1) - %5 = extractvalue {i16, i1} %4, 0 - store i16 %5, i16* %2, align 4 - %6 = extractvalue {i16, i1} %4, 1 - ret i1 %6 -} - -define zeroext i1 @__llvm_umul_with_overflow_i16(i16, i16, i16* nocapture) #0 { - %4 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %0, i16 %1) - %5 = extractvalue {i16, i1} %4, 0 - store i16 %5, i16* %2, align 4 - %6 = extractvalue {i16, i1} %4, 1 - ret i1 %6 -} - -declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) -declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) - -define zeroext i1 @__llvm_smul_with_overflow_i32(i32, i32, i32* nocapture) #0 { - %4 = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %0, i32 %1) - %5 = extractvalue {i32, i1} %4, 0 - store i32 %5, i32* %2, align 4 - %6 = extractvalue {i32, i1} %4, 1 - ret i1 %6 -} - -define zeroext i1 @__llvm_umul_with_overflow_i32(i32, i32, i32* nocapture) #0 { - %4 = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %0, i32 %1) - %5 = extractvalue {i32, i1} %4, 0 - store i32 %5, i32* %2, align 4 - %6 = extractvalue {i32, i1} %4, 1 - ret i1 %6 -} - -declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) -declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) - -define zeroext i1 @__llvm_smul_with_overflow_i64(i64, i64, i64* nocapture) #0 { - %4 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %0, i64 %1) - %5 = extractvalue {i64, i1} %4, 0 - store i64 %5, i64* %2, align 4 - %6 = extractvalue {i64, i1} %4, 1 - ret i1 %6 -} - -define zeroext i1 @__llvm_umul_with_overflow_i64(i64, i64, i64* nocapture) #0 { - %4 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %0, i64 %1) - %5 = extractvalue {i64, i1} %4, 0 - store i64 %5, i64* %2, align 4 - %6 = extractvalue {i64, i1} %4, 1 - ret i1 %6 -} - -attributes #0 = { alwaysinline argmemonly norecurse nounwind } - diff --git a/ockl/src/add_sat.cl b/ockl/src/add_sat.cl index 2b33791f..d17e64c1 100644 --- a/ockl/src/add_sat.cl +++ b/ockl/src/add_sat.cl @@ -12,7 +12,7 @@ __attribute__((always_inline)) int OCKL_MANGLE_I32(add_sat)(int x, int y) { int s; - bool c = __llvm_sadd_with_overflow_i32(x, y, &s); + bool c = __builtin_sadd_overflow(x, y, &s); int lim = (x >> 31) ^ INT_MAX; return c ? lim : s; } @@ -21,7 +21,7 @@ __attribute__((always_inline)) uint OCKL_MANGLE_U32(add_sat)(uint x, uint y) { uint s; - bool c = __llvm_uadd_with_overflow_i32(x, y, &s); + bool c = __builtin_uadd_overflow(x, y, &s); return c ? UINT_MAX : s; } @@ -29,7 +29,7 @@ __attribute__((always_inline)) long OCKL_MANGLE_I64(add_sat)(long x, long y) { long s; - bool c = __llvm_sadd_with_overflow_i64(x, y, &s); + bool c = __builtin_saddl_overflow(x, y, &s); long lim = (x >> 63) ^ LONG_MAX; return c ? lim : s; } @@ -38,7 +38,7 @@ __attribute__((always_inline)) ulong OCKL_MANGLE_U64(add_sat)(ulong x, ulong y) { ulong s; - bool c = __llvm_uadd_with_overflow_i64(x, y, &s); + bool c = __builtin_uaddl_overflow(x, y, &s); return c ? ULONG_MAX : s; } diff --git a/ockl/src/hsaqs.cl b/ockl/src/hsaqs.cl index a39fc495..de82f52a 100644 --- a/ockl/src/hsaqs.cl +++ b/ockl/src/hsaqs.cl @@ -66,7 +66,7 @@ update_mbox(const __global amd_signal_t *sig) if (mb) { uint id = sig->event_id; atomic_store_explicit(mb, id, memory_order_release, memory_scope_all_svm_devices); - __llvm_amdgcn_s_sendmsg(1 | (0 << 4), __llvm_amdgcn_readfirstlane(id) & 0xff); + __builtin_amdgcn_s_sendmsg(1 | (0 << 4), __llvm_amdgcn_readfirstlane(id) & 0xff); } } diff --git a/ockl/src/sub_sat.cl b/ockl/src/sub_sat.cl index 9b39d9ac..2cb506b7 100644 --- a/ockl/src/sub_sat.cl +++ b/ockl/src/sub_sat.cl @@ -12,7 +12,7 @@ __attribute__((always_inline, const)) int OCKL_MANGLE_I32(sub_sat)(int x, int y) { int s; - bool c = __llvm_ssub_with_overflow_i32(x, y, &s); + bool c = __builtin_ssub_overflow(x, y, &s); int lim = (x >> 31) ^ INT_MAX; return c ? lim : s; } @@ -21,7 +21,7 @@ __attribute__((always_inline, const)) uint OCKL_MANGLE_U32(sub_sat)(uint x, uint y) { uint s; - bool c = __llvm_usub_with_overflow_i32(x, y, &s); + bool c = __builtin_usub_overflow(x, y, &s); return c ? 0U : s; } @@ -29,7 +29,7 @@ __attribute__((always_inline, const)) long OCKL_MANGLE_I64(sub_sat)(long x, long y) { long s; - bool c = __llvm_ssub_with_overflow_i64(x, y, &s); + bool c = __builtin_ssubl_overflow(x, y, &s); long lim = (x >> 63) ^ LONG_MAX; return c ? lim : s; } @@ -38,7 +38,7 @@ __attribute__((always_inline, const)) ulong OCKL_MANGLE_U64(sub_sat)(ulong x, ulong y) { ulong s; - bool c = __llvm_usub_with_overflow_i64(x, y, &s); + bool c = __builtin_usubl_overflow(x, y, &s); return c ? 0UL : s; } diff --git a/ockl/src/workitem.cl b/ockl/src/workitem.cl index 6e315a6b..c03a9f88 100644 --- a/ockl/src/workitem.cl +++ b/ockl/src/workitem.cl @@ -34,18 +34,18 @@ __ockl_get_global_id(uint dim) switch(dim) { case 0: - l = __llvm_amdgcn_workitem_id_x(); - g = __llvm_amdgcn_workgroup_id_x(); + l = __builtin_amdgcn_workitem_id_x(); + g = __builtin_amdgcn_workgroup_id_x(); s = p->workgroup_size_x; break; case 1: - l = __llvm_amdgcn_workitem_id_y(); - g = __llvm_amdgcn_workgroup_id_y(); + l = __builtin_amdgcn_workitem_id_y(); + g = __builtin_amdgcn_workgroup_id_y(); s = p->workgroup_size_y; break; case 2: - l = __llvm_amdgcn_workitem_id_z(); - g = __llvm_amdgcn_workgroup_id_z(); + l = __builtin_amdgcn_workitem_id_z(); + g = __builtin_amdgcn_workgroup_id_z(); s = p->workgroup_size_z; break; default: @@ -63,11 +63,11 @@ __ockl_get_local_id(uint dim) { switch(dim) { case 0: - return __llvm_amdgcn_workitem_id_x(); + return __builtin_amdgcn_workitem_id_x(); case 1: - return __llvm_amdgcn_workitem_id_y(); + return __builtin_amdgcn_workitem_id_y(); case 2: - return __llvm_amdgcn_workitem_id_z(); + return __builtin_amdgcn_workitem_id_z(); default: return 0; } @@ -78,11 +78,11 @@ __ockl_get_group_id(uint dim) { switch(dim) { case 0: - return __llvm_amdgcn_workgroup_id_x(); + return __builtin_amdgcn_workgroup_id_x(); case 1: - return __llvm_amdgcn_workgroup_id_y(); + return __builtin_amdgcn_workgroup_id_y(); case 2: - return __llvm_amdgcn_workgroup_id_z(); + return __builtin_amdgcn_workgroup_id_z(); default: return 0; } @@ -110,38 +110,32 @@ __ockl_get_local_size(uint dim) { // TODO save some effort if -cl-uniform-work-group-size is used __constant hsa_kernel_dispatch_packet_t *p = __llvm_amdgcn_dispatch_ptr(); - uint l, g, n, d; + uint g, n, d; switch(dim) { case 0: - l = __llvm_amdgcn_workitem_id_x(); - g = __llvm_amdgcn_workgroup_id_x(); + g = __builtin_amdgcn_workgroup_id_x(); n = p->grid_size_x; d = p->workgroup_size_x; break; case 1: - l = __llvm_amdgcn_workitem_id_y(); - g = __llvm_amdgcn_workgroup_id_y(); + g = __builtin_amdgcn_workgroup_id_y(); n = p->grid_size_y; d = p->workgroup_size_y; break; case 2: - l = __llvm_amdgcn_workitem_id_z(); - g = __llvm_amdgcn_workgroup_id_z(); + g = __builtin_amdgcn_workgroup_id_z(); n = p->grid_size_z; d = p->workgroup_size_z; break; default: - l = 0; g = 0; n = 0; d = 1; break; } - uint q = n / d; - uint r = n - q*d; - uint i = g*d + l; - return (r > 0) & (i >= n-r) ? r : d; + uint r = n - g*d; + return (r < d) ? r : d; } ATTR size_t @@ -208,17 +202,17 @@ __ockl_get_global_linear_id(void) switch (p->setup) { case 1: { - uint l0 = __llvm_amdgcn_workitem_id_x(); - uint g0 = __llvm_amdgcn_workgroup_id_x(); + uint l0 = __builtin_amdgcn_workitem_id_x(); + uint g0 = __builtin_amdgcn_workgroup_id_x(); uint s0 = p->workgroup_size_x; return g0*s0 + l0; } case 2: { - uint l0 = __llvm_amdgcn_workitem_id_x(); - uint l1 = __llvm_amdgcn_workitem_id_y(); - uint g0 = __llvm_amdgcn_workgroup_id_x(); - uint g1 = __llvm_amdgcn_workgroup_id_y(); + uint l0 = __builtin_amdgcn_workitem_id_x(); + uint l1 = __builtin_amdgcn_workitem_id_y(); + uint g0 = __builtin_amdgcn_workgroup_id_x(); + uint g1 = __builtin_amdgcn_workgroup_id_y(); uint s0 = p->workgroup_size_x; uint s1 = p->workgroup_size_y; uint n0 = p->grid_size_x; @@ -228,12 +222,12 @@ __ockl_get_global_linear_id(void) } case 3: { - uint l0 = __llvm_amdgcn_workitem_id_x(); - uint l1 = __llvm_amdgcn_workitem_id_y(); - uint l2 = __llvm_amdgcn_workitem_id_z(); - uint g0 = __llvm_amdgcn_workgroup_id_x(); - uint g1 = __llvm_amdgcn_workgroup_id_y(); - uint g2 = __llvm_amdgcn_workgroup_id_z(); + uint l0 = __builtin_amdgcn_workitem_id_x(); + uint l1 = __builtin_amdgcn_workitem_id_y(); + uint l2 = __builtin_amdgcn_workitem_id_z(); + uint g0 = __builtin_amdgcn_workgroup_id_x(); + uint g1 = __builtin_amdgcn_workgroup_id_y(); + uint g2 = __builtin_amdgcn_workgroup_id_z(); uint s0 = p->workgroup_size_x; uint s1 = p->workgroup_size_y; uint s2 = p->workgroup_size_z; @@ -253,7 +247,7 @@ ATTR size_t __ockl_get_local_linear_id(void) { __constant hsa_kernel_dispatch_packet_t *p = __llvm_amdgcn_dispatch_ptr(); - return (__llvm_amdgcn_workitem_id_z()*p->workgroup_size_y + - __llvm_amdgcn_workitem_id_y()) * p->workgroup_size_x + __llvm_amdgcn_workitem_id_x(); + return (__builtin_amdgcn_workitem_id_z()*p->workgroup_size_y + + __builtin_amdgcn_workitem_id_y()) * p->workgroup_size_x + __builtin_amdgcn_workitem_id_x(); } diff --git a/opencl/src/pipes/pipes.h b/opencl/src/pipes/pipes.h index 16ab22fd..010368f1 100644 --- a/opencl/src/pipes/pipes.h +++ b/opencl/src/pipes/pipes.h @@ -71,14 +71,14 @@ wave_reserve_1(volatile __global atomic_size_t *pi, size_t lim) } } - __llvm_amdgcn_wave_barrier(); + __builtin_amdgcn_wave_barrier(); // Broadcast the result; the ctz tells us which lane has active lane id 0 uint k = (uint)__llvm_cttz_i64(__llvm_amdgcn_read_exec()); i = ((size_t)__llvm_amdgcn_readlane((uint)(i >> 32), k) << 32) | (size_t)__llvm_amdgcn_readlane((uint)i, k); - __llvm_amdgcn_wave_barrier(); + __builtin_amdgcn_wave_barrier(); if (i != ~(size_t)0) i += l; diff --git a/opencl/src/pipes/wresvnp.cl b/opencl/src/pipes/wresvnp.cl index 2b4f2fa4..d686931b 100644 --- a/opencl/src/pipes/wresvnp.cl +++ b/opencl/src/pipes/wresvnp.cl @@ -120,19 +120,19 @@ __amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n) slid = 63 - (int)clz(smask); t = __llvm_amdgcn_ds_bpermute(slid << 2, sum); sum += slid < 0 ? 0 : t; - __llvm_amdgcn_wave_barrier(); + __builtin_amdgcn_wave_barrier(); size_t idx = 0; if (l == 63 - (int)clz(__llvm_amdgcn_read_exec())) { idx = reserve(pidx, lim, (size_t)sum); } - __llvm_amdgcn_wave_barrier(); + __builtin_amdgcn_wave_barrier(); // Broadcast uint k = 63u - (uint)clz(__llvm_amdgcn_read_exec()); idx = ((size_t)__llvm_amdgcn_readlane((uint)(idx >> 32), k) << 32) | (size_t)__llvm_amdgcn_readlane((uint)idx, k); - __llvm_amdgcn_wave_barrier(); + __builtin_amdgcn_wave_barrier(); rid = idx + (size_t)(sum - (uint)n); rid = idx != ~(size_t)0 ? rid : idx; diff --git a/opencl/src/subgroup/subbar.cl b/opencl/src/subgroup/subbar.cl index 73de6850..77e8f77a 100644 --- a/opencl/src/subgroup/subbar.cl +++ b/opencl/src/subgroup/subbar.cl @@ -17,7 +17,7 @@ __attribute__((overloadable, always_inline)) void sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope) { // This barrier is a no-op to ensure this function remains convergent - __llvm_amdgcn_wave_barrier(); + __builtin_amdgcn_wave_barrier(); if (flags) atomic_work_item_fence(flags, memory_order_acq_rel, scope); diff --git a/opencl/src/workgroup/wgbarrier.cl b/opencl/src/workgroup/wgbarrier.cl index 0e03210e..d399028d 100644 --- a/opencl/src/workgroup/wgbarrier.cl +++ b/opencl/src/workgroup/wgbarrier.cl @@ -5,8 +5,6 @@ * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ -#include "irif.h" - __attribute__((overloadable, always_inline)) void barrier(cl_mem_fence_flags flags) { @@ -23,11 +21,19 @@ __attribute__((overloadable, always_inline)) void work_group_barrier(cl_mem_fence_flags flags, memory_scope scope) { if (flags) { - atomic_work_item_fence(flags, memory_order_release, scope); - __llvm_amdgcn_s_barrier(); - atomic_work_item_fence(flags, memory_order_acquire, scope); + atomic_work_item_fence(flags, + flags == (CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE) ? + memory_order_seq_cst : memory_order_release, + scope); + + __builtin_amdgcn_s_barrier(); + + atomic_work_item_fence(flags, + flags == (CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE) ? + memory_order_seq_cst : memory_order_acquire, + scope); } else { - __llvm_amdgcn_s_barrier(); + __builtin_amdgcn_s_barrier(); } }