From 94b5167ce3880c1654f9d24bd5bbe2b45f507a1e Mon Sep 17 00:00:00 2001 From: Brian Sumner Date: Fri, 15 Sep 2017 05:13:35 -0700 Subject: [PATCH] Pipe functions Change-Id: I47368f5e3d7b1083d0e7ba8a9b355cd7f5433f19 --- opencl/CMakeLists.txt | 1 + opencl/src/pipes/commitp.cl | 93 +++++++++++++++ opencl/src/pipes/getp.cl | 45 +++++++ opencl/src/pipes/memcpyia.cl | 55 +++++++++ opencl/src/pipes/pipes.h | 109 +++++++++++++++++ opencl/src/pipes/readp.cl | 75 ++++++++++++ opencl/src/pipes/reservep.cl | 219 +++++++++++++++++++++++++++++++++++ opencl/src/pipes/validp.cl | 14 +++ opencl/src/pipes/wresvnp.cl | 148 +++++++++++++++++++++++ opencl/src/pipes/writep.cl | 65 +++++++++++ 10 files changed, 824 insertions(+) create mode 100644 opencl/src/pipes/commitp.cl create mode 100644 opencl/src/pipes/getp.cl create mode 100644 opencl/src/pipes/memcpyia.cl create mode 100644 opencl/src/pipes/pipes.h create mode 100644 opencl/src/pipes/readp.cl create mode 100644 opencl/src/pipes/reservep.cl create mode 100644 opencl/src/pipes/validp.cl create mode 100644 opencl/src/pipes/wresvnp.cl create mode 100644 opencl/src/pipes/writep.cl diff --git a/opencl/CMakeLists.txt b/opencl/CMakeLists.txt index 5b707605..8da642aa 100644 --- a/opencl/CMakeLists.txt +++ b/opencl/CMakeLists.txt @@ -14,6 +14,7 @@ file(GLOB cl_sources ${CMAKE_CURRENT_SOURCE_DIR}/src/math/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/media/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/misc/*.cl + ${CMAKE_CURRENT_SOURCE_DIR}/src/pipes/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/relational/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/subgroup/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/vldst/*.cl diff --git a/opencl/src/pipes/commitp.cl b/opencl/src/pipes/commitp.cl new file mode 100644 index 00000000..51528cb8 --- /dev/null +++ b/opencl/src/pipes/commitp.cl @@ -0,0 +1,93 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "pipes.h" + +#define ATTR __attribute__((always_inline)) + +#define COMMIT_READ_PIPE_SIZE(SIZE, STYPE) \ +ATTR void \ +__commit_read_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \ +{ \ +} + +// DO_PIPE_SIZE(COMMIT_READ_PIPE_SIZE) + +ATTR void +__commit_read_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align) +{ +} + +#define COMMIT_WRITE_PIPE_SIZE(SIZE, STYPE) \ +ATTR void \ +__commit_write_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \ +{ \ +} + +// DO_PIPE_SIZE(COMMIT_WRITE_PIPE_SIZE) + +ATTR void +__commit_write_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align) +{ +} + +// Work group functions + +#define WORK_GROUP_COMMIT_READ_PIPE_SIZE(SIZE, STYPE) \ +ATTR void \ +__work_group_commit_read_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \ +{ \ +} + +// DO_PIPE_SIZE(WORK_GROUP_COMMIT_READ_PIPE_SIZE) + +ATTR void +__work_group_commit_read_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align) +{ +} + +#define WORK_GROUP_COMMIT_WRITE_PIPE_SIZE(SIZE, STYPE) \ +ATTR void \ +__work_group_commit_write_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \ +{ \ +} + +// DO_PIPE_SIZE(WORK_GROUP_COMMIT_WRITE_PIPE_SIZE) + +ATTR void +__work_group_commit_write_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align) +{ +} + +// sub group functions + +#define SUB_GROUP_COMMIT_READ_PIPE_SIZE(SIZE, STYPE) \ +ATTR void \ +__sub_group_commit_read_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \ +{ \ +} + +// DO_PIPE_SIZE(SUB_GROUP_COMMIT_READ_PIPE_SIZE) + +ATTR void +__sub_group_commit_read_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align) +{ +} + +#define SUB_GROUP_COMMIT_WRITE_PIPE_SIZE(SIZE, STYPE) \ +ATTR void \ +__sub_group_commit_write_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \ +{ \ +} + +// DO_PIPE_SIZE(SUB_GROUP_COMMIT_WRITE_PIPE_SIZE) + +ATTR void +__sub_group_commit_write_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align) +{ +} + diff --git a/opencl/src/pipes/getp.cl b/opencl/src/pipes/getp.cl new file mode 100644 index 00000000..d5531996 --- /dev/null +++ b/opencl/src/pipes/getp.cl @@ -0,0 +1,45 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "pipes.h" + +#define ATTR __attribute__((always_inline, pure)) + +#define GET_PIPE_NUM_PACKETS_SIZE(SIZE, STYPE) \ +ATTR uint \ +__get_pipe_num_packets_##SIZE(__global struct pipeimp* p) \ +{ \ + size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \ + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \ + return (uint)(wi - ri); \ +} + +// DO_PIPE_SIZE(GET_PIPE_NUM_PACKETS_SIZE) + +ATTR uint +__get_pipe_num_packets(__global struct pipeimp* p, uint size, uint align) +{ + size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); + return (uint)(wi - ri); +} + +#define GET_PIPE_MAX_PACKETS_SIZE(SIZE, STYPE) \ +ATTR uint \ +__get_pipe_max_packets_##SIZE(__global struct pipeimp* p) \ +{ \ + return (uint)p->end_idx; \ +} + +// DO_PIPE_SIZE(GET_PIPE_MAX_PACKETS_SIZE) + +ATTR uint +__get_pipe_max_packets(__global struct pipeimp* p, uint size, uint align) +{ + return (uint)p->end_idx; +} + diff --git a/opencl/src/pipes/memcpyia.cl b/opencl/src/pipes/memcpyia.cl new file mode 100644 index 00000000..f536d044 --- /dev/null +++ b/opencl/src/pipes/memcpyia.cl @@ -0,0 +1,55 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +__attribute__((always_inline, weak)) void +__memcpy_internal_aligned(void *d, const void *s, size_t size, size_t align) +{ + if (align == 2) { + short *d2 = (short *)d; + short *s2 = (short *)s; + short *e2 = s2 + size/2; + + while (s2 < e2) + *d2++ = *s2++; + } else if (align == 4) { + int *d4 = (int *)d; + int *s4 = (int *)s; + int *e4 = s4 + size/4; + + while (s4 < e4) + *d4++ = *s4++; + } else if (align == 8) { + long *d8 = (long *)d; + long *s8 = (long *)s; + long *e8 = s8 + size/8; + + while (s8 < e8) + *d8++ = *s8++; + } else if (align == 16) { + long2 *d16 = (long2 *)d; + long2 *s16 = (long2 *)s; + long2 *e16 = s16 + size/16; + + while (s16 < e16) + *d16++ = *s16++; + } else if (align == 32 || align == 64 || align == 128) { + long4 *d32 = (long4 *)d; + long4 *s32 = (long4 *)s; + long4 *e32 = s32 + size/32; + + while (s32 < e32) + *d32++ = *s32++; + } else { + char *d1 = (char *)d; + char *s1 = (char *)s; + char *e1 = s1 + size; + + while (s1 < e1) + *d1++ = *s1++; + } +} + diff --git a/opencl/src/pipes/pipes.h b/opencl/src/pipes/pipes.h new file mode 100644 index 00000000..16ab22fd --- /dev/null +++ b/opencl/src/pipes/pipes.h @@ -0,0 +1,109 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "irif.h" + +#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable + +extern size_t __amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n); + +#define DO_PIPE_SIZE(F) \ +F(1,uchar) \ +F(2,ushort) \ +F(4,uint) \ +F(8,ulong) \ +F(16,ulong2) \ +F(32,ulong4) \ +F(64,ulong8) \ +F(128,ulong16) + +struct pipeimp { + atomic_size_t read_idx; + atomic_size_t write_idx; + size_t end_idx; + uchar pad[128 - 3*sizeof(size_t)]; + uchar packets[1]; +}; + +extern void __memcpy_internal_aligned(void *, const void *, size_t, size_t); + +static __attribute__((always_inline)) size_t +reserve(volatile __global atomic_size_t *pi, size_t lim, size_t n) +{ + size_t i = __opencl_atomic_load(pi, memory_order_relaxed, memory_scope_device); + + for (;;) { + if (i + n > lim) + return ~(size_t)0; + + if (__opencl_atomic_compare_exchange_strong(pi, &i, i + n, memory_order_relaxed, memory_order_relaxed, memory_scope_device)) + break; + } + + return i; +} + +static inline size_t +wave_reserve_1(volatile __global atomic_size_t *pi, size_t lim) +{ + size_t n = (size_t)(__llvm_ctpop_i32(__llvm_amdgcn_read_exec_lo()) + + __llvm_ctpop_i32(__llvm_amdgcn_read_exec_hi())); + uint l = __llvm_amdgcn_mbcnt_hi(__llvm_amdgcn_read_exec_hi(), + __llvm_amdgcn_mbcnt_lo(__llvm_amdgcn_read_exec_lo(), 0u)); + size_t i = 0; + + if (l == 0) { + i = __opencl_atomic_load(pi, memory_order_relaxed, memory_scope_device); + + for (;;) { + if (i + n > lim) { + i = ~(size_t)0; + break; + } + + if (__opencl_atomic_compare_exchange_strong(pi, &i, i + n, memory_order_relaxed, memory_order_relaxed, memory_scope_device)) + break; + } + } + + __llvm_amdgcn_wave_barrier(); + + // Broadcast the result; the ctz tells us which lane has active lane id 0 + uint k = (uint)__llvm_cttz_i64(__llvm_amdgcn_read_exec()); + i = ((size_t)__llvm_amdgcn_readlane((uint)(i >> 32), k) << 32) | + (size_t)__llvm_amdgcn_readlane((uint)i, k); + + __llvm_amdgcn_wave_barrier(); + + if (i != ~(size_t)0) + i += l; + else { + // The entire group didn't fit, have to handle one by one + i = reserve(pi, lim, (size_t)1); + } + + return i; +} + +static inline size_t +wrap(size_t i, size_t n) +{ + // Assume end_i < 2^32 + size_t ret; + if (as_uint2(i).y == 0U) { + uint j = (uint)i; + uint m = (uint)n; + if (j < m) + ret = i; + else + ret = (ulong)(j % m); + } else + ret = i % n; + return ret; +} + diff --git a/opencl/src/pipes/readp.cl b/opencl/src/pipes/readp.cl new file mode 100644 index 00000000..1808ad3a --- /dev/null +++ b/opencl/src/pipes/readp.cl @@ -0,0 +1,75 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "pipes.h" + +#define ATTR __attribute__((always_inline)) + +#define READ_PIPE_SIZE(SIZE, STYPE) \ +ATTR int \ +__read_pipe_2_##SIZE(__global struct pipeimp* p, STYPE* ptr) \ +{ \ + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \ + size_t ri = wave_reserve_1(&p->read_idx, wi); \ + if (ri == ~(size_t)0) \ + return -1; \ + \ + size_t pi = wrap(ri, p->end_idx); \ + *ptr = ((__global STYPE *)p->packets)[pi]; \ + \ + if (ri == wi-1) { \ + __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \ + __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \ + }\ +\ + return 0; \ +} + +DO_PIPE_SIZE(READ_PIPE_SIZE) + +ATTR int +__read_pipe_2(__global struct pipeimp* p, void* ptr, uint size, uint align) +{ + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); + size_t ri = wave_reserve_1(&p->read_idx, wi); + if (ri == ~(size_t)0) + return -1; + + size_t pi = wrap(ri, p->end_idx); + __memcpy_internal_aligned(ptr, p->packets + pi*size, size, align); + + if (ri == wi-1) { + __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); + __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); + } + + return 0; +} + +#define READ_PIPE_RESERVED_SIZE(SIZE, STYPE) \ +ATTR int \ +__read_pipe_4_##SIZE(__global struct pipeimp* p, size_t rid, uint i, STYPE* ptr) \ +{ \ + rid += i; \ + size_t pi = wrap(rid, p->end_idx); \ + *ptr = ((__global STYPE *)p->packets)[pi]; \ + \ + return 0; \ +} + +DO_PIPE_SIZE(READ_PIPE_RESERVED_SIZE) + +ATTR int +__read_pipe_4(__global struct pipeimp* p, size_t rid, uint i, void *ptr, uint size, uint align) +{ + rid += i; + size_t pi = wrap(rid, p->end_idx); + __memcpy_internal_aligned(ptr, p->packets + pi*size, size, align); + + return 0; +} + diff --git a/opencl/src/pipes/reservep.cl b/opencl/src/pipes/reservep.cl new file mode 100644 index 00000000..18e073be --- /dev/null +++ b/opencl/src/pipes/reservep.cl @@ -0,0 +1,219 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#include "pipes.h" +#include "../workgroup/wg.h" + +#define ATTR __attribute__((always_inline)) + +#define RESERVE_READ_PIPE_SIZE(SIZE, STYPE) \ +ATTR size_t \ +__reserve_read_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \ +{ \ + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \ + size_t rid = __amd_wresvn(&p->read_idx, wi, num_packets); \ + \ + if (rid + num_packets == wi) { \ + __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \ + __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \ + } \ + \ + return rid; \ +} + +// DO_PIPE_SIZE(RESERVE_READ_PIPE_SIZE) + +ATTR size_t +__reserve_read_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align) +{ + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); + size_t rid = __amd_wresvn(&p->read_idx, wi, num_packets); + + if (rid + num_packets == wi) { + __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); + __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); + } + + return rid; +} + +#define RESERVE_WRITE_PIPE_SIZE(SIZE, STYPE) \ +ATTR size_t \ +__reserve_write_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \ +{ \ + size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \ + size_t ei = p->end_idx; \ + return __amd_wresvn(&p->write_idx, ri + ei, num_packets); \ +} + +// DO_PIPE_SIZE(RESERVE_WRITE_PIPE_SIZE) + +ATTR size_t +__reserve_write_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align) +{ + size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); + size_t ei = p->end_idx; + return __amd_wresvn(&p->write_idx, ri + ei, num_packets); +} + +// Work group functions + +#define WORK_GROUP_RESERVE_READ_PIPE_SIZE(SIZE, STYPE) \ +ATTR size_t \ +__work_group_reserve_read_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \ +{ \ + __local size_t *t = (__local size_t *)__get_scratch_lds(); \ + \ + if ((int)get_local_linear_id() == 0) { \ + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \ + size_t rid = reserve(&p->read_idx, wi, num_packets); \ + \ + if (rid + num_packets == wi) { \ + __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \ + __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \ + } \ + \ + *t = rid; \ + } \ + \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + \ + return *t; \ +} + +// DO_PIPE_SIZE(WORK_GROUP_RESERVE_READ_PIPE_SIZE) + +ATTR size_t +__work_group_reserve_read_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align) +{ + __local size_t *t = (__local size_t *)__get_scratch_lds(); + + if ((int)get_local_linear_id() == 0) { + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); + size_t rid = reserve(&p->read_idx, wi, num_packets); + + if (rid + num_packets == wi) { + __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); + __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); + } + + *t = rid; + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + return *t; +} + +#define WORK_GROUP_RESERVE_WRITE_PIPE_SIZE(SIZE, STYPE) \ +ATTR size_t \ +__work_group_reserve_write_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \ +{ \ + __local size_t *t = (__local size_t *)__get_scratch_lds(); \ + \ + if ((int)get_local_linear_id() == 0) { \ + size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \ + size_t ei = p->end_idx; \ + *t = reserve(&p->write_idx, ri + ei, num_packets); \ + } \ + \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + \ + return *t; \ +} + +// DO_PIPE_SIZE(WORK_GROUP_RESERVE_WRITE_PIPE_SIZE) + +ATTR size_t +__work_group_reserve_write_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align) +{ + __local size_t *t = (__local size_t *)__get_scratch_lds(); + + if ((int)get_local_linear_id() == 0) { + size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); + size_t ei = p->end_idx; + *t = reserve(&p->write_idx, ri + ei, num_packets); + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + return *t; +} + +// sub group functions + +#define SUB_GROUP_RESERVE_READ_PIPE_SIZE(SIZE, STYPE) \ +ATTR size_t \ +__sub_group_reserve_read_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \ +{ \ + size_t rid = ~(size_t)0; \ + \ + if (get_sub_group_local_id() == 0) { \ + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \ + rid = reserve(&p->read_idx, wi, num_packets); \ + \ + if (rid + num_packets == wi) { \ + __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \ + __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \ + } \ + } \ + \ + return sub_group_broadcast(rid, 0); \ +} + +// DO_PIPE_SIZE(SUB_GROUP_RESERVE_READ_PIPE_SIZE) + +ATTR size_t +__sub_group_reserve_read_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align) +{ + size_t rid = ~(size_t)0; + + if (get_sub_group_local_id() == 0) { + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); + rid = reserve(&p->read_idx, wi, num_packets); + + if (rid + num_packets == wi) { + __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); + __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); + } + } + + return sub_group_broadcast(rid, 0); +} + +#define SUB_GROUP_RESERVE_WRITE_PIPE_SIZE(SIZE, STYPE) \ +ATTR size_t \ +__sub_group_reserve_write_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \ +{ \ + size_t rid = ~(size_t)0; \ + \ + if (get_sub_group_local_id() == 0) { \ + size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \ + size_t ei = p->end_idx; \ + rid = reserve(&p->write_idx, ri + ei, num_packets); \ + } \ + \ + return sub_group_broadcast(rid, 0); \ +} + +// DO_PIPE_SIZE(SUB_GROUP_RESERVE_WRITE_PIPE_SIZE) + +ATTR size_t +__sub_group_reserve_write_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align) +{ + size_t rid = ~(size_t)0; + + if (get_sub_group_local_id() == 0) { + size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); + size_t ei = p->end_idx; + rid = reserve(&p->write_idx, ri + ei, num_packets); + } + + return sub_group_broadcast(rid, 0); +} + diff --git a/opencl/src/pipes/validp.cl b/opencl/src/pipes/validp.cl new file mode 100644 index 00000000..5397dfce --- /dev/null +++ b/opencl/src/pipes/validp.cl @@ -0,0 +1,14 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + + +__attribute__((overloadable, always_inline)) bool +is_valid_reserve_id(reserve_id_t rid) +{ + return as_ulong(rid) != ~(size_t)0; +} + diff --git a/opencl/src/pipes/wresvnp.cl b/opencl/src/pipes/wresvnp.cl new file mode 100644 index 00000000..2b4f2fa4 --- /dev/null +++ b/opencl/src/pipes/wresvnp.cl @@ -0,0 +1,148 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "pipes.h" + +size_t +__amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n) +{ + uint alc = (size_t)(__llvm_ctpop_i32(__llvm_amdgcn_read_exec_lo()) + + __llvm_ctpop_i32(__llvm_amdgcn_read_exec_hi())); + uint l = __llvm_amdgcn_mbcnt_hi(-1, __llvm_amdgcn_mbcnt_lo(-1, 0u)); + size_t rid; + + if (__llvm_amdgcn_read_exec() == (1UL << alc) - 1UL) { + // Handle fully active subgroup + uint sum = sub_group_scan_inclusive_add((uint)n); + size_t idx = 0; + if (l == alc-1) { + idx = reserve(pidx, lim, (size_t)sum); + } + idx = sub_group_broadcast(idx, alc-1); + rid = idx + (size_t)(sum - (uint)n); + rid = idx != ~(size_t)0 ? rid : idx; + } else { + // Inclusive add scan with not all lanes active + const ulong nomsb = 0x7fffffffffffffffUL; + + // Step 1 + ulong smask = __llvm_amdgcn_read_exec() & ((0x1UL << l) - 0x1UL); + int slid = 63 - (int)clz(smask); + uint t = __llvm_amdgcn_ds_bpermute(slid << 2, n); + uint sum = n + (slid < 0 ? 0 : t); + smask ^= (0x1UL << slid) & nomsb; + + // Step 2 + slid = 63 - (int)clz(smask); + t = __llvm_amdgcn_ds_bpermute(slid << 2, sum); + sum += slid < 0 ? 0 : t; + + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + + // Step 3 + slid = 63 - (int)clz(smask); + t = __llvm_amdgcn_ds_bpermute(slid << 2, sum); + sum += slid < 0 ? 0 : t; + + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + + // Step 4 + slid = 63 - (int)clz(smask); + t = __llvm_amdgcn_ds_bpermute(slid << 2, sum); + sum += slid < 0 ? 0 : t; + + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + + // Step 5 + slid = 63 - (int)clz(smask); + t = __llvm_amdgcn_ds_bpermute(slid << 2, sum); + sum += slid < 0 ? 0 : t; + + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + + // Step 6 + slid = 63 - (int)clz(smask); + t = __llvm_amdgcn_ds_bpermute(slid << 2, sum); + sum += slid < 0 ? 0 : t; + __llvm_amdgcn_wave_barrier(); + + size_t idx = 0; + if (l == 63 - (int)clz(__llvm_amdgcn_read_exec())) { + idx = reserve(pidx, lim, (size_t)sum); + } + __llvm_amdgcn_wave_barrier(); + + // Broadcast + uint k = 63u - (uint)clz(__llvm_amdgcn_read_exec()); + idx = ((size_t)__llvm_amdgcn_readlane((uint)(idx >> 32), k) << 32) | + (size_t)__llvm_amdgcn_readlane((uint)idx, k); + __llvm_amdgcn_wave_barrier(); + + rid = idx + (size_t)(sum - (uint)n); + rid = idx != ~(size_t)0 ? rid : idx; + } + + if (rid == ~(size_t)0) { + // Try again one at a time + rid = reserve(pidx, lim, n); + } + + return rid; +} + diff --git a/opencl/src/pipes/writep.cl b/opencl/src/pipes/writep.cl new file mode 100644 index 00000000..e07026cd --- /dev/null +++ b/opencl/src/pipes/writep.cl @@ -0,0 +1,65 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "pipes.h" + +#define ATTR __attribute__((always_inline)) + +#define WRITE_PIPE_SIZE(SIZE, STYPE) \ +ATTR int \ +__write_pipe_2_##SIZE(__global struct pipeimp* p, const STYPE* ptr) \ +{ \ + size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \ + size_t ei = p->end_idx; \ + size_t wi = wave_reserve_1(&p->write_idx, ri+ei); \ + if (wi == ~(size_t)0) \ + return -1; \ + \ + size_t pi = wrap(wi, ei); \ + ((__global STYPE *)p->packets)[pi] = *ptr; \ + return 0; \ +} + +DO_PIPE_SIZE(WRITE_PIPE_SIZE) + +ATTR int +__write_pipe_2(__global struct pipeimp* p, const void* ptr, uint size, uint align) +{ + size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); + size_t ei = p->end_idx; + size_t wi = wave_reserve_1(&p->write_idx, ri+ei); + if (wi == ~(size_t)0) + return -1; + + size_t pi = wrap(wi, ei); + __memcpy_internal_aligned(p->packets + pi*size, ptr, size, align); + + return 0; +} + +#define WRITE_PIPE_RESERVED_SIZE(SIZE, STYPE) \ +ATTR int \ +__write_pipe_4_##SIZE(__global struct pipeimp* p, size_t rid, uint i, const STYPE* ptr) \ +{ \ + rid += i; \ + size_t pi = wrap(rid, p->end_idx); \ + ((__global STYPE *)p->packets)[pi] = *ptr; \ + return 0; \ +} + +DO_PIPE_SIZE(WRITE_PIPE_RESERVED_SIZE) + +ATTR int +__write_pipe_4(__global struct pipeimp* p, size_t rid, uint i, const void *ptr, uint size, uint align) +{ + rid += i; + size_t pi = wrap(rid, p->end_idx); + __memcpy_internal_aligned(p->packets + pi*size, ptr, size, align); + + return 0; +} +