Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CUDA atomic add benchmark #2

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 127 additions & 0 deletions CUDA/atomic_add.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#include <cstdlib>
#include <cuda.h>
#include <stdint.h>
#include <time.h>

#include <chrono>
using namespace std::chrono;
using nano_double = duration<double, std::nano>;

#ifdef _WIN32
#define EXPORT_API __declspec(dllexport)
#else
#define EXPORT_API
#endif

#define DAT double

#define source(i) _source[i]
#define indices(i, j) _indices[j * n + i]
#define target1(i) _target1[i]
#define target2(i) _target2[i]

__global__ void no_atomic_kernel(DAT *_target1, DAT *_target2, DAT *_source, int *_indices, const int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int i1 = indices(i, 0);
int i2 = indices(i, 1);
int i3 = indices(i, 2);
int i4 = indices(i, 3);
DAT v = source(i);
target1(i1) += v;
target1(i2) += v;
target1(i3) += v;
target1(i4) += v;
target2(i1) += v;
target2(i2) += v;
target2(i3) += v;
target2(i4) += v;
}

__global__ void atomic_kernel(DAT *_target1, DAT *_target2, DAT *_source, int *_indices, const int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int i1 = indices(i, 0);
int i2 = indices(i, 1);
int i3 = indices(i, 2);
int i4 = indices(i, 3);
DAT v = source(i);
atomicAdd(&target1(i1), v);
atomicAdd(&target1(i2), v);
atomicAdd(&target1(i3), v);
atomicAdd(&target1(i4), v);
atomicAdd(&target2(i1), v);
atomicAdd(&target2(i2), v);
atomicAdd(&target2(i3), v);
atomicAdd(&target2(i4), v);
}

extern "C" EXPORT_API void run_benchmark(double *times, const int nsamples) {
int i;
const int n = 1024;
const int bins = 64;
DAT *target1, *target2, *source;
int *indices;

srand((unsigned) time(NULL));

DAT *target1_h = (DAT *)malloc(bins * sizeof(DAT));
for (i = 0; i < bins; i++) {
target1_h[i] = (DAT)0.0;
}
DAT *target2_h = (DAT *)malloc(bins * sizeof(DAT));
for (i = 0; i < bins; i++) {
target2_h[i] = (DAT)0.0;
}
DAT *source_h = (DAT *)malloc(n * sizeof(DAT));
for (i = 0; i < n; i++) {
source_h[i] = static_cast<DAT>(rand()) / static_cast<DAT>(RAND_MAX);
}
int *indices_h = (int *)malloc(n * 4 * sizeof(int));
for (i = 0; i < (n * 4); i++) {
indices_h[i] = std::rand() % bins;
}

cudaMalloc(&target1, bins * sizeof(DAT));
cudaMalloc(&target2, bins * sizeof(DAT));
cudaMalloc(&source, n * sizeof(DAT));
cudaMalloc(&indices, n * 4 * sizeof(int));

cudaMemcpy(target1, target1_h, bins * sizeof(DAT), cudaMemcpyHostToDevice);
cudaMemcpy(target2, target2_h, bins * sizeof(DAT), cudaMemcpyHostToDevice);
cudaMemcpy( source, source_h, n * sizeof(DAT), cudaMemcpyHostToDevice);
cudaMemcpy(indices, indices_h, n * 4 * sizeof(int), cudaMemcpyHostToDevice);

cudaStream_t stream;
cudaStreamCreate(&stream);

dim3 nthreads(256);
dim3 nblocks((n + nthreads.x - 1) / nthreads.x);

// for (int isample = 0; isample < nsamples; ++isample) {
// auto timer = high_resolution_clock::now();
// hipLaunchKernelGGL(no_atomic_kernel, nblocks, nthreads, 0, stream, target1, target2, source, indices, n);
// hipStreamSynchronize(stream);
// auto elapsed = high_resolution_clock::now() - timer;
// auto time_total = duration_cast<nano_double>(elapsed).count();
// times[isample] = time_total;
// }

for (int isample = 0; isample < nsamples; ++isample) {
auto timer = high_resolution_clock::now();
atomic_kernel<<<nblocks, nthreads, 0, stream>>>(target1, target2, source, indices, n);
cudaStreamSynchronize(stream);
auto elapsed = high_resolution_clock::now() - timer;
auto time_total = duration_cast<nano_double>(elapsed).count();
times[isample] = time_total;
}

free(target1_h);
free(target2_h);
free(source_h);
free(indices_h);
cudaFree(target1);
cudaFree(target2);
cudaFree(source);
cudaFree(indices);

cudaStreamDestroy(stream);
}
100 changes: 100 additions & 0 deletions CUDA/atomic_add.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
using CUDA
using KernelAbstractions
using BenchmarkTools
using Libdl

function make_c_trial(nsamples)
c_times = zeros(Float64, nsamples)
c_gctimes = zeros(Float64, nsamples)
c_memory = 0::Int64
c_allocs = 0::Int64
c_params = BenchmarkTools.DEFAULT_PARAMETERS
c_params.samples = nsamples
return BenchmarkTools.Trial(c_params, c_times, c_gctimes, c_memory, c_allocs)
end

INPUTS = Dict()

INPUTS["atomic_add"] = (
c_samples=2000,
)

function cuda_atomic_add!(target1, target2, source, indices)
i = threadIdx().x + (blockIdx().x - 1) * gridDim().x
i1, i2, i3, i4 = indices[i, 1], indices[i, 2], indices[i, 3], indices[i, 4]
v = source[i]
CUDA.@atomic target1[i1] += v
CUDA.@atomic target1[i2] += v
CUDA.@atomic target1[i3] += v
CUDA.@atomic target1[i4] += v
CUDA.@atomic target2[i1] += v
CUDA.@atomic target2[i2] += v
CUDA.@atomic target2[i3] += v
CUDA.@atomic target2[i4] += v
return
end

@kernel function ka_atomic_add!(target1, target2, source, indices)
i = @index(Global, Linear)
i1, i2, i3, i4 = indices[i, 1], indices[i, 2], indices[i, 3], indices[i, 4]
v = source[i]
KernelAbstractions.@atomic target1[i1] += v
KernelAbstractions.@atomic target1[i2] += v
KernelAbstractions.@atomic target1[i3] += v
KernelAbstractions.@atomic target1[i4] += v
KernelAbstractions.@atomic target2[i1] += v
KernelAbstractions.@atomic target2[i2] += v
KernelAbstractions.@atomic target2[i3] += v
KernelAbstractions.@atomic target2[i4] += v
end

function run_julia_benchmarks(::Type{DAT}) where DAT
n, bins = 1024, 64
target1 = CuArray(zeros(DAT, bins))
target2 = CuArray(zeros(DAT, bins))
source = CuArray(rand(DAT, n))
indices = CuArray(rand(1:bins, n, 4))

nthreads = 256
nblocks = cld.(n, nthreads)

bm = @benchmark begin
@cuda threads=$nthreads blocks=$nblocks cuda_atomic_add!($target1, $target2, $source, $indices)
CUDA.synchronize()
end

bm_ka = @benchmark begin
ka_atomic_add!($CUDABackend(), 256, $n)($target1, $target2, $source, $indices)
KernelAbstractions.synchronize(CUDABackend())
end

CUDA.unsafe_free!(source)
CUDA.unsafe_free!(indices)
CUDA.unsafe_free!(target1)
CUDA.unsafe_free!(target2)

return (bm, bm_ka)
end

function run_c_benchmarks(lib, nsamples)
trial = make_c_trial(nsamples)

sym = Libdl.dlsym(lib, :run_benchmark)
@ccall $sym(trial.times::Ptr{Cdouble}, nsamples::Cint)::Cvoid

return trial
end

# Compile C benchmark
libext = Sys.iswindows() ? "dll" : "so"
libname = "atomic_add." * libext
run(`nvcc -O3 -o $libname --shared -Xcompiler -fPIC -arch=sm_60 atomic_add.cu`)

Libdl.dlopen("./$libname") do lib
group_n = BenchmarkGroup()
jb = run_julia_benchmarks(Float32)
group_n["julia"] = jb[1]
group_n["julia-ka"] = jb[2]
group_n["reference"] = run_c_benchmarks(lib, INPUTS["atomic_add"].c_samples)
display(group_n)
end