Skip to content

Commit

Permalink
Improve setting launching bounds on preset kernels
Browse files Browse the repository at this point in the history
- Replace metadata if already set (CUDA)
- Add test kernel_preset_bounds
  • Loading branch information
ggeorgakoudis committed Dec 18, 2024
1 parent 7cfa5a7 commit 9858438
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 4 deletions.
25 changes: 21 additions & 4 deletions lib/JitEngineDeviceCUDA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
//===----------------------------------------------------------------------===//

#include "llvm/Bitcode/BitcodeWriter.h"
#include "llvm/IR/Metadata.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Target/TargetMachine.h"
#include <llvm/Linker/Linker.h>
Expand Down Expand Up @@ -118,10 +119,26 @@ void JitEngineDeviceCUDA::setLaunchBoundsForKernel(Module &M, Function &F,
// properties.
// TODO: set min GridSize.
int MaxThreads = std::min(1024, BlockSize);
Metadata *MDVals[] = {ConstantAsMetadata::get(&F),
MDString::get(M.getContext(), "maxntidx"),
ConstantAsMetadata::get(ConstantInt::get(
Type::getInt32Ty(M.getContext()), MaxThreads))};
auto *FuncMetadata = ConstantAsMetadata::get(&F);
auto *MaxntidxMetadata = MDString::get(M.getContext(), "maxntidx");
auto *MaxThreadsMetadata = ConstantAsMetadata::get(
ConstantInt::get(Type::getInt32Ty(M.getContext()), MaxThreads));

// Replace if the metadata exists.
for (auto *MetadataNode : NvvmAnnotations->operands()) {
// Expecting 3 operands ptr, desc, i32 value.
assert(MetadataNode->getNumOperands() == 3);

auto *PtrMetadata = MetadataNode->getOperand(0).get();
auto *DescMetadata = MetadataNode->getOperand(1).get();
if (PtrMetadata == FuncMetadata && MaxntidxMetadata == DescMetadata) {
MetadataNode->replaceOperandWith(2, MaxThreadsMetadata);
return;
}
}

// Otherwise create the metadata and insert.
Metadata *MDVals[] = {FuncMetadata, MaxntidxMetadata, MaxThreadsMetadata};
NvvmAnnotations->addOperand(MDNode::get(M.getContext(), MDVals));
}

Expand Down
1 change: 1 addition & 0 deletions lib/JitEngineDeviceHIP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ void JitEngineDeviceHIP::setLaunchBoundsForKernel(Module &M, Function &F,
// TODO: find maximum (hardcoded 1024) from device info.
// TODO: Setting as 1, BlockSize to replicate launch bounds settings
// Does setting it as BlockSize, BlockSize help?
// Setting the attribute override any previous setting.
F.addFnAttr("amdgpu-flat-work-group-size",
"1," + std::to_string(std::min(1024, BlockSize)));
// TODO: find warp size (hardcoded 64) from device info.
Expand Down
2 changes: 2 additions & 0 deletions tests/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ CREATE_GPU_TEST(types types.cpp)
CREATE_GPU_TEST(kernel_unused_gvar kernel_unused_gvar.cpp kernel_unused_gvar_def.cpp)
CREATE_GPU_TEST(kernel_repeat kernel_repeat.cpp)
CREATE_GPU_TEST(kernel_launch_exception kernel_launch_exception.cpp)
CREATE_GPU_TEST(kernel_preset_bounds kernel_preset_bounds.cpp)

CREATE_GPU_TEST_RDC(kernel kernel.cpp)
CREATE_GPU_TEST_RDC(kernel_cache kernel_cache.cpp)
Expand All @@ -194,6 +195,7 @@ CREATE_GPU_TEST_RDC(types types.cpp)
CREATE_GPU_TEST_RDC(kernel_calls_func kernel_calls_func.cpp device_func.cpp)
CREATE_GPU_TEST_RDC(kernel_repeat kernel_repeat.cpp)
CREATE_GPU_TEST_RDC(kernel_launch_exception kernel_launch_exception.cpp)
CREATE_GPU_TEST_RDC(kernel_preset_bounds kernel_preset_bounds.cpp)

CREATE_GPU_LIBRARY(device_func_lib device_func.cpp)
CREATE_GPU_TEST_RDC_LIBS(kernel_calls_func_lib device_func_lib kernel_calls_func_lib.cpp)
26 changes: 26 additions & 0 deletions tests/gpu/kernel_preset_bounds.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// clang-format off
// RUN: ./kernel_preset_bounds.%ext | FileCheck %s --check-prefixes=CHECK,CHECK-FIRST
// Second run uses the object cache.
// RUN: ./kernel_preset_bounds.%ext | FileCheck %s --check-prefixes=CHECK,CHECK-SECOND
// clang-format on
#include <climits>
#include <cstdio>

#include "gpu_common.h"

__global__ __attribute__((annotate("jit")))
__launch_bounds__(128, 4) void kernel() {
printf("Kernel\n");
}

int main() {
kernel<<<1, 1>>>();
gpuErrCheck(gpuDeviceSynchronize());
return 0;
}

// CHECK: Kernel
// CHECK: JitCache hits 0 total 1
// CHECK: HashValue {{[0-9]+}} NumExecs 1 NumHits 0
// CHECK-FIRST: JitStorageCache hits 0 total 1
// CHECK-SECOND: JitStorageCache hits 1 total 1

0 comments on commit 9858438

Please sign in to comment.