Skip to content

Commit

Permalink
updates gpu initialization (adds hip); renames file initialize_cuda.c…
Browse files Browse the repository at this point in the history
…u to initialize_gpu.cu
  • Loading branch information
danielpeter committed Apr 13, 2021
1 parent 7981920 commit cc2d50a
Show file tree
Hide file tree
Showing 10 changed files with 312 additions and 38 deletions.
28 changes: 26 additions & 2 deletions src/gpu/helper_functions.cu
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,35 @@ void gpuCopy_todevice_realw(void** d_array_addr_ptr,realw* h_array,int size){
#endif
}


/* ----------------------------------------------------------------------------------------------- */
// GPU reset
/* ----------------------------------------------------------------------------------------------- */

// GPU synchronization
void gpuReset() {
// releases previous contexts

// cuda version
#ifdef USE_CUDA
if (run_cuda) {
#if CUDA_VERSION < 4000 || (defined (__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ < 4))
cudaThreadExit();
#else
cudaDeviceReset();
#endif
}
#endif

// hip version
#ifdef USE_HIP
if (run_hip) {
hipDeviceReset();
}
#endif
}


/* ----------------------------------------------------------------------------------------------- */
// GPU synchronization
/* ----------------------------------------------------------------------------------------------- */

void gpuSynchronize() {
Expand Down
298 changes: 272 additions & 26 deletions src/gpu/initialize_cuda.cu → src/gpu/initialize_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@

#include "mesh_constants_gpu.h"

// gpu runtime flags
int run_cuda = 0;
int run_hip = 0;

/* ----------------------------------------------------------------------------------------------- */
// CUDA initialization
/* ----------------------------------------------------------------------------------------------- */

// CUDA version output
#ifdef USE_CUDA

Expand All @@ -47,20 +55,9 @@
#pragma message ("\n\nCompiling for CUDA version < 4.0\n")
#endif

#endif

// gpu runtime flags
int run_cuda = 0;

/* ----------------------------------------------------------------------------------------------- */
void initialize_cuda_device(int* myrank_f,int* ncuda_devices) {

// GPU initialization

/* ----------------------------------------------------------------------------------------------- */

extern EXTERN_LANG
void FC_FUNC_(initialize_cuda_device,
INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
TRACE("initialize_cuda_device");

int device;
Expand Down Expand Up @@ -131,8 +128,10 @@ void FC_FUNC_(initialize_cuda_device,
exit_on_error("CUDA runtime error: cudaGetDeviceCount failed\n\nplease check if driver and runtime libraries work together\nor on cluster environments enable MPS (Multi-Process Service) to use single GPU with multiple MPI processes\n\nexiting...\n");
}

// returns device count to fortran
// checks if CUDA devices available
if (device_count == 0) exit_on_error("CUDA runtime error: there is no device supporting CUDA\n");

// returns device count to fortran
*ncuda_devices = device_count;

// Sets the active device
Expand All @@ -142,29 +141,25 @@ void FC_FUNC_(initialize_cuda_device,
// "setting the device when a process is active is not allowed"

// releases previous contexts
#if CUDA_VERSION < 4000
cudaThreadExit();
#else
cudaDeviceReset();
#endif
gpuReset();

//printf("rank %d: cuda device count = %d sets device = %d \n",myrank,device_count,myrank % device_count);
//MPI_Barrier(MPI_COMM_WORLD);

// sets active device
#ifdef CUDA_DEVICE_ID
// uses fixed device id when compile with e.g.: -DCUDA_DEVICE_ID=1
device = CUDA_DEVICE_ID;
if (myrank == 0) printf("setting cuda devices with id = %d for all processes by -DCUDA_DEVICE_ID\n\n",device);
#ifdef GPU_DEVICE_ID
// uses fixed device id when compile with e.g.: -DGPU_DEVICE_ID=1
device = GPU_DEVICE_ID;
if (myrank == 0) printf("setting CUDA devices with id = %d for all processes by -DGPU_DEVICE_ID\n\n",device);

cudaSetDevice( device );
exit_on_gpu_error("cudaSetDevice has invalid device");

// double check that device was properly selected
cudaGetDevice(&device);
if (device != CUDA_DEVICE_ID ){
printf("error rank: %d devices: %d \n",myrank,device_count);
printf(" cudaSetDevice()=%d\n cudaGetDevice()=%d\n",CUDA_DEVICE_ID,device);
if (device != GPU_DEVICE_ID ){
printf("Error rank: %d devices: %d \n",myrank,device_count);
printf(" cudaSetDevice()=%d\n cudaGetDevice()=%d\n",GPU_DEVICE_ID,device);
exit_on_error("CUDA set/get device error: device id conflict \n");
}
#else
Expand All @@ -178,7 +173,7 @@ void FC_FUNC_(initialize_cuda_device,
// double check that device was properly selected
cudaGetDevice(&device);
if (device != (myrank % device_count) ){
printf("error rank: %d devices: %d \n",myrank,device_count);
printf("Error rank: %d devices: %d \n",myrank,device_count);
printf(" cudaSetDevice()=%d\n cudaGetDevice()=%d\n",myrank%device_count,device);
exit_on_error("CUDA set/get device error: device id conflict \n");
}
Expand Down Expand Up @@ -311,5 +306,256 @@ void FC_FUNC_(initialize_cuda_device,
}
}
#endif
}
#endif // USE_CUDA



/* ----------------------------------------------------------------------------------------------- */
// HIP initialization
/* ----------------------------------------------------------------------------------------------- */

#ifdef USE_HIP

void initialize_hip_device(int* myrank_f,int* ncuda_devices) {

TRACE("initialize_hip_device");

int device;
int device_count;

// Gets rank number of MPI process
int myrank = *myrank_f;

// first HIP call
//
// explicit initialization
// (not necessary, most HIP APIs implicitly initialize the HIP runtime)
//hipError_t status = hipInit(0);
//if (status != hipSuccess) exit_on_error("HIP initialization failed\n");
//
// gets number of devices
device_count = 0;
hipGetDeviceCount(&device_count);
hipError_t err = hipGetLastError();

// adds quick check on versions
int driverVersion = 0, runtimeVersion = 0;
hipDriverGetVersion(&driverVersion);
hipRuntimeGetVersion(&runtimeVersion);

// exit in case first HIP call failed
if (err != hipSuccess){
fprintf (stderr,"Error after hipGetDeviceCount: %s\n", hipGetErrorString(err));
fprintf (stderr,"HIP Device count: %d\n",device_count);
fprintf (stderr,"HIP Driver Version / Runtime Version: %d.%d / %d.%d\n",
driverVersion / 1000, (driverVersion % 100) / 10,
runtimeVersion / 1000, (runtimeVersion % 100) / 10);

exit_on_error("HIP runtime error: hipGetDeviceCount failed\n\nPlease check if any HIP devices are available\n\nexiting...\n");
}

// checks if HIP devices available
if (device_count == 0) exit_on_error("HIP runtime error: no HIP devices available\n");

// returns device count to fortran
*ncuda_devices = device_count;

// Sets the active device
if (device_count >= 1) {
// generalized for more GPUs per node
// note: without previous context release, hipSetDevice will complain with the cuda error
// "setting the device when a process is active is not allowed"

// releases previous contexts
#if CUDA_VERSION < 4000
hipDeviceReset();
#else
hipDeviceReset();
#endif

//printf("rank %d: cuda device count = %d sets device = %d \n",myrank,device_count,myrank % device_count);
//MPI_Barrier(MPI_COMM_WORLD);

// sets active device
#ifdef GPU_DEVICE_ID
// uses fixed device id when compile with e.g.: -DGPU_DEVICE_ID=1
device = GPU_DEVICE_ID;
if (myrank == 0) printf("setting HIP devices with id = %d for all processes by -DGPU_DEVICE_ID\n\n",device);

hipSetDevice( device );
exit_on_gpu_error("hipSetDevice has invalid device");

// double check that device was properly selected
hipGetDevice(&device);
if (device != GPU_DEVICE_ID ){
printf("Error rank: %d devices: %d \n",myrank,device_count);
printf(" hipSetDevice()=%d\n hipGetDevice()=%d\n",GPU_DEVICE_ID,device);
exit_on_error("HIP set/get device error: device id conflict \n");
}
#else
// device changes for different mpi processes according to number of device per node
// (assumes that number of devices per node is the same for different compute nodes)
device = myrank % device_count;

hipSetDevice( device );
exit_on_gpu_error("hipSetDevice has invalid device");

// double check that device was properly selected
hipGetDevice(&device);
if (device != (myrank % device_count) ){
printf("Error rank: %d devices: %d \n",myrank,device_count);
printf(" hipSetDevice()=%d\n hipGetDevice()=%d\n",myrank%device_count,device);
exit_on_error("HIP set/get device error: device id conflict \n");
}
#endif
}

// returns a handle to the active device
hipGetDevice(&device);
exit_on_gpu_error("hipGetDevice failed");

// get device properties
struct hipDeviceProp_t deviceProp;
hipGetDeviceProperties(&deviceProp,device);
exit_on_gpu_error("hipGetDevicePropoerties failed");

// memory usage
double free_db,used_db,total_db;
get_free_memory(&free_db,&used_db,&total_db);

// outputs device infos to file
char filename[BUFSIZ];
FILE* fp;
int do_output_info;

// by default, only master process outputs device infos to avoid file cluttering
do_output_info = 0;
if (myrank == 0){
do_output_info = 1;
sprintf(filename,OUTPUT_FILES"/gpu_device_info.txt");
}
// debugging
if (DEBUG){
do_output_info = 1;
sprintf(filename,OUTPUT_FILES"/gpu_device_info_proc_%06d.txt",myrank);
}

// output to file
if (do_output_info ){
fp = fopen(filename,"w");
if (fp != NULL){
// display device properties
fprintf (fp, "Device Name = %s\n", deviceProp.name);
fprintf (fp, "memory:\n");
fprintf (fp, " totalGlobalMem (in MB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f));
fprintf (fp, " totalGlobalMem (in GB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f * 1024.f));
fprintf (fp, " totalConstMem (in bytes): %lu\n",(unsigned long) deviceProp.totalConstMem); // seems to be same as GlobalMem
//fprintf (fp, " Maximum 1D texture size (in bytes): %lu\n",(unsigned long) deviceProp.maxTexture1D); // not available?
fprintf (fp, " sharedMemPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.sharedMemPerBlock);
fprintf (fp, " regsPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.regsPerBlock);
fprintf (fp, "blocks:\n");
fprintf (fp, " Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock);
fprintf (fp, " Maximum size of each dimension of a block: %d x %d x %d\n",
deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
fprintf (fp, " Maximum sizes of each dimension of a grid: %d x %d x %d\n",
deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
fprintf (fp, "features:\n");
fprintf (fp, " Compute capability of the device = %d.%d\n", deviceProp.major, deviceProp.minor);
fprintf (fp, " multiProcessorCount: %d\n",deviceProp.multiProcessorCount);
if (deviceProp.canMapHostMemory){
fprintf (fp, " canMapHostMemory: TRUE\n");
}else{
fprintf (fp, " canMapHostMemory: FALSE\n");
}
if (deviceProp.concurrentKernels){
fprintf (fp, " concurrentKernels: TRUE\n");
}else{
fprintf (fp, " concurrentKernels: FALSE\n");
}

fprintf(fp,"HIP Device count: %d\n",device_count);
fprintf(fp,"HIP Driver Version / Runtime Version %d.%d / %d.%d\n",
driverVersion / 1000, (driverVersion % 100) / 10,
runtimeVersion / 1000, (runtimeVersion % 100) / 10);

// outputs initial memory infos via hipMemGetInfo()
fprintf(fp,"memory usage:\n");
fprintf(fp," rank %d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n",myrank,
used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);

// closes output file
fclose(fp);
}
}

/* daniel todo: check in case this applies...
// we use pinned memory for asynchronous copy
if (! deviceProp.canMapHostMemory){
fprintf(stderr,"Device capability should allow to map host memory, exiting...\n");
exit_on_error("CUDA Device capability canMapHostMemory should be TRUE\n");
}
*/

// checks kernel optimization setting
#ifdef USE_LAUNCH_BOUNDS
// see: mesh_constants_cuda.h
// performance statistics: main kernel Kernel_2_**_impl():
// shared memory per block = 6200 for Kepler: total = 49152 -> limits active blocks to 7
// registers per thread = 72 (limited by LAUNCH_MIN_BLOCKS 7)
// registers per block = 9216 total = 65536 (limited by LAUNCH_MIN_BLOCKS 7)

// shared memory
if (deviceProp.sharedMemPerBlock > 49152 && LAUNCH_MIN_BLOCKS <= 7){
if (myrank == 0){
printf("GPU non-optimal settings: your setting of using LAUNCH_MIN_BLOCK %i is too low and limits the register usage\n",
LAUNCH_MIN_BLOCKS);
}
}

// registers
if (deviceProp.regsPerBlock > 65536 && LAUNCH_MIN_BLOCKS <= 7){
if (myrank == 0){
printf("GPU non-optimal settings: your setting of using LAUNCH_MIN_BLOCK %i is too low and limits the register usage\n",
LAUNCH_MIN_BLOCKS);
}
}
#endif

}
#endif // USE_HIP

/* ----------------------------------------------------------------------------------------------- */

// GPU initialization

/* ----------------------------------------------------------------------------------------------- */

extern EXTERN_LANG
void FC_FUNC_(initialize_gpu_device,
INITIALIZE_GPU_DEVICE)(int* myrank_f,int* ncuda_devices) {

TRACE("initialize_gpu_device");

// check if compiled with both CUDA and HIP support
#if defined(USE_CUDA) && defined(USE_HIP)
if (*myrank_f == 0) {
printf("Error: GPU version compilation with both USE_CUDA and USE_HIP not supported yet.\nPlease only use one for now...\n\n",);
}
exit(1);
#endif

// initializes gpu cards
#ifdef USE_CUDA
run_cuda = 1;
if (run_cuda) {
initialize_cuda_device(myrank_f, ncuda_devices);
}
#endif
#ifdef USE_HIP
run_hip = 1;
if (run_hip) {
initialize_hip_device(myrank_f, ncuda_devices);
}
#endif
}
Loading

0 comments on commit cc2d50a

Please sign in to comment.