updates gpu initialization (adds hip); renames file initialize_cuda.c…

…u to initialize_gpu.cu
cenjinzhiyu · Apr 13, 2021 · cc2d50a · cc2d50a
1 parent 7981920
commit cc2d50a
Show file tree

Hide file tree

Showing 10 changed files with 312 additions and 38 deletions.
diff --git a/src/gpu/helper_functions.cu b/src/gpu/helper_functions.cu
@@ -96,11 +96,35 @@ void gpuCopy_todevice_realw(void** d_array_addr_ptr,realw* h_array,int size){
 #endif
 }
 
-
+/* ----------------------------------------------------------------------------------------------- */
+// GPU reset
 /* ----------------------------------------------------------------------------------------------- */
 
-// GPU synchronization
+void gpuReset() {
+  // releases previous contexts
+
+  // cuda version
+#ifdef USE_CUDA
+  if (run_cuda) {
+#if CUDA_VERSION < 4000 || (defined (__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ < 4))
+    cudaThreadExit();
+#else
+    cudaDeviceReset();
+#endif
+  }
+#endif
 
+  // hip version
+#ifdef USE_HIP
+  if (run_hip) {
+    hipDeviceReset();
+  }
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+// GPU synchronization
 /* ----------------------------------------------------------------------------------------------- */
 
 void gpuSynchronize() {

diff --git a/src/gpu/initialize_cuda.cu → src/gpu/initialize_gpu.cu b/src/gpu/initialize_cuda.cu → src/gpu/initialize_gpu.cu
@@ -29,6 +29,14 @@
 
 #include "mesh_constants_gpu.h"
 
+// gpu runtime flags
+int run_cuda = 0;
+int run_hip = 0;
+
+/* ----------------------------------------------------------------------------------------------- */
+// CUDA initialization
+/* ----------------------------------------------------------------------------------------------- */
+
 // CUDA version output
 #ifdef USE_CUDA
 
@@ -47,20 +55,9 @@
 #pragma message ("\n\nCompiling for CUDA version < 4.0\n")
 #endif
 
-#endif
-
-// gpu runtime flags
-int run_cuda = 0;
 
-/* ----------------------------------------------------------------------------------------------- */
+void initialize_cuda_device(int* myrank_f,int* ncuda_devices) {
 
-// GPU initialization
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern EXTERN_LANG
-void FC_FUNC_(initialize_cuda_device,
-              INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
   TRACE("initialize_cuda_device");
 
   int device;
@@ -131,8 +128,10 @@ void FC_FUNC_(initialize_cuda_device,
     exit_on_error("CUDA runtime error: cudaGetDeviceCount failed\n\nplease check if driver and runtime libraries work together\nor on cluster environments enable MPS (Multi-Process Service) to use single GPU with multiple MPI processes\n\nexiting...\n");
   }
 
-  // returns device count to fortran
+  // checks if CUDA devices available
   if (device_count == 0) exit_on_error("CUDA runtime error: there is no device supporting CUDA\n");
+
+  // returns device count to fortran
   *ncuda_devices = device_count;
 
   // Sets the active device
@@ -142,29 +141,25 @@ void FC_FUNC_(initialize_cuda_device,
     //         "setting the device when a process is active is not allowed"
 
     // releases previous contexts
-#if CUDA_VERSION < 4000
-    cudaThreadExit();
-#else
-    cudaDeviceReset();
-#endif
+    gpuReset();
 
     //printf("rank %d: cuda device count = %d sets device = %d \n",myrank,device_count,myrank % device_count);
     //MPI_Barrier(MPI_COMM_WORLD);
 
     // sets active device
-#ifdef CUDA_DEVICE_ID
-    // uses fixed device id when compile with e.g.: -DCUDA_DEVICE_ID=1
-    device = CUDA_DEVICE_ID;
-    if (myrank == 0) printf("setting cuda devices with id = %d for all processes by -DCUDA_DEVICE_ID\n\n",device);
+#ifdef GPU_DEVICE_ID
+    // uses fixed device id when compile with e.g.: -DGPU_DEVICE_ID=1
+    device = GPU_DEVICE_ID;
+    if (myrank == 0) printf("setting CUDA devices with id = %d for all processes by -DGPU_DEVICE_ID\n\n",device);
 
     cudaSetDevice( device );
     exit_on_gpu_error("cudaSetDevice has invalid device");
 
     // double check that device was  properly selected
     cudaGetDevice(&device);
-    if (device != CUDA_DEVICE_ID ){
-       printf("error rank: %d devices: %d \n",myrank,device_count);
-       printf("  cudaSetDevice()=%d\n  cudaGetDevice()=%d\n",CUDA_DEVICE_ID,device);
+    if (device != GPU_DEVICE_ID ){
+       printf("Error rank: %d devices: %d \n",myrank,device_count);
+       printf("  cudaSetDevice()=%d\n  cudaGetDevice()=%d\n",GPU_DEVICE_ID,device);
        exit_on_error("CUDA set/get device error: device id conflict \n");
     }
 #else
@@ -178,7 +173,7 @@ void FC_FUNC_(initialize_cuda_device,
     // double check that device was  properly selected
     cudaGetDevice(&device);
     if (device != (myrank % device_count) ){
-       printf("error rank: %d devices: %d \n",myrank,device_count);
+       printf("Error rank: %d devices: %d \n",myrank,device_count);
        printf("  cudaSetDevice()=%d\n  cudaGetDevice()=%d\n",myrank%device_count,device);
        exit_on_error("CUDA set/get device error: device id conflict \n");
     }
@@ -311,5 +306,256 @@ void FC_FUNC_(initialize_cuda_device,
     }
   }
 #endif
+}
+#endif // USE_CUDA
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+// HIP initialization
+/* ----------------------------------------------------------------------------------------------- */
+
+#ifdef USE_HIP
+
+void initialize_hip_device(int* myrank_f,int* ncuda_devices) {
+
+  TRACE("initialize_hip_device");
+
+  int device;
+  int device_count;
+
+  // Gets rank number of MPI process
+  int myrank = *myrank_f;
+
+  // first HIP call
+  //
+  // explicit initialization
+  // (not necessary, most HIP APIs implicitly initialize the HIP runtime)
+  //hipError_t status = hipInit(0);
+  //if (status != hipSuccess) exit_on_error("HIP initialization failed\n");
+  //
+  // gets number of devices
+  device_count = 0;
+  hipGetDeviceCount(&device_count);
+  hipError_t err = hipGetLastError();
+
+  // adds quick check on versions
+  int driverVersion = 0, runtimeVersion = 0;
+  hipDriverGetVersion(&driverVersion);
+  hipRuntimeGetVersion(&runtimeVersion);
+
+  // exit in case first HIP call failed
+  if (err != hipSuccess){
+    fprintf (stderr,"Error after hipGetDeviceCount: %s\n", hipGetErrorString(err));
+    fprintf (stderr,"HIP Device count: %d\n",device_count);
+    fprintf (stderr,"HIP Driver Version / Runtime Version: %d.%d / %d.%d\n",
+                    driverVersion / 1000, (driverVersion % 100) / 10,
+                    runtimeVersion / 1000, (runtimeVersion % 100) / 10);
+
+    exit_on_error("HIP runtime error: hipGetDeviceCount failed\n\nPlease check if any HIP devices are available\n\nexiting...\n");
+  }
 
+  // checks if HIP devices available
+  if (device_count == 0) exit_on_error("HIP runtime error: no HIP devices available\n");
+
+  // returns device count to fortran
+  *ncuda_devices = device_count;
+
+  // Sets the active device
+  if (device_count >= 1) {
+    // generalized for more GPUs per node
+    // note: without previous context release, hipSetDevice will complain with the cuda error
+    //         "setting the device when a process is active is not allowed"
+
+    // releases previous contexts
+#if CUDA_VERSION < 4000
+    hipDeviceReset();
+#else
+    hipDeviceReset();
+#endif
+
+    //printf("rank %d: cuda device count = %d sets device = %d \n",myrank,device_count,myrank % device_count);
+    //MPI_Barrier(MPI_COMM_WORLD);
+
+    // sets active device
+#ifdef GPU_DEVICE_ID
+    // uses fixed device id when compile with e.g.: -DGPU_DEVICE_ID=1
+    device = GPU_DEVICE_ID;
+    if (myrank == 0) printf("setting HIP devices with id = %d for all processes by -DGPU_DEVICE_ID\n\n",device);
+
+    hipSetDevice( device );
+    exit_on_gpu_error("hipSetDevice has invalid device");
+
+    // double check that device was  properly selected
+    hipGetDevice(&device);
+    if (device != GPU_DEVICE_ID ){
+       printf("Error rank: %d devices: %d \n",myrank,device_count);
+       printf("  hipSetDevice()=%d\n  hipGetDevice()=%d\n",GPU_DEVICE_ID,device);
+       exit_on_error("HIP set/get device error: device id conflict \n");
+    }
+#else
+    // device changes for different mpi processes according to number of device per node
+    // (assumes that number of devices per node is the same for different compute nodes)
+    device = myrank % device_count;
+
+    hipSetDevice( device );
+    exit_on_gpu_error("hipSetDevice has invalid device");
+
+    // double check that device was  properly selected
+    hipGetDevice(&device);
+    if (device != (myrank % device_count) ){
+       printf("Error rank: %d devices: %d \n",myrank,device_count);
+       printf("  hipSetDevice()=%d\n  hipGetDevice()=%d\n",myrank%device_count,device);
+       exit_on_error("HIP set/get device error: device id conflict \n");
+    }
+#endif
+  }
+
+  // returns a handle to the active device
+  hipGetDevice(&device);
+  exit_on_gpu_error("hipGetDevice failed");
+
+  // get device properties
+  struct hipDeviceProp_t deviceProp;
+  hipGetDeviceProperties(&deviceProp,device);
+  exit_on_gpu_error("hipGetDevicePropoerties failed");
+
+  // memory usage
+  double free_db,used_db,total_db;
+  get_free_memory(&free_db,&used_db,&total_db);
+
+  // outputs device infos to file
+  char filename[BUFSIZ];
+  FILE* fp;
+  int do_output_info;
+
+  // by default, only master process outputs device infos to avoid file cluttering
+  do_output_info = 0;
+  if (myrank == 0){
+    do_output_info = 1;
+    sprintf(filename,OUTPUT_FILES"/gpu_device_info.txt");
+  }
+  // debugging
+  if (DEBUG){
+    do_output_info = 1;
+    sprintf(filename,OUTPUT_FILES"/gpu_device_info_proc_%06d.txt",myrank);
+  }
+
+  // output to file
+  if (do_output_info ){
+    fp = fopen(filename,"w");
+    if (fp != NULL){
+      // display device properties
+      fprintf (fp, "Device Name = %s\n", deviceProp.name);
+      fprintf (fp, "memory:\n");
+      fprintf (fp, "  totalGlobalMem (in MB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f));
+      fprintf (fp, "  totalGlobalMem (in GB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f * 1024.f));
+      fprintf (fp, "  totalConstMem (in bytes): %lu\n",(unsigned long) deviceProp.totalConstMem); // seems to be same as GlobalMem
+      //fprintf (fp, "  Maximum 1D texture size (in bytes): %lu\n",(unsigned long) deviceProp.maxTexture1D); // not available?
+      fprintf (fp, "  sharedMemPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.sharedMemPerBlock);
+      fprintf (fp, "  regsPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.regsPerBlock);
+      fprintf (fp, "blocks:\n");
+      fprintf (fp, "  Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock);
+      fprintf (fp, "  Maximum size of each dimension of a block: %d x %d x %d\n",
+                       deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
+      fprintf (fp, "  Maximum sizes of each dimension of a grid: %d x %d x %d\n",
+                       deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
+      fprintf (fp, "features:\n");
+      fprintf (fp, "  Compute capability of the device = %d.%d\n", deviceProp.major, deviceProp.minor);
+      fprintf (fp, "  multiProcessorCount: %d\n",deviceProp.multiProcessorCount);
+      if (deviceProp.canMapHostMemory){
+        fprintf (fp, "  canMapHostMemory: TRUE\n");
+      }else{
+        fprintf (fp, "  canMapHostMemory: FALSE\n");
+      }
+      if (deviceProp.concurrentKernels){
+        fprintf (fp, "  concurrentKernels: TRUE\n");
+      }else{
+        fprintf (fp, "  concurrentKernels: FALSE\n");
+      }
+
+      fprintf(fp,"HIP Device count: %d\n",device_count);
+      fprintf(fp,"HIP Driver Version / Runtime Version          %d.%d / %d.%d\n",
+              driverVersion / 1000, (driverVersion % 100) / 10,
+              runtimeVersion / 1000, (runtimeVersion % 100) / 10);
+
+      // outputs initial memory infos via hipMemGetInfo()
+      fprintf(fp,"memory usage:\n");
+      fprintf(fp,"  rank %d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n",myrank,
+              used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
+
+      // closes output file
+      fclose(fp);
+    }
+  }
+
+  /* daniel todo: check in case this applies...
+  // we use pinned memory for asynchronous copy
+  if (! deviceProp.canMapHostMemory){
+    fprintf(stderr,"Device capability should allow to map host memory, exiting...\n");
+    exit_on_error("CUDA Device capability canMapHostMemory should be TRUE\n");
+  }
+  */
+
+  // checks kernel optimization setting
+#ifdef USE_LAUNCH_BOUNDS
+  // see: mesh_constants_cuda.h
+  // performance statistics: main kernel Kernel_2_**_impl():
+  //       shared memory per block = 6200    for Kepler: total = 49152 -> limits active blocks to 7
+  //       registers per thread    = 72                                   (limited by LAUNCH_MIN_BLOCKS 7)
+  //       registers per block     = 9216                total = 65536    (limited by LAUNCH_MIN_BLOCKS 7)
+
+  // shared memory
+  if (deviceProp.sharedMemPerBlock > 49152 && LAUNCH_MIN_BLOCKS <= 7){
+    if (myrank == 0){
+      printf("GPU non-optimal settings: your setting of using LAUNCH_MIN_BLOCK %i is too low and limits the register usage\n",
+             LAUNCH_MIN_BLOCKS);
+    }
+  }
+
+  // registers
+  if (deviceProp.regsPerBlock > 65536 && LAUNCH_MIN_BLOCKS <= 7){
+    if (myrank == 0){
+      printf("GPU non-optimal settings: your setting of using LAUNCH_MIN_BLOCK %i is too low and limits the register usage\n",
+             LAUNCH_MIN_BLOCKS);
+    }
+  }
+#endif
+
+}
+#endif // USE_HIP
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// GPU initialization
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern EXTERN_LANG
+void FC_FUNC_(initialize_gpu_device,
+              INITIALIZE_GPU_DEVICE)(int* myrank_f,int* ncuda_devices) {
+
+  TRACE("initialize_gpu_device");
+
+  // check if compiled with both CUDA and HIP support
+#if defined(USE_CUDA) && defined(USE_HIP)
+  if (*myrank_f == 0) {
+    printf("Error: GPU version compilation with both USE_CUDA and USE_HIP not supported yet.\nPlease only use one for now...\n\n",);
+  }
+  exit(1);
+#endif
+
+  // initializes gpu cards
+#ifdef USE_CUDA
+  run_cuda = 1;
+  if (run_cuda) {
+    initialize_cuda_device(myrank_f, ncuda_devices);
+  }
+#endif
+#ifdef USE_HIP
+  run_hip = 1;
+  if (run_hip) {
+    initialize_hip_device(myrank_f, ncuda_devices);
+  }
+#endif
 }