diff --git a/src/gpu/helper_functions.cu b/src/gpu/helper_functions.cu index 9fbaa1b83..733f1d059 100644 --- a/src/gpu/helper_functions.cu +++ b/src/gpu/helper_functions.cu @@ -96,11 +96,35 @@ void gpuCopy_todevice_realw(void** d_array_addr_ptr,realw* h_array,int size){ #endif } - +/* ----------------------------------------------------------------------------------------------- */ +// GPU reset /* ----------------------------------------------------------------------------------------------- */ -// GPU synchronization +void gpuReset() { + // releases previous contexts + + // cuda version +#ifdef USE_CUDA + if (run_cuda) { +#if CUDA_VERSION < 4000 || (defined (__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ < 4)) + cudaThreadExit(); +#else + cudaDeviceReset(); +#endif + } +#endif + // hip version +#ifdef USE_HIP + if (run_hip) { + hipDeviceReset(); + } +#endif +} + + +/* ----------------------------------------------------------------------------------------------- */ +// GPU synchronization /* ----------------------------------------------------------------------------------------------- */ void gpuSynchronize() { diff --git a/src/gpu/initialize_cuda.cu b/src/gpu/initialize_gpu.cu similarity index 55% rename from src/gpu/initialize_cuda.cu rename to src/gpu/initialize_gpu.cu index 8b315f3b7..4c049b829 100644 --- a/src/gpu/initialize_cuda.cu +++ b/src/gpu/initialize_gpu.cu @@ -29,6 +29,14 @@ #include "mesh_constants_gpu.h" +// gpu runtime flags +int run_cuda = 0; +int run_hip = 0; + +/* ----------------------------------------------------------------------------------------------- */ +// CUDA initialization +/* ----------------------------------------------------------------------------------------------- */ + // CUDA version output #ifdef USE_CUDA @@ -47,20 +55,9 @@ #pragma message ("\n\nCompiling for CUDA version < 4.0\n") #endif -#endif - -// gpu runtime flags -int run_cuda = 0; -/* ----------------------------------------------------------------------------------------------- */ +void initialize_cuda_device(int* myrank_f,int* ncuda_devices) { -// GPU initialization - -/* ----------------------------------------------------------------------------------------------- */ - -extern EXTERN_LANG -void FC_FUNC_(initialize_cuda_device, - INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) { TRACE("initialize_cuda_device"); int device; @@ -131,8 +128,10 @@ void FC_FUNC_(initialize_cuda_device, exit_on_error("CUDA runtime error: cudaGetDeviceCount failed\n\nplease check if driver and runtime libraries work together\nor on cluster environments enable MPS (Multi-Process Service) to use single GPU with multiple MPI processes\n\nexiting...\n"); } - // returns device count to fortran + // checks if CUDA devices available if (device_count == 0) exit_on_error("CUDA runtime error: there is no device supporting CUDA\n"); + + // returns device count to fortran *ncuda_devices = device_count; // Sets the active device @@ -142,29 +141,25 @@ void FC_FUNC_(initialize_cuda_device, // "setting the device when a process is active is not allowed" // releases previous contexts -#if CUDA_VERSION < 4000 - cudaThreadExit(); -#else - cudaDeviceReset(); -#endif + gpuReset(); //printf("rank %d: cuda device count = %d sets device = %d \n",myrank,device_count,myrank % device_count); //MPI_Barrier(MPI_COMM_WORLD); // sets active device -#ifdef CUDA_DEVICE_ID - // uses fixed device id when compile with e.g.: -DCUDA_DEVICE_ID=1 - device = CUDA_DEVICE_ID; - if (myrank == 0) printf("setting cuda devices with id = %d for all processes by -DCUDA_DEVICE_ID\n\n",device); +#ifdef GPU_DEVICE_ID + // uses fixed device id when compile with e.g.: -DGPU_DEVICE_ID=1 + device = GPU_DEVICE_ID; + if (myrank == 0) printf("setting CUDA devices with id = %d for all processes by -DGPU_DEVICE_ID\n\n",device); cudaSetDevice( device ); exit_on_gpu_error("cudaSetDevice has invalid device"); // double check that device was properly selected cudaGetDevice(&device); - if (device != CUDA_DEVICE_ID ){ - printf("error rank: %d devices: %d \n",myrank,device_count); - printf(" cudaSetDevice()=%d\n cudaGetDevice()=%d\n",CUDA_DEVICE_ID,device); + if (device != GPU_DEVICE_ID ){ + printf("Error rank: %d devices: %d \n",myrank,device_count); + printf(" cudaSetDevice()=%d\n cudaGetDevice()=%d\n",GPU_DEVICE_ID,device); exit_on_error("CUDA set/get device error: device id conflict \n"); } #else @@ -178,7 +173,7 @@ void FC_FUNC_(initialize_cuda_device, // double check that device was properly selected cudaGetDevice(&device); if (device != (myrank % device_count) ){ - printf("error rank: %d devices: %d \n",myrank,device_count); + printf("Error rank: %d devices: %d \n",myrank,device_count); printf(" cudaSetDevice()=%d\n cudaGetDevice()=%d\n",myrank%device_count,device); exit_on_error("CUDA set/get device error: device id conflict \n"); } @@ -311,5 +306,256 @@ void FC_FUNC_(initialize_cuda_device, } } #endif +} +#endif // USE_CUDA + + + +/* ----------------------------------------------------------------------------------------------- */ +// HIP initialization +/* ----------------------------------------------------------------------------------------------- */ + +#ifdef USE_HIP + +void initialize_hip_device(int* myrank_f,int* ncuda_devices) { + + TRACE("initialize_hip_device"); + + int device; + int device_count; + + // Gets rank number of MPI process + int myrank = *myrank_f; + + // first HIP call + // + // explicit initialization + // (not necessary, most HIP APIs implicitly initialize the HIP runtime) + //hipError_t status = hipInit(0); + //if (status != hipSuccess) exit_on_error("HIP initialization failed\n"); + // + // gets number of devices + device_count = 0; + hipGetDeviceCount(&device_count); + hipError_t err = hipGetLastError(); + + // adds quick check on versions + int driverVersion = 0, runtimeVersion = 0; + hipDriverGetVersion(&driverVersion); + hipRuntimeGetVersion(&runtimeVersion); + + // exit in case first HIP call failed + if (err != hipSuccess){ + fprintf (stderr,"Error after hipGetDeviceCount: %s\n", hipGetErrorString(err)); + fprintf (stderr,"HIP Device count: %d\n",device_count); + fprintf (stderr,"HIP Driver Version / Runtime Version: %d.%d / %d.%d\n", + driverVersion / 1000, (driverVersion % 100) / 10, + runtimeVersion / 1000, (runtimeVersion % 100) / 10); + + exit_on_error("HIP runtime error: hipGetDeviceCount failed\n\nPlease check if any HIP devices are available\n\nexiting...\n"); + } + // checks if HIP devices available + if (device_count == 0) exit_on_error("HIP runtime error: no HIP devices available\n"); + + // returns device count to fortran + *ncuda_devices = device_count; + + // Sets the active device + if (device_count >= 1) { + // generalized for more GPUs per node + // note: without previous context release, hipSetDevice will complain with the cuda error + // "setting the device when a process is active is not allowed" + + // releases previous contexts +#if CUDA_VERSION < 4000 + hipDeviceReset(); +#else + hipDeviceReset(); +#endif + + //printf("rank %d: cuda device count = %d sets device = %d \n",myrank,device_count,myrank % device_count); + //MPI_Barrier(MPI_COMM_WORLD); + + // sets active device +#ifdef GPU_DEVICE_ID + // uses fixed device id when compile with e.g.: -DGPU_DEVICE_ID=1 + device = GPU_DEVICE_ID; + if (myrank == 0) printf("setting HIP devices with id = %d for all processes by -DGPU_DEVICE_ID\n\n",device); + + hipSetDevice( device ); + exit_on_gpu_error("hipSetDevice has invalid device"); + + // double check that device was properly selected + hipGetDevice(&device); + if (device != GPU_DEVICE_ID ){ + printf("Error rank: %d devices: %d \n",myrank,device_count); + printf(" hipSetDevice()=%d\n hipGetDevice()=%d\n",GPU_DEVICE_ID,device); + exit_on_error("HIP set/get device error: device id conflict \n"); + } +#else + // device changes for different mpi processes according to number of device per node + // (assumes that number of devices per node is the same for different compute nodes) + device = myrank % device_count; + + hipSetDevice( device ); + exit_on_gpu_error("hipSetDevice has invalid device"); + + // double check that device was properly selected + hipGetDevice(&device); + if (device != (myrank % device_count) ){ + printf("Error rank: %d devices: %d \n",myrank,device_count); + printf(" hipSetDevice()=%d\n hipGetDevice()=%d\n",myrank%device_count,device); + exit_on_error("HIP set/get device error: device id conflict \n"); + } +#endif + } + + // returns a handle to the active device + hipGetDevice(&device); + exit_on_gpu_error("hipGetDevice failed"); + + // get device properties + struct hipDeviceProp_t deviceProp; + hipGetDeviceProperties(&deviceProp,device); + exit_on_gpu_error("hipGetDevicePropoerties failed"); + + // memory usage + double free_db,used_db,total_db; + get_free_memory(&free_db,&used_db,&total_db); + + // outputs device infos to file + char filename[BUFSIZ]; + FILE* fp; + int do_output_info; + + // by default, only master process outputs device infos to avoid file cluttering + do_output_info = 0; + if (myrank == 0){ + do_output_info = 1; + sprintf(filename,OUTPUT_FILES"/gpu_device_info.txt"); + } + // debugging + if (DEBUG){ + do_output_info = 1; + sprintf(filename,OUTPUT_FILES"/gpu_device_info_proc_%06d.txt",myrank); + } + + // output to file + if (do_output_info ){ + fp = fopen(filename,"w"); + if (fp != NULL){ + // display device properties + fprintf (fp, "Device Name = %s\n", deviceProp.name); + fprintf (fp, "memory:\n"); + fprintf (fp, " totalGlobalMem (in MB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f)); + fprintf (fp, " totalGlobalMem (in GB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f * 1024.f)); + fprintf (fp, " totalConstMem (in bytes): %lu\n",(unsigned long) deviceProp.totalConstMem); // seems to be same as GlobalMem + //fprintf (fp, " Maximum 1D texture size (in bytes): %lu\n",(unsigned long) deviceProp.maxTexture1D); // not available? + fprintf (fp, " sharedMemPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.sharedMemPerBlock); + fprintf (fp, " regsPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.regsPerBlock); + fprintf (fp, "blocks:\n"); + fprintf (fp, " Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock); + fprintf (fp, " Maximum size of each dimension of a block: %d x %d x %d\n", + deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]); + fprintf (fp, " Maximum sizes of each dimension of a grid: %d x %d x %d\n", + deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]); + fprintf (fp, "features:\n"); + fprintf (fp, " Compute capability of the device = %d.%d\n", deviceProp.major, deviceProp.minor); + fprintf (fp, " multiProcessorCount: %d\n",deviceProp.multiProcessorCount); + if (deviceProp.canMapHostMemory){ + fprintf (fp, " canMapHostMemory: TRUE\n"); + }else{ + fprintf (fp, " canMapHostMemory: FALSE\n"); + } + if (deviceProp.concurrentKernels){ + fprintf (fp, " concurrentKernels: TRUE\n"); + }else{ + fprintf (fp, " concurrentKernels: FALSE\n"); + } + + fprintf(fp,"HIP Device count: %d\n",device_count); + fprintf(fp,"HIP Driver Version / Runtime Version %d.%d / %d.%d\n", + driverVersion / 1000, (driverVersion % 100) / 10, + runtimeVersion / 1000, (runtimeVersion % 100) / 10); + + // outputs initial memory infos via hipMemGetInfo() + fprintf(fp,"memory usage:\n"); + fprintf(fp," rank %d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n",myrank, + used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0); + + // closes output file + fclose(fp); + } + } + + /* daniel todo: check in case this applies... + // we use pinned memory for asynchronous copy + if (! deviceProp.canMapHostMemory){ + fprintf(stderr,"Device capability should allow to map host memory, exiting...\n"); + exit_on_error("CUDA Device capability canMapHostMemory should be TRUE\n"); + } + */ + + // checks kernel optimization setting +#ifdef USE_LAUNCH_BOUNDS + // see: mesh_constants_cuda.h + // performance statistics: main kernel Kernel_2_**_impl(): + // shared memory per block = 6200 for Kepler: total = 49152 -> limits active blocks to 7 + // registers per thread = 72 (limited by LAUNCH_MIN_BLOCKS 7) + // registers per block = 9216 total = 65536 (limited by LAUNCH_MIN_BLOCKS 7) + + // shared memory + if (deviceProp.sharedMemPerBlock > 49152 && LAUNCH_MIN_BLOCKS <= 7){ + if (myrank == 0){ + printf("GPU non-optimal settings: your setting of using LAUNCH_MIN_BLOCK %i is too low and limits the register usage\n", + LAUNCH_MIN_BLOCKS); + } + } + + // registers + if (deviceProp.regsPerBlock > 65536 && LAUNCH_MIN_BLOCKS <= 7){ + if (myrank == 0){ + printf("GPU non-optimal settings: your setting of using LAUNCH_MIN_BLOCK %i is too low and limits the register usage\n", + LAUNCH_MIN_BLOCKS); + } + } +#endif + +} +#endif // USE_HIP + +/* ----------------------------------------------------------------------------------------------- */ + +// GPU initialization + +/* ----------------------------------------------------------------------------------------------- */ + +extern EXTERN_LANG +void FC_FUNC_(initialize_gpu_device, + INITIALIZE_GPU_DEVICE)(int* myrank_f,int* ncuda_devices) { + + TRACE("initialize_gpu_device"); + + // check if compiled with both CUDA and HIP support +#if defined(USE_CUDA) && defined(USE_HIP) + if (*myrank_f == 0) { + printf("Error: GPU version compilation with both USE_CUDA and USE_HIP not supported yet.\nPlease only use one for now...\n\n",); + } + exit(1); +#endif + + // initializes gpu cards +#ifdef USE_CUDA + run_cuda = 1; + if (run_cuda) { + initialize_cuda_device(myrank_f, ncuda_devices); + } +#endif +#ifdef USE_HIP + run_hip = 1; + if (run_hip) { + initialize_hip_device(myrank_f, ncuda_devices); + } +#endif } diff --git a/src/gpu/mesh_constants_gpu.h b/src/gpu/mesh_constants_gpu.h index 763b85e74..ae84a469d 100644 --- a/src/gpu/mesh_constants_gpu.h +++ b/src/gpu/mesh_constants_gpu.h @@ -809,7 +809,8 @@ typedef struct mesh_ { void gpuCopy_todevice_int(void** d_array_addr_ptr,int* h_array,int size); void gpuCopy_todevice_realw(void** d_array_addr_ptr,realw* h_array,int size); -void gpuSynchronize (); +void gpuReset(); +void gpuSynchronize(); void exit_on_gpu_error(const char* kernel_name); void exit_on_error(const char* info); diff --git a/src/gpu/prepare_mesh_constants_cuda.cu b/src/gpu/prepare_mesh_constants_cuda.cu index 78b3d7f90..362d27b0d 100644 --- a/src/gpu/prepare_mesh_constants_cuda.cu +++ b/src/gpu/prepare_mesh_constants_cuda.cu @@ -1721,6 +1721,9 @@ TRACE("prepare_cleanup_device"); if (*NOISE_TOMOGRAPHY == 3) cudaFree(mp->d_sigma_kl); } + // releases previous contexts + gpuReset(); + // mesh pointer - not needed anymore free(mp); } diff --git a/src/gpu/rules.mk b/src/gpu/rules.mk index bbc054a83..d063fee5a 100644 --- a/src/gpu/rules.mk +++ b/src/gpu/rules.mk @@ -51,7 +51,7 @@ gpu_specfem3D_OBJECTS = \ $O/compute_stacey_viscoelastic_cuda.o \ $O/fault_solver_dynamics.o \ $O/helper_functions.o \ - $O/initialize_cuda.o \ + $O/initialize_gpu.o \ $O/noise_tomography_cuda.o \ $O/prepare_mesh_constants_cuda.o \ $O/save_and_compare_cpu_vs_gpu.o \ diff --git a/src/gpu/specfem3D_gpu_cuda_method_stubs.c b/src/gpu/specfem3D_gpu_cuda_method_stubs.c index 4fd030b21..ea946f158 100644 --- a/src/gpu/specfem3D_gpu_cuda_method_stubs.c +++ b/src/gpu/specfem3D_gpu_cuda_method_stubs.c @@ -433,12 +433,12 @@ void FC_FUNC_(pause_for_debug, // -// src/gpu/initialize_cuda.cu +// src/gpu/initialize_gpu.cu // -void FC_FUNC_(initialize_cuda_device, - INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) { - fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n"); +void FC_FUNC_(initialize_gpu_device, + INITIALIZE_GPU_DEVICE)(int* myrank_f,int* ncuda_devices) { + fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA/HIP Support. To enable GPU support, reconfigure with --with-cuda or --with-hip flag.\n"); exit(1); } diff --git a/src/specfem3D/initialize_simulation.F90 b/src/specfem3D/initialize_simulation.F90 index 677542397..8fcf317b6 100644 --- a/src/specfem3D/initialize_simulation.F90 +++ b/src/specfem3D/initialize_simulation.F90 @@ -543,7 +543,7 @@ subroutine initialize_GPU() endif ! initializes GPU and outputs info to files for all processes - call initialize_cuda_device(num_device,ncuda_devices) + call initialize_gpu_device(num_device,ncuda_devices) ! collects min/max of local devices found for statistics call synchronize_all() diff --git a/src/tomography/postprocess_sensitivity_kernels/smooth_sem.F90 b/src/tomography/postprocess_sensitivity_kernels/smooth_sem.F90 index 3a649de84..a3c7ad6e4 100644 --- a/src/tomography/postprocess_sensitivity_kernels/smooth_sem.F90 +++ b/src/tomography/postprocess_sensitivity_kernels/smooth_sem.F90 @@ -204,7 +204,7 @@ program smooth_sem call parse_kernel_names(kernel_names_comma_delimited,kernel_names,nker) kernel_name = kernel_names(1) - if (USE_GPU) call initialize_cuda_device(myrank,ncuda_devices) + if (USE_GPU) call initialize_gpu_device(myrank,ncuda_devices) if (nker > 1) then if (myrank == 0) then diff --git a/utils/GPU_tools/check_hip_device.cpp b/utils/GPU_tools/check_hip_device.cpp index 7a7db0f85..137a8dd72 100644 --- a/utils/GPU_tools/check_hip_device.cpp +++ b/utils/GPU_tools/check_hip_device.cpp @@ -84,7 +84,7 @@ int main(int argc, char* const argv[]) { // first HIP call // // explicit initialization - // not necessary, most HIP APIs implicitly initialize the HIP runtime) + // (not necessary, most HIP APIs implicitly initialize the HIP runtime) //hipError_t status = hipInit(0); //if (status != hipSuccess) exit_on_error("HIP initialization failed\n"); // diff --git a/utils/create_specfem3D_gpu_cuda_method_stubs.pl b/utils/create_specfem3D_gpu_cuda_method_stubs.pl index 06c184acb..4f13df220 100755 --- a/utils/create_specfem3D_gpu_cuda_method_stubs.pl +++ b/utils/create_specfem3D_gpu_cuda_method_stubs.pl @@ -116,7 +116,7 @@ END # function declaration if($line =~ /{/){ # function declaration ends - if( $line =~ /INITIALIZE_CUDA_DEVICE/ ){ + if( $line =~ /INITIALIZE_GPU_DEVICE/ ){ # adds warning print IOUT "$line\n$warning\}\n\n"; }else{