renames gpu helper functions (similar to globe version)

cenjinzhiyu · Apr 13, 2021 · 7981920 · 7981920
1 parent ca1ad83
commit 7981920
Show file tree

Hide file tree

Showing 14 changed files with 413 additions and 256 deletions.
diff --git a/src/gpu/assemble_MPI_scalar_cuda.cu b/src/gpu/assemble_MPI_scalar_cuda.cu
@@ -83,7 +83,7 @@ TRACE("transfer_boun_pot_from_device");
     //GPU_ERROR_CHECKING("after prepare_boundary_potential_on_device");
 
     // synchronizes
-    //synchronize_cuda();
+    //gpuSynchronize();
     // explicitly waits until previous compute stream finishes
     // (cudaMemcpy implicitly synchronizes all other cuda operations)
     cudaStreamSynchronize(mp->compute_stream);
@@ -122,7 +122,7 @@ TRACE("transfer_asmbl_pot_to_device");
 
   // Cuda timing
   //cudaEvent_t start, stop;
-  //start_timing_cuda(&start,&stop);
+  //start_timing_gpu(&start,&stop);
 
   // checks if anything to do
   if (mp->size_mpi_buffer_potential > 0){
@@ -151,7 +151,7 @@ TRACE("transfer_asmbl_pot_to_device");
     }
 
     // synchronizes
-    synchronize_cuda();
+    gpuSynchronize();
 
     // copies buffer onto GPU
     print_CUDA_error_if_any(cudaMemcpy(d_send_buffer, buffer_recv_scalar_ext_mesh,
@@ -165,8 +165,8 @@ TRACE("transfer_asmbl_pot_to_device");
                                                                                  mp->d_nibool_interfaces_ext_mesh,
                                                                                  mp->d_ibool_interfaces_ext_mesh);
   }
-  // Cuda timing
-  //stop_timing_cuda(&start,&stop,"assemble_boundary_potential_on_device");
+  // kernel timing
+  //stop_timing_gpu(&start,&stop,"assemble_boundary_potential_on_device");
 
   GPU_ERROR_CHECKING("transfer_asmbl_pot_to_device");
 }

diff --git a/src/gpu/assemble_MPI_vector_cuda.cu b/src/gpu/assemble_MPI_vector_cuda.cu
@@ -73,8 +73,8 @@ TRACE("\ttransfer_boun_accel_from_device");
     }
 
     // Cuda timing
-    //cudaEvent_t start, stop;
-    //start_timing_cuda(&start,&stop);
+    //gpu_event start, stop;
+    //start_timing_gpu(&start,&stop);
 
     // fills mpi boundary buffer
     prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(d_accel,d_send_buffer,
@@ -83,7 +83,7 @@ TRACE("\ttransfer_boun_accel_from_device");
                                                                             mp->d_nibool_interfaces_ext_mesh,
                                                                             mp->d_ibool_interfaces_ext_mesh);
     // synchronizes
-    //synchronize_cuda();
+    //gpuSynchronize();
     // explicitly waits until previous compute stream finishes
     // (cudaMemcpy implicitly synchronizes all other cuda operations)
     cudaStreamSynchronize(mp->compute_stream);
@@ -92,9 +92,9 @@ TRACE("\ttransfer_boun_accel_from_device");
     print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,d_send_buffer,
                             mp->size_mpi_buffer*sizeof(realw),cudaMemcpyDeviceToHost),97001);
 
-    // Cuda timing
+    // kernel timing
     // finish timing of kernel+memcpy
-    //stop_timing_cuda(&start,&stop,"prepare_boundary_accel_on_device");
+    //stop_timing_gpu(&start,&stop,"prepare_boundary_accel_on_device");
   }
 
   GPU_ERROR_CHECKING("transfer_boun_accel_from_device");
@@ -130,7 +130,7 @@ void FC_FUNC_(transfer_boundary_from_device_a,
                                                                             mp->d_nibool_interfaces_ext_mesh,
                                                                             mp->d_ibool_interfaces_ext_mesh);
     // waits until kernel is finished before starting async memcpy
-    //synchronize_cuda();
+    //gpuSynchronize();
     // waits until previous compute stream finishes
     cudaStreamSynchronize(mp->compute_stream);
 
@@ -196,7 +196,7 @@ TRACE("\ttransfer_asmbl_accel_to_device");
     else if (*FORWARD_OR_ADJOINT == 3){
       // explicitly synchronizes
       // (cudaMemcpy implicitly synchronizes all other cuda operations)
-      synchronize_cuda();
+      gpuSynchronize();
 
       print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer, buffer_recv_vector_ext_mesh,
                               mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice),97001);
@@ -283,7 +283,7 @@ TRACE("\ttransfer_sync_accel_to_device");
     else if (*FORWARD_OR_ADJOINT == 3){
       // explicitly synchronizes
       // (cudaMemcpy implicitly synchronizes all other cuda operations)
-      synchronize_cuda();
+      gpuSynchronize();
 
       print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer, buffer_recv_vector_ext_mesh,
                               mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice),97001);
@@ -402,7 +402,7 @@ void FC_FUNC_(sync_copy_from_device,
   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
   // Wait until async-memcpy of outer elements is finished and start MPI.
-  if (*iphase != 2){ exit_on_cuda_error("sync_copy_from_device must be called for iphase == 2"); }
+  if (*iphase != 2){ exit_on_gpu_error("sync_copy_from_device must be called for iphase == 2"); }
 
   if (mp->size_mpi_buffer > 0){
     // waits for asynchronous copy to finish

diff --git a/src/gpu/check_fields_cuda.cu b/src/gpu/check_fields_cuda.cu
@@ -175,7 +175,7 @@ realw get_device_array_maximum_value(realw* array, int size){
 
     // explicitly wait for cuda kernels to finish
     // (cudaMemcpy implicitly synchronizes all other cuda operations)
-    synchronize_cuda();
+    gpuSynchronize();
 
     h_array = (realw*)calloc(size,sizeof(realw));
     print_CUDA_error_if_any(cudaMemcpy(h_array,array,sizeof(realw)*size,cudaMemcpyDeviceToHost),33001);
@@ -276,7 +276,7 @@ void FC_FUNC_(get_norm_acoustic_from_device,
   GPU_ERROR_CHECKING("kernel get_maximum_field_kernel");
 
   // synchronizes
-  //synchronize_cuda();
+  //gpuSynchronize();
   // explicitly waits for stream to finish
   // (cudaMemcpy implicitly synchronizes all other cuda operations)
   cudaStreamSynchronize(mp->compute_stream);
@@ -393,7 +393,7 @@ void FC_FUNC_(get_norm_elastic_from_device,
   GPU_ERROR_CHECKING("kernel get_norm_elastic_from_device");
 
   // synchronizes
-  //synchronize_cuda();
+  //gpuSynchronize();
   // explicitly waits for stream to finish
   // (cudaMemcpy implicitly synchronizes all other cuda operations)
   cudaStreamSynchronize(mp->compute_stream);

diff --git a/src/gpu/compute_add_sources_acoustic_cuda.cu b/src/gpu/compute_add_sources_acoustic_cuda.cu
@@ -172,13 +172,13 @@ void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   // checks
-  if (*nadj_rec_local != mp->nadj_rec_local) exit_on_cuda_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");
+  if (*nadj_rec_local != mp->nadj_rec_local) exit_on_gpu_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");
 
   // note: for acoustic simulations with fused wavefields, NB_RUNS_ACOUSTIC_GPU > 1
   //       and thus the number of adjoint sources might become different in future
   //       todo: not implemented yet for adjoint/kernel simulation
   //if (*nadj_rec_local/NB_RUNS_ACOUSTIC_GPU != mp->nadj_rec_local)
-  //  exit_on_cuda_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");
+  //  exit_on_gpu_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");
 
   // checks if anything to do
   if (mp->nadj_rec_local == 0) return;

diff --git a/src/gpu/compute_forces_acoustic_cuda.cu b/src/gpu/compute_forces_acoustic_cuda.cu
@@ -66,10 +66,10 @@ void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
   // note: for computational efficienty, the FORWARD_OR_ADJOINT variable here can have a special case (== 0)
   //       to combine forward and backward wavefield in the same kernel call
 
-  // Cuda timing
-  cudaEvent_t start, stop;
+  // kernel timing
+  gpu_event start, stop;
   if (CUDA_TIMING ){
-    start_timing_cuda(&start,&stop);
+    start_timing_gpu(&start,&stop);
   }
   int nb_field = mp->simulation_type == 3 ? 2 : 1 ;
 
@@ -141,7 +141,7 @@ void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
   // Cuda timing
   if (CUDA_TIMING ){
     realw flops,time;
-    stop_timing_cuda(&start,&stop,"Kernel_2_acoustic_impl",&time);
+    stop_timing_gpu(&start,&stop,"Kernel_2_acoustic_impl",&time);
     // time in seconds
     time = time / 1000.;
     // performance

diff --git a/src/gpu/compute_forces_viscoelastic_cuda.cu b/src/gpu/compute_forces_viscoelastic_cuda.cu
@@ -90,10 +90,10 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  // Cuda timing
-  cudaEvent_t start,stop;
+  // kernel timing
+  gpu_event start,stop;
   if (CUDA_TIMING ){
-    start_timing_cuda(&start,&stop);
+    start_timing_gpu(&start,&stop);
   }
 
   // defines local parameters for forward/adjoint function calls
@@ -563,19 +563,19 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
   // Cuda timing
   if (CUDA_TIMING ){
     if (ATTENUATION ){
-      stop_timing_cuda(&start,&stop,"Kernel_2_att_impl");
+      stop_timing_gpu(&start,&stop,"Kernel_2_att_impl");
     }else{
       if (ANISOTROPY ){
-        stop_timing_cuda(&start,&stop,"Kernel_2_noatt_ani_impl");
+        stop_timing_gpu(&start,&stop,"Kernel_2_noatt_ani_impl");
       }else{
         if (mp->gravity ){
-          stop_timing_cuda(&start,&stop,"Kernel_2_noatt_iso_grav_impl");
+          stop_timing_gpu(&start,&stop,"Kernel_2_noatt_iso_grav_impl");
         }else{
           if (COMPUTE_AND_STORE_STRAIN ){
-            stop_timing_cuda(&start,&stop,"Kernel_2_noatt_iso_strain_impl");
+            stop_timing_gpu(&start,&stop,"Kernel_2_noatt_iso_strain_impl");
           }else{
             realw time;
-            stop_timing_cuda(&start,&stop,"Kernel_2_noatt_iso_impl",&time);
+            stop_timing_gpu(&start,&stop,"Kernel_2_noatt_iso_impl",&time);
             // time in seconds
             time = time / 1000.;
             // performance

diff --git a/src/gpu/fault_solver_dynamics.cu b/src/gpu/fault_solver_dynamics.cu
@@ -110,12 +110,17 @@ void FC_FUNC_(initialize_fault_data_gpu,
 /* ----------------------------------------------------------------------------------------------- */
 
 // copies realw array from CPU host to GPU device
-void copy_todevice_realw_test(void** d_array_addr_ptr,realw* h_array,int size) {
+void gpuCopy_todevice_realw_test(void** d_array_addr_ptr,realw* h_array,int size) {
 
+#ifdef USE_CUDA
   // allocates memory on GPU
   cudaMalloc((void**)d_array_addr_ptr,size*sizeof(realw));
   // copies values onto GPU
   cudaMemcpy((realw*) *d_array_addr_ptr,h_array,size*sizeof(realw),cudaMemcpyHostToDevice);
+#endif
+#ifdef USE_HIP
+daniel todo copy...
+#endif
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -129,12 +134,17 @@ void copy_tohost_realw_test(void** d_array_addr_ptr,realw* h_array,int size) {
 /* ----------------------------------------------------------------------------------------------- */
 
 // copies integer array from CPU host to GPU device
-void copy_todevice_int_test(void** d_array_addr_ptr,int* h_array,int size) {
+void gpuCopy_todevice_int_test(void** d_array_addr_ptr,int* h_array,int size) {
 
+#ifdef USE_CUDA
   // allocates memory on GPU
   cudaMalloc((void**)d_array_addr_ptr,size*sizeof(int));
   // copies values onto GPU
   cudaMemcpy((realw*) *d_array_addr_ptr,h_array,size*sizeof(int),cudaMemcpyHostToDevice);
+#endif
+#ifdef USE_HIP
+daniel todo hipcopy...
+#endif
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -188,21 +198,21 @@ void FC_FUNC_(transfer_fault_data_to_device,
 
   // copies data to GPU
   if (*NGLOB_FLT > 0){
-    copy_todevice_realw_test((void **)&(Flt->B),B,*NGLOB_FLT);
-    copy_todevice_realw_test((void **)&(Flt->R),R,(*NGLOB_FLT)*9);
-    copy_todevice_realw_test((void **)&(Flt->Z),Z,(*NGLOB_FLT));
+    gpuCopy_todevice_realw_test((void **)&(Flt->B),B,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(Flt->R),R,(*NGLOB_FLT)*9);
+    gpuCopy_todevice_realw_test((void **)&(Flt->Z),Z,(*NGLOB_FLT));
 
-    copy_todevice_realw_test((void **)&(Flt->D),D,(*NGLOB_FLT)*3);
-    copy_todevice_realw_test((void **)&(Flt->V),V0,(*NGLOB_FLT)*3);
+    gpuCopy_todevice_realw_test((void **)&(Flt->D),D,(*NGLOB_FLT)*3);
+    gpuCopy_todevice_realw_test((void **)&(Flt->V),V0,(*NGLOB_FLT)*3);
 
-    copy_todevice_realw_test((void **)&(Flt->T0),T0,(*NGLOB_FLT)*3);
-    copy_todevice_realw_test((void **)&(Flt->T),T,(*NGLOB_FLT)*3);
+    gpuCopy_todevice_realw_test((void **)&(Flt->T0),T0,(*NGLOB_FLT)*3);
+    gpuCopy_todevice_realw_test((void **)&(Flt->T),T,(*NGLOB_FLT)*3);
 
-    copy_todevice_realw_test((void **)&(Flt->invM1),invM1,*NGLOB_FLT);
-    copy_todevice_realw_test((void **)&(Flt->invM2),invM2,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(Flt->invM1),invM1,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(Flt->invM2),invM2,*NGLOB_FLT);
 
-    copy_todevice_int_test((void **)&(Flt->ibulk1),ibulk1,(*NGLOB_FLT));
-    copy_todevice_int_test((void **)&(Flt->ibulk2),ibulk2,(*NGLOB_FLT));
+    gpuCopy_todevice_int_test((void **)&(Flt->ibulk1),ibulk1,(*NGLOB_FLT));
+    gpuCopy_todevice_int_test((void **)&(Flt->ibulk2),ibulk2,(*NGLOB_FLT));
   }
 
   GPU_ERROR_CHECKING("transfer_fault_data_to_device");
@@ -310,18 +320,18 @@ void FC_FUNC_(transfer_rsf_data_todevice,
 
   // copies arrays onto GPU
   if (*NGLOB_FLT > 0){
-    copy_todevice_realw_test((void **)&(rsf->V0),V0,*NGLOB_FLT);
-    copy_todevice_realw_test((void **)&(rsf->f0),f0,*NGLOB_FLT);
-    copy_todevice_realw_test((void **)&(rsf->V_init),V_init,*NGLOB_FLT);
-    copy_todevice_realw_test((void **)&(rsf->a),a,*NGLOB_FLT);
-    copy_todevice_realw_test((void **)&(rsf->b),b,*NGLOB_FLT);
-    copy_todevice_realw_test((void **)&(rsf->L),L,*NGLOB_FLT);
-    copy_todevice_realw_test((void **)&(rsf->theta),theta,*NGLOB_FLT);
-    copy_todevice_realw_test((void **)&(rsf->T),T,*NGLOB_FLT);
-    copy_todevice_realw_test((void **)&(rsf->Coh),C,*NGLOB_FLT);
-    copy_todevice_realw_test((void **)&(rsf->fw),fw,*NGLOB_FLT);
-    copy_todevice_realw_test((void **)&(rsf->Vw),Vw,*NGLOB_FLT);
-    copy_todevice_realw_test((void **)&(rsf->Fload),Fload,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(rsf->V0),V0,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(rsf->f0),f0,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(rsf->V_init),V_init,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(rsf->a),a,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(rsf->b),b,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(rsf->L),L,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(rsf->theta),theta,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(rsf->T),T,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(rsf->Coh),C,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(rsf->fw),fw,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(rsf->Vw),Vw,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(rsf->Fload),Fload,*NGLOB_FLT);
   }
 
   GPU_ERROR_CHECKING("transfer_rsf_data_todevice");
@@ -350,12 +360,12 @@ void FC_FUNC_(transfer_swf_data_todevice,
   if (Fsolver->RATE_AND_STATE){ exit_on_error("Error with SWF setup, RATE_AND_STATE flag is on; please check fault setup and rerun\n");}
 
   if (*NGLOB_FLT > 0){
-    copy_todevice_realw_test((void **)&(swf->Dc),Dc,*NGLOB_FLT);
-    copy_todevice_realw_test((void **)&(swf->mus),mus,*NGLOB_FLT);
-    copy_todevice_realw_test((void **)&(swf->mud),mud,*NGLOB_FLT);
-    copy_todevice_realw_test((void **)&(swf->Coh),C,*NGLOB_FLT);
-    copy_todevice_realw_test((void **)&(swf->T),T,*NGLOB_FLT);
-    copy_todevice_realw_test((void **)&(swf->theta),theta,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(swf->Dc),Dc,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(swf->mus),mus,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(swf->mud),mud,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(swf->Coh),C,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(swf->T),T,*NGLOB_FLT);
+    gpuCopy_todevice_realw_test((void **)&(swf->theta),theta,*NGLOB_FLT);
   }
 
   GPU_ERROR_CHECKING("transfer_swf_data_todevice");