Skip to content

Commit

Permalink
renames gpu helper functions (similar to globe version)
Browse files Browse the repository at this point in the history
  • Loading branch information
danielpeter committed Apr 13, 2021
1 parent ca1ad83 commit 7981920
Show file tree
Hide file tree
Showing 14 changed files with 413 additions and 256 deletions.
10 changes: 5 additions & 5 deletions src/gpu/assemble_MPI_scalar_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ TRACE("transfer_boun_pot_from_device");
//GPU_ERROR_CHECKING("after prepare_boundary_potential_on_device");

// synchronizes
//synchronize_cuda();
//gpuSynchronize();
// explicitly waits until previous compute stream finishes
// (cudaMemcpy implicitly synchronizes all other cuda operations)
cudaStreamSynchronize(mp->compute_stream);
Expand Down Expand Up @@ -122,7 +122,7 @@ TRACE("transfer_asmbl_pot_to_device");

// Cuda timing
//cudaEvent_t start, stop;
//start_timing_cuda(&start,&stop);
//start_timing_gpu(&start,&stop);

// checks if anything to do
if (mp->size_mpi_buffer_potential > 0){
Expand Down Expand Up @@ -151,7 +151,7 @@ TRACE("transfer_asmbl_pot_to_device");
}

// synchronizes
synchronize_cuda();
gpuSynchronize();

// copies buffer onto GPU
print_CUDA_error_if_any(cudaMemcpy(d_send_buffer, buffer_recv_scalar_ext_mesh,
Expand All @@ -165,8 +165,8 @@ TRACE("transfer_asmbl_pot_to_device");
mp->d_nibool_interfaces_ext_mesh,
mp->d_ibool_interfaces_ext_mesh);
}
// Cuda timing
//stop_timing_cuda(&start,&stop,"assemble_boundary_potential_on_device");
// kernel timing
//stop_timing_gpu(&start,&stop,"assemble_boundary_potential_on_device");

GPU_ERROR_CHECKING("transfer_asmbl_pot_to_device");
}
Expand Down
18 changes: 9 additions & 9 deletions src/gpu/assemble_MPI_vector_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ TRACE("\ttransfer_boun_accel_from_device");
}

// Cuda timing
//cudaEvent_t start, stop;
//start_timing_cuda(&start,&stop);
//gpu_event start, stop;
//start_timing_gpu(&start,&stop);

// fills mpi boundary buffer
prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(d_accel,d_send_buffer,
Expand All @@ -83,7 +83,7 @@ TRACE("\ttransfer_boun_accel_from_device");
mp->d_nibool_interfaces_ext_mesh,
mp->d_ibool_interfaces_ext_mesh);
// synchronizes
//synchronize_cuda();
//gpuSynchronize();
// explicitly waits until previous compute stream finishes
// (cudaMemcpy implicitly synchronizes all other cuda operations)
cudaStreamSynchronize(mp->compute_stream);
Expand All @@ -92,9 +92,9 @@ TRACE("\ttransfer_boun_accel_from_device");
print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,d_send_buffer,
mp->size_mpi_buffer*sizeof(realw),cudaMemcpyDeviceToHost),97001);

// Cuda timing
// kernel timing
// finish timing of kernel+memcpy
//stop_timing_cuda(&start,&stop,"prepare_boundary_accel_on_device");
//stop_timing_gpu(&start,&stop,"prepare_boundary_accel_on_device");
}

GPU_ERROR_CHECKING("transfer_boun_accel_from_device");
Expand Down Expand Up @@ -130,7 +130,7 @@ void FC_FUNC_(transfer_boundary_from_device_a,
mp->d_nibool_interfaces_ext_mesh,
mp->d_ibool_interfaces_ext_mesh);
// waits until kernel is finished before starting async memcpy
//synchronize_cuda();
//gpuSynchronize();
// waits until previous compute stream finishes
cudaStreamSynchronize(mp->compute_stream);

Expand Down Expand Up @@ -196,7 +196,7 @@ TRACE("\ttransfer_asmbl_accel_to_device");
else if (*FORWARD_OR_ADJOINT == 3){
// explicitly synchronizes
// (cudaMemcpy implicitly synchronizes all other cuda operations)
synchronize_cuda();
gpuSynchronize();

print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer, buffer_recv_vector_ext_mesh,
mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice),97001);
Expand Down Expand Up @@ -283,7 +283,7 @@ TRACE("\ttransfer_sync_accel_to_device");
else if (*FORWARD_OR_ADJOINT == 3){
// explicitly synchronizes
// (cudaMemcpy implicitly synchronizes all other cuda operations)
synchronize_cuda();
gpuSynchronize();

print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer, buffer_recv_vector_ext_mesh,
mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice),97001);
Expand Down Expand Up @@ -402,7 +402,7 @@ void FC_FUNC_(sync_copy_from_device,
Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper

// Wait until async-memcpy of outer elements is finished and start MPI.
if (*iphase != 2){ exit_on_cuda_error("sync_copy_from_device must be called for iphase == 2"); }
if (*iphase != 2){ exit_on_gpu_error("sync_copy_from_device must be called for iphase == 2"); }

if (mp->size_mpi_buffer > 0){
// waits for asynchronous copy to finish
Expand Down
6 changes: 3 additions & 3 deletions src/gpu/check_fields_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ realw get_device_array_maximum_value(realw* array, int size){

// explicitly wait for cuda kernels to finish
// (cudaMemcpy implicitly synchronizes all other cuda operations)
synchronize_cuda();
gpuSynchronize();

h_array = (realw*)calloc(size,sizeof(realw));
print_CUDA_error_if_any(cudaMemcpy(h_array,array,sizeof(realw)*size,cudaMemcpyDeviceToHost),33001);
Expand Down Expand Up @@ -276,7 +276,7 @@ void FC_FUNC_(get_norm_acoustic_from_device,
GPU_ERROR_CHECKING("kernel get_maximum_field_kernel");

// synchronizes
//synchronize_cuda();
//gpuSynchronize();
// explicitly waits for stream to finish
// (cudaMemcpy implicitly synchronizes all other cuda operations)
cudaStreamSynchronize(mp->compute_stream);
Expand Down Expand Up @@ -393,7 +393,7 @@ void FC_FUNC_(get_norm_elastic_from_device,
GPU_ERROR_CHECKING("kernel get_norm_elastic_from_device");

// synchronizes
//synchronize_cuda();
//gpuSynchronize();
// explicitly waits for stream to finish
// (cudaMemcpy implicitly synchronizes all other cuda operations)
cudaStreamSynchronize(mp->compute_stream);
Expand Down
4 changes: 2 additions & 2 deletions src/gpu/compute_add_sources_acoustic_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -172,13 +172,13 @@ void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container

// checks
if (*nadj_rec_local != mp->nadj_rec_local) exit_on_cuda_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");
if (*nadj_rec_local != mp->nadj_rec_local) exit_on_gpu_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");

// note: for acoustic simulations with fused wavefields, NB_RUNS_ACOUSTIC_GPU > 1
// and thus the number of adjoint sources might become different in future
// todo: not implemented yet for adjoint/kernel simulation
//if (*nadj_rec_local/NB_RUNS_ACOUSTIC_GPU != mp->nadj_rec_local)
// exit_on_cuda_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");
// exit_on_gpu_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");

// checks if anything to do
if (mp->nadj_rec_local == 0) return;
Expand Down
8 changes: 4 additions & 4 deletions src/gpu/compute_forces_acoustic_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,10 @@ void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
// note: for computational efficienty, the FORWARD_OR_ADJOINT variable here can have a special case (== 0)
// to combine forward and backward wavefield in the same kernel call

// Cuda timing
cudaEvent_t start, stop;
// kernel timing
gpu_event start, stop;
if (CUDA_TIMING ){
start_timing_cuda(&start,&stop);
start_timing_gpu(&start,&stop);
}
int nb_field = mp->simulation_type == 3 ? 2 : 1 ;

Expand Down Expand Up @@ -141,7 +141,7 @@ void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
// Cuda timing
if (CUDA_TIMING ){
realw flops,time;
stop_timing_cuda(&start,&stop,"Kernel_2_acoustic_impl",&time);
stop_timing_gpu(&start,&stop,"Kernel_2_acoustic_impl",&time);
// time in seconds
time = time / 1000.;
// performance
Expand Down
16 changes: 8 additions & 8 deletions src/gpu/compute_forces_viscoelastic_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,10 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);

// Cuda timing
cudaEvent_t start,stop;
// kernel timing
gpu_event start,stop;
if (CUDA_TIMING ){
start_timing_cuda(&start,&stop);
start_timing_gpu(&start,&stop);
}

// defines local parameters for forward/adjoint function calls
Expand Down Expand Up @@ -563,19 +563,19 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
// Cuda timing
if (CUDA_TIMING ){
if (ATTENUATION ){
stop_timing_cuda(&start,&stop,"Kernel_2_att_impl");
stop_timing_gpu(&start,&stop,"Kernel_2_att_impl");
}else{
if (ANISOTROPY ){
stop_timing_cuda(&start,&stop,"Kernel_2_noatt_ani_impl");
stop_timing_gpu(&start,&stop,"Kernel_2_noatt_ani_impl");
}else{
if (mp->gravity ){
stop_timing_cuda(&start,&stop,"Kernel_2_noatt_iso_grav_impl");
stop_timing_gpu(&start,&stop,"Kernel_2_noatt_iso_grav_impl");
}else{
if (COMPUTE_AND_STORE_STRAIN ){
stop_timing_cuda(&start,&stop,"Kernel_2_noatt_iso_strain_impl");
stop_timing_gpu(&start,&stop,"Kernel_2_noatt_iso_strain_impl");
}else{
realw time;
stop_timing_cuda(&start,&stop,"Kernel_2_noatt_iso_impl",&time);
stop_timing_gpu(&start,&stop,"Kernel_2_noatt_iso_impl",&time);
// time in seconds
time = time / 1000.;
// performance
Expand Down
72 changes: 41 additions & 31 deletions src/gpu/fault_solver_dynamics.cu
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,17 @@ void FC_FUNC_(initialize_fault_data_gpu,
/* ----------------------------------------------------------------------------------------------- */

// copies realw array from CPU host to GPU device
void copy_todevice_realw_test(void** d_array_addr_ptr,realw* h_array,int size) {
void gpuCopy_todevice_realw_test(void** d_array_addr_ptr,realw* h_array,int size) {

#ifdef USE_CUDA
// allocates memory on GPU
cudaMalloc((void**)d_array_addr_ptr,size*sizeof(realw));
// copies values onto GPU
cudaMemcpy((realw*) *d_array_addr_ptr,h_array,size*sizeof(realw),cudaMemcpyHostToDevice);
#endif
#ifdef USE_HIP
daniel todo copy...
#endif
}

/* ----------------------------------------------------------------------------------------------- */
Expand All @@ -129,12 +134,17 @@ void copy_tohost_realw_test(void** d_array_addr_ptr,realw* h_array,int size) {
/* ----------------------------------------------------------------------------------------------- */

// copies integer array from CPU host to GPU device
void copy_todevice_int_test(void** d_array_addr_ptr,int* h_array,int size) {
void gpuCopy_todevice_int_test(void** d_array_addr_ptr,int* h_array,int size) {

#ifdef USE_CUDA
// allocates memory on GPU
cudaMalloc((void**)d_array_addr_ptr,size*sizeof(int));
// copies values onto GPU
cudaMemcpy((realw*) *d_array_addr_ptr,h_array,size*sizeof(int),cudaMemcpyHostToDevice);
#endif
#ifdef USE_HIP
daniel todo hipcopy...
#endif
}

/* ----------------------------------------------------------------------------------------------- */
Expand Down Expand Up @@ -188,21 +198,21 @@ void FC_FUNC_(transfer_fault_data_to_device,

// copies data to GPU
if (*NGLOB_FLT > 0){
copy_todevice_realw_test((void **)&(Flt->B),B,*NGLOB_FLT);
copy_todevice_realw_test((void **)&(Flt->R),R,(*NGLOB_FLT)*9);
copy_todevice_realw_test((void **)&(Flt->Z),Z,(*NGLOB_FLT));
gpuCopy_todevice_realw_test((void **)&(Flt->B),B,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(Flt->R),R,(*NGLOB_FLT)*9);
gpuCopy_todevice_realw_test((void **)&(Flt->Z),Z,(*NGLOB_FLT));

copy_todevice_realw_test((void **)&(Flt->D),D,(*NGLOB_FLT)*3);
copy_todevice_realw_test((void **)&(Flt->V),V0,(*NGLOB_FLT)*3);
gpuCopy_todevice_realw_test((void **)&(Flt->D),D,(*NGLOB_FLT)*3);
gpuCopy_todevice_realw_test((void **)&(Flt->V),V0,(*NGLOB_FLT)*3);

copy_todevice_realw_test((void **)&(Flt->T0),T0,(*NGLOB_FLT)*3);
copy_todevice_realw_test((void **)&(Flt->T),T,(*NGLOB_FLT)*3);
gpuCopy_todevice_realw_test((void **)&(Flt->T0),T0,(*NGLOB_FLT)*3);
gpuCopy_todevice_realw_test((void **)&(Flt->T),T,(*NGLOB_FLT)*3);

copy_todevice_realw_test((void **)&(Flt->invM1),invM1,*NGLOB_FLT);
copy_todevice_realw_test((void **)&(Flt->invM2),invM2,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(Flt->invM1),invM1,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(Flt->invM2),invM2,*NGLOB_FLT);

copy_todevice_int_test((void **)&(Flt->ibulk1),ibulk1,(*NGLOB_FLT));
copy_todevice_int_test((void **)&(Flt->ibulk2),ibulk2,(*NGLOB_FLT));
gpuCopy_todevice_int_test((void **)&(Flt->ibulk1),ibulk1,(*NGLOB_FLT));
gpuCopy_todevice_int_test((void **)&(Flt->ibulk2),ibulk2,(*NGLOB_FLT));
}

GPU_ERROR_CHECKING("transfer_fault_data_to_device");
Expand Down Expand Up @@ -310,18 +320,18 @@ void FC_FUNC_(transfer_rsf_data_todevice,

// copies arrays onto GPU
if (*NGLOB_FLT > 0){
copy_todevice_realw_test((void **)&(rsf->V0),V0,*NGLOB_FLT);
copy_todevice_realw_test((void **)&(rsf->f0),f0,*NGLOB_FLT);
copy_todevice_realw_test((void **)&(rsf->V_init),V_init,*NGLOB_FLT);
copy_todevice_realw_test((void **)&(rsf->a),a,*NGLOB_FLT);
copy_todevice_realw_test((void **)&(rsf->b),b,*NGLOB_FLT);
copy_todevice_realw_test((void **)&(rsf->L),L,*NGLOB_FLT);
copy_todevice_realw_test((void **)&(rsf->theta),theta,*NGLOB_FLT);
copy_todevice_realw_test((void **)&(rsf->T),T,*NGLOB_FLT);
copy_todevice_realw_test((void **)&(rsf->Coh),C,*NGLOB_FLT);
copy_todevice_realw_test((void **)&(rsf->fw),fw,*NGLOB_FLT);
copy_todevice_realw_test((void **)&(rsf->Vw),Vw,*NGLOB_FLT);
copy_todevice_realw_test((void **)&(rsf->Fload),Fload,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(rsf->V0),V0,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(rsf->f0),f0,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(rsf->V_init),V_init,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(rsf->a),a,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(rsf->b),b,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(rsf->L),L,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(rsf->theta),theta,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(rsf->T),T,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(rsf->Coh),C,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(rsf->fw),fw,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(rsf->Vw),Vw,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(rsf->Fload),Fload,*NGLOB_FLT);
}

GPU_ERROR_CHECKING("transfer_rsf_data_todevice");
Expand Down Expand Up @@ -350,12 +360,12 @@ void FC_FUNC_(transfer_swf_data_todevice,
if (Fsolver->RATE_AND_STATE){ exit_on_error("Error with SWF setup, RATE_AND_STATE flag is on; please check fault setup and rerun\n");}

if (*NGLOB_FLT > 0){
copy_todevice_realw_test((void **)&(swf->Dc),Dc,*NGLOB_FLT);
copy_todevice_realw_test((void **)&(swf->mus),mus,*NGLOB_FLT);
copy_todevice_realw_test((void **)&(swf->mud),mud,*NGLOB_FLT);
copy_todevice_realw_test((void **)&(swf->Coh),C,*NGLOB_FLT);
copy_todevice_realw_test((void **)&(swf->T),T,*NGLOB_FLT);
copy_todevice_realw_test((void **)&(swf->theta),theta,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(swf->Dc),Dc,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(swf->mus),mus,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(swf->mud),mud,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(swf->Coh),C,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(swf->T),T,*NGLOB_FLT);
gpuCopy_todevice_realw_test((void **)&(swf->theta),theta,*NGLOB_FLT);
}

GPU_ERROR_CHECKING("transfer_swf_data_todevice");
Expand Down
Loading

0 comments on commit 7981920

Please sign in to comment.