Skip to content

Commit

Permalink
Merge pull request SPECFEM#1345 from danielpeter/devel
Browse files Browse the repository at this point in the history
fixes attenuation kernel in gpu version
  • Loading branch information
danielpeter authored Sep 9, 2019
2 parents 1603deb + 85a601b commit e2d28c9
Show file tree
Hide file tree
Showing 15 changed files with 807 additions and 556 deletions.
4 changes: 2 additions & 2 deletions src/cuda/assemble_MPI_scalar_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ __global__ void prepare_boundary_potential_on_device(field* d_potential_dot_dot_
const int* d_nibool_interfaces_ext_mesh,
const int* d_ibool_interfaces_ext_mesh) {

int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;
int ientry,iglob;

for(int iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
Expand Down Expand Up @@ -156,7 +156,7 @@ __global__ void assemble_boundary_potential_on_device(field* d_potential_dot_dot
const int* d_nibool_interfaces_ext_mesh,
const int* d_ibool_interfaces_ext_mesh) {

int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;
int ientry,iglob;

for( int iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
Expand Down
7 changes: 4 additions & 3 deletions src/cuda/assemble_MPI_vector_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ __global__ void prepare_boundary_accel_on_device(realw* d_accel, realw* d_send_a
const int* d_nibool_interfaces_ext_mesh,
const int* d_ibool_interfaces_ext_mesh) {

int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;
int ientry,iglob;

for( int iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
Expand Down Expand Up @@ -216,7 +216,7 @@ __global__ void assemble_boundary_accel_on_device(realw* d_accel, realw* d_send_

//int bx = blockIdx.y*gridDim.x+blockIdx.x;
//int tx = threadIdx.x;
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;

int ientry,iglob;

Expand Down Expand Up @@ -257,7 +257,8 @@ __global__ void synchronize_boundary_accel_on_device(realw* d_accel, realw* d_se

//int bx = blockIdx.y*gridDim.x+blockIdx.x;
//int tx = threadIdx.x;
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;

// printf("inside the synchronization!\n");

int ientry,iglob;
Expand Down
7 changes: 6 additions & 1 deletion src/cuda/check_fields_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,8 @@ __global__ void get_maximum_vector_kernel(realw* array, int size, realw* d_max){
// load shared mem
unsigned int tid = threadIdx.x;
unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
unsigned int i = tid + bx*blockDim.x;
//unsigned int i = tid + bx*blockDim.x;
unsigned int i = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;

// loads values into shared memory: assume array is a vector array
sdata[tid] = (i < size) ? (array[i*3]*array[i*3] + array[i*3+1]*array[i*3+1] + array[i*3+2]*array[i*3+2]) : 0.0 ;
Expand Down Expand Up @@ -722,6 +723,10 @@ void FC_FUNC_(get_norm_elastic_from_device,
// determines max for all blocks
max = h_max[0];
for(int i=1;i<num_blocks_x*num_blocks_y;i++) {
//debug: denormals can show up here in debugging checks like
// -ffpe-trap=overflow,underflow,zero,invalid,denormal
// when values are zero, avoid the denormal check
// printf("debug: %d h = %f %d\n",i,h_max[i],num_blocks_x * num_blocks_y);
if (max < h_max[i]) max = h_max[i];
}
res = sqrt(max);
Expand Down
698 changes: 349 additions & 349 deletions src/cuda/compute_forces_viscoelastic_cuda.cu

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/cuda/initialize_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ void FC_FUNC_(initialize_cuda_device,
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess){
fprintf(stderr,"Error after cudaGetDeviceCount: %s\n", cudaGetErrorString(err));
exit_on_error("CUDA runtime error: cudaGetDeviceCount failed\n\nplease check if driver and runtime libraries work together\nor on titan enable environment: CRAY_CUDA_PROXY=1 to use single GPU with multiple MPI processes\n\nexiting...\n");
exit_on_error("CUDA runtime error: cudaGetDeviceCount failed\n\nplease check if driver and runtime libraries work together\nor on cluster environments enable MPS (Multi-Process Service) to use single GPU with multiple MPI processes\n\nexiting...\n");
}

// returns device count to fortran
Expand Down
40 changes: 36 additions & 4 deletions src/cuda/mesh_constants_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,16 +164,18 @@
// leads up to ~1% performance increase
//#define MANUALLY_UNROLLED_LOOPS

// compiler specifications
// CUDA compiler specifications
// (optional) use launch_bounds specification to increase compiler optimization
// (depending on GPU type, register spilling might slow down the performance)
// (uncomment if desired)
//#define USE_LAUNCH_BOUNDS

// elastic kernel
#ifdef GPU_DEVICE_K20
// specifics see: https://docs.nvidia.com/cuda/kepler-tuning-guide/index.html
// maximum shared memory per thread block 48KB
//
// note: main kernel is Kernel_2_***_impl() which is limited by shared memory usage to 8 active blocks
// while register usage might use up to 9 blocks
//
//
// performance statistics: kernel Kernel_2_noatt_impl():
// shared memory per block = 1700 for Kepler: total = 49152 -> limits active blocks to 16
// registers per thread = 48
Expand All @@ -183,7 +185,35 @@
// shared memory per block = 6100 for Kepler: total = 49152 -> limits active blocks to 8
// registers per thread = 59
// registers per block = 8192 total = 65536 -> limits active blocks to 8
//
// (uncomment if desired)
//#define USE_LAUNCH_BOUNDS
#define LAUNCH_MIN_BLOCKS 10
//#pragma message ("\nCompiling with: USE_LAUNCH_BOUNDS enabled for K20\n")
#endif

// add more card specific values
#ifdef GPU_DEVICE_Maxwell
// specifics see: https://docs.nvidia.com/cuda/maxwell-tuning-guide/index.html
// register file size 64k 32-bit registers per SM
// shared memory 64KB for GM107 and 96KB for GM204
#undef USE_LAUNCH_BOUNDS
#endif

#ifdef GPU_DEVICE_Pascal
// specifics see: https://docs.nvidia.com/cuda/pascal-tuning-guide/index.html
// register file size 64k 32-bit registers per SM
// shared memory 64KB for GP100 and 96KB for GP104
#undef USE_LAUNCH_BOUNDS
#endif

#ifdef GPU_DEVICE_Volta
// specifics see: https://docs.nvidia.com/cuda/volta-tuning-guide/index.html
// register file size 64k 32-bit registers per SM
// shared memory size 96KB per SM (maximum shared memory per thread block)
// maximum registers 255 per thread
#undef USE_LAUNCH_BOUNDS
#endif

// acoustic kernel
// performance statistics: kernel Kernel_2_acoustic_impl():
Expand Down Expand Up @@ -369,6 +399,8 @@ typedef struct mesh_ {

// mesh resolution
int NSPEC_AB;
int NSPEC_IRREGULAR;

int NGLOB_AB;

// mpi process
Expand Down
31 changes: 16 additions & 15 deletions src/cuda/prepare_mesh_constants_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -110,13 +110,17 @@ void FC_FUNC_(prepare_constants_device,

// sets global parameters
mp->NSPEC_AB = *NSPEC_AB;
mp->NSPEC_IRREGULAR = *NSPEC_IRREGULAR;
mp->NGLOB_AB = *NGLOB_AB;

// constants
mp->simulation_type = *SIMULATION_TYPE;
mp->absorbing_conditions = *ABSORBING_CONDITIONS;
mp->save_forward = *SAVE_FORWARD;

// DK DK August 2018: adding this test, following a suggestion by Etienne Bachmann
if (*h_NGLLX != NGLLX) exit_on_error("make sure that the NGLL constants are equal in the two files setup/constants.h and src/cuda/mesh_constants_cuda.h and then please re-compile; also make sure that the value of NGLL3_PADDED is consistent with the value of NGLL\n");

// sets constant arrays
setConst_hprime_xx(h_hprime_xx,mp);
// setConst_hprime_yy(h_hprime_yy,mp); // only needed if NGLLX != NGLLY != NGLLZ
Expand Down Expand Up @@ -154,10 +158,7 @@ void FC_FUNC_(prepare_constants_device,

// mesh
// Assuming NGLLX=5. Padded is then 128 (5^3+3)
int size_padded = NGLL3_PADDED * (*NSPEC_IRREGULAR > 0 ? *NSPEC_IRREGULAR : 1);

// DK DK August 2018: adding this test, following a suggestion by Etienne Bachmann
if (*h_NGLLX != NGLLX) exit_on_error("make sure that the NGLL constants are equal in the two files setup/constants.h and src/cuda/mesh_constants_cuda.h and then please re-compile; also make sure that the value of NGLL3_PADDED is consistent with the value of NGLL\n");
int size_padded = NGLL3_PADDED * (mp->NSPEC_IRREGULAR > 0 ? *NSPEC_IRREGULAR : 1);

print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix, size_padded*sizeof(realw)),1001);
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiy, size_padded*sizeof(realw)),1002);
Expand All @@ -172,7 +173,7 @@ void FC_FUNC_(prepare_constants_device,
// transfer constant element data with padding
/*
// way 1: slow...
for(int i=0;i < mp->NSPEC_AB;i++) {
for(int i=0;i < mp->NSPEC_IRREGULAR;i++) {
print_CUDA_error_if_any(cudaMemcpy(mp->d_xix + i*NGLL3_PADDED, &h_xix[i*NGLL3],
NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1501);
print_CUDA_error_if_any(cudaMemcpy(mp->d_xiy+i*NGLL3_PADDED, &h_xiy[i*NGLL3],
Expand Down Expand Up @@ -201,34 +202,34 @@ void FC_FUNC_(prepare_constants_device,
if (*NSPEC_IRREGULAR > 0 ){
print_CUDA_error_if_any(cudaMemcpy2D(mp->d_xix, NGLL3_PADDED*sizeof(realw),
h_xix, NGLL3*sizeof(realw), NGLL3*sizeof(realw),
mp->NSPEC_AB, cudaMemcpyHostToDevice),1501);
mp->NSPEC_IRREGULAR, cudaMemcpyHostToDevice),1501);
print_CUDA_error_if_any(cudaMemcpy2D(mp->d_xiy, NGLL3_PADDED*sizeof(realw),
h_xiy, NGLL3*sizeof(realw), NGLL3*sizeof(realw),
mp->NSPEC_AB, cudaMemcpyHostToDevice),1502);
mp->NSPEC_IRREGULAR, cudaMemcpyHostToDevice),1502);
print_CUDA_error_if_any(cudaMemcpy2D(mp->d_xiz, NGLL3_PADDED*sizeof(realw),
h_xiz, NGLL3*sizeof(realw), NGLL3*sizeof(realw),
mp->NSPEC_AB, cudaMemcpyHostToDevice),1503);
mp->NSPEC_IRREGULAR, cudaMemcpyHostToDevice),1503);
print_CUDA_error_if_any(cudaMemcpy2D(mp->d_etax, NGLL3_PADDED*sizeof(realw),
h_etax, NGLL3*sizeof(realw), NGLL3*sizeof(realw),
mp->NSPEC_AB, cudaMemcpyHostToDevice),1504);
mp->NSPEC_IRREGULAR, cudaMemcpyHostToDevice),1504);
print_CUDA_error_if_any(cudaMemcpy2D(mp->d_etay, NGLL3_PADDED*sizeof(realw),
h_etay, NGLL3*sizeof(realw), NGLL3*sizeof(realw),
mp->NSPEC_AB, cudaMemcpyHostToDevice),1505);
mp->NSPEC_IRREGULAR, cudaMemcpyHostToDevice),1505);
print_CUDA_error_if_any(cudaMemcpy2D(mp->d_etaz, NGLL3_PADDED*sizeof(realw),
h_etaz, NGLL3*sizeof(realw), NGLL3*sizeof(realw),
mp->NSPEC_AB, cudaMemcpyHostToDevice),1506);
mp->NSPEC_IRREGULAR, cudaMemcpyHostToDevice),1506);
print_CUDA_error_if_any(cudaMemcpy2D(mp->d_gammax, NGLL3_PADDED*sizeof(realw),
h_gammax, NGLL3*sizeof(realw), NGLL3*sizeof(realw),
mp->NSPEC_AB, cudaMemcpyHostToDevice),1507);
mp->NSPEC_IRREGULAR, cudaMemcpyHostToDevice),1507);
print_CUDA_error_if_any(cudaMemcpy2D(mp->d_gammay, NGLL3_PADDED*sizeof(realw),
h_gammay, NGLL3*sizeof(realw), NGLL3*sizeof(realw),
mp->NSPEC_AB, cudaMemcpyHostToDevice),1508);
mp->NSPEC_IRREGULAR, cudaMemcpyHostToDevice),1508);
print_CUDA_error_if_any(cudaMemcpy2D(mp->d_gammaz, NGLL3_PADDED*sizeof(realw),
h_gammaz, NGLL3*sizeof(realw), NGLL3*sizeof(realw),
mp->NSPEC_AB, cudaMemcpyHostToDevice),1509);
mp->NSPEC_IRREGULAR, cudaMemcpyHostToDevice),1509);
}
size_padded = NGLL3_PADDED * (mp->NSPEC_AB);

size_padded = NGLL3_PADDED * (mp->NSPEC_AB);

// global indexing (padded)
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool, size_padded*sizeof(int)),1600);
Expand Down
16 changes: 10 additions & 6 deletions src/cuda/update_displacement_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ __global__ void UpdateDispVeloc_kernel(realw* displ,
realw deltatover2) {

// two dimensional array of blocks on grid where each block has one dimensional array of threads
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;

realw acc = accel[id];
// because of block and grid sizing problems, there is a small
// amount of buffer at the end of the calculation
Expand Down Expand Up @@ -155,7 +156,7 @@ __global__ void UpdatePotential_kernel(field* potential_acoustic,
realw deltatsqover2,
realw deltatover2) {

int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;

// because of block and grid sizing problems, there is a small
// amount of buffer at the end of the calculation
Expand Down Expand Up @@ -282,7 +283,8 @@ __global__ void kernel_3_cuda_device(realw* veloc,
realw* rmassy,
realw* rmassz) {

int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;

realw rx,ry,rz;
realw ax,ay,az;
// because of block and grid sizing problems, there is a small
Expand Down Expand Up @@ -331,7 +333,8 @@ __global__ void kernel_3_accel_cuda_device(realw* accel,
realw* rmassx,
realw* rmassy,
realw* rmassz) {
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;

int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;

realw rx,ry,rz;
realw ax,ay,az;
Expand Down Expand Up @@ -369,7 +372,7 @@ __global__ void kernel_3_veloc_cuda_device(realw* veloc,
int size,
realw deltatover2) {

int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;

// because of block and grid sizing problems, there is a small
// amount of buffer at the end of the calculation
Expand Down Expand Up @@ -493,7 +496,8 @@ __global__ void kernel_3_acoustic_cuda_device(field* potential_dot_acoustic,
realw b_deltatover2,
realw* rmass_acoustic) {

int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;

realw rmass;
field p_dot_dot;
// because of block and grid sizing problems, there is a small
Expand Down
85 changes: 79 additions & 6 deletions src/generate_databases/calc_jacobian.f90
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,10 @@ subroutine check_element_regularity(xelm,yelm,zelm,any_regular_elem,cube_edge_si
double precision :: dist1_sq,dist2_sq,dist3_sq
double precision :: threshold
double precision,parameter :: threshold_percentage = 1.e-5
logical :: eqx1,eqx2,eqx3,eqx4,eqx5,eqx6
logical :: eqy1,eqy2,eqy3,eqy4,eqy5,eqy6
logical :: eqz1,eqz2,eqz3,eqz4,eqz5,eqz6
logical, external :: is_equal_number

! by default, we assume to have a perfect regular shape (cube)

Expand All @@ -201,13 +205,57 @@ subroutine check_element_regularity(xelm,yelm,zelm,any_regular_elem,cube_edge_si
return
endif

! number convention:
!
! 8 7
! o ------------- o
! /| /|
! / | / |
! 5o---------------o6 |
! | | | |
! | 4o----------- |--o3
! | / | /
! |/ |/
! z o---------------o
! | y 1 2
! |/
! o---> x
!

! checks x-plane (using epsilon() function to determine almost negligible differences)
eqx1 = is_equal_number(xelm(1),xelm(4))
eqx2 = is_equal_number(xelm(1),xelm(5))
eqx3 = is_equal_number(xelm(1),xelm(8))
eqx4 = is_equal_number(xelm(2),xelm(3))
eqx5 = is_equal_number(xelm(2),xelm(6))
eqx6 = is_equal_number(xelm(2),xelm(7))
! checks y-plane
eqy1 = is_equal_number(yelm(1),yelm(2))
eqy2 = is_equal_number(yelm(1),yelm(5))
eqy3 = is_equal_number(yelm(1),yelm(6))
eqy4 = is_equal_number(yelm(3),yelm(4))
eqy5 = is_equal_number(yelm(3),yelm(7))
eqy6 = is_equal_number(yelm(3),yelm(8))
! checks z-plane
eqz1 = is_equal_number(zelm(1),zelm(2))
eqz2 = is_equal_number(zelm(1),zelm(3))
eqz3 = is_equal_number(zelm(1),zelm(4))
eqz4 = is_equal_number(zelm(5),zelm(6))
eqz5 = is_equal_number(zelm(5),zelm(7))
eqz6 = is_equal_number(zelm(5),zelm(8))

! checks if the element is a cube (following numbering convention in a 8 nodes element)
if (xelm(1) == xelm(4) .and. xelm(1) == xelm(5) .and. xelm(1) == xelm(8) .and. &
xelm(2) == xelm(3) .and. xelm(2) == xelm(6) .and. xelm(2) == xelm(7) .and. &
yelm(1) == yelm(2) .and. yelm(1) == yelm(5) .and. yelm(1) == yelm(6) .and. &
yelm(3) == yelm(4) .and. yelm(3) == yelm(7) .and. yelm(3) == yelm(8) .and. &
zelm(1) == zelm(2) .and. zelm(1) == zelm(3) .and. zelm(1) == zelm(4) .and. &
zelm(5) == zelm(6) .and. zelm(5) == zelm(7) .and. zelm(5) == zelm(8) ) then
! with numerically negligible differences (to avoid representation issues of float values)
if (eqx1 .and. eqx2 .and. eqx3 .and. eqx4 .and. eqx5 .and. eqx6 .and. &
eqy1 .and. eqy2 .and. eqy3 .and. eqy4 .and. eqy5 .and. eqy6 .and. &
eqz1 .and. eqz2 .and. eqz3 .and. eqz4 .and. eqz5 .and. eqz6 ) then
! or direct float comparisons
!if (xelm(1) == xelm(4) .and. xelm(1) == xelm(5) .and. xelm(1) == xelm(8) .and. &
! xelm(2) == xelm(3) .and. xelm(2) == xelm(6) .and. xelm(2) == xelm(7) .and. &
! yelm(1) == yelm(2) .and. yelm(1) == yelm(5) .and. yelm(1) == yelm(6) .and. &
! yelm(3) == yelm(4) .and. yelm(3) == yelm(7) .and. yelm(3) == yelm(8) .and. &
! zelm(1) == zelm(2) .and. zelm(1) == zelm(3) .and. zelm(1) == zelm(4) .and. &
! zelm(5) == zelm(6) .and. zelm(5) == zelm(7) .and. zelm(5) == zelm(8) ) then

dist2_sq = (xelm(5)-xelm(1))**2 + (yelm(5)-yelm(1))**2 + (zelm(5)-zelm(1))**2
dist3_sq = (xelm(4)-xelm(1))**2 + (yelm(4)-yelm(1))**2 + (zelm(4)-zelm(1))**2
Expand Down Expand Up @@ -235,3 +283,28 @@ subroutine check_element_regularity(xelm,yelm,zelm,any_regular_elem,cube_edge_si
endif

end subroutine check_element_regularity

!
!-------------------------------------------------------------------------------------------------
!

function is_equal_number(a,b)

! compares two real values and determines if they are equal (within numerical precision)
! to avoid direct comparisons a == b which can fail for rounding issues

implicit none

real, intent(in) :: a,b
logical :: is_equal_number

! note: epsilon() intrinsic function
! returns a positive number that is almost negligible
! example: If x is of type REAL(4), EPSILON (X) has the value 2**-23
if (abs(a - b) <= epsilon(a)) then
is_equal_number = .true.
else
is_equal_number = .false.
endif

end function is_equal_number
Loading

0 comments on commit e2d28c9

Please sign in to comment.