diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp index 7650e89dc..8116ec9cd 100644 --- a/src/simulation/m_fftw.fpp +++ b/src/simulation/m_fftw.fpp @@ -141,7 +141,8 @@ contains subroutine s_apply_fourier_filter(q_cons_vf) type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf - + real(c_double), pointer :: p_real(:) + complex(c_double_complex), pointer :: p_cmplx(:), p_fltr_cmplx(:) integer :: i, j, k, l !< Generic loop iterators ! Restrict filter to processors that have cells adjacent to axis @@ -166,11 +167,16 @@ contains end do end do -!$acc host_data use_device(data_real_gpu, data_cmplx_gpu) + p_real => data_real_gpu + p_cmplx => data_cmplx_gpu + p_fltr_cmplx => data_fltr_cmplx_gpu + +!$acc data attach(p_real, p_cmplx, p_fltr_cmplx) +!$acc host_data use_device(p_real, p_cmplx, p_fltr_cmplx) #if defined(__PGI) ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu) #else - ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(data_real_gpu), c_loc(data_cmplx_gpu)) + ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx)) call hipCheck(hipDeviceSynchronize()) #endif !$acc end host_data @@ -186,11 +192,11 @@ contains end do end do -!$acc host_data use_device(data_real_gpu, data_fltr_cmplx_gpu) +!$acc host_data use_device(p_real, p_fltr_cmplx) #if defined(__PGI) ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu) #else - ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(data_fltr_cmplx_gpu), c_loc(data_real_gpu)) + ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real)) call hipCheck(hipDeviceSynchronize()) #endif !$acc end host_data @@ -225,11 +231,11 @@ contains end do end do -!$acc host_data use_device(data_real_gpu, data_cmplx_gpu) +!$acc host_data use_device(p_real, p_cmplx) #if defined(__PGI) ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu) #else - ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(data_real_gpu), c_loc(data_cmplx_gpu)) + ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx)) call hipCheck(hipDeviceSynchronize()) #endif !$acc end host_data @@ -246,11 +252,11 @@ contains end do end do -!$acc host_data use_device(data_real_gpu, data_fltr_cmplx_gpu) +!$acc host_data use_device(p_real, p_fltr_cmplx) #if defined(__PGI) ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu) #else - ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(data_fltr_cmplx_gpu), c_loc(data_real_gpu)) + ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real)) call hipCheck(hipDeviceSynchronize()) #endif !$acc end host_data @@ -297,8 +303,8 @@ contains end do end do #endif - - end subroutine s_apply_fourier_filter +!$acc end data + end subroutine s_apply_fourier_filter ! -------------------------------- !> The purpose of this subroutine is to destroy the fftw plan !! that will be used in the forward and backward DFTs when diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp index 292fe51a8..ae4363ac5 100644 --- a/src/simulation/m_mpi_proxy.fpp +++ b/src/simulation/m_mpi_proxy.fpp @@ -866,6 +866,8 @@ contains integer :: pack_offsets(1:3), unpack_offsets(1:3) integer :: pack_offset, unpack_offset + real(kind(0d0)), pointer, dimension(:) :: p_send, p_recv + integer, pointer, dimension(:) :: p_i_send, p_i_recv #ifdef MFC_MPI @@ -1065,8 +1067,11 @@ contains ! Send/Recv #:for rdma_mpi in [False, True] if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then + p_send => q_cons_buff_send + p_recv => q_cons_buff_recv #:if rdma_mpi - !$acc host_data use_device(q_cons_buff_recv, q_cons_buff_send, ib_buff_recv, ib_buff_send) + !$acc data attach(p_send, p_recv) + !$acc host_data use_device(p_send, p_recv) #:else !$acc update host(q_cons_buff_send, ib_buff_send) #:endif @@ -1078,6 +1083,7 @@ contains #:if rdma_mpi !$acc end host_data + !$acc end data !$acc wait #:else !$acc update device(q_cons_buff_recv) @@ -1268,6 +1274,7 @@ contains integer, intent(in) :: gp_layers integer :: i, j, k, l, r !< Generic loop iterators + integer, pointer, dimension(:) :: p_i_send, p_i_recv #ifdef MFC_MPI @@ -1309,19 +1316,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send, ib_buff_recv, ib_buff_send) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(n + 1)*(p + 1), & MPI_INTEGER, bc_x%end, 0, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(n + 1)*(p + 1), & MPI_INTEGER, bc_x%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1359,19 +1371,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(n + 1)*(p + 1), & MPI_INTEGER, bc_x%beg, 1, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(n + 1)*(p + 1), & MPI_INTEGER, bc_x%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1431,19 +1448,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(n + 1)*(p + 1), & MPI_INTEGER, bc_x%beg, 1, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(n + 1)*(p + 1), & MPI_INTEGER, bc_x%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1479,19 +1501,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(n + 1)*(p + 1), & MPI_INTEGER, bc_x%end, 0, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(n + 1)*(p + 1), & MPI_INTEGER, bc_x%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1553,19 +1580,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(m + 2*gp_layers + 1)*(p + 1), & MPI_INTEGER, bc_y%end, 0, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(m + 2*gp_layers + 1)*(p + 1), & MPI_INTEGER, bc_y%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1604,19 +1636,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(m + 2*gp_layers + 1)*(p + 1), & MPI_INTEGER, bc_y%beg, 1, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(m + 2*gp_layers + 1)*(p + 1), & MPI_INTEGER, bc_y%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1679,19 +1716,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(m + 2*gp_layers + 1)*(p + 1), & MPI_INTEGER, bc_y%beg, 1, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(m + 2*gp_layers + 1)*(p + 1), & MPI_INTEGER, bc_y%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1730,19 +1772,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(m + 2*gp_layers + 1)*(p + 1), & MPI_INTEGER, bc_y%end, 0, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(m + 2*gp_layers + 1)*(p + 1), & MPI_INTEGER, bc_y%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1808,19 +1855,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), & MPI_INTEGER, bc_z%end, 0, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), & MPI_INTEGER, bc_z%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1859,19 +1911,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), & MPI_INTEGER, bc_z%beg, 1, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), & MPI_INTEGER, bc_z%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1935,19 +1992,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), & MPI_INTEGER, bc_z%beg, 1, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), & MPI_INTEGER, bc_z%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1986,19 +2048,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), & MPI_INTEGER, bc_z%end, 0, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), & MPI_INTEGER, bc_z%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -2064,6 +2131,7 @@ contains integer :: pack_offsets(1:3), unpack_offsets(1:3) integer :: pack_offset, unpack_offset + real(kind(0d0)), pointer, dimension(:) :: p_send, p_recv #ifdef MFC_MPI @@ -2158,19 +2226,24 @@ contains ! Send/Recv #:for rdma_mpi in [False, True] if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then + p_send => c_divs_buff_send + p_recv => c_divs_buff_recv + #:if rdma_mpi - !$acc host_data use_device(c_divs_buff_recv, c_divs_buff_send) + !$acc data attach(p_send, p_recv) + !$acc host_data use_device(p_send, p_recv) #:else !$acc update host(c_divs_buff_send) #:endif call MPI_SENDRECV( & - c_divs_buff_send(0), buffer_count, MPI_DOUBLE_PRECISION, dst_proc, send_tag, & - c_divs_buff_recv(0), buffer_count, MPI_DOUBLE_PRECISION, src_proc, recv_tag, & + p_send, buffer_count, MPI_DOUBLE_PRECISION, dst_proc, send_tag, & + p_recv, buffer_count, MPI_DOUBLE_PRECISION, src_proc, recv_tag, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) #:if rdma_mpi !$acc end host_data + !$acc end data !$acc wait #:else !$acc update device(c_divs_buff_recv) diff --git a/toolchain/modules b/toolchain/modules index 0dc9576c8..97046657c 100644 --- a/toolchain/modules +++ b/toolchain/modules @@ -47,11 +47,9 @@ p-cpu gcc/12.3.0 openmpi/4.1.5 p-gpu nvhpc/24.5 hpcx/2.19-cuda cuda/12.1.1 f OLCF Frontier -f-gpu rocm/5.5.1 craype-accel-amd-gfx90a -f-all cpe/23.09 -f-all cray-fftw cray-hdf5 cray-mpich/8.1.26 cce/16.0.1 -f-all rocm/5.5.1 cray-python omniperf -f-cpu +f-all cce/18.0.0 cpe/24.07 rocm/6.2.0 cray-mpich/8.1.28 +f-all cray-fftw cray-hdf5 cray-python omniperf +f-gpu craype-accel-amd-gfx90a d NCSA Delta d-all python/3.11.6