From 86620317b0d0ea2d3cee6134fb3bbe16ab407e59 Mon Sep 17 00:00:00 2001 From: Henry LE BERRE Date: Sat, 24 Aug 2024 20:42:24 -0400 Subject: [PATCH] CCE 18 migration on OLCF Frontier Co-authored-by: Steve Abbott --- .github/workflows/frontier/build.sh | 2 +- .github/workflows/frontier/test.sh | 5 +- src/common/m_finite_differences.fpp | 5 + src/simulation/m_fftw.fpp | 34 ++--- src/simulation/m_mpi_proxy.fpp | 169 ++++++++++++++++++------- src/simulation/m_viscous.fpp | 188 ++++++++++++++++++++++++++++ toolchain/mfc/run/run.py | 6 +- toolchain/mfc/test/case.py | 2 +- toolchain/mfc/test/test.py | 4 +- toolchain/modules | 8 +- 10 files changed, 350 insertions(+), 73 deletions(-) diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index a6a51b65f0..e04f321e15 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -1,4 +1,4 @@ #!/bin/bash . ./mfc.sh load -c f -m g -./mfc.sh build -j 8 --gpu +./mfc.sh build -j 8 --gpu --sys-hdf5 --sys-fftw diff --git a/.github/workflows/frontier/test.sh b/.github/workflows/frontier/test.sh index fab53ef8e2..4ecae46b88 100644 --- a/.github/workflows/frontier/test.sh +++ b/.github/workflows/frontier/test.sh @@ -1,3 +1,6 @@ #!/bin/bash -./mfc.sh test -j 4 -a -- -c frontier +gpus=`rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' '` +ngpus=`echo "$gpus" | tr -d '[:space:]' | wc -c` + +./mfc.sh test -j $ngpus --sys-hdf5 --sys-fftw --gpus $gpus -- -c frontier diff --git a/src/common/m_finite_differences.fpp b/src/common/m_finite_differences.fpp index e5b9462802..6ef081f213 100644 --- a/src/common/m_finite_differences.fpp +++ b/src/common/m_finite_differences.fpp @@ -4,6 +4,11 @@ module m_finite_differences implicit none + private; + public :: s_compute_fd_laplacian, & + s_compute_fd_divergence, & + s_compute_finite_difference_coefficients + contains !> Computes the scalar gradient fields via finite differences diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp index 7650e89dce..58fb51be77 100644 --- a/src/simulation/m_fftw.fpp +++ b/src/simulation/m_fftw.fpp @@ -57,9 +57,9 @@ module m_fftw @:CRAY_DECLARE_GLOBAL(complex(kind(0d0)), dimension(:), data_fltr_cmplx_gpu) !$acc declare link(data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu) #else - real(kind(0d0)), allocatable :: data_real_gpu(:) - complex(kind(0d0)), allocatable :: data_cmplx_gpu(:) - complex(kind(0d0)), allocatable :: data_fltr_cmplx_gpu(:) + real(kind(0d0)), allocatable, target :: data_real_gpu(:) + complex(kind(0d0)), allocatable, target :: data_cmplx_gpu(:) + complex(kind(0d0)), allocatable, target :: data_fltr_cmplx_gpu(:) !$acc declare create(data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu) #endif @@ -141,7 +141,8 @@ contains subroutine s_apply_fourier_filter(q_cons_vf) type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf - + real(c_double), pointer :: p_real(:) + complex(c_double_complex), pointer :: p_cmplx(:), p_fltr_cmplx(:) integer :: i, j, k, l !< Generic loop iterators ! Restrict filter to processors that have cells adjacent to axis @@ -166,11 +167,16 @@ contains end do end do -!$acc host_data use_device(data_real_gpu, data_cmplx_gpu) + p_real => data_real_gpu + p_cmplx => data_cmplx_gpu + p_fltr_cmplx => data_fltr_cmplx_gpu + +!$acc data attach(p_real, p_cmplx, p_fltr_cmplx) +!$acc host_data use_device(p_real, p_cmplx, p_fltr_cmplx) #if defined(__PGI) ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu) #else - ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(data_real_gpu), c_loc(data_cmplx_gpu)) + ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx)) call hipCheck(hipDeviceSynchronize()) #endif !$acc end host_data @@ -186,11 +192,11 @@ contains end do end do -!$acc host_data use_device(data_real_gpu, data_fltr_cmplx_gpu) +!$acc host_data use_device(p_real, p_fltr_cmplx) #if defined(__PGI) ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu) #else - ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(data_fltr_cmplx_gpu), c_loc(data_real_gpu)) + ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real)) call hipCheck(hipDeviceSynchronize()) #endif !$acc end host_data @@ -225,11 +231,11 @@ contains end do end do -!$acc host_data use_device(data_real_gpu, data_cmplx_gpu) +!$acc host_data use_device(p_real, p_cmplx) #if defined(__PGI) ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu) #else - ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(data_real_gpu), c_loc(data_cmplx_gpu)) + ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx)) call hipCheck(hipDeviceSynchronize()) #endif !$acc end host_data @@ -246,11 +252,11 @@ contains end do end do -!$acc host_data use_device(data_real_gpu, data_fltr_cmplx_gpu) +!$acc host_data use_device(p_real, p_fltr_cmplx) #if defined(__PGI) ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu) #else - ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(data_fltr_cmplx_gpu), c_loc(data_real_gpu)) + ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real)) call hipCheck(hipDeviceSynchronize()) #endif !$acc end host_data @@ -297,8 +303,8 @@ contains end do end do #endif - - end subroutine s_apply_fourier_filter +!$acc end data + end subroutine s_apply_fourier_filter ! -------------------------------- !> The purpose of this subroutine is to destroy the fftw plan !! that will be used in the forward and backward DFTs when diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp index 292fe51a86..109d1e7a7f 100644 --- a/src/simulation/m_mpi_proxy.fpp +++ b/src/simulation/m_mpi_proxy.fpp @@ -43,32 +43,32 @@ module m_mpi_proxy !$acc declare link(ib_buff_send, ib_buff_recv) !$acc declare link(c_divs_buff_send, c_divs_buff_recv) #else - real(kind(0d0)), private, allocatable, dimension(:) :: q_cons_buff_send !< + real(kind(0d0)), private, allocatable, dimension(:), target :: q_cons_buff_send !< !! This variable is utilized to pack and send the buffer of the cell-average !! conservative variables, for a single computational domain boundary at the !! time, to the relevant neighboring processor. - real(kind(0d0)), private, allocatable, dimension(:) :: q_cons_buff_recv !< + real(kind(0d0)), private, allocatable, dimension(:), target :: q_cons_buff_recv !< !! q_cons_buff_recv is utilized to receive and unpack the buffer of the cell- !! average conservative variables, for a single computational domain boundary !! at the time, from the relevant neighboring processor. - real(kind(0d0)), private, allocatable, dimension(:) :: c_divs_buff_send !< + real(kind(0d0)), private, allocatable, dimension(:), target :: c_divs_buff_send !< !! c_divs_buff_send is utilized to send and unpack the buffer of the cell- !! centered color function derivatives, for a single computational domain !! boundary at the time, to the the relevant neighboring processor - real(kind(0d0)), private, allocatable, dimension(:) :: c_divs_buff_recv + real(kind(0d0)), private, allocatable, dimension(:), target :: c_divs_buff_recv !! c_divs_buff_recv is utilized to receiver and unpack the buffer of the cell- !! centered color function derivatives, for a single computational domain !! boundary at the time, from the relevant neighboring processor - integer, private, allocatable, dimension(:) :: ib_buff_send !< + integer, private, allocatable, dimension(:), target :: ib_buff_send !< !! This variable is utilized to pack and send the buffer of the immersed !! boundary markers, for a single computational domain boundary at the !! time, to the relevant neighboring processor. - integer, private, allocatable, dimension(:) :: ib_buff_recv !< + integer, private, allocatable, dimension(:), target :: ib_buff_recv !< !! q_cons_buff_recv is utilized to receive and unpack the buffer of the !! immersed boundary markers, for a single computational domain boundary !! at the time, from the relevant neighboring processor. @@ -866,6 +866,8 @@ contains integer :: pack_offsets(1:3), unpack_offsets(1:3) integer :: pack_offset, unpack_offset + real(kind(0d0)), pointer :: p_send, p_recv + integer, pointer, dimension(:) :: p_i_send, p_i_recv #ifdef MFC_MPI @@ -1065,19 +1067,23 @@ contains ! Send/Recv #:for rdma_mpi in [False, True] if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then + p_send => q_cons_buff_send(0) + p_recv => q_cons_buff_recv(0) #:if rdma_mpi - !$acc host_data use_device(q_cons_buff_recv, q_cons_buff_send, ib_buff_recv, ib_buff_send) + !$acc data attach(p_send, p_recv) + !$acc host_data use_device(p_send, p_recv) #:else !$acc update host(q_cons_buff_send, ib_buff_send) #:endif call MPI_SENDRECV( & - q_cons_buff_send(0), buffer_count, MPI_DOUBLE_PRECISION, dst_proc, send_tag, & - q_cons_buff_recv(0), buffer_count, MPI_DOUBLE_PRECISION, src_proc, recv_tag, & + p_send, buffer_count, MPI_DOUBLE_PRECISION, dst_proc, send_tag, & + p_recv, buffer_count, MPI_DOUBLE_PRECISION, src_proc, recv_tag, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) #:if rdma_mpi !$acc end host_data + !$acc end data !$acc wait #:else !$acc update device(q_cons_buff_recv) @@ -1268,6 +1274,7 @@ contains integer, intent(in) :: gp_layers integer :: i, j, k, l, r !< Generic loop iterators + integer, pointer, dimension(:) :: p_i_send, p_i_recv #ifdef MFC_MPI @@ -1309,19 +1316,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send, ib_buff_recv, ib_buff_send) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(n + 1)*(p + 1), & MPI_INTEGER, bc_x%end, 0, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(n + 1)*(p + 1), & MPI_INTEGER, bc_x%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1359,19 +1371,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(n + 1)*(p + 1), & MPI_INTEGER, bc_x%beg, 1, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(n + 1)*(p + 1), & MPI_INTEGER, bc_x%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1431,19 +1448,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(n + 1)*(p + 1), & MPI_INTEGER, bc_x%beg, 1, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(n + 1)*(p + 1), & MPI_INTEGER, bc_x%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1479,19 +1501,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(n + 1)*(p + 1), & MPI_INTEGER, bc_x%end, 0, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(n + 1)*(p + 1), & MPI_INTEGER, bc_x%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1553,19 +1580,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(m + 2*gp_layers + 1)*(p + 1), & MPI_INTEGER, bc_y%end, 0, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(m + 2*gp_layers + 1)*(p + 1), & MPI_INTEGER, bc_y%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1604,19 +1636,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(m + 2*gp_layers + 1)*(p + 1), & MPI_INTEGER, bc_y%beg, 1, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(m + 2*gp_layers + 1)*(p + 1), & MPI_INTEGER, bc_y%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1679,19 +1716,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(m + 2*gp_layers + 1)*(p + 1), & MPI_INTEGER, bc_y%beg, 1, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(m + 2*gp_layers + 1)*(p + 1), & MPI_INTEGER, bc_y%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1730,19 +1772,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(m + 2*gp_layers + 1)*(p + 1), & MPI_INTEGER, bc_y%end, 0, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(m + 2*gp_layers + 1)*(p + 1), & MPI_INTEGER, bc_y%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1808,19 +1855,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), & MPI_INTEGER, bc_z%end, 0, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), & MPI_INTEGER, bc_z%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1859,19 +1911,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), & MPI_INTEGER, bc_z%beg, 1, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), & MPI_INTEGER, bc_z%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1935,19 +1992,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), & MPI_INTEGER, bc_z%beg, 1, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), & MPI_INTEGER, bc_z%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -1986,19 +2048,24 @@ contains #if defined(MFC_OpenACC) if (rdma_mpi) then - !$acc host_data use_device( ib_buff_recv, ib_buff_send ) + p_i_send => ib_buff_send + p_i_recv => ib_buff_recv + + !$acc data attach(p_i_send, p_i_recv) + !$acc host_data use_device(p_i_send, p_i_recv) ! Send/receive buffer to/from bc_x%end/bc_x%beg call MPI_SENDRECV( & - ib_buff_send(0), & + p_i_send(0), & gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), & MPI_INTEGER, bc_z%end, 0, & - ib_buff_recv(0), & + p_i_recv(0), & gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), & MPI_INTEGER, bc_z%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) !$acc end host_data + !$acc end data !$acc wait else #endif @@ -2064,6 +2131,7 @@ contains integer :: pack_offsets(1:3), unpack_offsets(1:3) integer :: pack_offset, unpack_offset + real(kind(0d0)), pointer :: p_send, p_recv #ifdef MFC_MPI @@ -2158,19 +2226,24 @@ contains ! Send/Recv #:for rdma_mpi in [False, True] if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then + p_send => c_divs_buff_send(0) + p_recv => c_divs_buff_recv(0) + #:if rdma_mpi - !$acc host_data use_device(c_divs_buff_recv, c_divs_buff_send) + !$acc data attach(p_send, p_recv) + !$acc host_data use_device(p_send, p_recv) #:else !$acc update host(c_divs_buff_send) #:endif call MPI_SENDRECV( & - c_divs_buff_send(0), buffer_count, MPI_DOUBLE_PRECISION, dst_proc, send_tag, & - c_divs_buff_recv(0), buffer_count, MPI_DOUBLE_PRECISION, src_proc, recv_tag, & + p_send, buffer_count, MPI_DOUBLE_PRECISION, dst_proc, send_tag, & + p_recv, buffer_count, MPI_DOUBLE_PRECISION, src_proc, recv_tag, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) #:if rdma_mpi !$acc end host_data + !$acc end data !$acc wait #:else !$acc update device(c_divs_buff_recv) diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp index 078b1c6020..0e008c932c 100644 --- a/src/simulation/m_viscous.fpp +++ b/src/simulation/m_viscous.fpp @@ -1305,6 +1305,194 @@ contains end subroutine s_apply_scalar_divergence_theorem + !> Computes the scalar gradient fields via finite differences + !! @param var Variable to compute derivative of + !! @param grad_x First coordinate direction component of the derivative + !! @param grad_y Second coordinate direction component of the derivative + !! @param grad_z Third coordinate direction component of the derivative + !! @param norm Norm of the gradient vector + subroutine s_compute_fd_gradient(var, grad_x, grad_y, grad_z, & + ix, iy, iz, buff_size_in) + + type(scalar_field), intent(in) :: var + type(scalar_field), intent(inout) :: grad_x + type(scalar_field), intent(inout) :: grad_y + type(scalar_field), intent(inout) :: grad_z + type(int_bounds_info), intent(inout) :: ix, iy, iz + integer, intent(in) :: buff_size_in + + integer :: j, k, l !< Generic loop iterators + + ix%beg = -buff_size_in; ix%end = m + buff_size_in; + if (n > 0) then + iy%beg = -buff_size_in; iy%end = n + buff_size_in + else + iy%beg = -1; iy%end = 1 + end if + + if (p > 0) then + iz%beg = -buff_size_in; iz%end = p + buff_size_in + else + iz%beg = -1; iz%end = 1 + end if + + is1_viscous = ix; is2_viscous = iy; is3_viscous = iz + + !$acc update device(is1_viscous, is2_viscous, is3_viscous) + + !$acc parallel loop collapse(3) gang vector default(present) + do l = is3_viscous%beg + 1, is3_viscous%end - 1 + do k = is2_viscous%beg + 1, is2_viscous%end - 1 + do j = is1_viscous%beg + 1, is1_viscous%end - 1 + grad_x%sf(j, k, l) = & + (var%sf(j + 1, k, l) - var%sf(j - 1, k, l))/ & + (x_cc(j + 1) - x_cc(j - 1)) + end do + end do + end do + + if (n > 0) then + !$acc parallel loop collapse(3) gang vector + do l = is3_viscous%beg + 1, is3_viscous%end - 1 + do k = is2_viscous%beg + 1, is2_viscous%end - 1 + do j = is1_viscous%beg + 1, is1_viscous%end - 1 + grad_y%sf(j, k, l) = & + (var%sf(j, k + 1, l) - var%sf(j, k - 1, l))/ & + (y_cc(k + 1) - y_cc(k - 1)) + end do + end do + end do + end if + + if (p > 0) then + !$acc parallel loop collapse(3) gang vector + do l = is3_viscous%beg + 1, is3_viscous%end - 1 + do k = is2_viscous%beg + 1, is2_viscous%end - 1 + do j = is1_viscous%beg + 1, is1_viscous%end - 1 + grad_z%sf(j, k, l) = & + (var%sf(j, k, l + 1) - var%sf(j, k, l - 1))/ & + (z_cc(l + 1) - z_cc(l - 1)) + end do + end do + end do + end if + + is1_viscous%beg = -buff_size_in; is1_viscous%end = m + buff_size_in; + if (n > 0) then + is2_viscous%beg = -buff_size_in; is2_viscous%end = n + buff_size_in + else + is2_viscous%beg = 0; is2_viscous%end = 0 + end if + + if (p > 0) then + is3_viscous%beg = -buff_size_in; is3_viscous%end = p + buff_size_in + else + is3_viscous%beg = 0; is3_viscous%end = 0 + end if + + !$acc update device(is1_viscous, is2_viscous, is3_viscous) + + !$acc parallel loop collapse(2) gang vector default(present) + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg, is2_viscous%end + grad_x%sf(is1_viscous%beg, k, l) = & + (-3d0*var%sf(is1_viscous%beg, k, l) + 4d0*var%sf(is1_viscous%beg + 1, k, l) - var%sf(is1_viscous%beg + 2, k, l))/ & + (x_cc(is1_viscous%beg + 2) - x_cc(is1_viscous%beg)) + grad_x%sf(is1_viscous%end, k, l) = & + (3d0*var%sf(is1_viscous%end, k, l) - 4d0*var%sf(is1_viscous%end - 1, k, l) + var%sf(is1_viscous%end - 2, k, l))/ & + (x_cc(is1_viscous%end) - x_cc(is1_viscous%end - 2)) + end do + end do + if (n > 0) then + !$acc parallel loop collapse(2) gang vector default(present) + do l = is3_viscous%beg, is3_viscous%end + do j = is1_viscous%beg, is1_viscous%end + grad_y%sf(j, is2_viscous%beg, l) = & + (-3d0*var%sf(j, is2_viscous%beg, l) + 4d0*var%sf(j, is2_viscous%beg + 1, l) - var%sf(j, is2_viscous%beg + 2, l))/ & + (y_cc(is2_viscous%beg + 2) - y_cc(is2_viscous%beg)) + grad_y%sf(j, is2_viscous%end, l) = & + (3d0*var%sf(j, is2_viscous%end, l) - 4d0*var%sf(j, is2_viscous%end - 1, l) + var%sf(j, is2_viscous%end - 2, l))/ & + (y_cc(is2_viscous%end) - y_cc(is2_viscous%end - 2)) + end do + end do + if (p > 0) then + !$acc parallel loop collapse(2) gang vector default(present) + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end + grad_z%sf(j, k, is3_viscous%beg) = & + (-3d0*var%sf(j, k, is3_viscous%beg) + 4d0*var%sf(j, k, is3_viscous%beg + 1) - var%sf(j, k, is3_viscous%beg + 2))/ & + (z_cc(is3_viscous%beg + 2) - z_cc(is3_viscous%beg)) + grad_z%sf(j, k, is3_viscous%end) = & + (3d0*var%sf(j, k, is3_viscous%end) - 4d0*var%sf(j, k, is3_viscous%end - 1) + var%sf(j, k, is3_viscous%end - 2))/ & + (z_cc(is3_viscous%end) - z_cc(is3_viscous%end - 2)) + end do + end do + end if + end if + + if (bc_x%beg <= -3) then + !$acc parallel loop collapse(2) gang vector default(present) + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg, is2_viscous%end + grad_x%sf(0, k, l) = (-3d0*var%sf(0, k, l) + 4d0*var%sf(1, k, l) - var%sf(2, k, l))/ & + (x_cc(2) - x_cc(0)) + end do + end do + end if + if (bc_x%end <= -3) then + !$acc parallel loop collapse(2) gang vector default(present) + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg, is2_viscous%end + grad_x%sf(m, k, l) = (3d0*var%sf(m, k, l) - 4d0*var%sf(m - 1, k, l) + var%sf(m - 2, k, l))/ & + (x_cc(m) - x_cc(m - 2)) + end do + end do + end if + if (n > 0) then + if (bc_y%beg <= -3 .and. bc_y%beg /= -13) then + !$acc parallel loop collapse(2) gang vector default(present) + do l = is3_viscous%beg, is3_viscous%end + do j = is1_viscous%beg, is1_viscous%end + grad_y%sf(j, 0, l) = (-3d0*var%sf(j, 0, l) + 4d0*var%sf(j, 1, l) - var%sf(j, 2, l))/ & + (y_cc(2) - y_cc(0)) + end do + end do + end if + if (bc_y%end <= -3) then + !$acc parallel loop collapse(2) gang vector default(present) + do l = is3_viscous%beg, is3_viscous%end + do j = is1_viscous%beg, is1_viscous%end + grad_y%sf(j, n, l) = (3d0*var%sf(j, n, l) - 4d0*var%sf(j, n - 1, l) + var%sf(j, n - 2, l))/ & + (y_cc(n) - y_cc(n - 2)) + end do + end do + end if + if (p > 0) then + if (bc_z%beg <= -3) then + !$acc parallel loop collapse(2) gang vector default(present) + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end + grad_z%sf(j, k, 0) = & + (-3d0*var%sf(j, k, 0) + 4d0*var%sf(j, k, 1) - var%sf(j, k, 2))/ & + (z_cc(2) - z_cc(0)) + end do + end do + end if + if (bc_z%end <= -3) then + !$acc parallel loop collapse(2) gang vector default(present) + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end + grad_z%sf(j, k, p) = & + (3d0*var%sf(j, k, p) - 4d0*var%sf(j, k, p - 1) + var%sf(j, k, p - 2))/ & + (z_cc(p) - z_cc(p - 2)) + end do + end do + end if + end if + end if + + end subroutine s_compute_fd_gradient + subroutine s_finalize_viscous_module() integer :: i diff --git a/toolchain/mfc/run/run.py b/toolchain/mfc/run/run.py index 96d39877e2..58cbdf9d07 100644 --- a/toolchain/mfc/run/run.py +++ b/toolchain/mfc/run/run.py @@ -93,7 +93,11 @@ def __get_template() -> Template: def __generate_job_script(targets, case: input.MFCInputFile): env = {} if ARG('gpus') is not None: - env['CUDA_VISIBLE_DEVICES'] = ','.join([str(_) for _ in ARG('gpus')]) + gpu_ids = ','.join([str(_) for _ in ARG('gpus')]) + env.update({ + 'CUDA_VISIBLE_DEVICES': gpu_ids, + 'HIP_VISIBLE_DEVICES': gpu_ids + }) content = __get_template().render( **{**ARGS(), 'targets': targets}, diff --git a/toolchain/mfc/test/case.py b/toolchain/mfc/test/case.py index d4fa948ef4..164ded4b7d 100644 --- a/toolchain/mfc/test/case.py +++ b/toolchain/mfc/test/case.py @@ -128,7 +128,7 @@ def run(self, targets: typing.List[typing.Union[str, MFCTarget]], gpus: typing.S *jobs, "-t", *target_names, *gpus_select, *ARG("--") ] - return common.system(command, print_cmd=False, text=True, capture_output=True) + return common.system(command, print_cmd=False, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) def get_uuid(self) -> str: return trace_to_uuid(self.trace) diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index e2417d4ea7..0d300e58bb 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -191,9 +191,9 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): h5dump = f"{HDF5.get_install_dirpath(MFCInputFile(os.path.basename(case.get_filepath()), case.get_dirpath(), case.get_parameters()))}/bin/h5dump" - if ARG("sys_hdf5"): + if not os.path.exists(h5dump or ""): if not does_command_exist("h5dump"): - raise MFCException("--sys-hdf5 was specified and h5dump couldn't be found.") + raise MFCException("h5dump couldn't be found.") h5dump = shutil.which("h5dump") diff --git a/toolchain/modules b/toolchain/modules index 0dc9576c87..d73c600773 100644 --- a/toolchain/modules +++ b/toolchain/modules @@ -47,11 +47,9 @@ p-cpu gcc/12.3.0 openmpi/4.1.5 p-gpu nvhpc/24.5 hpcx/2.19-cuda cuda/12.1.1 f OLCF Frontier -f-gpu rocm/5.5.1 craype-accel-amd-gfx90a -f-all cpe/23.09 -f-all cray-fftw cray-hdf5 cray-mpich/8.1.26 cce/16.0.1 -f-all rocm/5.5.1 cray-python omniperf -f-cpu +f-all cce/18.0.0 cpe/24.07 rocm/6.1.3 cray-mpich/8.1.28 +f-all cray-fftw cray-hdf5 cray-python omniperf +f-gpu craype-accel-amd-gfx90a d NCSA Delta d-all python/3.11.6