From 86620317b0d0ea2d3cee6134fb3bbe16ab407e59 Mon Sep 17 00:00:00 2001
From: Henry LE BERRE <hberre3@gatech.edu>
Date: Sat, 24 Aug 2024 20:42:24 -0400
Subject: [PATCH] CCE 18 migration on OLCF Frontier

Co-authored-by: Steve Abbott <stephen.abbott@hpe.com>
---
 .github/workflows/frontier/build.sh |   2 +-
 .github/workflows/frontier/test.sh  |   5 +-
 src/common/m_finite_differences.fpp |   5 +
 src/simulation/m_fftw.fpp           |  34 ++---
 src/simulation/m_mpi_proxy.fpp      | 169 ++++++++++++++++++-------
 src/simulation/m_viscous.fpp        | 188 ++++++++++++++++++++++++++++
 toolchain/mfc/run/run.py            |   6 +-
 toolchain/mfc/test/case.py          |   2 +-
 toolchain/mfc/test/test.py          |   4 +-
 toolchain/modules                   |   8 +-
 10 files changed, 350 insertions(+), 73 deletions(-)

diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh
index a6a51b65f0..e04f321e15 100644
--- a/.github/workflows/frontier/build.sh
+++ b/.github/workflows/frontier/build.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
 
 . ./mfc.sh load -c f -m g
-./mfc.sh build -j 8 --gpu
+./mfc.sh build -j 8 --gpu --sys-hdf5 --sys-fftw
diff --git a/.github/workflows/frontier/test.sh b/.github/workflows/frontier/test.sh
index fab53ef8e2..4ecae46b88 100644
--- a/.github/workflows/frontier/test.sh
+++ b/.github/workflows/frontier/test.sh
@@ -1,3 +1,6 @@
 #!/bin/bash
 
-./mfc.sh test -j 4 -a -- -c frontier
+gpus=`rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' '`
+ngpus=`echo "$gpus" | tr -d '[:space:]' | wc -c`
+
+./mfc.sh test -j $ngpus --sys-hdf5 --sys-fftw --gpus $gpus -- -c frontier
diff --git a/src/common/m_finite_differences.fpp b/src/common/m_finite_differences.fpp
index e5b9462802..6ef081f213 100644
--- a/src/common/m_finite_differences.fpp
+++ b/src/common/m_finite_differences.fpp
@@ -4,6 +4,11 @@ module m_finite_differences
 
     implicit none
 
+    private; 
+    public :: s_compute_fd_laplacian, &
+              s_compute_fd_divergence, &
+              s_compute_finite_difference_coefficients
+
 contains
 
     !>  Computes the scalar gradient fields via finite differences
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index 7650e89dce..58fb51be77 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -57,9 +57,9 @@ module m_fftw
     @:CRAY_DECLARE_GLOBAL(complex(kind(0d0)), dimension(:), data_fltr_cmplx_gpu)
     !$acc declare link(data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu)
 #else
-    real(kind(0d0)), allocatable :: data_real_gpu(:)
-    complex(kind(0d0)), allocatable :: data_cmplx_gpu(:)
-    complex(kind(0d0)), allocatable :: data_fltr_cmplx_gpu(:)
+    real(kind(0d0)), allocatable, target :: data_real_gpu(:)
+    complex(kind(0d0)), allocatable, target :: data_cmplx_gpu(:)
+    complex(kind(0d0)), allocatable, target :: data_fltr_cmplx_gpu(:)
     !$acc declare create(data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu)
 #endif
 
@@ -141,7 +141,8 @@ contains
     subroutine s_apply_fourier_filter(q_cons_vf)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-
+        real(c_double), pointer :: p_real(:)
+        complex(c_double_complex), pointer :: p_cmplx(:), p_fltr_cmplx(:)
         integer :: i, j, k, l !< Generic loop iterators
 
         ! Restrict filter to processors that have cells adjacent to axis
@@ -166,11 +167,16 @@ contains
             end do
         end do
 
-!$acc host_data use_device(data_real_gpu, data_cmplx_gpu)
+        p_real => data_real_gpu
+        p_cmplx => data_cmplx_gpu
+        p_fltr_cmplx => data_fltr_cmplx_gpu
+
+!$acc data attach(p_real, p_cmplx, p_fltr_cmplx)
+!$acc host_data use_device(p_real, p_cmplx, p_fltr_cmplx)
 #if defined(__PGI)
         ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
 #else
-        ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(data_real_gpu), c_loc(data_cmplx_gpu))
+        ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx))
         call hipCheck(hipDeviceSynchronize())
 #endif
         !$acc end host_data
@@ -186,11 +192,11 @@ contains
             end do
         end do
 
-!$acc host_data use_device(data_real_gpu, data_fltr_cmplx_gpu)
+!$acc host_data use_device(p_real, p_fltr_cmplx)
 #if defined(__PGI)
         ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
 #else
-        ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(data_fltr_cmplx_gpu), c_loc(data_real_gpu))
+        ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real))
         call hipCheck(hipDeviceSynchronize())
 #endif
         !$acc end host_data
@@ -225,11 +231,11 @@ contains
                 end do
             end do
 
-!$acc host_data use_device(data_real_gpu, data_cmplx_gpu)
+!$acc host_data use_device(p_real, p_cmplx)
 #if defined(__PGI)
             ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
 #else
-            ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(data_real_gpu), c_loc(data_cmplx_gpu))
+            ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx))
             call hipCheck(hipDeviceSynchronize())
 #endif
             !$acc end host_data
@@ -246,11 +252,11 @@ contains
                 end do
             end do
 
-!$acc host_data use_device(data_real_gpu, data_fltr_cmplx_gpu)
+!$acc host_data use_device(p_real, p_fltr_cmplx)
 #if defined(__PGI)
             ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
 #else
-            ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(data_fltr_cmplx_gpu), c_loc(data_real_gpu))
+            ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real))
             call hipCheck(hipDeviceSynchronize())
 #endif
             !$acc end host_data
@@ -297,8 +303,8 @@ contains
             end do
         end do
 #endif
-
-    end subroutine s_apply_fourier_filter
+!$acc end data
+    end subroutine s_apply_fourier_filter ! --------------------------------
 
     !>  The purpose of this subroutine is to destroy the fftw plan
         !!      that will be used in the forward and backward DFTs when
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index 292fe51a86..109d1e7a7f 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -43,32 +43,32 @@ module m_mpi_proxy
     !$acc declare link(ib_buff_send, ib_buff_recv)
     !$acc declare link(c_divs_buff_send, c_divs_buff_recv)
 #else
-    real(kind(0d0)), private, allocatable, dimension(:) :: q_cons_buff_send !<
+    real(kind(0d0)), private, allocatable, dimension(:), target :: q_cons_buff_send !<
     !! This variable is utilized to pack and send the buffer of the cell-average
     !! conservative variables, for a single computational domain boundary at the
     !! time, to the relevant neighboring processor.
 
-    real(kind(0d0)), private, allocatable, dimension(:) :: q_cons_buff_recv !<
+    real(kind(0d0)), private, allocatable, dimension(:), target :: q_cons_buff_recv !<
     !! q_cons_buff_recv is utilized to receive and unpack the buffer of the cell-
     !! average conservative variables, for a single computational domain boundary
     !! at the time, from the relevant neighboring processor.
 
-    real(kind(0d0)), private, allocatable, dimension(:) :: c_divs_buff_send !<
+    real(kind(0d0)), private, allocatable, dimension(:), target :: c_divs_buff_send !<
     !! c_divs_buff_send is utilized to send and unpack the buffer of the cell-
     !! centered color function derivatives, for a single computational domain
     !! boundary at the time, to the the relevant neighboring processor
 
-    real(kind(0d0)), private, allocatable, dimension(:) :: c_divs_buff_recv
+    real(kind(0d0)), private, allocatable, dimension(:), target :: c_divs_buff_recv
     !! c_divs_buff_recv is utilized to receiver and unpack the buffer of the cell-
     !! centered color function derivatives, for a single computational domain
     !! boundary at the time, from the relevant neighboring processor
 
-    integer, private, allocatable, dimension(:) :: ib_buff_send !<
+    integer, private, allocatable, dimension(:), target :: ib_buff_send !<
     !! This variable is utilized to pack and send the buffer of the immersed
     !! boundary markers, for a single computational domain boundary at the
     !! time, to the relevant neighboring processor.
 
-    integer, private, allocatable, dimension(:) :: ib_buff_recv !<
+    integer, private, allocatable, dimension(:), target :: ib_buff_recv !<
     !! q_cons_buff_recv is utilized to receive and unpack the buffer of the
     !! immersed boundary markers, for a single computational domain boundary
     !! at the time, from the relevant neighboring processor.
@@ -866,6 +866,8 @@ contains
 
         integer :: pack_offsets(1:3), unpack_offsets(1:3)
         integer :: pack_offset, unpack_offset
+        real(kind(0d0)), pointer :: p_send, p_recv
+        integer, pointer, dimension(:) :: p_i_send, p_i_recv
 
 #ifdef MFC_MPI
 
@@ -1065,19 +1067,23 @@ contains
         ! Send/Recv
         #:for rdma_mpi in [False, True]
             if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then
+                p_send => q_cons_buff_send(0)
+                p_recv => q_cons_buff_recv(0)
                 #:if rdma_mpi
-                    !$acc host_data use_device(q_cons_buff_recv, q_cons_buff_send, ib_buff_recv, ib_buff_send)
+                    !$acc data attach(p_send, p_recv)
+                    !$acc host_data use_device(p_send, p_recv)
                 #:else
                     !$acc update host(q_cons_buff_send, ib_buff_send)
                 #:endif
 
                 call MPI_SENDRECV( &
-                    q_cons_buff_send(0), buffer_count, MPI_DOUBLE_PRECISION, dst_proc, send_tag, &
-                    q_cons_buff_recv(0), buffer_count, MPI_DOUBLE_PRECISION, src_proc, recv_tag, &
+                    p_send, buffer_count, MPI_DOUBLE_PRECISION, dst_proc, send_tag, &
+                    p_recv, buffer_count, MPI_DOUBLE_PRECISION, src_proc, recv_tag, &
                     MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
                 #:if rdma_mpi
                     !$acc end host_data
+                    !$acc end data
                     !$acc wait
                 #:else
                     !$acc update device(q_cons_buff_recv)
@@ -1268,6 +1274,7 @@ contains
         integer, intent(in) :: gp_layers
 
         integer :: i, j, k, l, r !< Generic loop iterators
+        integer, pointer, dimension(:) :: p_i_send, p_i_recv
 
 #ifdef MFC_MPI
 
@@ -1309,19 +1316,24 @@ contains
 
 #if defined(MFC_OpenACC)
                 if (rdma_mpi) then
-                    !$acc host_data use_device( ib_buff_recv, ib_buff_send, ib_buff_recv, ib_buff_send)
+                    p_i_send => ib_buff_send
+                    p_i_recv => ib_buff_recv
+
+                    !$acc data attach(p_i_send, p_i_recv)
+                    !$acc host_data use_device(p_i_send, p_i_recv)
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
-                        ib_buff_send(0), &
+                        p_i_send(0), &
                         gp_layers*(n + 1)*(p + 1), &
                         MPI_INTEGER, bc_x%end, 0, &
-                        ib_buff_recv(0), &
+                        p_i_recv(0), &
                         gp_layers*(n + 1)*(p + 1), &
                         MPI_INTEGER, bc_x%beg, 0, &
                         MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
                     !$acc end host_data
+                    !$acc end data
                     !$acc wait
                 else
 #endif
@@ -1359,19 +1371,24 @@ contains
 
 #if defined(MFC_OpenACC)
                 if (rdma_mpi) then
-                    !$acc host_data use_device( ib_buff_recv, ib_buff_send )
+                    p_i_send => ib_buff_send
+                    p_i_recv => ib_buff_recv
+
+                    !$acc data attach(p_i_send, p_i_recv)
+                    !$acc host_data use_device(p_i_send, p_i_recv)
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
-                        ib_buff_send(0), &
+                        p_i_send(0), &
                         gp_layers*(n + 1)*(p + 1), &
                         MPI_INTEGER, bc_x%beg, 1, &
-                        ib_buff_recv(0), &
+                        p_i_recv(0), &
                         gp_layers*(n + 1)*(p + 1), &
                         MPI_INTEGER, bc_x%beg, 0, &
                         MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
                     !$acc end host_data
+                    !$acc end data
                     !$acc wait
                 else
 #endif
@@ -1431,19 +1448,24 @@ contains
 
 #if defined(MFC_OpenACC)
                 if (rdma_mpi) then
-                    !$acc host_data use_device( ib_buff_recv, ib_buff_send )
+                    p_i_send => ib_buff_send
+                    p_i_recv => ib_buff_recv
+
+                    !$acc data attach(p_i_send, p_i_recv)
+                    !$acc host_data use_device(p_i_send, p_i_recv)
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
-                        ib_buff_send(0), &
+                        p_i_send(0), &
                         gp_layers*(n + 1)*(p + 1), &
                         MPI_INTEGER, bc_x%beg, 1, &
-                        ib_buff_recv(0), &
+                        p_i_recv(0), &
                         gp_layers*(n + 1)*(p + 1), &
                         MPI_INTEGER, bc_x%end, 1, &
                         MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
                     !$acc end host_data
+                    !$acc end data
                     !$acc wait
                 else
 #endif
@@ -1479,19 +1501,24 @@ contains
 
 #if defined(MFC_OpenACC)
                 if (rdma_mpi) then
-                    !$acc host_data use_device( ib_buff_recv, ib_buff_send )
+                    p_i_send => ib_buff_send
+                    p_i_recv => ib_buff_recv
+
+                    !$acc data attach(p_i_send, p_i_recv)
+                    !$acc host_data use_device(p_i_send, p_i_recv)
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
-                        ib_buff_send(0), &
+                        p_i_send(0), &
                         gp_layers*(n + 1)*(p + 1), &
                         MPI_INTEGER, bc_x%end, 0, &
-                        ib_buff_recv(0), &
+                        p_i_recv(0), &
                         gp_layers*(n + 1)*(p + 1), &
                         MPI_INTEGER, bc_x%end, 1, &
                         MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
                     !$acc end host_data
+                    !$acc end data
                     !$acc wait
                 else
 #endif
@@ -1553,19 +1580,24 @@ contains
 
 #if defined(MFC_OpenACC)
                 if (rdma_mpi) then
-                    !$acc host_data use_device( ib_buff_recv, ib_buff_send )
+                    p_i_send => ib_buff_send
+                    p_i_recv => ib_buff_recv
+
+                    !$acc data attach(p_i_send, p_i_recv)
+                    !$acc host_data use_device(p_i_send, p_i_recv)
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
-                        ib_buff_send(0), &
+                        p_i_send(0), &
                         gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
                         MPI_INTEGER, bc_y%end, 0, &
-                        ib_buff_recv(0), &
+                        p_i_recv(0), &
                         gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
                         MPI_INTEGER, bc_y%beg, 0, &
                         MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
                     !$acc end host_data
+                    !$acc end data
                     !$acc wait
                 else
 #endif
@@ -1604,19 +1636,24 @@ contains
 
 #if defined(MFC_OpenACC)
                 if (rdma_mpi) then
-                    !$acc host_data use_device( ib_buff_recv, ib_buff_send )
+                    p_i_send => ib_buff_send
+                    p_i_recv => ib_buff_recv
+
+                    !$acc data attach(p_i_send, p_i_recv)
+                    !$acc host_data use_device(p_i_send, p_i_recv)
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
-                        ib_buff_send(0), &
+                        p_i_send(0), &
                         gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
                         MPI_INTEGER, bc_y%beg, 1, &
-                        ib_buff_recv(0), &
+                        p_i_recv(0), &
                         gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
                         MPI_INTEGER, bc_y%beg, 0, &
                         MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
                     !$acc end host_data
+                    !$acc end data
                     !$acc wait
                 else
 #endif
@@ -1679,19 +1716,24 @@ contains
 
 #if defined(MFC_OpenACC)
                 if (rdma_mpi) then
-                    !$acc host_data use_device( ib_buff_recv, ib_buff_send )
+                    p_i_send => ib_buff_send
+                    p_i_recv => ib_buff_recv
+
+                    !$acc data attach(p_i_send, p_i_recv)
+                    !$acc host_data use_device(p_i_send, p_i_recv)
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
-                        ib_buff_send(0), &
+                        p_i_send(0), &
                         gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
                         MPI_INTEGER, bc_y%beg, 1, &
-                        ib_buff_recv(0), &
+                        p_i_recv(0), &
                         gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
                         MPI_INTEGER, bc_y%end, 1, &
                         MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
                     !$acc end host_data
+                    !$acc end data
                     !$acc wait
                 else
 #endif
@@ -1730,19 +1772,24 @@ contains
 
 #if defined(MFC_OpenACC)
                 if (rdma_mpi) then
-                    !$acc host_data use_device( ib_buff_recv, ib_buff_send )
+                    p_i_send => ib_buff_send
+                    p_i_recv => ib_buff_recv
+
+                    !$acc data attach(p_i_send, p_i_recv)
+                    !$acc host_data use_device(p_i_send, p_i_recv)
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
-                        ib_buff_send(0), &
+                        p_i_send(0), &
                         gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
                         MPI_INTEGER, bc_y%end, 0, &
-                        ib_buff_recv(0), &
+                        p_i_recv(0), &
                         gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
                         MPI_INTEGER, bc_y%end, 1, &
                         MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
                     !$acc end host_data
+                    !$acc end data
                     !$acc wait
                 else
 #endif
@@ -1808,19 +1855,24 @@ contains
 
 #if defined(MFC_OpenACC)
                 if (rdma_mpi) then
-                    !$acc host_data use_device( ib_buff_recv, ib_buff_send )
+                    p_i_send => ib_buff_send
+                    p_i_recv => ib_buff_recv
+
+                    !$acc data attach(p_i_send, p_i_recv)
+                    !$acc host_data use_device(p_i_send, p_i_recv)
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
-                        ib_buff_send(0), &
+                        p_i_send(0), &
                         gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
                         MPI_INTEGER, bc_z%end, 0, &
-                        ib_buff_recv(0), &
+                        p_i_recv(0), &
                         gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
                         MPI_INTEGER, bc_z%beg, 0, &
                         MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
                     !$acc end host_data
+                    !$acc end data
                     !$acc wait
                 else
 #endif
@@ -1859,19 +1911,24 @@ contains
 
 #if defined(MFC_OpenACC)
                 if (rdma_mpi) then
-                    !$acc host_data use_device( ib_buff_recv, ib_buff_send )
+                    p_i_send => ib_buff_send
+                    p_i_recv => ib_buff_recv
+
+                    !$acc data attach(p_i_send, p_i_recv)
+                    !$acc host_data use_device(p_i_send, p_i_recv)
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
-                        ib_buff_send(0), &
+                        p_i_send(0), &
                         gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
                         MPI_INTEGER, bc_z%beg, 1, &
-                        ib_buff_recv(0), &
+                        p_i_recv(0), &
                         gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
                         MPI_INTEGER, bc_z%beg, 0, &
                         MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
                     !$acc end host_data
+                    !$acc end data
                     !$acc wait
                 else
 #endif
@@ -1935,19 +1992,24 @@ contains
 
 #if defined(MFC_OpenACC)
                 if (rdma_mpi) then
-                    !$acc host_data use_device( ib_buff_recv, ib_buff_send )
+                    p_i_send => ib_buff_send
+                    p_i_recv => ib_buff_recv
+
+                    !$acc data attach(p_i_send, p_i_recv)
+                    !$acc host_data use_device(p_i_send, p_i_recv)
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
-                        ib_buff_send(0), &
+                        p_i_send(0), &
                         gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
                         MPI_INTEGER, bc_z%beg, 1, &
-                        ib_buff_recv(0), &
+                        p_i_recv(0), &
                         gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
                         MPI_INTEGER, bc_z%end, 1, &
                         MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
                     !$acc end host_data
+                    !$acc end data
                     !$acc wait
                 else
 #endif
@@ -1986,19 +2048,24 @@ contains
 
 #if defined(MFC_OpenACC)
                 if (rdma_mpi) then
-                    !$acc host_data use_device( ib_buff_recv, ib_buff_send )
+                    p_i_send => ib_buff_send
+                    p_i_recv => ib_buff_recv
+
+                    !$acc data attach(p_i_send, p_i_recv)
+                    !$acc host_data use_device(p_i_send, p_i_recv)
 
                     ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV( &
-                        ib_buff_send(0), &
+                        p_i_send(0), &
                         gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
                         MPI_INTEGER, bc_z%end, 0, &
-                        ib_buff_recv(0), &
+                        p_i_recv(0), &
                         gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
                         MPI_INTEGER, bc_z%end, 1, &
                         MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
                     !$acc end host_data
+                    !$acc end data
                     !$acc wait
                 else
 #endif
@@ -2064,6 +2131,7 @@ contains
 
         integer :: pack_offsets(1:3), unpack_offsets(1:3)
         integer :: pack_offset, unpack_offset
+        real(kind(0d0)), pointer :: p_send, p_recv
 
 #ifdef MFC_MPI
 
@@ -2158,19 +2226,24 @@ contains
         ! Send/Recv
         #:for rdma_mpi in [False, True]
             if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then
+                p_send => c_divs_buff_send(0)
+                p_recv => c_divs_buff_recv(0)
+
                 #:if rdma_mpi
-                    !$acc host_data use_device(c_divs_buff_recv, c_divs_buff_send)
+                    !$acc data attach(p_send, p_recv)
+                    !$acc host_data use_device(p_send, p_recv)
                 #:else
                     !$acc update host(c_divs_buff_send)
                 #:endif
 
                 call MPI_SENDRECV( &
-                    c_divs_buff_send(0), buffer_count, MPI_DOUBLE_PRECISION, dst_proc, send_tag, &
-                    c_divs_buff_recv(0), buffer_count, MPI_DOUBLE_PRECISION, src_proc, recv_tag, &
+                    p_send, buffer_count, MPI_DOUBLE_PRECISION, dst_proc, send_tag, &
+                    p_recv, buffer_count, MPI_DOUBLE_PRECISION, src_proc, recv_tag, &
                     MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
                 #:if rdma_mpi
                     !$acc end host_data
+                    !$acc end data
                     !$acc wait
                 #:else
                     !$acc update device(c_divs_buff_recv)
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 078b1c6020..0e008c932c 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -1305,6 +1305,194 @@ contains
 
     end subroutine s_apply_scalar_divergence_theorem
 
+    !>  Computes the scalar gradient fields via finite differences
+        !!  @param var Variable to compute derivative of
+        !!  @param grad_x First coordinate direction component of the derivative
+        !!  @param grad_y Second coordinate direction component of the derivative
+        !!  @param grad_z Third coordinate direction component of the derivative
+        !!  @param norm Norm of the gradient vector
+    subroutine s_compute_fd_gradient(var, grad_x, grad_y, grad_z, &
+                                     ix, iy, iz, buff_size_in)
+
+        type(scalar_field), intent(in) :: var
+        type(scalar_field), intent(inout) :: grad_x
+        type(scalar_field), intent(inout) :: grad_y
+        type(scalar_field), intent(inout) :: grad_z
+        type(int_bounds_info), intent(inout) :: ix, iy, iz
+        integer, intent(in) :: buff_size_in
+
+        integer :: j, k, l !< Generic loop iterators
+
+        ix%beg = -buff_size_in; ix%end = m + buff_size_in; 
+        if (n > 0) then
+            iy%beg = -buff_size_in; iy%end = n + buff_size_in
+        else
+            iy%beg = -1; iy%end = 1
+        end if
+
+        if (p > 0) then
+            iz%beg = -buff_size_in; iz%end = p + buff_size_in
+        else
+            iz%beg = -1; iz%end = 1
+        end if
+
+        is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
+
+        !$acc update device(is1_viscous, is2_viscous, is3_viscous)
+
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do l = is3_viscous%beg + 1, is3_viscous%end - 1
+            do k = is2_viscous%beg + 1, is2_viscous%end - 1
+                do j = is1_viscous%beg + 1, is1_viscous%end - 1
+                    grad_x%sf(j, k, l) = &
+                        (var%sf(j + 1, k, l) - var%sf(j - 1, k, l))/ &
+                        (x_cc(j + 1) - x_cc(j - 1))
+                end do
+            end do
+        end do
+
+        if (n > 0) then
+            !$acc parallel loop collapse(3) gang vector
+            do l = is3_viscous%beg + 1, is3_viscous%end - 1
+                do k = is2_viscous%beg + 1, is2_viscous%end - 1
+                    do j = is1_viscous%beg + 1, is1_viscous%end - 1
+                        grad_y%sf(j, k, l) = &
+                            (var%sf(j, k + 1, l) - var%sf(j, k - 1, l))/ &
+                            (y_cc(k + 1) - y_cc(k - 1))
+                    end do
+                end do
+            end do
+        end if
+
+        if (p > 0) then
+            !$acc parallel loop collapse(3) gang vector
+            do l = is3_viscous%beg + 1, is3_viscous%end - 1
+                do k = is2_viscous%beg + 1, is2_viscous%end - 1
+                    do j = is1_viscous%beg + 1, is1_viscous%end - 1
+                        grad_z%sf(j, k, l) = &
+                            (var%sf(j, k, l + 1) - var%sf(j, k, l - 1))/ &
+                            (z_cc(l + 1) - z_cc(l - 1))
+                    end do
+                end do
+            end do
+        end if
+
+        is1_viscous%beg = -buff_size_in; is1_viscous%end = m + buff_size_in; 
+        if (n > 0) then
+            is2_viscous%beg = -buff_size_in; is2_viscous%end = n + buff_size_in
+        else
+            is2_viscous%beg = 0; is2_viscous%end = 0
+        end if
+
+        if (p > 0) then
+            is3_viscous%beg = -buff_size_in; is3_viscous%end = p + buff_size_in
+        else
+            is3_viscous%beg = 0; is3_viscous%end = 0
+        end if
+
+        !$acc update device(is1_viscous, is2_viscous, is3_viscous)
+
+        !$acc parallel loop collapse(2) gang vector default(present)
+        do l = is3_viscous%beg, is3_viscous%end
+            do k = is2_viscous%beg, is2_viscous%end
+                grad_x%sf(is1_viscous%beg, k, l) = &
+                    (-3d0*var%sf(is1_viscous%beg, k, l) + 4d0*var%sf(is1_viscous%beg + 1, k, l) - var%sf(is1_viscous%beg + 2, k, l))/ &
+                    (x_cc(is1_viscous%beg + 2) - x_cc(is1_viscous%beg))
+                grad_x%sf(is1_viscous%end, k, l) = &
+                    (3d0*var%sf(is1_viscous%end, k, l) - 4d0*var%sf(is1_viscous%end - 1, k, l) + var%sf(is1_viscous%end - 2, k, l))/ &
+                    (x_cc(is1_viscous%end) - x_cc(is1_viscous%end - 2))
+            end do
+        end do
+        if (n > 0) then
+            !$acc parallel loop collapse(2) gang vector default(present)
+            do l = is3_viscous%beg, is3_viscous%end
+                do j = is1_viscous%beg, is1_viscous%end
+                    grad_y%sf(j, is2_viscous%beg, l) = &
+                        (-3d0*var%sf(j, is2_viscous%beg, l) + 4d0*var%sf(j, is2_viscous%beg + 1, l) - var%sf(j, is2_viscous%beg + 2, l))/ &
+                        (y_cc(is2_viscous%beg + 2) - y_cc(is2_viscous%beg))
+                    grad_y%sf(j, is2_viscous%end, l) = &
+                        (3d0*var%sf(j, is2_viscous%end, l) - 4d0*var%sf(j, is2_viscous%end - 1, l) + var%sf(j, is2_viscous%end - 2, l))/ &
+                        (y_cc(is2_viscous%end) - y_cc(is2_viscous%end - 2))
+                end do
+            end do
+            if (p > 0) then
+                !$acc parallel loop collapse(2) gang vector default(present)
+                do k = is2_viscous%beg, is2_viscous%end
+                    do j = is1_viscous%beg, is1_viscous%end
+                        grad_z%sf(j, k, is3_viscous%beg) = &
+                            (-3d0*var%sf(j, k, is3_viscous%beg) + 4d0*var%sf(j, k, is3_viscous%beg + 1) - var%sf(j, k, is3_viscous%beg + 2))/ &
+                            (z_cc(is3_viscous%beg + 2) - z_cc(is3_viscous%beg))
+                        grad_z%sf(j, k, is3_viscous%end) = &
+                            (3d0*var%sf(j, k, is3_viscous%end) - 4d0*var%sf(j, k, is3_viscous%end - 1) + var%sf(j, k, is3_viscous%end - 2))/ &
+                            (z_cc(is3_viscous%end) - z_cc(is3_viscous%end - 2))
+                    end do
+                end do
+            end if
+        end if
+
+        if (bc_x%beg <= -3) then
+            !$acc parallel loop collapse(2) gang vector default(present)
+            do l = is3_viscous%beg, is3_viscous%end
+                do k = is2_viscous%beg, is2_viscous%end
+                    grad_x%sf(0, k, l) = (-3d0*var%sf(0, k, l) + 4d0*var%sf(1, k, l) - var%sf(2, k, l))/ &
+                                         (x_cc(2) - x_cc(0))
+                end do
+            end do
+        end if
+        if (bc_x%end <= -3) then
+            !$acc parallel loop collapse(2) gang vector default(present)
+            do l = is3_viscous%beg, is3_viscous%end
+                do k = is2_viscous%beg, is2_viscous%end
+                    grad_x%sf(m, k, l) = (3d0*var%sf(m, k, l) - 4d0*var%sf(m - 1, k, l) + var%sf(m - 2, k, l))/ &
+                                         (x_cc(m) - x_cc(m - 2))
+                end do
+            end do
+        end if
+        if (n > 0) then
+            if (bc_y%beg <= -3 .and. bc_y%beg /= -13) then
+                !$acc parallel loop collapse(2) gang vector default(present)
+                do l = is3_viscous%beg, is3_viscous%end
+                    do j = is1_viscous%beg, is1_viscous%end
+                        grad_y%sf(j, 0, l) = (-3d0*var%sf(j, 0, l) + 4d0*var%sf(j, 1, l) - var%sf(j, 2, l))/ &
+                                             (y_cc(2) - y_cc(0))
+                    end do
+                end do
+            end if
+            if (bc_y%end <= -3) then
+                !$acc parallel loop collapse(2) gang vector default(present)
+                do l = is3_viscous%beg, is3_viscous%end
+                    do j = is1_viscous%beg, is1_viscous%end
+                        grad_y%sf(j, n, l) = (3d0*var%sf(j, n, l) - 4d0*var%sf(j, n - 1, l) + var%sf(j, n - 2, l))/ &
+                                             (y_cc(n) - y_cc(n - 2))
+                    end do
+                end do
+            end if
+            if (p > 0) then
+                if (bc_z%beg <= -3) then
+                    !$acc parallel loop collapse(2) gang vector default(present)
+                    do k = is2_viscous%beg, is2_viscous%end
+                        do j = is1_viscous%beg, is1_viscous%end
+                            grad_z%sf(j, k, 0) = &
+                                (-3d0*var%sf(j, k, 0) + 4d0*var%sf(j, k, 1) - var%sf(j, k, 2))/ &
+                                (z_cc(2) - z_cc(0))
+                        end do
+                    end do
+                end if
+                if (bc_z%end <= -3) then
+                    !$acc parallel loop collapse(2) gang vector default(present)
+                    do k = is2_viscous%beg, is2_viscous%end
+                        do j = is1_viscous%beg, is1_viscous%end
+                            grad_z%sf(j, k, p) = &
+                                (3d0*var%sf(j, k, p) - 4d0*var%sf(j, k, p - 1) + var%sf(j, k, p - 2))/ &
+                                (z_cc(p) - z_cc(p - 2))
+                        end do
+                    end do
+                end if
+            end if
+        end if
+
+    end subroutine s_compute_fd_gradient
+
     subroutine s_finalize_viscous_module()
 
         integer :: i
diff --git a/toolchain/mfc/run/run.py b/toolchain/mfc/run/run.py
index 96d39877e2..58cbdf9d07 100644
--- a/toolchain/mfc/run/run.py
+++ b/toolchain/mfc/run/run.py
@@ -93,7 +93,11 @@ def __get_template() -> Template:
 def __generate_job_script(targets, case: input.MFCInputFile):
     env = {}
     if ARG('gpus') is not None:
-        env['CUDA_VISIBLE_DEVICES'] = ','.join([str(_) for _ in ARG('gpus')])
+        gpu_ids = ','.join([str(_) for _ in ARG('gpus')])
+        env.update({
+            'CUDA_VISIBLE_DEVICES': gpu_ids,
+            'HIP_VISIBLE_DEVICES':  gpu_ids
+        })
 
     content = __get_template().render(
         **{**ARGS(), 'targets': targets},
diff --git a/toolchain/mfc/test/case.py b/toolchain/mfc/test/case.py
index d4fa948ef4..164ded4b7d 100644
--- a/toolchain/mfc/test/case.py
+++ b/toolchain/mfc/test/case.py
@@ -128,7 +128,7 @@ def run(self, targets: typing.List[typing.Union[str, MFCTarget]], gpus: typing.S
             *jobs, "-t", *target_names, *gpus_select, *ARG("--")
         ]
 
-        return common.system(command, print_cmd=False, text=True, capture_output=True)
+        return common.system(command, print_cmd=False, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     def get_uuid(self) -> str:
         return trace_to_uuid(self.trace)
diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index e2417d4ea7..0d300e58bb 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -191,9 +191,9 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
 
             h5dump = f"{HDF5.get_install_dirpath(MFCInputFile(os.path.basename(case.get_filepath()), case.get_dirpath(), case.get_parameters()))}/bin/h5dump"
 
-            if ARG("sys_hdf5"):
+            if not os.path.exists(h5dump or ""):
                 if not does_command_exist("h5dump"):
-                    raise MFCException("--sys-hdf5 was specified and h5dump couldn't be found.")
+                    raise MFCException("h5dump couldn't be found.")
 
                 h5dump = shutil.which("h5dump")
 
diff --git a/toolchain/modules b/toolchain/modules
index 0dc9576c87..d73c600773 100644
--- a/toolchain/modules
+++ b/toolchain/modules
@@ -47,11 +47,9 @@ p-cpu gcc/12.3.0 openmpi/4.1.5
 p-gpu nvhpc/24.5 hpcx/2.19-cuda cuda/12.1.1
 
 f     OLCF Frontier
-f-gpu rocm/5.5.1 craype-accel-amd-gfx90a
-f-all cpe/23.09
-f-all cray-fftw cray-hdf5 cray-mpich/8.1.26 cce/16.0.1 
-f-all rocm/5.5.1 cray-python omniperf
-f-cpu
+f-all cce/18.0.0 cpe/24.07 rocm/6.1.3 cray-mpich/8.1.28
+f-all cray-fftw cray-hdf5 cray-python omniperf
+f-gpu craype-accel-amd-gfx90a
 
 d     NCSA Delta
 d-all python/3.11.6