From ace0a44d5bddc36b05067816368869142cdbed46 Mon Sep 17 00:00:00 2001
From: Igor Sfiligoi <isfiligoi@sdsc.edu>
Date: Wed, 30 Oct 2024 10:09:50 -0700
Subject: [PATCH] Optimize rhs by removing large pre-computer arrays (#415)

* Switch id and ic in dtheta and icd_c, to improve memory access on GPUs

* Remove icd_c and dtheta, compute on the fly. 5x faster than original

* Make inner rhs loop iterative, remove complex int ops to improve CPU performance

* Remove blocks and add omp/acc private statements. Blocks not supported by nvfortran

* Minor cpu optimization, conditionally move loop

* Properly cleanup thfac_itor

---------

Co-authored-by: Igor Sfiligoi <igor.sfiligoi@ga.com>
---
 cgyro/src/cgyro_check_memory.F90 |   3 -
 cgyro/src/cgyro_cleanup.F90      |  16 ++---
 cgyro/src/cgyro_globals.F90      |   4 +-
 cgyro/src/cgyro_init_arrays.F90  |  49 +++++++------
 cgyro/src/cgyro_init_manager.F90 |   3 -
 cgyro/src/cgyro_rhs.F90          | 119 ++++++++++++++++++++++++-------
 6 files changed, 123 insertions(+), 71 deletions(-)

diff --git a/cgyro/src/cgyro_check_memory.F90 b/cgyro/src/cgyro_check_memory.F90
index fe44f9e21..0d52afbe4 100644
--- a/cgyro/src/cgyro_check_memory.F90
+++ b/cgyro/src/cgyro_check_memory.F90
@@ -24,9 +24,6 @@ subroutine cgyro_check_memory(datafile)
      call cgyro_alloc_add(io,nc*4.0,'it_c')
      call cgyro_alloc_add(io,n_radial*n_theta*4.0,'ic_c')
      call cgyro_alloc_add(io,n_energy*n_xi*n_species*4.0,'iv_v')
-     call cgyro_alloc_add(io,nc*(2*nup_theta+1)*nt_loc*4.0,'icd_c')
-     call cgyro_alloc_add_3d(io,(2*nup_theta+1),nc,nt_loc,16,'dtheta')
-     call cgyro_alloc_add_3d(io,(2*nup_theta+1),nc,nt_loc,16,'dtheta_up')
 
      write(io,*)
      write(io,*) 'Fields and field solve'
diff --git a/cgyro/src/cgyro_cleanup.F90 b/cgyro/src/cgyro_cleanup.F90
index d868bd20a..0f3d0dd68 100644
--- a/cgyro/src/cgyro_cleanup.F90
+++ b/cgyro/src/cgyro_cleanup.F90
@@ -125,22 +125,14 @@ subroutine cgyro_cleanup
   if(allocated(cflux_tave))          deallocate(cflux_tave)
   if(allocated(gflux_tave))          deallocate(gflux_tave)
   if(allocated(recv_status))         deallocate(recv_status)
-  if(allocated(icd_c))  then
-     ccl_del_device(icd_c)     
-     deallocate(icd_c)
-  endif
-  if(allocated(dtheta)) then
-     ccl_del_device(dtheta)     
-     deallocate(dtheta)
-  endif
-  if(allocated(dtheta_up))  then
-     ccl_del_device(dtheta_up)     
-     deallocate(dtheta_up)
-  endif
   if(allocated(source)) then
       ccl_del_device(source)      
      deallocate(source)
   endif
+  if(allocated(thfac_itor)) then
+      ccl_del_device(thfac_itor)
+     deallocate(thfac_itor)
+  endif
   if(allocated(h0_old)) then
      ccl_del_device(h0_old)      
      deallocate(h0_old)
diff --git a/cgyro/src/cgyro_globals.F90 b/cgyro/src/cgyro_globals.F90
index c91c3bdf0..a990e88f1 100644
--- a/cgyro/src/cgyro_globals.F90
+++ b/cgyro/src/cgyro_globals.F90
@@ -328,9 +328,7 @@ module cgyro_globals
   real, dimension(:), allocatable :: theta
   real, dimension(:), allocatable :: uderiv
   real, dimension(:), allocatable :: cderiv
-  integer, dimension(:,:,:), allocatable :: icd_c
-  complex, dimension(:,:,:), allocatable :: dtheta
-  complex, dimension(:,:,:), allocatable :: dtheta_up
+  complex, dimension(:,:), allocatable :: thfac_itor
   !
   ! Wavenumber advection
   integer :: source_flag
diff --git a/cgyro/src/cgyro_init_arrays.F90 b/cgyro/src/cgyro_init_arrays.F90
index e4b156a23..b5318f4f2 100644
--- a/cgyro/src/cgyro_init_arrays.F90
+++ b/cgyro/src/cgyro_init_arrays.F90
@@ -17,7 +17,7 @@ subroutine cgyro_init_arrays
   integer :: i_field
   integer :: l,ll
   integer :: iltheta_min
-  complex :: thfac,carg
+  complex :: carg
   complex :: jval
   real, dimension(:,:,:), allocatable :: res_loc
   real, dimension(:,:,:), allocatable :: jloc_c
@@ -365,32 +365,35 @@ subroutine cgyro_init_arrays
   !-------------------------------------------------------------------------
   ! Streaming coefficient arrays
   !
+  allocate(thfac_itor(0:2,nt1:nt2))
+
   do itor=nt1,nt2
-   do ir=1,n_radial
-     do it=1,n_theta
-        do id=-nup_theta,nup_theta
-           jt = modulo(it+id-1,n_theta)+1
-           if (it+id < 1) then
-              thfac = exp(2*pi*i_c*k_theta_base*itor*rmin)
-              jr = modulo(ir-itor*box_size*sign_qs-1,n_radial)+1
-           else if (it+id > n_theta) then
-              thfac = exp(-2*pi*i_c*k_theta_base*itor*rmin)
-              jr = modulo(ir+itor*box_size*sign_qs-1,n_radial)+1
-           else
-              thfac = (1.0,0.0)
-              jr = ir
-           endif
-           dtheta(id, ic_c(ir,it), itor)    = cderiv(id)*thfac
-           dtheta_up(id, ic_c(ir,it), itor) = uderiv(id)*thfac*up_theta
-           icd_c(id, ic_c(ir,it), itor)     = ic_c(jr,modulo(it+id-1,n_theta)+1)
-        enddo
-     enddo
-   enddo
+   thfac_itor(0,itor) = exp(2*pi*i_c*k_theta_base*itor*rmin)
+   thfac_itor(1,itor) = (1.0,0.0)
+   thfac_itor(2,itor) = exp(-2*pi*i_c*k_theta_base*itor*rmin)
+   !do ir=1,n_radial
+   !  do it=1,n_theta
+   !     do id=-nup_theta,nup_theta
+   !        ! jt = modulo(it+id-1,n_theta)+1
+   !        if (it+id < 1) then
+   !           jr = modulo(ir-itor*box_size*sign_qs-1,n_radial)+1
+   !        else if (it+id > n_theta) then
+   !           jr = modulo(ir+itor*box_size*sign_qs-1,n_radial)+1
+   !        else
+   !           jr = ir
+   !        endif
+   !        thfac = thfac_itor((n_theta+it+id-1)/n_theta,itor)
+   !        dtheta(ic_c(ir,it), id, itor)    = cderiv(id)*thfac
+   !        dtheta_up(ic_c(ir,it), id, itor) = uderiv(id)*thfac*up_theta
+   !        icd_c(ic_c(ir,it), id, itor)     = ic_c(jr,modulo(it+id-1,n_theta)+1)
+   !     enddo
+   !  enddo
+   !enddo
   enddo
 #if defined(OMPGPU)
-!$omp target enter data map(to:dtheta,dtheta_up,icd_c,c_wave)
+!$omp target enter data map(to:c_wave,thfac_itor,cderiv,uderiv)
 #elif defined(_OPENACC)
-!$acc enter data copyin(dtheta,dtheta_up,icd_c,c_wave)
+!$acc enter data copyin(c_wave,thfac_itor,cderiv,uderiv)
 #endif
 
   ! Streaming coefficients (for speed optimization)
diff --git a/cgyro/src/cgyro_init_manager.F90 b/cgyro/src/cgyro_init_manager.F90
index aec0a8149..2c8b1ba3b 100644
--- a/cgyro/src/cgyro_init_manager.F90
+++ b/cgyro/src/cgyro_init_manager.F90
@@ -183,9 +183,6 @@ subroutine cgyro_init_manager
      
      allocate(recv_status(MPI_STATUS_SIZE))
 
-     allocate(icd_c(-nup_theta:nup_theta, nc ,nt1:nt2))
-     allocate(dtheta(-nup_theta:nup_theta, nc ,nt1:nt2))
-     allocate(dtheta_up(-nup_theta:nup_theta, nc,nt1:nt2))
      allocate(source(n_theta,nv_loc,nt1:nt2))
 
 #if defined(OMPGPU)
diff --git a/cgyro/src/cgyro_rhs.F90 b/cgyro/src/cgyro_rhs.F90
index f42e6d12b..e8b538340 100644
--- a/cgyro/src/cgyro_rhs.F90
+++ b/cgyro/src/cgyro_rhs.F90
@@ -84,13 +84,16 @@ subroutine cgyro_rhs_comp1(ij,update_cap)
 #if defined(OMPGPU)
   ! no async for OMPGPU for now
 !$omp target teams distribute parallel do simd collapse(3) &
+!$omp&  firstprivate(nv2,nv1,nt2,nt1,nc,update_cap,ij) &
 !$omp&  private(iv,ic,iv_loc,rhs_el,h_el,is,cap_el,my_psi)
 #elif defined(_OPENACC)
-!$acc  parallel loop gang vector collapse(3) & 
-!$acc& private(iv,ic,iv_loc,rhs_el,h_el,is,cap_el,my_psi) async(1)
+!$acc parallel loop gang vector collapse(3) async(1) &
+!$acc&  firstprivate(nv2,nv1,nt2,nt1,nc,update_cap,ij) &
+!$acc&  private(iv,ic,iv_loc,rhs_el,h_el,is,cap_el,my_psi)
 #else
 !$omp parallel do collapse(2) &
-!$omp& private(iv,iv_loc,itor,is,ic,rhs_el,h_el,cap_el,my_psi) 
+!$omp&  firstprivate(nv2,nv1,nt2,nt1,nc,update_cap,ij) &
+!$omp&  private(iv,iv_loc,itor,is,ic,rhs_el,h_el,cap_el,my_psi) 
 #endif
   do itor=nt1,nt2
    do iv=nv1,nv2
@@ -141,9 +144,21 @@ subroutine cgyro_rhs_comp2(ij)
 
   integer, intent(in) :: ij
   !--------------------------------
-  integer :: is,itor
-  integer :: id,jc
-  real :: rval,rval2
+  integer :: itor,ir,it
+  ! ir loop specific
+  integer :: itorbox
+  !integer :: iv_loc
+  integer :: is
+  integer :: jr0(0:2)   ! n_theta*(pre-compute jr-1)
+  real :: vel_xi
+  ! it loop specific
+  !integer :: ic
+  integer :: id
+  integer :: itd   ! precompute modulo(it+id-1,n_theta)+1, use for iteration
+  integer :: itd_class
+  integer :: jc
+  real :: rval,rval2,rval2s
+  complex :: thfac
   complex :: rhs_stream
 
 
@@ -157,40 +172,90 @@ subroutine cgyro_rhs_comp2(ij)
 !$acc& present(cap_h_c) &
 !$acc& present(is_v,ix_v,ie_v,it_c) &
 !$acc& present(omega_stream,xi,vel) &
-!$acc& present(dtheta,dtheta_up,icd_c)
+!$acc& present(thfac_itor,cderiv,uderiv)
 
 #endif
   ! add stream to rhs
 #if defined(OMPGPU)
   ! no async for OMPGPU for now
-!$omp target teams distribute parallel do simd collapse(3) &
-!$omp&  private(iv,ic,iv_loc,is,rval,rval2,rhs_stream,id,jc)
+!$omp target teams distribute parallel do simd collapse(4) &
+!$omp&  firstprivate(n_radial,nv2,nv1,nt2,nt1,n_theta) &
+!$omp&  firstprivate(sign_qs,nup_theta,ij,box_size,up_theta) &
+!$omp&  private(itor,iv,ir,it) &
+!$omp&  private(itorbox,iv_loc,is,jr0,vel_xi) &
+!$omp&  private(ic,id,itd,itd_class,jc,rval,rval2,rval2s,thfac,rhs_stream)
 #elif defined(_OPENACC)
-!$acc  parallel loop gang vector collapse(3) & 
-!$acc& private(iv,ic,iv_loc,is,rval,rval2,rhs_stream,id,jc) async(1)
+!$acc  parallel loop gang vector collapse(4) async(1) &
+!$acc&  firstprivate(n_radial,nv2,nv1,nt2,nt1,n_theta) &
+!$acc&  firstprivate(sign_qs,nup_theta,ij,box_size,up_theta) &
+!$acc&  private(itor,iv,ir,it) &
+!$acc&  private(itorbox,iv_loc,is,jr0,vel_xi) &
+!$acc&  private(ic,id,itd,itd_class,jc,rval,rval2,rval2s,thfac,rhs_stream)
 #else
-!$omp parallel do collapse(2) &
-!$omp& private(itor,iv,iv_loc,is,ic,rval,rval2,rhs_stream,id,jc) 
+!$omp parallel do collapse(3) &
+!$omp&  firstprivate(n_radial,nv2,nv1,nt2,nt1,n_theta) &
+!$omp&  firstprivate(sign_qs,nup_theta,ij,box_size,up_theta) &
+!$omp&  private(itor,iv,ir,it) &
+!$omp&  private(itorbox,iv_loc,is,jr0,vel_xi)
 #endif
   do itor=nt1,nt2
    do iv=nv1,nv2
-     do ic=1,nc
+    do ir=1,n_radial
+#if defined(OMPGPU) || defined(_OPENACC)
+     ! keep loop high for maximal collapse
+     do it=1,n_theta
+#endif
+        itorbox = itor*box_size*sign_qs
         iv_loc = iv-nv1+1
-        is = is_v(iv)
-        ! Parallel streaming with upwind dissipation 
-        rval  = omega_stream(it_c(ic),is,itor)*vel(ie_v(iv))*xi(ix_v(iv))
-        rval2 = abs(omega_stream(it_c(ic),is,itor))
-
-        rhs_stream = 0.0
-        do id=-nup_theta,nup_theta
-           jc = icd_c(id, ic, itor)
-           rhs_stream = rhs_stream &
-                -rval*dtheta(id,ic,itor)*cap_h_c(jc,iv_loc,itor)  &
-                -rval2*dtheta_up(id,ic,itor)*g_x(jc,iv_loc,itor)
-        enddo
 
-        rhs(ic,iv_loc,itor,ij) = rhs(ic,iv_loc,itor,ij) + rhs_stream
+        is = is_v(iv)
+        vel_xi = vel(ie_v(iv))*xi(ix_v(iv))
+        jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial)
+        jr0(1) = n_theta*(ir-1)
+        jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial)
+
+#if !(defined(OMPGPU) || defined(_OPENACC))
+        ! loop as late as possible, to minimize recompute
+!$omp loop &
+!$omp&  private(ic,id,itd,itd_class,jc,rval,rval2,rval2s,thfac,rhs_stream)
+        do it=1,n_theta
+#endif
+          ic = (ir-1)*n_theta + it ! ic_c(ir,it)
+
+          ! Parallel streaming with upwind dissipation 
+          rval2s = omega_stream(it,is,itor)
+          rval  = rval2s*vel_xi
+          rval2 = abs(rval2s)
+
+          rhs_stream = 0.0
+
+          !icd_c(ic, id, itor)     = ic_c(jr,modulo(it+id-1,n_theta)+1)
+          !jc = icd_c(ic, id, itor)
+          !dtheta(ic, id, itor)    := cderiv(id)*thfac
+          !dtheta_up(ic, id, itor) := uderiv(id)*thfac*up_theta
+          itd = n_theta+it-nup_theta
+          itd_class = 0
+          jc = jr0(itd_class)+itd
+          thfac = thfac_itor(itd_class,itor)
+          do id=-nup_theta,nup_theta
+              if (itd > n_theta) then
+                ! move to next itd_class of compute
+                itd = itd - n_theta
+                itd_class = itd_class + 1
+                jc = jr0(itd_class)+itd
+                thfac = thfac_itor(itd_class,itor)
+              endif
+              rhs_stream = rhs_stream &
+                - thfac  &
+                  * ( rval*cderiv(id)*cap_h_c(jc,iv_loc,itor)  &
+                    + rval2*uderiv(id)*up_theta*g_x(jc,iv_loc,itor) )
+              itd = itd + 1
+              jc = jc + 1
+          enddo
+
+          rhs(ic,iv_loc,itor,ij) = rhs(ic,iv_loc,itor,ij) + rhs_stream
      enddo
+    enddo
    enddo
   enddo