From 78d21f43e6ab0f0f147c9c0c7397bf8e35f185dd Mon Sep 17 00:00:00 2001
From: Igor Sfiligoi <isfiligoi@sdsc.edu>
Date: Tue, 11 Jun 2024 19:39:11 -0500
Subject: [PATCH] Optimize the upwind for CPU, by avoiding false sharing on
 write

---
 cgyro/src/cgyro_upwind.F90 | 96 ++++++++++++++++++++++++++++++++++----
 1 file changed, 88 insertions(+), 8 deletions(-)

diff --git a/cgyro/src/cgyro_upwind.F90 b/cgyro/src/cgyro_upwind.F90
index 623c09506..7adb1e244 100644
--- a/cgyro/src/cgyro_upwind.F90
+++ b/cgyro/src/cgyro_upwind.F90
@@ -29,18 +29,19 @@ subroutine cgyro_upwind_prepare_async_r64
 
   call timer_lib_in('str')
 
+#if defined(OMPGPU) || defined(_OPENACC)
+
+  ! running on the GPU
+  ! The vector/SIMT nature of the GPU allows us to compute effeciently both together
 #if defined(OMPGPU)
   ! No async for OMPGPU for now
 !$omp target teams distribute parallel do simd collapse(3) &
 !$omp&         private(res_loc,iv,iv_loc,g_val) 
-#elif defined(_OPENACC)
+#else
 !$acc parallel loop collapse(3) gang vector independent async(1) &
 !$acc&         private(res_loc,iv,iv_loc,g_val) &
 !$acc&         present(g_x,h_x,z,temp,jvec_c,field,upfac1,is_v,upwind_res_loc) &
 !$acc&         present(nt1,nt2,ns1,ns2,nc,nv1,nv2,n_field) default(none)
-#else
-!$omp parallel do collapse(3) &
-!$omp&         private(res_loc,iv,iv_loc) 
 #endif
   do itor=nt1,nt2
    do is=ns1,ns2
@@ -63,6 +64,45 @@ subroutine cgyro_upwind_prepare_async_r64
    enddo
   enddo
 
+#else
+
+  ! running on the CPU
+  ! Split, to avoid false sharing
+!$omp parallel do collapse(3) &
+!$omp&         private(g_val,iv,iv_loc,is) 
+  do itor=nt1,nt2
+    do iv=nv1,nv2
+      do ic=1,nc
+          is = is_v(iv)
+          iv_loc = iv-nv1+1
+          g_val = h_x(ic,iv_loc,itor)
+          if (n_field > 1) then
+             g_val = g_val + & 
+                   (z(is)/temp(is))*jvec_c(2,ic,iv_loc,itor)*field(2,ic,itor)
+          endif
+          g_x(ic,iv_loc,itor) = g_val
+      enddo
+    enddo
+  enddo
+!$omp parallel do collapse(3) &
+!$omp&         private(res_loc,iv,iv_loc,is) 
+  do itor=nt1,nt2
+   do is=ns1,ns2
+     do ic=1,nc
+       res_loc = (0.0,0.0)
+       do iv=nv1,nv2
+          iv_loc = iv-nv1+1
+          if (is == is_v(iv)) then
+             res_loc = res_loc+upfac1(ic,iv_loc,itor)*g_x(ic,iv_loc,itor)
+          endif
+       enddo
+       upwind_res_loc(ic,is,itor) = res_loc
+    enddo
+   enddo
+  enddo
+
+#endif
+
   call timer_lib_out('str')
 
 end subroutine cgyro_upwind_prepare_async_r64
@@ -127,18 +167,19 @@ subroutine cgyro_upwind_prepare_async_r32
 
   call timer_lib_in('str')
 
+#if defined(OMPGPU) || defined(_OPENACC)
+
+  ! running on the GPU
+  ! The vector/SIMT nature of the GPU allows us to compute effeciently both together
 #if defined(OMPGPU)
   ! no sync for OMPGPU for now
 !$omp target teams distribute parallel do simd collapse(3) &
 !$omp&         private(res_loc,iv,iv_loc,g_val) 
-#elif defined(_OPENACC)
+#else
 !$acc parallel loop collapse(3) gang vector independent async(1) &
 !$acc&         private(res_loc,iv,iv_loc,g_val) &
 !$acc&         present(g_x,h_x,z,temp,jvec_c,field,upfac1,is_v,upwind32_res_loc) &
 !$acc&         present(nt1,nt2,ns1,ns2,nc,nv1,nv2,n_field) default(none)
-#else
-!$omp parallel do collapse(3) &
-!$omp&         private(res_loc,iv,iv_loc,g_val)
 #endif
   do itor=nt1,nt2
    do is=ns1,ns2
@@ -162,6 +203,45 @@ subroutine cgyro_upwind_prepare_async_r32
    enddo
   enddo
 
+#else
+
+  ! running on the CPU
+  ! Split, to avoid false sharing
+!$omp parallel do collapse(3) &
+!$omp&         private(g_val,iv,iv_loc,is) 
+  do itor=nt1,nt2
+    do iv=nv1,nv2
+      do ic=1,nc
+          is = is_v(iv)
+          iv_loc = iv-nv1+1
+          g_val = h_x(ic,iv_loc,itor)
+          if (n_field > 1) then
+             g_val = g_val + & 
+                   (z(is)/temp(is))*jvec_c(2,ic,iv_loc,itor)*field(2,ic,itor)
+          endif
+          g_x(ic,iv_loc,itor) = g_val
+      enddo
+    enddo
+  enddo
+!$omp parallel do collapse(3) &
+!$omp&         private(res_loc,iv,iv_loc,is) 
+  do itor=nt1,nt2
+   do is=ns1,ns2
+     do ic=1,nc
+       res_loc = (0.0,0.0)
+       do iv=nv1,nv2
+          iv_loc = iv-nv1+1
+          if (is == is_v(iv)) then
+             res_loc = res_loc+upfac1(ic,iv_loc,itor)*g_x(ic,iv_loc,itor)
+          endif
+       enddo
+       upwind32_res_loc(ic,is,itor) = res_loc
+    enddo
+   enddo
+  enddo
+
+#endif
+
   call timer_lib_out('str')
 
 end subroutine cgyro_upwind_prepare_async_r32