diff --git a/cgyro/src/cgyro_check_memory.F90 b/cgyro/src/cgyro_check_memory.F90
index 84a821f28..304b7e798 100644
--- a/cgyro/src/cgyro_check_memory.F90
+++ b/cgyro/src/cgyro_check_memory.F90
@@ -106,6 +106,9 @@ subroutine cgyro_check_memory(datafile)
      call cgyro_alloc_add_4d(io,n_field,nc,nv_loc,nt_loc,16,'omega_ss')
      call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'omega_cap_h')
      call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'omega_h')
+     if (triad_print_flag == 1) then
+       call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'diss_r')
+     endif
      call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'h_x')
      call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'h0_x')
      call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'g_x')
@@ -133,6 +136,12 @@ subroutine cgyro_check_memory(datafile)
         call cgyro_alloc_add_3d(io,n_radial,nt_loc,nsplitA*n_toroidal_procs,16,'fpackA')
         call cgyro_alloc_add_3d(io,n_radial,nt_loc,nsplitB*n_toroidal_procs,16,'fpackB')
         call cgyro_alloc_add_4d(io,n_field,n_radial,n_jtheta,n_toroidal,16,'gpack')
+        if (triad_print_flag == 1) then
+          call cgyro_alloc_add_4d(io,n_radial,nt_loc,nsplitA,n_toroidal_procs,16,'eA_nl')
+          call cgyro_alloc_add_4d(io,n_radial,nt_loc,nsplitB,n_toroidal_procs,16,'eB_nl')
+          call cgyro_alloc_add_3d(io,n_radial,nt_loc,nsplitA*n_toroidal_procs,16,'epackA')
+          call cgyro_alloc_add_3d(io,n_radial,nt_loc,nsplitB*n_toroidal_procs,16,'epackB')
+        endif
      endif
 
      write(io,*)
diff --git a/cgyro/src/cgyro_cleanup.F90 b/cgyro/src/cgyro_cleanup.F90
index 7828e8bf5..177021040 100644
--- a/cgyro/src/cgyro_cleanup.F90
+++ b/cgyro/src/cgyro_cleanup.F90
@@ -43,21 +43,33 @@ subroutine cgyro_cleanup
      ccl_del_device(vel2)
      deallocate(vel2)
   endif
-  if(allocated(w_e))           deallocate(w_e)
+  if(allocated(w_e))   then
+     ccl_del_device(w_e)
+     deallocate(w_e)
+  endif
   if(allocated(e_deriv1_mat))  deallocate(e_deriv1_mat)
   if(allocated(e_deriv1_rot_mat))  deallocate(e_deriv1_rot_mat)
   if(allocated(xi))            then
      ccl_del_device(xi)
      deallocate(xi)
   endif
-  if(allocated(w_xi))          deallocate(w_xi)
-  if(allocated(w_exi))         deallocate(w_exi)
+  if(allocated(w_xi))   then
+     ccl_del_device(w_xi)
+     deallocate(w_xi)
+  endif
+  if(allocated(w_exi))   then
+     ccl_del_device(w_exi)
+     deallocate(w_exi)
+  endif
   if(allocated(xi_lor_mat))    deallocate(xi_lor_mat)
   if(allocated(xi_deriv_mat))  deallocate(xi_deriv_mat)
   
   if(allocated(theta))          deallocate(theta)
   if(allocated(thetab))         deallocate(thetab)
-  if(allocated(w_theta))        deallocate(w_theta)
+  if(allocated(w_theta))   then
+     ccl_del_device(w_theta)
+     deallocate(w_theta)
+  endif
   if(allocated(g_theta))        deallocate(g_theta)
   if(allocated(g_theta_geo))    deallocate(g_theta_geo)
   if(allocated(bmag))           deallocate(bmag)
@@ -83,7 +95,10 @@ subroutine cgyro_cleanup
   if(allocated(lambda_rot))          deallocate(lambda_rot)
   if(allocated(dlambda_rot))         deallocate(dlambda_rot)
   if(allocated(dens_rot))            deallocate(dens_rot)
-  if(allocated(dens2_rot))           deallocate(dens2_rot)
+  if(allocated(dens2_rot))   then
+     ccl_del_device(dens2_rot)
+     deallocate(dens2_rot)
+  endif
   if(allocated(dens_ele_rot))        deallocate(dens_ele_rot)
   if(allocated(dens_avg_rot))        deallocate(dens_avg_rot)
   if(allocated(dlnndr_avg_rot))      deallocate(dlnndr_avg_rot)
@@ -137,6 +152,18 @@ subroutine cgyro_cleanup
       ccl_del_device(source)      
      deallocate(source)
   endif
+  if(allocated(triad)) then
+      ccl_del_device(triad)      
+     deallocate(triad)
+  endif
+  if(allocated(triad_loc)) then
+      ccl_del_device(triad_loc)      
+     deallocate(triad_loc)
+  endif
+  if(allocated(triad_loc_old)) then
+      ccl_del_device(triad_loc_old)      
+     deallocate(triad_loc_old)
+  endif
   if(allocated(thfac_itor)) then
       ccl_del_device(thfac_itor)
      deallocate(thfac_itor)
@@ -201,6 +228,10 @@ subroutine cgyro_cleanup
      ccl_del_device(omega_sbeta)        
      deallocate(omega_sbeta)
   endif
+  if(allocated(diss_r)) then
+     ccl_del_device(diss_r)        
+     deallocate(diss_r)
+  endif
   if(allocated(jvec_c))  then
      ccl_del_device(jvec_c)     
      deallocate(jvec_c)
@@ -261,6 +292,14 @@ subroutine cgyro_cleanup
      ccl_del_device(g_nl)
      deallocate(g_nl)
   endif
+  if(allocated(eA_nl))   then
+     ccl_del_device(eA_nl)
+     deallocate(eA_nl)
+  endif
+  if(allocated(eB_nl))   then
+     ccl_del_device(eB_nl)
+     deallocate(eB_nl)
+  endif
   if(allocated(fpackA))   then
      ccl_del_device(fpackA)
      deallocate(fpackA)
@@ -273,6 +312,14 @@ subroutine cgyro_cleanup
      ccl_del_device(gpack)
      deallocate(gpack)
   endif
+  if(allocated(epackA))   then
+     ccl_del_device(epackA)
+     deallocate(epackA)
+  endif
+  if(allocated(epackB))   then
+     ccl_del_device(epackB)
+     deallocate(epackB)
+  endif
   if (allocated(cmat)) then
      ccl_del_bigdevice(cmat)
      deallocate(cmat)
diff --git a/cgyro/src/cgyro_equilibrium.F90 b/cgyro/src/cgyro_equilibrium.F90
index 453ea1681..a4edcfd6a 100644
--- a/cgyro/src/cgyro_equilibrium.F90
+++ b/cgyro/src/cgyro_equilibrium.F90
@@ -277,9 +277,9 @@ subroutine cgyro_equilibrium
   enddo
 
 #if defined(OMPGPU)
-!$omp target enter data map(to:xi,omega_stream)
+!$omp target enter data map(to:xi,omega_stream,w_theta,dens2_rot)
 #elif defined(_OPENACC)
-!$acc enter data copyin(xi,omega_stream)
+!$acc enter data copyin(xi,omega_stream,w_theta,dens2_rot)
 #endif
 
 #if defined(OMPGPU) || defined(_OPENACC)
diff --git a/cgyro/src/cgyro_flux.f90 b/cgyro/src/cgyro_flux.f90
index bfdc2f728..b9622a165 100644
--- a/cgyro/src/cgyro_flux.f90
+++ b/cgyro/src/cgyro_flux.f90
@@ -194,8 +194,9 @@ subroutine cgyro_flux
   ! 2-1. Compute Triad energy transfer
   !-------------------------------------------------------------
 
-     kx = 2*pi*rho/length
-     do is=1,n_species
+     if (triad_print_flag == 1) then
+      kx = 2*pi*rho/length
+      do is=1,n_species
         ! Triad energy transfer   :   T_k
         triad_loc_old(is,:,itor,1) = triad_loc(is,:,itor,1) *temp(is)/dlntdr(is_ele)
         ! From Nonzonal Triad energy transfer   :   T_k [NZ(k',k")->k]
@@ -214,7 +215,8 @@ subroutine cgyro_flux
         triad_loc_old(is,:,itor,7) = triad_loc(is,:,itor,6) *temp(is)/dlntdr(is_ele)
         ! Diss. (Coll. )
         triad_loc_old(is,:,itor,8) = triad_loc(is,:,itor,7) *temp(is)/dlntdr(is_ele)
-     enddo
+      enddo
+     endif
 
 
      !-----------------------------------------------------
@@ -279,14 +281,16 @@ subroutine cgyro_flux
        NEW_COMM_1, &
        i_err)
 
-  ! Reduced complex triad(ns,kx), below, is still distributed over n 
-  call MPI_ALLREDUCE(triad_loc_old(:,:,:,:), &
+  if (triad_print_flag == 1) then
+   ! Reduced complex triad(ns,kx), below, is still distributed over n 
+   call MPI_ALLREDUCE(triad_loc_old(:,:,:,:), &
        triad, &
        size(triad), &
        MPI_DOUBLE_COMPLEX, &
        MPI_SUM, &
        NEW_COMM_1, &
        i_err)
+  endif
 
 
   tave_step = tave_step + 1
diff --git a/cgyro/src/cgyro_init_arrays.F90 b/cgyro/src/cgyro_init_arrays.F90
index 7b0b63b46..09670ca24 100644
--- a/cgyro/src/cgyro_init_arrays.F90
+++ b/cgyro/src/cgyro_init_arrays.F90
@@ -448,12 +448,14 @@ subroutine cgyro_init_arrays
                 + abs(omega_rot_drift_r(it,is)) &
                 + abs(omega_rot_edrift_r(it)))          
 
-            ! (d/dr) upwind dissipation for triad energy transfer diagnostics
-            diss_r(ic,iv_loc,itor) = - (n_radial/length)*spectraldiss(u,nup_radial)*up_radial &
-             * (abs(omega_rdrift(it,is))*energy(ie)*(1.0+xi(ix)**2) &
-             + abs(omega_cdrift_r(it,is)*xi(ix))*vel(ie) &
-             + abs(omega_rot_drift_r(it,is)) &
-             + abs(omega_rot_edrift_r(it))) 
+            if (triad_print_flag == 1) then
+              ! (d/dr) upwind dissipation for triad energy transfer diagnostics
+              diss_r(ic,iv_loc,itor) = - (n_radial/length)*spectraldiss(u,nup_radial)*up_radial &
+               * (abs(omega_rdrift(it,is))*energy(ie)*(1.0+xi(ix)**2) &
+               + abs(omega_cdrift_r(it,is)*xi(ix))*vel(ie) &
+               + abs(omega_rot_drift_r(it,is)) &
+               + abs(omega_rot_edrift_r(it))) 
+            endif
 
            ! omega_star 
            carg = &
@@ -487,6 +489,13 @@ subroutine cgyro_init_arrays
 #elif defined(_OPENACC)
 !$acc enter data copyin(omega_cap_h,omega_h,omega_s,omega_ss,omega_sbeta)
 #endif
+  if (triad_print_flag == 1) then
+#if defined(OMPGPU)
+!$omp target enter data map(to:diss_r)
+#elif defined(_OPENACC)
+!$acc enter data copyin(diss_r)
+#endif
+  endif
   !-------------------------------------------------------------------------
 
   deallocate(gdlnndr,gdlntdr)
diff --git a/cgyro/src/cgyro_init_manager.F90 b/cgyro/src/cgyro_init_manager.F90
index 8a2644e75..7316e6951 100644
--- a/cgyro/src/cgyro_init_manager.F90
+++ b/cgyro/src/cgyro_init_manager.F90
@@ -111,6 +111,12 @@ subroutine cgyro_init_manager
      enddo
   enddo
 
+#if defined(OMPGPU)
+!$omp target enter data map(to:w_exi,w_e,w_xi)
+#elif defined(_OPENACC)
+!$acc enter data copyin(w_exi,w_e,w_xi)
+#endif
+
   allocate(theta(n_theta))
   allocate(thetab(n_theta,n_radial/box_size))
   allocate(w_theta(n_theta))
@@ -179,10 +185,6 @@ subroutine cgyro_init_manager
      allocate(    gflux(0:n_global,n_species,4,n_field,nt1:nt2))
      allocate(gflux_loc(0:n_global,n_species,4,n_field,nt1:nt2))
 
-     allocate(    triad(n_species,n_radial,nt1:nt2,8))
-     allocate(triad_loc(n_species,n_radial,nt1:nt2,7))
-     allocate(triad_loc_old(n_species,n_radial,nt1:nt2,8))
-
      allocate(cflux_tave(n_species,4))
      allocate(gflux_tave(n_species,4))
 
@@ -195,6 +197,16 @@ subroutine cgyro_init_manager
 #elif defined(_OPENACC)
 !$acc enter data create(fcoef,gcoef,field,field_loc,source)
 #endif
+     if (triad_print_flag == 1) then
+        allocate(    triad(n_species,n_radial,nt1:nt2,8))
+        allocate(triad_loc(n_species,n_radial,nt1:nt2,7))
+        allocate(triad_loc_old(n_species,n_radial,nt1:nt2,8))
+#if defined(OMPGPU)
+!$omp target enter data map(alloc:triad,triad_loc,triad_loc_old)
+#elif defined(_OPENACC)
+!$acc enter data create(triad,triad_loc,triad_loc_old)
+#endif
+     endif
 
      if ((collision_model /= 5) .AND. (collision_field_model == 1)) then
        ! nc and nc_loc must be last, since it will be collated     
@@ -262,7 +274,9 @@ subroutine cgyro_init_manager
      allocate(cap_h_v(nc_loc,nt1:nt2,nv))
      allocate(omega_cap_h(nc,nv_loc,nt1:nt2))
      allocate(omega_h(nc,nv_loc,nt1:nt2))
-     allocate(diss_r(nc,nv_loc,nt1:nt2))
+     if (triad_print_flag == 1) then
+        allocate(diss_r(nc,nv_loc,nt1:nt2))
+     endif
      allocate(omega_s(n_field,nc,nv_loc,nt1:nt2))
      allocate(omega_ss(n_field,nc,nv_loc,nt1:nt2))
      allocate(omega_sbeta(nc,nv_loc,nt1:nt2))
@@ -302,10 +316,8 @@ subroutine cgyro_init_manager
      ! Nonlinear arrays
      if (nonlinear_flag == 1) then
         allocate(fA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs))
-        allocate(eA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs))
         allocate(g_nl(n_field,n_radial,n_jtheta,n_toroidal))
         allocate(fpackA(n_radial,nt_loc,nsplitA*n_toroidal_procs))
-        allocate(epackA(n_radial,nt_loc,nsplitA*n_toroidal_procs))
         allocate(gpack(n_field,n_radial,n_jtheta,n_toroidal))
         allocate(jvec_c_nl(n_field,n_radial,n_jtheta,nv_loc,n_toroidal))
 #if defined(OMPGPU)
@@ -313,16 +325,32 @@ subroutine cgyro_init_manager
 #elif defined(_OPENACC)
 !$acc enter data create(fpackA,gpack,fA_nl,g_nl,jvec_c_nl)
 #endif
+        if (triad_print_flag == 1) then
+          allocate(eA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs))
+          allocate(epackA(n_radial,nt_loc,nsplitA*n_toroidal_procs))
+#if defined(OMPGPU)
+!$omp target enter data map(alloc:epackA,eA_nl)
+#elif defined(_OPENACC)
+!$acc enter data create(epackA,eA_nl)
+#endif
+        endif
         if (nsplitB > 0) then ! nsplitB can be zero at large MPI
           allocate(fB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs))
           allocate(fpackB(n_radial,nt_loc,nsplitB*n_toroidal_procs))
-          allocate(eB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs))
-          allocate(epackB(n_radial,nt_loc,nsplitB*n_toroidal_procs))
 #if defined(OMPGPU)
 !$omp target enter data map(alloc:fpackB,fB_nl)
 #elif defined(_OPENACC)
 !$acc enter data create(fpackB,fB_nl)
 #endif
+          if (triad_print_flag == 1) then
+            allocate(epackB(n_radial,nt_loc,nsplitB*n_toroidal_procs))
+            allocate(eB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs))
+#if defined(OMPGPU)
+!$omp target enter data map(alloc:epackB,eB_nl)
+#elif defined(_OPENACC)
+!$acc enter data create(epackB,eB_nl)
+#endif
+          endif
         endif
      endif
 
diff --git a/cgyro/src/cgyro_kernel.F90 b/cgyro/src/cgyro_kernel.F90
index 95c08273f..ee3bc3c24 100644
--- a/cgyro/src/cgyro_kernel.F90
+++ b/cgyro/src/cgyro_kernel.F90
@@ -135,6 +135,13 @@ subroutine cgyro_kernel
         ! wait for cap_h_c to be synched into system memory, used by cgyro_write_timedata
 !$acc wait(4)
 #endif
+       if (triad_print_flag == 1) then
+#if defined(OMPGPU)
+!$omp target update from(triad_loc)
+#elif defined(_OPENACC)
+!$acc update host(triad_loc)
+#endif
+       endif
        call timer_lib_out('coll_mem')
 
        call timer_lib_in('io')
diff --git a/cgyro/src/cgyro_nl_comm.F90 b/cgyro/src/cgyro_nl_comm.F90
index 4e0e74fae..a4cd1d28f 100644
--- a/cgyro/src/cgyro_nl_comm.F90
+++ b/cgyro/src/cgyro_nl_comm.F90
@@ -326,9 +326,24 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij)
   real :: dv,dvr,dvp,rval,rval2
   complex :: cprod,cprod2,thfac
 
-  triad_loc_old(:,:,nt1:nt2,3) = triad_loc(:,:,nt1:nt2,3)
-  triad_loc_old(:,:,nt1:nt2,4) = triad_loc(:,:,nt1:nt2,4)
-  triad_loc(:,:,nt1:nt2,:) = 0.0
+#if defined(OMPGPU)
+!$omp target teams distribute parallel do simd collapse(3)
+#elif defined(_OPENACC)
+!$acc parallel loop collapse(3) gang vector independent &
+!$acc&         present(triad_loc_old,triad_loc) &
+!$acc&         present(nt1,nt2,n_radial,n_species) default(none)
+#else
+!$omp parallel do private(ir,is)
+#endif
+  do itor=nt1,nt2
+      do ir=1,n_radial
+        do is=1,n_species
+          triad_loc_old(is,ir,itor,3) = triad_loc(is,ir,itor,3)
+          triad_loc_old(is,ir,itor,4) = triad_loc(is,ir,itor,4)
+          triad_loc(is,ir,itor,:) = 0.0
+        enddo
+      enddo
+  enddo
 
   call timer_lib_in('nl_comm')
   call parallel_slib_r_nc_wait(nsplitA,fA_nl,fpackA,fA_req)
@@ -351,11 +366,16 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij)
 #if defined(OMPGPU)
 !$omp target teams distribute parallel do simd collapse(4) &
 !$omp&         private(iexch0,itor0,isplit0,iexch_base) &
+!$omp&         private(id,itorbox,jr0,jc,itd,itd_class,thfac) &
 !$omp&         private(ic_loc_m,my_psi)
 #elif defined(_OPENACC)
 !$acc parallel loop collapse(4) gang vector independent private(ic_loc_m,my_psi) &
 !$acc&         private(iexch0,itor0,isplit0,iexch_base) &
-!$acc&         present(ic_c,px,rhs,fpackA,fpackB) copyin(psi_mul,zf_scale) &
+!$acc&         private(id,itorbox,jr0,jc,itd,itd_class,thfac) &
+!$acc&         present(h_x,g_x,cap_h_c,cap_h_ct,field,dvjvec_c,jvec_c) &
+!$acc&         present(ic_c,is_v,ix_v,ie_v,w_exi,w_theta,dens2_rot,z,temp) &
+!$acc&         present(omega_stream,vel,xi,thfac_itor,cderiv,uderiv) &
+!$acc&         present(px,rhs,fpackA,fpackB,epackA,epackB,diss_r,triad_loc) copyin(psi_mul,zf_scale) &
 !$acc&         present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA,nsplitB) copyin(ij) default(none)
 #else
 !$omp parallel do collapse(2) private(ic_loc_m,my_psi) &
@@ -365,12 +385,12 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij)
   do itor=nt1,nt2
     do iv_loc_m=1,nv_loc
       do ir=1,n_radial
-        itorbox = itor*box_size*sign_qs
-        jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial)
-        jr0(1) = n_theta*(ir-1)
-        jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial)
-
         do it=1,n_theta
+           itorbox = itor*box_size*sign_qs
+           jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial)
+           jr0(1) = n_theta*(ir-1)
+           jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial)
+
            ic_loc_m = ic_c(ir,it)
 
            is = is_v(iv_loc_m +nv1 -1 )
@@ -490,11 +510,16 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij)
 #if defined(OMPGPU)
 !$omp target teams distribute parallel do simd collapse(4) &
 !$omp&         private(iexch0,itor0,isplit0,iexch_base) &
+!$omp&         private(id,itorbox,jr0,jc,itd,itd_class,thfac) &
 !$omp&         private(ic_loc_m,my_psi)
 #elif defined(_OPENACC)
 !$acc parallel loop collapse(4) gang vector independent private(ic_loc_m,my_psi) &
 !$acc&         private(iexch0,itor0,isplit0,iexch_base) &
-!$acc&         present(ic_c,px,rhs,fpackA) copyin(psi_mul,zf_scale) &
+!$acc&         private(id,itorbox,jr0,jc,itd,itd_class,thfac) &
+!$acc&         present(h_x,g_x,cap_h_c,cap_h_ct,field,dvjvec_c,jvec_c) &
+!$acc&         present(ic_c,is_v,ix_v,ie_v,w_exi,w_theta,dens2_rot,z,temp) &
+!$acc&         present(omega_stream,vel,xi,thfac_itor,cderiv,uderiv) &
+!$acc&         present(px,rhs,fpackA,epackA,diss_r,triad_loc) copyin(psi_mul,zf_scale) &
 !$acc&         present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA) copyin(ij) default(none)
 #else
 !$omp parallel do collapse(2) private(ic_loc_m,my_psi) &
@@ -504,12 +529,12 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij)
   do itor=nt1,nt2
     do iv_loc_m=1,nv_loc
       do ir=1,n_radial
-        itorbox = itor*box_size*sign_qs
-        jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial)
-        jr0(1) = n_theta*(ir-1)
-        jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial)
-
         do it=1,n_theta
+           itorbox = itor*box_size*sign_qs
+           jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial)
+           jr0(1) = n_theta*(ir-1)
+           jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial)
+
            ic_loc_m = ic_c(ir,it)
 
            is = is_v(iv_loc_m +nv1 -1 )
diff --git a/cgyro/src/cgyro_nl_fftw.F90 b/cgyro/src/cgyro_nl_fftw.F90
index a2e065fde..a4c58206a 100644
--- a/cgyro/src/cgyro_nl_fftw.F90
+++ b/cgyro/src/cgyro_nl_fftw.F90
@@ -378,7 +378,63 @@ subroutine cgyro_nl_fftw_mul(sz,uvm,uxm,vym,uym,vxm,inv_nxny)
 
 end subroutine
 
-subroutine cgyro_nl_fftw(ij)
+subroutine cgyro_nl_fftw_mul_sub_mean(ny,nx,nt,uvm,uxm,vym,uym,vxm,inv_nxny)
+  implicit none
+
+  integer, intent(in) :: ny,nx,nt
+  real, dimension(*),intent(out) :: uvm
+  real, dimension(*),intent(in) :: uxm,vym,uym,vxm
+  real, intent(in) :: inv_nxny
+
+  integer :: iy,ix,it
+  integer :: sz,sz_t
+  integer :: i,i_xt
+  real :: y_mean_ux, y_mean_uy, y_mean_vx, y_mean_vy
+  real :: r_ux, r_uy, r_vx, r_vy
+  real :: inv_ny
+
+  sz = ny*nx*nt
+  sz_t = ny*nx
+  inv_ny = 1.0/ny
+
+#if defined(OMPGPU)
+!$omp target teams distribute parallel do collapse(2) &
+!$omp&  private(iy,i_xt,y_mean_ux,y_mean_uy,y_mean_vx,y_mean_vy) &
+!$omp&  private(i,r_ux,r_uy,r_vx,r_vy) &
+!$omp&  map(to:uxm(1:sz),vym(1:sz),uym(1:sz),vxm(1:sz)) &
+!$omp&  map(from:uvm(1:sz))
+#else
+!$acc parallel loop independent gang collapse(2) &
+!$acc&         private(iy,i_xt,y_mean_ux,y_mean_uy,y_mean_vx,y_mean_vy) &
+!$acc&         present(uvm,uxm,vym,uym,vxm)
+#endif
+  do it=0,nt-1
+   do ix=0,nx-1
+    i_xt = (it*sz_t) + ix*ny;
+    y_mean_ux = sum(uxm(i_xt+1:i_xt+ny) ) * inv_ny
+    y_mean_uy = sum(uym(i_xt+1:i_xt+ny) ) * inv_ny
+    y_mean_vx = sum(vxm(i_xt+1:i_xt+ny) ) * inv_ny
+    y_mean_vy = sum(vym(i_xt+1:i_xt+ny) ) * inv_ny
+#if (!defined(OMPGPU)) && defined(_OPENACC)
+!$acc loop vector private(i,r_ux,r_uy,r_vx,r_vy)
+#endif
+    do iy=1,ny
+      i = i_xt+iy
+      ! remove ky=0
+      r_ux = uxm(i) - y_mean_ux
+      r_uy = uym(i) - y_mean_uy
+      r_vx = vxm(i) - y_mean_vx
+      r_vy = vym(i) - y_mean_vy
+
+      ! compute and save uv
+      uvm(i) = (uxm(i)*vym(i)-uym(i)*vxm(i))*inv_nxny
+    enddo
+   enddo
+  enddo
+
+end subroutine
+
+subroutine cgyro_nl_fftw(i_triad)
 
 #if defined(HIPGPU)
   use hipfort
@@ -398,7 +454,7 @@ subroutine cgyro_nl_fftw(ij)
   include 'fftw/fftw3.f'
 #endif
   !-----------------------------------
-  integer, intent(in) :: ij
+  integer, intent(in) :: i_triad
   !-----------------------------------
   integer :: j,p,iexch
   integer :: it,ir,itm,itl,ix,iy
@@ -477,7 +533,7 @@ subroutine cgyro_nl_fftw(ij)
 
   call timer_lib_in('nl')
 #if !defined(OMPGPU)
-!$acc  data present(fA_nl)  &
+!$acc  data present(fA_nl,eA_nl)  &
 !$acc&      present(fxmany,fymany,gxmany,gymany) &
 !$acc&      present(uxmany,uymany,vxmany,vymany) &
 !$acc&      present(uvmany)
@@ -929,8 +985,99 @@ subroutine cgyro_nl_fftw(ij)
    enddo
   enddo
 
+  if (i_triad == 1) then
+    call timer_lib_out('nl_mem')
+    call timer_lib_in('nl')
+    ! 2. Poisson bracket in real space with Non_Zonal pairs
+
+    call cgyro_nl_fftw_mul_sub_mean(size(uvmany,1),size(uvmany,2),nsplitA, &
+                         uvmany, &
+                         uxmany,vymany(:,:,1:nsplitA), &
+                         uymany,vxmany(:,:,1:nsplitA), &
+                         inv_nxny)
+
+    ! make sure reqs progress
+    call cgyro_nl_fftw_comm_test()
+
+    ! ------------------
+    ! Transform uv to fx
+    ! ------------------
+
+#if defined(OMPGPU)
+
+#if defined(MKLGPU)
+!$omp target data map(tofrom: uvmany,fxmany)
+#else
+!$omp target data use_device_ptr(uvmany,fxmany)
+#endif
+
+#else
+!$acc wait
+!$acc  host_data use_device(uvmany,fxmany)
+#endif
+
+#if defined(HIPGPU)
+    rc = hipfftExecD2Z(hip_plan_r2c_manyA,c_loc(uvmany),c_loc(fxmany))
+#elif defined(MKLGPU)
+  !$omp dispatch
+    call dfftw_execute_dft_r2c(dfftw_plan_r2c_manyA,uvmany,fxmany)
+    rc = 0
+#else
+    rc = cufftExecD2Z(cu_plan_r2c_manyA,uvmany,fxmany)
+#endif
+
+#ifdef HIPGPU
+    ! hipfftExec is asynchronous, will need the results below
+    rc = hipDeviceSynchronize()
+#endif
+
+#if defined(OMPGPU)
+!$omp end target data
+#else
+!$acc wait
+!$acc end host_data
+#endif
+
+    ! make sure reqs progress
+    call cgyro_nl_fftw_comm_test()
+
+    call timer_lib_out('nl')
+    call timer_lib_in('nl_mem')
+
+    ! tile for performance, since this is effectively a transpose
+#if defined(OMPGPU)
+!$omp target teams distribute parallel do collapse(5) &
+!$omp&   private(iy,ir,itm,itl,ix)
+#else
+!$acc parallel loop independent collapse(5) gang &
+!$acc&         private(iy,ir,itm,itl,ix) present(eA_nl,fxmany)
+#endif
+    do j=1,nsplitA
+     do iy0=0,n_toroidal+(R_TORTILE-1)-1,R_TORTILE  ! round up
+      do ir0=0,n_radial+(R_RADTILE-1)-1,R_RADTILE  ! round up
+       do iy1=0,(R_TORTILE-1)   ! tile
+        do ir1=0,(R_RADTILE-1)  ! tile
+         iy = iy0 + iy1
+         ir = 1 + ir0 + ir1
+         if ((iy < n_toroidal) .and. (ir <= n_radial)) then
+           ! itor = iy+1
+           itm = 1 + iy/nt_loc
+           itl = 1 + modulo(iy,nt_loc)
+           ix = ir-1-nx0/2
+           if (ix < 0) ix = ix+nx
+
+           eA_nl(ir,itl,j,itm) = fxmany(iy,ix,j)
+          endif
+        enddo
+       enddo
+      enddo
+     enddo
+    enddo
+
+  endif ! i_triad==1
+
 #if !defined(OMPGPU)
-  ! end data fA_nl
+  ! end data fA_nl,eA_nl
 !$acc end data
 #endif
 
@@ -978,6 +1125,9 @@ subroutine cgyro_nl_fftw(ij)
   ! start the async reverse comm
   ! can reuse the same req, no overlap with forward fA_req
   call parallel_slib_r_nc_async(nsplitA,fA_nl,fpackA,fA_req)
+  if (i_triad == 1) then
+     call parallel_slib_r_nc_async(nsplitA,eA_nl,epackA,eA_req)
+  end if
   fA_req_valid = .TRUE.
 
   if (nsplitB > 0) then
@@ -993,7 +1143,7 @@ subroutine cgyro_nl_fftw(ij)
 
   call timer_lib_in('nl')
 #if !defined(OMPGPU)
-!$acc  data present(fB_nl)  &
+!$acc  data present(fB_nl,eB_nl)  &
 !$acc&      present(fxmany,fymany,gxmany,gymany) &
 !$acc&      present(uxmany,uymany,vxmany,vymany) &
 !$acc&      present(uvmany)
@@ -1242,8 +1392,105 @@ subroutine cgyro_nl_fftw(ij)
    enddo
   enddo
 
+  if (i_triad == 1) then
+    call timer_lib_out('nl_mem')
+    call timer_lib_in('nl')
+    ! 2. Poisson bracket in real space with Non_Zonal pairs
+    call cgyro_nl_fftw_mul(size(uvmany,1)*size(uvmany,2)*nsplitB, &
+                         uvmany, &
+                         uxmany,vymany(:,:,(nsplitA+1):nsplit), &
+                         uymany,vxmany(:,:,(nsplitA+1):nsplit), &
+                         inv_nxny)
+
+    ! make sure reqs progress
+    call cgyro_nl_fftw_comm_test()
+
+    ! ------------------
+    ! Transform uv to fx
+    ! ------------------
+
+#if defined(OMPGPU)
+
+#if defined(MKLGPU)
+!$omp target data map(tofrom: uvmany,fxmany)
+#else
+!$omp target data use_device_ptr(uvmany,fxmany)
+#endif
+
+#else
+!$acc wait
+!$acc  host_data use_device(uvmany,fxmany)
+#endif
+
+#if defined(HIPGPU)
+    rc = hipfftExecD2Z(hip_plan_r2c_manyB,c_loc(uvmany),c_loc(fxmany))
+#elif defined(MKLGPU)
+    !$omp dispatch
+    call dfftw_execute_dft_r2c(dfftw_plan_r2c_manyB,uvmany,fxmany)
+    rc = 0
+#else
+    rc = cufftExecD2Z(cu_plan_r2c_manyB,uvmany,fxmany)
+#endif
+
+#ifdef HIPGPU
+    ! make sure reqs progress
+    call cgyro_nl_fftw_comm_test()
+    ! hipfftExec is asynchronous, will need the results below
+    rc = hipDeviceSynchronize()
+#endif
+
+#if defined(OMPGPU)
+!$omp end target data
+#else
+    ! make sure reqs progress
+    call cgyro_nl_fftw_comm_test()
+!$acc wait
+!$acc end host_data
+#endif
+
+    ! make sure reqs progress
+    call cgyro_nl_fftw_comm_test()
+
+    call timer_lib_out('nl')
+    call timer_lib_in('nl_mem')
+
+    ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component
+    ! that will be filtered in the main time-stepping loop
+
+    ! tile for performance, since this is effectively a transpose
+#if defined(OMPGPU)
+!$omp target teams distribute parallel do collapse(5) &
+!$omp&   private(iy,ir,itm,itl,ix)
+#else
+!$acc parallel loop independent collapse(5) gang &
+!$acc&         private(iy,ir,itm,itl,ix) present(eB_nl,fxmany)
+#endif
+    do j=1,nsplitB
+     do iy0=0,n_toroidal+(R_TORTILE-1)-1,R_TORTILE  ! round up
+      do ir0=0,n_radial+(R_RADTILE-1)-1,R_RADTILE  ! round up
+       do iy1=0,(R_TORTILE-1)   ! tile
+        do ir1=0,(R_RADTILE-1)  ! tile
+         iy = iy0 + iy1
+         ir = 1 + ir0 + ir1
+         if ((iy < n_toroidal) .and. (ir <= n_radial)) then
+           ! itor = iy+1
+           itm = 1 + iy/nt_loc
+           itl = 1 + modulo(iy,nt_loc)
+           ix = ir-1-nx0/2
+           if (ix < 0) ix = ix+nx
+
+           eB_nl(ir,itl,j,itm) = fxmany(iy,ix,j)
+         endif
+        enddo
+       enddo
+      enddo
+     enddo
+    enddo
+
+  endif ! if i_triad
+
 #if !defined(OMPGPU)
-  ! end data fB_nl
+  ! end data fB_nl,eB_nl
 !$acc end data
 #endif
 
@@ -1253,6 +1500,9 @@ subroutine cgyro_nl_fftw(ij)
   ! start the async reverse comm
   ! can reuse the same req, no overlap with forward fB_req
   call parallel_slib_r_nc_async(nsplitB,fB_nl,fpackB,fB_req)
+  if (i_triad == 1) then
+     call parallel_slib_r_nc_async(nsplitB,eB_nl,epackB,eB_req)
+  end if
   fB_req_valid = .TRUE.
   ! make sure reqs progress
   call cgyro_nl_fftw_comm_test()
@@ -1272,16 +1522,24 @@ subroutine cgyro_nl_fftw_stepr(g_j, f_j, nl_idx, i_omp)
 
   implicit none
 
+  !-----------------------------------
   integer, intent(in) :: g_j, f_j
   integer,intent(in) :: nl_idx ! 1=>A, 2=>B
   integer,intent(in) :: i_omp
+  !-----------------------------------
   integer :: ix,iy
   integer :: ir,itm,itl,itor
+  real :: inv_nxny
 
   include 'fftw3.f03'
 
+  inv_nxny = 1.0/(nx*ny)
   ! Poisson bracket in real space
-  uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny)
+  do ix=0,nx-1
+    do iy=0,ny-1
+      uv(iy,ix,i_omp) = (uxmany(iy,ix,f_j)*vymany(iy,ix,g_j)-uymany(iy,ix,f_j)*vxmany(iy,ix,g_j)) * inv_nxny
+    enddo
+  enddo
 
   call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp))
 
@@ -1307,6 +1565,7 @@ subroutine cgyro_nl_fftw_stepr(g_j, f_j, nl_idx, i_omp)
 
 end subroutine cgyro_nl_fftw_stepr
 
+  ! assumes cgyro_nl_fftw_stepr has already been called
 subroutine cgyro_nl_fftw_stepr_triad(g_j, f_j, nl_idx, i_omp)
 
   use timer_lib
@@ -1315,73 +1574,47 @@ subroutine cgyro_nl_fftw_stepr_triad(g_j, f_j, nl_idx, i_omp)
 
   implicit none
 
+  !-----------------------------------
   integer, intent(in) :: g_j, f_j
   integer,intent(in) :: nl_idx ! 1=>A, 2=>B
   integer,intent(in) :: i_omp
-  real :: y_mean(nx)
+  !-----------------------------------
+  real :: y_mean_ux, y_mean_uy, y_mean_vx, y_mean_vy
+  real :: r_ux, r_uy, r_vx, r_vy
   integer :: ix,iy
   integer :: ir,itm,itl,itor
+  real :: inv_nxny,inv_ny
 
   include 'fftw3.f03'
 
-  ! 1. Poisson bracket in real space
-  uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny)
-
-  call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp))
+  inv_ny = 1.0/ny
+  inv_nxny = 1.0/(nx*ny)
+  ! 1. Poisson bracket in real space was done in cgyro_nl_fftw_stepr
 
-  ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component
-  ! that will be filtered in the main time-stepping loop
+  ! 2. Poisson bracket in real space with Non_Zonal pairs
 
-  ! this should really be accounted against nl_mem, but hard to do with OMP
-  do itm=1,n_toroidal_procs
-   do itl=1,nt_loc
-    itor=itl + (itm-1)*nt_loc
-    do ir=1,n_radial
-     ix = ir-1-nx0/2
-     if (ix < 0) ix = ix+nx
-     iy = itor-1
-     if (nl_idx==1) then
-       fA_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp)
-     else
-       fB_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp)
-     endif
+  do ix=0,nx-1
+    y_mean_ux = sum(uxmany(:,ix,f_j) ,dim=1 ) * inv_ny
+    y_mean_uy = sum(uymany(:,ix,f_j) ,dim=1 ) * inv_ny
+    y_mean_vx = sum(vxmany(:,ix,f_j) ,dim=1 ) * inv_ny
+    y_mean_vy = sum(vymany(:,ix,f_j) ,dim=1 ) * inv_ny
+    do iy=0,ny-1
+      ! remove ky=0
+      r_ux = uxmany(iy,ix,f_j) - y_mean_ux
+      r_uy = uymany(iy,ix,f_j) - y_mean_uy
+      r_vx = vxmany(iy,ix,f_j) - y_mean_vx
+      r_vy = vymany(iy,ix,f_j) - y_mean_vy
+      ! we could save, but we do not really need to (not used outside from this function)
+      !uxmany(iy,ix,f_j) = r_ux
+      !uymany(iy,ix,f_j) = r_y
+      !vxmany(iy,ix,f_j) = r_vx
+      !vymany(iy,ix,f_j) = r_vy
+
+      ! compute and save uv
+      uv(iy,ix,i_omp) = (r_ux*r_vy-r_uy*r_vx) * inv_nxny
     enddo
-   enddo
   enddo
 
-
-  ! 2. Poisson bracket in real space with Non_Zonal pairs
-
-  ! remove ky=0 in uxmany
-  y_mean = sum(uxmany(:,:,f_j) ,dim=1 ) / ny
-  do iy=0,ny-1
-    uxmany(iy,:,f_j) = uxmany(iy,:,f_j) -y_mean
-
-  end do
-
-  ! remove ky=0 in uymany
-  y_mean = sum(uymany(:,:,f_j) ,dim=1 ) / ny ! =0
-  do iy=0,ny-1
-    uymany(iy,:,f_j) = uymany(iy,:,f_j) -y_mean
-
-  end do
-
-  ! remove ky=0 in vx
-  y_mean = sum(vxmany(:,:,g_j) ,dim=1 ) / ny
-  do iy=0,ny-1
-    vxmany(iy,:,g_j) = vxmany(iy,:,g_j) -y_mean
-
-  end do
-
-  ! remove ky=0 in vy
-  y_mean = sum(vymany(:,:,g_j) ,dim=1 ) / ny ! =0
-  do iy=0,ny-1
-    vymany(iy,:,g_j) = vymany(iy,:,g_j) -y_mean
-
-  end do
-
-  uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny)
-
   call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp))
 
   do itm=1,n_toroidal_procs
@@ -1403,7 +1636,7 @@ subroutine cgyro_nl_fftw_stepr_triad(g_j, f_j, nl_idx, i_omp)
 end subroutine cgyro_nl_fftw_stepr_triad
 
 ! NOTE: call cgyro_nl_fftw_comm1 before cgyro_nl_fftw
-subroutine cgyro_nl_fftw(ij)
+subroutine cgyro_nl_fftw(i_triad)
 
   use timer_lib
   use parallel_lib
@@ -1414,9 +1647,9 @@ subroutine cgyro_nl_fftw(ij)
 
 
   !-----------------------------------
-  integer, intent(in) :: ij
+  integer, intent(in) :: i_triad
   !-----------------------------------
-  integer :: ix,iy,i_triad=0
+  integer :: ix,iy
   integer :: ir,it,itm,itl,it_loc
   integer :: itor,mytm
   integer :: j,p
@@ -1429,10 +1662,6 @@ subroutine cgyro_nl_fftw(ij)
 
   include 'fftw3.f03'
 
-  if (triad_print_flag == 1 .and. ij == 3) then
-   i_triad=1
-  endif
-
   call cgyro_nl_fftw_comm_test()
 
   ! time to wait for the FA_nl to become avaialble
@@ -1574,10 +1803,14 @@ subroutine cgyro_nl_fftw(ij)
         endif
 
         if (j<=nsplitA) then
+           call cgyro_nl_fftw_stepr(j, j, 1, i_omp)
+           if (i_omp==1) then
+             ! use the main thread to progress the async MPI
+             call cgyro_nl_fftw_comm_test()
+           endif
+
            if (i_triad == 1) then
               call cgyro_nl_fftw_stepr_triad(j, j, 1, i_omp)
-           else
-              call cgyro_nl_fftw_stepr(j, j, 1, i_omp)
            endif
         endif
         ! else we will do it in the next loop
@@ -1660,10 +1893,14 @@ subroutine cgyro_nl_fftw(ij)
           call cgyro_nl_fftw_comm_test()
         endif
 
+        call cgyro_nl_fftw_stepr(nsplitA+j, j, 2, i_omp)
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call cgyro_nl_fftw_comm_test()
+        endif
+
         if (i_triad == 1) then
           call cgyro_nl_fftw_stepr_triad(nsplitA+j, j, 2, i_omp)
-        else
-          call cgyro_nl_fftw_stepr(nsplitA+j, j, 2, i_omp)
         endif
    enddo ! j
 
diff --git a/cgyro/src/cgyro_rhs.F90 b/cgyro/src/cgyro_rhs.F90
index b83770355..b566b4328 100644
--- a/cgyro/src/cgyro_rhs.F90
+++ b/cgyro/src/cgyro_rhs.F90
@@ -393,6 +393,7 @@ subroutine cgyro_rhs(ij,update_cap)
   integer, intent(in) :: ij
   logical, intent(in) :: update_cap
   !--------------------------------
+  integer :: i_triad
 
   ! fields is ready by now
   call cgyro_rhs_comm_async_fd
@@ -412,9 +413,13 @@ subroutine cgyro_rhs(ij,update_cap)
 
   ! Nonlinear evaluation [f,g]
   if (nonlinear_flag == 1) then
+     i_triad=0
+     if (triad_print_flag == 1 .and. ij == 3) then
+       i_triad=1
+     endif
      ! assumes someone already started the input comm
      ! and will finish the output comm
-     call cgyro_nl_fftw(ij)
+     call cgyro_nl_fftw(i_triad)
   endif
 
   call cgyro_upwind_complete
diff --git a/cgyro/src/cgyro_step_collision.F90 b/cgyro/src/cgyro_step_collision.F90
index c2297993c..eb97d6207 100644
--- a/cgyro/src/cgyro_step_collision.F90
+++ b/cgyro/src/cgyro_step_collision.F90
@@ -1049,6 +1049,38 @@ subroutine cgyro_step_collision_gpu(use_simple)
   call timer_lib_in('coll')
 
   ! Compute H given h and [phi(h), apar(h)]
+ if (triad_print_flag == 1 ) then
+
+#if defined(OMPGPU)
+!$omp target teams distribute parallel do simd collapse(3) &
+!$omp&         private(iv_loc,is,my_psi,my_ch)
+#else
+!$acc parallel loop collapse(3) gang vector private(iv_loc,is,my_psi,my_ch) &
+!$acc&         present(is_v,cap_h_c,cap_h_ct,cap_h_c,jvec_c,field,z,temp,h_x) &
+!$acc&         present(nt1,nt2,nv1,nv2,nc) default(none)
+#endif
+  do itor=nt1,nt2
+   do iv=nv1,nv2
+     do ic=1,nc
+        iv_loc = iv-nv1+1
+        is = is_v(iv)
+
+        ! Save collisional diss. 
+        my_ch = cap_h_ct(iv_loc,itor,ic)
+        my_psi = sum(jvec_c(:,ic,iv_loc,itor)*field(:,ic,itor))
+           
+        my_psi = my_ch-my_psi*(z(is)/temp(is))
+        cap_h_ct(iv_loc,itor,ic) = (cap_h_c(ic,iv_loc,itor) + my_ch) / 2.0
+        cap_h_ct(iv_loc,itor,ic) = conjg(cap_h_ct(iv_loc,itor,ic)) &
+          * ( my_psi - h_x(ic,iv_loc,itor) )
+
+        h_x(ic,iv_loc,itor) = my_psi
+        cap_h_c(ic,iv_loc,itor) = my_ch
+     enddo
+   enddo
+  enddo
+
+ else
 
 #if defined(OMPGPU)
 !$omp target teams distribute parallel do simd collapse(3) &
@@ -1071,6 +1103,8 @@ subroutine cgyro_step_collision_gpu(use_simple)
    enddo
   enddo
 
+ end if ! (triad_print_flag == 1 )
+
   call timer_lib_out('coll')
 
 end subroutine cgyro_step_collision_gpu