diff --git a/cgyro/src/cgyro_flux.f90 b/cgyro/src/cgyro_flux.f90
index 52599e119..b5c8be3df 100644
--- a/cgyro/src/cgyro_flux.f90
+++ b/cgyro/src/cgyro_flux.f90
@@ -44,6 +44,7 @@ subroutine cgyro_flux
   complex :: cprod
   real, parameter :: x_fraction=0.2
   real :: u
+  real :: kx
 
 !$omp parallel do private(iv_loc,iv,is,ix,ie,dv,vpar,ic,ir,it,erot,cprod,cn) &
 !$omp&            private(prod1,prod2,prod3,l,icl,dvr,u,flux_norm) &
@@ -188,6 +189,34 @@ subroutine cgyro_flux
         cflux_loc(:,:,:,itor) = cflux_loc(:,:,:,itor)+2*sin(u)*real(gflux_loc(l,:,:,:,itor))/u
      enddo
 
+
+  !-------------------------------------------------------------
+  ! 2-1. Compute Triad energy transfer
+  !-------------------------------------------------------------
+
+     kx = 2*pi*rho/length
+     do is=1,n_species
+        ! Triad energy transfer   :   T_k
+        triad_loc_old(is,:,itor,1) = triad_loc(is,:,itor,1) *temp(is)/dlntdr(is_ele)
+        ! From Nonzonal Triad energy transfer   :   T_k [NZ(k',k")->k]
+        triad_loc_old(is,:,itor,2) = triad_loc(is,:,itor,2) *temp(is)/dlntdr(is_ele)
+        ! dEntropy / dt   :   dS_k/dt 
+        triad_loc_old(is,:,itor,3) = (triad_loc(is,:,itor,3)-triad_loc_old(is,:,itor,3))/delta_t
+        triad_loc_old(is,:,itor,3) = triad_loc_old(is,:,itor,3) *temp(is)/dlntdr(is_ele)*0.5
+        ! dWk_perp / dt    , (ky = 0)  
+        triad_loc_old(is,:,itor,4) = (triad_loc(is,:,itor,4)-triad_loc_old(is,:,itor,4))/delta_t
+        triad_loc_old(is,:,itor,4) = triad_loc_old(is,:,itor,4) *(kx*lambda_star)**2/dlntdr(is_ele)*0.5
+        ! Entropy   :   S_k
+        triad_loc_old(is,:,itor,5) = triad_loc(is,:,itor,3) *temp(is)/dlntdr(is_ele)*0.5
+        ! Diss. (radial)
+        triad_loc_old(is,:,itor,6) = triad_loc(is,:,itor,5) *temp(is)/dlntdr(is_ele)
+        ! Diss. (theta )
+        triad_loc_old(is,:,itor,7) = triad_loc(is,:,itor,6) *temp(is)/dlntdr(is_ele)
+        ! Diss. (Coll. )
+        triad_loc_old(is,:,itor,8) = triad_loc(is,:,itor,7) *temp(is)/dlntdr(is_ele)
+     enddo
+
+
      !-----------------------------------------------------
      ! 3. Renormalize fluxes to GB or quasilinear forms
      !~----------------------------------------------------
@@ -216,7 +245,7 @@ subroutine cgyro_flux
         ! GyroBohm normalizations
         gflux_loc(:,:,:,:,itor) = gflux_loc(:,:,:,:,itor)/rho**2
         cflux_loc(:,:,:,itor) = cflux_loc(:,:,:,itor)/rho**2
-
+        triad_loc_old(:,:,itor,:)  = triad_loc_old(:,:,itor,:)/rho**2
      endif
   enddo
 
@@ -240,6 +269,15 @@ subroutine cgyro_flux
        NEW_COMM_1, &
        i_err)
 
+  ! Reduced complex triad(ns,kx), below, is still distributed over n 
+  call MPI_ALLREDUCE(triad_loc_old(:,:,:,:), &
+       triad, &
+       size(triad), &
+       MPI_DOUBLE_COMPLEX, &
+       MPI_SUM, &
+       NEW_COMM_1, &
+       i_err)
+
   ! Reduced real cflux(ky), below, is still distributed over n 
 
   call MPI_ALLREDUCE(cflux_loc, &
diff --git a/cgyro/src/cgyro_globals.F90 b/cgyro/src/cgyro_globals.F90
index a990e88f1..8b6a1f893 100644
--- a/cgyro/src/cgyro_globals.F90
+++ b/cgyro/src/cgyro_globals.F90
@@ -80,6 +80,7 @@ module cgyro_globals
   integer :: moment_print_flag
   integer :: gflux_print_flag
   integer :: field_print_flag
+  integer :: triad_print_flag
   real :: amp0
   real :: amp
   real :: gamma_e
@@ -192,7 +193,7 @@ module cgyro_globals
   integer :: nsplit,nsplitA,nsplitB
   integer :: ns1,ns2
   integer, dimension(:), allocatable :: recv_status
-  integer :: fA_req, fB_req, g_req
+  integer :: fA_req, fB_req, eA_req, eB_req, g_req
   logical :: fA_req_valid, fB_req_valid, g_req_valid
   ! Thetas present in the process after NL AllToAll
   integer :: n_jtheta
@@ -243,6 +244,7 @@ module cgyro_globals
   character(len=12) :: binfile_hb      = 'bin.cgyro.hb'
   character(len=17) :: binfile_ky_flux = 'bin.cgyro.ky_flux'
   character(len=18) :: binfile_ky_cflux = 'bin.cgyro.ky_cflux'
+  character(len=15) :: binfile_triad   = 'bin.cgyro.triad'
   character(len=15), dimension(3) :: binfile_fieldb = &
        (/'bin.cgyro.phib ','bin.cgyro.aparb','bin.cgyro.bparb'/)
   character(len=16), dimension(3) :: binfile_kxky = &
@@ -344,11 +346,14 @@ module cgyro_globals
   complex, dimension(:,:,:), allocatable :: h0_x
   complex, dimension(:,:,:), allocatable :: h0_old
   complex, dimension(:,:,:,:), allocatable :: fA_nl,fB_nl
+  complex, dimension(:,:,:,:), allocatable :: eA_nl,eB_nl
   complex, dimension(:,:,:,:), allocatable :: g_nl
   complex, dimension(:,:,:), allocatable :: fpackA,fpackB
+  complex, dimension(:,:,:), allocatable :: epackA,epackB
   complex, dimension(:,:,:,:), allocatable :: gpack
   complex, dimension(:,:,:), allocatable :: omega_cap_h
   complex, dimension(:,:,:), allocatable :: omega_h
+  complex, dimension(:,:,:), allocatable :: diss_r
   complex, dimension(:,:,:,:), allocatable :: omega_s,omega_ss
   complex, dimension(:,:,:), allocatable :: omega_sbeta
   complex, dimension(:,:,:), allocatable :: cap_h_c
@@ -382,6 +387,7 @@ module cgyro_globals
   real, dimension(:,:,:,:), allocatable :: cflux
   complex, dimension(:,:,:,:,:), allocatable :: gflux_loc
   complex, dimension(:,:,:,:,:), allocatable :: gflux
+  complex, dimension(:,:,:,:), allocatable :: triad,triad_loc,triad_loc_old
   real, dimension(:,:), allocatable :: cflux_tave, gflux_tave
   real :: tave_min, tave_max
   integer :: tave_step
diff --git a/cgyro/src/cgyro_init_arrays.F90 b/cgyro/src/cgyro_init_arrays.F90
index b5318f4f2..3908f6156 100644
--- a/cgyro/src/cgyro_init_arrays.F90
+++ b/cgyro/src/cgyro_init_arrays.F90
@@ -448,7 +448,14 @@ subroutine cgyro_init_arrays
                 + abs(omega_cdrift_r(it,is)*xi(ix))*vel(ie) &
                 + abs(omega_rot_drift_r(it,is)) &
                 + abs(omega_rot_edrift_r(it)))          
-             
+
+            ! (d/dr) upwind diss.
+            diss_r(ic,iv_loc,itor) = - (n_radial/length)*spectraldiss(u,nup_radial)*up_radial &
+             * (abs(omega_rdrift(it,is))*energy(ie)*(1.0+xi(ix)**2) &
+             + abs(omega_cdrift_r(it,is)*xi(ix))*vel(ie) &
+             + abs(omega_rot_drift_r(it,is)) &
+             + abs(omega_rot_edrift_r(it))) 
+
            ! omega_star 
            carg = &
                 -i_c*k_theta_base*itor*rho*(dlnndr(is)+dlntdr(is)*(energy(ie)-1.5)) &
diff --git a/cgyro/src/cgyro_init_manager.F90 b/cgyro/src/cgyro_init_manager.F90
index 2c8b1ba3b..dd14d6ab9 100644
--- a/cgyro/src/cgyro_init_manager.F90
+++ b/cgyro/src/cgyro_init_manager.F90
@@ -178,6 +178,11 @@ subroutine cgyro_init_manager
      allocate(cflux_loc(n_species,4,n_field,nt1:nt2))
      allocate(    gflux(0:n_global,n_species,4,n_field,nt1:nt2))
      allocate(gflux_loc(0:n_global,n_species,4,n_field,nt1:nt2))
+
+     allocate(    triad(n_species,n_radial,nt1:nt2,8))
+     allocate(triad_loc(n_species,n_radial,nt1:nt2,7))
+     allocate(triad_loc_old(n_species,n_radial,nt1:nt2,8))
+
      allocate(cflux_tave(n_species,4))
      allocate(gflux_tave(n_species,4))
      
@@ -245,6 +250,7 @@ subroutine cgyro_init_manager
      allocate(cap_h_v(nc_loc,nt1:nt2,nv))
      allocate(omega_cap_h(nc,nv_loc,nt1:nt2))
      allocate(omega_h(nc,nv_loc,nt1:nt2))
+     allocate(diss_r(nc,nv_loc,nt1:nt2))
      allocate(omega_s(n_field,nc,nv_loc,nt1:nt2))
      allocate(omega_ss(n_field,nc,nv_loc,nt1:nt2))
      allocate(omega_sbeta(nc,nv_loc,nt1:nt2))
@@ -285,8 +291,10 @@ subroutine cgyro_init_manager
      ! Nonlinear arrays
      if (nonlinear_flag == 1) then
         allocate(fA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs))
+        allocate(eA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs))
         allocate(g_nl(n_field,n_radial,n_jtheta,n_toroidal))
         allocate(fpackA(n_radial,nt_loc,nsplitA*n_toroidal_procs))
+        allocate(epackA(n_radial,nt_loc,nsplitA*n_toroidal_procs))
         allocate(gpack(n_field,n_radial,n_jtheta,n_toroidal))
         allocate(jvec_c_nl(n_field,n_radial,n_jtheta,nv_loc,n_toroidal))
 #if defined(OMPGPU)
@@ -297,6 +305,8 @@ subroutine cgyro_init_manager
         if (nsplitB > 0) then ! nsplitB can be zero at large MPI
           allocate(fB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs))
           allocate(fpackB(n_radial,nt_loc,nsplitB*n_toroidal_procs))
+          allocate(eB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs))
+          allocate(epackB(n_radial,nt_loc,nsplitB*n_toroidal_procs))
 #if defined(OMPGPU)
 !$omp target enter data map(alloc:fpackB,fB_nl)
 #elif defined(_OPENACC)
diff --git a/cgyro/src/cgyro_nl_comm.F90 b/cgyro/src/cgyro_nl_comm.F90
index 674d75e94..6392d334d 100644
--- a/cgyro/src/cgyro_nl_comm.F90
+++ b/cgyro/src/cgyro_nl_comm.F90
@@ -304,6 +304,316 @@ subroutine cgyro_nl_fftw_comm1_r(ij)
 end subroutine cgyro_nl_fftw_comm1_r
 
 
+subroutine cgyro_nl_fftw_comm1_r_triad(ij)
+  use timer_lib
+  use parallel_lib
+  use cgyro_globals
+
+  implicit none
+
+  !-----------------------------------
+  integer, intent(in) :: ij
+  !-----------------------------------
+  
+  integer :: is,ix,ie
+  integer :: id,itd,itd_class,jr0(0:2),itorbox,jc
+  integer :: ir,it,iv_loc_m,ic_loc_m,itor
+  integer :: iexch0,itor0,isplit0,iexch_base
+  complex :: my_psi
+  real :: psi_mul
+
+  real :: dv,dvr,dvp,rval,rval2
+  complex :: cprod,cprod2,thfac
+
+  triad_loc_old(:,:,:,3)=triad_loc(:,:,:,3)
+  triad_loc_old(:,:,:,4)=triad_loc(:,:,:,4)
+  triad_loc(:,:,:,:)=0.0
+
+  call timer_lib_in('nl_comm')
+  call parallel_slib_r_nc_wait(nsplitA,fA_nl,fpackA,fA_req)
+  call parallel_slib_r_nc_wait(nsplitA,eA_nl,epackA,eA_req)
+  fA_req_valid = .FALSE.
+  if (nsplitB > 0) then
+    ! no major compute to overlap
+    call parallel_slib_r_nc_wait(nsplitB,fB_nl,fpackB,fB_req)
+    call parallel_slib_r_nc_wait(nsplitB,eB_nl,epackB,eB_req)
+    fB_req_valid = .FALSE.
+  endif
+  call timer_lib_out('nl_comm')
+
+  call timer_lib_in('nl')
+
+  psi_mul = (q*rho/rmin)*(2*pi/length)
+
+  if (nsplitB > 0) then
+
+#if defined(OMPGPU)
+!$omp target teams distribute parallel do simd collapse(4) &
+!$omp&         private(iexch0,itor0,isplit0,iexch_base) &
+!$omp&         private(ic_loc_m,my_psi)
+#elif defined(_OPENACC)
+!$acc parallel loop collapse(4) gang vector independent private(ic_loc_m,my_psi) &
+!$acc&         private(iexch0,itor0,isplit0,iexch_base) &
+!$acc&         present(ic_c,px,rhs,fpackA,fpackB) copyin(psi_mul,zf_scale) &
+!$acc&         present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA,nsplitB) copyin(ij) default(none)
+#else
+!$omp parallel do collapse(2) private(ic_loc_m,my_psi) &
+!$omp&         private(iexch0,itor0,isplit0,iexch_base,is,ix,ie,dv,dvr,dvp,rval,rval2,cprod,cprod2) &
+!$omp&         private(id,itorbox,jr0,jc,itd,itd_class,thfac)
+#endif
+  do itor=nt1,nt2
+    do iv_loc_m=1,nv_loc
+      do ir=1,n_radial
+        itorbox = itor*box_size*sign_qs
+        jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial)
+        jr0(1) = n_theta*(ir-1)
+        jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial)
+
+        do it=1,n_theta
+           ic_loc_m = ic_c(ir,it)
+
+           is = is_v(iv_loc_m +nv1 -1 )
+           ix = ix_v(iv_loc_m +nv1 -1 )
+           ie = ie_v(iv_loc_m +nv1 -1 )
+           dv = w_exi(ie,ix)
+           dvr  = w_theta(it)*dens2_rot(it,is)*dv
+           dvp = w_theta(it)*dv*(ir-1-nx0/2)**2
+
+           ! Density moment
+           cprod = w_theta(it)*cap_h_c(ic_loc_m,iv_loc_m,itor)*dvjvec_c(1,ic_loc_m,iv_loc_m,itor)/z(is)
+           cprod = -(dvr*z(is)/temp(is)*field(1,ic_loc_m,itor)-cprod)
+           cprod2= ( jvec_c(1,ic_loc_m,iv_loc_m,itor)*z(is)/temp(is) )*conjg(field(1,ic_loc_m,itor) )
+
+           iexch0 = (iv_loc_m-1) + (it-1)*nv_loc
+           itor0 = iexch0/nsplit
+           isplit0 = modulo(iexch0,nsplit)
+           if (isplit0 < nsplitA) then
+              iexch_base = 1+itor0*nsplitA
+              my_psi = fpackA(ir,itor-nt1+1,iexch_base+isplit0)
+
+              ! 1. Triad energy transfer (all)
+              triad_loc(is,ir,itor,1) = triad_loc(is,ir,itor,1) &
+               + fpackA(ir,itor-nt1+1,iexch_base+isplit0)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul
+              ! 2. Triad energy transfer ( {NZF-NZF} coupling )
+              if (itor == 0) then
+                ! New : Diag. direct ZF production [A. Ishizawa PRL 2019 ]
+                triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2)  &
+               + epackA(ir,itor-nt1+1,iexch_base+isplit0)*cprod2*dvr*psi_mul
+              else
+              triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2) &
+               + epackA(ir,itor-nt1+1,iexch_base+isplit0)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul
+              endif
+           else
+              iexch_base = 1+itor0*nsplitB
+              my_psi = fpackB(ir,itor-nt1+1,iexch_base+(isplit0-nsplitA))
+
+              ! 1. Triad energy transfer (all)
+              triad_loc(is,ir,itor,1) = triad_loc(is,ir,itor,1) &
+               + fpackB(ir,itor-nt1+1,iexch_base+(isplit0-nsplitA))*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul
+              ! 2. Triad energy transfer ( {NZF-NZF} coupling )
+              if (itor == 0) then
+                ! New : Diag. direct ZF production [A. Ishizawa PRL 2019 ]
+                triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2)  &
+               + epackB(ir,itor-nt1+1,iexch_base+(isplit0-nsplitA))*cprod2*dvr*psi_mul
+              else
+                triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2)  &
+               + epackB(ir,itor-nt1+1,iexch_base+(isplit0-nsplitA))*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul
+              endif
+           endif
+
+           ! 3. Entropy 
+           triad_loc(is,ir,itor,3) = triad_loc(is,ir,itor,3) &
+                + cap_h_c(ic_loc_m,iv_loc_m,itor)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr &
+                - field(1,ic_loc_m,itor)*conjg(field(1,ic_loc_m,itor))*(z(is)/temp(is))**2*dvr &
+                - 2.0*cprod*conjg(field(1,ic_loc_m,itor))*(z(is)/temp(is))
+           ! 4. Field potential
+           triad_loc(is,ir,itor,4) = triad_loc(is,ir,itor,4)  & 
+                + sum( field(:,ic_loc_m,itor)*conjg(field(:,ic_loc_m,itor)) )*dvp
+           ! 5. Diss. (radial)
+           triad_loc(is,ir,itor,5) = triad_loc(is,ir,itor,5)  &  
+                + diss_r(ic_loc_m,iv_loc_m,itor)*h_x(ic_loc_m,iv_loc_m,itor)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr
+           ! 6. Diss. (theta )
+           rval = omega_stream(it,is,itor)*vel(ie)*xi(ix)
+           rval2 = abs(omega_stream(it,is,itor))
+           cprod = 0.0 
+           cprod2= 0.0
+
+          !icd_c(ic, id, itor)     = ic_c(jr,modulo(it+id-1,n_theta)+1)
+          !jc = icd_c(ic, id, itor)
+          !dtheta(ic, id, itor)    := cderiv(id)*thfac
+          !dtheta_up(ic, id, itor) := uderiv(id)*thfac*up_theta
+          itd = n_theta+it-nup_theta
+          itd_class = 0
+          jc = jr0(itd_class)+itd
+          thfac = thfac_itor(itd_class,itor)
+
+           do id=-nup_theta,nup_theta
+              if (itd > n_theta) then
+                ! move to next itd_class of compute
+                itd = itd - n_theta
+                itd_class = itd_class + 1
+                jc = jr0(itd_class)+itd
+                thfac = thfac_itor(itd_class,itor)
+              endif
+
+              cprod2 = cprod2 - rval* thfac*cderiv(id) *cap_h_c(jc,iv_loc_m,itor)
+              cprod = cprod - rval2* uderiv(id)*up_theta *g_x(jc,iv_loc_m,itor)
+              itd = itd + 1
+              jc = jc + 1
+           enddo 
+
+           triad_loc(is,ir,itor,6) = triad_loc(is,ir,itor,6) + cprod*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr
+           ! 7. Diss. (Coll. = Implicit advance - theta_streaming )
+           triad_loc(is,ir,itor,7) = triad_loc(is,ir,itor,7) &
+                 + ( cap_h_ct(iv_loc_m,itor,ic_loc_m)/delta_t + cprod2*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor)) )*dvr
+
+
+           if ( (itor == 0) .and.  (ir == 1 .or. px(ir) == 0) ) then
+              ! filter
+              my_psi = (0.0,0.0)
+           endif
+ 
+           if (itor == 0) then
+              my_psi = my_psi*zf_scale
+           endif
+
+           ! RHS -> -[f,g] = [f,g]_{r,-alpha}
+           rhs(ic_loc_m,iv_loc_m,itor,ij) = rhs(ic_loc_m,iv_loc_m,itor,ij)+psi_mul*my_psi
+        enddo
+      enddo
+    enddo
+  enddo
+
+  else ! nsplitB==0
+
+#if defined(OMPGPU)
+!$omp target teams distribute parallel do simd collapse(4) &
+!$omp&         private(iexch0,itor0,isplit0,iexch_base) &
+!$omp&         private(ic_loc_m,my_psi)
+#elif defined(_OPENACC)
+!$acc parallel loop collapse(4) gang vector independent private(ic_loc_m,my_psi) &
+!$acc&         private(iexch0,itor0,isplit0,iexch_base) &
+!$acc&         present(ic_c,px,rhs,fpackA) copyin(psi_mul,zf_scale) &
+!$acc&         present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA) copyin(ij) default(none)
+#else
+!$omp parallel do collapse(2) private(ic_loc_m,my_psi) &
+!$omp&         private(iexch0,itor0,isplit0,iexch_base,is,ix,ie,dv,dvr,dvp,rval,rval2,cprod,cprod2) &
+!$omp&         private(id,itorbox,jr0,jc,itd,itd_class,thfac)
+#endif
+  do itor=nt1,nt2
+    do iv_loc_m=1,nv_loc
+      do ir=1,n_radial
+        itorbox = itor*box_size*sign_qs
+        jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial)
+        jr0(1) = n_theta*(ir-1)
+        jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial)
+
+        do it=1,n_theta
+           ic_loc_m = ic_c(ir,it)
+
+           is = is_v(iv_loc_m +nv1 -1 )
+           ix = ix_v(iv_loc_m +nv1 -1 )
+           ie = ie_v(iv_loc_m +nv1 -1 )
+           dv = w_exi(ie,ix)
+           dvr  = w_theta(it)*dens2_rot(it,is)*dv
+           dvp = w_theta(it)*dv*(ir-1-nx0/2)**2
+
+           ! Density moment
+           cprod = w_theta(it)*cap_h_c(ic_loc_m,iv_loc_m,itor)*dvjvec_c(1,ic_loc_m,iv_loc_m,itor)/z(is)
+           cprod = -(dvr*z(is)/temp(is)*field(1,ic_loc_m,itor)-cprod)
+           cprod2= ( jvec_c(1,ic_loc_m,iv_loc_m,itor)*z(is)/temp(is) )*conjg(field(1,ic_loc_m,itor) )
+
+
+           iexch0 = (iv_loc_m-1) + (it-1)*nv_loc
+           itor0 = iexch0/nsplit
+           isplit0 = modulo(iexch0,nsplit)
+           iexch_base = 1+itor0*nsplitA
+           my_psi = fpackA(ir,itor-nt1+1,iexch_base+isplit0)
+     
+
+           ! 1. Triad energy transfer (all)
+           triad_loc(is,ir,itor,1) = triad_loc(is,ir,itor,1) &
+               + fpackA(ir,itor-nt1+1,iexch_base+isplit0)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul
+           ! 2. Triad energy transfer ( {NZF-NZF} coupling )
+           if (itor == 0) then
+             ! New : Diag. direct ZF production [A. Ishizawa PRL 2019 ]
+             triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2)  &
+               + epackA(ir,itor-nt1+1,iexch_base+isplit0)*cprod2*dvr*psi_mul
+           else
+           triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2) &
+               + epackA(ir,itor-nt1+1,iexch_base+isplit0)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul
+           endif
+           
+           ! 3. Entropy 
+           triad_loc(is,ir,itor,3) = triad_loc(is,ir,itor,3) &
+                + cap_h_c(ic_loc_m,iv_loc_m,itor)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr &
+                - field(1,ic_loc_m,itor)*conjg(field(1,ic_loc_m,itor))*(z(is)/temp(is))**2*dvr &
+                - 2.0*cprod*conjg(field(1,ic_loc_m,itor))*(z(is)/temp(is))
+           ! 4. Field potential
+           triad_loc(is,ir,itor,4) = triad_loc(is,ir,itor,4)  & 
+                + sum( field(:,ic_loc_m,itor)*conjg(field(:,ic_loc_m,itor)) )*dvp
+           ! 5. Diss. (radial)
+           triad_loc(is,ir,itor,5) = triad_loc(is,ir,itor,5)  &  
+                + diss_r(ic_loc_m,iv_loc_m,itor)*h_x(ic_loc_m,iv_loc_m,itor)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr
+           ! 6. Diss. (theta )
+           rval = omega_stream(it,is,itor)*vel(ie)*xi(ix)
+           rval2 = abs(omega_stream(it,is,itor))
+           cprod = 0.0 
+           cprod2= 0.0
+
+          !icd_c(ic, id, itor)     = ic_c(jr,modulo(it+id-1,n_theta)+1)
+          !jc = icd_c(ic, id, itor)
+          !dtheta(ic, id, itor)    := cderiv(id)*thfac
+          !dtheta_up(ic, id, itor) := uderiv(id)*thfac*up_theta
+          itd = n_theta+it-nup_theta
+          itd_class = 0
+          jc = jr0(itd_class)+itd
+          thfac = thfac_itor(itd_class,itor)
+
+           do id=-nup_theta,nup_theta
+              if (itd > n_theta) then
+                ! move to next itd_class of compute
+                itd = itd - n_theta
+                itd_class = itd_class + 1
+                jc = jr0(itd_class)+itd
+                thfac = thfac_itor(itd_class,itor)
+              endif
+
+              cprod2 = cprod2 - rval* thfac*cderiv(id) *cap_h_c(jc,iv_loc_m,itor)
+              cprod = cprod - rval2* uderiv(id)*up_theta *g_x(jc,iv_loc_m,itor)
+              itd = itd + 1
+              jc = jc + 1
+           enddo 
+
+           triad_loc(is,ir,itor,6) = triad_loc(is,ir,itor,6) + cprod*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr
+           ! 7. Diss. (Coll. = Implicit advance - theta_streaming )
+           triad_loc(is,ir,itor,7) = triad_loc(is,ir,itor,7) &
+                 + ( cap_h_ct(iv_loc_m,itor,ic_loc_m)/delta_t + cprod2*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor)) )*dvr
+
+
+           if ( (itor == 0) .and.  (ir == 1 .or. px(ir) == 0) ) then
+              ! filter
+              my_psi = (0.0,0.0)
+           endif
+
+           if (itor == 0) then
+              my_psi = my_psi*zf_scale
+           endif
+           
+           ! RHS -> -[f,g] = [f,g]_{r,-alpha}
+           rhs(ic_loc_m,iv_loc_m,itor,ij) = rhs(ic_loc_m,iv_loc_m,itor,ij)+psi_mul*my_psi
+        enddo
+      enddo
+    enddo
+  enddo
+
+  endif ! if nsplitB>0
+
+  call timer_lib_out('nl')
+
+end subroutine cgyro_nl_fftw_comm1_r_triad
+
+
 !
 ! Comm2 is a transpose
 ! Reminder: nc ~= n_radial*n_theta
diff --git a/cgyro/src/cgyro_nl_fftw.f90 b/cgyro/src/cgyro_nl_fftw.f90
index 0fbe6a200..d56143bbf 100644
--- a/cgyro/src/cgyro_nl_fftw.f90
+++ b/cgyro/src/cgyro_nl_fftw.f90
@@ -64,6 +64,102 @@ subroutine cgyro_nl_fftw_stepr(g_j, f_j, nl_idx, i_omp)
 
 end subroutine cgyro_nl_fftw_stepr
 
+subroutine cgyro_nl_fftw_stepr_triad(g_j, f_j, nl_idx, i_omp)
+
+  use timer_lib
+  use parallel_lib
+  use cgyro_globals
+
+  implicit none
+
+  integer, intent(in) :: g_j, f_j
+  integer,intent(in) :: nl_idx ! 1=>A, 2=>B
+  integer,intent(in) :: i_omp
+  real :: y_mean(nx)
+  integer :: ix,iy
+  integer :: ir,itm,itl,itor
+
+  include 'fftw3.f03'
+
+  ! 1. Poisson bracket in real space
+  uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny)
+
+  call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp))
+
+  ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component
+  ! that will be filtered in the main time-stepping loop
+
+  ! this should really be accounted against nl_mem, but hard to do with OMP
+  do itm=1,n_toroidal_procs
+   do itl=1,nt_loc
+    itor=itl + (itm-1)*nt_loc
+    do ir=1,n_radial
+     ix = ir-1-nx0/2
+     if (ix < 0) ix = ix+nx
+     iy = itor-1
+     if (nl_idx==1) then
+       fA_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp)
+     else
+       fB_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp)
+     endif
+    enddo
+   enddo
+  enddo
+
+
+  ! 2. Poisson bracket in real space with Non_Zonal pairs
+
+  ! remove ky=0 in uxmany
+  y_mean = sum(uxmany(:,:,f_j) ,dim=1 ) / ny
+  do iy=0,ny-1
+    uxmany(iy,:,f_j) = uxmany(iy,:,f_j) -y_mean
+
+  end do
+
+  ! remove ky=0 in uymany
+  y_mean = sum(uymany(:,:,f_j) ,dim=1 ) / ny ! =0
+  do iy=0,ny-1
+    uymany(iy,:,f_j) = uymany(iy,:,f_j) -y_mean
+
+  end do
+
+  ! remove ky=0 in vx
+  y_mean = sum(vxmany(:,:,g_j) ,dim=1 ) / ny
+  do iy=0,ny-1
+    vxmany(iy,:,g_j) = vxmany(iy,:,g_j) -y_mean
+
+  end do
+
+  ! remove ky=0 in vy
+  y_mean = sum(vymany(:,:,g_j) ,dim=1 ) / ny ! =0
+  do iy=0,ny-1
+    vymany(iy,:,g_j) = vymany(iy,:,g_j) -y_mean
+
+  end do
+
+  uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny)
+
+  call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp))
+
+  do itm=1,n_toroidal_procs
+   do itl=1,nt_loc
+    itor=itl + (itm-1)*nt_loc
+    do ir=1,n_radial
+     ix = ir-1-nx0/2
+     if (ix < 0) ix = ix+nx
+     iy = itor-1
+     if (nl_idx==1) then
+       eA_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp)
+     else
+       eB_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp)
+     endif
+    enddo
+   enddo
+  enddo
+
+end subroutine cgyro_nl_fftw_stepr_triad
+
+
 ! NOTE: call cgyro_nl_fftw_comm1 before cgyro_nl_fftw
 subroutine cgyro_nl_fftw(ij)
 
@@ -326,3 +422,268 @@ subroutine cgyro_nl_fftw(ij)
   endif ! if nsplitB>0
 
 end subroutine cgyro_nl_fftw
+
+
+subroutine cgyro_nl_fftw_triad(ij)
+
+  use timer_lib
+  use parallel_lib
+  use cgyro_nl_comm
+  use cgyro_globals
+
+  implicit none
+
+  !-----------------------------------
+  integer, intent(in) :: ij
+  !-----------------------------------
+  integer :: ix,iy
+  integer :: ir,it,itm,itl,it_loc
+  integer :: itor,mytm
+  integer :: j,p
+  integer :: i_omp
+  integer :: jtheta_min
+
+  complex :: f0,g0
+
+  integer, external :: omp_get_thread_num
+
+  include 'fftw3.f03'
+  
+  call cgyro_nl_fftw_comm_test()
+
+  ! time to wait for the FA_nl to become avaialble
+  call timer_lib_in('nl_comm')
+  call parallel_slib_f_nc_wait(nsplitA,fpackA,fA_nl,fA_req)
+  fA_req_valid = .FALSE.
+  ! make sure reqs progress
+  call cgyro_nl_fftw_comm_test()
+  call timer_lib_out('nl_comm')
+
+  call timer_lib_in('nl')
+
+! f_nl is (radial, nt_loc, theta, nv_loc1, toroidal_procs)
+! where nv_loc1 * toroidal_procs >= nv_loc
+!$omp parallel do schedule(dynamic,1) private(itm,itl,itor,iy,ir,p,ix,f0,i_omp,j)
+  do j=1,nsplitA
+        i_omp = omp_get_thread_num()+1
+
+        ! zero elements not otherwise set below
+        fx(0:ny2,nx2:nx0-1,i_omp) = 0.0
+        fy(0:ny2,nx2:nx0-1,i_omp) = 0.0
+
+        ! Array mapping
+        do ir=1,n_radial
+           p  = ir-1-nx0/2
+           ix = p
+           if (ix < 0) ix = ix+nx
+           do itm=1,n_toroidal_procs
+            do itl=1,nt_loc
+              itor=itl + (itm-1)*nt_loc
+              iy = itor-1
+              f0 = i_c*fA_nl(ir,itl,j,itm)
+              fx(iy,ix,i_omp) = p*f0
+              fy(iy,ix,i_omp) = iy*f0
+            enddo
+           enddo
+           if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2
+             ! Average elements so as to ensure
+             !   f(kx,ky=0) = f(-kx,ky=0)^*
+             ! This symmetry is required for complex input to c2r
+             f0 = 0.5*( fx(0,ix,i_omp)+conjg(fx(0,nx-ix,i_omp)) )
+             fx(0,ix   ,i_omp) = f0
+             fx(0,nx-ix,i_omp) = conjg(f0)
+           endif
+           fx(n_toroidal:ny2,ix,i_omp) = 0.0
+           fy(n_toroidal:ny2,ix,i_omp) = 0.0
+        enddo
+
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call cgyro_nl_fftw_comm_test()
+        endif
+
+        call fftw_execute_dft_c2r(plan_c2r,fx(:,:,i_omp),uxmany(:,:,j))
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call cgyro_nl_fftw_comm_test()
+        endif
+
+        call fftw_execute_dft_c2r(plan_c2r,fy(:,:,i_omp),uymany(:,:,j))
+  enddo ! j
+
+  call timer_lib_out('nl')
+
+  call timer_lib_in('nl_comm')
+  ! time to wait for the g_nl to become avaialble
+  call parallel_slib_f_fd_wait(n_field,n_radial,n_jtheta,gpack,g_nl,g_req)
+  g_req_valid = .FALSE.
+  ! make sure reqs progress
+  call cgyro_nl_fftw_comm_test()
+  call timer_lib_out('nl_comm')
+
+  call timer_lib_in('nl')
+
+! g_nl      is (n_field,n_radial,n_jtheta,nt_loc,n_toroidal_procs)
+! jcev_c_nl is (n_field,n_radial,n_jtheta,nv_loc,nt_loc,n_toroidal_procs)
+!$omp parallel do schedule(dynamic,1) &
+!$omp& private(itor,mytm,itm,itl,iy,ir,p,ix,g0,i_omp,j,it,iv_loc,it_loc,jtheta_min)
+  do j=1,nsplit
+        i_omp = omp_get_thread_num()+1
+
+        ! zero elements not otherwise set below
+        gx(0:ny2,nx2:nx0-1,i_omp) = 0.0
+        gy(0:ny2,nx2:nx0-1,i_omp) = 0.0
+
+        ! Array mapping
+        do ir=1,n_radial
+           p  = ir-1-nx0/2
+           ix = p
+           if (ix < 0) ix = ix+nx
+           do itm=1,n_toroidal_procs
+            do itl=1,nt_loc
+              itor = itl + (itm-1)*nt_loc
+              mytm = 1 + nt1/nt_loc !my toroidal proc number
+              it = 1+((mytm-1)*nsplit+j-1)/nv_loc
+              iv_loc = 1+modulo((mytm-1)*nsplit+j-1,nv_loc)
+              jtheta_min = 1+((mytm-1)*nsplit)/nv_loc
+              it_loc = it-jtheta_min+1
+
+              iy = itor-1
+              if (it > n_theta) then
+                 g0 = (0.0,0.0)
+              else
+                 g0 = i_c*sum( jvec_c_nl(:,ir,it_loc,iv_loc,itor)*g_nl(:,ir,it_loc,itor))
+              endif
+              gx(iy,ix,i_omp) = p*g0
+              gy(iy,ix,i_omp) = iy*g0
+            enddo
+           enddo
+           if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2
+              ! Average elements so as to ensure
+              !   g(kx,ky=0) = g(-kx,ky=0)^*
+              ! This symmetry is required for complex input to c2r
+              g0 = 0.5*( gx(0,ix,i_omp)+conjg(gx(0,nx-ix,i_omp)) )
+              gx(0,ix   ,i_omp) = g0
+              gx(0,nx-ix,i_omp) = conjg(g0)
+           endif
+           gx(n_toroidal:ny2,ix,i_omp) = 0.0
+           gy(n_toroidal:ny2,ix,i_omp) = 0.0
+        enddo
+
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call cgyro_nl_fftw_comm_test()
+        endif
+
+        call fftw_execute_dft_c2r(plan_c2r,gx(:,:,i_omp),vxmany(:,:,j))
+
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call cgyro_nl_fftw_comm_test()
+        endif
+
+        call fftw_execute_dft_c2r(plan_c2r,gy(:,:,i_omp),vymany(:,:,j))
+
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call cgyro_nl_fftw_comm_test()
+        endif
+
+        if (j<=nsplitA) then
+           call cgyro_nl_fftw_stepr_triad(j, j, 1, i_omp)
+        endif
+        ! else we will do it in the next loop
+  enddo ! j
+
+  call timer_lib_out('nl')
+
+  call timer_lib_in('nl_comm')
+  ! start the async reverse comm
+  ! can reuse the same req, no overlap with forward fA_req
+  call parallel_slib_r_nc_async(nsplitA,fA_nl,fpackA,fA_req)
+  call parallel_slib_r_nc_async(nsplitA,eA_nl,epackA,eA_req)
+  fA_req_valid = .TRUE.
+
+  if (nsplitB > 0) then
+    ! time to wait for the 2nd half of F_nl to become avaialble
+    call parallel_slib_f_nc_wait(nsplitB,fpackB,fB_nl,fB_req)
+    fB_req_valid = .FALSE.
+  endif
+
+  ! make sure reqs progress
+  call cgyro_nl_fftw_comm_test()
+  call timer_lib_out('nl_comm')
+
+  if (nsplitB > 0) then
+   call timer_lib_in('nl')
+
+! f_nl is (radial, nt_loc, theta, nv_loc1, toroidal_procs)
+! where nv_loc1 * toroidal_procs >= nv_loc
+!$omp parallel do schedule(dynamic,1) private(itm,itl,itor,iy,ir,p,ix,f0,i_omp,j)
+   do j=1,nsplitB
+        i_omp = omp_get_thread_num()+1
+
+        ! zero elements not otherwise set below
+        fx(0:ny2,nx2:nx0-1,i_omp) = 0.0
+        fy(0:ny2,nx2:nx0-1,i_omp) = 0.0
+
+        ! Array mapping
+        do ir=1,n_radial
+           p  = ir-1-nx0/2
+           ix = p
+           if (ix < 0) ix = ix+nx
+           do itm=1,n_toroidal_procs
+            do itl=1,nt_loc
+              itor=itl + (itm-1)*nt_loc
+              iy = itor-1
+              f0 = i_c*fB_nl(ir,itl,j,itm)
+              fx(iy,ix,i_omp) = p*f0
+              fy(iy,ix,i_omp) = iy*f0
+            enddo
+           enddo
+           if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2
+             ! Average elements so as to ensure
+             !   f(kx,ky=0) = f(-kx,ky=0)^*
+             ! This symmetry is required for complex input to c2r
+             f0 = 0.5*( fx(0,ix,i_omp)+conjg(fx(0,nx-ix,i_omp)) )
+             fx(0,ix   ,i_omp) = f0
+             fx(0,nx-ix,i_omp) = conjg(f0)
+           endif
+           fx(n_toroidal:ny2,ix,i_omp) = 0.0
+           fy(n_toroidal:ny2,ix,i_omp) = 0.0
+        enddo
+
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call cgyro_nl_fftw_comm_test()
+        endif
+
+        call fftw_execute_dft_c2r(plan_c2r,fx(:,:,i_omp),uxmany(:,:,j))
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call cgyro_nl_fftw_comm_test()
+        endif
+
+        call fftw_execute_dft_c2r(plan_c2r,fy(:,:,i_omp),uymany(:,:,j))
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call cgyro_nl_fftw_comm_test()
+        endif
+
+        call cgyro_nl_fftw_stepr_triad(nsplitA+j, j, 2, i_omp)
+   enddo ! j
+
+   call timer_lib_out('nl')
+
+   call timer_lib_in('nl_comm')
+   ! start the async reverse comm
+   ! can reuse the same req, no overlap with forward fB_req
+   call parallel_slib_r_nc_async(nsplitB,fB_nl,fpackB,fB_req)
+   call parallel_slib_r_nc_async(nsplitB,eB_nl,epackB,eB_req)
+   fB_req_valid = .TRUE.
+   ! make sure reqs progress
+   call cgyro_nl_fftw_comm_test()
+   call timer_lib_out('nl_comm')
+  endif ! if nsplitB>0
+
+end subroutine cgyro_nl_fftw_triad
diff --git a/cgyro/src/cgyro_read_input.f90 b/cgyro/src/cgyro_read_input.f90
index be77de7dd..bfbe4bf71 100644
--- a/cgyro/src/cgyro_read_input.f90
+++ b/cgyro/src/cgyro_read_input.f90
@@ -70,6 +70,7 @@ subroutine cgyro_read_input
   call cgyro_readbc_int(moment_print_flag,'MOMENT_PRINT_FLAG')
   call cgyro_readbc_int(gflux_print_flag,'GFLUX_PRINT_FLAG')
   call cgyro_readbc_int(field_print_flag,'FIELD_PRINT_FLAG')
+  call cgyro_readbc_int(triad_print_flag,'TRIAD_PRINT_FLAG')
   call cgyro_readbc_real(amp0)
   call cgyro_readbc_real(amp)
   call cgyro_readbc_real(gamma_e)
diff --git a/cgyro/src/cgyro_rhs.F90 b/cgyro/src/cgyro_rhs.F90
index 2f762bd16..b12c6cca5 100644
--- a/cgyro/src/cgyro_rhs.F90
+++ b/cgyro/src/cgyro_rhs.F90
@@ -35,7 +35,11 @@ subroutine cgyro_rhs_r_comm_async(ij)
   integer, intent(in) :: ij
 
   if (nonlinear_flag == 1) then
-       call cgyro_nl_fftw_comm1_r(ij)
+     if (triad_print_flag == 1 .and. ij == 4) then
+        call cgyro_nl_fftw_comm1_r_triad(ij)
+     else
+        call cgyro_nl_fftw_comm1_r(ij)
+     endif
   endif
 
 end subroutine cgyro_rhs_r_comm_async
@@ -410,7 +414,11 @@ subroutine cgyro_rhs(ij,update_cap)
   if (nonlinear_flag == 1) then
      ! assumes someone already started the input comm
      ! and will finish the output comm
-     call cgyro_nl_fftw()
+     if (triad_print_flag == 1 .and. ij == 4) then
+        call cgyro_nl_fftw_triad()
+     else
+        call cgyro_nl_fftw()
+     endif
   endif
 
   call cgyro_upwind_complete
diff --git a/cgyro/src/cgyro_step_collision.F90 b/cgyro/src/cgyro_step_collision.F90
index f422e88a4..228360d2f 100644
--- a/cgyro/src/cgyro_step_collision.F90
+++ b/cgyro/src/cgyro_step_collision.F90
@@ -334,6 +334,7 @@ subroutine cgyro_step_collision_cpu(use_simple)
 
   ! Compute H given h and [phi(h), apar(h)]
 
+ if (triad_print_flag == 1 ) then
 !$omp parallel do collapse(2) &
 !$omp&            private(iv_loc,is,ic,iv,my_psi,my_ch) firstprivate(nc)
   do itor=nt1,nt2
@@ -341,14 +342,44 @@ subroutine cgyro_step_collision_cpu(use_simple)
      iv_loc = iv-nv1+1
      is = is_v(iv)
      do ic=1,nc
+  
+        ! Save collisional diss. 
+        my_ch = cap_h_ct(iv_loc,itor,ic)
+        my_psi = sum(jvec_c(:,ic,iv_loc,itor)*field(:,ic,itor))
+           
+        my_psi = my_ch-my_psi*(z(is)/temp(is))
+        cap_h_ct(iv_loc,itor,ic) = (cap_h_c(ic,iv_loc,itor) + my_ch) / 2.0
+        cap_h_ct(iv_loc,itor,ic) = conjg(cap_h_ct(iv_loc,itor,ic)) &
+          * ( my_psi - h_x(ic,iv_loc,itor) )
+
+        h_x(ic,iv_loc,itor) = my_psi
+        cap_h_c(ic,iv_loc,itor) = my_ch
+
+     enddo
+   enddo
+  enddo
+
+ else
+
+!$omp parallel do collapse(2) &
+!$omp&            private(iv_loc,is,ic,iv,my_psi,my_ch) firstprivate(nc)
+  do itor=nt1,nt2
+   do iv=nv1,nv2
+     iv_loc = iv-nv1+1
+     is = is_v(iv)
+     do ic=1,nc
+
         my_ch = cap_h_ct(iv_loc,itor,ic)
         my_psi = sum(jvec_c(:,ic,iv_loc,itor)*field(:,ic,itor))
         h_x(ic,iv_loc,itor) = my_ch-my_psi*(z(is)/temp(is))
         cap_h_c(ic,iv_loc,itor) = my_ch
+
      enddo
    enddo
   enddo
 
+ end if ! (triad_print_flag == 1 )
+
   call timer_lib_out('coll')
 
 end subroutine cgyro_step_collision_cpu
diff --git a/cgyro/src/cgyro_write_timedata.f90 b/cgyro/src/cgyro_write_timedata.f90
index 3d6b808a1..36864a9ab 100644
--- a/cgyro/src/cgyro_write_timedata.f90
+++ b/cgyro/src/cgyro_write_timedata.f90
@@ -70,6 +70,14 @@ subroutine cgyro_write_timedata
      enddo
   endif
 
+  if (nonlinear_flag == 1 .and. triad_print_flag == 1) then
+      ! Triad energy transfer for all kxky
+        call cgyro_write_distributed_bcomplex(&
+             trim(path)//binfile_triad,&
+             size(triad(:,:,:,:)),&
+             triad(:,:,:,:))
+  endif
+
   if (field_print_flag == 1) then
      p_field = n_field
   else