diff --git a/cgyro/src/cgyro_cleanup.F90 b/cgyro/src/cgyro_cleanup.F90
index 716a01cc7..1cdaa2e74 100644
--- a/cgyro/src/cgyro_cleanup.F90
+++ b/cgyro/src/cgyro_cleanup.F90
@@ -283,10 +283,10 @@ subroutine cgyro_cleanup
   if(allocated(gx))                deallocate(gx)
   if(allocated(fy))                deallocate(fy)
   if(allocated(gy))                deallocate(gy)
-  if(allocated(uxmany))            deallocate(uxmany)
-  if(allocated(uymany))            deallocate(uymany)
-  if(allocated(vx))                deallocate(vx)
-  if(allocated(vy))                deallocate(vy)
+  if(allocated(vxmany))            deallocate(vxmany)
+  if(allocated(vymany))            deallocate(vymany)
+  if(allocated(ux))                deallocate(ux)
+  if(allocated(uy))                deallocate(uy)
   if(allocated(uv))                deallocate(uv)
 #endif  
 
diff --git a/cgyro/src/cgyro_globals.F90 b/cgyro/src/cgyro_globals.F90
index 76a378b2e..eb248901d 100644
--- a/cgyro/src/cgyro_globals.F90
+++ b/cgyro/src/cgyro_globals.F90
@@ -422,10 +422,10 @@ module cgyro_globals
   !
   ! 2D FFT work arrays
 #ifndef CGYRO_GPU_FFT
-  real, dimension(:,:,:), allocatable :: uxmany
-  real, dimension(:,:,:), allocatable :: uymany
-  real, dimension(:,:,:), allocatable :: vx
-  real, dimension(:,:,:), allocatable :: vy
+  real, dimension(:,:,:), allocatable :: vxmany
+  real, dimension(:,:,:), allocatable :: vymany
+  real, dimension(:,:,:), allocatable :: ux
+  real, dimension(:,:,:), allocatable :: uy
   real, dimension(:,:,:), allocatable :: uv
   complex, dimension(:,:,:),allocatable :: fx
   complex, dimension(:,:,:),allocatable :: fy
diff --git a/cgyro/src/cgyro_init_manager.F90 b/cgyro/src/cgyro_init_manager.F90
index c50fec0d2..7a2499952 100644
--- a/cgyro/src/cgyro_init_manager.F90
+++ b/cgyro/src/cgyro_init_manager.F90
@@ -432,14 +432,14 @@ subroutine cgyro_init_manager
   allocate(fy(0:ny2,0:nx-1,n_omp))
   allocate(gy(0:ny2,0:nx-1,n_omp))
 
-  allocate(uxmany(0:ny-1,0:nx-1,nsplit))
-  allocate(uymany(0:ny-1,0:nx-1,nsplit))
-  allocate(vx(0:ny-1,0:nx-1,n_omp))
-  allocate(vy(0:ny-1,0:nx-1,n_omp))
+  allocate(vxmany(0:ny-1,0:nx-1,nsplit))
+  allocate(vymany(0:ny-1,0:nx-1,nsplit))
+  allocate(ux(0:ny-1,0:nx-1,n_omp))
+  allocate(uy(0:ny-1,0:nx-1,n_omp))
   allocate(uv(0:ny-1,0:nx-1,n_omp))
 
   ! Create plans once and for all, with global arrays fx,ux
-  plan_c2r = fftw_plan_dft_c2r_2d(nx,ny,gx(:,:,1),vx(:,:,1),FFTW_PATIENT)
+  plan_c2r = fftw_plan_dft_c2r_2d(nx,ny,fx(:,:,1),ux(:,:,1),FFTW_PATIENT)
   plan_r2c = fftw_plan_dft_r2c_2d(nx,ny,uv(:,:,1),fx(:,:,1),FFTW_PATIENT)
 #endif
 
diff --git a/cgyro/src/cgyro_nl_fftw.f90 b/cgyro/src/cgyro_nl_fftw.f90
index 6d0388d23..f8a9acd76 100644
--- a/cgyro/src/cgyro_nl_fftw.f90
+++ b/cgyro/src/cgyro_nl_fftw.f90
@@ -26,7 +26,7 @@ subroutine cgyro_nl_fftw_stepr(j, i_omp)
 
   ! Poisson bracket in real space
 
-  uv(:,:,i_omp) = (uxmany(:,:,j)*vy(:,:,i_omp)-uymany(:,:,j)*vx(:,:,i_omp))/(nx*ny)
+  uv(:,:,i_omp) = (ux(:,:,i_omp)*vymany(:,:,j)-uy(:,:,i_omp)*vxmany(:,:,j))/(nx*ny)
 
   call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp))
 
@@ -76,36 +76,35 @@ subroutine cgyro_nl_fftw(ij)
 
   include 'fftw3.f03'
   
-  ! no staggering, will need all threads for FFTW in one pass
   force_early_comm2 = (n_omp>=(4*nsplit)) 
   one_pass_fft = force_early_comm2
 
-  ! time to wait for the F_nl to become avaialble
+  ! time to wait for the g_nl to become avaialble
   call timer_lib_in('nl_comm')
-  call parallel_slib_f_nc_wait(fpack,f_nl,f_req)
-  ! make sure g_req progresses
-  call parallel_slib_test(g_req)
-  call timer_lib_out('nl_comm')
-
+  call parallel_slib_f_fd_wait(n_field,n_radial,n_jtheta,gpack,g_nl,g_req)
   if (force_early_comm2) then
-     ! time to wait for the g_nl to become avaialble
-     call timer_lib_in('nl_comm')
-     call parallel_slib_f_fd_wait(n_field,n_radial,n_jtheta,gpack,g_nl,g_req)
-     call timer_lib_out('nl_comm')
+     ! wait also for the f_nl to become avaialble
+     call parallel_slib_f_nc_wait(fpack,f_nl,f_req)
+  else
+     ! make sure f_req progresses
+     call parallel_slib_test(f_req)
   endif
+  call timer_lib_out('nl_comm')
+
 
   call timer_lib_in('nl')
 
-! f_nl is (radial, nt_loc, theta, nv_loc1, toroidal_procs)
-! where nv_loc1 * toroidal_procs >= nv_loc
-  if (n_omp<=nsplit) then
-!$omp parallel do private(itm,itl,itor,iy,ir,p,ix,f0,i_omp,j)
+! g_nl      is (n_field,n_radial,n_jtheta,nt_loc,n_toroidal_procs)
+! jcev_c_nl is (n_field,n_radial,n_jtheta,nv_loc,nt_loc,n_toroidal_procs)
+  if (n_omp <= nsplit) then
+!$omp parallel do schedule(dynamic,1) &
+!$omp& private(itor,mytm,itm,itl,iy,ir,p,ix,g0,i_omp,j,it,iv_loc,it_loc,jtheta_min)
      do j=1,nsplit
         i_omp = omp_get_thread_num()+1
 
         ! zero elements not otherwise set below
-        fx(0:ny2,nx2:nx0-1,i_omp) = 0.0
-        fy(0:ny2,nx2:nx0-1,i_omp) = 0.0
+        gx(0:ny2,nx2:nx0-1,i_omp) = 0.0
+        gy(0:ny2,nx2:nx0-1,i_omp) = 0.0
 
         ! Array mapping
         do ir=1,n_radial
@@ -114,27 +113,48 @@ subroutine cgyro_nl_fftw(ij)
            if (ix < 0) ix = ix+nx
            do itm=1,n_toroidal_procs
             do itl=1,nt_loc
-              itor=itl + (itm-1)*nt_loc
+              itor = itl + (itm-1)*nt_loc
+              mytm = 1 + nt1/nt_loc !my toroidal proc number
+              it = 1+((mytm-1)*nsplit+j-1)/nv_loc
+              iv_loc = 1+modulo((mytm-1)*nsplit+j-1,nv_loc)
+              jtheta_min = 1+((mytm-1)*nsplit)/nv_loc
+              it_loc = it-jtheta_min+1
+
               iy = itor-1
-              f0 = i_c*f_nl(ir,itl,j,itm)
-              fx(iy,ix,i_omp) = p*f0
-              fy(iy,ix,i_omp) = iy*f0
+              if (it > n_theta) then
+                 g0 = (0.0,0.0)
+              else
+                 g0 = i_c*sum( jvec_c_nl(:,ir,it_loc,iv_loc,itor)*g_nl(:,ir,it_loc,itor))
+              endif
+              gx(iy,ix,i_omp) = p*g0
+              gy(iy,ix,i_omp) = iy*g0
             enddo
            enddo
            if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2
-             ! Average elements so as to ensure
-             !   f(kx,ky=0) = f(-kx,ky=0)^*
-             ! This symmetry is required for complex input to c2r
-             f0 = 0.5*( fx(0,ix,i_omp)+conjg(fx(0,nx-ix,i_omp)) )
-             fx(0,ix   ,i_omp) = f0
-             fx(0,nx-ix,i_omp) = conjg(f0)
+              ! Average elements so as to ensure
+              !   g(kx,ky=0) = g(-kx,ky=0)^*
+              ! This symmetry is required for complex input to c2r
+              g0 = 0.5*( gx(0,ix,i_omp)+conjg(gx(0,nx-ix,i_omp)) )
+              gx(0,ix   ,i_omp) = g0
+              gx(0,nx-ix,i_omp) = conjg(g0)
            endif
-           fx(n_toroidal:ny2,ix,i_omp) = 0.0
-           fy(n_toroidal:ny2,ix,i_omp) = 0.0
+           gx(n_toroidal:ny2,ix,i_omp) = 0.0
+           gy(n_toroidal:ny2,ix,i_omp) = 0.0
         enddo
 
-        call fftw_execute_dft_c2r(plan_c2r,fx(:,:,i_omp),uxmany(:,:,j))
-        call fftw_execute_dft_c2r(plan_c2r,fy(:,:,i_omp),uymany(:,:,j))
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call parallel_slib_test(f_req)
+        endif
+
+        call fftw_execute_dft_c2r(plan_c2r,gx(:,:,i_omp),vxmany(:,:,j))
+
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call parallel_slib_test(f_req)
+        endif
+
+        call fftw_execute_dft_c2r(plan_c2r,gy(:,:,i_omp),vymany(:,:,j))
      enddo ! j
   else ! (n_omp>nsplit), increase parallelism
      num_one_pass = 2
@@ -148,58 +168,6 @@ subroutine cgyro_nl_fftw(ij)
 
            select case(o)
            case (1)
-              ! zero elements not otherwise set below
-              fx(0:ny2,nx2:nx0-1,i_omp) = 0.0
-
-              ! Array mapping
-              do ir=1,n_radial
-                 p  = ir-1-nx0/2
-                 ix = p
-                 if (ix < 0) ix = ix+nx
-                 do itm=1,n_toroidal_procs
-                   do itl=1,nt_loc
-                    itor=itl + (itm-1)*nt_loc
-                    iy = itor-1
-                    f0 = i_c*f_nl(ir,itl,j,itm)
-                    fx(iy,ix,i_omp) = p*f0
-                   enddo
-                 enddo
-                 if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2
-                    ! Average elements so as to ensure
-                    !   f(kx,ky=0) = f(-kx,ky=0)^*
-                    ! This symmetry is required for complex input to c2r
-                    f0 = 0.5*( fx(0,ix,i_omp)+conjg(fx(0,nx-ix,i_omp)) )
-                    fx(0,ix   ,i_omp) = f0
-                    fx(0,nx-ix,i_omp) = conjg(f0)
-                 endif
-                 fx(n_toroidal:ny2,ix,i_omp) = 0.0
-              enddo
-
-              call fftw_execute_dft_c2r(plan_c2r,fx(:,:,i_omp),uxmany(:,:,j))
-
-           case (2)
-              ! zero elements not otherwise set below
-              fy(0:ny2,nx2:nx0-1,i_omp) = 0.0
-
-              ! Array mapping
-              do ir=1,n_radial
-                 p  = ir-1-nx0/2
-                 ix = p
-                 if (ix < 0) ix = ix+nx
-                 do itm=1,n_toroidal_procs
-                   do itl=1,nt_loc
-                    itor=itl + (itm-1)*nt_loc
-                    iy = itor-1
-                    f0 = i_c*f_nl(ir,itl,j,itm)
-                    fy(iy,ix,i_omp) = iy*f0
-                   enddo
-                 enddo
-                 fy(n_toroidal:ny2,ix,i_omp) = 0.0
-              enddo
-
-              call fftw_execute_dft_c2r(plan_c2r,fy(:,:,i_omp),uymany(:,:,j))
-
-           case (3)
               ! zero elements not otherwise set below
               gx(0:ny2,nx2:nx0-1,i_omp) = 0.0
 
@@ -237,9 +205,9 @@ subroutine cgyro_nl_fftw(ij)
                  gx(n_toroidal:ny2,ix,i_omp) = 0.0
               enddo
 
-              call fftw_execute_dft_c2r(plan_c2r,gx(:,:,i_omp),vx(:,:,i_omp))
+              call fftw_execute_dft_c2r(plan_c2r,gx(:,:,i_omp),vxmany(:,:,j))
 
-           case (4)
+           case (2)
               ! zero elements not otherwise set below
               gy(0:ny2,nx2:nx0-1,i_omp) = 0.0
 
@@ -269,7 +237,59 @@ subroutine cgyro_nl_fftw(ij)
                  gy(n_toroidal:ny2,ix,i_omp) = 0.0
               enddo
 
-              call fftw_execute_dft_c2r(plan_c2r,gy(:,:,i_omp),vy(:,:,i_omp))
+              call fftw_execute_dft_c2r(plan_c2r,gy(:,:,i_omp),vymany(:,:,j))
+
+           case (3)
+              ! zero elements not otherwise set below
+              fx(0:ny2,nx2:nx0-1,i_omp) = 0.0
+
+              ! Array mapping
+              do ir=1,n_radial
+                 p  = ir-1-nx0/2
+                 ix = p
+                 if (ix < 0) ix = ix+nx
+                 do itm=1,n_toroidal_procs
+                   do itl=1,nt_loc
+                    itor=itl + (itm-1)*nt_loc
+                    iy = itor-1
+                    f0 = i_c*f_nl(ir,itl,j,itm)
+                    fx(iy,ix,i_omp) = p*f0
+                   enddo
+                 enddo
+                 if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2
+                    ! Average elements so as to ensure
+                    !   f(kx,ky=0) = f(-kx,ky=0)^*
+                    ! This symmetry is required for complex input to c2r
+                    f0 = 0.5*( fx(0,ix,i_omp)+conjg(fx(0,nx-ix,i_omp)) )
+                    fx(0,ix   ,i_omp) = f0
+                    fx(0,nx-ix,i_omp) = conjg(f0)
+                 endif
+                 fx(n_toroidal:ny2,ix,i_omp) = 0.0
+              enddo
+
+              call fftw_execute_dft_c2r(plan_c2r,fx(:,:,i_omp),ux(:,:,i_omp))
+
+           case (4)
+              ! zero elements not otherwise set below
+              fy(0:ny2,nx2:nx0-1,i_omp) = 0.0
+
+              ! Array mapping
+              do ir=1,n_radial
+                 p  = ir-1-nx0/2
+                 ix = p
+                 if (ix < 0) ix = ix+nx
+                 do itm=1,n_toroidal_procs
+                   do itl=1,nt_loc
+                    itor=itl + (itm-1)*nt_loc
+                    iy = itor-1
+                    f0 = i_c*f_nl(ir,itl,j,itm)
+                    fy(iy,ix,i_omp) = iy*f0
+                   enddo
+                 enddo
+                 fy(n_toroidal:ny2,ix,i_omp) = 0.0
+              enddo
+
+              call fftw_execute_dft_c2r(plan_c2r,fy(:,:,i_omp),uy(:,:,i_omp))
 
            end select
         enddo ! o
@@ -277,80 +297,68 @@ subroutine cgyro_nl_fftw(ij)
   endif
 
   call timer_lib_out('nl')
-  
-  ! stagger comm2, to load ballance network traffic
+
   if (.not. force_early_comm2) then
-     ! time to wait for the g_nl to become avaialble
+     ! time to wait for the f_nl to become avaialble
      call timer_lib_in('nl_comm')
-     call parallel_slib_f_fd_wait(n_field,n_radial,n_jtheta,gpack,g_nl,g_req)
+     call parallel_slib_f_nc_wait(fpack,f_nl,f_req)
      call timer_lib_out('nl_comm')
   endif
 
   call timer_lib_in('nl')
 
-! g_nl      is (n_field,n_radial,n_jtheta,nt_loc,n_toroidal_procs)
-! jcev_c_nl is (n_field,n_radial,n_jtheta,nv_loc,nt_loc,n_toroidal_procs)
+! f_nl is (radial, nt_loc, theta, nv_loc1, toroidal_procs)
+! where nv_loc1 * toroidal_procs >= nv_loc
   if (n_omp <= nsplit) then
-!$omp parallel do private(itor,mytm,itm,itl,iy,ir,p,ix,g0,i_omp,j,it,iv_loc,it_loc,jtheta_min)
+!$omp parallel do private(itm,itl,itor,iy,ir,p,ix,f0,i_omp,j)
      do j=1,nsplit
         i_omp = omp_get_thread_num()+1
 
         ! zero elements not otherwise set below
-        gx(0:ny2,nx2:nx0-1,i_omp) = 0.0
-        gy(0:ny2,nx2:nx0-1,i_omp) = 0.0
+        fx(0:ny2,nx2:nx0-1,i_omp) = 0.0
+        fy(0:ny2,nx2:nx0-1,i_omp) = 0.0
 
         ! Array mapping
         do ir=1,n_radial
            p  = ir-1-nx0/2
            ix = p
-           if (ix < 0) ix = ix+nx  
+           if (ix < 0) ix = ix+nx
            do itm=1,n_toroidal_procs
             do itl=1,nt_loc
-              itor = itl + (itm-1)*nt_loc
-              mytm = 1 + nt1/nt_loc !my toroidal proc number
-              it = 1+((mytm-1)*nsplit+j-1)/nv_loc
-              iv_loc = 1+modulo((mytm-1)*nsplit+j-1,nv_loc)
-              jtheta_min = 1+((mytm-1)*nsplit)/nv_loc
-              it_loc = it-jtheta_min+1
-
+              itor=itl + (itm-1)*nt_loc
               iy = itor-1
-              if (it > n_theta) then
-                 g0 = (0.0,0.0)
-              else
-                 g0 = i_c*sum( jvec_c_nl(:,ir,it_loc,iv_loc,itor)*g_nl(:,ir,it_loc,itor))
-              endif
-              gx(iy,ix,i_omp) = p*g0
-              gy(iy,ix,i_omp) = iy*g0
+              f0 = i_c*f_nl(ir,itl,j,itm)
+              fx(iy,ix,i_omp) = p*f0
+              fy(iy,ix,i_omp) = iy*f0
             enddo
            enddo
            if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2
-              ! Average elements so as to ensure
-              !   g(kx,ky=0) = g(-kx,ky=0)^*
-              ! This symmetry is required for complex input to c2r
-              g0 = 0.5*( gx(0,ix,i_omp)+conjg(gx(0,nx-ix,i_omp)) )
-              gx(0,ix   ,i_omp) = g0
-              gx(0,nx-ix,i_omp) = conjg(g0)
+             ! Average elements so as to ensure
+             !   f(kx,ky=0) = f(-kx,ky=0)^*
+             ! This symmetry is required for complex input to c2r
+             f0 = 0.5*( fx(0,ix,i_omp)+conjg(fx(0,nx-ix,i_omp)) )
+             fx(0,ix   ,i_omp) = f0
+             fx(0,nx-ix,i_omp) = conjg(f0)
            endif
-           gx(n_toroidal:ny2,ix,i_omp) = 0.0
-           gy(n_toroidal:ny2,ix,i_omp) = 0.0
+           fx(n_toroidal:ny2,ix,i_omp) = 0.0
+           fy(n_toroidal:ny2,ix,i_omp) = 0.0
         enddo
 
-        call fftw_execute_dft_c2r(plan_c2r,gx(:,:,i_omp),vx(:,:,i_omp))
-        call fftw_execute_dft_c2r(plan_c2r,gy(:,:,i_omp),vy(:,:,i_omp))
+        call fftw_execute_dft_c2r(plan_c2r,fx(:,:,i_omp),ux(:,:,i_omp))
+        call fftw_execute_dft_c2r(plan_c2r,fy(:,:,i_omp),uy(:,:,i_omp))
 
         call cgyro_nl_fftw_stepr(j, i_omp)
      enddo ! j
   else ! n_omp>nsplit
      if (.not. one_pass_fft) then
-!$omp parallel do collapse(2) &
-!$omp&            private(itm,itl,itor,mytm,iy,ir,p,ix,g0,i_omp,it,iv_loc,it_loc,jtheta_min)
+!$omp parallel do collapse(2) private(itl,itm,itor,mytm,iy,ir,p,ix,f0,g0,i_omp,j,o,it,iv_loc,it_loc,jtheta_min)
         do j=1,nsplit
            do o=1,2
               i_omp = j ! j<n_omp in this branch, so we can do it
 
               if (o == 1) then
                  ! zero elements not otherwise set below
-                 gx(0:ny2,nx2:nx0-1,i_omp) = 0.0
+                 fx(0:ny2,nx2:nx0-1,i_omp) = 0.0
 
                  ! Array mapping
                  do ir=1,n_radial
@@ -360,36 +368,27 @@ subroutine cgyro_nl_fftw(ij)
                     do itm=1,n_toroidal_procs
                      do itl=1,nt_loc
                        itor = itl + (itm-1)*nt_loc
-                       mytm  = 1 + nt1/nt_loc !my toroidal proc number
-                       it = 1+((mytm-1)*nsplit+j-1)/nv_loc
-                       iv_loc = 1+modulo((mytm-1)*nsplit+j-1,nv_loc)
-                       jtheta_min = 1+((mytm-1)*nsplit)/nv_loc
-                       it_loc = it-jtheta_min+1
-
                        iy = itor-1
-                       if (it > n_theta) then
-                          g0 = (0.0,0.0)
-                       else
-                          g0 = i_c*sum( jvec_c_nl(:,ir,it_loc,iv_loc,itor)*g_nl(:,ir,it_loc,itor))
-                       endif
-                       gx(iy,ix,i_omp) = p*g0
+                       f0 = i_c*f_nl(ir,itl,j,itm)
+                       fx(iy,ix,i_omp) = p*f0
                      enddo
                     enddo
                     if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2
                        ! Average elements so as to ensure
-                       !   g(kx,ky=0) = g(-kx,ky=0)^*
+                       !   f(kx,ky=0) = f(-kx,ky=0)^*
                        ! This symmetry is required for complex input to c2r
-                       g0 = 0.5*( gx(0,ix,i_omp)+conjg(gx(0,nx-ix,i_omp)) )
-                       gx(0,ix   ,i_omp) = g0
-                       gx(0,nx-ix,i_omp) = conjg(g0)
+                       f0 = 0.5*( fx(0,ix,i_omp)+conjg(fx(0,nx-ix,i_omp)) )
+                       fx(0,ix   ,i_omp) = f0
+                       fx(0,nx-ix,i_omp) = conjg(f0)
                     endif
-                    gx(n_toroidal:ny2,ix,i_omp) = 0.0
+                    fx(n_toroidal:ny2,ix,i_omp) = 0.0
                  enddo
 
-                 call fftw_execute_dft_c2r(plan_c2r,gx(:,:,i_omp),vx(:,:,i_omp))
+                 call fftw_execute_dft_c2r(plan_c2r,fx(:,:,i_omp),ux(:,:,i_omp))
+
               else
                  ! zero elements not otherwise set below
-                 gy(0:ny2,nx2:nx0-1,i_omp) = 0.0
+                 fy(0:ny2,nx2:nx0-1,i_omp) = 0.0
 
                  ! Array mapping
                  do ir=1,n_radial
@@ -399,25 +398,16 @@ subroutine cgyro_nl_fftw(ij)
                     do itm=1,n_toroidal_procs
                      do itl=1,nt_loc
                        itor = itl + (itm-1)*nt_loc
-                       mytm  = 1 + nt1/nt_loc !my toroidal proc number
-                       it = 1+((mytm-1)*nsplit+j-1)/nv_loc
-                       iv_loc = 1+modulo((mytm-1)*nsplit+j-1,nv_loc)
-                       jtheta_min = 1+((mytm-1)*nsplit)/nv_loc
-                       it_loc = it-jtheta_min+1
-
                        iy = itor-1
-                       if (it > n_theta) then
-                          g0 = (0.0,0.0)
-                       else
-                          g0 = i_c*sum( jvec_c_nl(:,ir,it_loc,iv_loc,itor)*g_nl(:,ir,it_loc,itor))
-                       endif
-                       gy(iy,ix,i_omp) = iy*g0
+                       f0 = i_c*f_nl(ir,itl,j,itm)
+                       fy(iy,ix,i_omp) = iy*f0
                      enddo
                     enddo
-                    gy(n_toroidal:ny2,ix,i_omp) = 0.0
+                    fy(n_toroidal:ny2,ix,i_omp) = 0.0
                  enddo
 
-                 call fftw_execute_dft_c2r(plan_c2r,gy(:,:,i_omp),vy(:,:,i_omp))
+                 call fftw_execute_dft_c2r(plan_c2r,fy(:,:,i_omp),uy(:,:,i_omp))
+
               endif
            enddo ! o
         enddo ! j
diff --git a/cgyro/src/cgyro_nl_fftw.gpu.F90 b/cgyro/src/cgyro_nl_fftw.gpu.F90
index 623fa35d8..d8b8fb80f 100644
--- a/cgyro/src/cgyro_nl_fftw.gpu.F90
+++ b/cgyro/src/cgyro_nl_fftw.gpu.F90
@@ -72,10 +72,10 @@ subroutine cgyro_nl_fftw(ij)
 
   call timer_lib_in('nl_mem')
   ! make sure reqs progress
-  call parallel_slib_test(f_req)
   call parallel_slib_test(g_req)
+  call parallel_slib_test(f_req)
 
-  ! we can zero the elements we know are zero while we wait
+  ! we can zero the elements we know are zero while we wait for comm
 #if defined(OMPGPU)
   !no async for OMPGPU for now
 !$omp target teams distribute parallel do simd collapse(3)
@@ -87,8 +87,8 @@ subroutine cgyro_nl_fftw(ij)
   do j=1,nsplit
      do ix=nx2,nx0-1
        do iy=0,ny2
-         fxmany(iy,ix,j) = 0
-         fymany(iy,ix,j) = 0
+         gxmany(iy,ix,j) = 0
+         gymany(iy,ix,j) = 0
        enddo
      enddo
   enddo
@@ -104,107 +104,109 @@ subroutine cgyro_nl_fftw(ij)
   do j=1,nsplit
      do ix=0,nx-1
        do iy=n_toroidal,ny2
-         fxmany(iy,ix,j) = 0
-         fymany(iy,ix,j) = 0
+         gxmany(iy,ix,j) = 0
+         gymany(iy,ix,j) = 0
        enddo
      enddo
   enddo
   call timer_lib_out('nl_mem')
 
-  ! time to wait for the F_nl to become avaialble
   call timer_lib_in('nl_comm')
-  call parallel_slib_f_nc_wait(fpack,f_nl,f_req)
-  ! make sure g_req progresses
-  call parallel_slib_test(g_req)
+  ! time to wait for the g_nl to become avaialble
+  call parallel_slib_f_fd_wait(n_field,n_radial,n_jtheta,gpack,g_nl,g_req)
+  ! make sure reqs progress
+  call parallel_slib_test(f_req)
   call timer_lib_out('nl_comm')
 
   call timer_lib_in('nl')
-#if !defined(OMPGPU)
-!$acc  data present(f_nl)  &
-!$acc&      present(fxmany,fymany,gxmany,gymany) &
-!$acc&      present(uxmany,uymany,vxmany,vymany) &
-!$acc&      present(uvmany)
-#endif
 
-! f_nl is (radial, nt_loc, theta, nv_loc1, toroidal_procs)
-! where nv_loc1 * toroidal_procs >= nv_loc
+! g_nl      is (n_field,n_radial,n_jtheta,nt_loc,n_toroidal_procs)
+! jcev_c_nl is (n_field,n_radial,n_jtheta,nv_loc,nt_loc,n_toroidal_procs)
+
 #if defined(OMPGPU)
   !no async for OMPGPU for now
 !$omp target teams distribute parallel do simd collapse(4) &
-!$omp&  private(j,ir,p,ix,itor,iy,f0,g0,itm,itl,irbase,irmax)
+!$omp&   private(j,ir,p,ix,itor,mytm,iy,g0,it,iv_loc,it_loc,jtheta_min,itm,itl,irbase,irmax)
 #else
 !$acc parallel loop gang vector independent collapse(4) async(2) &
-!$acc&         private(j,ir,p,ix,itor,iy,f0,g0,itm,itl,irbase,irmax)
+!$acc&         private(j,ir,p,ix,itor,mytm,iy,g0,it,iv_loc,it_loc,jtheta_min,itm,itl,irbase,irmax) &
+!$acc&         present(g_nl,jvec_c_nl) &
+!$acc&         present(nsplit,n_radial,n_toroidal_procs,nt_loc,nt1,n_theta,nv_loc,nx0)
 #endif
   do j=1,nsplit
-     do irbase=1,n_radial,1
+     do irbase=1,n_radial,tile_size
        do itm=1,n_toroidal_procs
          do itl=1,nt_loc
-           itor=itl + (itm-1)*nt_loc
+           itor = itl + (itm-1)*nt_loc
+           mytm = 1 + nt1/nt_loc !my toroidal proc number
+           it = 1+((mytm-1)*nsplit+j-1)/nv_loc
+           iv_loc = 1+modulo((mytm-1)*nsplit+j-1,nv_loc)
+           jtheta_min = 1+((mytm-1)*nsplit)/nv_loc
+           it_loc = it-jtheta_min+1
            iy = itor-1
-           ! process several ir to improve f_nl cache reuse
+
+           ! process several ir to improve jvec_nl and g_nl cache reuse
            irmax=min(irbase+(tile_size-1),n_radial)
            do ir=irbase,irmax
               p  = ir-1-nx0/2
               ix = p
               if (ix < 0) ix = ix+nx
 
-              f0 = i_c*f_nl(ir,itl,j,itm)
-              fxmany(iy,ix,j) = p*f0
-              fymany(iy,ix,j) = iy*f0
+              if (it > n_theta) then
+                 g0 = (0.0,0.0)
+              else
+                 g0 = i_c*sum( jvec_c_nl(1:n_field,ir,it_loc,iv_loc,itor)*g_nl(1:n_field,ir,it_loc,itor))
+              endif
+              gxmany(iy,ix,j) = p*g0
+              gymany(iy,ix,j) = iy*g0
            enddo
          enddo
        enddo
      enddo
   enddo
 
-  ! make sure g_req progresses
-  call parallel_slib_test(g_req)
-
 #if defined(OMPGPU)
   !no async for OMPGPU for now
 #else
+  ! make sure reqs progress
+  call parallel_slib_test(f_req)
 !$acc wait(2)
 #endif
 
+  ! make sure reqs progress
+  call parallel_slib_test(f_req)
+
   ! Average elements so as to ensure
-  !   f(kx,ky=0) = f(-kx,ky=0)^*
+  !   g(kx,ky=0) = g(-kx,ky=0)^*
   ! This symmetry is required for complex input to c2r
 #if defined(OMPGPU)
   !no async for OMPGPU for now
 !$omp target teams distribute parallel do simd collapse(2) &
-!$omp&  private(j,ix,f0)
+!$omp&   private(j,ix,g0)
 #else
 !$acc parallel loop gang vector independent collapse(2) async(2) &
-!$acc&         private(j,ix,f0) &
-!$acc&         present(fxmany) &
+!$acc&         private(j,ix,g0) &
+!$acc&         present(gxmany) &
 !$acc&         present(nsplit,nx)
 #endif
   do j=1,nsplit
     do ix=1,nx/2-1
-      f0 = 0.5*( fxmany(0,ix,j)+conjg(fxmany(0,nx-ix,j)) )
-      fxmany(0,ix   ,j) = f0
-      fxmany(0,nx-ix,j) = conjg(f0)
+      g0 = 0.5*( gxmany(0,ix,j)+conjg(gxmany(0,nx-ix,j)) )
+      gxmany(0,ix   ,j) = g0
+      gxmany(0,nx-ix,j) = conjg(g0)
     enddo
   enddo
 
-  ! make sure g_req progresses
-  call parallel_slib_test(g_req)
-
-     ! --------------------------------------
-     ! perform many Fourier Transforms at once
-     ! --------------------------------------
-
 #if defined(OMPGPU)
-!$omp target data use_device_ptr(fymany,uymany)
+!$omp target data use_device_ptr(gymany,vymany)
 #else
-!$acc  host_data use_device(fymany,uymany)
+!$acc  host_data use_device(gymany,vymany)
 #endif
 
 #ifdef HIPGPU
-  rc = hipfftExecZ2D(hip_plan_c2r_many,c_loc(fymany),c_loc(uymany))
+  rc = hipfftExecZ2D(hip_plan_c2r_many,c_loc(gymany),c_loc(vymany))
 #else
-  rc = cufftExecZ2D(cu_plan_c2r_many,fymany,uymany)
+  rc = cufftExecZ2D(cu_plan_c2r_many,gymany,vymany)
 #endif
 
 #if defined(OMPGPU)
@@ -213,41 +215,43 @@ subroutine cgyro_nl_fftw(ij)
 !$acc end host_data
 #endif
 
-  ! make sure g_req progresses
-  call parallel_slib_test(g_req)
-
 #if defined(OMPGPU)
   !no async for OMPGPU for now
 #else
+  ! make sure reqs progress
+  call parallel_slib_test(f_req)
 !$acc wait(2)
 #endif
-  ! fxmany is complete now
 
-  ! make sure g_req progresses
-  call parallel_slib_test(g_req)
+  ! make sure reqs progress
+  call parallel_slib_test(f_req)
+  ! gxmany is complete now
 
 #if defined(OMPGPU)
-!$omp target data use_device_ptr(fxmany,uxmany)
+!$omp target data use_device_ptr(gxmany,vxmany)
 #else
-!$acc  host_data use_device(fxmany,uxmany)
+!$acc  host_data use_device(gxmany,vxmany)
 #endif
 
 #ifdef HIPGPU
-  rc = hipfftExecZ2D(hip_plan_c2r_many,c_loc(fxmany),c_loc(uxmany))
+  rc = hipfftExecZ2D(hip_plan_c2r_many,c_loc(gxmany),c_loc(vxmany))
 #else
-  rc = cufftExecZ2D(cu_plan_c2r_many,fxmany,uxmany)
+  rc = cufftExecZ2D(cu_plan_c2r_many,gxmany,vxmany)
 #endif
 
 #if defined(OMPGPU)
 !$omp end target data
 #else
+  ! make sure reqs progress
+  call parallel_slib_test(f_req)
+!$acc wait
 !$acc end host_data
 #endif
 
-  ! make sure g_req progresses
-  call parallel_slib_test(g_req)
+  ! make sure reqs progress
+  call parallel_slib_test(f_req)
 
-  ! we can zero the elements we know are zero while we wait for comm
+  ! we can zero the elements we know are zero while we wait
 #if defined(OMPGPU)
   !no async for OMPGPU for now
 !$omp target teams distribute parallel do simd collapse(3)
@@ -259,8 +263,8 @@ subroutine cgyro_nl_fftw(ij)
   do j=1,nsplit
      do ix=nx2,nx0-1
        do iy=0,ny2
-         gxmany(iy,ix,j) = 0
-         gymany(iy,ix,j) = 0
+         fxmany(iy,ix,j) = 0
+         fymany(iy,ix,j) = 0
        enddo
      enddo
   enddo
@@ -276,59 +280,52 @@ subroutine cgyro_nl_fftw(ij)
   do j=1,nsplit
      do ix=0,nx-1
        do iy=n_toroidal,ny2
-         gxmany(iy,ix,j) = 0
-         gymany(iy,ix,j) = 0
+         fxmany(iy,ix,j) = 0
+         fymany(iy,ix,j) = 0
        enddo
      enddo
   enddo
   call timer_lib_out('nl')
 
-  ! time to wait for the g_nl to become avaialble
+  ! time to wait for the F_nl to become avaialble
   call timer_lib_in('nl_comm')
-  call parallel_slib_f_fd_wait(n_field,n_radial,n_jtheta,gpack,g_nl,g_req)
+  call parallel_slib_f_nc_wait(fpack,f_nl,f_req)
   call timer_lib_out('nl_comm')
 
   call timer_lib_in('nl')
+#if !defined(OMPGPU)
+!$acc  data present(f_nl)  &
+!$acc&      present(fxmany,fymany,gxmany,gymany) &
+!$acc&      present(uxmany,uymany,vxmany,vymany) &
+!$acc&      present(uvmany)
+#endif
 
-! g_nl      is (n_field,n_radial,n_jtheta,nt_loc,n_toroidal_procs)
-! jcev_c_nl is (n_field,n_radial,n_jtheta,nv_loc,nt_loc,n_toroidal_procs)
-
+! f_nl is (radial, nt_loc, theta, nv_loc1, toroidal_procs)
+! where nv_loc1 * toroidal_procs >= nv_loc
 #if defined(OMPGPU)
   !no async for OMPGPU for now
 !$omp target teams distribute parallel do simd collapse(4) &
-!$omp&   private(j,ir,p,ix,itor,mytm,iy,g0,it,iv_loc,it_loc,jtheta_min,itm,itl,irbase,irmax)
+!$omp&  private(j,ir,p,ix,itor,iy,f0,g0,itm,itl,irbase,irmax)
 #else
 !$acc parallel loop gang vector independent collapse(4) async(2) &
-!$acc&         private(j,ir,p,ix,itor,mytm,iy,g0,it,iv_loc,it_loc,jtheta_min,itm,itl,irbase,irmax) &
-!$acc&         present(g_nl,jvec_c_nl) &
-!$acc&         present(nsplit,n_radial,n_toroidal_procs,nt_loc,nt1,n_theta,nv_loc,nx0)
+!$acc&         private(j,ir,p,ix,itor,iy,f0,g0,itm,itl,irbase,irmax)
 #endif
   do j=1,nsplit
-     do irbase=1,n_radial,tile_size
+     do irbase=1,n_radial,1
        do itm=1,n_toroidal_procs
          do itl=1,nt_loc
-           itor = itl + (itm-1)*nt_loc
-           mytm = 1 + nt1/nt_loc !my toroidal proc number
-           it = 1+((mytm-1)*nsplit+j-1)/nv_loc
-           iv_loc = 1+modulo((mytm-1)*nsplit+j-1,nv_loc)
-           jtheta_min = 1+((mytm-1)*nsplit)/nv_loc
-           it_loc = it-jtheta_min+1
+           itor=itl + (itm-1)*nt_loc
            iy = itor-1
-
-           ! process several ir to improve jvec_nl and g_nl cache reuse
+           ! process several ir to improve f_nl cache reuse
            irmax=min(irbase+(tile_size-1),n_radial)
            do ir=irbase,irmax
               p  = ir-1-nx0/2
               ix = p
               if (ix < 0) ix = ix+nx
 
-              if (it > n_theta) then
-                 g0 = (0.0,0.0)
-              else
-                 g0 = i_c*sum( jvec_c_nl(1:n_field,ir,it_loc,iv_loc,itor)*g_nl(1:n_field,ir,it_loc,itor))
-              endif
-              gxmany(iy,ix,j) = p*g0
-              gymany(iy,ix,j) = iy*g0
+              f0 = i_c*f_nl(ir,itl,j,itm)
+              fxmany(iy,ix,j) = p*f0
+              fymany(iy,ix,j) = iy*f0
            enddo
          enddo
        enddo
@@ -342,36 +339,40 @@ subroutine cgyro_nl_fftw(ij)
 #endif
 
   ! Average elements so as to ensure
-  !   g(kx,ky=0) = g(-kx,ky=0)^*
+  !   f(kx,ky=0) = f(-kx,ky=0)^*
   ! This symmetry is required for complex input to c2r
 #if defined(OMPGPU)
   !no async for OMPGPU for now
 !$omp target teams distribute parallel do simd collapse(2) &
-!$omp&   private(j,ix,g0)
+!$omp&  private(j,ix,f0)
 #else
 !$acc parallel loop gang vector independent collapse(2) async(2) &
-!$acc&         private(j,ix,g0) &
-!$acc&         present(gxmany) &
+!$acc&         private(j,ix,f0) &
+!$acc&         present(fxmany) &
 !$acc&         present(nsplit,nx)
 #endif
   do j=1,nsplit
     do ix=1,nx/2-1
-      g0 = 0.5*( gxmany(0,ix,j)+conjg(gxmany(0,nx-ix,j)) )
-      gxmany(0,ix   ,j) = g0
-      gxmany(0,nx-ix,j) = conjg(g0)
+      f0 = 0.5*( fxmany(0,ix,j)+conjg(fxmany(0,nx-ix,j)) )
+      fxmany(0,ix   ,j) = f0
+      fxmany(0,nx-ix,j) = conjg(f0)
     enddo
   enddo
 
+     ! --------------------------------------
+     ! perform many Fourier Transforms at once
+     ! --------------------------------------
+
 #if defined(OMPGPU)
-!$omp target data use_device_ptr(gymany,vymany)
+!$omp target data use_device_ptr(fymany,uymany)
 #else
-!$acc  host_data use_device(gymany,vymany)
+!$acc  host_data use_device(fymany,uymany)
 #endif
 
 #ifdef HIPGPU
-  rc = hipfftExecZ2D(hip_plan_c2r_many,c_loc(gymany),c_loc(vymany))
+  rc = hipfftExecZ2D(hip_plan_c2r_many,c_loc(fymany),c_loc(uymany))
 #else
-  rc = cufftExecZ2D(cu_plan_c2r_many,gymany,vymany)
+  rc = cufftExecZ2D(cu_plan_c2r_many,fymany,uymany)
 #endif
 
 #if defined(OMPGPU)
@@ -385,25 +386,23 @@ subroutine cgyro_nl_fftw(ij)
 #else
 !$acc wait(2)
 #endif
-
-  ! gxmany is complete now
+  ! fxmany is complete now
 
 #if defined(OMPGPU)
-!$omp target data use_device_ptr(gxmany,vxmany)
+!$omp target data use_device_ptr(fxmany,uxmany)
 #else
-!$acc  host_data use_device(gxmany,vxmany)
+!$acc  host_data use_device(fxmany,uxmany)
 #endif
 
 #ifdef HIPGPU
-  rc = hipfftExecZ2D(hip_plan_c2r_many,c_loc(gxmany),c_loc(vxmany))
+  rc = hipfftExecZ2D(hip_plan_c2r_many,c_loc(fxmany),c_loc(uxmany))
 #else
-  rc = cufftExecZ2D(cu_plan_c2r_many,gxmany,vxmany)
+  rc = cufftExecZ2D(cu_plan_c2r_many,fxmany,uxmany)
 #endif
 
 #if defined(OMPGPU)
 !$omp end target data
 #else
-!$acc wait
 !$acc end host_data
 #endif
 
diff --git a/cgyro/src/cgyro_rhs.f90 b/cgyro/src/cgyro_rhs.f90
index a32e07e43..2183ba5d2 100644
--- a/cgyro/src/cgyro_rhs.f90
+++ b/cgyro/src/cgyro_rhs.f90
@@ -52,9 +52,6 @@ subroutine cgyro_rhs(ij)
   complex, dimension(:), allocatable   :: bvec_trap
   integer :: nj_loc
 
-  ! h_x is not modified after this and before nl_fftw
-  call cgyro_rhs_comm_async(1)
-
   ! Prepare suitable distribution (g, not h) for conservative upwind method
   if (n_field > 1) then
      call timer_lib_in('str')
@@ -74,17 +71,23 @@ subroutine cgyro_rhs(ij)
   else
      call timer_lib_in('str_mem')
 
-     g_x(:,:,:) = h_x(:,:,:)
+!$omp parallel do collapse(2) private(iv_loc)
+     do itor=nt1,nt2
+      do iv=nv1,nv2
+        iv_loc = iv-nv1+1
+        g_x(:,iv_loc,itor) = h_x(:,iv_loc,itor)
+      enddo
+     enddo
 
      call timer_lib_out('str_mem')
   endif
-  call cgyro_rhs_comm_test(1)
 
   ! Correct g_x for number conservation
   call cgyro_upwind
 
-  call cgyro_rhs_comm_test(1)
   call cgyro_rhs_comm_async(2)
+  ! comm2 is much smaller, so get that out of the way first
+  call cgyro_rhs_comm_async(1)
 
   call timer_lib_in('str')
 
@@ -120,6 +123,7 @@ subroutine cgyro_rhs(ij)
      enddo
    enddo
   enddo
+  call cgyro_rhs_comm_test(2)
   call cgyro_rhs_comm_test(1)
 
   ! Explicit trapping term
@@ -185,6 +189,7 @@ subroutine cgyro_rhs(ij)
   endif
   
   call timer_lib_out('str')
+  call cgyro_rhs_comm_test(2)
   call cgyro_rhs_comm_test(1)
 
   ! Wavenumber advection shear terms
diff --git a/cgyro/src/cgyro_rhs.gpu.F90 b/cgyro/src/cgyro_rhs.gpu.F90
index 1cb08e49f..4f51ed876 100644
--- a/cgyro/src/cgyro_rhs.gpu.F90
+++ b/cgyro/src/cgyro_rhs.gpu.F90
@@ -49,9 +49,6 @@ subroutine cgyro_rhs(ij)
   real :: rval,rval2
   complex :: rhs_stream,rhs_el
 
-  ! h_x is not modified after this and before nl_fftw
-  call cgyro_rhs_comm_async(1)
-
   call timer_lib_in('str_mem')
 
 #if (!defined(OMPGPU)) && defined(_OPENACC)
@@ -89,10 +86,8 @@ subroutine cgyro_rhs(ij)
      enddo
 
 #if (!defined(OMPGPU)) && defined(_OPENACC)
-     call cgyro_rhs_comm_test(1)
 !$acc wait(1)
 #endif
-     call cgyro_rhs_comm_test(1)
 
      call timer_lib_out('str')
   else
@@ -118,18 +113,18 @@ subroutine cgyro_rhs(ij)
      enddo
 
 #if (!defined(OMPGPU)) && defined(_OPENACC)
-     call cgyro_rhs_comm_test(1)
 !$acc wait(1)
 #endif
-     call cgyro_rhs_comm_test(1)
 
      call timer_lib_out('str_mem')
   endif
 
   call cgyro_upwind
 
-  call cgyro_rhs_comm_test(1)
   call cgyro_rhs_comm_async(2)
+  ! comm2 is much smaller, so get that out of the way first
+  call cgyro_rhs_comm_async(1)
+
 
 #if (!defined(OMPGPU)) && defined(_OPENACC)
   call timer_lib_in('str_mem')
@@ -184,11 +179,11 @@ subroutine cgyro_rhs(ij)
   enddo
 
 #if (!defined(OMPGPU)) && defined(_OPENACC)
-  call cgyro_rhs_comm_test(1)
+  call cgyro_rhs_comm_test(2)
 !$acc wait(1)
 #endif
-  call cgyro_rhs_comm_test(1)
   call cgyro_rhs_comm_test(2)
+  call cgyro_rhs_comm_test(1)
 
   call timer_lib_out('str')