diff --git a/cgyro/src/cgyro_cleanup.F90 b/cgyro/src/cgyro_cleanup.F90 index 716a01cc7..1cdaa2e74 100644 --- a/cgyro/src/cgyro_cleanup.F90 +++ b/cgyro/src/cgyro_cleanup.F90 @@ -283,10 +283,10 @@ subroutine cgyro_cleanup if(allocated(gx)) deallocate(gx) if(allocated(fy)) deallocate(fy) if(allocated(gy)) deallocate(gy) - if(allocated(uxmany)) deallocate(uxmany) - if(allocated(uymany)) deallocate(uymany) - if(allocated(vx)) deallocate(vx) - if(allocated(vy)) deallocate(vy) + if(allocated(vxmany)) deallocate(vxmany) + if(allocated(vymany)) deallocate(vymany) + if(allocated(ux)) deallocate(ux) + if(allocated(uy)) deallocate(uy) if(allocated(uv)) deallocate(uv) #endif diff --git a/cgyro/src/cgyro_globals.F90 b/cgyro/src/cgyro_globals.F90 index 76a378b2e..eb248901d 100644 --- a/cgyro/src/cgyro_globals.F90 +++ b/cgyro/src/cgyro_globals.F90 @@ -422,10 +422,10 @@ module cgyro_globals ! ! 2D FFT work arrays #ifndef CGYRO_GPU_FFT - real, dimension(:,:,:), allocatable :: uxmany - real, dimension(:,:,:), allocatable :: uymany - real, dimension(:,:,:), allocatable :: vx - real, dimension(:,:,:), allocatable :: vy + real, dimension(:,:,:), allocatable :: vxmany + real, dimension(:,:,:), allocatable :: vymany + real, dimension(:,:,:), allocatable :: ux + real, dimension(:,:,:), allocatable :: uy real, dimension(:,:,:), allocatable :: uv complex, dimension(:,:,:),allocatable :: fx complex, dimension(:,:,:),allocatable :: fy diff --git a/cgyro/src/cgyro_init_manager.F90 b/cgyro/src/cgyro_init_manager.F90 index c50fec0d2..7a2499952 100644 --- a/cgyro/src/cgyro_init_manager.F90 +++ b/cgyro/src/cgyro_init_manager.F90 @@ -432,14 +432,14 @@ subroutine cgyro_init_manager allocate(fy(0:ny2,0:nx-1,n_omp)) allocate(gy(0:ny2,0:nx-1,n_omp)) - allocate(uxmany(0:ny-1,0:nx-1,nsplit)) - allocate(uymany(0:ny-1,0:nx-1,nsplit)) - allocate(vx(0:ny-1,0:nx-1,n_omp)) - allocate(vy(0:ny-1,0:nx-1,n_omp)) + allocate(vxmany(0:ny-1,0:nx-1,nsplit)) + allocate(vymany(0:ny-1,0:nx-1,nsplit)) + allocate(ux(0:ny-1,0:nx-1,n_omp)) + allocate(uy(0:ny-1,0:nx-1,n_omp)) allocate(uv(0:ny-1,0:nx-1,n_omp)) ! Create plans once and for all, with global arrays fx,ux - plan_c2r = fftw_plan_dft_c2r_2d(nx,ny,gx(:,:,1),vx(:,:,1),FFTW_PATIENT) + plan_c2r = fftw_plan_dft_c2r_2d(nx,ny,fx(:,:,1),ux(:,:,1),FFTW_PATIENT) plan_r2c = fftw_plan_dft_r2c_2d(nx,ny,uv(:,:,1),fx(:,:,1),FFTW_PATIENT) #endif diff --git a/cgyro/src/cgyro_nl_fftw.f90 b/cgyro/src/cgyro_nl_fftw.f90 index 6d0388d23..f8a9acd76 100644 --- a/cgyro/src/cgyro_nl_fftw.f90 +++ b/cgyro/src/cgyro_nl_fftw.f90 @@ -26,7 +26,7 @@ subroutine cgyro_nl_fftw_stepr(j, i_omp) ! Poisson bracket in real space - uv(:,:,i_omp) = (uxmany(:,:,j)*vy(:,:,i_omp)-uymany(:,:,j)*vx(:,:,i_omp))/(nx*ny) + uv(:,:,i_omp) = (ux(:,:,i_omp)*vymany(:,:,j)-uy(:,:,i_omp)*vxmany(:,:,j))/(nx*ny) call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp)) @@ -76,36 +76,35 @@ subroutine cgyro_nl_fftw(ij) include 'fftw3.f03' - ! no staggering, will need all threads for FFTW in one pass force_early_comm2 = (n_omp>=(4*nsplit)) one_pass_fft = force_early_comm2 - ! time to wait for the F_nl to become avaialble + ! time to wait for the g_nl to become avaialble call timer_lib_in('nl_comm') - call parallel_slib_f_nc_wait(fpack,f_nl,f_req) - ! make sure g_req progresses - call parallel_slib_test(g_req) - call timer_lib_out('nl_comm') - + call parallel_slib_f_fd_wait(n_field,n_radial,n_jtheta,gpack,g_nl,g_req) if (force_early_comm2) then - ! time to wait for the g_nl to become avaialble - call timer_lib_in('nl_comm') - call parallel_slib_f_fd_wait(n_field,n_radial,n_jtheta,gpack,g_nl,g_req) - call timer_lib_out('nl_comm') + ! wait also for the f_nl to become avaialble + call parallel_slib_f_nc_wait(fpack,f_nl,f_req) + else + ! make sure f_req progresses + call parallel_slib_test(f_req) endif + call timer_lib_out('nl_comm') + call timer_lib_in('nl') -! f_nl is (radial, nt_loc, theta, nv_loc1, toroidal_procs) -! where nv_loc1 * toroidal_procs >= nv_loc - if (n_omp<=nsplit) then -!$omp parallel do private(itm,itl,itor,iy,ir,p,ix,f0,i_omp,j) +! g_nl is (n_field,n_radial,n_jtheta,nt_loc,n_toroidal_procs) +! jcev_c_nl is (n_field,n_radial,n_jtheta,nv_loc,nt_loc,n_toroidal_procs) + if (n_omp <= nsplit) then +!$omp parallel do schedule(dynamic,1) & +!$omp& private(itor,mytm,itm,itl,iy,ir,p,ix,g0,i_omp,j,it,iv_loc,it_loc,jtheta_min) do j=1,nsplit i_omp = omp_get_thread_num()+1 ! zero elements not otherwise set below - fx(0:ny2,nx2:nx0-1,i_omp) = 0.0 - fy(0:ny2,nx2:nx0-1,i_omp) = 0.0 + gx(0:ny2,nx2:nx0-1,i_omp) = 0.0 + gy(0:ny2,nx2:nx0-1,i_omp) = 0.0 ! Array mapping do ir=1,n_radial @@ -114,27 +113,48 @@ subroutine cgyro_nl_fftw(ij) if (ix < 0) ix = ix+nx do itm=1,n_toroidal_procs do itl=1,nt_loc - itor=itl + (itm-1)*nt_loc + itor = itl + (itm-1)*nt_loc + mytm = 1 + nt1/nt_loc !my toroidal proc number + it = 1+((mytm-1)*nsplit+j-1)/nv_loc + iv_loc = 1+modulo((mytm-1)*nsplit+j-1,nv_loc) + jtheta_min = 1+((mytm-1)*nsplit)/nv_loc + it_loc = it-jtheta_min+1 + iy = itor-1 - f0 = i_c*f_nl(ir,itl,j,itm) - fx(iy,ix,i_omp) = p*f0 - fy(iy,ix,i_omp) = iy*f0 + if (it > n_theta) then + g0 = (0.0,0.0) + else + g0 = i_c*sum( jvec_c_nl(:,ir,it_loc,iv_loc,itor)*g_nl(:,ir,it_loc,itor)) + endif + gx(iy,ix,i_omp) = p*g0 + gy(iy,ix,i_omp) = iy*g0 enddo enddo if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2 - ! Average elements so as to ensure - ! f(kx,ky=0) = f(-kx,ky=0)^* - ! This symmetry is required for complex input to c2r - f0 = 0.5*( fx(0,ix,i_omp)+conjg(fx(0,nx-ix,i_omp)) ) - fx(0,ix ,i_omp) = f0 - fx(0,nx-ix,i_omp) = conjg(f0) + ! Average elements so as to ensure + ! g(kx,ky=0) = g(-kx,ky=0)^* + ! This symmetry is required for complex input to c2r + g0 = 0.5*( gx(0,ix,i_omp)+conjg(gx(0,nx-ix,i_omp)) ) + gx(0,ix ,i_omp) = g0 + gx(0,nx-ix,i_omp) = conjg(g0) endif - fx(n_toroidal:ny2,ix,i_omp) = 0.0 - fy(n_toroidal:ny2,ix,i_omp) = 0.0 + gx(n_toroidal:ny2,ix,i_omp) = 0.0 + gy(n_toroidal:ny2,ix,i_omp) = 0.0 enddo - call fftw_execute_dft_c2r(plan_c2r,fx(:,:,i_omp),uxmany(:,:,j)) - call fftw_execute_dft_c2r(plan_c2r,fy(:,:,i_omp),uymany(:,:,j)) + if (i_omp==1) then + ! use the main thread to progress the async MPI + call parallel_slib_test(f_req) + endif + + call fftw_execute_dft_c2r(plan_c2r,gx(:,:,i_omp),vxmany(:,:,j)) + + if (i_omp==1) then + ! use the main thread to progress the async MPI + call parallel_slib_test(f_req) + endif + + call fftw_execute_dft_c2r(plan_c2r,gy(:,:,i_omp),vymany(:,:,j)) enddo ! j else ! (n_omp>nsplit), increase parallelism num_one_pass = 2 @@ -148,58 +168,6 @@ subroutine cgyro_nl_fftw(ij) select case(o) case (1) - ! zero elements not otherwise set below - fx(0:ny2,nx2:nx0-1,i_omp) = 0.0 - - ! Array mapping - do ir=1,n_radial - p = ir-1-nx0/2 - ix = p - if (ix < 0) ix = ix+nx - do itm=1,n_toroidal_procs - do itl=1,nt_loc - itor=itl + (itm-1)*nt_loc - iy = itor-1 - f0 = i_c*f_nl(ir,itl,j,itm) - fx(iy,ix,i_omp) = p*f0 - enddo - enddo - if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2 - ! Average elements so as to ensure - ! f(kx,ky=0) = f(-kx,ky=0)^* - ! This symmetry is required for complex input to c2r - f0 = 0.5*( fx(0,ix,i_omp)+conjg(fx(0,nx-ix,i_omp)) ) - fx(0,ix ,i_omp) = f0 - fx(0,nx-ix,i_omp) = conjg(f0) - endif - fx(n_toroidal:ny2,ix,i_omp) = 0.0 - enddo - - call fftw_execute_dft_c2r(plan_c2r,fx(:,:,i_omp),uxmany(:,:,j)) - - case (2) - ! zero elements not otherwise set below - fy(0:ny2,nx2:nx0-1,i_omp) = 0.0 - - ! Array mapping - do ir=1,n_radial - p = ir-1-nx0/2 - ix = p - if (ix < 0) ix = ix+nx - do itm=1,n_toroidal_procs - do itl=1,nt_loc - itor=itl + (itm-1)*nt_loc - iy = itor-1 - f0 = i_c*f_nl(ir,itl,j,itm) - fy(iy,ix,i_omp) = iy*f0 - enddo - enddo - fy(n_toroidal:ny2,ix,i_omp) = 0.0 - enddo - - call fftw_execute_dft_c2r(plan_c2r,fy(:,:,i_omp),uymany(:,:,j)) - - case (3) ! zero elements not otherwise set below gx(0:ny2,nx2:nx0-1,i_omp) = 0.0 @@ -237,9 +205,9 @@ subroutine cgyro_nl_fftw(ij) gx(n_toroidal:ny2,ix,i_omp) = 0.0 enddo - call fftw_execute_dft_c2r(plan_c2r,gx(:,:,i_omp),vx(:,:,i_omp)) + call fftw_execute_dft_c2r(plan_c2r,gx(:,:,i_omp),vxmany(:,:,j)) - case (4) + case (2) ! zero elements not otherwise set below gy(0:ny2,nx2:nx0-1,i_omp) = 0.0 @@ -269,7 +237,59 @@ subroutine cgyro_nl_fftw(ij) gy(n_toroidal:ny2,ix,i_omp) = 0.0 enddo - call fftw_execute_dft_c2r(plan_c2r,gy(:,:,i_omp),vy(:,:,i_omp)) + call fftw_execute_dft_c2r(plan_c2r,gy(:,:,i_omp),vymany(:,:,j)) + + case (3) + ! zero elements not otherwise set below + fx(0:ny2,nx2:nx0-1,i_omp) = 0.0 + + ! Array mapping + do ir=1,n_radial + p = ir-1-nx0/2 + ix = p + if (ix < 0) ix = ix+nx + do itm=1,n_toroidal_procs + do itl=1,nt_loc + itor=itl + (itm-1)*nt_loc + iy = itor-1 + f0 = i_c*f_nl(ir,itl,j,itm) + fx(iy,ix,i_omp) = p*f0 + enddo + enddo + if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2 + ! Average elements so as to ensure + ! f(kx,ky=0) = f(-kx,ky=0)^* + ! This symmetry is required for complex input to c2r + f0 = 0.5*( fx(0,ix,i_omp)+conjg(fx(0,nx-ix,i_omp)) ) + fx(0,ix ,i_omp) = f0 + fx(0,nx-ix,i_omp) = conjg(f0) + endif + fx(n_toroidal:ny2,ix,i_omp) = 0.0 + enddo + + call fftw_execute_dft_c2r(plan_c2r,fx(:,:,i_omp),ux(:,:,i_omp)) + + case (4) + ! zero elements not otherwise set below + fy(0:ny2,nx2:nx0-1,i_omp) = 0.0 + + ! Array mapping + do ir=1,n_radial + p = ir-1-nx0/2 + ix = p + if (ix < 0) ix = ix+nx + do itm=1,n_toroidal_procs + do itl=1,nt_loc + itor=itl + (itm-1)*nt_loc + iy = itor-1 + f0 = i_c*f_nl(ir,itl,j,itm) + fy(iy,ix,i_omp) = iy*f0 + enddo + enddo + fy(n_toroidal:ny2,ix,i_omp) = 0.0 + enddo + + call fftw_execute_dft_c2r(plan_c2r,fy(:,:,i_omp),uy(:,:,i_omp)) end select enddo ! o @@ -277,80 +297,68 @@ subroutine cgyro_nl_fftw(ij) endif call timer_lib_out('nl') - - ! stagger comm2, to load ballance network traffic + if (.not. force_early_comm2) then - ! time to wait for the g_nl to become avaialble + ! time to wait for the f_nl to become avaialble call timer_lib_in('nl_comm') - call parallel_slib_f_fd_wait(n_field,n_radial,n_jtheta,gpack,g_nl,g_req) + call parallel_slib_f_nc_wait(fpack,f_nl,f_req) call timer_lib_out('nl_comm') endif call timer_lib_in('nl') -! g_nl is (n_field,n_radial,n_jtheta,nt_loc,n_toroidal_procs) -! jcev_c_nl is (n_field,n_radial,n_jtheta,nv_loc,nt_loc,n_toroidal_procs) +! f_nl is (radial, nt_loc, theta, nv_loc1, toroidal_procs) +! where nv_loc1 * toroidal_procs >= nv_loc if (n_omp <= nsplit) then -!$omp parallel do private(itor,mytm,itm,itl,iy,ir,p,ix,g0,i_omp,j,it,iv_loc,it_loc,jtheta_min) +!$omp parallel do private(itm,itl,itor,iy,ir,p,ix,f0,i_omp,j) do j=1,nsplit i_omp = omp_get_thread_num()+1 ! zero elements not otherwise set below - gx(0:ny2,nx2:nx0-1,i_omp) = 0.0 - gy(0:ny2,nx2:nx0-1,i_omp) = 0.0 + fx(0:ny2,nx2:nx0-1,i_omp) = 0.0 + fy(0:ny2,nx2:nx0-1,i_omp) = 0.0 ! Array mapping do ir=1,n_radial p = ir-1-nx0/2 ix = p - if (ix < 0) ix = ix+nx + if (ix < 0) ix = ix+nx do itm=1,n_toroidal_procs do itl=1,nt_loc - itor = itl + (itm-1)*nt_loc - mytm = 1 + nt1/nt_loc !my toroidal proc number - it = 1+((mytm-1)*nsplit+j-1)/nv_loc - iv_loc = 1+modulo((mytm-1)*nsplit+j-1,nv_loc) - jtheta_min = 1+((mytm-1)*nsplit)/nv_loc - it_loc = it-jtheta_min+1 - + itor=itl + (itm-1)*nt_loc iy = itor-1 - if (it > n_theta) then - g0 = (0.0,0.0) - else - g0 = i_c*sum( jvec_c_nl(:,ir,it_loc,iv_loc,itor)*g_nl(:,ir,it_loc,itor)) - endif - gx(iy,ix,i_omp) = p*g0 - gy(iy,ix,i_omp) = iy*g0 + f0 = i_c*f_nl(ir,itl,j,itm) + fx(iy,ix,i_omp) = p*f0 + fy(iy,ix,i_omp) = iy*f0 enddo enddo if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2 - ! Average elements so as to ensure - ! g(kx,ky=0) = g(-kx,ky=0)^* - ! This symmetry is required for complex input to c2r - g0 = 0.5*( gx(0,ix,i_omp)+conjg(gx(0,nx-ix,i_omp)) ) - gx(0,ix ,i_omp) = g0 - gx(0,nx-ix,i_omp) = conjg(g0) + ! Average elements so as to ensure + ! f(kx,ky=0) = f(-kx,ky=0)^* + ! This symmetry is required for complex input to c2r + f0 = 0.5*( fx(0,ix,i_omp)+conjg(fx(0,nx-ix,i_omp)) ) + fx(0,ix ,i_omp) = f0 + fx(0,nx-ix,i_omp) = conjg(f0) endif - gx(n_toroidal:ny2,ix,i_omp) = 0.0 - gy(n_toroidal:ny2,ix,i_omp) = 0.0 + fx(n_toroidal:ny2,ix,i_omp) = 0.0 + fy(n_toroidal:ny2,ix,i_omp) = 0.0 enddo - call fftw_execute_dft_c2r(plan_c2r,gx(:,:,i_omp),vx(:,:,i_omp)) - call fftw_execute_dft_c2r(plan_c2r,gy(:,:,i_omp),vy(:,:,i_omp)) + call fftw_execute_dft_c2r(plan_c2r,fx(:,:,i_omp),ux(:,:,i_omp)) + call fftw_execute_dft_c2r(plan_c2r,fy(:,:,i_omp),uy(:,:,i_omp)) call cgyro_nl_fftw_stepr(j, i_omp) enddo ! j else ! n_omp>nsplit if (.not. one_pass_fft) then -!$omp parallel do collapse(2) & -!$omp& private(itm,itl,itor,mytm,iy,ir,p,ix,g0,i_omp,it,iv_loc,it_loc,jtheta_min) +!$omp parallel do collapse(2) private(itl,itm,itor,mytm,iy,ir,p,ix,f0,g0,i_omp,j,o,it,iv_loc,it_loc,jtheta_min) do j=1,nsplit do o=1,2 i_omp = j ! j n_theta) then - g0 = (0.0,0.0) - else - g0 = i_c*sum( jvec_c_nl(:,ir,it_loc,iv_loc,itor)*g_nl(:,ir,it_loc,itor)) - endif - gx(iy,ix,i_omp) = p*g0 + f0 = i_c*f_nl(ir,itl,j,itm) + fx(iy,ix,i_omp) = p*f0 enddo enddo if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2 ! Average elements so as to ensure - ! g(kx,ky=0) = g(-kx,ky=0)^* + ! f(kx,ky=0) = f(-kx,ky=0)^* ! This symmetry is required for complex input to c2r - g0 = 0.5*( gx(0,ix,i_omp)+conjg(gx(0,nx-ix,i_omp)) ) - gx(0,ix ,i_omp) = g0 - gx(0,nx-ix,i_omp) = conjg(g0) + f0 = 0.5*( fx(0,ix,i_omp)+conjg(fx(0,nx-ix,i_omp)) ) + fx(0,ix ,i_omp) = f0 + fx(0,nx-ix,i_omp) = conjg(f0) endif - gx(n_toroidal:ny2,ix,i_omp) = 0.0 + fx(n_toroidal:ny2,ix,i_omp) = 0.0 enddo - call fftw_execute_dft_c2r(plan_c2r,gx(:,:,i_omp),vx(:,:,i_omp)) + call fftw_execute_dft_c2r(plan_c2r,fx(:,:,i_omp),ux(:,:,i_omp)) + else ! zero elements not otherwise set below - gy(0:ny2,nx2:nx0-1,i_omp) = 0.0 + fy(0:ny2,nx2:nx0-1,i_omp) = 0.0 ! Array mapping do ir=1,n_radial @@ -399,25 +398,16 @@ subroutine cgyro_nl_fftw(ij) do itm=1,n_toroidal_procs do itl=1,nt_loc itor = itl + (itm-1)*nt_loc - mytm = 1 + nt1/nt_loc !my toroidal proc number - it = 1+((mytm-1)*nsplit+j-1)/nv_loc - iv_loc = 1+modulo((mytm-1)*nsplit+j-1,nv_loc) - jtheta_min = 1+((mytm-1)*nsplit)/nv_loc - it_loc = it-jtheta_min+1 - iy = itor-1 - if (it > n_theta) then - g0 = (0.0,0.0) - else - g0 = i_c*sum( jvec_c_nl(:,ir,it_loc,iv_loc,itor)*g_nl(:,ir,it_loc,itor)) - endif - gy(iy,ix,i_omp) = iy*g0 + f0 = i_c*f_nl(ir,itl,j,itm) + fy(iy,ix,i_omp) = iy*f0 enddo enddo - gy(n_toroidal:ny2,ix,i_omp) = 0.0 + fy(n_toroidal:ny2,ix,i_omp) = 0.0 enddo - call fftw_execute_dft_c2r(plan_c2r,gy(:,:,i_omp),vy(:,:,i_omp)) + call fftw_execute_dft_c2r(plan_c2r,fy(:,:,i_omp),uy(:,:,i_omp)) + endif enddo ! o enddo ! j diff --git a/cgyro/src/cgyro_nl_fftw.gpu.F90 b/cgyro/src/cgyro_nl_fftw.gpu.F90 index 623fa35d8..d8b8fb80f 100644 --- a/cgyro/src/cgyro_nl_fftw.gpu.F90 +++ b/cgyro/src/cgyro_nl_fftw.gpu.F90 @@ -72,10 +72,10 @@ subroutine cgyro_nl_fftw(ij) call timer_lib_in('nl_mem') ! make sure reqs progress - call parallel_slib_test(f_req) call parallel_slib_test(g_req) + call parallel_slib_test(f_req) - ! we can zero the elements we know are zero while we wait + ! we can zero the elements we know are zero while we wait for comm #if defined(OMPGPU) !no async for OMPGPU for now !$omp target teams distribute parallel do simd collapse(3) @@ -87,8 +87,8 @@ subroutine cgyro_nl_fftw(ij) do j=1,nsplit do ix=nx2,nx0-1 do iy=0,ny2 - fxmany(iy,ix,j) = 0 - fymany(iy,ix,j) = 0 + gxmany(iy,ix,j) = 0 + gymany(iy,ix,j) = 0 enddo enddo enddo @@ -104,107 +104,109 @@ subroutine cgyro_nl_fftw(ij) do j=1,nsplit do ix=0,nx-1 do iy=n_toroidal,ny2 - fxmany(iy,ix,j) = 0 - fymany(iy,ix,j) = 0 + gxmany(iy,ix,j) = 0 + gymany(iy,ix,j) = 0 enddo enddo enddo call timer_lib_out('nl_mem') - ! time to wait for the F_nl to become avaialble call timer_lib_in('nl_comm') - call parallel_slib_f_nc_wait(fpack,f_nl,f_req) - ! make sure g_req progresses - call parallel_slib_test(g_req) + ! time to wait for the g_nl to become avaialble + call parallel_slib_f_fd_wait(n_field,n_radial,n_jtheta,gpack,g_nl,g_req) + ! make sure reqs progress + call parallel_slib_test(f_req) call timer_lib_out('nl_comm') call timer_lib_in('nl') -#if !defined(OMPGPU) -!$acc data present(f_nl) & -!$acc& present(fxmany,fymany,gxmany,gymany) & -!$acc& present(uxmany,uymany,vxmany,vymany) & -!$acc& present(uvmany) -#endif -! f_nl is (radial, nt_loc, theta, nv_loc1, toroidal_procs) -! where nv_loc1 * toroidal_procs >= nv_loc +! g_nl is (n_field,n_radial,n_jtheta,nt_loc,n_toroidal_procs) +! jcev_c_nl is (n_field,n_radial,n_jtheta,nv_loc,nt_loc,n_toroidal_procs) + #if defined(OMPGPU) !no async for OMPGPU for now !$omp target teams distribute parallel do simd collapse(4) & -!$omp& private(j,ir,p,ix,itor,iy,f0,g0,itm,itl,irbase,irmax) +!$omp& private(j,ir,p,ix,itor,mytm,iy,g0,it,iv_loc,it_loc,jtheta_min,itm,itl,irbase,irmax) #else !$acc parallel loop gang vector independent collapse(4) async(2) & -!$acc& private(j,ir,p,ix,itor,iy,f0,g0,itm,itl,irbase,irmax) +!$acc& private(j,ir,p,ix,itor,mytm,iy,g0,it,iv_loc,it_loc,jtheta_min,itm,itl,irbase,irmax) & +!$acc& present(g_nl,jvec_c_nl) & +!$acc& present(nsplit,n_radial,n_toroidal_procs,nt_loc,nt1,n_theta,nv_loc,nx0) #endif do j=1,nsplit - do irbase=1,n_radial,1 + do irbase=1,n_radial,tile_size do itm=1,n_toroidal_procs do itl=1,nt_loc - itor=itl + (itm-1)*nt_loc + itor = itl + (itm-1)*nt_loc + mytm = 1 + nt1/nt_loc !my toroidal proc number + it = 1+((mytm-1)*nsplit+j-1)/nv_loc + iv_loc = 1+modulo((mytm-1)*nsplit+j-1,nv_loc) + jtheta_min = 1+((mytm-1)*nsplit)/nv_loc + it_loc = it-jtheta_min+1 iy = itor-1 - ! process several ir to improve f_nl cache reuse + + ! process several ir to improve jvec_nl and g_nl cache reuse irmax=min(irbase+(tile_size-1),n_radial) do ir=irbase,irmax p = ir-1-nx0/2 ix = p if (ix < 0) ix = ix+nx - f0 = i_c*f_nl(ir,itl,j,itm) - fxmany(iy,ix,j) = p*f0 - fymany(iy,ix,j) = iy*f0 + if (it > n_theta) then + g0 = (0.0,0.0) + else + g0 = i_c*sum( jvec_c_nl(1:n_field,ir,it_loc,iv_loc,itor)*g_nl(1:n_field,ir,it_loc,itor)) + endif + gxmany(iy,ix,j) = p*g0 + gymany(iy,ix,j) = iy*g0 enddo enddo enddo enddo enddo - ! make sure g_req progresses - call parallel_slib_test(g_req) - #if defined(OMPGPU) !no async for OMPGPU for now #else + ! make sure reqs progress + call parallel_slib_test(f_req) !$acc wait(2) #endif + ! make sure reqs progress + call parallel_slib_test(f_req) + ! Average elements so as to ensure - ! f(kx,ky=0) = f(-kx,ky=0)^* + ! g(kx,ky=0) = g(-kx,ky=0)^* ! This symmetry is required for complex input to c2r #if defined(OMPGPU) !no async for OMPGPU for now !$omp target teams distribute parallel do simd collapse(2) & -!$omp& private(j,ix,f0) +!$omp& private(j,ix,g0) #else !$acc parallel loop gang vector independent collapse(2) async(2) & -!$acc& private(j,ix,f0) & -!$acc& present(fxmany) & +!$acc& private(j,ix,g0) & +!$acc& present(gxmany) & !$acc& present(nsplit,nx) #endif do j=1,nsplit do ix=1,nx/2-1 - f0 = 0.5*( fxmany(0,ix,j)+conjg(fxmany(0,nx-ix,j)) ) - fxmany(0,ix ,j) = f0 - fxmany(0,nx-ix,j) = conjg(f0) + g0 = 0.5*( gxmany(0,ix,j)+conjg(gxmany(0,nx-ix,j)) ) + gxmany(0,ix ,j) = g0 + gxmany(0,nx-ix,j) = conjg(g0) enddo enddo - ! make sure g_req progresses - call parallel_slib_test(g_req) - - ! -------------------------------------- - ! perform many Fourier Transforms at once - ! -------------------------------------- - #if defined(OMPGPU) -!$omp target data use_device_ptr(fymany,uymany) +!$omp target data use_device_ptr(gymany,vymany) #else -!$acc host_data use_device(fymany,uymany) +!$acc host_data use_device(gymany,vymany) #endif #ifdef HIPGPU - rc = hipfftExecZ2D(hip_plan_c2r_many,c_loc(fymany),c_loc(uymany)) + rc = hipfftExecZ2D(hip_plan_c2r_many,c_loc(gymany),c_loc(vymany)) #else - rc = cufftExecZ2D(cu_plan_c2r_many,fymany,uymany) + rc = cufftExecZ2D(cu_plan_c2r_many,gymany,vymany) #endif #if defined(OMPGPU) @@ -213,41 +215,43 @@ subroutine cgyro_nl_fftw(ij) !$acc end host_data #endif - ! make sure g_req progresses - call parallel_slib_test(g_req) - #if defined(OMPGPU) !no async for OMPGPU for now #else + ! make sure reqs progress + call parallel_slib_test(f_req) !$acc wait(2) #endif - ! fxmany is complete now - ! make sure g_req progresses - call parallel_slib_test(g_req) + ! make sure reqs progress + call parallel_slib_test(f_req) + ! gxmany is complete now #if defined(OMPGPU) -!$omp target data use_device_ptr(fxmany,uxmany) +!$omp target data use_device_ptr(gxmany,vxmany) #else -!$acc host_data use_device(fxmany,uxmany) +!$acc host_data use_device(gxmany,vxmany) #endif #ifdef HIPGPU - rc = hipfftExecZ2D(hip_plan_c2r_many,c_loc(fxmany),c_loc(uxmany)) + rc = hipfftExecZ2D(hip_plan_c2r_many,c_loc(gxmany),c_loc(vxmany)) #else - rc = cufftExecZ2D(cu_plan_c2r_many,fxmany,uxmany) + rc = cufftExecZ2D(cu_plan_c2r_many,gxmany,vxmany) #endif #if defined(OMPGPU) !$omp end target data #else + ! make sure reqs progress + call parallel_slib_test(f_req) +!$acc wait !$acc end host_data #endif - ! make sure g_req progresses - call parallel_slib_test(g_req) + ! make sure reqs progress + call parallel_slib_test(f_req) - ! we can zero the elements we know are zero while we wait for comm + ! we can zero the elements we know are zero while we wait #if defined(OMPGPU) !no async for OMPGPU for now !$omp target teams distribute parallel do simd collapse(3) @@ -259,8 +263,8 @@ subroutine cgyro_nl_fftw(ij) do j=1,nsplit do ix=nx2,nx0-1 do iy=0,ny2 - gxmany(iy,ix,j) = 0 - gymany(iy,ix,j) = 0 + fxmany(iy,ix,j) = 0 + fymany(iy,ix,j) = 0 enddo enddo enddo @@ -276,59 +280,52 @@ subroutine cgyro_nl_fftw(ij) do j=1,nsplit do ix=0,nx-1 do iy=n_toroidal,ny2 - gxmany(iy,ix,j) = 0 - gymany(iy,ix,j) = 0 + fxmany(iy,ix,j) = 0 + fymany(iy,ix,j) = 0 enddo enddo enddo call timer_lib_out('nl') - ! time to wait for the g_nl to become avaialble + ! time to wait for the F_nl to become avaialble call timer_lib_in('nl_comm') - call parallel_slib_f_fd_wait(n_field,n_radial,n_jtheta,gpack,g_nl,g_req) + call parallel_slib_f_nc_wait(fpack,f_nl,f_req) call timer_lib_out('nl_comm') call timer_lib_in('nl') +#if !defined(OMPGPU) +!$acc data present(f_nl) & +!$acc& present(fxmany,fymany,gxmany,gymany) & +!$acc& present(uxmany,uymany,vxmany,vymany) & +!$acc& present(uvmany) +#endif -! g_nl is (n_field,n_radial,n_jtheta,nt_loc,n_toroidal_procs) -! jcev_c_nl is (n_field,n_radial,n_jtheta,nv_loc,nt_loc,n_toroidal_procs) - +! f_nl is (radial, nt_loc, theta, nv_loc1, toroidal_procs) +! where nv_loc1 * toroidal_procs >= nv_loc #if defined(OMPGPU) !no async for OMPGPU for now !$omp target teams distribute parallel do simd collapse(4) & -!$omp& private(j,ir,p,ix,itor,mytm,iy,g0,it,iv_loc,it_loc,jtheta_min,itm,itl,irbase,irmax) +!$omp& private(j,ir,p,ix,itor,iy,f0,g0,itm,itl,irbase,irmax) #else !$acc parallel loop gang vector independent collapse(4) async(2) & -!$acc& private(j,ir,p,ix,itor,mytm,iy,g0,it,iv_loc,it_loc,jtheta_min,itm,itl,irbase,irmax) & -!$acc& present(g_nl,jvec_c_nl) & -!$acc& present(nsplit,n_radial,n_toroidal_procs,nt_loc,nt1,n_theta,nv_loc,nx0) +!$acc& private(j,ir,p,ix,itor,iy,f0,g0,itm,itl,irbase,irmax) #endif do j=1,nsplit - do irbase=1,n_radial,tile_size + do irbase=1,n_radial,1 do itm=1,n_toroidal_procs do itl=1,nt_loc - itor = itl + (itm-1)*nt_loc - mytm = 1 + nt1/nt_loc !my toroidal proc number - it = 1+((mytm-1)*nsplit+j-1)/nv_loc - iv_loc = 1+modulo((mytm-1)*nsplit+j-1,nv_loc) - jtheta_min = 1+((mytm-1)*nsplit)/nv_loc - it_loc = it-jtheta_min+1 + itor=itl + (itm-1)*nt_loc iy = itor-1 - - ! process several ir to improve jvec_nl and g_nl cache reuse + ! process several ir to improve f_nl cache reuse irmax=min(irbase+(tile_size-1),n_radial) do ir=irbase,irmax p = ir-1-nx0/2 ix = p if (ix < 0) ix = ix+nx - if (it > n_theta) then - g0 = (0.0,0.0) - else - g0 = i_c*sum( jvec_c_nl(1:n_field,ir,it_loc,iv_loc,itor)*g_nl(1:n_field,ir,it_loc,itor)) - endif - gxmany(iy,ix,j) = p*g0 - gymany(iy,ix,j) = iy*g0 + f0 = i_c*f_nl(ir,itl,j,itm) + fxmany(iy,ix,j) = p*f0 + fymany(iy,ix,j) = iy*f0 enddo enddo enddo @@ -342,36 +339,40 @@ subroutine cgyro_nl_fftw(ij) #endif ! Average elements so as to ensure - ! g(kx,ky=0) = g(-kx,ky=0)^* + ! f(kx,ky=0) = f(-kx,ky=0)^* ! This symmetry is required for complex input to c2r #if defined(OMPGPU) !no async for OMPGPU for now !$omp target teams distribute parallel do simd collapse(2) & -!$omp& private(j,ix,g0) +!$omp& private(j,ix,f0) #else !$acc parallel loop gang vector independent collapse(2) async(2) & -!$acc& private(j,ix,g0) & -!$acc& present(gxmany) & +!$acc& private(j,ix,f0) & +!$acc& present(fxmany) & !$acc& present(nsplit,nx) #endif do j=1,nsplit do ix=1,nx/2-1 - g0 = 0.5*( gxmany(0,ix,j)+conjg(gxmany(0,nx-ix,j)) ) - gxmany(0,ix ,j) = g0 - gxmany(0,nx-ix,j) = conjg(g0) + f0 = 0.5*( fxmany(0,ix,j)+conjg(fxmany(0,nx-ix,j)) ) + fxmany(0,ix ,j) = f0 + fxmany(0,nx-ix,j) = conjg(f0) enddo enddo + ! -------------------------------------- + ! perform many Fourier Transforms at once + ! -------------------------------------- + #if defined(OMPGPU) -!$omp target data use_device_ptr(gymany,vymany) +!$omp target data use_device_ptr(fymany,uymany) #else -!$acc host_data use_device(gymany,vymany) +!$acc host_data use_device(fymany,uymany) #endif #ifdef HIPGPU - rc = hipfftExecZ2D(hip_plan_c2r_many,c_loc(gymany),c_loc(vymany)) + rc = hipfftExecZ2D(hip_plan_c2r_many,c_loc(fymany),c_loc(uymany)) #else - rc = cufftExecZ2D(cu_plan_c2r_many,gymany,vymany) + rc = cufftExecZ2D(cu_plan_c2r_many,fymany,uymany) #endif #if defined(OMPGPU) @@ -385,25 +386,23 @@ subroutine cgyro_nl_fftw(ij) #else !$acc wait(2) #endif - - ! gxmany is complete now + ! fxmany is complete now #if defined(OMPGPU) -!$omp target data use_device_ptr(gxmany,vxmany) +!$omp target data use_device_ptr(fxmany,uxmany) #else -!$acc host_data use_device(gxmany,vxmany) +!$acc host_data use_device(fxmany,uxmany) #endif #ifdef HIPGPU - rc = hipfftExecZ2D(hip_plan_c2r_many,c_loc(gxmany),c_loc(vxmany)) + rc = hipfftExecZ2D(hip_plan_c2r_many,c_loc(fxmany),c_loc(uxmany)) #else - rc = cufftExecZ2D(cu_plan_c2r_many,gxmany,vxmany) + rc = cufftExecZ2D(cu_plan_c2r_many,fxmany,uxmany) #endif #if defined(OMPGPU) !$omp end target data #else -!$acc wait !$acc end host_data #endif diff --git a/cgyro/src/cgyro_rhs.f90 b/cgyro/src/cgyro_rhs.f90 index a32e07e43..2183ba5d2 100644 --- a/cgyro/src/cgyro_rhs.f90 +++ b/cgyro/src/cgyro_rhs.f90 @@ -52,9 +52,6 @@ subroutine cgyro_rhs(ij) complex, dimension(:), allocatable :: bvec_trap integer :: nj_loc - ! h_x is not modified after this and before nl_fftw - call cgyro_rhs_comm_async(1) - ! Prepare suitable distribution (g, not h) for conservative upwind method if (n_field > 1) then call timer_lib_in('str') @@ -74,17 +71,23 @@ subroutine cgyro_rhs(ij) else call timer_lib_in('str_mem') - g_x(:,:,:) = h_x(:,:,:) +!$omp parallel do collapse(2) private(iv_loc) + do itor=nt1,nt2 + do iv=nv1,nv2 + iv_loc = iv-nv1+1 + g_x(:,iv_loc,itor) = h_x(:,iv_loc,itor) + enddo + enddo call timer_lib_out('str_mem') endif - call cgyro_rhs_comm_test(1) ! Correct g_x for number conservation call cgyro_upwind - call cgyro_rhs_comm_test(1) call cgyro_rhs_comm_async(2) + ! comm2 is much smaller, so get that out of the way first + call cgyro_rhs_comm_async(1) call timer_lib_in('str') @@ -120,6 +123,7 @@ subroutine cgyro_rhs(ij) enddo enddo enddo + call cgyro_rhs_comm_test(2) call cgyro_rhs_comm_test(1) ! Explicit trapping term @@ -185,6 +189,7 @@ subroutine cgyro_rhs(ij) endif call timer_lib_out('str') + call cgyro_rhs_comm_test(2) call cgyro_rhs_comm_test(1) ! Wavenumber advection shear terms diff --git a/cgyro/src/cgyro_rhs.gpu.F90 b/cgyro/src/cgyro_rhs.gpu.F90 index 1cb08e49f..4f51ed876 100644 --- a/cgyro/src/cgyro_rhs.gpu.F90 +++ b/cgyro/src/cgyro_rhs.gpu.F90 @@ -49,9 +49,6 @@ subroutine cgyro_rhs(ij) real :: rval,rval2 complex :: rhs_stream,rhs_el - ! h_x is not modified after this and before nl_fftw - call cgyro_rhs_comm_async(1) - call timer_lib_in('str_mem') #if (!defined(OMPGPU)) && defined(_OPENACC) @@ -89,10 +86,8 @@ subroutine cgyro_rhs(ij) enddo #if (!defined(OMPGPU)) && defined(_OPENACC) - call cgyro_rhs_comm_test(1) !$acc wait(1) #endif - call cgyro_rhs_comm_test(1) call timer_lib_out('str') else @@ -118,18 +113,18 @@ subroutine cgyro_rhs(ij) enddo #if (!defined(OMPGPU)) && defined(_OPENACC) - call cgyro_rhs_comm_test(1) !$acc wait(1) #endif - call cgyro_rhs_comm_test(1) call timer_lib_out('str_mem') endif call cgyro_upwind - call cgyro_rhs_comm_test(1) call cgyro_rhs_comm_async(2) + ! comm2 is much smaller, so get that out of the way first + call cgyro_rhs_comm_async(1) + #if (!defined(OMPGPU)) && defined(_OPENACC) call timer_lib_in('str_mem') @@ -184,11 +179,11 @@ subroutine cgyro_rhs(ij) enddo #if (!defined(OMPGPU)) && defined(_OPENACC) - call cgyro_rhs_comm_test(1) + call cgyro_rhs_comm_test(2) !$acc wait(1) #endif - call cgyro_rhs_comm_test(1) call cgyro_rhs_comm_test(2) + call cgyro_rhs_comm_test(1) call timer_lib_out('str')