diff --git a/cgyro/src/cgyro_check_memory.F90 b/cgyro/src/cgyro_check_memory.F90 index 84a821f28..304b7e798 100644 --- a/cgyro/src/cgyro_check_memory.F90 +++ b/cgyro/src/cgyro_check_memory.F90 @@ -106,6 +106,9 @@ subroutine cgyro_check_memory(datafile) call cgyro_alloc_add_4d(io,n_field,nc,nv_loc,nt_loc,16,'omega_ss') call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'omega_cap_h') call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'omega_h') + if (triad_print_flag == 1) then + call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'diss_r') + endif call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'h_x') call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'h0_x') call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'g_x') @@ -133,6 +136,12 @@ subroutine cgyro_check_memory(datafile) call cgyro_alloc_add_3d(io,n_radial,nt_loc,nsplitA*n_toroidal_procs,16,'fpackA') call cgyro_alloc_add_3d(io,n_radial,nt_loc,nsplitB*n_toroidal_procs,16,'fpackB') call cgyro_alloc_add_4d(io,n_field,n_radial,n_jtheta,n_toroidal,16,'gpack') + if (triad_print_flag == 1) then + call cgyro_alloc_add_4d(io,n_radial,nt_loc,nsplitA,n_toroidal_procs,16,'eA_nl') + call cgyro_alloc_add_4d(io,n_radial,nt_loc,nsplitB,n_toroidal_procs,16,'eB_nl') + call cgyro_alloc_add_3d(io,n_radial,nt_loc,nsplitA*n_toroidal_procs,16,'epackA') + call cgyro_alloc_add_3d(io,n_radial,nt_loc,nsplitB*n_toroidal_procs,16,'epackB') + endif endif write(io,*) diff --git a/cgyro/src/cgyro_cleanup.F90 b/cgyro/src/cgyro_cleanup.F90 index 7828e8bf5..177021040 100644 --- a/cgyro/src/cgyro_cleanup.F90 +++ b/cgyro/src/cgyro_cleanup.F90 @@ -43,21 +43,33 @@ subroutine cgyro_cleanup ccl_del_device(vel2) deallocate(vel2) endif - if(allocated(w_e)) deallocate(w_e) + if(allocated(w_e)) then + ccl_del_device(w_e) + deallocate(w_e) + endif if(allocated(e_deriv1_mat)) deallocate(e_deriv1_mat) if(allocated(e_deriv1_rot_mat)) deallocate(e_deriv1_rot_mat) if(allocated(xi)) then ccl_del_device(xi) deallocate(xi) endif - if(allocated(w_xi)) deallocate(w_xi) - if(allocated(w_exi)) deallocate(w_exi) + if(allocated(w_xi)) then + ccl_del_device(w_xi) + deallocate(w_xi) + endif + if(allocated(w_exi)) then + ccl_del_device(w_exi) + deallocate(w_exi) + endif if(allocated(xi_lor_mat)) deallocate(xi_lor_mat) if(allocated(xi_deriv_mat)) deallocate(xi_deriv_mat) if(allocated(theta)) deallocate(theta) if(allocated(thetab)) deallocate(thetab) - if(allocated(w_theta)) deallocate(w_theta) + if(allocated(w_theta)) then + ccl_del_device(w_theta) + deallocate(w_theta) + endif if(allocated(g_theta)) deallocate(g_theta) if(allocated(g_theta_geo)) deallocate(g_theta_geo) if(allocated(bmag)) deallocate(bmag) @@ -83,7 +95,10 @@ subroutine cgyro_cleanup if(allocated(lambda_rot)) deallocate(lambda_rot) if(allocated(dlambda_rot)) deallocate(dlambda_rot) if(allocated(dens_rot)) deallocate(dens_rot) - if(allocated(dens2_rot)) deallocate(dens2_rot) + if(allocated(dens2_rot)) then + ccl_del_device(dens2_rot) + deallocate(dens2_rot) + endif if(allocated(dens_ele_rot)) deallocate(dens_ele_rot) if(allocated(dens_avg_rot)) deallocate(dens_avg_rot) if(allocated(dlnndr_avg_rot)) deallocate(dlnndr_avg_rot) @@ -137,6 +152,18 @@ subroutine cgyro_cleanup ccl_del_device(source) deallocate(source) endif + if(allocated(triad)) then + ccl_del_device(triad) + deallocate(triad) + endif + if(allocated(triad_loc)) then + ccl_del_device(triad_loc) + deallocate(triad_loc) + endif + if(allocated(triad_loc_old)) then + ccl_del_device(triad_loc_old) + deallocate(triad_loc_old) + endif if(allocated(thfac_itor)) then ccl_del_device(thfac_itor) deallocate(thfac_itor) @@ -201,6 +228,10 @@ subroutine cgyro_cleanup ccl_del_device(omega_sbeta) deallocate(omega_sbeta) endif + if(allocated(diss_r)) then + ccl_del_device(diss_r) + deallocate(diss_r) + endif if(allocated(jvec_c)) then ccl_del_device(jvec_c) deallocate(jvec_c) @@ -261,6 +292,14 @@ subroutine cgyro_cleanup ccl_del_device(g_nl) deallocate(g_nl) endif + if(allocated(eA_nl)) then + ccl_del_device(eA_nl) + deallocate(eA_nl) + endif + if(allocated(eB_nl)) then + ccl_del_device(eB_nl) + deallocate(eB_nl) + endif if(allocated(fpackA)) then ccl_del_device(fpackA) deallocate(fpackA) @@ -273,6 +312,14 @@ subroutine cgyro_cleanup ccl_del_device(gpack) deallocate(gpack) endif + if(allocated(epackA)) then + ccl_del_device(epackA) + deallocate(epackA) + endif + if(allocated(epackB)) then + ccl_del_device(epackB) + deallocate(epackB) + endif if (allocated(cmat)) then ccl_del_bigdevice(cmat) deallocate(cmat) diff --git a/cgyro/src/cgyro_equilibrium.F90 b/cgyro/src/cgyro_equilibrium.F90 index 453ea1681..a4edcfd6a 100644 --- a/cgyro/src/cgyro_equilibrium.F90 +++ b/cgyro/src/cgyro_equilibrium.F90 @@ -277,9 +277,9 @@ subroutine cgyro_equilibrium enddo #if defined(OMPGPU) -!$omp target enter data map(to:xi,omega_stream) +!$omp target enter data map(to:xi,omega_stream,w_theta,dens2_rot) #elif defined(_OPENACC) -!$acc enter data copyin(xi,omega_stream) +!$acc enter data copyin(xi,omega_stream,w_theta,dens2_rot) #endif #if defined(OMPGPU) || defined(_OPENACC) diff --git a/cgyro/src/cgyro_flux.f90 b/cgyro/src/cgyro_flux.f90 index bfdc2f728..b9622a165 100644 --- a/cgyro/src/cgyro_flux.f90 +++ b/cgyro/src/cgyro_flux.f90 @@ -194,8 +194,9 @@ subroutine cgyro_flux ! 2-1. Compute Triad energy transfer !------------------------------------------------------------- - kx = 2*pi*rho/length - do is=1,n_species + if (triad_print_flag == 1) then + kx = 2*pi*rho/length + do is=1,n_species ! Triad energy transfer : T_k triad_loc_old(is,:,itor,1) = triad_loc(is,:,itor,1) *temp(is)/dlntdr(is_ele) ! From Nonzonal Triad energy transfer : T_k [NZ(k',k")->k] @@ -214,7 +215,8 @@ subroutine cgyro_flux triad_loc_old(is,:,itor,7) = triad_loc(is,:,itor,6) *temp(is)/dlntdr(is_ele) ! Diss. (Coll. ) triad_loc_old(is,:,itor,8) = triad_loc(is,:,itor,7) *temp(is)/dlntdr(is_ele) - enddo + enddo + endif !----------------------------------------------------- @@ -279,14 +281,16 @@ subroutine cgyro_flux NEW_COMM_1, & i_err) - ! Reduced complex triad(ns,kx), below, is still distributed over n - call MPI_ALLREDUCE(triad_loc_old(:,:,:,:), & + if (triad_print_flag == 1) then + ! Reduced complex triad(ns,kx), below, is still distributed over n + call MPI_ALLREDUCE(triad_loc_old(:,:,:,:), & triad, & size(triad), & MPI_DOUBLE_COMPLEX, & MPI_SUM, & NEW_COMM_1, & i_err) + endif tave_step = tave_step + 1 diff --git a/cgyro/src/cgyro_init_arrays.F90 b/cgyro/src/cgyro_init_arrays.F90 index 7b0b63b46..09670ca24 100644 --- a/cgyro/src/cgyro_init_arrays.F90 +++ b/cgyro/src/cgyro_init_arrays.F90 @@ -448,12 +448,14 @@ subroutine cgyro_init_arrays + abs(omega_rot_drift_r(it,is)) & + abs(omega_rot_edrift_r(it))) - ! (d/dr) upwind dissipation for triad energy transfer diagnostics - diss_r(ic,iv_loc,itor) = - (n_radial/length)*spectraldiss(u,nup_radial)*up_radial & - * (abs(omega_rdrift(it,is))*energy(ie)*(1.0+xi(ix)**2) & - + abs(omega_cdrift_r(it,is)*xi(ix))*vel(ie) & - + abs(omega_rot_drift_r(it,is)) & - + abs(omega_rot_edrift_r(it))) + if (triad_print_flag == 1) then + ! (d/dr) upwind dissipation for triad energy transfer diagnostics + diss_r(ic,iv_loc,itor) = - (n_radial/length)*spectraldiss(u,nup_radial)*up_radial & + * (abs(omega_rdrift(it,is))*energy(ie)*(1.0+xi(ix)**2) & + + abs(omega_cdrift_r(it,is)*xi(ix))*vel(ie) & + + abs(omega_rot_drift_r(it,is)) & + + abs(omega_rot_edrift_r(it))) + endif ! omega_star carg = & @@ -487,6 +489,13 @@ subroutine cgyro_init_arrays #elif defined(_OPENACC) !$acc enter data copyin(omega_cap_h,omega_h,omega_s,omega_ss,omega_sbeta) #endif + if (triad_print_flag == 1) then +#if defined(OMPGPU) +!$omp target enter data map(to:diss_r) +#elif defined(_OPENACC) +!$acc enter data copyin(diss_r) +#endif + endif !------------------------------------------------------------------------- deallocate(gdlnndr,gdlntdr) diff --git a/cgyro/src/cgyro_init_manager.F90 b/cgyro/src/cgyro_init_manager.F90 index 8a2644e75..7316e6951 100644 --- a/cgyro/src/cgyro_init_manager.F90 +++ b/cgyro/src/cgyro_init_manager.F90 @@ -111,6 +111,12 @@ subroutine cgyro_init_manager enddo enddo +#if defined(OMPGPU) +!$omp target enter data map(to:w_exi,w_e,w_xi) +#elif defined(_OPENACC) +!$acc enter data copyin(w_exi,w_e,w_xi) +#endif + allocate(theta(n_theta)) allocate(thetab(n_theta,n_radial/box_size)) allocate(w_theta(n_theta)) @@ -179,10 +185,6 @@ subroutine cgyro_init_manager allocate( gflux(0:n_global,n_species,4,n_field,nt1:nt2)) allocate(gflux_loc(0:n_global,n_species,4,n_field,nt1:nt2)) - allocate( triad(n_species,n_radial,nt1:nt2,8)) - allocate(triad_loc(n_species,n_radial,nt1:nt2,7)) - allocate(triad_loc_old(n_species,n_radial,nt1:nt2,8)) - allocate(cflux_tave(n_species,4)) allocate(gflux_tave(n_species,4)) @@ -195,6 +197,16 @@ subroutine cgyro_init_manager #elif defined(_OPENACC) !$acc enter data create(fcoef,gcoef,field,field_loc,source) #endif + if (triad_print_flag == 1) then + allocate( triad(n_species,n_radial,nt1:nt2,8)) + allocate(triad_loc(n_species,n_radial,nt1:nt2,7)) + allocate(triad_loc_old(n_species,n_radial,nt1:nt2,8)) +#if defined(OMPGPU) +!$omp target enter data map(alloc:triad,triad_loc,triad_loc_old) +#elif defined(_OPENACC) +!$acc enter data create(triad,triad_loc,triad_loc_old) +#endif + endif if ((collision_model /= 5) .AND. (collision_field_model == 1)) then ! nc and nc_loc must be last, since it will be collated @@ -262,7 +274,9 @@ subroutine cgyro_init_manager allocate(cap_h_v(nc_loc,nt1:nt2,nv)) allocate(omega_cap_h(nc,nv_loc,nt1:nt2)) allocate(omega_h(nc,nv_loc,nt1:nt2)) - allocate(diss_r(nc,nv_loc,nt1:nt2)) + if (triad_print_flag == 1) then + allocate(diss_r(nc,nv_loc,nt1:nt2)) + endif allocate(omega_s(n_field,nc,nv_loc,nt1:nt2)) allocate(omega_ss(n_field,nc,nv_loc,nt1:nt2)) allocate(omega_sbeta(nc,nv_loc,nt1:nt2)) @@ -302,10 +316,8 @@ subroutine cgyro_init_manager ! Nonlinear arrays if (nonlinear_flag == 1) then allocate(fA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs)) - allocate(eA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs)) allocate(g_nl(n_field,n_radial,n_jtheta,n_toroidal)) allocate(fpackA(n_radial,nt_loc,nsplitA*n_toroidal_procs)) - allocate(epackA(n_radial,nt_loc,nsplitA*n_toroidal_procs)) allocate(gpack(n_field,n_radial,n_jtheta,n_toroidal)) allocate(jvec_c_nl(n_field,n_radial,n_jtheta,nv_loc,n_toroidal)) #if defined(OMPGPU) @@ -313,16 +325,32 @@ subroutine cgyro_init_manager #elif defined(_OPENACC) !$acc enter data create(fpackA,gpack,fA_nl,g_nl,jvec_c_nl) #endif + if (triad_print_flag == 1) then + allocate(eA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs)) + allocate(epackA(n_radial,nt_loc,nsplitA*n_toroidal_procs)) +#if defined(OMPGPU) +!$omp target enter data map(alloc:epackA,eA_nl) +#elif defined(_OPENACC) +!$acc enter data create(epackA,eA_nl) +#endif + endif if (nsplitB > 0) then ! nsplitB can be zero at large MPI allocate(fB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs)) allocate(fpackB(n_radial,nt_loc,nsplitB*n_toroidal_procs)) - allocate(eB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs)) - allocate(epackB(n_radial,nt_loc,nsplitB*n_toroidal_procs)) #if defined(OMPGPU) !$omp target enter data map(alloc:fpackB,fB_nl) #elif defined(_OPENACC) !$acc enter data create(fpackB,fB_nl) #endif + if (triad_print_flag == 1) then + allocate(epackB(n_radial,nt_loc,nsplitB*n_toroidal_procs)) + allocate(eB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs)) +#if defined(OMPGPU) +!$omp target enter data map(alloc:epackB,eB_nl) +#elif defined(_OPENACC) +!$acc enter data create(epackB,eB_nl) +#endif + endif endif endif diff --git a/cgyro/src/cgyro_kernel.F90 b/cgyro/src/cgyro_kernel.F90 index 95c08273f..ee3bc3c24 100644 --- a/cgyro/src/cgyro_kernel.F90 +++ b/cgyro/src/cgyro_kernel.F90 @@ -135,6 +135,13 @@ subroutine cgyro_kernel ! wait for cap_h_c to be synched into system memory, used by cgyro_write_timedata !$acc wait(4) #endif + if (triad_print_flag == 1) then +#if defined(OMPGPU) +!$omp target update from(triad_loc) +#elif defined(_OPENACC) +!$acc update host(triad_loc) +#endif + endif call timer_lib_out('coll_mem') call timer_lib_in('io') diff --git a/cgyro/src/cgyro_nl_comm.F90 b/cgyro/src/cgyro_nl_comm.F90 index 4e0e74fae..a4cd1d28f 100644 --- a/cgyro/src/cgyro_nl_comm.F90 +++ b/cgyro/src/cgyro_nl_comm.F90 @@ -326,9 +326,24 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij) real :: dv,dvr,dvp,rval,rval2 complex :: cprod,cprod2,thfac - triad_loc_old(:,:,nt1:nt2,3) = triad_loc(:,:,nt1:nt2,3) - triad_loc_old(:,:,nt1:nt2,4) = triad_loc(:,:,nt1:nt2,4) - triad_loc(:,:,nt1:nt2,:) = 0.0 +#if defined(OMPGPU) +!$omp target teams distribute parallel do simd collapse(3) +#elif defined(_OPENACC) +!$acc parallel loop collapse(3) gang vector independent & +!$acc& present(triad_loc_old,triad_loc) & +!$acc& present(nt1,nt2,n_radial,n_species) default(none) +#else +!$omp parallel do private(ir,is) +#endif + do itor=nt1,nt2 + do ir=1,n_radial + do is=1,n_species + triad_loc_old(is,ir,itor,3) = triad_loc(is,ir,itor,3) + triad_loc_old(is,ir,itor,4) = triad_loc(is,ir,itor,4) + triad_loc(is,ir,itor,:) = 0.0 + enddo + enddo + enddo call timer_lib_in('nl_comm') call parallel_slib_r_nc_wait(nsplitA,fA_nl,fpackA,fA_req) @@ -351,11 +366,16 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij) #if defined(OMPGPU) !$omp target teams distribute parallel do simd collapse(4) & !$omp& private(iexch0,itor0,isplit0,iexch_base) & +!$omp& private(id,itorbox,jr0,jc,itd,itd_class,thfac) & !$omp& private(ic_loc_m,my_psi) #elif defined(_OPENACC) !$acc parallel loop collapse(4) gang vector independent private(ic_loc_m,my_psi) & !$acc& private(iexch0,itor0,isplit0,iexch_base) & -!$acc& present(ic_c,px,rhs,fpackA,fpackB) copyin(psi_mul,zf_scale) & +!$acc& private(id,itorbox,jr0,jc,itd,itd_class,thfac) & +!$acc& present(h_x,g_x,cap_h_c,cap_h_ct,field,dvjvec_c,jvec_c) & +!$acc& present(ic_c,is_v,ix_v,ie_v,w_exi,w_theta,dens2_rot,z,temp) & +!$acc& present(omega_stream,vel,xi,thfac_itor,cderiv,uderiv) & +!$acc& present(px,rhs,fpackA,fpackB,epackA,epackB,diss_r,triad_loc) copyin(psi_mul,zf_scale) & !$acc& present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA,nsplitB) copyin(ij) default(none) #else !$omp parallel do collapse(2) private(ic_loc_m,my_psi) & @@ -365,12 +385,12 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij) do itor=nt1,nt2 do iv_loc_m=1,nv_loc do ir=1,n_radial - itorbox = itor*box_size*sign_qs - jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial) - jr0(1) = n_theta*(ir-1) - jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial) - do it=1,n_theta + itorbox = itor*box_size*sign_qs + jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial) + jr0(1) = n_theta*(ir-1) + jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial) + ic_loc_m = ic_c(ir,it) is = is_v(iv_loc_m +nv1 -1 ) @@ -490,11 +510,16 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij) #if defined(OMPGPU) !$omp target teams distribute parallel do simd collapse(4) & !$omp& private(iexch0,itor0,isplit0,iexch_base) & +!$omp& private(id,itorbox,jr0,jc,itd,itd_class,thfac) & !$omp& private(ic_loc_m,my_psi) #elif defined(_OPENACC) !$acc parallel loop collapse(4) gang vector independent private(ic_loc_m,my_psi) & !$acc& private(iexch0,itor0,isplit0,iexch_base) & -!$acc& present(ic_c,px,rhs,fpackA) copyin(psi_mul,zf_scale) & +!$acc& private(id,itorbox,jr0,jc,itd,itd_class,thfac) & +!$acc& present(h_x,g_x,cap_h_c,cap_h_ct,field,dvjvec_c,jvec_c) & +!$acc& present(ic_c,is_v,ix_v,ie_v,w_exi,w_theta,dens2_rot,z,temp) & +!$acc& present(omega_stream,vel,xi,thfac_itor,cderiv,uderiv) & +!$acc& present(px,rhs,fpackA,epackA,diss_r,triad_loc) copyin(psi_mul,zf_scale) & !$acc& present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA) copyin(ij) default(none) #else !$omp parallel do collapse(2) private(ic_loc_m,my_psi) & @@ -504,12 +529,12 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij) do itor=nt1,nt2 do iv_loc_m=1,nv_loc do ir=1,n_radial - itorbox = itor*box_size*sign_qs - jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial) - jr0(1) = n_theta*(ir-1) - jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial) - do it=1,n_theta + itorbox = itor*box_size*sign_qs + jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial) + jr0(1) = n_theta*(ir-1) + jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial) + ic_loc_m = ic_c(ir,it) is = is_v(iv_loc_m +nv1 -1 ) diff --git a/cgyro/src/cgyro_nl_fftw.F90 b/cgyro/src/cgyro_nl_fftw.F90 index a2e065fde..a4c58206a 100644 --- a/cgyro/src/cgyro_nl_fftw.F90 +++ b/cgyro/src/cgyro_nl_fftw.F90 @@ -378,7 +378,63 @@ subroutine cgyro_nl_fftw_mul(sz,uvm,uxm,vym,uym,vxm,inv_nxny) end subroutine -subroutine cgyro_nl_fftw(ij) +subroutine cgyro_nl_fftw_mul_sub_mean(ny,nx,nt,uvm,uxm,vym,uym,vxm,inv_nxny) + implicit none + + integer, intent(in) :: ny,nx,nt + real, dimension(*),intent(out) :: uvm + real, dimension(*),intent(in) :: uxm,vym,uym,vxm + real, intent(in) :: inv_nxny + + integer :: iy,ix,it + integer :: sz,sz_t + integer :: i,i_xt + real :: y_mean_ux, y_mean_uy, y_mean_vx, y_mean_vy + real :: r_ux, r_uy, r_vx, r_vy + real :: inv_ny + + sz = ny*nx*nt + sz_t = ny*nx + inv_ny = 1.0/ny + +#if defined(OMPGPU) +!$omp target teams distribute parallel do collapse(2) & +!$omp& private(iy,i_xt,y_mean_ux,y_mean_uy,y_mean_vx,y_mean_vy) & +!$omp& private(i,r_ux,r_uy,r_vx,r_vy) & +!$omp& map(to:uxm(1:sz),vym(1:sz),uym(1:sz),vxm(1:sz)) & +!$omp& map(from:uvm(1:sz)) +#else +!$acc parallel loop independent gang collapse(2) & +!$acc& private(iy,i_xt,y_mean_ux,y_mean_uy,y_mean_vx,y_mean_vy) & +!$acc& present(uvm,uxm,vym,uym,vxm) +#endif + do it=0,nt-1 + do ix=0,nx-1 + i_xt = (it*sz_t) + ix*ny; + y_mean_ux = sum(uxm(i_xt+1:i_xt+ny) ) * inv_ny + y_mean_uy = sum(uym(i_xt+1:i_xt+ny) ) * inv_ny + y_mean_vx = sum(vxm(i_xt+1:i_xt+ny) ) * inv_ny + y_mean_vy = sum(vym(i_xt+1:i_xt+ny) ) * inv_ny +#if (!defined(OMPGPU)) && defined(_OPENACC) +!$acc loop vector private(i,r_ux,r_uy,r_vx,r_vy) +#endif + do iy=1,ny + i = i_xt+iy + ! remove ky=0 + r_ux = uxm(i) - y_mean_ux + r_uy = uym(i) - y_mean_uy + r_vx = vxm(i) - y_mean_vx + r_vy = vym(i) - y_mean_vy + + ! compute and save uv + uvm(i) = (uxm(i)*vym(i)-uym(i)*vxm(i))*inv_nxny + enddo + enddo + enddo + +end subroutine + +subroutine cgyro_nl_fftw(i_triad) #if defined(HIPGPU) use hipfort @@ -398,7 +454,7 @@ subroutine cgyro_nl_fftw(ij) include 'fftw/fftw3.f' #endif !----------------------------------- - integer, intent(in) :: ij + integer, intent(in) :: i_triad !----------------------------------- integer :: j,p,iexch integer :: it,ir,itm,itl,ix,iy @@ -477,7 +533,7 @@ subroutine cgyro_nl_fftw(ij) call timer_lib_in('nl') #if !defined(OMPGPU) -!$acc data present(fA_nl) & +!$acc data present(fA_nl,eA_nl) & !$acc& present(fxmany,fymany,gxmany,gymany) & !$acc& present(uxmany,uymany,vxmany,vymany) & !$acc& present(uvmany) @@ -929,8 +985,99 @@ subroutine cgyro_nl_fftw(ij) enddo enddo + if (i_triad == 1) then + call timer_lib_out('nl_mem') + call timer_lib_in('nl') + ! 2. Poisson bracket in real space with Non_Zonal pairs + + call cgyro_nl_fftw_mul_sub_mean(size(uvmany,1),size(uvmany,2),nsplitA, & + uvmany, & + uxmany,vymany(:,:,1:nsplitA), & + uymany,vxmany(:,:,1:nsplitA), & + inv_nxny) + + ! make sure reqs progress + call cgyro_nl_fftw_comm_test() + + ! ------------------ + ! Transform uv to fx + ! ------------------ + +#if defined(OMPGPU) + +#if defined(MKLGPU) +!$omp target data map(tofrom: uvmany,fxmany) +#else +!$omp target data use_device_ptr(uvmany,fxmany) +#endif + +#else +!$acc wait +!$acc host_data use_device(uvmany,fxmany) +#endif + +#if defined(HIPGPU) + rc = hipfftExecD2Z(hip_plan_r2c_manyA,c_loc(uvmany),c_loc(fxmany)) +#elif defined(MKLGPU) + !$omp dispatch + call dfftw_execute_dft_r2c(dfftw_plan_r2c_manyA,uvmany,fxmany) + rc = 0 +#else + rc = cufftExecD2Z(cu_plan_r2c_manyA,uvmany,fxmany) +#endif + +#ifdef HIPGPU + ! hipfftExec is asynchronous, will need the results below + rc = hipDeviceSynchronize() +#endif + +#if defined(OMPGPU) +!$omp end target data +#else +!$acc wait +!$acc end host_data +#endif + + ! make sure reqs progress + call cgyro_nl_fftw_comm_test() + + call timer_lib_out('nl') + call timer_lib_in('nl_mem') + + ! tile for performance, since this is effectively a transpose +#if defined(OMPGPU) +!$omp target teams distribute parallel do collapse(5) & +!$omp& private(iy,ir,itm,itl,ix) +#else +!$acc parallel loop independent collapse(5) gang & +!$acc& private(iy,ir,itm,itl,ix) present(eA_nl,fxmany) +#endif + do j=1,nsplitA + do iy0=0,n_toroidal+(R_TORTILE-1)-1,R_TORTILE ! round up + do ir0=0,n_radial+(R_RADTILE-1)-1,R_RADTILE ! round up + do iy1=0,(R_TORTILE-1) ! tile + do ir1=0,(R_RADTILE-1) ! tile + iy = iy0 + iy1 + ir = 1 + ir0 + ir1 + if ((iy < n_toroidal) .and. (ir <= n_radial)) then + ! itor = iy+1 + itm = 1 + iy/nt_loc + itl = 1 + modulo(iy,nt_loc) + ix = ir-1-nx0/2 + if (ix < 0) ix = ix+nx + + eA_nl(ir,itl,j,itm) = fxmany(iy,ix,j) + endif + enddo + enddo + enddo + enddo + enddo + + endif ! i_triad==1 + #if !defined(OMPGPU) - ! end data fA_nl + ! end data fA_nl,eA_nl !$acc end data #endif @@ -978,6 +1125,9 @@ subroutine cgyro_nl_fftw(ij) ! start the async reverse comm ! can reuse the same req, no overlap with forward fA_req call parallel_slib_r_nc_async(nsplitA,fA_nl,fpackA,fA_req) + if (i_triad == 1) then + call parallel_slib_r_nc_async(nsplitA,eA_nl,epackA,eA_req) + end if fA_req_valid = .TRUE. if (nsplitB > 0) then @@ -993,7 +1143,7 @@ subroutine cgyro_nl_fftw(ij) call timer_lib_in('nl') #if !defined(OMPGPU) -!$acc data present(fB_nl) & +!$acc data present(fB_nl,eB_nl) & !$acc& present(fxmany,fymany,gxmany,gymany) & !$acc& present(uxmany,uymany,vxmany,vymany) & !$acc& present(uvmany) @@ -1242,8 +1392,105 @@ subroutine cgyro_nl_fftw(ij) enddo enddo + if (i_triad == 1) then + call timer_lib_out('nl_mem') + call timer_lib_in('nl') + ! 2. Poisson bracket in real space with Non_Zonal pairs + call cgyro_nl_fftw_mul(size(uvmany,1)*size(uvmany,2)*nsplitB, & + uvmany, & + uxmany,vymany(:,:,(nsplitA+1):nsplit), & + uymany,vxmany(:,:,(nsplitA+1):nsplit), & + inv_nxny) + + ! make sure reqs progress + call cgyro_nl_fftw_comm_test() + + ! ------------------ + ! Transform uv to fx + ! ------------------ + +#if defined(OMPGPU) + +#if defined(MKLGPU) +!$omp target data map(tofrom: uvmany,fxmany) +#else +!$omp target data use_device_ptr(uvmany,fxmany) +#endif + +#else +!$acc wait +!$acc host_data use_device(uvmany,fxmany) +#endif + +#if defined(HIPGPU) + rc = hipfftExecD2Z(hip_plan_r2c_manyB,c_loc(uvmany),c_loc(fxmany)) +#elif defined(MKLGPU) + !$omp dispatch + call dfftw_execute_dft_r2c(dfftw_plan_r2c_manyB,uvmany,fxmany) + rc = 0 +#else + rc = cufftExecD2Z(cu_plan_r2c_manyB,uvmany,fxmany) +#endif + +#ifdef HIPGPU + ! make sure reqs progress + call cgyro_nl_fftw_comm_test() + ! hipfftExec is asynchronous, will need the results below + rc = hipDeviceSynchronize() +#endif + +#if defined(OMPGPU) +!$omp end target data +#else + ! make sure reqs progress + call cgyro_nl_fftw_comm_test() +!$acc wait +!$acc end host_data +#endif + + ! make sure reqs progress + call cgyro_nl_fftw_comm_test() + + call timer_lib_out('nl') + call timer_lib_in('nl_mem') + + ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component + ! that will be filtered in the main time-stepping loop + + ! tile for performance, since this is effectively a transpose +#if defined(OMPGPU) +!$omp target teams distribute parallel do collapse(5) & +!$omp& private(iy,ir,itm,itl,ix) +#else +!$acc parallel loop independent collapse(5) gang & +!$acc& private(iy,ir,itm,itl,ix) present(eB_nl,fxmany) +#endif + do j=1,nsplitB + do iy0=0,n_toroidal+(R_TORTILE-1)-1,R_TORTILE ! round up + do ir0=0,n_radial+(R_RADTILE-1)-1,R_RADTILE ! round up + do iy1=0,(R_TORTILE-1) ! tile + do ir1=0,(R_RADTILE-1) ! tile + iy = iy0 + iy1 + ir = 1 + ir0 + ir1 + if ((iy < n_toroidal) .and. (ir <= n_radial)) then + ! itor = iy+1 + itm = 1 + iy/nt_loc + itl = 1 + modulo(iy,nt_loc) + ix = ir-1-nx0/2 + if (ix < 0) ix = ix+nx + + eB_nl(ir,itl,j,itm) = fxmany(iy,ix,j) + endif + enddo + enddo + enddo + enddo + enddo + + endif ! if i_triad + #if !defined(OMPGPU) - ! end data fB_nl + ! end data fB_nl,eB_nl !$acc end data #endif @@ -1253,6 +1500,9 @@ subroutine cgyro_nl_fftw(ij) ! start the async reverse comm ! can reuse the same req, no overlap with forward fB_req call parallel_slib_r_nc_async(nsplitB,fB_nl,fpackB,fB_req) + if (i_triad == 1) then + call parallel_slib_r_nc_async(nsplitB,eB_nl,epackB,eB_req) + end if fB_req_valid = .TRUE. ! make sure reqs progress call cgyro_nl_fftw_comm_test() @@ -1272,16 +1522,24 @@ subroutine cgyro_nl_fftw_stepr(g_j, f_j, nl_idx, i_omp) implicit none + !----------------------------------- integer, intent(in) :: g_j, f_j integer,intent(in) :: nl_idx ! 1=>A, 2=>B integer,intent(in) :: i_omp + !----------------------------------- integer :: ix,iy integer :: ir,itm,itl,itor + real :: inv_nxny include 'fftw3.f03' + inv_nxny = 1.0/(nx*ny) ! Poisson bracket in real space - uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny) + do ix=0,nx-1 + do iy=0,ny-1 + uv(iy,ix,i_omp) = (uxmany(iy,ix,f_j)*vymany(iy,ix,g_j)-uymany(iy,ix,f_j)*vxmany(iy,ix,g_j)) * inv_nxny + enddo + enddo call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp)) @@ -1307,6 +1565,7 @@ subroutine cgyro_nl_fftw_stepr(g_j, f_j, nl_idx, i_omp) end subroutine cgyro_nl_fftw_stepr + ! assumes cgyro_nl_fftw_stepr has already been called subroutine cgyro_nl_fftw_stepr_triad(g_j, f_j, nl_idx, i_omp) use timer_lib @@ -1315,73 +1574,47 @@ subroutine cgyro_nl_fftw_stepr_triad(g_j, f_j, nl_idx, i_omp) implicit none + !----------------------------------- integer, intent(in) :: g_j, f_j integer,intent(in) :: nl_idx ! 1=>A, 2=>B integer,intent(in) :: i_omp - real :: y_mean(nx) + !----------------------------------- + real :: y_mean_ux, y_mean_uy, y_mean_vx, y_mean_vy + real :: r_ux, r_uy, r_vx, r_vy integer :: ix,iy integer :: ir,itm,itl,itor + real :: inv_nxny,inv_ny include 'fftw3.f03' - ! 1. Poisson bracket in real space - uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny) - - call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp)) + inv_ny = 1.0/ny + inv_nxny = 1.0/(nx*ny) + ! 1. Poisson bracket in real space was done in cgyro_nl_fftw_stepr - ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component - ! that will be filtered in the main time-stepping loop + ! 2. Poisson bracket in real space with Non_Zonal pairs - ! this should really be accounted against nl_mem, but hard to do with OMP - do itm=1,n_toroidal_procs - do itl=1,nt_loc - itor=itl + (itm-1)*nt_loc - do ir=1,n_radial - ix = ir-1-nx0/2 - if (ix < 0) ix = ix+nx - iy = itor-1 - if (nl_idx==1) then - fA_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp) - else - fB_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp) - endif + do ix=0,nx-1 + y_mean_ux = sum(uxmany(:,ix,f_j) ,dim=1 ) * inv_ny + y_mean_uy = sum(uymany(:,ix,f_j) ,dim=1 ) * inv_ny + y_mean_vx = sum(vxmany(:,ix,f_j) ,dim=1 ) * inv_ny + y_mean_vy = sum(vymany(:,ix,f_j) ,dim=1 ) * inv_ny + do iy=0,ny-1 + ! remove ky=0 + r_ux = uxmany(iy,ix,f_j) - y_mean_ux + r_uy = uymany(iy,ix,f_j) - y_mean_uy + r_vx = vxmany(iy,ix,f_j) - y_mean_vx + r_vy = vymany(iy,ix,f_j) - y_mean_vy + ! we could save, but we do not really need to (not used outside from this function) + !uxmany(iy,ix,f_j) = r_ux + !uymany(iy,ix,f_j) = r_y + !vxmany(iy,ix,f_j) = r_vx + !vymany(iy,ix,f_j) = r_vy + + ! compute and save uv + uv(iy,ix,i_omp) = (r_ux*r_vy-r_uy*r_vx) * inv_nxny enddo - enddo enddo - - ! 2. Poisson bracket in real space with Non_Zonal pairs - - ! remove ky=0 in uxmany - y_mean = sum(uxmany(:,:,f_j) ,dim=1 ) / ny - do iy=0,ny-1 - uxmany(iy,:,f_j) = uxmany(iy,:,f_j) -y_mean - - end do - - ! remove ky=0 in uymany - y_mean = sum(uymany(:,:,f_j) ,dim=1 ) / ny ! =0 - do iy=0,ny-1 - uymany(iy,:,f_j) = uymany(iy,:,f_j) -y_mean - - end do - - ! remove ky=0 in vx - y_mean = sum(vxmany(:,:,g_j) ,dim=1 ) / ny - do iy=0,ny-1 - vxmany(iy,:,g_j) = vxmany(iy,:,g_j) -y_mean - - end do - - ! remove ky=0 in vy - y_mean = sum(vymany(:,:,g_j) ,dim=1 ) / ny ! =0 - do iy=0,ny-1 - vymany(iy,:,g_j) = vymany(iy,:,g_j) -y_mean - - end do - - uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny) - call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp)) do itm=1,n_toroidal_procs @@ -1403,7 +1636,7 @@ subroutine cgyro_nl_fftw_stepr_triad(g_j, f_j, nl_idx, i_omp) end subroutine cgyro_nl_fftw_stepr_triad ! NOTE: call cgyro_nl_fftw_comm1 before cgyro_nl_fftw -subroutine cgyro_nl_fftw(ij) +subroutine cgyro_nl_fftw(i_triad) use timer_lib use parallel_lib @@ -1414,9 +1647,9 @@ subroutine cgyro_nl_fftw(ij) !----------------------------------- - integer, intent(in) :: ij + integer, intent(in) :: i_triad !----------------------------------- - integer :: ix,iy,i_triad=0 + integer :: ix,iy integer :: ir,it,itm,itl,it_loc integer :: itor,mytm integer :: j,p @@ -1429,10 +1662,6 @@ subroutine cgyro_nl_fftw(ij) include 'fftw3.f03' - if (triad_print_flag == 1 .and. ij == 3) then - i_triad=1 - endif - call cgyro_nl_fftw_comm_test() ! time to wait for the FA_nl to become avaialble @@ -1574,10 +1803,14 @@ subroutine cgyro_nl_fftw(ij) endif if (j<=nsplitA) then + call cgyro_nl_fftw_stepr(j, j, 1, i_omp) + if (i_omp==1) then + ! use the main thread to progress the async MPI + call cgyro_nl_fftw_comm_test() + endif + if (i_triad == 1) then call cgyro_nl_fftw_stepr_triad(j, j, 1, i_omp) - else - call cgyro_nl_fftw_stepr(j, j, 1, i_omp) endif endif ! else we will do it in the next loop @@ -1660,10 +1893,14 @@ subroutine cgyro_nl_fftw(ij) call cgyro_nl_fftw_comm_test() endif + call cgyro_nl_fftw_stepr(nsplitA+j, j, 2, i_omp) + if (i_omp==1) then + ! use the main thread to progress the async MPI + call cgyro_nl_fftw_comm_test() + endif + if (i_triad == 1) then call cgyro_nl_fftw_stepr_triad(nsplitA+j, j, 2, i_omp) - else - call cgyro_nl_fftw_stepr(nsplitA+j, j, 2, i_omp) endif enddo ! j diff --git a/cgyro/src/cgyro_rhs.F90 b/cgyro/src/cgyro_rhs.F90 index b83770355..b566b4328 100644 --- a/cgyro/src/cgyro_rhs.F90 +++ b/cgyro/src/cgyro_rhs.F90 @@ -393,6 +393,7 @@ subroutine cgyro_rhs(ij,update_cap) integer, intent(in) :: ij logical, intent(in) :: update_cap !-------------------------------- + integer :: i_triad ! fields is ready by now call cgyro_rhs_comm_async_fd @@ -412,9 +413,13 @@ subroutine cgyro_rhs(ij,update_cap) ! Nonlinear evaluation [f,g] if (nonlinear_flag == 1) then + i_triad=0 + if (triad_print_flag == 1 .and. ij == 3) then + i_triad=1 + endif ! assumes someone already started the input comm ! and will finish the output comm - call cgyro_nl_fftw(ij) + call cgyro_nl_fftw(i_triad) endif call cgyro_upwind_complete diff --git a/cgyro/src/cgyro_step_collision.F90 b/cgyro/src/cgyro_step_collision.F90 index c2297993c..eb97d6207 100644 --- a/cgyro/src/cgyro_step_collision.F90 +++ b/cgyro/src/cgyro_step_collision.F90 @@ -1049,6 +1049,38 @@ subroutine cgyro_step_collision_gpu(use_simple) call timer_lib_in('coll') ! Compute H given h and [phi(h), apar(h)] + if (triad_print_flag == 1 ) then + +#if defined(OMPGPU) +!$omp target teams distribute parallel do simd collapse(3) & +!$omp& private(iv_loc,is,my_psi,my_ch) +#else +!$acc parallel loop collapse(3) gang vector private(iv_loc,is,my_psi,my_ch) & +!$acc& present(is_v,cap_h_c,cap_h_ct,cap_h_c,jvec_c,field,z,temp,h_x) & +!$acc& present(nt1,nt2,nv1,nv2,nc) default(none) +#endif + do itor=nt1,nt2 + do iv=nv1,nv2 + do ic=1,nc + iv_loc = iv-nv1+1 + is = is_v(iv) + + ! Save collisional diss. + my_ch = cap_h_ct(iv_loc,itor,ic) + my_psi = sum(jvec_c(:,ic,iv_loc,itor)*field(:,ic,itor)) + + my_psi = my_ch-my_psi*(z(is)/temp(is)) + cap_h_ct(iv_loc,itor,ic) = (cap_h_c(ic,iv_loc,itor) + my_ch) / 2.0 + cap_h_ct(iv_loc,itor,ic) = conjg(cap_h_ct(iv_loc,itor,ic)) & + * ( my_psi - h_x(ic,iv_loc,itor) ) + + h_x(ic,iv_loc,itor) = my_psi + cap_h_c(ic,iv_loc,itor) = my_ch + enddo + enddo + enddo + + else #if defined(OMPGPU) !$omp target teams distribute parallel do simd collapse(3) & @@ -1071,6 +1103,8 @@ subroutine cgyro_step_collision_gpu(use_simple) enddo enddo + end if ! (triad_print_flag == 1 ) + call timer_lib_out('coll') end subroutine cgyro_step_collision_gpu