diff --git a/cgyro/src/cgyro_flux.f90 b/cgyro/src/cgyro_flux.f90 index 52599e119..b5c8be3df 100644 --- a/cgyro/src/cgyro_flux.f90 +++ b/cgyro/src/cgyro_flux.f90 @@ -44,6 +44,7 @@ subroutine cgyro_flux complex :: cprod real, parameter :: x_fraction=0.2 real :: u + real :: kx !$omp parallel do private(iv_loc,iv,is,ix,ie,dv,vpar,ic,ir,it,erot,cprod,cn) & !$omp& private(prod1,prod2,prod3,l,icl,dvr,u,flux_norm) & @@ -188,6 +189,34 @@ subroutine cgyro_flux cflux_loc(:,:,:,itor) = cflux_loc(:,:,:,itor)+2*sin(u)*real(gflux_loc(l,:,:,:,itor))/u enddo + + !------------------------------------------------------------- + ! 2-1. Compute Triad energy transfer + !------------------------------------------------------------- + + kx = 2*pi*rho/length + do is=1,n_species + ! Triad energy transfer : T_k + triad_loc_old(is,:,itor,1) = triad_loc(is,:,itor,1) *temp(is)/dlntdr(is_ele) + ! From Nonzonal Triad energy transfer : T_k [NZ(k',k")->k] + triad_loc_old(is,:,itor,2) = triad_loc(is,:,itor,2) *temp(is)/dlntdr(is_ele) + ! dEntropy / dt : dS_k/dt + triad_loc_old(is,:,itor,3) = (triad_loc(is,:,itor,3)-triad_loc_old(is,:,itor,3))/delta_t + triad_loc_old(is,:,itor,3) = triad_loc_old(is,:,itor,3) *temp(is)/dlntdr(is_ele)*0.5 + ! dWk_perp / dt , (ky = 0) + triad_loc_old(is,:,itor,4) = (triad_loc(is,:,itor,4)-triad_loc_old(is,:,itor,4))/delta_t + triad_loc_old(is,:,itor,4) = triad_loc_old(is,:,itor,4) *(kx*lambda_star)**2/dlntdr(is_ele)*0.5 + ! Entropy : S_k + triad_loc_old(is,:,itor,5) = triad_loc(is,:,itor,3) *temp(is)/dlntdr(is_ele)*0.5 + ! Diss. (radial) + triad_loc_old(is,:,itor,6) = triad_loc(is,:,itor,5) *temp(is)/dlntdr(is_ele) + ! Diss. (theta ) + triad_loc_old(is,:,itor,7) = triad_loc(is,:,itor,6) *temp(is)/dlntdr(is_ele) + ! Diss. (Coll. ) + triad_loc_old(is,:,itor,8) = triad_loc(is,:,itor,7) *temp(is)/dlntdr(is_ele) + enddo + + !----------------------------------------------------- ! 3. Renormalize fluxes to GB or quasilinear forms !~---------------------------------------------------- @@ -216,7 +245,7 @@ subroutine cgyro_flux ! GyroBohm normalizations gflux_loc(:,:,:,:,itor) = gflux_loc(:,:,:,:,itor)/rho**2 cflux_loc(:,:,:,itor) = cflux_loc(:,:,:,itor)/rho**2 - + triad_loc_old(:,:,itor,:) = triad_loc_old(:,:,itor,:)/rho**2 endif enddo @@ -240,6 +269,15 @@ subroutine cgyro_flux NEW_COMM_1, & i_err) + ! Reduced complex triad(ns,kx), below, is still distributed over n + call MPI_ALLREDUCE(triad_loc_old(:,:,:,:), & + triad, & + size(triad), & + MPI_DOUBLE_COMPLEX, & + MPI_SUM, & + NEW_COMM_1, & + i_err) + ! Reduced real cflux(ky), below, is still distributed over n call MPI_ALLREDUCE(cflux_loc, & diff --git a/cgyro/src/cgyro_globals.F90 b/cgyro/src/cgyro_globals.F90 index a990e88f1..8b6a1f893 100644 --- a/cgyro/src/cgyro_globals.F90 +++ b/cgyro/src/cgyro_globals.F90 @@ -80,6 +80,7 @@ module cgyro_globals integer :: moment_print_flag integer :: gflux_print_flag integer :: field_print_flag + integer :: triad_print_flag real :: amp0 real :: amp real :: gamma_e @@ -192,7 +193,7 @@ module cgyro_globals integer :: nsplit,nsplitA,nsplitB integer :: ns1,ns2 integer, dimension(:), allocatable :: recv_status - integer :: fA_req, fB_req, g_req + integer :: fA_req, fB_req, eA_req, eB_req, g_req logical :: fA_req_valid, fB_req_valid, g_req_valid ! Thetas present in the process after NL AllToAll integer :: n_jtheta @@ -243,6 +244,7 @@ module cgyro_globals character(len=12) :: binfile_hb = 'bin.cgyro.hb' character(len=17) :: binfile_ky_flux = 'bin.cgyro.ky_flux' character(len=18) :: binfile_ky_cflux = 'bin.cgyro.ky_cflux' + character(len=15) :: binfile_triad = 'bin.cgyro.triad' character(len=15), dimension(3) :: binfile_fieldb = & (/'bin.cgyro.phib ','bin.cgyro.aparb','bin.cgyro.bparb'/) character(len=16), dimension(3) :: binfile_kxky = & @@ -344,11 +346,14 @@ module cgyro_globals complex, dimension(:,:,:), allocatable :: h0_x complex, dimension(:,:,:), allocatable :: h0_old complex, dimension(:,:,:,:), allocatable :: fA_nl,fB_nl + complex, dimension(:,:,:,:), allocatable :: eA_nl,eB_nl complex, dimension(:,:,:,:), allocatable :: g_nl complex, dimension(:,:,:), allocatable :: fpackA,fpackB + complex, dimension(:,:,:), allocatable :: epackA,epackB complex, dimension(:,:,:,:), allocatable :: gpack complex, dimension(:,:,:), allocatable :: omega_cap_h complex, dimension(:,:,:), allocatable :: omega_h + complex, dimension(:,:,:), allocatable :: diss_r complex, dimension(:,:,:,:), allocatable :: omega_s,omega_ss complex, dimension(:,:,:), allocatable :: omega_sbeta complex, dimension(:,:,:), allocatable :: cap_h_c @@ -382,6 +387,7 @@ module cgyro_globals real, dimension(:,:,:,:), allocatable :: cflux complex, dimension(:,:,:,:,:), allocatable :: gflux_loc complex, dimension(:,:,:,:,:), allocatable :: gflux + complex, dimension(:,:,:,:), allocatable :: triad,triad_loc,triad_loc_old real, dimension(:,:), allocatable :: cflux_tave, gflux_tave real :: tave_min, tave_max integer :: tave_step diff --git a/cgyro/src/cgyro_init_arrays.F90 b/cgyro/src/cgyro_init_arrays.F90 index b5318f4f2..3908f6156 100644 --- a/cgyro/src/cgyro_init_arrays.F90 +++ b/cgyro/src/cgyro_init_arrays.F90 @@ -448,7 +448,14 @@ subroutine cgyro_init_arrays + abs(omega_cdrift_r(it,is)*xi(ix))*vel(ie) & + abs(omega_rot_drift_r(it,is)) & + abs(omega_rot_edrift_r(it))) - + + ! (d/dr) upwind diss. + diss_r(ic,iv_loc,itor) = - (n_radial/length)*spectraldiss(u,nup_radial)*up_radial & + * (abs(omega_rdrift(it,is))*energy(ie)*(1.0+xi(ix)**2) & + + abs(omega_cdrift_r(it,is)*xi(ix))*vel(ie) & + + abs(omega_rot_drift_r(it,is)) & + + abs(omega_rot_edrift_r(it))) + ! omega_star carg = & -i_c*k_theta_base*itor*rho*(dlnndr(is)+dlntdr(is)*(energy(ie)-1.5)) & diff --git a/cgyro/src/cgyro_init_manager.F90 b/cgyro/src/cgyro_init_manager.F90 index 2c8b1ba3b..dd14d6ab9 100644 --- a/cgyro/src/cgyro_init_manager.F90 +++ b/cgyro/src/cgyro_init_manager.F90 @@ -178,6 +178,11 @@ subroutine cgyro_init_manager allocate(cflux_loc(n_species,4,n_field,nt1:nt2)) allocate( gflux(0:n_global,n_species,4,n_field,nt1:nt2)) allocate(gflux_loc(0:n_global,n_species,4,n_field,nt1:nt2)) + + allocate( triad(n_species,n_radial,nt1:nt2,8)) + allocate(triad_loc(n_species,n_radial,nt1:nt2,7)) + allocate(triad_loc_old(n_species,n_radial,nt1:nt2,8)) + allocate(cflux_tave(n_species,4)) allocate(gflux_tave(n_species,4)) @@ -245,6 +250,7 @@ subroutine cgyro_init_manager allocate(cap_h_v(nc_loc,nt1:nt2,nv)) allocate(omega_cap_h(nc,nv_loc,nt1:nt2)) allocate(omega_h(nc,nv_loc,nt1:nt2)) + allocate(diss_r(nc,nv_loc,nt1:nt2)) allocate(omega_s(n_field,nc,nv_loc,nt1:nt2)) allocate(omega_ss(n_field,nc,nv_loc,nt1:nt2)) allocate(omega_sbeta(nc,nv_loc,nt1:nt2)) @@ -285,8 +291,10 @@ subroutine cgyro_init_manager ! Nonlinear arrays if (nonlinear_flag == 1) then allocate(fA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs)) + allocate(eA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs)) allocate(g_nl(n_field,n_radial,n_jtheta,n_toroidal)) allocate(fpackA(n_radial,nt_loc,nsplitA*n_toroidal_procs)) + allocate(epackA(n_radial,nt_loc,nsplitA*n_toroidal_procs)) allocate(gpack(n_field,n_radial,n_jtheta,n_toroidal)) allocate(jvec_c_nl(n_field,n_radial,n_jtheta,nv_loc,n_toroidal)) #if defined(OMPGPU) @@ -297,6 +305,8 @@ subroutine cgyro_init_manager if (nsplitB > 0) then ! nsplitB can be zero at large MPI allocate(fB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs)) allocate(fpackB(n_radial,nt_loc,nsplitB*n_toroidal_procs)) + allocate(eB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs)) + allocate(epackB(n_radial,nt_loc,nsplitB*n_toroidal_procs)) #if defined(OMPGPU) !$omp target enter data map(alloc:fpackB,fB_nl) #elif defined(_OPENACC) diff --git a/cgyro/src/cgyro_nl_comm.F90 b/cgyro/src/cgyro_nl_comm.F90 index 674d75e94..6392d334d 100644 --- a/cgyro/src/cgyro_nl_comm.F90 +++ b/cgyro/src/cgyro_nl_comm.F90 @@ -304,6 +304,316 @@ subroutine cgyro_nl_fftw_comm1_r(ij) end subroutine cgyro_nl_fftw_comm1_r +subroutine cgyro_nl_fftw_comm1_r_triad(ij) + use timer_lib + use parallel_lib + use cgyro_globals + + implicit none + + !----------------------------------- + integer, intent(in) :: ij + !----------------------------------- + + integer :: is,ix,ie + integer :: id,itd,itd_class,jr0(0:2),itorbox,jc + integer :: ir,it,iv_loc_m,ic_loc_m,itor + integer :: iexch0,itor0,isplit0,iexch_base + complex :: my_psi + real :: psi_mul + + real :: dv,dvr,dvp,rval,rval2 + complex :: cprod,cprod2,thfac + + triad_loc_old(:,:,:,3)=triad_loc(:,:,:,3) + triad_loc_old(:,:,:,4)=triad_loc(:,:,:,4) + triad_loc(:,:,:,:)=0.0 + + call timer_lib_in('nl_comm') + call parallel_slib_r_nc_wait(nsplitA,fA_nl,fpackA,fA_req) + call parallel_slib_r_nc_wait(nsplitA,eA_nl,epackA,eA_req) + fA_req_valid = .FALSE. + if (nsplitB > 0) then + ! no major compute to overlap + call parallel_slib_r_nc_wait(nsplitB,fB_nl,fpackB,fB_req) + call parallel_slib_r_nc_wait(nsplitB,eB_nl,epackB,eB_req) + fB_req_valid = .FALSE. + endif + call timer_lib_out('nl_comm') + + call timer_lib_in('nl') + + psi_mul = (q*rho/rmin)*(2*pi/length) + + if (nsplitB > 0) then + +#if defined(OMPGPU) +!$omp target teams distribute parallel do simd collapse(4) & +!$omp& private(iexch0,itor0,isplit0,iexch_base) & +!$omp& private(ic_loc_m,my_psi) +#elif defined(_OPENACC) +!$acc parallel loop collapse(4) gang vector independent private(ic_loc_m,my_psi) & +!$acc& private(iexch0,itor0,isplit0,iexch_base) & +!$acc& present(ic_c,px,rhs,fpackA,fpackB) copyin(psi_mul,zf_scale) & +!$acc& present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA,nsplitB) copyin(ij) default(none) +#else +!$omp parallel do collapse(2) private(ic_loc_m,my_psi) & +!$omp& private(iexch0,itor0,isplit0,iexch_base,is,ix,ie,dv,dvr,dvp,rval,rval2,cprod,cprod2) & +!$omp& private(id,itorbox,jr0,jc,itd,itd_class,thfac) +#endif + do itor=nt1,nt2 + do iv_loc_m=1,nv_loc + do ir=1,n_radial + itorbox = itor*box_size*sign_qs + jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial) + jr0(1) = n_theta*(ir-1) + jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial) + + do it=1,n_theta + ic_loc_m = ic_c(ir,it) + + is = is_v(iv_loc_m +nv1 -1 ) + ix = ix_v(iv_loc_m +nv1 -1 ) + ie = ie_v(iv_loc_m +nv1 -1 ) + dv = w_exi(ie,ix) + dvr = w_theta(it)*dens2_rot(it,is)*dv + dvp = w_theta(it)*dv*(ir-1-nx0/2)**2 + + ! Density moment + cprod = w_theta(it)*cap_h_c(ic_loc_m,iv_loc_m,itor)*dvjvec_c(1,ic_loc_m,iv_loc_m,itor)/z(is) + cprod = -(dvr*z(is)/temp(is)*field(1,ic_loc_m,itor)-cprod) + cprod2= ( jvec_c(1,ic_loc_m,iv_loc_m,itor)*z(is)/temp(is) )*conjg(field(1,ic_loc_m,itor) ) + + iexch0 = (iv_loc_m-1) + (it-1)*nv_loc + itor0 = iexch0/nsplit + isplit0 = modulo(iexch0,nsplit) + if (isplit0 < nsplitA) then + iexch_base = 1+itor0*nsplitA + my_psi = fpackA(ir,itor-nt1+1,iexch_base+isplit0) + + ! 1. Triad energy transfer (all) + triad_loc(is,ir,itor,1) = triad_loc(is,ir,itor,1) & + + fpackA(ir,itor-nt1+1,iexch_base+isplit0)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul + ! 2. Triad energy transfer ( {NZF-NZF} coupling ) + if (itor == 0) then + ! New : Diag. direct ZF production [A. Ishizawa PRL 2019 ] + triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2) & + + epackA(ir,itor-nt1+1,iexch_base+isplit0)*cprod2*dvr*psi_mul + else + triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2) & + + epackA(ir,itor-nt1+1,iexch_base+isplit0)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul + endif + else + iexch_base = 1+itor0*nsplitB + my_psi = fpackB(ir,itor-nt1+1,iexch_base+(isplit0-nsplitA)) + + ! 1. Triad energy transfer (all) + triad_loc(is,ir,itor,1) = triad_loc(is,ir,itor,1) & + + fpackB(ir,itor-nt1+1,iexch_base+(isplit0-nsplitA))*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul + ! 2. Triad energy transfer ( {NZF-NZF} coupling ) + if (itor == 0) then + ! New : Diag. direct ZF production [A. Ishizawa PRL 2019 ] + triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2) & + + epackB(ir,itor-nt1+1,iexch_base+(isplit0-nsplitA))*cprod2*dvr*psi_mul + else + triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2) & + + epackB(ir,itor-nt1+1,iexch_base+(isplit0-nsplitA))*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul + endif + endif + + ! 3. Entropy + triad_loc(is,ir,itor,3) = triad_loc(is,ir,itor,3) & + + cap_h_c(ic_loc_m,iv_loc_m,itor)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr & + - field(1,ic_loc_m,itor)*conjg(field(1,ic_loc_m,itor))*(z(is)/temp(is))**2*dvr & + - 2.0*cprod*conjg(field(1,ic_loc_m,itor))*(z(is)/temp(is)) + ! 4. Field potential + triad_loc(is,ir,itor,4) = triad_loc(is,ir,itor,4) & + + sum( field(:,ic_loc_m,itor)*conjg(field(:,ic_loc_m,itor)) )*dvp + ! 5. Diss. (radial) + triad_loc(is,ir,itor,5) = triad_loc(is,ir,itor,5) & + + diss_r(ic_loc_m,iv_loc_m,itor)*h_x(ic_loc_m,iv_loc_m,itor)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr + ! 6. Diss. (theta ) + rval = omega_stream(it,is,itor)*vel(ie)*xi(ix) + rval2 = abs(omega_stream(it,is,itor)) + cprod = 0.0 + cprod2= 0.0 + + !icd_c(ic, id, itor) = ic_c(jr,modulo(it+id-1,n_theta)+1) + !jc = icd_c(ic, id, itor) + !dtheta(ic, id, itor) := cderiv(id)*thfac + !dtheta_up(ic, id, itor) := uderiv(id)*thfac*up_theta + itd = n_theta+it-nup_theta + itd_class = 0 + jc = jr0(itd_class)+itd + thfac = thfac_itor(itd_class,itor) + + do id=-nup_theta,nup_theta + if (itd > n_theta) then + ! move to next itd_class of compute + itd = itd - n_theta + itd_class = itd_class + 1 + jc = jr0(itd_class)+itd + thfac = thfac_itor(itd_class,itor) + endif + + cprod2 = cprod2 - rval* thfac*cderiv(id) *cap_h_c(jc,iv_loc_m,itor) + cprod = cprod - rval2* uderiv(id)*up_theta *g_x(jc,iv_loc_m,itor) + itd = itd + 1 + jc = jc + 1 + enddo + + triad_loc(is,ir,itor,6) = triad_loc(is,ir,itor,6) + cprod*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr + ! 7. Diss. (Coll. = Implicit advance - theta_streaming ) + triad_loc(is,ir,itor,7) = triad_loc(is,ir,itor,7) & + + ( cap_h_ct(iv_loc_m,itor,ic_loc_m)/delta_t + cprod2*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor)) )*dvr + + + if ( (itor == 0) .and. (ir == 1 .or. px(ir) == 0) ) then + ! filter + my_psi = (0.0,0.0) + endif + + if (itor == 0) then + my_psi = my_psi*zf_scale + endif + + ! RHS -> -[f,g] = [f,g]_{r,-alpha} + rhs(ic_loc_m,iv_loc_m,itor,ij) = rhs(ic_loc_m,iv_loc_m,itor,ij)+psi_mul*my_psi + enddo + enddo + enddo + enddo + + else ! nsplitB==0 + +#if defined(OMPGPU) +!$omp target teams distribute parallel do simd collapse(4) & +!$omp& private(iexch0,itor0,isplit0,iexch_base) & +!$omp& private(ic_loc_m,my_psi) +#elif defined(_OPENACC) +!$acc parallel loop collapse(4) gang vector independent private(ic_loc_m,my_psi) & +!$acc& private(iexch0,itor0,isplit0,iexch_base) & +!$acc& present(ic_c,px,rhs,fpackA) copyin(psi_mul,zf_scale) & +!$acc& present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA) copyin(ij) default(none) +#else +!$omp parallel do collapse(2) private(ic_loc_m,my_psi) & +!$omp& private(iexch0,itor0,isplit0,iexch_base,is,ix,ie,dv,dvr,dvp,rval,rval2,cprod,cprod2) & +!$omp& private(id,itorbox,jr0,jc,itd,itd_class,thfac) +#endif + do itor=nt1,nt2 + do iv_loc_m=1,nv_loc + do ir=1,n_radial + itorbox = itor*box_size*sign_qs + jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial) + jr0(1) = n_theta*(ir-1) + jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial) + + do it=1,n_theta + ic_loc_m = ic_c(ir,it) + + is = is_v(iv_loc_m +nv1 -1 ) + ix = ix_v(iv_loc_m +nv1 -1 ) + ie = ie_v(iv_loc_m +nv1 -1 ) + dv = w_exi(ie,ix) + dvr = w_theta(it)*dens2_rot(it,is)*dv + dvp = w_theta(it)*dv*(ir-1-nx0/2)**2 + + ! Density moment + cprod = w_theta(it)*cap_h_c(ic_loc_m,iv_loc_m,itor)*dvjvec_c(1,ic_loc_m,iv_loc_m,itor)/z(is) + cprod = -(dvr*z(is)/temp(is)*field(1,ic_loc_m,itor)-cprod) + cprod2= ( jvec_c(1,ic_loc_m,iv_loc_m,itor)*z(is)/temp(is) )*conjg(field(1,ic_loc_m,itor) ) + + + iexch0 = (iv_loc_m-1) + (it-1)*nv_loc + itor0 = iexch0/nsplit + isplit0 = modulo(iexch0,nsplit) + iexch_base = 1+itor0*nsplitA + my_psi = fpackA(ir,itor-nt1+1,iexch_base+isplit0) + + + ! 1. Triad energy transfer (all) + triad_loc(is,ir,itor,1) = triad_loc(is,ir,itor,1) & + + fpackA(ir,itor-nt1+1,iexch_base+isplit0)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul + ! 2. Triad energy transfer ( {NZF-NZF} coupling ) + if (itor == 0) then + ! New : Diag. direct ZF production [A. Ishizawa PRL 2019 ] + triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2) & + + epackA(ir,itor-nt1+1,iexch_base+isplit0)*cprod2*dvr*psi_mul + else + triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2) & + + epackA(ir,itor-nt1+1,iexch_base+isplit0)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul + endif + + ! 3. Entropy + triad_loc(is,ir,itor,3) = triad_loc(is,ir,itor,3) & + + cap_h_c(ic_loc_m,iv_loc_m,itor)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr & + - field(1,ic_loc_m,itor)*conjg(field(1,ic_loc_m,itor))*(z(is)/temp(is))**2*dvr & + - 2.0*cprod*conjg(field(1,ic_loc_m,itor))*(z(is)/temp(is)) + ! 4. Field potential + triad_loc(is,ir,itor,4) = triad_loc(is,ir,itor,4) & + + sum( field(:,ic_loc_m,itor)*conjg(field(:,ic_loc_m,itor)) )*dvp + ! 5. Diss. (radial) + triad_loc(is,ir,itor,5) = triad_loc(is,ir,itor,5) & + + diss_r(ic_loc_m,iv_loc_m,itor)*h_x(ic_loc_m,iv_loc_m,itor)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr + ! 6. Diss. (theta ) + rval = omega_stream(it,is,itor)*vel(ie)*xi(ix) + rval2 = abs(omega_stream(it,is,itor)) + cprod = 0.0 + cprod2= 0.0 + + !icd_c(ic, id, itor) = ic_c(jr,modulo(it+id-1,n_theta)+1) + !jc = icd_c(ic, id, itor) + !dtheta(ic, id, itor) := cderiv(id)*thfac + !dtheta_up(ic, id, itor) := uderiv(id)*thfac*up_theta + itd = n_theta+it-nup_theta + itd_class = 0 + jc = jr0(itd_class)+itd + thfac = thfac_itor(itd_class,itor) + + do id=-nup_theta,nup_theta + if (itd > n_theta) then + ! move to next itd_class of compute + itd = itd - n_theta + itd_class = itd_class + 1 + jc = jr0(itd_class)+itd + thfac = thfac_itor(itd_class,itor) + endif + + cprod2 = cprod2 - rval* thfac*cderiv(id) *cap_h_c(jc,iv_loc_m,itor) + cprod = cprod - rval2* uderiv(id)*up_theta *g_x(jc,iv_loc_m,itor) + itd = itd + 1 + jc = jc + 1 + enddo + + triad_loc(is,ir,itor,6) = triad_loc(is,ir,itor,6) + cprod*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr + ! 7. Diss. (Coll. = Implicit advance - theta_streaming ) + triad_loc(is,ir,itor,7) = triad_loc(is,ir,itor,7) & + + ( cap_h_ct(iv_loc_m,itor,ic_loc_m)/delta_t + cprod2*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor)) )*dvr + + + if ( (itor == 0) .and. (ir == 1 .or. px(ir) == 0) ) then + ! filter + my_psi = (0.0,0.0) + endif + + if (itor == 0) then + my_psi = my_psi*zf_scale + endif + + ! RHS -> -[f,g] = [f,g]_{r,-alpha} + rhs(ic_loc_m,iv_loc_m,itor,ij) = rhs(ic_loc_m,iv_loc_m,itor,ij)+psi_mul*my_psi + enddo + enddo + enddo + enddo + + endif ! if nsplitB>0 + + call timer_lib_out('nl') + +end subroutine cgyro_nl_fftw_comm1_r_triad + + ! ! Comm2 is a transpose ! Reminder: nc ~= n_radial*n_theta diff --git a/cgyro/src/cgyro_nl_fftw.f90 b/cgyro/src/cgyro_nl_fftw.f90 index 0fbe6a200..d56143bbf 100644 --- a/cgyro/src/cgyro_nl_fftw.f90 +++ b/cgyro/src/cgyro_nl_fftw.f90 @@ -64,6 +64,102 @@ subroutine cgyro_nl_fftw_stepr(g_j, f_j, nl_idx, i_omp) end subroutine cgyro_nl_fftw_stepr +subroutine cgyro_nl_fftw_stepr_triad(g_j, f_j, nl_idx, i_omp) + + use timer_lib + use parallel_lib + use cgyro_globals + + implicit none + + integer, intent(in) :: g_j, f_j + integer,intent(in) :: nl_idx ! 1=>A, 2=>B + integer,intent(in) :: i_omp + real :: y_mean(nx) + integer :: ix,iy + integer :: ir,itm,itl,itor + + include 'fftw3.f03' + + ! 1. Poisson bracket in real space + uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny) + + call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp)) + + ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component + ! that will be filtered in the main time-stepping loop + + ! this should really be accounted against nl_mem, but hard to do with OMP + do itm=1,n_toroidal_procs + do itl=1,nt_loc + itor=itl + (itm-1)*nt_loc + do ir=1,n_radial + ix = ir-1-nx0/2 + if (ix < 0) ix = ix+nx + iy = itor-1 + if (nl_idx==1) then + fA_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp) + else + fB_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp) + endif + enddo + enddo + enddo + + + ! 2. Poisson bracket in real space with Non_Zonal pairs + + ! remove ky=0 in uxmany + y_mean = sum(uxmany(:,:,f_j) ,dim=1 ) / ny + do iy=0,ny-1 + uxmany(iy,:,f_j) = uxmany(iy,:,f_j) -y_mean + + end do + + ! remove ky=0 in uymany + y_mean = sum(uymany(:,:,f_j) ,dim=1 ) / ny ! =0 + do iy=0,ny-1 + uymany(iy,:,f_j) = uymany(iy,:,f_j) -y_mean + + end do + + ! remove ky=0 in vx + y_mean = sum(vxmany(:,:,g_j) ,dim=1 ) / ny + do iy=0,ny-1 + vxmany(iy,:,g_j) = vxmany(iy,:,g_j) -y_mean + + end do + + ! remove ky=0 in vy + y_mean = sum(vymany(:,:,g_j) ,dim=1 ) / ny ! =0 + do iy=0,ny-1 + vymany(iy,:,g_j) = vymany(iy,:,g_j) -y_mean + + end do + + uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny) + + call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp)) + + do itm=1,n_toroidal_procs + do itl=1,nt_loc + itor=itl + (itm-1)*nt_loc + do ir=1,n_radial + ix = ir-1-nx0/2 + if (ix < 0) ix = ix+nx + iy = itor-1 + if (nl_idx==1) then + eA_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp) + else + eB_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp) + endif + enddo + enddo + enddo + +end subroutine cgyro_nl_fftw_stepr_triad + + ! NOTE: call cgyro_nl_fftw_comm1 before cgyro_nl_fftw subroutine cgyro_nl_fftw(ij) @@ -326,3 +422,268 @@ subroutine cgyro_nl_fftw(ij) endif ! if nsplitB>0 end subroutine cgyro_nl_fftw + + +subroutine cgyro_nl_fftw_triad(ij) + + use timer_lib + use parallel_lib + use cgyro_nl_comm + use cgyro_globals + + implicit none + + !----------------------------------- + integer, intent(in) :: ij + !----------------------------------- + integer :: ix,iy + integer :: ir,it,itm,itl,it_loc + integer :: itor,mytm + integer :: j,p + integer :: i_omp + integer :: jtheta_min + + complex :: f0,g0 + + integer, external :: omp_get_thread_num + + include 'fftw3.f03' + + call cgyro_nl_fftw_comm_test() + + ! time to wait for the FA_nl to become avaialble + call timer_lib_in('nl_comm') + call parallel_slib_f_nc_wait(nsplitA,fpackA,fA_nl,fA_req) + fA_req_valid = .FALSE. + ! make sure reqs progress + call cgyro_nl_fftw_comm_test() + call timer_lib_out('nl_comm') + + call timer_lib_in('nl') + +! f_nl is (radial, nt_loc, theta, nv_loc1, toroidal_procs) +! where nv_loc1 * toroidal_procs >= nv_loc +!$omp parallel do schedule(dynamic,1) private(itm,itl,itor,iy,ir,p,ix,f0,i_omp,j) + do j=1,nsplitA + i_omp = omp_get_thread_num()+1 + + ! zero elements not otherwise set below + fx(0:ny2,nx2:nx0-1,i_omp) = 0.0 + fy(0:ny2,nx2:nx0-1,i_omp) = 0.0 + + ! Array mapping + do ir=1,n_radial + p = ir-1-nx0/2 + ix = p + if (ix < 0) ix = ix+nx + do itm=1,n_toroidal_procs + do itl=1,nt_loc + itor=itl + (itm-1)*nt_loc + iy = itor-1 + f0 = i_c*fA_nl(ir,itl,j,itm) + fx(iy,ix,i_omp) = p*f0 + fy(iy,ix,i_omp) = iy*f0 + enddo + enddo + if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2 + ! Average elements so as to ensure + ! f(kx,ky=0) = f(-kx,ky=0)^* + ! This symmetry is required for complex input to c2r + f0 = 0.5*( fx(0,ix,i_omp)+conjg(fx(0,nx-ix,i_omp)) ) + fx(0,ix ,i_omp) = f0 + fx(0,nx-ix,i_omp) = conjg(f0) + endif + fx(n_toroidal:ny2,ix,i_omp) = 0.0 + fy(n_toroidal:ny2,ix,i_omp) = 0.0 + enddo + + if (i_omp==1) then + ! use the main thread to progress the async MPI + call cgyro_nl_fftw_comm_test() + endif + + call fftw_execute_dft_c2r(plan_c2r,fx(:,:,i_omp),uxmany(:,:,j)) + if (i_omp==1) then + ! use the main thread to progress the async MPI + call cgyro_nl_fftw_comm_test() + endif + + call fftw_execute_dft_c2r(plan_c2r,fy(:,:,i_omp),uymany(:,:,j)) + enddo ! j + + call timer_lib_out('nl') + + call timer_lib_in('nl_comm') + ! time to wait for the g_nl to become avaialble + call parallel_slib_f_fd_wait(n_field,n_radial,n_jtheta,gpack,g_nl,g_req) + g_req_valid = .FALSE. + ! make sure reqs progress + call cgyro_nl_fftw_comm_test() + call timer_lib_out('nl_comm') + + call timer_lib_in('nl') + +! g_nl is (n_field,n_radial,n_jtheta,nt_loc,n_toroidal_procs) +! jcev_c_nl is (n_field,n_radial,n_jtheta,nv_loc,nt_loc,n_toroidal_procs) +!$omp parallel do schedule(dynamic,1) & +!$omp& private(itor,mytm,itm,itl,iy,ir,p,ix,g0,i_omp,j,it,iv_loc,it_loc,jtheta_min) + do j=1,nsplit + i_omp = omp_get_thread_num()+1 + + ! zero elements not otherwise set below + gx(0:ny2,nx2:nx0-1,i_omp) = 0.0 + gy(0:ny2,nx2:nx0-1,i_omp) = 0.0 + + ! Array mapping + do ir=1,n_radial + p = ir-1-nx0/2 + ix = p + if (ix < 0) ix = ix+nx + do itm=1,n_toroidal_procs + do itl=1,nt_loc + itor = itl + (itm-1)*nt_loc + mytm = 1 + nt1/nt_loc !my toroidal proc number + it = 1+((mytm-1)*nsplit+j-1)/nv_loc + iv_loc = 1+modulo((mytm-1)*nsplit+j-1,nv_loc) + jtheta_min = 1+((mytm-1)*nsplit)/nv_loc + it_loc = it-jtheta_min+1 + + iy = itor-1 + if (it > n_theta) then + g0 = (0.0,0.0) + else + g0 = i_c*sum( jvec_c_nl(:,ir,it_loc,iv_loc,itor)*g_nl(:,ir,it_loc,itor)) + endif + gx(iy,ix,i_omp) = p*g0 + gy(iy,ix,i_omp) = iy*g0 + enddo + enddo + if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2 + ! Average elements so as to ensure + ! g(kx,ky=0) = g(-kx,ky=0)^* + ! This symmetry is required for complex input to c2r + g0 = 0.5*( gx(0,ix,i_omp)+conjg(gx(0,nx-ix,i_omp)) ) + gx(0,ix ,i_omp) = g0 + gx(0,nx-ix,i_omp) = conjg(g0) + endif + gx(n_toroidal:ny2,ix,i_omp) = 0.0 + gy(n_toroidal:ny2,ix,i_omp) = 0.0 + enddo + + if (i_omp==1) then + ! use the main thread to progress the async MPI + call cgyro_nl_fftw_comm_test() + endif + + call fftw_execute_dft_c2r(plan_c2r,gx(:,:,i_omp),vxmany(:,:,j)) + + if (i_omp==1) then + ! use the main thread to progress the async MPI + call cgyro_nl_fftw_comm_test() + endif + + call fftw_execute_dft_c2r(plan_c2r,gy(:,:,i_omp),vymany(:,:,j)) + + if (i_omp==1) then + ! use the main thread to progress the async MPI + call cgyro_nl_fftw_comm_test() + endif + + if (j<=nsplitA) then + call cgyro_nl_fftw_stepr_triad(j, j, 1, i_omp) + endif + ! else we will do it in the next loop + enddo ! j + + call timer_lib_out('nl') + + call timer_lib_in('nl_comm') + ! start the async reverse comm + ! can reuse the same req, no overlap with forward fA_req + call parallel_slib_r_nc_async(nsplitA,fA_nl,fpackA,fA_req) + call parallel_slib_r_nc_async(nsplitA,eA_nl,epackA,eA_req) + fA_req_valid = .TRUE. + + if (nsplitB > 0) then + ! time to wait for the 2nd half of F_nl to become avaialble + call parallel_slib_f_nc_wait(nsplitB,fpackB,fB_nl,fB_req) + fB_req_valid = .FALSE. + endif + + ! make sure reqs progress + call cgyro_nl_fftw_comm_test() + call timer_lib_out('nl_comm') + + if (nsplitB > 0) then + call timer_lib_in('nl') + +! f_nl is (radial, nt_loc, theta, nv_loc1, toroidal_procs) +! where nv_loc1 * toroidal_procs >= nv_loc +!$omp parallel do schedule(dynamic,1) private(itm,itl,itor,iy,ir,p,ix,f0,i_omp,j) + do j=1,nsplitB + i_omp = omp_get_thread_num()+1 + + ! zero elements not otherwise set below + fx(0:ny2,nx2:nx0-1,i_omp) = 0.0 + fy(0:ny2,nx2:nx0-1,i_omp) = 0.0 + + ! Array mapping + do ir=1,n_radial + p = ir-1-nx0/2 + ix = p + if (ix < 0) ix = ix+nx + do itm=1,n_toroidal_procs + do itl=1,nt_loc + itor=itl + (itm-1)*nt_loc + iy = itor-1 + f0 = i_c*fB_nl(ir,itl,j,itm) + fx(iy,ix,i_omp) = p*f0 + fy(iy,ix,i_omp) = iy*f0 + enddo + enddo + if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2 + ! Average elements so as to ensure + ! f(kx,ky=0) = f(-kx,ky=0)^* + ! This symmetry is required for complex input to c2r + f0 = 0.5*( fx(0,ix,i_omp)+conjg(fx(0,nx-ix,i_omp)) ) + fx(0,ix ,i_omp) = f0 + fx(0,nx-ix,i_omp) = conjg(f0) + endif + fx(n_toroidal:ny2,ix,i_omp) = 0.0 + fy(n_toroidal:ny2,ix,i_omp) = 0.0 + enddo + + if (i_omp==1) then + ! use the main thread to progress the async MPI + call cgyro_nl_fftw_comm_test() + endif + + call fftw_execute_dft_c2r(plan_c2r,fx(:,:,i_omp),uxmany(:,:,j)) + if (i_omp==1) then + ! use the main thread to progress the async MPI + call cgyro_nl_fftw_comm_test() + endif + + call fftw_execute_dft_c2r(plan_c2r,fy(:,:,i_omp),uymany(:,:,j)) + if (i_omp==1) then + ! use the main thread to progress the async MPI + call cgyro_nl_fftw_comm_test() + endif + + call cgyro_nl_fftw_stepr_triad(nsplitA+j, j, 2, i_omp) + enddo ! j + + call timer_lib_out('nl') + + call timer_lib_in('nl_comm') + ! start the async reverse comm + ! can reuse the same req, no overlap with forward fB_req + call parallel_slib_r_nc_async(nsplitB,fB_nl,fpackB,fB_req) + call parallel_slib_r_nc_async(nsplitB,eB_nl,epackB,eB_req) + fB_req_valid = .TRUE. + ! make sure reqs progress + call cgyro_nl_fftw_comm_test() + call timer_lib_out('nl_comm') + endif ! if nsplitB>0 + +end subroutine cgyro_nl_fftw_triad diff --git a/cgyro/src/cgyro_read_input.f90 b/cgyro/src/cgyro_read_input.f90 index be77de7dd..bfbe4bf71 100644 --- a/cgyro/src/cgyro_read_input.f90 +++ b/cgyro/src/cgyro_read_input.f90 @@ -70,6 +70,7 @@ subroutine cgyro_read_input call cgyro_readbc_int(moment_print_flag,'MOMENT_PRINT_FLAG') call cgyro_readbc_int(gflux_print_flag,'GFLUX_PRINT_FLAG') call cgyro_readbc_int(field_print_flag,'FIELD_PRINT_FLAG') + call cgyro_readbc_int(triad_print_flag,'TRIAD_PRINT_FLAG') call cgyro_readbc_real(amp0) call cgyro_readbc_real(amp) call cgyro_readbc_real(gamma_e) diff --git a/cgyro/src/cgyro_rhs.F90 b/cgyro/src/cgyro_rhs.F90 index 2f762bd16..b12c6cca5 100644 --- a/cgyro/src/cgyro_rhs.F90 +++ b/cgyro/src/cgyro_rhs.F90 @@ -35,7 +35,11 @@ subroutine cgyro_rhs_r_comm_async(ij) integer, intent(in) :: ij if (nonlinear_flag == 1) then - call cgyro_nl_fftw_comm1_r(ij) + if (triad_print_flag == 1 .and. ij == 4) then + call cgyro_nl_fftw_comm1_r_triad(ij) + else + call cgyro_nl_fftw_comm1_r(ij) + endif endif end subroutine cgyro_rhs_r_comm_async @@ -410,7 +414,11 @@ subroutine cgyro_rhs(ij,update_cap) if (nonlinear_flag == 1) then ! assumes someone already started the input comm ! and will finish the output comm - call cgyro_nl_fftw() + if (triad_print_flag == 1 .and. ij == 4) then + call cgyro_nl_fftw_triad() + else + call cgyro_nl_fftw() + endif endif call cgyro_upwind_complete diff --git a/cgyro/src/cgyro_step_collision.F90 b/cgyro/src/cgyro_step_collision.F90 index f422e88a4..228360d2f 100644 --- a/cgyro/src/cgyro_step_collision.F90 +++ b/cgyro/src/cgyro_step_collision.F90 @@ -334,6 +334,7 @@ subroutine cgyro_step_collision_cpu(use_simple) ! Compute H given h and [phi(h), apar(h)] + if (triad_print_flag == 1 ) then !$omp parallel do collapse(2) & !$omp& private(iv_loc,is,ic,iv,my_psi,my_ch) firstprivate(nc) do itor=nt1,nt2 @@ -341,14 +342,44 @@ subroutine cgyro_step_collision_cpu(use_simple) iv_loc = iv-nv1+1 is = is_v(iv) do ic=1,nc + + ! Save collisional diss. + my_ch = cap_h_ct(iv_loc,itor,ic) + my_psi = sum(jvec_c(:,ic,iv_loc,itor)*field(:,ic,itor)) + + my_psi = my_ch-my_psi*(z(is)/temp(is)) + cap_h_ct(iv_loc,itor,ic) = (cap_h_c(ic,iv_loc,itor) + my_ch) / 2.0 + cap_h_ct(iv_loc,itor,ic) = conjg(cap_h_ct(iv_loc,itor,ic)) & + * ( my_psi - h_x(ic,iv_loc,itor) ) + + h_x(ic,iv_loc,itor) = my_psi + cap_h_c(ic,iv_loc,itor) = my_ch + + enddo + enddo + enddo + + else + +!$omp parallel do collapse(2) & +!$omp& private(iv_loc,is,ic,iv,my_psi,my_ch) firstprivate(nc) + do itor=nt1,nt2 + do iv=nv1,nv2 + iv_loc = iv-nv1+1 + is = is_v(iv) + do ic=1,nc + my_ch = cap_h_ct(iv_loc,itor,ic) my_psi = sum(jvec_c(:,ic,iv_loc,itor)*field(:,ic,itor)) h_x(ic,iv_loc,itor) = my_ch-my_psi*(z(is)/temp(is)) cap_h_c(ic,iv_loc,itor) = my_ch + enddo enddo enddo + end if ! (triad_print_flag == 1 ) + call timer_lib_out('coll') end subroutine cgyro_step_collision_cpu diff --git a/cgyro/src/cgyro_write_timedata.f90 b/cgyro/src/cgyro_write_timedata.f90 index 3d6b808a1..36864a9ab 100644 --- a/cgyro/src/cgyro_write_timedata.f90 +++ b/cgyro/src/cgyro_write_timedata.f90 @@ -70,6 +70,14 @@ subroutine cgyro_write_timedata enddo endif + if (nonlinear_flag == 1 .and. triad_print_flag == 1) then + ! Triad energy transfer for all kxky + call cgyro_write_distributed_bcomplex(& + trim(path)//binfile_triad,& + size(triad(:,:,:,:)),& + triad(:,:,:,:)) + endif + if (field_print_flag == 1) then p_field = n_field else