diff --git a/cgyro/src/Makefile b/cgyro/src/Makefile index ad5293929..b7af86877 100644 --- a/cgyro/src/Makefile +++ b/cgyro/src/Makefile @@ -59,6 +59,7 @@ OBJECTS = cgyro_globals.o \ cgyro_step_gk_ck.o \ cgyro_step_gk_bs5.o \ cgyro_step_gk_v76.o \ + cgyro_stress.o \ cgyro_write_initdata.o \ cgyro_write_timedata.o \ cgyro_write_hosts.o \ diff --git a/cgyro/src/cgyro_cleanup.F90 b/cgyro/src/cgyro_cleanup.F90 index d868bd20a..91211e7f0 100644 --- a/cgyro/src/cgyro_cleanup.F90 +++ b/cgyro/src/cgyro_cleanup.F90 @@ -122,9 +122,16 @@ subroutine cgyro_cleanup if(allocated(cflux_loc)) deallocate(cflux_loc) if(allocated(gflux)) deallocate(gflux) if(allocated(gflux_loc)) deallocate(gflux_loc) + if(allocated(stress_integrated)) deallocate(stress_integrated) + if(allocated(stress_integrated_loc)) deallocate(stress_integrated_loc) if(allocated(cflux_tave)) deallocate(cflux_tave) if(allocated(gflux_tave)) deallocate(gflux_tave) if(allocated(recv_status)) deallocate(recv_status) + if(allocated(stress)) then + ccl_del_device(stress) + deallocate(stress) + endif + if(allocated(icd_c)) then ccl_del_device(icd_c) deallocate(icd_c) diff --git a/cgyro/src/cgyro_flux.f90 b/cgyro/src/cgyro_flux.f90 index 5d61efae2..1bac8f546 100644 --- a/cgyro/src/cgyro_flux.f90 +++ b/cgyro/src/cgyro_flux.f90 @@ -30,7 +30,6 @@ subroutine cgyro_flux use mpi use cgyro_globals - use cgyro_nl_comm implicit none @@ -48,7 +47,7 @@ subroutine cgyro_flux !$omp parallel do private(iv_loc,iv,is,ix,ie,dv,vpar,ic,ir,it,erot,cprod,cn) & !$omp& private(prod1,prod2,prod3,l,icl,dvr,u,flux_norm) & -!$omp& shared(moment_loc,gflux_loc,cflux_loc) +!$omp& shared(moment_loc,gflux_loc,cflux_loc,stress_integrated_loc) do itor=nt1,nt2 !----------------------------------------------------- @@ -219,20 +218,8 @@ subroutine cgyro_flux cflux_loc(:,:,:,itor) = cflux_loc(:,:,:,itor)/rho**2 if (stress_print_flag .eq. 1) then - stress(:, :, :, :) = 0.0 - if (i_time .ne. 0) then - - ! Set up stress for each field - do i_field=1, n_field - call cgyro_nl_fftw_comm1_async - call cgyro_nl_fftw_comm2_async_stress(i_field) - call cgyro_nl_fftw - call cgyro_nl_fftw_comm1_r_stress(i_field) - end do - - end if - ! stress(kx-theta, vel_coord, ky, phi) - stress_integrated_loc = 0.0 + stress_integrated_loc(:, :, :, itor, :) = 0.0 + ! stress(kx-theta, vel_coord, ky, phi) iv_loc = 0 do iv=nv1,nv2 @@ -248,9 +235,9 @@ subroutine cgyro_flux ir = ir_c(ic) it = it_c(ic) stress_integrated_loc(ir, it, is, itor, :) = stress_integrated_loc(ir, it, is, itor, :) + stress(ic, iv_loc, itor, :) * dv - end do + enddo enddo - end if + endif endif enddo diff --git a/cgyro/src/cgyro_init_manager.F90 b/cgyro/src/cgyro_init_manager.F90 index d42c0dc9a..1edca6422 100644 --- a/cgyro/src/cgyro_init_manager.F90 +++ b/cgyro/src/cgyro_init_manager.F90 @@ -312,9 +312,9 @@ subroutine cgyro_init_manager allocate(stress_integrated_loc(n_radial,n_theta,n_species,nt1:nt2,n_field)) allocate(stress_integrated(n_radial,n_theta,n_species,nt1:nt2,n_field)) #if defined(OMPGPU) -!$omp target enter data map(alloc:stress,stress_integrated,stress_integrated_loc) +!$omp target enter data map(alloc:stress) #elif defined(_OPENACC) -!$acc enter data create(stress,stress_integrated,stress_integrated_loc) +!$acc enter data create(stress) #endif end if diff --git a/cgyro/src/cgyro_kernel.F90 b/cgyro/src/cgyro_kernel.F90 index 95c08273f..84fdbf963 100644 --- a/cgyro/src/cgyro_kernel.F90 +++ b/cgyro/src/cgyro_kernel.F90 @@ -19,9 +19,11 @@ subroutine cgyro_kernel use cgyro_io use cgyro_restart + implicit none integer, parameter :: analysis = 0 + integer :: i_field if (test_flag == 1) return @@ -139,6 +141,8 @@ subroutine cgyro_kernel call timer_lib_in('io') + if (stress_print_flag .eq. 1) call cgyro_stress + ! Write simulation data call cgyro_write_timedata diff --git a/cgyro/src/cgyro_nl_comm.F90 b/cgyro/src/cgyro_nl_comm.F90 index da2126a2c..ed4801736 100644 --- a/cgyro/src/cgyro_nl_comm.F90 +++ b/cgyro/src/cgyro_nl_comm.F90 @@ -399,14 +399,15 @@ subroutine cgyro_nl_fftw_comm2_async_stress(itf) call timer_lib_in('nl_mem') #if defined(OMPGPU) -!$omp target teams distribute parallel do simd collapse(5) & +!$omp target teams distribute parallel do simd collapse(4) & !$omp& private(itor,it,iltheta_min,mytor,gval) #elif defined(_OPENACC) -!$acc parallel loop gang vector collapse(5) independent & +!$acc parallel loop gang vector collapse(4) independent & !$acc& private(itor,it,iltheta_min,mytor,gval) & !$acc& present(field,gpack) & !$acc& present(n_toroidal_procs,nt_loc,n_jtheta,nv_loc,nt1) & !$acc& present(n_theta,n_radial,n_field,nsplit) & +!$acc& copyin(itf) & !$acc& default(none) #else !$omp parallel do collapse(3) & @@ -443,9 +444,6 @@ subroutine cgyro_nl_fftw_comm2_async_stress(itf) end subroutine cgyro_nl_fftw_comm2_async_stress -end module cgyro_nl_comm - - subroutine cgyro_nl_fftw_comm1_r_stress(itf) use timer_lib use parallel_lib @@ -485,7 +483,7 @@ subroutine cgyro_nl_fftw_comm1_r_stress(itf) #elif defined(_OPENACC) !$acc parallel loop collapse(4) gang vector independent private(ic_loc_m,my_psi) & !$acc& private(iexch0,itor0,isplit0,iexch_base) & -!$acc& present(ic_c,px,rhs,fpackA,fpackB) copyin(psi_mul,zf_scale) & +!$acc& present(ic_c,px,stress,fpackA,fpackB) copyin(psi_mul,zf_scale) & !$acc& present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA,nsplitB) copyin(itf) default(none) #else !$omp parallel do collapse(2) private(ic_loc_m,my_psi) & @@ -516,10 +514,9 @@ subroutine cgyro_nl_fftw_comm1_r_stress(itf) endif stress(ic_loc_m,iv_loc_m,itor,itf) = stress(ic_loc_m,iv_loc_m,itor,itf)+psi_mul*my_psi enddo + enddo enddo - enddo - enddo - + enddo else ! nsplitB==0 #if defined(OMPGPU) @@ -529,8 +526,8 @@ subroutine cgyro_nl_fftw_comm1_r_stress(itf) #elif defined(_OPENACC) !$acc parallel loop collapse(4) gang vector independent private(ic_loc_m,my_psi) & !$acc& private(iexch0,itor0,isplit0,iexch_base) & -!$acc& present(ic_c,px,rhs,fpackA) copyin(psi_mul,zf_scale) & -!$acc& present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA) copyin(ij) default(none) +!$acc& present(ic_c,px,stress,fpackA) copyin(psi_mul,zf_scale) & +!$acc& present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA) copyin(itf) default(none) #else !$omp parallel do collapse(2) private(ic_loc_m,my_psi) & !$omp& private(iexch0,itor0,isplit0,iexch_base) @@ -565,3 +562,6 @@ subroutine cgyro_nl_fftw_comm1_r_stress(itf) end subroutine cgyro_nl_fftw_comm1_r_stress + +end module cgyro_nl_comm + diff --git a/cgyro/src/cgyro_stress.F90 b/cgyro/src/cgyro_stress.F90 new file mode 100644 index 000000000..4bf5058dd --- /dev/null +++ b/cgyro/src/cgyro_stress.F90 @@ -0,0 +1,55 @@ +!--------------------------------------------------------------------------- +! cgyro_stress.f90 +! +! PURPOSE: +! Compute Reynolds/Maxwell stresses from the fields as a +! function of (kx,theta,species,ky,field). +! +! NOTES: +! +! The stress terms are just the nonlinear term evaluated for each +! field individually +! +! To trigger output of the stresses +! set STRESS_PRINT_FLAG=1 + +!--------------------------------------------------------------------------- + +subroutine cgyro_stress + + use timer_lib + use mpi + use cgyro_globals + use cgyro_nl_comm + + implicit none + + integer :: i_field + + ! Set up stress for each field + ! Not sure if this can be with async given the NL calc set up + do i_field=1, n_field + ! Set up first half h_x + call cgyro_nl_fftw_comm1_async + + ! Set up fields + call cgyro_nl_fftw_comm2_async_stress(i_field) + + ! Set up second half h_x (if split) + call cgyro_nl_fftw_comm3_async + + ! Calculate nonlinear term + call cgyro_nl_fftw + call cgyro_nl_fftw_comm_test + + ! Set stress array from nonlinear output + call cgyro_nl_fftw_comm1_r_stress(i_field) + call cgyro_nl_fftw_comm_test + end do +#if defined(OMPGPU) +!$omp target update from(stress) +#elif defined(_OPENACC) +!$acc update host(stress) +#endif + +end subroutine cgyro_stress