From ace0a44d5bddc36b05067816368869142cdbed46 Mon Sep 17 00:00:00 2001 From: Igor Sfiligoi Date: Wed, 30 Oct 2024 10:09:50 -0700 Subject: [PATCH] Optimize rhs by removing large pre-computer arrays (#415) * Switch id and ic in dtheta and icd_c, to improve memory access on GPUs * Remove icd_c and dtheta, compute on the fly. 5x faster than original * Make inner rhs loop iterative, remove complex int ops to improve CPU performance * Remove blocks and add omp/acc private statements. Blocks not supported by nvfortran * Minor cpu optimization, conditionally move loop * Properly cleanup thfac_itor --------- Co-authored-by: Igor Sfiligoi --- cgyro/src/cgyro_check_memory.F90 | 3 - cgyro/src/cgyro_cleanup.F90 | 16 ++--- cgyro/src/cgyro_globals.F90 | 4 +- cgyro/src/cgyro_init_arrays.F90 | 49 +++++++------ cgyro/src/cgyro_init_manager.F90 | 3 - cgyro/src/cgyro_rhs.F90 | 119 ++++++++++++++++++++++++------- 6 files changed, 123 insertions(+), 71 deletions(-) diff --git a/cgyro/src/cgyro_check_memory.F90 b/cgyro/src/cgyro_check_memory.F90 index fe44f9e21..0d52afbe4 100644 --- a/cgyro/src/cgyro_check_memory.F90 +++ b/cgyro/src/cgyro_check_memory.F90 @@ -24,9 +24,6 @@ subroutine cgyro_check_memory(datafile) call cgyro_alloc_add(io,nc*4.0,'it_c') call cgyro_alloc_add(io,n_radial*n_theta*4.0,'ic_c') call cgyro_alloc_add(io,n_energy*n_xi*n_species*4.0,'iv_v') - call cgyro_alloc_add(io,nc*(2*nup_theta+1)*nt_loc*4.0,'icd_c') - call cgyro_alloc_add_3d(io,(2*nup_theta+1),nc,nt_loc,16,'dtheta') - call cgyro_alloc_add_3d(io,(2*nup_theta+1),nc,nt_loc,16,'dtheta_up') write(io,*) write(io,*) 'Fields and field solve' diff --git a/cgyro/src/cgyro_cleanup.F90 b/cgyro/src/cgyro_cleanup.F90 index d868bd20a..0f3d0dd68 100644 --- a/cgyro/src/cgyro_cleanup.F90 +++ b/cgyro/src/cgyro_cleanup.F90 @@ -125,22 +125,14 @@ subroutine cgyro_cleanup if(allocated(cflux_tave)) deallocate(cflux_tave) if(allocated(gflux_tave)) deallocate(gflux_tave) if(allocated(recv_status)) deallocate(recv_status) - if(allocated(icd_c)) then - ccl_del_device(icd_c) - deallocate(icd_c) - endif - if(allocated(dtheta)) then - ccl_del_device(dtheta) - deallocate(dtheta) - endif - if(allocated(dtheta_up)) then - ccl_del_device(dtheta_up) - deallocate(dtheta_up) - endif if(allocated(source)) then ccl_del_device(source) deallocate(source) endif + if(allocated(thfac_itor)) then + ccl_del_device(thfac_itor) + deallocate(thfac_itor) + endif if(allocated(h0_old)) then ccl_del_device(h0_old) deallocate(h0_old) diff --git a/cgyro/src/cgyro_globals.F90 b/cgyro/src/cgyro_globals.F90 index c91c3bdf0..a990e88f1 100644 --- a/cgyro/src/cgyro_globals.F90 +++ b/cgyro/src/cgyro_globals.F90 @@ -328,9 +328,7 @@ module cgyro_globals real, dimension(:), allocatable :: theta real, dimension(:), allocatable :: uderiv real, dimension(:), allocatable :: cderiv - integer, dimension(:,:,:), allocatable :: icd_c - complex, dimension(:,:,:), allocatable :: dtheta - complex, dimension(:,:,:), allocatable :: dtheta_up + complex, dimension(:,:), allocatable :: thfac_itor ! ! Wavenumber advection integer :: source_flag diff --git a/cgyro/src/cgyro_init_arrays.F90 b/cgyro/src/cgyro_init_arrays.F90 index e4b156a23..b5318f4f2 100644 --- a/cgyro/src/cgyro_init_arrays.F90 +++ b/cgyro/src/cgyro_init_arrays.F90 @@ -17,7 +17,7 @@ subroutine cgyro_init_arrays integer :: i_field integer :: l,ll integer :: iltheta_min - complex :: thfac,carg + complex :: carg complex :: jval real, dimension(:,:,:), allocatable :: res_loc real, dimension(:,:,:), allocatable :: jloc_c @@ -365,32 +365,35 @@ subroutine cgyro_init_arrays !------------------------------------------------------------------------- ! Streaming coefficient arrays ! + allocate(thfac_itor(0:2,nt1:nt2)) + do itor=nt1,nt2 - do ir=1,n_radial - do it=1,n_theta - do id=-nup_theta,nup_theta - jt = modulo(it+id-1,n_theta)+1 - if (it+id < 1) then - thfac = exp(2*pi*i_c*k_theta_base*itor*rmin) - jr = modulo(ir-itor*box_size*sign_qs-1,n_radial)+1 - else if (it+id > n_theta) then - thfac = exp(-2*pi*i_c*k_theta_base*itor*rmin) - jr = modulo(ir+itor*box_size*sign_qs-1,n_radial)+1 - else - thfac = (1.0,0.0) - jr = ir - endif - dtheta(id, ic_c(ir,it), itor) = cderiv(id)*thfac - dtheta_up(id, ic_c(ir,it), itor) = uderiv(id)*thfac*up_theta - icd_c(id, ic_c(ir,it), itor) = ic_c(jr,modulo(it+id-1,n_theta)+1) - enddo - enddo - enddo + thfac_itor(0,itor) = exp(2*pi*i_c*k_theta_base*itor*rmin) + thfac_itor(1,itor) = (1.0,0.0) + thfac_itor(2,itor) = exp(-2*pi*i_c*k_theta_base*itor*rmin) + !do ir=1,n_radial + ! do it=1,n_theta + ! do id=-nup_theta,nup_theta + ! ! jt = modulo(it+id-1,n_theta)+1 + ! if (it+id < 1) then + ! jr = modulo(ir-itor*box_size*sign_qs-1,n_radial)+1 + ! else if (it+id > n_theta) then + ! jr = modulo(ir+itor*box_size*sign_qs-1,n_radial)+1 + ! else + ! jr = ir + ! endif + ! thfac = thfac_itor((n_theta+it+id-1)/n_theta,itor) + ! dtheta(ic_c(ir,it), id, itor) = cderiv(id)*thfac + ! dtheta_up(ic_c(ir,it), id, itor) = uderiv(id)*thfac*up_theta + ! icd_c(ic_c(ir,it), id, itor) = ic_c(jr,modulo(it+id-1,n_theta)+1) + ! enddo + ! enddo + !enddo enddo #if defined(OMPGPU) -!$omp target enter data map(to:dtheta,dtheta_up,icd_c,c_wave) +!$omp target enter data map(to:c_wave,thfac_itor,cderiv,uderiv) #elif defined(_OPENACC) -!$acc enter data copyin(dtheta,dtheta_up,icd_c,c_wave) +!$acc enter data copyin(c_wave,thfac_itor,cderiv,uderiv) #endif ! Streaming coefficients (for speed optimization) diff --git a/cgyro/src/cgyro_init_manager.F90 b/cgyro/src/cgyro_init_manager.F90 index aec0a8149..2c8b1ba3b 100644 --- a/cgyro/src/cgyro_init_manager.F90 +++ b/cgyro/src/cgyro_init_manager.F90 @@ -183,9 +183,6 @@ subroutine cgyro_init_manager allocate(recv_status(MPI_STATUS_SIZE)) - allocate(icd_c(-nup_theta:nup_theta, nc ,nt1:nt2)) - allocate(dtheta(-nup_theta:nup_theta, nc ,nt1:nt2)) - allocate(dtheta_up(-nup_theta:nup_theta, nc,nt1:nt2)) allocate(source(n_theta,nv_loc,nt1:nt2)) #if defined(OMPGPU) diff --git a/cgyro/src/cgyro_rhs.F90 b/cgyro/src/cgyro_rhs.F90 index f42e6d12b..e8b538340 100644 --- a/cgyro/src/cgyro_rhs.F90 +++ b/cgyro/src/cgyro_rhs.F90 @@ -84,13 +84,16 @@ subroutine cgyro_rhs_comp1(ij,update_cap) #if defined(OMPGPU) ! no async for OMPGPU for now !$omp target teams distribute parallel do simd collapse(3) & +!$omp& firstprivate(nv2,nv1,nt2,nt1,nc,update_cap,ij) & !$omp& private(iv,ic,iv_loc,rhs_el,h_el,is,cap_el,my_psi) #elif defined(_OPENACC) -!$acc parallel loop gang vector collapse(3) & -!$acc& private(iv,ic,iv_loc,rhs_el,h_el,is,cap_el,my_psi) async(1) +!$acc parallel loop gang vector collapse(3) async(1) & +!$acc& firstprivate(nv2,nv1,nt2,nt1,nc,update_cap,ij) & +!$acc& private(iv,ic,iv_loc,rhs_el,h_el,is,cap_el,my_psi) #else !$omp parallel do collapse(2) & -!$omp& private(iv,iv_loc,itor,is,ic,rhs_el,h_el,cap_el,my_psi) +!$omp& firstprivate(nv2,nv1,nt2,nt1,nc,update_cap,ij) & +!$omp& private(iv,iv_loc,itor,is,ic,rhs_el,h_el,cap_el,my_psi) #endif do itor=nt1,nt2 do iv=nv1,nv2 @@ -141,9 +144,21 @@ subroutine cgyro_rhs_comp2(ij) integer, intent(in) :: ij !-------------------------------- - integer :: is,itor - integer :: id,jc - real :: rval,rval2 + integer :: itor,ir,it + ! ir loop specific + integer :: itorbox + !integer :: iv_loc + integer :: is + integer :: jr0(0:2) ! n_theta*(pre-compute jr-1) + real :: vel_xi + ! it loop specific + !integer :: ic + integer :: id + integer :: itd ! precompute modulo(it+id-1,n_theta)+1, use for iteration + integer :: itd_class + integer :: jc + real :: rval,rval2,rval2s + complex :: thfac complex :: rhs_stream @@ -157,40 +172,90 @@ subroutine cgyro_rhs_comp2(ij) !$acc& present(cap_h_c) & !$acc& present(is_v,ix_v,ie_v,it_c) & !$acc& present(omega_stream,xi,vel) & -!$acc& present(dtheta,dtheta_up,icd_c) +!$acc& present(thfac_itor,cderiv,uderiv) #endif ! add stream to rhs #if defined(OMPGPU) ! no async for OMPGPU for now -!$omp target teams distribute parallel do simd collapse(3) & -!$omp& private(iv,ic,iv_loc,is,rval,rval2,rhs_stream,id,jc) +!$omp target teams distribute parallel do simd collapse(4) & +!$omp& firstprivate(n_radial,nv2,nv1,nt2,nt1,n_theta) & +!$omp& firstprivate(sign_qs,nup_theta,ij,box_size,up_theta) & +!$omp& private(itor,iv,ir,it) & +!$omp& private(itorbox,iv_loc,is,jr0,vel_xi) & +!$omp& private(ic,id,itd,itd_class,jc,rval,rval2,rval2s,thfac,rhs_stream) #elif defined(_OPENACC) -!$acc parallel loop gang vector collapse(3) & -!$acc& private(iv,ic,iv_loc,is,rval,rval2,rhs_stream,id,jc) async(1) +!$acc parallel loop gang vector collapse(4) async(1) & +!$acc& firstprivate(n_radial,nv2,nv1,nt2,nt1,n_theta) & +!$acc& firstprivate(sign_qs,nup_theta,ij,box_size,up_theta) & +!$acc& private(itor,iv,ir,it) & +!$acc& private(itorbox,iv_loc,is,jr0,vel_xi) & +!$acc& private(ic,id,itd,itd_class,jc,rval,rval2,rval2s,thfac,rhs_stream) #else -!$omp parallel do collapse(2) & -!$omp& private(itor,iv,iv_loc,is,ic,rval,rval2,rhs_stream,id,jc) +!$omp parallel do collapse(3) & +!$omp& firstprivate(n_radial,nv2,nv1,nt2,nt1,n_theta) & +!$omp& firstprivate(sign_qs,nup_theta,ij,box_size,up_theta) & +!$omp& private(itor,iv,ir,it) & +!$omp& private(itorbox,iv_loc,is,jr0,vel_xi) #endif do itor=nt1,nt2 do iv=nv1,nv2 - do ic=1,nc + do ir=1,n_radial +#if defined(OMPGPU) || defined(_OPENACC) + ! keep loop high for maximal collapse + do it=1,n_theta +#endif + itorbox = itor*box_size*sign_qs iv_loc = iv-nv1+1 - is = is_v(iv) - ! Parallel streaming with upwind dissipation - rval = omega_stream(it_c(ic),is,itor)*vel(ie_v(iv))*xi(ix_v(iv)) - rval2 = abs(omega_stream(it_c(ic),is,itor)) - - rhs_stream = 0.0 - do id=-nup_theta,nup_theta - jc = icd_c(id, ic, itor) - rhs_stream = rhs_stream & - -rval*dtheta(id,ic,itor)*cap_h_c(jc,iv_loc,itor) & - -rval2*dtheta_up(id,ic,itor)*g_x(jc,iv_loc,itor) - enddo - rhs(ic,iv_loc,itor,ij) = rhs(ic,iv_loc,itor,ij) + rhs_stream + is = is_v(iv) + vel_xi = vel(ie_v(iv))*xi(ix_v(iv)) + jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial) + jr0(1) = n_theta*(ir-1) + jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial) + +#if !(defined(OMPGPU) || defined(_OPENACC)) + ! loop as late as possible, to minimize recompute +!$omp loop & +!$omp& private(ic,id,itd,itd_class,jc,rval,rval2,rval2s,thfac,rhs_stream) + do it=1,n_theta +#endif + ic = (ir-1)*n_theta + it ! ic_c(ir,it) + + ! Parallel streaming with upwind dissipation + rval2s = omega_stream(it,is,itor) + rval = rval2s*vel_xi + rval2 = abs(rval2s) + + rhs_stream = 0.0 + + !icd_c(ic, id, itor) = ic_c(jr,modulo(it+id-1,n_theta)+1) + !jc = icd_c(ic, id, itor) + !dtheta(ic, id, itor) := cderiv(id)*thfac + !dtheta_up(ic, id, itor) := uderiv(id)*thfac*up_theta + itd = n_theta+it-nup_theta + itd_class = 0 + jc = jr0(itd_class)+itd + thfac = thfac_itor(itd_class,itor) + do id=-nup_theta,nup_theta + if (itd > n_theta) then + ! move to next itd_class of compute + itd = itd - n_theta + itd_class = itd_class + 1 + jc = jr0(itd_class)+itd + thfac = thfac_itor(itd_class,itor) + endif + rhs_stream = rhs_stream & + - thfac & + * ( rval*cderiv(id)*cap_h_c(jc,iv_loc,itor) & + + rval2*uderiv(id)*up_theta*g_x(jc,iv_loc,itor) ) + itd = itd + 1 + jc = jc + 1 + enddo + + rhs(ic,iv_loc,itor,ij) = rhs(ic,iv_loc,itor,ij) + rhs_stream enddo + enddo enddo enddo