Skip to content

Commit

Permalink
Optimize rhs by removing large pre-computer arrays (#415)
Browse files Browse the repository at this point in the history
* Switch id and ic in dtheta and icd_c, to improve memory access on GPUs

* Remove icd_c and dtheta, compute on the fly. 5x faster than original

* Make inner rhs loop iterative, remove complex int ops to improve CPU performance

* Remove blocks and add omp/acc private statements. Blocks not supported by nvfortran

* Minor cpu optimization, conditionally move loop

* Properly cleanup thfac_itor

---------

Co-authored-by: Igor Sfiligoi <[email protected]>
  • Loading branch information
sfiligoi and Igor Sfiligoi authored Oct 30, 2024
1 parent e7ead93 commit ace0a44
Show file tree
Hide file tree
Showing 6 changed files with 123 additions and 71 deletions.
3 changes: 0 additions & 3 deletions cgyro/src/cgyro_check_memory.F90
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,6 @@ subroutine cgyro_check_memory(datafile)
call cgyro_alloc_add(io,nc*4.0,'it_c')
call cgyro_alloc_add(io,n_radial*n_theta*4.0,'ic_c')
call cgyro_alloc_add(io,n_energy*n_xi*n_species*4.0,'iv_v')
call cgyro_alloc_add(io,nc*(2*nup_theta+1)*nt_loc*4.0,'icd_c')
call cgyro_alloc_add_3d(io,(2*nup_theta+1),nc,nt_loc,16,'dtheta')
call cgyro_alloc_add_3d(io,(2*nup_theta+1),nc,nt_loc,16,'dtheta_up')

write(io,*)
write(io,*) 'Fields and field solve'
Expand Down
16 changes: 4 additions & 12 deletions cgyro/src/cgyro_cleanup.F90
Original file line number Diff line number Diff line change
Expand Up @@ -125,22 +125,14 @@ subroutine cgyro_cleanup
if(allocated(cflux_tave)) deallocate(cflux_tave)
if(allocated(gflux_tave)) deallocate(gflux_tave)
if(allocated(recv_status)) deallocate(recv_status)
if(allocated(icd_c)) then
ccl_del_device(icd_c)
deallocate(icd_c)
endif
if(allocated(dtheta)) then
ccl_del_device(dtheta)
deallocate(dtheta)
endif
if(allocated(dtheta_up)) then
ccl_del_device(dtheta_up)
deallocate(dtheta_up)
endif
if(allocated(source)) then
ccl_del_device(source)
deallocate(source)
endif
if(allocated(thfac_itor)) then
ccl_del_device(thfac_itor)
deallocate(thfac_itor)
endif
if(allocated(h0_old)) then
ccl_del_device(h0_old)
deallocate(h0_old)
Expand Down
4 changes: 1 addition & 3 deletions cgyro/src/cgyro_globals.F90
Original file line number Diff line number Diff line change
Expand Up @@ -328,9 +328,7 @@ module cgyro_globals
real, dimension(:), allocatable :: theta
real, dimension(:), allocatable :: uderiv
real, dimension(:), allocatable :: cderiv
integer, dimension(:,:,:), allocatable :: icd_c
complex, dimension(:,:,:), allocatable :: dtheta
complex, dimension(:,:,:), allocatable :: dtheta_up
complex, dimension(:,:), allocatable :: thfac_itor
!
! Wavenumber advection
integer :: source_flag
Expand Down
49 changes: 26 additions & 23 deletions cgyro/src/cgyro_init_arrays.F90
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ subroutine cgyro_init_arrays
integer :: i_field
integer :: l,ll
integer :: iltheta_min
complex :: thfac,carg
complex :: carg
complex :: jval
real, dimension(:,:,:), allocatable :: res_loc
real, dimension(:,:,:), allocatable :: jloc_c
Expand Down Expand Up @@ -365,32 +365,35 @@ subroutine cgyro_init_arrays
!-------------------------------------------------------------------------
! Streaming coefficient arrays
!
allocate(thfac_itor(0:2,nt1:nt2))

do itor=nt1,nt2
do ir=1,n_radial
do it=1,n_theta
do id=-nup_theta,nup_theta
jt = modulo(it+id-1,n_theta)+1
if (it+id < 1) then
thfac = exp(2*pi*i_c*k_theta_base*itor*rmin)
jr = modulo(ir-itor*box_size*sign_qs-1,n_radial)+1
else if (it+id > n_theta) then
thfac = exp(-2*pi*i_c*k_theta_base*itor*rmin)
jr = modulo(ir+itor*box_size*sign_qs-1,n_radial)+1
else
thfac = (1.0,0.0)
jr = ir
endif
dtheta(id, ic_c(ir,it), itor) = cderiv(id)*thfac
dtheta_up(id, ic_c(ir,it), itor) = uderiv(id)*thfac*up_theta
icd_c(id, ic_c(ir,it), itor) = ic_c(jr,modulo(it+id-1,n_theta)+1)
enddo
enddo
enddo
thfac_itor(0,itor) = exp(2*pi*i_c*k_theta_base*itor*rmin)
thfac_itor(1,itor) = (1.0,0.0)
thfac_itor(2,itor) = exp(-2*pi*i_c*k_theta_base*itor*rmin)
!do ir=1,n_radial
! do it=1,n_theta
! do id=-nup_theta,nup_theta
! ! jt = modulo(it+id-1,n_theta)+1
! if (it+id < 1) then
! jr = modulo(ir-itor*box_size*sign_qs-1,n_radial)+1
! else if (it+id > n_theta) then
! jr = modulo(ir+itor*box_size*sign_qs-1,n_radial)+1
! else
! jr = ir
! endif
! thfac = thfac_itor((n_theta+it+id-1)/n_theta,itor)
! dtheta(ic_c(ir,it), id, itor) = cderiv(id)*thfac
! dtheta_up(ic_c(ir,it), id, itor) = uderiv(id)*thfac*up_theta
! icd_c(ic_c(ir,it), id, itor) = ic_c(jr,modulo(it+id-1,n_theta)+1)
! enddo
! enddo
!enddo
enddo
#if defined(OMPGPU)
!$omp target enter data map(to:dtheta,dtheta_up,icd_c,c_wave)
!$omp target enter data map(to:c_wave,thfac_itor,cderiv,uderiv)
#elif defined(_OPENACC)
!$acc enter data copyin(dtheta,dtheta_up,icd_c,c_wave)
!$acc enter data copyin(c_wave,thfac_itor,cderiv,uderiv)
#endif

! Streaming coefficients (for speed optimization)
Expand Down
3 changes: 0 additions & 3 deletions cgyro/src/cgyro_init_manager.F90
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,6 @@ subroutine cgyro_init_manager

allocate(recv_status(MPI_STATUS_SIZE))

allocate(icd_c(-nup_theta:nup_theta, nc ,nt1:nt2))
allocate(dtheta(-nup_theta:nup_theta, nc ,nt1:nt2))
allocate(dtheta_up(-nup_theta:nup_theta, nc,nt1:nt2))
allocate(source(n_theta,nv_loc,nt1:nt2))

#if defined(OMPGPU)
Expand Down
119 changes: 92 additions & 27 deletions cgyro/src/cgyro_rhs.F90
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,16 @@ subroutine cgyro_rhs_comp1(ij,update_cap)
#if defined(OMPGPU)
! no async for OMPGPU for now
!$omp target teams distribute parallel do simd collapse(3) &
!$omp& firstprivate(nv2,nv1,nt2,nt1,nc,update_cap,ij) &
!$omp& private(iv,ic,iv_loc,rhs_el,h_el,is,cap_el,my_psi)
#elif defined(_OPENACC)
!$acc parallel loop gang vector collapse(3) &
!$acc& private(iv,ic,iv_loc,rhs_el,h_el,is,cap_el,my_psi) async(1)
!$acc parallel loop gang vector collapse(3) async(1) &
!$acc& firstprivate(nv2,nv1,nt2,nt1,nc,update_cap,ij) &
!$acc& private(iv,ic,iv_loc,rhs_el,h_el,is,cap_el,my_psi)
#else
!$omp parallel do collapse(2) &
!$omp& private(iv,iv_loc,itor,is,ic,rhs_el,h_el,cap_el,my_psi)
!$omp& firstprivate(nv2,nv1,nt2,nt1,nc,update_cap,ij) &
!$omp& private(iv,iv_loc,itor,is,ic,rhs_el,h_el,cap_el,my_psi)
#endif
do itor=nt1,nt2
do iv=nv1,nv2
Expand Down Expand Up @@ -141,9 +144,21 @@ subroutine cgyro_rhs_comp2(ij)

integer, intent(in) :: ij
!--------------------------------
integer :: is,itor
integer :: id,jc
real :: rval,rval2
integer :: itor,ir,it
! ir loop specific
integer :: itorbox
!integer :: iv_loc
integer :: is
integer :: jr0(0:2) ! n_theta*(pre-compute jr-1)
real :: vel_xi
! it loop specific
!integer :: ic
integer :: id
integer :: itd ! precompute modulo(it+id-1,n_theta)+1, use for iteration
integer :: itd_class
integer :: jc
real :: rval,rval2,rval2s
complex :: thfac
complex :: rhs_stream


Expand All @@ -157,40 +172,90 @@ subroutine cgyro_rhs_comp2(ij)
!$acc& present(cap_h_c) &
!$acc& present(is_v,ix_v,ie_v,it_c) &
!$acc& present(omega_stream,xi,vel) &
!$acc& present(dtheta,dtheta_up,icd_c)
!$acc& present(thfac_itor,cderiv,uderiv)

#endif
! add stream to rhs
#if defined(OMPGPU)
! no async for OMPGPU for now
!$omp target teams distribute parallel do simd collapse(3) &
!$omp& private(iv,ic,iv_loc,is,rval,rval2,rhs_stream,id,jc)
!$omp target teams distribute parallel do simd collapse(4) &
!$omp& firstprivate(n_radial,nv2,nv1,nt2,nt1,n_theta) &
!$omp& firstprivate(sign_qs,nup_theta,ij,box_size,up_theta) &
!$omp& private(itor,iv,ir,it) &
!$omp& private(itorbox,iv_loc,is,jr0,vel_xi) &
!$omp& private(ic,id,itd,itd_class,jc,rval,rval2,rval2s,thfac,rhs_stream)
#elif defined(_OPENACC)
!$acc parallel loop gang vector collapse(3) &
!$acc& private(iv,ic,iv_loc,is,rval,rval2,rhs_stream,id,jc) async(1)
!$acc parallel loop gang vector collapse(4) async(1) &
!$acc& firstprivate(n_radial,nv2,nv1,nt2,nt1,n_theta) &
!$acc& firstprivate(sign_qs,nup_theta,ij,box_size,up_theta) &
!$acc& private(itor,iv,ir,it) &
!$acc& private(itorbox,iv_loc,is,jr0,vel_xi) &
!$acc& private(ic,id,itd,itd_class,jc,rval,rval2,rval2s,thfac,rhs_stream)
#else
!$omp parallel do collapse(2) &
!$omp& private(itor,iv,iv_loc,is,ic,rval,rval2,rhs_stream,id,jc)
!$omp parallel do collapse(3) &
!$omp& firstprivate(n_radial,nv2,nv1,nt2,nt1,n_theta) &
!$omp& firstprivate(sign_qs,nup_theta,ij,box_size,up_theta) &
!$omp& private(itor,iv,ir,it) &
!$omp& private(itorbox,iv_loc,is,jr0,vel_xi)
#endif
do itor=nt1,nt2
do iv=nv1,nv2
do ic=1,nc
do ir=1,n_radial
#if defined(OMPGPU) || defined(_OPENACC)
! keep loop high for maximal collapse
do it=1,n_theta
#endif
itorbox = itor*box_size*sign_qs
iv_loc = iv-nv1+1
is = is_v(iv)
! Parallel streaming with upwind dissipation
rval = omega_stream(it_c(ic),is,itor)*vel(ie_v(iv))*xi(ix_v(iv))
rval2 = abs(omega_stream(it_c(ic),is,itor))

rhs_stream = 0.0
do id=-nup_theta,nup_theta
jc = icd_c(id, ic, itor)
rhs_stream = rhs_stream &
-rval*dtheta(id,ic,itor)*cap_h_c(jc,iv_loc,itor) &
-rval2*dtheta_up(id,ic,itor)*g_x(jc,iv_loc,itor)
enddo

rhs(ic,iv_loc,itor,ij) = rhs(ic,iv_loc,itor,ij) + rhs_stream
is = is_v(iv)
vel_xi = vel(ie_v(iv))*xi(ix_v(iv))
jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial)
jr0(1) = n_theta*(ir-1)
jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial)

#if !(defined(OMPGPU) || defined(_OPENACC))
! loop as late as possible, to minimize recompute
!$omp loop &
!$omp& private(ic,id,itd,itd_class,jc,rval,rval2,rval2s,thfac,rhs_stream)
do it=1,n_theta
#endif
ic = (ir-1)*n_theta + it ! ic_c(ir,it)

! Parallel streaming with upwind dissipation
rval2s = omega_stream(it,is,itor)
rval = rval2s*vel_xi
rval2 = abs(rval2s)

rhs_stream = 0.0

!icd_c(ic, id, itor) = ic_c(jr,modulo(it+id-1,n_theta)+1)
!jc = icd_c(ic, id, itor)
!dtheta(ic, id, itor) := cderiv(id)*thfac
!dtheta_up(ic, id, itor) := uderiv(id)*thfac*up_theta
itd = n_theta+it-nup_theta
itd_class = 0
jc = jr0(itd_class)+itd
thfac = thfac_itor(itd_class,itor)
do id=-nup_theta,nup_theta
if (itd > n_theta) then
! move to next itd_class of compute
itd = itd - n_theta
itd_class = itd_class + 1
jc = jr0(itd_class)+itd
thfac = thfac_itor(itd_class,itor)
endif
rhs_stream = rhs_stream &
- thfac &
* ( rval*cderiv(id)*cap_h_c(jc,iv_loc,itor) &
+ rval2*uderiv(id)*up_theta*g_x(jc,iv_loc,itor) )
itd = itd + 1
jc = jc + 1
enddo

rhs(ic,iv_loc,itor,ij) = rhs(ic,iv_loc,itor,ij) + rhs_stream
enddo
enddo
enddo
enddo

Expand Down

0 comments on commit ace0a44

Please sign in to comment.