Skip to content

Commit

Permalink
Merge pull request #352 from sfiligoi/cmat32
Browse files Browse the repository at this point in the history
Add support for COLLISION_PRECISION_MODE 32 and 64
  • Loading branch information
jcandy authored Mar 1, 2024
2 parents 29354c6 + dab1026 commit 5d10cfd
Show file tree
Hide file tree
Showing 5 changed files with 230 additions and 25 deletions.
8 changes: 5 additions & 3 deletions cgyro/src/cgyro_check_memory.F90
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,14 @@ subroutine cgyro_check_memory(datafile)
if(collision_model == 5) then
call cgyro_alloc_add(io,(8.0*n_xi)*n_xi*n_species*n_energy*n_theta*nt_loc,'cmat')
else
if (collision_precision_mode == 0) then
call cgyro_alloc_add_4d(io,nv,nv,nc_loc,nt_loc,8,'cmat')
else
if (collision_precision_mode == 1) then
call cgyro_alloc_add_4d(io,nv,nv,nc_loc,nt_loc,4,'cmat_fp32')
call cgyro_alloc_add(io,4.0*n_xi*n_species*(n_energy-n_low_energy)*n_xi*nc_loc*nt_loc,'cmat_stripes')
call cgyro_alloc_add(io,4.0*n_xi*n_species*nv*nc_loc*n_low_energy*nt_loc,'cmat_e1')
else if (collision_precision_mode == 32) then
call cgyro_alloc_add_4d(io,nv,nv,nc_loc,nt_loc,4,'cmat_fp32')
else
call cgyro_alloc_add_4d(io,nv,nv,nc_loc,nt_loc,8,'cmat')
endif
#if defined(OMPGPU) || defined(_OPENACC)
if (gpu_bigmem_flag /= 1) then
Expand Down
8 changes: 4 additions & 4 deletions cgyro/src/cgyro_globals.F90
Original file line number Diff line number Diff line change
Expand Up @@ -457,10 +457,10 @@ module cgyro_globals
!
! Collision operator
integer :: n_low_energy
real, dimension(:,:,:,:), allocatable :: cmat ! only used if collision_precision_mode=0
real(KIND=REAL32), dimension(:,:,:,:), allocatable :: cmat_fp32 ! only used if collision_precision_mod/=0
real(KIND=REAL32), dimension(:,:,:,:,:,:), allocatable :: cmat_stripes ! only used if collision_precision_mod/=0
real(KIND=REAL32), dimension(:,:,:,:,:,:), allocatable :: cmat_e1 ! only used if collision_precision_mod/=0
real, dimension(:,:,:,:), allocatable :: cmat ! only used if collision_precision_mode=0 & 64
real(KIND=REAL32), dimension(:,:,:,:), allocatable :: cmat_fp32 ! only used if collision_precision_mode=1 & 32
real(KIND=REAL32), dimension(:,:,:,:,:,:), allocatable :: cmat_stripes ! only used if collision_precision_mode=1
real(KIND=REAL32), dimension(:,:,:,:,:,:), allocatable :: cmat_e1 ! only used if collision_precision_mode=1
real, dimension(:,:,:,:,:,:), allocatable :: cmat_simple ! only used in collision_model=5
!
! Equilibrium/geometry arrays
Expand Down
13 changes: 11 additions & 2 deletions cgyro/src/cgyro_init_collision.F90
Original file line number Diff line number Diff line change
Expand Up @@ -741,7 +741,7 @@ subroutine cgyro_init_collision


! result in amat, transfer to the right cmat matrix
if (collision_precision_mode /= 0) then
if (collision_precision_mode == 1) then
! keep all cmat in fp32 precision
cmat_fp32(:,:,ic_loc,itor) = amat(:,:)
! keep the remaining precision for select elements
Expand Down Expand Up @@ -791,6 +791,9 @@ subroutine cgyro_init_collision
! use 19 as absolute counter for normalization
cmap_fp32_error_abs_cnt_loc(19) = cmap_fp32_error_abs_cnt_loc(19) + 1
enddo
else if (collision_precision_mode == 32) then
! keep all cmat in fp32 precision
cmat_fp32(:,:,ic_loc,itor) = amat(:,:)
else
! keep all cmat in full precision
cmat(:,:,ic_loc,itor) = amat(:,:)
Expand All @@ -807,7 +810,7 @@ subroutine cgyro_init_collision
end if


if (collision_precision_mode /= 0) then
if (collision_precision_mode == 1) then
#if defined(OMPGPU)
! no async for OMPGPU for now
!$omp target enter data map(to:cmat_fp32,cmat_stripes,cmat_e1) if (gpu_bigmem_flag == 1)
Expand Down Expand Up @@ -880,6 +883,12 @@ subroutine cgyro_init_collision
endif
#if (!defined(OMPGPU)) && defined(_OPENACC)
!$acc wait
#endif
else if (collision_precision_mode == 32) then
#if defined(OMPGPU)
!$omp target enter data map(to:cmat_fp32) if (gpu_bigmem_flag == 1)
#elif defined(_OPENACC)
!$acc enter data copyin(cmat_fp32) if (gpu_bigmem_flag == 1)
#endif
else
#if defined(OMPGPU)
Expand Down
4 changes: 3 additions & 1 deletion cgyro/src/cgyro_init_manager.F90
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ subroutine cgyro_init_manager
if (collision_model == 5) then
allocate(cmat_simple(n_xi,n_xi,n_energy,n_species,n_theta,nt1:nt2))
else
if (collision_precision_mode /= 0) then
if (collision_precision_mode == 1) then
! the lowest energy(s) has the most spread, so treat differently
n_low_energy = 1
do ie=2,n_energy
Expand All @@ -348,6 +348,8 @@ subroutine cgyro_init_manager

write (msg, "(A,I1,A)") "Using fp32 collision precision except e<=",n_low_energy," or same e&s."
call cgyro_info(msg)
else if (collision_precision_mode == 32) then
allocate(cmat_fp32(nv,nv,nc_loc,nt1:nt2))
else
allocate(cmat(nv,nv,nc_loc,nt1:nt2))
endif
Expand Down
222 changes: 207 additions & 15 deletions cgyro/src/cgyro_step_collision.F90
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,58 @@
#define CGYRO_GPU_ROUTINES
#endif

subroutine cgyro_calc_collision_cpu_fp32(nj_loc)

use parallel_lib
use cgyro_globals

! --------------------------------------------------
implicit none
!
integer, intent(in) :: nj_loc
!
integer :: ivp,j,k,itor
complex, dimension(nv) :: bvec,cvec
real :: cvec_re,cvec_im
real :: cval
! --------------------------------------------------

!$omp parallel do collapse(2) &
!$omp& private(ic,ic_loc,iv,ivp,cvec,bvec,cvec_re,cvec_im,cval,j,k) &
!$omp& shared(cap_h_v,fsendf,cmat_fp32)
do itor=nt1,nt2
do ic=nc1,nc2

ic_loc = ic-nc1+1
! Set-up the RHS: H = f + ze/T G phi

do iv=1,nv
cvec(iv) = cap_h_v(ic_loc,itor,iv)
enddo

bvec(:) = (0.0,0.0)

! This is a key loop for performance
do ivp=1,nv
cvec_re = real(cvec(ivp))
cvec_im = aimag(cvec(ivp))
do iv=1,nv
cval = cmat_fp32(iv,ivp,ic_loc,itor)
bvec(iv) = bvec(iv)+ cmplx(cval*cvec_re, cval*cvec_im)
enddo
enddo

do k=1,nproc
do j=1,nj_loc
fsendf(j,itor,ic_loc,k) = bvec(j+(k-1)*nj_loc)
enddo
enddo

enddo
enddo

end subroutine cgyro_calc_collision_cpu_fp32

subroutine cgyro_calc_collision_cpu_fp64(nj_loc)

use parallel_lib
Expand Down Expand Up @@ -64,7 +116,7 @@ subroutine cgyro_calc_collision_cpu_fp64(nj_loc)

end subroutine cgyro_calc_collision_cpu_fp64

subroutine cgyro_calc_collision_cpu_fp32(nj_loc)
subroutine cgyro_calc_collision_cpu_m1(nj_loc)

use parallel_lib
use cgyro_globals
Expand Down Expand Up @@ -128,7 +180,7 @@ subroutine cgyro_calc_collision_cpu_fp32(nj_loc)

enddo
enddo
end subroutine cgyro_calc_collision_cpu_fp32
end subroutine cgyro_calc_collision_cpu_m1

subroutine cgyro_calc_collision_cpu(nj_loc)

Expand All @@ -140,10 +192,12 @@ subroutine cgyro_calc_collision_cpu(nj_loc)
integer, intent(in) :: nj_loc
! --------------------------------------------------

if (collision_precision_mode == 0) then
call cgyro_calc_collision_cpu_fp64(nj_loc)
else
if (collision_precision_mode == 1) then
call cgyro_calc_collision_cpu_m1(nj_loc)
else if (collision_precision_mode == 32) then
call cgyro_calc_collision_cpu_fp32(nj_loc)
else
call cgyro_calc_collision_cpu_fp64(nj_loc)
endif

end subroutine cgyro_calc_collision_cpu
Expand Down Expand Up @@ -304,6 +358,140 @@ end subroutine cgyro_step_collision_cpu

! ==================================================

subroutine cgyro_calc_collision_gpu_fp32(nj_loc)

use parallel_lib
use cgyro_globals

! --------------------------------------------------
implicit none
!
integer, intent(in) :: nj_loc
!

integer :: j,k,ivp,itor
real :: b_re,b_im
real :: cval
! --------------------------------------------------

#if defined(OMPGPU)
!$omp target teams distribute parallel do simd collapse(4) &
!$omp& private(b_re,b_im,cval,ivp,iv) firstprivate(nproc,nj_loc,nv) &
!$omp& private(k,ic,j,ic_loc) private(cval)
#else
!$acc parallel loop collapse(4) gang vector &
!$acc& private(b_re,b_im,cval,ivp,iv) firstprivate(nproc,nj_loc,nv) &
!$acc& present(cmat_fp32,cap_h_v,fsendf) private(k,ic,j,ic_loc)
#endif
do itor=nt1,nt2
do ic=nc1,nc2
do k=1,nproc
do j=1,nj_loc
ic_loc = ic-nc1+1
iv = j+(k-1)*nj_loc
b_re = 0.0
b_im = 0.0
#if (!defined(OMPGPU)) && defined(_OPENACC)
!$acc loop seq private(cval)
#endif
do ivp=1,nv
cval = cmat_fp32(iv,ivp,ic_loc,itor)
b_re = b_re + cval*real(cap_h_v(ic_loc,itor,ivp))
b_im = b_im + cval*aimag(cap_h_v(ic_loc,itor,ivp))
enddo

fsendf(j,itor,ic_loc,k) = cmplx(b_re,b_im)
enddo
enddo
enddo
enddo
end subroutine cgyro_calc_collision_gpu_fp32

subroutine cgyro_calc_collision_gpu_b2_fp32(nj_loc)

use parallel_lib
use cgyro_globals

! --------------------------------------------------
implicit none
!
integer, intent(in) :: nj_loc
!

integer :: bsplit
integer :: j,k,ivp,b,itor
integer :: n_ic_loc,d_ic_loc
integer, dimension((gpu_bigmem_flag*2)+1) :: bic
integer :: bs,be,bb
real :: b_re,b_im
real :: cval
! --------------------------------------------------

bsplit = gpu_bigmem_flag*2

n_ic_loc = nc2-nc1+1
d_ic_loc = n_ic_loc/bsplit

bic(1) = nc1
do b=2,bsplit
bic(b) = bic(b-1) + d_ic_loc
enddo
bic(bsplit+1) = nc2+1

do itor=nt1,nt2
do b=1,bsplit
bs = bic(b)-nc1+1
be = bic(b+1)-nc1
! by keeping only 2 alive at any time, we limit GPU memory use
bb = modulo(b+itor,2)+2
#if defined(OMPGPU)
! not using async for OMP for now
!$omp target teams distribute parallel do simd collapse(3) &
!$omp& private(b_re,b_im,cval,ivp,iv) firstprivate(nproc,nj_loc,nv) &
!$omp& map(to:cmat_fp32(:,:,bs:be,itor)) &
!$omp& private(k,j,ic_loc) private(cval)
#else
! ensure there is not another even/odd already runnning
!$acc wait(bb)
! now launch myself
!$acc parallel loop gang vector collapse(3) &
!$acc& private(b_re,b_im,cval,ivp,iv) firstprivate(nproc,nj_loc,nv) &
!$acc& copyin(cmat_fp32(:,:,bs:be,itor)) present(cap_h_v,fsendf) private(k,j,ic_loc) async(bb)
#endif
do ic_loc=bs,be
do k=1,nproc
do j=1,nj_loc
iv = j+(k-1)*nj_loc
b_re = 0.0
b_im = 0.0
#if (!defined(OMPGPU)) && defined(_OPENACC)
!$acc loop seq private(cval)
#endif
do ivp=1,nv
cval = cmat_fp32(iv,ivp,ic_loc,itor)
b_re = b_re + cval*real(cap_h_v(ic_loc,itor,ivp))
b_im = b_im + cval*aimag(cap_h_v(ic_loc,itor,ivp))
enddo

fsendf(j,itor,ic_loc,k) = cmplx(b_re,b_im)
enddo
enddo

enddo ! ic

enddo ! b
enddo ! itor

#if defined(OMPGPU)
! not using async for OMP for now
#else
! wait for all the async kernels to terminate
!$acc wait(2)
!$acc wait(3)
#endif

end subroutine cgyro_calc_collision_gpu_b2_fp32

subroutine cgyro_calc_collision_gpu_fp64(nj_loc)

use parallel_lib
Expand Down Expand Up @@ -438,7 +626,7 @@ subroutine cgyro_calc_collision_gpu_b2_fp64(nj_loc)

end subroutine cgyro_calc_collision_gpu_b2_fp64

subroutine cgyro_calc_collision_gpu_fp32(nj_loc)
subroutine cgyro_calc_collision_gpu_m1(nj_loc)

use parallel_lib
use cgyro_globals
Expand Down Expand Up @@ -506,9 +694,9 @@ subroutine cgyro_calc_collision_gpu_fp32(nj_loc)

enddo
enddo
end subroutine cgyro_calc_collision_gpu_fp32
end subroutine cgyro_calc_collision_gpu_m1

subroutine cgyro_calc_collision_gpu_b2_fp32(nj_loc)
subroutine cgyro_calc_collision_gpu_b2_m1(nj_loc)

use parallel_lib
use cgyro_globals
Expand Down Expand Up @@ -614,7 +802,7 @@ subroutine cgyro_calc_collision_gpu_b2_fp32(nj_loc)
!$acc wait(3)
#endif

end subroutine cgyro_calc_collision_gpu_b2_fp32
end subroutine cgyro_calc_collision_gpu_b2_m1

subroutine cgyro_calc_collision_gpu(nj_loc)

Expand All @@ -626,10 +814,12 @@ subroutine cgyro_calc_collision_gpu(nj_loc)
integer, intent(in) :: nj_loc
! --------------------------------------------------

if (collision_precision_mode == 0) then
call cgyro_calc_collision_gpu_fp64(nj_loc)
else
if (collision_precision_mode == 1) then
call cgyro_calc_collision_gpu_m1(nj_loc)
else if (collision_precision_mode == 32) then
call cgyro_calc_collision_gpu_fp32(nj_loc)
else
call cgyro_calc_collision_gpu_fp64(nj_loc)
endif

end subroutine cgyro_calc_collision_gpu
Expand All @@ -644,10 +834,12 @@ subroutine cgyro_calc_collision_gpu_b2(nj_loc)
integer, intent(in) :: nj_loc
! --------------------------------------------------

if (collision_precision_mode == 0) then
call cgyro_calc_collision_gpu_b2_fp64(nj_loc)
else
if (collision_precision_mode == 1) then
call cgyro_calc_collision_gpu_b2_m1(nj_loc)
else if (collision_precision_mode == 32) then
call cgyro_calc_collision_gpu_b2_fp32(nj_loc)
else
call cgyro_calc_collision_gpu_b2_fp64(nj_loc)
endif

end subroutine cgyro_calc_collision_gpu_b2
Expand Down

0 comments on commit 5d10cfd

Please sign in to comment.