Skip to content

Commit

Permalink
Merge branch 'master' into imas
Browse files Browse the repository at this point in the history
  • Loading branch information
jcandy committed Sep 3, 2024
2 parents 0444b8f + 47d3dcf commit 8f30ce5
Show file tree
Hide file tree
Showing 10 changed files with 170 additions and 24 deletions.
2 changes: 2 additions & 0 deletions cgyro/install/make.ext.LUMI
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
cgyro_nl_fftw.o : cgyro_nl_fftw.gpu.F90
$(FC) $(FMATH) $(FFLAGS) -o cgyro_nl_fftw.o -c cgyro_nl_fftw.gpu.F90
4 changes: 1 addition & 3 deletions cgyro/src/cgyro_init_arrays.F90
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,7 @@ subroutine cgyro_init_arrays
!$acc enter data copyin(jvec_c)
#endif

do i_field=1,n_field
call parallel_lib_rtrans_real(jvec_c(i_field,:,:,:),jvec_v(i_field,:,:,:))
enddo
call parallel_lib_rtrans_real(jvec_c,jvec_v)

if (nonlinear_flag == 1) then
!
Expand Down
2 changes: 1 addition & 1 deletion cgyro/src/cgyro_mpi_grid.F90
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ subroutine cgyro_mpi_grid

! ni -> nc
! nj -> nv
call parallel_lib_init(nc,nv,nt1,nt_loc,nc_loc,nv_loc,NEW_COMM_1)
call parallel_lib_init(nc,nv,nt1,nt_loc,n_field,nc_loc,nv_loc,NEW_COMM_1)

nv1 = 1+i_proc_1*nv_loc
nv2 = (1+i_proc_1)*nv_loc
Expand Down
31 changes: 19 additions & 12 deletions cgyro/src/cgyro_parallel_lib.F90
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@ module parallel_lib
integer, private :: nk1,nk2
integer, private :: lib_comm
integer, private :: nsend
integer, private :: n_field
integer, private :: nsend_real

real, dimension(:,:,:,:), allocatable, private :: fsendr_real
real, dimension(:,:,:,:,:), allocatable, private :: fsendr_real

! (expose these)
complex, dimension(:,:,:,:), allocatable :: fsendf
Expand Down Expand Up @@ -146,14 +148,14 @@ module parallel_lib
! parallel_lib_r -> g(nj_loc,ni) -> f(ni_loc,nj)
!=========================================================

subroutine parallel_lib_init(ni_in,nj_in,nk1_in,nk_loc_in,ni_loc_out,nj_loc_out,comm)
subroutine parallel_lib_init(ni_in,nj_in,nk1_in,nk_loc_in,n_field_in,ni_loc_out,nj_loc_out,comm)

use mpi

implicit none

integer, intent(in) :: ni_in,nj_in
integer, intent(in) :: nk1_in,nk_loc_in
integer, intent(in) :: nk1_in,nk_loc_in,n_field_in
integer, intent(in) :: comm
integer, intent(inout) :: ni_loc_out,nj_loc_out
integer, external :: parallel_dim
Expand Down Expand Up @@ -183,7 +185,10 @@ subroutine parallel_lib_init(ni_in,nj_in,nk1_in,nk_loc_in,ni_loc_out,nj_loc_out,

allocate(fsendf(nj_loc,nk1:nk2,ni_loc,nproc))
allocate(fsendr(ni_loc,nk1:nk2,nj_loc,nproc))
if (.not. allocated(fsendr_real)) allocate(fsendr_real(ni_loc,nk1:nk2,nj_loc,nproc))

n_field = n_field_in
nsend_real = n_field*nsend
if (.not. allocated(fsendr_real)) allocate(fsendr_real(n_field,ni_loc,nk1:nk2,nj_loc,nproc))

#if defined(OMPGPU)
!$omp target enter data map(alloc:fsendf,fsendr)
Expand Down Expand Up @@ -419,33 +424,35 @@ subroutine parallel_lib_rtrans_real(fin,f)

implicit none

real, intent(in), dimension(:,:,:) :: fin
real, intent(inout), dimension(:,:,:) :: f
integer :: ierr,j_loc,i,j,k,j1,j2,itor
real, intent(in), dimension(:,:,:,:) :: fin
real, intent(inout), dimension(:,:,:,:) :: f
integer :: ierr,j_loc,i,j,k,j1,j2,itor,fi

j1 = 1+iproc*nj_loc
j2 = (1+iproc)*nj_loc

!$omp parallel do collapse(2) if (size(fsendr_real) >= default_size) default(none) &
!$omp& shared(nproc,j1,j2,ni_loc,nk1,nk2) &
!$omp& private(j,j_loc,i) &
!$omp& firstprivate(nproc,j1,j2,ni_loc,nk1,nk2,n_field) &
!$omp& private(j,j_loc,i,fi) &
!$omp& shared(fin,fsendr_real)
do k=1,nproc
do itor=nk1,nk2
do j=j1,j2
j_loc = j-j1+1
do i=1,ni_loc
fsendr_real(i,itor,j_loc,k) = fin(i+(k-1)*ni_loc,j_loc,1+(itor-nk1))
do fi=1,n_field
fsendr_real(fi,i,itor,j_loc,k) = fin(fi,i+(k-1)*ni_loc,j_loc,1+(itor-nk1))
enddo
enddo
enddo
enddo
enddo

call MPI_ALLTOALL(fsendr_real, &
nsend, &
nsend_real, &
MPI_DOUBLE_PRECISION,&
f, &
nsend, &
nsend_real, &
MPI_DOUBLE_PRECISION, &
lib_comm, &
ierr)
Expand Down
2 changes: 1 addition & 1 deletion platform/build/make.inc.FRONTIER
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ FDEBUG =-O0 -g
F2PY = f2py --fcompiler=pg

# System math libraries
LMATH=-L${ROCM_PATH}/lib -L${HIPFORT_DIR}/lib -lhipfort-amdgcn -lhipfft -lamdhip64
LMATH=-L${HIPFORT_DIR}/lib -L${ROCM_PATH}/lib -lhipfort-amdgcn -lhipfft -lamdhip64

# NetCDF
NETCDF=-L${NETCDF_DIR}/lib -lnetcdff -lnetcdf
Expand Down
39 changes: 39 additions & 0 deletions platform/build/make.inc.LUMI
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#----------------------------------------------------------
# Cray (lumi.csc.fi) [GPU nodes]
#
# - 7*2*4 CPU cores (AMD Epyc) + 4x2 GPUs (MI250X)
#----------------------------------------------------------

IDENTITY="LUMI"
CORES_PER_NODE=56
NUMAS_PER_NODE=8

# Fortran 90/95 compiler
FC = ftn -J ${GACODE_ROOT}/modules
#FC = /opt/rocm/llvm/bin/flang -J ${GACODE_ROOT}/modules
# Fortran 77 compiler
F77 = ${FC}

# Compiler options/flags
ifneq ($(GACODE_OMPGPU),1)
FACC =-hacc -DHIPGPU -DGACODE_GPU_AMD -I${HIPFORT_DIR}/include/hipfort/amdgcn -hacc_model=auto_async_none:no_fast_addr:no_deep_copy
else
FACC = -DOMPGPU -DHIPGPU -DGACODE_GPU_AMD -I${HIPFORT_DIR}/include/hipfort/amdgcn
endif
FOMP =-homp
FMATH =-s real64
FOPT =-Ofast
FDEBUG =-O0 -g
F2PY = f2py --fcompiler=pg


# System math libraries
LMATH=-L${ROCM_PATH}/lib -L${HIPFORT_DIR}/lib -lhipfort-amdgcn -lhipfft -lamdhip64

# NetCDF
#NETCDF=-L${NETCDF_DIR}/lib -lnetcdff -lnetcdf
#NETCDF_INC = ${NETCDF_DIR}/include

# Archive
ARCH = ar cr

14 changes: 7 additions & 7 deletions platform/env/env.FRONTIER
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@ if [ -n "$SSH_TTY" ] ; then
fi
fi

module load PrgEnv-cray
module load craype-accel-amd-gfx90a
module load rocm
#module load cray-mpich
module load cpe/23.09
module load craype-accel-amd-gfx90a rocm
module load cray-python
module load cray-mpich
module use /lustre/orion/stf243/world-shared/hagertnl/test_environment/modulefiles/
module load hipfort/5.5.1


#export MPICH_SMP_SINGLE_COPY_MODE=NONE
module load cray-mpich/8.1.25
export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"

export MPICH_GPU_SUPPORT_ENABLED=1
export HIPFORT_DIR=/lustre/orion/proj-shared/fus140/hipfort/cce_15.0.0-rocm_5.3.0
export HIPFORT_DIR=${OLCF_HIPFORT_ROOT}
export ROCFFT_RTC_CACHE_PATH=/dev/null
31 changes: 31 additions & 0 deletions platform/env/env.LUMI
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
if [ -n "$SSH_TTY" ] ; then
echo "Setting up $GACODE_PLATFORM environment for gacode"
fi

if [ "x${GACODE_OMPGPU}" == "x" ]; then
# default to OpenACC if not defined
export GACODE_OMPGPU=0
fi

if [ -n "$SSH_TTY" ] ; then
if [ "x${GACODE_OMPGPU}" == "x1" ]; then
echo "Using OMPGPU offload setup"
else
echo "Using OpenACC offload setup"
fi
fi

module --force purge
module load LUMI/23.09 partition/G
module load PrgEnv-cray
module load rocm
module load cray-mpich
module load cray-python

#export MPICH_SMP_SINGLE_COPY_MODE=NONE
export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"

export MPICH_GPU_SUPPORT_ENABLED=1
export HIPFORT_DIR=/scratch/project_462000507/cce_16.0.1-rocm_5.6.0
#export HIPFORT_DIR=/opt/rocm/hipfort
export ROCFFT_RTC_CACHE_PATH=/dev/null
45 changes: 45 additions & 0 deletions platform/exec/exec.LUMI
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#! /usr/bin/env bash
# GACODE Parallel execution script

simdir=${1}
nmpi=${2}
exec=${3}
nomp=${4}
numa=${5}
mpinuma=${6}

# nmpi = MPI tasks
# nomp = OpenMP threads per MPI task
# numa = NUMAs active per node
# mpinuma = MPI tasks per active NUMA

. $GACODE_ROOT/shared/bin/gacode_mpi_tool

cd $simdir

let proc_per_node=8

export MPICH_MAX_THREAD_SAFETY=funneled
export OMP_NUM_THREADS=$nomp
export OMP_STACKSIZE=400M
export MPICH_GPU_SUPPORT_ENABLED=1

#export SLURM_CPU_BIND="cores"
ulimit -c unlimited

#
# As recommended by
# https://docs.lumi-supercomputer.eu/runjobs/scheduled-jobs/lumig-job/
#

CPU_BIND="mask_cpu:fe000000000000,fe00000000000000"
CPU_BIND="${CPU_BIND},fe0000,fe000000"
CPU_BIND="${CPU_BIND},fe,fe00"
CPU_BIND="${CPU_BIND},fe00000000,fe0000000000"

#echo "> srun -n$nmpi -c$nomp --gpus-per-task=1 --partition=standard-g --account=project_462000507 --gpu-bind=closest $exec"
#srun -n$nmpi -c$nomp --gpus-per-task=1 --account=project_462000507 --partition=standard-g --gres=gpu:4 --gpu-bind=closest $exec
#$exec

echo "> srun -n $nmpi --cpu-bind=${CPU_BIND} $GACODE_ROOT/platform/exec/wrap.${GACODE_PLATFORM} $exec"
srun -n $nmpi --cpu-bind=${CPU_BIND} $GACODE_ROOT/platform/exec/wrap.${GACODE_PLATFORM} $exec
24 changes: 24 additions & 0 deletions platform/exec/wrap.LUMI
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#! /usr/bin/env bash
# GACODE Parallel execution script (LUMI)

#
# As recommended by
# https://docs.lumi-supercomputer.eu/runjobs/scheduled-jobs/lumig-job/
#

export MPICH_GPU_SUPPORT_ENABLED=1

#env 1>&2

#echo $SLURM_LOCALID
let ACC_DEVICE_NUM=$SLURM_LOCALID
export ACC_DEVICE_NUM
export OMP_DEFAULT_DEVICE=${ACC_DEVICE_NUM}
export HIP_VISIBLE_DEVICES=${ACC_DEVICE_NUM}

echo "`uname -n` $SLURM_PROCID $SLURM_LOCALID $ACC_DEVICE_NUM `taskset -pc $$`"
#ecno "uname -n` $SLURM_LOCALID LL $LD_LIBRARY_PATH"

# no MPS
exec "$@"

0 comments on commit 8f30ce5

Please sign in to comment.