Merge branch 'master' into imas

gafusion · Sep 3, 2024 · 8f30ce5 · 8f30ce5
2 parents 0444b8f + 47d3dcf
commit 8f30ce5
Show file tree

Hide file tree

Showing 10 changed files with 170 additions and 24 deletions.
diff --git a/cgyro/install/make.ext.LUMI b/cgyro/install/make.ext.LUMI
@@ -0,0 +1,2 @@
+cgyro_nl_fftw.o : cgyro_nl_fftw.gpu.F90
+	$(FC) $(FMATH) $(FFLAGS) -o cgyro_nl_fftw.o -c cgyro_nl_fftw.gpu.F90
diff --git a/cgyro/src/cgyro_init_arrays.F90 b/cgyro/src/cgyro_init_arrays.F90
@@ -111,9 +111,7 @@ subroutine cgyro_init_arrays
 !$acc enter data copyin(jvec_c)
 #endif
 
-  do i_field=1,n_field
-     call parallel_lib_rtrans_real(jvec_c(i_field,:,:,:),jvec_v(i_field,:,:,:))
-  enddo
+  call parallel_lib_rtrans_real(jvec_c,jvec_v)
 
   if (nonlinear_flag == 1) then
 !

diff --git a/cgyro/src/cgyro_mpi_grid.F90 b/cgyro/src/cgyro_mpi_grid.F90
@@ -281,7 +281,7 @@ subroutine cgyro_mpi_grid
 
   ! ni -> nc
   ! nj -> nv  
-  call parallel_lib_init(nc,nv,nt1,nt_loc,nc_loc,nv_loc,NEW_COMM_1)
+  call parallel_lib_init(nc,nv,nt1,nt_loc,n_field,nc_loc,nv_loc,NEW_COMM_1)
 
   nv1 = 1+i_proc_1*nv_loc
   nv2 = (1+i_proc_1)*nv_loc

diff --git a/cgyro/src/cgyro_parallel_lib.F90 b/cgyro/src/cgyro_parallel_lib.F90
@@ -21,8 +21,10 @@ module parallel_lib
   integer, private :: nk1,nk2
   integer, private :: lib_comm
   integer, private :: nsend
+  integer, private :: n_field
+  integer, private :: nsend_real
 
-  real, dimension(:,:,:,:), allocatable, private :: fsendr_real
+  real, dimension(:,:,:,:,:), allocatable, private :: fsendr_real
 
   ! (expose these)
   complex, dimension(:,:,:,:), allocatable :: fsendf
@@ -146,14 +148,14 @@ module parallel_lib
   !  parallel_lib_r -> g(nj_loc,ni) -> f(ni_loc,nj)
   !=========================================================
 
-  subroutine parallel_lib_init(ni_in,nj_in,nk1_in,nk_loc_in,ni_loc_out,nj_loc_out,comm)
+  subroutine parallel_lib_init(ni_in,nj_in,nk1_in,nk_loc_in,n_field_in,ni_loc_out,nj_loc_out,comm)
 
     use mpi
 
     implicit none
 
     integer, intent(in) :: ni_in,nj_in
-    integer, intent(in) :: nk1_in,nk_loc_in
+    integer, intent(in) :: nk1_in,nk_loc_in,n_field_in
     integer, intent(in) :: comm
     integer, intent(inout) :: ni_loc_out,nj_loc_out
     integer, external :: parallel_dim
@@ -183,7 +185,10 @@ subroutine parallel_lib_init(ni_in,nj_in,nk1_in,nk_loc_in,ni_loc_out,nj_loc_out,
 
     allocate(fsendf(nj_loc,nk1:nk2,ni_loc,nproc))
     allocate(fsendr(ni_loc,nk1:nk2,nj_loc,nproc))
-    if (.not. allocated(fsendr_real)) allocate(fsendr_real(ni_loc,nk1:nk2,nj_loc,nproc))
+
+    n_field = n_field_in
+    nsend_real = n_field*nsend
+    if (.not. allocated(fsendr_real)) allocate(fsendr_real(n_field,ni_loc,nk1:nk2,nj_loc,nproc))
 
 #if defined(OMPGPU)
 !$omp target enter data map(alloc:fsendf,fsendr)
@@ -419,33 +424,35 @@ subroutine parallel_lib_rtrans_real(fin,f)
 
     implicit none
 
-    real, intent(in), dimension(:,:,:) :: fin
-    real, intent(inout), dimension(:,:,:) :: f
-    integer :: ierr,j_loc,i,j,k,j1,j2,itor
+    real, intent(in), dimension(:,:,:,:) :: fin
+    real, intent(inout), dimension(:,:,:,:) :: f
+    integer :: ierr,j_loc,i,j,k,j1,j2,itor,fi
 
     j1 = 1+iproc*nj_loc
     j2 = (1+iproc)*nj_loc
 
 !$omp parallel do collapse(2) if (size(fsendr_real) >= default_size) default(none) &
-!$omp& shared(nproc,j1,j2,ni_loc,nk1,nk2) &
-!$omp& private(j,j_loc,i) &
+!$omp& firstprivate(nproc,j1,j2,ni_loc,nk1,nk2,n_field) &
+!$omp& private(j,j_loc,i,fi) &
 !$omp& shared(fin,fsendr_real)
     do k=1,nproc
      do itor=nk1,nk2
        do j=j1,j2
           j_loc = j-j1+1
           do i=1,ni_loc
-             fsendr_real(i,itor,j_loc,k) = fin(i+(k-1)*ni_loc,j_loc,1+(itor-nk1)) 
+             do fi=1,n_field
+                fsendr_real(fi,i,itor,j_loc,k) = fin(fi,i+(k-1)*ni_loc,j_loc,1+(itor-nk1)) 
+             enddo
           enddo
        enddo
      enddo
     enddo
 
     call MPI_ALLTOALL(fsendr_real, &
-         nsend, &
+         nsend_real, &
          MPI_DOUBLE_PRECISION,&
          f, &
-         nsend, &
+         nsend_real, &
          MPI_DOUBLE_PRECISION, &
          lib_comm, &
          ierr)

diff --git a/platform/build/make.inc.FRONTIER b/platform/build/make.inc.FRONTIER
@@ -27,7 +27,7 @@ FDEBUG =-O0 -g
 F2PY   = f2py --fcompiler=pg
 
 # System math libraries
-LMATH=-L${ROCM_PATH}/lib -L${HIPFORT_DIR}/lib -lhipfort-amdgcn -lhipfft -lamdhip64
+LMATH=-L${HIPFORT_DIR}/lib -L${ROCM_PATH}/lib -lhipfort-amdgcn -lhipfft -lamdhip64
 
 # NetCDF
 NETCDF=-L${NETCDF_DIR}/lib -lnetcdff -lnetcdf

diff --git a/platform/build/make.inc.LUMI b/platform/build/make.inc.LUMI
@@ -0,0 +1,39 @@
+#----------------------------------------------------------
+# Cray (lumi.csc.fi) [GPU nodes]
+#
+# - 7*2*4 CPU cores (AMD Epyc) + 4x2 GPUs (MI250X)
+#----------------------------------------------------------
+
+IDENTITY="LUMI"
+CORES_PER_NODE=56
+NUMAS_PER_NODE=8
+
+# Fortran 90/95 compiler
+FC = ftn -J ${GACODE_ROOT}/modules
+#FC = /opt/rocm/llvm/bin/flang -J ${GACODE_ROOT}/modules
+# Fortran 77 compiler
+F77 = ${FC}
+
+# Compiler options/flags
+ifneq ($(GACODE_OMPGPU),1)
+FACC   =-hacc -DHIPGPU -DGACODE_GPU_AMD -I${HIPFORT_DIR}/include/hipfort/amdgcn -hacc_model=auto_async_none:no_fast_addr:no_deep_copy
+else
+FACC = -DOMPGPU -DHIPGPU -DGACODE_GPU_AMD -I${HIPFORT_DIR}/include/hipfort/amdgcn
+endif
+FOMP   =-homp
+FMATH  =-s real64
+FOPT   =-Ofast
+FDEBUG =-O0 -g
+F2PY   = f2py --fcompiler=pg
+
+
+# System math libraries
+LMATH=-L${ROCM_PATH}/lib -L${HIPFORT_DIR}/lib -lhipfort-amdgcn -lhipfft -lamdhip64
+
+# NetCDF
+#NETCDF=-L${NETCDF_DIR}/lib -lnetcdff -lnetcdf
+#NETCDF_INC = ${NETCDF_DIR}/include
+
+# Archive 
+ARCH = ar cr
+
diff --git a/platform/env/env.FRONTIER b/platform/env/env.FRONTIER
@@ -15,16 +15,16 @@ if [ -n "$SSH_TTY" ] ; then
  fi
 fi
 
-module load PrgEnv-cray
-module load craype-accel-amd-gfx90a
-module load rocm
-#module load cray-mpich
+module load cpe/23.09
+module load craype-accel-amd-gfx90a rocm
 module load cray-python
+module load cray-mpich
+module use /lustre/orion/stf243/world-shared/hagertnl/test_environment/modulefiles/
+module load hipfort/5.5.1
+
 
-#export MPICH_SMP_SINGLE_COPY_MODE=NONE
-module load cray-mpich/8.1.25
 export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
 
 export MPICH_GPU_SUPPORT_ENABLED=1
-export HIPFORT_DIR=/lustre/orion/proj-shared/fus140/hipfort/cce_15.0.0-rocm_5.3.0
+export HIPFORT_DIR=${OLCF_HIPFORT_ROOT}
 export ROCFFT_RTC_CACHE_PATH=/dev/null
diff --git a/platform/env/env.LUMI b/platform/env/env.LUMI
@@ -0,0 +1,31 @@
+if [ -n "$SSH_TTY" ] ; then
+   echo "Setting up $GACODE_PLATFORM environment for gacode"
+fi
+
+if [ "x${GACODE_OMPGPU}" == "x" ]; then
+   # default to OpenACC if not defined
+   export GACODE_OMPGPU=0
+fi
+
+if [ -n "$SSH_TTY" ] ; then
+ if [ "x${GACODE_OMPGPU}" == "x1" ]; then
+   echo "Using OMPGPU offload setup"
+ else
+   echo "Using OpenACC offload setup"
+ fi
+fi
+
+module --force purge
+module load LUMI/23.09 partition/G 
+module load PrgEnv-cray
+module load rocm
+module load cray-mpich
+module load cray-python
+
+#export MPICH_SMP_SINGLE_COPY_MODE=NONE
+export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
+
+export MPICH_GPU_SUPPORT_ENABLED=1
+export HIPFORT_DIR=/scratch/project_462000507/cce_16.0.1-rocm_5.6.0
+#export HIPFORT_DIR=/opt/rocm/hipfort
+export ROCFFT_RTC_CACHE_PATH=/dev/null
diff --git a/platform/exec/exec.LUMI b/platform/exec/exec.LUMI
@@ -0,0 +1,45 @@
+#! /usr/bin/env bash 
+# GACODE Parallel execution script
+
+simdir=${1}
+nmpi=${2}
+exec=${3}
+nomp=${4}
+numa=${5}
+mpinuma=${6}
+
+# nmpi = MPI tasks
+# nomp = OpenMP threads per MPI task
+# numa = NUMAs active per node
+# mpinuma = MPI tasks per active NUMA 
+
+. $GACODE_ROOT/shared/bin/gacode_mpi_tool
+
+cd $simdir
+
+let proc_per_node=8
+
+export MPICH_MAX_THREAD_SAFETY=funneled
+export OMP_NUM_THREADS=$nomp
+export OMP_STACKSIZE=400M
+export MPICH_GPU_SUPPORT_ENABLED=1
+
+#export SLURM_CPU_BIND="cores"
+ulimit -c unlimited
+
+#
+# As recommended by
+# https://docs.lumi-supercomputer.eu/runjobs/scheduled-jobs/lumig-job/
+#
+
+CPU_BIND="mask_cpu:fe000000000000,fe00000000000000"
+CPU_BIND="${CPU_BIND},fe0000,fe000000"
+CPU_BIND="${CPU_BIND},fe,fe00"
+CPU_BIND="${CPU_BIND},fe00000000,fe0000000000"
+
+#echo "> srun -n$nmpi -c$nomp --gpus-per-task=1 --partition=standard-g --account=project_462000507 --gpu-bind=closest $exec"
+#srun -n$nmpi -c$nomp --gpus-per-task=1 --account=project_462000507 --partition=standard-g --gres=gpu:4 --gpu-bind=closest $exec
+#$exec
+
+echo "> srun -n $nmpi --cpu-bind=${CPU_BIND} $GACODE_ROOT/platform/exec/wrap.${GACODE_PLATFORM} $exec"
+srun -n $nmpi --cpu-bind=${CPU_BIND} $GACODE_ROOT/platform/exec/wrap.${GACODE_PLATFORM} $exec
diff --git a/platform/exec/wrap.LUMI b/platform/exec/wrap.LUMI
@@ -0,0 +1,24 @@
+#! /usr/bin/env bash
+# GACODE Parallel execution script (LUMI)
+
+#
+# As recommended by
+# https://docs.lumi-supercomputer.eu/runjobs/scheduled-jobs/lumig-job/
+#
+
+export MPICH_GPU_SUPPORT_ENABLED=1
+
+#env 1>&2
+
+#echo $SLURM_LOCALID
+let ACC_DEVICE_NUM=$SLURM_LOCALID
+export ACC_DEVICE_NUM
+export OMP_DEFAULT_DEVICE=${ACC_DEVICE_NUM}
+export HIP_VISIBLE_DEVICES=${ACC_DEVICE_NUM}
+
+echo "`uname -n` $SLURM_PROCID $SLURM_LOCALID $ACC_DEVICE_NUM `taskset -pc $$`"
+#ecno "uname -n` $SLURM_LOCALID LL $LD_LIBRARY_PATH"
+
+# no MPS
+exec  "$@"
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		cgyro_nl_fftw.o : cgyro_nl_fftw.gpu.F90
		$(FC) $(FMATH) $(FFLAGS) -o cgyro_nl_fftw.o -c cgyro_nl_fftw.gpu.F90