From d0a5243f90ba0ba7999f77c13870caa04d5dbfcc Mon Sep 17 00:00:00 2001
From: Klaus Hallatschek <Klaus.Hallatschek@ipp.mpg.de>
Date: Fri, 6 Sep 2024 20:39:52 +0200
Subject: [PATCH 1/9] Loop adapted to strict compiler scrutiny

in DIMATCOPY opt out case. (Was previously traditional
fortran practice.)
---
 cgyro/src/cgyro_init_collision_landau.F90 | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/cgyro/src/cgyro_init_collision_landau.F90 b/cgyro/src/cgyro_init_collision_landau.F90
index 836e287db..3387f624e 100644
--- a/cgyro/src/cgyro_init_collision_landau.F90
+++ b/cgyro/src/cgyro_init_collision_landau.F90
@@ -731,9 +731,13 @@ subroutine cgyro_init_landau()
 #endif
 #else
           !alternative to domatcopy
-          do k=1,n_xi*n_energy
-             do j=1,n_xi*n_energy
-                gyrocolmat(j,1,k,1,ib,ia,ik)=gyrocolmat(k,1,j,1,ia,ib,ik)
+          do k=1,n_xi
+             do l=1,n_energy
+                do j=1,n_xi
+                   do m=1,n_energy
+                      gyrocolmat(m,j,l,k,ib,ia,ik)=gyrocolmat(l,k,m,j,ia,ib,ik)
+                   end do
+                end do
              end do
           end do
 #endif

From 21a271e7f87f7a19a914a7c46a47693e3e5296f2 Mon Sep 17 00:00:00 2001
From: Klaus Hallatschek <Klaus.Hallatschek@ipp.mpg.de>
Date: Thu, 7 Nov 2024 14:30:59 +0100
Subject: [PATCH 2/9] std=f2008 for debug option for tumbleweed

---
 platform/build/make.inc.TUMBLEWEED | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/platform/build/make.inc.TUMBLEWEED b/platform/build/make.inc.TUMBLEWEED
index c502f5b36..5fc8e26c5 100644
--- a/platform/build/make.inc.TUMBLEWEED
+++ b/platform/build/make.inc.TUMBLEWEED
@@ -14,14 +14,14 @@ NUMAS_PER_NODE=1
 
 # Compilers
 
-FC     = mpifort -DDIMATCOPY -std=gnu -fallow-argument-mismatch -fall-intrinsics -fimplicit-none -J $(GACODE_ROOT)/modules #-fPIC
+FC     = mpifort -DDIMATCOPY -fallow-argument-mismatch -fall-intrinsics -fimplicit-none -J $(GACODE_ROOT)/modules #-fPIC
 #FC     = mpifort -std=f2008 -fall-intrinsics -fimplicit-none -J $(GACODE_ROOT)/modules #-fPIC
 F77    = mpifort -fimplicit-none -fallow-argument-mismatch
 FOMP   =-fopenmp
 FMATH  =-fdefault-real-8 -fdefault-double-8 -I$(FFTW_INC)
-FOPT   =-O3 -fexternal-blas -march=native -ffree-line-length-0
+FOPT   =-O3 -std=gnu -fexternal-blas -march=native -ffree-line-length-0
 # -floop-nest-optimize -floop-parallelize-all -flto
-FDEBUG =-Wall -W -fcheck=all -g -fbacktrace  -ffpe-trap=invalid,zero,overflow -finit-real=snan # -std=f2003 -fall-intrinsics
+FDEBUG =-Wall -W -fcheck=all -g -fbacktrace  -ffpe-trap=invalid,zero,overflow -finit-real=snan -std=f2008 -fall-intrinsics
 F2PY   = f2py
 
 # System math libraries

From 68084a1c44fe8f60e8a3f68de25a9163df88274d Mon Sep 17 00:00:00 2001
From: Klaus Hallatschek <Klaus.Hallatschek@ipp.mpg.de>
Date: Thu, 7 Nov 2024 18:31:31 +0100
Subject: [PATCH 3/9] no unused variable, rel comp, int div warning

---
 platform/build/make.inc.TUMBLEWEED | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/platform/build/make.inc.TUMBLEWEED b/platform/build/make.inc.TUMBLEWEED
index 5fc8e26c5..19712de78 100644
--- a/platform/build/make.inc.TUMBLEWEED
+++ b/platform/build/make.inc.TUMBLEWEED
@@ -21,7 +21,7 @@ FOMP   =-fopenmp
 FMATH  =-fdefault-real-8 -fdefault-double-8 -I$(FFTW_INC)
 FOPT   =-O3 -std=gnu -fexternal-blas -march=native -ffree-line-length-0
 # -floop-nest-optimize -floop-parallelize-all -flto
-FDEBUG =-Wall -W -fcheck=all -g -fbacktrace  -ffpe-trap=invalid,zero,overflow -finit-real=snan -std=f2008 -fall-intrinsics
+FDEBUG =-Wall -W -fcheck=all -Wno-compare-reals  -Wno-integer-division -Wno-unused-variable -g -fbacktrace  -ffpe-trap=invalid,zero,overflow -finit-real=snan -std=f2008 -fall-intrinsics
 F2PY   = f2py
 
 # System math libraries

From 88ba3b6cb379eddfc5fb229c77667cf3b9c329a4 Mon Sep 17 00:00:00 2001
From: Klaus Hallatschek <Klaus.Hallatschek@ipp.mpg.de>
Date: Thu, 7 Nov 2024 18:41:26 +0100
Subject: [PATCH 4/9] liquidat. obsol. statement fct in half_hermite.f90

---
 shared/math/half_hermite.f90 | 48 +++++++++++++++++++++++++++---------
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/shared/math/half_hermite.f90 b/shared/math/half_hermite.f90
index f81a47b85..e294605cf 100644
--- a/shared/math/half_hermite.f90
+++ b/shared/math/half_hermite.f90
@@ -90,7 +90,6 @@ subroutine half_hermite_norm(n,xmin,xmax,sign,alpha,a1,b1,c1,a,bsq,logg,verbos&
     real :: t00,t10
     real, parameter :: mori=.5,epsmori=1e-6,epsbase=5e-16,epsgauss=1e-18
     integer ngauss
-    real w,sech2,sech,sechb,tanhp1,tanhm1,tanhb
     real eps
     integer i,j,k,ia
     integer leftnew,rightnew ! How many points of the left or right summation have to be computed
@@ -164,15 +163,6 @@ subroutine half_hermite_norm(n,xmin,xmax,sign,alpha,a1,b1,c1,a,bsq,logg,verbos&
     ! n(i+1)=|-a(i)n(i)pn(i,x=0)-b(i)n(i-1)pn(i,x=0)|
     ! But we normalise to <P^2>
 
-
-    w(x)=x**alpha*exp(-sign*x*x) ! inline function for weight
-    sechb(x)=2.*x/(1.+x*x)
-    sech(x)=sechb(exp(-x))
-    sech2(x)=sech(abs(x))**2
-    tanhb(x)=2*x/(1+x)
-    tanhp1(x)=tanhb(exp(2*x))   ! tanh(x)+1 for x<0 approx. 2*exp(2*x) for x<<-1
-    tanhm1(x)=-tanhb(exp(-2*x)) ! tanh(x)-1 for x>0
-
     if (n==0) return
     nn=n
     vb=present(verbose)
@@ -182,7 +172,7 @@ subroutine half_hermite_norm(n,xmin,xmax,sign,alpha,a1,b1,c1,a,bsq,logg,verbos&
     t00=t0
     t10=t1
     if (vb) then
-       ngauss=n+xmax**2+xmax*sqrt(-2*log(epsgauss))-log(epsgauss)/3
+       ngauss=n+floor(xmax**2+xmax*sqrt(-2*log(epsgauss))-log(epsgauss)/3)
        ngauss=n+ceiling(10*xmax)+ceiling(alpha/2)
        print *,'ngauss=',ngauss
        allocate(gw(ngauss),gp(ngauss),gw2(ngauss*2),gp2(ngauss*2))
@@ -671,6 +661,42 @@ subroutine half_hermite_norm(n,xmin,xmax,sign,alpha,a1,b1,c1,a,bsq,logg,verbos&
     enddo
     if (vb) print *,'hhn used mem',2*(nsafe+nintervals)
     deallocate(mw,mp,poly,poly1) !deallocation may be necessary because these are pointers.
+  contains
+    elemental real function w(x)
+      implicit none
+      real,intent(in) :: x
+      w=x**alpha*exp(-sign*x*x) ! inline function for weight
+    end function w
+    elemental real function sechb(x)
+      implicit none
+      real,intent(in) :: x
+      sechb=2.*x/(1.+x*x)
+    end function sechb
+    elemental real function sech(x)
+      implicit none
+      real,intent(in) :: x
+      sech=sechb(exp(-x))
+    end function sech
+    elemental real function sech2(x)
+      implicit none
+      real,intent(in) :: x
+      sech2=sech(abs(x))**2
+    end function sech2
+    elemental real function tanhb(x)
+      implicit none
+      real,intent(in) :: x
+      tanhb=2*x/(1+x)
+    end function tanhb
+    elemental real function tanhp1(x)
+      implicit none
+      real,intent(in) :: x
+      tanhp1=tanhb(exp(2*x))   ! tanh(x)+1 for x<0 approx. 2*exp(2*x) for x<<-1
+    end function tanhp1
+    elemental real function tanhm1(x)
+      implicit none
+      real,intent(in) :: x
+      tanhm1=-tanhb(exp(-2*x)) ! tanh(x)-1 for x>0
+    end function tanhm1
   end subroutine half_hermite_norm
 
   subroutine gauss_nodes_functions(n,a,bsq,sp,ortho_f)

From b7f0217ee22528013e082d62c982fa4d88ce49d7 Mon Sep 17 00:00:00 2001
From: Klaus Hallatschek <Klaus.Hallatschek@ipp.mpg.de>
Date: Thu, 7 Nov 2024 18:32:47 +0100
Subject: [PATCH 5/9] liquid. obsolescent statement fct in landau.F90

---
 shared/landau/landau.F90 | 126 +++++++++++++++++++--------------------
 1 file changed, 63 insertions(+), 63 deletions(-)

diff --git a/shared/landau/landau.F90 b/shared/landau/landau.F90
index 6532f9b88..a305eed95 100644
--- a/shared/landau/landau.F90
+++ b/shared/landau/landau.F90
@@ -1,26 +1,28 @@
 !  gfortran -march=native -O3 -fno-stack-arrays -fimplicit-none -fdefault-real-8 landau.F90  -c
 !intel:
 !ifort -stand f15 -warn all -march=native -O3 -heap-arrays 10 -implicitnone -real-size 64 landau.F90 -c
-module landau
 #ifndef __PGI
   !can't use quad precision with frumpy pgi compiler
-  real(16), parameter,private :: pi1=atan(1._16)*4
+#define HIREAL real(16)
+#define HRS _16
+#else
+#define HIREAL real
+#define HRS
+#endif
+module landau
+  HIREAL, parameter,private :: pi1=atan(1.HRS)*4
   ! real, parameter :: intnorm=pi1**(-1.5)*4*pi1
   ! int gphys(v) d^3v=1=int g(x) x^2 dx *intnor
   ! Maybe one should normalise this once for a given xmax and density 1?
-  real, parameter :: vunit=sqrt(2._16)
+  real, parameter :: vunit=real(sqrt(2.HRS))
   ! v*vunit is v in units of sqrt(T/m)
   ! v itself is v in units of sqrt(2T/m) ("collision operator units")
-#else
-  real, parameter,private :: pi1=atan(1.)*4
-  real, parameter :: vunit=sqrt(2.)
-#endif
-  real, parameter :: intnorm=4/sqrt(pi1)
-  real, parameter :: normcol=sqrt(8/pi1)
+  real, parameter :: intnorm=real(4/sqrt(pi1))
+  real, parameter :: normcol=real(sqrt(8/pi1))
   ! Normalisation of ctest/emat cfield/emat for the self-collisions of the
   ! reference species without caa or cab.
   ! Note: must take into account muref^3=sqrt(1/8), since mu^2=m/(2T)
-  real, parameter :: normcolcgyro=4/sqrt(pi1)
+  real, parameter :: normcolcgyro=real(4/sqrt(pi1))
 
   integer :: verbose=0
 contains
@@ -56,7 +58,7 @@ real function tau_ab_inv(mua,mub,nb,za,zb,Ta,Tb)
     if (present(nb)) nb1=nb
     if (present(mua)) mua1=mua
     if (present(mub)) mub1=mub
-    tau_ab_inv=sqrt(8./pi1)/3*za1**2*zb1**2*nb1/(mua1*Ta1**2)*&
+    tau_ab_inv=real(sqrt(8./pi1))/3*za1**2*zb1**2*nb1/(mua1*Ta1**2)*&
          (1+mua1**2*Ta1/(mub1**2*Tb1))/(1+mua1**2/(mub1**2))**1.5
   end function tau_ab_inv
     
@@ -71,36 +73,13 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu
     optional :: t1t2_int ! only calculate that term if it is present.
     optional :: addcutoff
     !allocate(lor_int(n,n),dif_int(n,n))
-    real p0,p1,pi,q0,q1,qi,r0,r1,ri,s0,s1,si,x,x1,xmax1
+    real p0,p1,pi,q0,q1,qi,r0,r1,ri,s0,s1,si,x,xmax1
     real f1,f2
     integer i,j,k
-#ifndef __PGI
-    real(16) fct1,fct2,fct1lo,fct1hi,fct2lo,fct2hi !inline functions
-    real(16) w,xmaxx
-#else
-    real fct1,fct2,fct1lo,fct1hi,fct2lo,fct2hi !inline functions
-    real w,xmaxx
-#endif
+    HIREAL xmaxx,x1
     logical t1t2,addcutoff1
     real t1,t2 ! for timing
-#ifndef __PGI
-    fct1(w)=.125_16*w**(-3)*(exp(-w*w)*w+(2*w*w-1)*.5_16*sqrt(pi1)*erf(w))
-    fct2(w)=.25_16*w**(-1)*(.5_16*sqrt(pi1)*erf(w)-w*exp(-w**2))
-    fct2hi(w)=-w**(-1)*((1._16/12)*xmaxx*exp(-xmaxx**2)*(3+2*xmaxx**2)-(sqrt(pi1)/8)*erf(xmaxx))
-#else
-    fct1(w)=.125*w**(-3)*(exp(-w*w)*w+(2*w*w-1)*.5*sqrt(pi1)*erf(w))
-    fct2(w)=.25*w**(-1)*(.5*sqrt(pi1)*erf(w)-w*exp(-w**2))
-    fct2hi(w)=-w**(-1)*((1./12)*xmaxx*exp(-xmaxx**2)*(3+2*xmaxx**2)-(sqrt(pi1)/8)*erf(xmaxx))
-#endif
-    fct1lo(w)=fct1(w)-exp(-xmaxx**2)/6  ! for w<xmax
-    fct2lo(w)=fct2(w)-w*w*exp(-xmaxx**2)/6 !for w<xmax
-    !  fct1hi(w)=w**(-3)*((1./24)*exp(-xmaxx**2)*xmaxx*(3-6*w**2+2*xmaxx**2)+(sqrt(pi1)/16)*(2*w**2-1)*erf(xmaxx))
-    fct1hi(w)=-w**(-3)/48*((-2*exp(-xmaxx**2)*xmaxx*(3+2*xmaxx**2)+3*sqrt(pi1)*erf(xmaxx))+&
-         w**2*(12*exp(-xmaxx**2)*xmaxx-6*sqrt(pi1)*erf(xmaxx)))
 
-    !Inaccuracy for
-    !./hhv 5 .5 0 50 1 1
-    !was due to inaccuracies in this routine, likely due to cancellations in the fcts.
     xmaxx=xmax
     t1t2=present(t1t2_int)
     addcutoff1=.true.
@@ -197,23 +176,13 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu
     do k=1,ngauss
        x=gp(k)*xmax1
        x1=x*beta
-#ifndef __PGI
-       if (addcutoff1) then
-          f1=fct1lo(real(x1,16))
-          f2=fct2lo(real(x1,16))
-       else
-          f1=fct1(real(x1,16))
-          f2=fct2(real(x1,16))
-       endif
-#else
        if (addcutoff1) then
-          f1=fct1lo(x1)
-          f2=fct2lo(x1)
+          f1=real(fct1lo(x1))
+          f2=real(fct2lo(x1))
        else
-          f1=fct1(x1)
-          f2=fct2(x1)
+          f1=real(fct1(x1))
+          f2=real(fct2(x1))
        endif
-#endif
        do j=1,n
           if (j==1) then
              q1=0
@@ -234,7 +203,7 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu
           s1=0
           si=0
           lor_int(j,i)=lor_int(j,i)+pi*ri*f1
-          if (t1t2) t1t2_int(j,i)=t1t2_int(j,i)+2*qi*ri*x1*f2
+          if (t1t2) t1t2_int(j,i)=t1t2_int(j,i)+2*qi*ri*real(x1)*f2
           ! ^^ vgl 1.8.19 (2)
           do i=2,n
              s0=si
@@ -245,21 +214,16 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu
              r1=r0
              lor_int(j,i)=lor_int(j,i)+pi*ri*f1
              dif_int(j,i)=dif_int(j,i)+qi*si*f2
-             if (t1t2) t1t2_int(j,i)=t1t2_int(j,i)+2*qi*ri*x1*f2
+             if (t1t2) t1t2_int(j,i)=t1t2_int(j,i)+2*qi*ri*real(x1)*f2
           enddo
        enddo
     enddo
     if (addcutoff1 .and. beta>1) then
        do k=1,ngauss
           x1=xmax*(1+(beta-1)*gp(k))
-          x=x1/beta
-#ifndef __PGI
-          f1=fct1hi(real(x1,16))
-          f2=fct2hi(real(x1,16))
-#else
-          f1=fct1hi(x1)
-          f2=fct2hi(x1)
-#endif
+          x=real(x1)/beta
+          f1=real(fct1hi(x1))
+          f2=real(fct2hi(x1))
 
           do j=1,n
              if (j==1) then
@@ -281,7 +245,7 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu
              s1=0
              si=0
              lor_int(j,i)=lor_int(j,i)+pi*ri*f1
-             if (t1t2) t1t2_int(j,i)=t1t2_int(j,i)+2*qi*ri*x1*f2
+             if (t1t2) t1t2_int(j,i)=t1t2_int(j,i)+2*qi*ri*real(x1)*f2
              ! ^^ vgl 1.8.19 (2)
              do i=2,n
                 s0=si
@@ -292,7 +256,7 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu
                 r1=r0
                 lor_int(j,i)=lor_int(j,i)+pi*ri*f1
                 dif_int(j,i)=dif_int(j,i)+qi*si*f2
-                if (t1t2) t1t2_int(j,i)=t1t2_int(j,i)+2*qi*ri*x1*f2
+                if (t1t2) t1t2_int(j,i)=t1t2_int(j,i)+2*qi*ri*real(x1)*f2
              enddo
           enddo
        enddo
@@ -320,8 +284,44 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu
     ! i.e. we multiply with beta!!
     if (t1t2) t1t2_int=t1t2_int*beta**(-4) ! ***<--- must be multiplied with (Ta/Tb-1)
     call cpu_time(t2)
-    
+
     if (verbose>0) print '(A,G13.3)','gentestkernel took ',t2-t1
+  contains
+    elemental HIREAL function fct1(w)
+      implicit none
+      HIREAL,intent(in) :: w
+      fct1=.125HRS*w**(-3)*(exp(-w*w)*w+(2*w*w-1)*.5HRS*sqrt(pi1)*erf(w))
+    end function fct1
+    elemental HIREAL function fct2(w)
+      implicit none
+      HIREAL,intent(in) :: w
+      fct2=.25HRS*w**(-1)*(.5HRS*sqrt(pi1)*erf(w)-w*exp(-w**2))
+    end function fct2
+    elemental HIREAL function fct1lo(w)
+      implicit none
+      HIREAL,intent(in) :: w
+      fct1lo=fct1(w)-exp(-xmaxx**2)/6  ! for w<xmax
+    end function fct1lo
+    elemental HIREAL function fct1hi(w)
+      implicit none
+      HIREAL,intent(in) :: w
+      !  fct1hi(w)=w**(-3)*((1./24)*exp(-xmaxx**2)*xmaxx*(3-6*w**2+2*xmaxx**2)+(sqrt(pi1)/16)*(2*w**2-1)*erf(xmaxx))
+      fct1hi=-w**(-3)/48*((-2*exp(-xmaxx**2)*xmaxx*(3+2*xmaxx**2)+3*sqrt(pi1)*erf(xmaxx))+&
+           w**2*(12*exp(-xmaxx**2)*xmaxx-6*sqrt(pi1)*erf(xmaxx)))
+      !Inaccuracy for
+      !./hhv 5 .5 0 50 1 1
+      !was due to inaccuracies in this routine, likely due to cancellations in the fcts.
+    end function fct1hi
+    elemental HIREAL function fct2lo(w)
+      implicit none
+      HIREAL,intent(in) :: w
+      fct2lo=fct2(w)-w*w*exp(-xmaxx**2)/6 !for w<xmax
+    end function fct2lo
+    elemental HIREAL function fct2hi(w)
+      implicit none
+      HIREAL,intent(in) :: w
+      fct2hi=-w**(-1)*((1.HRS/12)*xmaxx*exp(-xmaxx**2)*(3+2*xmaxx**2)-(sqrt(pi1)/8)*erf(xmaxx))
+    end function fct2hi
   end subroutine gentestkernel
 
   subroutine genintkernel(n,lmax,a1,b1,c1,xmax,beta,t1t2ratio,gp,gw,ngauss,gp2,gw2,ng2,addcutoff,intkernel,intlokernel)

From e7ff35ebf2da7934c179f2eae11105e751b5180b Mon Sep 17 00:00:00 2001
From: Klaus Hallatschek <Klaus.Hallatschek@ipp.mpg.de>
Date: Sat, 23 Nov 2024 10:00:22 +0100
Subject: [PATCH 6/9] c._in._coll._landau.F90: MPI_COMM..->CGYRO_COMM..

---
 cgyro/src/cgyro_init_collision_landau.F90 | 46 +++++++++++------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/cgyro/src/cgyro_init_collision_landau.F90 b/cgyro/src/cgyro_init_collision_landau.F90
index 3387f624e..ff58c8e72 100644
--- a/cgyro/src/cgyro_init_collision_landau.F90
+++ b/cgyro/src/cgyro_init_collision_landau.F90
@@ -25,7 +25,7 @@ module cgyro_init_collision_landau
   subroutine cgyro_init_landau()
     ! populate cmat with Galerkin based gyrokinetic Landau operator.
     ! cmat1 is only for comparison purposes
-    use cgyro_globals, only : vth,temp,mass,dens,temp_ele,mass_ele,dens_ele,rho,z,&
+    use cgyro_globals, only : CGYRO_COMM_WORLD,vth,temp,mass,dens,temp_ele,mass_ele,dens_ele,rho,z,&
          n_energy,e_max,n_xi,n_radial,n_theta,n_species,n_toroidal,nt1,nt2,nc_loc,nc1,nc2,nc,nv,&
          nu_ee,&
          xi,w_xi,& !needed for projleg calc
@@ -119,7 +119,7 @@ subroutine cgyro_init_landau()
       gtvb=1
     end if
 
-    !$    call MPI_Barrier(MPI_COMM_WORLD,ierror) ! may improve timing
+    !$    call MPI_Barrier(CGYRO_COMM_WORLD,ierror) ! may improve timing
     call cpu_time(t1)
     ns=ispec(n_species,n_species) !number of non-redundant species pairs
     xmax=sqrt(e_max) !cut off at exp(-xmax^2)
@@ -146,7 +146,7 @@ subroutine cgyro_init_landau()
     end do
     ! kperp_bmag_max is not completely global, there is still the n dependence.
     ! we need to maximize over the toroidal mode numbers:
-    call MPI_ALLREDUCE(MPI_IN_PLACE,kperp_bmag_max,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,ierror)
+    call MPI_ALLREDUCE(MPI_IN_PLACE,kperp_bmag_max,1,MPI_REAL8,MPI_MAX,CGYRO_COMM_WORLD,ierror)
     rhomax=maxval(abs(rho_spec([(i,i=1,n_species)])))*xmax
     kperprhomax=kperp_bmag_max*rhomax
     if (verbose>0 .and. i_proc==0) print 6,'using kperprhomax=',kperprhomax
@@ -911,13 +911,13 @@ subroutine cgyro_init_landau()
        print 7,'pre_scatter timing:'
        do i=1,n_proc
           if (i>1) then
-             call MPI_Recv(t,11,MPI_REAL8,i-1,i-1,MPI_COMM_WORLD,status,ierror)
+             call MPI_Recv(t,11,MPI_REAL8,i-1,i-1,CGYRO_COMM_WORLD,status,ierror)
           end if
 5         format("init_collision_landau: ",A,I0,A,7G24.16E3,A,I0,A,G24.16E3)
           print 5,'i_proc=',i-1,' took',t(1:7),' load ',load(i),' rel',t(3)/load(i)
        end do
     else
-       call MPI_Send(t,11,MPI_REAL8,0,i_proc,MPI_COMM_WORLD,ierror)
+       call MPI_Send(t,11,MPI_REAL8,0,i_proc,CGYRO_COMM_WORLD,ierror)
     end if
     call cpu_time(t1)
     ! Now do the scatter
@@ -931,17 +931,17 @@ subroutine cgyro_init_landau()
        ib=idx+1
        if (proc(ik,ia,ib)/=0) then
 !!$          do j=1,n_proc
-!!$             call MPI_BARRIER(MPI_COMM_WORLD,ierror)
+!!$             call MPI_BARRIER(CGYRO_COMM_WORLD,ierror)
 !!$!          if (i_proc==0 .and. verbose>100) then
 !!$             if (i_proc==j-1) print *,'bcasting (ik,ia,ib,proc)=',ik,ia,ib,proc(ik,ia,ib)-1,'ip',i_proc
 !!$          !          end if
-!!$             call MPI_BARRIER(MPI_COMM_WORLD,ierror)
+!!$             call MPI_BARRIER(CGYRO_COMM_WORLD,ierror)
 !!$          end do
           call MPI_Bcast(gyrocolmat(:,:,:,:,ia,ib,ik),n_xi**2*n_energy**2,&
-               MPI_REAL8,proc(ik,ia,ib)-1,MPI_COMM_WORLD,ierror)
+               MPI_REAL8,proc(ik,ia,ib)-1,CGYRO_COMM_WORLD,ierror)
           if (ia>ib .and. temp(ia)==temp(ib)) then
              call MPI_Bcast(gyrocolmat(:,:,:,:,ib,ia,ik),n_xi**2*n_energy**2,&
-                  MPI_REAL8,proc(ik,ia,ib)-1,MPI_COMM_WORLD,ierror)
+                  MPI_REAL8,proc(ik,ia,ib)-1,CGYRO_COMM_WORLD,ierror)
           end if
        end if
     enddo
@@ -1003,12 +1003,12 @@ subroutine cgyro_init_landau()
     if (i_proc==0) then
        do i=1,n_proc
           if (i>1) then
-             call MPI_Recv(t,11,MPI_REAL8,i-1,i-1,MPI_COMM_WORLD,status,ierror)
+             call MPI_Recv(t,11,MPI_REAL8,i-1,i-1,CGYRO_COMM_WORLD,status,ierror)
           end if
           print *,'i_proc=',i-1,'took',t(1:10),'load',load(i),'rel',t(3)/load(i)
        end do
     else
-       call MPI_Send(t,11,MPI_REAL8,0,i_proc,MPI_COMM_WORLD,ierror)
+       call MPI_Send(t,11,MPI_REAL8,0,i_proc,CGYRO_COMM_WORLD,ierror)
     end if
     
     coltestmode: if(collision_test_mode==1) then
@@ -1036,10 +1036,10 @@ subroutine cgyro_init_landau()
        nt2_proc(i_proc+1)=nt2
        proc_c=0 ! dummy value if no processor is responsible
        do i=1,n_proc
-          call MPI_BCAST(nc1_proc(i),1,MPI_INTEGER,i-1,MPI_COMM_WORLD,ierror)
-          call MPI_BCAST(nc2_proc(i),1,MPI_INTEGER,i-1,MPI_COMM_WORLD,ierror)
-          call MPI_BCAST(nt1_proc(i),1,MPI_INTEGER,i-1,MPI_COMM_WORLD,ierror)
-          call MPI_BCAST(nt2_proc(i),1,MPI_INTEGER,i-1,MPI_COMM_WORLD,ierror)
+          call MPI_BCAST(nc1_proc(i),1,MPI_INTEGER,i-1,CGYRO_COMM_WORLD,ierror)
+          call MPI_BCAST(nc2_proc(i),1,MPI_INTEGER,i-1,CGYRO_COMM_WORLD,ierror)
+          call MPI_BCAST(nt1_proc(i),1,MPI_INTEGER,i-1,CGYRO_COMM_WORLD,ierror)
+          call MPI_BCAST(nt2_proc(i),1,MPI_INTEGER,i-1,CGYRO_COMM_WORLD,ierror)
           ! this assigns the processor with the highest number to the respective
           ! nc and nt range
           proc_c(nc1_proc(i):nc2_proc(i),nt1_proc(i):nt2_proc(i))=i
@@ -1094,11 +1094,11 @@ subroutine cgyro_init_landau()
                               **2,L2xi,n_xi,0.,c(:,:,:,:,l),n_xi*n_energy**2)
                       end do
                       if (i_proc/=0) then
-                         call MPI_SEND(c,size(c),MPI_REAL8,0,1234,MPI_COMM_WORLD,ierror)
+                         call MPI_SEND(c,size(c),MPI_REAL8,0,1234,CGYRO_COMM_WORLD,ierror)
                       end if
                    else
                       if (i_proc==0) then
-                         call MPI_RECV(c,size(c),MPI_REAL8,proc_c(ic,itor)-1,1234,MPI_COMM_WORLD,status,ierror)
+                         call MPI_RECV(c,size(c),MPI_REAL8,proc_c(ic,itor)-1,1234,CGYRO_COMM_WORLD,status,ierror)
                       end if
                    end if
 !!$                  block
@@ -1227,11 +1227,11 @@ subroutine cgyro_init_landau()
                               **2,L2xi,n_xi,0.,c(:,:,:,:,l),n_xi*n_energy**2)
                       end do
                       if (i_proc/=0) then
-                         call MPI_SEND(c,size(c),MPI_REAL8,0,1234,MPI_COMM_WORLD,ierror)
+                         call MPI_SEND(c,size(c),MPI_REAL8,0,1234,CGYRO_COMM_WORLD,ierror)
                       end if
                    else
                       if (i_proc==0) then
-                         call MPI_RECV(c,size(c),MPI_REAL8,proc_c(ic,itor)-1,1234,MPI_COMM_WORLD,status,ierror)
+                         call MPI_RECV(c,size(c),MPI_REAL8,proc_c(ic,itor)-1,1234,CGYRO_COMM_WORLD,status,ierror)
                       end if
                    end if
 
@@ -1285,18 +1285,18 @@ subroutine cgyro_init_landau()
              end do
           enddo
        end if
-       call MPI_reduce(md,d,1,MPI_REAL8,MPI_MAX,0,MPI_COMM_WORLD,ierror)
+       call MPI_reduce(md,d,1,MPI_REAL8,MPI_MAX,0,CGYRO_COMM_WORLD,ierror)
        if (i_proc==0) print 11,'Max. deviation over all processors:',d
 11      format ('cgyro_in._col.: ',A,G23.16)
 
-       call MPI_Barrier(MPI_COMM_WORLD,ierror)
+       call MPI_Barrier(CGYRO_COMM_WORLD,ierror)
        call MPI_finalize(ierror)
        stop
     end if coltestmode
 
-!!$    call MPI_Barrier(MPI_COMM_WORLD,ierror)
+!!$    call MPI_Barrier(CGYRO_COMM_WORLD,ierror)
 !!$    print *,'i_proc',i_proc,'done with init_landau'
-!!$    call MPI_Barrier(MPI_COMM_WORLD,ierror)
+!!$    call MPI_Barrier(CGYRO_COMM_WORLD,ierror)
 
   contains
     elemental real function sinc(target_k,halfperiod)

From 03d5d4dff45753f50de26f64c993cb8bddd00bbc Mon Sep 17 00:00:00 2001
From: Klaus Hallatschek <Klaus.Hallatschek@ipp.mpg.de>
Date: Sat, 23 Nov 2024 10:11:50 +0100
Subject: [PATCH 7/9] ifdef PGI -> ifdef LANDAU_PREC16

Higher precision mode for comparison purposes
---
 cgyro/src/cgyro_init_collision_landau.F90 |  3 ---
 shared/landau/landau.F90                  | 10 ++++++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/cgyro/src/cgyro_init_collision_landau.F90 b/cgyro/src/cgyro_init_collision_landau.F90
index ff58c8e72..49f8b4447 100644
--- a/cgyro/src/cgyro_init_collision_landau.F90
+++ b/cgyro/src/cgyro_init_collision_landau.F90
@@ -106,9 +106,6 @@ subroutine cgyro_init_landau()
     if (i_proc==0) print 1,'WARNING: dens_rot not yet implemented!!'
     if (i_proc==0) print 1,'WARNING: nu_global not yet implemented!!'
     if (i_proc==0 .and. collision_test_mode==1) print 1,'collision_test_mode==1, comparing ...'
-#ifdef __PGI
-    if (i_proc==0) print 1,'WARNING: precision loss in landau.F90 - can''t use quad precision in PGI!!'
-#endif
     if (i_proc==0 .and. maxval(abs(temp(1:n_species)-temp(1)))/=0) then
        print 1,'Warning: Landau not yet working for different species temperatures!!'
     end if
diff --git a/shared/landau/landau.F90 b/shared/landau/landau.F90
index a305eed95..c4a6fbf42 100644
--- a/shared/landau/landau.F90
+++ b/shared/landau/landau.F90
@@ -1,14 +1,17 @@
 !  gfortran -march=native -O3 -fno-stack-arrays -fimplicit-none -fdefault-real-8 landau.F90  -c
 !intel:
 !ifort -stand f15 -warn all -march=native -O3 -heap-arrays 10 -implicitnone -real-size 64 landau.F90 -c
-#ifndef __PGI
-  !can't use quad precision with frumpy pgi compiler
+
+#ifdef LANDAU_PREC16
+!at least in the past couldn't use quad precision with frumpy pgi compiler
+!for comparison purposes
 #define HIREAL real(16)
 #define HRS _16
 #else
 #define HIREAL real
 #define HRS
 #endif
+
 module landau
   HIREAL, parameter,private :: pi1=atan(1.HRS)*4
   ! real, parameter :: intnorm=pi1**(-1.5)*4*pi1
@@ -85,14 +88,13 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu
     addcutoff1=.true.
     if (present(addcutoff)) addcutoff1=addcutoff
 
-#ifndef __PGI
 2   format (A,3G25.16)
     
     if (verbose>2) then
        print 2,'1,lo,hi',fct1lo(xmaxx),fct1hi(xmaxx),fct1(xmaxx)
        print 2,'2,lo,hi',fct2lo(xmaxx),fct2hi(xmaxx),fct2(xmaxx)
     end if
-#endif
+
     ! lor_int and dif_int are symmetric in i,j
     ! t1t2_int is not symmetric (but not antisymmetric.)
     ! The first index is in that case the output index

From fd36c77d24698b3ee13e0b92b3ae688f65f7d494 Mon Sep 17 00:00:00 2001
From: Klaus Hallatschek <Klaus.Hallatschek@ipp.mpg.de>
Date: Sat, 23 Nov 2024 10:23:28 +0100
Subject: [PATCH 8/9] Added LANDAU_PREC16 comment to TUMBLEWEED

---
 platform/build/make.inc.TUMBLEWEED | 1 +
 1 file changed, 1 insertion(+)

diff --git a/platform/build/make.inc.TUMBLEWEED b/platform/build/make.inc.TUMBLEWEED
index 19712de78..fd15f0dec 100644
--- a/platform/build/make.inc.TUMBLEWEED
+++ b/platform/build/make.inc.TUMBLEWEED
@@ -15,6 +15,7 @@ NUMAS_PER_NODE=1
 # Compilers
 
 FC     = mpifort -DDIMATCOPY -fallow-argument-mismatch -fall-intrinsics -fimplicit-none -J $(GACODE_ROOT)/modules #-fPIC
+#FC     = mpifort -DDIMATCOPY -DLANDAU_PREC16 -fallow-argument-mismatch -fall-intrinsics -fimplicit-none -J $(GACODE_ROOT)/modules #-fPIC
 #FC     = mpifort -std=f2008 -fall-intrinsics -fimplicit-none -J $(GACODE_ROOT)/modules #-fPIC
 F77    = mpifort -fimplicit-none -fallow-argument-mismatch
 FOMP   =-fopenmp

From 4bf8353ab8373b8659ee175ca9452a259a201447 Mon Sep 17 00:00:00 2001
From: Klaus Hallatschek <Klaus.Hallatschek@ipp.mpg.de>
Date: Thu, 28 Nov 2024 14:14:16 +0100
Subject: [PATCH 9/9] use kind parameter instead of macros in landau.F90

to accomodate nvidia compiler preprocessor problems
---
 shared/landau/landau.F90 | 47 ++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/shared/landau/landau.F90 b/shared/landau/landau.F90
index c4a6fbf42..e265f851f 100644
--- a/shared/landau/landau.F90
+++ b/shared/landau/landau.F90
@@ -3,21 +3,20 @@
 !ifort -stand f15 -warn all -march=native -O3 -heap-arrays 10 -implicitnone -real-size 64 landau.F90 -c
 
 #ifdef LANDAU_PREC16
-!at least in the past couldn't use quad precision with frumpy pgi compiler
-!for comparison purposes
-#define HIREAL real(16)
-#define HRS _16
+!still cannot use quad precision with nvhpc/24.3 compiler version
+#define LP 16
 #else
-#define HIREAL real
-#define HRS
+#define LP 8
 #endif
 
+
 module landau
-  HIREAL, parameter,private :: pi1=atan(1.HRS)*4
+  integer, parameter,private :: PREC=LP
+  real(PREC), parameter,private :: pi1=atan(1._PREC)*4
   ! real, parameter :: intnorm=pi1**(-1.5)*4*pi1
   ! int gphys(v) d^3v=1=int g(x) x^2 dx *intnor
   ! Maybe one should normalise this once for a given xmax and density 1?
-  real, parameter :: vunit=real(sqrt(2.HRS))
+  real, parameter :: vunit=real(sqrt(2._PREC))
   ! v*vunit is v in units of sqrt(T/m)
   ! v itself is v in units of sqrt(2T/m) ("collision operator units")
   real, parameter :: intnorm=real(4/sqrt(pi1))
@@ -79,7 +78,7 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu
     real p0,p1,pi,q0,q1,qi,r0,r1,ri,s0,s1,si,x,xmax1
     real f1,f2
     integer i,j,k
-    HIREAL xmaxx,x1
+    real(PREC) xmaxx,x1
     logical t1t2,addcutoff1
     real t1,t2 ! for timing
 
@@ -289,24 +288,24 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu
 
     if (verbose>0) print '(A,G13.3)','gentestkernel took ',t2-t1
   contains
-    elemental HIREAL function fct1(w)
+    elemental real(PREC) function fct1(w)
       implicit none
-      HIREAL,intent(in) :: w
-      fct1=.125HRS*w**(-3)*(exp(-w*w)*w+(2*w*w-1)*.5HRS*sqrt(pi1)*erf(w))
+      real(PREC),intent(in) :: w
+      fct1=.125_PREC*w**(-3)*(exp(-w*w)*w+(2*w*w-1)*.5_PREC*sqrt(pi1)*erf(w))
     end function fct1
-    elemental HIREAL function fct2(w)
+    elemental real(PREC) function fct2(w)
       implicit none
-      HIREAL,intent(in) :: w
-      fct2=.25HRS*w**(-1)*(.5HRS*sqrt(pi1)*erf(w)-w*exp(-w**2))
+      real(PREC),intent(in) :: w
+      fct2=.25_PREC*w**(-1)*(.5_PREC*sqrt(pi1)*erf(w)-w*exp(-w**2))
     end function fct2
-    elemental HIREAL function fct1lo(w)
+    elemental real(PREC) function fct1lo(w)
       implicit none
-      HIREAL,intent(in) :: w
+      real(PREC),intent(in) :: w
       fct1lo=fct1(w)-exp(-xmaxx**2)/6  ! for w<xmax
     end function fct1lo
-    elemental HIREAL function fct1hi(w)
+    elemental real(PREC) function fct1hi(w)
       implicit none
-      HIREAL,intent(in) :: w
+      real(PREC),intent(in) :: w
       !  fct1hi(w)=w**(-3)*((1./24)*exp(-xmaxx**2)*xmaxx*(3-6*w**2+2*xmaxx**2)+(sqrt(pi1)/16)*(2*w**2-1)*erf(xmaxx))
       fct1hi=-w**(-3)/48*((-2*exp(-xmaxx**2)*xmaxx*(3+2*xmaxx**2)+3*sqrt(pi1)*erf(xmaxx))+&
            w**2*(12*exp(-xmaxx**2)*xmaxx-6*sqrt(pi1)*erf(xmaxx)))
@@ -314,15 +313,15 @@ elemental HIREAL function fct1hi(w)
       !./hhv 5 .5 0 50 1 1
       !was due to inaccuracies in this routine, likely due to cancellations in the fcts.
     end function fct1hi
-    elemental HIREAL function fct2lo(w)
+    elemental real(PREC) function fct2lo(w)
       implicit none
-      HIREAL,intent(in) :: w
+      real(PREC),intent(in) :: w
       fct2lo=fct2(w)-w*w*exp(-xmaxx**2)/6 !for w<xmax
     end function fct2lo
-    elemental HIREAL function fct2hi(w)
+    elemental real(PREC) function fct2hi(w)
       implicit none
-      HIREAL,intent(in) :: w
-      fct2hi=-w**(-1)*((1.HRS/12)*xmaxx*exp(-xmaxx**2)*(3+2*xmaxx**2)-(sqrt(pi1)/8)*erf(xmaxx))
+      real(PREC),intent(in) :: w
+      fct2hi=-w**(-1)*((1._PREC/12)*xmaxx*exp(-xmaxx**2)*(3+2*xmaxx**2)-(sqrt(pi1)/8)*erf(xmaxx))
     end function fct2hi
   end subroutine gentestkernel