From d0a5243f90ba0ba7999f77c13870caa04d5dbfcc Mon Sep 17 00:00:00 2001 From: Klaus Hallatschek Date: Fri, 6 Sep 2024 20:39:52 +0200 Subject: [PATCH 1/9] Loop adapted to strict compiler scrutiny in DIMATCOPY opt out case. (Was previously traditional fortran practice.) --- cgyro/src/cgyro_init_collision_landau.F90 | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cgyro/src/cgyro_init_collision_landau.F90 b/cgyro/src/cgyro_init_collision_landau.F90 index 836e287db..3387f624e 100644 --- a/cgyro/src/cgyro_init_collision_landau.F90 +++ b/cgyro/src/cgyro_init_collision_landau.F90 @@ -731,9 +731,13 @@ subroutine cgyro_init_landau() #endif #else !alternative to domatcopy - do k=1,n_xi*n_energy - do j=1,n_xi*n_energy - gyrocolmat(j,1,k,1,ib,ia,ik)=gyrocolmat(k,1,j,1,ia,ib,ik) + do k=1,n_xi + do l=1,n_energy + do j=1,n_xi + do m=1,n_energy + gyrocolmat(m,j,l,k,ib,ia,ik)=gyrocolmat(l,k,m,j,ia,ib,ik) + end do + end do end do end do #endif From 21a271e7f87f7a19a914a7c46a47693e3e5296f2 Mon Sep 17 00:00:00 2001 From: Klaus Hallatschek Date: Thu, 7 Nov 2024 14:30:59 +0100 Subject: [PATCH 2/9] std=f2008 for debug option for tumbleweed --- platform/build/make.inc.TUMBLEWEED | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/platform/build/make.inc.TUMBLEWEED b/platform/build/make.inc.TUMBLEWEED index c502f5b36..5fc8e26c5 100644 --- a/platform/build/make.inc.TUMBLEWEED +++ b/platform/build/make.inc.TUMBLEWEED @@ -14,14 +14,14 @@ NUMAS_PER_NODE=1 # Compilers -FC = mpifort -DDIMATCOPY -std=gnu -fallow-argument-mismatch -fall-intrinsics -fimplicit-none -J $(GACODE_ROOT)/modules #-fPIC +FC = mpifort -DDIMATCOPY -fallow-argument-mismatch -fall-intrinsics -fimplicit-none -J $(GACODE_ROOT)/modules #-fPIC #FC = mpifort -std=f2008 -fall-intrinsics -fimplicit-none -J $(GACODE_ROOT)/modules #-fPIC F77 = mpifort -fimplicit-none -fallow-argument-mismatch FOMP =-fopenmp FMATH =-fdefault-real-8 -fdefault-double-8 -I$(FFTW_INC) -FOPT =-O3 -fexternal-blas -march=native -ffree-line-length-0 +FOPT =-O3 -std=gnu -fexternal-blas -march=native -ffree-line-length-0 # -floop-nest-optimize -floop-parallelize-all -flto -FDEBUG =-Wall -W -fcheck=all -g -fbacktrace -ffpe-trap=invalid,zero,overflow -finit-real=snan # -std=f2003 -fall-intrinsics +FDEBUG =-Wall -W -fcheck=all -g -fbacktrace -ffpe-trap=invalid,zero,overflow -finit-real=snan -std=f2008 -fall-intrinsics F2PY = f2py # System math libraries From 68084a1c44fe8f60e8a3f68de25a9163df88274d Mon Sep 17 00:00:00 2001 From: Klaus Hallatschek Date: Thu, 7 Nov 2024 18:31:31 +0100 Subject: [PATCH 3/9] no unused variable, rel comp, int div warning --- platform/build/make.inc.TUMBLEWEED | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/platform/build/make.inc.TUMBLEWEED b/platform/build/make.inc.TUMBLEWEED index 5fc8e26c5..19712de78 100644 --- a/platform/build/make.inc.TUMBLEWEED +++ b/platform/build/make.inc.TUMBLEWEED @@ -21,7 +21,7 @@ FOMP =-fopenmp FMATH =-fdefault-real-8 -fdefault-double-8 -I$(FFTW_INC) FOPT =-O3 -std=gnu -fexternal-blas -march=native -ffree-line-length-0 # -floop-nest-optimize -floop-parallelize-all -flto -FDEBUG =-Wall -W -fcheck=all -g -fbacktrace -ffpe-trap=invalid,zero,overflow -finit-real=snan -std=f2008 -fall-intrinsics +FDEBUG =-Wall -W -fcheck=all -Wno-compare-reals -Wno-integer-division -Wno-unused-variable -g -fbacktrace -ffpe-trap=invalid,zero,overflow -finit-real=snan -std=f2008 -fall-intrinsics F2PY = f2py # System math libraries From 88ba3b6cb379eddfc5fb229c77667cf3b9c329a4 Mon Sep 17 00:00:00 2001 From: Klaus Hallatschek Date: Thu, 7 Nov 2024 18:41:26 +0100 Subject: [PATCH 4/9] liquidat. obsol. statement fct in half_hermite.f90 --- shared/math/half_hermite.f90 | 48 +++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/shared/math/half_hermite.f90 b/shared/math/half_hermite.f90 index f81a47b85..e294605cf 100644 --- a/shared/math/half_hermite.f90 +++ b/shared/math/half_hermite.f90 @@ -90,7 +90,6 @@ subroutine half_hermite_norm(n,xmin,xmax,sign,alpha,a1,b1,c1,a,bsq,logg,verbos& real :: t00,t10 real, parameter :: mori=.5,epsmori=1e-6,epsbase=5e-16,epsgauss=1e-18 integer ngauss - real w,sech2,sech,sechb,tanhp1,tanhm1,tanhb real eps integer i,j,k,ia integer leftnew,rightnew ! How many points of the left or right summation have to be computed @@ -164,15 +163,6 @@ subroutine half_hermite_norm(n,xmin,xmax,sign,alpha,a1,b1,c1,a,bsq,logg,verbos& ! n(i+1)=|-a(i)n(i)pn(i,x=0)-b(i)n(i-1)pn(i,x=0)| ! But we normalise to - - w(x)=x**alpha*exp(-sign*x*x) ! inline function for weight - sechb(x)=2.*x/(1.+x*x) - sech(x)=sechb(exp(-x)) - sech2(x)=sech(abs(x))**2 - tanhb(x)=2*x/(1+x) - tanhp1(x)=tanhb(exp(2*x)) ! tanh(x)+1 for x<0 approx. 2*exp(2*x) for x<<-1 - tanhm1(x)=-tanhb(exp(-2*x)) ! tanh(x)-1 for x>0 - if (n==0) return nn=n vb=present(verbose) @@ -182,7 +172,7 @@ subroutine half_hermite_norm(n,xmin,xmax,sign,alpha,a1,b1,c1,a,bsq,logg,verbos& t00=t0 t10=t1 if (vb) then - ngauss=n+xmax**2+xmax*sqrt(-2*log(epsgauss))-log(epsgauss)/3 + ngauss=n+floor(xmax**2+xmax*sqrt(-2*log(epsgauss))-log(epsgauss)/3) ngauss=n+ceiling(10*xmax)+ceiling(alpha/2) print *,'ngauss=',ngauss allocate(gw(ngauss),gp(ngauss),gw2(ngauss*2),gp2(ngauss*2)) @@ -671,6 +661,42 @@ subroutine half_hermite_norm(n,xmin,xmax,sign,alpha,a1,b1,c1,a,bsq,logg,verbos& enddo if (vb) print *,'hhn used mem',2*(nsafe+nintervals) deallocate(mw,mp,poly,poly1) !deallocation may be necessary because these are pointers. + contains + elemental real function w(x) + implicit none + real,intent(in) :: x + w=x**alpha*exp(-sign*x*x) ! inline function for weight + end function w + elemental real function sechb(x) + implicit none + real,intent(in) :: x + sechb=2.*x/(1.+x*x) + end function sechb + elemental real function sech(x) + implicit none + real,intent(in) :: x + sech=sechb(exp(-x)) + end function sech + elemental real function sech2(x) + implicit none + real,intent(in) :: x + sech2=sech(abs(x))**2 + end function sech2 + elemental real function tanhb(x) + implicit none + real,intent(in) :: x + tanhb=2*x/(1+x) + end function tanhb + elemental real function tanhp1(x) + implicit none + real,intent(in) :: x + tanhp1=tanhb(exp(2*x)) ! tanh(x)+1 for x<0 approx. 2*exp(2*x) for x<<-1 + end function tanhp1 + elemental real function tanhm1(x) + implicit none + real,intent(in) :: x + tanhm1=-tanhb(exp(-2*x)) ! tanh(x)-1 for x>0 + end function tanhm1 end subroutine half_hermite_norm subroutine gauss_nodes_functions(n,a,bsq,sp,ortho_f) From b7f0217ee22528013e082d62c982fa4d88ce49d7 Mon Sep 17 00:00:00 2001 From: Klaus Hallatschek Date: Thu, 7 Nov 2024 18:32:47 +0100 Subject: [PATCH 5/9] liquid. obsolescent statement fct in landau.F90 --- shared/landau/landau.F90 | 126 +++++++++++++++++++-------------------- 1 file changed, 63 insertions(+), 63 deletions(-) diff --git a/shared/landau/landau.F90 b/shared/landau/landau.F90 index 6532f9b88..a305eed95 100644 --- a/shared/landau/landau.F90 +++ b/shared/landau/landau.F90 @@ -1,26 +1,28 @@ ! gfortran -march=native -O3 -fno-stack-arrays -fimplicit-none -fdefault-real-8 landau.F90 -c !intel: !ifort -stand f15 -warn all -march=native -O3 -heap-arrays 10 -implicitnone -real-size 64 landau.F90 -c -module landau #ifndef __PGI !can't use quad precision with frumpy pgi compiler - real(16), parameter,private :: pi1=atan(1._16)*4 +#define HIREAL real(16) +#define HRS _16 +#else +#define HIREAL real +#define HRS +#endif +module landau + HIREAL, parameter,private :: pi1=atan(1.HRS)*4 ! real, parameter :: intnorm=pi1**(-1.5)*4*pi1 ! int gphys(v) d^3v=1=int g(x) x^2 dx *intnor ! Maybe one should normalise this once for a given xmax and density 1? - real, parameter :: vunit=sqrt(2._16) + real, parameter :: vunit=real(sqrt(2.HRS)) ! v*vunit is v in units of sqrt(T/m) ! v itself is v in units of sqrt(2T/m) ("collision operator units") -#else - real, parameter,private :: pi1=atan(1.)*4 - real, parameter :: vunit=sqrt(2.) -#endif - real, parameter :: intnorm=4/sqrt(pi1) - real, parameter :: normcol=sqrt(8/pi1) + real, parameter :: intnorm=real(4/sqrt(pi1)) + real, parameter :: normcol=real(sqrt(8/pi1)) ! Normalisation of ctest/emat cfield/emat for the self-collisions of the ! reference species without caa or cab. ! Note: must take into account muref^3=sqrt(1/8), since mu^2=m/(2T) - real, parameter :: normcolcgyro=4/sqrt(pi1) + real, parameter :: normcolcgyro=real(4/sqrt(pi1)) integer :: verbose=0 contains @@ -56,7 +58,7 @@ real function tau_ab_inv(mua,mub,nb,za,zb,Ta,Tb) if (present(nb)) nb1=nb if (present(mua)) mua1=mua if (present(mub)) mub1=mub - tau_ab_inv=sqrt(8./pi1)/3*za1**2*zb1**2*nb1/(mua1*Ta1**2)*& + tau_ab_inv=real(sqrt(8./pi1))/3*za1**2*zb1**2*nb1/(mua1*Ta1**2)*& (1+mua1**2*Ta1/(mub1**2*Tb1))/(1+mua1**2/(mub1**2))**1.5 end function tau_ab_inv @@ -71,36 +73,13 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu optional :: t1t2_int ! only calculate that term if it is present. optional :: addcutoff !allocate(lor_int(n,n),dif_int(n,n)) - real p0,p1,pi,q0,q1,qi,r0,r1,ri,s0,s1,si,x,x1,xmax1 + real p0,p1,pi,q0,q1,qi,r0,r1,ri,s0,s1,si,x,xmax1 real f1,f2 integer i,j,k -#ifndef __PGI - real(16) fct1,fct2,fct1lo,fct1hi,fct2lo,fct2hi !inline functions - real(16) w,xmaxx -#else - real fct1,fct2,fct1lo,fct1hi,fct2lo,fct2hi !inline functions - real w,xmaxx -#endif + HIREAL xmaxx,x1 logical t1t2,addcutoff1 real t1,t2 ! for timing -#ifndef __PGI - fct1(w)=.125_16*w**(-3)*(exp(-w*w)*w+(2*w*w-1)*.5_16*sqrt(pi1)*erf(w)) - fct2(w)=.25_16*w**(-1)*(.5_16*sqrt(pi1)*erf(w)-w*exp(-w**2)) - fct2hi(w)=-w**(-1)*((1._16/12)*xmaxx*exp(-xmaxx**2)*(3+2*xmaxx**2)-(sqrt(pi1)/8)*erf(xmaxx)) -#else - fct1(w)=.125*w**(-3)*(exp(-w*w)*w+(2*w*w-1)*.5*sqrt(pi1)*erf(w)) - fct2(w)=.25*w**(-1)*(.5*sqrt(pi1)*erf(w)-w*exp(-w**2)) - fct2hi(w)=-w**(-1)*((1./12)*xmaxx*exp(-xmaxx**2)*(3+2*xmaxx**2)-(sqrt(pi1)/8)*erf(xmaxx)) -#endif - fct1lo(w)=fct1(w)-exp(-xmaxx**2)/6 ! for w1) then do k=1,ngauss x1=xmax*(1+(beta-1)*gp(k)) - x=x1/beta -#ifndef __PGI - f1=fct1hi(real(x1,16)) - f2=fct2hi(real(x1,16)) -#else - f1=fct1hi(x1) - f2=fct2hi(x1) -#endif + x=real(x1)/beta + f1=real(fct1hi(x1)) + f2=real(fct2hi(x1)) do j=1,n if (j==1) then @@ -281,7 +245,7 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu s1=0 si=0 lor_int(j,i)=lor_int(j,i)+pi*ri*f1 - if (t1t2) t1t2_int(j,i)=t1t2_int(j,i)+2*qi*ri*x1*f2 + if (t1t2) t1t2_int(j,i)=t1t2_int(j,i)+2*qi*ri*real(x1)*f2 ! ^^ vgl 1.8.19 (2) do i=2,n s0=si @@ -292,7 +256,7 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu r1=r0 lor_int(j,i)=lor_int(j,i)+pi*ri*f1 dif_int(j,i)=dif_int(j,i)+qi*si*f2 - if (t1t2) t1t2_int(j,i)=t1t2_int(j,i)+2*qi*ri*x1*f2 + if (t1t2) t1t2_int(j,i)=t1t2_int(j,i)+2*qi*ri*real(x1)*f2 enddo enddo enddo @@ -320,8 +284,44 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu ! i.e. we multiply with beta!! if (t1t2) t1t2_int=t1t2_int*beta**(-4) ! ***<--- must be multiplied with (Ta/Tb-1) call cpu_time(t2) - + if (verbose>0) print '(A,G13.3)','gentestkernel took ',t2-t1 + contains + elemental HIREAL function fct1(w) + implicit none + HIREAL,intent(in) :: w + fct1=.125HRS*w**(-3)*(exp(-w*w)*w+(2*w*w-1)*.5HRS*sqrt(pi1)*erf(w)) + end function fct1 + elemental HIREAL function fct2(w) + implicit none + HIREAL,intent(in) :: w + fct2=.25HRS*w**(-1)*(.5HRS*sqrt(pi1)*erf(w)-w*exp(-w**2)) + end function fct2 + elemental HIREAL function fct1lo(w) + implicit none + HIREAL,intent(in) :: w + fct1lo=fct1(w)-exp(-xmaxx**2)/6 ! for w Date: Sat, 23 Nov 2024 10:00:22 +0100 Subject: [PATCH 6/9] c._in._coll._landau.F90: MPI_COMM..->CGYRO_COMM.. --- cgyro/src/cgyro_init_collision_landau.F90 | 46 +++++++++++------------ 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/cgyro/src/cgyro_init_collision_landau.F90 b/cgyro/src/cgyro_init_collision_landau.F90 index 3387f624e..ff58c8e72 100644 --- a/cgyro/src/cgyro_init_collision_landau.F90 +++ b/cgyro/src/cgyro_init_collision_landau.F90 @@ -25,7 +25,7 @@ module cgyro_init_collision_landau subroutine cgyro_init_landau() ! populate cmat with Galerkin based gyrokinetic Landau operator. ! cmat1 is only for comparison purposes - use cgyro_globals, only : vth,temp,mass,dens,temp_ele,mass_ele,dens_ele,rho,z,& + use cgyro_globals, only : CGYRO_COMM_WORLD,vth,temp,mass,dens,temp_ele,mass_ele,dens_ele,rho,z,& n_energy,e_max,n_xi,n_radial,n_theta,n_species,n_toroidal,nt1,nt2,nc_loc,nc1,nc2,nc,nv,& nu_ee,& xi,w_xi,& !needed for projleg calc @@ -119,7 +119,7 @@ subroutine cgyro_init_landau() gtvb=1 end if - !$ call MPI_Barrier(MPI_COMM_WORLD,ierror) ! may improve timing + !$ call MPI_Barrier(CGYRO_COMM_WORLD,ierror) ! may improve timing call cpu_time(t1) ns=ispec(n_species,n_species) !number of non-redundant species pairs xmax=sqrt(e_max) !cut off at exp(-xmax^2) @@ -146,7 +146,7 @@ subroutine cgyro_init_landau() end do ! kperp_bmag_max is not completely global, there is still the n dependence. ! we need to maximize over the toroidal mode numbers: - call MPI_ALLREDUCE(MPI_IN_PLACE,kperp_bmag_max,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,ierror) + call MPI_ALLREDUCE(MPI_IN_PLACE,kperp_bmag_max,1,MPI_REAL8,MPI_MAX,CGYRO_COMM_WORLD,ierror) rhomax=maxval(abs(rho_spec([(i,i=1,n_species)])))*xmax kperprhomax=kperp_bmag_max*rhomax if (verbose>0 .and. i_proc==0) print 6,'using kperprhomax=',kperprhomax @@ -911,13 +911,13 @@ subroutine cgyro_init_landau() print 7,'pre_scatter timing:' do i=1,n_proc if (i>1) then - call MPI_Recv(t,11,MPI_REAL8,i-1,i-1,MPI_COMM_WORLD,status,ierror) + call MPI_Recv(t,11,MPI_REAL8,i-1,i-1,CGYRO_COMM_WORLD,status,ierror) end if 5 format("init_collision_landau: ",A,I0,A,7G24.16E3,A,I0,A,G24.16E3) print 5,'i_proc=',i-1,' took',t(1:7),' load ',load(i),' rel',t(3)/load(i) end do else - call MPI_Send(t,11,MPI_REAL8,0,i_proc,MPI_COMM_WORLD,ierror) + call MPI_Send(t,11,MPI_REAL8,0,i_proc,CGYRO_COMM_WORLD,ierror) end if call cpu_time(t1) ! Now do the scatter @@ -931,17 +931,17 @@ subroutine cgyro_init_landau() ib=idx+1 if (proc(ik,ia,ib)/=0) then !!$ do j=1,n_proc -!!$ call MPI_BARRIER(MPI_COMM_WORLD,ierror) +!!$ call MPI_BARRIER(CGYRO_COMM_WORLD,ierror) !!$! if (i_proc==0 .and. verbose>100) then !!$ if (i_proc==j-1) print *,'bcasting (ik,ia,ib,proc)=',ik,ia,ib,proc(ik,ia,ib)-1,'ip',i_proc !!$ ! end if -!!$ call MPI_BARRIER(MPI_COMM_WORLD,ierror) +!!$ call MPI_BARRIER(CGYRO_COMM_WORLD,ierror) !!$ end do call MPI_Bcast(gyrocolmat(:,:,:,:,ia,ib,ik),n_xi**2*n_energy**2,& - MPI_REAL8,proc(ik,ia,ib)-1,MPI_COMM_WORLD,ierror) + MPI_REAL8,proc(ik,ia,ib)-1,CGYRO_COMM_WORLD,ierror) if (ia>ib .and. temp(ia)==temp(ib)) then call MPI_Bcast(gyrocolmat(:,:,:,:,ib,ia,ik),n_xi**2*n_energy**2,& - MPI_REAL8,proc(ik,ia,ib)-1,MPI_COMM_WORLD,ierror) + MPI_REAL8,proc(ik,ia,ib)-1,CGYRO_COMM_WORLD,ierror) end if end if enddo @@ -1003,12 +1003,12 @@ subroutine cgyro_init_landau() if (i_proc==0) then do i=1,n_proc if (i>1) then - call MPI_Recv(t,11,MPI_REAL8,i-1,i-1,MPI_COMM_WORLD,status,ierror) + call MPI_Recv(t,11,MPI_REAL8,i-1,i-1,CGYRO_COMM_WORLD,status,ierror) end if print *,'i_proc=',i-1,'took',t(1:10),'load',load(i),'rel',t(3)/load(i) end do else - call MPI_Send(t,11,MPI_REAL8,0,i_proc,MPI_COMM_WORLD,ierror) + call MPI_Send(t,11,MPI_REAL8,0,i_proc,CGYRO_COMM_WORLD,ierror) end if coltestmode: if(collision_test_mode==1) then @@ -1036,10 +1036,10 @@ subroutine cgyro_init_landau() nt2_proc(i_proc+1)=nt2 proc_c=0 ! dummy value if no processor is responsible do i=1,n_proc - call MPI_BCAST(nc1_proc(i),1,MPI_INTEGER,i-1,MPI_COMM_WORLD,ierror) - call MPI_BCAST(nc2_proc(i),1,MPI_INTEGER,i-1,MPI_COMM_WORLD,ierror) - call MPI_BCAST(nt1_proc(i),1,MPI_INTEGER,i-1,MPI_COMM_WORLD,ierror) - call MPI_BCAST(nt2_proc(i),1,MPI_INTEGER,i-1,MPI_COMM_WORLD,ierror) + call MPI_BCAST(nc1_proc(i),1,MPI_INTEGER,i-1,CGYRO_COMM_WORLD,ierror) + call MPI_BCAST(nc2_proc(i),1,MPI_INTEGER,i-1,CGYRO_COMM_WORLD,ierror) + call MPI_BCAST(nt1_proc(i),1,MPI_INTEGER,i-1,CGYRO_COMM_WORLD,ierror) + call MPI_BCAST(nt2_proc(i),1,MPI_INTEGER,i-1,CGYRO_COMM_WORLD,ierror) ! this assigns the processor with the highest number to the respective ! nc and nt range proc_c(nc1_proc(i):nc2_proc(i),nt1_proc(i):nt2_proc(i))=i @@ -1094,11 +1094,11 @@ subroutine cgyro_init_landau() **2,L2xi,n_xi,0.,c(:,:,:,:,l),n_xi*n_energy**2) end do if (i_proc/=0) then - call MPI_SEND(c,size(c),MPI_REAL8,0,1234,MPI_COMM_WORLD,ierror) + call MPI_SEND(c,size(c),MPI_REAL8,0,1234,CGYRO_COMM_WORLD,ierror) end if else if (i_proc==0) then - call MPI_RECV(c,size(c),MPI_REAL8,proc_c(ic,itor)-1,1234,MPI_COMM_WORLD,status,ierror) + call MPI_RECV(c,size(c),MPI_REAL8,proc_c(ic,itor)-1,1234,CGYRO_COMM_WORLD,status,ierror) end if end if !!$ block @@ -1227,11 +1227,11 @@ subroutine cgyro_init_landau() **2,L2xi,n_xi,0.,c(:,:,:,:,l),n_xi*n_energy**2) end do if (i_proc/=0) then - call MPI_SEND(c,size(c),MPI_REAL8,0,1234,MPI_COMM_WORLD,ierror) + call MPI_SEND(c,size(c),MPI_REAL8,0,1234,CGYRO_COMM_WORLD,ierror) end if else if (i_proc==0) then - call MPI_RECV(c,size(c),MPI_REAL8,proc_c(ic,itor)-1,1234,MPI_COMM_WORLD,status,ierror) + call MPI_RECV(c,size(c),MPI_REAL8,proc_c(ic,itor)-1,1234,CGYRO_COMM_WORLD,status,ierror) end if end if @@ -1285,18 +1285,18 @@ subroutine cgyro_init_landau() end do enddo end if - call MPI_reduce(md,d,1,MPI_REAL8,MPI_MAX,0,MPI_COMM_WORLD,ierror) + call MPI_reduce(md,d,1,MPI_REAL8,MPI_MAX,0,CGYRO_COMM_WORLD,ierror) if (i_proc==0) print 11,'Max. deviation over all processors:',d 11 format ('cgyro_in._col.: ',A,G23.16) - call MPI_Barrier(MPI_COMM_WORLD,ierror) + call MPI_Barrier(CGYRO_COMM_WORLD,ierror) call MPI_finalize(ierror) stop end if coltestmode -!!$ call MPI_Barrier(MPI_COMM_WORLD,ierror) +!!$ call MPI_Barrier(CGYRO_COMM_WORLD,ierror) !!$ print *,'i_proc',i_proc,'done with init_landau' -!!$ call MPI_Barrier(MPI_COMM_WORLD,ierror) +!!$ call MPI_Barrier(CGYRO_COMM_WORLD,ierror) contains elemental real function sinc(target_k,halfperiod) From 03d5d4dff45753f50de26f64c993cb8bddd00bbc Mon Sep 17 00:00:00 2001 From: Klaus Hallatschek Date: Sat, 23 Nov 2024 10:11:50 +0100 Subject: [PATCH 7/9] ifdef PGI -> ifdef LANDAU_PREC16 Higher precision mode for comparison purposes --- cgyro/src/cgyro_init_collision_landau.F90 | 3 --- shared/landau/landau.F90 | 10 ++++++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cgyro/src/cgyro_init_collision_landau.F90 b/cgyro/src/cgyro_init_collision_landau.F90 index ff58c8e72..49f8b4447 100644 --- a/cgyro/src/cgyro_init_collision_landau.F90 +++ b/cgyro/src/cgyro_init_collision_landau.F90 @@ -106,9 +106,6 @@ subroutine cgyro_init_landau() if (i_proc==0) print 1,'WARNING: dens_rot not yet implemented!!' if (i_proc==0) print 1,'WARNING: nu_global not yet implemented!!' if (i_proc==0 .and. collision_test_mode==1) print 1,'collision_test_mode==1, comparing ...' -#ifdef __PGI - if (i_proc==0) print 1,'WARNING: precision loss in landau.F90 - can''t use quad precision in PGI!!' -#endif if (i_proc==0 .and. maxval(abs(temp(1:n_species)-temp(1)))/=0) then print 1,'Warning: Landau not yet working for different species temperatures!!' end if diff --git a/shared/landau/landau.F90 b/shared/landau/landau.F90 index a305eed95..c4a6fbf42 100644 --- a/shared/landau/landau.F90 +++ b/shared/landau/landau.F90 @@ -1,14 +1,17 @@ ! gfortran -march=native -O3 -fno-stack-arrays -fimplicit-none -fdefault-real-8 landau.F90 -c !intel: !ifort -stand f15 -warn all -march=native -O3 -heap-arrays 10 -implicitnone -real-size 64 landau.F90 -c -#ifndef __PGI - !can't use quad precision with frumpy pgi compiler + +#ifdef LANDAU_PREC16 +!at least in the past couldn't use quad precision with frumpy pgi compiler +!for comparison purposes #define HIREAL real(16) #define HRS _16 #else #define HIREAL real #define HRS #endif + module landau HIREAL, parameter,private :: pi1=atan(1.HRS)*4 ! real, parameter :: intnorm=pi1**(-1.5)*4*pi1 @@ -85,14 +88,13 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu addcutoff1=.true. if (present(addcutoff)) addcutoff1=addcutoff -#ifndef __PGI 2 format (A,3G25.16) if (verbose>2) then print 2,'1,lo,hi',fct1lo(xmaxx),fct1hi(xmaxx),fct1(xmaxx) print 2,'2,lo,hi',fct2lo(xmaxx),fct2hi(xmaxx),fct2(xmaxx) end if -#endif + ! lor_int and dif_int are symmetric in i,j ! t1t2_int is not symmetric (but not antisymmetric.) ! The first index is in that case the output index From fd36c77d24698b3ee13e0b92b3ae688f65f7d494 Mon Sep 17 00:00:00 2001 From: Klaus Hallatschek Date: Sat, 23 Nov 2024 10:23:28 +0100 Subject: [PATCH 8/9] Added LANDAU_PREC16 comment to TUMBLEWEED --- platform/build/make.inc.TUMBLEWEED | 1 + 1 file changed, 1 insertion(+) diff --git a/platform/build/make.inc.TUMBLEWEED b/platform/build/make.inc.TUMBLEWEED index 19712de78..fd15f0dec 100644 --- a/platform/build/make.inc.TUMBLEWEED +++ b/platform/build/make.inc.TUMBLEWEED @@ -15,6 +15,7 @@ NUMAS_PER_NODE=1 # Compilers FC = mpifort -DDIMATCOPY -fallow-argument-mismatch -fall-intrinsics -fimplicit-none -J $(GACODE_ROOT)/modules #-fPIC +#FC = mpifort -DDIMATCOPY -DLANDAU_PREC16 -fallow-argument-mismatch -fall-intrinsics -fimplicit-none -J $(GACODE_ROOT)/modules #-fPIC #FC = mpifort -std=f2008 -fall-intrinsics -fimplicit-none -J $(GACODE_ROOT)/modules #-fPIC F77 = mpifort -fimplicit-none -fallow-argument-mismatch FOMP =-fopenmp From 4bf8353ab8373b8659ee175ca9452a259a201447 Mon Sep 17 00:00:00 2001 From: Klaus Hallatschek Date: Thu, 28 Nov 2024 14:14:16 +0100 Subject: [PATCH 9/9] use kind parameter instead of macros in landau.F90 to accomodate nvidia compiler preprocessor problems --- shared/landau/landau.F90 | 47 ++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/shared/landau/landau.F90 b/shared/landau/landau.F90 index c4a6fbf42..e265f851f 100644 --- a/shared/landau/landau.F90 +++ b/shared/landau/landau.F90 @@ -3,21 +3,20 @@ !ifort -stand f15 -warn all -march=native -O3 -heap-arrays 10 -implicitnone -real-size 64 landau.F90 -c #ifdef LANDAU_PREC16 -!at least in the past couldn't use quad precision with frumpy pgi compiler -!for comparison purposes -#define HIREAL real(16) -#define HRS _16 +!still cannot use quad precision with nvhpc/24.3 compiler version +#define LP 16 #else -#define HIREAL real -#define HRS +#define LP 8 #endif + module landau - HIREAL, parameter,private :: pi1=atan(1.HRS)*4 + integer, parameter,private :: PREC=LP + real(PREC), parameter,private :: pi1=atan(1._PREC)*4 ! real, parameter :: intnorm=pi1**(-1.5)*4*pi1 ! int gphys(v) d^3v=1=int g(x) x^2 dx *intnor ! Maybe one should normalise this once for a given xmax and density 1? - real, parameter :: vunit=real(sqrt(2.HRS)) + real, parameter :: vunit=real(sqrt(2._PREC)) ! v*vunit is v in units of sqrt(T/m) ! v itself is v in units of sqrt(2T/m) ("collision operator units") real, parameter :: intnorm=real(4/sqrt(pi1)) @@ -79,7 +78,7 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu real p0,p1,pi,q0,q1,qi,r0,r1,ri,s0,s1,si,x,xmax1 real f1,f2 integer i,j,k - HIREAL xmaxx,x1 + real(PREC) xmaxx,x1 logical t1t2,addcutoff1 real t1,t2 ! for timing @@ -289,24 +288,24 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu if (verbose>0) print '(A,G13.3)','gentestkernel took ',t2-t1 contains - elemental HIREAL function fct1(w) + elemental real(PREC) function fct1(w) implicit none - HIREAL,intent(in) :: w - fct1=.125HRS*w**(-3)*(exp(-w*w)*w+(2*w*w-1)*.5HRS*sqrt(pi1)*erf(w)) + real(PREC),intent(in) :: w + fct1=.125_PREC*w**(-3)*(exp(-w*w)*w+(2*w*w-1)*.5_PREC*sqrt(pi1)*erf(w)) end function fct1 - elemental HIREAL function fct2(w) + elemental real(PREC) function fct2(w) implicit none - HIREAL,intent(in) :: w - fct2=.25HRS*w**(-1)*(.5HRS*sqrt(pi1)*erf(w)-w*exp(-w**2)) + real(PREC),intent(in) :: w + fct2=.25_PREC*w**(-1)*(.5_PREC*sqrt(pi1)*erf(w)-w*exp(-w**2)) end function fct2 - elemental HIREAL function fct1lo(w) + elemental real(PREC) function fct1lo(w) implicit none - HIREAL,intent(in) :: w + real(PREC),intent(in) :: w fct1lo=fct1(w)-exp(-xmaxx**2)/6 ! for w