Merge pull request SPECFEM#1345 from danielpeter/devel

fixes attenuation kernel in gpu version
cenjinzhiyu · Sep 9, 2019 · e2d28c9 · e2d28c9
2 parents 1603deb + 85a601b
commit e2d28c9
Show file tree

Hide file tree

Showing 15 changed files with 807 additions and 556 deletions.
diff --git a/src/cuda/assemble_MPI_scalar_cuda.cu b/src/cuda/assemble_MPI_scalar_cuda.cu
@@ -44,7 +44,7 @@ __global__ void prepare_boundary_potential_on_device(field* d_potential_dot_dot_
                                                      const int* d_nibool_interfaces_ext_mesh,
                                                      const int* d_ibool_interfaces_ext_mesh) {
 
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;
   int ientry,iglob;
 
   for(int iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
@@ -156,7 +156,7 @@ __global__ void assemble_boundary_potential_on_device(field* d_potential_dot_dot
                                                       const int* d_nibool_interfaces_ext_mesh,
                                                       const int* d_ibool_interfaces_ext_mesh) {
 
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;
   int ientry,iglob;
 
   for( int iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {

diff --git a/src/cuda/assemble_MPI_vector_cuda.cu b/src/cuda/assemble_MPI_vector_cuda.cu
@@ -44,7 +44,7 @@ __global__ void prepare_boundary_accel_on_device(realw* d_accel, realw* d_send_a
                                                  const int* d_nibool_interfaces_ext_mesh,
                                                  const int* d_ibool_interfaces_ext_mesh) {
 
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;
   int ientry,iglob;
 
   for( int iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
@@ -216,7 +216,7 @@ __global__ void assemble_boundary_accel_on_device(realw* d_accel, realw* d_send_
 
   //int bx = blockIdx.y*gridDim.x+blockIdx.x;
   //int tx = threadIdx.x;
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;
 
   int ientry,iglob;
 
@@ -257,7 +257,8 @@ __global__ void synchronize_boundary_accel_on_device(realw* d_accel, realw* d_se
 
   //int bx = blockIdx.y*gridDim.x+blockIdx.x;
   //int tx = threadIdx.x;
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;
+
   // printf("inside the synchronization!\n");
 
   int ientry,iglob;

diff --git a/src/cuda/check_fields_cuda.cu b/src/cuda/check_fields_cuda.cu
@@ -634,7 +634,8 @@ __global__ void get_maximum_vector_kernel(realw* array, int size, realw* d_max){
   // load shared mem
   unsigned int tid = threadIdx.x;
   unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
-  unsigned int i = tid + bx*blockDim.x;
+  //unsigned int i = tid + bx*blockDim.x;
+  unsigned int i = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;
 
   // loads values into shared memory: assume array is a vector array
   sdata[tid] = (i < size) ? (array[i*3]*array[i*3] + array[i*3+1]*array[i*3+1] + array[i*3+2]*array[i*3+2]) : 0.0 ;
@@ -722,6 +723,10 @@ void FC_FUNC_(get_norm_elastic_from_device,
   // determines max for all blocks
   max = h_max[0];
   for(int i=1;i<num_blocks_x*num_blocks_y;i++) {
+    //debug: denormals can show up here in debugging checks like
+    //       -ffpe-trap=overflow,underflow,zero,invalid,denormal
+    //       when values are zero, avoid the denormal check
+    // printf("debug: %d h = %f %d\n",i,h_max[i],num_blocks_x * num_blocks_y);
     if (max < h_max[i]) max = h_max[i];
   }
   res = sqrt(max);

diff --git a/src/cuda/compute_forces_viscoelastic_cuda.cu b/src/cuda/compute_forces_viscoelastic_cuda.cu
diff --git a/src/cuda/initialize_cuda.cu b/src/cuda/initialize_cuda.cu
@@ -90,7 +90,7 @@ void FC_FUNC_(initialize_cuda_device,
   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess){
     fprintf(stderr,"Error after cudaGetDeviceCount: %s\n", cudaGetErrorString(err));
-    exit_on_error("CUDA runtime error: cudaGetDeviceCount failed\n\nplease check if driver and runtime libraries work together\nor on titan enable environment: CRAY_CUDA_PROXY=1 to use single GPU with multiple MPI processes\n\nexiting...\n");
+    exit_on_error("CUDA runtime error: cudaGetDeviceCount failed\n\nplease check if driver and runtime libraries work together\nor on cluster environments enable MPS (Multi-Process Service) to use single GPU with multiple MPI processes\n\nexiting...\n");
   }
 
   // returns device count to fortran

diff --git a/src/cuda/mesh_constants_cuda.h b/src/cuda/mesh_constants_cuda.h
@@ -164,16 +164,18 @@
 // leads up to ~1% performance increase
 //#define MANUALLY_UNROLLED_LOOPS
 
-// compiler specifications
+// CUDA compiler specifications
 // (optional) use launch_bounds specification to increase compiler optimization
 // (depending on GPU type, register spilling might slow down the performance)
-// (uncomment if desired)
-//#define USE_LAUNCH_BOUNDS
-
 // elastic kernel
+#ifdef GPU_DEVICE_K20
+// specifics see: https://docs.nvidia.com/cuda/kepler-tuning-guide/index.html
+// maximum shared memory per thread block 48KB
+//
 // note: main kernel is Kernel_2_***_impl() which is limited by shared memory usage to 8 active blocks
 //       while register usage might use up to 9 blocks
 //
+//
 // performance statistics: kernel Kernel_2_noatt_impl():
 //       shared memory per block = 1700    for Kepler: total = 49152 -> limits active blocks to 16
 //       registers per thread    = 48
@@ -183,7 +185,35 @@
 //       shared memory per block = 6100    for Kepler: total = 49152 -> limits active blocks to 8
 //       registers per thread    = 59
 //       registers per block     = 8192                total = 65536 -> limits active blocks to 8
+//
+// (uncomment if desired)
+//#define USE_LAUNCH_BOUNDS
 #define LAUNCH_MIN_BLOCKS 10
+//#pragma message ("\nCompiling with: USE_LAUNCH_BOUNDS enabled for K20\n")
+#endif
+
+// add more card specific values
+#ifdef GPU_DEVICE_Maxwell
+// specifics see: https://docs.nvidia.com/cuda/maxwell-tuning-guide/index.html
+// register file size 64k 32-bit registers per SM
+// shared memory 64KB for GM107 and 96KB for GM204
+#undef USE_LAUNCH_BOUNDS
+#endif
+
+#ifdef GPU_DEVICE_Pascal
+// specifics see: https://docs.nvidia.com/cuda/pascal-tuning-guide/index.html
+// register file size 64k 32-bit registers per SM
+// shared memory 64KB for GP100 and 96KB for GP104
+#undef USE_LAUNCH_BOUNDS
+#endif
+
+#ifdef GPU_DEVICE_Volta
+// specifics see: https://docs.nvidia.com/cuda/volta-tuning-guide/index.html
+// register file size 64k 32-bit registers per SM
+// shared memory size 96KB per SM (maximum shared memory per thread block)
+// maximum registers 255 per thread
+#undef USE_LAUNCH_BOUNDS
+#endif
 
 // acoustic kernel
 // performance statistics: kernel Kernel_2_acoustic_impl():
@@ -369,6 +399,8 @@ typedef struct mesh_ {
 
   // mesh resolution
   int NSPEC_AB;
+  int NSPEC_IRREGULAR;
+
   int NGLOB_AB;
 
   // mpi process

diff --git a/src/cuda/prepare_mesh_constants_cuda.cu b/src/cuda/prepare_mesh_constants_cuda.cu
@@ -110,13 +110,17 @@ void FC_FUNC_(prepare_constants_device,
 
   // sets global parameters
   mp->NSPEC_AB = *NSPEC_AB;
+  mp->NSPEC_IRREGULAR = *NSPEC_IRREGULAR;
   mp->NGLOB_AB = *NGLOB_AB;
 
   // constants
   mp->simulation_type = *SIMULATION_TYPE;
   mp->absorbing_conditions = *ABSORBING_CONDITIONS;
   mp->save_forward = *SAVE_FORWARD;
 
+// DK DK August 2018: adding this test, following a suggestion by Etienne Bachmann
+  if (*h_NGLLX != NGLLX) exit_on_error("make sure that the NGLL constants are equal in the two files setup/constants.h and src/cuda/mesh_constants_cuda.h and then please re-compile; also make sure that the value of NGLL3_PADDED is consistent with the value of NGLL\n");
+
   // sets constant arrays
   setConst_hprime_xx(h_hprime_xx,mp);
   // setConst_hprime_yy(h_hprime_yy,mp); // only needed if NGLLX != NGLLY != NGLLZ
@@ -154,10 +158,7 @@ void FC_FUNC_(prepare_constants_device,
 
   // mesh
   // Assuming NGLLX=5. Padded is then 128 (5^3+3)
-  int size_padded = NGLL3_PADDED * (*NSPEC_IRREGULAR > 0 ? *NSPEC_IRREGULAR : 1);
-
-// DK DK August 2018: adding this test, following a suggestion by Etienne Bachmann
-  if (*h_NGLLX != NGLLX) exit_on_error("make sure that the NGLL constants are equal in the two files setup/constants.h and src/cuda/mesh_constants_cuda.h and then please re-compile; also make sure that the value of NGLL3_PADDED is consistent with the value of NGLL\n");
+  int size_padded = NGLL3_PADDED * (mp->NSPEC_IRREGULAR > 0 ? *NSPEC_IRREGULAR : 1);
 
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix, size_padded*sizeof(realw)),1001);
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiy, size_padded*sizeof(realw)),1002);
@@ -172,7 +173,7 @@ void FC_FUNC_(prepare_constants_device,
   // transfer constant element data with padding
   /*
   // way 1: slow...
-  for(int i=0;i < mp->NSPEC_AB;i++) {
+  for(int i=0;i < mp->NSPEC_IRREGULAR;i++) {
     print_CUDA_error_if_any(cudaMemcpy(mp->d_xix + i*NGLL3_PADDED, &h_xix[i*NGLL3],
                                        NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1501);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_xiy+i*NGLL3_PADDED,   &h_xiy[i*NGLL3],
@@ -201,34 +202,34 @@ void FC_FUNC_(prepare_constants_device,
   if (*NSPEC_IRREGULAR > 0 ){
     print_CUDA_error_if_any(cudaMemcpy2D(mp->d_xix, NGLL3_PADDED*sizeof(realw),
                                          h_xix, NGLL3*sizeof(realw), NGLL3*sizeof(realw),
-                                         mp->NSPEC_AB, cudaMemcpyHostToDevice),1501);
+                                         mp->NSPEC_IRREGULAR, cudaMemcpyHostToDevice),1501);
     print_CUDA_error_if_any(cudaMemcpy2D(mp->d_xiy, NGLL3_PADDED*sizeof(realw),
                                          h_xiy, NGLL3*sizeof(realw), NGLL3*sizeof(realw),
-                                         mp->NSPEC_AB, cudaMemcpyHostToDevice),1502);
+                                         mp->NSPEC_IRREGULAR, cudaMemcpyHostToDevice),1502);
     print_CUDA_error_if_any(cudaMemcpy2D(mp->d_xiz, NGLL3_PADDED*sizeof(realw),
                                          h_xiz, NGLL3*sizeof(realw), NGLL3*sizeof(realw),
-                                         mp->NSPEC_AB, cudaMemcpyHostToDevice),1503);
+                                         mp->NSPEC_IRREGULAR, cudaMemcpyHostToDevice),1503);
     print_CUDA_error_if_any(cudaMemcpy2D(mp->d_etax, NGLL3_PADDED*sizeof(realw),
                                          h_etax, NGLL3*sizeof(realw), NGLL3*sizeof(realw),
-                                         mp->NSPEC_AB, cudaMemcpyHostToDevice),1504);
+                                         mp->NSPEC_IRREGULAR, cudaMemcpyHostToDevice),1504);
     print_CUDA_error_if_any(cudaMemcpy2D(mp->d_etay, NGLL3_PADDED*sizeof(realw),
                                          h_etay, NGLL3*sizeof(realw), NGLL3*sizeof(realw),
-                                         mp->NSPEC_AB, cudaMemcpyHostToDevice),1505);
+                                         mp->NSPEC_IRREGULAR, cudaMemcpyHostToDevice),1505);
     print_CUDA_error_if_any(cudaMemcpy2D(mp->d_etaz, NGLL3_PADDED*sizeof(realw),
                                          h_etaz, NGLL3*sizeof(realw), NGLL3*sizeof(realw),
-                                         mp->NSPEC_AB, cudaMemcpyHostToDevice),1506);
+                                         mp->NSPEC_IRREGULAR, cudaMemcpyHostToDevice),1506);
     print_CUDA_error_if_any(cudaMemcpy2D(mp->d_gammax, NGLL3_PADDED*sizeof(realw),
                                          h_gammax, NGLL3*sizeof(realw), NGLL3*sizeof(realw),
-                                         mp->NSPEC_AB, cudaMemcpyHostToDevice),1507);
+                                         mp->NSPEC_IRREGULAR, cudaMemcpyHostToDevice),1507);
     print_CUDA_error_if_any(cudaMemcpy2D(mp->d_gammay, NGLL3_PADDED*sizeof(realw),
                                          h_gammay, NGLL3*sizeof(realw), NGLL3*sizeof(realw),
-                                         mp->NSPEC_AB, cudaMemcpyHostToDevice),1508);
+                                         mp->NSPEC_IRREGULAR, cudaMemcpyHostToDevice),1508);
     print_CUDA_error_if_any(cudaMemcpy2D(mp->d_gammaz, NGLL3_PADDED*sizeof(realw),
                                          h_gammaz, NGLL3*sizeof(realw), NGLL3*sizeof(realw),
-                                         mp->NSPEC_AB, cudaMemcpyHostToDevice),1509);
+                                         mp->NSPEC_IRREGULAR, cudaMemcpyHostToDevice),1509);
   }
-  size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
 
+  size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
 
   // global indexing (padded)
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool, size_padded*sizeof(int)),1600);

diff --git a/src/cuda/update_displacement_cuda.cu b/src/cuda/update_displacement_cuda.cu
@@ -45,7 +45,8 @@ __global__ void UpdateDispVeloc_kernel(realw* displ,
                                        realw deltatover2) {
 
   // two dimensional array of blocks on grid where each block has one dimensional array of threads
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;
+
   realw acc = accel[id];
   // because of block and grid sizing problems, there is a small
   // amount of buffer at the end of the calculation
@@ -155,7 +156,7 @@ __global__ void UpdatePotential_kernel(field* potential_acoustic,
                                        realw deltatsqover2,
                                        realw deltatover2) {
 
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;
 
   // because of block and grid sizing problems, there is a small
   // amount of buffer at the end of the calculation
@@ -282,7 +283,8 @@ __global__ void kernel_3_cuda_device(realw* veloc,
                                      realw* rmassy,
                                      realw* rmassz) {
 
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;
+
   realw rx,ry,rz;
   realw ax,ay,az;
   // because of block and grid sizing problems, there is a small
@@ -331,7 +333,8 @@ __global__ void kernel_3_accel_cuda_device(realw* accel,
                                            realw* rmassx,
                                            realw* rmassy,
                                            realw* rmassz) {
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;
 
   realw rx,ry,rz;
   realw ax,ay,az;
@@ -369,7 +372,7 @@ __global__ void kernel_3_veloc_cuda_device(realw* veloc,
                                            int size,
                                            realw deltatover2) {
 
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;
 
   // because of block and grid sizing problems, there is a small
   // amount of buffer at the end of the calculation
@@ -493,7 +496,8 @@ __global__ void kernel_3_acoustic_cuda_device(field* potential_dot_acoustic,
                                                 realw b_deltatover2,
                                                 realw* rmass_acoustic) {
 
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;
+
   realw rmass;
   field p_dot_dot;
   // because of block and grid sizing problems, there is a small

diff --git a/src/generate_databases/calc_jacobian.f90 b/src/generate_databases/calc_jacobian.f90
@@ -184,6 +184,10 @@ subroutine check_element_regularity(xelm,yelm,zelm,any_regular_elem,cube_edge_si
   double precision :: dist1_sq,dist2_sq,dist3_sq
   double precision :: threshold
   double precision,parameter :: threshold_percentage = 1.e-5
+  logical :: eqx1,eqx2,eqx3,eqx4,eqx5,eqx6
+  logical :: eqy1,eqy2,eqy3,eqy4,eqy5,eqy6
+  logical :: eqz1,eqz2,eqz3,eqz4,eqz5,eqz6
+  logical, external :: is_equal_number
 
   ! by default, we assume to have a perfect regular shape (cube)
 
@@ -201,13 +205,57 @@ subroutine check_element_regularity(xelm,yelm,zelm,any_regular_elem,cube_edge_si
     return
   endif
 
+  ! number convention:
+  !
+  !              8                 7
+  !              o ------------- o
+  !             /|              /|
+  !            / |             / |
+  !          5o---------------o6 |
+  !           |  |            |  |
+  !           | 4o----------- |--o3
+  !           | /             | /
+  !           |/              |/
+  !     z     o---------------o
+  !     | y   1                 2
+  !     |/
+  !     o---> x
+  !
+
+  ! checks x-plane (using epsilon() function to determine almost negligible differences)
+  eqx1 = is_equal_number(xelm(1),xelm(4))
+  eqx2 = is_equal_number(xelm(1),xelm(5))
+  eqx3 = is_equal_number(xelm(1),xelm(8))
+  eqx4 = is_equal_number(xelm(2),xelm(3))
+  eqx5 = is_equal_number(xelm(2),xelm(6))
+  eqx6 = is_equal_number(xelm(2),xelm(7))
+  ! checks y-plane
+  eqy1 = is_equal_number(yelm(1),yelm(2))
+  eqy2 = is_equal_number(yelm(1),yelm(5))
+  eqy3 = is_equal_number(yelm(1),yelm(6))
+  eqy4 = is_equal_number(yelm(3),yelm(4))
+  eqy5 = is_equal_number(yelm(3),yelm(7))
+  eqy6 = is_equal_number(yelm(3),yelm(8))
+  ! checks z-plane
+  eqz1 = is_equal_number(zelm(1),zelm(2))
+  eqz2 = is_equal_number(zelm(1),zelm(3))
+  eqz3 = is_equal_number(zelm(1),zelm(4))
+  eqz4 = is_equal_number(zelm(5),zelm(6))
+  eqz5 = is_equal_number(zelm(5),zelm(7))
+  eqz6 = is_equal_number(zelm(5),zelm(8))
+
   ! checks if the element is a cube (following numbering convention in a 8 nodes element)
-  if (xelm(1) == xelm(4) .and. xelm(1) == xelm(5) .and. xelm(1) == xelm(8) .and. &
-      xelm(2) == xelm(3) .and. xelm(2) == xelm(6) .and. xelm(2) == xelm(7) .and. &
-      yelm(1) == yelm(2) .and. yelm(1) == yelm(5) .and. yelm(1) == yelm(6) .and. &
-      yelm(3) == yelm(4) .and. yelm(3) == yelm(7) .and. yelm(3) == yelm(8) .and. &
-      zelm(1) == zelm(2) .and. zelm(1) == zelm(3) .and. zelm(1) == zelm(4) .and. &
-      zelm(5) == zelm(6) .and. zelm(5) == zelm(7) .and. zelm(5) == zelm(8) ) then
+  ! with numerically negligible differences (to avoid representation issues of float values)
+  if (eqx1 .and. eqx2 .and. eqx3 .and. eqx4 .and. eqx5 .and. eqx6 .and. &
+      eqy1 .and. eqy2 .and. eqy3 .and. eqy4 .and. eqy5 .and. eqy6 .and. &
+      eqz1 .and. eqz2 .and. eqz3 .and. eqz4 .and. eqz5 .and. eqz6 ) then
+  ! or direct float comparisons
+  !if (xelm(1) == xelm(4) .and. xelm(1) == xelm(5) .and. xelm(1) == xelm(8) .and. &
+  !    xelm(2) == xelm(3) .and. xelm(2) == xelm(6) .and. xelm(2) == xelm(7) .and. &
+  !    yelm(1) == yelm(2) .and. yelm(1) == yelm(5) .and. yelm(1) == yelm(6) .and. &
+  !    yelm(3) == yelm(4) .and. yelm(3) == yelm(7) .and. yelm(3) == yelm(8) .and. &
+  !    zelm(1) == zelm(2) .and. zelm(1) == zelm(3) .and. zelm(1) == zelm(4) .and. &
+  !    zelm(5) == zelm(6) .and. zelm(5) == zelm(7) .and. zelm(5) == zelm(8) ) then
 
     dist2_sq = (xelm(5)-xelm(1))**2 + (yelm(5)-yelm(1))**2 + (zelm(5)-zelm(1))**2
     dist3_sq = (xelm(4)-xelm(1))**2 + (yelm(4)-yelm(1))**2 + (zelm(4)-zelm(1))**2
@@ -235,3 +283,28 @@ subroutine check_element_regularity(xelm,yelm,zelm,any_regular_elem,cube_edge_si
   endif
 
   end subroutine check_element_regularity
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  function is_equal_number(a,b)
+
+! compares two real values and determines if they are equal (within numerical precision)
+! to avoid direct comparisons a == b which can fail for rounding issues
+
+  implicit none
+
+  real, intent(in) :: a,b
+  logical :: is_equal_number
+
+  ! note: epsilon() intrinsic function
+  !       returns a positive number that is almost negligible
+  !       example: If x is of type REAL(4), EPSILON (X) has the value 2**-23
+  if (abs(a - b) <= epsilon(a)) then
+    is_equal_number = .true.
+  else
+    is_equal_number = .false.
+  endif
+
+  end function is_equal_number