Lassen regression test update (#1094)

This PR updates the regression test on lassen because the default CUDA version has been upgraded.
hypre-space · May 3, 2024 · 94a9b16 · 94a9b16
1 parent d475cdf
commit 94a9b16
Show file tree

Hide file tree

Showing 11 changed files with 145 additions and 141 deletions.
diff --git a/AUTOTEST/machine-lassen.sh b/AUTOTEST/machine-lassen.sh
@@ -47,57 +47,57 @@ save="lassen"
 
 ######################
 ##   DEFAULT CUDA   ##
-##  (cuda/10.1.243) ##
+##  (cuda/11.2.0)   ##
 ######################
 
 module -q load cuda
 module -q load xl
 
 # CUDA with UM in debug mode [ij, ams, struct, sstruct]
-co="--with-cuda --enable-unified-memory --enable-persistent --enable-debug --with-gpu-arch=70 --with-memory-tracker --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
+co="--with-cuda --enable-unified-memory --enable-persistent --enable-debug --with-gpu-arch=70 --with-memory-tracker --with-extra-CFLAGS=\\'-qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qsuppress=1500-029\\'"
 ro="-ij-gpu -ams -struct -sstruct -rt -mpibind -save ${save} -rtol ${rtol} -atol ${atol}"
 eo="-gpu -rt -mpibind -save ${save} -rtol ${rtol} -atol ${atol}"
 ./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro -eo: $eo
 ./renametest.sh basic $output_dir/basic-cuda-um
 
 # CUDA with UM in debug mode [ij, ams, struct, sstruct]
-co="--with-cuda --enable-unified-memory --enable-persistent --enable-debug --with-print-errors --with-gpu-arch=70 --with-memory-tracker --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
+co="--with-cuda --enable-unified-memory --enable-persistent --enable-debug --with-print-errors --with-gpu-arch=70 --with-memory-tracker --with-extra-CFLAGS=\\'-qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qsuppress=1500-029\\'"
 ro="-error -rt -mpibind -save ${save} -rtol ${rtol} -atol ${atol}"
 ./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
 ./renametest.sh basic $output_dir/basic-cuda-um-with-errors
 
 # CUDA with UM and mixed-int
-co="--with-cuda --enable-unified-memory --enable-mixedint --enable-debug --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
+co="--with-cuda --enable-unified-memory --enable-mixedint --enable-debug --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qsuppress=1500-029\\'"
 ro="-ij-mixed -ams -struct -sstruct-mixed -rt -mpibind -save ${save} -rtol ${rtol} -atol ${atol}"
 ./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
 ./renametest.sh basic $output_dir/basic-cuda-um-mixedint
 
 # CUDA with UM with shared library
-co="--with-cuda --enable-unified-memory --with-openmp --enable-hopscotch --enable-shared --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
+co="--with-cuda --enable-unified-memory --with-openmp --enable-hopscotch --enable-shared --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qsuppress=1500-029\\'"
 ro="-gpumemcheck -rt -mpibind -cudamemcheck -save ${save}"
 ./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
 ./renametest.sh basic $output_dir/basic-cuda-um-shared
 
 # CUDA with UM and single precision
-co="--with-cuda --enable-unified-memory --enable-single --enable-debug --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
+co="--with-cuda --enable-unified-memory --enable-single --enable-debug --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qsuppress=1500-029\\'"
 ro="-single -rt -mpibind -save ${save}"
 ./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: ${ro}
 ./renametest.sh basic $output_dir/basic-cuda-um-single
 
 # CUDA with UM without MPI [no run]
-#co="--with-cuda --enable-unified-memory --without-MPI --with-gpu-arch=70 --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
+#co="--with-cuda --enable-unified-memory --without-MPI --with-gpu-arch=70 --with-extra-CXXFLAGS=\\'-qsuppress=1500-029\\'"
 #./test.sh basic.sh $src_dir -co: $co -mo: $mo
 #./renametest.sh basic $output_dir/basic-cuda-um-without-MPI
 
 # CUDA without UM with device memory pool [struct]
-co="--with-cuda --enable-device-memory-pool --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
+co="--with-cuda --enable-device-memory-pool --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qsuppress=1500-029\\'"
 ro="-struct -rt -mpibind -save ${save}"
 ./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
 ./renametest.sh basic $output_dir/basic-cuda-nonum
 
 # CUDA without UM with umpire [benchmark]
-UMPIRE_DIR=/usr/workspace/hypre/ext-libs/Umpire/2022.03.1-nvcc10.1.243-sm_70-xl2021.09.22
-co="--with-cuda --with-gpu-arch=70 --with-umpire --with-umpire-include=${UMPIRE_DIR}/include --with-umpire-lib-dirs=${UMPIRE_DIR}/lib --with-umpire-libs=umpire --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
+UMPIRE_DIR=/usr/workspace/hypre/ext-libs/Umpire/2022.03.1-nvcc11.2-sm_70-xl2023.06.28-cuda-11.2.0-gcc-8.3.1
+co="--with-cuda --with-gpu-arch=70 --with-umpire --with-umpire-include=${UMPIRE_DIR}/include --with-umpire-lib-dirs=${UMPIRE_DIR}/lib --with-umpire-libs=umpire --with-extra-CFLAGS=\\'-qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qsuppress=1500-029\\'"
 ro="-bench -rt -mpibind -save ${save}"
 ./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
 ./renametest.sh basic $output_dir/basic-cuda-bench
@@ -113,34 +113,36 @@ ro="-ij-noilu -ams -struct -sstruct -rt -mpibind -save lassen_cpu"
 ############
 
 # OMP 4.5 with UM with shared library [no run]
-#co="--with-device-openmp --enable-unified-memory --enable-shared --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029:1500-030:1501-308\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029:1500-030:1501-308\\'"
+#co="--with-device-openmp --enable-unified-memory --enable-shared --with-extra-CFLAGS=\\'-qsuppress=1500-029:1500-030:1501-308\\' --with-extra-CXXFLAGS=\\'-qsuppress=1500-029:1500-030:1501-308\\'"
 #./test.sh basic.sh $src_dir -co: $co -mo: $mo
 #./renametest.sh basic $output_dir/basic-deviceomp-um-shared
 
 # OMP 4.5 without UM in debug mode [struct]
-co="--with-device-openmp --enable-debug --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
+co="--with-device-openmp --enable-debug --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qsuppress=1500-029\\'"
 ro="-struct -rt -mpibind -save ${save}"
 ./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
 ./renametest.sh basic $output_dir/basic-deviceomp-nonum-debug-struct
 
 #####################################
 ## CUDA + CMake build (only) tests ##
 #####################################
+module -q load cmake/3.16.8
+module list cmake/3.16.8 |& grep "None found"
 
 mo="-j"
 
 # CUDA with UM + CMake
-co="-DCMAKE_C_COMPILER=$(which xlc) -DCMAKE_CXX_COMPILER=$(which xlc++) -DCMAKE_CUDA_COMPILER=$(which nvcc) -DMPI_C_COMPILER=$(which mpicc) -DMPI_CXX_COMPILER=$(which mpicxx) -DHYPRE_WITH_CUDA=ON -DHYPRE_ENABLE_UNIFIED_MEMORY=ON -DCMAKE_BUILD_TYPE=Debug -DHYPRE_ENABLE_PERSISTENT_COMM=ON -DHYPRE_ENABLE_DEVICE_POOL=ON -DHYPRE_WITH_EXTRA_CFLAGS="\'"-qmaxmem=-1 -qsuppress=1500-029"\'" -DHYPRE_WITH_EXTRA_CXXFLAGS="\'"-qmaxmem=-1 -qsuppress=1500-029"\'" -DHYPRE_CUDA_SM=70"
+co="-DCMAKE_C_COMPILER=$(which xlc) -DCMAKE_CXX_COMPILER=$(which xlc++) -DCMAKE_CUDA_COMPILER=$(which nvcc) -DMPI_C_COMPILER=$(which mpicc) -DMPI_CXX_COMPILER=$(which mpicxx) -DHYPRE_WITH_CUDA=ON -DHYPRE_ENABLE_UNIFIED_MEMORY=ON -DCMAKE_BUILD_TYPE=Debug -DHYPRE_ENABLE_PERSISTENT_COMM=ON -DHYPRE_ENABLE_DEVICE_POOL=ON -DHYPRE_WITH_EXTRA_CFLAGS="\'"-qsuppress=1500-029"\'" -DHYPRE_WITH_EXTRA_CXXFLAGS="\'"-qsuppress=1500-029"\'" -DHYPRE_CUDA_SM=70"
 ./test.sh cmake.sh $src_dir -co: $co -mo: $mo
 ./renametest.sh cmake $output_dir/cmake-cuda-um-ij
 
 # CUDA with UM [shared library] + CMake
-co="-DCMAKE_C_COMPILER=$(which xlc) -DCMAKE_CXX_COMPILER=$(which xlc++) -DCMAKE_CUDA_COMPILER=$(which nvcc) -DMPI_C_COMPILER=$(which mpicc) -DMPI_CXX_COMPILER=$(which mpicxx) -DHYPRE_WITH_CUDA=ON -DHYPRE_ENABLE_UNIFIED_MEMORY=ON -DCMAKE_BUILD_TYPE=Debug -DHYPRE_WITH_OPENMP=ON -DHYPRE_ENABLE_HOPSCOTCH=ON -DHYPRE_ENABLE_SHARED=ON -DHYPRE_WITH_EXTRA_CFLAGS="\'"-qmaxmem=-1 -qsuppress=1500-029"\'" -DHYPRE_WITH_EXTRA_CXXFLAGS="\'"-qmaxmem=-1 -qsuppress=1500-029 "\'" -DHYPRE_CUDA_SM=70"
+co="-DCMAKE_C_COMPILER=$(which xlc) -DCMAKE_CXX_COMPILER=$(which xlc++) -DCMAKE_CUDA_COMPILER=$(which nvcc) -DMPI_C_COMPILER=$(which mpicc) -DMPI_CXX_COMPILER=$(which mpicxx) -DHYPRE_WITH_CUDA=ON -DHYPRE_ENABLE_UNIFIED_MEMORY=ON -DCMAKE_BUILD_TYPE=Debug -DHYPRE_WITH_OPENMP=ON -DHYPRE_ENABLE_HOPSCOTCH=ON -DHYPRE_ENABLE_SHARED=ON -DHYPRE_WITH_EXTRA_CFLAGS="\'"-qsuppress=1500-029"\'" -DHYPRE_WITH_EXTRA_CXXFLAGS="\'"-qsuppress=1500-029 "\'" -DHYPRE_CUDA_SM=70"
 ./test.sh cmake.sh $src_dir -co: $co -mo: $mo
 ./renametest.sh cmake $output_dir/cmake-cuda-um-shared
 
 # CUDA w.o UM + CMake
-co="-DCMAKE_C_COMPILER=$(which xlc) -DCMAKE_CXX_COMPILER=$(which xlc++) -DCMAKE_CUDA_COMPILER=$(which nvcc) -DMPI_C_COMPILER=$(which mpicc) -DMPI_CXX_COMPILER=$(which mpicxx) -DHYPRE_WITH_CUDA=ON -DCMAKE_BUILD_TYPE=Debug -DHYPRE_WITH_EXTRA_CFLAGS="\'"-qmaxmem=-1 -qsuppress=1500-029"\'" -DHYPRE_WITH_EXTRA_CXXFLAGS="\'"-qmaxmem=-1 -qsuppress=1500-029"\'" -DHYPRE_CUDA_SM=70"
+co="-DCMAKE_C_COMPILER=$(which xlc) -DCMAKE_CXX_COMPILER=$(which xlc++) -DCMAKE_CUDA_COMPILER=$(which nvcc) -DMPI_C_COMPILER=$(which mpicc) -DMPI_CXX_COMPILER=$(which mpicxx) -DHYPRE_WITH_CUDA=ON -DCMAKE_BUILD_TYPE=Debug -DHYPRE_WITH_EXTRA_CFLAGS="\'"-qsuppress=1500-029"\'" -DHYPRE_WITH_EXTRA_CXXFLAGS="\'"-qsuppress=1500-029"\'" -DHYPRE_CUDA_SM=70"
 ./test.sh cmake.sh $src_dir -co: $co -mo: $mo
 ./renametest.sh cmake $output_dir/cmake-cuda-nonum-struct
 

diff --git a/src/test/TEST_ams/solvers.saved.lassen b/src/test/TEST_ams/solvers.saved.lassen
@@ -1,117 +1,117 @@
 # Output file: solvers.out.0
-    Cycle 58   2.180962e-05    0.801100     1.624257e-06
-    Cycle 59   1.747511e-05    0.801257     1.301447e-06
-    Cycle 60   1.399773e-05    0.801009     1.042471e-06
-    Cycle 61   1.122074e-05    0.801612     8.356570e-07
+    Cycle 58   2.180961e-05    0.801052     1.624256e-06 
+    Cycle 59   1.747620e-05    0.801308     1.301528e-06 
+    Cycle 60   1.400242e-05    0.801228     1.042821e-06 
+    Cycle 61   1.121910e-05    0.801226     8.355349e-07 
 
 
- Average Convergence Factor = 0.794989
+ Average Convergence Factor = 0.794987
 
 # Output file: solvers.out.1
-    Cycle 58   2.180962e-05    0.801100     1.624257e-06
-    Cycle 59   1.747511e-05    0.801257     1.301447e-06
-    Cycle 60   1.399773e-05    0.801009     1.042471e-06
-    Cycle 61   1.122074e-05    0.801612     8.356570e-07
+    Cycle 58   2.180961e-05    0.801052     1.624256e-06 
+    Cycle 59   1.747620e-05    0.801308     1.301528e-06 
+    Cycle 60   1.400242e-05    0.801228     1.042821e-06 
+    Cycle 61   1.121910e-05    0.801226     8.355349e-07 
 
 
- Average Convergence Factor = 0.794989
+ Average Convergence Factor = 0.794987
 
 # Output file: solvers.out.2
-    Cycle 97   1.783837e-05    0.873468     1.328500e-06
-    Cycle 98   1.558153e-05    0.873484     1.160424e-06
-    Cycle 99   1.361003e-05    0.873472     1.013597e-06
-    Cycle 100   1.188958e-05    0.873590     8.854686e-07
+    Cycle 97   1.784019e-05    0.873467     1.328636e-06 
+    Cycle 98   1.557979e-05    0.873297     1.160294e-06 
+    Cycle 99   1.360847e-05    0.873470     1.013482e-06 
+    Cycle 100   1.188950e-05    0.873684     8.854624e-07 
 
 
  Average Convergence Factor = 0.869905
 
 # Output file: solvers.out.3
-    Cycle 97   1.783837e-05    0.873468     1.328500e-06
-    Cycle 98   1.558153e-05    0.873484     1.160424e-06
-    Cycle 99   1.361003e-05    0.873472     1.013597e-06
-    Cycle 100   1.188958e-05    0.873590     8.854686e-07
+    Cycle 97   1.784019e-05    0.873467     1.328636e-06 
+    Cycle 98   1.557979e-05    0.873297     1.160294e-06 
+    Cycle 99   1.360847e-05    0.873470     1.013482e-06 
+    Cycle 100   1.188950e-05    0.873684     8.854624e-07 
 
 
  Average Convergence Factor = 0.869905
 
 # Output file: solvers.out.4
 
 Iterations = 13
-Final Relative Residual Norm = 4.939012e-07
+Final Relative Residual Norm = 4.939034e-07
 
 # Output file: solvers.out.5
 
 Iterations = 13
-Final Relative Residual Norm = 4.939012e-07
+Final Relative Residual Norm = 4.939034e-07
 
 # Output file: solvers.out.6
 
 Iterations = 16
-Final Relative Residual Norm = 6.410688e-07
+Final Relative Residual Norm = 6.410653e-07
 
 # Output file: solvers.out.7
 
 Iterations = 16
-Final Relative Residual Norm = 6.410688e-07
+Final Relative Residual Norm = 6.410653e-07
 
 # Output file: solvers.out.12
 
 Iterations = 18
-Final Relative Residual Norm = 2.433909e+01
+Final Relative Residual Norm = 2.433983e+01
 
 # Output file: solvers.out.8
 
-Eigenvalue lambda   3.02357653920195e+01
-Eigenvalue lambda   3.03135374702301e+01
-Eigenvalue lambda   3.85013899430578e+01
-Eigenvalue lambda   5.14395940112140e+01
-Eigenvalue lambda   5.15742481824047e+01
-Residual   7.45352589377429e-05
-Residual   7.43880033246762e-05
-Residual   1.26594022210671e-04
-Residual   8.99904407865315e-05
-Residual   9.30548924090266e-05
+Eigenvalue lambda   3.02357653920079e+01
+Eigenvalue lambda   3.03135374702236e+01
+Eigenvalue lambda   3.85013899430559e+01
+Eigenvalue lambda   5.14395940112081e+01
+Eigenvalue lambda   5.15742481823856e+01
+Residual   7.45352588578718e-05
+Residual   7.43880032620007e-05
+Residual   1.26594022215394e-04
+Residual   8.99904406834779e-05
+Residual   9.30548921427010e-05
 
 24 iterations
 # Output file: solvers.out.9
 
-Eigenvalue lambda   3.02357653920195e+01
-Eigenvalue lambda   3.03135374702301e+01
-Eigenvalue lambda   3.85013899430578e+01
-Eigenvalue lambda   5.14395940112140e+01
-Eigenvalue lambda   5.15742481824047e+01
-Residual   7.45352589377429e-05
-Residual   7.43880033246762e-05
-Residual   1.26594022210671e-04
-Residual   8.99904407865315e-05
-Residual   9.30548924090266e-05
+Eigenvalue lambda   3.02357653920079e+01
+Eigenvalue lambda   3.03135374702236e+01
+Eigenvalue lambda   3.85013899430559e+01
+Eigenvalue lambda   5.14395940112081e+01
+Eigenvalue lambda   5.15742481823856e+01
+Residual   7.45352588578718e-05
+Residual   7.43880032620007e-05
+Residual   1.26594022215394e-04
+Residual   8.99904406834779e-05
+Residual   9.30548921427010e-05
 
 24 iterations
 # Output file: solvers.out.10
 
-Eigenvalue lambda   3.02357653945254e+01
-Eigenvalue lambda   3.03135374737707e+01
-Eigenvalue lambda   3.85013899480154e+01
-Eigenvalue lambda   5.14395940359891e+01
-Eigenvalue lambda   5.15742483525312e+01
-Residual   8.77563343325342e-05
-Residual   1.06541588442403e-04
-Residual   8.94426356809917e-05
-Residual   1.08234820725642e-04
-Residual   1.17206146877284e-04
+Eigenvalue lambda   3.02357653967707e+01
+Eigenvalue lambda   3.03135374800728e+01
+Eigenvalue lambda   3.85013899397799e+01
+Eigenvalue lambda   5.14395940101665e+01
+Eigenvalue lambda   5.15742480845151e+01
+Residual   8.77562666069003e-05
+Residual   1.06541449357735e-04
+Residual   8.94425865221178e-05
+Residual   1.08234881163452e-04
+Residual   1.17205182508686e-04
 
 35 iterations
 # Output file: solvers.out.11
 
-Eigenvalue lambda   3.02357653945254e+01
-Eigenvalue lambda   3.03135374737707e+01
-Eigenvalue lambda   3.85013899480154e+01
-Eigenvalue lambda   5.14395940359891e+01
-Eigenvalue lambda   5.15742483525312e+01
-Residual   8.77563343325342e-05
-Residual   1.06541588442403e-04
-Residual   8.94426356809917e-05
-Residual   1.08234820725642e-04
-Residual   1.17206146877284e-04
+Eigenvalue lambda   3.02357653967707e+01
+Eigenvalue lambda   3.03135374800728e+01
+Eigenvalue lambda   3.85013899397799e+01
+Eigenvalue lambda   5.14395940101665e+01
+Eigenvalue lambda   5.15742480845151e+01
+Residual   8.77562666069003e-05
+Residual   1.06541449357735e-04
+Residual   8.94425865221178e-05
+Residual   1.08234881163452e-04
+Residual   1.17205182508686e-04
 
 35 iterations
diff --git a/src/test/TEST_bench/benchmark_spgemm.jobs b/src/test/TEST_bench/benchmark_spgemm.jobs
@@ -26,7 +26,7 @@ mpirun -np 1 ./ij_mm -n 1024 1024 1  -5pt -verify 1 -spgemm_binned 0 -spgemmalg
 mpirun -np 1 ./ij_mm -n 1024 1024 1  -9pt -verify 1 -spgemm_binned 0 -spgemmalg 3 -rep 10 > benchmark_spgemm.out.12
 
 mpirun -np 1 ./ij_mm -n 128 128 128  -7pt -verify 1 -vendor 1                     -rep 10 > benchmark_spgemm.out.13
-mpirun -np 1 ./ij_mm -n 128 128 128 -27pt -verify 1 -vendor 1                     -rep 10 > benchmark_spgemm.out.14
+mpirun -np 1 ./ij_mm -n 80   80  80 -27pt -verify 1 -vendor 1                     -rep 10 > benchmark_spgemm.out.14
 mpirun -np 1 ./ij_mm -n 1024 1024 1  -5pt -verify 1 -vendor 1                     -rep 10 > benchmark_spgemm.out.15
 mpirun -np 1 ./ij_mm -n 1024 1024 1  -9pt -verify 1 -vendor 1                     -rep 10 > benchmark_spgemm.out.16
 

diff --git a/src/test/TEST_bench/benchmark_spgemm.perf.saved.lassen b/src/test/TEST_bench/benchmark_spgemm.perf.saved.lassen
@@ -23,23 +23,23 @@ Device Parcsr Matrix-by-Matrix wall clock time = 0.008427 seconds
 # Output file: benchmark_spgemm.out.12
 Device Parcsr Matrix-by-Matrix wall clock time = 0.011918 seconds
 # Output file: benchmark_spgemm.out.13
-Device Parcsr Matrix-by-Matrix wall clock time = 0.122758 seconds
+Device Parcsr Matrix-by-Matrix wall clock time = 0.027048 seconds
 # Output file: benchmark_spgemm.out.14
-Device Parcsr Matrix-by-Matrix wall clock time = 0.654239 seconds
+Device Parcsr Matrix-by-Matrix wall clock time = 0.064765 seconds
 # Output file: benchmark_spgemm.out.15
-Device Parcsr Matrix-by-Matrix wall clock time = 0.041012 seconds
+Device Parcsr Matrix-by-Matrix wall clock time = 0.006528 seconds
 # Output file: benchmark_spgemm.out.16
-Device Parcsr Matrix-by-Matrix wall clock time = 0.075197 seconds
+Device Parcsr Matrix-by-Matrix wall clock time = 0.018399 seconds
 # Output file: benchmark_spgemm.out.17
-Device Parcsr Matrix-by-Matrix wall clock time = 0.001930 seconds
+Device Parcsr Matrix-by-Matrix wall clock time = 0.001937 seconds
 # Output file: benchmark_spgemm.out.18
-Device Parcsr Matrix-by-Matrix wall clock time = 0.005978 seconds
+Device Parcsr Matrix-by-Matrix wall clock time = 0.006223 seconds
 # Output file: benchmark_spgemm.out.19
 Device Parcsr Matrix-by-Matrix wall clock time = 0.001117 seconds
 # Output file: benchmark_spgemm.out.20
 Device Parcsr Matrix-by-Matrix wall clock time = 0.001318 seconds
 # Output file: benchmark_spgemm.out.21
-Device Parcsr Matrix-by-Matrix wall clock time = 0.031280 seconds
+Device Parcsr Matrix-by-Matrix wall clock time = 0.026565 seconds
 # Output file: benchmark_spgemm.out.22
 Device Parcsr Matrix-by-Matrix wall clock time = 0.138814 seconds
 # Output file: benchmark_spgemm.out.23

diff --git a/src/test/TEST_bench/benchmark_spgemm.perf.saved.tioga b/src/test/TEST_bench/benchmark_spgemm.perf.saved.tioga
@@ -25,7 +25,7 @@ Device Parcsr Matrix-by-Matrix wall clock time = 0.019611 seconds
 # Output file: benchmark_spgemm.out.13
 Device Parcsr Matrix-by-Matrix wall clock time = 0.018017 seconds
 # Output file: benchmark_spgemm.out.14
-Device Parcsr Matrix-by-Matrix wall clock time = 0.130224 seconds
+Device Parcsr Matrix-by-Matrix wall clock time = 0.014804 seconds
 # Output file: benchmark_spgemm.out.15
 Device Parcsr Matrix-by-Matrix wall clock time = 0.006545 seconds
 # Output file: benchmark_spgemm.out.16

diff --git a/src/test/TEST_bench/benchmark_spgemm.saved.lassen b/src/test/TEST_bench/benchmark_spgemm.saved.lassen
@@ -38,8 +38,8 @@ B 1048576 x 1048576, NNZ 26152996, RNZ 24
 A^2: 2097152 x 2097152, nnz [CPU 51742208, GPU 51742208], CPU-GPU err 0.000000e+00
 B 2097152 x 2097152, NNZ 51742208, RNZ 24
 # Output file: benchmark_spgemm.out.14
-A^2: 2097152 x 2097152, nnz [CPU 254840104, GPU 254840104], CPU-GPU err 0.000000e+00
-B 2097152 x 2097152, NNZ 254840104, RNZ 121
+A^2: 512000 x 512000, nnz [CPU 61162984, GPU 61162984], CPU-GPU err 0.000000e+00
+B 512000 x 512000, NNZ 61162984, RNZ 119
 # Output file: benchmark_spgemm.out.15
 A^2: 1048576 x 1048576, nnz [CPU 13611012, GPU 13611012], CPU-GPU err 0.000000e+00
 B 1048576 x 1048576, NNZ 13611012, RNZ 12

diff --git a/src/test/TEST_bench/benchmark_spgemm.saved.tioga b/src/test/TEST_bench/benchmark_spgemm.saved.tioga
@@ -38,8 +38,8 @@ B 1048576 x 1048576, NNZ 26152996, RNZ 24
 A^2: 2097152 x 2097152, nnz [CPU 51742208, GPU 51742208], CPU-GPU err 0.000000e+00
 B 2097152 x 2097152, NNZ 51742208, RNZ 24
 # Output file: benchmark_spgemm.out.14
-A^2: 2097152 x 2097152, nnz [CPU 254840104, GPU 254840104], CPU-GPU err 0.000000e+00
-B 2097152 x 2097152, NNZ 254840104, RNZ 121
+A^2: 512000 x 512000, nnz [CPU 61162984, GPU 61162984], CPU-GPU err 0.000000e+00
+B 512000 x 512000, NNZ 61162984, RNZ 119
 # Output file: benchmark_spgemm.out.15
 A^2: 1048576 x 1048576, nnz [CPU 13611012, GPU 13611012], CPU-GPU err 0.000000e+00
 B 1048576 x 1048576, NNZ 13611012, RNZ 12