Merge pull request #20 from spcl/numba-validation-fixes

Various fixes
spcl · Feb 3, 2025 · 1a9e6f8 · 1a9e6f8
2 parents 0bc108b + 4eba4df
commit 1a9e6f8
Show file tree

Hide file tree

Showing 6 changed files with 42 additions and 25 deletions.
diff --git a/frameworks.md b/frameworks.md
@@ -21,7 +21,8 @@ The base `Framework` class (found in [`npbench/infrastructure/framework.py`](npb
 - impl_files: Returns a list of the framework's implementation files for the input benchmark. Each element in the list is a tuple of the implementation filename and a description (e.g. `default` or `nopython-parallel`).
 - implementations: Returns a list of the framework's implementations for the input benchmark. Each element in the list is a tuple of the implementation method and a description (as above).
 - args: Returns a list with the names of the input arguments for running the input implementation of the input benchmark.
-- out_args: Returns a list with the input arguments for running the input implementation of the input benchmark **and** have to be copied(for example, because they may be modified during benchmark execution).
+- mutable_args: Returns a list with the input arguments for running the input implementation of the input benchmark **and** have to be copied(for example, because they may be modified during benchmark execution).
+- inout_args: Returns a list with the input arguments that are also output, i.e., they must be validated.
 - arg_str: Returns the argument-string needed to call the input implementation of the input benchmark.
 - out_arg_str: Returns the argument-string with the input arguments that must be copied.
 - setup_str: Returns the setup-string of the code that should be executed for, e.g., copying data, before executing the benchmark implementation.

diff --git a/.../polybench/cholesky/cholesky_numba_npr.py → ...s/polybench/cholesky/cholesky_numba_np.py b/.../polybench/cholesky/cholesky_numba_npr.py → ...s/polybench/cholesky/cholesky_numba_np.py
@@ -7,7 +7,7 @@ def kernel(A):
 
     A[0, 0] = np.sqrt(A[0, 0])
     for i in range(1, A.shape[0]):
-        for j in nb.prange(i):
+        for j in range(i):
             A[i, j] -= np.dot(A[i, :j], A[j, :j])
             A[i, j] /= A[j, j]
         A[i, i] -= np.dot(A[i, :i], A[i, :i])

diff --git a/npbench/infrastructure/dace_framework.py b/npbench/infrastructure/dace_framework.py
@@ -185,14 +185,14 @@ def parallelize(sdfg):
         try:
 
             def autoopt(sdfg, device, symbols):  #, nofuse):
-                # Mark arrays as on the GPU
-                if device == dtypes.DeviceType.GPU:
-                    for k, v in sdfg.arrays.items():
-                        if not v.transient and type(v) == dace.data.Array:
-                            v.storage = dace.dtypes.StorageType.GPU_Global
+                # # Mark arrays as on the GPU
+                # if device == dtypes.DeviceType.GPU:
+                #     for k, v in sdfg.arrays.items():
+                #         if not v.transient and type(v) == dace.data.Array:
+                #             v.storage = dace.dtypes.StorageType.GPU_Global
 
                 # Auto-optimize SDFG
-                opt.auto_optimize(auto_opt_sdfg, device, symbols=symbols)
+                opt.auto_optimize(auto_opt_sdfg, device, symbols=symbols, use_gpu_storage=True)
 
             auto_opt_sdfg = copy.deepcopy(strict_sdfg)
             auto_opt_sdfg._name = 'auto_opt'
@@ -229,9 +229,10 @@ def vectorize(sdfg, vec_len=None):
                 dace.Config.set('library', 'blas', 'default_implementation', value='cuBLAS')
 
         def copy_to_gpu(sdfg):
-            for k, v in sdfg.arrays.items():
-                if not v.transient and isinstance(v, dace.data.Array):
-                    v.storage = dace.dtypes.StorageType.GPU_Global
+            opt.apply_gpu_storage(sdfg)
+            # for k, v in sdfg.arrays.items():
+            #     if not v.transient and isinstance(v, dace.data.Array):
+            #         v.storage = dace.dtypes.StorageType.GPU_Global
 
         if self.info["arch"] == "gpu":
             import cupy as cp
@@ -242,9 +243,9 @@ def copy_to_gpu(sdfg):
             fe_time = t
             if sdfg._name != 'auto_opt':
                 device = dtypes.DeviceType.GPU if self.info["arch"] == "gpu" else dtypes.DeviceType.CPU
-                if self.info["arch"] == "cpu":
-                    # GPUTransform will set GPU schedules by itself
-                    opt.set_fast_implementations(sdfg, device)
+                # if self.info["arch"] == "cpu":
+                #     # GPUTransform will set GPU schedules by itself
+                opt.set_fast_implementations(sdfg, device)
             if self.info["arch"] == "gpu":
                 if sdfg._name in ['strict', 'parallel', 'fusion']:
                     _, gpu_time1 = util.benchmark("copy_to_gpu(sdfg)",

diff --git a/npbench/infrastructure/framework.py b/npbench/infrastructure/framework.py
@@ -91,17 +91,25 @@ def args(self, bench: Benchmark, impl: Callable = None):
             for a in bench.info["input_args"]
         ]
 
-    def out_args(self, bench: Benchmark, impl: Callable = None):
+    def mutable_args(self, bench: Benchmark, impl: Callable = None):
         """ Generates the input/output arguments that should be copied during
         the setup.
         :param bench: A benchmark.
         :param impl: A benchmark implementation.
         """
 
         return ["__npb_{pr}_{a}".format(pr=self.info["prefix"], a=a) for a in bench.info["array_args"]]
+
 
-    # def params(self, bench: Benchmark, impl: Callable = None):
-    #     return list(bench.info["input_params"])
+    def inout_args(self, bench: Benchmark, impl: Callable = None):
+        """ Generates the input/output arguments that should be checked during
+        validation.
+        :param bench: A benchmark.
+        :param impl: A benchmark implementation.
+        """
+
+        return ["__npb_{pr}_{a}".format(pr=self.info["prefix"], a=a) for a in bench.info["output_args"]]
+
 
     def arg_str(self, bench: Benchmark, impl: Callable = None):
         """ Generates the argument-string that should be used for calling
@@ -119,7 +127,7 @@ def out_arg_str(self, bench: Benchmark, impl: Callable = None):
         :param impl: A benchmark implementation.
         """
 
-        output_args = self.out_args(bench, impl)
+        output_args = self.mutable_args(bench, impl)
         return ", ".join(output_args)
 
     def setup_str(self, bench: Benchmark, impl: Callable = None):

diff --git a/npbench/infrastructure/test.py b/npbench/infrastructure/test.py
@@ -43,8 +43,11 @@ def _execute(self, frmwrk: Framework, impl: Callable, impl_name: str, mode: str,
                 out = [out]
         else:
             out = []
-        if "out_args" in self.bench.info.keys():
-            out += [ldict[a] for a in self.frmwrk.args(self.bench)]
+        if "output_args" in self.bench.info.keys():
+            num_return_args = len(out)
+            num_output_args = len(self.bench.info["output_args"])
+            out += [ldict[a] for a in frmwrk.inout_args(self.bench)]
+            assert len(out) == num_return_args + num_output_args, "Number of output arguments does not match."
         return out, timelist
 
     def run(self, preset: str, validate: bool, repeat: int, timeout: float = 200.0, ignore_errors: bool = True):

diff --git a/npbench/infrastructure/utilities.py b/npbench/infrastructure/utilities.py
@@ -134,16 +134,20 @@ def inner(_it, _timer{init}):
 
 def benchmark(stmt, setup="pass", out_text="", repeat=1, context={}, output=None, verbose=True):
 
-    timeit.template = timeit_tmpl.format(init='{init}', setup='{setup}', stmt='{stmt}', output=output)
-
     ldict = {**context}
-    output = timeit.repeat(stmt, setup=setup, repeat=repeat, number=1, globals=ldict)
-    res = output[0][1]
-    raw_time_list = [a for a, _ in output]
+    raw_time_list = timeit.repeat(stmt, setup=setup, repeat=repeat, number=1, globals=ldict)
     raw_time = np.median(raw_time_list)
     ms_time = time_to_ms(raw_time)
     if verbose:
         print("{}: {}ms".format(out_text, ms_time))
+
+    if output is not None:
+        exec(setup, context)
+        exec(stmt, context)
+        res = context[output]
+    else:
+        res = None
+
     return res, raw_time_list