Skip to content

Commit

Permalink
Merge pull request #20 from spcl/numba-validation-fixes
Browse files Browse the repository at this point in the history
Various fixes
  • Loading branch information
alexnick83 authored Feb 3, 2025
2 parents 0bc108b + 4eba4df commit 1a9e6f8
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 25 deletions.
3 changes: 2 additions & 1 deletion frameworks.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ The base `Framework` class (found in [`npbench/infrastructure/framework.py`](npb
- impl_files: Returns a list of the framework's implementation files for the input benchmark. Each element in the list is a tuple of the implementation filename and a description (e.g. `default` or `nopython-parallel`).
- implementations: Returns a list of the framework's implementations for the input benchmark. Each element in the list is a tuple of the implementation method and a description (as above).
- args: Returns a list with the names of the input arguments for running the input implementation of the input benchmark.
- out_args: Returns a list with the input arguments for running the input implementation of the input benchmark **and** have to be copied(for example, because they may be modified during benchmark execution).
- mutable_args: Returns a list with the input arguments for running the input implementation of the input benchmark **and** have to be copied(for example, because they may be modified during benchmark execution).
- inout_args: Returns a list with the input arguments that are also output, i.e., they must be validated.
- arg_str: Returns the argument-string needed to call the input implementation of the input benchmark.
- out_arg_str: Returns the argument-string with the input arguments that must be copied.
- setup_str: Returns the setup-string of the code that should be executed for, e.g., copying data, before executing the benchmark implementation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def kernel(A):

A[0, 0] = np.sqrt(A[0, 0])
for i in range(1, A.shape[0]):
for j in nb.prange(i):
for j in range(i):
A[i, j] -= np.dot(A[i, :j], A[j, :j])
A[i, j] /= A[j, j]
A[i, i] -= np.dot(A[i, :i], A[i, :i])
Expand Down
25 changes: 13 additions & 12 deletions npbench/infrastructure/dace_framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,14 +185,14 @@ def parallelize(sdfg):
try:

def autoopt(sdfg, device, symbols): #, nofuse):
# Mark arrays as on the GPU
if device == dtypes.DeviceType.GPU:
for k, v in sdfg.arrays.items():
if not v.transient and type(v) == dace.data.Array:
v.storage = dace.dtypes.StorageType.GPU_Global
# # Mark arrays as on the GPU
# if device == dtypes.DeviceType.GPU:
# for k, v in sdfg.arrays.items():
# if not v.transient and type(v) == dace.data.Array:
# v.storage = dace.dtypes.StorageType.GPU_Global

# Auto-optimize SDFG
opt.auto_optimize(auto_opt_sdfg, device, symbols=symbols)
opt.auto_optimize(auto_opt_sdfg, device, symbols=symbols, use_gpu_storage=True)

auto_opt_sdfg = copy.deepcopy(strict_sdfg)
auto_opt_sdfg._name = 'auto_opt'
Expand Down Expand Up @@ -229,9 +229,10 @@ def vectorize(sdfg, vec_len=None):
dace.Config.set('library', 'blas', 'default_implementation', value='cuBLAS')

def copy_to_gpu(sdfg):
for k, v in sdfg.arrays.items():
if not v.transient and isinstance(v, dace.data.Array):
v.storage = dace.dtypes.StorageType.GPU_Global
opt.apply_gpu_storage(sdfg)
# for k, v in sdfg.arrays.items():
# if not v.transient and isinstance(v, dace.data.Array):
# v.storage = dace.dtypes.StorageType.GPU_Global

if self.info["arch"] == "gpu":
import cupy as cp
Expand All @@ -242,9 +243,9 @@ def copy_to_gpu(sdfg):
fe_time = t
if sdfg._name != 'auto_opt':
device = dtypes.DeviceType.GPU if self.info["arch"] == "gpu" else dtypes.DeviceType.CPU
if self.info["arch"] == "cpu":
# GPUTransform will set GPU schedules by itself
opt.set_fast_implementations(sdfg, device)
# if self.info["arch"] == "cpu":
# # GPUTransform will set GPU schedules by itself
opt.set_fast_implementations(sdfg, device)
if self.info["arch"] == "gpu":
if sdfg._name in ['strict', 'parallel', 'fusion']:
_, gpu_time1 = util.benchmark("copy_to_gpu(sdfg)",
Expand Down
16 changes: 12 additions & 4 deletions npbench/infrastructure/framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,17 +91,25 @@ def args(self, bench: Benchmark, impl: Callable = None):
for a in bench.info["input_args"]
]

def out_args(self, bench: Benchmark, impl: Callable = None):
def mutable_args(self, bench: Benchmark, impl: Callable = None):
""" Generates the input/output arguments that should be copied during
the setup.
:param bench: A benchmark.
:param impl: A benchmark implementation.
"""

return ["__npb_{pr}_{a}".format(pr=self.info["prefix"], a=a) for a in bench.info["array_args"]]


# def params(self, bench: Benchmark, impl: Callable = None):
# return list(bench.info["input_params"])
def inout_args(self, bench: Benchmark, impl: Callable = None):
""" Generates the input/output arguments that should be checked during
validation.
:param bench: A benchmark.
:param impl: A benchmark implementation.
"""

return ["__npb_{pr}_{a}".format(pr=self.info["prefix"], a=a) for a in bench.info["output_args"]]


def arg_str(self, bench: Benchmark, impl: Callable = None):
""" Generates the argument-string that should be used for calling
Expand All @@ -119,7 +127,7 @@ def out_arg_str(self, bench: Benchmark, impl: Callable = None):
:param impl: A benchmark implementation.
"""

output_args = self.out_args(bench, impl)
output_args = self.mutable_args(bench, impl)
return ", ".join(output_args)

def setup_str(self, bench: Benchmark, impl: Callable = None):
Expand Down
7 changes: 5 additions & 2 deletions npbench/infrastructure/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,11 @@ def _execute(self, frmwrk: Framework, impl: Callable, impl_name: str, mode: str,
out = [out]
else:
out = []
if "out_args" in self.bench.info.keys():
out += [ldict[a] for a in self.frmwrk.args(self.bench)]
if "output_args" in self.bench.info.keys():
num_return_args = len(out)
num_output_args = len(self.bench.info["output_args"])
out += [ldict[a] for a in frmwrk.inout_args(self.bench)]
assert len(out) == num_return_args + num_output_args, "Number of output arguments does not match."
return out, timelist

def run(self, preset: str, validate: bool, repeat: int, timeout: float = 200.0, ignore_errors: bool = True):
Expand Down
14 changes: 9 additions & 5 deletions npbench/infrastructure/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,16 +134,20 @@ def inner(_it, _timer{init}):

def benchmark(stmt, setup="pass", out_text="", repeat=1, context={}, output=None, verbose=True):

timeit.template = timeit_tmpl.format(init='{init}', setup='{setup}', stmt='{stmt}', output=output)

ldict = {**context}
output = timeit.repeat(stmt, setup=setup, repeat=repeat, number=1, globals=ldict)
res = output[0][1]
raw_time_list = [a for a, _ in output]
raw_time_list = timeit.repeat(stmt, setup=setup, repeat=repeat, number=1, globals=ldict)
raw_time = np.median(raw_time_list)
ms_time = time_to_ms(raw_time)
if verbose:
print("{}: {}ms".format(out_text, ms_time))

if output is not None:
exec(setup, context)
exec(stmt, context)
res = context[output]
else:
res = None

return res, raw_time_list


Expand Down

0 comments on commit 1a9e6f8

Please sign in to comment.