Olympus-HPC · johnbowen42 · Jan 23, 2025 · Jan 31, 2025 · Jan 31, 2025 · Feb 1, 2025
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "RAJAPerf"]
+	path = benchmarks/RAJAPerf
+	url = https://github.com/Olympus-HPC/RAJAPerf.git
diff --git a/benchmarks.toml b/benchmarks.toml
@@ -1,65 +1,129 @@
+[build]
+[build.nvidia]
+command = ["make"]
+[build.nvidia.clean]
+command = "make clean"
+[build.amd]
+command = ["make"]
+[build.amd.clean]
+command = "make clean"
+
 [adam]
 [adam.nvidia]
-aot = "benchmarks/hecbench/cuda/adam"
-proteus = "benchmarks/hecbench/cuda/adam"
-jitify = "benchmarks/hecbench/cuda-jitify/adam"
-[adam.amd]
-aot = "benchmarks/hecbench/hip/adam"
-proteus = "benchmarks/hecbench/hip/adam"
+[adam.nvidia.aot]
+path = "benchmarks/hecbench/cuda/adam"
+exe = "adam-aot.x"
+[adam.nvidia.proteus]
+path = "benchmarks/hecbench/cuda/adam"
+exe = "adam-proteus.x"
+[adam.nvidia.jitify]
+path = "benchmarks/hecbench/cuda-jitify/adam"
+exe = "adam-jitify.x"
+[adam.amd.aot]
+path = "benchmarks/hecbench/hip/adam"
+exe = "adam-aot.x"
+[adam.amd.proteus]
+path = "benchmarks/hecbench/hip/adam"
+exe = "adam-proteus.x"
 [adam.inputs]
 default = "160000 1600 1000"
 
 [feynman-kac]
 [feynman-kac.nvidia]
-aot = "benchmarks/hecbench/cuda/feynman-kac"
-proteus = "benchmarks/hecbench/cuda/feynman-kac"
-jitify = "benchmarks/hecbench/cuda-jitify/feynman-kac"
-[feynman-kac.amd]
-aot = "benchmarks/hecbench/hip/feynman-kac"
-proteus = "benchmarks/hecbench/hip/feynman-kac"
+[feynman-kac.nvidia.aot]
+path = "benchmarks/hecbench/cuda/feynman-kac"
+exe = "feynman-kac-aot.x"
+[feynman-kac.nvidia.proteus]
+path = "benchmarks/hecbench/cuda/feynman-kac"
+exe = "feynman-kac-proteus.x"
+[feynman-kac.nvidia.jitify]
+path = "benchmarks/hecbench/cuda-jitify/feynman-kac"
+exe = "feynman-kac-jitify.x"
+[feynman-kac.amd.aot]
+path = "benchmarks/hecbench/hip/feynman-kac"
+exe = "feynman-kac-aot.x"
+[feynman-kac.amd.proteus]
+path = "benchmarks/hecbench/hip/feynman-kac"
+exe = "feynman-kac-proteus.x"
 [feynman-kac.inputs]
 default = "1"
 
 [lulesh]
 [lulesh.nvidia]
-aot = "benchmarks/hecbench/cuda/LULESH"
-proteus = "benchmarks/hecbench/cuda/LULESH"
-jitify = "benchmarks/hecbench/cuda-jitify/LULESH"
-[lulesh.amd]
-aot = "benchmarks/hecbench/hip/LULESH"
-proteus = "benchmarks/hecbench/hip/LULESH"
+[lulesh.nvidia.aot]
+path = "benchmarks/hecbench/cuda/LULESH"
+exe = "lulesh-aot.x"
+[lulesh.nvidia.proteus]
+path = "benchmarks/hecbench/cuda/LULESH"
+exe = "lulesh-proteus.x"
+[lulesh.nvidia.jitify]
+path = "benchmarks/hecbench/cuda-jitify/LULESH"
+exe = "lulesh-jitify.x"
+[lulesh.amd.aot]
+path = "benchmarks/hecbench/hip/LULESH"
+exe = "lulesh-aot.x"
+[lulesh.amd.proteus]
+path = "benchmarks/hecbench/hip/LULESH"
+exe = "lulesh-proteus.x"
 [lulesh.inputs]
 default = "-s 128 -i 1000"
 
 [rsbench]
 [rsbench.nvidia]
-aot = "benchmarks/hecbench/cuda/rsbench"
-proteus = "benchmarks/hecbench/cuda/rsbench"
-jitify = "benchmarks/hecbench/cuda-jitify/rsbench"
-[rsbench.amd]
-aot = "benchmarks/hecbench/hip/rsbench"
-proteus = "benchmarks/hecbench/hip/rsbench"
+[rsbench.nvidia.aot]
+path = "benchmarks/hecbench/cuda/rsbench"
+exe = "rsbench-aot.x"
+[rsbench.nvidia.proteus]
+path = "benchmarks/hecbench/cuda/rsbench"
+exe = "rsbench-proteus.x"
+[rsbench.nvidia.jitify]
+path = "benchmarks/hecbench/cuda-jitify/rsbench"
+exe = "rsbench-jitify.x"
+[rsbench.amd.aot]
+path = "benchmarks/hecbench/hip/rsbench"
+exe = "rsbench-aot.x"
+[rsbench.amd.proteus]
+path = "benchmarks/hecbench/hip/rsbench"
+exe = "rsbench-proteus.x"
 [rsbench.inputs]
 default = "-s large -m event"
 
 [sw4ck]
 [sw4ck.nvidia]
-aot = "benchmarks/hecbench/cuda/sw4ck"
-proteus = "benchmarks/hecbench/cuda/sw4ck"
-jitify = "benchmarks/hecbench/cuda-jitify/sw4ck"
-[sw4ck.amd]
-aot = "benchmarks/hecbench/hip/sw4ck"
-proteus = "benchmarks/hecbench/hip/sw4ck"
+[sw4ck.nvidia.aot]
+path = "benchmarks/hecbench/cuda/sw4ck"
+exe = "sw4ck-aot.x"
+[sw4ck.nvidia.proteus]
+path = "benchmarks/hecbench/cuda/sw4ck"
+exe = "sw4ck-proteus.x"
+[sw4ck.nvidia.jitify]
+path = "benchmarks/hecbench/cuda-jitify/sw4ck"
+exe = "sw4ck-jitify.x"
+[sw4ck.amd.aot]
+path = "benchmarks/hecbench/hip/sw4ck"
+exe = "sw4ck-aot.x"
+[sw4ck.amd.proteus]
+path = "benchmarks/hecbench/hip/sw4ck"
+exe = "sw4ck-proteus.x"
 [sw4ck.inputs]
 default = "sw4ck.in 100"
 
 [wsm5]
 [wsm5.nvidia]
-aot = "benchmarks/hecbench/cuda/wsm5"
-proteus = "benchmarks/hecbench/cuda/wsm5"
-jitify = "benchmarks/hecbench/cuda-jitify/wsm5"
-[wsm5.amd]
-aot = "benchmarks/hecbench/hip/wsm5"
-proteus = "benchmarks/hecbench/hip/wsm5"
+[wsm5.nvidia.aot]
+path = "benchmarks/hecbench/cuda/wsm5"
+exe = "wsm5-aot.x"
+[wsm5.nvidia.proteus]
+path = "benchmarks/hecbench/cuda/wsm5"
+exe = "wsm5-proteus.x"
+[wsm5.nvidia.jitify]
+path = "benchmarks/hecbench/cuda-jitify/wsm5"
+exe = "wsm5-jitify.x"
+[wsm5.amd.aot]
+path = "benchmarks/hecbench/hip/wsm5"
+exe = "wsm5-aot.x"
+[wsm5.amd.proteus]
+path = "benchmarks/hecbench/hip/wsm5"
+exe = "wsm5-proteus.x"
 [wsm5.inputs]
 default = "10"
diff --git a/benchmarks/RAJAPerf b/benchmarks/RAJAPerf
diff --git a/driver.py b/driver.py
@@ -14,6 +14,21 @@
 import tomllib
 
 
+def demangle(potentially_mangled_name):
+    try:
+        p = subprocess.run(
+            "llvm-cxxfilt " + "\"" + potentially_mangled_name + "\"", check=True, text=True, capture_output=True, shell=True
+        )
+    except subprocess.CalledProcessError as e:
+        print("Failed cmd", e.cmd)
+        print("ret", e.returncode)
+        print("stdout\n", e.stdout)
+        print("stderr\n", e.stderr)
+        print(e)
+        raise e
+
+    return p.stdout
+
 class ProteusConfig:
     def check_valid(self, key, values):
         if key not in self.valid_keys:
@@ -91,7 +106,7 @@ def parse(self, fn):
         def get_hash(x):
             try:
                 hash_pos = 2
-                return cxxfilt.demangle(x.split("$")[hash_pos])
+                return demangle(x.split("$")[hash_pos])
             except IndexError:
                 return None
 
@@ -101,7 +116,7 @@ def get_hash(x):
         df["Duration"] = df["EndNs"] - df["BeginNs"]
         df["Name"] = df["Name"].str.replace(" [clone .kd]", "", regex=False)
         df["Hash"] = df.Name.apply(lambda x: get_hash(x))
-        df["Name"] = df.Name.apply(lambda x: cxxfilt.demangle(x.split("$")[0]))
+        df["Name"] = df.Name.apply(lambda x: demangle(x.split("$")[0]))
         return df
 
 
@@ -120,7 +135,7 @@ def parse(self, fn):
         def get_hash(x):
             try:
                 hash_pos = 2
-                return cxxfilt.demangle(x.split("$")[hash_pos])
+                return demangle(x.split("$")[hash_pos])
             except IndexError:
                 return None
 
@@ -132,24 +147,35 @@ def get_hash(x):
         df = df[1:]
         # Nvprof with metrics tracks only kernels.
         if self.metrics:
-            df["Kernel"] = df.Kernel.apply(lambda x: cxxfilt.demangle(x.split("$")[0]))
+            df["Kernel"] = df.Kernel.apply(lambda x: demangle(x.split("$")[0]))
             df.rename(columns={"Kernel": "Name"}, inplace=True)
         else:
             df["Hash"] = df.Name.apply(lambda x: get_hash(x))
-            df["Name"] = df.Name.apply(lambda x: cxxfilt.demangle(x.split("$")[0]))
+            df["Name"] = df.Name.apply(lambda x: demangle(x.split("$")[0]))
 
         return df
 
 
 class Executor:
-    def __init__(self, benchmark, path, exemode, inputs, cc, proteus_path, env_configs):
+    def __init__(self, benchmark, path, executable_name, extra_args, exemode,
+                build_command, clean_command, inputs, cc, proteus_path, env_configs,
+                build_once, already_built):
         self.benchmark = benchmark
         self.path = path
+        self.executable_name = executable_name
+        self.extra_args = extra_args
         self.exemode = exemode
+        # the build command is meant to be a full bash command to build the benchmark, eg
+        # `cmake -DCMAKE_BUILD_TYPE=Debug --build` or `make benchmark`
+        # If none is provided, it will default to `make`
+        self.build_command = 'make' if build_command == None else build_command
+        self.clean_command = clean_command
         self.inputs = inputs
         self.cc = cc
         self.proteus_path = proteus_path
         self.env_configs = env_configs
+        self.build_once = build_once
+        self.already_built = already_built
 
     def __str__(self):
         return f"{self.benchmark} {self.path} {self.exemode}"
@@ -178,25 +204,30 @@ def execute_command(self, cmd, **kwargs):
 
     def clean(self):
         os.chdir(self.path)
-        cmd = "make clean"
-        self.execute_command(cmd)
+        if self.clean_command is not None:
+            self.execute_command(self.clean_command)
 
     def build(self, do_jit):
         os.chdir(self.path)
-        cmd = "make"
         env = os.environ.copy()
         env["ENABLE_PROTEUS"] = "yes" if do_jit else "no"
         env["PROTEUS_PATH"] = self.proteus_path
         env["CC"] = self.cc
+        if self.build_once and self.already_built:
+            print(self.benchmark)
+            return 0
         t1 = time.perf_counter()
         print(
             "Build command",
-            cmd,
+            self.build_command,
             "CC=" + env["CC"],
             "PROTEUS_PATH=" + env["PROTEUS_PATH"],
             "ENABLE_PROTEUS=" + env["ENABLE_PROTEUS"],
         )
-        self.execute_command(cmd, env=env)
+        if not isinstance(self.build_command, list):
+            self.build_command = [self.build_command]
+        for cmd in self.build_command:
+            self.execute_command(cmd, env=env)
         t2 = time.perf_counter()
         return t2 - t1
 
@@ -211,12 +242,10 @@ def build_and_run(self, reps, profiler=None):
             or self.exemode == "jitify"
         ), "Expected aot or proteus or jitify for exemode"
 
-        exe = f"{self.benchmark}-{self.exemode}.x"
         self.clean()
         print("BUILD", self.path, "type", self.exemode)
-
         ctime = self.build(self.exemode != "aot")
-        exe_size = Path(f"{self.path}/{exe}").stat().st_size
+        exe_size = Path(f"{self.path}/{self.executable_name}").stat().st_size
         print("=> BUILT")
 
         for repeat in range(0, reps):
@@ -225,7 +254,7 @@ def build_and_run(self, reps, profiler=None):
                     cmd_env = os.environ.copy()
                     for k, v in env.items():
                         cmd_env[k] = v
-                    cmd = f"./{exe} {args}"
+                    cmd = f"{self.executable_name} {args} {self.extra_args}"
 
                     set_launch_bounds = (
                         False if env["ENV_PROTEUS_SET_LAUNCH_BOUNDS"] == "0" else True
@@ -461,21 +490,49 @@ def main():
         env_configs = JitifyConfig().get_env_configs()
     else:
         raise Exception(f"Invalid exemode {args.exemode}")
-
+    proteus_install = args.proteus_path
+    assert os.path.exists(proteus_install), f"Error: Proteus install path '{proteus_install}' does not exist!"
+    for env in env_configs:
+        env["PROTEUS_INSTALL_PATH"] = proteus_install
     experiments = []
+    build_command = None
+    build_once = False
+    already_built = False
+    if "build" in benchmark_configs and "build_once" in benchmark_configs["build"]:
+        build_once = True
+
+    # custom toml wide level build command specified
+    if "build" in benchmark_configs and "command" in benchmark_configs["build"][args.machine]:
+        build_command = benchmark_configs["build"][args.machine]["command"]
+    else:
+        raise Exception(
+            "Build instructions must be supplied on a toml-wide level"
+        )
+
     for benchmark in args.bench if args.bench else benchmark_configs:
+        if benchmark == "build":
+            continue
         config = benchmark_configs[benchmark]
+        extra_args = config[args.machine][args.exemode]["args"] if "args" in config[args.machine][args.exemode] else ""
+        clean_command = benchmark_configs["build"]["clean"] if "build" in benchmark_configs and "clean" in benchmark_configs["build"] else None
         experiments.append(
             Executor(
                 benchmark,
-                Path.cwd() / Path(config[args.machine][args.exemode]),
+                Path.cwd() / Path(config[args.machine][args.exemode]["path"]),
+                Path(config[args.machine][args.exemode]["exe"]),
+                extra_args,
                 args.exemode,
+                build_command,
+                clean_command,
                 config["inputs"],
                 args.compiler,
                 args.proteus_path,
                 env_configs,
+                build_once,
+                already_built
             )
         )
+        already_built = True
 
     def gather_profiler_results(metrics):
         if args.machine == "amd":