eki-project · fpjentzsch · Jan 29, 2025 · Jan 29, 2025 · Jan 29, 2025 · Jan 29, 2025
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -178,3 +178,50 @@ FINN Test Suite 2024.1:
   extends: FINN Test Suite 2022.2
   variables:
     FINN_XILINX_VERSION: "2024.1"
+
+Bench (Manual):
+  stage: test
+  rules:
+    # Do not run on a schedule
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+      when: never
+    - if: $MANUAL_CFG_PATH != ""
+  trigger:
+    include: benchmarking/bench-ci.yml
+    strategy: depend
+    forward:
+      pipeline_variables: true
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    BENCH_CFG: "manual"
+
+Bench:
+  stage: test
+  rules:
+    # Do not run on a schedule
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+      when: never
+    - if: $MANUAL_CFG_PATH == ""
+  trigger:
+    include: benchmarking/bench-ci.yml
+    strategy: depend
+    forward:
+      pipeline_variables: true
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+  parallel:
+    matrix:
+      - BENCH_CFG: [mvau_test, resnet50_test, metafi_test, transformer_test, transformer_radioml_all]
+
+#dev: mvau_test
+#fifo: fifosizing_test, metafi_fifosizing_test, resnet50_fifosizing_test
+#transformer: transformer_test, transformer_radioml_all
+
+#TODO: add selector for none, reduced, full benchmark suite
+
+#TODO: introduce result collect job on parent level for easier visualization/excel interfacing
+#TODO: more control via (optional) variables
+#TODO: move power measurement from polling-based script to its own job/runner
+#TODO: ensure a freshly initialized workdir on job/runner level (e.g. created directories seem to stay there)
+#TODO: (optionally) save ALL build artifacts/logs/temporary files to artifacts or PFS for debugging (maybe via Jacamar feature of setting individual persistent workdirs?)
+#TODO: fix clock frequency discrepancies between setting, synth, and driver
diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
@@ -0,0 +1,58 @@
+stages:
+  - synth
+  - measure
+  - collect
+
+variables:
+  BENCH_CFG:
+    description: "Select config, usually provided by parent pipeline"
+    value: ""
+
+workflow:
+  name: "bench_$BENCH_CFG"
+
+FINN Build:
+  id_tokens:
+    CI_JOB_JWT:
+      aud: https://git.uni-paderborn.de
+  stage: synth
+  needs:
+    - job: Fetch Repos
+      pipeline: $PARENT_PIPELINE_ID
+  variables:
+    SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )"
+    NUM_DEFAULT_WORKERS: "$CPU_CORES"
+    PYTEST_PARALLEL: "$CPU_CORES"
+  before_script:
+    - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk)
+    - cd $PATH_WORKDIR/finn-plus
+    - module load system singularity
+    - ulimit -s unlimited # Increase stack size limit
+    - export FINN_SINGULARITY=$PATH_SINGULARITY_IMG/finn-plus/$SINGULARITY_IMG_SELECT
+  script:
+    - ./run-docker.sh python benchmarking/bench.py $BENCH_CFG
+  cache:
+    key: $CI_COMMIT_SHA
+    policy: pull
+    paths:
+      - deps
+  artifacts:
+    name: "bench_artifacts"
+    when: always
+    paths:
+      - bench_artifacts/
+
+Result Collection:
+  id_tokens:
+    CI_JOB_JWT:
+      aud: https://git.uni-paderborn.de
+  stage: collect
+  tags:
+    - image_build
+  script:
+    - python benchmarking/collect.py bench_artifacts/tasks_output bench_results.json
+  artifacts:
+    name: "bench_results"
+    when: always
+    paths:
+      - bench_results.json
diff --git a/benchmarking/bench.py b/benchmarking/bench.py
@@ -0,0 +1,180 @@
+import itertools
+import sys
+import os
+import json
+import time
+import traceback
+import onnxruntime as ort
+
+from dut.mvau import bench_mvau
+from dut.resnet50 import bench_resnet50
+from dut.metafi import bench_metafi
+from dut.synthetic_nonlinear import bench_synthetic_nonlinear
+
+dut = dict()
+dut["mvau"] = bench_mvau
+dut["resnet50"] = bench_resnet50
+dut["metafi"] = bench_metafi
+dut["synthetic_nonlinear"] = bench_synthetic_nonlinear
+
+# TODO: remove guard once transformer support has been fully merged
+try:
+    from dut.transformer import bench_transformer
+    dut["transformer"] = bench_transformer
+except ImportError:
+    pass
+
+def main(config_name):
+    exit_code = 0
+    # Attempt to work around onnxruntime issue on Slurm-managed clusters:
+    # See https://github.com/microsoft/onnxruntime/issues/8313
+    # This seems to happen only when assigned CPU cores are not contiguous
+    _default_session_options = ort.capi._pybind_state.get_default_session_options()
+    def get_default_session_options_new():
+        _default_session_options.inter_op_num_threads = 1
+        _default_session_options.intra_op_num_threads = 1
+        return _default_session_options
+    ort.capi._pybind_state.get_default_session_options = get_default_session_options_new
+
+    # Gather job array info
+    job_id = int(os.environ["SLURM_JOB_ID"])
+    #TODO: allow portable execution on any platform by making as many env vars as possible optional
+    print("Job launched with ID: %d" % (job_id))
+    try:
+        array_id = int(os.environ["SLURM_ARRAY_JOB_ID"])
+        task_id = int(os.environ["SLURM_ARRAY_TASK_ID"])
+        task_count = int(os.environ["SLURM_ARRAY_TASK_COUNT"])
+        print(
+            "Launched as job array (Array ID: %d, Task ID: %d, Task count: %d)"
+            % (array_id, task_id, task_count)
+        )
+    except KeyError:
+        array_id = job_id
+        task_id = 0
+        task_count = 1
+        print("Launched as single job")
+
+    # Prepare result directory
+    # experiment_dir = os.environ.get("EXPERIMENT_DIR") # original experiment dir (before potential copy to ramdisk)
+    experiment_dir = os.environ.get("CI_PROJECT_DIR")
+
+    artifacts_dir = os.path.join(experiment_dir, "bench_artifacts")
+    print("Collecting results in path: %s" % artifacts_dir)
+    os.makedirs(os.path.join(artifacts_dir, "tasks_output"), exist_ok=True)
+    log_path = os.path.join(artifacts_dir, "tasks_output", "task_%d.json" % (task_id))
+
+    # local save dir for large artifacts (e.g., build output, tmp dir dump for debugging)
+    if job_id == 0:
+        #DEBUG mode
+        save_dir = experiment_dir + "_save"
+    else:
+        save_dir = os.path.join(os.environ.get("LOCAL_ARTIFACT_DIR"),
+                            "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + os.environ.get("CI_PIPELINE_NAME"))
+    print("Saving additional artifacts in path: %s" % save_dir)
+    os.makedirs(save_dir, exist_ok=True)
+
+    # Gather benchmarking configs
+    if config_name == "manual":
+        configs_path, config_select = os.path.split(os.environ.get("MANUAL_CFG_PATH"))
+    else:
+        configs_path = os.path.join(os.path.dirname(__file__), "cfg")
+        config_select = config_name + ".json"
+
+    # Load config
+    config_path = os.path.join(configs_path, config_select)
+    print("Loading config %s" % (config_path))
+    if os.path.exists(config_path):
+        with open(config_path, "r") as f:
+            config = json.load(f)
+    else:
+        print("ERROR: config file not found")
+        return
+
+    # Expand all specified config combinations (gridsearch)
+    config_expanded = []
+    for param_set in config:
+        param_set_expanded = list(
+            dict(zip(param_set.keys(), x)) for x in itertools.product(*param_set.values())
+        )
+        config_expanded.extend(param_set_expanded)
+
+    # Save config (only first job of array) for logging purposes
+    if task_id == 0:
+        with open(os.path.join(artifacts_dir, "bench_config.json"), "w") as f:
+            json.dump(config, f, indent=2)
+        with open(os.path.join(artifacts_dir, "bench_config_exp.json"), "w") as f:
+            json.dump(config_expanded, f, indent=2)
+
+    # Determine which runs this job will work on
+    total_runs = len(config_expanded)
+    if total_runs <= task_count:
+        if task_id < total_runs:
+            selected_runs = [task_id]
+        else:
+            return
+    else:
+        selected_runs = []
+        idx = task_id
+        while idx < total_runs:
+            selected_runs.append(idx)
+            idx = idx + task_count
+    print("This job will perform %d out of %d total runs" % (len(selected_runs), total_runs))
+
+    # Run benchmark
+    # TODO: integrate this loop (especially status logging) into the bench class
+    # TODO: log additional info as artifact or directly into info section of json (e.g. dut, versions, date)
+    # TODO: log stdout of individual tasks of the job array into seperate files as artifacts (GitLab web interface is not readable)
+    log = []
+    for run, run_id in enumerate(selected_runs):
+        print(
+            "Starting run %d/%d (id %d of %d total runs)"
+            % (run + 1, len(selected_runs), run_id, total_runs)
+        )
+
+        params = config_expanded[run_id]
+        print("Run parameters: %s" % (str(params)))
+
+        log_dict = {"run_id": run_id, "task_id": task_id, "params": params}
+
+        # Create bench object for respective DUT
+        if "dut" in params:
+            if params["dut"] in dut:
+                bench_object = dut[params["dut"]](params, task_id, run_id, artifacts_dir, save_dir)
+            else:
+                print("ERROR: unknown DUT specified")
+                return 1
+        else:
+            print("ERROR: no DUT specified")
+            return 1
+
+        start_time = time.time()
+        try:
+            bench_object.run()
+            if not bench_object.output_dict:
+                log_dict["status"] = "skipped"
+                print("Run skipped")
+            else:
+                log_dict["status"] = "ok"
+                print("Run completed")
+        except Exception:
+            log_dict["status"] = "failed"
+            print("Run failed: " + traceback.format_exc())
+            exit_code = 1
+            # TODO: exception catch all in builder prevents internal failures from being caught here
+
+        log_dict["total_time"] = int(time.time() - start_time)
+        log_dict["output"] = bench_object.output_dict
+        log.append(log_dict)
+        # overwrite output log file every time to allow early abort
+        with open(log_path, "w") as f:
+            json.dump(log, f, indent=2)
+
+        # save local artifacts of this run (e.g., detailed debug info)
+        bench_object.save_local_artifacts_collection()
+    print("Stopping job")
+    return exit_code
+    #TODO: add additional exit codes (e.g. when some verification within the run failed)?
+
+if __name__ == "__main__":
+    exit_code = main(sys.argv[1])
+    sys.exit(exit_code)