diff --git a/.github/workflows/sycl-benchmark-aggregate.yml b/.github/workflows/sycl-benchmark-aggregate.yml new file mode 100644 index 0000000000000..feaf803faa245 --- /dev/null +++ b/.github/workflows/sycl-benchmark-aggregate.yml @@ -0,0 +1,108 @@ +name: Aggregate compute-benchmark averages from historical data + +# The benchmarking workflow in sycl-linux-run-tests.yml passes or fails based on +# how the benchmark results compare to a historical average: This historical +# average is calculated in this workflow, which aggregates historical data and +# produces measures of central tendency (median in this case) used for this +# purpose. + +on: + workflow_dispatch: + inputs: + cutoff_timestamp: + description: | + Timestamp indicating the age limit of data used in average calculation: + Any benchmark results created before this timestamp is excluded from + being aggregated. + + Any valid date string supported by GNU coreutils is valid here: + https://www.gnu.org/software/coreutils/manual/html_node/Date-input-formats.html + type: string + required: false + workflow_call: + inputs: + cutoff_timestamp: + type: string + required: false + +permissions: + contents: read + +jobs: + aggregate: + name: Aggregate average (median) value for all metrics + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + path: llvm + sparse-checkout: | + devops/scripts/benchmarking + devops/benchmarking + - name: Load benchmarking configuration + run: | + CONFIG_FILE="$PWD/llvm/devops/benchmarking/benchmark-ci.conf" + + # Load default values from configuration file + . "$PWD/llvm/devops/scripts/benchmarking/utils.sh" + # utils.sh contains functions to sanitize config file settings + load_config_constants "$CONFIG_FILE" + echo "PERF_RES_GIT_REPO=$PERF_RES_GIT_REPO" >> $GITHUB_ENV + echo "PERF_RES_BRANCH=$PERF_RES_BRANCH" >> $GITHUB_ENV + echo "PERF_RES_PATH=$PERF_RES_PATH" >> $GITHUB_ENV + + # Determine a "cutoff timestamp" used by the aggregator script + # + # This timestamp controls which historical results are used to compute + # measures of central tendency: Any files timestamped *before* this time + # will be *excluded* from the central tendency calculation. + + echo "TIMESTAMP_FORMAT=$TIMESTAMP_FORMAT" >> $GITHUB_ENV + if [ -z '${{ inputs.cutoff_timestamp }}' ]; then + # No time given, use default time period from config file: + echo "CUTOFF_TIMESTAMP=$(date --date="$AVERAGE_CUTOFF_RANGE" +"$TIMESTAMP_FORMAT")" >> $GITHUB_ENV + else + # If the provided time is a valid GNU coreutils date string, convert + # the time to our format: + _converted_timestamp="$(date --date '${{ inputs.cutoff_timestamp }}' +"$TIMESTAMP_FORMAT" 2> /dev/null)" + if [ -n "$_converted_timestamp" ]; then + echo "CUTOFF_TIMESTAMP=$_converted_timestamp" >> $GITHUB_ENV + else + # If not a valid GNU date string, it could be in our timestamp format already. + # aggregate.py will ensure the timestamp is in the proper format, so we can pass the + # time forward regardless: + echo 'CUTOFF_TIMESTAMP=${{ inputs.cutoff_timestamp }}' >> $GITHUB_ENV + fi + fi + - name: Checkout historical performance results repository + run: | + git clone -b $PERF_RES_BRANCH https://github.com/$PERF_RES_GIT_REPO $PERF_RES_PATH + - name: Run aggregator on historical results + run: | + # The current format of the historical results respository is: + # + # /// + # + # Thus, a min/max depth of 3 is used to enumerate all test cases in the + # repository. Test name is also derived from here. + for dir in $(find "$PERF_RES_PATH" -mindepth 3 -maxdepth 3 -type d ! -path '*.git*'); do + test_name="$(basename $dir)" + python llvm/devops/scripts/benchmarking/aggregate.py "$test_name" "$dir" "$CUTOFF_TIMESTAMP" + done + - name: Upload average to the repo + env: + GITHUB_TOKEN: ${{ secrets.LLVM_SYCL_BENCHMARK_TOKEN }} + run: | + # TODO -- waiting on security clearance + cd "$PERF_RES_PATH" + git config user.name "SYCL Benchmarking Bot" + git config user.email "sys_sycl_benchmarks@intel.com" + git add . + git commit -m "[GHA] Aggregate median data from $CUTOFF_TIMESTAMP to $(date +"$TIMESTAMP_FORMAT")" + git push "https://$GITHUB_TOKEN@github.com/$PERF_RES_GIT_REPO.git" "$PERF_RES_BRANCH" + - name: Archive new medians + if: always() + uses: actions/upload-artifact@v4 + with: + name: llvm-ci-perf-results new medians + path: ${{ env.PERF_RES_PATH }}/**/*-median.csv \ No newline at end of file diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl-linux-run-tests.yml index b3b4f62e370db..1869e435afb79 100644 --- a/.github/workflows/sycl-linux-run-tests.yml +++ b/.github/workflows/sycl-linux-run-tests.yml @@ -25,7 +25,7 @@ on: required: False tests_selector: description: | - Two possible options: "e2e" and "cts". + Three possible options: "e2e", "cts", and "benchmark". type: string default: "e2e" @@ -150,6 +150,7 @@ on: options: - e2e - cts + - benchmark env: description: | @@ -314,3 +315,11 @@ jobs: sycl_cts_artifact: ${{ inputs.sycl_cts_artifact }} target_devices: ${{ inputs.target_devices }} retention-days: ${{ inputs.retention-days }} + + - name: Run compute-benchmarks on SYCL + if: inputs.tests_selector == 'benchmark' + uses: ./devops/actions/run-tests/benchmark + with: + target_devices: ${{ inputs.target_devices }} + env: + GITHUB_TOKEN: ${{ secrets.LLVM_SYCL_BENCHMARK_TOKEN }} diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl-nightly.yml index 5485719d60141..569b90d277eb0 100644 --- a/.github/workflows/sycl-nightly.yml +++ b/.github/workflows/sycl-nightly.yml @@ -238,6 +238,38 @@ jobs: sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }} sycl_cts_artifact: sycl_cts_bin + aggregate_benchmark_results: + if: always() && !cancelled() + name: Aggregate benchmark results and produce historical averages + uses: ./.github/workflows/sycl-benchmark-aggregate.yml + + run-sycl-benchmarks: + needs: [ubuntu2204_build, aggregate_benchmark_results] + if: ${{ always() && !cancelled() && needs.ubuntu2204_build.outputs.build_conclusion == 'success' }} + strategy: + fail-fast: false + matrix: + include: + - name: Run compute-benchmarks on L0 Gen12 + runner: '["Linux", "gen12"]' + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest + image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN + target_devices: level_zero:gpu + reset_intel_gpu: true + uses: ./.github/workflows/sycl-linux-run-tests.yml + with: + name: ${{ matrix.name }} + runner: ${{ matrix.runner }} + image: ${{ matrix.image }} + image_options: ${{ matrix.image_options }} + target_devices: ${{ matrix.target_devices }} + tests_selector: benchmark + reset_intel_gpu: ${{ matrix.reset_intel_gpu }} + ref: ${{ github.sha }} + sycl_toolchain_artifact: sycl_linux_default + sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }} + sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }} + nightly_build_upload: name: Nightly Build Upload if: ${{ github.ref_name == 'sycl' }} diff --git a/devops/actions/run-tests/benchmark/action.yml b/devops/actions/run-tests/benchmark/action.yml new file mode 100644 index 0000000000000..4dd4ca66d765f --- /dev/null +++ b/devops/actions/run-tests/benchmark/action.yml @@ -0,0 +1,63 @@ +name: 'Run compute-benchmarks' + +# Run compute-benchmarks on SYCL +# +# This action assumes SYCL is in $PWD/toolchain, and that /devops has been +# checked out in $PWD/devops. This action also assumes that GITHUB_TOKEN +# was properly set in env, because according to Github, that's apparently the +# recommended way to pass a secret into a github action: +# +# https://docs.github.com/en/actions/security-for-github-actions/security-guides/using-secrets-in-github-actions#accessing-your-secrets +# + +inputs: + target_devices: + required: true + +runs: + using: "composite" + steps: + - name: Run compute-benchmarks + shell: bash + run: | + cat << EOF + # + # NOTE TO DEVELOPERS: + # + + Check latter steps of the workflow: This job produces an artifact with: + - benchmark results from passing/failing tests + - log containing all failing (too slow) benchmarks + - log containing all erroring benchmarks + + While this step in the workflow provides debugging output describing this + information, it might be easier to inspect the logs from the artifact + instead. + + EOF + export ONEAPI_DEVICE_SELECTOR="${{ inputs.target_devices }}" + export CMPLR_ROOT=$PWD/toolchain + sycl-ls + echo "-----" + ./devops/scripts/benchmarking/benchmark.sh -n '${{ runner.name }}' -s + - name: Push compute-benchmarks results + shell: bash + run: | + # TODO -- waiting on security clearance + # Load configuration values + . "$PWD/devops/scripts/benchmarking/utils.sh" + CONFIG_FILE="$PWD/devops/benchmarking/benchmark-ci.conf" + load_config_constants "$CONFIG_FILE" + + cd "$PERF_RES_PATH" + git config user.name "SYCL Benchmarking Bot" + git config user.email "sys_sycl_benchmarks@intel.com" + git add . + git commit -m "[GHA] Upload compute-benchmarks results from https://github.com/intel/llvm/actions/runs/${{ github.run_id }}" + git push "https://$GITHUB_TOKEN@github.com/$PERF_RES_GIT_REPO.git" "$PERF_RES_BRANCH" + - name: Archive compute-benchmark results + if: always() + uses: actions/upload-artifact@v4 + with: + name: Compute-benchmark results (${{ runner.name }}) + path: ./artifact diff --git a/devops/benchmarking/benchmark-ci.conf b/devops/benchmarking/benchmark-ci.conf new file mode 100644 index 0000000000000..ba39b37cd1e92 --- /dev/null +++ b/devops/benchmarking/benchmark-ci.conf @@ -0,0 +1,75 @@ +# +# Configuration Options +# + +# Compile flags used to build compute-benchmarks +COMPUTE_BENCH_COMPILE_FLAGS="-j2" +# Number of iterations to run tests for +COMPUTE_BENCH_ITERATIONS="100" + +# Metrics to benchmark, and their allowed variance, as a Python dictionary +#METRICS_VARIANCE='{"Median": 0.5, "StdDev": 4.0}' +METRICS_VARIANCE='{"Median": 0.5}' +# Metrics to record using aggregate.py +METRICS_RECORDED='["Median", "StdDev"]' + +# Default period of time to aggregate for the average +AVERAGE_CUTOFF_RANGE="7-days-ago" +# Accepts all valid date strings accepted by GNU coreutils `date` extension: +# +# https://www.gnu.org/software/coreutils/manual/html_node/Date-input-formats.html +# +# Relative timestamps are okay, but replace ' ' with '-', as whitespace gets +# Threshold to store benchmark files before benchmarking +# TODO reconsider this +AVERAGE_THRESHOLD=3 +# removed when config file entries are sanitized. + +# Enabled ONEAPI_DEVICE_SELECTOR backends +DEVICE_SELECTOR_ENABLED_BACKENDS="level_zero,opencl,cuda,hip" +# Disabled backends: native_cpu + +# Enabled ONEAPI_DEVICE_SELECTOR backends +DEVICE_SELECTOR_ENABLED_DEVICES="cpu,gpu" +# Disabled devices: fpga + + +# +# Constants +# + +# Constants used throughout the benchmarking workflow -- do not randomly +# reconfigure + +# Github repo + branch settings for repo storing benchmark results +PERF_RES_GIT_REPO="ianayl/llvm-ci-perf-results" +PERF_RES_BRANCH="test-compute-bench" + +# Github repo + branch settings for compute-benchmarks itself +COMPUTE_BENCH_GIT_REPO="ianayl/compute-benchmarks" +COMPUTE_BENCH_BRANCH="update-sycl" + +# Path to clone benchmark results repo +PERF_RES_PATH="./llvm-ci-perf-res" + +# Path to clone and build compute-benchmarks +COMPUTE_BENCH_PATH="./compute-benchmarks" + +# Format of timestamps used (unix `date` format string) +TIMESTAMP_FORMAT="%Y%m%d_%H%M%S" + +# Path to root folder storing benchmark CI artifact +ARTIFACT_PATH="./artifact" + +# Path to temporarily cache compute-benchmark results +OUTPUT_CACHE="./artifact/failed_tests" +# If a test result does not get moved out of this catch-all cache path, it is +# considered to have failed + +# Path to cache passing compute-benchmark results +PASSING_CACHE="./artifact/passing_tests" + +# Log file for test cases that perform over the allowed variance +BENCHMARK_SLOW_LOG="./artifact/benchmarks_failed.log" +# Log file for test cases that errored / failed to build +BENCHMARK_ERROR_LOG="./artifact/benchmarks_errored.log" diff --git a/devops/benchmarking/enabled_tests.conf b/devops/benchmarking/enabled_tests.conf new file mode 100644 index 0000000000000..0f6e21f93f67b --- /dev/null +++ b/devops/benchmarking/enabled_tests.conf @@ -0,0 +1,8 @@ +# Test cases to be enabled: +api_overhead_benchmark_sycl +memory_benchmark_sycl +miscellaneous_benchmark_sycl +ulls_benchmark_sycl + +# As of January 2025, these are every compute-benchmark tests with a SYCL +# implementation. \ No newline at end of file diff --git a/devops/scripts/benchmarking/aggregate.py b/devops/scripts/benchmarking/aggregate.py new file mode 100644 index 0000000000000..4b3918118af75 --- /dev/null +++ b/devops/scripts/benchmarking/aggregate.py @@ -0,0 +1,126 @@ +import csv +import sys +from pathlib import Path +import heapq +import statistics + +import common + + +# Simple median calculation +class SimpleMedian: + + def __init__(self): + self.elements = [] + + def add(self, n: float): + self.elements.append(n) + + def get_median(self) -> float: + return statistics.median(self.elements) + + +# Calculate medians incrementally using a heap: Useful for when dealing with +# large number of samples. +# +# TODO how many samples are we going to realistically get? I had written this +# with precommit in mind, but if this only runs nightly, it would actually be +# faster to do a normal median calculation. +class StreamingMedian: + + def __init__(self): + # Gist: we keep a minheap and a maxheap, and store the median as the top + # of the minheap. When a new element comes it gets put into the heap + # based on if the element is bigger than the current median. Then, the + # heaps are heapified and the median is repopulated by heapify. + self.minheap_larger = [] + self.maxheap_smaller = [] + + # Note: numbers on maxheap should be negative, as heapq + # is minheap by default + + def add(self, n: float): + if len(self.maxheap_smaller) == 0 or -self.maxheap_smaller[0] >= n: + heapq.heappush(self.maxheap_smaller, -n) + else: + heapq.heappush(self.minheap_larger, n) + + # Ensure minheap has more elements than maxheap + if len(self.maxheap_smaller) > len(self.minheap_larger) + 1: + heapq.heappush(self.minheap_larger, -heapq.heappop(self.maxheap_smaller)) + elif len(self.maxheap_smaller) < len(self.minheap_larger): + heapq.heappush(self.maxheap_smaller, -heapq.heappop(self.minheap_larger)) + + def get_median(self) -> float: + if len(self.maxheap_smaller) == len(self.minheap_larger): + # Equal number of elements smaller and larger than "median": + # thus, there are two median values. The median would then become + # the average of both median values. + return (-self.maxheap_smaller[0] + self.minheap_larger[0]) / 2.0 + else: + # Otherwise, median is always in minheap, as minheap is always + # bigger + return -self.maxheap_smaller[0] + + +def aggregate_median(test_name: str, test_dir: str, cutoff: str): + + # Get all .csv samples for the requested test folder + def csv_samples() -> list[str]: + # TODO check that the path below is valid directory + cache_dir = Path(f"{test_dir}") + # TODO check for time range; What time range do I want? + return filter( + lambda f: f.is_file() + and common.valid_timestamp(str(f)[-19:-4]) + and str(f)[-19:-4] > cutoff, + cache_dir.glob(f"{test_name}-*_*.csv"), + ) + + # Calculate median of every desired metric: + aggregate_s = dict() + for sample_path in csv_samples(): + with open(sample_path, "r") as sample_file: + for s in csv.DictReader(sample_file): + test_case = s["TestCase"] + # Construct entry in aggregate_s for test case if it does not + # exist already: + if test_case not in aggregate_s: + aggregate_s[test_case] = { + metric: SimpleMedian() for metric in common.metrics_variance + } + + for metric in common.metrics_variance: + aggregate_s[test_case][metric].add(common.sanitize(s[metric])) + + # Write calculated median (aggregate_s) as a new .csv file: + with open( + f"{test_dir}/{test_name}-median.csv", "w" + ) as output_csv: + writer = csv.DictWriter( + output_csv, fieldnames=["TestCase", *common.metrics_variance.keys()] + ) + writer.writeheader() + for test_case in aggregate_s: + writer.writerow( + {"TestCase": test_case} + | { + metric: aggregate_s[test_case][metric].get_median() + for metric in common.metrics_variance + } + ) + + +if __name__ == "__main__": + if len(sys.argv) < 4: + print( + f"Usage: {sys.argv[0]} " + ) + exit(1) + if not common.valid_timestamp(sys.argv[3]): + print(sys.argv) + print(f"Bad cutoff timestamp, please use YYYYMMDD_HHMMSS.") + exit(1) + common.load_configs() + + aggregate_median(sys.argv[1], sys.argv[2], sys.argv[3]) diff --git a/devops/scripts/benchmarking/benchmark.sh b/devops/scripts/benchmarking/benchmark.sh new file mode 100755 index 0000000000000..ca707af09ce04 --- /dev/null +++ b/devops/scripts/benchmarking/benchmark.sh @@ -0,0 +1,308 @@ +#!/bin/sh + +# +# benchmark.sh: Benchmark dpcpp using compute-benchmarks +# + +usage () { + >&2 echo "Usage: $0 -t [-B ] + -n Github runner name -- Required + -B Path to clone and build compute-benchmarks on + -p Path to compute-benchmarks (or directory to build compute-benchmarks in) + -r Github repo to use for compute-benchmarks origin, in format / + -b Git branch to use within compute-benchmarks + -f Compile flags passed into building compute-benchmarks + -c Clean up working directory + -C Clean up working directory and exit + -s Cache results + +This script builds and runs benchmarks from compute-benchmarks." + exit 1 +} + +clone_perf_res() { + echo "### Cloning llvm-ci-perf-res ($PERF_RES_GIT_REPO:$PERF_RES_BRANCH) ###" + mkdir -p "$(dirname "$PERF_RES_PATH")" + git clone -b $PERF_RES_BRANCH https://github.com/$PERF_RES_GIT_REPO $PERF_RES_PATH + [ "$?" -ne 0 ] && exit $? +} + +clone_compute_bench() { + echo "### Cloning compute-benchmarks ($COMPUTE_BENCH_GIT_REPO:$COMPUTE_BENCH_BRANCH) ###" + mkdir -p "$(dirname "$COMPUTE_BENCH_PATH")" + git clone -b $COMPUTE_BENCH_BRANCH \ + --recurse-submodules https://github.com/$COMPUTE_BENCH_GIT_REPO \ + $COMPUTE_BENCH_PATH + [ "$?" -ne 0 ] && exit $? +} + +build_compute_bench() { + echo "### Building compute-benchmarks ($COMPUTE_BENCH_GIT_REPO:$COMPUTE_BENCH_BRANCH) ###" + mkdir $COMPUTE_BENCH_PATH/build && cd $COMPUTE_BENCH_PATH/build && + # No reason to turn on ccache, if this docker image will be disassembled later on + cmake .. -DBUILD_SYCL=ON -DBUILD_L0=OFF -DBUILD=OCL=OFF -DCCACHE_ALLOWED=FALSE # && cmake --build . $COMPUTE_BENCH_COMPILE_FLAGS + # TODO enable mechanism for opting into L0 and OCL -- the concept is to + # subtract OCL/L0 times from SYCL times in hopes of deriving SYCL runtime + # overhead, but this is mostly an idea that needs to be mulled upon. + + if [ "$?" -eq 0 ]; then + while IFS= read -r case; do + # Skip lines starting with '#' + [ "${case##\#*}" ] || continue + make $COMPUTE_BENCH_COMPILE_FLAGS "$case" + done < "$TESTS_CONFIG" + fi + #compute_bench_build_stat=$? + cd - + #[ "$compute_bench_build_stat" -ne 0 ] && exit $compute_bench_build_stat +} + +# print_bench_res() { +# # Usage: print_bench_res +# if [ ! -s $1 ]; then +# printf "NO OUTPUT! (Status $2)\n" | tee -a $3 +# return # Do not proceed if file is empty +# fi +# +# get_csv_col_index $1 run-time-mean +# tmp_run_time_mean_i=$tmp_csv_col_i +# get_csv_col_index $1 run-time-median +# tmp_run_time_median_i=$tmp_csv_col_i +# get_csv_col_index $1 run-time-throughput +# tmp_run_time_throughput_i=$tmp_csv_col_i +# +# # `sycl-bench` output seems to like inserting the header multiple times. +# # Here we cache the header to make sure it prints only once: +# tmp_header_title="$(cat $1 | head -n 1 | sed 's/^\# Benchmark name/benchmark/')" +# tmp_result="$(cat $1 | grep '^[^\#]')" +# +# printf "%s\n%s" "$tmp_header_title" "$tmp_result" \ +# | awk -F',' -v me="$tmp_run_time_mean_i" \ +# -v md="$tmp_run_time_median_i" \ +# -v th="$tmp_run_time_throughput_i" \ +# '{printf "%-57s %-13s %-15s %-20s\n", $1, $me, $md, $th }' \ +# | tee -a $3 # Print to summary file +# } + +# Check if the number of samples for a given test case is less than a threshold +# set in benchmark-ci.conf +# +# Usage: +samples_under_threshold () { + [ ! -d "$PERF_RES_PATH/$1" ] && return 1 # Directory doesn't exist + file_count="$(find "$PERF_RES_PATH/$1" -maxdepth 1 -type f | wc -l )" + [ "$file_count" -lt "$AVERAGE_THRESHOLD" ] +} + +# Check for a regression via compare.py +# +# Usage: check_regression +check_regression() { + csv_relpath="$(dirname "$1")" + csv_name="$(basename "$1")" + if samples_under_threshold "$csv_relpath"; then + echo "Not enough samples to construct a good average, performance\ + check skipped!" + return 0 # Success status + fi + DEVOPS_PATH="$DEVOPS_PATH" \ + python "$DEVOPS_PATH/scripts/benchmarking/compare.py" \ + "$csv_relpath" "$csv_name" + return $? +} + +# Move the results of our benchmark into the git repo, and save benchmark +# results to artifact archive +# +# Usage: cache +cache() { + mkdir -p "$(dirname "$PASSING_CACHE/$1")" "$(dirname "$PERF_RES_PATH/$1")" + cp "$OUTPUT_CACHE/$1" "$PASSING_CACHE/$1" + mv "$OUTPUT_CACHE/$1" "$PERF_RES_PATH/$1" +} + +# Check for a regression + cache if no regression found +# +# Usage: check_and_cache +check_and_cache() { + echo "Checking $1..." + if check_regression $1; then + if [ "$CACHE_RESULTS" -eq "1" ]; then + echo "Caching $1..." + cache $1 + fi + else + [ "$CACHE_RESULTS" -eq "1" ] && echo "Regression found -- Not caching!" + fi +} + +# Run and process the results of each enabled benchmark in enabled_tests.conf +process_benchmarks() { + mkdir -p "$PERF_RES_PATH" + + echo "### Running and processing selected benchmarks ###" + if [ -z "$TESTS_CONFIG" ]; then + echo "Setting tests to run via cli is not currently supported." + exit 1 + else + rm "$BENCHMARK_ERROR_LOG" "$BENCHMARK_SLOW_LOG" 2> /dev/null + mkdir -p "$(dirname "$BENCHMARK_ERROR_LOG")" "$(dirname "$BENCHMARK_SLOW_LOG")" + # Loop through each line of enabled_tests.conf, but ignore lines in the + # test config starting with #'s: + grep "^[^#]" "$TESTS_CONFIG" | while read -r testcase; do + echo "# Running $testcase..." + + # The benchmark results git repo and this script's output both share + # the following directory structure: + # + # /// + # + # Instead of specifying 2 paths with a slightly different root + # folder name for every function we use, we can use a relative path + # to represent the file in both folders. + # + # Figure out the relative path of our testcase result: + test_dir_relpath="$DEVICE_SELECTOR_DIRNAME/$RUNNER/$testcase" + output_csv_relpath="$test_dir_relpath/$testcase-$TIMESTAMP.csv" + mkdir -p "$OUTPUT_CACHE/$test_dir_relpath" # Ensure directory exists + # TODO generate runner config txt if not exist + + output_csv="$OUTPUT_CACHE/$output_csv_relpath" + $COMPUTE_BENCH_PATH/build/bin/$testcase --csv \ + --iterations="$COMPUTE_BENCH_ITERATIONS" \ + | tail +8 > "$output_csv" + # The tail +8 filters out header lines not in csv format + + exit_status="$?" + if [ "$exit_status" -eq 0 ] && [ -s "$output_csv" ]; then + check_and_cache $output_csv_relpath + else + # TODO consider capturing stderr for logging + echo "[ERROR] $testcase returned exit status $exit_status" + echo "-- $testcase: error $exit_status" >> "$BENCHMARK_ERROR_LOG" + fi + done + fi +} + +# Handle failures + produce a report on what failed +process_results() { + fail=0 + if [ -s "$BENCHMARK_SLOW_LOG" ]; then + printf "\n### Tests performing over acceptable range of average: ###\n" + cat "$BENCHMARK_SLOW_LOG" + echo "" + fail=2 + fi + if [ -s "$BENCHMARK_ERROR_LOG" ]; then + printf "\n### Tests that failed to run: ###\n" + cat "$BENCHMARK_ERROR_LOG" + echo "" + fail=1 + fi + exit $fail +} + +cleanup() { + echo "### Cleaning up compute-benchmark builds from prior runs ###" + rm -rf $COMPUTE_BENCH_PATH + rm -rf $PERF_RES_PATH + [ ! -z "$_exit_after_cleanup" ] && exit +} + +load_configs() { + # This script needs to know where the intel/llvm "/devops" directory is, + # containing all the configuration files and the compare script. + # + # If this is not provided, this function tries to guess where the files + # are based on how the script is called, and verifies that all necessary + # configs and scripts are reachable. + + # This benchmarking script is usually at: + # + # /devops/scripts/benchmarking/benchmark.sh + # + # Derive /devops based on location of this script: + [ -z "$DEVOPS_PATH" ] && DEVOPS_PATH="$(dirname "$0")/../.." + + BENCHMARK_CI_CONFIG="$(realpath $DEVOPS_PATH/benchmarking/benchmark-ci.conf)" + TESTS_CONFIG="$(realpath $DEVOPS_PATH/benchmarking/enabled_tests.conf)" + COMPARE_PATH="$(realpath $DEVOPS_PATH/scripts/benchmarking/compare.py)" + UTILS_PATH="$(realpath $DEVOPS_PATH/scripts/benchmarking/utils.sh)" + + for file in \ + "$BENCHMARK_CI_CONFIG" "$TESTS_CONFIG" "$COMPARE_PATH" "$UTILS_PATH" + do + if [ ! -f "$file" ]; then + echo "Please provide path to DEVOPS_PATH." + exit -1 + fi + done + + . "$UTILS_PATH" + load_config_options "$BENCHMARK_CI_CONFIG" + load_config_constants "$BENCHMARK_CI_CONFIG" +} + +##### + +load_configs + +COMPUTE_BENCH_COMPILE_FLAGS="" +CACHE_RESULTS="0" +TIMESTAMP="$(date +"$TIMESTAMP_FORMAT")" + +# CLI flags + overrides to configuration options: +while getopts "p:b:r:f:n:cCs" opt; do + case $opt in + p) COMPUTE_BENCH_PATH=$OPTARG ;; + r) COMPUTE_BENCH_GIT_REPO=$OPTARG ;; + b) COMPUTE_BENCH_BRANCH=$OPTARG ;; + f) COMPUTE_BENCH_COMPILE_FLAGS=$OPTARG ;; + n) RUNNER=$OPTARG ;; + # Cleanup status is saved in a var to ensure all arguments are processed before + # performing cleanup + c) _cleanup=1 ;; + C) _cleanup=1 && _exit_after_cleanup=1 ;; + s) CACHE_RESULTS="1";; + \?) usage ;; + esac +done + +# Check all necessary variables exist: +if [ -z "$CMPLR_ROOT" ]; then + echo "Please set CMPLR_ROOT first; it is needed by compute-benchmarks to build." + exit 1 +elif [ -z "$ONEAPI_DEVICE_SELECTOR" ]; then + echo "Please set ONEAPI_DEVICE_SELECTOR first to specify which device to use." + exit 1 +elif [ -z "$RUNNER" ]; then + echo "Please specify runner name using -n first; it is needed for storing/comparing benchmark results." + exit 1 +fi + +# Make sure ONEAPI_DEVICE_SELECTOR doesn't try to enable multiple devices at the +# same time, or use specific device id's +_dev_sel_backend_re="$(echo "$DEVICE_SELECTOR_ENABLED_BACKENDS" | sed 's/,/|/g')" +_dev_sel_device_re="$(echo "$DEVICE_SELECTOR_ENABLED_DEVICES" | sed 's/,/|/g')" +_dev_sel_re="s/($_dev_sel_backend_re):($_dev_sel_device_re)//" +if [ -n "$(echo "$ONEAPI_DEVICE_SELECTOR" | sed -E "$_dev_sel_re")" ]; then + echo "Unsupported ONEAPI_DEVICE_SELECTOR value: please ensure only one \ +device is selected, and devices are not selected by indices." + echo "Enabled backends: $DEVICE_SELECTOR_ENABLED_BACKENDS" + echo "Enabled device types: $DEVICE_SELECTOR_ENABLED_DEVICES" + exit 1 +fi +# ONEAPI_DEVICE_SELECTOR values are not valid directory names in unix: this +# value lets us use ONEAPI_DEVICE_SELECTOR as actual directory names +DEVICE_SELECTOR_DIRNAME="$(echo "$ONEAPI_DEVICE_SELECTOR" | sed 's/:/-/')" + +# Clean up and delete all cached files if specified: +[ ! -z "$_cleanup" ] && cleanup +# Clone and build only if they aren't already cached/deleted: +[ ! -d "$PERF_RES_PATH" ] && clone_perf_res +[ ! -d "$COMPUTE_BENCH_PATH" ] && clone_compute_bench +[ ! -d "$COMPUTE_BENCH_PATH/build" ] && build_compute_bench +# Process benchmarks: +process_benchmarks +process_results diff --git a/devops/scripts/benchmarking/common.py b/devops/scripts/benchmarking/common.py new file mode 100644 index 0000000000000..75f236066fd98 --- /dev/null +++ b/devops/scripts/benchmarking/common.py @@ -0,0 +1,67 @@ +import os +import re +import ast + +# Globals definition +PERF_RES_PATH, metrics_variance, metrics_recorded = None, None, None +BENCHMARK_SLOW_LOG, BENCHMARK_ERROR_LOG = None, None + + +def sanitize(stat: str) -> float: + # Get rid of % + if stat[-1] == "%": + stat = stat[:-1] + return float(stat) + + +def load_configs(): + DEVOPS_PATH = os.getenv("DEVOPS_PATH") + if DEVOPS_PATH is None: + # Try to predict where /devops is based on executable + DEVOPS_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) + + benchmarking_ci_conf_path = f"{DEVOPS_PATH}/benchmarking/benchmark-ci.conf" + if not os.path.isfile(benchmarking_ci_conf_path): + raise Exception(f"Please provide path to a valid DEVOPS_PATH.") + + global PERF_RES_PATH, OUTPUT_CACHE, metrics_variance, metrics_recorded + global BENCHMARK_ERROR_LOG, BENCHMARK_SLOW_LOG + perf_res_re = re.compile(r"^PERF_RES_PATH=(.*)$", re.M) + output_cache_re = re.compile(r"^OUTPUT_CACHE=(.*)$", re.M) + m_variance_re = re.compile(r"^METRICS_VARIANCE=(.*)$", re.M) + m_recorded_re = re.compile(r"^METRICS_RECORDED=(.*)$", re.M) + b_slow_re = re.compile(r"^BENCHMARK_SLOW_LOG=(.*)$", re.M) + b_error_re = re.compile(r"^BENCHMARK_ERROR_LOG=(.*)$", re.M) + + with open(benchmarking_ci_conf_path, "r") as configs_file: + configs_str = configs_file.read() + + for m_variance in m_variance_re.findall(configs_str): + metrics_variance = ast.literal_eval(m_variance.strip()[1:-1]) + if not isinstance(metrics_variance, dict): + raise TypeError("Error in benchmark-ci.conf: METRICS_VARIANCE is not a python dict.") + + for m_recorded in m_recorded_re.findall(configs_str): + metrics_recorded = ast.literal_eval(m_recorded.strip()[1:-1]) + if not isinstance(metrics_recorded, list): + raise TypeError("Error in benchmark-ci.conf: METRICS_RECORDED is not a python list.") + + for perf_res in perf_res_re.findall(configs_str): + PERF_RES_PATH = str(perf_res[1:-1]) + + for output_cache in output_cache_re.findall(configs_str): + OUTPUT_CACHE = str(output_cache[1:-1]) + + for b_slow_log in b_slow_re.findall(configs_str): + BENCHMARK_SLOW_LOG = str(b_slow_log[1:-1]) + + for b_error_log in b_error_re.findall(configs_str): + BENCHMARK_ERROR_LOG = str(b_error_log[1:-1]) + + +def valid_timestamp(timestamp: str) -> bool: + timestamp_re = re.compile( + # YYYYMMDD_HHMMSS + r"^\d{4}(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])_(0[0-9]|1[0-9]|2[0-3])[0-5][0-9][0-5][0-9]$" + ) + return timestamp_re.match(timestamp) is not None \ No newline at end of file diff --git a/devops/scripts/benchmarking/compare.py b/devops/scripts/benchmarking/compare.py new file mode 100644 index 0000000000000..7974fcc130397 --- /dev/null +++ b/devops/scripts/benchmarking/compare.py @@ -0,0 +1,73 @@ +import os +import csv +import sys +from pathlib import Path + +import common + +def compare_to_median(test_name: str, median_path: str, test_csv_path: str): + median = dict() # stores actual median of current testcase + with open(median_path, "r") as median_csv: + for stat in csv.DictReader(median_csv): + median[stat["TestCase"]] = { + metric: float(stat[metric]) for metric in common.metrics_variance + } + + # TODO read status codes from a config file instead? + status = 0 + failure_counts = {metric: 0 for metric in common.metrics_variance} + with open(test_csv_path, "r") as sample_csv: + for sample in csv.DictReader(sample_csv): + test_case = sample["TestCase"] + + # Ignore test cases we haven't profiled before + if test_case not in median: + continue + hist_median = median[test_case] + for metric, threshold in common.metrics_variance.items(): + max_tolerated = hist_median[metric] * (1 + threshold) + sample_value = common.sanitize(sample[metric]) + if sample_value > max_tolerated: + print("vvv FAILED vvv") + print(test_case) + print( + f"{metric}: {sample_value} -- Historic avg. {hist_median[metric]} (max tolerance {threshold*100}%: {max_tolerated})" + ) + print("^^^^^^^^^^^^^^") + with open(common.BENCHMARK_SLOW_LOG, "a") as slow_log: + slow_log.write( + f"-- {test_name}::{test_case}\n" + f" {metric}: {sample_value} -- Historic avg. {hist_median[metric]} (max tol. {threshold*100}%: {max_tolerated})\n" + ) + status = 1 + failure_counts[metric] += 1 + if status != 0: + print(f"Failure counts: {failure_counts}") + return status + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print(f"Usage: {sys.argv[0]} ") + exit(1) + # Both benchmark results git repo and benchmark.sh output are structured + # like so: + # /// + # This relative path is sys.argv[1], while the name of the csv file we are + # comparing against is sys.argv[2]. + common.load_configs() + test_name = os.path.basename(sys.argv[1]) + test_csv_path = f"{common.OUTPUT_CACHE}/{sys.argv[1]}/{sys.argv[2]}" + median_path = f"{common.PERF_RES_PATH}/{sys.argv[1]}/{test_name}-median.csv" + + if not os.path.isfile(test_csv_path): + print("Invalid test file provided: " + test_csv_path) + exit(1) + if not os.path.isfile(median_path): + print( + f"Median file for test {test_name} not found at {median_path}.\n" + + "Please calculate the median using the aggregate workflow." + ) + exit(1) + + exit(compare_to_median(test_name, median_path, test_csv_path)) diff --git a/devops/scripts/benchmarking/utils.sh b/devops/scripts/benchmarking/utils.sh new file mode 100644 index 0000000000000..ccff01b572add --- /dev/null +++ b/devops/scripts/benchmarking/utils.sh @@ -0,0 +1,111 @@ +#!/bin/sh + +# +# utils.sh: Utilities for benchmarking scripts +# + +# Usage: _sanitize_configs +_sanitize_configs() { + # Trim quotes if any + trim_quotes="$(printf "%s" "$2" | tr -d "\n" | sed 's/^"//; s/"$//')" + check_illegal_chars="$(printf "%s" "$trim_quotes" | sed 's/[a-zA-Z0-9_.,:/%-]//g')" + + if [ -n "$check_illegal_chars" ]; then + # Throw if unallowed characters are spotted + printf "" + else + # Return the trimmed string + printf "%s" "$trim_quotes" + fi +} + +_preprocess_config() { + # Remove comments + _tmp1="$(mktemp)" + grep '^[^#]' "$1" > "$_tmp1" + # Skip values intended for python + _tmp2="$(mktemp)" + grep -E -v '^METRICS_(VARIANCE|RECORDED)' "$_tmp1" > "$_tmp2" + rm "$_tmp1" + # Return + echo "$_tmp2" +} + +# Sanitize + load all known configuration options +# Usage: load_config_options +load_config_options() { + processed_config="$(_preprocess_config $1)" + # Strict loading of configuration options by name: + while IFS='=' read -r key value; do + sanitized_value=$(_sanitize_configs "$key" "$value") + if [ -z "$sanitized_value" ]; then + echo "Bad configuration value for $key: $value" + echo "Ensure $value is within character range [a-zA-Z0-9_.,:/%-]." + exit 1 + fi + + case "$key" in + 'COMPUTE_BENCH_COMPILE_FLAGS') + export COMPUTE_BENCH_COMPILE_FLAGS="$sanitized_value" ;; + 'COMPUTE_BENCH_ITERATIONS') + export COMPUTE_BENCH_ITERATIONS="$sanitized_value" ;; + 'AVERAGE_THRESHOLD') + export AVERAGE_THRESHOLD="$sanitized_value" ;; + 'AVERAGE_CUTOFF_RANGE') + export AVERAGE_CUTOFF_RANGE="$sanitized_value" ;; + 'DEVICE_SELECTOR_ENABLED_BACKENDS') + export DEVICE_SELECTOR_ENABLED_BACKENDS="$sanitized_value" ;; + 'DEVICE_SELECTOR_ENABLED_DEVICES') + export DEVICE_SELECTOR_ENABLED_DEVICES="$sanitized_value" ;; + esac + done < "$processed_config" +} + +# Sanitize + load all (known) constants from the configuration file +# Usage: load_config_constants +load_config_constants() { + processed_config="$(_preprocess_config $1)" + # Strict loading of configuration options by name: + while IFS='=' read -r key value; do + sanitized_value=$(_sanitize_configs "$key" "$value") + if [ -z "$sanitized_value" ]; then + echo "Bad configuration value for $key: $value" + echo "Ensure $value is within character range [a-zA-Z0-9_.,:/%-]." + exit 1 + fi + + case "$key" in + 'PERF_RES_GIT_REPO') + export PERF_RES_GIT_REPO="$sanitized_value" ;; + 'PERF_RES_BRANCH') + export PERF_RES_BRANCH="$sanitized_value" ;; + 'PERF_RES_PATH') + export PERF_RES_PATH="$sanitized_value" ;; + 'COMPUTE_BENCH_GIT_REPO') + export COMPUTE_BENCH_GIT_REPO="$sanitized_value" ;; + 'COMPUTE_BENCH_BRANCH') + export COMPUTE_BENCH_BRANCH="$sanitized_value" ;; + 'COMPUTE_BENCH_PATH') + export COMPUTE_BENCH_PATH="$sanitized_value" ;; + 'OUTPUT_CACHE') + export OUTPUT_CACHE="$sanitized_value" ;; + 'ARTIFACT_PATH') + export ARTIFACT_PATH="$sanitized_value" ;; + 'PASSING_CACHE') + export PASSING_CACHE="$sanitized_value" ;; + 'TIMESTAMP_FORMAT') + export TIMESTAMP_FORMAT="$sanitized_value" ;; + 'BENCHMARK_SLOW_LOG') + export BENCHMARK_SLOW_LOG="$sanitized_value" ;; + 'BENCHMARK_ERROR_LOG') + export BENCHMARK_ERROR_LOG="$sanitized_value" ;; + esac + done < "$processed_config" +} + +# # Sanitize + load a single configuration value +# # Usage: load_single_config +# load_single_config() { +# _val="$(_sanitize_configs "$(grep "^$2=" "$1" | sed "s/^$2=//")")" +# export "$2=$_val" +# }