From dddd1b7a7fbd5a7ce47000ac1a5d6e1ce75a535c Mon Sep 17 00:00:00 2001 From: jnsbck <65561470+jnsbck@users.noreply.github.com> Date: Tue, 3 Dec 2024 18:58:08 +0100 Subject: [PATCH] Confidence bounds for regression tests (#540) * enh: add t-test to regression tests. * fix: fix wrong type of baseline env variable and better printouts * fix: small fixes * fix: change command to update baseline * fix: conf region changed to <= --- .../workflows/update_regression_baseline.yml | 4 +- .gitignore | 1 + tests/conftest.py | 16 +++-- tests/test_regression.py | 67 ++++++++++--------- 4 files changed, 48 insertions(+), 40 deletions(-) diff --git a/.github/workflows/update_regression_baseline.yml b/.github/workflows/update_regression_baseline.yml index c2d6bf65..4190da6c 100644 --- a/.github/workflows/update_regression_baseline.yml +++ b/.github/workflows/update_regression_baseline.yml @@ -12,8 +12,8 @@ jobs: update_regression_tests: name: update_regression_tests runs-on: ubuntu-20.04 - # Trigger from a comment that contains '/update_regression_baselines' - if: github.event.issue.pull_request && contains(github.event.comment.body, '/update_regression_baselines') + # Trigger from a comment that contains '/update_regression_baseline' + if: github.event.issue.pull_request && contains(github.event.comment.body, '/update_regression_baseline') # workflow needs permissions to write to the PR permissions: contents: write diff --git a/.gitignore b/.gitignore index d5638eb6..789570e4 100644 --- a/.gitignore +++ b/.gitignore @@ -57,6 +57,7 @@ coverage.xml .pytest_cache/ tests/regression_test_results.json tests/regression_test_baselines.json +tests/regression_test_report.txt # Translations *.mo diff --git a/tests/conftest.py b/tests/conftest.py index afe41e30..fd503c01 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -209,7 +209,9 @@ def get_or_compute_swc2jaxley_params( @pytest.fixture(scope="session", autouse=True) def print_session_report(request, pytestconfig): """Cleanup a testing directory once we are finished.""" - NEW_BASELINE = os.environ["NEW_BASELINE"] if "NEW_BASELINE" in os.environ else 0 + NEW_BASELINE = ( + int(os.environ["NEW_BASELINE"]) if "NEW_BASELINE" in os.environ else 0 + ) dirname = os.path.dirname(__file__) baseline_fname = os.path.join(dirname, "regression_test_baselines.json") @@ -220,11 +222,10 @@ def print_session_report(request, pytestconfig): ] def update_baseline(): - if NEW_BASELINE: - results = load_json(results_fname) - with open(baseline_fname, "w") as f: - json.dump(results, f, indent=2) - os.remove(results_fname) + results = load_json(results_fname) + with open(baseline_fname, "w") as f: + json.dump(results, f, indent=2) + os.remove(results_fname) def print_regression_report(): baselines = load_json(baseline_fname) @@ -243,5 +244,6 @@ def print_regression_report(): print(report) if len(collected_regression_tests) > 0: - request.addfinalizer(update_baseline) + if NEW_BASELINE: + request.addfinalizer(update_baseline) request.addfinalizer(print_regression_report) diff --git a/tests/test_regression.py b/tests/test_regression.py index a1ccafe1..66dca95e 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -10,12 +10,15 @@ import numpy as np import pytest from jax import jit +from scipy.stats import t as t_dist import jaxley as jx from jaxley.channels import HH from jaxley.connect import sparse_connect from jaxley.synapses import IonotropicSynapse +pytestmark = pytest.mark.regression # mark all tests as regression tests in this file + # Every runtime test needs to have the following structure: # # @compare_to_baseline() @@ -38,24 +41,12 @@ # takes into account the input_kwargs of the test, the name of the test and the runtimes # of each part. +NEW_BASELINE = int(os.environ["NEW_BASELINE"]) if "NEW_BASELINE" in os.environ else 0 +CONFIDENCE = 0.95 -def load_json(fpath): - dct = {} - if os.path.exists(fpath): - with open(fpath, "r") as f: - dct = json.load(f) - return dct - - -pytestmark = pytest.mark.regression # mark all tests as regression tests in this file -NEW_BASELINE = os.environ["NEW_BASELINE"] if "NEW_BASELINE" in os.environ else 0 dirname = os.path.dirname(__file__) fpath_baselines = os.path.join(dirname, "regression_test_baselines.json") fpath_results = os.path.join(dirname, "regression_test_results.json") - -tolerance = 0.2 - -baselines = load_json(fpath_baselines) with open(fpath_results, "w") as f: # clear previous results f.write("{}") @@ -83,14 +74,14 @@ def generate_regression_report(base_results, new_results): diff = None if base_time is None else ((new_time - base_time) / base_time) status = "" - if diff is None: + if base_time is None: status = "🆕" - elif diff > tolerance: + elif new_time <= base_time: + status = "🟢" if diff is not None and diff < -0.05 else "🟠" + elif new_time > base_time: status = "🔴" - elif diff < 0: - status = "🟢" else: - status = "⚪" + status = "❌" # This should never happen. time_str = ( f"({new_time:.3f}s)" @@ -111,6 +102,20 @@ def generate_unique_key(d): return str(hash) +def compute_conf_bounds(X): + df = len(X) - 1 # degrees of freedom = n-1 + critical_value = t_dist.ppf(CONFIDENCE, df) + return np.mean(X) + critical_value * np.std(X, ddof=1) # sample std + + +def load_json(fpath): + dct = {} + if os.path.exists(fpath): + with open(fpath, "r") as f: + dct = json.load(f) + return dct + + def append_to_json(fpath, test_name, input_kwargs, runtimes): header = {"test_name": test_name, "input_kwargs": input_kwargs} data = {generate_unique_key(header): {**header, "runtimes": runtimes}} @@ -124,9 +129,10 @@ def append_to_json(fpath, test_name, input_kwargs, runtimes): class compare_to_baseline: - def __init__(self, baseline_iters=3, test_iters=1): + def __init__(self, baseline_iters=5, test_iters=1): self.baseline_iters = baseline_iters self.test_iters = test_iters + self.baselines = load_json(fpath_baselines) def __call__(self, func): @wraps(func) # ensures kwargs exposed to pytest @@ -139,24 +145,23 @@ def test_wrapper(**kwargs): for _ in range(num_iters): runtimes = func(**kwargs) runs.append(runtimes) - runtimes = {k: np.mean([d[k] for d in runs]) for k in runs[0]} + + # the baseline time is taken as the upper bound of the confidence interval, + # while the runtimes that we test against the baseline are taken as the mean + agg = compute_conf_bounds if NEW_BASELINE else np.mean + runtimes = {k: agg([d[k] for d in runs]) for k in runs[0]} append_to_json( fpath_results, header["test_name"], header["input_kwargs"], runtimes ) if not NEW_BASELINE: - assert key in baselines, f"No basline found for {header}" - func_baselines = baselines[key]["runtimes"] + assert key in self.baselines, f"No basline found for {header}" + func_baselines = self.baselines[key]["runtimes"] for key, baseline in func_baselines.items(): - diff = ( - float("nan") - if np.isclose(baseline, 0) - else (runtimes[key] - baseline) / baseline - ) - assert runtimes[key] <= baseline * ( - 1 + tolerance - ), f"{key} is {diff:.2%} slower than the baseline." + assert ( + runtimes[key] <= baseline + ), f"{key} is significantly slower than the baseline at {runtimes[key]:.3f}s vs. {baseline:.3f}s." return test_wrapper