Skip to content

Commit

Permalink
Confidence bounds for regression tests (#540)
Browse files Browse the repository at this point in the history
* enh: add t-test to regression tests.

* fix: fix wrong type of baseline env variable and better printouts

* fix: small fixes

* fix: change command to update baseline

* fix: conf region changed to <=
  • Loading branch information
jnsbck authored Dec 3, 2024
1 parent dd477f2 commit dddd1b7
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 40 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/update_regression_baseline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ jobs:
update_regression_tests:
name: update_regression_tests
runs-on: ubuntu-20.04
# Trigger from a comment that contains '/update_regression_baselines'
if: github.event.issue.pull_request && contains(github.event.comment.body, '/update_regression_baselines')
# Trigger from a comment that contains '/update_regression_baseline'
if: github.event.issue.pull_request && contains(github.event.comment.body, '/update_regression_baseline')
# workflow needs permissions to write to the PR
permissions:
contents: write
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ coverage.xml
.pytest_cache/
tests/regression_test_results.json
tests/regression_test_baselines.json
tests/regression_test_report.txt

# Translations
*.mo
Expand Down
16 changes: 9 additions & 7 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,9 @@ def get_or_compute_swc2jaxley_params(
@pytest.fixture(scope="session", autouse=True)
def print_session_report(request, pytestconfig):
"""Cleanup a testing directory once we are finished."""
NEW_BASELINE = os.environ["NEW_BASELINE"] if "NEW_BASELINE" in os.environ else 0
NEW_BASELINE = (
int(os.environ["NEW_BASELINE"]) if "NEW_BASELINE" in os.environ else 0
)

dirname = os.path.dirname(__file__)
baseline_fname = os.path.join(dirname, "regression_test_baselines.json")
Expand All @@ -220,11 +222,10 @@ def print_session_report(request, pytestconfig):
]

def update_baseline():
if NEW_BASELINE:
results = load_json(results_fname)
with open(baseline_fname, "w") as f:
json.dump(results, f, indent=2)
os.remove(results_fname)
results = load_json(results_fname)
with open(baseline_fname, "w") as f:
json.dump(results, f, indent=2)
os.remove(results_fname)

def print_regression_report():
baselines = load_json(baseline_fname)
Expand All @@ -243,5 +244,6 @@ def print_regression_report():
print(report)

if len(collected_regression_tests) > 0:
request.addfinalizer(update_baseline)
if NEW_BASELINE:
request.addfinalizer(update_baseline)
request.addfinalizer(print_regression_report)
67 changes: 36 additions & 31 deletions tests/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,15 @@
import numpy as np
import pytest
from jax import jit
from scipy.stats import t as t_dist

import jaxley as jx
from jaxley.channels import HH
from jaxley.connect import sparse_connect
from jaxley.synapses import IonotropicSynapse

pytestmark = pytest.mark.regression # mark all tests as regression tests in this file

# Every runtime test needs to have the following structure:
#
# @compare_to_baseline()
Expand All @@ -38,24 +41,12 @@
# takes into account the input_kwargs of the test, the name of the test and the runtimes
# of each part.

NEW_BASELINE = int(os.environ["NEW_BASELINE"]) if "NEW_BASELINE" in os.environ else 0
CONFIDENCE = 0.95

def load_json(fpath):
dct = {}
if os.path.exists(fpath):
with open(fpath, "r") as f:
dct = json.load(f)
return dct


pytestmark = pytest.mark.regression # mark all tests as regression tests in this file
NEW_BASELINE = os.environ["NEW_BASELINE"] if "NEW_BASELINE" in os.environ else 0
dirname = os.path.dirname(__file__)
fpath_baselines = os.path.join(dirname, "regression_test_baselines.json")
fpath_results = os.path.join(dirname, "regression_test_results.json")

tolerance = 0.2

baselines = load_json(fpath_baselines)
with open(fpath_results, "w") as f: # clear previous results
f.write("{}")

Expand Down Expand Up @@ -83,14 +74,14 @@ def generate_regression_report(base_results, new_results):
diff = None if base_time is None else ((new_time - base_time) / base_time)

status = ""
if diff is None:
if base_time is None:
status = "🆕"
elif diff > tolerance:
elif new_time <= base_time:
status = "🟢" if diff is not None and diff < -0.05 else "🟠"
elif new_time > base_time:
status = "🔴"
elif diff < 0:
status = "🟢"
else:
status = "⚪"
status = "❌" # This should never happen.

time_str = (
f"({new_time:.3f}s)"
Expand All @@ -111,6 +102,20 @@ def generate_unique_key(d):
return str(hash)


def compute_conf_bounds(X):
df = len(X) - 1 # degrees of freedom = n-1
critical_value = t_dist.ppf(CONFIDENCE, df)
return np.mean(X) + critical_value * np.std(X, ddof=1) # sample std


def load_json(fpath):
dct = {}
if os.path.exists(fpath):
with open(fpath, "r") as f:
dct = json.load(f)
return dct


def append_to_json(fpath, test_name, input_kwargs, runtimes):
header = {"test_name": test_name, "input_kwargs": input_kwargs}
data = {generate_unique_key(header): {**header, "runtimes": runtimes}}
Expand All @@ -124,9 +129,10 @@ def append_to_json(fpath, test_name, input_kwargs, runtimes):


class compare_to_baseline:
def __init__(self, baseline_iters=3, test_iters=1):
def __init__(self, baseline_iters=5, test_iters=1):
self.baseline_iters = baseline_iters
self.test_iters = test_iters
self.baselines = load_json(fpath_baselines)

def __call__(self, func):
@wraps(func) # ensures kwargs exposed to pytest
Expand All @@ -139,24 +145,23 @@ def test_wrapper(**kwargs):
for _ in range(num_iters):
runtimes = func(**kwargs)
runs.append(runtimes)
runtimes = {k: np.mean([d[k] for d in runs]) for k in runs[0]}

# the baseline time is taken as the upper bound of the confidence interval,
# while the runtimes that we test against the baseline are taken as the mean
agg = compute_conf_bounds if NEW_BASELINE else np.mean
runtimes = {k: agg([d[k] for d in runs]) for k in runs[0]}

append_to_json(
fpath_results, header["test_name"], header["input_kwargs"], runtimes
)

if not NEW_BASELINE:
assert key in baselines, f"No basline found for {header}"
func_baselines = baselines[key]["runtimes"]
assert key in self.baselines, f"No basline found for {header}"
func_baselines = self.baselines[key]["runtimes"]
for key, baseline in func_baselines.items():
diff = (
float("nan")
if np.isclose(baseline, 0)
else (runtimes[key] - baseline) / baseline
)
assert runtimes[key] <= baseline * (
1 + tolerance
), f"{key} is {diff:.2%} slower than the baseline."
assert (
runtimes[key] <= baseline
), f"{key} is significantly slower than the baseline at {runtimes[key]:.3f}s vs. {baseline:.3f}s."

return test_wrapper

Expand Down

0 comments on commit dddd1b7

Please sign in to comment.