Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Confidence bounds for regression tests #540

Merged
merged 5 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/update_regression_baseline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ jobs:
update_regression_tests:
name: update_regression_tests
runs-on: ubuntu-20.04
# Trigger from a comment that contains '/update_regression_baselines'
if: github.event.issue.pull_request && contains(github.event.comment.body, '/update_regression_baselines')
# Trigger from a comment that contains '/update_regression_baseline'
if: github.event.issue.pull_request && contains(github.event.comment.body, '/update_regression_baseline')
# workflow needs permissions to write to the PR
permissions:
contents: write
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ coverage.xml
.pytest_cache/
tests/regression_test_results.json
tests/regression_test_baselines.json
tests/regression_test_report.txt

# Translations
*.mo
Expand Down
16 changes: 9 additions & 7 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,9 @@ def get_or_compute_swc2jaxley_params(
@pytest.fixture(scope="session", autouse=True)
def print_session_report(request, pytestconfig):
"""Cleanup a testing directory once we are finished."""
NEW_BASELINE = os.environ["NEW_BASELINE"] if "NEW_BASELINE" in os.environ else 0
NEW_BASELINE = (
int(os.environ["NEW_BASELINE"]) if "NEW_BASELINE" in os.environ else 0
)

dirname = os.path.dirname(__file__)
baseline_fname = os.path.join(dirname, "regression_test_baselines.json")
Expand All @@ -220,11 +222,10 @@ def print_session_report(request, pytestconfig):
]

def update_baseline():
if NEW_BASELINE:
results = load_json(results_fname)
with open(baseline_fname, "w") as f:
json.dump(results, f, indent=2)
os.remove(results_fname)
results = load_json(results_fname)
with open(baseline_fname, "w") as f:
json.dump(results, f, indent=2)
os.remove(results_fname)

def print_regression_report():
baselines = load_json(baseline_fname)
Expand All @@ -243,5 +244,6 @@ def print_regression_report():
print(report)

if len(collected_regression_tests) > 0:
request.addfinalizer(update_baseline)
if NEW_BASELINE:
request.addfinalizer(update_baseline)
request.addfinalizer(print_regression_report)
67 changes: 36 additions & 31 deletions tests/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,15 @@
import numpy as np
import pytest
from jax import jit
from scipy.stats import t as t_dist

import jaxley as jx
from jaxley.channels import HH
from jaxley.connect import sparse_connect
from jaxley.synapses import IonotropicSynapse

pytestmark = pytest.mark.regression # mark all tests as regression tests in this file

# Every runtime test needs to have the following structure:
#
# @compare_to_baseline()
Expand All @@ -38,24 +41,12 @@
# takes into account the input_kwargs of the test, the name of the test and the runtimes
# of each part.

NEW_BASELINE = int(os.environ["NEW_BASELINE"]) if "NEW_BASELINE" in os.environ else 0
CONFIDENCE = 0.95

def load_json(fpath):
dct = {}
if os.path.exists(fpath):
with open(fpath, "r") as f:
dct = json.load(f)
return dct


pytestmark = pytest.mark.regression # mark all tests as regression tests in this file
NEW_BASELINE = os.environ["NEW_BASELINE"] if "NEW_BASELINE" in os.environ else 0
dirname = os.path.dirname(__file__)
fpath_baselines = os.path.join(dirname, "regression_test_baselines.json")
fpath_results = os.path.join(dirname, "regression_test_results.json")

tolerance = 0.2

baselines = load_json(fpath_baselines)
with open(fpath_results, "w") as f: # clear previous results
f.write("{}")

Expand Down Expand Up @@ -83,14 +74,14 @@ def generate_regression_report(base_results, new_results):
diff = None if base_time is None else ((new_time - base_time) / base_time)

status = ""
if diff is None:
if base_time is None:
status = "🆕"
elif diff > tolerance:
elif new_time <= base_time:
status = "🟢" if diff is not None and diff < -0.05 else "🟠"
elif new_time > base_time:
status = "🔴"
elif diff < 0:
status = "🟢"
else:
status = "⚪"
status = "❌" # This should never happen.

time_str = (
f"({new_time:.3f}s)"
Expand All @@ -111,6 +102,20 @@ def generate_unique_key(d):
return str(hash)


def compute_conf_bounds(X):
df = len(X) - 1 # degrees of freedom = n-1
critical_value = t_dist.ppf(CONFIDENCE, df)
return np.mean(X) + critical_value * np.std(X, ddof=1) # sample std


def load_json(fpath):
dct = {}
if os.path.exists(fpath):
with open(fpath, "r") as f:
dct = json.load(f)
return dct


def append_to_json(fpath, test_name, input_kwargs, runtimes):
header = {"test_name": test_name, "input_kwargs": input_kwargs}
data = {generate_unique_key(header): {**header, "runtimes": runtimes}}
Expand All @@ -124,9 +129,10 @@ def append_to_json(fpath, test_name, input_kwargs, runtimes):


class compare_to_baseline:
def __init__(self, baseline_iters=3, test_iters=1):
def __init__(self, baseline_iters=5, test_iters=1):
self.baseline_iters = baseline_iters
self.test_iters = test_iters
self.baselines = load_json(fpath_baselines)

def __call__(self, func):
@wraps(func) # ensures kwargs exposed to pytest
Expand All @@ -139,24 +145,23 @@ def test_wrapper(**kwargs):
for _ in range(num_iters):
runtimes = func(**kwargs)
runs.append(runtimes)
runtimes = {k: np.mean([d[k] for d in runs]) for k in runs[0]}

# the baseline time is taken as the upper bound of the confidence interval,
# while the runtimes that we test against the baseline are taken as the mean
agg = compute_conf_bounds if NEW_BASELINE else np.mean
runtimes = {k: agg([d[k] for d in runs]) for k in runs[0]}

append_to_json(
fpath_results, header["test_name"], header["input_kwargs"], runtimes
)

if not NEW_BASELINE:
assert key in baselines, f"No basline found for {header}"
func_baselines = baselines[key]["runtimes"]
assert key in self.baselines, f"No basline found for {header}"
func_baselines = self.baselines[key]["runtimes"]
for key, baseline in func_baselines.items():
diff = (
float("nan")
if np.isclose(baseline, 0)
else (runtimes[key] - baseline) / baseline
)
assert runtimes[key] <= baseline * (
1 + tolerance
), f"{key} is {diff:.2%} slower than the baseline."
assert (
runtimes[key] <= baseline
), f"{key} is significantly slower than the baseline at {runtimes[key]:.3f}s vs. {baseline:.3f}s."

return test_wrapper

Expand Down
Loading