From dddd1b7a7fbd5a7ce47000ac1a5d6e1ce75a535c Mon Sep 17 00:00:00 2001
From: jnsbck <65561470+jnsbck@users.noreply.github.com>
Date: Tue, 3 Dec 2024 18:58:08 +0100
Subject: [PATCH] Confidence bounds for regression tests (#540)

* enh: add t-test to regression tests.

* fix: fix wrong type of baseline env variable and better printouts

* fix: small fixes

* fix: change command to update baseline

* fix: conf region changed to <=
---
 .../workflows/update_regression_baseline.yml  |  4 +-
 .gitignore                                    |  1 +
 tests/conftest.py                             | 16 +++--
 tests/test_regression.py                      | 67 ++++++++++---------
 4 files changed, 48 insertions(+), 40 deletions(-)

diff --git a/.github/workflows/update_regression_baseline.yml b/.github/workflows/update_regression_baseline.yml
index c2d6bf65..4190da6c 100644
--- a/.github/workflows/update_regression_baseline.yml
+++ b/.github/workflows/update_regression_baseline.yml
@@ -12,8 +12,8 @@ jobs:
   update_regression_tests:
     name: update_regression_tests
     runs-on: ubuntu-20.04
-    # Trigger from a comment that contains '/update_regression_baselines'
-    if: github.event.issue.pull_request && contains(github.event.comment.body, '/update_regression_baselines')
+    # Trigger from a comment that contains '/update_regression_baseline'
+    if: github.event.issue.pull_request && contains(github.event.comment.body, '/update_regression_baseline')
     # workflow needs permissions to write to the PR
     permissions:
       contents: write
diff --git a/.gitignore b/.gitignore
index d5638eb6..789570e4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -57,6 +57,7 @@ coverage.xml
 .pytest_cache/
 tests/regression_test_results.json
 tests/regression_test_baselines.json
+tests/regression_test_report.txt
 
 # Translations
 *.mo
diff --git a/tests/conftest.py b/tests/conftest.py
index afe41e30..fd503c01 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -209,7 +209,9 @@ def get_or_compute_swc2jaxley_params(
 @pytest.fixture(scope="session", autouse=True)
 def print_session_report(request, pytestconfig):
     """Cleanup a testing directory once we are finished."""
-    NEW_BASELINE = os.environ["NEW_BASELINE"] if "NEW_BASELINE" in os.environ else 0
+    NEW_BASELINE = (
+        int(os.environ["NEW_BASELINE"]) if "NEW_BASELINE" in os.environ else 0
+    )
 
     dirname = os.path.dirname(__file__)
     baseline_fname = os.path.join(dirname, "regression_test_baselines.json")
@@ -220,11 +222,10 @@ def print_session_report(request, pytestconfig):
     ]
 
     def update_baseline():
-        if NEW_BASELINE:
-            results = load_json(results_fname)
-            with open(baseline_fname, "w") as f:
-                json.dump(results, f, indent=2)
-            os.remove(results_fname)
+        results = load_json(results_fname)
+        with open(baseline_fname, "w") as f:
+            json.dump(results, f, indent=2)
+        os.remove(results_fname)
 
     def print_regression_report():
         baselines = load_json(baseline_fname)
@@ -243,5 +244,6 @@ def print_regression_report():
             print(report)
 
     if len(collected_regression_tests) > 0:
-        request.addfinalizer(update_baseline)
+        if NEW_BASELINE:
+            request.addfinalizer(update_baseline)
         request.addfinalizer(print_regression_report)
diff --git a/tests/test_regression.py b/tests/test_regression.py
index a1ccafe1..66dca95e 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -10,12 +10,15 @@
 import numpy as np
 import pytest
 from jax import jit
+from scipy.stats import t as t_dist
 
 import jaxley as jx
 from jaxley.channels import HH
 from jaxley.connect import sparse_connect
 from jaxley.synapses import IonotropicSynapse
 
+pytestmark = pytest.mark.regression  # mark all tests as regression tests in this file
+
 # Every runtime test needs to have the following structure:
 #
 # @compare_to_baseline()
@@ -38,24 +41,12 @@
 # takes into account the input_kwargs of the test, the name of the test and the runtimes
 # of each part.
 
+NEW_BASELINE = int(os.environ["NEW_BASELINE"]) if "NEW_BASELINE" in os.environ else 0
+CONFIDENCE = 0.95
 
-def load_json(fpath):
-    dct = {}
-    if os.path.exists(fpath):
-        with open(fpath, "r") as f:
-            dct = json.load(f)
-    return dct
-
-
-pytestmark = pytest.mark.regression  # mark all tests as regression tests in this file
-NEW_BASELINE = os.environ["NEW_BASELINE"] if "NEW_BASELINE" in os.environ else 0
 dirname = os.path.dirname(__file__)
 fpath_baselines = os.path.join(dirname, "regression_test_baselines.json")
 fpath_results = os.path.join(dirname, "regression_test_results.json")
-
-tolerance = 0.2
-
-baselines = load_json(fpath_baselines)
 with open(fpath_results, "w") as f:  # clear previous results
     f.write("{}")
 
@@ -83,14 +74,14 @@ def generate_regression_report(base_results, new_results):
             diff = None if base_time is None else ((new_time - base_time) / base_time)
 
             status = ""
-            if diff is None:
+            if base_time is None:
                 status = "🆕"
-            elif diff > tolerance:
+            elif new_time <= base_time:
+                status = "🟢" if diff is not None and diff < -0.05 else "🟠"
+            elif new_time > base_time:
                 status = "🔴"
-            elif diff < 0:
-                status = "🟢"
             else:
-                status = "⚪"
+                status = "❌"  # This should never happen.
 
             time_str = (
                 f"({new_time:.3f}s)"
@@ -111,6 +102,20 @@ def generate_unique_key(d):
     return str(hash)
 
 
+def compute_conf_bounds(X):
+    df = len(X) - 1  # degrees of freedom = n-1
+    critical_value = t_dist.ppf(CONFIDENCE, df)
+    return np.mean(X) + critical_value * np.std(X, ddof=1)  # sample std
+
+
+def load_json(fpath):
+    dct = {}
+    if os.path.exists(fpath):
+        with open(fpath, "r") as f:
+            dct = json.load(f)
+    return dct
+
+
 def append_to_json(fpath, test_name, input_kwargs, runtimes):
     header = {"test_name": test_name, "input_kwargs": input_kwargs}
     data = {generate_unique_key(header): {**header, "runtimes": runtimes}}
@@ -124,9 +129,10 @@ def append_to_json(fpath, test_name, input_kwargs, runtimes):
 
 
 class compare_to_baseline:
-    def __init__(self, baseline_iters=3, test_iters=1):
+    def __init__(self, baseline_iters=5, test_iters=1):
         self.baseline_iters = baseline_iters
         self.test_iters = test_iters
+        self.baselines = load_json(fpath_baselines)
 
     def __call__(self, func):
         @wraps(func)  # ensures kwargs exposed to pytest
@@ -139,24 +145,23 @@ def test_wrapper(**kwargs):
             for _ in range(num_iters):
                 runtimes = func(**kwargs)
                 runs.append(runtimes)
-            runtimes = {k: np.mean([d[k] for d in runs]) for k in runs[0]}
+
+            # the baseline time is taken as the upper bound of the confidence interval,
+            # while the runtimes that we test against the baseline are taken as the mean
+            agg = compute_conf_bounds if NEW_BASELINE else np.mean
+            runtimes = {k: agg([d[k] for d in runs]) for k in runs[0]}
 
             append_to_json(
                 fpath_results, header["test_name"], header["input_kwargs"], runtimes
             )
 
             if not NEW_BASELINE:
-                assert key in baselines, f"No basline found for {header}"
-                func_baselines = baselines[key]["runtimes"]
+                assert key in self.baselines, f"No basline found for {header}"
+                func_baselines = self.baselines[key]["runtimes"]
                 for key, baseline in func_baselines.items():
-                    diff = (
-                        float("nan")
-                        if np.isclose(baseline, 0)
-                        else (runtimes[key] - baseline) / baseline
-                    )
-                    assert runtimes[key] <= baseline * (
-                        1 + tolerance
-                    ), f"{key} is {diff:.2%} slower than the baseline."
+                    assert (
+                        runtimes[key] <= baseline
+                    ), f"{key} is significantly slower than the baseline at {runtimes[key]:.3f}s vs. {baseline:.3f}s."
 
         return test_wrapper