Merge pull request #4043 from jobh/flaky-exceptiongroup

Make Flaky into an ExceptionGroup
HypothesisWorks · Jul 13, 2024 · 317958d · 317958d
2 parents 272185c + b99e702
commit 317958d
Show file tree

Hide file tree

Showing 13 changed files with 189 additions and 59 deletions.
diff --git a/hypothesis-python/.coveragerc b/hypothesis-python/.coveragerc
@@ -34,3 +34,4 @@ exclude_lines =
     if "[\w\.]+" in sys\.modules:
     if .+ := sys\.modules\.get\("[\w\.]+"\)
     @overload
+    if .*\bnot .*provider.avoid_realization
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,4 @@
+RELEASE_TYPE: minor
+
+This patch changes most Flaky errors to use an ExceptionGroup, which
+makes the representation of these errors easier to understand.
diff --git a/hypothesis-python/src/hypothesis/core.py b/hypothesis-python/src/hypothesis/core.py
@@ -58,7 +58,8 @@
     DeadlineExceeded,
     DidNotReproduce,
     FailedHealthCheck,
-    Flaky,
+    FlakyFailure,
+    FlakyReplay,
     Found,
     HypothesisException,
     HypothesisWarning,
@@ -999,12 +1000,27 @@ def run(data):
                 )
             else:
                 report("Failed to reproduce exception. Expected: \n" + traceback)
-            raise Flaky(
+            raise FlakyFailure(
                 f"Hypothesis {text_repr} produces unreliable results: "
-                "Falsified on the first call but did not on a subsequent one"
-            ) from exception
+                "Falsified on the first call but did not on a subsequent one",
+                [exception],
+            )
         return result
 
+    def _flaky_replay_to_failure(
+        self, err: FlakyReplay, context: BaseException
+    ) -> FlakyFailure:
+        interesting_examples = [
+            self._runner.interesting_examples[io]
+            for io in err._interesting_origins
+            if io
+        ]
+        exceptions = [
+            ie.extra_information._expected_exception for ie in interesting_examples
+        ]
+        exceptions.append(context)  # the offending assume (or whatever)
+        return FlakyFailure(err.reason, exceptions)
+
     def _execute_once_for_engine(self, data: ConjectureData) -> None:
         """Wrapper around ``execute_once`` that intercepts test failure
         exceptions and single-test control exceptions, and turns them into
@@ -1037,7 +1053,12 @@ def _execute_once_for_engine(self, data: ConjectureData) -> None:
         except UnsatisfiedAssumption as e:
             # An "assume" check failed, so instead we inform the engine that
             # this test run was invalid.
-            data.mark_invalid(e.reason)
+            try:
+                data.mark_invalid(e.reason)
+            except FlakyReplay as err:
+                # This was unexpected, meaning that the assume was flaky.
+                # Report it as such.
+                raise self._flaky_replay_to_failure(err, e) from None
         except StopTest:
             # The engine knows how to handle this control exception, so it's
             # OK to re-raise it.
@@ -1217,22 +1238,28 @@ def run_engine(self):
                             info._expected_traceback,
                         ),
                     )
-            except StopTest:
-                err = Flaky(
-                    "Inconsistent results: An example which failed on the "
-                    "first run now succeeds (or fails with another error)."
-                )
+            except StopTest as e:
                 # Link the expected exception from the first run. Not sure
                 # how to access the current exception, if it failed
-                # differently on this run.
-                err.__cause__ = err.__context__ = info._expected_exception
+                # differently on this run. In fact, in the only known
+                # reproducer, the StopTest is caused by OVERRUN before the
+                # test is even executed. Possibly because all initial examples
+                # failed until the final non-traced replay, and something was
+                # exhausted? Possibly a FIXME, but sufficiently weird to
+                # ignore for now.
+                err = FlakyFailure(
+                    "Inconsistent results: An example failed on the "
+                    "first run but now succeeds (or fails with another "
+                    "error, or is for some reason not runnable).",
+                    [info._expected_exception or e],  # (note: e is a BaseException)
+                )
                 errors_to_report.append((fragments, err))
             except UnsatisfiedAssumption as e:  # pragma: no cover  # ironically flaky
-                err = Flaky(
+                err = FlakyFailure(
                     "Unreliable assumption: An example which satisfied "
-                    "assumptions on the first run now fails it."
+                    "assumptions on the first run now fails it.",
+                    [e],
                 )
-                err.__cause__ = err.__context__ = e
                 errors_to_report.append((fragments, err))
             except BaseException as e:
                 # If we have anything for explain-mode, this is the time to report.

diff --git a/hypothesis-python/src/hypothesis/errors.py b/hypothesis-python/src/hypothesis/errors.py
@@ -8,6 +8,8 @@
 # v. 2.0. If a copy of the MPL was not distributed with this file, You can
 # obtain one at https://mozilla.org/MPL/2.0/.
 
+from hypothesis.internal.compat import ExceptionGroup
+
 
 class HypothesisException(Exception):
     """Generic parent class for exceptions thrown by Hypothesis."""
@@ -52,9 +54,42 @@ class Unsatisfiable(_Trimmable):
 
 
 class Flaky(_Trimmable):
+    """Base class for indeterministic failures. Usually one of the more
+    specific subclasses (FlakyFailure or FlakyStrategyDefinition) is raised."""
+
+
+class FlakyReplay(Flaky):
+    """Internal error raised by the conjecture engine if flaky failures are
+    detected during replay.
+
+    Carries information allowing the runner to reconstruct the flakiness as
+    a FlakyFailure exception group for final presentation.
+    """
+
+    def __init__(self, reason, interesting_origins=None):
+        super().__init__(reason)
+        self.reason = reason
+        self._interesting_origins = interesting_origins
+
+
+class FlakyStrategyDefinition(Flaky):
+    """This function appears to cause inconsistent data generation.
+
+    Common causes for this problem are:
+        1. The strategy depends on external state. e.g. it uses an external
+           random number generator. Try to make a version that passes all the
+           relevant state in from Hypothesis.
+    """
+
+
+class _WrappedBaseException(Exception):
+    """Used internally for wrapping BaseExceptions as components of FlakyFailure."""
+
+
+class FlakyFailure(ExceptionGroup, Flaky):
     """This function appears to fail non-deterministically: We have seen it
     fail when passed this example at least once, but a subsequent invocation
-    did not fail.
+    did not fail, or caused a distinct error.
 
     Common causes for this problem are:
         1. The function depends on external state. e.g. it uses an external
@@ -67,6 +102,19 @@ class Flaky(_Trimmable):
            don't do that and testing those instead.
     """
 
+    def __new__(cls, msg, group):
+        # The Exception mixin forces this an ExceptionGroup (only accepting
+        # Exceptions, not BaseException). Usually BaseException is raised
+        # directly and will hence not be part of a FlakyFailure, but I'm not
+        # sure this assumption holds everywhere. So wrap any BaseExceptions.
+        group = list(group)
+        for i, exc in enumerate(group):
+            if not isinstance(exc, Exception):
+                err = _WrappedBaseException()
+                err.__cause__ = err.__context__ = exc
+                group[i] = err
+        return ExceptionGroup.__new__(cls, msg, group)
+
 
 class InvalidArgument(_Trimmable, TypeError):
     """Used to indicate that the arguments to a Hypothesis function were in

diff --git a/hypothesis-python/src/hypothesis/internal/charmap.py b/hypothesis-python/src/hypothesis/internal/charmap.py
@@ -19,6 +19,7 @@
 from typing import Dict, Tuple
 
 from hypothesis.configuration import storage_directory
+from hypothesis.control import _current_build_context
 from hypothesis.errors import InvalidArgument
 from hypothesis.internal.intervalsets import IntervalSet
 
@@ -214,10 +215,14 @@ def _query_for_key(key):
     >>> _query_for_key(('Zl', 'Zp', 'Co'))
     ((8232, 8233), (57344, 63743), (983040, 1048573), (1048576, 1114109))
     """
-    try:
-        return category_index_cache[key]
-    except KeyError:
-        pass
+    context = _current_build_context.value
+    if context is None or not context.data.provider.avoid_realization:
+        try:
+            return category_index_cache[key]
+        except KeyError:
+            pass
+    elif not key:  # pragma: no cover  # only on alternative backends
+        return ()
     assert key
     if set(key) == set(categories()):
         result = IntervalSet([(0, sys.maxunicode)])
@@ -226,7 +231,8 @@ def _query_for_key(key):
             IntervalSet(charmap()[key[-1]])
         )
     assert isinstance(result, IntervalSet)
-    category_index_cache[key] = result.intervals
+    if context is None or not context.data.provider.avoid_realization:
+        category_index_cache[key] = result.intervals
     return result.intervals
 
 
@@ -268,15 +274,18 @@ def query(
         character_intervals.intervals,
         exclude_intervals.intervals,
     )
-    try:
-        return limited_category_index_cache[qkey]
-    except KeyError:
-        pass
+    context = _current_build_context.value
+    if context is None or not context.data.provider.avoid_realization:
+        try:
+            return limited_category_index_cache[qkey]
+        except KeyError:
+            pass
     base = _query_for_key(catkey)
     result = []
     for u, v in base:
         if v >= min_codepoint and u <= max_codepoint:
             result.append((max(u, min_codepoint), min(v, max_codepoint)))
     result = (IntervalSet(result) | character_intervals) - exclude_intervals
-    limited_category_index_cache[qkey] = result
+    if context is None or not context.data.provider.avoid_realization:
+        limited_category_index_cache[qkey] = result
     return result
diff --git a/hypothesis-python/src/hypothesis/internal/conjecture/datatree.py b/hypothesis-python/src/hypothesis/internal/conjecture/datatree.py
@@ -14,7 +14,12 @@
 
 import attr
 
-from hypothesis.errors import Flaky, HypothesisException, StopTest
+from hypothesis.errors import (
+    FlakyReplay,
+    FlakyStrategyDefinition,
+    HypothesisException,
+    StopTest,
+)
 from hypothesis.internal import floats as flt
 from hypothesis.internal.compat import int_to_bytes
 from hypothesis.internal.conjecture.data import (
@@ -45,7 +50,7 @@ class PreviouslyUnseenBehaviour(HypothesisException):
 
 
 def inconsistent_generation():
-    raise Flaky(
+    raise FlakyStrategyDefinition(
         "Inconsistent data generation! Data generation behaved differently "
         "between different runs. Is your data generation depending on external "
         "state?"
@@ -469,7 +474,7 @@ def split_at(self, i):
         Splits the tree so that it can incorporate a decision at the draw call
         corresponding to the node at position i.
 
-        Raises Flaky if node i was forced.
+        Raises FlakyStrategyDefinition if node i was forced.
         """
 
         if i in self.forced:
@@ -1142,10 +1147,13 @@ def conclude_test(self, status, interesting_origin):
                 node.transition.status != Status.INTERESTING
                 or new_transition.status != Status.VALID
             ):
-                raise Flaky(
+                old_origin = node.transition.interesting_origin
+                new_origin = new_transition.interesting_origin
+                raise FlakyReplay(
                     f"Inconsistent results from replaying a test case!\n"
-                    f"  last: {node.transition.status.name} from {node.transition.interesting_origin}\n"
-                    f"  this: {new_transition.status.name} from {new_transition.interesting_origin}"
+                    f"  last: {node.transition.status.name} from {old_origin}\n"
+                    f"  this: {new_transition.status.name} from {new_origin}",
+                    (old_origin, new_origin),
                 )
         else:
             node.transition = new_transition

diff --git a/hypothesis-python/src/hypothesis/internal/conjecture/engine.py b/hypothesis-python/src/hypothesis/internal/conjecture/engine.py
@@ -10,6 +10,7 @@
 
 import importlib
 import math
+import textwrap
 import time
 from collections import defaultdict
 from contextlib import contextmanager
@@ -39,7 +40,12 @@
 from hypothesis import HealthCheck, Phase, Verbosity, settings as Settings
 from hypothesis._settings import local_settings
 from hypothesis.database import ExampleDatabase
-from hypothesis.errors import Flaky, HypothesisException, InvalidArgument, StopTest
+from hypothesis.errors import (
+    FlakyReplay,
+    HypothesisException,
+    InvalidArgument,
+    StopTest,
+)
 from hypothesis.internal.cache import LRUReusedCache
 from hypothesis.internal.compat import (
     NotRequired,
@@ -297,7 +303,7 @@ def __tree_is_exhausted(self) -> bool:
 
     def __stoppable_test_function(self, data: ConjectureData) -> None:
         """Run ``self._test_function``, but convert a ``StopTest`` exception
-        into a normal return and avoid raising Flaky for RecursionErrors.
+        into a normal return and avoid raising anything flaky for RecursionErrors.
         """
         # We ensure that the test has this much stack space remaining, no
         # matter the size of the stack when called, to de-flake RecursionErrors
@@ -528,18 +534,24 @@ def test_function(self, data: ConjectureData) -> None:
                 # drive the ir tree through the test function to convert it
                 # to a buffer
                 initial_origin = data.interesting_origin
+                initial_traceback = data.extra_information._expected_traceback  # type: ignore
                 data = ConjectureData.for_ir_tree(data.examples.ir_tree_nodes)
                 self.__stoppable_test_function(data)
                 data.freeze()
-                # we'd like to use expected_failure machinery here from
-                # StateForActualGivenExecution for better diagnostic reports of eg
-                # flaky deadlines, but we're too low down in the engine for that.
-                # for now a worse generic flaky error will have to do.
+                # TODO: Convert to FlakyFailure on the way out. Should same-origin
+                #       also be checked?
                 if data.status != Status.INTERESTING:
-                    raise Flaky(
+                    desc_new_status = {
+                        data.status.VALID: "passed",
+                        data.status.INVALID: "failed filters",
+                        data.status.OVERRUN: "overran",
+                    }[data.status]
+                    wrapped_tb = textwrap.indent(initial_traceback, "  | ")
+                    raise FlakyReplay(
                         f"Inconsistent results from replaying a failing test case!\n"
-                        f"  last: {Status.INTERESTING.name} from {initial_origin}\n"
-                        f"  this: {data.status.name}"
+                        f"{wrapped_tb}on backend={self.settings.backend!r} but "
+                        f"{desc_new_status} under backend='hypothesis'",
+                        interesting_origins=[initial_origin],
                     )
 
                 self._cache(data)

diff --git a/hypothesis-python/tests/conjecture/test_engine.py b/hypothesis-python/tests/conjecture/test_engine.py
@@ -26,7 +26,7 @@
     strategies as st,
 )
 from hypothesis.database import ExampleDatabase, InMemoryExampleDatabase
-from hypothesis.errors import FailedHealthCheck, Flaky
+from hypothesis.errors import FailedHealthCheck, FlakyStrategyDefinition
 from hypothesis.internal.compat import bit_count, int_from_bytes
 from hypothesis.internal.conjecture import engine as engine_module
 from hypothesis.internal.conjecture.data import ConjectureData, IRNode, Overrun, Status
@@ -338,7 +338,7 @@ def test(data):
 def test_erratic_draws():
     n = [0]
 
-    with pytest.raises(Flaky):
+    with pytest.raises(FlakyStrategyDefinition):
 
         @run_to_buffer
         def x(data):

diff --git a/hypothesis-python/tests/cover/test_deadline.py b/hypothesis-python/tests/cover/test_deadline.py
@@ -14,7 +14,7 @@
 import pytest
 
 from hypothesis import given, settings, strategies as st
-from hypothesis.errors import DeadlineExceeded, Flaky, InvalidArgument
+from hypothesis.errors import DeadlineExceeded, FlakyFailure, InvalidArgument
 
 from tests.common.utils import assert_falsifying_output, fails_with
 
@@ -56,7 +56,7 @@ def test_flaky_slow(i):
             once[0] = False
             time.sleep(1)
 
-    with pytest.raises(Flaky):
+    with pytest.raises(FlakyFailure):
         test_flaky_slow()
 
 
@@ -110,7 +110,7 @@ def slow_once(i):
             once[0] = False
             time.sleep(0.2)
 
-    with pytest.raises(Flaky) as err:
+    with pytest.raises(FlakyFailure) as err:
         slow_once()
     assert "Unreliable test timing" in "\n".join(err.value.__notes__)
     assert "took 2" in "\n".join(err.value.__notes__)