Skip to content

Commit

Permalink
Merge pull request #4043 from jobh/flaky-exceptiongroup
Browse files Browse the repository at this point in the history
Make Flaky into an ExceptionGroup
  • Loading branch information
Zac-HD authored Jul 13, 2024
2 parents 272185c + b99e702 commit 317958d
Show file tree
Hide file tree
Showing 13 changed files with 189 additions and 59 deletions.
1 change: 1 addition & 0 deletions hypothesis-python/.coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,4 @@ exclude_lines =
if "[\w\.]+" in sys\.modules:
if .+ := sys\.modules\.get\("[\w\.]+"\)
@overload
if .*\bnot .*provider.avoid_realization
4 changes: 4 additions & 0 deletions hypothesis-python/RELEASE.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
RELEASE_TYPE: minor

This patch changes most Flaky errors to use an ExceptionGroup, which
makes the representation of these errors easier to understand.
57 changes: 42 additions & 15 deletions hypothesis-python/src/hypothesis/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@
DeadlineExceeded,
DidNotReproduce,
FailedHealthCheck,
Flaky,
FlakyFailure,
FlakyReplay,
Found,
HypothesisException,
HypothesisWarning,
Expand Down Expand Up @@ -999,12 +1000,27 @@ def run(data):
)
else:
report("Failed to reproduce exception. Expected: \n" + traceback)
raise Flaky(
raise FlakyFailure(
f"Hypothesis {text_repr} produces unreliable results: "
"Falsified on the first call but did not on a subsequent one"
) from exception
"Falsified on the first call but did not on a subsequent one",
[exception],
)
return result

def _flaky_replay_to_failure(
self, err: FlakyReplay, context: BaseException
) -> FlakyFailure:
interesting_examples = [
self._runner.interesting_examples[io]
for io in err._interesting_origins
if io
]
exceptions = [
ie.extra_information._expected_exception for ie in interesting_examples
]
exceptions.append(context) # the offending assume (or whatever)
return FlakyFailure(err.reason, exceptions)

def _execute_once_for_engine(self, data: ConjectureData) -> None:
"""Wrapper around ``execute_once`` that intercepts test failure
exceptions and single-test control exceptions, and turns them into
Expand Down Expand Up @@ -1037,7 +1053,12 @@ def _execute_once_for_engine(self, data: ConjectureData) -> None:
except UnsatisfiedAssumption as e:
# An "assume" check failed, so instead we inform the engine that
# this test run was invalid.
data.mark_invalid(e.reason)
try:
data.mark_invalid(e.reason)
except FlakyReplay as err:
# This was unexpected, meaning that the assume was flaky.
# Report it as such.
raise self._flaky_replay_to_failure(err, e) from None
except StopTest:
# The engine knows how to handle this control exception, so it's
# OK to re-raise it.
Expand Down Expand Up @@ -1217,22 +1238,28 @@ def run_engine(self):
info._expected_traceback,
),
)
except StopTest:
err = Flaky(
"Inconsistent results: An example which failed on the "
"first run now succeeds (or fails with another error)."
)
except StopTest as e:
# Link the expected exception from the first run. Not sure
# how to access the current exception, if it failed
# differently on this run.
err.__cause__ = err.__context__ = info._expected_exception
# differently on this run. In fact, in the only known
# reproducer, the StopTest is caused by OVERRUN before the
# test is even executed. Possibly because all initial examples
# failed until the final non-traced replay, and something was
# exhausted? Possibly a FIXME, but sufficiently weird to
# ignore for now.
err = FlakyFailure(
"Inconsistent results: An example failed on the "
"first run but now succeeds (or fails with another "
"error, or is for some reason not runnable).",
[info._expected_exception or e], # (note: e is a BaseException)
)
errors_to_report.append((fragments, err))
except UnsatisfiedAssumption as e: # pragma: no cover # ironically flaky
err = Flaky(
err = FlakyFailure(
"Unreliable assumption: An example which satisfied "
"assumptions on the first run now fails it."
"assumptions on the first run now fails it.",
[e],
)
err.__cause__ = err.__context__ = e
errors_to_report.append((fragments, err))
except BaseException as e:
# If we have anything for explain-mode, this is the time to report.
Expand Down
50 changes: 49 additions & 1 deletion hypothesis-python/src/hypothesis/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.

from hypothesis.internal.compat import ExceptionGroup


class HypothesisException(Exception):
"""Generic parent class for exceptions thrown by Hypothesis."""
Expand Down Expand Up @@ -52,9 +54,42 @@ class Unsatisfiable(_Trimmable):


class Flaky(_Trimmable):
"""Base class for indeterministic failures. Usually one of the more
specific subclasses (FlakyFailure or FlakyStrategyDefinition) is raised."""


class FlakyReplay(Flaky):
"""Internal error raised by the conjecture engine if flaky failures are
detected during replay.
Carries information allowing the runner to reconstruct the flakiness as
a FlakyFailure exception group for final presentation.
"""

def __init__(self, reason, interesting_origins=None):
super().__init__(reason)
self.reason = reason
self._interesting_origins = interesting_origins


class FlakyStrategyDefinition(Flaky):
"""This function appears to cause inconsistent data generation.
Common causes for this problem are:
1. The strategy depends on external state. e.g. it uses an external
random number generator. Try to make a version that passes all the
relevant state in from Hypothesis.
"""


class _WrappedBaseException(Exception):
"""Used internally for wrapping BaseExceptions as components of FlakyFailure."""


class FlakyFailure(ExceptionGroup, Flaky):
"""This function appears to fail non-deterministically: We have seen it
fail when passed this example at least once, but a subsequent invocation
did not fail.
did not fail, or caused a distinct error.
Common causes for this problem are:
1. The function depends on external state. e.g. it uses an external
Expand All @@ -67,6 +102,19 @@ class Flaky(_Trimmable):
don't do that and testing those instead.
"""

def __new__(cls, msg, group):
# The Exception mixin forces this an ExceptionGroup (only accepting
# Exceptions, not BaseException). Usually BaseException is raised
# directly and will hence not be part of a FlakyFailure, but I'm not
# sure this assumption holds everywhere. So wrap any BaseExceptions.
group = list(group)
for i, exc in enumerate(group):
if not isinstance(exc, Exception):
err = _WrappedBaseException()
err.__cause__ = err.__context__ = exc
group[i] = err
return ExceptionGroup.__new__(cls, msg, group)


class InvalidArgument(_Trimmable, TypeError):
"""Used to indicate that the arguments to a Hypothesis function were in
Expand Down
29 changes: 19 additions & 10 deletions hypothesis-python/src/hypothesis/internal/charmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from typing import Dict, Tuple

from hypothesis.configuration import storage_directory
from hypothesis.control import _current_build_context
from hypothesis.errors import InvalidArgument
from hypothesis.internal.intervalsets import IntervalSet

Expand Down Expand Up @@ -214,10 +215,14 @@ def _query_for_key(key):
>>> _query_for_key(('Zl', 'Zp', 'Co'))
((8232, 8233), (57344, 63743), (983040, 1048573), (1048576, 1114109))
"""
try:
return category_index_cache[key]
except KeyError:
pass
context = _current_build_context.value
if context is None or not context.data.provider.avoid_realization:
try:
return category_index_cache[key]
except KeyError:
pass
elif not key: # pragma: no cover # only on alternative backends
return ()
assert key
if set(key) == set(categories()):
result = IntervalSet([(0, sys.maxunicode)])
Expand All @@ -226,7 +231,8 @@ def _query_for_key(key):
IntervalSet(charmap()[key[-1]])
)
assert isinstance(result, IntervalSet)
category_index_cache[key] = result.intervals
if context is None or not context.data.provider.avoid_realization:
category_index_cache[key] = result.intervals
return result.intervals


Expand Down Expand Up @@ -268,15 +274,18 @@ def query(
character_intervals.intervals,
exclude_intervals.intervals,
)
try:
return limited_category_index_cache[qkey]
except KeyError:
pass
context = _current_build_context.value
if context is None or not context.data.provider.avoid_realization:
try:
return limited_category_index_cache[qkey]
except KeyError:
pass
base = _query_for_key(catkey)
result = []
for u, v in base:
if v >= min_codepoint and u <= max_codepoint:
result.append((max(u, min_codepoint), min(v, max_codepoint)))
result = (IntervalSet(result) | character_intervals) - exclude_intervals
limited_category_index_cache[qkey] = result
if context is None or not context.data.provider.avoid_realization:
limited_category_index_cache[qkey] = result
return result
20 changes: 14 additions & 6 deletions hypothesis-python/src/hypothesis/internal/conjecture/datatree.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@

import attr

from hypothesis.errors import Flaky, HypothesisException, StopTest
from hypothesis.errors import (
FlakyReplay,
FlakyStrategyDefinition,
HypothesisException,
StopTest,
)
from hypothesis.internal import floats as flt
from hypothesis.internal.compat import int_to_bytes
from hypothesis.internal.conjecture.data import (
Expand Down Expand Up @@ -45,7 +50,7 @@ class PreviouslyUnseenBehaviour(HypothesisException):


def inconsistent_generation():
raise Flaky(
raise FlakyStrategyDefinition(
"Inconsistent data generation! Data generation behaved differently "
"between different runs. Is your data generation depending on external "
"state?"
Expand Down Expand Up @@ -469,7 +474,7 @@ def split_at(self, i):
Splits the tree so that it can incorporate a decision at the draw call
corresponding to the node at position i.
Raises Flaky if node i was forced.
Raises FlakyStrategyDefinition if node i was forced.
"""

if i in self.forced:
Expand Down Expand Up @@ -1142,10 +1147,13 @@ def conclude_test(self, status, interesting_origin):
node.transition.status != Status.INTERESTING
or new_transition.status != Status.VALID
):
raise Flaky(
old_origin = node.transition.interesting_origin
new_origin = new_transition.interesting_origin
raise FlakyReplay(
f"Inconsistent results from replaying a test case!\n"
f" last: {node.transition.status.name} from {node.transition.interesting_origin}\n"
f" this: {new_transition.status.name} from {new_transition.interesting_origin}"
f" last: {node.transition.status.name} from {old_origin}\n"
f" this: {new_transition.status.name} from {new_origin}",
(old_origin, new_origin),
)
else:
node.transition = new_transition
Expand Down
30 changes: 21 additions & 9 deletions hypothesis-python/src/hypothesis/internal/conjecture/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import importlib
import math
import textwrap
import time
from collections import defaultdict
from contextlib import contextmanager
Expand Down Expand Up @@ -39,7 +40,12 @@
from hypothesis import HealthCheck, Phase, Verbosity, settings as Settings
from hypothesis._settings import local_settings
from hypothesis.database import ExampleDatabase
from hypothesis.errors import Flaky, HypothesisException, InvalidArgument, StopTest
from hypothesis.errors import (
FlakyReplay,
HypothesisException,
InvalidArgument,
StopTest,
)
from hypothesis.internal.cache import LRUReusedCache
from hypothesis.internal.compat import (
NotRequired,
Expand Down Expand Up @@ -297,7 +303,7 @@ def __tree_is_exhausted(self) -> bool:

def __stoppable_test_function(self, data: ConjectureData) -> None:
"""Run ``self._test_function``, but convert a ``StopTest`` exception
into a normal return and avoid raising Flaky for RecursionErrors.
into a normal return and avoid raising anything flaky for RecursionErrors.
"""
# We ensure that the test has this much stack space remaining, no
# matter the size of the stack when called, to de-flake RecursionErrors
Expand Down Expand Up @@ -528,18 +534,24 @@ def test_function(self, data: ConjectureData) -> None:
# drive the ir tree through the test function to convert it
# to a buffer
initial_origin = data.interesting_origin
initial_traceback = data.extra_information._expected_traceback # type: ignore
data = ConjectureData.for_ir_tree(data.examples.ir_tree_nodes)
self.__stoppable_test_function(data)
data.freeze()
# we'd like to use expected_failure machinery here from
# StateForActualGivenExecution for better diagnostic reports of eg
# flaky deadlines, but we're too low down in the engine for that.
# for now a worse generic flaky error will have to do.
# TODO: Convert to FlakyFailure on the way out. Should same-origin
# also be checked?
if data.status != Status.INTERESTING:
raise Flaky(
desc_new_status = {
data.status.VALID: "passed",
data.status.INVALID: "failed filters",
data.status.OVERRUN: "overran",
}[data.status]
wrapped_tb = textwrap.indent(initial_traceback, " | ")
raise FlakyReplay(
f"Inconsistent results from replaying a failing test case!\n"
f" last: {Status.INTERESTING.name} from {initial_origin}\n"
f" this: {data.status.name}"
f"{wrapped_tb}on backend={self.settings.backend!r} but "
f"{desc_new_status} under backend='hypothesis'",
interesting_origins=[initial_origin],
)

self._cache(data)
Expand Down
4 changes: 2 additions & 2 deletions hypothesis-python/tests/conjecture/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
strategies as st,
)
from hypothesis.database import ExampleDatabase, InMemoryExampleDatabase
from hypothesis.errors import FailedHealthCheck, Flaky
from hypothesis.errors import FailedHealthCheck, FlakyStrategyDefinition
from hypothesis.internal.compat import bit_count, int_from_bytes
from hypothesis.internal.conjecture import engine as engine_module
from hypothesis.internal.conjecture.data import ConjectureData, IRNode, Overrun, Status
Expand Down Expand Up @@ -338,7 +338,7 @@ def test(data):
def test_erratic_draws():
n = [0]

with pytest.raises(Flaky):
with pytest.raises(FlakyStrategyDefinition):

@run_to_buffer
def x(data):
Expand Down
6 changes: 3 additions & 3 deletions hypothesis-python/tests/cover/test_deadline.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import pytest

from hypothesis import given, settings, strategies as st
from hypothesis.errors import DeadlineExceeded, Flaky, InvalidArgument
from hypothesis.errors import DeadlineExceeded, FlakyFailure, InvalidArgument

from tests.common.utils import assert_falsifying_output, fails_with

Expand Down Expand Up @@ -56,7 +56,7 @@ def test_flaky_slow(i):
once[0] = False
time.sleep(1)

with pytest.raises(Flaky):
with pytest.raises(FlakyFailure):
test_flaky_slow()


Expand Down Expand Up @@ -110,7 +110,7 @@ def slow_once(i):
once[0] = False
time.sleep(0.2)

with pytest.raises(Flaky) as err:
with pytest.raises(FlakyFailure) as err:
slow_once()
assert "Unreliable test timing" in "\n".join(err.value.__notes__)
assert "took 2" in "\n".join(err.value.__notes__)
Expand Down
Loading

0 comments on commit 317958d

Please sign in to comment.