diff --git a/docs/changelog.md b/docs/changelog.md index 508613fab..bd25a544c 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,9 @@ # Changelog +### 2024-12-05 + +- Bugfix to no longer stop the reconciliation when a runner's health check fails. + ### 2024-12-04 - Clean up corresponding OpenStack runner resources when a unit of the charm is removed. diff --git a/github-runner-manager/src-docs/errors.md b/github-runner-manager/src-docs/errors.md index ec864dc62..941b39cd3 100644 --- a/github-runner-manager/src-docs/errors.md +++ b/github-runner-manager/src-docs/errors.md @@ -260,3 +260,14 @@ Base class for all reconcile errors. +--- + + + +## class `OpenstackHealthCheckError` +Base class for all health check errors. + + + + + diff --git a/github-runner-manager/src-docs/manager.cloud_runner_manager.md b/github-runner-manager/src-docs/manager.cloud_runner_manager.md index 4fad301b9..14ab97e8b 100644 --- a/github-runner-manager/src-docs/manager.cloud_runner_manager.md +++ b/github-runner-manager/src-docs/manager.cloud_runner_manager.md @@ -28,7 +28,7 @@ Health state of the runners. --- - + ## class `CloudRunnerState` Represent state of the instance hosting the runner. @@ -51,7 +51,7 @@ Represent state of the instance hosting the runner. --- - + ## class `CloudInitStatus` Represents the state of cloud-init script. @@ -77,7 +77,7 @@ Refer to the official documentation on cloud-init status: https://cloudinit.read --- - + ## class `GitHubRunnerConfig` Configuration for GitHub runner spawned. @@ -107,7 +107,7 @@ __init__(github_path: GitHubOrg | GitHubRepo, labels: list[str]) → None --- - + ## class `SupportServiceConfig` Configuration for supporting services for runners. @@ -144,7 +144,7 @@ __init__( --- - + ## class `CloudRunnerInstance` Information on the runner on the cloud. @@ -181,7 +181,7 @@ __init__( --- - + ## class `CloudRunnerManager` Manage runner instance on cloud. @@ -203,7 +203,7 @@ Get the name prefix of the self-hosted runners. --- - + ### method `cleanup` @@ -223,7 +223,7 @@ Perform health check on runner and delete the runner if it fails. --- - + ### method `create_runner` @@ -241,7 +241,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -260,7 +260,7 @@ Delete self-hosted runner. --- - + ### method `flush_runners` @@ -279,7 +279,7 @@ Stop all runners. --- - + ### method `get_runner` @@ -297,7 +297,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/github-runner-manager/src/github_runner_manager/errors.py b/github-runner-manager/src/github_runner_manager/errors.py index d358ea1d6..3aa229df9 100644 --- a/github-runner-manager/src/github_runner_manager/errors.py +++ b/github-runner-manager/src/github_runner_manager/errors.py @@ -95,3 +95,7 @@ class KeyfileError(SSHError): class ReconcileError(Exception): """Base class for all reconcile errors.""" + + +class OpenstackHealthCheckError(Exception): + """Base class for all health check errors.""" diff --git a/github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py b/github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py index e9d25cd5d..a8d9eacb0 100644 --- a/github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py +++ b/github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py @@ -35,6 +35,20 @@ class HealthState(Enum): UNHEALTHY = auto() UNKNOWN = auto() + @staticmethod + def from_value(health: bool | None) -> "HealthState": + """Create from a health value. + + Args: + health: The health value as boolean or None. + + Returns: + The health state. + """ + if health is None: + return HealthState.UNKNOWN + return HealthState.HEALTHY if health else HealthState.UNHEALTHY + class CloudRunnerState(str, Enum): """Represent state of the instance hosting the runner. diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py index 754a48669..4a241dc51 100644 --- a/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py +++ b/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py @@ -9,7 +9,7 @@ import invoke from fabric import Connection as SSHConnection -from github_runner_manager.errors import KeyfileError, SSHError +from github_runner_manager.errors import KeyfileError, OpenstackHealthCheckError, SSHError from github_runner_manager.manager.cloud_runner_manager import CloudInitStatus, CloudRunnerState from github_runner_manager.openstack_cloud.constants import ( METRICS_EXCHANGE_PATH, @@ -26,6 +26,10 @@ _HealthCheckResult = bool | None # None indicates that the check can not determine health status +class _SSHError(Exception): + """Error on SSH command execution.""" + + def check_runner(openstack_cloud: OpenstackCloud, instance: OpenstackInstance) -> bool: """Run a general health check on a runner instance. @@ -47,9 +51,17 @@ def check_runner(openstack_cloud: OpenstackCloud, instance: OpenstackInstance) - logger.exception( "Health check failed due to unable to find keyfile for %s", instance.server_name ) + # KeyfileError indicates that we'll never be able to ssh into the unit, + # so we mark it as unhealthy. return False - except SSHError: - logger.exception("SSH Failed on %s, marking as unhealthy.") + except _SSHError: + logger.exception( + "Unable to get SSH connection for instance %s, marking as unhealthy.", + instance.server_name, + ) + # We assume that the failure to get the SSH connection is not transient, and mark + # the runner as unhealthy. + # It is debatable whether we should throw an exception here instead. return False return check_active_runner(ssh_conn, instance) @@ -69,23 +81,31 @@ def check_active_runner( the flag, the health check would fail as it checks for running processes which would not be present in this case. + Raises: + OpenstackHealthCheckError: If the health check could not be completed. + Returns: Whether the runner should be considered healthy. """ - if (check_ok := _run_health_check_runner_installed(ssh_conn, instance)) is not None: - return check_ok - - if ( - check_ok := _run_health_check_cloud_init( - ssh_conn, instance.server_name, accept_finished_job - ) - ) is not None: - return check_ok + try: + if (check_ok := _run_health_check_runner_installed(ssh_conn, instance)) is not None: + return check_ok - if ( - check_ok := _run_health_check_runner_processes_running(ssh_conn, instance.server_name) - ) is not None: - return check_ok + if ( + check_ok := _run_health_check_cloud_init( + ssh_conn, instance.server_name, accept_finished_job + ) + ) is not None: + return check_ok + + if ( + check_ok := _run_health_check_runner_processes_running(ssh_conn, instance.server_name) + ) is not None: + return check_ok + except _SSHError as exc: + raise OpenstackHealthCheckError( + "Health check execution failed due to SSH command failure." + ) from exc return True @@ -101,7 +121,7 @@ def _get_ssh_connection( instance: The OpenStack instance to conduit the health check. Raises: - SSHError: Unable to get a SSH connection to the instance. + _SSHError: Unable to get a SSH connection to the instance. Returns: Whether the runner is healthy. @@ -109,11 +129,8 @@ def _get_ssh_connection( try: ssh_conn = openstack_cloud.get_ssh_connection(instance) - except SSHError: - logger.exception( - "SSH connection failure with %s during health check", instance.server_name - ) - raise + except SSHError as exc: + raise _SSHError(f"Unable to get SSH connection to {instance.server_name}") from exc return ssh_conn @@ -170,7 +187,7 @@ def _run_health_check_cloud_init( Returns: Whether the cloud-init status indicates the run is healthy or None. """ - result: invoke.runners.Result = ssh_conn.run("cloud-init status", warn=True, timeout=30) + result: invoke.runners.Result = _execute_ssh_command(ssh_conn, "cloud-init status") if not result.ok: logger.warning("cloud-init status command failed on %s: %s.", server_name, result.stderr) return False @@ -206,8 +223,8 @@ def _run_health_check_runner_installed( If the run can be considered healthy depending on the existence of the runner-installed.timestamp. """ - result = ssh_conn.run( - f"[ -f {METRICS_EXCHANGE_PATH}/runner-installed.timestamp ]", warn=True, timeout=30 + result = _execute_ssh_command( + ssh_conn, f"[ -f {METRICS_EXCHANGE_PATH}/runner-installed.timestamp ]" ) if not result.ok: logger.info( @@ -239,7 +256,7 @@ def _run_health_check_runner_processes_running( Returns: If the run can be considered healthy depending on the existence of the processes. """ - result = ssh_conn.run("ps aux", warn=True, timeout=30) + result = _execute_ssh_command(ssh_conn, "ps aux") if not result.ok: logger.warning("SSH run of `ps aux` failed on %s: %s", server_name, result.stderr) return False @@ -247,3 +264,24 @@ def _run_health_check_runner_processes_running( logger.warning("Runner process not found on %s", server_name) return False return None + + +def _execute_ssh_command(ssh_conn: SSHConnection, command: str) -> invoke.runners.Result: + """Run a command on the remote server. + + Args: + ssh_conn: The SSH connection to the runner. + command: The command to run. + + Returns: + The result of the command. + + Raises: + _SSHError: If the command execution failed. + """ + try: + return ssh_conn.run(command, warn=True, timeout=30) + except invoke.exceptions.CommandTimedOut as exc: + raise _SSHError( + f"SSH command execution timed out for command '{command}' on {ssh_conn.host}" + ) from exc diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py index 37a3f4fad..964d98ee5 100644 --- a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py +++ b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py @@ -23,6 +23,7 @@ KeyfileError, MissingServerConfigError, OpenStackError, + OpenstackHealthCheckError, RunnerCreateError, RunnerStartError, SSHError, @@ -70,6 +71,8 @@ OUTDATED_METRICS_STORAGE_IN_SECONDS = CREATE_SERVER_TIMEOUT + 30 # add a bit on top of the timeout +HEALTH_CHECK_ERROR_LOG_MSG = "Health check could not be completed for %s" + class _GithubRunnerRemoveError(Exception): """Represents an error while SSH into a runner and running the remove script.""" @@ -124,10 +127,12 @@ class _RunnerHealth: Attributes: healthy: The list of healthy runners. unhealthy: The list of unhealthy runners. + unknown: The list of runners whose health state could not be determined. """ healthy: tuple[OpenstackInstance, ...] unhealthy: tuple[OpenstackInstance, ...] + unknown: tuple[OpenstackInstance, ...] class OpenStackRunnerManager(CloudRunnerManager): @@ -233,15 +238,20 @@ def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: logger.debug( "Runner info fetched, checking health %s %s", instance_id, instance.server_name ) - healthy = health_checks.check_runner( - openstack_cloud=self._openstack_cloud, instance=instance - ) - logger.debug("Runner health check completed %s %s", instance.server_name, healthy) + + try: + healthy = health_checks.check_runner( + openstack_cloud=self._openstack_cloud, instance=instance + ) + logger.debug("Runner health check completed %s %s", instance.server_name, healthy) + except OpenstackHealthCheckError: + logger.exception(HEALTH_CHECK_ERROR_LOG_MSG, instance.server_name) + healthy = None return ( CloudRunnerInstance( name=instance.server_name, instance_id=instance_id, - health=HealthState.HEALTHY if healthy else HealthState.UNHEALTHY, + health=HealthState.from_value(healthy), state=CloudRunnerState.from_openstack_server_status(instance.status), ) if instance is not None @@ -260,27 +270,29 @@ def get_runners( Returns: Information on the runner instances. """ - instance_list = self._openstack_cloud.get_instances() - instance_list = [ - CloudRunnerInstance( - name=instance.server_name, - instance_id=instance.instance_id, - health=( - HealthState.HEALTHY - if health_checks.check_runner( - openstack_cloud=self._openstack_cloud, instance=instance - ) - else HealthState.UNHEALTHY - ), - state=CloudRunnerState.from_openstack_server_status(instance.status), + instances = self._openstack_cloud.get_instances() + runners = [] + for instance in instances: + try: + healthy = health_checks.check_runner( + openstack_cloud=self._openstack_cloud, instance=instance + ) + except OpenstackHealthCheckError: + logger.exception(HEALTH_CHECK_ERROR_LOG_MSG, instance.server_name) + healthy = None + runners.append( + CloudRunnerInstance( + name=instance.server_name, + instance_id=instance.instance_id, + health=HealthState.from_value(healthy), + state=CloudRunnerState.from_openstack_server_status(instance.status), + ) ) - for instance in instance_list - ] if states is None: - return tuple(instance_list) + return tuple(runners) state_set = set(states) - return tuple(instance for instance in instance_list if instance.state in state_set) + return tuple(runner for runner in runners if runner.state in state_set) def delete_runner( self, instance_id: InstanceId, remove_token: str @@ -361,8 +373,10 @@ def cleanup(self, remove_token: str) -> Iterator[runner_metrics.RunnerMetrics]: healthy_runner_names = {runner.server_name for runner in runners.healthy} unhealthy_runner_names = {runner.server_name for runner in runners.unhealthy} + unknown_runner_names = {runner.server_name for runner in runners.unknown} logger.debug("Healthy runners: %s", healthy_runner_names) logger.debug("Unhealthy runners: %s", unhealthy_runner_names) + logger.debug("Unknown health runners: %s", unknown_runner_names) logger.debug("Deleting unhealthy runners.") for runner in runners.unhealthy: @@ -374,33 +388,31 @@ def cleanup(self, remove_token: str) -> Iterator[runner_metrics.RunnerMetrics]: logger.debug("Extracting metrics.") return self._cleanup_extract_metrics( metrics_storage_manager=self._metrics_storage_manager, - healthy_runner_names=healthy_runner_names, - unhealthy_runner_names=unhealthy_runner_names, + ignore_runner_names=healthy_runner_names | unknown_runner_names, + include_runner_names=unhealthy_runner_names, ) @staticmethod def _cleanup_extract_metrics( metrics_storage_manager: StorageManager, - healthy_runner_names: set[str], - unhealthy_runner_names: set[str], + ignore_runner_names: set[str], + include_runner_names: set[str], ) -> Iterator[runner_metrics.RunnerMetrics]: - """Extract metrics for unhealthy runners and dangling metrics storage. + """Extract metrics for certain runners and dangling metrics storage. Args: metrics_storage_manager: The metrics storage manager. - healthy_runner_names: The names of healthy runners. - unhealthy_runner_names: The names of unhealthy runners. + ignore_runner_names: The names of the runners whose metrics should not be extracted. + include_runner_names: The names of the runners whose metrics should be extracted. Returns: - Any metrics retrieved from unhealthy runners and dangling storage. + Any metrics retrieved from the include_runner_names and dangling storage. """ - # We want to extract metrics for unhealthy runners(runners to clean up). - # But there may be runners under construction - # (not marked as healthy and unhealthy because they do not yet exist in OpenStack) - # that should not be cleaned up. + # There may be runners under construction that are not included in the runner_names sets + # because they do not yet exist in OpenStack and that should not be cleaned up. # On the other hand, there could be storage for runners from the past that # should be cleaned up. - all_runner_names = healthy_runner_names | unhealthy_runner_names + all_runner_names = ignore_runner_names | include_runner_names unmatched_metrics_storage = ( ms for ms in metrics_storage_manager.list_all() @@ -414,7 +426,7 @@ def _cleanup_extract_metrics( } return runner_metrics.extract( metrics_storage_manager=metrics_storage_manager, - runners=unhealthy_runner_names | dangling_storage_runner_names, + runners=include_runner_names | dangling_storage_runner_names, include=True, ) @@ -462,13 +474,21 @@ def _get_runners_health(self) -> _RunnerHealth: """ runner_list = self._openstack_cloud.get_instances() - healthy, unhealthy = [], [] + healthy, unhealthy, unknown = [], [], [] for runner in runner_list: - if health_checks.check_runner(openstack_cloud=self._openstack_cloud, instance=runner): - healthy.append(runner) - else: - unhealthy.append(runner) - return _RunnerHealth(healthy=tuple(healthy), unhealthy=tuple(unhealthy)) + try: + if health_checks.check_runner( + openstack_cloud=self._openstack_cloud, instance=runner + ): + healthy.append(runner) + else: + unhealthy.append(runner) + except OpenstackHealthCheckError: + logger.exception(HEALTH_CHECK_ERROR_LOG_MSG, runner.server_name) + unknown.append(runner) + return _RunnerHealth( + healthy=tuple(healthy), unhealthy=tuple(unhealthy), unknown=tuple(unknown) + ) def _generate_cloud_init(self, instance_name: str, registration_token: str) -> str: """Generate cloud init userdata. @@ -668,15 +688,21 @@ def _wait_runner_running(self, instance: OpenstackInstance) -> None: f"Failed to SSH connect to {instance.server_name} openstack runner" ) from err - if not health_checks.check_active_runner( - ssh_conn=ssh_conn, instance=instance, accept_finished_job=True - ): - logger.info("Runner process not found on %s", instance.server_name) + try: + healthy = health_checks.check_active_runner( + ssh_conn=ssh_conn, instance=instance, accept_finished_job=True + ) + except OpenstackHealthCheckError as exc: + raise RunnerStartError( + f"Failed to check health of runner process on {instance.server_name}" + ) from exc + if not healthy: + logger.info("Runner %s not considered healthy", instance.server_name) raise RunnerStartError( - f"Runner process on {instance.server_name} failed to initialize on after starting" + f"Runner {instance.server_name} failed to initialize after starting" ) - logger.info("Runner process found to be healthy on %s", instance.server_name) + logger.info("Runner %s found to be healthy", instance.server_name) @staticmethod def _generate_instance_id() -> InstanceId: diff --git a/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py b/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py index 6dff55ba8..1000ce8d7 100644 --- a/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py +++ b/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py @@ -2,19 +2,66 @@ # See LICENSE file for licensing details. """Module for unit-testing OpenStack runner manager.""" +import secrets from datetime import datetime, timedelta from pathlib import Path from unittest.mock import MagicMock import pytest +from github_runner_manager.errors import OpenstackHealthCheckError +from github_runner_manager.manager.cloud_runner_manager import SupportServiceConfig from github_runner_manager.metrics import runner from github_runner_manager.metrics.storage import MetricsStorage, StorageManager -from github_runner_manager.openstack_cloud import openstack_runner_manager +from github_runner_manager.openstack_cloud import ( + health_checks, + openstack_cloud, + openstack_runner_manager, +) +from github_runner_manager.openstack_cloud.openstack_cloud import OpenstackCloud from github_runner_manager.openstack_cloud.openstack_runner_manager import ( OUTDATED_METRICS_STORAGE_IN_SECONDS, OpenStackRunnerManager, + OpenStackRunnerManagerConfig, ) +from tests.unit.factories import openstack_factory + +OPENSTACK_INSTANCE_PREFIX = "test" + + +@pytest.fixture(name="runner_manager") +def openstack_runner_manager_fixture(monkeypatch: pytest.MonkeyPatch) -> OpenStackRunnerManager: + """Mock required dependencies/configs and return an OpenStackRunnerManager instance.""" + monkeypatch.setattr( + "github_runner_manager.openstack_cloud.openstack_runner_manager.metrics_storage", + MagicMock(), + ) + monkeypatch.setattr( + "github_runner_manager.openstack_cloud.openstack_runner_manager.OpenstackCloud", + MagicMock(), + ) + + service_config_mock = MagicMock(spec=SupportServiceConfig) + service_config_mock.proxy_config = None + config = OpenStackRunnerManagerConfig( + name="test", + prefix="test", + credentials=MagicMock(), + server_config=MagicMock(), + runner_config=MagicMock(), + service_config=service_config_mock, + system_user_config=MagicMock(), + ) + + return OpenStackRunnerManager(config=config) + + +@pytest.fixture(name="runner_metrics_mock") +def runner_metrics_mock_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: + """Mock the runner_metrics module.""" + runner_metrics_mock = MagicMock(spec=runner) + monkeypatch.setattr(openstack_runner_manager, "runner_metrics", runner_metrics_mock) + return runner_metrics_mock @pytest.mark.parametrize( @@ -87,6 +134,7 @@ def test__cleanup_extract_metrics( undecided_runner_storage: set[tuple[str, datetime]], expected_storage_to_be_extracted: set[str], monkeypatch: pytest.MonkeyPatch, + runner_metrics_mock: MagicMock, ): """ arrange: Given different combinations of healthy, unhealthy and undecided runners. @@ -94,8 +142,6 @@ def test__cleanup_extract_metrics( assert: runner_metrics.extract is called with the expected storage to be extracted. """ metric_storage_manager = MagicMock(spec=StorageManager) - runner_metrics_mock = MagicMock(spec=runner) - monkeypatch.setattr(openstack_runner_manager, "runner_metrics", runner_metrics_mock) now = datetime.now() all_runner_name_metrics_storage = [ _create_metrics_storage(runner_name, now) @@ -111,14 +157,69 @@ def test__cleanup_extract_metrics( OpenStackRunnerManager._cleanup_extract_metrics( metrics_storage_manager=metric_storage_manager, - healthy_runner_names=healthy_runner_names, - unhealthy_runner_names=unhealthy_runner_names, + ignore_runner_names=healthy_runner_names, + include_runner_names=unhealthy_runner_names, ) assert runner_metrics_mock.extract.call_count == 1 assert runner_metrics_mock.extract.call_args[1]["runners"] == expected_storage_to_be_extracted +@pytest.mark.parametrize( + "healthy_count, unhealthy_count, unknown_count", + [ + pytest.param(1, 1, 1, id="one of each"), + pytest.param(2, 1, 1, id="two healthy"), + pytest.param(1, 2, 1, id="two unhealthy"), + pytest.param(1, 1, 2, id="two unknown"), + pytest.param(0, 0, 0, id="no runners"), + pytest.param(0, 0, 1, id="one unknown"), + pytest.param(0, 1, 0, id="one unhealthy"), + pytest.param(1, 0, 0, id="one healthy"), + ], +) +def test_cleanup_ignores_runners_with_health_check_errors( + healthy_count: int, + unhealthy_count: int, + unknown_count, + monkeypatch: pytest.MonkeyPatch, + runner_manager: OpenStackRunnerManager, + runner_metrics_mock: MagicMock, +): + """ + arrange: Given a combination of healthy/unhealthy/unknown(with a health check error) runners. + act: When the cleanup method is called. + assert: Only the unhealthy runners are deleted and their metrics are extracted. + """ + names = [ + f"test-{status}{i}" + for status, count in [ + ("healthy", healthy_count), + ("unhealthy", unhealthy_count), + ("unknown", unknown_count), + ] + for i in range(count) + ] + openstack_cloud_mock = _create_openstack_cloud_mock(names) + runner_manager._openstack_cloud = openstack_cloud_mock + health_checks_mock = _create_health_checks_mock() + monkeypatch.setattr( + "github_runner_manager.openstack_cloud.openstack_runner_manager.health_checks", + health_checks_mock, + ) + runner_manager.cleanup(secrets.token_hex(16)) + + assert openstack_cloud_mock.delete_instance.call_count == unhealthy_count + for name in names: + instance_id = name[len(OPENSTACK_INSTANCE_PREFIX) + 1 :] + if instance_id.startswith("unhealthy"): + openstack_cloud_mock.delete_instance.assert_any_call(instance_id) + assert runner_metrics_mock.extract.call_count == 1 + assert runner_metrics_mock.extract.call_args[1]["runners"] == { + names for names in names if names.startswith(f"{OPENSTACK_INSTANCE_PREFIX}-unhealthy") + } + + def _create_metrics_storage(runner_name: str, mtime: datetime) -> MetricsStorage: """ Create a metric storage object with a mocked mtime for the storage path. @@ -136,3 +237,42 @@ def _create_metrics_storage(runner_name: str, mtime: datetime) -> MetricsStorage stat.st_mtime = mtime.timestamp() metrics_storage.path.stat = stat_mock return metrics_storage + + +def _create_openstack_cloud_mock(server_names: list[str]) -> MagicMock: + """Create an OpenstackCloud mock which returns servers with a given list of server names.""" + openstack_cloud_mock = MagicMock(spec=OpenstackCloud) + openstack_cloud_mock.get_instances.return_value = [ + openstack_cloud.OpenstackInstance( + server=openstack_factory.ServerFactory( + status="ACTIVE", + name=name, + ), + prefix=OPENSTACK_INSTANCE_PREFIX, + ) + for name in server_names + ] + return openstack_cloud_mock + + +def _create_health_checks_mock() -> MagicMock: + """Create a health check mock that returns a boolean or raises an error. + + The logic is that if the server name starts with "test-healthy" it returns True, + if it starts with "test-unhealthy" it returns False, and raises an error otherwise. + """ + health_checks_mock = MagicMock(spec=health_checks) + + def _health_checks_side_effect(openstack_cloud, instance): + """Mock side effect for the health_checks.check_runner method. + + This implements the logic mentioned in the docstring above. + """ + if instance.server_name.startswith("test-healthy"): + return True + if instance.server_name.startswith("test-unhealthy"): + return False + raise OpenstackHealthCheckError("Health check failed") + + health_checks_mock.check_runner.side_effect = _health_checks_side_effect + return health_checks_mock