diff --git a/docs/changelog.md b/docs/changelog.md
index 508613fab..bd25a544c 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,9 @@
# Changelog
+### 2024-12-05
+
+- Bugfix to no longer stop the reconciliation when a runner's health check fails.
+
### 2024-12-04
- Clean up corresponding OpenStack runner resources when a unit of the charm is removed.
diff --git a/github-runner-manager/src-docs/errors.md b/github-runner-manager/src-docs/errors.md
index ec864dc62..941b39cd3 100644
--- a/github-runner-manager/src-docs/errors.md
+++ b/github-runner-manager/src-docs/errors.md
@@ -260,3 +260,14 @@ Base class for all reconcile errors.
+---
+
+
+
+## class `OpenstackHealthCheckError`
+Base class for all health check errors.
+
+
+
+
+
diff --git a/github-runner-manager/src-docs/manager.cloud_runner_manager.md b/github-runner-manager/src-docs/manager.cloud_runner_manager.md
index 4fad301b9..14ab97e8b 100644
--- a/github-runner-manager/src-docs/manager.cloud_runner_manager.md
+++ b/github-runner-manager/src-docs/manager.cloud_runner_manager.md
@@ -28,7 +28,7 @@ Health state of the runners.
---
-
+
## class `CloudRunnerState`
Represent state of the instance hosting the runner.
@@ -51,7 +51,7 @@ Represent state of the instance hosting the runner.
---
-
+
## class `CloudInitStatus`
Represents the state of cloud-init script.
@@ -77,7 +77,7 @@ Refer to the official documentation on cloud-init status: https://cloudinit.read
---
-
+
## class `GitHubRunnerConfig`
Configuration for GitHub runner spawned.
@@ -107,7 +107,7 @@ __init__(github_path: GitHubOrg | GitHubRepo, labels: list[str]) → None
---
-
+
## class `SupportServiceConfig`
Configuration for supporting services for runners.
@@ -144,7 +144,7 @@ __init__(
---
-
+
## class `CloudRunnerInstance`
Information on the runner on the cloud.
@@ -181,7 +181,7 @@ __init__(
---
-
+
## class `CloudRunnerManager`
Manage runner instance on cloud.
@@ -203,7 +203,7 @@ Get the name prefix of the self-hosted runners.
---
-
+
### method `cleanup`
@@ -223,7 +223,7 @@ Perform health check on runner and delete the runner if it fails.
---
-
+
### method `create_runner`
@@ -241,7 +241,7 @@ Create a self-hosted runner.
---
-
+
### method `delete_runner`
@@ -260,7 +260,7 @@ Delete self-hosted runner.
---
-
+
### method `flush_runners`
@@ -279,7 +279,7 @@ Stop all runners.
---
-
+
### method `get_runner`
@@ -297,7 +297,7 @@ Get a self-hosted runner by instance id.
---
-
+
### method `get_runners`
diff --git a/github-runner-manager/src/github_runner_manager/errors.py b/github-runner-manager/src/github_runner_manager/errors.py
index d358ea1d6..3aa229df9 100644
--- a/github-runner-manager/src/github_runner_manager/errors.py
+++ b/github-runner-manager/src/github_runner_manager/errors.py
@@ -95,3 +95,7 @@ class KeyfileError(SSHError):
class ReconcileError(Exception):
"""Base class for all reconcile errors."""
+
+
+class OpenstackHealthCheckError(Exception):
+ """Base class for all health check errors."""
diff --git a/github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py b/github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py
index e9d25cd5d..a8d9eacb0 100644
--- a/github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py
+++ b/github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py
@@ -35,6 +35,20 @@ class HealthState(Enum):
UNHEALTHY = auto()
UNKNOWN = auto()
+ @staticmethod
+ def from_value(health: bool | None) -> "HealthState":
+ """Create from a health value.
+
+ Args:
+ health: The health value as boolean or None.
+
+ Returns:
+ The health state.
+ """
+ if health is None:
+ return HealthState.UNKNOWN
+ return HealthState.HEALTHY if health else HealthState.UNHEALTHY
+
class CloudRunnerState(str, Enum):
"""Represent state of the instance hosting the runner.
diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py
index 754a48669..4a241dc51 100644
--- a/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py
+++ b/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py
@@ -9,7 +9,7 @@
import invoke
from fabric import Connection as SSHConnection
-from github_runner_manager.errors import KeyfileError, SSHError
+from github_runner_manager.errors import KeyfileError, OpenstackHealthCheckError, SSHError
from github_runner_manager.manager.cloud_runner_manager import CloudInitStatus, CloudRunnerState
from github_runner_manager.openstack_cloud.constants import (
METRICS_EXCHANGE_PATH,
@@ -26,6 +26,10 @@
_HealthCheckResult = bool | None # None indicates that the check can not determine health status
+class _SSHError(Exception):
+ """Error on SSH command execution."""
+
+
def check_runner(openstack_cloud: OpenstackCloud, instance: OpenstackInstance) -> bool:
"""Run a general health check on a runner instance.
@@ -47,9 +51,17 @@ def check_runner(openstack_cloud: OpenstackCloud, instance: OpenstackInstance) -
logger.exception(
"Health check failed due to unable to find keyfile for %s", instance.server_name
)
+ # KeyfileError indicates that we'll never be able to ssh into the unit,
+ # so we mark it as unhealthy.
return False
- except SSHError:
- logger.exception("SSH Failed on %s, marking as unhealthy.")
+ except _SSHError:
+ logger.exception(
+ "Unable to get SSH connection for instance %s, marking as unhealthy.",
+ instance.server_name,
+ )
+ # We assume that the failure to get the SSH connection is not transient, and mark
+ # the runner as unhealthy.
+ # It is debatable whether we should throw an exception here instead.
return False
return check_active_runner(ssh_conn, instance)
@@ -69,23 +81,31 @@ def check_active_runner(
the flag, the health check would fail as it checks for running processes
which would not be present in this case.
+ Raises:
+ OpenstackHealthCheckError: If the health check could not be completed.
+
Returns:
Whether the runner should be considered healthy.
"""
- if (check_ok := _run_health_check_runner_installed(ssh_conn, instance)) is not None:
- return check_ok
-
- if (
- check_ok := _run_health_check_cloud_init(
- ssh_conn, instance.server_name, accept_finished_job
- )
- ) is not None:
- return check_ok
+ try:
+ if (check_ok := _run_health_check_runner_installed(ssh_conn, instance)) is not None:
+ return check_ok
- if (
- check_ok := _run_health_check_runner_processes_running(ssh_conn, instance.server_name)
- ) is not None:
- return check_ok
+ if (
+ check_ok := _run_health_check_cloud_init(
+ ssh_conn, instance.server_name, accept_finished_job
+ )
+ ) is not None:
+ return check_ok
+
+ if (
+ check_ok := _run_health_check_runner_processes_running(ssh_conn, instance.server_name)
+ ) is not None:
+ return check_ok
+ except _SSHError as exc:
+ raise OpenstackHealthCheckError(
+ "Health check execution failed due to SSH command failure."
+ ) from exc
return True
@@ -101,7 +121,7 @@ def _get_ssh_connection(
instance: The OpenStack instance to conduit the health check.
Raises:
- SSHError: Unable to get a SSH connection to the instance.
+ _SSHError: Unable to get a SSH connection to the instance.
Returns:
Whether the runner is healthy.
@@ -109,11 +129,8 @@ def _get_ssh_connection(
try:
ssh_conn = openstack_cloud.get_ssh_connection(instance)
- except SSHError:
- logger.exception(
- "SSH connection failure with %s during health check", instance.server_name
- )
- raise
+ except SSHError as exc:
+ raise _SSHError(f"Unable to get SSH connection to {instance.server_name}") from exc
return ssh_conn
@@ -170,7 +187,7 @@ def _run_health_check_cloud_init(
Returns:
Whether the cloud-init status indicates the run is healthy or None.
"""
- result: invoke.runners.Result = ssh_conn.run("cloud-init status", warn=True, timeout=30)
+ result: invoke.runners.Result = _execute_ssh_command(ssh_conn, "cloud-init status")
if not result.ok:
logger.warning("cloud-init status command failed on %s: %s.", server_name, result.stderr)
return False
@@ -206,8 +223,8 @@ def _run_health_check_runner_installed(
If the run can be considered healthy depending on the existence of
the runner-installed.timestamp.
"""
- result = ssh_conn.run(
- f"[ -f {METRICS_EXCHANGE_PATH}/runner-installed.timestamp ]", warn=True, timeout=30
+ result = _execute_ssh_command(
+ ssh_conn, f"[ -f {METRICS_EXCHANGE_PATH}/runner-installed.timestamp ]"
)
if not result.ok:
logger.info(
@@ -239,7 +256,7 @@ def _run_health_check_runner_processes_running(
Returns:
If the run can be considered healthy depending on the existence of the processes.
"""
- result = ssh_conn.run("ps aux", warn=True, timeout=30)
+ result = _execute_ssh_command(ssh_conn, "ps aux")
if not result.ok:
logger.warning("SSH run of `ps aux` failed on %s: %s", server_name, result.stderr)
return False
@@ -247,3 +264,24 @@ def _run_health_check_runner_processes_running(
logger.warning("Runner process not found on %s", server_name)
return False
return None
+
+
+def _execute_ssh_command(ssh_conn: SSHConnection, command: str) -> invoke.runners.Result:
+ """Run a command on the remote server.
+
+ Args:
+ ssh_conn: The SSH connection to the runner.
+ command: The command to run.
+
+ Returns:
+ The result of the command.
+
+ Raises:
+ _SSHError: If the command execution failed.
+ """
+ try:
+ return ssh_conn.run(command, warn=True, timeout=30)
+ except invoke.exceptions.CommandTimedOut as exc:
+ raise _SSHError(
+ f"SSH command execution timed out for command '{command}' on {ssh_conn.host}"
+ ) from exc
diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py
index 37a3f4fad..964d98ee5 100644
--- a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py
+++ b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py
@@ -23,6 +23,7 @@
KeyfileError,
MissingServerConfigError,
OpenStackError,
+ OpenstackHealthCheckError,
RunnerCreateError,
RunnerStartError,
SSHError,
@@ -70,6 +71,8 @@
OUTDATED_METRICS_STORAGE_IN_SECONDS = CREATE_SERVER_TIMEOUT + 30 # add a bit on top of the timeout
+HEALTH_CHECK_ERROR_LOG_MSG = "Health check could not be completed for %s"
+
class _GithubRunnerRemoveError(Exception):
"""Represents an error while SSH into a runner and running the remove script."""
@@ -124,10 +127,12 @@ class _RunnerHealth:
Attributes:
healthy: The list of healthy runners.
unhealthy: The list of unhealthy runners.
+ unknown: The list of runners whose health state could not be determined.
"""
healthy: tuple[OpenstackInstance, ...]
unhealthy: tuple[OpenstackInstance, ...]
+ unknown: tuple[OpenstackInstance, ...]
class OpenStackRunnerManager(CloudRunnerManager):
@@ -233,15 +238,20 @@ def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None:
logger.debug(
"Runner info fetched, checking health %s %s", instance_id, instance.server_name
)
- healthy = health_checks.check_runner(
- openstack_cloud=self._openstack_cloud, instance=instance
- )
- logger.debug("Runner health check completed %s %s", instance.server_name, healthy)
+
+ try:
+ healthy = health_checks.check_runner(
+ openstack_cloud=self._openstack_cloud, instance=instance
+ )
+ logger.debug("Runner health check completed %s %s", instance.server_name, healthy)
+ except OpenstackHealthCheckError:
+ logger.exception(HEALTH_CHECK_ERROR_LOG_MSG, instance.server_name)
+ healthy = None
return (
CloudRunnerInstance(
name=instance.server_name,
instance_id=instance_id,
- health=HealthState.HEALTHY if healthy else HealthState.UNHEALTHY,
+ health=HealthState.from_value(healthy),
state=CloudRunnerState.from_openstack_server_status(instance.status),
)
if instance is not None
@@ -260,27 +270,29 @@ def get_runners(
Returns:
Information on the runner instances.
"""
- instance_list = self._openstack_cloud.get_instances()
- instance_list = [
- CloudRunnerInstance(
- name=instance.server_name,
- instance_id=instance.instance_id,
- health=(
- HealthState.HEALTHY
- if health_checks.check_runner(
- openstack_cloud=self._openstack_cloud, instance=instance
- )
- else HealthState.UNHEALTHY
- ),
- state=CloudRunnerState.from_openstack_server_status(instance.status),
+ instances = self._openstack_cloud.get_instances()
+ runners = []
+ for instance in instances:
+ try:
+ healthy = health_checks.check_runner(
+ openstack_cloud=self._openstack_cloud, instance=instance
+ )
+ except OpenstackHealthCheckError:
+ logger.exception(HEALTH_CHECK_ERROR_LOG_MSG, instance.server_name)
+ healthy = None
+ runners.append(
+ CloudRunnerInstance(
+ name=instance.server_name,
+ instance_id=instance.instance_id,
+ health=HealthState.from_value(healthy),
+ state=CloudRunnerState.from_openstack_server_status(instance.status),
+ )
)
- for instance in instance_list
- ]
if states is None:
- return tuple(instance_list)
+ return tuple(runners)
state_set = set(states)
- return tuple(instance for instance in instance_list if instance.state in state_set)
+ return tuple(runner for runner in runners if runner.state in state_set)
def delete_runner(
self, instance_id: InstanceId, remove_token: str
@@ -361,8 +373,10 @@ def cleanup(self, remove_token: str) -> Iterator[runner_metrics.RunnerMetrics]:
healthy_runner_names = {runner.server_name for runner in runners.healthy}
unhealthy_runner_names = {runner.server_name for runner in runners.unhealthy}
+ unknown_runner_names = {runner.server_name for runner in runners.unknown}
logger.debug("Healthy runners: %s", healthy_runner_names)
logger.debug("Unhealthy runners: %s", unhealthy_runner_names)
+ logger.debug("Unknown health runners: %s", unknown_runner_names)
logger.debug("Deleting unhealthy runners.")
for runner in runners.unhealthy:
@@ -374,33 +388,31 @@ def cleanup(self, remove_token: str) -> Iterator[runner_metrics.RunnerMetrics]:
logger.debug("Extracting metrics.")
return self._cleanup_extract_metrics(
metrics_storage_manager=self._metrics_storage_manager,
- healthy_runner_names=healthy_runner_names,
- unhealthy_runner_names=unhealthy_runner_names,
+ ignore_runner_names=healthy_runner_names | unknown_runner_names,
+ include_runner_names=unhealthy_runner_names,
)
@staticmethod
def _cleanup_extract_metrics(
metrics_storage_manager: StorageManager,
- healthy_runner_names: set[str],
- unhealthy_runner_names: set[str],
+ ignore_runner_names: set[str],
+ include_runner_names: set[str],
) -> Iterator[runner_metrics.RunnerMetrics]:
- """Extract metrics for unhealthy runners and dangling metrics storage.
+ """Extract metrics for certain runners and dangling metrics storage.
Args:
metrics_storage_manager: The metrics storage manager.
- healthy_runner_names: The names of healthy runners.
- unhealthy_runner_names: The names of unhealthy runners.
+ ignore_runner_names: The names of the runners whose metrics should not be extracted.
+ include_runner_names: The names of the runners whose metrics should be extracted.
Returns:
- Any metrics retrieved from unhealthy runners and dangling storage.
+ Any metrics retrieved from the include_runner_names and dangling storage.
"""
- # We want to extract metrics for unhealthy runners(runners to clean up).
- # But there may be runners under construction
- # (not marked as healthy and unhealthy because they do not yet exist in OpenStack)
- # that should not be cleaned up.
+ # There may be runners under construction that are not included in the runner_names sets
+ # because they do not yet exist in OpenStack and that should not be cleaned up.
# On the other hand, there could be storage for runners from the past that
# should be cleaned up.
- all_runner_names = healthy_runner_names | unhealthy_runner_names
+ all_runner_names = ignore_runner_names | include_runner_names
unmatched_metrics_storage = (
ms
for ms in metrics_storage_manager.list_all()
@@ -414,7 +426,7 @@ def _cleanup_extract_metrics(
}
return runner_metrics.extract(
metrics_storage_manager=metrics_storage_manager,
- runners=unhealthy_runner_names | dangling_storage_runner_names,
+ runners=include_runner_names | dangling_storage_runner_names,
include=True,
)
@@ -462,13 +474,21 @@ def _get_runners_health(self) -> _RunnerHealth:
"""
runner_list = self._openstack_cloud.get_instances()
- healthy, unhealthy = [], []
+ healthy, unhealthy, unknown = [], [], []
for runner in runner_list:
- if health_checks.check_runner(openstack_cloud=self._openstack_cloud, instance=runner):
- healthy.append(runner)
- else:
- unhealthy.append(runner)
- return _RunnerHealth(healthy=tuple(healthy), unhealthy=tuple(unhealthy))
+ try:
+ if health_checks.check_runner(
+ openstack_cloud=self._openstack_cloud, instance=runner
+ ):
+ healthy.append(runner)
+ else:
+ unhealthy.append(runner)
+ except OpenstackHealthCheckError:
+ logger.exception(HEALTH_CHECK_ERROR_LOG_MSG, runner.server_name)
+ unknown.append(runner)
+ return _RunnerHealth(
+ healthy=tuple(healthy), unhealthy=tuple(unhealthy), unknown=tuple(unknown)
+ )
def _generate_cloud_init(self, instance_name: str, registration_token: str) -> str:
"""Generate cloud init userdata.
@@ -668,15 +688,21 @@ def _wait_runner_running(self, instance: OpenstackInstance) -> None:
f"Failed to SSH connect to {instance.server_name} openstack runner"
) from err
- if not health_checks.check_active_runner(
- ssh_conn=ssh_conn, instance=instance, accept_finished_job=True
- ):
- logger.info("Runner process not found on %s", instance.server_name)
+ try:
+ healthy = health_checks.check_active_runner(
+ ssh_conn=ssh_conn, instance=instance, accept_finished_job=True
+ )
+ except OpenstackHealthCheckError as exc:
+ raise RunnerStartError(
+ f"Failed to check health of runner process on {instance.server_name}"
+ ) from exc
+ if not healthy:
+ logger.info("Runner %s not considered healthy", instance.server_name)
raise RunnerStartError(
- f"Runner process on {instance.server_name} failed to initialize on after starting"
+ f"Runner {instance.server_name} failed to initialize after starting"
)
- logger.info("Runner process found to be healthy on %s", instance.server_name)
+ logger.info("Runner %s found to be healthy", instance.server_name)
@staticmethod
def _generate_instance_id() -> InstanceId:
diff --git a/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py b/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py
index 6dff55ba8..1000ce8d7 100644
--- a/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py
+++ b/github-runner-manager/tests/unit/openstack_cloud/test_openstack_runner_manager.py
@@ -2,19 +2,66 @@
# See LICENSE file for licensing details.
"""Module for unit-testing OpenStack runner manager."""
+import secrets
from datetime import datetime, timedelta
from pathlib import Path
from unittest.mock import MagicMock
import pytest
+from github_runner_manager.errors import OpenstackHealthCheckError
+from github_runner_manager.manager.cloud_runner_manager import SupportServiceConfig
from github_runner_manager.metrics import runner
from github_runner_manager.metrics.storage import MetricsStorage, StorageManager
-from github_runner_manager.openstack_cloud import openstack_runner_manager
+from github_runner_manager.openstack_cloud import (
+ health_checks,
+ openstack_cloud,
+ openstack_runner_manager,
+)
+from github_runner_manager.openstack_cloud.openstack_cloud import OpenstackCloud
from github_runner_manager.openstack_cloud.openstack_runner_manager import (
OUTDATED_METRICS_STORAGE_IN_SECONDS,
OpenStackRunnerManager,
+ OpenStackRunnerManagerConfig,
)
+from tests.unit.factories import openstack_factory
+
+OPENSTACK_INSTANCE_PREFIX = "test"
+
+
+@pytest.fixture(name="runner_manager")
+def openstack_runner_manager_fixture(monkeypatch: pytest.MonkeyPatch) -> OpenStackRunnerManager:
+ """Mock required dependencies/configs and return an OpenStackRunnerManager instance."""
+ monkeypatch.setattr(
+ "github_runner_manager.openstack_cloud.openstack_runner_manager.metrics_storage",
+ MagicMock(),
+ )
+ monkeypatch.setattr(
+ "github_runner_manager.openstack_cloud.openstack_runner_manager.OpenstackCloud",
+ MagicMock(),
+ )
+
+ service_config_mock = MagicMock(spec=SupportServiceConfig)
+ service_config_mock.proxy_config = None
+ config = OpenStackRunnerManagerConfig(
+ name="test",
+ prefix="test",
+ credentials=MagicMock(),
+ server_config=MagicMock(),
+ runner_config=MagicMock(),
+ service_config=service_config_mock,
+ system_user_config=MagicMock(),
+ )
+
+ return OpenStackRunnerManager(config=config)
+
+
+@pytest.fixture(name="runner_metrics_mock")
+def runner_metrics_mock_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock:
+ """Mock the runner_metrics module."""
+ runner_metrics_mock = MagicMock(spec=runner)
+ monkeypatch.setattr(openstack_runner_manager, "runner_metrics", runner_metrics_mock)
+ return runner_metrics_mock
@pytest.mark.parametrize(
@@ -87,6 +134,7 @@ def test__cleanup_extract_metrics(
undecided_runner_storage: set[tuple[str, datetime]],
expected_storage_to_be_extracted: set[str],
monkeypatch: pytest.MonkeyPatch,
+ runner_metrics_mock: MagicMock,
):
"""
arrange: Given different combinations of healthy, unhealthy and undecided runners.
@@ -94,8 +142,6 @@ def test__cleanup_extract_metrics(
assert: runner_metrics.extract is called with the expected storage to be extracted.
"""
metric_storage_manager = MagicMock(spec=StorageManager)
- runner_metrics_mock = MagicMock(spec=runner)
- monkeypatch.setattr(openstack_runner_manager, "runner_metrics", runner_metrics_mock)
now = datetime.now()
all_runner_name_metrics_storage = [
_create_metrics_storage(runner_name, now)
@@ -111,14 +157,69 @@ def test__cleanup_extract_metrics(
OpenStackRunnerManager._cleanup_extract_metrics(
metrics_storage_manager=metric_storage_manager,
- healthy_runner_names=healthy_runner_names,
- unhealthy_runner_names=unhealthy_runner_names,
+ ignore_runner_names=healthy_runner_names,
+ include_runner_names=unhealthy_runner_names,
)
assert runner_metrics_mock.extract.call_count == 1
assert runner_metrics_mock.extract.call_args[1]["runners"] == expected_storage_to_be_extracted
+@pytest.mark.parametrize(
+ "healthy_count, unhealthy_count, unknown_count",
+ [
+ pytest.param(1, 1, 1, id="one of each"),
+ pytest.param(2, 1, 1, id="two healthy"),
+ pytest.param(1, 2, 1, id="two unhealthy"),
+ pytest.param(1, 1, 2, id="two unknown"),
+ pytest.param(0, 0, 0, id="no runners"),
+ pytest.param(0, 0, 1, id="one unknown"),
+ pytest.param(0, 1, 0, id="one unhealthy"),
+ pytest.param(1, 0, 0, id="one healthy"),
+ ],
+)
+def test_cleanup_ignores_runners_with_health_check_errors(
+ healthy_count: int,
+ unhealthy_count: int,
+ unknown_count,
+ monkeypatch: pytest.MonkeyPatch,
+ runner_manager: OpenStackRunnerManager,
+ runner_metrics_mock: MagicMock,
+):
+ """
+ arrange: Given a combination of healthy/unhealthy/unknown(with a health check error) runners.
+ act: When the cleanup method is called.
+ assert: Only the unhealthy runners are deleted and their metrics are extracted.
+ """
+ names = [
+ f"test-{status}{i}"
+ for status, count in [
+ ("healthy", healthy_count),
+ ("unhealthy", unhealthy_count),
+ ("unknown", unknown_count),
+ ]
+ for i in range(count)
+ ]
+ openstack_cloud_mock = _create_openstack_cloud_mock(names)
+ runner_manager._openstack_cloud = openstack_cloud_mock
+ health_checks_mock = _create_health_checks_mock()
+ monkeypatch.setattr(
+ "github_runner_manager.openstack_cloud.openstack_runner_manager.health_checks",
+ health_checks_mock,
+ )
+ runner_manager.cleanup(secrets.token_hex(16))
+
+ assert openstack_cloud_mock.delete_instance.call_count == unhealthy_count
+ for name in names:
+ instance_id = name[len(OPENSTACK_INSTANCE_PREFIX) + 1 :]
+ if instance_id.startswith("unhealthy"):
+ openstack_cloud_mock.delete_instance.assert_any_call(instance_id)
+ assert runner_metrics_mock.extract.call_count == 1
+ assert runner_metrics_mock.extract.call_args[1]["runners"] == {
+ names for names in names if names.startswith(f"{OPENSTACK_INSTANCE_PREFIX}-unhealthy")
+ }
+
+
def _create_metrics_storage(runner_name: str, mtime: datetime) -> MetricsStorage:
"""
Create a metric storage object with a mocked mtime for the storage path.
@@ -136,3 +237,42 @@ def _create_metrics_storage(runner_name: str, mtime: datetime) -> MetricsStorage
stat.st_mtime = mtime.timestamp()
metrics_storage.path.stat = stat_mock
return metrics_storage
+
+
+def _create_openstack_cloud_mock(server_names: list[str]) -> MagicMock:
+ """Create an OpenstackCloud mock which returns servers with a given list of server names."""
+ openstack_cloud_mock = MagicMock(spec=OpenstackCloud)
+ openstack_cloud_mock.get_instances.return_value = [
+ openstack_cloud.OpenstackInstance(
+ server=openstack_factory.ServerFactory(
+ status="ACTIVE",
+ name=name,
+ ),
+ prefix=OPENSTACK_INSTANCE_PREFIX,
+ )
+ for name in server_names
+ ]
+ return openstack_cloud_mock
+
+
+def _create_health_checks_mock() -> MagicMock:
+ """Create a health check mock that returns a boolean or raises an error.
+
+ The logic is that if the server name starts with "test-healthy" it returns True,
+ if it starts with "test-unhealthy" it returns False, and raises an error otherwise.
+ """
+ health_checks_mock = MagicMock(spec=health_checks)
+
+ def _health_checks_side_effect(openstack_cloud, instance):
+ """Mock side effect for the health_checks.check_runner method.
+
+ This implements the logic mentioned in the docstring above.
+ """
+ if instance.server_name.startswith("test-healthy"):
+ return True
+ if instance.server_name.startswith("test-unhealthy"):
+ return False
+ raise OpenstackHealthCheckError("Health check failed")
+
+ health_checks_mock.check_runner.side_effect = _health_checks_side_effect
+ return health_checks_mock