Skip to content

Commit

Permalink
fix: Continue reconciliation after a health check execution error (#415)
Browse files Browse the repository at this point in the history
* catch health check error and continue reconciliation

* catch health check error and continue reconciliation

* rename to unknown

* add metric storage check to unit test

* make unit test more readable

* refactor _cleanup_extract_metrics signature

* use constant for health check error log msg

* change health check exc handling

* use _SSHError in get_ssh_connection

* add changelog

* add explanatory comments
  • Loading branch information
cbartz authored Dec 6, 2024
1 parent 20b6513 commit e477fb8
Show file tree
Hide file tree
Showing 8 changed files with 328 additions and 91 deletions.
4 changes: 4 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Changelog

### 2024-12-05

- Bugfix to no longer stop the reconciliation when a runner's health check fails.

### 2024-12-04

- Clean up corresponding OpenStack runner resources when a unit of the charm is removed.
Expand Down
11 changes: 11 additions & 0 deletions github-runner-manager/src-docs/errors.md
Original file line number Diff line number Diff line change
Expand Up @@ -260,3 +260,14 @@ Base class for all reconcile errors.



---

<a href="../../github-runner-manager/src/github_runner_manager/errors.py#L100"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

## <kbd>class</kbd> `OpenstackHealthCheckError`
Base class for all health check errors.





24 changes: 12 additions & 12 deletions github-runner-manager/src-docs/manager.cloud_runner_manager.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Health state of the runners.

---

<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L39"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L53"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

## <kbd>class</kbd> `CloudRunnerState`
Represent state of the instance hosting the runner.
Expand All @@ -51,7 +51,7 @@ Represent state of the instance hosting the runner.

---

<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L97"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L111"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

## <kbd>class</kbd> `CloudInitStatus`
Represents the state of cloud-init script.
Expand All @@ -77,7 +77,7 @@ Refer to the official documentation on cloud-init status: https://cloudinit.read

---

<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L124"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L138"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

## <kbd>class</kbd> `GitHubRunnerConfig`
Configuration for GitHub runner spawned.
Expand Down Expand Up @@ -107,7 +107,7 @@ __init__(github_path: GitHubOrg | GitHubRepo, labels: list[str]) → None

---

<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L137"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L151"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

## <kbd>class</kbd> `SupportServiceConfig`
Configuration for supporting services for runners.
Expand Down Expand Up @@ -144,7 +144,7 @@ __init__(

---

<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L154"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L168"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

## <kbd>class</kbd> `CloudRunnerInstance`
Information on the runner on the cloud.
Expand Down Expand Up @@ -181,7 +181,7 @@ __init__(

---

<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L171"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L185"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

## <kbd>class</kbd> `CloudRunnerManager`
Manage runner instance on cloud.
Expand All @@ -203,7 +203,7 @@ Get the name prefix of the self-hosted runners.

---

<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L227"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L241"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

### <kbd>method</kbd> `cleanup`

Expand All @@ -223,7 +223,7 @@ Perform health check on runner and delete the runner if it fails.

---

<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L183"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L197"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

### <kbd>method</kbd> `create_runner`

Expand All @@ -241,7 +241,7 @@ Create a self-hosted runner.

---

<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L208"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L222"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

### <kbd>method</kbd> `delete_runner`

Expand All @@ -260,7 +260,7 @@ Delete self-hosted runner.

---

<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L217"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L231"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

### <kbd>method</kbd> `flush_runners`

Expand All @@ -279,7 +279,7 @@ Stop all runners.

---

<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L191"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L205"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

### <kbd>method</kbd> `get_runner`

Expand All @@ -297,7 +297,7 @@ Get a self-hosted runner by instance id.

---

<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L199"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
<a href="../../github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py#L213"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

### <kbd>method</kbd> `get_runners`

Expand Down
4 changes: 4 additions & 0 deletions github-runner-manager/src/github_runner_manager/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,7 @@ class KeyfileError(SSHError):

class ReconcileError(Exception):
"""Base class for all reconcile errors."""


class OpenstackHealthCheckError(Exception):
"""Base class for all health check errors."""
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,20 @@ class HealthState(Enum):
UNHEALTHY = auto()
UNKNOWN = auto()

@staticmethod
def from_value(health: bool | None) -> "HealthState":
"""Create from a health value.
Args:
health: The health value as boolean or None.
Returns:
The health state.
"""
if health is None:
return HealthState.UNKNOWN
return HealthState.HEALTHY if health else HealthState.UNHEALTHY


class CloudRunnerState(str, Enum):
"""Represent state of the instance hosting the runner.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import invoke
from fabric import Connection as SSHConnection

from github_runner_manager.errors import KeyfileError, SSHError
from github_runner_manager.errors import KeyfileError, OpenstackHealthCheckError, SSHError
from github_runner_manager.manager.cloud_runner_manager import CloudInitStatus, CloudRunnerState
from github_runner_manager.openstack_cloud.constants import (
METRICS_EXCHANGE_PATH,
Expand All @@ -26,6 +26,10 @@
_HealthCheckResult = bool | None # None indicates that the check can not determine health status


class _SSHError(Exception):
"""Error on SSH command execution."""


def check_runner(openstack_cloud: OpenstackCloud, instance: OpenstackInstance) -> bool:
"""Run a general health check on a runner instance.
Expand All @@ -47,9 +51,17 @@ def check_runner(openstack_cloud: OpenstackCloud, instance: OpenstackInstance) -
logger.exception(
"Health check failed due to unable to find keyfile for %s", instance.server_name
)
# KeyfileError indicates that we'll never be able to ssh into the unit,
# so we mark it as unhealthy.
return False
except SSHError:
logger.exception("SSH Failed on %s, marking as unhealthy.")
except _SSHError:
logger.exception(
"Unable to get SSH connection for instance %s, marking as unhealthy.",
instance.server_name,
)
# We assume that the failure to get the SSH connection is not transient, and mark
# the runner as unhealthy.
# It is debatable whether we should throw an exception here instead.
return False

return check_active_runner(ssh_conn, instance)
Expand All @@ -69,23 +81,31 @@ def check_active_runner(
the flag, the health check would fail as it checks for running processes
which would not be present in this case.
Raises:
OpenstackHealthCheckError: If the health check could not be completed.
Returns:
Whether the runner should be considered healthy.
"""
if (check_ok := _run_health_check_runner_installed(ssh_conn, instance)) is not None:
return check_ok

if (
check_ok := _run_health_check_cloud_init(
ssh_conn, instance.server_name, accept_finished_job
)
) is not None:
return check_ok
try:
if (check_ok := _run_health_check_runner_installed(ssh_conn, instance)) is not None:
return check_ok

if (
check_ok := _run_health_check_runner_processes_running(ssh_conn, instance.server_name)
) is not None:
return check_ok
if (
check_ok := _run_health_check_cloud_init(
ssh_conn, instance.server_name, accept_finished_job
)
) is not None:
return check_ok

if (
check_ok := _run_health_check_runner_processes_running(ssh_conn, instance.server_name)
) is not None:
return check_ok
except _SSHError as exc:
raise OpenstackHealthCheckError(
"Health check execution failed due to SSH command failure."
) from exc

return True

Expand All @@ -101,19 +121,16 @@ def _get_ssh_connection(
instance: The OpenStack instance to conduit the health check.
Raises:
SSHError: Unable to get a SSH connection to the instance.
_SSHError: Unable to get a SSH connection to the instance.
Returns:
Whether the runner is healthy.
"""
try:
ssh_conn = openstack_cloud.get_ssh_connection(instance)

except SSHError:
logger.exception(
"SSH connection failure with %s during health check", instance.server_name
)
raise
except SSHError as exc:
raise _SSHError(f"Unable to get SSH connection to {instance.server_name}") from exc
return ssh_conn


Expand Down Expand Up @@ -170,7 +187,7 @@ def _run_health_check_cloud_init(
Returns:
Whether the cloud-init status indicates the run is healthy or None.
"""
result: invoke.runners.Result = ssh_conn.run("cloud-init status", warn=True, timeout=30)
result: invoke.runners.Result = _execute_ssh_command(ssh_conn, "cloud-init status")
if not result.ok:
logger.warning("cloud-init status command failed on %s: %s.", server_name, result.stderr)
return False
Expand Down Expand Up @@ -206,8 +223,8 @@ def _run_health_check_runner_installed(
If the run can be considered healthy depending on the existence of
the runner-installed.timestamp.
"""
result = ssh_conn.run(
f"[ -f {METRICS_EXCHANGE_PATH}/runner-installed.timestamp ]", warn=True, timeout=30
result = _execute_ssh_command(
ssh_conn, f"[ -f {METRICS_EXCHANGE_PATH}/runner-installed.timestamp ]"
)
if not result.ok:
logger.info(
Expand Down Expand Up @@ -239,11 +256,32 @@ def _run_health_check_runner_processes_running(
Returns:
If the run can be considered healthy depending on the existence of the processes.
"""
result = ssh_conn.run("ps aux", warn=True, timeout=30)
result = _execute_ssh_command(ssh_conn, "ps aux")
if not result.ok:
logger.warning("SSH run of `ps aux` failed on %s: %s", server_name, result.stderr)
return False
if RUNNER_WORKER_PROCESS not in result.stdout and RUNNER_LISTENER_PROCESS not in result.stdout:
logger.warning("Runner process not found on %s", server_name)
return False
return None


def _execute_ssh_command(ssh_conn: SSHConnection, command: str) -> invoke.runners.Result:
"""Run a command on the remote server.
Args:
ssh_conn: The SSH connection to the runner.
command: The command to run.
Returns:
The result of the command.
Raises:
_SSHError: If the command execution failed.
"""
try:
return ssh_conn.run(command, warn=True, timeout=30)
except invoke.exceptions.CommandTimedOut as exc:
raise _SSHError(
f"SSH command execution timed out for command '{command}' on {ssh_conn.host}"
) from exc
Loading

0 comments on commit e477fb8

Please sign in to comment.