diff --git a/charms/knative-operator/requirements-integration.txt b/charms/knative-operator/requirements-integration.txt index 8023576d..b1ca327d 100644 --- a/charms/knative-operator/requirements-integration.txt +++ b/charms/knative-operator/requirements-integration.txt @@ -241,6 +241,8 @@ sniffio==1.3.0 # httpx stack-data==0.6.2 # via ipython +tenacity==8.2.2 + # via -r ./requirements.txt theblues==0.5.2 # via juju toml==0.10.2 diff --git a/charms/knative-operator/requirements-unit.txt b/charms/knative-operator/requirements-unit.txt index e4e068b7..bbe4ab62 100644 --- a/charms/knative-operator/requirements-unit.txt +++ b/charms/knative-operator/requirements-unit.txt @@ -104,5 +104,7 @@ sniffio==1.3.0 # anyio # httpcore # httpx +tenacity==8.2.2 + # via -r ./requirements.txt tomli==2.0.1 # via pytest diff --git a/charms/knative-operator/requirements.in b/charms/knative-operator/requirements.in index f254511b..86e6d536 100644 --- a/charms/knative-operator/requirements.in +++ b/charms/knative-operator/requirements.in @@ -2,3 +2,4 @@ charmed-kubeflow-chisme jinja2 ops lightkube +tenacity diff --git a/charms/knative-operator/requirements.txt b/charms/knative-operator/requirements.txt index c77d070e..3c156044 100644 --- a/charms/knative-operator/requirements.txt +++ b/charms/knative-operator/requirements.txt @@ -57,3 +57,5 @@ sniffio==1.3.0 # anyio # httpcore # httpx +tenacity==8.2.2 + # via -r ./requirements.in diff --git a/charms/knative-operator/src/charm.py b/charms/knative-operator/src/charm.py index 29c103fb..c812104d 100755 --- a/charms/knative-operator/src/charm.py +++ b/charms/knative-operator/src/charm.py @@ -6,7 +6,6 @@ import glob import logging -import time import traceback from pathlib import Path @@ -14,12 +13,13 @@ from charmed_kubeflow_chisme.kubernetes import KubernetesResourceHandler as KRH # noqa N813 from charmed_kubeflow_chisme.lightkube.batch import delete_many from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider -from lightkube import ApiError -from lightkube.resources.core_v1 import Service +from lightkube import ApiError, Client +from lightkube.resources.core_v1 import ConfigMap, Secret, Service from ops.charm import CharmBase from ops.main import main from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus from ops.pebble import ChangeError, Layer +from tenacity import Retrying, stop_after_delay, wait_fixed REQUEST_LOG_TEMPLATE = '{"httpRequest": {"requestMethod": "{{.Request.Method}}", "requestUrl": "{{js .Request.RequestURI}}", "requestSize": "{{.Request.ContentLength}}", "status": {{.Response.Code}}, "responseSize": "{{.Response.Size}}", "userAgent": "{{js .Request.UserAgent}}", "remoteIp": "{{js .Request.RemoteAddr}}", "serverIp": "{{.Revision.PodIP}}", "referer": "{{js .Request.Referer}}", "latency": "{{.Response.Latency}}s", "protocol": "{{.Request.Proto}}"}, "traceId": "{{index .Request.Header "X-B3-Traceid"}}"}' # noqa: E501 @@ -234,17 +234,8 @@ def _main(self, event): self.unit.status = MaintenanceStatus("Applying resources") self._apply_resources(resource_handler=self.resource_handler) - # TODO: There is a race condition between creating the configmaps and secrets that - # knative operator/webhook need, and starting their processes. Without these resources, - # their processes panic and exit with non-zero error codes quickly (<1s). Pebble sees - # this quick exit not as a start and fail, not as not starting at all (see - # https://github.com/canonical/pebble#viewing-starting-and-stopping-services), which - # means Pebble will not auto-restart the service. Health checks also do not seem to - # be able to restart the service. - # Need a better way to handle this. - sleep_time = 5 - logger.info(f"Sleeping {sleep_time} seconds to allow resources to be available") - time.sleep(sleep_time) + # Handle [this race condition](https://github.com/canonical/knative-operators/issues/90) + wait_for_required_kubernetes_resources(self._namespace, DEFAULT_RETRYER) # Update Pebble configuration layer if it has changed self.unit.status = MaintenanceStatus("Configuring Pebble layers") @@ -285,5 +276,29 @@ def _on_remove(self, _): self.unit.status = MaintenanceStatus("K8s resources removed") +REQUIRED_CONFIGMAPS = ["config-observability", "config-logging"] +REQUIRED_SECRETS = ["operator-webhook-certs"] +DEFAULT_RETRYER = Retrying( + stop=stop_after_delay(15), + wait=wait_fixed(3), + reraise=True, +) + + +def wait_for_required_kubernetes_resources(namespace: str, retryer: Retrying): + """Waits for required kubernetes resources to be created, raising if we exceed a timeout""" + client = Client() + for attempt in retryer: + with attempt: + for cm in REQUIRED_CONFIGMAPS: + logger.info(f"Looking for required configmap {cm}") + client.get(ConfigMap, name=cm, namespace=namespace) + logger.info(f"Found required configmap {cm}") + for secret in REQUIRED_SECRETS: + logger.info(f"Looking for required secret {secret}") + client.get(Secret, name=secret, namespace=namespace) + logger.info(f"Found required secret {cm}") + + if __name__ == "__main__": main(KnativeOperatorCharm) diff --git a/charms/knative-operator/tests/unit/test_charm.py b/charms/knative-operator/tests/unit/test_charm.py index 9685ea42..1f36eae8 100644 --- a/charms/knative-operator/tests/unit/test_charm.py +++ b/charms/knative-operator/tests/unit/test_charm.py @@ -14,8 +14,16 @@ from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus from ops.pebble import Change, ChangeError, ChangeID from ops.testing import Harness - -from charm import KNATIVE_OPERATOR, KNATIVE_OPERATOR_WEBHOOK, KnativeOperatorCharm +from tenacity import Retrying, stop_after_attempt + +from charm import ( + KNATIVE_OPERATOR, + KNATIVE_OPERATOR_WEBHOOK, + REQUIRED_CONFIGMAPS, + REQUIRED_SECRETS, + KnativeOperatorCharm, + wait_for_required_kubernetes_resources, +) class _FakeChange: @@ -93,12 +101,6 @@ def mocked_metrics_endpoint_provider(mocker): def test_events(harness, mocked_resource_handler, mocked_metrics_endpoint_provider, mocker): harness.begin() main = mocker.patch("charm.KnativeOperatorCharm._main") - pebble_ready_knative_operator = mocker.patch( - "charm.KnativeOperatorCharm._on_knative_operator_pebble_ready" - ) - pebble_ready_knative_operator_webhook = mocker.patch( - "charm.KnativeOperatorCharm._on_knative_operator_webhook_pebble_ready" - ) otel_relation_created = mocker.patch( "charm.KnativeOperatorCharm._on_otel_collector_relation_created" ) @@ -112,12 +114,12 @@ def test_events(harness, mocked_resource_handler, mocked_metrics_endpoint_provid main.reset_mock() harness.charm.on.knative_operator_pebble_ready.emit("knative-operator") - pebble_ready_knative_operator.assert_called_once() - pebble_ready_knative_operator.reset_mock() + main.assert_called_once() + main.reset_mock() harness.charm.on.knative_operator_webhook_pebble_ready.emit("knative-operator-webhook") - pebble_ready_knative_operator_webhook.assert_called_once() - pebble_ready_knative_operator_webhook.reset_mock() + main.assert_called_once() + main.reset_mock() rel_id = harness.add_relation("otel-collector", "app") harness.update_relation_data(rel_id, "app", {"some-key": "some-value"}) @@ -156,6 +158,9 @@ def test_update_layer_active( ): # The charm uses a service name that is the same as the container name service_name = container_name + + mocker.patch("charm.wait_for_required_kubernetes_resources") + harness.begin() # Check the initial Pebble plan is empty initial_plan = harness.get_container_pebble_plan(container_name) @@ -320,3 +325,78 @@ def test_on_remove_failure( delete_many.side_effect = _FakeApiError() with pytest.raises(ApiError): harness.charm.on.remove.emit() + + +@pytest.fixture() +def mocked_lightkube_base_client(mocker): + """Mocks the actual Lightkube Client class rather than the resource handler's Client.""" + mocked_lightkube_client = MagicMock() + mocked_lightkube_client_class = mocker.patch("charm.Client") + mocked_lightkube_client_class.return_value = mocked_lightkube_client + yield mocked_lightkube_client + + +def test_wait_for_required_kubernetes_resources_success(mocked_lightkube_base_client): + """Tests case where resources do exist after some time. + + Given an environment that will eventually (after a few tries) have the required resources, + assert that the function returns successfully + """ + mocked_lightkube_client = mocked_lightkube_base_client + + # Lightkube will fail the first 3 access, then return successfully after + mocked_lightkube_client.get.side_effect = [ + _FakeApiError(404), + _FakeApiError(404), + _FakeApiError(404), + None, + None, + None, + ] + + retryer = Retrying( + stop=stop_after_attempt(15), + reraise=True, + ) + + wait_for_required_kubernetes_resources("", retryer=retryer) + + # Assert we had 3 attempts end in a failure after one get, and then a 4th attempt that had + # 3 successful returns + assert mocked_lightkube_client.get.call_count == 6 + + # Assert that the last 3 calls of .get attempted to get the required resources + required_resource_names = REQUIRED_CONFIGMAPS + REQUIRED_SECRETS + + # call_args_list returns a list of Call objects, each of which are a tuple of (args, kwargs). + # We assert that the kwargs has the correct resource name. + requested_resource_names = [ + kwargs["name"] for (_, kwargs) in mocked_lightkube_client.get.call_args_list[-3:] + ] + for name in required_resource_names: + assert name in requested_resource_names + + +def test_wait_for_required_kubernetes_failure(mocked_lightkube_base_client): + """Tests case where resources do not exist and the wait should raise an exception. + + Given an environment that will never have the required resources, + assert that the function raises after the expected number of attempts. + """ + mocked_lightkube_client = mocked_lightkube_base_client + + # Lightkube's Client.get will always raise an exception + mocked_lightkube_client.get.side_effect = _FakeApiError(404) + + n_attempts = 3 + retryer = Retrying( + stop=stop_after_attempt(n_attempts), + reraise=True, + ) + + with pytest.raises(ApiError): + wait_for_required_kubernetes_resources("", retryer=retryer) + + # Assert we had 3 attempts end in a failure after one get, and then a 4th attempt that had + # 3 successful returns + assert mocked_lightkube_client.get.call_count == n_attempts