Skip to content

Commit

Permalink
fix: handle the race condition between knative-operator and k8s resou…
Browse files Browse the repository at this point in the history
…rces

Adds a check to ensure the kubernetes resources required to deploy the knative operator/webhook exist.

Closes #90
  • Loading branch information
ca-scribner committed Mar 9, 2023
1 parent 063a029 commit b5c8a86
Show file tree
Hide file tree
Showing 6 changed files with 128 additions and 26 deletions.
2 changes: 2 additions & 0 deletions charms/knative-operator/requirements-integration.txt
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,8 @@ sniffio==1.3.0
# httpx
stack-data==0.6.2
# via ipython
tenacity==8.2.2
# via -r ./requirements.txt
theblues==0.5.2
# via juju
toml==0.10.2
Expand Down
2 changes: 2 additions & 0 deletions charms/knative-operator/requirements-unit.txt
Original file line number Diff line number Diff line change
Expand Up @@ -104,5 +104,7 @@ sniffio==1.3.0
# anyio
# httpcore
# httpx
tenacity==8.2.2
# via -r ./requirements.txt
tomli==2.0.1
# via pytest
1 change: 1 addition & 0 deletions charms/knative-operator/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ charmed-kubeflow-chisme
jinja2
ops
lightkube
tenacity
2 changes: 2 additions & 0 deletions charms/knative-operator/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,5 @@ sniffio==1.3.0
# anyio
# httpcore
# httpx
tenacity==8.2.2
# via -r ./requirements.in
43 changes: 29 additions & 14 deletions charms/knative-operator/src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,20 @@

import glob
import logging
import time
import traceback
from pathlib import Path

from charmed_kubeflow_chisme.exceptions import ErrorWithStatus
from charmed_kubeflow_chisme.kubernetes import KubernetesResourceHandler as KRH # noqa N813
from charmed_kubeflow_chisme.lightkube.batch import delete_many
from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider
from lightkube import ApiError
from lightkube.resources.core_v1 import Service
from lightkube import ApiError, Client
from lightkube.resources.core_v1 import ConfigMap, Secret, Service
from ops.charm import CharmBase
from ops.main import main
from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus
from ops.pebble import ChangeError, Layer
from tenacity import Retrying, stop_after_delay, wait_fixed

REQUEST_LOG_TEMPLATE = '{"httpRequest": {"requestMethod": "{{.Request.Method}}", "requestUrl": "{{js .Request.RequestURI}}", "requestSize": "{{.Request.ContentLength}}", "status": {{.Response.Code}}, "responseSize": "{{.Response.Size}}", "userAgent": "{{js .Request.UserAgent}}", "remoteIp": "{{js .Request.RemoteAddr}}", "serverIp": "{{.Revision.PodIP}}", "referer": "{{js .Request.Referer}}", "latency": "{{.Response.Latency}}s", "protocol": "{{.Request.Proto}}"}, "traceId": "{{index .Request.Header "X-B3-Traceid"}}"}' # noqa: E501

Expand Down Expand Up @@ -234,17 +234,8 @@ def _main(self, event):
self.unit.status = MaintenanceStatus("Applying resources")
self._apply_resources(resource_handler=self.resource_handler)

# TODO: There is a race condition between creating the configmaps and secrets that
# knative operator/webhook need, and starting their processes. Without these resources,
# their processes panic and exit with non-zero error codes quickly (<1s). Pebble sees
# this quick exit not as a start and fail, not as not starting at all (see
# https://github.com/canonical/pebble#viewing-starting-and-stopping-services), which
# means Pebble will not auto-restart the service. Health checks also do not seem to
# be able to restart the service.
# Need a better way to handle this.
sleep_time = 5
logger.info(f"Sleeping {sleep_time} seconds to allow resources to be available")
time.sleep(sleep_time)
# Handle [this race condition](https://github.com/canonical/knative-operators/issues/90)
wait_for_required_kubernetes_resources(self._namespace, DEFAULT_RETRYER)

# Update Pebble configuration layer if it has changed
self.unit.status = MaintenanceStatus("Configuring Pebble layers")
Expand Down Expand Up @@ -285,5 +276,29 @@ def _on_remove(self, _):
self.unit.status = MaintenanceStatus("K8s resources removed")


REQUIRED_CONFIGMAPS = ["config-observability", "config-logging"]
REQUIRED_SECRETS = ["operator-webhook-certs"]
DEFAULT_RETRYER = Retrying(
stop=stop_after_delay(15),
wait=wait_fixed(3),
reraise=True,
)


def wait_for_required_kubernetes_resources(namespace: str, retryer: Retrying):
"""Waits for required kubernetes resources to be created, raising if we exceed a timeout"""
client = Client()
for attempt in retryer:
with attempt:
for cm in REQUIRED_CONFIGMAPS:
logger.info(f"Looking for required configmap {cm}")
client.get(ConfigMap, name=cm, namespace=namespace)
logger.info(f"Found required configmap {cm}")
for secret in REQUIRED_SECRETS:
logger.info(f"Looking for required secret {secret}")
client.get(Secret, name=secret, namespace=namespace)
logger.info(f"Found required secret {cm}")


if __name__ == "__main__":
main(KnativeOperatorCharm)
104 changes: 92 additions & 12 deletions charms/knative-operator/tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,16 @@
from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus
from ops.pebble import Change, ChangeError, ChangeID
from ops.testing import Harness

from charm import KNATIVE_OPERATOR, KNATIVE_OPERATOR_WEBHOOK, KnativeOperatorCharm
from tenacity import Retrying, stop_after_attempt

from charm import (
KNATIVE_OPERATOR,
KNATIVE_OPERATOR_WEBHOOK,
REQUIRED_CONFIGMAPS,
REQUIRED_SECRETS,
KnativeOperatorCharm,
wait_for_required_kubernetes_resources,
)


class _FakeChange:
Expand Down Expand Up @@ -93,12 +101,6 @@ def mocked_metrics_endpoint_provider(mocker):
def test_events(harness, mocked_resource_handler, mocked_metrics_endpoint_provider, mocker):
harness.begin()
main = mocker.patch("charm.KnativeOperatorCharm._main")
pebble_ready_knative_operator = mocker.patch(
"charm.KnativeOperatorCharm._on_knative_operator_pebble_ready"
)
pebble_ready_knative_operator_webhook = mocker.patch(
"charm.KnativeOperatorCharm._on_knative_operator_webhook_pebble_ready"
)
otel_relation_created = mocker.patch(
"charm.KnativeOperatorCharm._on_otel_collector_relation_created"
)
Expand All @@ -112,12 +114,12 @@ def test_events(harness, mocked_resource_handler, mocked_metrics_endpoint_provid
main.reset_mock()

harness.charm.on.knative_operator_pebble_ready.emit("knative-operator")
pebble_ready_knative_operator.assert_called_once()
pebble_ready_knative_operator.reset_mock()
main.assert_called_once()
main.reset_mock()

harness.charm.on.knative_operator_webhook_pebble_ready.emit("knative-operator-webhook")
pebble_ready_knative_operator_webhook.assert_called_once()
pebble_ready_knative_operator_webhook.reset_mock()
main.assert_called_once()
main.reset_mock()

rel_id = harness.add_relation("otel-collector", "app")
harness.update_relation_data(rel_id, "app", {"some-key": "some-value"})
Expand Down Expand Up @@ -156,6 +158,9 @@ def test_update_layer_active(
):
# The charm uses a service name that is the same as the container name
service_name = container_name

mocker.patch("charm.wait_for_required_kubernetes_resources")

harness.begin()
# Check the initial Pebble plan is empty
initial_plan = harness.get_container_pebble_plan(container_name)
Expand Down Expand Up @@ -320,3 +325,78 @@ def test_on_remove_failure(
delete_many.side_effect = _FakeApiError()
with pytest.raises(ApiError):
harness.charm.on.remove.emit()


@pytest.fixture()
def mocked_lightkube_base_client(mocker):
"""Mocks the actual Lightkube Client class rather than the resource handler's Client."""
mocked_lightkube_client = MagicMock()
mocked_lightkube_client_class = mocker.patch("charm.Client")
mocked_lightkube_client_class.return_value = mocked_lightkube_client
yield mocked_lightkube_client


def test_wait_for_required_kubernetes_resources_success(mocked_lightkube_base_client):
"""Tests case where resources do exist after some time.
Given an environment that will eventually (after a few tries) have the required resources,
assert that the function returns successfully
"""
mocked_lightkube_client = mocked_lightkube_base_client

# Lightkube will fail the first 3 access, then return successfully after
mocked_lightkube_client.get.side_effect = [
_FakeApiError(404),
_FakeApiError(404),
_FakeApiError(404),
None,
None,
None,
]

retryer = Retrying(
stop=stop_after_attempt(15),
reraise=True,
)

wait_for_required_kubernetes_resources("", retryer=retryer)

# Assert we had 3 attempts end in a failure after one get, and then a 4th attempt that had
# 3 successful returns
assert mocked_lightkube_client.get.call_count == 6

# Assert that the last 3 calls of .get attempted to get the required resources
required_resource_names = REQUIRED_CONFIGMAPS + REQUIRED_SECRETS

# call_args_list returns a list of Call objects, each of which are a tuple of (args, kwargs).
# We assert that the kwargs has the correct resource name.
requested_resource_names = [
kwargs["name"] for (_, kwargs) in mocked_lightkube_client.get.call_args_list[-3:]
]
for name in required_resource_names:
assert name in requested_resource_names


def test_wait_for_required_kubernetes_failure(mocked_lightkube_base_client):
"""Tests case where resources do not exist and the wait should raise an exception.
Given an environment that will never have the required resources,
assert that the function raises after the expected number of attempts.
"""
mocked_lightkube_client = mocked_lightkube_base_client

# Lightkube's Client.get will always raise an exception
mocked_lightkube_client.get.side_effect = _FakeApiError(404)

n_attempts = 3
retryer = Retrying(
stop=stop_after_attempt(n_attempts),
reraise=True,
)

with pytest.raises(ApiError):
wait_for_required_kubernetes_resources("", retryer=retryer)

# Assert we had 3 attempts end in a failure after one get, and then a 4th attempt that had
# 3 successful returns
assert mocked_lightkube_client.get.call_count == n_attempts

0 comments on commit b5c8a86

Please sign in to comment.