Skip to content

Commit

Permalink
Update scrape and remote_write libs for generic HostHealth rules (#660)
Browse files Browse the repository at this point in the history
* Update prometheus_scrape and prometheus_remote_write libs for generic host health rules
* Import central rule groups from cos-lib
* Support generic alert rules in MetricsEndpointAggregator

---------

Co-authored-by: Luca Bello <[email protected]>
  • Loading branch information
MichaelThamm and lucabello authored Feb 5, 2025
1 parent c6c40f3 commit 76f3889
Show file tree
Hide file tree
Showing 11 changed files with 145 additions and 164 deletions.
25 changes: 23 additions & 2 deletions lib/charms/prometheus_k8s/v0/prometheus_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ def _on_scrape_targets_changed(self, event):

import yaml
from cosl import JujuTopology
from cosl.rules import AlertRules
from cosl.rules import AlertRules, generic_alert_groups
from ops.charm import CharmBase, RelationRole
from ops.framework import (
BoundEvent,
Expand All @@ -362,7 +362,7 @@ def _on_scrape_targets_changed(self, event):

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 48
LIBPATCH = 49

PYDEPS = ["cosl"]

Expand Down Expand Up @@ -1531,6 +1531,9 @@ def set_scrape_job_spec(self, _=None):

alert_rules = AlertRules(query_type="promql", topology=self.topology)
alert_rules.add_path(self._alert_rules_path, recursive=True)
alert_rules.add(
generic_alert_groups.application_rules, group_name_prefix=self.topology.identifier
)
alert_rules_as_dict = alert_rules.as_dict()

for relation in self._charm.model.relations[self._relation_name]:
Expand Down Expand Up @@ -1776,6 +1779,7 @@ def __init__(
relation_names: Optional[dict] = None,
relabel_instance=True,
resolve_addresses=False,
path_to_own_alert_rules: Optional[str] = None,
):
"""Construct a `MetricsEndpointAggregator`.
Expand All @@ -1795,6 +1799,7 @@ def __init__(
resolve_addresses: A boolean flag indiccating if the aggregator
should attempt to perform DNS lookups of targets and append
a `dns_name` label
path_to_own_alert_rules: Optionally supply a path for alert rule files
"""
self._charm = charm

Expand All @@ -1807,6 +1812,8 @@ def __init__(
self._alert_rules_relation = relation_names.get("alert_rules", "prometheus-rules")

super().__init__(charm, self._prometheus_relation)
self.topology = JujuTopology.from_charm(charm)

self._stored.set_default(jobs=[], alert_rules=[])

self._relabel_instance = relabel_instance
Expand All @@ -1816,6 +1823,8 @@ def __init__(
prometheus_events = self._charm.on[self._prometheus_relation]
self.framework.observe(prometheus_events.relation_joined, self._set_prometheus_data)

self.path_to_own_alert_rules = path_to_own_alert_rules

# manage list of Prometheus scrape jobs from related scrape targets
target_events = self._charm.on[self._target_relation]
self.framework.observe(target_events.relation_changed, self._on_prometheus_targets_changed)
Expand All @@ -1838,6 +1847,7 @@ def _set_prometheus_data(self, event):
if not self._charm.unit.is_leader():
return

# Gather the scrape jobs
jobs = [] + _type_convert_stored(
self._stored.jobs # pyright: ignore
) # list of scrape jobs, one per relation
Expand All @@ -1846,6 +1856,7 @@ def _set_prometheus_data(self, event):
if targets and relation.app:
jobs.append(self._static_scrape_job(targets, relation.app.name))

# Gather the alert rules
groups = [] + _type_convert_stored(
self._stored.alert_rules # pyright: ignore
) # list of alert rule groups
Expand All @@ -1856,7 +1867,17 @@ def _set_prometheus_data(self, event):
rules = self._label_alert_rules(unit_rules, appname)
group = {"name": self.group_name(appname), "rules": rules}
groups.append(group)
alert_rules = AlertRules(query_type="promql", topology=self.topology)
# Add alert rules from file
if self.path_to_own_alert_rules:
alert_rules.add_path(self.path_to_own_alert_rules, recursive=True)
# Add generic alert rules
alert_rules.add(
generic_alert_groups.application_rules, group_name_prefix=self.topology.identifier
)
groups.extend(alert_rules.as_dict()["groups"])

# Set scrape jobs and alert rules in relation data
event.relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs)
event.relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups})

Expand Down
7 changes: 5 additions & 2 deletions lib/charms/prometheus_k8s/v1/prometheus_remote_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

import yaml
from cosl import JujuTopology
from cosl.rules import AlertRules
from cosl.rules import AlertRules, generic_alert_groups
from ops.charm import (
CharmBase,
HookEvent,
Expand All @@ -46,7 +46,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 5
LIBPATCH = 6

PYDEPS = ["cosl"]

Expand Down Expand Up @@ -485,6 +485,9 @@ def _push_alerts_to_relation_databag(self, relation: Relation) -> None:

alert_rules = AlertRules(query_type="promql", topology=self.topology)
alert_rules.add_path(self._alert_rules_path)
alert_rules.add(
generic_alert_groups.aggregator_rules, group_name_prefix=self.topology.identifier
)

alert_rules_as_dict = alert_rules.as_dict()

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cosl>=0.0.46
cosl
cryptography
jsonschema
ops
Expand Down
7 changes: 4 additions & 3 deletions tests/integration/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import logging

import pytest
from cosl.rules import generic_alert_groups
from helpers import (
check_prometheus_is_ready,
get_job_config_for,
Expand Down Expand Up @@ -51,7 +52,7 @@ async def test_prometheus_scrape_relation_with_prometheus_tester(
),
)

await ops_test.model.wait_for_idle(apps=app_names, status="active", wait_for_units=1)
await ops_test.model.wait_for_idle(apps=app_names, status="active", wait_for_exact_units=1)

assert initial_workload_is_ready(ops_test, app_names)
assert await check_prometheus_is_ready(ops_test, prometheus_app_name, 0)
Expand All @@ -73,7 +74,7 @@ async def test_prometheus_scrape_relation_with_prometheus_tester(

rules_with_relation = await get_prometheus_rules(ops_test, prometheus_app_name, 0)
tester_rules = get_rules_for(tester_app_name, rules_with_relation)
assert len(tester_rules) == 1
assert len(tester_rules) == 1 + len(generic_alert_groups.application_rules)


async def test_alert_rule_path_can_be_changed(ops_test, prometheus_tester_charm):
Expand Down Expand Up @@ -105,4 +106,4 @@ async def test_alert_rule_path_can_be_changed(ops_test, prometheus_tester_charm)

rules_with_relation = await get_prometheus_rules(ops_test, prometheus_app_name, 0)
tester_rules = get_rules_for(tester_app_name, rules_with_relation)
assert len(tester_rules) == 2
assert len(tester_rules) == 2 + len(generic_alert_groups.application_rules)
2 changes: 1 addition & 1 deletion tests/integration/test_prometheus_scrape_multiunit.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ async def test_prometheus_scrape_relation_with_prometheus_tester(
)

await ops_test.model.wait_for_idle(
apps=app_names, status="active", wait_for_units=num_units, timeout=600
apps=app_names, status="active", wait_for_exact_units=num_units, timeout=600
)
await asyncio.gather(
*[check_prometheus_is_ready(ops_test, prometheus_app_name, u) for u in range(num_units)]
Expand Down
5 changes: 3 additions & 2 deletions tests/integration/test_remote_write_grafana_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import logging

import pytest
from cosl.rules import generic_alert_groups
from helpers import (
check_prometheus_is_ready,
get_prometheus_rules,
Expand Down Expand Up @@ -55,7 +56,7 @@ async def test_remote_write_with_grafana_agent(
),
)

await ops_test.model.wait_for_idle(apps=apps, wait_for_units=1)
await ops_test.model.wait_for_idle(apps=apps, wait_for_exact_units=1)
assert await check_prometheus_is_ready(ops_test, prometheus_name, 0)

await asyncio.gather(
Expand Down Expand Up @@ -109,7 +110,7 @@ async def test_remote_write_alerts_deduplicate(ops_test):
# Make sure only one copy of the alerts is present
rules_with_relation = await get_prometheus_rules(ops_test, prometheus_name, 0)
tester_rules = get_rules_for(tester_name, rules_with_relation)[0]["rules"]
assert len(tester_rules) == 1
assert len(tester_rules) == 1 + len(generic_alert_groups.aggregator_rules)


@pytest.mark.abort_on_fail
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_remote_write_with_zinc.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ async def test_remote_write_with_zinc(ops_test, prometheus_charm):
),
)

await ops_test.model.wait_for_idle(apps=local_apps, status="active", wait_for_units=1)
await ops_test.model.wait_for_idle(apps=local_apps, status="active", wait_for_exact_units=1)
assert await check_prometheus_is_ready(ops_test, prometheus_name, 0)

await asyncio.gather(
Expand Down
5 changes: 3 additions & 2 deletions tests/integration/test_upgrade_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import pytest
import yaml
from cosl.rules import generic_alert_groups
from helpers import (
check_prometheus_is_ready,
get_prometheus_rules,
Expand Down Expand Up @@ -60,7 +61,7 @@ async def test_deploy_charm(ops_test, prometheus_tester_charm, prometheus_charm)
rules_with_relation = await get_prometheus_rules(ops_test, prometheus_app_name, 0)
tester_rules = get_rules_for(tester_app_name, rules_with_relation)

assert len(tester_rules) == 1
assert len(tester_rules) == 1 + len(generic_alert_groups.application_rules)


@pytest.mark.abort_on_fail
Expand Down Expand Up @@ -91,7 +92,7 @@ def get_config():
# Check only one alert rule exists
rules_with_relation = await get_prometheus_rules(ops_test, prometheus_app_name, 0)
tester_rules = get_rules_for(tester_app_name, rules_with_relation)
assert len(tester_rules) == 1
assert len(tester_rules) == 1 + len(generic_alert_groups.application_rules)


@pytest.mark.abort_on_fail
Expand Down
Loading

0 comments on commit 76f3889

Please sign in to comment.