diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml index bfb691bd..1f5d2629 100644 --- a/.github/workflows/integration_tests.yaml +++ b/.github/workflows/integration_tests.yaml @@ -13,7 +13,7 @@ jobs: pre-run-script: scripts/pre-integration-test.sh provider: lxd test-tox-env: integration-juju3.1 - modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner"]' + modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics"]' integration-test-juju2: name: Integration test needs: integration-test-juju3 @@ -24,4 +24,4 @@ jobs: pre-run-script: scripts/pre-integration-test.sh provider: lxd test-tox-env: integration-juju2.9 - modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner"]' \ No newline at end of file + modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics"]' \ No newline at end of file diff --git a/.licenserc.yaml b/.licenserc.yaml index c3646090..5e0d7f17 100644 --- a/.licenserc.yaml +++ b/.licenserc.yaml @@ -22,4 +22,6 @@ header: - 'CODEOWNERS' - 'icon.svg' - 'LICENSE' + - '.pylintrc' + - '.woke.yaml' comment: on-failure diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 00000000..3eff9173 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,2 @@ +[MAIN] +extension-pkg-whitelist=pydantic # see https://github.com/pydantic/pydantic/issues/1961#issuecomment-759522422 diff --git a/.woke.yaml b/.woke.yaml new file mode 100644 index 00000000..18ecc91b --- /dev/null +++ b/.woke.yaml @@ -0,0 +1,3 @@ +ignore_files: + # Ignore pylintrc as it uses non compliant terminology: whitelist + - .pylintrc diff --git a/README.md b/README.md index ed91b523..2fdce278 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,34 @@ If there are more idle runners than configured, the oldest idle runners are unre During each time period, every unit will make one or more API calls to GitHub. The interval may need to be adjusted if the number of units is large enough to trigger [Rate Limiting](https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limiting). + +## COS +The charm is designed to provide comprehensive metrics and monitoring capabilities for both the Runners and the Charm itself. These metrics are made available through the `cos-agent` integration with the `cos_agent` interface. Additionally, a Grafana Dashboard is included to help visualize these metrics effectively. + +### Loki Integration +#### Loki Push API +The charm seamlessly integrates with Loki, a powerful log aggregation system, through the `cos_agent` interface. This integration allows the charm to push various metrics and logs related to the Runners and the Charm itself to a Loki instance. This provides valuable insights into the performance and behavior of your deployment. + +### Grafana Dashboard +To make monitoring even more accessible, the charm comes with a pre-configured Grafana Dashboard. This dashboard is designed to visualize the metrics collected by the charm, making it easier for operators to track the health and performance of the system. + +#### Automated Dashboard Deployment +You can automate the deployment of the Grafana Dashboard using the [cos-integration-k8s](https://charmhub.io/cos-configuration-k8s) charm. This simplifies the setup process and ensures that your monitoring infrastructure is ready to go with minimal manual intervention. + +#### Configuration Options +To enable the automated deployment of the Grafana Dashboard, you can provide the following configuration options when deploying the `cos-integration-k8s` charm: + +```ini +git_repo=https://https://github.com/canonical/github-runner-operator +git_branch=main +git_depth=1 +grafana_dashboards_path=src/grafana_dashboard_metrics +``` + + + + + ## Development This charm uses black and flake8 for formatting. Both run with the lint stage of tox. diff --git a/lib/charms/grafana_agent/v0/cos_agent.py b/lib/charms/grafana_agent/v0/cos_agent.py new file mode 100644 index 00000000..d3130b2b --- /dev/null +++ b/lib/charms/grafana_agent/v0/cos_agent.py @@ -0,0 +1,842 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +r"""## Overview. + +This library can be used to manage the cos_agent relation interface: + +- `COSAgentProvider`: Use in machine charms that need to have a workload's metrics + or logs scraped, or forward rule files or dashboards to Prometheus, Loki or Grafana through + the Grafana Agent machine charm. + +- `COSAgentConsumer`: Used in the Grafana Agent machine charm to manage the requirer side of + the `cos_agent` interface. + + +## COSAgentProvider Library Usage + +Grafana Agent machine Charmed Operator interacts with its clients using the cos_agent library. +Charms seeking to send telemetry, must do so using the `COSAgentProvider` object from +this charm library. + +Using the `COSAgentProvider` object only requires instantiating it, +typically in the `__init__` method of your charm (the one which sends telemetry). + +The constructor of `COSAgentProvider` has only one required and nine optional parameters: + +```python + def __init__( + self, + charm: CharmType, + relation_name: str = DEFAULT_RELATION_NAME, + metrics_endpoints: Optional[List[_MetricsEndpointDict]] = None, + metrics_rules_dir: str = "./src/prometheus_alert_rules", + logs_rules_dir: str = "./src/loki_alert_rules", + recurse_rules_dirs: bool = False, + log_slots: Optional[List[str]] = None, + dashboard_dirs: Optional[List[str]] = None, + refresh_events: Optional[List] = None, + scrape_configs: Optional[Union[List[Dict], Callable]] = None, + ): +``` + +### Parameters + +- `charm`: The instance of the charm that instantiates `COSAgentProvider`, typically `self`. + +- `relation_name`: If your charmed operator uses a relation name other than `cos-agent` to use + the `cos_agent` interface, this is where you have to specify that. + +- `metrics_endpoints`: In this parameter you can specify the metrics endpoints that Grafana Agent + machine Charmed Operator will scrape. The configs of this list will be merged with the configs + from `scrape_configs`. + +- `metrics_rules_dir`: The directory in which the Charmed Operator stores its metrics alert rules + files. + +- `logs_rules_dir`: The directory in which the Charmed Operator stores its logs alert rules files. + +- `recurse_rules_dirs`: This parameters set whether Grafana Agent machine Charmed Operator has to + search alert rules files recursively in the previous two directories or not. + +- `log_slots`: Snap slots to connect to for scraping logs in the form ["snap-name:slot", ...]. + +- `dashboard_dirs`: List of directories where the dashboards are stored in the Charmed Operator. + +- `refresh_events`: List of events on which to refresh relation data. + +- `scrape_configs`: List of standard scrape_configs dicts or a callable that returns the list in + case the configs need to be generated dynamically. The contents of this list will be merged + with the configs from `metrics_endpoints`. + + +### Example 1 - Minimal instrumentation: + +In order to use this object the following should be in the `charm.py` file. + +```python +from charms.grafana_agent.v0.cos_agent import COSAgentProvider +... +class TelemetryProviderCharm(CharmBase): + def __init__(self, *args): + ... + self._grafana_agent = COSAgentProvider(self) +``` + +### Example 2 - Full instrumentation: + +In order to use this object the following should be in the `charm.py` file. + +```python +from charms.grafana_agent.v0.cos_agent import COSAgentProvider +... +class TelemetryProviderCharm(CharmBase): + def __init__(self, *args): + ... + self._grafana_agent = COSAgentProvider( + self, + relation_name="custom-cos-agent", + metrics_endpoints=[ + # specify "path" and "port" to scrape from localhost + {"path": "/metrics", "port": 9000}, + {"path": "/metrics", "port": 9001}, + {"path": "/metrics", "port": 9002}, + ], + metrics_rules_dir="./src/alert_rules/prometheus", + logs_rules_dir="./src/alert_rules/loki", + recursive_rules_dir=True, + log_slots=["my-app:slot"], + dashboard_dirs=["./src/dashboards_1", "./src/dashboards_2"], + refresh_events=["update-status", "upgrade-charm"], + scrape_configs=[ + { + "job_name": "custom_job", + "metrics_path": "/metrics", + "authorization": {"credentials": "bearer-token"}, + "static_configs": [ + { + "targets": ["localhost:9003"]}, + "labels": {"key": "value"}, + }, + ], + }, + ] + ) +``` + +### Example 3 - Dynamic scrape configs generation: + +Pass a function to the `scrape_configs` to decouple the generation of the configs +from the instantiation of the COSAgentProvider object. + +```python +from charms.grafana_agent.v0.cos_agent import COSAgentProvider +... + +class TelemetryProviderCharm(CharmBase): + def generate_scrape_configs(self): + return [ + { + "job_name": "custom", + "metrics_path": "/metrics", + "static_configs": [{"targets": ["localhost:9000"]}], + }, + ] + + def __init__(self, *args): + ... + self._grafana_agent = COSAgentProvider( + self, + scrape_configs=self.generate_scrape_configs, + ) +``` + +## COSAgentConsumer Library Usage + +This object may be used by any Charmed Operator which gathers telemetry data by +implementing the consumer side of the `cos_agent` interface. +For instance Grafana Agent machine Charmed Operator. + +For this purpose the charm needs to instantiate the `COSAgentConsumer` object with one mandatory +and two optional arguments. + +### Parameters + +- `charm`: A reference to the parent (Grafana Agent machine) charm. + +- `relation_name`: The name of the relation that the charm uses to interact + with its clients that provides telemetry data using the `COSAgentProvider` object. + + If provided, this relation name must match a provided relation in metadata.yaml with the + `cos_agent` interface. + The default value of this argument is "cos-agent". + +- `refresh_events`: List of events on which to refresh relation data. + + +### Example 1 - Minimal instrumentation: + +In order to use this object the following should be in the `charm.py` file. + +```python +from charms.grafana_agent.v0.cos_agent import COSAgentConsumer +... +class GrafanaAgentMachineCharm(GrafanaAgentCharm) + def __init__(self, *args): + ... + self._cos = COSAgentRequirer(self) +``` + + +### Example 2 - Full instrumentation: + +In order to use this object the following should be in the `charm.py` file. + +```python +from charms.grafana_agent.v0.cos_agent import COSAgentConsumer +... +class GrafanaAgentMachineCharm(GrafanaAgentCharm) + def __init__(self, *args): + ... + self._cos = COSAgentRequirer( + self, + relation_name="cos-agent-consumer", + refresh_events=["update-status", "upgrade-charm"], + ) +``` +""" + +import base64 +import json +import logging +import lzma +from collections import namedtuple +from itertools import chain +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, List, Optional, Set, Union + +import pydantic +from cosl import JujuTopology +from cosl.rules import AlertRules +from ops.charm import RelationChangedEvent +from ops.framework import EventBase, EventSource, Object, ObjectEvents +from ops.model import Relation, Unit +from ops.testing import CharmType + +if TYPE_CHECKING: + try: + from typing import TypedDict + + class _MetricsEndpointDict(TypedDict): + path: str + port: int + + except ModuleNotFoundError: + _MetricsEndpointDict = Dict # pyright: ignore + +LIBID = "dc15fa84cef84ce58155fb84f6c6213a" +LIBAPI = 0 +LIBPATCH = 6 + +PYDEPS = ["cosl", "pydantic < 2"] + +DEFAULT_RELATION_NAME = "cos-agent" +DEFAULT_PEER_RELATION_NAME = "peers" +DEFAULT_SCRAPE_CONFIG = { + "static_configs": [{"targets": ["localhost:80"]}], + "metrics_path": "/metrics", +} + +logger = logging.getLogger(__name__) +SnapEndpoint = namedtuple("SnapEndpoint", "owner, name") + + +class GrafanaDashboard(str): + """Grafana Dashboard encoded json; lzma-compressed.""" + + # TODO Replace this with a custom type when pydantic v2 released (end of 2023 Q1?) + # https://github.com/pydantic/pydantic/issues/4887 + @staticmethod + def _serialize(raw_json: Union[str, bytes]) -> "GrafanaDashboard": + if not isinstance(raw_json, bytes): + raw_json = raw_json.encode("utf-8") + encoded = base64.b64encode(lzma.compress(raw_json)).decode("utf-8") + return GrafanaDashboard(encoded) + + def _deserialize(self) -> Dict: + try: + raw = lzma.decompress(base64.b64decode(self.encode("utf-8"))).decode() + return json.loads(raw) + except json.decoder.JSONDecodeError as e: + logger.error("Invalid Dashboard format: %s", e) + return {} + + def __repr__(self): + """Return string representation of self.""" + return "" + + +class CosAgentProviderUnitData(pydantic.BaseModel): + """Unit databag model for `cos-agent` relation.""" + + # The following entries are the same for all units of the same principal. + # Note that the same grafana agent subordinate may be related to several apps. + # this needs to make its way to the gagent leader + metrics_alert_rules: dict + log_alert_rules: dict + dashboards: List[GrafanaDashboard] + subordinate: Optional[bool] + + # The following entries may vary across units of the same principal app. + # this data does not need to be forwarded to the gagent leader + metrics_scrape_jobs: List[Dict] + log_slots: List[str] + + # when this whole datastructure is dumped into a databag, it will be nested under this key. + # while not strictly necessary (we could have it 'flattened out' into the databag), + # this simplifies working with the model. + KEY: ClassVar[str] = "config" + + +class CosAgentPeersUnitData(pydantic.BaseModel): + """Unit databag model for `peers` cos-agent machine charm peer relation.""" + + # We need the principal unit name and relation metadata to be able to render identifiers + # (e.g. topology) on the leader side, after all the data moves into peer data (the grafana + # agent leader can only see its own principal, because it is a subordinate charm). + principal_unit_name: str + principal_relation_id: str + principal_relation_name: str + + # The only data that is forwarded to the leader is data that needs to go into the app databags + # of the outgoing o11y relations. + metrics_alert_rules: Optional[dict] + log_alert_rules: Optional[dict] + dashboards: Optional[List[GrafanaDashboard]] + + # when this whole datastructure is dumped into a databag, it will be nested under this key. + # while not strictly necessary (we could have it 'flattened out' into the databag), + # this simplifies working with the model. + KEY: ClassVar[str] = "config" + + @property + def app_name(self) -> str: + """Parse out the app name from the unit name. + + TODO: Switch to using `model_post_init` when pydantic v2 is released? + https://github.com/pydantic/pydantic/issues/1729#issuecomment-1300576214 + """ + return self.principal_unit_name.split("/")[0] + + +class COSAgentProvider(Object): + """Integration endpoint wrapper for the provider side of the cos_agent interface.""" + + def __init__( + self, + charm: CharmType, + relation_name: str = DEFAULT_RELATION_NAME, + metrics_endpoints: Optional[List["_MetricsEndpointDict"]] = None, + metrics_rules_dir: str = "./src/prometheus_alert_rules", + logs_rules_dir: str = "./src/loki_alert_rules", + recurse_rules_dirs: bool = False, + log_slots: Optional[List[str]] = None, + dashboard_dirs: Optional[List[str]] = None, + refresh_events: Optional[List] = None, + *, + scrape_configs: Optional[Union[List[dict], Callable]] = None, + ): + """Create a COSAgentProvider instance. + + Args: + charm: The `CharmBase` instance that is instantiating this object. + relation_name: The name of the relation to communicate over. + metrics_endpoints: List of endpoints in the form [{"path": path, "port": port}, ...]. + This argument is a simplified form of the `scrape_configs`. + The contents of this list will be merged with the contents of `scrape_configs`. + metrics_rules_dir: Directory where the metrics rules are stored. + logs_rules_dir: Directory where the logs rules are stored. + recurse_rules_dirs: Whether to recurse into rule paths. + log_slots: Snap slots to connect to for scraping logs + in the form ["snap-name:slot", ...]. + dashboard_dirs: Directory where the dashboards are stored. + refresh_events: List of events on which to refresh relation data. + scrape_configs: List of standard scrape_configs dicts or a callable + that returns the list in case the configs need to be generated dynamically. + The contents of this list will be merged with the contents of `metrics_endpoints`. + """ + super().__init__(charm, relation_name) + dashboard_dirs = dashboard_dirs or ["./src/grafana_dashboards"] + + self._charm = charm + self._relation_name = relation_name + self._metrics_endpoints = metrics_endpoints or [] + self._scrape_configs = scrape_configs or [] + self._metrics_rules = metrics_rules_dir + self._logs_rules = logs_rules_dir + self._recursive = recurse_rules_dirs + self._log_slots = log_slots or [] + self._dashboard_dirs = dashboard_dirs + self._refresh_events = refresh_events or [self._charm.on.config_changed] + + events = self._charm.on[relation_name] + self.framework.observe(events.relation_joined, self._on_refresh) + self.framework.observe(events.relation_changed, self._on_refresh) + for event in self._refresh_events: + self.framework.observe(event, self._on_refresh) + + def _on_refresh(self, event): + """Trigger the class to update relation data.""" + relations = self._charm.model.relations[self._relation_name] + + for relation in relations: + # Before a principal is related to the grafana-agent subordinate, we'd get + # ModelError: ERROR cannot read relation settings: unit "zk/2": settings not found + # Add a guard to make sure it doesn't happen. + if relation.data and self._charm.unit in relation.data: + # Subordinate relations can communicate only over unit data. + try: + data = CosAgentProviderUnitData( + metrics_alert_rules=self._metrics_alert_rules, + log_alert_rules=self._log_alert_rules, + dashboards=self._dashboards, + metrics_scrape_jobs=self._scrape_jobs, + log_slots=self._log_slots, + subordinate=self._charm.meta.subordinate, + ) + relation.data[self._charm.unit][data.KEY] = data.json() + except ( + pydantic.ValidationError, + json.decoder.JSONDecodeError, + ) as e: + logger.error("Invalid relation data provided: %s", e) + + @property + def _scrape_jobs(self) -> List[Dict]: + """Return a prometheus_scrape-like data structure for jobs. + + https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config + """ + if callable(self._scrape_configs): + scrape_configs = self._scrape_configs() + else: + # Create a copy of the user scrape_configs, since we will mutate this object + scrape_configs = self._scrape_configs.copy() + + # Convert "metrics_endpoints" to standard scrape_configs, and add them in + for endpoint in self._metrics_endpoints: + scrape_configs.append( + { + "metrics_path": endpoint["path"], + "static_configs": [{"targets": [f"localhost:{endpoint['port']}"]}], + } + ) + + scrape_configs = scrape_configs or [DEFAULT_SCRAPE_CONFIG] + + # Augment job name to include the app name and a unique id (index) + for idx, scrape_config in enumerate(scrape_configs): + scrape_config["job_name"] = "_".join( + [self._charm.app.name, str(idx), scrape_config.get("job_name", "default")] + ) + + return scrape_configs + + @property + def _metrics_alert_rules(self) -> Dict: + """Use (for now) the prometheus_scrape AlertRules to initialize this.""" + alert_rules = AlertRules( + query_type="promql", topology=JujuTopology.from_charm(self._charm) + ) + alert_rules.add_path(self._metrics_rules, recursive=self._recursive) + return alert_rules.as_dict() + + @property + def _log_alert_rules(self) -> Dict: + """Use (for now) the loki_push_api AlertRules to initialize this.""" + alert_rules = AlertRules(query_type="logql", topology=JujuTopology.from_charm(self._charm)) + alert_rules.add_path(self._logs_rules, recursive=self._recursive) + return alert_rules.as_dict() + + @property + def _dashboards(self) -> List[GrafanaDashboard]: + dashboards: List[GrafanaDashboard] = [] + for d in self._dashboard_dirs: + for path in Path(d).glob("*"): + dashboard = GrafanaDashboard._serialize(path.read_bytes()) + dashboards.append(dashboard) + return dashboards + + +class COSAgentDataChanged(EventBase): + """Event emitted by `COSAgentRequirer` when relation data changes.""" + + +class COSAgentValidationError(EventBase): + """Event emitted by `COSAgentRequirer` when there is an error in the relation data.""" + + def __init__(self, handle, message: str = ""): + super().__init__(handle) + self.message = message + + def snapshot(self) -> Dict: + """Save COSAgentValidationError source information.""" + return {"message": self.message} + + def restore(self, snapshot): + """Restore COSAgentValidationError source information.""" + self.message = snapshot["message"] + + +class COSAgentRequirerEvents(ObjectEvents): + """`COSAgentRequirer` events.""" + + data_changed = EventSource(COSAgentDataChanged) + validation_error = EventSource(COSAgentValidationError) + + +class MultiplePrincipalsError(Exception): + """Custom exception for when there are multiple principal applications.""" + + pass + + +class COSAgentRequirer(Object): + """Integration endpoint wrapper for the Requirer side of the cos_agent interface.""" + + on = COSAgentRequirerEvents() # pyright: ignore + + def __init__( + self, + charm: CharmType, + *, + relation_name: str = DEFAULT_RELATION_NAME, + peer_relation_name: str = DEFAULT_PEER_RELATION_NAME, + refresh_events: Optional[List[str]] = None, + ): + """Create a COSAgentRequirer instance. + + Args: + charm: The `CharmBase` instance that is instantiating this object. + relation_name: The name of the relation to communicate over. + peer_relation_name: The name of the peer relation to communicate over. + refresh_events: List of events on which to refresh relation data. + """ + super().__init__(charm, relation_name) + self._charm = charm + self._relation_name = relation_name + self._peer_relation_name = peer_relation_name + self._refresh_events = refresh_events or [self._charm.on.config_changed] + + events = self._charm.on[relation_name] + self.framework.observe( + events.relation_joined, self._on_relation_data_changed + ) # TODO: do we need this? + self.framework.observe(events.relation_changed, self._on_relation_data_changed) + for event in self._refresh_events: + self.framework.observe(event, self.trigger_refresh) # pyright: ignore + + # Peer relation events + # A peer relation is needed as it is the only mechanism for exchanging data across + # subordinate units. + # self.framework.observe( + # self.on[self._peer_relation_name].relation_joined, self._on_peer_relation_joined + # ) + peer_events = self._charm.on[peer_relation_name] + self.framework.observe(peer_events.relation_changed, self._on_peer_relation_changed) + + @property + def peer_relation(self) -> Optional["Relation"]: + """Helper function for obtaining the peer relation object. + + Returns: peer relation object + (NOTE: would return None if called too early, e.g. during install). + """ + return self.model.get_relation(self._peer_relation_name) + + def _on_peer_relation_changed(self, _): + # Peer data is used for forwarding data from principal units to the grafana agent + # subordinate leader, for updating the app data of the outgoing o11y relations. + if self._charm.unit.is_leader(): + self.on.data_changed.emit() # pyright: ignore + + def _on_relation_data_changed(self, event: RelationChangedEvent): + # Peer data is the only means of communication between subordinate units. + if not self.peer_relation: + event.defer() + return + + cos_agent_relation = event.relation + if not event.unit or not cos_agent_relation.data.get(event.unit): + return + principal_unit = event.unit + + # Coherence check + units = cos_agent_relation.units + if len(units) > 1: + # should never happen + raise ValueError( + f"unexpected error: subordinate relation {cos_agent_relation} " + f"should have exactly one unit" + ) + + if not (raw := cos_agent_relation.data[principal_unit].get(CosAgentProviderUnitData.KEY)): + return + + if not (provider_data := self._validated_provider_data(raw)): + return + + # Copy data from the principal relation to the peer relation, so the leader could + # follow up. + # Save the originating unit name, so it could be used for topology later on by the leader. + data = CosAgentPeersUnitData( # peer relation databag model + principal_unit_name=event.unit.name, + principal_relation_id=str(event.relation.id), + principal_relation_name=event.relation.name, + metrics_alert_rules=provider_data.metrics_alert_rules, + log_alert_rules=provider_data.log_alert_rules, + dashboards=provider_data.dashboards, + ) + self.peer_relation.data[self._charm.unit][ + f"{CosAgentPeersUnitData.KEY}-{event.unit.name}" + ] = data.json() + + # We can't easily tell if the data that was changed is limited to only the data + # that goes into peer relation (in which case, if this is not a leader unit, we wouldn't + # need to emit `on.data_changed`), so we're emitting `on.data_changed` either way. + self.on.data_changed.emit() # pyright: ignore + + def _validated_provider_data(self, raw) -> Optional[CosAgentProviderUnitData]: + try: + return CosAgentProviderUnitData(**json.loads(raw)) + except (pydantic.ValidationError, json.decoder.JSONDecodeError) as e: + self.on.validation_error.emit(message=str(e)) # pyright: ignore + return None + + def trigger_refresh(self, _): + """Trigger a refresh of relation data.""" + # FIXME: Figure out what we should do here + self.on.data_changed.emit() # pyright: ignore + + @property + def _principal_unit(self) -> Optional[Unit]: + """Return the principal unit for a relation. + + Assumes that the relation is of type subordinate. + Relies on the fact that, for subordinate relations, the only remote unit visible to + *this unit* is the principal unit that this unit is attached to. + """ + if relations := self._principal_relations: + # Technically it's a list, but for subordinates there can only be one relation + principal_relation = next(iter(relations)) + if units := principal_relation.units: + # Technically it's a list, but for subordinates there can only be one + return next(iter(units)) + + return None + + @property + def _principal_relations(self): + relations = [] + for relation in self._charm.model.relations[self._relation_name]: + if not json.loads(relation.data[next(iter(relation.units))]["config"]).get( + ["subordinate"], False + ): + relations.append(relation) + if len(relations) > 1: + logger.error( + "Multiple applications claiming to be principal. Update the cos-agent library in the client application charms." + ) + raise MultiplePrincipalsError("Multiple principal applications.") + return relations + + @property + def _remote_data(self) -> List[CosAgentProviderUnitData]: + """Return a list of remote data from each of the related units. + + Assumes that the relation is of type subordinate. + Relies on the fact that, for subordinate relations, the only remote unit visible to + *this unit* is the principal unit that this unit is attached to. + """ + all_data = [] + + for relation in self._charm.model.relations[self._relation_name]: + if not relation.units: + continue + unit = next(iter(relation.units)) + if not (raw := relation.data[unit].get(CosAgentProviderUnitData.KEY)): + continue + if not (provider_data := self._validated_provider_data(raw)): + continue + all_data.append(provider_data) + + return all_data + + def _gather_peer_data(self) -> List[CosAgentPeersUnitData]: + """Collect data from the peers. + + Returns a trimmed-down list of CosAgentPeersUnitData. + """ + relation = self.peer_relation + + # Ensure that whatever context we're running this in, we take the necessary precautions: + if not relation or not relation.data or not relation.app: + return [] + + # Iterate over all peer unit data and only collect every principal once. + peer_data: List[CosAgentPeersUnitData] = [] + app_names: Set[str] = set() + + for unit in chain((self._charm.unit,), relation.units): + if not relation.data.get(unit): + continue + + for unit_name in relation.data.get(unit): # pyright: ignore + if not unit_name.startswith(CosAgentPeersUnitData.KEY): + continue + raw = relation.data[unit].get(unit_name) + if raw is None: + continue + data = CosAgentPeersUnitData(**json.loads(raw)) + # Have we already seen this principal app? + if (app_name := data.app_name) in app_names: + continue + peer_data.append(data) + app_names.add(app_name) + + return peer_data + + @property + def metrics_alerts(self) -> Dict[str, Any]: + """Fetch metrics alerts.""" + alert_rules = {} + + seen_apps: List[str] = [] + for data in self._gather_peer_data(): + if rules := data.metrics_alert_rules: + app_name = data.app_name + if app_name in seen_apps: + continue # dedup! + seen_apps.append(app_name) + # This is only used for naming the file, so be as specific as we can be + identifier = JujuTopology( + model=self._charm.model.name, + model_uuid=self._charm.model.uuid, + application=app_name, + # For the topology unit, we could use `data.principal_unit_name`, but that unit + # name may not be very stable: `_gather_peer_data` de-duplicates by app name so + # the exact unit name that turns up first in the iterator may vary from time to + # time. So using the grafana-agent unit name instead. + unit=self._charm.unit.name, + ).identifier + + alert_rules[identifier] = rules + + return alert_rules + + @property + def metrics_jobs(self) -> List[Dict]: + """Parse the relation data contents and extract the metrics jobs.""" + scrape_jobs = [] + for data in self._remote_data: + for job in data.metrics_scrape_jobs: + # In #220, relation schema changed from a simplified dict to the standard + # `scrape_configs`. + # This is to ensure backwards compatibility with Providers older than v0.5. + if "path" in job and "port" in job and "job_name" in job: + job = { + "job_name": job["job_name"], + "metrics_path": job["path"], + "static_configs": [{"targets": [f"localhost:{job['port']}"]}], + } + + scrape_jobs.append(job) + + return scrape_jobs + + @property + def snap_log_endpoints(self) -> List[SnapEndpoint]: + """Fetch logging endpoints exposed by related snaps.""" + plugs = [] + for data in self._remote_data: + targets = data.log_slots + if targets: + for target in targets: + if target in plugs: + logger.warning( + f"plug {target} already listed. " + "The same snap is being passed from multiple " + "endpoints; this should not happen." + ) + else: + plugs.append(target) + + endpoints = [] + for plug in plugs: + if ":" not in plug: + logger.error(f"invalid plug definition received: {plug}. Ignoring...") + else: + endpoint = SnapEndpoint(*plug.split(":")) + endpoints.append(endpoint) + return endpoints + + @property + def logs_alerts(self) -> Dict[str, Any]: + """Fetch log alerts.""" + alert_rules = {} + seen_apps: List[str] = [] + + for data in self._gather_peer_data(): + if rules := data.log_alert_rules: + # This is only used for naming the file, so be as specific as we can be + app_name = data.app_name + if app_name in seen_apps: + continue # dedup! + seen_apps.append(app_name) + + identifier = JujuTopology( + model=self._charm.model.name, + model_uuid=self._charm.model.uuid, + application=app_name, + # For the topology unit, we could use `data.principal_unit_name`, but that unit + # name may not be very stable: `_gather_peer_data` de-duplicates by app name so + # the exact unit name that turns up first in the iterator may vary from time to + # time. So using the grafana-agent unit name instead. + unit=self._charm.unit.name, + ).identifier + + alert_rules[identifier] = rules + + return alert_rules + + @property + def dashboards(self) -> List[Dict[str, str]]: + """Fetch dashboards as encoded content. + + Dashboards are assumed not to vary across units of the same primary. + """ + dashboards: List[Dict[str, Any]] = [] + + seen_apps: List[str] = [] + for data in self._gather_peer_data(): + app_name = data.app_name + if app_name in seen_apps: + continue # dedup! + seen_apps.append(app_name) + + for encoded_dashboard in data.dashboards or (): + content = GrafanaDashboard(encoded_dashboard)._deserialize() + + title = content.get("title", "no_title") + + dashboards.append( + { + "relation_id": data.principal_relation_id, + # We have the remote charm name - use it for the identifier + "charm": f"{data.principal_relation_name}-{app_name}", + "content": content, + "title": title, + } + ) + + return dashboards diff --git a/metadata.yaml b/metadata.yaml index 3fccf485..dd4d182e 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -20,3 +20,7 @@ description: | the reconciliation interval and the number of runners to maintain are configurable. series: - jammy + +provides: + cos-agent: + interface: cos_agent diff --git a/requirements.txt b/requirements.txt index ede5e44b..0bc10fe4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,5 @@ requests typing-extensions # Newer version does not work with default OpenSSL version on jammy. cryptography <= 38.0.4 +pydantic == 1.10.3 +cosl == 0.0.7 diff --git a/src-docs/charm.py.md b/src-docs/charm.py.md index 908394c1..212c27af 100644 --- a/src-docs/charm.py.md +++ b/src-docs/charm.py.md @@ -8,7 +8,7 @@ Charm for creating and managing GitHub self-hosted runner instances. --- - + ## function `catch_charm_errors` @@ -34,7 +34,7 @@ Catch common errors in charm. --- - + ## function `catch_action_errors` @@ -63,7 +63,7 @@ Catch common errors in actions. ## class `GithubRunnerCharm` Charm for managing GitHub self-hosted runners. - + ### function `__init__` diff --git a/src-docs/charm_state.py.md b/src-docs/charm_state.py.md new file mode 100644 index 00000000..a482c7cc --- /dev/null +++ b/src-docs/charm_state.py.md @@ -0,0 +1,40 @@ + + + + +# module `charm_state.py` +State of the Charm. + +**Global Variables** +--------------- +- **COS_AGENT_INTEGRATION_NAME** + + +--- + +## class `State` +The charm state. + +Attrs: proxy_config: Proxy configuration. _charm: The charm instance. + + + + +--- + + + +### classmethod `from_charm` + +```python +from_charm(charm: CharmBase) → State +``` + +Initialize the state from charm. + + + +**Returns:** + Current state of the charm. + + diff --git a/src-docs/cos.py.md b/src-docs/cos.py.md new file mode 100644 index 00000000..3b18e5cd --- /dev/null +++ b/src-docs/cos.py.md @@ -0,0 +1,237 @@ + + + + +# module `cos.py` +The COS integration observer. + +**Global Variables** +--------------- +- **METRICS_LOGGING_INTEGRATION_NAME** +- **PROMTAIL_HEALTH_CHECK_INTERVAL_MINUTES** + + +--- + +## class `LokiEndpoint` +Information about the Loki endpoint. + +Attrs: url: The URL of the Loki endpoint. + + +--- + +#### property model_computed_fields + +Get the computed fields of this model instance. + + + +**Returns:** + A dictionary of computed field names and their corresponding `ComputedFieldInfo` objects. + +--- + +#### property model_extra + +Get extra fields set during validation. + + + +**Returns:** + A dictionary of extra fields, or `None` if `config.extra` is not set to `"allow"`. + +--- + +#### property model_fields_set + +Returns the set of fields that have been set on this model instance. + + + +**Returns:** + A set of strings representing the fields that have been set, i.e. that were not filled from defaults. + + + + +--- + +## class `LokiIntegrationData` +Represents Loki integration data. + +Attrs: endpoints: The Loki endpoints. promtail_binaries: The Promtail binaries. + + +--- + +#### property model_computed_fields + +Get the computed fields of this model instance. + + + +**Returns:** + A dictionary of computed field names and their corresponding `ComputedFieldInfo` objects. + +--- + +#### property model_extra + +Get extra fields set during validation. + + + +**Returns:** + A dictionary of extra fields, or `None` if `config.extra` is not set to `"allow"`. + +--- + +#### property model_fields_set + +Returns the set of fields that have been set on this model instance. + + + +**Returns:** + A set of strings representing the fields that have been set, i.e. that were not filled from defaults. + + + + +--- + +## class `LokiIntegrationDataIncompleteError` +Indicates an error if the Loki integration data is not complete for Promtail startup. + + + +### function `__init__` + +```python +__init__(msg: str) +``` + +Initialize a new instance of the LokiIntegrationDataNotComplete exception. + + + +**Args:** + + - `msg`: Explanation of the error. + + + + + +--- + +## class `Observer` +COS integration observer. + + + +### function `__init__` + +```python +__init__(charm: CharmBase, state: State) +``` + +Initialize the COS observer and register event handlers. + + + +**Args:** + + - `charm`: The parent charm to attach the observer to. + - `state`: The charm state. + + +--- + +#### property model + +Shortcut for more simple access the model. + + + +--- + + + +### function `metrics_logging_available` + +```python +metrics_logging_available() → bool +``` + +Check that the metrics logging integration is set up correctly. + + + +**Returns:** + True if the integration is established, False otherwise. + + +--- + +## class `PromtailBinary` +Information about the Promtail binary. + +Attrs: url: The URL to download the Promtail binary from. zipsha: The SHA256 hash of the Promtail zip file. binsha: The SHA256 hash of the Promtail binary. + + +--- + +#### property model_computed_fields + +Get the computed fields of this model instance. + + + +**Returns:** + A dictionary of computed field names and their corresponding `ComputedFieldInfo` objects. + +--- + +#### property model_extra + +Get extra fields set during validation. + + + +**Returns:** + A dictionary of extra fields, or `None` if `config.extra` is not set to `"allow"`. + +--- + +#### property model_fields_set + +Returns the set of fields that have been set on this model instance. + + + +**Returns:** + A set of strings representing the fields that have been set, i.e. that were not filled from defaults. + + + + +--- + +## class `PromtailHealthCheckEvent` +Event representing a periodic check to ensure Promtail is running. + + + + + +--- + +## class `PromtailNotRunningError` +Indicates an error if Promtail is not running. + + + + + diff --git a/src-docs/errors.py.md b/src-docs/errors.py.md index d9c24dbd..b06eb9f1 100644 --- a/src-docs/errors.py.md +++ b/src-docs/errors.py.md @@ -7,6 +7,15 @@ Errors used by the charm. +--- + +## class `LogrotateSetupError` +Error raised when logrotate cannot be setup. + + + + + --- ## class `LxdError` diff --git a/src-docs/metrics.py.md b/src-docs/metrics.py.md new file mode 100644 index 00000000..4d97d266 --- /dev/null +++ b/src-docs/metrics.py.md @@ -0,0 +1,112 @@ + + + + +# module `metrics.py` +Models and functions for the metric events. + +**Global Variables** +--------------- +- **LOG_ROTATE_TIMER_SYSTEMD_SERVICE** +- **SYSTEMCTL_PATH** + +--- + + + +## function `issue_event` + +```python +issue_event(event: Event) → None +``` + +Issue a metric event. + +The metric event is logged to the metrics log. + + + +**Args:** + + - `event`: The metric event to log. + +**Raises:** + + - `OSError`: If an error occurs while writing the metrics log. + + +--- + + + +## function `setup_logrotate` + +```python +setup_logrotate() +``` + +Configure logrotate for the metrics log. + + + +**Raises:** + + - `LogrotateSetupError`: If the logrotate.timer cannot be enabled. + + +--- + +## class `Event` +Base class for metric events. + +Attrs: timestamp: The UNIX time stamp of the time at which the event was originally issued. event: The name of the event. Will be set to the class name in snake case if not provided. + + + +### function `__init__` + +```python +__init__(*args, **kwargs) +``` + +Initialize the event. + + + +**Args:** + + - `*args`: The positional arguments to pass to the base class. + - `**kwargs`: The keyword arguments to pass to the base class. These are used to set the specific fields. E.g. timestamp=12345 will set the timestamp field to 12345. + + + + + +--- + +## class `RunnerInstalled` +Metric event for when a runner is installed. + +Attrs: flavor: Describes the characteristics of the runner. The flavour could be for example "small". duration: The duration of the installation in seconds. + + + +### function `__init__` + +```python +__init__(*args, **kwargs) +``` + +Initialize the event. + + + +**Args:** + + - `*args`: The positional arguments to pass to the base class. + - `**kwargs`: The keyword arguments to pass to the base class. These are used to set the specific fields. E.g. timestamp=12345 will set the timestamp field to 12345. + + + + + diff --git a/src-docs/promtail.py.md b/src-docs/promtail.py.md new file mode 100644 index 00000000..10ea70f8 --- /dev/null +++ b/src-docs/promtail.py.md @@ -0,0 +1,128 @@ + + + + +# module `promtail.py` +Functions for operating Promtail. + +**Global Variables** +--------------- +- **PROMTAIL_BASE_URL** +- **SYSTEMCTL_PATH_STR** +- **PROMTAIL_BINARY_FILE_MODE** +- **JINJA2_TEMPLATE_PATH** + +--- + + + +## function `setup` + +```python +setup(config: Config) → None +``` + +Set up Promtail. + +Installs, configures and starts Promtail. + +If Promtail has not already been installed, it will be installed and configured to send logs to Loki. If Promtail is already running, it will be reconfigured and restarted. + + + +**Args:** + + - `config`: The configuration for Promtail. + + +--- + + + +## function `restart` + +```python +restart() → None +``` + +Restart Promtail. + + +--- + + + +## function `stop` + +```python +stop() → None +``` + +Stop Promtail. + + +--- + + + +## function `is_running` + +```python +is_running() → bool +``` + +Check if Promtail is running. + + + +**Returns:** + True if Promtail is running, False otherwise. + + +--- + +## class `Config` +Configuration options for Promtail. + +Attrs: loki_endpoint: The Loki endpoint to send logs to. proxies: Proxy settings. promtail_download_info: Information about the Promtail download. + + + + + +--- + +## class `PromtailDownloadInfo` +Information about the Promtail download. + +Attrs: url: The URL to download Promtail from. zip_sha256: The SHA256 hash of the Promtail zip file. bin_sha256: The SHA256 hash of the Promtail binary. + + + + + +--- + +## class `PromtailInstallationError` +Represents an error during installation of Promtail. + + + +### function `__init__` + +```python +__init__(msg: str) +``` + +Initialize a new instance of the PromtailInstallationError exception. + + + +**Args:** + + - `msg`: Explanation of the error. + + + + + diff --git a/src-docs/runner_manager.py.md b/src-docs/runner_manager.py.md index 247511f1..a6ff8201 100644 --- a/src-docs/runner_manager.py.md +++ b/src-docs/runner_manager.py.md @@ -23,7 +23,7 @@ Used as a returned type to method querying runner information. ## class `RunnerManager` Manage a group of runners according to configuration. - + ### function `__init__` @@ -32,7 +32,7 @@ __init__( app_name: str, unit: int, runner_manager_config: RunnerManagerConfig, - proxies: ProxySetting = {} + proxies: Optional[ProxySetting] = None ) → None ``` @@ -52,7 +52,7 @@ Construct RunnerManager object for creating and managing runners. --- - + ### function `check_runner_bin` @@ -69,7 +69,7 @@ Check if runner binary exists. --- - + ### function `flush` @@ -92,7 +92,7 @@ Remove existing runners. --- - + ### function `get_github_info` @@ -109,7 +109,7 @@ Get information on the runners from GitHub. --- - + ### function `get_latest_runner_bin_url` @@ -138,7 +138,7 @@ The runner binary URL changes when a new version is available. --- - + ### function `reconcile` @@ -162,7 +162,7 @@ Bring runners in line with target. --- - + ### function `update_runner_bin` @@ -186,7 +186,7 @@ Remove the existing runner binary to prevent it from being used. This is done to ## class `RunnerManagerConfig` Configuration of runner manager. -Attrs: path: GitHub repository path in the format '/', or the GitHub organization name. token: GitHub personal access token to register runner to the repository or organization. image: Name of the image for creating LXD instance. service_token: Token for accessing local service. lxd_storage_path: Path to be used as LXD storage. +Attrs: path: GitHub repository path in the format '/', or the GitHub organization name. token: GitHub personal access token to register runner to the repository or organization. image: Name of the image for creating LXD instance. service_token: Token for accessing local service. lxd_storage_path: Path to be used as LXD storage. issue_metrics: Whether to issue metrics. diff --git a/src-docs/utilities.py.md b/src-docs/utilities.py.md index dd447382..e5329ed2 100644 --- a/src-docs/utilities.py.md +++ b/src-docs/utilities.py.md @@ -106,9 +106,15 @@ The command is executed with `subprocess.run`, additional arguments can be passe Output on stdout, and the exit code. + +**Raises:** + + - `SubprocessError`: If `check_exit` is set and the exit code is non-zero. + + --- - + ## function `get_env_var` @@ -134,7 +140,7 @@ Looks for all upper-case and all low-case of the `env_var`. --- - + ## function `set_env_var` @@ -156,7 +162,7 @@ Set the all upper case and all low case of the `env_var`. --- - + ## function `bytes_with_unit_to_kib` diff --git a/src/charm.py b/src/charm.py index 95299b67..e75e0cd9 100755 --- a/src/charm.py +++ b/src/charm.py @@ -15,6 +15,7 @@ from typing import Any, Callable, Dict, Optional, Sequence, TypeVar import jinja2 +from charms.grafana_agent.v0.cos_agent import COSAgentProvider from ops.charm import ( ActionEvent, CharmBase, @@ -28,7 +29,10 @@ from ops.main import main from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus +import metrics +from charm_state import State from errors import ( + LogrotateSetupError, MissingConfigurationError, MissingRunnerBinaryError, RunnerBinaryError, @@ -130,6 +134,10 @@ def __init__(self, *args, **kargs) -> None: class. """ super().__init__(*args, **kargs) + + self._grafana_agent = COSAgentProvider(self) + self._state = State.from_charm(self) + if LXD_PROFILE_YAML.exists(): if self.config.get("test-mode") != "insecure": raise RuntimeError("lxd-profile.yaml detected outside test mode") @@ -271,7 +279,14 @@ def _get_runner_manager( return RunnerManager( app_name, unit, - RunnerManagerConfig(path, token, "jammy", self.service_token, self.ram_pool_path), + RunnerManagerConfig( + path=path, + token=token, + image="jammy", + service_token=self.service_token, + lxd_storage_path=self.ram_pool_path, + charm_state=self._state, + ), proxies=self.proxies, ) @@ -290,10 +305,15 @@ def _on_install(self, _event: InstallEvent) -> None: # The `_start_services`, `_install_deps` includes retry. self._install_deps() self._start_services() - except SubprocessError as err: + metrics.setup_logrotate() + except (LogrotateSetupError, SubprocessError) as err: logger.exception(err) - # The charm cannot proceed without dependencies. - self.unit.status = BlockedStatus("Failed to install dependencies") + + if isinstance(err, LogrotateSetupError): + msg = "Failed to setup logrotate" + else: + msg = "Failed to install dependencies" + self.unit.status = BlockedStatus(msg) return self._refresh_firewall() @@ -368,10 +388,15 @@ def _on_upgrade_charm(self, _event: UpgradeCharmEvent) -> None: # The `_start_services`, `_install_deps` includes retry. self._install_deps() self._start_services() - except SubprocessError as err: + metrics.setup_logrotate() + except (LogrotateSetupError, SubprocessError) as err: logger.exception(err) - # The charm cannot proceed without dependencies. - self.unit.status = BlockedStatus("Failed to install dependencies") + + if isinstance(err, LogrotateSetupError): + msg = "Failed to setup logrotate" + else: + msg = "Failed to install dependencies" + self.unit.status = BlockedStatus(msg) return self._refresh_firewall() diff --git a/src/charm_state.py b/src/charm_state.py new file mode 100644 index 00000000..cc6fab70 --- /dev/null +++ b/src/charm_state.py @@ -0,0 +1,36 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +"""State of the Charm.""" + +import dataclasses +import logging + +from ops import CharmBase + +logger = logging.getLogger(__name__) + +COS_AGENT_INTEGRATION_NAME = "cos-agent" + + +@dataclasses.dataclass(frozen=True) +class State: + """The charm state. + + Attrs: + proxy_config: Proxy configuration. + _charm: The charm instance. + """ + + is_metrics_logging_available: bool + + @classmethod + def from_charm(cls, charm: CharmBase) -> "State": + """Initialize the state from charm. + + Returns: + Current state of the charm. + """ + return cls( + is_metrics_logging_available=bool(charm.model.relations[COS_AGENT_INTEGRATION_NAME]) + ) diff --git a/src/errors.py b/src/errors.py index 3657a51f..7222508b 100644 --- a/src/errors.py +++ b/src/errors.py @@ -92,3 +92,7 @@ def __init__( self.return_code = return_code self.stdout = stdout self.stderr = stderr + + +class LogrotateSetupError(Exception): + """Error raised when logrotate cannot be setup.""" diff --git a/src/grafana_dashboard_metrics/metrics.json b/src/grafana_dashboard_metrics/metrics.json new file mode 100644 index 00000000..a2d42ff6 --- /dev/null +++ b/src/grafana_dashboard_metrics/metrics.json @@ -0,0 +1,257 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "${lokids}" + }, + "description": "All aggregations are based on a 1-hour time period.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${lokids}" + }, + "editorMode": "builder", + "expr": "avg_over_time({filename=\"/var/log/github-runner-metrics.log\"} | json event=\"event\", duration=\"duration\" | event = `runner_installed` | unwrap duration [1h])", + "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", + "legendFormat": "Average", + "queryType": "range", + "refId": "A" + }, + { + "datasource": { + "type": "loki", + "uid": "${lokids}" + }, + "editorMode": "builder", + "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\"} | json event=\"event\", duration=\"duration\" | event = `runner_installed` | unwrap duration [1h])", + "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", + "legendFormat": "Max", + "queryType": "range", + "refId": "B" + }, + { + "datasource": { + "type": "loki", + "uid": "${lokids}" + }, + "editorMode": "builder", + "expr": "min_over_time({filename=\"/var/log/github-runner-metrics.log\"} | json event=\"event\", duration=\"duration\" | event = `runner_installed` | unwrap duration [1h])", + "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", + "legendFormat": "Max", + "queryType": "range", + "refId": "C" + } + ], + "title": "Runner Installation Duration in seconds", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "${lokids}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${lokids}" + }, + "editorMode": "code", + "expr": "count_over_time({filename=\"/var/log/github-runner-metrics.log\"} | json event=\"event\" | event = `runner_installed` [$__range])", + "key": "Q-e82ee7ed-0742-4a37-a485-636e69760962-0", + "legendFormat": "Totel Started Jobs", + "queryType": "range", + "refId": "A" + } + ], + "title": "Totel Started Jobs", + "type": "timeseries" + } + ], + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "GitHub Self-Hosted Runner Metrics", + "version": 0, + "weekStart": "" +} \ No newline at end of file diff --git a/src/metrics.py b/src/metrics.py new file mode 100644 index 00000000..e86ebe65 --- /dev/null +++ b/src/metrics.py @@ -0,0 +1,137 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Models and functions for the metric events.""" +import logging +from pathlib import Path + +from pydantic import BaseModel, NonNegativeInt + +from errors import LogrotateSetupError, SubprocessError +from utilities import execute_command + +LOG_ROTATE_TIMER_SYSTEMD_SERVICE = "logrotate.timer" + +SYSTEMCTL_PATH = "/usr/bin/systemctl" + +LOGROTATE_CONFIG = Path("/etc/logrotate.d/github-runner-metrics") +METRICS_LOG_PATH = Path("/var/log/github-runner-metrics.log") + + +logger = logging.getLogger(__name__) + + +class Event(BaseModel): + """Base class for metric events. + + Attrs: + timestamp: The UNIX time stamp of the time at which the event was originally issued. + event: The name of the event. Will be set to the class name in snake case if not provided. + """ + + timestamp: NonNegativeInt + event: str + + @staticmethod + def _camel_to_snake(camel_case_string: str) -> str: + """Convert a camel case string to snake case. + + Args: + camel_case_string: The string to convert. + Returns: + The converted string. + """ + snake_case_string = camel_case_string[0].lower() + for char in camel_case_string[1:]: + if char.isupper(): + snake_case_string += "_" + char.lower() + else: + snake_case_string += char + return snake_case_string + + def __init__(self, *args, **kwargs): + """Initialize the event. + + Args: + *args: The positional arguments to pass to the base class. + **kwargs: The keyword arguments to pass to the base class. These are used to set the + specific fields. E.g. timestamp=12345 will set the timestamp field to 12345. + """ + if "event" not in kwargs: + event = self._camel_to_snake(self.__class__.__name__) + kwargs["event"] = event + super().__init__(*args, **kwargs) + + +class RunnerInstalled(Event): + """Metric event for when a runner is installed. + + Attrs: + flavor: Describes the characteristics of the runner. + The flavour could be for example "small". + duration: The duration of the installation in seconds. + """ + + flavor: str + duration: NonNegativeInt + + +def issue_event(event: Event) -> None: + """Issue a metric event. + + The metric event is logged to the metrics log. + + Args: + event: The metric event to log. + Raises: + OSError: If an error occurs while writing the metrics log. + """ + with METRICS_LOG_PATH.open(mode="a", encoding="utf-8") as metrics_file: + metrics_file.write(f"{event.json()}\n") + + +def _enable_logrotate() -> None: + """Enable and start the logrotate timer if it is not active. + + Raises: + SubprocessError: If the logrotate.timer cannot be enabled and started. + """ + execute_command([SYSTEMCTL_PATH, "enable", LOG_ROTATE_TIMER_SYSTEMD_SERVICE], check_exit=True) + + _, retcode = execute_command( + [SYSTEMCTL_PATH, "is-active", "--quiet", LOG_ROTATE_TIMER_SYSTEMD_SERVICE] + ) + if retcode != 0: + execute_command( + [SYSTEMCTL_PATH, "start", LOG_ROTATE_TIMER_SYSTEMD_SERVICE], check_exit=True + ) + + +def _configure_logrotate() -> None: + """Configure logrotate for the metrics log.""" + # Set rotate to 0 to not keep the old metrics log file to avoid sending the + # metrics to Loki twice, which may happen if there is a corrupt log scrape configuration. + LOGROTATE_CONFIG.write_text( + f"""{str(METRICS_LOG_PATH)} {{ + rotate 0 + missingok + notifempty + create +}} +""", + encoding="utf-8", + ) + + +def setup_logrotate(): + """Configure logrotate for the metrics log. + + Raises: + LogrotateSetupError: If the logrotate.timer cannot be enabled. + """ + _configure_logrotate() + + try: + _enable_logrotate() + except SubprocessError as error: + raise LogrotateSetupError() from error diff --git a/src/runner_manager.py b/src/runner_manager.py index 0a392264..659bbf74 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -6,6 +6,7 @@ import hashlib import logging import tarfile +import time import urllib.request import uuid from dataclasses import dataclass @@ -21,6 +22,8 @@ from ghapi.page import pages from typing_extensions import assert_never +import metrics +from charm_state import State as CharmState from errors import RunnerBinaryError, RunnerCreateError from github_type import ( GitHubRunnerStatus, @@ -58,6 +61,7 @@ class RunnerManagerConfig: image: Name of the image for creating LXD instance. service_token: Token for accessing local service. lxd_storage_path: Path to be used as LXD storage. + issue_metrics: Whether to issue metrics. """ path: GitHubPath @@ -65,6 +69,7 @@ class RunnerManagerConfig: image: str service_token: str lxd_storage_path: Path + charm_state: CharmState @dataclass @@ -84,12 +89,12 @@ class RunnerManager: runner_bin_path = Path("/home/ubuntu/github-runner-app") - def __init__( + def __init__( # pylint: disable=too-many-arguments self, app_name: str, unit: int, runner_manager_config: RunnerManagerConfig, - proxies: ProxySetting = ProxySetting(), + proxies: Optional[ProxySetting] = None, ) -> None: """Construct RunnerManager object for creating and managing runners. @@ -102,7 +107,7 @@ def __init__( self.app_name = app_name self.instance_name = f"{app_name}-{unit}" self.config = runner_manager_config - self.proxies = proxies + self.proxies = proxies if proxies else ProxySetting() # Setting the env var to this process and any child process spawned. if "no_proxy" in self.proxies: @@ -290,6 +295,45 @@ def _get_runner_health_states(self) -> RunnerByHealth: return RunnerByHealth(healthy, unhealthy) + def _create_runner( + self, registration_token: str, resources: VirtualMachineResources, runner: Runner + ): + """Create a runner. + + Issues RunnerInstalled metric if metrics_logging is enabled. + + Args: + registration_token: Token for registering runner to GitHub. + resources: Configuration of the virtual machine resources. + runner: Runner to be created. + """ + if self.config.charm_state.is_metrics_logging_available: + ts_now = time.time() + runner.create( + self.config.image, + resources, + RunnerManager.runner_bin_path, + registration_token, + ) + ts_after = time.time() + try: + metrics.issue_event( + event=metrics.RunnerInstalled( + timestamp=ts_after, + flavor=self.app_name, + duration=ts_after - ts_now, + ), + ) + except OSError: + logger.exception("Failed to issue metrics") + else: + runner.create( + self.config.image, + resources, + RunnerManager.runner_bin_path, + registration_token, + ) + def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int: """Bring runners in line with target. @@ -357,12 +401,7 @@ def reconcile(self, quantity: int, resources: VirtualMachineResources) -> int: ) runner = Runner(self._clients, config, RunnerStatus()) try: - runner.create( - self.config.image, - resources, - RunnerManager.runner_bin_path, - registration_token, - ) + self._create_runner(registration_token, resources, runner) logger.info("Created runner: %s", runner.config.name) except RunnerCreateError: logger.error("Unable to create runner: %s", runner.config.name) diff --git a/src/utilities.py b/src/utilities.py index 1cb2e3c3..b8b0b3e0 100644 --- a/src/utilities.py +++ b/src/utilities.py @@ -146,6 +146,9 @@ def execute_command(cmd: Sequence[str], check_exit: bool = True, **kwargs) -> tu Returns: Output on stdout, and the exit code. + + Raises: + SubprocessError: If `check_exit` is set and the exit code is non-zero. """ result = secure_run_subprocess(cmd, **kwargs) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index dfe5c4c0..450ce8cd 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -15,7 +15,7 @@ from juju.model import Model from pytest_operator.plugin import OpsTest -from tests.integration.helpers import wait_till_num_of_runners +from tests.integration.helpers import create_runner from tests.status_name import ACTIVE_STATUS_NAME @@ -146,11 +146,7 @@ async def app_no_runner( https_proxy: str, no_proxy: str, ) -> AsyncIterator[Application]: - """Application with no token. - - Test should ensure it returns with the application having no token and no - runner. - """ + """Application with no runner.""" subprocess.run(["sudo", "modprobe", "br_netfilter"]) await model.set_config( @@ -176,7 +172,7 @@ async def app_no_runner( "reconcile-interval": 60, }, ) - await model.wait_for_idle() + await model.wait_for_idle(timeout=60 * 30) yield application @@ -188,15 +184,7 @@ async def app(model: Model, app_no_runner: Application) -> AsyncIterator[Applica Test should ensure it returns with the application in a good state and has one runner. """ - unit = app_no_runner.units[0] - - await app_no_runner.set_config({"virtual-machines": "1"}) - action = await unit.run_action("reconcile-runners") - await action.wait() - await model.wait_for_idle(status=ACTIVE_STATUS_NAME) - - # Wait until there is one runner. - await wait_till_num_of_runners(unit, 1) + await create_runner(app=app_no_runner, model=model) yield app_no_runner diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 0e58ac96..d97e0ccc 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -9,10 +9,13 @@ import juju.version import yaml +from juju.application import Application +from juju.model import Model from juju.unit import Unit from runner import Runner from runner_manager import RunnerManager +from tests.status_name import ACTIVE_STATUS_NAME from utilities import retry @@ -236,3 +239,18 @@ async def start_test_http_server(unit: Unit, port: int): await sleep(3) else: assert False, "Timeout waiting for HTTP server to start up" + + +async def create_runner(app: Application, model: Model) -> None: + """Let the charm create a runner. + + Args: + app: The GitHub Runner Charm app to create the runner for. + model: The machine charm model. + """ + await app.set_config({"virtual-machines": "1"}) + unit = app.units[0] + action = await unit.run_action("reconcile-runners") + await action.wait() + await model.wait_for_idle(apps=[app.name], status=ACTIVE_STATUS_NAME) + await wait_till_num_of_runners(unit, 1) diff --git a/tests/integration/test_charm_fork_repo.py b/tests/integration/test_charm_fork_repo.py index cf7e47c5..9e3c64ff 100644 --- a/tests/integration/test_charm_fork_repo.py +++ b/tests/integration/test_charm_fork_repo.py @@ -20,7 +20,7 @@ from juju.application import Application from juju.model import Model -from tests.integration.helpers import get_runner_names, wait_till_num_of_runners +from tests.integration.helpers import create_runner, get_runner_names from tests.status_name import ACTIVE_STATUS_NAME DISPATCH_TEST_WORKFLOW_FILENAME = "workflow_dispatch_test.yaml" @@ -142,19 +142,12 @@ async def app_with_unsigned_commit_repo( Test should ensure it returns with the application in a good state and has one runner. """ - unit = app_no_runner.units[0] + app = app_no_runner # alias for readability as the app will have a runner during the test - await app_no_runner.set_config( - {"virtual-machines": "1", "path": forked_github_repository.full_name} - ) - action = await unit.run_action("reconcile-runners") - await action.wait() - await model.wait_for_idle(status=ACTIVE_STATUS_NAME) - - # Wait until there is one runner. - await wait_till_num_of_runners(unit, 1) + await app.set_config({"path": forked_github_repository.full_name}) + await create_runner(app=app, model=model) - yield app_no_runner + yield app @pytest.mark.asyncio diff --git a/tests/integration/test_charm_metrics.py b/tests/integration/test_charm_metrics.py new file mode 100644 index 00000000..51138d52 --- /dev/null +++ b/tests/integration/test_charm_metrics.py @@ -0,0 +1,57 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Integration tests for metrics.""" +import json +import logging + +from juju.application import Application +from juju.model import Model +from juju.unit import Unit + +from metrics import METRICS_LOG_PATH +from tests.integration.helpers import create_runner +from tests.status_name import ACTIVE_STATUS_NAME + + +async def _get_metrics_log(unit: Unit) -> str: + """Retrieve the metrics log from the unit. + + Args: + unit: The unit to retrieve the metrics log from. + + Returns: + The metrics log. + """ + return ( + await unit.ssh( + f"if [ -f {METRICS_LOG_PATH} ]; then cat {METRICS_LOG_PATH}; else echo ''; fi" + ) + ).strip() + + +async def test_charm_issues_runner_installed_metric( + model: Model, + app_no_runner: Application, +): + """ + arrange: A charm without runners integrated with grafana-agent using the cos-agent integration. + act: Config the charm to contain one runner. + assert: The RunnerInstalled metric is logged. + """ + app = app_no_runner # alias for readability as the app will have a runner during the test + grafana_agent = await model.deploy("grafana-agent", channel="latest/edge") + await model.relate(f"{app.name}:cos-agent", f"{grafana_agent.name}:cos-agent") + await model.wait_for_idle(apps=[app.name], status=ACTIVE_STATUS_NAME) + await model.wait_for_idle(apps=[grafana_agent.name]) + metrics_log = await _get_metrics_log(app.units[0]) + assert metrics_log == "".strip() + + await create_runner(app=app, model=model) + + metrics_log = await _get_metrics_log(app.units[0]) + logging.info("Metric log: %s", metrics_log) + metric_log = json.loads(metrics_log) + assert metric_log.get("flavor") == app.name + assert metric_log.get("event") == "runner_installed" + assert metric_log.get("duration") >= 0 diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 8fe7c8aa..eb23366f 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -9,8 +9,13 @@ from tests.unit.mock import MockGhapiClient, MockLxdClient, MockRepoPolicyComplianceClient +@pytest.fixture(name="exec_command") +def exec_command_fixture(): + return unittest.mock.MagicMock(return_value=("", 0)) + + @pytest.fixture(autouse=True) -def mocks(monkeypatch, tmp_path): +def mocks(monkeypatch, tmp_path, exec_command): monkeypatch.setattr( "charm.GithubRunnerCharm.service_token_path", Path(tmp_path / "mock_service_token") ) @@ -24,6 +29,10 @@ def mocks(monkeypatch, tmp_path): "firewall.Firewall.get_host_ip", unittest.mock.MagicMock(return_value="10.0.0.1") ) monkeypatch.setattr("firewall.Firewall.refresh_firewall", unittest.mock.MagicMock()) + monkeypatch.setattr("metrics.execute_command", exec_command) + monkeypatch.setattr("metrics.METRICS_LOG_PATH", Path(tmp_path / "metrics.log")) + monkeypatch.setattr("metrics.LOGROTATE_CONFIG", Path(tmp_path / "github-runner-metrics")) + monkeypatch.setattr("runner.time", unittest.mock.MagicMock()) monkeypatch.setattr("runner_manager.GhApi", MockGhapiClient) monkeypatch.setattr("runner_manager.jinja2", unittest.mock.MagicMock()) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 6caed4db..82ff35f3 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -11,11 +11,13 @@ from ops.testing import Harness from charm import GithubRunnerCharm -from errors import MissingConfigurationError, RunnerError, SubprocessError +from errors import LogrotateSetupError, MissingConfigurationError, RunnerError, SubprocessError from github_type import GitHubRunnerStatus from runner_manager import RunnerInfo, RunnerManagerConfig from runner_type import GitHubOrg, GitHubRepo, VirtualMachineResources +TEST_PROXY_SERVER_URL = "http://proxy.server:1234" + def raise_runner_error(*args, **kargs): raise RunnerError("mock error") @@ -51,18 +53,18 @@ class TestCharm(unittest.TestCase): @patch.dict( os.environ, { - "JUJU_CHARM_HTTPS_PROXY": "mock_https_proxy", - "JUJU_CHARM_HTTP_PROXY": "mock_http_proxy", - "JUJU_CHARM_NO_PROXY": "mock_no_proxy", + "JUJU_CHARM_HTTPS_PROXY": TEST_PROXY_SERVER_URL, + "JUJU_CHARM_HTTP_PROXY": TEST_PROXY_SERVER_URL, + "JUJU_CHARM_NO_PROXY": "127.0.0.1,localhost", }, ) def test_proxy_setting(self): harness = Harness(GithubRunnerCharm) harness.begin() - assert harness.charm.proxies["https"] == "mock_https_proxy" - assert harness.charm.proxies["http"] == "mock_http_proxy" - assert harness.charm.proxies["no_proxy"] == "mock_no_proxy" + assert harness.charm.proxies["https"] == TEST_PROXY_SERVER_URL + assert harness.charm.proxies["http"] == TEST_PROXY_SERVER_URL + assert harness.charm.proxies["no_proxy"] == "127.0.0.1,localhost" @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -110,6 +112,7 @@ def test_org_register(self, run, wt, mkdir, rm): image="jammy", service_token=token, lxd_storage_path=GithubRunnerCharm.ram_pool_path, + charm_state=harness.charm._state, ), proxies={}, ) @@ -135,6 +138,7 @@ def test_repo_register(self, run, wt, mkdir, rm): image="jammy", service_token=token, lxd_storage_path=GithubRunnerCharm.ram_pool_path, + charm_state=harness.charm._state, ), proxies={}, ) @@ -163,6 +167,7 @@ def test_update_config(self, run, wt, mkdir, rm): image="jammy", service_token=token, lxd_storage_path=GithubRunnerCharm.ram_pool_path, + charm_state=harness.charm._state, ), proxies={}, ) @@ -182,6 +187,7 @@ def test_update_config(self, run, wt, mkdir, rm): image="jammy", service_token=token, lxd_storage_path=GithubRunnerCharm.ram_pool_path, + charm_state=harness.charm._state, ), proxies={}, ) @@ -222,12 +228,13 @@ def test_get_runner_manager(self, run, wt, mkdir): # With invalid path. assert harness.charm._get_runner_manager("mocktoken", "mock/invalid/path") is None + @patch("charm.metrics.setup_logrotate") @patch("charm.RunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @patch("builtins.open") - def test_on_install_failure(self, open, run, wt, mkdir, rm): + def test_on_install_failure(self, open, run, wt, mkdir, rm, sr): """Test various error thrown during install.""" rm.return_value = mock_rm = MagicMock() @@ -241,6 +248,11 @@ def test_on_install_failure(self, open, run, wt, mkdir, rm): harness.charm.on.install.emit() assert harness.charm.unit.status == ActiveStatus() + sr.side_effect = LogrotateSetupError + harness.charm.on.install.emit() + assert harness.charm.unit.status == BlockedStatus("Failed to setup logrotate") + + sr.side_effect = None GithubRunnerCharm._install_deps = raise_subprocess_error harness.charm.on.install.emit() assert harness.charm.unit.status == BlockedStatus("Failed to install dependencies") diff --git a/tests/unit/test_charm_state.py b/tests/unit/test_charm_state.py new file mode 100644 index 00000000..554c5c1b --- /dev/null +++ b/tests/unit/test_charm_state.py @@ -0,0 +1,34 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +from unittest.mock import MagicMock + +from charm_state import State + + +def test_metrics_logging_available_true(): + """ + arrange: Setup mocked charm to return an integration. + act: Access is_metrics_logging_available property. + assert: metrics_logging_available returns True. + """ + charm = MagicMock() + charm.model.relations.__getitem__.return_value = [MagicMock()] + + state = State.from_charm(charm) + + assert state.is_metrics_logging_available + + +def test_metrics_logging_available_false(): + """ + arrange: Setup mocked charm to return no integration. + act: Access is_metrics_logging_available property. + assert: metrics_logging_available returns False. + """ + charm = MagicMock() + charm.model.relations.__getitem__.return_value = [] + + state = State.from_charm(charm) + + assert not state.is_metrics_logging_available diff --git a/tests/unit/test_metrics.py b/tests/unit/test_metrics.py new file mode 100644 index 00000000..481cfba7 --- /dev/null +++ b/tests/unit/test_metrics.py @@ -0,0 +1,93 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. +import json +from pathlib import Path +from unittest.mock import MagicMock, call + +import pytest + +from errors import LogrotateSetupError, SubprocessError +from metrics import RunnerInstalled, issue_event, setup_logrotate + +TEST_LOKI_PUSH_API_URL = "http://loki:3100/api/prom/push" + + +def test_issue_metrics_logs_events(tmp_path: Path): + """ + arrange: Change path of the metrics log + act: Issue a metric event + assert: The expected metric log is created + """ + event = RunnerInstalled(timestamp=123, flavor="small", duration=456) + + issue_event(event) + + assert json.loads(tmp_path.joinpath("metrics.log").read_text()) == { + "event": "runner_installed", + "timestamp": 123, + "flavor": "small", + "duration": 456, + } + + +def test_setup_logrotate(tmp_path: Path): + """ + arrange: Change paths for the logrotate config and the log file + act: Setup logrotate + assert: The expected logrotate config is created + """ + + setup_logrotate() + + logrotate_path = tmp_path / "github-runner-metrics" + metrics_log_path = tmp_path / "metrics.log" + + expected_logrotate_config = f"""{metrics_log_path} {{ + rotate 0 + missingok + notifempty + create +}} +""" + assert logrotate_path.read_text() == expected_logrotate_config + + +def test_setup_logrotate_enables_logrotate_timer(exec_command: MagicMock): + """ + arrange: Mock execute command to return error for the is-active call and + non-error for the remaining calls. + act: Setup logrotate + assert: The commands to enable and start the logrotate timer are called + """ + + def side_effect(*args, **kwargs): + if "is-active" in args[0]: + return "", 1 + return "", 0 + + exec_command.side_effect = side_effect + + setup_logrotate() + + assert ( + call(["/usr/bin/systemctl", "enable", "logrotate.timer"], check_exit=True) + in exec_command.mock_calls + ) + assert ( + call(["/usr/bin/systemctl", "start", "logrotate.timer"], check_exit=True) + in exec_command.mock_calls + ) + + +def test_setup_logrotate_raises_error(exec_command: MagicMock): + """ + arrange: Mock execute command to raise a SubprocessError + act: Setup logrotate + assert: The expected error is raised. + """ + exec_command.side_effect = SubprocessError( + cmd=["mock"], return_code=1, stdout="mock stdout", stderr="mock stderr" + ) + + with pytest.raises(LogrotateSetupError): + setup_logrotate() diff --git a/tests/unit/test_runner_manager.py b/tests/unit/test_runner_manager.py index 31c7863e..ea77ee10 100644 --- a/tests/unit/test_runner_manager.py +++ b/tests/unit/test_runner_manager.py @@ -8,19 +8,31 @@ from unittest.mock import MagicMock import pytest +from _pytest.monkeypatch import MonkeyPatch +from charm_state import State from errors import RunnerBinaryError +from metrics import RunnerInstalled from runner import Runner, RunnerStatus from runner_manager import RunnerManager, RunnerManagerConfig from runner_type import GitHubOrg, GitHubRepo, RunnerByHealth, VirtualMachineResources from tests.unit.mock import TEST_BINARY +TEST_LOKI_ENDPOINT = "http://test.loki" + @pytest.fixture(scope="function", name="token") def token_fixture(): return secrets.token_hex() +@pytest.fixture(scope="function", name="charm_state") +def charm_state_fixture(): + mock = MagicMock(spec=State) + mock.is_metrics_logging_available = False + return mock + + @pytest.fixture( scope="function", name="runner_manager", @@ -32,7 +44,7 @@ def token_fixture(): ), ], ) -def runner_manager_fixture(request, tmp_path, monkeypatch, token): +def runner_manager_fixture(request, tmp_path, monkeypatch, token, charm_state): monkeypatch.setattr( "runner_manager.RunnerManager.runner_bin_path", tmp_path / "mock_runner_binary" ) @@ -42,13 +54,28 @@ def runner_manager_fixture(request, tmp_path, monkeypatch, token): runner_manager = RunnerManager( "test app", "0", - RunnerManagerConfig(request.param[0], token, "jammy", secrets.token_hex(16), pool_path), + RunnerManagerConfig( + request.param[0], + token, + "jammy", + secrets.token_hex(16), + pool_path, + charm_state=charm_state, + ), proxies=request.param[1], ) runner_manager.runner_bin_path.write_bytes(TEST_BINARY) return runner_manager +@pytest.fixture(autouse=True, name="issue_event_mock") +def issue_event_mock_fixture(monkeypatch: MonkeyPatch) -> MagicMock: + """Mock the issue_event function.""" + issue_event_mock = MagicMock() + monkeypatch.setattr("metrics.issue_event", issue_event_mock) + return issue_event_mock + + def test_get_latest_runner_bin_url(runner_manager: RunnerManager): """ arrange: Nothing. @@ -186,3 +213,59 @@ def test_flush(runner_manager: RunnerManager, tmp_path: Path): runner_manager.flush() assert len(runner_manager._get_runners()) == 0 + + +def test_reconcile_issues_runner_installed_event( + runner_manager: RunnerManager, + monkeypatch: MonkeyPatch, + issue_event_mock: MagicMock, + charm_state: MagicMock, +): + """ + arrange: Enable issuing of metrics and mock timestamps. + act: Reconcile to create a runner. + assert: The expected event is issued. + """ + charm_state.is_metrics_logging_available = True + t_mock = MagicMock(return_value=12345) + monkeypatch.setattr("runner_manager.time.time", t_mock) + + runner_manager.reconcile(1, VirtualMachineResources(2, "7GiB", "10Gib")) + + issue_event_mock.assert_called_once_with( + event=RunnerInstalled(timestamp=12345, flavor=runner_manager.app_name, duration=0) + ) + + +def test_reconcile_issues_no_runner_installed_event_if_metrics_disabled( + runner_manager: RunnerManager, issue_event_mock: MagicMock, charm_state: MagicMock +): + """ + arrange: Disable issuing of metrics. + act: Reconcile to create a runner. + assert: The expected event is not issued. + """ + charm_state.is_metrics_logging_available = False + + runner_manager.reconcile(1, VirtualMachineResources(2, "7GiB", "10Gib")) + + issue_event_mock.assert_not_called() + + +def test_reconcile_error_on_runner_installed_event_are_ignored( + runner_manager: RunnerManager, + issue_event_mock: MagicMock, + charm_state: MagicMock, +): + """ + arrange: Enable issuing of metrics and mock the metric issuing to raise an expected error. + act: Reconcile to create a runner. + assert: No error is raised. + """ + charm_state.is_metrics_logging_available = True + + issue_event_mock.side_effect = OSError + + delta = runner_manager.reconcile(1, VirtualMachineResources(2, "7GiB", "10Gib")) + + assert delta == 1 diff --git a/tox.ini b/tox.ini index d6511bb4..4cd69663 100644 --- a/tox.ini +++ b/tox.ini @@ -14,7 +14,7 @@ all_path = {[vars]src_path} {[vars]tst_path} [testenv] basepython = python3.10 setenv = - PYTHONPATH={toxinidir}:{toxinidir}/src + PYTHONPATH={toxinidir}:{toxinidir}/lib:{toxinidir}/src PYTHONBREAKPOINT=ipdb.set_trace PY_COLOR=1 passenv = @@ -74,6 +74,7 @@ commands = description = Run unit tests deps = pytest + requests-mock coverage[toml] -r{toxinidir}/requirements.txt commands = @@ -114,8 +115,6 @@ commands = [testenv:src-docs] allowlist_externals=sh -setenv = - PYTHONPATH=src description = Generate documentation for src deps = lazydocs