From 314d3e12c9ea885a3ba562c1f9ad5fc1097c3628 Mon Sep 17 00:00:00 2001 From: Robert Gildein Date: Thu, 18 Jul 2024 16:33:04 +0200 Subject: [PATCH 1/8] Add metrics-endpoint and grafana-dashboard Add relations endpoints + update manifest to export metrics via service port. fixes: #475 --- .../grafana_k8s/v0/grafana_dashboard.py | 2014 ++++++++++++++ .../prometheus_k8s/v0/prometheus_scrape.py | 2378 +++++++++++++++++ charms/istio-gateway/metadata.yaml | 5 + charms/istio-gateway/requirements-unit.txt | 16 + charms/istio-gateway/requirements.in | 1 + charms/istio-gateway/requirements.txt | 16 + charms/istio-gateway/src/charm.py | 23 + charms/istio-gateway/src/manifest.yaml | 22 + .../tests/unit/data/egress-example.yaml | 22 + .../tests/unit/data/ingress-example.yaml | 22 + charms/istio-gateway/tests/unit/test_charm.py | 27 + 11 files changed, 4546 insertions(+) create mode 100644 charms/istio-gateway/lib/charms/grafana_k8s/v0/grafana_dashboard.py create mode 100644 charms/istio-gateway/lib/charms/prometheus_k8s/v0/prometheus_scrape.py diff --git a/charms/istio-gateway/lib/charms/grafana_k8s/v0/grafana_dashboard.py b/charms/istio-gateway/lib/charms/grafana_k8s/v0/grafana_dashboard.py new file mode 100644 index 00000000..dfc32ddc --- /dev/null +++ b/charms/istio-gateway/lib/charms/grafana_k8s/v0/grafana_dashboard.py @@ -0,0 +1,2014 @@ +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. + +"""## Overview. + +This document explains how to integrate with the Grafana charm +for the purpose of providing a dashboard which can be used by +end users. It also explains the structure of the data +expected by the `grafana-dashboard` interface, and may provide a +mechanism or reference point for providing a compatible interface +or library by providing a definitive reference guide to the +structure of relation data which is shared between the Grafana +charm and any charm providing datasource information. + +## Provider Library Usage + +The Grafana charm interacts with its dashboards using its charm +library. The goal of this library is to be as simple to use as +possible, and instantiation of the class with or without changing +the default arguments provides a complete use case. For the simplest +use case of a charm which bundles dashboards and provides a +`provides: grafana-dashboard` interface, + + requires: + grafana-dashboard: + interface: grafana_dashboard + +creation of a `GrafanaDashboardProvider` object with the default arguments is +sufficient. + +:class:`GrafanaDashboardProvider` expects that bundled dashboards should +be included in your charm with a default path of: + + path/to/charm.py + path/to/src/grafana_dashboards/*.{json|json.tmpl|.tmpl} + +Where the files are Grafana dashboard JSON data either from the +Grafana marketplace, or directly exported from a Grafana instance. +Refer to the [official docs](https://grafana.com/tutorials/provision-dashboards-and-data-sources/) +for more information. + +When constructing a dashboard that is intended to be consumed by COS, make sure to use variables +for your datasources, and name them "prometheusds" and "lokids". You can also use the following +juju topology variables in your dashboards: $juju_model, $juju_model_uuid, $juju_application +and $juju_unit. Note, however, that if metrics are coming via peripheral charms (scrape-config +or cos-config) then topology labels would not exist. + +The default constructor arguments are: + + `charm`: `self` from the charm instantiating this library + `relation_name`: grafana-dashboard + `dashboards_path`: "/src/grafana_dashboards" + +If your configuration requires any changes from these defaults, they +may be set from the class constructor. It may be instantiated as +follows: + + from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider + + class FooCharm: + def __init__(self, *args): + super().__init__(*args, **kwargs) + ... + self.grafana_dashboard_provider = GrafanaDashboardProvider(self) + ... + +The first argument (`self`) should be a reference to the parent (providing +dashboards), as this charm's lifecycle events will be used to re-submit +dashboard information if a charm is upgraded, the pod is restarted, or other. + +An instantiated `GrafanaDashboardProvider` validates that the path specified +in the constructor (or the default) exists, reads the file contents, then +compresses them with LZMA and adds them to the application relation data +when a relation is established with Grafana. + +Provided dashboards will be checked by Grafana, and a series of dropdown menus +providing the ability to select query targets by Juju Model, application instance, +and unit will be added if they do not exist. + +To avoid requiring `jinja` in `GrafanaDashboardProvider` users, template validation +and rendering occurs on the other side of the relation, and relation data in +the form of: + + { + "event": { + "valid": `true|false`, + "errors": [], + } + } + +Will be returned if rendering or validation fails. In this case, the +`GrafanaDashboardProvider` object will emit a `dashboard_status_changed` event +of the type :class:`GrafanaDashboardEvent`, which will contain information +about the validation error. + +This information is added to the relation data for the charms as serialized JSON +from a dict, with a structure of: +``` +{ + "application": { + "dashboards": { + "uuid": a uuid generated to ensure a relation event triggers, + "templates": { + "file:{hash}": { + "content": `{compressed_template_data}`, + "charm": `charm.meta.name`, + "juju_topology": { + "model": `charm.model.name`, + "model_uuid": `charm.model.uuid`, + "application": `charm.app.name`, + "unit": `charm.unit.name`, + } + }, + "file:{other_file_hash}": { + ... + }, + }, + }, + }, +} +``` + +This is ingested by :class:`GrafanaDashboardConsumer`, and is sufficient for configuration. + +The [COS Configuration Charm](https://charmhub.io/cos-configuration-k8s) can be used to +add dashboards which are not bundled with charms. + +## Consumer Library Usage + +The `GrafanaDashboardConsumer` object may be used by Grafana +charms to manage relations with available dashboards. For this +purpose, a charm consuming Grafana dashboard information should do +the following things: + +1. Instantiate the `GrafanaDashboardConsumer` object by providing it a +reference to the parent (Grafana) charm and, optionally, the name of +the relation that the Grafana charm uses to interact with dashboards. +This relation must confirm to the `grafana-dashboard` interface. + +For example a Grafana charm may instantiate the +`GrafanaDashboardConsumer` in its constructor as follows + + from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardConsumer + + def __init__(self, *args): + super().__init__(*args) + ... + self.grafana_dashboard_consumer = GrafanaDashboardConsumer(self) + ... + +2. A Grafana charm also needs to listen to the +`GrafanaDashboardConsumer` events emitted by the `GrafanaDashboardConsumer` +by adding itself as an observer for these events: + + self.framework.observe( + self.grafana_source_consumer.on.sources_changed, + self._on_dashboards_changed, + ) + +Dashboards can be retrieved the :meth:`dashboards`: + +It will be returned in the format of: + +``` +[ + { + "id": unique_id, + "relation_id": relation_id, + "charm": the name of the charm which provided the dashboard, + "content": compressed_template_data + }, +] +``` + +The consuming charm should decompress the dashboard. +""" + +import base64 +import hashlib +import json +import logging +import lzma +import os +import platform +import re +import subprocess +import tempfile +import uuid +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +import yaml +from ops.charm import ( + CharmBase, + HookEvent, + RelationBrokenEvent, + RelationChangedEvent, + RelationCreatedEvent, + RelationEvent, + RelationRole, +) +from ops.framework import ( + EventBase, + EventSource, + Object, + ObjectEvents, + StoredDict, + StoredList, + StoredState, +) +from ops.model import Relation + +# The unique Charmhub library identifier, never change it +LIBID = "c49eb9c7dfef40c7b6235ebd67010a3f" + +# Increment this major API version when introducing breaking changes +LIBAPI = 0 + +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version + +LIBPATCH = 36 + +logger = logging.getLogger(__name__) + + +DEFAULT_RELATION_NAME = "grafana-dashboard" +DEFAULT_PEER_NAME = "grafana" +RELATION_INTERFACE_NAME = "grafana_dashboard" + +TOPOLOGY_TEMPLATE_DROPDOWNS = [ # type: ignore + { + "allValue": ".*", + "datasource": "${prometheusds}", + "definition": "label_values(up,juju_model)", + "description": None, + "error": None, + "hide": 0, + "includeAll": True, + "label": "Juju model", + "multi": True, + "name": "juju_model", + "query": { + "query": "label_values(up,juju_model)", + "refId": "StandardVariableQuery", + }, + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": False, + }, + { + "allValue": ".*", + "datasource": "${prometheusds}", + "definition": 'label_values(up{juju_model=~"$juju_model"},juju_model_uuid)', + "description": None, + "error": None, + "hide": 0, + "includeAll": True, + "label": "Juju model uuid", + "multi": True, + "name": "juju_model_uuid", + "query": { + "query": 'label_values(up{juju_model=~"$juju_model"},juju_model_uuid)', + "refId": "StandardVariableQuery", + }, + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": False, + }, + { + "allValue": ".*", + "datasource": "${prometheusds}", + "definition": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid"},juju_application)', + "description": None, + "error": None, + "hide": 0, + "includeAll": True, + "label": "Juju application", + "multi": True, + "name": "juju_application", + "query": { + "query": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid"},juju_application)', + "refId": "StandardVariableQuery", + }, + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": False, + }, + { + "allValue": ".*", + "datasource": "${prometheusds}", + "definition": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},juju_unit)', + "description": None, + "error": None, + "hide": 0, + "includeAll": True, + "label": "Juju unit", + "multi": True, + "name": "juju_unit", + "query": { + "query": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},juju_unit)', + "refId": "StandardVariableQuery", + }, + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": False, + }, +] + +DATASOURCE_TEMPLATE_DROPDOWNS = [ # type: ignore + { + "description": None, + "error": None, + "hide": 0, + "includeAll": True, + "label": "Prometheus datasource", + "multi": True, + "name": "prometheusds", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "type": "datasource", + }, + { + "description": None, + "error": None, + "hide": 0, + "includeAll": True, + "label": "Loki datasource", + "multi": True, + "name": "lokids", + "options": [], + "query": "loki", + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "type": "datasource", + }, +] + +REACTIVE_CONVERTER = { # type: ignore + "allValue": None, + "datasource": "${prometheusds}", + "definition": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},host)', + "description": None, + "error": None, + "hide": 0, + "includeAll": True, + "label": "hosts", + "multi": True, + "name": "host", + "options": [], + "query": { + "query": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},host)', + "refId": "StandardVariableQuery", + }, + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": False, +} + + +class RelationNotFoundError(Exception): + """Raised if there is no relation with the given name.""" + + def __init__(self, relation_name: str): + self.relation_name = relation_name + self.message = "No relation named '{}' found".format(relation_name) + + super().__init__(self.message) + + +class RelationInterfaceMismatchError(Exception): + """Raised if the relation with the given name has a different interface.""" + + def __init__( + self, + relation_name: str, + expected_relation_interface: str, + actual_relation_interface: str, + ): + self.relation_name = relation_name + self.expected_relation_interface = expected_relation_interface + self.actual_relation_interface = actual_relation_interface + self.message = ( + "The '{}' relation has '{}' as " + "interface rather than the expected '{}'".format( + relation_name, actual_relation_interface, expected_relation_interface + ) + ) + + super().__init__(self.message) + + +class RelationRoleMismatchError(Exception): + """Raised if the relation with the given name has a different direction.""" + + def __init__( + self, + relation_name: str, + expected_relation_role: RelationRole, + actual_relation_role: RelationRole, + ): + self.relation_name = relation_name + self.expected_relation_interface = expected_relation_role + self.actual_relation_role = actual_relation_role + self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format( + relation_name, repr(actual_relation_role), repr(expected_relation_role) + ) + + super().__init__(self.message) + + +class InvalidDirectoryPathError(Exception): + """Raised if the grafana dashboards folder cannot be found or is otherwise invalid.""" + + def __init__( + self, + grafana_dashboards_absolute_path: str, + message: str, + ): + self.grafana_dashboards_absolute_path = grafana_dashboards_absolute_path + self.message = message + + super().__init__(self.message) + + +def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> str: + """Resolve the provided path items against the directory of the main file. + + Look up the directory of the charmed operator file being executed. This is normally + going to be the charm.py file of the charm including this library. Then, resolve + the provided path elements and return its absolute path. + + Raises: + InvalidDirectoryPathError if the resolved path does not exist or it is not a directory + + """ + charm_dir = Path(str(charm.charm_dir)) + if not charm_dir.exists() or not charm_dir.is_dir(): + # Operator Framework does not currently expose a robust + # way to determine the top level charm source directory + # that is consistent across deployed charms and unit tests + # Hence for unit tests the current working directory is used + # TODO: updated this logic when the following ticket is resolved + # https://github.com/canonical/operator/issues/643 + charm_dir = Path(os.getcwd()) + + dir_path = charm_dir.absolute().joinpath(*path_elements) + + if not dir_path.exists(): + raise InvalidDirectoryPathError(str(dir_path), "directory does not exist") + if not dir_path.is_dir(): + raise InvalidDirectoryPathError(str(dir_path), "is not a directory") + + return str(dir_path) + + +def _validate_relation_by_interface_and_direction( + charm: CharmBase, + relation_name: str, + expected_relation_interface: str, + expected_relation_role: RelationRole, +) -> None: + """Verifies that a relation has the necessary characteristics. + + Verifies that the `relation_name` provided: (1) exists in metadata.yaml, + (2) declares as interface the interface name passed as `relation_interface` + and (3) has the right "direction", i.e., it is a relation that `charm` + provides or requires. + + Args: + charm: a `CharmBase` object to scan for the matching relation. + relation_name: the name of the relation to be verified. + expected_relation_interface: the interface name to be matched by the + relation named `relation_name`. + expected_relation_role: whether the `relation_name` must be either + provided or required by `charm`. + + Raises: + RelationNotFoundError: If there is no relation in the charm's metadata.yaml + named like the value of the `relation_name` argument. + RelationInterfaceMismatchError: If the relation interface of the + relation named as the provided `relation_name` argument does not + match the `expected_relation_interface` argument. + RelationRoleMismatchError: If the relation named as the provided `relation_name` + argument has a different role than what is specified by the + `expected_relation_role` argument. + """ + if relation_name not in charm.meta.relations: + raise RelationNotFoundError(relation_name) + + relation = charm.meta.relations[relation_name] + + actual_relation_interface = relation.interface_name + if actual_relation_interface and actual_relation_interface != expected_relation_interface: + raise RelationInterfaceMismatchError( + relation_name, expected_relation_interface, actual_relation_interface + ) + + if expected_relation_role == RelationRole.provides: + if relation_name not in charm.meta.provides: + raise RelationRoleMismatchError( + relation_name, RelationRole.provides, RelationRole.requires + ) + elif expected_relation_role == RelationRole.requires: + if relation_name not in charm.meta.requires: + raise RelationRoleMismatchError( + relation_name, RelationRole.requires, RelationRole.provides + ) + else: + raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role)) + + +def _encode_dashboard_content(content: Union[str, bytes]) -> str: + if isinstance(content, str): + content = bytes(content, "utf-8") + + return base64.b64encode(lzma.compress(content)).decode("utf-8") + + +def _decode_dashboard_content(encoded_content: str) -> str: + return lzma.decompress(base64.b64decode(encoded_content.encode("utf-8"))).decode() + + +def _convert_dashboard_fields(content: str, inject_dropdowns: bool = True) -> str: + """Make sure values are present for Juju topology. + + Inserts Juju topology variables and selectors into the template, as well as + a variable for Prometheus. + """ + dict_content = json.loads(content) + datasources = {} + existing_templates = False + + template_dropdowns = ( + TOPOLOGY_TEMPLATE_DROPDOWNS + DATASOURCE_TEMPLATE_DROPDOWNS # type: ignore + if inject_dropdowns + else DATASOURCE_TEMPLATE_DROPDOWNS + ) + + # If the dashboard has __inputs, get the names to replace them. These are stripped + # from reactive dashboards in GrafanaDashboardAggregator, but charm authors in + # newer charms may import them directly from the marketplace + if "__inputs" in dict_content: + for field in dict_content["__inputs"]: + if "type" in field and field["type"] == "datasource": + datasources[field["name"]] = field["pluginName"].lower() + del dict_content["__inputs"] + + # If no existing template variables exist, just insert our own + if "templating" not in dict_content: + dict_content["templating"] = {"list": list(template_dropdowns)} # type: ignore + else: + # Otherwise, set a flag so we can go back later + existing_templates = True + for template_value in dict_content["templating"]["list"]: + # Build a list of `datasource_name`: `datasource_type` mappings + # The "query" field is actually "prometheus", "loki", "influxdb", etc + if "type" in template_value and template_value["type"] == "datasource": + datasources[template_value["name"]] = template_value["query"].lower() + + # Put our own variables in the template + for d in template_dropdowns: # type: ignore + if d not in dict_content["templating"]["list"]: + dict_content["templating"]["list"].insert(0, d) + + dict_content = _replace_template_fields(dict_content, datasources, existing_templates) + return json.dumps(dict_content) + + +def _replace_template_fields( # noqa: C901 + dict_content: dict, datasources: dict, existing_templates: bool +) -> dict: + """Make templated fields get cleaned up afterwards. + + If existing datasource variables are present, try to substitute them. + """ + replacements = {"loki": "${lokids}", "prometheus": "${prometheusds}"} + used_replacements = [] # type: List[str] + + # If any existing datasources match types we know, or we didn't find + # any templating variables at all, template them. + if datasources or not existing_templates: + panels = dict_content.get("panels", {}) + if panels: + dict_content["panels"] = _template_panels( + panels, replacements, used_replacements, existing_templates, datasources + ) + + # Find panels nested under rows + rows = dict_content.get("rows", {}) + if rows: + for row_idx, row in enumerate(rows): + if "panels" in row.keys(): + rows[row_idx]["panels"] = _template_panels( + row["panels"], + replacements, + used_replacements, + existing_templates, + datasources, + ) + + dict_content["rows"] = rows + + # Finally, go back and pop off the templates we stubbed out + deletions = [] + for tmpl in dict_content["templating"]["list"]: + if tmpl["name"] and tmpl["name"] in used_replacements: + deletions.append(tmpl) + + for d in deletions: + dict_content["templating"]["list"].remove(d) + + return dict_content + + +def _template_panels( + panels: dict, + replacements: dict, + used_replacements: list, + existing_templates: bool, + datasources: dict, +) -> dict: + """Iterate through a `panels` object and template it appropriately.""" + # Go through all the panels. If they have a datasource set, AND it's one + # that we can convert to ${lokids} or ${prometheusds}, by stripping off the + # ${} templating and comparing the name to the list we built, replace it, + # otherwise, leave it alone. + # + for panel in panels: + if "datasource" not in panel or not panel.get("datasource"): + continue + if not existing_templates: + datasource = panel.get("datasource") + if isinstance(datasource, str): + if "loki" in datasource: + panel["datasource"] = "${lokids}" + elif "grafana" in datasource: + continue + else: + panel["datasource"] = "${prometheusds}" + elif isinstance(datasource, dict): + # In dashboards exported by Grafana 9, datasource type is dict + dstype = datasource.get("type", "") + if dstype == "loki": + panel["datasource"]["uid"] = "${lokids}" + elif dstype == "prometheus": + panel["datasource"]["uid"] = "${prometheusds}" + else: + logger.debug("Unrecognized datasource type '%s'; skipping", dstype) + continue + else: + logger.error("Unknown datasource format: skipping") + continue + else: + if isinstance(panel["datasource"], str): + if panel["datasource"].lower() in replacements.values(): + # Already a known template variable + continue + # Strip out variable characters and maybe braces + ds = re.sub(r"(\$|\{|\})", "", panel["datasource"]) + + if ds not in datasources.keys(): + # Unknown, non-templated datasource, potentially a Grafana builtin + continue + + replacement = replacements.get(datasources[ds], "") + if replacement: + used_replacements.append(ds) + panel["datasource"] = replacement or panel["datasource"] + elif isinstance(panel["datasource"], dict): + dstype = panel["datasource"].get("type", "") + if panel["datasource"].get("uid", "").lower() in replacements.values(): + # Already a known template variable + continue + # Strip out variable characters and maybe braces + ds = re.sub(r"(\$|\{|\})", "", panel["datasource"].get("uid", "")) + + if ds not in datasources.keys(): + # Unknown, non-templated datasource, potentially a Grafana builtin + continue + + replacement = replacements.get(datasources[ds], "") + if replacement: + used_replacements.append(ds) + panel["datasource"]["uid"] = replacement + else: + logger.error("Unknown datasource format: skipping") + continue + return panels + + +def _inject_labels(content: str, topology: dict, transformer: "CosTool") -> str: + """Inject Juju topology into panel expressions via CosTool. + + A dashboard will have a structure approximating: + { + "__inputs": [], + "templating": { + "list": [ + { + "name": "prometheusds", + "type": "prometheus" + } + ] + }, + "panels": [ + { + "foo": "bar", + "targets": [ + { + "some": "field", + "expr": "up{job="foo"}" + }, + { + "some_other": "field", + "expr": "sum(http_requests_total{instance="$foo"}[5m])} + } + ], + "datasource": "${someds}" + } + ] + } + + `templating` is used elsewhere in this library, but the structure is not rigid. It is + not guaranteed that a panel will actually have any targets (it could be a "spacer" with + no datasource, hence no expression). It could have only one target. It could have multiple + targets. It could have multiple targets of which only one has an `expr` to evaluate. We need + to try to handle all of these concisely. + + `cos-tool` (`github.com/canonical/cos-tool` as a Go module in general) + does not know "Grafana-isms", such as using `[$_variable]` to modify the query from the user + interface, so we add placeholders (as `5y`, since it must parse, but a dashboard looking for + five years for a panel query would be unusual). + + Args: + content: dashboard content as a string + topology: a dict containing topology values + transformer: a 'CosTool' instance + Returns: + dashboard content with replaced values. + """ + dict_content = json.loads(content) + + if "panels" not in dict_content.keys(): + return json.dumps(dict_content) + + # Go through all the panels and inject topology labels + # Panels may have more than one 'target' where the expressions live, so that must be + # accounted for. Additionally, `promql-transform` does not necessarily gracefully handle + # expressions with range queries including variables. Exclude these. + # + # It is not a certainty that the `datasource` field will necessarily reflect the type, so + # operate on all fields. + panels = dict_content["panels"] + topology_with_prefix = {"juju_{}".format(k): v for k, v in topology.items()} + + # We need to use an index so we can insert the changed element back later + for panel_idx, panel in enumerate(panels): + if not isinstance(panel, dict): + continue + + # Use the index to insert it back in the same location + panels[panel_idx] = _modify_panel(panel, topology_with_prefix, transformer) + + return json.dumps(dict_content) + + +def _modify_panel(panel: dict, topology: dict, transformer: "CosTool") -> dict: + """Inject Juju topology into panel expressions via CosTool. + + Args: + panel: a dashboard panel as a dict + topology: a dict containing topology values + transformer: a 'CosTool' instance + Returns: + the panel with injected values + """ + if "targets" not in panel.keys(): + return panel + + # Pre-compile a regular expression to grab values from inside of [] + range_re = re.compile(r"\[(?P.*?)\]") + # Do the same for any offsets + offset_re = re.compile(r"offset\s+(?P-?\s*[$\w]+)") + + known_datasources = {"${prometheusds}": "promql", "${lokids}": "logql"} + + targets = panel["targets"] + + # We need to use an index so we can insert the changed element back later + for idx, target in enumerate(targets): + # If there's no expression, we don't need to do anything + if "expr" not in target.keys(): + continue + expr = target["expr"] + + if "datasource" not in panel.keys(): + continue + + if isinstance(panel["datasource"], str): + if panel["datasource"] not in known_datasources: + continue + querytype = known_datasources[panel["datasource"]] + elif isinstance(panel["datasource"], dict): + if panel["datasource"]["uid"] not in known_datasources: + continue + querytype = known_datasources[panel["datasource"]["uid"]] + else: + logger.error("Unknown datasource format: skipping") + continue + + # Capture all values inside `[]` into a list which we'll iterate over later to + # put them back in-order. Then apply the regex again and replace everything with + # `[5y]` so promql/parser will take it. + # + # Then do it again for offsets + range_values = [m.group("value") for m in range_re.finditer(expr)] + expr = range_re.sub(r"[5y]", expr) + + offset_values = [m.group("value") for m in offset_re.finditer(expr)] + expr = offset_re.sub(r"offset 5y", expr) + # Retrieve the new expression (which may be unchanged if there were no label + # matchers in the expression, or if tt was unable to be parsed like logql. It's + # virtually impossible to tell from any datasource "name" in a panel what the + # actual type is without re-implementing a complete dashboard parser, but no + # harm will some from passing invalid promql -- we'll just get the original back. + # + replacement = transformer.inject_label_matchers(expr, topology, querytype) + + if replacement == target["expr"]: + # promql-tranform caught an error. Move on + continue + + # Go back and substitute values in [] which were pulled out + # Enumerate with an index... again. The same regex is ok, since it will still match + # `[(.*?)]`, which includes `[5y]`, our placeholder + for i, match in enumerate(range_re.finditer(replacement)): + # Replace one-by-one, starting from the left. We build the string back with + # `str.replace(string_to_replace, replacement_value, count)`. Limit the count + # to one, since we are going through one-by-one through the list we saved earlier + # in `range_values`. + replacement = replacement.replace( + "[{}]".format(match.group("value")), + "[{}]".format(range_values[i]), + 1, + ) + + for i, match in enumerate(offset_re.finditer(replacement)): + # Replace one-by-one, starting from the left. We build the string back with + # `str.replace(string_to_replace, replacement_value, count)`. Limit the count + # to one, since we are going through one-by-one through the list we saved earlier + # in `range_values`. + replacement = replacement.replace( + "offset {}".format(match.group("value")), + "offset {}".format(offset_values[i]), + 1, + ) + + # Use the index to insert it back in the same location + targets[idx]["expr"] = replacement + + panel["targets"] = targets + return panel + + +def _type_convert_stored(obj): + """Convert Stored* to their appropriate types, recursively.""" + if isinstance(obj, StoredList): + return list(map(_type_convert_stored, obj)) + if isinstance(obj, StoredDict): + rdict = {} # type: Dict[Any, Any] + for k in obj.keys(): + rdict[k] = _type_convert_stored(obj[k]) + return rdict + return obj + + +class GrafanaDashboardsChanged(EventBase): + """Event emitted when Grafana dashboards change.""" + + def __init__(self, handle, data=None): + super().__init__(handle) + self.data = data + + def snapshot(self) -> Dict: + """Save grafana source information.""" + return {"data": self.data} + + def restore(self, snapshot): + """Restore grafana source information.""" + self.data = snapshot["data"] + + +class GrafanaDashboardEvents(ObjectEvents): + """Events raised by :class:`GrafanaSourceEvents`.""" + + dashboards_changed = EventSource(GrafanaDashboardsChanged) + + +class GrafanaDashboardEvent(EventBase): + """Event emitted when Grafana dashboards cannot be resolved. + + Enables us to set a clear status on the provider. + """ + + def __init__(self, handle, errors: List[Dict[str, str]] = [], valid: bool = False): + super().__init__(handle) + self.errors = errors + self.error_message = "; ".join([error["error"] for error in errors if "error" in error]) + self.valid = valid + + def snapshot(self) -> Dict: + """Save grafana source information.""" + return { + "error_message": self.error_message, + "valid": self.valid, + "errors": json.dumps(self.errors), + } + + def restore(self, snapshot): + """Restore grafana source information.""" + self.error_message = snapshot["error_message"] + self.valid = snapshot["valid"] + self.errors = json.loads(str(snapshot["errors"])) + + +class GrafanaProviderEvents(ObjectEvents): + """Events raised by :class:`GrafanaSourceEvents`.""" + + dashboard_status_changed = EventSource(GrafanaDashboardEvent) + + +class GrafanaDashboardProvider(Object): + """An API to provide Grafana dashboards to a Grafana charm.""" + + _stored = StoredState() + on = GrafanaProviderEvents() # pyright: ignore + + def __init__( + self, + charm: CharmBase, + relation_name: str = DEFAULT_RELATION_NAME, + dashboards_path: str = "src/grafana_dashboards", + ) -> None: + """API to provide Grafana dashboard to a Grafana charmed operator. + + The :class:`GrafanaDashboardProvider` object provides an API + to upload dashboards to a Grafana charm. In its most streamlined + usage, the :class:`GrafanaDashboardProvider` is integrated in a + charmed operator as follows: + + self.grafana = GrafanaDashboardProvider(self) + + The :class:`GrafanaDashboardProvider` will look for dashboard + templates in the `/grafana_dashboards` folder. + Additionally, dashboard templates can be uploaded programmatically + via the :method:`GrafanaDashboardProvider.add_dashboard` method. + + To use the :class:`GrafanaDashboardProvider` API, you need a relation + defined in your charm operator's metadata.yaml as follows: + + provides: + grafana-dashboard: + interface: grafana_dashboard + + If you would like to use relation name other than `grafana-dashboard`, + you will need to specify the relation name via the `relation_name` + argument when instantiating the :class:`GrafanaDashboardProvider` object. + However, it is strongly advised to keep the default relation name, + so that people deploying your charm will have a consistent experience + with all other charms that provide Grafana dashboards. + + It is possible to provide a different file path for the Grafana dashboards + to be automatically managed by the :class:`GrafanaDashboardProvider` object + via the `dashboards_path` argument. This may be necessary when the directory + structure of your charmed operator repository is not the "usual" one as + generated by `charmcraft init`, for example when adding the charmed operator + in a Java repository managed by Maven or Gradle. However, unless there are + such constraints with other tooling, it is strongly advised to store the + Grafana dashboards in the default `/grafana_dashboards` + folder, in order to provide a consistent experience for other charmed operator + authors. + + Args: + charm: a :class:`CharmBase` object which manages this + :class:`GrafanaProvider` object. Generally this is + `self` in the instantiating class. + relation_name: a :string: name of the relation managed by this + :class:`GrafanaDashboardProvider`; it defaults to "grafana-dashboard". + dashboards_path: a filesystem path relative to the charm root + where dashboard templates can be located. By default, the library + expects dashboard files to be in the `/grafana_dashboards` + directory. + """ + _validate_relation_by_interface_and_direction( + charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides + ) + + try: + dashboards_path = _resolve_dir_against_charm_path(charm, dashboards_path) + except InvalidDirectoryPathError as e: + logger.warning( + "Invalid Grafana dashboards folder at %s: %s", + e.grafana_dashboards_absolute_path, + e.message, + ) + + super().__init__(charm, relation_name) + + self._charm = charm + self._relation_name = relation_name + self._dashboards_path = dashboards_path + + # No peer relation bucket we can rely on providers, keep StoredState here, too + self._stored.set_default(dashboard_templates={}) # type: ignore + + self.framework.observe(self._charm.on.leader_elected, self._update_all_dashboards_from_dir) + self.framework.observe(self._charm.on.upgrade_charm, self._update_all_dashboards_from_dir) + self.framework.observe(self._charm.on.config_changed, self._update_all_dashboards_from_dir) + + self.framework.observe( + self._charm.on[self._relation_name].relation_created, + self._on_grafana_dashboard_relation_created, + ) + self.framework.observe( + self._charm.on[self._relation_name].relation_changed, + self._on_grafana_dashboard_relation_changed, + ) + + def add_dashboard(self, content: str, inject_dropdowns: bool = True) -> None: + """Add a dashboard to the relation managed by this :class:`GrafanaDashboardProvider`. + + Args: + content: a string representing a Jinja template. Currently, no + global variables are added to the Jinja template evaluation + context. + inject_dropdowns: a :boolean: indicating whether topology dropdowns should be + added to the dashboard + """ + # Update of storage must be done irrespective of leadership, so + # that the stored state is there when this unit becomes leader. + stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore + + encoded_dashboard = _encode_dashboard_content(content) + + # Use as id the first chars of the encoded dashboard, so that + # it is predictable across units. + id = "prog:{}".format(encoded_dashboard[-24:-16]) + + stored_dashboard_templates[id] = self._content_to_dashboard_object( + encoded_dashboard, inject_dropdowns + ) + stored_dashboard_templates[id]["dashboard_alt_uid"] = self._generate_alt_uid(id) + + if self._charm.unit.is_leader(): + for dashboard_relation in self._charm.model.relations[self._relation_name]: + self._upset_dashboards_on_relation(dashboard_relation) + + def remove_non_builtin_dashboards(self) -> None: + """Remove all dashboards to the relation added via :method:`add_dashboard`.""" + # Update of storage must be done irrespective of leadership, so + # that the stored state is there when this unit becomes leader. + stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore + + for dashboard_id in list(stored_dashboard_templates.keys()): + if dashboard_id.startswith("prog:"): + del stored_dashboard_templates[dashboard_id] + self._stored.dashboard_templates = stored_dashboard_templates + + if self._charm.unit.is_leader(): + for dashboard_relation in self._charm.model.relations[self._relation_name]: + self._upset_dashboards_on_relation(dashboard_relation) + + def update_dashboards(self) -> None: + """Trigger the re-evaluation of the data on all relations.""" + if self._charm.unit.is_leader(): + for dashboard_relation in self._charm.model.relations[self._relation_name]: + self._upset_dashboards_on_relation(dashboard_relation) + + def _update_all_dashboards_from_dir( + self, _: Optional[HookEvent] = None, inject_dropdowns: bool = True + ) -> None: + """Scans the built-in dashboards and updates relations with changes.""" + # Update of storage must be done irrespective of leadership, so + # that the stored state is there when this unit becomes leader. + + # Ensure we do not leave outdated dashboards by removing from stored all + # the encoded dashboards that start with "file/". + if self._dashboards_path: + stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore + + for dashboard_id in list(stored_dashboard_templates.keys()): + if dashboard_id.startswith("file:"): + del stored_dashboard_templates[dashboard_id] + + # Path.glob uses fnmatch on the backend, which is pretty limited, so use a + # custom function for the filter + def _is_dashboard(p: Path) -> bool: + return p.is_file() and p.name.endswith((".json", ".json.tmpl", ".tmpl")) + + for path in filter(_is_dashboard, Path(self._dashboards_path).glob("*")): + # path = Path(path) + id = "file:{}".format(path.stem) + stored_dashboard_templates[id] = self._content_to_dashboard_object( + _encode_dashboard_content(path.read_bytes()), inject_dropdowns + ) + stored_dashboard_templates[id]["dashboard_alt_uid"] = self._generate_alt_uid(id) + + self._stored.dashboard_templates = stored_dashboard_templates + + if self._charm.unit.is_leader(): + for dashboard_relation in self._charm.model.relations[self._relation_name]: + self._upset_dashboards_on_relation(dashboard_relation) + + def _generate_alt_uid(self, key: str) -> str: + """Generate alternative uid for dashboards. + + Args: + key: A string used (along with charm.meta.name) to build the hash uid. + + Returns: A hash string. + """ + raw_dashboard_alt_uid = "{}-{}".format(self._charm.meta.name, key) + return hashlib.shake_256(raw_dashboard_alt_uid.encode("utf-8")).hexdigest(8) + + def _reinitialize_dashboard_data(self, inject_dropdowns: bool = True) -> None: + """Triggers a reload of dashboard outside of an eventing workflow. + + Args: + inject_dropdowns: a :bool: used to indicate whether topology dropdowns should be added + + This will destroy any existing relation data. + """ + try: + _resolve_dir_against_charm_path(self._charm, self._dashboards_path) + self._update_all_dashboards_from_dir(inject_dropdowns=inject_dropdowns) + + except InvalidDirectoryPathError as e: + logger.warning( + "Invalid Grafana dashboards folder at %s: %s", + e.grafana_dashboards_absolute_path, + e.message, + ) + stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore + + for dashboard_id in list(stored_dashboard_templates.keys()): + if dashboard_id.startswith("file:"): + del stored_dashboard_templates[dashboard_id] + self._stored.dashboard_templates = stored_dashboard_templates + + # With all the file-based dashboards cleared out, force a refresh + # of relation data + if self._charm.unit.is_leader(): + for dashboard_relation in self._charm.model.relations[self._relation_name]: + self._upset_dashboards_on_relation(dashboard_relation) + + def _on_grafana_dashboard_relation_created(self, event: RelationCreatedEvent) -> None: + """Watch for a relation being created and automatically send dashboards. + + Args: + event: The :class:`RelationJoinedEvent` sent when a + `grafana_dashboaard` relationship is joined + """ + if self._charm.unit.is_leader(): + self._update_all_dashboards_from_dir() + self._upset_dashboards_on_relation(event.relation) + + def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> None: + """Watch for changes so we know if there's an error to signal back to the parent charm. + + Args: + event: The `RelationChangedEvent` that triggered this handler. + """ + if self._charm.unit.is_leader(): + data = json.loads(event.relation.data[event.app].get("event", "{}")) # type: ignore + + if not data: + return + + valid = bool(data.get("valid", True)) + errors = data.get("errors", []) + if valid and not errors: + self.on.dashboard_status_changed.emit(valid=valid) # pyright: ignore + else: + self.on.dashboard_status_changed.emit( # pyright: ignore + valid=valid, errors=errors + ) + + def _upset_dashboards_on_relation(self, relation: Relation) -> None: + """Update the dashboards in the relation data bucket.""" + # It's completely ridiculous to add a UUID, but if we don't have some + # pseudo-random value, this never makes it across 'juju set-state' + stored_data = { + "templates": _type_convert_stored(self._stored.dashboard_templates), # pyright: ignore + "uuid": str(uuid.uuid4()), + } + + relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) + + def _content_to_dashboard_object(self, content: str, inject_dropdowns: bool = True) -> Dict: + return { + "charm": self._charm.meta.name, + "content": content, + "juju_topology": self._juju_topology if inject_dropdowns else {}, + "inject_dropdowns": inject_dropdowns, + } + + # This is not actually used in the dashboards, but is present to provide a secondary + # salt to ensure uniqueness in the dict keys in case individual charm units provide + # dashboards + @property + def _juju_topology(self) -> Dict: + return { + "model": self._charm.model.name, + "model_uuid": self._charm.model.uuid, + "application": self._charm.app.name, + "unit": self._charm.unit.name, + } + + @property + def dashboard_templates(self) -> List: + """Return a list of the known dashboard templates.""" + return list(self._stored.dashboard_templates.values()) # type: ignore + + +class GrafanaDashboardConsumer(Object): + """A consumer object for working with Grafana Dashboards.""" + + on = GrafanaDashboardEvents() # pyright: ignore + _stored = StoredState() + + def __init__( + self, + charm: CharmBase, + relation_name: str = DEFAULT_RELATION_NAME, + ) -> None: + """API to receive Grafana dashboards from charmed operators. + + The :class:`GrafanaDashboardConsumer` object provides an API + to consume dashboards provided by a charmed operator using the + :class:`GrafanaDashboardProvider` library. The + :class:`GrafanaDashboardConsumer` is integrated in a + charmed operator as follows: + + self.grafana = GrafanaDashboardConsumer(self) + + To use this library, you need a relation defined as follows in + your charm operator's metadata.yaml: + + requires: + grafana-dashboard: + interface: grafana_dashboard + + If you would like to use a different relation name than + `grafana-dashboard`, you need to specify the relation name via the + `relation_name` argument. However, it is strongly advised not to + change the default, so that people deploying your charm will have + a consistent experience with all other charms that consume Grafana + dashboards. + + Args: + charm: a :class:`CharmBase` object which manages this + :class:`GrafanaProvider` object. Generally this is + `self` in the instantiating class. + relation_name: a :string: name of the relation managed by this + :class:`GrafanaDashboardConsumer`; it defaults to "grafana-dashboard". + """ + _validate_relation_by_interface_and_direction( + charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires + ) + + super().__init__(charm, relation_name) + self._charm = charm + self._relation_name = relation_name + self._tranformer = CosTool(self._charm) + + self._stored.set_default(dashboards={}) # type: ignore + + self.framework.observe( + self._charm.on[self._relation_name].relation_changed, + self._on_grafana_dashboard_relation_changed, + ) + self.framework.observe( + self._charm.on[self._relation_name].relation_broken, + self._on_grafana_dashboard_relation_broken, + ) + self.framework.observe( + self._charm.on[DEFAULT_PEER_NAME].relation_changed, + self._on_grafana_peer_changed, + ) + + def get_dashboards_from_relation(self, relation_id: int) -> List: + """Get a list of known dashboards for one instance of the monitored relation. + + Args: + relation_id: the identifier of the relation instance, as returned by + :method:`ops.model.Relation.id`. + + Returns: a list of known dashboards coming from the provided relation instance. + """ + return [ + self._to_external_object(relation_id, dashboard) + for dashboard in self._get_stored_dashboards(relation_id) + ] + + def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> None: + """Handle relation changes in related providers. + + If there are changes in relations between Grafana dashboard consumers + and providers, this event handler (if the unit is the leader) will + get data for an incoming grafana-dashboard relation through a + :class:`GrafanaDashboardsChanged` event, and make the relation data + available in the app's datastore object. The Grafana charm can + then respond to the event to update its configuration. + """ + changes = False + if self._charm.unit.is_leader(): + changes = self._render_dashboards_and_signal_changed(event.relation) + + if changes: + self.on.dashboards_changed.emit() # pyright: ignore + + def _on_grafana_peer_changed(self, _: RelationChangedEvent) -> None: + """Emit dashboard events on peer events so secondary charm data updates.""" + if self._charm.unit.is_leader(): + return + self.on.dashboards_changed.emit() # pyright: ignore + + def update_dashboards(self, relation: Optional[Relation] = None) -> None: + """Re-establish dashboards on one or more relations. + + If something changes between this library and a datasource, try to re-establish + invalid dashboards and invalidate active ones. + + Args: + relation: a specific relation for which the dashboards have to be + updated. If not specified, all relations managed by this + :class:`GrafanaDashboardConsumer` will be updated. + """ + if self._charm.unit.is_leader(): + relations = ( + [relation] if relation else self._charm.model.relations[self._relation_name] + ) + + for relation in relations: + self._render_dashboards_and_signal_changed(relation) + + def _on_grafana_dashboard_relation_broken(self, event: RelationBrokenEvent) -> None: + """Update job config when providers depart. + + When a Grafana dashboard provider departs, the configuration + for that provider is removed from the list of dashboards + """ + if not self._charm.unit.is_leader(): + return + + self._remove_all_dashboards_for_relation(event.relation) + + def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # type: ignore + """Validate a given dashboard. + + Verify that the passed dashboard data is able to be found in our list + of datasources and will render. If they do, let the charm know by + emitting an event. + + Args: + relation: Relation; The relation the dashboard is associated with. + + Returns: + a boolean indicating whether an event should be emitted + """ + other_app = relation.app + + raw_data = relation.data[other_app].get("dashboards", "") # pyright: ignore + + if not raw_data: + logger.warning( + "No dashboard data found in the %s:%s relation", + self._relation_name, + str(relation.id), + ) + return False + + data = json.loads(raw_data) + + # The only piece of data needed on this side of the relations is "templates" + templates = data.pop("templates") + + # The dashboards are WAY too big since this ultimately calls out to Juju to + # set the relation data, and it overflows the maximum argument length for + # subprocess, so we have to use b64, annoyingly. + # Worse, Python3 expects absolutely everything to be a byte, and a plain + # `base64.b64encode()` is still too large, so we have to go through hoops + # of encoding to byte, compressing with lzma, converting to base64 so it + # can be converted to JSON, then all the way back. + + rendered_dashboards = [] + relation_has_invalid_dashboards = False + + for _, (fname, template) in enumerate(templates.items()): + content = None + error = None + topology = template.get("juju_topology", {}) + try: + content = _decode_dashboard_content(template["content"]) + inject_dropdowns = template.get("inject_dropdowns", True) + content = self._manage_dashboard_uid(content, template) + content = _convert_dashboard_fields(content, inject_dropdowns) + + if topology: + content = _inject_labels(content, topology, self._tranformer) + + content = _encode_dashboard_content(content) + except lzma.LZMAError as e: + error = str(e) + relation_has_invalid_dashboards = True + except json.JSONDecodeError as e: + error = str(e.msg) + logger.warning("Invalid JSON in Grafana dashboard: {}".format(fname)) + continue + + # Prepend the relation name and ID to the dashboard ID to avoid clashes with + # multiple relations with apps from the same charm, or having dashboards with + # the same ids inside their charm operators + rendered_dashboards.append( + { + "id": "{}:{}/{}".format(relation.name, relation.id, fname), + "original_id": fname, + "content": content if content else None, + "template": template, + "valid": (error is None), + "error": error, + } + ) + + if relation_has_invalid_dashboards: + self._remove_all_dashboards_for_relation(relation) + + invalid_templates = [ + data["original_id"] for data in rendered_dashboards if not data["valid"] + ] + + logger.warning( + "Cannot add one or more Grafana dashboards from relation '{}:{}': the following " + "templates are invalid: {}".format( + relation.name, + relation.id, + invalid_templates, + ) + ) + + relation.data[self._charm.app]["event"] = json.dumps( + { + "errors": [ + { + "dashboard_id": rendered_dashboard["original_id"], + "error": rendered_dashboard["error"], + } + for rendered_dashboard in rendered_dashboards + if rendered_dashboard["error"] + ] + } + ) + + # Dropping dashboards for a relation needs to be signalled + return True + + stored_data = rendered_dashboards + currently_stored_data = self._get_stored_dashboards(relation.id) + + coerced_data = _type_convert_stored(currently_stored_data) if currently_stored_data else {} + + if not coerced_data == stored_data: + stored_dashboards = self.get_peer_data("dashboards") + stored_dashboards[relation.id] = stored_data + self.set_peer_data("dashboards", stored_dashboards) + return True + return None # type: ignore + + def _manage_dashboard_uid(self, dashboard: str, template: dict) -> str: + """Add an uid to the dashboard if it is not present.""" + dashboard_dict = json.loads(dashboard) + + if not dashboard_dict.get("uid", None) and "dashboard_alt_uid" in template: + dashboard_dict["uid"] = template["dashboard_alt_uid"] + + return json.dumps(dashboard_dict) + + def _remove_all_dashboards_for_relation(self, relation: Relation) -> None: + """If an errored dashboard is in stored data, remove it and trigger a deletion.""" + if self._get_stored_dashboards(relation.id): + stored_dashboards = self.get_peer_data("dashboards") + stored_dashboards.pop(str(relation.id)) + self.set_peer_data("dashboards", stored_dashboards) + self.on.dashboards_changed.emit() # pyright: ignore + + def _to_external_object(self, relation_id, dashboard): + return { + "id": dashboard["original_id"], + "relation_id": relation_id, + "charm": dashboard["template"]["charm"], + "content": _decode_dashboard_content(dashboard["content"]), + } + + @property + def dashboards(self) -> List[Dict]: + """Get a list of known dashboards across all instances of the monitored relation. + + Returns: a list of known dashboards. The JSON of each of the dashboards is available + in the `content` field of the corresponding `dict`. + """ + dashboards = [] + + for _, (relation_id, dashboards_for_relation) in enumerate( + self.get_peer_data("dashboards").items() + ): + for dashboard in dashboards_for_relation: + dashboards.append(self._to_external_object(relation_id, dashboard)) + + return dashboards + + def _get_stored_dashboards(self, relation_id: int) -> list: + """Pull stored dashboards out of the peer data bucket.""" + return self.get_peer_data("dashboards").get(str(relation_id), {}) + + def _set_default_data(self) -> None: + """Set defaults if they are not in peer relation data.""" + data = {"dashboards": {}} # type: ignore + for k, v in data.items(): + if not self.get_peer_data(k): + self.set_peer_data(k, v) + + def set_peer_data(self, key: str, data: Any) -> None: + """Put information into the peer data bucket instead of `StoredState`.""" + self._charm.peers.data[self._charm.app][key] = json.dumps(data) # type: ignore[attr-defined] + + def get_peer_data(self, key: str) -> Any: + """Retrieve information from the peer data bucket instead of `StoredState`.""" + data = self._charm.peers.data[self._charm.app].get(key, "") # type: ignore[attr-defined] + return json.loads(data) if data else {} + + +class GrafanaDashboardAggregator(Object): + """API to retrieve Grafana dashboards from machine dashboards. + + The :class:`GrafanaDashboardAggregator` object provides a way to + collate and aggregate Grafana dashboards from reactive/machine charms + and transport them into Charmed Operators, using Juju topology. + For detailed usage instructions, see the documentation for + :module:`cos-proxy-operator`, as this class is intended for use as a + single point of intersection rather than use in individual charms. + + Since :class:`GrafanaDashboardAggregator` serves as a bridge between + Canonical Observability Stack Charmed Operators and Reactive Charms, + deployed in a Reactive Juju model, both a target relation which is + used to collect events from Reactive charms and a `grafana_relation` + which is used to send the collected data back to the Canonical + Observability Stack are required. + + In its most streamlined usage, :class:`GrafanaDashboardAggregator` is + integrated in a charmed operator as follows: + self.grafana = GrafanaDashboardAggregator(self) + + Args: + charm: a :class:`CharmBase` object which manages this + :class:`GrafanaProvider` object. Generally this is + `self` in the instantiating class. + target_relation: a :string: name of a relation managed by this + :class:`GrafanaDashboardAggregator`, which is used to communicate + with reactive/machine charms it defaults to "dashboards". + grafana_relation: a :string: name of a relation used by this + :class:`GrafanaDashboardAggregator`, which is used to communicate + with charmed grafana. It defaults to "downstream-grafana-dashboard" + """ + + _stored = StoredState() + on = GrafanaProviderEvents() # pyright: ignore + + def __init__( + self, + charm: CharmBase, + target_relation: str = "dashboards", + grafana_relation: str = "downstream-grafana-dashboard", + ): + super().__init__(charm, grafana_relation) + + # Reactive charms may be RPC-ish and not leave reliable data around. Keep + # StoredState here + self._stored.set_default( # type: ignore + dashboard_templates={}, + id_mappings={}, + ) + + self._charm = charm + self._target_relation = target_relation + self._grafana_relation = grafana_relation + + self.framework.observe( + self._charm.on[self._grafana_relation].relation_joined, + self._update_remote_grafana, + ) + self.framework.observe( + self._charm.on[self._grafana_relation].relation_changed, + self._update_remote_grafana, + ) + self.framework.observe( + self._charm.on[self._target_relation].relation_changed, + self.update_dashboards, + ) + self.framework.observe( + self._charm.on[self._target_relation].relation_broken, + self.remove_dashboards, + ) + + def update_dashboards(self, event: RelationEvent) -> None: + """If we get a dashboard from a reactive charm, parse it out and update.""" + if self._charm.unit.is_leader(): + self._upset_dashboards_on_event(event) + + def _upset_dashboards_on_event(self, event: RelationEvent) -> None: + """Update the dashboards in the relation data bucket.""" + dashboards = self._handle_reactive_dashboards(event) + + if not dashboards: + logger.warning( + "Could not find dashboard data after a relation change for {}".format(event.app) + ) + return + + for id in dashboards: + self._stored.dashboard_templates[id] = self._content_to_dashboard_object( # type: ignore + dashboards[id], event + ) + + self._stored.id_mappings[event.app.name] = dashboards # type: ignore + self._update_remote_grafana(event) + + def _update_remote_grafana(self, _: Optional[RelationEvent] = None) -> None: + """Push dashboards to the downstream Grafana relation.""" + # It's still ridiculous to add a UUID here, but needed + stored_data = { + "templates": _type_convert_stored(self._stored.dashboard_templates), # pyright: ignore + "uuid": str(uuid.uuid4()), + } + + if self._charm.unit.is_leader(): + for grafana_relation in self.model.relations[self._grafana_relation]: + grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) + + def remove_dashboards(self, event: RelationBrokenEvent) -> None: + """Remove a dashboard if the relation is broken.""" + app_ids = _type_convert_stored(self._stored.id_mappings.get(event.app.name, "")) # type: ignore + + if not app_ids: + logger.info("Could not look up stored dashboards for %s", event.app.name) # type: ignore + return + + del self._stored.id_mappings[event.app.name] # type: ignore + for id in app_ids: + del self._stored.dashboard_templates[id] # type: ignore + + stored_data = { + "templates": _type_convert_stored(self._stored.dashboard_templates), # pyright: ignore + "uuid": str(uuid.uuid4()), + } + + if self._charm.unit.is_leader(): + for grafana_relation in self.model.relations[self._grafana_relation]: + grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) + + # Yes, this has a fair amount of branching. It's not that complex, though + def _strip_existing_datasources(self, dash: dict) -> dict: # noqa: C901 + """Remove existing reactive charm datasource templating out. + + This method iterates through *known* places where reactive charms may set + data in contributed dashboards and removes them. + + `dashboard["__inputs"]` is a property sometimes set when exporting dashboards from + the Grafana UI. It is not present in earlier Grafana versions, and can be disabled + in 5.3.4 and above (optionally). If set, any values present will be substituted on + import. Some reactive charms use this for Prometheus. COS uses dropdown selectors + for datasources, and leaving this present results in "default" datasource values + which are broken. + + Similarly, `dashboard["templating"]["list"][N]["name"] == "host"` can be used to + set a `host` variable for use in dashboards which is not meaningful in the context + of Juju topology and will yield broken dashboards. + + Further properties may be discovered. + """ + try: + if "list" in dash["templating"]: + for i in range(len(dash["templating"]["list"])): + if ( + "datasource" in dash["templating"]["list"][i] + and dash["templating"]["list"][i]["datasource"] is not None + ): + if "Juju" in dash["templating"]["list"][i].get("datasource", ""): + dash["templating"]["list"][i]["datasource"] = r"${prometheusds}" + + # Strip out newly-added 'juju_application' template variables which + # don't line up with our drop-downs + dash_mutable = dash + for i in range(len(dash["templating"]["list"])): + if ( + "name" in dash["templating"]["list"][i] + and dash["templating"]["list"][i].get("name", "") == "app" + ): + del dash_mutable["templating"]["list"][i] + + if dash_mutable: + dash = dash_mutable + except KeyError: + logger.debug("No existing templating data in dashboard") + + if "__inputs" in dash: + inputs = dash + for i in range(len(dash["__inputs"])): + if dash["__inputs"][i].get("pluginName", "") == "Prometheus": + del inputs["__inputs"][i] + if inputs: + dash["__inputs"] = inputs["__inputs"] + else: + del dash["__inputs"] + + return dash + + def _handle_reactive_dashboards(self, event: RelationEvent) -> Optional[Dict]: + """Look for a dashboard in relation data (during a reactive hook) or builtin by name.""" + if not self._charm.unit.is_leader(): + return {} + + templates = [] + id = "" + + # Reactive data can reliably be pulled out of events. In theory, if we got an event, + # it's on the bucket, but using event explicitly keeps the mental model in + # place for reactive + for k in event.relation.data[event.unit].keys(): # type: ignore + if k.startswith("request_"): + templates.append(json.loads(event.relation.data[event.unit][k])["dashboard"]) # type: ignore + + for k in event.relation.data[event.app].keys(): # type: ignore + if k.startswith("request_"): + templates.append(json.loads(event.relation.data[event.app][k])["dashboard"]) # type: ignore + + builtins = self._maybe_get_builtin_dashboards(event) + + if not templates and not builtins: + logger.warning("NOTHING!") + return {} + + dashboards = {} + for t in templates: + # This seems ridiculous, too, but to get it from a "dashboards" key in serialized JSON + # in the bucket back out to the actual "dashboard" we _need_, this is the way + # This is not a mistake -- there's a double nesting in reactive charms, and + # Grafana won't load it. We have to unbox: + # event.relation.data[event.]["request_*"]["dashboard"]["dashboard"], + # and the final unboxing is below. + # + # Apparently SOME newer dashboards (such as Ceph) do not have this double nesting, so + # now we get to account for both :toot: + dash = t.get("dashboard", {}) or t + + # Replace values with LMA-style templating + dash = self._strip_existing_datasources(dash) + dash = json.dumps(dash) + + # Replace the old-style datasource templates + dash = re.sub(r"<< datasource >>", r"${prometheusds}", dash) + dash = re.sub(r'"datasource": "prom.*?"', r'"datasource": "${prometheusds}"', dash) + dash = re.sub( + r'"datasource": "\$datasource"', r'"datasource": "${prometheusds}"', dash + ) + dash = re.sub(r'"uid": "\$datasource"', r'"uid": "${prometheusds}"', dash) + dash = re.sub( + r'"datasource": "(!?\w)[\w|\s|-]+?Juju generated.*?"', + r'"datasource": "${prometheusds}"', + dash, + ) + + # Yank out "new"+old LMA topology + dash = re.sub( + r'(,?\s?juju_application=~)\\"\$app\\"', r'\1\\"$juju_application\\"', dash + ) + + # Replace old piechart panels + dash = re.sub(r'"type": "grafana-piechart-panel"', '"type": "piechart"', dash) + + from jinja2 import DebugUndefined, Template + + content = _encode_dashboard_content( + Template(dash, undefined=DebugUndefined).render(datasource=r"${prometheusds}") # type: ignore + ) + id = "prog:{}".format(content[-24:-16]) + + dashboards[id] = content + return {**builtins, **dashboards} + + def _maybe_get_builtin_dashboards(self, event: RelationEvent) -> Dict: + """Tries to match the event with an included dashboard. + + Scans dashboards packed with the charm instantiating this class, and tries to match + one with the event. There is no guarantee that any given event will match a builtin, + since each charm instantiating this class may include a different set of dashboards, + or none. + """ + builtins = {} + dashboards_path = None + + try: + dashboards_path = _resolve_dir_against_charm_path( + self._charm, "src/grafana_dashboards" + ) + except InvalidDirectoryPathError as e: + logger.warning( + "Invalid Grafana dashboards folder at %s: %s", + e.grafana_dashboards_absolute_path, + e.message, + ) + + if dashboards_path: + + def is_dashboard(p: Path) -> bool: + return p.is_file() and p.name.endswith((".json", ".json.tmpl", ".tmpl")) + + for path in filter(is_dashboard, Path(dashboards_path).glob("*")): + # path = Path(path) + if event.app.name in path.name: # type: ignore + id = "file:{}".format(path.stem) + builtins[id] = self._content_to_dashboard_object( + _encode_dashboard_content(path.read_bytes()), event + ) + + return builtins + + def _content_to_dashboard_object(self, content: str, event: RelationEvent) -> Dict: + return { + "charm": event.app.name, # type: ignore + "content": content, + "juju_topology": self._juju_topology(event), + "inject_dropdowns": True, + } + + # This is not actually used in the dashboards, but is present to provide a secondary + # salt to ensure uniqueness in the dict keys in case individual charm units provide + # dashboards + def _juju_topology(self, event: RelationEvent) -> Dict: + return { + "model": self._charm.model.name, + "model_uuid": self._charm.model.uuid, + "application": event.app.name, # type: ignore + "unit": event.unit.name, # type: ignore + } + + +class CosTool: + """Uses cos-tool to inject label matchers into alert rule expressions and validate rules.""" + + _path = None + _disabled = False + + def __init__(self, charm): + self._charm = charm + + @property + def path(self): + """Lazy lookup of the path of cos-tool.""" + if self._disabled: + return None + if not self._path: + self._path = self._get_tool_path() + if not self._path: + logger.debug("Skipping injection of juju topology as label matchers") + self._disabled = True + return self._path + + def apply_label_matchers(self, rules: dict, type: str) -> dict: + """Will apply label matchers to the expression of all alerts in all supplied groups.""" + if not self.path: + return rules + for group in rules["groups"]: + rules_in_group = group.get("rules", []) + for rule in rules_in_group: + topology = {} + # if the user for some reason has provided juju_unit, we'll need to honor it + # in most cases, however, this will be empty + for label in [ + "juju_model", + "juju_model_uuid", + "juju_application", + "juju_charm", + "juju_unit", + ]: + if label in rule["labels"]: + topology[label] = rule["labels"][label] + + rule["expr"] = self.inject_label_matchers(rule["expr"], topology, type) + return rules + + def validate_alert_rules(self, rules: dict) -> Tuple[bool, str]: + """Will validate correctness of alert rules, returning a boolean and any errors.""" + if not self.path: + logger.debug("`cos-tool` unavailable. Not validating alert correctness.") + return True, "" + + with tempfile.TemporaryDirectory() as tmpdir: + rule_path = Path(tmpdir + "/validate_rule.yaml") + + # Smash "our" rules format into what upstream actually uses, which is more like: + # + # groups: + # - name: foo + # rules: + # - alert: SomeAlert + # expr: up + # - alert: OtherAlert + # expr: up + transformed_rules = {"groups": []} # type: ignore + for rule in rules["groups"]: + transformed = {"name": str(uuid.uuid4()), "rules": [rule]} + transformed_rules["groups"].append(transformed) + + rule_path.write_text(yaml.dump(transformed_rules)) + + args = [str(self.path), "validate", str(rule_path)] + # noinspection PyBroadException + try: + self._exec(args) + return True, "" + except subprocess.CalledProcessError as e: + logger.debug("Validating the rules failed: %s", e.output) + return False, ", ".join([line for line in e.output if "error validating" in line]) + + def inject_label_matchers(self, expression: str, topology: dict, type: str) -> str: + """Add label matchers to an expression.""" + if not topology: + return expression + if not self.path: + logger.debug("`cos-tool` unavailable. Leaving expression unchanged: %s", expression) + return expression + args = [str(self.path), "--format", type, "transform"] + + variable_topology = {k: "${}".format(k) for k in topology.keys()} + args.extend( + [ + "--label-matcher={}={}".format(key, value) + for key, value in variable_topology.items() + ] + ) + + # Pass a leading "--" so expressions with a negation or subtraction aren't interpreted as + # flags + args.extend(["--", "{}".format(expression)]) + # noinspection PyBroadException + try: + return re.sub(r'="\$juju', r'=~"$juju', self._exec(args)) + except subprocess.CalledProcessError as e: + logger.debug('Applying the expression failed: "%s", falling back to the original', e) + return expression + + def _get_tool_path(self) -> Optional[Path]: + arch = platform.machine() + arch = "amd64" if arch == "x86_64" else arch + res = "cos-tool-{}".format(arch) + try: + path = Path(res).resolve() + path.chmod(0o777) + return path + except NotImplementedError: + logger.debug("System lacks support for chmod") + except FileNotFoundError: + logger.debug('Could not locate cos-tool at: "{}"'.format(res)) + return None + + def _exec(self, cmd) -> str: + result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE) + output = result.stdout.decode("utf-8").strip() + return output diff --git a/charms/istio-gateway/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/charms/istio-gateway/lib/charms/prometheus_k8s/v0/prometheus_scrape.py new file mode 100644 index 00000000..e3d35c6f --- /dev/null +++ b/charms/istio-gateway/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -0,0 +1,2378 @@ +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. +"""Prometheus Scrape Library. + +## Overview + +This document explains how to integrate with the Prometheus charm +for the purpose of providing a metrics endpoint to Prometheus. It +also explains how alternative implementations of the Prometheus charms +may maintain the same interface and be backward compatible with all +currently integrated charms. Finally this document is the +authoritative reference on the structure of relation data that is +shared between Prometheus charms and any other charm that intends to +provide a scrape target for Prometheus. + +## Source code + +Source code can be found on GitHub at: + https://github.com/canonical/prometheus-k8s-operator/tree/main/lib/charms/prometheus_k8s + +## Provider Library Usage + +This Prometheus charm interacts with its scrape targets using its +charm library. Charms seeking to expose metric endpoints for the +Prometheus charm, must do so using the `MetricsEndpointProvider` +object from this charm library. For the simplest use cases, using the +`MetricsEndpointProvider` object only requires instantiating it, +typically in the constructor of your charm (the one which exposes a +metrics endpoint). The `MetricsEndpointProvider` constructor requires +the name of the relation over which a scrape target (metrics endpoint) +is exposed to the Prometheus charm. This relation must use the +`prometheus_scrape` interface. By default address of the metrics +endpoint is set to the unit IP address, by each unit of the +`MetricsEndpointProvider` charm. These units set their address in +response to the `PebbleReady` event of each container in the unit, +since container restarts of Kubernetes charms can result in change of +IP addresses. The default name for the metrics endpoint relation is +`metrics-endpoint`. It is strongly recommended to use the same +relation name for consistency across charms and doing so obviates the +need for an additional constructor argument. The +`MetricsEndpointProvider` object may be instantiated as follows + + from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider + + def __init__(self, *args): + super().__init__(*args) + ... + self.metrics_endpoint = MetricsEndpointProvider(self) + ... + +Note that the first argument (`self`) to `MetricsEndpointProvider` is +always a reference to the parent (scrape target) charm. + +An instantiated `MetricsEndpointProvider` object will ensure that each +unit of its parent charm, is a scrape target for the +`MetricsEndpointConsumer` (Prometheus) charm. By default +`MetricsEndpointProvider` assumes each unit of the consumer charm +exports its metrics at a path given by `/metrics` on port 80. These +defaults may be changed by providing the `MetricsEndpointProvider` +constructor an optional argument (`jobs`) that represents a +Prometheus scrape job specification using Python standard data +structures. This job specification is a subset of Prometheus' own +[scrape +configuration](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config) +format but represented using Python data structures. More than one job +may be provided using the `jobs` argument. Hence `jobs` accepts a list +of dictionaries where each dictionary represents one `` +object as described in the Prometheus documentation. The currently +supported configuration subset is: `job_name`, `metrics_path`, +`static_configs` + +Suppose it is required to change the port on which scraped metrics are +exposed to 8000. This may be done by providing the following data +structure as the value of `jobs`. + +``` +[ + { + "static_configs": [ + { + "targets": ["*:8000"] + } + ] + } +] +``` + +The wildcard ("*") host specification implies that the scrape targets +will automatically be set to the host addresses advertised by each +unit of the consumer charm. + +It is also possible to change the metrics path and scrape multiple +ports, for example + +``` +[ + { + "metrics_path": "/my-metrics-path", + "static_configs": [ + { + "targets": ["*:8000", "*:8081"], + } + ] + } +] +``` + +More complex scrape configurations are possible. For example + +``` +[ + { + "static_configs": [ + { + "targets": ["10.1.32.215:7000", "*:8000"], + "labels": { + "some_key": "some-value" + } + } + ] + } +] +``` + +This example scrapes the target "10.1.32.215" at port 7000 in addition +to scraping each unit at port 8000. There is however one difference +between wildcard targets (specified using "*") and fully qualified +targets (such as "10.1.32.215"). The Prometheus charm automatically +associates labels with metrics generated by each target. These labels +localise the source of metrics within the Juju topology by specifying +its "model name", "model UUID", "application name" and "unit +name". However unit name is associated only with wildcard targets but +not with fully qualified targets. + +Multiple jobs with different metrics paths and labels are allowed, but +each job must be given a unique name: + +``` +[ + { + "job_name": "my-first-job", + "metrics_path": "one-path", + "static_configs": [ + { + "targets": ["*:7000"], + "labels": { + "some_key": "some-value" + } + } + ] + }, + { + "job_name": "my-second-job", + "metrics_path": "another-path", + "static_configs": [ + { + "targets": ["*:8000"], + "labels": { + "some_other_key": "some-other-value" + } + } + ] + } +] +``` + +**Important:** `job_name` should be a fixed string (e.g. hardcoded literal). +For instance, if you include variable elements, like your `unit.name`, it may break +the continuity of the metrics time series gathered by Prometheus when the leader unit +changes (e.g. on upgrade or rescale). + +Additionally, it is also technically possible, but **strongly discouraged**, to +configure the following scrape-related settings, which behave as described by the +[Prometheus documentation](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config): + +- `static_configs` +- `scrape_interval` +- `scrape_timeout` +- `proxy_url` +- `relabel_configs` +- `metric_relabel_configs` +- `sample_limit` +- `label_limit` +- `label_name_length_limit` +- `label_value_length_limit` + +The settings above are supported by the `prometheus_scrape` library only for the sake of +specialized facilities like the [Prometheus Scrape Config](https://charmhub.io/prometheus-scrape-config-k8s) +charm. Virtually no charms should use these settings, and charmers definitely **should not** +expose them to the Juju administrator via configuration options. + +## Consumer Library Usage + +The `MetricsEndpointConsumer` object may be used by Prometheus +charms to manage relations with their scrape targets. For this +purposes a Prometheus charm needs to do two things + +1. Instantiate the `MetricsEndpointConsumer` object by providing it a +reference to the parent (Prometheus) charm and optionally the name of +the relation that the Prometheus charm uses to interact with scrape +targets. This relation must confirm to the `prometheus_scrape` +interface and it is strongly recommended that this relation be named +`metrics-endpoint` which is its default value. + +For example a Prometheus charm may instantiate the +`MetricsEndpointConsumer` in its constructor as follows + + from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointConsumer + + def __init__(self, *args): + super().__init__(*args) + ... + self.metrics_consumer = MetricsEndpointConsumer(self) + ... + +2. A Prometheus charm also needs to respond to the +`TargetsChangedEvent` event of the `MetricsEndpointConsumer` by adding itself as +an observer for these events, as in + + self.framework.observe( + self.metrics_consumer.on.targets_changed, + self._on_scrape_targets_changed, + ) + +In responding to the `TargetsChangedEvent` event the Prometheus +charm must update the Prometheus configuration so that any new scrape +targets are added and/or old ones removed from the list of scraped +endpoints. For this purpose the `MetricsEndpointConsumer` object +exposes a `jobs()` method that returns a list of scrape jobs. Each +element of this list is the Prometheus scrape configuration for that +job. In order to update the Prometheus configuration, the Prometheus +charm needs to replace the current list of jobs with the list provided +by `jobs()` as follows + + def _on_scrape_targets_changed(self, event): + ... + scrape_jobs = self.metrics_consumer.jobs() + for job in scrape_jobs: + prometheus_scrape_config.append(job) + ... + +## Alerting Rules + +This charm library also supports gathering alerting rules from all +related `MetricsEndpointProvider` charms and enabling corresponding alerts within the +Prometheus charm. Alert rules are automatically gathered by `MetricsEndpointProvider` +charms when using this library, from a directory conventionally named +`prometheus_alert_rules`. This directory must reside at the top level +in the `src` folder of the consumer charm. Each file in this directory +is assumed to be in one of two formats: +- the official prometheus alert rule format, conforming to the +[Prometheus docs](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) +- a single rule format, which is a simplified subset of the official format, +comprising a single alert rule per file, using the same YAML fields. + +The file name must have one of the following extensions: +- `.rule` +- `.rules` +- `.yml` +- `.yaml` + +An example of the contents of such a file in the custom single rule +format is shown below. + +``` +alert: HighRequestLatency +expr: job:request_latency_seconds:mean5m{my_key=my_value} > 0.5 +for: 10m +labels: + severity: Medium + type: HighLatency +annotations: + summary: High request latency for {{ $labels.instance }}. +``` + +The `MetricsEndpointProvider` will read all available alert rules and +also inject "filtering labels" into the alert expressions. The +filtering labels ensure that alert rules are localised to the metrics +provider charm's Juju topology (application, model and its UUID). Such +a topology filter is essential to ensure that alert rules submitted by +one provider charm generates alerts only for that same charm. When +alert rules are embedded in a charm, and the charm is deployed as a +Juju application, the alert rules from that application have their +expressions automatically updated to filter for metrics coming from +the units of that application alone. This remove risk of spurious +evaluation, e.g., when you have multiple deployments of the same charm +monitored by the same Prometheus. + +Not all alerts one may want to specify can be embedded in a +charm. Some alert rules will be specific to a user's use case. This is +the case, for example, of alert rules that are based on business +constraints, like expecting a certain amount of requests to a specific +API every five minutes. Such alert rules can be specified via the +[COS Config Charm](https://charmhub.io/cos-configuration-k8s), +which allows importing alert rules and other settings like dashboards +from a Git repository. + +Gathering alert rules and generating rule files within the Prometheus +charm is easily done using the `alerts()` method of +`MetricsEndpointConsumer`. Alerts generated by Prometheus will +automatically include Juju topology labels in the alerts. These labels +indicate the source of the alert. The following labels are +automatically included with each alert + +- `juju_model` +- `juju_model_uuid` +- `juju_application` + +## Relation Data + +The Prometheus charm uses both application and unit relation data to +obtain information regarding its scrape jobs, alert rules and scrape +targets. This relation data is in JSON format and it closely resembles +the YAML structure of Prometheus [scrape configuration] +(https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config). + +Units of Metrics provider charms advertise their names and addresses +over unit relation data using the `prometheus_scrape_unit_name` and +`prometheus_scrape_unit_address` keys. While the `scrape_metadata`, +`scrape_jobs` and `alert_rules` keys in application relation data +of Metrics provider charms hold eponymous information. + +""" # noqa: W505 + +import copy +import hashlib +import ipaddress +import json +import logging +import os +import platform +import re +import socket +import subprocess +import tempfile +from collections import defaultdict +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from urllib.parse import urlparse + +import yaml +from cosl import JujuTopology +from cosl.rules import AlertRules +from ops.charm import CharmBase, RelationRole +from ops.framework import ( + BoundEvent, + EventBase, + EventSource, + Object, + ObjectEvents, + StoredDict, + StoredList, + StoredState, +) +from ops.model import Relation + +# The unique Charmhub library identifier, never change it +LIBID = "bc84295fef5f4049878f07b131968ee2" + +# Increment this major API version when introducing breaking changes +LIBAPI = 0 + +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version +LIBPATCH = 47 + +PYDEPS = ["cosl"] + +logger = logging.getLogger(__name__) + + +ALLOWED_KEYS = { + "job_name", + "metrics_path", + "static_configs", + "scrape_interval", + "scrape_timeout", + "proxy_url", + "relabel_configs", + "metric_relabel_configs", + "sample_limit", + "label_limit", + "label_name_length_limit", + "label_value_length_limit", + "scheme", + "basic_auth", + "tls_config", + "authorization", + "params", +} +DEFAULT_JOB = { + "metrics_path": "/metrics", + "static_configs": [{"targets": ["*:80"]}], +} + + +DEFAULT_RELATION_NAME = "metrics-endpoint" +RELATION_INTERFACE_NAME = "prometheus_scrape" + +DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" + + +class PrometheusConfig: + """A namespace for utility functions for manipulating the prometheus config dict.""" + + # relabel instance labels so that instance identifiers are globally unique + # stable over unit recreation + topology_relabel_config = { + "source_labels": ["juju_model", "juju_model_uuid", "juju_application"], + "separator": "_", + "target_label": "instance", + "regex": "(.*)", + } + + topology_relabel_config_wildcard = { + "source_labels": ["juju_model", "juju_model_uuid", "juju_application", "juju_unit"], + "separator": "_", + "target_label": "instance", + "regex": "(.*)", + } + + @staticmethod + def sanitize_scrape_config(job: dict) -> dict: + """Restrict permissible scrape configuration options. + + If job is empty then a default job is returned. The + default job is + + ``` + { + "metrics_path": "/metrics", + "static_configs": [{"targets": ["*:80"]}], + } + ``` + + Args: + job: a dict containing a single Prometheus job + specification. + + Returns: + a dictionary containing a sanitized job specification. + """ + sanitized_job = DEFAULT_JOB.copy() + sanitized_job.update({key: value for key, value in job.items() if key in ALLOWED_KEYS}) + return sanitized_job + + @staticmethod + def sanitize_scrape_configs(scrape_configs: List[dict]) -> List[dict]: + """A vectorized version of `sanitize_scrape_config`.""" + return [PrometheusConfig.sanitize_scrape_config(job) for job in scrape_configs] + + @staticmethod + def prefix_job_names(scrape_configs: List[dict], prefix: str) -> List[dict]: + """Adds the given prefix to all the job names in the given scrape_configs list.""" + modified_scrape_configs = [] + for scrape_config in scrape_configs: + job_name = scrape_config.get("job_name") + modified = scrape_config.copy() + modified["job_name"] = prefix + "_" + job_name if job_name else prefix + modified_scrape_configs.append(modified) + + return modified_scrape_configs + + @staticmethod + def expand_wildcard_targets_into_individual_jobs( + scrape_jobs: List[dict], + hosts: Dict[str, Tuple[str, str]], + topology: Optional[JujuTopology] = None, + ) -> List[dict]: + """Extract wildcard hosts from the given scrape_configs list into separate jobs. + + Args: + scrape_jobs: list of scrape jobs. + hosts: a dictionary mapping host names to host address for + all units of the relation for which this job configuration + must be constructed. + topology: optional arg for adding topology labels to scrape targets. + """ + # hosts = self._relation_hosts(relation) + + modified_scrape_jobs = [] + for job in scrape_jobs: + static_configs = job.get("static_configs") + if not static_configs: + continue + + # When a single unit specified more than one wildcard target, then they are expanded + # into a static_config per target + non_wildcard_static_configs = [] + + for static_config in static_configs: + targets = static_config.get("targets") + if not targets: + continue + + # All non-wildcard targets remain in the same static_config + non_wildcard_targets = [] + + # All wildcard targets are extracted to a job per unit. If multiple wildcard + # targets are specified, they remain in the same static_config (per unit). + wildcard_targets = [] + + for target in targets: + match = re.compile(r"\*(?:(:\d+))?").match(target) + if match: + # This is a wildcard target. + # Need to expand into separate jobs and remove it from this job here + wildcard_targets.append(target) + else: + # This is not a wildcard target. Copy it over into its own static_config. + non_wildcard_targets.append(target) + + # All non-wildcard targets remain in the same static_config + if non_wildcard_targets: + non_wildcard_static_config = static_config.copy() + non_wildcard_static_config["targets"] = non_wildcard_targets + + if topology: + # When non-wildcard targets (aka fully qualified hostnames) are specified, + # there is no reliable way to determine the name (Juju topology unit name) + # for such a target. Therefore labeling with Juju topology, excluding the + # unit name. + non_wildcard_static_config["labels"] = { + **topology.label_matcher_dict, + **non_wildcard_static_config.get("labels", {}), + } + + non_wildcard_static_configs.append(non_wildcard_static_config) + + # Extract wildcard targets into individual jobs + if wildcard_targets: + for unit_name, (unit_hostname, unit_path) in hosts.items(): + modified_job = job.copy() + modified_job["static_configs"] = [static_config.copy()] + modified_static_config = modified_job["static_configs"][0] + modified_static_config["targets"] = [ + target.replace("*", unit_hostname) for target in wildcard_targets + ] + + unit_num = unit_name.split("/")[-1] + job_name = modified_job.get("job_name", "unnamed-job") + "-" + unit_num + modified_job["job_name"] = job_name + modified_job["metrics_path"] = unit_path + ( + job.get("metrics_path") or "/metrics" + ) + + if topology: + # Add topology labels + modified_static_config["labels"] = { + **topology.label_matcher_dict, + **{"juju_unit": unit_name}, + **modified_static_config.get("labels", {}), + } + + # Instance relabeling for topology should be last in order. + modified_job["relabel_configs"] = modified_job.get( + "relabel_configs", [] + ) + [PrometheusConfig.topology_relabel_config_wildcard] + + modified_scrape_jobs.append(modified_job) + + if non_wildcard_static_configs: + modified_job = job.copy() + modified_job["static_configs"] = non_wildcard_static_configs + modified_job["metrics_path"] = modified_job.get("metrics_path") or "/metrics" + + if topology: + # Instance relabeling for topology should be last in order. + modified_job["relabel_configs"] = modified_job.get("relabel_configs", []) + [ + PrometheusConfig.topology_relabel_config + ] + + modified_scrape_jobs.append(modified_job) + + return modified_scrape_jobs + + @staticmethod + def render_alertmanager_static_configs(alertmanagers: List[str]): + """Render the alertmanager static_configs section from a list of URLs. + + Each target must be in the hostname:port format, and prefixes are specified in a separate + key. Therefore, with ingress in place, would need to extract the path into the + `path_prefix` key, which is higher up in the config hierarchy. + + https://prometheus.io/docs/prometheus/latest/configuration/configuration/#alertmanager_config + + Args: + alertmanagers: List of alertmanager URLs. + + Returns: + A dict representation for the static_configs section. + """ + # Make sure it's a valid url so urlparse could parse it. + scheme = re.compile(r"^https?://") + sanitized = [am if scheme.search(am) else "http://" + am for am in alertmanagers] + + # Create a mapping from paths to netlocs + # Group alertmanager targets into a dictionary of lists: + # {path: [netloc1, netloc2]} + paths = defaultdict(list) # type: Dict[Tuple[str, str], List[str]] + for parsed in map(urlparse, sanitized): + path = parsed.path or "/" + paths[(parsed.scheme, path)].append(parsed.netloc) + + return { + "alertmanagers": [ + { + # For https we still do not render a `tls_config` section because + # certs are expected to be made available by the charm via the + # `update-ca-certificates` mechanism. + "scheme": scheme, + "path_prefix": path_prefix, + "static_configs": [{"targets": netlocs}], + } + for (scheme, path_prefix), netlocs in paths.items() + ] + } + + +class RelationNotFoundError(Exception): + """Raised if there is no relation with the given name is found.""" + + def __init__(self, relation_name: str): + self.relation_name = relation_name + self.message = "No relation named '{}' found".format(relation_name) + + super().__init__(self.message) + + +class RelationInterfaceMismatchError(Exception): + """Raised if the relation with the given name has a different interface.""" + + def __init__( + self, + relation_name: str, + expected_relation_interface: str, + actual_relation_interface: str, + ): + self.relation_name = relation_name + self.expected_relation_interface = expected_relation_interface + self.actual_relation_interface = actual_relation_interface + self.message = ( + "The '{}' relation has '{}' as interface rather than the expected '{}'".format( + relation_name, actual_relation_interface, expected_relation_interface + ) + ) + + super().__init__(self.message) + + +class RelationRoleMismatchError(Exception): + """Raised if the relation with the given name has a different role.""" + + def __init__( + self, + relation_name: str, + expected_relation_role: RelationRole, + actual_relation_role: RelationRole, + ): + self.relation_name = relation_name + self.expected_relation_interface = expected_relation_role + self.actual_relation_role = actual_relation_role + self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format( + relation_name, repr(actual_relation_role), repr(expected_relation_role) + ) + + super().__init__(self.message) + + +class InvalidAlertRuleEvent(EventBase): + """Event emitted when alert rule files are not parsable. + + Enables us to set a clear status on the provider. + """ + + def __init__(self, handle, errors: str = "", valid: bool = False): + super().__init__(handle) + self.errors = errors + self.valid = valid + + def snapshot(self) -> Dict: + """Save alert rule information.""" + return { + "valid": self.valid, + "errors": self.errors, + } + + def restore(self, snapshot): + """Restore alert rule information.""" + self.valid = snapshot["valid"] + self.errors = snapshot["errors"] + + +class InvalidScrapeJobEvent(EventBase): + """Event emitted when alert rule files are not valid.""" + + def __init__(self, handle, errors: str = ""): + super().__init__(handle) + self.errors = errors + + def snapshot(self) -> Dict: + """Save error information.""" + return {"errors": self.errors} + + def restore(self, snapshot): + """Restore error information.""" + self.errors = snapshot["errors"] + + +class MetricsEndpointProviderEvents(ObjectEvents): + """Events raised by :class:`InvalidAlertRuleEvent`s.""" + + alert_rule_status_changed = EventSource(InvalidAlertRuleEvent) + invalid_scrape_job = EventSource(InvalidScrapeJobEvent) + + +def _type_convert_stored(obj): + """Convert Stored* to their appropriate types, recursively.""" + if isinstance(obj, StoredList): + return list(map(_type_convert_stored, obj)) + if isinstance(obj, StoredDict): + rdict = {} # type: Dict[Any, Any] + for k in obj.keys(): + rdict[k] = _type_convert_stored(obj[k]) + return rdict + return obj + + +def _validate_relation_by_interface_and_direction( + charm: CharmBase, + relation_name: str, + expected_relation_interface: str, + expected_relation_role: RelationRole, +): + """Verifies that a relation has the necessary characteristics. + + Verifies that the `relation_name` provided: (1) exists in metadata.yaml, + (2) declares as interface the interface name passed as `relation_interface` + and (3) has the right "direction", i.e., it is a relation that `charm` + provides or requires. + + Args: + charm: a `CharmBase` object to scan for the matching relation. + relation_name: the name of the relation to be verified. + expected_relation_interface: the interface name to be matched by the + relation named `relation_name`. + expected_relation_role: whether the `relation_name` must be either + provided or required by `charm`. + + Raises: + RelationNotFoundError: If there is no relation in the charm's metadata.yaml + with the same name as provided via `relation_name` argument. + RelationInterfaceMismatchError: The relation with the same name as provided + via `relation_name` argument does not have the same relation interface + as specified via the `expected_relation_interface` argument. + RelationRoleMismatchError: If the relation with the same name as provided + via `relation_name` argument does not have the same role as specified + via the `expected_relation_role` argument. + """ + if relation_name not in charm.meta.relations: + raise RelationNotFoundError(relation_name) + + relation = charm.meta.relations[relation_name] + + actual_relation_interface = relation.interface_name + if actual_relation_interface != expected_relation_interface: + raise RelationInterfaceMismatchError( + relation_name, expected_relation_interface, actual_relation_interface or "None" + ) + + if expected_relation_role == RelationRole.provides: + if relation_name not in charm.meta.provides: + raise RelationRoleMismatchError( + relation_name, RelationRole.provides, RelationRole.requires + ) + elif expected_relation_role == RelationRole.requires: + if relation_name not in charm.meta.requires: + raise RelationRoleMismatchError( + relation_name, RelationRole.requires, RelationRole.provides + ) + else: + raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role)) + + +class InvalidAlertRulePathError(Exception): + """Raised if the alert rules folder cannot be found or is otherwise invalid.""" + + def __init__( + self, + alert_rules_absolute_path: Path, + message: str, + ): + self.alert_rules_absolute_path = alert_rules_absolute_path + self.message = message + + super().__init__(self.message) + + +def _is_official_alert_rule_format(rules_dict: dict) -> bool: + """Are alert rules in the upstream format as supported by Prometheus. + + Alert rules in dictionary format are in "official" form if they + contain a "groups" key, since this implies they contain a list of + alert rule groups. + + Args: + rules_dict: a set of alert rules in Python dictionary format + + Returns: + True if alert rules are in official Prometheus file format. + """ + return "groups" in rules_dict + + +def _is_single_alert_rule_format(rules_dict: dict) -> bool: + """Are alert rules in single rule format. + + The Prometheus charm library supports reading of alert rules in a + custom format that consists of a single alert rule per file. This + does not conform to the official Prometheus alert rule file format + which requires that each alert rules file consists of a list of + alert rule groups and each group consists of a list of alert + rules. + + Alert rules in dictionary form are considered to be in single rule + format if in the least it contains two keys corresponding to the + alert rule name and alert expression. + + Returns: + True if alert rule is in single rule file format. + """ + # one alert rule per file + return set(rules_dict) >= {"alert", "expr"} + + +class TargetsChangedEvent(EventBase): + """Event emitted when Prometheus scrape targets change.""" + + def __init__(self, handle, relation_id): + super().__init__(handle) + self.relation_id = relation_id + + def snapshot(self): + """Save scrape target relation information.""" + return {"relation_id": self.relation_id} + + def restore(self, snapshot): + """Restore scrape target relation information.""" + self.relation_id = snapshot["relation_id"] + + +class MonitoringEvents(ObjectEvents): + """Event descriptor for events raised by `MetricsEndpointConsumer`.""" + + targets_changed = EventSource(TargetsChangedEvent) + + +class MetricsEndpointConsumer(Object): + """A Prometheus based Monitoring service.""" + + on = MonitoringEvents() # pyright: ignore + + def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME): + """A Prometheus based Monitoring service. + + Args: + charm: a `CharmBase` instance that manages this + instance of the Prometheus service. + relation_name: an optional string name of the relation between `charm` + and the Prometheus charmed service. The default is "metrics-endpoint". + It is strongly advised not to change the default, so that people + deploying your charm will have a consistent experience with all + other charms that consume metrics endpoints. + + Raises: + RelationNotFoundError: If there is no relation in the charm's metadata.yaml + with the same name as provided via `relation_name` argument. + RelationInterfaceMismatchError: The relation with the same name as provided + via `relation_name` argument does not have the `prometheus_scrape` relation + interface. + RelationRoleMismatchError: If the relation with the same name as provided + via `relation_name` argument does not have the `RelationRole.requires` + role. + """ + _validate_relation_by_interface_and_direction( + charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires + ) + + super().__init__(charm, relation_name) + self._charm = charm + self._relation_name = relation_name + self._tool = CosTool(self._charm) + events = self._charm.on[relation_name] + self.framework.observe(events.relation_changed, self._on_metrics_provider_relation_changed) + self.framework.observe( + events.relation_departed, self._on_metrics_provider_relation_departed + ) + + def _on_metrics_provider_relation_changed(self, event): + """Handle changes with related metrics providers. + + Anytime there are changes in relations between Prometheus + and metrics provider charms the Prometheus charm is informed, + through a `TargetsChangedEvent` event. The Prometheus charm can + then choose to update its scrape configuration. + + Args: + event: a `CharmEvent` in response to which the Prometheus + charm must update its scrape configuration. + """ + rel_id = event.relation.id + + self.on.targets_changed.emit(relation_id=rel_id) + + def _on_metrics_provider_relation_departed(self, event): + """Update job config when a metrics provider departs. + + When a metrics provider departs the Prometheus charm is informed + through a `TargetsChangedEvent` event so that it can update its + scrape configuration to ensure that the departed metrics provider + is removed from the list of scrape jobs and + + Args: + event: a `CharmEvent` that indicates a metrics provider + unit has departed. + """ + rel_id = event.relation.id + self.on.targets_changed.emit(relation_id=rel_id) + + def jobs(self) -> list: + """Fetch the list of scrape jobs. + + Returns: + A list consisting of all the static scrape configurations + for each related `MetricsEndpointProvider` that has specified + its scrape targets. + """ + scrape_jobs = [] + + for relation in self._charm.model.relations[self._relation_name]: + static_scrape_jobs = self._static_scrape_config(relation) + if static_scrape_jobs: + # Duplicate job names will cause validate_scrape_jobs to fail. + # Therefore we need to dedupe here and after all jobs are collected. + static_scrape_jobs = _dedupe_job_names(static_scrape_jobs) + try: + self._tool.validate_scrape_jobs(static_scrape_jobs) + except subprocess.CalledProcessError as e: + if self._charm.unit.is_leader(): + data = json.loads(relation.data[self._charm.app].get("event", "{}")) + data["scrape_job_errors"] = str(e) + relation.data[self._charm.app]["event"] = json.dumps(data) + else: + scrape_jobs.extend(static_scrape_jobs) + + scrape_jobs = _dedupe_job_names(scrape_jobs) + + return scrape_jobs + + @property + def alerts(self) -> dict: + """Fetch alerts for all relations. + + A Prometheus alert rules file consists of a list of "groups". Each + group consists of a list of alerts (`rules`) that are sequentially + executed. This method returns all the alert rules provided by each + related metrics provider charm. These rules may be used to generate a + separate alert rules file for each relation since the returned list + of alert groups are indexed by that relations Juju topology identifier. + The Juju topology identifier string includes substrings that identify + alert rule related metadata such as the Juju model, model UUID and the + application name from where the alert rule originates. Since this + topology identifier is globally unique, it may be used for instance as + the name for the file into which the list of alert rule groups are + written. For each relation, the structure of data returned is a dictionary + representation of a standard prometheus rules file: + + {"groups": [{"name": ...}, ...]} + + per official prometheus documentation + https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ + + The value of the `groups` key is such that it may be used to generate + a Prometheus alert rules file directly using `yaml.dump` but the + `groups` key itself must be included as this is required by Prometheus. + + For example the list of alert rule groups returned by this method may + be written into files consumed by Prometheus as follows + + ``` + for topology_identifier, alert_rule_groups in self.metrics_consumer.alerts().items(): + filename = "juju_" + topology_identifier + ".rules" + path = os.path.join(PROMETHEUS_RULES_DIR, filename) + rules = yaml.safe_dump(alert_rule_groups) + container.push(path, rules, make_dirs=True) + ``` + + Returns: + A dictionary mapping the Juju topology identifier of the source charm to + its list of alert rule groups. + """ + alerts = {} # type: Dict[str, dict] # mapping b/w juju identifiers and alert rule files + for relation in self._charm.model.relations[self._relation_name]: + if not relation.units or not relation.app: + continue + + alert_rules = json.loads(relation.data[relation.app].get("alert_rules", "{}")) + if not alert_rules: + continue + + alert_rules = self._inject_alert_expr_labels(alert_rules) + + identifier, topology = self._get_identifier_by_alert_rules(alert_rules) + if not topology: + try: + scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"]) + identifier = JujuTopology.from_dict(scrape_metadata).identifier + + except KeyError as e: + logger.debug( + "Relation %s has no 'scrape_metadata': %s", + relation.id, + e, + ) + + if not identifier: + logger.error( + "Alert rules were found but no usable group or identifier was present." + ) + continue + + # We need to append the relation info to the identifier. This is to allow for cases for there are two + # relations which eventually scrape the same application. Issue #551. + identifier = f"{identifier}_{relation.name}_{relation.id}" + + alerts[identifier] = alert_rules + + _, errmsg = self._tool.validate_alert_rules(alert_rules) + if errmsg: + if alerts[identifier]: + del alerts[identifier] + if self._charm.unit.is_leader(): + data = json.loads(relation.data[self._charm.app].get("event", "{}")) + data["errors"] = errmsg + relation.data[self._charm.app]["event"] = json.dumps(data) + continue + + return alerts + + def _get_identifier_by_alert_rules( + self, rules: dict + ) -> Tuple[Union[str, None], Union[JujuTopology, None]]: + """Determine an appropriate dict key for alert rules. + + The key is used as the filename when writing alerts to disk, so the structure + and uniqueness is important. + + Args: + rules: a dict of alert rules + Returns: + A tuple containing an identifier, if found, and a JujuTopology, if it could + be constructed. + """ + if "groups" not in rules: + logger.debug("No alert groups were found in relation data") + return None, None + + # Construct an ID based on what's in the alert rules if they have labels + for group in rules["groups"]: + try: + labels = group["rules"][0]["labels"] + topology = JujuTopology( + # Don't try to safely get required constructor fields. There's already + # a handler for KeyErrors + model_uuid=labels["juju_model_uuid"], + model=labels["juju_model"], + application=labels["juju_application"], + unit=labels.get("juju_unit", ""), + charm_name=labels.get("juju_charm", ""), + ) + return topology.identifier, topology + except KeyError: + logger.debug("Alert rules were found but no usable labels were present") + continue + + logger.warning( + "No labeled alert rules were found, and no 'scrape_metadata' " + "was available. Using the alert group name as filename." + ) + try: + for group in rules["groups"]: + return group["name"], None + except KeyError: + logger.debug("No group name was found to use as identifier") + + return None, None + + def _inject_alert_expr_labels(self, rules: Dict[str, Any]) -> Dict[str, Any]: + """Iterate through alert rules and inject topology into expressions. + + Args: + rules: a dict of alert rules + """ + if "groups" not in rules: + return rules + + modified_groups = [] + for group in rules["groups"]: + # Copy off rules, so we don't modify an object we're iterating over + rules_copy = group["rules"] + for idx, rule in enumerate(rules_copy): + labels = rule.get("labels") + + if labels: + try: + topology = JujuTopology( + # Don't try to safely get required constructor fields. There's already + # a handler for KeyErrors + model_uuid=labels["juju_model_uuid"], + model=labels["juju_model"], + application=labels["juju_application"], + unit=labels.get("juju_unit", ""), + charm_name=labels.get("juju_charm", ""), + ) + + # Inject topology and put it back in the list + rule["expr"] = self._tool.inject_label_matchers( + re.sub(r"%%juju_topology%%,?", "", rule["expr"]), + topology.alert_expression_dict, + ) + except KeyError: + # Some required JujuTopology key is missing. Just move on. + pass + + group["rules"][idx] = rule + + modified_groups.append(group) + + rules["groups"] = modified_groups + return rules + + def _static_scrape_config(self, relation) -> list: + """Generate the static scrape configuration for a single relation. + + If the relation data includes `scrape_metadata` then the value + of this key is used to annotate the scrape jobs with Juju + Topology labels before returning them. + + Args: + relation: an `ops.model.Relation` object whose static + scrape configuration is required. + + Returns: + A list (possibly empty) of scrape jobs. Each job is a + valid Prometheus scrape configuration for that job, + represented as a Python dictionary. + """ + if not relation.units: + return [] + + scrape_configs = json.loads(relation.data[relation.app].get("scrape_jobs", "[]")) + + if not scrape_configs: + return [] + + scrape_metadata = json.loads(relation.data[relation.app].get("scrape_metadata", "{}")) + + if not scrape_metadata: + return scrape_configs + + topology = JujuTopology.from_dict(scrape_metadata) + + job_name_prefix = "juju_{}_prometheus_scrape".format(topology.identifier) + scrape_configs = PrometheusConfig.prefix_job_names(scrape_configs, job_name_prefix) + scrape_configs = PrometheusConfig.sanitize_scrape_configs(scrape_configs) + + hosts = self._relation_hosts(relation) + + scrape_configs = PrometheusConfig.expand_wildcard_targets_into_individual_jobs( + scrape_configs, hosts, topology + ) + + # For https scrape targets we still do not render a `tls_config` section because certs + # are expected to be made available by the charm via the `update-ca-certificates` mechanism. + return scrape_configs + + def _relation_hosts(self, relation: Relation) -> Dict[str, Tuple[str, str]]: + """Returns a mapping from unit names to (address, path) tuples, for the given relation.""" + hosts = {} + for unit in relation.units: + # TODO deprecate and remove unit.name + unit_name = relation.data[unit].get("prometheus_scrape_unit_name") or unit.name + # TODO deprecate and remove "prometheus_scrape_host" + unit_address = relation.data[unit].get( + "prometheus_scrape_unit_address" + ) or relation.data[unit].get("prometheus_scrape_host") + unit_path = relation.data[unit].get("prometheus_scrape_unit_path", "") + if unit_name and unit_address: + hosts.update({unit_name: (unit_address, unit_path)}) + + return hosts + + def _target_parts(self, target) -> list: + """Extract host and port from a wildcard target. + + Args: + target: a string specifying a scrape target. A + scrape target is expected to have the format + "host:port". The host part may be a wildcard + "*" and the port part can be missing (along + with ":") in which case port is set to 80. + + Returns: + a list with target host and port as in [host, port] + """ + if ":" in target: + parts = target.split(":") + else: + parts = [target, "80"] + + return parts + + +def _dedupe_job_names(jobs: List[dict]): + """Deduplicate a list of dicts by appending a hash to the value of the 'job_name' key. + + Additionally, fully de-duplicate any identical jobs. + + Args: + jobs: A list of prometheus scrape jobs + """ + jobs_copy = copy.deepcopy(jobs) + + # Convert to a dict with job names as keys + # I think this line is O(n^2) but it should be okay given the list sizes + jobs_dict = { + job["job_name"]: list(filter(lambda x: x["job_name"] == job["job_name"], jobs_copy)) + for job in jobs_copy + } + + # If multiple jobs have the same name, convert the name to "name_" + for key in jobs_dict: + if len(jobs_dict[key]) > 1: + for job in jobs_dict[key]: + job_json = json.dumps(job) + hashed = hashlib.sha256(job_json.encode()).hexdigest() + job["job_name"] = "{}_{}".format(job["job_name"], hashed) + new_jobs = [] + for key in jobs_dict: + new_jobs.extend(list(jobs_dict[key])) + + # Deduplicate jobs which are equal + # Again this in O(n^2) but it should be okay + deduped_jobs = [] + seen = [] + for job in new_jobs: + job_json = json.dumps(job) + hashed = hashlib.sha256(job_json.encode()).hexdigest() + if hashed in seen: + continue + seen.append(hashed) + deduped_jobs.append(job) + + return deduped_jobs + + +def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> str: + """Resolve the provided path items against the directory of the main file. + + Look up the directory of the `main.py` file being executed. This is normally + going to be the charm.py file of the charm including this library. Then, resolve + the provided path elements and, if the result path exists and is a directory, + return its absolute path; otherwise, raise en exception. + + Raises: + InvalidAlertRulePathError, if the path does not exist or is not a directory. + """ + charm_dir = Path(str(charm.charm_dir)) + if not charm_dir.exists() or not charm_dir.is_dir(): + # Operator Framework does not currently expose a robust + # way to determine the top level charm source directory + # that is consistent across deployed charms and unit tests + # Hence for unit tests the current working directory is used + # TODO: updated this logic when the following ticket is resolved + # https://github.com/canonical/operator/issues/643 + charm_dir = Path(os.getcwd()) + + alerts_dir_path = charm_dir.absolute().joinpath(*path_elements) + + if not alerts_dir_path.exists(): + raise InvalidAlertRulePathError(alerts_dir_path, "directory does not exist") + if not alerts_dir_path.is_dir(): + raise InvalidAlertRulePathError(alerts_dir_path, "is not a directory") + + return str(alerts_dir_path) + + +class MetricsEndpointProvider(Object): + """A metrics endpoint for Prometheus.""" + + on = MetricsEndpointProviderEvents() # pyright: ignore + + def __init__( + self, + charm, + relation_name: str = DEFAULT_RELATION_NAME, + jobs=None, + alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, + refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None, + external_url: str = "", + lookaside_jobs_callable: Optional[Callable] = None, + ): + """Construct a metrics provider for a Prometheus charm. + + If your charm exposes a Prometheus metrics endpoint, the + `MetricsEndpointProvider` object enables your charm to easily + communicate how to reach that metrics endpoint. + + By default, a charm instantiating this object has the metrics + endpoints of each of its units scraped by the related Prometheus + charms. The scraped metrics are automatically tagged by the + Prometheus charms with Juju topology data via the + `juju_model_name`, `juju_model_uuid`, `juju_application_name` + and `juju_unit` labels. To support such tagging `MetricsEndpointProvider` + automatically forwards scrape metadata to a `MetricsEndpointConsumer` + (Prometheus charm). + + Scrape targets provided by `MetricsEndpointProvider` can be + customized when instantiating this object. For example in the + case of a charm exposing the metrics endpoint for each of its + units on port 8080 and the `/metrics` path, the + `MetricsEndpointProvider` can be instantiated as follows: + + self.metrics_endpoint_provider = MetricsEndpointProvider( + self, + jobs=[{ + "static_configs": [{"targets": ["*:8080"]}], + }]) + + The notation `*:` means "scrape each unit of this charm on port + ``. + + In case the metrics endpoints are not on the standard `/metrics` path, + a custom path can be specified as follows: + + self.metrics_endpoint_provider = MetricsEndpointProvider( + self, + jobs=[{ + "metrics_path": "/my/strange/metrics/path", + "static_configs": [{"targets": ["*:8080"]}], + }]) + + Note how the `jobs` argument is a list: this allows you to expose multiple + combinations of paths "metrics_path" and "static_configs" in case your charm + exposes multiple endpoints, which could happen, for example, when you have + multiple workload containers, with applications in each needing to be scraped. + The structure of the objects in the `jobs` list is one-to-one with the + `scrape_config` configuration item of Prometheus' own configuration (see + https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config + ), but with only a subset of the fields allowed. The permitted fields are + listed in `ALLOWED_KEYS` object in this charm library module. + + It is also possible to specify alert rules. By default, this library will look + into the `/prometheus_alert_rules`, which in a standard charm + layouts resolves to `src/prometheus_alert_rules`. Each alert rule goes into a + separate `*.rule` file. If the syntax of a rule is invalid, + the `MetricsEndpointProvider` logs an error and does not load the particular + rule. + + To avoid false positives and negatives in the evaluation of alert rules, + all ingested alert rule expressions are automatically qualified using Juju + Topology filters. This ensures that alert rules provided by your charm, trigger + alerts based only on data scrapped from your charm. For example an alert rule + such as the following + + alert: UnitUnavailable + expr: up < 1 + for: 0m + + will be automatically transformed into something along the lines of the following + + alert: UnitUnavailable + expr: up{juju_model=, juju_model_uuid=, juju_application=} < 1 + for: 0m + + An attempt will be made to validate alert rules prior to loading them into Prometheus. + If they are invalid, an event will be emitted from this object which charms can respond + to in order to set a meaningful status for administrators. + + This can be observed via `consumer.on.alert_rule_status_changed` which contains: + - The error(s) encountered when validating as `errors` + - A `valid` attribute, which can be used to reset the state of charms if alert rules + are updated via another mechanism (e.g. `cos-config`) and refreshed. + + Args: + charm: a `CharmBase` object that manages this + `MetricsEndpointProvider` object. Typically, this is + `self` in the instantiating class. + relation_name: an optional string name of the relation between `charm` + and the Prometheus charmed service. The default is "metrics-endpoint". + It is strongly advised not to change the default, so that people + deploying your charm will have a consistent experience with all + other charms that provide metrics endpoints. + jobs: an optional list of dictionaries where each + dictionary represents the Prometheus scrape + configuration for a single job. When not provided, a + default scrape configuration is provided for the + `/metrics` endpoint polling all units of the charm on port `80` + using the `MetricsEndpointProvider` object. + alert_rules_path: an optional path for the location of alert rules + files. Defaults to "./prometheus_alert_rules", + resolved relative to the directory hosting the charm entry file. + The alert rules are automatically updated on charm upgrade. + refresh_event: an optional bound event or list of bound events which + will be observed to re-set scrape job data (IP address and others) + external_url: an optional argument that represents an external url that + can be generated by an Ingress or a Proxy. + lookaside_jobs_callable: an optional `Callable` which should be invoked + when the job configuration is built as a secondary mapping. The callable + should return a `List[Dict]` which is syntactically identical to the + `jobs` parameter, but can be updated out of step initialization of + this library without disrupting the 'global' job spec. + + Raises: + RelationNotFoundError: If there is no relation in the charm's metadata.yaml + with the same name as provided via `relation_name` argument. + RelationInterfaceMismatchError: The relation with the same name as provided + via `relation_name` argument does not have the `prometheus_scrape` relation + interface. + RelationRoleMismatchError: If the relation with the same name as provided + via `relation_name` argument does not have the `RelationRole.provides` + role. + """ + _validate_relation_by_interface_and_direction( + charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides + ) + + try: + alert_rules_path = _resolve_dir_against_charm_path(charm, alert_rules_path) + except InvalidAlertRulePathError as e: + logger.debug( + "Invalid Prometheus alert rules folder at %s: %s", + e.alert_rules_absolute_path, + e.message, + ) + + super().__init__(charm, relation_name) + self.topology = JujuTopology.from_charm(charm) + + self._charm = charm + self._alert_rules_path = alert_rules_path + self._relation_name = relation_name + # sanitize job configurations to the supported subset of parameters + jobs = [] if jobs is None else jobs + self._jobs = PrometheusConfig.sanitize_scrape_configs(jobs) + + if external_url: + external_url = ( + external_url if urlparse(external_url).scheme else ("http://" + external_url) + ) + self.external_url = external_url + self._lookaside_jobs = lookaside_jobs_callable + + events = self._charm.on[self._relation_name] + self.framework.observe(events.relation_changed, self._on_relation_changed) + + if not refresh_event: + # FIXME remove once podspec charms are verified. + # `self.set_scrape_job_spec()` is called every re-init so this should not be needed. + if len(self._charm.meta.containers) == 1: + if "kubernetes" in self._charm.meta.series: + # This is a podspec charm + refresh_event = [self._charm.on.update_status] + else: + # This is a sidecar/pebble charm + container = list(self._charm.meta.containers.values())[0] + refresh_event = [self._charm.on[container.name.replace("-", "_")].pebble_ready] + else: + logger.warning( + "%d containers are present in metadata.yaml and " + "refresh_event was not specified. Defaulting to update_status. " + "Metrics IP may not be set in a timely fashion.", + len(self._charm.meta.containers), + ) + refresh_event = [self._charm.on.update_status] + + else: + if not isinstance(refresh_event, list): + refresh_event = [refresh_event] + + self.framework.observe(events.relation_joined, self.set_scrape_job_spec) + for ev in refresh_event: + self.framework.observe(ev, self.set_scrape_job_spec) + + def _on_relation_changed(self, event): + """Check for alert rule messages in the relation data before moving on.""" + if self._charm.unit.is_leader(): + ev = json.loads(event.relation.data[event.app].get("event", "{}")) + + if ev: + valid = bool(ev.get("valid", True)) + errors = ev.get("errors", "") + + if valid and not errors: + self.on.alert_rule_status_changed.emit(valid=valid) + else: + self.on.alert_rule_status_changed.emit(valid=valid, errors=errors) + + scrape_errors = ev.get("scrape_job_errors", None) + if scrape_errors: + self.on.invalid_scrape_job.emit(errors=scrape_errors) + + def update_scrape_job_spec(self, jobs): + """Update scrape job specification.""" + self._jobs = PrometheusConfig.sanitize_scrape_configs(jobs) + self.set_scrape_job_spec() + + def set_scrape_job_spec(self, _=None): + """Ensure scrape target information is made available to prometheus. + + When a metrics provider charm is related to a prometheus charm, the + metrics provider sets specification and metadata related to its own + scrape configuration. This information is set using Juju application + data. In addition, each of the consumer units also sets its own + host address in Juju unit relation data. + """ + self._set_unit_ip() + + if not self._charm.unit.is_leader(): + return + + alert_rules = AlertRules(query_type="promql", topology=self.topology) + alert_rules.add_path(self._alert_rules_path, recursive=True) + alert_rules_as_dict = alert_rules.as_dict() + + for relation in self._charm.model.relations[self._relation_name]: + relation.data[self._charm.app]["scrape_metadata"] = json.dumps(self._scrape_metadata) + relation.data[self._charm.app]["scrape_jobs"] = json.dumps(self._scrape_jobs) + + # Update relation data with the string representation of the rule file. + # Juju topology is already included in the "scrape_metadata" field above. + # The consumer side of the relation uses this information to name the rules file + # that is written to the filesystem. + relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict) + + def _set_unit_ip(self, _=None): + """Set unit host address. + + Each time a metrics provider charm container is restarted it updates its own + host address in the unit relation data for the prometheus charm. + + The only argument specified is an event, and it ignored. This is for expediency + to be able to use this method as an event handler, although no access to the + event is actually needed. + """ + for relation in self._charm.model.relations[self._relation_name]: + unit_ip = str(self._charm.model.get_binding(relation).network.bind_address) + + # TODO store entire url in relation data, instead of only select url parts. + + if self.external_url: + parsed = urlparse(self.external_url) + unit_address = parsed.hostname + path = parsed.path + elif self._is_valid_unit_address(unit_ip): + unit_address = unit_ip + path = "" + else: + unit_address = socket.getfqdn() + path = "" + + relation.data[self._charm.unit]["prometheus_scrape_unit_address"] = unit_address + relation.data[self._charm.unit]["prometheus_scrape_unit_path"] = path + relation.data[self._charm.unit]["prometheus_scrape_unit_name"] = str( + self._charm.model.unit.name + ) + + def _is_valid_unit_address(self, address: str) -> bool: + """Validate a unit address. + + At present only IP address validation is supported, but + this may be extended to DNS addresses also, as needed. + + Args: + address: a string representing a unit address + """ + try: + _ = ipaddress.ip_address(address) + except ValueError: + return False + + return True + + @property + def _scrape_jobs(self) -> list: + """Fetch list of scrape jobs. + + Returns: + A list of dictionaries, where each dictionary specifies a + single scrape job for Prometheus. + """ + jobs = self._jobs or [] + if callable(self._lookaside_jobs): + jobs.extend(PrometheusConfig.sanitize_scrape_configs(self._lookaside_jobs())) + return jobs or [DEFAULT_JOB] + + @property + def _scrape_metadata(self) -> dict: + """Generate scrape metadata. + + Returns: + Scrape configuration metadata for this metrics provider charm. + """ + return self.topology.as_dict() + + +class PrometheusRulesProvider(Object): + """Forward rules to Prometheus. + + This object may be used to forward rules to Prometheus. At present it only supports + forwarding alert rules. This is unlike :class:`MetricsEndpointProvider`, which + is used for forwarding both scrape targets and associated alert rules. This object + is typically used when there is a desire to forward rules that apply globally (across + all deployed charms and units) rather than to a single charm. All rule files are + forwarded using the same 'prometheus_scrape' interface that is also used by + `MetricsEndpointProvider`. + + Args: + charm: A charm instance that `provides` a relation with the `prometheus_scrape` interface. + relation_name: Name of the relation in `metadata.yaml` that + has the `prometheus_scrape` interface. + dir_path: Root directory for the collection of rule files. + recursive: Whether to scan for rule files recursively. + """ + + def __init__( + self, + charm: CharmBase, + relation_name: str = DEFAULT_RELATION_NAME, + dir_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, + recursive=True, + ): + super().__init__(charm, relation_name) + self._charm = charm + self._relation_name = relation_name + self._recursive = recursive + + try: + dir_path = _resolve_dir_against_charm_path(charm, dir_path) + except InvalidAlertRulePathError as e: + logger.debug( + "Invalid Prometheus alert rules folder at %s: %s", + e.alert_rules_absolute_path, + e.message, + ) + self.dir_path = dir_path + + events = self._charm.on[self._relation_name] + event_sources = [ + events.relation_joined, + events.relation_changed, + self._charm.on.leader_elected, + self._charm.on.upgrade_charm, + ] + + for event_source in event_sources: + self.framework.observe(event_source, self._update_relation_data) + + def _reinitialize_alert_rules(self): + """Reloads alert rules and updates all relations.""" + self._update_relation_data(None) + + def _update_relation_data(self, _): + """Update application relation data with alert rules for all relations.""" + if not self._charm.unit.is_leader(): + return + + alert_rules = AlertRules(query_type="promql") + alert_rules.add_path(self.dir_path, recursive=self._recursive) + alert_rules_as_dict = alert_rules.as_dict() + + logger.info("Updating relation data with rule files from disk") + for relation in self._charm.model.relations[self._relation_name]: + relation.data[self._charm.app]["alert_rules"] = json.dumps( + alert_rules_as_dict, + sort_keys=True, # sort, to prevent unnecessary relation_changed events + ) + + +class MetricsEndpointAggregator(Object): + """Aggregate metrics from multiple scrape targets. + + `MetricsEndpointAggregator` collects scrape target information from one + or more related charms and forwards this to a `MetricsEndpointConsumer` + charm, which may be in a different Juju model. However, it is + essential that `MetricsEndpointAggregator` itself resides in the same + model as its scrape targets, as this is currently the only way to + ensure in Juju that the `MetricsEndpointAggregator` will be able to + determine the model name and uuid of the scrape targets. + + `MetricsEndpointAggregator` should be used in place of + `MetricsEndpointProvider` in the following two use cases: + + 1. Integrating one or more scrape targets that do not support the + `prometheus_scrape` interface. + + 2. Integrating one or more scrape targets through cross model + relations. Although the [Scrape Config Operator](https://charmhub.io/cos-configuration-k8s) + may also be used for the purpose of supporting cross model + relations. + + Using `MetricsEndpointAggregator` to build a Prometheus charm client + only requires instantiating it. Instantiating + `MetricsEndpointAggregator` is similar to `MetricsEndpointProvider` except + that it requires specifying the names of three relations: the + relation with scrape targets, the relation for alert rules, and + that with the Prometheus charms. For example + + ```python + self._aggregator = MetricsEndpointAggregator( + self, + { + "prometheus": "monitoring", + "scrape_target": "prometheus-target", + "alert_rules": "prometheus-rules" + } + ) + ``` + + `MetricsEndpointAggregator` assumes that each unit of a scrape target + sets in its unit-level relation data two entries with keys + "hostname" and "port". If it is required to integrate with charms + that do not honor these assumptions, it is always possible to + derive from `MetricsEndpointAggregator` overriding the `_get_targets()` + method, which is responsible for aggregating the unit name, host + address ("hostname") and port of the scrape target. + `MetricsEndpointAggregator` also assumes that each unit of a + scrape target sets in its unit-level relation data a key named + "groups". The value of this key is expected to be the string + representation of list of Prometheus Alert rules in YAML format. + An example of a single such alert rule is + + ```yaml + - alert: HighRequestLatency + expr: job:request_latency_seconds:mean5m{job="myjob"} > 0.5 + for: 10m + labels: + severity: page + annotations: + summary: High request latency + ``` + + Once again if it is required to integrate with charms that do not + honour these assumptions about alert rules then an object derived + from `MetricsEndpointAggregator` may be used by overriding the + `_get_alert_rules()` method. + + `MetricsEndpointAggregator` ensures that Prometheus scrape job + specifications and alert rules are annotated with Juju topology + information, just like `MetricsEndpointProvider` and + `MetricsEndpointConsumer` do. + + By default, `MetricsEndpointAggregator` ensures that Prometheus + "instance" labels refer to Juju topology. This ensures that + instance labels are stable over unit recreation. While it is not + advisable to change this option, if required it can be done by + setting the "relabel_instance" keyword argument to `False` when + constructing an aggregator object. + """ + + _stored = StoredState() + + def __init__( + self, + charm, + relation_names: Optional[dict] = None, + relabel_instance=True, + resolve_addresses=False, + ): + """Construct a `MetricsEndpointAggregator`. + + Args: + charm: a `CharmBase` object that manages this + `MetricsEndpointAggregator` object. Typically, this is + `self` in the instantiating class. + relation_names: a dictionary with three keys. The value + of the "scrape_target" and "alert_rules" keys are + the relation names over which scrape job and alert rule + information is gathered by this `MetricsEndpointAggregator`. + And the value of the "prometheus" key is the name of + the relation with a `MetricsEndpointConsumer` such as + the Prometheus charm. + relabel_instance: A boolean flag indicating if Prometheus + scrape job "instance" labels must refer to Juju Topology. + resolve_addresses: A boolean flag indiccating if the aggregator + should attempt to perform DNS lookups of targets and append + a `dns_name` label + """ + self._charm = charm + + relation_names = relation_names or {} + + self._prometheus_relation = relation_names.get( + "prometheus", "downstream-prometheus-scrape" + ) + self._target_relation = relation_names.get("scrape_target", "prometheus-target") + self._alert_rules_relation = relation_names.get("alert_rules", "prometheus-rules") + + super().__init__(charm, self._prometheus_relation) + self._stored.set_default(jobs=[], alert_rules=[]) + + self._relabel_instance = relabel_instance + self._resolve_addresses = resolve_addresses + + # manage Prometheus charm relation events + prometheus_events = self._charm.on[self._prometheus_relation] + self.framework.observe(prometheus_events.relation_joined, self._set_prometheus_data) + + # manage list of Prometheus scrape jobs from related scrape targets + target_events = self._charm.on[self._target_relation] + self.framework.observe(target_events.relation_changed, self._on_prometheus_targets_changed) + self.framework.observe( + target_events.relation_departed, self._on_prometheus_targets_departed + ) + + # manage alert rules for Prometheus from related scrape targets + alert_rule_events = self._charm.on[self._alert_rules_relation] + self.framework.observe(alert_rule_events.relation_changed, self._on_alert_rules_changed) + self.framework.observe(alert_rule_events.relation_departed, self._on_alert_rules_departed) + + def _set_prometheus_data(self, event): + """Ensure every new Prometheus instances is updated. + + Any time a new Prometheus unit joins the relation with + `MetricsEndpointAggregator`, that Prometheus unit is provided + with the complete set of existing scrape jobs and alert rules. + """ + if not self._charm.unit.is_leader(): + return + + jobs = [] + _type_convert_stored( + self._stored.jobs # pyright: ignore + ) # list of scrape jobs, one per relation + for relation in self.model.relations[self._target_relation]: + targets = self._get_targets(relation) + if targets and relation.app: + jobs.append(self._static_scrape_job(targets, relation.app.name)) + + groups = [] + _type_convert_stored( + self._stored.alert_rules # pyright: ignore + ) # list of alert rule groups + for relation in self.model.relations[self._alert_rules_relation]: + unit_rules = self._get_alert_rules(relation) + if unit_rules and relation.app: + appname = relation.app.name + rules = self._label_alert_rules(unit_rules, appname) + group = {"name": self.group_name(appname), "rules": rules} + groups.append(group) + + event.relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) + event.relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups}) + + def _on_prometheus_targets_changed(self, event): + """Update scrape jobs in response to scrape target changes. + + When there is any change in relation data with any scrape + target, the Prometheus scrape job, for that specific target is + updated. + """ + targets = self._get_targets(event.relation) + if not targets: + return + + # new scrape job for the relation that has changed + self.set_target_job_data(targets, event.relation.app.name) + + def set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None: + """Update scrape jobs in response to scrape target changes. + + When there is any change in relation data with any scrape + target, the Prometheus scrape job, for that specific target is + updated. Additionally, if this method is called manually, do the + same. + + Args: + targets: a `dict` containing target information + app_name: a `str` identifying the application + kwargs: a `dict` of the extra arguments passed to the function + """ + if not self._charm.unit.is_leader(): + return + + # new scrape job for the relation that has changed + updated_job = self._static_scrape_job(targets, app_name, **kwargs) + + for relation in self.model.relations[self._prometheus_relation]: + jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) + # list of scrape jobs that have not changed + jobs = [job for job in jobs if updated_job["job_name"] != job["job_name"]] + jobs.append(updated_job) + relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) + + if not _type_convert_stored(self._stored.jobs) == jobs: # pyright: ignore + self._stored.jobs = jobs + + def _on_prometheus_targets_departed(self, event): + """Remove scrape jobs when a target departs. + + Any time a scrape target departs, any Prometheus scrape job + associated with that specific scrape target is removed. + """ + job_name = self._job_name(event.relation.app.name) + unit_name = event.unit.name + self.remove_prometheus_jobs(job_name, unit_name) + + def remove_prometheus_jobs(self, job_name: str, unit_name: Optional[str] = ""): + """Given a job name and unit name, remove scrape jobs associated. + + The `unit_name` parameter is used for automatic, relation data bag-based + generation, where the unit name in labels can be used to ensure that jobs with + similar names (which are generated via the app name when scanning relation data + bags) are not accidentally removed, as their unit name labels will differ. + For NRPE, the job name is calculated from an ID sent via the NRPE relation, and is + sufficient to uniquely identify the target. + """ + if not self._charm.unit.is_leader(): + return + + for relation in self.model.relations[self._prometheus_relation]: + jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) + if not jobs: + continue + + changed_job = [j for j in jobs if j.get("job_name") == job_name] + if not changed_job: + continue + changed_job = changed_job[0] + + # list of scrape jobs that have not changed + jobs = [job for job in jobs if job.get("job_name") != job_name] + + # list of scrape jobs for units of the same application that still exist + configs_kept = [ + config + for config in changed_job["static_configs"] # type: ignore + if config.get("labels", {}).get("juju_unit") != unit_name + ] + + if configs_kept: + changed_job["static_configs"] = configs_kept # type: ignore + jobs.append(changed_job) + + relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) + + if not _type_convert_stored(self._stored.jobs) == jobs: # pyright: ignore + self._stored.jobs = jobs + + def _job_name(self, appname) -> str: + """Construct a scrape job name. + + Each relation has its own unique scrape job name. All units in + the relation are scraped as part of the same scrape job. + + Args: + appname: string name of a related application. + + Returns: + a string Prometheus scrape job name for the application. + """ + return "juju_{}_{}_{}_prometheus_scrape".format( + self.model.name, self.model.uuid[:7], appname + ) + + def _get_targets(self, relation) -> dict: + """Fetch scrape targets for a relation. + + Scrape target information is returned for each unit in the + relation. This information contains the unit name, network + hostname (or address) for that unit, and port on which a + metrics endpoint is exposed in that unit. + + Args: + relation: an `ops.model.Relation` object for which scrape + targets are required. + + Returns: + a dictionary whose keys are names of the units in the + relation. There values associated with each key is itself + a dictionary of the form + ``` + {"hostname": hostname, "port": port} + ``` + """ + targets = {} + for unit in relation.units: + port = relation.data[unit].get("port", 80) + hostname = relation.data[unit].get("hostname") + if hostname: + targets.update({unit.name: {"hostname": hostname, "port": port}}) + + return targets + + def _static_scrape_job(self, targets, application_name, **kwargs) -> dict: + """Construct a static scrape job for an application. + + Args: + targets: a dictionary providing hostname and port for all + scrape target. The keys of this dictionary are unit + names. Values corresponding to these keys are + themselves a dictionary with keys "hostname" and + "port". + application_name: a string name of the application for + which this static scrape job is being constructed. + kwargs: a `dict` of the extra arguments passed to the function + + Returns: + A dictionary corresponding to a Prometheus static scrape + job configuration for one application. The returned + dictionary may be transformed into YAML and appended to + the list of any existing list of Prometheus static configs. + """ + juju_model = self.model.name + juju_model_uuid = self.model.uuid + + job = { + "job_name": self._job_name(application_name), + "static_configs": [ + { + "targets": ["{}:{}".format(target["hostname"], target["port"])], + "labels": { + "juju_model": juju_model, + "juju_model_uuid": juju_model_uuid, + "juju_application": application_name, + "juju_unit": unit_name, + "host": target["hostname"], + # Expanding this will merge the dicts and replace the + # topology labels if any were present/found + **self._static_config_extra_labels(target), + }, + } + for unit_name, target in targets.items() + ], + "relabel_configs": self._relabel_configs + kwargs.get("relabel_configs", []), + } + job.update(kwargs.get("updates", {})) + + return job + + def _static_config_extra_labels(self, target: Dict[str, str]) -> Dict[str, str]: + """Build a list of extra static config parameters, if specified.""" + extra_info = {} + + if self._resolve_addresses: + try: + dns_name = socket.gethostbyaddr(target["hostname"])[0] + except OSError: + logger.debug("Could not perform DNS lookup for %s", target["hostname"]) + dns_name = target["hostname"] + extra_info["dns_name"] = dns_name + + return extra_info + + @property + def _relabel_configs(self) -> list: + """Create Juju topology relabeling configuration. + + Using Juju topology for instance labels ensures that these + labels are stable across unit recreation. + + Returns: + a list of Prometheus relabeling configurations. Each item in + this list is one relabel configuration. + """ + return ( + [ + { + "source_labels": [ + "juju_model", + "juju_model_uuid", + "juju_application", + "juju_unit", + ], + "separator": "_", + "target_label": "instance", + "regex": "(.*)", + } + ] + if self._relabel_instance + else [] + ) + + def _on_alert_rules_changed(self, event): + """Update alert rules in response to scrape target changes. + + When there is any change in alert rule relation data for any + scrape target, the list of alert rules for that specific + target is updated. + """ + unit_rules = self._get_alert_rules(event.relation) + if not unit_rules: + return + + app_name = event.relation.app.name + self.set_alert_rule_data(app_name, unit_rules) + + def set_alert_rule_data(self, name: str, unit_rules: dict, label_rules: bool = True) -> None: + """Update alert rule data. + + The unit rules should be a dict, which is has additional Juju topology labels added. For + rules generated by the NRPE exporter, they are pre-labeled so lookups can be performed. + """ + if not self._charm.unit.is_leader(): + return + + if label_rules: + rules = self._label_alert_rules(unit_rules, name) + else: + rules = [unit_rules] + updated_group = {"name": self.group_name(name), "rules": rules} + + for relation in self.model.relations[self._prometheus_relation]: + alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) + groups = alert_rules.get("groups", []) + # list of alert rule groups that have not changed + for group in groups: + if group["name"] == updated_group["name"]: + group["rules"] = [r for r in group["rules"] if r not in updated_group["rules"]] + group["rules"].extend(updated_group["rules"]) + + if updated_group["name"] not in [g["name"] for g in groups]: + groups.append(updated_group) + relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups}) + + if not _type_convert_stored(self._stored.alert_rules) == groups: # pyright: ignore + self._stored.alert_rules = groups + + def _on_alert_rules_departed(self, event): + """Remove alert rules for departed targets. + + Any time a scrape target departs any alert rules associated + with that specific scrape target is removed. + """ + group_name = self.group_name(event.relation.app.name) + unit_name = event.unit.name + self.remove_alert_rules(group_name, unit_name) + + def remove_alert_rules(self, group_name: str, unit_name: str) -> None: + """Remove an alert rule group from relation data.""" + if not self._charm.unit.is_leader(): + return + + for relation in self.model.relations[self._prometheus_relation]: + alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) + if not alert_rules: + continue + + groups = alert_rules.get("groups", []) + if not groups: + continue + + changed_group = [group for group in groups if group["name"] == group_name] + if not changed_group: + continue + changed_group = changed_group[0] + + # list of alert rule groups that have not changed + groups = [group for group in groups if group["name"] != group_name] + + # list of alert rules not associated with departing unit + rules_kept = [ + rule + for rule in changed_group.get("rules") # type: ignore + if rule.get("labels").get("juju_unit") != unit_name + ] + + if rules_kept: + changed_group["rules"] = rules_kept # type: ignore + groups.append(changed_group) + + relation.data[self._charm.app]["alert_rules"] = ( + json.dumps({"groups": groups}) if groups else "{}" + ) + + if not _type_convert_stored(self._stored.alert_rules) == groups: # pyright: ignore + self._stored.alert_rules = groups + + def _get_alert_rules(self, relation) -> dict: + """Fetch alert rules for a relation. + + Each unit of the related scrape target may have its own + associated alert rules. Alert rules for all units are returned + indexed by unit name. + + Args: + relation: an `ops.model.Relation` object for which alert + rules are required. + + Returns: + a dictionary whose keys are names of the units in the + relation. There values associated with each key is a list + of alert rules. Each rule is in dictionary format. The + structure "rule dictionary" corresponds to single + Prometheus alert rule. + """ + rules = {} + for unit in relation.units: + unit_rules = yaml.safe_load(relation.data[unit].get("groups", "")) + if unit_rules: + rules.update({unit.name: unit_rules}) + + return rules + + def group_name(self, unit_name: str) -> str: + """Construct name for an alert rule group. + + Each unit in a relation may define its own alert rules. All + rules, for all units in a relation are grouped together and + given a single alert rule group name. + + Args: + unit_name: string name of a related application. + + Returns: + a string Prometheus alert rules group name for the unit. + """ + unit_name = re.sub(r"/", "_", unit_name) + return "juju_{}_{}_{}_alert_rules".format(self.model.name, self.model.uuid[:7], unit_name) + + def _label_alert_rules(self, unit_rules, app_name: str) -> list: + """Apply juju topology labels to alert rules. + + Args: + unit_rules: a list of alert rules, where each rule is in + dictionary format. + app_name: a string name of the application to which the + alert rules belong. + + Returns: + a list of alert rules with Juju topology labels. + """ + labeled_rules = [] + for unit_name, rules in unit_rules.items(): + for rule in rules: + # the new JujuTopology removed this, so build it up by hand + matchers = { + "juju_{}".format(k): v + for k, v in JujuTopology(self.model.name, self.model.uuid, app_name, unit_name) + .as_dict(excluded_keys=["charm_name"]) + .items() + } + rule["labels"].update(matchers.items()) + labeled_rules.append(rule) + + return labeled_rules + + +class CosTool: + """Uses cos-tool to inject label matchers into alert rule expressions and validate rules.""" + + _path = None + _disabled = False + + def __init__(self, charm): + self._charm = charm + + @property + def path(self): + """Lazy lookup of the path of cos-tool.""" + if self._disabled: + return None + if not self._path: + self._path = self._get_tool_path() + if not self._path: + logger.debug("Skipping injection of juju topology as label matchers") + self._disabled = True + return self._path + + def apply_label_matchers(self, rules) -> dict: + """Will apply label matchers to the expression of all alerts in all supplied groups.""" + if not self.path: + return rules + for group in rules["groups"]: + rules_in_group = group.get("rules", []) + for rule in rules_in_group: + topology = {} + # if the user for some reason has provided juju_unit, we'll need to honor it + # in most cases, however, this will be empty + for label in [ + "juju_model", + "juju_model_uuid", + "juju_application", + "juju_charm", + "juju_unit", + ]: + if label in rule["labels"]: + topology[label] = rule["labels"][label] + + rule["expr"] = self.inject_label_matchers(rule["expr"], topology) + return rules + + def validate_alert_rules(self, rules: dict) -> Tuple[bool, str]: + """Will validate correctness of alert rules, returning a boolean and any errors.""" + if not self.path: + logger.debug("`cos-tool` unavailable. Not validating alert correctness.") + return True, "" + + with tempfile.TemporaryDirectory() as tmpdir: + rule_path = Path(tmpdir + "/validate_rule.yaml") + rule_path.write_text(yaml.dump(rules)) + + args = [str(self.path), "validate", str(rule_path)] + # noinspection PyBroadException + try: + self._exec(args) + return True, "" + except subprocess.CalledProcessError as e: + logger.debug("Validating the rules failed: %s", e.output) + return False, ", ".join( + [ + line + for line in e.output.decode("utf8").splitlines() + if "error validating" in line + ] + ) + + def validate_scrape_jobs(self, jobs: list) -> bool: + """Validate scrape jobs using cos-tool.""" + if not self.path: + logger.debug("`cos-tool` unavailable. Not validating scrape jobs.") + return True + conf = {"scrape_configs": jobs} + with tempfile.NamedTemporaryFile() as tmpfile: + with open(tmpfile.name, "w") as f: + f.write(yaml.safe_dump(conf)) + try: + self._exec([str(self.path), "validate-config", tmpfile.name]) + except subprocess.CalledProcessError as e: + logger.error("Validating scrape jobs failed: {}".format(e.output)) + raise + return True + + def inject_label_matchers(self, expression, topology) -> str: + """Add label matchers to an expression.""" + if not topology: + return expression + if not self.path: + logger.debug("`cos-tool` unavailable. Leaving expression unchanged: %s", expression) + return expression + args = [str(self.path), "transform"] + args.extend( + ["--label-matcher={}={}".format(key, value) for key, value in topology.items()] + ) + + args.extend(["{}".format(expression)]) + # noinspection PyBroadException + try: + return self._exec(args) + except subprocess.CalledProcessError as e: + logger.debug('Applying the expression failed: "%s", falling back to the original', e) + return expression + + def _get_tool_path(self) -> Optional[Path]: + arch = platform.machine() + arch = "amd64" if arch == "x86_64" else arch + res = "cos-tool-{}".format(arch) + try: + path = Path(res).resolve() + path.chmod(0o777) + return path + except NotImplementedError: + logger.debug("System lacks support for chmod") + except FileNotFoundError: + logger.debug('Could not locate cos-tool at: "{}"'.format(res)) + return None + + def _exec(self, cmd) -> str: + result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + return result.stdout.decode("utf-8").strip() diff --git a/charms/istio-gateway/metadata.yaml b/charms/istio-gateway/metadata.yaml index 70c83b44..6990f678 100644 --- a/charms/istio-gateway/metadata.yaml +++ b/charms/istio-gateway/metadata.yaml @@ -22,5 +22,10 @@ requires: - service-port versions: [v1] __schema_source: https://raw.githubusercontent.com/canonical/operator-schemas/master/k8s-service.yaml +provides: + metrics-endpoint: + interface: prometheus_scrape + grafana-dashboard: + interface: grafana_dashboard assumes: - juju >= 3.1 diff --git a/charms/istio-gateway/requirements-unit.txt b/charms/istio-gateway/requirements-unit.txt index 9113f1ed..d9c80ccb 100644 --- a/charms/istio-gateway/requirements-unit.txt +++ b/charms/istio-gateway/requirements-unit.txt @@ -4,6 +4,8 @@ # # pip-compile requirements-unit.in # +annotated-types==0.7.0 + # via pydantic anyio==4.0.0 # via httpcore attrs==23.1.0 @@ -15,6 +17,8 @@ certifi==2023.7.22 # requests charset-normalizer==3.2.0 # via requests +cosl==0.0.13 + # via -r requirements.in coverage==7.3.0 # via -r requirements-unit.in exceptiongroup==1.1.3 @@ -56,6 +60,7 @@ ops==2.14.0 # via # -r requirements-unit.in # -r requirements.in + # cosl # serialized-data-interface packaging==23.1 # via pytest @@ -63,6 +68,10 @@ pkgutil-resolve-name==1.3.10 # via jsonschema pluggy==1.3.0 # via pytest +pydantic==2.8.2 + # via cosl +pydantic-core==2.20.1 + # via pydantic pyrsistent==0.19.3 # via jsonschema pytest==7.4.1 @@ -74,6 +83,7 @@ pytest-mock==3.11.1 pyyaml==6.0.1 # via # -r requirements-unit.in + # cosl # lightkube # ops # serialized-data-interface @@ -90,6 +100,12 @@ sniffio==1.3.0 # httpx tomli==2.0.1 # via pytest +typing-extensions==4.12.2 + # via + # annotated-types + # cosl + # pydantic + # pydantic-core urllib3==2.0.4 # via requests websocket-client==1.6.2 diff --git a/charms/istio-gateway/requirements.in b/charms/istio-gateway/requirements.in index 75674524..ae449274 100644 --- a/charms/istio-gateway/requirements.in +++ b/charms/istio-gateway/requirements.in @@ -6,3 +6,4 @@ lightkube lightkube-models<1.28 # We're trying to use a deprecated API and it needs to be fixed before KF 1.7 (because 1.7 must support k8s 1.25) oci-image +cosl diff --git a/charms/istio-gateway/requirements.txt b/charms/istio-gateway/requirements.txt index 0bb6493c..6f494a6d 100644 --- a/charms/istio-gateway/requirements.txt +++ b/charms/istio-gateway/requirements.txt @@ -4,6 +4,8 @@ # # pip-compile requirements.in # +annotated-types==0.7.0 + # via pydantic anyio==4.0.0 # via httpcore attrs==23.1.0 @@ -15,6 +17,8 @@ certifi==2023.7.22 # requests charset-normalizer==3.2.0 # via requests +cosl==0.0.13 + # via -r requirements.in exceptiongroup==1.1.3 # via anyio h11==0.14.0 @@ -47,13 +51,19 @@ oci-image==1.0.0 ops==2.14.0 # via # -r requirements.in + # cosl # serialized-data-interface pkgutil-resolve-name==1.3.10 # via jsonschema +pydantic==2.8.2 + # via cosl +pydantic-core==2.20.1 + # via pydantic pyrsistent==0.19.3 # via jsonschema pyyaml==6.0.1 # via + # cosl # lightkube # ops # serialized-data-interface @@ -68,6 +78,12 @@ sniffio==1.3.0 # anyio # httpcore # httpx +typing-extensions==4.12.2 + # via + # annotated-types + # cosl + # pydantic + # pydantic-core urllib3==2.0.4 # via requests websocket-client==1.6.2 diff --git a/charms/istio-gateway/src/charm.py b/charms/istio-gateway/src/charm.py index 03829693..6532397a 100755 --- a/charms/istio-gateway/src/charm.py +++ b/charms/istio-gateway/src/charm.py @@ -2,6 +2,8 @@ import logging +from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider +from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider from jinja2 import Environment, FileSystemLoader from lightkube import Client, codecs from lightkube.core.exceptions import ApiError @@ -12,6 +14,9 @@ SUPPORTED_GATEWAY_SERVICE_TYPES = ["LoadBalancer", "ClusterIP", "NodePort"] +METRICS_PATH = "/stats/prometheus" +METRICS_PORT = 9090 + class Operator(CharmBase): def __init__(self, *args): @@ -30,6 +35,24 @@ def __init__(self, *args): self.framework.observe(event, self.start) self.framework.observe(self.on.remove, self.remove) + # metrics and dashboard relation configuration + self.prometheus_provider = MetricsEndpointProvider( + charm=self, + relation_name="metrics-endpoint", + jobs=[ + { + "metrics_path": METRICS_PATH, + # Note(rgildein): Service is defined in manifest.yaml and without using full + # path, the grafana-agent will be using IP of application pod instead of IP + # of workload deployment. + "static_configs": [ + {"targets": [f"istio-gateway-metrics.{self.model.name}.svc:{9090}"]} + ], + } + ], + ) + self.dashboard_provider = GrafanaDashboardProvider(self) + def start(self, event): """Event handler for StartEevnt.""" try: diff --git a/charms/istio-gateway/src/manifest.yaml b/charms/istio-gateway/src/manifest.yaml index c4f5c227..5fb11c6c 100644 --- a/charms/istio-gateway/src/manifest.yaml +++ b/charms/istio-gateway/src/manifest.yaml @@ -308,3 +308,25 @@ spec: selector: istio: {{ kind }}gateway type: {{ 'ClusterIP' if kind == 'egress' else gateway_service_type }} +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: istio-{{ kind }}gateway + install.operator.istio.io/owning-resource: unknown + istio: {{ kind }}gateway + istio.io/rev: default + operator.istio.io/component: "{{ kind|capitalize }}Gateways" + release: istio + name: istio-gateway-metrics + namespace: {{ namespace }} +spec: + ports: + - name: metrics + port: 9090 + protocol: TCP + targetPort: 15020 + selector: + istio: {{ kind }}gateway + type: ClusterIP diff --git a/charms/istio-gateway/tests/unit/data/egress-example.yaml b/charms/istio-gateway/tests/unit/data/egress-example.yaml index 8810a14c..a60b4005 100644 --- a/charms/istio-gateway/tests/unit/data/egress-example.yaml +++ b/charms/istio-gateway/tests/unit/data/egress-example.yaml @@ -303,3 +303,25 @@ spec: selector: istio: egressgateway type: ClusterIP +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: istio-egressgateway + install.operator.istio.io/owning-resource: unknown + istio: egressgateway + istio.io/rev: default + operator.istio.io/component: "EgressGateways" + release: istio + name: istio-gateway-metrics + namespace: None +spec: + ports: + - name: metrics + port: 9090 + protocol: TCP + targetPort: 15020 + selector: + istio: egressgateway + type: ClusterIP diff --git a/charms/istio-gateway/tests/unit/data/ingress-example.yaml b/charms/istio-gateway/tests/unit/data/ingress-example.yaml index 81d0689f..56b30b0f 100644 --- a/charms/istio-gateway/tests/unit/data/ingress-example.yaml +++ b/charms/istio-gateway/tests/unit/data/ingress-example.yaml @@ -303,3 +303,25 @@ spec: selector: istio: ingressgateway type: LoadBalancer +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: istio-ingressgateway + install.operator.istio.io/owning-resource: unknown + istio: ingressgateway + istio.io/rev: default + operator.istio.io/component: "IngressGateways" + release: istio + name: istio-gateway-metrics + namespace: None +spec: + ports: + - name: metrics + port: 9090 + protocol: TCP + targetPort: 15020 + selector: + istio: ingressgateway + type: ClusterIP diff --git a/charms/istio-gateway/tests/unit/test_charm.py b/charms/istio-gateway/tests/unit/test_charm.py index 9e9ed09a..50c69f5d 100644 --- a/charms/istio-gateway/tests/unit/test_charm.py +++ b/charms/istio-gateway/tests/unit/test_charm.py @@ -1,3 +1,5 @@ +from unittest.mock import patch + import pytest import yaml from lightkube.core.exceptions import ApiError @@ -138,3 +140,28 @@ def test_service_type(configured_harness_only_ingress, gateway_service_type, moc assert workload_service["spec"].get("type") == gateway_service_type assert configured_harness_only_ingress.charm.model.unit.status == ActiveStatus("") + + +def test_metrics(harness): + """Test MetricsEndpointProvider initialization.""" + with patch("charm.MetricsEndpointProvider") as mock_metrics: + harness.begin() + mock_metrics.assert_called_once_with( + charm=harness.charm, + relation_name="metrics-endpoint", + jobs=[ + { + "metrics_path": "/stats/prometheus", + "static_configs": [ + {"targets": [f"istio-gateway-metrics.{harness.model.name}.svc:9090"]} + ], + } + ], + ) + + +def test_grafana_dashboard(harness): + """Test GrafanaDashboardProvider initialization.""" + with patch("charm.GrafanaDashboardProvider") as mock_grafana: + harness.begin() + mock_grafana.assert_called_once_with(harness.charm) From ec66b51f5283967cfb8d442698fae124a6b54615 Mon Sep 17 00:00:00 2001 From: Robert Gildein Date: Fri, 19 Jul 2024 09:30:39 +0200 Subject: [PATCH 2/8] Update integration tests with cos. --- requirements-integration.in | 2 + requirements-integration.txt | 58 +++++++++-- requirements-lint.in | 1 + requirements-lint.txt | 18 +++- tests/test_cos_integration.py | 176 ++++++++++++---------------------- tox.ini | 1 - 6 files changed, 129 insertions(+), 127 deletions(-) diff --git a/requirements-integration.in b/requirements-integration.in index 63e39988..e542af6a 100644 --- a/requirements-integration.in +++ b/requirements-integration.in @@ -7,3 +7,5 @@ lightkube pytest-operator PyYAML tenacity +# This is required due to the abstraction of cos integration +charmed-kubeflow-chisme>=0.4.0 diff --git a/requirements-integration.txt b/requirements-integration.txt index 59512a18..c8aa57a5 100644 --- a/requirements-integration.txt +++ b/requirements-integration.txt @@ -4,7 +4,7 @@ # # pip-compile requirements-integration.in # -aiohttp==3.8.5 +aiohttp==3.9.5 # via -r requirements-integration.in aiosignal==1.3.1 # via aiohttp @@ -17,7 +17,9 @@ async-timeout==4.0.3 asyncio==3.4.3 # via -r requirements-integration.in attrs==23.1.0 - # via aiohttp + # via + # aiohttp + # jsonschema backcall==0.2.0 # via ipython bcrypt==4.0.1 @@ -36,16 +38,18 @@ cffi==1.15.1 # via # cryptography # pynacl +charmed-kubeflow-chisme==0.4.2 + # via -r requirements-integration.in charset-normalizer==3.2.0 - # via - # aiohttp - # requests + # via requests cryptography==41.0.3 # via paramiko decorator==5.1.1 # via # ipdb # ipython +deepdiff==6.2.1 + # via charmed-kubeflow-chisme exceptiongroup==1.1.3 # via # anyio @@ -72,6 +76,8 @@ idna==3.4 # httpx # requests # yarl +importlib-resources==6.4.0 + # via jsonschema iniconfig==2.0.0 # via pytest ipdb==0.13.13 @@ -81,15 +87,22 @@ ipython==8.12.2 jedi==0.19.0 # via ipython jinja2==3.1.2 - # via pytest-operator + # via + # charmed-kubeflow-chisme + # pytest-operator +jsonschema==4.17.3 + # via serialized-data-interface juju==3.4.0.0 # via # -r requirements-integration.in + # charmed-kubeflow-chisme # pytest-operator kubernetes==27.2.0 # via juju lightkube==0.14.0 - # via -r requirements-integration.in + # via + # -r requirements-integration.in + # charmed-kubeflow-chisme lightkube-models==1.28.1.4 # via lightkube macaroonbakery==1.3.1 @@ -108,6 +121,12 @@ oauthlib==3.2.2 # via # kubernetes # requests-oauthlib +ops==2.14.1 + # via + # charmed-kubeflow-chisme + # serialized-data-interface +ordered-set==4.1.0 + # via deepdiff packaging==23.1 # via # juju @@ -120,6 +139,8 @@ pexpect==4.8.0 # via ipython pickleshare==0.7.5 # via ipython +pkgutil-resolve-name==1.3.10 + # via jsonschema pluggy==1.3.0 # via pytest prompt-toolkit==3.0.39 @@ -154,6 +175,8 @@ pyrfc3339==1.1 # via # juju # macaroonbakery +pyrsistent==0.20.0 + # via jsonschema pytest==7.4.2 # via # pytest-asyncio @@ -172,17 +195,26 @@ pyyaml==6.0.1 # juju # kubernetes # lightkube + # ops # pytest-operator + # serialized-data-interface requests==2.31.0 # via # hvac # kubernetes # macaroonbakery # requests-oauthlib + # serialized-data-interface requests-oauthlib==1.3.1 # via kubernetes rsa==4.9 # via google-auth +ruamel-yaml==0.18.6 + # via charmed-kubeflow-chisme +ruamel-yaml-clib==0.2.8 + # via ruamel-yaml +serialized-data-interface==0.7.0 + # via charmed-kubeflow-chisme six==1.16.0 # via # asttokens @@ -202,7 +234,9 @@ soupsieve==2.5 stack-data==0.6.2 # via ipython tenacity==8.2.3 - # via -r requirements-integration.in + # via + # -r requirements-integration.in + # charmed-kubeflow-chisme tomli==2.0.1 # via # ipdb @@ -227,8 +261,12 @@ urllib3==1.26.16 wcwidth==0.2.6 # via prompt-toolkit websocket-client==1.6.2 - # via kubernetes -websockets==8.1 + # via + # kubernetes + # ops +websockets==12.0 # via juju yarl==1.9.2 # via aiohttp +zipp==3.19.2 + # via importlib-resources diff --git a/requirements-lint.in b/requirements-lint.in index 914276e8..07a4a51c 100644 --- a/requirements-lint.in +++ b/requirements-lint.in @@ -1,3 +1,4 @@ +black codespell flake8 flake8-builtins diff --git a/requirements-lint.txt b/requirements-lint.txt index 82ea1e33..35059667 100644 --- a/requirements-lint.txt +++ b/requirements-lint.txt @@ -4,6 +4,10 @@ # # pip-compile requirements-lint.in # +black==24.4.2 + # via -r requirements-lint.in +click==8.1.7 + # via black codespell==2.2.5 # via -r requirements-lint.in flake8==6.0.0 @@ -20,8 +24,16 @@ isort==5.12.0 # via -r requirements-lint.in mccabe==0.7.0 # via flake8 +mypy-extensions==1.0.0 + # via black +packaging==24.1 + # via black +pathspec==0.12.1 + # via black pep8-naming==0.13.3 # via -r requirements-lint.in +platformdirs==4.2.2 + # via black pycodestyle==2.10.0 # via flake8 pyflakes==3.0.1 @@ -29,7 +41,11 @@ pyflakes==3.0.1 pyproject-flake8==6.0.0.post1 # via -r requirements-lint.in tomli==2.0.1 - # via pyproject-flake8 + # via + # black + # pyproject-flake8 +typing-extensions==4.12.2 + # via black # The following packages are considered to be unsafe in a requirements file: # setuptools diff --git a/tests/test_cos_integration.py b/tests/test_cos_integration.py index 2443dcb1..88fa4d0a 100644 --- a/tests/test_cos_integration.py +++ b/tests/test_cos_integration.py @@ -1,31 +1,30 @@ # Copyright 2023 Canonical Ltd. # See LICENSE file for licensing details. -import glob -import json import logging from pathlib import Path import pytest -import requests -import tenacity -import yaml +from charmed_kubeflow_chisme.testing import ( + APP_GRAFANA_DASHBOARD, + APP_METRICS_ENDPOINT, + GRAFANA_AGENT_APP, + GRAFANA_AGENT_GRAFANA_DASHBOARD, + GRAFANA_AGENT_METRICS_ENDPOINT, + assert_alert_rules, + assert_metrics_endpoint, + deploy_and_assert_grafana_agent, + get_alert_rules, +) from pytest_operator.plugin import OpsTest log = logging.getLogger(__name__) ISTIO_PILOT = "istio-pilot" +ISTIO_PILOT_ALER_RULES = Path("./charms/istio-pilot/src/prometheus_alert_rules") ISTIO_GATEWAY_APP_NAME = "istio-ingressgateway" -PROMETHEUS_K8S = "prometheus-k8s" -PROMETHEUS_K8S_CHANNEL = "latest/stable" -PROMETHEUS_K8S_TRUST = True -PROMETHEUS_SCRAPE_K8S = "prometheus-scrape-config-k8s" -PROMETHEUS_SCRAPE_K8S_CHANNEL = "latest/stable" -PROMETHEUS_SCRAPE_CONFIG = {"scrape_interval": "30s"} - - @pytest.mark.abort_on_fail async def test_build_and_deploy_istio_charms(ops_test: OpsTest): # Build, deploy, and relate istio charms @@ -43,7 +42,7 @@ async def test_build_and_deploy_istio_charms(ops_test: OpsTest): trust=True, ) - await ops_test.model.add_relation( + await ops_test.model.integrate( f"{ISTIO_PILOT}:istio-pilot", f"{ISTIO_GATEWAY_APP_NAME}:istio-pilot" ) @@ -52,110 +51,57 @@ async def test_build_and_deploy_istio_charms(ops_test: OpsTest): raise_on_blocked=False, timeout=90 * 10, ) - - -async def test_prometheus_grafana_integration_istio_pilot(ops_test: OpsTest): - """Deploy prometheus and required relations, then test the metrics.""" - await ops_test.model.deploy( - PROMETHEUS_K8S, - channel=PROMETHEUS_K8S_CHANNEL, - trust=PROMETHEUS_K8S_TRUST, + # Deploying grafana-agent-k8s and add all relations + await deploy_and_assert_grafana_agent( + ops_test.model, ISTIO_PILOT, metrics=True, dashboard=True, logging=False ) - await ops_test.model.deploy( - PROMETHEUS_SCRAPE_K8S, - channel=PROMETHEUS_SCRAPE_K8S_CHANNEL, - config=PROMETHEUS_SCRAPE_CONFIG, + # Note(rgildein): Using this until the [1] is not fixed. + # [1]: https://github.com/canonical/charmed-kubeflow-chisme/issues/117 + log.info( + "Adding relation: %s:%s and %s:%s", + ISTIO_GATEWAY_APP_NAME, + APP_GRAFANA_DASHBOARD, + GRAFANA_AGENT_APP, + GRAFANA_AGENT_GRAFANA_DASHBOARD, ) - - await ops_test.model.add_relation("istio-pilot", PROMETHEUS_SCRAPE_K8S) - await ops_test.model.add_relation( - f"{PROMETHEUS_K8S}:metrics-endpoint", - f"{PROMETHEUS_SCRAPE_K8S}:metrics-endpoint", + await ops_test.model.integrate( + f"{ISTIO_GATEWAY_APP_NAME}:{APP_GRAFANA_DASHBOARD}", + f"{GRAFANA_AGENT_APP}:{GRAFANA_AGENT_GRAFANA_DASHBOARD}", + ) + log.info( + "Adding relation: %s:%s and %s:%s", + ISTIO_GATEWAY_APP_NAME, + APP_METRICS_ENDPOINT, + GRAFANA_AGENT_APP, + GRAFANA_AGENT_METRICS_ENDPOINT, + ) + await ops_test.model.integrate( + f"{ISTIO_GATEWAY_APP_NAME}:{APP_METRICS_ENDPOINT}", + f"{GRAFANA_AGENT_APP}:{GRAFANA_AGENT_METRICS_ENDPOINT}", + ) + await ops_test.model.wait_for_idle( + apps=[GRAFANA_AGENT_APP], status="blocked", timeout=5 * 60, idle_period=60 ) - - await ops_test.model.wait_for_idle(status="active", timeout=60 * 20) - status = await ops_test.model.get_status() - prometheus_unit_ip = status["applications"][PROMETHEUS_K8S]["units"][f"{PROMETHEUS_K8S}/0"][ - "address" - ] - log.info(f"Prometheus available at http://{prometheus_unit_ip}:9090") - - for attempt in retry_for_5_attempts: - log.info( - f"Testing prometheus deployment (attempt " f"{attempt.retry_state.attempt_number})" - ) - with attempt: - r = requests.get( - f"http://{prometheus_unit_ip}:9090/api/v1/query?" - f'query=up{{juju_application="{ISTIO_PILOT}"}}' - ) - response = json.loads(r.content.decode("utf-8")) - response_status = response["status"] - log.info(f"Response status is {response_status}") - assert response_status == "success" - - response_metric = response["data"]["result"][0]["metric"] - assert response_metric["juju_application"] == ISTIO_PILOT - assert response_metric["juju_model"] == ops_test.model_name - - -async def test_istio_pilot_alert_rules(ops_test: OpsTest): - """Test alert rules availability and match with what is found in the source code.""" - - status = await ops_test.model.get_status() - prometheus_unit_ip = status["applications"][PROMETHEUS_K8S]["units"][f"{PROMETHEUS_K8S}/0"][ - "address" - ] - - # Get targets and assert they are available - targets_url = f"http://{prometheus_unit_ip}:9090/api/v1/targets" - for attempt in retry_for_5_attempts: - log.info( - f"Reaching Prometheus targets... (attempt " f"{attempt.retry_state.attempt_number})" - ) - with attempt: - r = requests.get(targets_url) - targets_result = json.loads(r.content.decode("utf-8")) - assert targets_result is not None - assert targets_result["status"] == "success" - - # Verify that istio-pilot is in the target list - discovered_labels = targets_result["data"]["activeTargets"][0]["discoveredLabels"] - assert discovered_labels["juju_application"] == "istio-pilot" - - # Get available alert rules from Prometheus and assert they are available - rules_url = f"http://{prometheus_unit_ip}:9090/api/v1/rules" - for attempt in retry_for_5_attempts: - log.info( - f"Reaching Prometheus alert rules... (attempt " - f"{attempt.retry_state.attempt_number})" - ) - with attempt: - r = requests.get(rules_url) - alert_rules_result = json.loads(r.content.decode("utf-8")) - - assert alert_rules_result is not None - assert alert_rules_result["status"] == "success" - actual_rules = [] - for group in alert_rules_result["data"]["groups"]: - actual_rules.append(group["rules"][0]) - - # Verify expected alerts vs actual alerts in Prometheus - istio_pilot_alert_rules = glob.glob("charms/istio-pilot/src/prometheus_alert_rules/*.rule") - expected_rules = [] - for alert_rule in istio_pilot_alert_rules: - alert_object = yaml.safe_load(Path(alert_rule).read_text()) - expected_rules.append(alert_object["alert"]) - assert len(expected_rules) == len(actual_rules) - - # Verify istio_pilot alert rules match the actual alert rules - for rule in actual_rules: - assert rule["name"] in expected_rules -# Helper to retry calling a function over 30 seconds or 5 attempts -retry_for_5_attempts = tenacity.Retrying( - stop=(tenacity.stop_after_attempt(5) | tenacity.stop_after_delay(30)), - wait=tenacity.wait_exponential(multiplier=1, min=1, max=10), - reraise=True, +@pytest.mark.parametrize( + "charm, metrics_path, metrics_port", + [(ISTIO_PILOT, "/metrics", 15014), (ISTIO_GATEWAY_APP_NAME, "/stats/prometheus", 9090)], ) +async def test_metrics_enpoint(charm, metrics_path, metrics_port, ops_test): + """Test metrics_endpoints are defined in relation data bag and their accessibility. + This function gets all the metrics_endpoints from the relation data bag, checks if + they are available from the grafana-agent-k8s charm and finally compares them with the + ones provided to the function. + """ + app = ops_test.model.applications[charm] + await assert_metrics_endpoint(app, metrics_port=metrics_port, metrics_path=metrics_path) + + +@pytest.mark.parametrize("charm, path_to_alert_rules", [(ISTIO_PILOT, ISTIO_PILOT_ALER_RULES)]) +async def test_alert_rules(charm, path_to_alert_rules, ops_test): + """Test check charm alert rules and rules defined in relation data bag.""" + app = ops_test.model.applications[charm] + alert_rules = get_alert_rules(path_to_alert_rules) + log.info("found alert_rules: %s", alert_rules) + await assert_alert_rules(app, alert_rules) diff --git a/tox.ini b/tox.ini index 0b31eecd..58ae8731 100644 --- a/tox.ini +++ b/tox.ini @@ -46,7 +46,6 @@ deps = description = Apply coding style standards to code [testenv:lint] -allowlist_externals = black commands = # uncomment the following line if this charm owns a lib # codespell {[vars]lib_path} From a3e82eb586716ab86ab4b5189e063570f31a624b Mon Sep 17 00:00:00 2001 From: Robert Gildein Date: Fri, 19 Jul 2024 14:18:20 +0200 Subject: [PATCH 3/8] Add single alert rule from source [1] There are more rules, but no metrics for them was found. I also cheded alert rules from [2]. The Grafana dashboards will be added to istio-pilot in different PR, since most of metrics used in them are from the istio-pilot. --- [1]: https://samber.github.io/awesome-prometheus-alerts/rules.html#rule-istio-1-10 [2]: https://github.com/istio/tools/blob/release-1.14/perf/stability/alertmanager/prometheusrule.yaml --- .../prometheus_alert_rules/IstioPilotDuplicateEntry.rule | 8 ++++++++ tests/test_cos_integration.py | 5 ++++- 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 charms/istio-gateway/src/prometheus_alert_rules/IstioPilotDuplicateEntry.rule diff --git a/charms/istio-gateway/src/prometheus_alert_rules/IstioPilotDuplicateEntry.rule b/charms/istio-gateway/src/prometheus_alert_rules/IstioPilotDuplicateEntry.rule new file mode 100644 index 00000000..39301de5 --- /dev/null +++ b/charms/istio-gateway/src/prometheus_alert_rules/IstioPilotDuplicateEntry.rule @@ -0,0 +1,8 @@ +alert: IstioPilotDuplicateEntry +expr: sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0 +for: 0m +labels: + severity: critical +annotations: + summary: Istio Pilot Duplicate Entry (instance {{ $labels.instance }}) + description: "Istio pilot duplicate entry error.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/tests/test_cos_integration.py b/tests/test_cos_integration.py index 88fa4d0a..a1067de1 100644 --- a/tests/test_cos_integration.py +++ b/tests/test_cos_integration.py @@ -23,6 +23,7 @@ ISTIO_PILOT = "istio-pilot" ISTIO_PILOT_ALER_RULES = Path("./charms/istio-pilot/src/prometheus_alert_rules") ISTIO_GATEWAY_APP_NAME = "istio-ingressgateway" +ISTIO_GATEWAY_ALER_RULES = Path("./charms/istio-ingressgateway/src/prometheus_alert_rules") @pytest.mark.abort_on_fail @@ -98,7 +99,9 @@ async def test_metrics_enpoint(charm, metrics_path, metrics_port, ops_test): await assert_metrics_endpoint(app, metrics_port=metrics_port, metrics_path=metrics_path) -@pytest.mark.parametrize("charm, path_to_alert_rules", [(ISTIO_PILOT, ISTIO_PILOT_ALER_RULES)]) +@pytest.mark.parametrize("charm, path_to_alert_rules", [ + (ISTIO_PILOT, ISTIO_PILOT_ALER_RULES), (ISTIO_GATEWAY_APP_NAME, ISTIO_PILOT_ALER_RULES) +]) async def test_alert_rules(charm, path_to_alert_rules, ops_test): """Test check charm alert rules and rules defined in relation data bag.""" app = ops_test.model.applications[charm] From 2d168ab04f279fcb32d8a94b455fd3885ef8af99 Mon Sep 17 00:00:00 2001 From: Robert Gildein Date: Thu, 25 Jul 2024 11:05:56 +0200 Subject: [PATCH 4/8] bump cosl and jinja2 --- charms/istio-gateway/charmcraft.yaml | 1 + charms/istio-gateway/requirements-unit.txt | 3 +-- charms/istio-gateway/requirements.txt | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/charms/istio-gateway/charmcraft.yaml b/charms/istio-gateway/charmcraft.yaml index 0cfc6685..15391257 100644 --- a/charms/istio-gateway/charmcraft.yaml +++ b/charms/istio-gateway/charmcraft.yaml @@ -9,3 +9,4 @@ bases: parts: charm: charm-python-packages: [setuptools, pip] + build-packages: [git, rustc, cargo, libffi-dev, libssl-dev, pkg-config] diff --git a/charms/istio-gateway/requirements-unit.txt b/charms/istio-gateway/requirements-unit.txt index ca3c4af1..639bd76a 100644 --- a/charms/istio-gateway/requirements-unit.txt +++ b/charms/istio-gateway/requirements-unit.txt @@ -100,13 +100,12 @@ sniffio==1.3.1 tomli==2.0.1 # via pytest typing-extensions==4.12.2 - # via # annotated-types + # anyio # cosl # pydantic # pydantic-core - # via anyio urllib3==2.2.2 # via requests websocket-client==1.8.0 diff --git a/charms/istio-gateway/requirements.txt b/charms/istio-gateway/requirements.txt index 8d5daea9..5232c959 100644 --- a/charms/istio-gateway/requirements.txt +++ b/charms/istio-gateway/requirements.txt @@ -7,7 +7,7 @@ annotated-types==0.7.0 # via pydantic anyio==4.4.0 - # via httpcore + # via httpx attrs==23.2.0 # via jsonschema certifi==2024.7.4 @@ -17,7 +17,7 @@ certifi==2024.7.4 # requests charset-normalizer==3.3.2 # via requests -cosl==0.0.14 +cosl==0.0.15 # via -r requirements.in exceptiongroup==1.2.2 # via anyio @@ -80,6 +80,7 @@ sniffio==1.3.1 typing-extensions==4.12.2 # via # annotated-types + # anyio # cosl # pydantic # pydantic-core From 58cdde24f8efd7c7afaa79d65598fadf86d853b0 Mon Sep 17 00:00:00 2001 From: Robert Gildein Date: Thu, 25 Jul 2024 15:17:57 +0200 Subject: [PATCH 5/8] fix integration test for alert rules --- tests/test_cos_integration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_cos_integration.py b/tests/test_cos_integration.py index a1067de1..85ba561e 100644 --- a/tests/test_cos_integration.py +++ b/tests/test_cos_integration.py @@ -23,7 +23,7 @@ ISTIO_PILOT = "istio-pilot" ISTIO_PILOT_ALER_RULES = Path("./charms/istio-pilot/src/prometheus_alert_rules") ISTIO_GATEWAY_APP_NAME = "istio-ingressgateway" -ISTIO_GATEWAY_ALER_RULES = Path("./charms/istio-ingressgateway/src/prometheus_alert_rules") +ISTIO_GATEWAY_ALER_RULES = Path("./charms/istio-gateway/src/prometheus_alert_rules") @pytest.mark.abort_on_fail @@ -100,7 +100,7 @@ async def test_metrics_enpoint(charm, metrics_path, metrics_port, ops_test): @pytest.mark.parametrize("charm, path_to_alert_rules", [ - (ISTIO_PILOT, ISTIO_PILOT_ALER_RULES), (ISTIO_GATEWAY_APP_NAME, ISTIO_PILOT_ALER_RULES) + (ISTIO_PILOT, ISTIO_PILOT_ALER_RULES), (ISTIO_GATEWAY_APP_NAME, ISTIO_GATEWAY_ALER_RULES) ]) async def test_alert_rules(charm, path_to_alert_rules, ops_test): """Test check charm alert rules and rules defined in relation data bag.""" From cb8467849812075ab2090a700868ba49240f22b3 Mon Sep 17 00:00:00 2001 From: Robert Gildein Date: Tue, 30 Jul 2024 11:59:11 +0200 Subject: [PATCH 6/8] add alert rules from upstream --- .../IstioPilotDuplicateEntry.rule | 8 ------- .../src/prometheus_alert_rules/basic.rules | 18 +++++++++++++++ .../src/prometheus_alert_rules/workload.rules | 23 +++++++++++++++++++ 3 files changed, 41 insertions(+), 8 deletions(-) delete mode 100644 charms/istio-gateway/src/prometheus_alert_rules/IstioPilotDuplicateEntry.rule create mode 100644 charms/istio-gateway/src/prometheus_alert_rules/basic.rules create mode 100644 charms/istio-gateway/src/prometheus_alert_rules/workload.rules diff --git a/charms/istio-gateway/src/prometheus_alert_rules/IstioPilotDuplicateEntry.rule b/charms/istio-gateway/src/prometheus_alert_rules/IstioPilotDuplicateEntry.rule deleted file mode 100644 index 39301de5..00000000 --- a/charms/istio-gateway/src/prometheus_alert_rules/IstioPilotDuplicateEntry.rule +++ /dev/null @@ -1,8 +0,0 @@ -alert: IstioPilotDuplicateEntry -expr: sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0 -for: 0m -labels: - severity: critical -annotations: - summary: Istio Pilot Duplicate Entry (instance {{ $labels.instance }}) - description: "Istio pilot duplicate entry error.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/charms/istio-gateway/src/prometheus_alert_rules/basic.rules b/charms/istio-gateway/src/prometheus_alert_rules/basic.rules new file mode 100644 index 00000000..3644e2a2 --- /dev/null +++ b/charms/istio-gateway/src/prometheus_alert_rules/basic.rules @@ -0,0 +1,18 @@ +# Note(rgildein): Alert rules come from the source https://github.com/istio/tools/blob/6d537aee69ec7e9da007f311562496f7ac1cb691/perf/stability/alertmanager/prometheusrule.yaml#L11-L26 +groups: + - name: IstioGatewatBasic + rules: + - alert: IngressTrafficMissing + annotations: + summary: 'ingress gateway traffic missing' + description: '[Critical]: ingress gateway traffic missing, likely other monitors are misleading, check client logs' + expr: > + absent(reporter="source", source_workload=~"istio-(ingress|egress)gateway-workload"})==1 + for: 5m + - alert: IstioMetricsMissing + annotations: + summary: 'Istio Metrics missing' + description: '[Critical]: Check prometheus deployment or whether the prometheus filters are applied correctly' + expr: > + absent(istio_requests_total)==1 or absent(istio_request_duration_milliseconds_bucket)==1 + for: 5m diff --git a/charms/istio-gateway/src/prometheus_alert_rules/workload.rules b/charms/istio-gateway/src/prometheus_alert_rules/workload.rules new file mode 100644 index 00000000..bf674efe --- /dev/null +++ b/charms/istio-gateway/src/prometheus_alert_rules/workload.rules @@ -0,0 +1,23 @@ +# Note(rgildein): Alert rules come from the source https://github.com/istio/tools/blob/6d537aee69ec7e9da007f311562496f7ac1cb691/perf/stability/alertmanager/prometheusrule.yaml#L28-L47 +groups: + - name: Workload + rules: + - alert: HTTP5xxRateHigh + annotations: + summary: '5xx rate too high' + description: 'The HTTP 5xx errors rate higher than 0.05 in 5 mins' + expr: > + sum(irate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(irate(istio_requests_total{reporter="destination"}[5m])) > 0.05 + for: 5m + - alert: WorkloadLatencyP99High + expr: histogram_quantile(0.99, sum(irate(istio_request_duration_milliseconds_bucket{source_workload=~"svc.*"}[5m])) by (source_workload,namespace, le)) > 160 + for: 10m + annotations: + description: 'The workload request latency P99 > 160ms ' + message: "Request duration has slowed down for workload: {{`{{$labels.source_workload}}`}} in namespace: {{`{{$labels.namespace}}`}}. Response duration is {{`{{$value}}`}} milliseconds" + - alert: IngressLatencyP99High + expr: histogram_quantile(0.99, sum(irate(istio_request_duration_milliseconds_bucket{source_workload=~"istio.*"}[5m])) by (source_workload,namespace, le)) > 250 + for: 10m + annotations: + description: 'The ingress latency P99 > 250ms ' + message: "Request duration has slowed down for ingress: {{`{{$labels.source_workload}}`}} in namespace: {{`{{$labels.namespace}}`}}. Response duration is {{`{{$value}}`}} milliseconds" From 73289d996ec28b0765fa5a36896c69e63d6922ff Mon Sep 17 00:00:00 2001 From: Robert Gildein Date: Tue, 30 Jul 2024 12:32:46 +0200 Subject: [PATCH 7/8] drop dashboard relation --- .../grafana_k8s/v0/grafana_dashboard.py | 2014 ----------------- charms/istio-gateway/metadata.yaml | 2 - charms/istio-gateway/src/charm.py | 4 +- .../src/prometheus_alert_rules/basic.rules | 2 +- .../src/prometheus_alert_rules/workload.rules | 2 +- charms/istio-gateway/tests/unit/test_charm.py | 7 - tests/test_cos_integration.py | 13 - 7 files changed, 3 insertions(+), 2041 deletions(-) delete mode 100644 charms/istio-gateway/lib/charms/grafana_k8s/v0/grafana_dashboard.py diff --git a/charms/istio-gateway/lib/charms/grafana_k8s/v0/grafana_dashboard.py b/charms/istio-gateway/lib/charms/grafana_k8s/v0/grafana_dashboard.py deleted file mode 100644 index dfc32ddc..00000000 --- a/charms/istio-gateway/lib/charms/grafana_k8s/v0/grafana_dashboard.py +++ /dev/null @@ -1,2014 +0,0 @@ -# Copyright 2021 Canonical Ltd. -# See LICENSE file for licensing details. - -"""## Overview. - -This document explains how to integrate with the Grafana charm -for the purpose of providing a dashboard which can be used by -end users. It also explains the structure of the data -expected by the `grafana-dashboard` interface, and may provide a -mechanism or reference point for providing a compatible interface -or library by providing a definitive reference guide to the -structure of relation data which is shared between the Grafana -charm and any charm providing datasource information. - -## Provider Library Usage - -The Grafana charm interacts with its dashboards using its charm -library. The goal of this library is to be as simple to use as -possible, and instantiation of the class with or without changing -the default arguments provides a complete use case. For the simplest -use case of a charm which bundles dashboards and provides a -`provides: grafana-dashboard` interface, - - requires: - grafana-dashboard: - interface: grafana_dashboard - -creation of a `GrafanaDashboardProvider` object with the default arguments is -sufficient. - -:class:`GrafanaDashboardProvider` expects that bundled dashboards should -be included in your charm with a default path of: - - path/to/charm.py - path/to/src/grafana_dashboards/*.{json|json.tmpl|.tmpl} - -Where the files are Grafana dashboard JSON data either from the -Grafana marketplace, or directly exported from a Grafana instance. -Refer to the [official docs](https://grafana.com/tutorials/provision-dashboards-and-data-sources/) -for more information. - -When constructing a dashboard that is intended to be consumed by COS, make sure to use variables -for your datasources, and name them "prometheusds" and "lokids". You can also use the following -juju topology variables in your dashboards: $juju_model, $juju_model_uuid, $juju_application -and $juju_unit. Note, however, that if metrics are coming via peripheral charms (scrape-config -or cos-config) then topology labels would not exist. - -The default constructor arguments are: - - `charm`: `self` from the charm instantiating this library - `relation_name`: grafana-dashboard - `dashboards_path`: "/src/grafana_dashboards" - -If your configuration requires any changes from these defaults, they -may be set from the class constructor. It may be instantiated as -follows: - - from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider - - class FooCharm: - def __init__(self, *args): - super().__init__(*args, **kwargs) - ... - self.grafana_dashboard_provider = GrafanaDashboardProvider(self) - ... - -The first argument (`self`) should be a reference to the parent (providing -dashboards), as this charm's lifecycle events will be used to re-submit -dashboard information if a charm is upgraded, the pod is restarted, or other. - -An instantiated `GrafanaDashboardProvider` validates that the path specified -in the constructor (or the default) exists, reads the file contents, then -compresses them with LZMA and adds them to the application relation data -when a relation is established with Grafana. - -Provided dashboards will be checked by Grafana, and a series of dropdown menus -providing the ability to select query targets by Juju Model, application instance, -and unit will be added if they do not exist. - -To avoid requiring `jinja` in `GrafanaDashboardProvider` users, template validation -and rendering occurs on the other side of the relation, and relation data in -the form of: - - { - "event": { - "valid": `true|false`, - "errors": [], - } - } - -Will be returned if rendering or validation fails. In this case, the -`GrafanaDashboardProvider` object will emit a `dashboard_status_changed` event -of the type :class:`GrafanaDashboardEvent`, which will contain information -about the validation error. - -This information is added to the relation data for the charms as serialized JSON -from a dict, with a structure of: -``` -{ - "application": { - "dashboards": { - "uuid": a uuid generated to ensure a relation event triggers, - "templates": { - "file:{hash}": { - "content": `{compressed_template_data}`, - "charm": `charm.meta.name`, - "juju_topology": { - "model": `charm.model.name`, - "model_uuid": `charm.model.uuid`, - "application": `charm.app.name`, - "unit": `charm.unit.name`, - } - }, - "file:{other_file_hash}": { - ... - }, - }, - }, - }, -} -``` - -This is ingested by :class:`GrafanaDashboardConsumer`, and is sufficient for configuration. - -The [COS Configuration Charm](https://charmhub.io/cos-configuration-k8s) can be used to -add dashboards which are not bundled with charms. - -## Consumer Library Usage - -The `GrafanaDashboardConsumer` object may be used by Grafana -charms to manage relations with available dashboards. For this -purpose, a charm consuming Grafana dashboard information should do -the following things: - -1. Instantiate the `GrafanaDashboardConsumer` object by providing it a -reference to the parent (Grafana) charm and, optionally, the name of -the relation that the Grafana charm uses to interact with dashboards. -This relation must confirm to the `grafana-dashboard` interface. - -For example a Grafana charm may instantiate the -`GrafanaDashboardConsumer` in its constructor as follows - - from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardConsumer - - def __init__(self, *args): - super().__init__(*args) - ... - self.grafana_dashboard_consumer = GrafanaDashboardConsumer(self) - ... - -2. A Grafana charm also needs to listen to the -`GrafanaDashboardConsumer` events emitted by the `GrafanaDashboardConsumer` -by adding itself as an observer for these events: - - self.framework.observe( - self.grafana_source_consumer.on.sources_changed, - self._on_dashboards_changed, - ) - -Dashboards can be retrieved the :meth:`dashboards`: - -It will be returned in the format of: - -``` -[ - { - "id": unique_id, - "relation_id": relation_id, - "charm": the name of the charm which provided the dashboard, - "content": compressed_template_data - }, -] -``` - -The consuming charm should decompress the dashboard. -""" - -import base64 -import hashlib -import json -import logging -import lzma -import os -import platform -import re -import subprocess -import tempfile -import uuid -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union - -import yaml -from ops.charm import ( - CharmBase, - HookEvent, - RelationBrokenEvent, - RelationChangedEvent, - RelationCreatedEvent, - RelationEvent, - RelationRole, -) -from ops.framework import ( - EventBase, - EventSource, - Object, - ObjectEvents, - StoredDict, - StoredList, - StoredState, -) -from ops.model import Relation - -# The unique Charmhub library identifier, never change it -LIBID = "c49eb9c7dfef40c7b6235ebd67010a3f" - -# Increment this major API version when introducing breaking changes -LIBAPI = 0 - -# Increment this PATCH version before using `charmcraft publish-lib` or reset -# to 0 if you are raising the major API version - -LIBPATCH = 36 - -logger = logging.getLogger(__name__) - - -DEFAULT_RELATION_NAME = "grafana-dashboard" -DEFAULT_PEER_NAME = "grafana" -RELATION_INTERFACE_NAME = "grafana_dashboard" - -TOPOLOGY_TEMPLATE_DROPDOWNS = [ # type: ignore - { - "allValue": ".*", - "datasource": "${prometheusds}", - "definition": "label_values(up,juju_model)", - "description": None, - "error": None, - "hide": 0, - "includeAll": True, - "label": "Juju model", - "multi": True, - "name": "juju_model", - "query": { - "query": "label_values(up,juju_model)", - "refId": "StandardVariableQuery", - }, - "refresh": 1, - "regex": "", - "skipUrlSync": False, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": False, - }, - { - "allValue": ".*", - "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model=~"$juju_model"},juju_model_uuid)', - "description": None, - "error": None, - "hide": 0, - "includeAll": True, - "label": "Juju model uuid", - "multi": True, - "name": "juju_model_uuid", - "query": { - "query": 'label_values(up{juju_model=~"$juju_model"},juju_model_uuid)', - "refId": "StandardVariableQuery", - }, - "refresh": 1, - "regex": "", - "skipUrlSync": False, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": False, - }, - { - "allValue": ".*", - "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid"},juju_application)', - "description": None, - "error": None, - "hide": 0, - "includeAll": True, - "label": "Juju application", - "multi": True, - "name": "juju_application", - "query": { - "query": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid"},juju_application)', - "refId": "StandardVariableQuery", - }, - "refresh": 1, - "regex": "", - "skipUrlSync": False, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": False, - }, - { - "allValue": ".*", - "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},juju_unit)', - "description": None, - "error": None, - "hide": 0, - "includeAll": True, - "label": "Juju unit", - "multi": True, - "name": "juju_unit", - "query": { - "query": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},juju_unit)', - "refId": "StandardVariableQuery", - }, - "refresh": 1, - "regex": "", - "skipUrlSync": False, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": False, - }, -] - -DATASOURCE_TEMPLATE_DROPDOWNS = [ # type: ignore - { - "description": None, - "error": None, - "hide": 0, - "includeAll": True, - "label": "Prometheus datasource", - "multi": True, - "name": "prometheusds", - "options": [], - "query": "prometheus", - "refresh": 1, - "regex": "", - "skipUrlSync": False, - "type": "datasource", - }, - { - "description": None, - "error": None, - "hide": 0, - "includeAll": True, - "label": "Loki datasource", - "multi": True, - "name": "lokids", - "options": [], - "query": "loki", - "refresh": 1, - "regex": "", - "skipUrlSync": False, - "type": "datasource", - }, -] - -REACTIVE_CONVERTER = { # type: ignore - "allValue": None, - "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},host)', - "description": None, - "error": None, - "hide": 0, - "includeAll": True, - "label": "hosts", - "multi": True, - "name": "host", - "options": [], - "query": { - "query": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},host)', - "refId": "StandardVariableQuery", - }, - "refresh": 1, - "regex": "", - "skipUrlSync": False, - "sort": 1, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": False, -} - - -class RelationNotFoundError(Exception): - """Raised if there is no relation with the given name.""" - - def __init__(self, relation_name: str): - self.relation_name = relation_name - self.message = "No relation named '{}' found".format(relation_name) - - super().__init__(self.message) - - -class RelationInterfaceMismatchError(Exception): - """Raised if the relation with the given name has a different interface.""" - - def __init__( - self, - relation_name: str, - expected_relation_interface: str, - actual_relation_interface: str, - ): - self.relation_name = relation_name - self.expected_relation_interface = expected_relation_interface - self.actual_relation_interface = actual_relation_interface - self.message = ( - "The '{}' relation has '{}' as " - "interface rather than the expected '{}'".format( - relation_name, actual_relation_interface, expected_relation_interface - ) - ) - - super().__init__(self.message) - - -class RelationRoleMismatchError(Exception): - """Raised if the relation with the given name has a different direction.""" - - def __init__( - self, - relation_name: str, - expected_relation_role: RelationRole, - actual_relation_role: RelationRole, - ): - self.relation_name = relation_name - self.expected_relation_interface = expected_relation_role - self.actual_relation_role = actual_relation_role - self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format( - relation_name, repr(actual_relation_role), repr(expected_relation_role) - ) - - super().__init__(self.message) - - -class InvalidDirectoryPathError(Exception): - """Raised if the grafana dashboards folder cannot be found or is otherwise invalid.""" - - def __init__( - self, - grafana_dashboards_absolute_path: str, - message: str, - ): - self.grafana_dashboards_absolute_path = grafana_dashboards_absolute_path - self.message = message - - super().__init__(self.message) - - -def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> str: - """Resolve the provided path items against the directory of the main file. - - Look up the directory of the charmed operator file being executed. This is normally - going to be the charm.py file of the charm including this library. Then, resolve - the provided path elements and return its absolute path. - - Raises: - InvalidDirectoryPathError if the resolved path does not exist or it is not a directory - - """ - charm_dir = Path(str(charm.charm_dir)) - if not charm_dir.exists() or not charm_dir.is_dir(): - # Operator Framework does not currently expose a robust - # way to determine the top level charm source directory - # that is consistent across deployed charms and unit tests - # Hence for unit tests the current working directory is used - # TODO: updated this logic when the following ticket is resolved - # https://github.com/canonical/operator/issues/643 - charm_dir = Path(os.getcwd()) - - dir_path = charm_dir.absolute().joinpath(*path_elements) - - if not dir_path.exists(): - raise InvalidDirectoryPathError(str(dir_path), "directory does not exist") - if not dir_path.is_dir(): - raise InvalidDirectoryPathError(str(dir_path), "is not a directory") - - return str(dir_path) - - -def _validate_relation_by_interface_and_direction( - charm: CharmBase, - relation_name: str, - expected_relation_interface: str, - expected_relation_role: RelationRole, -) -> None: - """Verifies that a relation has the necessary characteristics. - - Verifies that the `relation_name` provided: (1) exists in metadata.yaml, - (2) declares as interface the interface name passed as `relation_interface` - and (3) has the right "direction", i.e., it is a relation that `charm` - provides or requires. - - Args: - charm: a `CharmBase` object to scan for the matching relation. - relation_name: the name of the relation to be verified. - expected_relation_interface: the interface name to be matched by the - relation named `relation_name`. - expected_relation_role: whether the `relation_name` must be either - provided or required by `charm`. - - Raises: - RelationNotFoundError: If there is no relation in the charm's metadata.yaml - named like the value of the `relation_name` argument. - RelationInterfaceMismatchError: If the relation interface of the - relation named as the provided `relation_name` argument does not - match the `expected_relation_interface` argument. - RelationRoleMismatchError: If the relation named as the provided `relation_name` - argument has a different role than what is specified by the - `expected_relation_role` argument. - """ - if relation_name not in charm.meta.relations: - raise RelationNotFoundError(relation_name) - - relation = charm.meta.relations[relation_name] - - actual_relation_interface = relation.interface_name - if actual_relation_interface and actual_relation_interface != expected_relation_interface: - raise RelationInterfaceMismatchError( - relation_name, expected_relation_interface, actual_relation_interface - ) - - if expected_relation_role == RelationRole.provides: - if relation_name not in charm.meta.provides: - raise RelationRoleMismatchError( - relation_name, RelationRole.provides, RelationRole.requires - ) - elif expected_relation_role == RelationRole.requires: - if relation_name not in charm.meta.requires: - raise RelationRoleMismatchError( - relation_name, RelationRole.requires, RelationRole.provides - ) - else: - raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role)) - - -def _encode_dashboard_content(content: Union[str, bytes]) -> str: - if isinstance(content, str): - content = bytes(content, "utf-8") - - return base64.b64encode(lzma.compress(content)).decode("utf-8") - - -def _decode_dashboard_content(encoded_content: str) -> str: - return lzma.decompress(base64.b64decode(encoded_content.encode("utf-8"))).decode() - - -def _convert_dashboard_fields(content: str, inject_dropdowns: bool = True) -> str: - """Make sure values are present for Juju topology. - - Inserts Juju topology variables and selectors into the template, as well as - a variable for Prometheus. - """ - dict_content = json.loads(content) - datasources = {} - existing_templates = False - - template_dropdowns = ( - TOPOLOGY_TEMPLATE_DROPDOWNS + DATASOURCE_TEMPLATE_DROPDOWNS # type: ignore - if inject_dropdowns - else DATASOURCE_TEMPLATE_DROPDOWNS - ) - - # If the dashboard has __inputs, get the names to replace them. These are stripped - # from reactive dashboards in GrafanaDashboardAggregator, but charm authors in - # newer charms may import them directly from the marketplace - if "__inputs" in dict_content: - for field in dict_content["__inputs"]: - if "type" in field and field["type"] == "datasource": - datasources[field["name"]] = field["pluginName"].lower() - del dict_content["__inputs"] - - # If no existing template variables exist, just insert our own - if "templating" not in dict_content: - dict_content["templating"] = {"list": list(template_dropdowns)} # type: ignore - else: - # Otherwise, set a flag so we can go back later - existing_templates = True - for template_value in dict_content["templating"]["list"]: - # Build a list of `datasource_name`: `datasource_type` mappings - # The "query" field is actually "prometheus", "loki", "influxdb", etc - if "type" in template_value and template_value["type"] == "datasource": - datasources[template_value["name"]] = template_value["query"].lower() - - # Put our own variables in the template - for d in template_dropdowns: # type: ignore - if d not in dict_content["templating"]["list"]: - dict_content["templating"]["list"].insert(0, d) - - dict_content = _replace_template_fields(dict_content, datasources, existing_templates) - return json.dumps(dict_content) - - -def _replace_template_fields( # noqa: C901 - dict_content: dict, datasources: dict, existing_templates: bool -) -> dict: - """Make templated fields get cleaned up afterwards. - - If existing datasource variables are present, try to substitute them. - """ - replacements = {"loki": "${lokids}", "prometheus": "${prometheusds}"} - used_replacements = [] # type: List[str] - - # If any existing datasources match types we know, or we didn't find - # any templating variables at all, template them. - if datasources or not existing_templates: - panels = dict_content.get("panels", {}) - if panels: - dict_content["panels"] = _template_panels( - panels, replacements, used_replacements, existing_templates, datasources - ) - - # Find panels nested under rows - rows = dict_content.get("rows", {}) - if rows: - for row_idx, row in enumerate(rows): - if "panels" in row.keys(): - rows[row_idx]["panels"] = _template_panels( - row["panels"], - replacements, - used_replacements, - existing_templates, - datasources, - ) - - dict_content["rows"] = rows - - # Finally, go back and pop off the templates we stubbed out - deletions = [] - for tmpl in dict_content["templating"]["list"]: - if tmpl["name"] and tmpl["name"] in used_replacements: - deletions.append(tmpl) - - for d in deletions: - dict_content["templating"]["list"].remove(d) - - return dict_content - - -def _template_panels( - panels: dict, - replacements: dict, - used_replacements: list, - existing_templates: bool, - datasources: dict, -) -> dict: - """Iterate through a `panels` object and template it appropriately.""" - # Go through all the panels. If they have a datasource set, AND it's one - # that we can convert to ${lokids} or ${prometheusds}, by stripping off the - # ${} templating and comparing the name to the list we built, replace it, - # otherwise, leave it alone. - # - for panel in panels: - if "datasource" not in panel or not panel.get("datasource"): - continue - if not existing_templates: - datasource = panel.get("datasource") - if isinstance(datasource, str): - if "loki" in datasource: - panel["datasource"] = "${lokids}" - elif "grafana" in datasource: - continue - else: - panel["datasource"] = "${prometheusds}" - elif isinstance(datasource, dict): - # In dashboards exported by Grafana 9, datasource type is dict - dstype = datasource.get("type", "") - if dstype == "loki": - panel["datasource"]["uid"] = "${lokids}" - elif dstype == "prometheus": - panel["datasource"]["uid"] = "${prometheusds}" - else: - logger.debug("Unrecognized datasource type '%s'; skipping", dstype) - continue - else: - logger.error("Unknown datasource format: skipping") - continue - else: - if isinstance(panel["datasource"], str): - if panel["datasource"].lower() in replacements.values(): - # Already a known template variable - continue - # Strip out variable characters and maybe braces - ds = re.sub(r"(\$|\{|\})", "", panel["datasource"]) - - if ds not in datasources.keys(): - # Unknown, non-templated datasource, potentially a Grafana builtin - continue - - replacement = replacements.get(datasources[ds], "") - if replacement: - used_replacements.append(ds) - panel["datasource"] = replacement or panel["datasource"] - elif isinstance(panel["datasource"], dict): - dstype = panel["datasource"].get("type", "") - if panel["datasource"].get("uid", "").lower() in replacements.values(): - # Already a known template variable - continue - # Strip out variable characters and maybe braces - ds = re.sub(r"(\$|\{|\})", "", panel["datasource"].get("uid", "")) - - if ds not in datasources.keys(): - # Unknown, non-templated datasource, potentially a Grafana builtin - continue - - replacement = replacements.get(datasources[ds], "") - if replacement: - used_replacements.append(ds) - panel["datasource"]["uid"] = replacement - else: - logger.error("Unknown datasource format: skipping") - continue - return panels - - -def _inject_labels(content: str, topology: dict, transformer: "CosTool") -> str: - """Inject Juju topology into panel expressions via CosTool. - - A dashboard will have a structure approximating: - { - "__inputs": [], - "templating": { - "list": [ - { - "name": "prometheusds", - "type": "prometheus" - } - ] - }, - "panels": [ - { - "foo": "bar", - "targets": [ - { - "some": "field", - "expr": "up{job="foo"}" - }, - { - "some_other": "field", - "expr": "sum(http_requests_total{instance="$foo"}[5m])} - } - ], - "datasource": "${someds}" - } - ] - } - - `templating` is used elsewhere in this library, but the structure is not rigid. It is - not guaranteed that a panel will actually have any targets (it could be a "spacer" with - no datasource, hence no expression). It could have only one target. It could have multiple - targets. It could have multiple targets of which only one has an `expr` to evaluate. We need - to try to handle all of these concisely. - - `cos-tool` (`github.com/canonical/cos-tool` as a Go module in general) - does not know "Grafana-isms", such as using `[$_variable]` to modify the query from the user - interface, so we add placeholders (as `5y`, since it must parse, but a dashboard looking for - five years for a panel query would be unusual). - - Args: - content: dashboard content as a string - topology: a dict containing topology values - transformer: a 'CosTool' instance - Returns: - dashboard content with replaced values. - """ - dict_content = json.loads(content) - - if "panels" not in dict_content.keys(): - return json.dumps(dict_content) - - # Go through all the panels and inject topology labels - # Panels may have more than one 'target' where the expressions live, so that must be - # accounted for. Additionally, `promql-transform` does not necessarily gracefully handle - # expressions with range queries including variables. Exclude these. - # - # It is not a certainty that the `datasource` field will necessarily reflect the type, so - # operate on all fields. - panels = dict_content["panels"] - topology_with_prefix = {"juju_{}".format(k): v for k, v in topology.items()} - - # We need to use an index so we can insert the changed element back later - for panel_idx, panel in enumerate(panels): - if not isinstance(panel, dict): - continue - - # Use the index to insert it back in the same location - panels[panel_idx] = _modify_panel(panel, topology_with_prefix, transformer) - - return json.dumps(dict_content) - - -def _modify_panel(panel: dict, topology: dict, transformer: "CosTool") -> dict: - """Inject Juju topology into panel expressions via CosTool. - - Args: - panel: a dashboard panel as a dict - topology: a dict containing topology values - transformer: a 'CosTool' instance - Returns: - the panel with injected values - """ - if "targets" not in panel.keys(): - return panel - - # Pre-compile a regular expression to grab values from inside of [] - range_re = re.compile(r"\[(?P.*?)\]") - # Do the same for any offsets - offset_re = re.compile(r"offset\s+(?P-?\s*[$\w]+)") - - known_datasources = {"${prometheusds}": "promql", "${lokids}": "logql"} - - targets = panel["targets"] - - # We need to use an index so we can insert the changed element back later - for idx, target in enumerate(targets): - # If there's no expression, we don't need to do anything - if "expr" not in target.keys(): - continue - expr = target["expr"] - - if "datasource" not in panel.keys(): - continue - - if isinstance(panel["datasource"], str): - if panel["datasource"] not in known_datasources: - continue - querytype = known_datasources[panel["datasource"]] - elif isinstance(panel["datasource"], dict): - if panel["datasource"]["uid"] not in known_datasources: - continue - querytype = known_datasources[panel["datasource"]["uid"]] - else: - logger.error("Unknown datasource format: skipping") - continue - - # Capture all values inside `[]` into a list which we'll iterate over later to - # put them back in-order. Then apply the regex again and replace everything with - # `[5y]` so promql/parser will take it. - # - # Then do it again for offsets - range_values = [m.group("value") for m in range_re.finditer(expr)] - expr = range_re.sub(r"[5y]", expr) - - offset_values = [m.group("value") for m in offset_re.finditer(expr)] - expr = offset_re.sub(r"offset 5y", expr) - # Retrieve the new expression (which may be unchanged if there were no label - # matchers in the expression, or if tt was unable to be parsed like logql. It's - # virtually impossible to tell from any datasource "name" in a panel what the - # actual type is without re-implementing a complete dashboard parser, but no - # harm will some from passing invalid promql -- we'll just get the original back. - # - replacement = transformer.inject_label_matchers(expr, topology, querytype) - - if replacement == target["expr"]: - # promql-tranform caught an error. Move on - continue - - # Go back and substitute values in [] which were pulled out - # Enumerate with an index... again. The same regex is ok, since it will still match - # `[(.*?)]`, which includes `[5y]`, our placeholder - for i, match in enumerate(range_re.finditer(replacement)): - # Replace one-by-one, starting from the left. We build the string back with - # `str.replace(string_to_replace, replacement_value, count)`. Limit the count - # to one, since we are going through one-by-one through the list we saved earlier - # in `range_values`. - replacement = replacement.replace( - "[{}]".format(match.group("value")), - "[{}]".format(range_values[i]), - 1, - ) - - for i, match in enumerate(offset_re.finditer(replacement)): - # Replace one-by-one, starting from the left. We build the string back with - # `str.replace(string_to_replace, replacement_value, count)`. Limit the count - # to one, since we are going through one-by-one through the list we saved earlier - # in `range_values`. - replacement = replacement.replace( - "offset {}".format(match.group("value")), - "offset {}".format(offset_values[i]), - 1, - ) - - # Use the index to insert it back in the same location - targets[idx]["expr"] = replacement - - panel["targets"] = targets - return panel - - -def _type_convert_stored(obj): - """Convert Stored* to their appropriate types, recursively.""" - if isinstance(obj, StoredList): - return list(map(_type_convert_stored, obj)) - if isinstance(obj, StoredDict): - rdict = {} # type: Dict[Any, Any] - for k in obj.keys(): - rdict[k] = _type_convert_stored(obj[k]) - return rdict - return obj - - -class GrafanaDashboardsChanged(EventBase): - """Event emitted when Grafana dashboards change.""" - - def __init__(self, handle, data=None): - super().__init__(handle) - self.data = data - - def snapshot(self) -> Dict: - """Save grafana source information.""" - return {"data": self.data} - - def restore(self, snapshot): - """Restore grafana source information.""" - self.data = snapshot["data"] - - -class GrafanaDashboardEvents(ObjectEvents): - """Events raised by :class:`GrafanaSourceEvents`.""" - - dashboards_changed = EventSource(GrafanaDashboardsChanged) - - -class GrafanaDashboardEvent(EventBase): - """Event emitted when Grafana dashboards cannot be resolved. - - Enables us to set a clear status on the provider. - """ - - def __init__(self, handle, errors: List[Dict[str, str]] = [], valid: bool = False): - super().__init__(handle) - self.errors = errors - self.error_message = "; ".join([error["error"] for error in errors if "error" in error]) - self.valid = valid - - def snapshot(self) -> Dict: - """Save grafana source information.""" - return { - "error_message": self.error_message, - "valid": self.valid, - "errors": json.dumps(self.errors), - } - - def restore(self, snapshot): - """Restore grafana source information.""" - self.error_message = snapshot["error_message"] - self.valid = snapshot["valid"] - self.errors = json.loads(str(snapshot["errors"])) - - -class GrafanaProviderEvents(ObjectEvents): - """Events raised by :class:`GrafanaSourceEvents`.""" - - dashboard_status_changed = EventSource(GrafanaDashboardEvent) - - -class GrafanaDashboardProvider(Object): - """An API to provide Grafana dashboards to a Grafana charm.""" - - _stored = StoredState() - on = GrafanaProviderEvents() # pyright: ignore - - def __init__( - self, - charm: CharmBase, - relation_name: str = DEFAULT_RELATION_NAME, - dashboards_path: str = "src/grafana_dashboards", - ) -> None: - """API to provide Grafana dashboard to a Grafana charmed operator. - - The :class:`GrafanaDashboardProvider` object provides an API - to upload dashboards to a Grafana charm. In its most streamlined - usage, the :class:`GrafanaDashboardProvider` is integrated in a - charmed operator as follows: - - self.grafana = GrafanaDashboardProvider(self) - - The :class:`GrafanaDashboardProvider` will look for dashboard - templates in the `/grafana_dashboards` folder. - Additionally, dashboard templates can be uploaded programmatically - via the :method:`GrafanaDashboardProvider.add_dashboard` method. - - To use the :class:`GrafanaDashboardProvider` API, you need a relation - defined in your charm operator's metadata.yaml as follows: - - provides: - grafana-dashboard: - interface: grafana_dashboard - - If you would like to use relation name other than `grafana-dashboard`, - you will need to specify the relation name via the `relation_name` - argument when instantiating the :class:`GrafanaDashboardProvider` object. - However, it is strongly advised to keep the default relation name, - so that people deploying your charm will have a consistent experience - with all other charms that provide Grafana dashboards. - - It is possible to provide a different file path for the Grafana dashboards - to be automatically managed by the :class:`GrafanaDashboardProvider` object - via the `dashboards_path` argument. This may be necessary when the directory - structure of your charmed operator repository is not the "usual" one as - generated by `charmcraft init`, for example when adding the charmed operator - in a Java repository managed by Maven or Gradle. However, unless there are - such constraints with other tooling, it is strongly advised to store the - Grafana dashboards in the default `/grafana_dashboards` - folder, in order to provide a consistent experience for other charmed operator - authors. - - Args: - charm: a :class:`CharmBase` object which manages this - :class:`GrafanaProvider` object. Generally this is - `self` in the instantiating class. - relation_name: a :string: name of the relation managed by this - :class:`GrafanaDashboardProvider`; it defaults to "grafana-dashboard". - dashboards_path: a filesystem path relative to the charm root - where dashboard templates can be located. By default, the library - expects dashboard files to be in the `/grafana_dashboards` - directory. - """ - _validate_relation_by_interface_and_direction( - charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides - ) - - try: - dashboards_path = _resolve_dir_against_charm_path(charm, dashboards_path) - except InvalidDirectoryPathError as e: - logger.warning( - "Invalid Grafana dashboards folder at %s: %s", - e.grafana_dashboards_absolute_path, - e.message, - ) - - super().__init__(charm, relation_name) - - self._charm = charm - self._relation_name = relation_name - self._dashboards_path = dashboards_path - - # No peer relation bucket we can rely on providers, keep StoredState here, too - self._stored.set_default(dashboard_templates={}) # type: ignore - - self.framework.observe(self._charm.on.leader_elected, self._update_all_dashboards_from_dir) - self.framework.observe(self._charm.on.upgrade_charm, self._update_all_dashboards_from_dir) - self.framework.observe(self._charm.on.config_changed, self._update_all_dashboards_from_dir) - - self.framework.observe( - self._charm.on[self._relation_name].relation_created, - self._on_grafana_dashboard_relation_created, - ) - self.framework.observe( - self._charm.on[self._relation_name].relation_changed, - self._on_grafana_dashboard_relation_changed, - ) - - def add_dashboard(self, content: str, inject_dropdowns: bool = True) -> None: - """Add a dashboard to the relation managed by this :class:`GrafanaDashboardProvider`. - - Args: - content: a string representing a Jinja template. Currently, no - global variables are added to the Jinja template evaluation - context. - inject_dropdowns: a :boolean: indicating whether topology dropdowns should be - added to the dashboard - """ - # Update of storage must be done irrespective of leadership, so - # that the stored state is there when this unit becomes leader. - stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore - - encoded_dashboard = _encode_dashboard_content(content) - - # Use as id the first chars of the encoded dashboard, so that - # it is predictable across units. - id = "prog:{}".format(encoded_dashboard[-24:-16]) - - stored_dashboard_templates[id] = self._content_to_dashboard_object( - encoded_dashboard, inject_dropdowns - ) - stored_dashboard_templates[id]["dashboard_alt_uid"] = self._generate_alt_uid(id) - - if self._charm.unit.is_leader(): - for dashboard_relation in self._charm.model.relations[self._relation_name]: - self._upset_dashboards_on_relation(dashboard_relation) - - def remove_non_builtin_dashboards(self) -> None: - """Remove all dashboards to the relation added via :method:`add_dashboard`.""" - # Update of storage must be done irrespective of leadership, so - # that the stored state is there when this unit becomes leader. - stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore - - for dashboard_id in list(stored_dashboard_templates.keys()): - if dashboard_id.startswith("prog:"): - del stored_dashboard_templates[dashboard_id] - self._stored.dashboard_templates = stored_dashboard_templates - - if self._charm.unit.is_leader(): - for dashboard_relation in self._charm.model.relations[self._relation_name]: - self._upset_dashboards_on_relation(dashboard_relation) - - def update_dashboards(self) -> None: - """Trigger the re-evaluation of the data on all relations.""" - if self._charm.unit.is_leader(): - for dashboard_relation in self._charm.model.relations[self._relation_name]: - self._upset_dashboards_on_relation(dashboard_relation) - - def _update_all_dashboards_from_dir( - self, _: Optional[HookEvent] = None, inject_dropdowns: bool = True - ) -> None: - """Scans the built-in dashboards and updates relations with changes.""" - # Update of storage must be done irrespective of leadership, so - # that the stored state is there when this unit becomes leader. - - # Ensure we do not leave outdated dashboards by removing from stored all - # the encoded dashboards that start with "file/". - if self._dashboards_path: - stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore - - for dashboard_id in list(stored_dashboard_templates.keys()): - if dashboard_id.startswith("file:"): - del stored_dashboard_templates[dashboard_id] - - # Path.glob uses fnmatch on the backend, which is pretty limited, so use a - # custom function for the filter - def _is_dashboard(p: Path) -> bool: - return p.is_file() and p.name.endswith((".json", ".json.tmpl", ".tmpl")) - - for path in filter(_is_dashboard, Path(self._dashboards_path).glob("*")): - # path = Path(path) - id = "file:{}".format(path.stem) - stored_dashboard_templates[id] = self._content_to_dashboard_object( - _encode_dashboard_content(path.read_bytes()), inject_dropdowns - ) - stored_dashboard_templates[id]["dashboard_alt_uid"] = self._generate_alt_uid(id) - - self._stored.dashboard_templates = stored_dashboard_templates - - if self._charm.unit.is_leader(): - for dashboard_relation in self._charm.model.relations[self._relation_name]: - self._upset_dashboards_on_relation(dashboard_relation) - - def _generate_alt_uid(self, key: str) -> str: - """Generate alternative uid for dashboards. - - Args: - key: A string used (along with charm.meta.name) to build the hash uid. - - Returns: A hash string. - """ - raw_dashboard_alt_uid = "{}-{}".format(self._charm.meta.name, key) - return hashlib.shake_256(raw_dashboard_alt_uid.encode("utf-8")).hexdigest(8) - - def _reinitialize_dashboard_data(self, inject_dropdowns: bool = True) -> None: - """Triggers a reload of dashboard outside of an eventing workflow. - - Args: - inject_dropdowns: a :bool: used to indicate whether topology dropdowns should be added - - This will destroy any existing relation data. - """ - try: - _resolve_dir_against_charm_path(self._charm, self._dashboards_path) - self._update_all_dashboards_from_dir(inject_dropdowns=inject_dropdowns) - - except InvalidDirectoryPathError as e: - logger.warning( - "Invalid Grafana dashboards folder at %s: %s", - e.grafana_dashboards_absolute_path, - e.message, - ) - stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore - - for dashboard_id in list(stored_dashboard_templates.keys()): - if dashboard_id.startswith("file:"): - del stored_dashboard_templates[dashboard_id] - self._stored.dashboard_templates = stored_dashboard_templates - - # With all the file-based dashboards cleared out, force a refresh - # of relation data - if self._charm.unit.is_leader(): - for dashboard_relation in self._charm.model.relations[self._relation_name]: - self._upset_dashboards_on_relation(dashboard_relation) - - def _on_grafana_dashboard_relation_created(self, event: RelationCreatedEvent) -> None: - """Watch for a relation being created and automatically send dashboards. - - Args: - event: The :class:`RelationJoinedEvent` sent when a - `grafana_dashboaard` relationship is joined - """ - if self._charm.unit.is_leader(): - self._update_all_dashboards_from_dir() - self._upset_dashboards_on_relation(event.relation) - - def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> None: - """Watch for changes so we know if there's an error to signal back to the parent charm. - - Args: - event: The `RelationChangedEvent` that triggered this handler. - """ - if self._charm.unit.is_leader(): - data = json.loads(event.relation.data[event.app].get("event", "{}")) # type: ignore - - if not data: - return - - valid = bool(data.get("valid", True)) - errors = data.get("errors", []) - if valid and not errors: - self.on.dashboard_status_changed.emit(valid=valid) # pyright: ignore - else: - self.on.dashboard_status_changed.emit( # pyright: ignore - valid=valid, errors=errors - ) - - def _upset_dashboards_on_relation(self, relation: Relation) -> None: - """Update the dashboards in the relation data bucket.""" - # It's completely ridiculous to add a UUID, but if we don't have some - # pseudo-random value, this never makes it across 'juju set-state' - stored_data = { - "templates": _type_convert_stored(self._stored.dashboard_templates), # pyright: ignore - "uuid": str(uuid.uuid4()), - } - - relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) - - def _content_to_dashboard_object(self, content: str, inject_dropdowns: bool = True) -> Dict: - return { - "charm": self._charm.meta.name, - "content": content, - "juju_topology": self._juju_topology if inject_dropdowns else {}, - "inject_dropdowns": inject_dropdowns, - } - - # This is not actually used in the dashboards, but is present to provide a secondary - # salt to ensure uniqueness in the dict keys in case individual charm units provide - # dashboards - @property - def _juju_topology(self) -> Dict: - return { - "model": self._charm.model.name, - "model_uuid": self._charm.model.uuid, - "application": self._charm.app.name, - "unit": self._charm.unit.name, - } - - @property - def dashboard_templates(self) -> List: - """Return a list of the known dashboard templates.""" - return list(self._stored.dashboard_templates.values()) # type: ignore - - -class GrafanaDashboardConsumer(Object): - """A consumer object for working with Grafana Dashboards.""" - - on = GrafanaDashboardEvents() # pyright: ignore - _stored = StoredState() - - def __init__( - self, - charm: CharmBase, - relation_name: str = DEFAULT_RELATION_NAME, - ) -> None: - """API to receive Grafana dashboards from charmed operators. - - The :class:`GrafanaDashboardConsumer` object provides an API - to consume dashboards provided by a charmed operator using the - :class:`GrafanaDashboardProvider` library. The - :class:`GrafanaDashboardConsumer` is integrated in a - charmed operator as follows: - - self.grafana = GrafanaDashboardConsumer(self) - - To use this library, you need a relation defined as follows in - your charm operator's metadata.yaml: - - requires: - grafana-dashboard: - interface: grafana_dashboard - - If you would like to use a different relation name than - `grafana-dashboard`, you need to specify the relation name via the - `relation_name` argument. However, it is strongly advised not to - change the default, so that people deploying your charm will have - a consistent experience with all other charms that consume Grafana - dashboards. - - Args: - charm: a :class:`CharmBase` object which manages this - :class:`GrafanaProvider` object. Generally this is - `self` in the instantiating class. - relation_name: a :string: name of the relation managed by this - :class:`GrafanaDashboardConsumer`; it defaults to "grafana-dashboard". - """ - _validate_relation_by_interface_and_direction( - charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires - ) - - super().__init__(charm, relation_name) - self._charm = charm - self._relation_name = relation_name - self._tranformer = CosTool(self._charm) - - self._stored.set_default(dashboards={}) # type: ignore - - self.framework.observe( - self._charm.on[self._relation_name].relation_changed, - self._on_grafana_dashboard_relation_changed, - ) - self.framework.observe( - self._charm.on[self._relation_name].relation_broken, - self._on_grafana_dashboard_relation_broken, - ) - self.framework.observe( - self._charm.on[DEFAULT_PEER_NAME].relation_changed, - self._on_grafana_peer_changed, - ) - - def get_dashboards_from_relation(self, relation_id: int) -> List: - """Get a list of known dashboards for one instance of the monitored relation. - - Args: - relation_id: the identifier of the relation instance, as returned by - :method:`ops.model.Relation.id`. - - Returns: a list of known dashboards coming from the provided relation instance. - """ - return [ - self._to_external_object(relation_id, dashboard) - for dashboard in self._get_stored_dashboards(relation_id) - ] - - def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> None: - """Handle relation changes in related providers. - - If there are changes in relations between Grafana dashboard consumers - and providers, this event handler (if the unit is the leader) will - get data for an incoming grafana-dashboard relation through a - :class:`GrafanaDashboardsChanged` event, and make the relation data - available in the app's datastore object. The Grafana charm can - then respond to the event to update its configuration. - """ - changes = False - if self._charm.unit.is_leader(): - changes = self._render_dashboards_and_signal_changed(event.relation) - - if changes: - self.on.dashboards_changed.emit() # pyright: ignore - - def _on_grafana_peer_changed(self, _: RelationChangedEvent) -> None: - """Emit dashboard events on peer events so secondary charm data updates.""" - if self._charm.unit.is_leader(): - return - self.on.dashboards_changed.emit() # pyright: ignore - - def update_dashboards(self, relation: Optional[Relation] = None) -> None: - """Re-establish dashboards on one or more relations. - - If something changes between this library and a datasource, try to re-establish - invalid dashboards and invalidate active ones. - - Args: - relation: a specific relation for which the dashboards have to be - updated. If not specified, all relations managed by this - :class:`GrafanaDashboardConsumer` will be updated. - """ - if self._charm.unit.is_leader(): - relations = ( - [relation] if relation else self._charm.model.relations[self._relation_name] - ) - - for relation in relations: - self._render_dashboards_and_signal_changed(relation) - - def _on_grafana_dashboard_relation_broken(self, event: RelationBrokenEvent) -> None: - """Update job config when providers depart. - - When a Grafana dashboard provider departs, the configuration - for that provider is removed from the list of dashboards - """ - if not self._charm.unit.is_leader(): - return - - self._remove_all_dashboards_for_relation(event.relation) - - def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # type: ignore - """Validate a given dashboard. - - Verify that the passed dashboard data is able to be found in our list - of datasources and will render. If they do, let the charm know by - emitting an event. - - Args: - relation: Relation; The relation the dashboard is associated with. - - Returns: - a boolean indicating whether an event should be emitted - """ - other_app = relation.app - - raw_data = relation.data[other_app].get("dashboards", "") # pyright: ignore - - if not raw_data: - logger.warning( - "No dashboard data found in the %s:%s relation", - self._relation_name, - str(relation.id), - ) - return False - - data = json.loads(raw_data) - - # The only piece of data needed on this side of the relations is "templates" - templates = data.pop("templates") - - # The dashboards are WAY too big since this ultimately calls out to Juju to - # set the relation data, and it overflows the maximum argument length for - # subprocess, so we have to use b64, annoyingly. - # Worse, Python3 expects absolutely everything to be a byte, and a plain - # `base64.b64encode()` is still too large, so we have to go through hoops - # of encoding to byte, compressing with lzma, converting to base64 so it - # can be converted to JSON, then all the way back. - - rendered_dashboards = [] - relation_has_invalid_dashboards = False - - for _, (fname, template) in enumerate(templates.items()): - content = None - error = None - topology = template.get("juju_topology", {}) - try: - content = _decode_dashboard_content(template["content"]) - inject_dropdowns = template.get("inject_dropdowns", True) - content = self._manage_dashboard_uid(content, template) - content = _convert_dashboard_fields(content, inject_dropdowns) - - if topology: - content = _inject_labels(content, topology, self._tranformer) - - content = _encode_dashboard_content(content) - except lzma.LZMAError as e: - error = str(e) - relation_has_invalid_dashboards = True - except json.JSONDecodeError as e: - error = str(e.msg) - logger.warning("Invalid JSON in Grafana dashboard: {}".format(fname)) - continue - - # Prepend the relation name and ID to the dashboard ID to avoid clashes with - # multiple relations with apps from the same charm, or having dashboards with - # the same ids inside their charm operators - rendered_dashboards.append( - { - "id": "{}:{}/{}".format(relation.name, relation.id, fname), - "original_id": fname, - "content": content if content else None, - "template": template, - "valid": (error is None), - "error": error, - } - ) - - if relation_has_invalid_dashboards: - self._remove_all_dashboards_for_relation(relation) - - invalid_templates = [ - data["original_id"] for data in rendered_dashboards if not data["valid"] - ] - - logger.warning( - "Cannot add one or more Grafana dashboards from relation '{}:{}': the following " - "templates are invalid: {}".format( - relation.name, - relation.id, - invalid_templates, - ) - ) - - relation.data[self._charm.app]["event"] = json.dumps( - { - "errors": [ - { - "dashboard_id": rendered_dashboard["original_id"], - "error": rendered_dashboard["error"], - } - for rendered_dashboard in rendered_dashboards - if rendered_dashboard["error"] - ] - } - ) - - # Dropping dashboards for a relation needs to be signalled - return True - - stored_data = rendered_dashboards - currently_stored_data = self._get_stored_dashboards(relation.id) - - coerced_data = _type_convert_stored(currently_stored_data) if currently_stored_data else {} - - if not coerced_data == stored_data: - stored_dashboards = self.get_peer_data("dashboards") - stored_dashboards[relation.id] = stored_data - self.set_peer_data("dashboards", stored_dashboards) - return True - return None # type: ignore - - def _manage_dashboard_uid(self, dashboard: str, template: dict) -> str: - """Add an uid to the dashboard if it is not present.""" - dashboard_dict = json.loads(dashboard) - - if not dashboard_dict.get("uid", None) and "dashboard_alt_uid" in template: - dashboard_dict["uid"] = template["dashboard_alt_uid"] - - return json.dumps(dashboard_dict) - - def _remove_all_dashboards_for_relation(self, relation: Relation) -> None: - """If an errored dashboard is in stored data, remove it and trigger a deletion.""" - if self._get_stored_dashboards(relation.id): - stored_dashboards = self.get_peer_data("dashboards") - stored_dashboards.pop(str(relation.id)) - self.set_peer_data("dashboards", stored_dashboards) - self.on.dashboards_changed.emit() # pyright: ignore - - def _to_external_object(self, relation_id, dashboard): - return { - "id": dashboard["original_id"], - "relation_id": relation_id, - "charm": dashboard["template"]["charm"], - "content": _decode_dashboard_content(dashboard["content"]), - } - - @property - def dashboards(self) -> List[Dict]: - """Get a list of known dashboards across all instances of the monitored relation. - - Returns: a list of known dashboards. The JSON of each of the dashboards is available - in the `content` field of the corresponding `dict`. - """ - dashboards = [] - - for _, (relation_id, dashboards_for_relation) in enumerate( - self.get_peer_data("dashboards").items() - ): - for dashboard in dashboards_for_relation: - dashboards.append(self._to_external_object(relation_id, dashboard)) - - return dashboards - - def _get_stored_dashboards(self, relation_id: int) -> list: - """Pull stored dashboards out of the peer data bucket.""" - return self.get_peer_data("dashboards").get(str(relation_id), {}) - - def _set_default_data(self) -> None: - """Set defaults if they are not in peer relation data.""" - data = {"dashboards": {}} # type: ignore - for k, v in data.items(): - if not self.get_peer_data(k): - self.set_peer_data(k, v) - - def set_peer_data(self, key: str, data: Any) -> None: - """Put information into the peer data bucket instead of `StoredState`.""" - self._charm.peers.data[self._charm.app][key] = json.dumps(data) # type: ignore[attr-defined] - - def get_peer_data(self, key: str) -> Any: - """Retrieve information from the peer data bucket instead of `StoredState`.""" - data = self._charm.peers.data[self._charm.app].get(key, "") # type: ignore[attr-defined] - return json.loads(data) if data else {} - - -class GrafanaDashboardAggregator(Object): - """API to retrieve Grafana dashboards from machine dashboards. - - The :class:`GrafanaDashboardAggregator` object provides a way to - collate and aggregate Grafana dashboards from reactive/machine charms - and transport them into Charmed Operators, using Juju topology. - For detailed usage instructions, see the documentation for - :module:`cos-proxy-operator`, as this class is intended for use as a - single point of intersection rather than use in individual charms. - - Since :class:`GrafanaDashboardAggregator` serves as a bridge between - Canonical Observability Stack Charmed Operators and Reactive Charms, - deployed in a Reactive Juju model, both a target relation which is - used to collect events from Reactive charms and a `grafana_relation` - which is used to send the collected data back to the Canonical - Observability Stack are required. - - In its most streamlined usage, :class:`GrafanaDashboardAggregator` is - integrated in a charmed operator as follows: - self.grafana = GrafanaDashboardAggregator(self) - - Args: - charm: a :class:`CharmBase` object which manages this - :class:`GrafanaProvider` object. Generally this is - `self` in the instantiating class. - target_relation: a :string: name of a relation managed by this - :class:`GrafanaDashboardAggregator`, which is used to communicate - with reactive/machine charms it defaults to "dashboards". - grafana_relation: a :string: name of a relation used by this - :class:`GrafanaDashboardAggregator`, which is used to communicate - with charmed grafana. It defaults to "downstream-grafana-dashboard" - """ - - _stored = StoredState() - on = GrafanaProviderEvents() # pyright: ignore - - def __init__( - self, - charm: CharmBase, - target_relation: str = "dashboards", - grafana_relation: str = "downstream-grafana-dashboard", - ): - super().__init__(charm, grafana_relation) - - # Reactive charms may be RPC-ish and not leave reliable data around. Keep - # StoredState here - self._stored.set_default( # type: ignore - dashboard_templates={}, - id_mappings={}, - ) - - self._charm = charm - self._target_relation = target_relation - self._grafana_relation = grafana_relation - - self.framework.observe( - self._charm.on[self._grafana_relation].relation_joined, - self._update_remote_grafana, - ) - self.framework.observe( - self._charm.on[self._grafana_relation].relation_changed, - self._update_remote_grafana, - ) - self.framework.observe( - self._charm.on[self._target_relation].relation_changed, - self.update_dashboards, - ) - self.framework.observe( - self._charm.on[self._target_relation].relation_broken, - self.remove_dashboards, - ) - - def update_dashboards(self, event: RelationEvent) -> None: - """If we get a dashboard from a reactive charm, parse it out and update.""" - if self._charm.unit.is_leader(): - self._upset_dashboards_on_event(event) - - def _upset_dashboards_on_event(self, event: RelationEvent) -> None: - """Update the dashboards in the relation data bucket.""" - dashboards = self._handle_reactive_dashboards(event) - - if not dashboards: - logger.warning( - "Could not find dashboard data after a relation change for {}".format(event.app) - ) - return - - for id in dashboards: - self._stored.dashboard_templates[id] = self._content_to_dashboard_object( # type: ignore - dashboards[id], event - ) - - self._stored.id_mappings[event.app.name] = dashboards # type: ignore - self._update_remote_grafana(event) - - def _update_remote_grafana(self, _: Optional[RelationEvent] = None) -> None: - """Push dashboards to the downstream Grafana relation.""" - # It's still ridiculous to add a UUID here, but needed - stored_data = { - "templates": _type_convert_stored(self._stored.dashboard_templates), # pyright: ignore - "uuid": str(uuid.uuid4()), - } - - if self._charm.unit.is_leader(): - for grafana_relation in self.model.relations[self._grafana_relation]: - grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) - - def remove_dashboards(self, event: RelationBrokenEvent) -> None: - """Remove a dashboard if the relation is broken.""" - app_ids = _type_convert_stored(self._stored.id_mappings.get(event.app.name, "")) # type: ignore - - if not app_ids: - logger.info("Could not look up stored dashboards for %s", event.app.name) # type: ignore - return - - del self._stored.id_mappings[event.app.name] # type: ignore - for id in app_ids: - del self._stored.dashboard_templates[id] # type: ignore - - stored_data = { - "templates": _type_convert_stored(self._stored.dashboard_templates), # pyright: ignore - "uuid": str(uuid.uuid4()), - } - - if self._charm.unit.is_leader(): - for grafana_relation in self.model.relations[self._grafana_relation]: - grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) - - # Yes, this has a fair amount of branching. It's not that complex, though - def _strip_existing_datasources(self, dash: dict) -> dict: # noqa: C901 - """Remove existing reactive charm datasource templating out. - - This method iterates through *known* places where reactive charms may set - data in contributed dashboards and removes them. - - `dashboard["__inputs"]` is a property sometimes set when exporting dashboards from - the Grafana UI. It is not present in earlier Grafana versions, and can be disabled - in 5.3.4 and above (optionally). If set, any values present will be substituted on - import. Some reactive charms use this for Prometheus. COS uses dropdown selectors - for datasources, and leaving this present results in "default" datasource values - which are broken. - - Similarly, `dashboard["templating"]["list"][N]["name"] == "host"` can be used to - set a `host` variable for use in dashboards which is not meaningful in the context - of Juju topology and will yield broken dashboards. - - Further properties may be discovered. - """ - try: - if "list" in dash["templating"]: - for i in range(len(dash["templating"]["list"])): - if ( - "datasource" in dash["templating"]["list"][i] - and dash["templating"]["list"][i]["datasource"] is not None - ): - if "Juju" in dash["templating"]["list"][i].get("datasource", ""): - dash["templating"]["list"][i]["datasource"] = r"${prometheusds}" - - # Strip out newly-added 'juju_application' template variables which - # don't line up with our drop-downs - dash_mutable = dash - for i in range(len(dash["templating"]["list"])): - if ( - "name" in dash["templating"]["list"][i] - and dash["templating"]["list"][i].get("name", "") == "app" - ): - del dash_mutable["templating"]["list"][i] - - if dash_mutable: - dash = dash_mutable - except KeyError: - logger.debug("No existing templating data in dashboard") - - if "__inputs" in dash: - inputs = dash - for i in range(len(dash["__inputs"])): - if dash["__inputs"][i].get("pluginName", "") == "Prometheus": - del inputs["__inputs"][i] - if inputs: - dash["__inputs"] = inputs["__inputs"] - else: - del dash["__inputs"] - - return dash - - def _handle_reactive_dashboards(self, event: RelationEvent) -> Optional[Dict]: - """Look for a dashboard in relation data (during a reactive hook) or builtin by name.""" - if not self._charm.unit.is_leader(): - return {} - - templates = [] - id = "" - - # Reactive data can reliably be pulled out of events. In theory, if we got an event, - # it's on the bucket, but using event explicitly keeps the mental model in - # place for reactive - for k in event.relation.data[event.unit].keys(): # type: ignore - if k.startswith("request_"): - templates.append(json.loads(event.relation.data[event.unit][k])["dashboard"]) # type: ignore - - for k in event.relation.data[event.app].keys(): # type: ignore - if k.startswith("request_"): - templates.append(json.loads(event.relation.data[event.app][k])["dashboard"]) # type: ignore - - builtins = self._maybe_get_builtin_dashboards(event) - - if not templates and not builtins: - logger.warning("NOTHING!") - return {} - - dashboards = {} - for t in templates: - # This seems ridiculous, too, but to get it from a "dashboards" key in serialized JSON - # in the bucket back out to the actual "dashboard" we _need_, this is the way - # This is not a mistake -- there's a double nesting in reactive charms, and - # Grafana won't load it. We have to unbox: - # event.relation.data[event.]["request_*"]["dashboard"]["dashboard"], - # and the final unboxing is below. - # - # Apparently SOME newer dashboards (such as Ceph) do not have this double nesting, so - # now we get to account for both :toot: - dash = t.get("dashboard", {}) or t - - # Replace values with LMA-style templating - dash = self._strip_existing_datasources(dash) - dash = json.dumps(dash) - - # Replace the old-style datasource templates - dash = re.sub(r"<< datasource >>", r"${prometheusds}", dash) - dash = re.sub(r'"datasource": "prom.*?"', r'"datasource": "${prometheusds}"', dash) - dash = re.sub( - r'"datasource": "\$datasource"', r'"datasource": "${prometheusds}"', dash - ) - dash = re.sub(r'"uid": "\$datasource"', r'"uid": "${prometheusds}"', dash) - dash = re.sub( - r'"datasource": "(!?\w)[\w|\s|-]+?Juju generated.*?"', - r'"datasource": "${prometheusds}"', - dash, - ) - - # Yank out "new"+old LMA topology - dash = re.sub( - r'(,?\s?juju_application=~)\\"\$app\\"', r'\1\\"$juju_application\\"', dash - ) - - # Replace old piechart panels - dash = re.sub(r'"type": "grafana-piechart-panel"', '"type": "piechart"', dash) - - from jinja2 import DebugUndefined, Template - - content = _encode_dashboard_content( - Template(dash, undefined=DebugUndefined).render(datasource=r"${prometheusds}") # type: ignore - ) - id = "prog:{}".format(content[-24:-16]) - - dashboards[id] = content - return {**builtins, **dashboards} - - def _maybe_get_builtin_dashboards(self, event: RelationEvent) -> Dict: - """Tries to match the event with an included dashboard. - - Scans dashboards packed with the charm instantiating this class, and tries to match - one with the event. There is no guarantee that any given event will match a builtin, - since each charm instantiating this class may include a different set of dashboards, - or none. - """ - builtins = {} - dashboards_path = None - - try: - dashboards_path = _resolve_dir_against_charm_path( - self._charm, "src/grafana_dashboards" - ) - except InvalidDirectoryPathError as e: - logger.warning( - "Invalid Grafana dashboards folder at %s: %s", - e.grafana_dashboards_absolute_path, - e.message, - ) - - if dashboards_path: - - def is_dashboard(p: Path) -> bool: - return p.is_file() and p.name.endswith((".json", ".json.tmpl", ".tmpl")) - - for path in filter(is_dashboard, Path(dashboards_path).glob("*")): - # path = Path(path) - if event.app.name in path.name: # type: ignore - id = "file:{}".format(path.stem) - builtins[id] = self._content_to_dashboard_object( - _encode_dashboard_content(path.read_bytes()), event - ) - - return builtins - - def _content_to_dashboard_object(self, content: str, event: RelationEvent) -> Dict: - return { - "charm": event.app.name, # type: ignore - "content": content, - "juju_topology": self._juju_topology(event), - "inject_dropdowns": True, - } - - # This is not actually used in the dashboards, but is present to provide a secondary - # salt to ensure uniqueness in the dict keys in case individual charm units provide - # dashboards - def _juju_topology(self, event: RelationEvent) -> Dict: - return { - "model": self._charm.model.name, - "model_uuid": self._charm.model.uuid, - "application": event.app.name, # type: ignore - "unit": event.unit.name, # type: ignore - } - - -class CosTool: - """Uses cos-tool to inject label matchers into alert rule expressions and validate rules.""" - - _path = None - _disabled = False - - def __init__(self, charm): - self._charm = charm - - @property - def path(self): - """Lazy lookup of the path of cos-tool.""" - if self._disabled: - return None - if not self._path: - self._path = self._get_tool_path() - if not self._path: - logger.debug("Skipping injection of juju topology as label matchers") - self._disabled = True - return self._path - - def apply_label_matchers(self, rules: dict, type: str) -> dict: - """Will apply label matchers to the expression of all alerts in all supplied groups.""" - if not self.path: - return rules - for group in rules["groups"]: - rules_in_group = group.get("rules", []) - for rule in rules_in_group: - topology = {} - # if the user for some reason has provided juju_unit, we'll need to honor it - # in most cases, however, this will be empty - for label in [ - "juju_model", - "juju_model_uuid", - "juju_application", - "juju_charm", - "juju_unit", - ]: - if label in rule["labels"]: - topology[label] = rule["labels"][label] - - rule["expr"] = self.inject_label_matchers(rule["expr"], topology, type) - return rules - - def validate_alert_rules(self, rules: dict) -> Tuple[bool, str]: - """Will validate correctness of alert rules, returning a boolean and any errors.""" - if not self.path: - logger.debug("`cos-tool` unavailable. Not validating alert correctness.") - return True, "" - - with tempfile.TemporaryDirectory() as tmpdir: - rule_path = Path(tmpdir + "/validate_rule.yaml") - - # Smash "our" rules format into what upstream actually uses, which is more like: - # - # groups: - # - name: foo - # rules: - # - alert: SomeAlert - # expr: up - # - alert: OtherAlert - # expr: up - transformed_rules = {"groups": []} # type: ignore - for rule in rules["groups"]: - transformed = {"name": str(uuid.uuid4()), "rules": [rule]} - transformed_rules["groups"].append(transformed) - - rule_path.write_text(yaml.dump(transformed_rules)) - - args = [str(self.path), "validate", str(rule_path)] - # noinspection PyBroadException - try: - self._exec(args) - return True, "" - except subprocess.CalledProcessError as e: - logger.debug("Validating the rules failed: %s", e.output) - return False, ", ".join([line for line in e.output if "error validating" in line]) - - def inject_label_matchers(self, expression: str, topology: dict, type: str) -> str: - """Add label matchers to an expression.""" - if not topology: - return expression - if not self.path: - logger.debug("`cos-tool` unavailable. Leaving expression unchanged: %s", expression) - return expression - args = [str(self.path), "--format", type, "transform"] - - variable_topology = {k: "${}".format(k) for k in topology.keys()} - args.extend( - [ - "--label-matcher={}={}".format(key, value) - for key, value in variable_topology.items() - ] - ) - - # Pass a leading "--" so expressions with a negation or subtraction aren't interpreted as - # flags - args.extend(["--", "{}".format(expression)]) - # noinspection PyBroadException - try: - return re.sub(r'="\$juju', r'=~"$juju', self._exec(args)) - except subprocess.CalledProcessError as e: - logger.debug('Applying the expression failed: "%s", falling back to the original', e) - return expression - - def _get_tool_path(self) -> Optional[Path]: - arch = platform.machine() - arch = "amd64" if arch == "x86_64" else arch - res = "cos-tool-{}".format(arch) - try: - path = Path(res).resolve() - path.chmod(0o777) - return path - except NotImplementedError: - logger.debug("System lacks support for chmod") - except FileNotFoundError: - logger.debug('Could not locate cos-tool at: "{}"'.format(res)) - return None - - def _exec(self, cmd) -> str: - result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE) - output = result.stdout.decode("utf-8").strip() - return output diff --git a/charms/istio-gateway/metadata.yaml b/charms/istio-gateway/metadata.yaml index 6990f678..7712717c 100644 --- a/charms/istio-gateway/metadata.yaml +++ b/charms/istio-gateway/metadata.yaml @@ -25,7 +25,5 @@ requires: provides: metrics-endpoint: interface: prometheus_scrape - grafana-dashboard: - interface: grafana_dashboard assumes: - juju >= 3.1 diff --git a/charms/istio-gateway/src/charm.py b/charms/istio-gateway/src/charm.py index 6532397a..3c258b01 100755 --- a/charms/istio-gateway/src/charm.py +++ b/charms/istio-gateway/src/charm.py @@ -2,7 +2,6 @@ import logging -from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider from jinja2 import Environment, FileSystemLoader from lightkube import Client, codecs @@ -35,7 +34,7 @@ def __init__(self, *args): self.framework.observe(event, self.start) self.framework.observe(self.on.remove, self.remove) - # metrics and dashboard relation configuration + # metrics relation configuration self.prometheus_provider = MetricsEndpointProvider( charm=self, relation_name="metrics-endpoint", @@ -51,7 +50,6 @@ def __init__(self, *args): } ], ) - self.dashboard_provider = GrafanaDashboardProvider(self) def start(self, event): """Event handler for StartEevnt.""" diff --git a/charms/istio-gateway/src/prometheus_alert_rules/basic.rules b/charms/istio-gateway/src/prometheus_alert_rules/basic.rules index 3644e2a2..6d64f95c 100644 --- a/charms/istio-gateway/src/prometheus_alert_rules/basic.rules +++ b/charms/istio-gateway/src/prometheus_alert_rules/basic.rules @@ -1,6 +1,6 @@ # Note(rgildein): Alert rules come from the source https://github.com/istio/tools/blob/6d537aee69ec7e9da007f311562496f7ac1cb691/perf/stability/alertmanager/prometheusrule.yaml#L11-L26 groups: - - name: IstioGatewatBasic + - name: IstioGatewayBasic rules: - alert: IngressTrafficMissing annotations: diff --git a/charms/istio-gateway/src/prometheus_alert_rules/workload.rules b/charms/istio-gateway/src/prometheus_alert_rules/workload.rules index bf674efe..12681310 100644 --- a/charms/istio-gateway/src/prometheus_alert_rules/workload.rules +++ b/charms/istio-gateway/src/prometheus_alert_rules/workload.rules @@ -1,6 +1,6 @@ # Note(rgildein): Alert rules come from the source https://github.com/istio/tools/blob/6d537aee69ec7e9da007f311562496f7ac1cb691/perf/stability/alertmanager/prometheusrule.yaml#L28-L47 groups: - - name: Workload + - name: IstioGatewayWorkload rules: - alert: HTTP5xxRateHigh annotations: diff --git a/charms/istio-gateway/tests/unit/test_charm.py b/charms/istio-gateway/tests/unit/test_charm.py index 50c69f5d..b271f2c0 100644 --- a/charms/istio-gateway/tests/unit/test_charm.py +++ b/charms/istio-gateway/tests/unit/test_charm.py @@ -158,10 +158,3 @@ def test_metrics(harness): } ], ) - - -def test_grafana_dashboard(harness): - """Test GrafanaDashboardProvider initialization.""" - with patch("charm.GrafanaDashboardProvider") as mock_grafana: - harness.begin() - mock_grafana.assert_called_once_with(harness.charm) diff --git a/tests/test_cos_integration.py b/tests/test_cos_integration.py index 85ba561e..9c62c518 100644 --- a/tests/test_cos_integration.py +++ b/tests/test_cos_integration.py @@ -6,10 +6,8 @@ import pytest from charmed_kubeflow_chisme.testing import ( - APP_GRAFANA_DASHBOARD, APP_METRICS_ENDPOINT, GRAFANA_AGENT_APP, - GRAFANA_AGENT_GRAFANA_DASHBOARD, GRAFANA_AGENT_METRICS_ENDPOINT, assert_alert_rules, assert_metrics_endpoint, @@ -58,17 +56,6 @@ async def test_build_and_deploy_istio_charms(ops_test: OpsTest): ) # Note(rgildein): Using this until the [1] is not fixed. # [1]: https://github.com/canonical/charmed-kubeflow-chisme/issues/117 - log.info( - "Adding relation: %s:%s and %s:%s", - ISTIO_GATEWAY_APP_NAME, - APP_GRAFANA_DASHBOARD, - GRAFANA_AGENT_APP, - GRAFANA_AGENT_GRAFANA_DASHBOARD, - ) - await ops_test.model.integrate( - f"{ISTIO_GATEWAY_APP_NAME}:{APP_GRAFANA_DASHBOARD}", - f"{GRAFANA_AGENT_APP}:{GRAFANA_AGENT_GRAFANA_DASHBOARD}", - ) log.info( "Adding relation: %s:%s and %s:%s", ISTIO_GATEWAY_APP_NAME, From 6dd4efbe60fef926ad6da38b1cd61feb55977005 Mon Sep 17 00:00:00 2001 From: Robert Gildein Date: Tue, 30 Jul 2024 15:59:58 +0200 Subject: [PATCH 8/8] drop public-url config option --- tests/test_bundle.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/test_bundle.py b/tests/test_bundle.py index c6bf05bb..28b653ff 100644 --- a/tests/test_bundle.py +++ b/tests/test_bundle.py @@ -263,13 +263,15 @@ async def test_enable_ingress_auth(ops_test: OpsTest): OIDC_GATEKEEPER, channel=OIDC_GATEKEEPER_CHANNEL, trust=OIDC_GATEKEEPER_TRUST, - config={"public-url": regular_ingress_gateway_ip}, ) - await ops_test.model.add_relation(f"{ISTIO_PILOT}:ingress", f"{DEX_AUTH}:ingress") - await ops_test.model.add_relation(f"{ISTIO_PILOT}:ingress", f"{OIDC_GATEKEEPER}:ingress") - await ops_test.model.add_relation(f"{OIDC_GATEKEEPER}:oidc-client", f"{DEX_AUTH}:oidc-client") - await ops_test.model.add_relation( + await ops_test.model.integrate(f"{ISTIO_PILOT}:ingress", f"{DEX_AUTH}:ingress") + await ops_test.model.integrate(f"{ISTIO_PILOT}:ingress", f"{OIDC_GATEKEEPER}:ingress") + await ops_test.model.integrate(f"{OIDC_GATEKEEPER}:oidc-client", f"{DEX_AUTH}:oidc-client") + await ops_test.model.integrate( + f"{OIDC_GATEKEEPER}:dex-oidc-config", f"{DEX_AUTH}:dex-oidc-config" + ) + await ops_test.model.integrate( f"{ISTIO_PILOT}:ingress-auth", f"{OIDC_GATEKEEPER}:ingress-auth" )