From 33d29635e4c2f8cb46fca9a688479ce1e2b19f6a Mon Sep 17 00:00:00 2001 From: vprashar2929 Date: Mon, 4 Nov 2024 16:50:18 +0530 Subject: [PATCH] feat(validator): add support to validate kepler metrics This commit adds support to validate essential metrics produced by Kepler Signed-off-by: vprashar2929 --- e2e/tools/validator/metric_validations.yaml | 354 ++++++++++++++++++ .../validator/scripts/regression-stressor.sh | 44 +++ .../validator/src/validator/cli/__init__.py | 51 ++- .../validator/src/validator/cli/options.py | 2 +- .../src/validator/stresser/__init__.py | 43 +++ .../src/validator/validations/__init__.py | 1 + e2e/tools/validator/validator.yaml.sample | 2 +- .../monitoring/prometheus/prometheus.yml | 2 +- 8 files changed, 495 insertions(+), 4 deletions(-) create mode 100644 e2e/tools/validator/metric_validations.yaml create mode 100644 e2e/tools/validator/scripts/regression-stressor.sh diff --git a/e2e/tools/validator/metric_validations.yaml b/e2e/tools/validator/metric_validations.yaml new file mode 100644 index 0000000000..0a5676958d --- /dev/null +++ b/e2e/tools/validator/metric_validations.yaml @@ -0,0 +1,354 @@ +config: + mapping: + actual: latest + predicted: dev + +validations: + # node rapl comparison + - name: node-rapl - kepler-package + units: Watts + mapping: + actual: node-rapl + predicted: kepler-package + + node-rapl: | + sum( + rate( + node_rapl_package_joules_total[{rate_interval}] + ) + ) + + kepler-package: | + sum( + rate( + kepler_node_package_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 1.00 + + - name: node-rapl - kepler-core + units: Watts + mapping: + actual: node-rapl + predicted: kepler-core + + node-rapl: | + sum( + rate( + node_rapl_core_joules_total[{rate_interval}] + ) + ) + + kepler-core: | + sum( + rate( + kepler_node_core_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 1.00 + + - name: node-rapl - kepler-dram + units: Watts + mapping: + actual: node-rapl + predicted: kepler-dram + + node-rapl: | + sum( + rate( + node_rapl_dram_joules_total[{rate_interval}] + ) + ) + + kepler-dram: | + sum( + rate( + kepler_node_dram_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 1.00 + + # absolute power comparison + - name: Total - absolute + latest: | + sum( + rate( + kepler_process_joules_total{{ + job="latest", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_joules_total{{ + job="dev", + }}[{rate_interval}] + ) + ) + + max_mae: 0.59 + + # CPU time comparison + - name: cpu-time + units: Milliseconds + latest: | + sum( + rate( + kepler_process_bpf_cpu_time_ms_total{{ + job="latest" + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_bpf_cpu_time_ms_total{{ + job="dev", + }}[{rate_interval}] + ) + ) + # max_mae: 20.0 + + # process comparison + - name: platform - dynamic + latest: | + sum( + rate( + kepler_process_platform_joules_total{{ + job="latest", mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_platform_joules_total{{ + job="dev", mode="dynamic", + }}[{rate_interval}] + ) + ) + + max_mae: 0.59 + + - name: package - dynamic + units: Watts + latest: | + sum( + rate( + kepler_process_package_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_package_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 0.59 + + - name: core - dynamic + units: Watts + latest: | + sum( + rate( + kepler_process_core_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_core_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 0.59 + + - name: dram - dynamic + units: Watts + latest: | + sum( + rate( + kepler_process_dram_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_dram_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 0.59 + + - name: other - dynamic + units: Watts + latest: | + sum( + rate( + kepler_process_other_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_other_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 0.59 + + - name: uncore - dynamic + units: Watts + latest: | + sum( + rate( + kepler_process_uncore_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_uncore_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 0.59 + + # node comparison + - name: node platform - dynamic + units: Watts + latest: | + rate(kepler_node_platform_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_platform_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 0.59 + + - name: node package - dynamic + units: Watts + latest: | + rate(kepler_node_package_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_package_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 0.59 + + - name: node core - dynamic + units: Watts + latest: | + rate(kepler_node_core_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_core_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 0.59 + + - name: node dram - dynamic + units: Watts + latest: | + rate(kepler_node_dram_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_dram_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 0.59 + + - name: node other - dynamic + units: Watts + latest: | + rate(kepler_node_other_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_other_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 0.59 + + - name: node uncore - dynamic + units: Watts + latest: | + rate(kepler_node_uncore_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_uncore_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 0.59 diff --git a/e2e/tools/validator/scripts/regression-stressor.sh b/e2e/tools/validator/scripts/regression-stressor.sh new file mode 100644 index 0000000000..5f6865e0dc --- /dev/null +++ b/e2e/tools/validator/scripts/regression-stressor.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +set -eu -o pipefail + +trap exit_all INT +exit_all() { + pkill -P $$ +} + +run() { + echo "❯ $*" + "$@" + echo " ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾" +} + +main() { + + local cpus + cpus=$(nproc) + + # load and time + local -a load_curve=( + 0:5 + 10:20 + 25:20 + 50:20 + 75:20 + 50:20 + 25:20 + 10:20 + 0:5 + ) + # sleep 5 so that first run and the second run look the same + echo "Warmup .." + run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load 0 --timeout 5 + + for x in "${load_curve[@]}"; do + local load="${x%%:*}" + local time="${x##*:}s" + run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load "$load" --timeout "$time" + done +} + +main "$@" diff --git a/e2e/tools/validator/src/validator/cli/__init__.py b/e2e/tools/validator/src/validator/cli/__init__.py index 3fc7c8cab2..716712d221 100644 --- a/e2e/tools/validator/src/validator/cli/__init__.py +++ b/e2e/tools/validator/src/validator/cli/__init__.py @@ -16,6 +16,7 @@ import matplotlib.pyplot as plt import numpy as np import numpy.typing as npt +from click.exceptions import Exit from matplotlib import ticker from matplotlib.dates import DateFormatter @@ -25,7 +26,7 @@ from validator.prometheus import Comparator, PrometheusClient, Series, ValueOrError from validator.report import CustomEncoder, JsonTemplate from validator.specs import MachineSpec, get_host_spec, get_vm_spec -from validator.stresser import Remote, ScriptResult +from validator.stresser import Local, Remote, ScriptResult from validator.validations import Loader, QueryTemplate, Validation logger = logging.getLogger(__name__) @@ -616,6 +617,54 @@ def validate_acpi(cfg: config.Validator, duration: datetime.timedelta, report_di return int(res.validations.passed) +@validator.command() +@click.option( + "--script-path", + "-s", + default="./scripts/regression-stressor.sh", + type=click.Path(exists=True), + show_default=True, +) +# ruff: noqa: S108 (Suppressed as we are intentionally using `/tmp` as reporting directory) +@click.option( + "--report-dir", + "-o", + default="/tmp", + type=click.Path(exists=True, dir_okay=True, writable=True), + show_default=True, +) +@pass_config +def validate_metrics( + cfg: config.Validator, + script_path: str, + report_dir: str, +): + results_dir, tag = create_report_dir(report_dir) + res = TestResult(tag) + # res.end_time = datetime.datetime.now(tz=datetime.UTC) if not end else end + # res.start_time = res.end_time - duration if not start else start + click.secho(" * Generating build and node info ...", fg="green") + res.build_info, res.node_info = get_build_and_node_info(cfg.prometheus) + click.secho(" * Generating spec report ...", fg="green") + res.host_spec = get_host_spec() + local = Local() + stress_test = local.run_script(script_path) + res.start_time = stress_test.start_time + res.end_time = stress_test.end_time + + # sleep a bit for prometheus to finish scrapping + click.secho(" * Sleeping for 10 seconds ...", fg="green") + time.sleep(10) + + # script_result = ScriptResult(res.start_time, res.end_time) + # res.validations = run_validations(cfg, stress_test, results_dir) + res.validations = run_validations(cfg, stress_test, results_dir) + click.secho(" * Generating validate metrics report file and dir", fg="green") + write_md_report(results_dir, res) + + raise Exit(1) if not res.validations.passed else Exit(0) + + def write_json_report(results_dir: str, res: TestResult): pattern = re.compile(r'[{]?(\w+)=("[^"]*"|[^,]+)[},]?') diff --git a/e2e/tools/validator/src/validator/cli/options.py b/e2e/tools/validator/src/validator/cli/options.py index 27068afa20..1091d7d6d3 100644 --- a/e2e/tools/validator/src/validator/cli/options.py +++ b/e2e/tools/validator/src/validator/cli/options.py @@ -32,7 +32,7 @@ class Duration(click.ParamType): def convert(self, value, param, ctx): td = parse_timedelta("now", value) if not td: - self.self.fail( + self.fail( "Expected duration format got " f"{value:r}", param, ctx, diff --git a/e2e/tools/validator/src/validator/stresser/__init__.py b/e2e/tools/validator/src/validator/stresser/__init__.py index a3fc747892..4cf51c6927 100644 --- a/e2e/tools/validator/src/validator/stresser/__init__.py +++ b/e2e/tools/validator/src/validator/stresser/__init__.py @@ -1,4 +1,7 @@ import logging +import os +import shutil +import subprocess from datetime import datetime from typing import NamedTuple @@ -20,6 +23,46 @@ class RunResult(NamedTuple): exit_code: int +class Local: + def copy(self, script_path, target_script): + logger.info("copying script %s - %s", script_path, target_script) + shutil.copy(script_path, target_script) + os.chmod(target_script, 0o755) + logger.info("copying script %s - %s - successful", script_path, target_script) + + def run_script(self, script_path: str) -> ScriptResult: + logger.info("Running script %s ...", script_path) + # ruff: noqa: S108 (Suppressed hard-coded path because we want to intentionally copy stress.sh inside `/tmp` dir) + target_script = "/tmp/stress.sh" + self.copy(script_path, target_script) + # ruff: noqa: DTZ005 (Suppressed non-time-zone aware object creation as it is not necessary for this use case) + start_time = datetime.now() + process = subprocess.Popen([target_script], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + end_time = datetime.now() + + # Output stdout + print("stdout output:") + for line in stdout.decode().splitlines(): + print(" ┊ ", line) + + # Output stderr + print("\nstderr output:") + for line in stderr.decode().splitlines(): + print(" ┊ ", line) + print("‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾\n\n") + + if process.returncode != 0: + logger.warning("script execution failed") + else: + logger.info("script execution successful") + + return ScriptResult( + start_time=start_time, + end_time=end_time, + ) + + class Remote: def __init__(self, config: config.Remote): self.host = config.host diff --git a/e2e/tools/validator/src/validator/validations/__init__.py b/e2e/tools/validator/src/validator/validations/__init__.py index 21ffe8de01..d5d0d309e4 100644 --- a/e2e/tools/validator/src/validator/validations/__init__.py +++ b/e2e/tools/validator/src/validator/validations/__init__.py @@ -88,6 +88,7 @@ def validation_from_yaml(v: dict[str, Any]) -> Validation: predicted_label=predicted_label, units=v.get("units", ""), max_mape=v.get("max_mape"), + max_mae=v.get("max_mae"), ) return [validation_from_yaml(v) for v in yml["validations"]] diff --git a/e2e/tools/validator/validator.yaml.sample b/e2e/tools/validator/validator.yaml.sample index 555407b5e9..cf423a7fb6 100644 --- a/e2e/tools/validator/validator.yaml.sample +++ b/e2e/tools/validator/validator.yaml.sample @@ -18,6 +18,6 @@ prometheus: url: http://localhost:9090 # Prometheus server URL rate_interval: 20s # Rate interval for Promql, default is 20s, typically 4 x $scrape_interval - steps: 3s # Step duration for Prometheus range queries + step: 3s # Step duration for Prometheus range queries validations_file: ./validations.yaml # Path to the validations file, default is ./validations.yaml diff --git a/manifests/compose/monitoring/prometheus/prometheus.yml b/manifests/compose/monitoring/prometheus/prometheus.yml index 93456a0e36..ad7df712f1 100644 --- a/manifests/compose/monitoring/prometheus/prometheus.yml +++ b/manifests/compose/monitoring/prometheus/prometheus.yml @@ -1,5 +1,5 @@ global: - scrape_interval: 5s # Set the scrape interval to every 5 seconds. Default is every 1 minute. + scrape_interval: 3s # Set the scrape interval to every 5 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s).