diff --git a/e2e/tools/validator/metric_validations.yaml b/e2e/tools/validator/metric_validations.yaml new file mode 100644 index 0000000000..2ab46a0a2a --- /dev/null +++ b/e2e/tools/validator/metric_validations.yaml @@ -0,0 +1,319 @@ +config: + mapping: + actual: latest + predicted: dev + +validations: + # absolute power comparison + - name: Total - absolute + latest: | + sum( + rate( + kepler_process_joules_total{{ + job="{latest_job_name}", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_joules_total{{ + job="{dev_job_name}", + }}[{rate_interval}] + ) + ) + + max_mae: 2.0 + + - name: platform - absolute + latest: | + sum( + rate( + kepler_process_platform_joules_total{{ + job="{latest_job_name}", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_platform_joules_total{{ + job="{dev_job_name}", + }}[{rate_interval}] + ) + ) + + max_mae: 2.0 + + - name: package - absolute + latest: | + sum( + rate( + kepler_process_package_joules_total{{ + job="{latest_job_name}", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_package_joules_total{{ + job="{dev_job_name}", + }}[{rate_interval}] + ) + ) + max_mae: 2.0 + + - name: dram - absolute + latest: | + sum( + rate( + kepler_process_dram_joules_total{{ + job="{latest_job_name}", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_dram_joules_total{{ + job="{dev_job_name}", + }}[{rate_interval}] + ) + ) + max_mae: 2.0 + + - name: core - absolute + units: Watts + latest: | + sum( + rate( + kepler_process_core_joules_total{{ + job="{latest_job_name}", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_core_joules_total{{ + job="{dev_job_name}", + }}[{rate_interval}] + ) + ) + max_mae: 2.0 + + - name: other - absolute + units: Watts + latest: | + sum( + rate( + kepler_process_other_joules_total{{ + job="{latest_job_name}", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_other_joules_total{{ + job="{dev_job_name}", + }}[{rate_interval}] + ) + ) + max_mae: 2.0 + + # CPU time comparison + - name: cpu-time + units: Milliseconds + latest: | + sum( + rate( + kepler_process_bpf_cpu_time_ms_total{{ + job="{latest_job_name}" + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_bpf_cpu_time_ms_total{{ + job="{dev_job_name}", + }}[{rate_interval}] + ) + ) + # max_mae: 20.0 + + - name: package - dynamic + units: Watts + latest: | + sum( + rate( + kepler_process_package_joules_total{{ + job="{latest_job_name}", + mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_package_joules_total{{ + job="{dev_job_name}", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 2.0 + + - name: core - dynamic + units: Watts + latest: | + sum( + rate( + kepler_process_core_joules_total{{ + job="{latest_job_name}", + mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_core_joules_total{{ + job="{dev_job_name}", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 2.0 + + - name: dram - dynamic + units: Watts + latest: | + sum( + rate( + kepler_process_dram_joules_total{{ + job="{latest_job_name}", + mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_dram_joules_total{{ + job="{dev_job_name}", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 2.0 + + - name: other - dynamic + units: Watts + latest: | + sum( + rate( + kepler_process_other_joules_total{{ + job="{latest_job_name}", + mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_other_joules_total{{ + job="{dev_job_name}", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 2.0 + +# Node comparison + - name: node platform - dynamic + units: Watts + latest: | + rate(kepler_node_platform_joules_total{{ + job="{latest_job_name}", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_platform_joules_total{{ + job="{dev_job_name}", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 2.0 + + - name: node package - dynamic + units: Watts + latest: | + rate(kepler_node_package_joules_total{{ + job="{latest_job_name}", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_package_joules_total{{ + job="{dev_job_name}", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 2.0 + + - name: node core - dynamic + units: Watts + latest: | + rate(kepler_node_core_joules_total{{ + job="{latest_job_name}", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_core_joules_total{{ + job="{dev_job_name}", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 2.0 + + - name: node dram - dynamic + units: Watts + latest: | + rate(kepler_node_dram_joules_total{{ + job="{latest_job_name}", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_dram_joules_total{{ + job="{dev_job_name}", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 2.0 + + - name: node other - dynamic + units: Watts + latest: | + rate(kepler_node_other_joules_total{{ + job="{latest_job_name}", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_other_joules_total{{ + job="{dev_job_name}", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 2.0 diff --git a/e2e/tools/validator/src/validator/cli/__init__.py b/e2e/tools/validator/src/validator/cli/__init__.py index d9909acf8c..4c36d01651 100644 --- a/e2e/tools/validator/src/validator/cli/__init__.py +++ b/e2e/tools/validator/src/validator/cli/__init__.py @@ -16,6 +16,7 @@ import matplotlib.pyplot as plt import numpy as np import numpy.typing as npt +from click.exceptions import Exit from matplotlib import ticker from matplotlib.dates import DateFormatter @@ -610,6 +611,34 @@ def validate_acpi(cfg: config.Validator, duration: datetime.timedelta, report_di return int(res.validations.passed) +@validator.command() +@click.option("--duration", "-d", type=options.Duration(), required=True) +# ruff: noqa: S108 (Suppressed as we are intentionally using `/tmp` as reporting directory) +@click.option( + "--report-dir", + "-o", + default="/tmp", + type=click.Path(exists=True, dir_okay=True, writable=True), + show_default=True, +) +@pass_config +def validate_metrics(cfg: config.Validator, duration: datetime.timedelta, report_dir: str): + results_dir, tag = create_report_dir(report_dir) + res = TestResult(tag) + res.end_time = datetime.datetime.now(tz=datetime.UTC) + res.start_time = res.end_time - duration + click.secho(" * Generating build and node info ...", fg="green") + res.build_info, res.node_info = get_build_and_node_info(cfg.prometheus) + click.secho(" * Generating spec report ...", fg="green") + res.host_spec = get_host_spec() + script_result = ScriptResult(res.start_time, res.end_time) + res.validations = run_validations(cfg, script_result, results_dir) + click.secho(" * Generating validate metrics report file and dir", fg="green") + write_md_report(results_dir, res) + + raise Exit(1) if not res.validations.passed else Exit(0) + + def write_json_report(results_dir: str, res: TestResult): pattern = re.compile(r'[{]?(\w+)=("[^"]*"|[^,]+)[},]?') diff --git a/e2e/tools/validator/src/validator/config/__init__.py b/e2e/tools/validator/src/validator/config/__init__.py index f4f49bc606..a190dbd40f 100644 --- a/e2e/tools/validator/src/validator/config/__init__.py +++ b/e2e/tools/validator/src/validator/config/__init__.py @@ -31,6 +31,8 @@ class Metal(NamedTuple): class PrometheusJob(NamedTuple): metal: str vm: str + dev: str + latest: str class Prometheus(NamedTuple): @@ -95,6 +97,8 @@ def load(config_file: str) -> Validator: job = PrometheusJob( metal=prom_job.get("metal", "metal"), vm=prom_job.get("vm", "vm"), + latest=prom_job.get("latest", "latest"), + dev=prom_job.get("dev", "dev"), ) prometheus = Prometheus( diff --git a/e2e/tools/validator/src/validator/validations/__init__.py b/e2e/tools/validator/src/validator/validations/__init__.py index 21ffe8de01..71aa4a305d 100644 --- a/e2e/tools/validator/src/validator/validations/__init__.py +++ b/e2e/tools/validator/src/validator/validations/__init__.py @@ -88,6 +88,7 @@ def validation_from_yaml(v: dict[str, Any]) -> Validation: predicted_label=predicted_label, units=v.get("units", ""), max_mape=v.get("max_mape"), + max_mae=v.get("max_mae"), ) return [validation_from_yaml(v) for v in yml["validations"]] @@ -112,6 +113,8 @@ def load(self) -> list[Validation]: promql_vars["rate_interval"] = prom.rate_interval promql_vars["metal_job_name"] = prom.job.metal promql_vars["vm_job_name"] = prom.job.vm + promql_vars["latest_job_name"] = prom.job.latest + promql_vars["dev_job_name"] = prom.job.dev logger.debug("promql_vars: %s", promql_vars)