Skip to content

Commit

Permalink
feat(validator): add support to validate kepler metrics
Browse files Browse the repository at this point in the history
This commit adds support to validate essential metrics produced by
Kepler

Signed-off-by: vprashar2929 <[email protected]>
  • Loading branch information
vprashar2929 committed Nov 5, 2024
1 parent 75b9533 commit 564fa4c
Show file tree
Hide file tree
Showing 4 changed files with 355 additions and 0 deletions.
319 changes: 319 additions & 0 deletions e2e/tools/validator/metric_validations.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,319 @@
config:
mapping:
actual: latest
predicted: dev

validations:
# absolute power comparison
- name: Total - absolute
latest: |
sum(
rate(
kepler_process_joules_total{{
job="{latest_job_name}",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_joules_total{{
job="{dev_job_name}",
}}[{rate_interval}]
)
)
max_mae: 2.0

- name: platform - absolute
latest: |
sum(
rate(
kepler_process_platform_joules_total{{
job="{latest_job_name}",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_platform_joules_total{{
job="{dev_job_name}",
}}[{rate_interval}]
)
)
max_mae: 2.0

- name: package - absolute
latest: |
sum(
rate(
kepler_process_package_joules_total{{
job="{latest_job_name}",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_package_joules_total{{
job="{dev_job_name}",
}}[{rate_interval}]
)
)
max_mae: 2.0

- name: dram - absolute
latest: |
sum(
rate(
kepler_process_dram_joules_total{{
job="{latest_job_name}",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_dram_joules_total{{
job="{dev_job_name}",
}}[{rate_interval}]
)
)
max_mae: 2.0

- name: core - absolute
units: Watts
latest: |
sum(
rate(
kepler_process_core_joules_total{{
job="{latest_job_name}",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_core_joules_total{{
job="{dev_job_name}",
}}[{rate_interval}]
)
)
max_mae: 2.0

- name: other - absolute
units: Watts
latest: |
sum(
rate(
kepler_process_other_joules_total{{
job="{latest_job_name}",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_other_joules_total{{
job="{dev_job_name}",
}}[{rate_interval}]
)
)
max_mae: 2.0

# CPU time comparison
- name: cpu-time
units: Milliseconds
latest: |
sum(
rate(
kepler_process_bpf_cpu_time_ms_total{{
job="{latest_job_name}"
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_bpf_cpu_time_ms_total{{
job="{dev_job_name}",
}}[{rate_interval}]
)
)
# max_mae: 20.0

- name: package - dynamic
units: Watts
latest: |
sum(
rate(
kepler_process_package_joules_total{{
job="{latest_job_name}",
mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_package_joules_total{{
job="{dev_job_name}",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 2.0

- name: core - dynamic
units: Watts
latest: |
sum(
rate(
kepler_process_core_joules_total{{
job="{latest_job_name}",
mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_core_joules_total{{
job="{dev_job_name}",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 2.0

- name: dram - dynamic
units: Watts
latest: |
sum(
rate(
kepler_process_dram_joules_total{{
job="{latest_job_name}",
mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_dram_joules_total{{
job="{dev_job_name}",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 2.0

- name: other - dynamic
units: Watts
latest: |
sum(
rate(
kepler_process_other_joules_total{{
job="{latest_job_name}",
mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_other_joules_total{{
job="{dev_job_name}",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 2.0

# Node comparison

Check warning on line 235 in e2e/tools/validator/metric_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

235:1 [comments-indentation] comment not indented like content
- name: node platform - dynamic
units: Watts
latest: |
rate(kepler_node_platform_joules_total{{
job="{latest_job_name}",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_platform_joules_total{{
job="{dev_job_name}",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 2.0

- name: node package - dynamic
units: Watts
latest: |
rate(kepler_node_package_joules_total{{
job="{latest_job_name}",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_package_joules_total{{
job="{dev_job_name}",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 2.0

- name: node core - dynamic
units: Watts
latest: |
rate(kepler_node_core_joules_total{{
job="{latest_job_name}",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_core_joules_total{{
job="{dev_job_name}",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 2.0

- name: node dram - dynamic
units: Watts
latest: |
rate(kepler_node_dram_joules_total{{
job="{latest_job_name}",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_dram_joules_total{{
job="{dev_job_name}",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 2.0

- name: node other - dynamic
units: Watts
latest: |
rate(kepler_node_other_joules_total{{
job="{latest_job_name}",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_other_joules_total{{
job="{dev_job_name}",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 2.0
29 changes: 29 additions & 0 deletions e2e/tools/validator/src/validator/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import matplotlib.pyplot as plt
import numpy as np
import numpy.typing as npt
from click.exceptions import Exit
from matplotlib import ticker
from matplotlib.dates import DateFormatter

Expand Down Expand Up @@ -610,6 +611,34 @@ def validate_acpi(cfg: config.Validator, duration: datetime.timedelta, report_di
return int(res.validations.passed)


@validator.command()
@click.option("--duration", "-d", type=options.Duration(), required=True)
# ruff: noqa: S108 (Suppressed as we are intentionally using `/tmp` as reporting directory)
@click.option(
"--report-dir",
"-o",
default="/tmp",
type=click.Path(exists=True, dir_okay=True, writable=True),
show_default=True,
)
@pass_config
def validate_metrics(cfg: config.Validator, duration: datetime.timedelta, report_dir: str):
results_dir, tag = create_report_dir(report_dir)
res = TestResult(tag)
res.end_time = datetime.datetime.now(tz=datetime.UTC)
res.start_time = res.end_time - duration
click.secho(" * Generating build and node info ...", fg="green")
res.build_info, res.node_info = get_build_and_node_info(cfg.prometheus)
click.secho(" * Generating spec report ...", fg="green")
res.host_spec = get_host_spec()
script_result = ScriptResult(res.start_time, res.end_time)
res.validations = run_validations(cfg, script_result, results_dir)
click.secho(" * Generating validate metrics report file and dir", fg="green")
write_md_report(results_dir, res)

raise Exit(1) if not res.validations.passed else Exit(0)


def write_json_report(results_dir: str, res: TestResult):
pattern = re.compile(r'[{]?(\w+)=("[^"]*"|[^,]+)[},]?')

Expand Down
4 changes: 4 additions & 0 deletions e2e/tools/validator/src/validator/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ class Metal(NamedTuple):
class PrometheusJob(NamedTuple):
metal: str
vm: str
dev: str
latest: str


class Prometheus(NamedTuple):
Expand Down Expand Up @@ -95,6 +97,8 @@ def load(config_file: str) -> Validator:
job = PrometheusJob(
metal=prom_job.get("metal", "metal"),
vm=prom_job.get("vm", "vm"),
latest=prom_job.get("latest", "latest"),
dev=prom_job.get("dev", "dev"),
)

prometheus = Prometheus(
Expand Down
3 changes: 3 additions & 0 deletions e2e/tools/validator/src/validator/validations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def validation_from_yaml(v: dict[str, Any]) -> Validation:
predicted_label=predicted_label,
units=v.get("units", ""),
max_mape=v.get("max_mape"),
max_mae=v.get("max_mae"),
)

return [validation_from_yaml(v) for v in yml["validations"]]
Expand All @@ -112,6 +113,8 @@ def load(self) -> list[Validation]:
promql_vars["rate_interval"] = prom.rate_interval
promql_vars["metal_job_name"] = prom.job.metal
promql_vars["vm_job_name"] = prom.job.vm
promql_vars["latest_job_name"] = prom.job.latest
promql_vars["dev_job_name"] = prom.job.dev

logger.debug("promql_vars: %s", promql_vars)

Expand Down

0 comments on commit 564fa4c

Please sign in to comment.