From 27a90ff758ad1d7faaa7d4dbf9273e9efe4f1a8a Mon Sep 17 00:00:00 2001 From: sp98 Date: Thu, 4 Jan 2024 08:42:55 +0530 Subject: [PATCH] use correct units for MDSCacheUsageHigh alerts ceph-mds-mem-rss, used in MDSCacheUsageHigh alert, is in KB rather than bytes. This PR converts the ceph-mds-mem-rss metric value to bytes before comparing it with the mds pod memory limit. Signed-off-by: sp98 --- metrics/deploy/prometheus-ocs-rules.yaml | 2 +- metrics/mixin/alerts/perf.libsonnet | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/metrics/deploy/prometheus-ocs-rules.yaml b/metrics/deploy/prometheus-ocs-rules.yaml index 2bbfe59209..0a5d11745e 100644 --- a/metrics/deploy/prometheus-ocs-rules.yaml +++ b/metrics/deploy/prometheus-ocs-rules.yaml @@ -282,7 +282,7 @@ spec: message: High MDS cache usage for the daemon {{ $labels.ceph_daemon }}. severity_level: error expr: | - ceph_mds_mem_rss / on(ceph_daemon) group_left(job)(label_replace(kube_pod_container_resource_requests{container="mds", resource="memory"}, "ceph_daemon", "mds.$1", "pod", "rook-ceph-mds-(.*)-(.*)") * .5) > .95 + (ceph_mds_mem_rss * 1000) / on(ceph_daemon) group_left(job)(label_replace(kube_pod_container_resource_requests{container="mds", resource="memory"}, "ceph_daemon", "mds.$1", "pod", "rook-ceph-mds-(.*)-(.*)") * .5) > .95 for: 5m labels: severity: critical diff --git a/metrics/mixin/alerts/perf.libsonnet b/metrics/mixin/alerts/perf.libsonnet index ae385de236..0cc0ea4b18 100644 --- a/metrics/mixin/alerts/perf.libsonnet +++ b/metrics/mixin/alerts/perf.libsonnet @@ -7,7 +7,7 @@ { alert: 'MDSCacheUsageHigh', expr: ||| - ceph_mds_mem_rss / on(ceph_daemon) group_left(job)(label_replace(kube_pod_container_resource_requests{container="mds", resource="memory"}, "ceph_daemon", "mds.$1", "pod", "rook-ceph-mds-(.*)-(.*)") * .5) > .95 + (ceph_mds_mem_rss * 1000) / on(ceph_daemon) group_left(job)(label_replace(kube_pod_container_resource_requests{container="mds", resource="memory"}, "ceph_daemon", "mds.$1", "pod", "rook-ceph-mds-(.*)-(.*)") * .5) > .95 ||| % $._config, 'for': $._config.mdsCacheUsageAlertTime, labels: {