From b0e6b08e9d7185dd36e91a65edd7accb48936cd9 Mon Sep 17 00:00:00 2001 From: HyeockJinKim Date: Mon, 2 Dec 2024 15:59:08 +0900 Subject: [PATCH 1/6] feat: Add grafana system --- configs/prometheus/prometheus.yml | 31 +++++++++++++++++++++++++ docker-compose.dashboard.yml | 29 +++++++++++++++++++++++ docker-compose.halfstack-2409.yml | 22 ++++++++++++++++++ docker/grafana/Dockerfile | 9 +++++++ src/ai/backend/manager/metric/metric.py | 7 +++--- src/ai/backend/manager/server.py | 2 +- 6 files changed, 96 insertions(+), 4 deletions(-) create mode 100644 configs/prometheus/prometheus.yml create mode 100644 docker-compose.dashboard.yml create mode 100644 docker/grafana/Dockerfile diff --git a/configs/prometheus/prometheus.yml b/configs/prometheus/prometheus.yml new file mode 100644 index 0000000000..86067c179e --- /dev/null +++ b/configs/prometheus/prometheus.yml @@ -0,0 +1,31 @@ +# my global config +global: + scrape_interval: 1m # Global scrape interval (default: 1m) + evaluation_interval: 1m # Evaluate interval for rules (default: 1m) + +# Alertmanager configuration +alerting: + alertmanagers: + - static_configs: + - targets: + # - alertmanager:9093 + +# Load and evaluate rules with period `evaluation_interval`. +rule_files: + # - "first_rules.yml" + # - "second_rules.yml" + +# Scrape configurations +scrape_configs: + # Prometheus itself + # - metrics_path defaults to "/metrics" + # - scheme defaults to "http". + - job_name: prometheus + static_configs: + - targets: ['backendai-half-prometheus:9090'] + - job_name: manager + static_configs: + - targets: ['host.docker.internal:8091'] + - job_name: agent + static_configs: + - targets: ['host.docker.internal:6001'] diff --git a/docker-compose.dashboard.yml b/docker-compose.dashboard.yml new file mode 100644 index 0000000000..4bc56dddee --- /dev/null +++ b/docker-compose.dashboard.yml @@ -0,0 +1,29 @@ +services: + backendai-half-grafana: + build: ./docker/grafana + networks: + - half + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_USER=backend.ai + - GF_SECURITY_ADMIN_PASSWORD=backend.ai + volumes: + - "./volumes/${DATADIR_PREFIX:-.}/grafana-data:/var/lib/grafana:rw" + + backendai-half-prometheus: + image: prom/prometheus:latest + networks: + - half + command: + - '--config.file=/etc/prometheus/prometheus.yml' + + backendai-half-redis-exporter: + image: oliver006/redis_exporter:latest + environment: + - REDIS_ADDR=backendai-half-redis:6379 + depends_on: + - backendai-half-redis + +networks: + half: diff --git a/docker-compose.halfstack-2409.yml b/docker-compose.halfstack-2409.yml index 6397fe451a..34e0630d46 100644 --- a/docker-compose.halfstack-2409.yml +++ b/docker-compose.halfstack-2409.yml @@ -65,5 +65,27 @@ services: timeout: 3s retries: 10 + backendai-half-grafana: + image: grafana/grafana-enterprise:latest + networks: + - half + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + volumes: + - "./volumes/${DATADIR_PREFIX:-.}/grafana-data:/var/lib/grafana:rw" + + backendai-half-prometheus: + image: prom/prometheus:latest + networks: + - half + volumes: + - "./configs/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml" + extra_hosts: + - "host.docker.internal:host-gateway" + command: + - '--config.file=/etc/prometheus/prometheus.yml' + networks: half: diff --git a/docker/grafana/Dockerfile b/docker/grafana/Dockerfile new file mode 100644 index 0000000000..2e159e3405 --- /dev/null +++ b/docker/grafana/Dockerfile @@ -0,0 +1,9 @@ +ARG GRAFANA_VERSION=latest +FROM grafana/grafana-enterprise:${GRAFANA_VERSION} + +# Install plugins +RUN grafana cli plugins install redis-datasource && \ + grafana cli plugins install fifemon-graphql-datasource && \ + grafana cli plugins install grafana-piechart-panel && \ + grafana cli plugins install marcusolsson-json-datasource && \ + grafana cli plugins install yesoreyeram-infinity-datasource diff --git a/src/ai/backend/manager/metric/metric.py b/src/ai/backend/manager/metric/metric.py index fdbc90340e..35827f39f0 100644 --- a/src/ai/backend/manager/metric/metric.py +++ b/src/ai/backend/manager/metric/metric.py @@ -12,10 +12,10 @@ def __init__(self) -> None: labelnames=["method", "endpoint", "status_code"], ) self._request_duration = Histogram( - name="backendai_api_request_duration_ms", + name="backendai_api_request_duration_sec", documentation="Duration of API requests in milliseconds", labelnames=["method", "endpoint", "status_code"], - buckets=[10, 50, 100, 200, 500, 1000, 2000, 5000, 10000], + buckets=[0.001, 0.01, 0.1, 0.5, 1, 2, 5, 10], ) @classmethod @@ -60,9 +60,10 @@ def __init__(self) -> None: labelnames=["event_type"], ) self._event_processing_time = Histogram( - name="backendai_event_processing_time", + name="backendai_event_processing_time_sec", documentation="Processing time of events in seconds", labelnames=["event_type", "status"], + buckets=[0.001, 0.01, 0.1, 0.5, 1, 2, 5, 10], ) @classmethod diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index 2562b3e732..ae55311c3b 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -212,7 +212,7 @@ async def prometheus_metrics(request: web.Request) -> web.Response: Returns the Prometheus metrics. """ root_ctx: RootContext = request.app["_root.context"] - metrics = str(root_ctx.metric_registry.to_prometheus()) + metrics = root_ctx.metric_registry.to_prometheus().decode("utf-8") return web.Response(text=metrics, content_type="text/plain") From db245e6a9b152391aaf92d2a19fbb6acde48e6a0 Mon Sep 17 00:00:00 2001 From: HyeockJinKim Date: Tue, 3 Dec 2024 13:41:19 +0900 Subject: [PATCH 2/6] Add bgtask metrics --- src/ai/backend/common/bgtask.py | 29 ++++++++++++++++- src/ai/backend/manager/metric/metric.py | 41 ++++++++++++++++++++++++- src/ai/backend/manager/server.py | 4 ++- tests/manager/metric/test_metric.py | 2 ++ 4 files changed, 73 insertions(+), 3 deletions(-) diff --git a/src/ai/backend/common/bgtask.py b/src/ai/backend/common/bgtask.py index 1a9a9df78f..873f61e75c 100644 --- a/src/ai/backend/common/bgtask.py +++ b/src/ai/backend/common/bgtask.py @@ -16,6 +16,7 @@ DefaultDict, Final, Literal, + Protocol, Set, Type, TypeAlias, @@ -108,17 +109,37 @@ async def _pipe_builder(r: Redis) -> Pipeline: BackgroundTask = Callable[Concatenate[ProgressReporter, ...], Awaitable[str | None]] +class BackgroundTaskMetric(Protocol): + def bgtask_started(self, *, task_name: str) -> None: ... + def bgtask_done(self, *, task_name: str, status: str, duration: float) -> None: ... + + +class NopBackgroundTaskMetric: + def bgtask_started(self, *, task_name: str) -> None: + pass + + def bgtask_done(self, *, task_name: str, status: str, duration: float) -> None: + pass + + class BackgroundTaskManager: event_producer: EventProducer ongoing_tasks: weakref.WeakSet[asyncio.Task] task_update_queues: DefaultDict[uuid.UUID, Set[asyncio.Queue[Sentinel | BgtaskEvents]]] dict_lock: asyncio.Lock + _metric: BackgroundTaskMetric - def __init__(self, event_producer: EventProducer) -> None: + def __init__( + self, + event_producer: EventProducer, + *, + metric: BackgroundTaskMetric = NopBackgroundTaskMetric(), + ) -> None: self.event_producer = event_producer self.ongoing_tasks = weakref.WeakSet() self.task_update_queues = defaultdict(set) self.dict_lock = asyncio.Lock() + self._metric = metric def register_event_handlers(self, event_dispatcher: EventDispatcher) -> None: """ @@ -280,6 +301,8 @@ async def _wrapper_task( event_cls: Type[BgtaskDoneEvent] | Type[BgtaskCancelledEvent] | Type[BgtaskFailedEvent] = ( BgtaskDoneEvent ) + self._metric.bgtask_started(task_name=task_name or func.__name__) + start = time.perf_counter() try: message = await func(reporter, **kwargs) or "" task_status = "bgtask_done" @@ -292,6 +315,10 @@ async def _wrapper_task( message = repr(e) log.exception("Task {} ({}): unhandled error", task_id, task_name) finally: + duration = time.perf_counter() - start + self._metric.bgtask_done( + task_name=task_name or func.__name__, status=task_status, duration=duration + ) redis_producer = self.event_producer.redis_client async def _pipe_builder(r: Redis): diff --git a/src/ai/backend/manager/metric/metric.py b/src/ai/backend/manager/metric/metric.py index 35827f39f0..23663bb997 100644 --- a/src/ai/backend/manager/metric/metric.py +++ b/src/ai/backend/manager/metric/metric.py @@ -1,4 +1,4 @@ -from prometheus_client import Counter, Histogram, generate_latest +from prometheus_client import Counter, Gauge, Histogram, generate_latest class APIMetrics: @@ -86,6 +86,44 @@ def update_failure_event_metric(self, *, event_type: str, duration: float) -> No ) +class BgTaskMetrics: + _bgtask_count: Gauge + _bgtask_done_count: Counter + _bgtask_processing_time: Histogram + + def __init__(self) -> None: + self._bgtask_count = Gauge( + name="backendai_bgtask_count", + documentation="Total number of background tasks processed", + labelnames=["task_name"], + ) + self._bgtask_done_count = Counter( + name="backendai_bgtask_done_count", + documentation="Number of completed background tasks", + labelnames=["task_name", "status"], + ) + self._bgtask_processing_time = Histogram( + name="backendai_bgtask_processing_time_sec", + documentation="Processing time of background tasks in seconds", + labelnames=["task_name", "status"], + buckets=[0.1, 1, 10, 30, 60, 300, 600], + ) + + @classmethod + def instance(cls): + if not hasattr(cls, "_instance"): + cls._instance = cls() + return cls._instance + + def bgtask_started(self, *, task_name: str) -> None: + self._bgtask_count.labels(task_name=task_name).inc() + + def bgtask_done(self, *, task_name: str, status: str, duration: float) -> None: + self._bgtask_count.labels(task_name=task_name).dec() + self._bgtask_processing_time.labels(task_name=task_name, status=status).observe(duration) + self._bgtask_done_count.labels(task_name=task_name, status=status).inc() + + class MetricRegistry: api: APIMetrics event: EventMetrics @@ -93,6 +131,7 @@ class MetricRegistry: def __init__(self) -> None: self.api = APIMetrics.instance() self.event = EventMetrics.instance() + self.bgtask = BgTaskMetrics.instance() def to_prometheus(self) -> bytes: return generate_latest() diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index ae55311c3b..feb0926150 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -717,7 +717,9 @@ def __init__(self, root_ctx: RootContext) -> None: self.root_ctx = root_ctx async def __aenter__(self) -> None: - self.root_ctx.background_task_manager = BackgroundTaskManager(self.root_ctx.event_producer) + self.root_ctx.background_task_manager = BackgroundTaskManager( + self.root_ctx.event_producer, metric=self.root_ctx.metric_registry.bgtask + ) async def __aexit__(self, *exc_info) -> None: pass diff --git a/tests/manager/metric/test_metric.py b/tests/manager/metric/test_metric.py index 5f34cd832e..4105c2cf8c 100644 --- a/tests/manager/metric/test_metric.py +++ b/tests/manager/metric/test_metric.py @@ -9,3 +9,5 @@ def test_metric_registry_instance(): assert registry1.api is registry2.api assert registry1.event is not None assert registry1.event is registry2.event + assert registry1.bgtask is not None + assert registry1.bgtask is registry2.bgtask From e179be9dde755bedc731873e3a8631c894443062 Mon Sep 17 00:00:00 2001 From: HyeockJinKim Date: Tue, 3 Dec 2024 14:16:54 +0900 Subject: [PATCH 3/6] add changelog --- changes/3191.feature.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/3191.feature.md diff --git a/changes/3191.feature.md b/changes/3191.feature.md new file mode 100644 index 0000000000..b8c8a44413 --- /dev/null +++ b/changes/3191.feature.md @@ -0,0 +1 @@ +Add grafana dashboard for monitoring metrics \ No newline at end of file From 6141b2fd431f81fbc73ac119b45b87ea71d6c9bb Mon Sep 17 00:00:00 2001 From: HyeockJinKim Date: Tue, 3 Dec 2024 18:38:19 +0900 Subject: [PATCH 4/6] Add grafana dashboard --- .../grafana/dashboards/manager-dashboard.json | 1249 +++++++++++++++++ .../provisioning/manager-dashboard.json | 1249 +++++++++++++++++ docker-compose.halfstack-2409.yml | 3 +- 3 files changed, 2500 insertions(+), 1 deletion(-) create mode 100644 configs/grafana/dashboards/manager-dashboard.json create mode 100644 configs/grafana/provisioning/manager-dashboard.json diff --git a/configs/grafana/dashboards/manager-dashboard.json b/configs/grafana/dashboards/manager-dashboard.json new file mode 100644 index 0000000000..152866f64c --- /dev/null +++ b/configs/grafana/dashboards/manager-dashboard.json @@ -0,0 +1,1249 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 3, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 3, + "panels": [], + "title": "Manager Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "sum(increase(backendai_api_request_count_total[5m])) by (method, endpoint, status_code)", + "legendFormat": "{{method}} : {{endpoint}} / {{status_code}}", + "range": true, + "refId": "A" + } + ], + "title": "Total API Request Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "sum(increase(backendai_api_request_count_total{status_code=~\"4..|500\"}[5m])) by (method, endpoint, status_code)", + "legendFormat": "{{method}} : {{endpoint}} / {{status_code}}", + "range": true, + "refId": "A" + } + ], + "title": "API Failure Request Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "sum by (method, endpoint) (increase(backendai_api_request_duration_sec_sum[5m])) \n/\nsum by (method, endpoint) (increase(backendai_api_request_duration_sec_count[5m]))", + "legendFormat": "{{method}} : {{endpoint}}", + "range": true, + "refId": "A" + } + ], + "title": "API Request Duration (Avg)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(backendai_api_request_duration_sec_bucket) by (le, method, endpoint))", + "legendFormat": "P95 {{method}} : {{endpoint}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum(backendai_api_request_duration_sec_bucket) by (le, method, endpoint))", + "hide": false, + "legendFormat": "P90 {{method}} : {{endpoint}}", + "range": true, + "refId": "B" + } + ], + "title": "API Request Duration (P90, P95)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "sum(increase(backendai_event_count_total[5m])) by (event_type)", + "legendFormat": "{{event_type}}", + "range": true, + "refId": "A" + } + ], + "title": "Total Event Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "sum(increase(backendai_event_failure_count[5m])) by (event_type)", + "legendFormat": "{{event_type}}", + "range": true, + "refId": "A" + } + ], + "title": "Event Failure Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 25 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "sum by (event_type) (increase(backendai_event_processing_time_sec_sum[5m])) \n/\nsum by (event_type) (increase(backendai_event_processing_time_sec_count[5m]))", + "legendFormat": "{{event_type}}", + "range": true, + "refId": "A" + } + ], + "title": "Event Processing Duration (Avg)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 25 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(backendai_event_processing_time_sec_bucket) by (le, event_type))", + "legendFormat": "P95 {{event_type}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum(backendai_event_processing_time_sec_bucket) by (le, event_type))", + "hide": false, + "legendFormat": "P90 {{event_type}}", + "range": true, + "refId": "B" + } + ], + "title": "Event Processing Duration (P90, P95)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 33 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "sum(backendai_bgtask_count) by (task_name)", + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Total BgTask Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 33 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "sum(increase(backendai_bgtask_done_count_total[5m])) by (task_name, status)", + "legendFormat": "{{task_name}} / {{status}}", + "range": true, + "refId": "A" + } + ], + "title": "Total BgTask Done Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 41 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "sum by (task_name, status) (increase(backendai_bgtask_processing_time_sec_sum[5m])) \n/\nsum by (task_name, status) (increase(backendai_bgtask_processing_time_sec_count[5m]))", + "legendFormat": "{{task_name}} / {{status}}", + "range": true, + "refId": "A" + } + ], + "title": "BgTask Duration (Avg)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 41 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(backendai_bgtask_processing_time_sec_bucket) by (le, task_name, status))", + "legendFormat": "P95 {{task_name}} / {{status}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum(backendai_bgtask_processing_time_sec_bucket) by (le, task_name, status))", + "hide": false, + "legendFormat": "P90 {{task_name}} / {{status}}", + "range": true, + "refId": "B" + } + ], + "title": "BgTask Duration (P90, P95)", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "1m", + "schemaVersion": 40, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-20m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Manager Dashboard", + "uid": "fe5pw9i1ftb0gc", + "version": 24, + "weekStart": "" +} \ No newline at end of file diff --git a/configs/grafana/provisioning/manager-dashboard.json b/configs/grafana/provisioning/manager-dashboard.json new file mode 100644 index 0000000000..152866f64c --- /dev/null +++ b/configs/grafana/provisioning/manager-dashboard.json @@ -0,0 +1,1249 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 3, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 3, + "panels": [], + "title": "Manager Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "sum(increase(backendai_api_request_count_total[5m])) by (method, endpoint, status_code)", + "legendFormat": "{{method}} : {{endpoint}} / {{status_code}}", + "range": true, + "refId": "A" + } + ], + "title": "Total API Request Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "sum(increase(backendai_api_request_count_total{status_code=~\"4..|500\"}[5m])) by (method, endpoint, status_code)", + "legendFormat": "{{method}} : {{endpoint}} / {{status_code}}", + "range": true, + "refId": "A" + } + ], + "title": "API Failure Request Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "sum by (method, endpoint) (increase(backendai_api_request_duration_sec_sum[5m])) \n/\nsum by (method, endpoint) (increase(backendai_api_request_duration_sec_count[5m]))", + "legendFormat": "{{method}} : {{endpoint}}", + "range": true, + "refId": "A" + } + ], + "title": "API Request Duration (Avg)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(backendai_api_request_duration_sec_bucket) by (le, method, endpoint))", + "legendFormat": "P95 {{method}} : {{endpoint}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum(backendai_api_request_duration_sec_bucket) by (le, method, endpoint))", + "hide": false, + "legendFormat": "P90 {{method}} : {{endpoint}}", + "range": true, + "refId": "B" + } + ], + "title": "API Request Duration (P90, P95)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "sum(increase(backendai_event_count_total[5m])) by (event_type)", + "legendFormat": "{{event_type}}", + "range": true, + "refId": "A" + } + ], + "title": "Total Event Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "sum(increase(backendai_event_failure_count[5m])) by (event_type)", + "legendFormat": "{{event_type}}", + "range": true, + "refId": "A" + } + ], + "title": "Event Failure Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 25 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "sum by (event_type) (increase(backendai_event_processing_time_sec_sum[5m])) \n/\nsum by (event_type) (increase(backendai_event_processing_time_sec_count[5m]))", + "legendFormat": "{{event_type}}", + "range": true, + "refId": "A" + } + ], + "title": "Event Processing Duration (Avg)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 25 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(backendai_event_processing_time_sec_bucket) by (le, event_type))", + "legendFormat": "P95 {{event_type}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum(backendai_event_processing_time_sec_bucket) by (le, event_type))", + "hide": false, + "legendFormat": "P90 {{event_type}}", + "range": true, + "refId": "B" + } + ], + "title": "Event Processing Duration (P90, P95)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 33 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "sum(backendai_bgtask_count) by (task_name)", + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Total BgTask Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 33 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "sum(increase(backendai_bgtask_done_count_total[5m])) by (task_name, status)", + "legendFormat": "{{task_name}} / {{status}}", + "range": true, + "refId": "A" + } + ], + "title": "Total BgTask Done Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 41 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "sum by (task_name, status) (increase(backendai_bgtask_processing_time_sec_sum[5m])) \n/\nsum by (task_name, status) (increase(backendai_bgtask_processing_time_sec_count[5m]))", + "legendFormat": "{{task_name}} / {{status}}", + "range": true, + "refId": "A" + } + ], + "title": "BgTask Duration (Avg)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 41 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(backendai_bgtask_processing_time_sec_bucket) by (le, task_name, status))", + "legendFormat": "P95 {{task_name}} / {{status}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "be3td1qwoaxhcc" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum(backendai_bgtask_processing_time_sec_bucket) by (le, task_name, status))", + "hide": false, + "legendFormat": "P90 {{task_name}} / {{status}}", + "range": true, + "refId": "B" + } + ], + "title": "BgTask Duration (P90, P95)", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "1m", + "schemaVersion": 40, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-20m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Manager Dashboard", + "uid": "fe5pw9i1ftb0gc", + "version": 24, + "weekStart": "" +} \ No newline at end of file diff --git a/docker-compose.halfstack-2409.yml b/docker-compose.halfstack-2409.yml index 34e0630d46..1e334de466 100644 --- a/docker-compose.halfstack-2409.yml +++ b/docker-compose.halfstack-2409.yml @@ -72,7 +72,8 @@ services: ports: - "3000:3000" environment: - - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_SECURITY_ADMIN_USER=backend + - GF_SECURITY_ADMIN_PASSWORD=develove volumes: - "./volumes/${DATADIR_PREFIX:-.}/grafana-data:/var/lib/grafana:rw" From 5566768aba7041eef096cfad17cbf9e33331fd0f Mon Sep 17 00:00:00 2001 From: HyeockJinKim Date: Wed, 4 Dec 2024 11:25:16 +0900 Subject: [PATCH 5/6] Add common metrics --- src/ai/backend/manager/metric/metric.py | 44 +++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/ai/backend/manager/metric/metric.py b/src/ai/backend/manager/metric/metric.py index 23663bb997..2b8eaaed6d 100644 --- a/src/ai/backend/manager/metric/metric.py +++ b/src/ai/backend/manager/metric/metric.py @@ -1,3 +1,6 @@ +import asyncio +import time + from prometheus_client import Counter, Gauge, Histogram, generate_latest @@ -124,14 +127,55 @@ def bgtask_done(self, *, task_name: str, status: str, duration: float) -> None: self._bgtask_done_count.labels(task_name=task_name, status=status).inc() +class CommonMetric: + _up: Gauge + _up_time: Gauge + _coroutines: Gauge + _boot_time: float + + def __init__(self) -> None: + self._up = Gauge( + name="backendai_up", + documentation="BackendAI service is up", + ) + self._coroutines = Gauge( + name="backendai_coroutines", + documentation="Total number of coroutines running", + ) + self._boot_time = time.time() + self._up.set(1) + self._up_time = Gauge( + name="backendai_up_time", + documentation="BackendAI service is up time", + ) + + @classmethod + def instance(cls): + if not hasattr(cls, "_instance"): + cls._instance = cls() + return cls._instance + + def _update_coroutines(self) -> None: + count = len(asyncio.all_tasks()) + self._coroutines.set(count) + + def update(self) -> None: + self._update_coroutines() + self._up_time.set(time.time() - self._boot_time) + + class MetricRegistry: api: APIMetrics event: EventMetrics + bgtask: BgTaskMetrics + common: CommonMetric def __init__(self) -> None: self.api = APIMetrics.instance() self.event = EventMetrics.instance() self.bgtask = BgTaskMetrics.instance() + self.common = CommonMetric.instance() def to_prometheus(self) -> bytes: + self.common.update() return generate_latest() From 8a0a244c22c018c3220e365b1947f68c9bbbb705 Mon Sep 17 00:00:00 2001 From: HyeockJinKim Date: Wed, 4 Dec 2024 13:39:07 +0900 Subject: [PATCH 6/6] change scrape_interval to 20s --- configs/prometheus/prometheus.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/configs/prometheus/prometheus.yml b/configs/prometheus/prometheus.yml index 86067c179e..9aa53e686b 100644 --- a/configs/prometheus/prometheus.yml +++ b/configs/prometheus/prometheus.yml @@ -1,7 +1,6 @@ # my global config global: - scrape_interval: 1m # Global scrape interval (default: 1m) - evaluation_interval: 1m # Evaluate interval for rules (default: 1m) + scrape_interval: 20s # Global scrape interval (default: 1m) # Alertmanager configuration alerting: