diff --git a/context/otel-stack/dashboards/dashboard-apm.json b/context/otel-stack/dashboards/dashboard-apm.json new file mode 100644 index 000000000..4011920ae --- /dev/null +++ b/context/otel-stack/dashboards/dashboard-apm.json @@ -0,0 +1,1135 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "A language-agnostic application performance management(APM) with OpenTelemetry, Grafana, and Prometheus.", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 19419, + "graphTooltip": 0, + "id": 120, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "green", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(duration_milliseconds_count{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_route=~\"$route\"}) by(service_name)", + "instant": false, + "legendFormat": "{{label_name}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_route=~\"$route\"}) by(service_name)", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "B" + } + ], + "title": "Total Request", + "transformations": [ + { + "id": "seriesToRows", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "Time" + } + ] + } + } + ], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{service_name=\"$app\", span_kind=\"SPAN_KIND_SERVER\", http_status_code=~\"^2.*\", http_route=~\"$route\"}) by(http_status_code)", + "format": "time_series", + "instant": false, + "legendFormat": "Http Status 2XX", + "range": true, + "refId": "2XX" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{service_name=\"$app\", span_kind=\"SPAN_KIND_SERVER\", http_status_code=~\"^3.*\", http_route=~\"$route\"}) by(http_status_code)", + "format": "time_series", + "hide": false, + "instant": false, + "legendFormat": "Http Status 3XX", + "range": true, + "refId": "3XX" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{service_name=\"$app\", span_kind=\"SPAN_KIND_SERVER\", http_status_code=~\"^4.*\", http_route=~\"$route\"}) by(http_status_code)", + "format": "time_series", + "hide": false, + "instant": false, + "legendFormat": "Http Status 4XX", + "range": true, + "refId": "4XX" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{service_name=\"$app\", span_kind=\"SPAN_KIND_SERVER\", http_status_code=~\"^5.*\", http_route=~\"$route\"}) by(http_status_code)", + "format": "time_series", + "hide": false, + "instant": false, + "legendFormat": "Http Status 5XX", + "range": true, + "refId": "5XX" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{service_name=\"$app\", span_kind=\"SPAN_KIND_SERVER\", http_status_code=~\"^2.*\", http_route=~\"$route\"}) by(http_status_code)", + "format": "time_series", + "hide": false, + "instant": true, + "legendFormat": "Http Status 2XX", + "range": false, + "refId": "2XX-instant" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{service_name=\"$app\", span_kind=\"SPAN_KIND_SERVER\", http_status_code=~\"^3.*\", http_route=~\"$route\"}) by(http_status_code)", + "format": "time_series", + "hide": false, + "instant": false, + "legendFormat": "Http Status 3XX", + "range": true, + "refId": "3XX-instant" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{service_name=\"$app\", span_kind=\"SPAN_KIND_SERVER\", http_status_code=~\"^4.*\", http_route=~\"$route\"}) by(http_status_code)", + "format": "time_series", + "hide": false, + "instant": false, + "legendFormat": "Http Status 4XX", + "range": true, + "refId": "4XX-instant" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{service_name=\"$app\", span_kind=\"SPAN_KIND_SERVER\", http_status_code=~\"^5.*\", http_route=~\"$route\"}) by(http_status_code)", + "format": "time_series", + "hide": false, + "instant": false, + "legendFormat": "Http Status 5XX", + "range": true, + "refId": "5XX-instant" + } + ], + "title": "Requests Count", + "transformations": [ + { + "id": "seriesToRows", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "Time" + } + ] + } + }, + { + "id": "partitionByValues", + "options": { + "fields": ["Metric"], + "keepFields": false + } + } + ], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "request amount distribution", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [], + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 4, + "options": { + "displayLabels": ["percent"], + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true, + "values": ["percent"] + }, + "pieType": "donut", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(duration_milliseconds_count{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_route=~\"$route\"}) by(span_name)", + "instant": false, + "legendFormat": "{{label_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Request Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "cumulative latency distribution", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [], + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 7, + "options": { + "displayLabels": ["percent"], + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true, + "values": ["percent"] + }, + "pieType": "donut", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_sum{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_route=~\"$route\"}) by(span_name)", + "instant": true, + "legendFormat": "{{label_name}}", + "range": false, + "refId": "A" + } + ], + "title": "Loading Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "overall request rate per minute over last 3 minutes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "reqpm" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 4 + }, + "id": 8, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(duration_milliseconds_count{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_route=~\"$route\"}[3m])*60)", + "hide": false, + "instant": false, + "range": true, + "refId": "B" + } + ], + "title": "Overall Request Rate", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "percentage of HTTP status 5xx in all requests", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 4 + }, + "id": 9, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(duration_milliseconds_count{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_status_code=~\"5.*|\", http_route=~\"$route\"})/sum(duration_milliseconds_count{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_route=~\"$route\"})", + "instant": false, + "range": true, + "refId": "A" + } + ], + "title": "Overall Error Rate", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "request rate per minute over last 3 minutes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqpm" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 8 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(duration_milliseconds_count{service_name=\"$app\", span_kind=\"SPAN_KIND_SERVER\", http_route=~\"$route\"}[3m])*60) by(span_name)", + "hide": false, + "instant": false, + "legendFormat": "{{label_name}}", + "range": true, + "refId": "B" + } + ], + "title": "Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "PR95 latency over last 3 minutes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(duration_milliseconds_bucket{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_route=~\"$route\"}[3m])) by (le, span_name))", + "instant": false, + "legendFormat": "{{label_name}}", + "range": true, + "refId": "A" + } + ], + "title": "PR95 Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "by route and http status code", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 8 + }, + "id": 6, + "options": { + "displayMode": "lcd", + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sort_desc(duration_milliseconds_sum{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_status_code!=\"\", http_route=~\"$route\"} / duration_milliseconds_count{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_status_code!=\"\", http_route=~\"$route\"})", + "instant": true, + "legendFormat": "[{{http_status_code}}] {{span_name}}", + "range": false, + "refId": "A" + } + ], + "title": "Average Latency", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Details of each API", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Request Rate" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "type": "sparkline" + } + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + }, + { + "id": "unit", + "value": "reqpm" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Error Rate" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "spanNulls": false, + "type": "sparkline" + } + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + }, + { + "id": "unit", + "value": "percentunit" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PR95" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "hideValue": false, + "type": "sparkline" + } + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + }, + { + "id": "unit", + "value": "ms" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 10, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": true, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(duration_milliseconds_count{service_name=\"$app\", http_route=~\"$route\", span_kind=\"SPAN_KIND_SERVER\"}[3m])*60) by(service_name, http_method, http_route)", + "format": "time_series", + "hide": false, + "instant": false, + "range": true, + "refId": "Request Rate" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(duration_milliseconds_bucket{service_name=\"$app\", http_route=~\"$route\", span_kind=\"SPAN_KIND_SERVER\"}[3m])) by (le, service_name, http_method, http_route))", + "hide": false, + "instant": false, + "range": true, + "refId": "PR95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{service_name=\"$app\", http_route=~\"$route\", span_kind=\"SPAN_KIND_SERVER\", http_status_code!~\"2.*|3.*\"}) by(service_name, http_method, http_route) / sum(duration_milliseconds_count{service_name=\"$app\", http_route=~\"$route\", span_kind=\"SPAN_KIND_SERVER\"}) by(service_name, http_method, http_route)", + "format": "time_series", + "hide": false, + "instant": false, + "range": true, + "refId": "Error Rate" + } + ], + "title": "Details", + "transformations": [ + { + "id": "timeSeriesTable", + "options": {} + }, + { + "id": "merge", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "includeByName": {}, + "indexByName": { + "Time": 0, + "Trend #PR95 Trend": 7, + "Trend #Request Rate Trend": 5, + "Value #PR95": 8, + "Value #Request Rate": 6, + "http_method": 3, + "http_route": 2, + "http_status_code": 4, + "service_name": 1 + }, + "renameByName": { + "Trend": "Request Rate Trend", + "Trend #Error Rate": "Error Rate", + "Trend #Error Rate Trend": "Error Rate Trend", + "Trend #PR95": "PR95", + "Trend #PR95 Trend": "PR95 Latency Trend", + "Trend #Request Rate": "Request Rate", + "Trend #Request Rate Trend": "Request Rate Trend", + "Value": "Request Rate", + "Value #A": "Error Rate", + "Value #Error Rate": "Error Rate", + "Value #PR95": "PR95 Latency", + "Value #Request Rate": "Request Rate", + "http_method": "Method", + "http_route": "Route", + "http_status_code": "Status Code", + "service_name": "Application" + } + } + } + ], + "type": "table" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "BACKOFFICE", + "value": "BACKOFFICE" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(service_name)", + "hide": 0, + "includeAll": false, + "label": "Application", + "multi": false, + "name": "app", + "options": [], + "query": { + "query": "label_values(service_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(http_route)", + "hide": 0, + "includeAll": true, + "label": "Route", + "multi": true, + "name": "route", + "options": [], + "query": { + "query": "label_values(http_route)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "OpenTelemetry APM WIP", + "uid": "opentelemetry-apm-wip", + "version": 1, + "weekStart": "" +} diff --git a/context/otel-stack/dashboards/dashboard-top.json b/context/otel-stack/dashboards/dashboard-top.json new file mode 100644 index 000000000..0649007c5 --- /dev/null +++ b/context/otel-stack/dashboards/dashboard-top.json @@ -0,0 +1,1139 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "Spanmetrics way of demo application view.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 129, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 24, + "panels": [], + "title": "Service Level - Throughput and Latencies", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-BlYlRd" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + }, + { + "color": "green", + "value": 2 + }, + { + "color": "#EAB839", + "value": 64 + }, + { + "color": "orange", + "value": 128 + }, + { + "color": "red", + "value": 256 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 20, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 2, + "interval": "5m", + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "10.4.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "topk(7,histogram_quantile(0.50, sum(rate(traces_spanmetrics_duration_seconds_bucket{aws_account_name=~\"$aws_account_name\",service_name=~\"$service\", span_name=~\"$span_name\"}[$__rate_interval])) by (le,service_name)))", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "legendFormat": "{{service_name}}-quantile_0.50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "topk(7,histogram_quantile(0.95, sum(rate(traces_spanmetrics_duration_seconds_bucket{aws_account_name=~\"$aws_account_name\",service_name=~\"$service\", span_name=~\"$span_name\"}[$__range])) by (le,service_name)))", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "{{le}} - {{service_name}}", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "histogram_quantile(0.99, sum(rate(traces_spanmetrics_duration_seconds_bucket{aws_account_name=~\"$aws_account_name\",service_name=~\"$service\", span_name=~\"$span_name\"}[$__rate_interval])) by (le,service_name))", + "hide": true, + "interval": "", + "legendFormat": "quantile99", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "histogram_quantile(0.999, sum(rate(traces_spanmetrics_duration_seconds_bucket{aws_account_name=~\"$aws_account_name\",service_name=~\"$service\", span_name=~\"$span_name\"}[$__rate_interval])) by (le,service_name))", + "hide": true, + "interval": "", + "legendFormat": "quantile999", + "range": true, + "refId": "D" + } + ], + "title": "Top 3x3 - Service Latency - quantile95", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-BlYlRd" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "super-light-blue", + "value": 1 + }, + { + "color": "#EAB839", + "value": 2 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 4, + "interval": "5m", + "options": { + "displayMode": "lcd", + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["mean"], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "10.4.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "topk(7,sum by (service_name) (rate(traces_spanmetrics_calls_total{aws_account_name=~\"$aws_account_name\",service_name=~\"$service\", span_name=~\"$span_name\"}[$__range])))", + "format": "time_series", + "instant": true, + "interval": "", + "legendFormat": "{{service_name}}", + "range": false, + "refId": "A" + } + ], + "title": "Top 7 Services Mean Rate over Range", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-reds" + }, + "decimals": 4, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 1 + }, + { + "color": "red", + "value": 15 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 15, + "interval": "5m", + "options": { + "displayMode": "lcd", + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "vertical", + "reduceOptions": { + "calcs": ["mean"], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "10.4.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "topk(7,sum(rate(traces_spanmetrics_calls_total{aws_account_name=~\"$aws_account_name\",status_code=\"STATUS_CODE_ERROR\",service_name=~\"$service\", span_name=~\"$span_name\"}[$__range])) by (service_name))", + "instant": true, + "interval": "", + "legendFormat": "{{service_name}}", + "range": false, + "refId": "A" + } + ], + "title": "Top 7 Services Mean ERROR Rate over Range", + "type": "bargauge" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 14, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "refId": "A" + } + ], + "title": "span_names Level - Throughput", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "bRate" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "mode": "lcd", + "type": "gauge" + } + }, + { + "id": "color", + "value": { + "mode": "continuous-BlYlRd" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "eRate" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "mode": "lcd", + "type": "gauge" + } + }, + { + "id": "color", + "value": { + "mode": "continuous-RdYlGr" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Error Rate" + }, + "properties": [ + { + "id": "custom.width", + "value": 663 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Rate" + }, + "properties": [ + { + "id": "custom.width", + "value": 667 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Service" + }, + "properties": [ + { + "id": "custom.width" + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 22, + "interval": "5m", + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "10.4.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": false, + "expr": "topk(7, sum(rate(traces_spanmetrics_calls_total{aws_account_name=~\"$aws_account_name\",service_name=~\"$service\", span_name=~\"$span_name\"}[$__range])) by (span_name,service_name)) ", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "Rate" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": false, + "expr": "topk(7, sum(rate(traces_spanmetrics_calls_total{aws_account_name=~\"$aws_account_name\",status_code=\"STATUS_CODE_ERROR\",service_name=~\"$service\", span_name=~\"$span_name\"}[$__range])) by (span_name,service_name))", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "Error Rate" + } + ], + "title": "Top 7 span_names and Errors (APM Table)", + "transformations": [ + { + "id": "seriesToColumns", + "options": { + "byField": "span_name" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true + }, + "indexByName": {}, + "renameByName": { + "Value #Error Rate": "Error Rate", + "Value #Rate": "Rate", + "service_name 1": "Rate in Service", + "service_name 2": "Error Rate in Service" + } + } + }, + { + "id": "calculateField", + "options": { + "alias": "bRate", + "mode": "reduceRow", + "reduce": { + "include": ["Rate"], + "reducer": "sum" + } + } + }, + { + "id": "calculateField", + "options": { + "alias": "eRate", + "mode": "reduceRow", + "reduce": { + "include": ["Error Rate"], + "reducer": "sum" + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Error Rate": true, + "Rate": true, + "bRate": false + }, + "indexByName": { + "Error Rate": 4, + "Error Rate in Service": 6, + "Rate": 1, + "Rate in Service": 5, + "bRate": 2, + "eRate": 3, + "span_name": 0 + }, + "renameByName": { + "Rate in Service": "Service", + "bRate": "Rate", + "eRate": "Error Rate", + "span_name": "span_name Name" + } + } + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "desc": true, + "field": "Rate" + } + ] + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 20, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "refId": "A" + } + ], + "title": "span_name Level - Latencies", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-BlYlRd" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + }, + { + "color": "green", + "value": 2 + }, + { + "color": "#EAB839", + "value": 64 + }, + { + "color": "orange", + "value": 128 + }, + { + "color": "red", + "value": 256 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 25, + "interval": "5m", + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "10.4.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "topk(7,histogram_quantile(0.50, sum(rate(traces_spanmetrics_duration_seconds_bucket{aws_account_name=~\"$aws_account_name\",service_name=~\"$service\", span_name=~\"$span_name\"}[$__rate_interval])) by (le,service_name)))", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "legendFormat": "{{service_name}}-quantile_0.50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "topk(7,histogram_quantile(0.95, sum(rate(traces_spanmetrics_duration_seconds_bucket{aws_account_name=~\"$aws_account_name\",service_name=~\"$service\", span_name=~\"$span_name\"}[$__range])) by (le,span_name)))", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "{{span_name}}", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "histogram_quantile(0.99, sum(rate(traces_spanmetrics_duration_seconds_bucket{aws_account_name=~\"$aws_account_name\",service_name=~\"$service\", span_name=~\"$span_name\"}[$__rate_interval])) by (le,service_name))", + "hide": true, + "interval": "", + "legendFormat": "quantile99", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "histogram_quantile(0.999, sum(rate(traces_spanmetrics_duration_seconds_bucket{aws_account_name=~\"$aws_account_name\",service_name=~\"$service\", span_name=~\"$span_name\"}[$__rate_interval])) by (le,service_name))", + "hide": true, + "interval": "", + "legendFormat": "quantile999", + "range": true, + "refId": "D" + } + ], + "title": "Top 3x3 - span_name Latency - quantile95", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-BlYlRd" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 12, + "x": 12, + "y": 34 + }, + "id": 10, + "interval": "5m", + "options": { + "displayMode": "lcd", + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["mean"], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "10.4.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "topk(7,sum by (span_name,service_name)(increase(traces_spanmetrics_duration_seconds_sum{service_name=~\"$service\", span_name=~\"$span_name\"}[$__rate_interval]) / increase(traces_spanmetrics_duration_seconds_count{service_name=~\"$service\", span_name=~\"$span_name\"}[$__rate_interval])))", + "instant": true, + "interval": "", + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Top 7 Highest Endpoint Latencies Mean Over Range ", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 47 + }, + "id": 16, + "interval": "5m", + "options": { + "legend": { + "calcs": ["mean", "logmin", "max", "delta"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": true, + "expr": "topk(7,sum by (span_name,service_name)(increase(traces_spanmetrics_duration_seconds_sum{aws_account_name=~\"$aws_account_name\",service_name=~\"$service\", span_name=~\"$span_name\"}[$__rate_interval]) / increase(traces_spanmetrics_duration_seconds_count{service_name=~\"$service\", span_name=~\"$span_name\"}[$__rate_interval])))", + "instant": false, + "interval": "", + "legendFormat": "[{{service_name}}] {{span_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Top 7 Latencies Over Range ", + "type": "timeseries" + } + ], + "refresh": "5m", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": ["All"], + "value": ["$__all"] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(up,aws_account_environment)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "aws_account_environment", + "options": [], + "query": { + "query": "label_values(up,aws_account_environment)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": true, + "text": ["All"], + "value": ["$__all"] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(up{aws_account_environment=~\"$aws_account_environment\"},aws_region)", + "hide": 0, + "includeAll": true, + "label": "Region", + "multi": true, + "name": "aws_region", + "options": [], + "query": { + "query": "label_values(up{aws_account_environment=~\"$aws_account_environment\"},aws_region)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": true, + "text": ["ptl"], + "value": ["ptl"] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(up{aws_account_environment=~\"$aws_account_environment\", aws_region=~\"$aws_region\"}, customer_name)", + "hide": 0, + "includeAll": true, + "label": "Customer", + "multi": true, + "name": "customer_name", + "options": [], + "query": { + "query": "label_values(up{aws_account_environment=~\"$aws_account_environment\", aws_region=~\"$aws_region\"}, customer_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": true, + "text": ["All"], + "value": ["$__all"] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(up{aws_account_environment=~\"$aws_account_environment\", aws_region=~\"$aws_region\",customer_name=~\"$customer_name\"}, aws_account_id)", + "hide": 2, + "includeAll": true, + "label": "AWS Account ID", + "multi": true, + "name": "aws_account_id", + "options": [], + "query": { + "query": "label_values(up{aws_account_environment=~\"$aws_account_environment\", aws_region=~\"$aws_region\",customer_name=~\"$customer_name\"}, aws_account_id)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "ptl-sandbox", + "value": "ptl-sandbox" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(up{aws_account_environment=~\"$aws_account_environment\", aws_region=~\"$aws_region\",customer_name=~\"$customer_name\",aws_account_id=~\"$aws_account_id\"}, aws_account_name)", + "hide": 0, + "includeAll": false, + "label": "Project name", + "multi": false, + "name": "aws_account_name", + "options": [], + "query": { + "query": "label_values(up{aws_account_environment=~\"$aws_account_environment\", aws_region=~\"$aws_region\",customer_name=~\"$customer_name\",aws_account_id=~\"$aws_account_id\"}, aws_account_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": ["All"], + "value": ["$__all"] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "query_result(count by (service_name)(count_over_time(traces_spanmetrics_calls_total[$__range])))", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "service", + "options": [], + "query": { + "query": "query_result(count by (service_name)(count_over_time(traces_spanmetrics_calls_total[$__range])))", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "/.*service_name=\"(.*)\".*/", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": ["All"], + "value": ["$__all"] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "query_result(sum ({__name__=~\".*traces_spanmetrics_calls_total\",service_name=~\"$service\"}) by (span_name))", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "span_name", + "options": [], + "query": { + "query": "query_result(sum ({__name__=~\".*traces_spanmetrics_calls_total\",service_name=~\"$service\"}) by (span_name))", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "/.*span_name=\"(.*)\".*/", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-30d", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "SRE - APM Top Dashboard", + "uid": "W2gX2zHVk48", + "version": 51, + "weekStart": "" +} diff --git a/context/otel-stack/dashboards/dashboards.yaml b/context/otel-stack/dashboards/dashboards.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/context/otel-stack/grafana-datasources.yaml b/context/otel-stack/grafana-datasources.yaml new file mode 100644 index 000000000..f730030ea --- /dev/null +++ b/context/otel-stack/grafana-datasources.yaml @@ -0,0 +1,30 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + uid: prometheus + access: proxy + orgId: 1 + url: http://prometheus:9090 + basicAuth: false + isDefault: false + version: 1 + editable: false + jsonData: + httpMethod: GET + - name: Tempo + type: tempo + access: proxy + orgId: 1 + url: http://tempo:3200 + basicAuth: false + isDefault: true + version: 1 + editable: false + apiVersion: 1 + uid: tempo + jsonData: + httpMethod: GET + serviceMap: + datasourceUid: prometheus diff --git a/context/otel-stack/otel-collector.yaml b/context/otel-stack/otel-collector.yaml new file mode 100644 index 000000000..439b6f37e --- /dev/null +++ b/context/otel-stack/otel-collector.yaml @@ -0,0 +1,16 @@ +receivers: + otlp: + protocols: + grpc: + http: + +exporters: + otlp: + endpoint: tempo:4317 + tls: + insecure: true +service: + pipelines: + traces: + receivers: [otlp] + exporters: [otlp] diff --git a/context/otel-stack/prometheus.yaml b/context/otel-stack/prometheus.yaml new file mode 100644 index 000000000..a987d7341 --- /dev/null +++ b/context/otel-stack/prometheus.yaml @@ -0,0 +1,11 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + - job_name: 'tempo' + static_configs: + - targets: ['tempo:3200'] diff --git a/context/otel-stack/tempo.yaml b/context/otel-stack/tempo.yaml new file mode 100644 index 000000000..d8e973389 --- /dev/null +++ b/context/otel-stack/tempo.yaml @@ -0,0 +1,65 @@ +stream_over_http_enabled: true +server: + http_listen_port: 3200 + log_level: info + +query_frontend: + search: + duration_slo: 5s + throughput_bytes_slo: 1.073741824e+09 + trace_by_id: + duration_slo: 5s + +distributor: + receivers: # this configuration will listen on all ports and protocols that tempo is capable of. + jaeger: # the receives all come from the OpenTelemetry collector. more configuration information can + protocols: # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver + thrift_http: # + grpc: # for a production deployment you should only enable the receivers you need! + thrift_binary: + thrift_compact: + zipkin: + otlp: + protocols: + http: + grpc: + opencensus: + +ingester: + max_block_duration: 5m # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally + +compactor: + compaction: + block_retention: 1h # overall Tempo trace retention. set for demo purposes + +metrics_generator: + registry: + external_labels: + source: tempo + cluster: docker-compose + storage: + path: /var/tempo/generator/wal + remote_write: + - url: http://prometheus:9090/api/v1/write + send_exemplars: true + traces_storage: + path: /var/tempo/generator/traces + +storage: + trace: + backend: local # backend configuration to use + wal: + path: /var/tempo/wal # where to store the wal locally + local: + path: /var/tempo/blocks + +overrides: + defaults: + global: + max_bytes_per_trace: 100000000 + ingestion: + max_traces_per_user: 10000 + read: + max_bytes_per_tag_values_query: 100000000 + metrics_generator: + processors: [service-graphs, span-metrics, local-blocks] # enables metrics generator diff --git a/generator/src/templates/nginx/http/gateway/otel-stack.server.conf.twig b/generator/src/templates/nginx/http/gateway/otel-stack.server.conf.twig new file mode 100644 index 000000000..315d533ff --- /dev/null +++ b/generator/src/templates/nginx/http/gateway/otel-stack.server.conf.twig @@ -0,0 +1,2 @@ +{% extends "nginx/http/gateway/server.conf.twig" %} +{% block upstream %}{{ upstream }}:8080{% endblock upstream %} diff --git a/generator/src/templates/service/otel-stack/default/otel-stack.yml.twig b/generator/src/templates/service/otel-stack/default/otel-stack.yml.twig new file mode 100644 index 000000000..6765381c3 --- /dev/null +++ b/generator/src/templates/service/otel-stack/default/otel-stack.yml.twig @@ -0,0 +1 @@ +{% include "service/#{engine}/latest/#{engine}.yml.twig" with _context only %} diff --git a/generator/src/templates/service/otel-stack/latest/otel-stack.yml.twig b/generator/src/templates/service/otel-stack/latest/otel-stack.yml.twig new file mode 100644 index 000000000..2ec45dc8c --- /dev/null +++ b/generator/src/templates/service/otel-stack/latest/otel-stack.yml.twig @@ -0,0 +1,67 @@ + + # Tempo runs as user 10001, and docker compose creates the volume as root. + # As such, we need to chown the volume in order for Tempo to start correctly. + tempo-init: + image: &tempoImage grafana/tempo:main-1a21818 + user: root + entrypoint: + - "chown" + - "10001:10001" + - "/var/tempo" + volumes: + - {{ serviceName }}-{{ serviceData['engine'] }}-data:/var/tempo:rw + + tempo: + image: *tempoImage + command: [ "-config.file=/etc/tempo.yaml" ] + volumes: + - ./${DEPLOYMENT_PATH}/context/otel-stack/tempo.yaml:/etc/tempo.yaml + - {{ serviceName }}-{{ serviceData['engine'] }}-data:/var/tempo:rw + networks: + - private + expose: + - "14268" # jaeger ingest + - "3200" # tempo + - "4317" # otlp grpc + - "4318" # otlp http + - "9411" # zipkin + depends_on: + - tempo-init + + collector: + image: otel/opentelemetry-collector:0.86.0 + command: [ "--config=/etc/otel-collector.yaml" ] + volumes: + - ./${DEPLOYMENT_PATH}/context/otel-stack/otel-collector.yaml:/etc/otel-collector.yaml + networks: + - private + + prometheus: + image: prom/prometheus:latest + command: + - --config.file=/etc/prometheus.yaml + - --web.enable-remote-write-receiver + - --enable-feature=exemplar-storage + networks: + - private + volumes: + - ./${DEPLOYMENT_PATH}/context/otel-stack/prometheus.yaml:/etc/prometheus.yaml + expose: + - "9090" + + grafana: + image: grafana/grafana:11.0.0 + volumes: + - ./${DEPLOYMENT_PATH}/context/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml + networks: + - public + - private + environment: + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin + - GF_AUTH_DISABLE_LOGIN_FORM=true + - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor + + expose: + - "3000" +