From 74bbf39f7c8392d3234b60c2ec1154ed27fffd9d Mon Sep 17 00:00:00 2001 From: Arthur Dayton <103078673+arthur-observe@users.noreply.github.com> Date: Mon, 30 Sep 2024 12:50:49 -0700 Subject: [PATCH] feat: OB-36454 add pod metrics config (#232) - Add configuration for pod metrics to: - values.yaml - _cluster-metrics-config.tpl - Add example for pod metrics to helm-charts/examples/agent/pod_metrics --- charts/agent/Chart.yaml | 2 +- charts/agent/README.md | 10 +- .../templates/_cluster-metrics-config.tpl | 114 ++++++++++++++++++ charts/agent/templates/_monitor-config.tpl | 2 +- charts/agent/values.yaml | 10 ++ examples/agent/affinity/affinity-values.yaml | 9 +- examples/agent/pod_metrics/README.md | 70 +++++++++++ .../agent/pod_metrics/pod-metrics-values.yaml | 52 ++++++++ examples/agent/pod_metrics/sample-pod-no.yaml | 73 +++++++++++ examples/agent/pod_metrics/sample-pod.yaml | 73 +++++++++++ 10 files changed, 407 insertions(+), 8 deletions(-) create mode 100644 examples/agent/pod_metrics/README.md create mode 100644 examples/agent/pod_metrics/pod-metrics-values.yaml create mode 100644 examples/agent/pod_metrics/sample-pod-no.yaml create mode 100644 examples/agent/pod_metrics/sample-pod.yaml diff --git a/charts/agent/Chart.yaml b/charts/agent/Chart.yaml index f96341dc..12fff75c 100644 --- a/charts/agent/Chart.yaml +++ b/charts/agent/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: agent description: Chart to install K8s collection stack based on Observe Agent type: application -version: 0.17.0 +version: 0.18.0 appVersion: "1.1.0" dependencies: - name: opentelemetry-collector diff --git a/charts/agent/README.md b/charts/agent/README.md index 88822f97..4100550c 100644 --- a/charts/agent/README.md +++ b/charts/agent/README.md @@ -1,6 +1,6 @@ # agent -![Version: 0.17.0](https://img.shields.io/badge/Version-0.17.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.1.0](https://img.shields.io/badge/AppVersion-1.1.0-informational?style=flat-square) +![Version: 0.18.0](https://img.shields.io/badge/Version-0.18.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.1.0](https://img.shields.io/badge/AppVersion-1.1.0-informational?style=flat-square) > [!CAUTION] > This chart is under active development and is not meant to be installed yet. @@ -166,6 +166,14 @@ Chart to install K8s collection stack based on Observe Agent | cluster.events.enabled | bool | `true` | | | cluster.events.pullInterval | string | `"20m"` | | | cluster.metrics.enabled | bool | `true` | | +| cluster.metrics.pod.action | string | `"keep"` | | +| cluster.metrics.pod.enabled | bool | `false` | | +| cluster.metrics.pod.interval | string | `"10s"` | | +| cluster.metrics.pod.metric_drop_regex | string | `".*bucket"` | | +| cluster.metrics.pod.metric_keep_regex | string | `"(.*)"` | | +| cluster.metrics.pod.namespace_drop_regex | string | `"(.*istio.*|.*ingress.*|kube-system|observe)"` | | +| cluster.metrics.pod.namespace_keep_regex | string | `"(.*)"` | | +| cluster.metrics.pod.port_keep_regex | string | `".*metrics"` | | | cluster.name | string | `"observe-agent-monitored-cluster"` | | | cluster.namespaceOverride.value | string | `"observe"` | | | cluster.uidOverride.value | string | `""` | | diff --git a/charts/agent/templates/_cluster-metrics-config.tpl b/charts/agent/templates/_cluster-metrics-config.tpl index 11aa3d35..b9266d35 100644 --- a/charts/agent/templates/_cluster-metrics-config.tpl +++ b/charts/agent/templates/_cluster-metrics-config.tpl @@ -23,6 +23,109 @@ receivers: metrics: k8s.node.condition: enabled: true + {{- if .Values.cluster.metrics.pod.enabled }} + prometheus/pod_metrics: + config: + scrape_configs: + - job_name: pod-metrics + scrape_interval: {{.Values.cluster.metrics.pod.interval}} + honor_labels: true + kubernetes_sd_configs: + - role: pod + relabel_configs: + # this is defaulted to keep but if set to drop then pod metrics will not be collected + - action: {{.Values.cluster.metrics.pod.action}} + + # Drop anything matching the configured namespace. + - action: 'drop' + source_labels: ['__meta_kubernetes_namespace'] + regex: {{.Values.cluster.metrics.pod.namespace_drop_regex}} + + # Drop anything not matching the configured namespace. + - action: 'keep' + source_labels: ['__meta_kubernetes_namespace'] + regex: {{.Values.cluster.metrics.pod.namespace_keep_regex}} + + # Drop endpoints without one of: a port name suffixed with the configured regex, or an explicit prometheus port annotation. + - action: 'keep' + source_labels: ['__meta_kubernetes_pod_container_port_name', '__meta_kubernetes_pod_annotation_prometheus_io_port'] + regex: '({{.Values.cluster.metrics.pod.port_keep_regex}};|.*;\d+)' + + # Drop pods with phase Succeeded or Failed. + - action: 'drop' + regex: 'Succeeded|Failed' + source_labels: ['__meta_kubernetes_pod_phase'] + + ################################################################ + # Drop anything annotated with 'observeinc.com.scrape=false' or 'observeinc_com_scrape=false' . + - action: 'drop' + regex: 'false' + source_labels: ['__meta_kubernetes_pod_annotation_observeinc_com_scrape'] + + ################################################################ + # Prometheus Configs + # Drop anything annotated with 'prometheus.io.scrape=false'. + - action: 'drop' + regex: 'false' + source_labels: ['__meta_kubernetes_pod_annotation_prometheus_io_scrape'] + + # Allow pods to override the scrape scheme with 'prometheus.io.scheme=https'. + - action: 'replace' + regex: '(https?)' + replacement: '$1' + source_labels: ['__meta_kubernetes_pod_annotation_prometheus_io_scheme'] + target_label: '__scheme__' + + # Allow service to override the scrape path with 'prometheus.io.path=/other_metrics_path'. + - action: 'replace' + regex: '(.+)' + replacement: '$1' + source_labels: ['__meta_kubernetes_pod_annotation_prometheus_io_path'] + target_label: '__metrics_path__' + + # Allow services to override the scrape port with 'prometheus.io.port=1234'. + - action: 'replace' + regex: '(.+?)(\:\d+)?;(\d+)' + replacement: '$1:$3' + source_labels: ['__address__', '__meta_kubernetes_pod_annotation_prometheus_io_port'] + target_label: '__address__' + + + ################################################################ + + #podAnnotations: { + # observeinc_com_scrape: 'true', + # observeinc_com_path: '/metrics', + # observeinc_com_port: '8080', + #} + + # set metrics_path (default is /metrics) to the metrics path specified in "observeinc_com_path: " annotation. + - source_labels: [__meta_kubernetes_pod_annotation_observeinc_com_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + + # set the scrapping port to the port specified in "observeinc_com_port: " annotation and set address accordingly. + - source_labels: [__address__, __meta_kubernetes_pod_annotation_observeinc_com_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $$1:$$2 + target_label: __address__ + + # Maps all Kubernetes pod labels to Prometheus labels with the prefix removed (e.g., __meta_kubernetes_pod_label_app becomes app). + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + + # adds new label + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + + # adds new label + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + {{ end }} processors: {{- include "config.processors.memory_limiter" . | nindent 2 }} @@ -39,6 +142,11 @@ processors: - key: debug_source action: insert value: cluster_metrics + attributes/debug_source_pod_metrics: + actions: + - key: debug_source + action: insert + value: pod_metrics service: extensions: [health_check] @@ -47,6 +155,12 @@ service: receivers: [k8s_cluster] processors: [memory_limiter, batch, k8sattributes, attributes/observe_common, attributes/debug_source_cluster_metrics] exporters: [prometheusremotewrite, debug/override] + {{- if .Values.cluster.metrics.pod.enabled }} + metrics/pod_metrics: + receivers: [prometheus/pod_metrics] + processors: [memory_limiter, batch, k8sattributes, attributes/observe_common, attributes/debug_source_pod_metrics] + exporters: [prometheusremotewrite, debug/override] + {{ end -}} {{- include "config.service.telemetry" . | nindent 2 }} {{- end }} diff --git a/charts/agent/templates/_monitor-config.tpl b/charts/agent/templates/_monitor-config.tpl index 3015fe5e..4543725b 100644 --- a/charts/agent/templates/_monitor-config.tpl +++ b/charts/agent/templates/_monitor-config.tpl @@ -31,7 +31,7 @@ receivers: action: keep regex: true # set metrics_path (default is /metrics) to the metrics path specified in "prometheus.io/path: " annotation. - - source_labels: [__meta_kubernetes_pod_annotationobserve_monitor_path] + - source_labels: [__meta_kubernetes_pod_annotation_observe_monitor_path] action: replace target_label: __metrics_path__ regex: (.+) diff --git a/charts/agent/values.yaml b/charts/agent/values.yaml index d91ba306..0cc44c20 100644 --- a/charts/agent/values.yaml +++ b/charts/agent/values.yaml @@ -21,6 +21,16 @@ cluster: enabled: true metrics: enabled: true + pod: + enabled: false + action: keep + interval: 10s + # add whatever namespace the collection is deployed to + namespace_drop_regex: (.*istio.*|.*ingress.*|kube-system|observe) + namespace_keep_regex: (.*) + port_keep_regex: .*metrics + metric_drop_regex: .*bucket + metric_keep_regex: (.*) namespaceOverride: # ! This needs to have same value as namespaceOverride in deployments and daemonsets below diff --git a/examples/agent/affinity/affinity-values.yaml b/examples/agent/affinity/affinity-values.yaml index 67e5c924..9a770827 100644 --- a/examples/agent/affinity/affinity-values.yaml +++ b/examples/agent/affinity/affinity-values.yaml @@ -30,7 +30,7 @@ config: debug: verbosity: normal -deployment-cluster-events: +cluster-events: enabled: true namespaceOverride: "k8smonitoring" affinity: @@ -47,15 +47,14 @@ deployment-cluster-events: operator: NotIn values: [windows] -deployment-cluster-metrics: +cluster-metrics: enabled: true namespaceOverride: "k8smonitoring" -daemonset-logs-metrics: +node-logs-metrics: enabled: true namespaceOverride: "k8smonitoring" - -deployment-agent-monitor: +monitor: enabled: true namespaceOverride: "k8smonitoring" diff --git a/examples/agent/pod_metrics/README.md b/examples/agent/pod_metrics/README.md new file mode 100644 index 00000000..0d02ef6b --- /dev/null +++ b/examples/agent/pod_metrics/README.md @@ -0,0 +1,70 @@ +# Prometheus pod metrics scrape example +This example deploys a sample container that emits prometheus metrics. Source is here - https://github.com/brancz/prometheus-example-app/blob/master/README.md + +It also creates a service - prometheus-example-app-service - and a cron job that calls the service at / every 10s and /err every 20s for a total of 5 minutes. + +You can alter the length of time the cron job runs and sleep time between runs by altering the SLEEP_TIME and LOOP_COUNT environment variables in the sample-pod.yaml file. + +The sample-pod-no.yaml demonstrates using the observeinc_com_scrape: 'false' annotation - you can prove it's effectiveness by setting to true and seeing the metrics show up. + +## Deploy sample container +``` +kubectl apply -f sample-pod.yaml + +kubectl apply -f sample-pod-no.yaml + +``` + +## Port forward +``` +kubectl port-forward service/prometheus-example-app-service 8080:8080 + +kubectl port-forward service/prometheus-example-app-no-service 8080:8080 +``` + +## Call service locally +``` +# Get metrics +curl localhost:8080/metrics + +# Generate 200 count for metrics +curl localhost:8080/ + +# Generate error count for metrics +curl localhost:8080/err + +``` + +### Cleanup +``` +kubectl delete -f sample-pod.yaml + +kubectl delete -f sample-pod-no.yaml +``` + +## Deploy k8s monitoring with pod metrics collection enabled +The sample-pod.yaml is deployed in the default namespace unless you change the provided command. + +It has annotations to change the metrics port to 8080 (observeinc_com_port: '8080') which will tell the scrape config to not use the default port of 8888. + +The pod-metrics-values.yaml file sets the namespace to scrape metrics from to default with namespace_keep_regex and adds the web port name as valid with port_keep_regex. + +``` +helm install pod-metrics-example -n k8smonitoring \ + --set observe.token.value=$TOKEN \ + --set observe.collectionEndpoint.value=$ENDPOINT \ + -f ./pod-metrics-values.yaml ../../../charts/agent + +helm upgrade pod-metrics-example -n k8smonitoring -f ./pod-metrics-values.yaml ../../../charts/agent +``` + +### Opal validation +Create a worksheet from your datastream and token then use following opal +``` +make_col debug_source:string(EXTRA.debug_source) +filter OBSERVATION_KIND = "prometheus" +filter debug_source = "pod_metrics" +make_col metric:string(EXTRA.__name__) +make_col k8s_namespace_name:string(EXTRA.k8s_namespace_name) +make_col app_kubernetes_io_name:string(EXTRA.app_kubernetes_io_name) +``` diff --git a/examples/agent/pod_metrics/pod-metrics-values.yaml b/examples/agent/pod_metrics/pod-metrics-values.yaml new file mode 100644 index 00000000..4eab42c6 --- /dev/null +++ b/examples/agent/pod_metrics/pod-metrics-values.yaml @@ -0,0 +1,52 @@ +cluster: + name: observe-agent-monitored-cluster + events: + pullInterval: 20m + enabled: true + metrics: + enabled: true + pod: + enabled: true + namespace_keep_regex: (default) + metric_drop_regex: "" + port_keep_regex: .*metrics|web + + namespaceOverride: + value: k8smonitoring + +containers: + logs: + enabled: true + metrics: + enabled: false + +agent: + selfMonitor: + enabled: true + +config: +global: + debug: + verbosity: normal + +cluster-events: + enabled: true + namespaceOverride: "k8smonitoring" + tolerations: + - key: "deployObserve" + operator: "Equal" + value: "notAllowed" + effect: "NoSchedule" + +cluster-metrics: + enabled: true + namespaceOverride: "k8smonitoring" + +node-logs-metrics: + enabled: true + namespaceOverride: "k8smonitoring" + + +monitor: + enabled: true + namespaceOverride: "k8smonitoring" diff --git a/examples/agent/pod_metrics/sample-pod-no.yaml b/examples/agent/pod_metrics/sample-pod-no.yaml new file mode 100644 index 00000000..1725232a --- /dev/null +++ b/examples/agent/pod_metrics/sample-pod-no.yaml @@ -0,0 +1,73 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/name: prometheus-example-app-no + name: prometheus-example-app-no +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: prometheus-example-app-no + template: + metadata: + labels: + app.kubernetes.io/name: prometheus-example-app-no + annotations: + observeinc_com_scrape: 'false' + observeinc_com_path: '/metrics' + observeinc_com_port: '8080' + spec: + containers: + - name: prometheus-example-app-no + image: quay.io/brancz/prometheus-example-app:v0.3.0 + ports: + - name: web + containerPort: 8080 +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus-example-app-no-service +spec: + selector: + app.kubernetes.io/name: prometheus-example-app-no + ports: + - protocol: TCP + port: 8080 # Exposed service port + targetPort: 8080 + name: metrics +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: caller-cronjob +spec: + schedule: "*/1 * * * *" # Runs every minute + jobTemplate: + spec: + template: + spec: + containers: + - name: caller + image: curlimages/curl:latest # A lightweight curl image + env: + - name: SLEEP_TIME + value: "10" # Sleep time in seconds + - name: LOOP_COUNT + value: "36" # Number of iterations + command: + - /bin/sh + - -c + - | + for i in $(seq 1 $LOOP_COUNT); do + curl http://prometheus-example-app-no-service:8080; # Adjust the URL and port as necessary + + # Second call on even numbers + if [ $((i % 2)) -eq 0 ]; then + curl http://prometheus-example-app-no-service:8080/err; # Second target service + echo "Second call on even #$i made." + fi + sleep $SLEEP_TIME; + done + restartPolicy: OnFailure diff --git a/examples/agent/pod_metrics/sample-pod.yaml b/examples/agent/pod_metrics/sample-pod.yaml new file mode 100644 index 00000000..3018d477 --- /dev/null +++ b/examples/agent/pod_metrics/sample-pod.yaml @@ -0,0 +1,73 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/name: prometheus-example-app + name: prometheus-example-app +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: prometheus-example-app + template: + metadata: + labels: + app.kubernetes.io/name: prometheus-example-app + annotations: + observeinc_com_scrape: 'true' + observeinc_com_path: '/metrics' + observeinc_com_port: '8080' + spec: + containers: + - name: prometheus-example-app + image: quay.io/brancz/prometheus-example-app:v0.3.0 + ports: + - name: web + containerPort: 8080 +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus-example-app-service +spec: + selector: + app.kubernetes.io/name: prometheus-example-app + ports: + - protocol: TCP + port: 8080 # Exposed service port + targetPort: 8080 + name: metrics +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: caller-cronjob +spec: + schedule: "*/1 * * * *" # Runs every minute + jobTemplate: + spec: + template: + spec: + containers: + - name: caller + image: curlimages/curl:latest # A lightweight curl image + env: + - name: SLEEP_TIME + value: "10" # Sleep time in seconds + - name: LOOP_COUNT + value: "36" # Number of iterations + command: + - /bin/sh + - -c + - | + for i in $(seq 1 $LOOP_COUNT); do + curl http://prometheus-example-app-service:8080; # Adjust the URL and port as necessary + + # Second call on even numbers + if [ $((i % 2)) -eq 0 ]; then + curl http://prometheus-example-app-service:8080/err; # Second target service + echo "Second call on even #$i made." + fi + sleep $SLEEP_TIME; + done + restartPolicy: OnFailure