From 74bbf39f7c8392d3234b60c2ec1154ed27fffd9d Mon Sep 17 00:00:00 2001
From: Arthur Dayton <103078673+arthur-observe@users.noreply.github.com>
Date: Mon, 30 Sep 2024 12:50:49 -0700
Subject: [PATCH] feat: OB-36454 add pod metrics config (#232)

- Add configuration for pod metrics to:
    - values.yaml
    - _cluster-metrics-config.tpl
- Add example for pod metrics to helm-charts/examples/agent/pod_metrics
---
 charts/agent/Chart.yaml                       |   2 +-
 charts/agent/README.md                        |  10 +-
 .../templates/_cluster-metrics-config.tpl     | 114 ++++++++++++++++++
 charts/agent/templates/_monitor-config.tpl    |   2 +-
 charts/agent/values.yaml                      |  10 ++
 examples/agent/affinity/affinity-values.yaml  |   9 +-
 examples/agent/pod_metrics/README.md          |  70 +++++++++++
 .../agent/pod_metrics/pod-metrics-values.yaml |  52 ++++++++
 examples/agent/pod_metrics/sample-pod-no.yaml |  73 +++++++++++
 examples/agent/pod_metrics/sample-pod.yaml    |  73 +++++++++++
 10 files changed, 407 insertions(+), 8 deletions(-)
 create mode 100644 examples/agent/pod_metrics/README.md
 create mode 100644 examples/agent/pod_metrics/pod-metrics-values.yaml
 create mode 100644 examples/agent/pod_metrics/sample-pod-no.yaml
 create mode 100644 examples/agent/pod_metrics/sample-pod.yaml

diff --git a/charts/agent/Chart.yaml b/charts/agent/Chart.yaml
index f96341dc..12fff75c 100644
--- a/charts/agent/Chart.yaml
+++ b/charts/agent/Chart.yaml
@@ -2,7 +2,7 @@ apiVersion: v2
 name: agent
 description: Chart to install K8s collection stack based on Observe Agent
 type: application
-version: 0.17.0
+version: 0.18.0
 appVersion: "1.1.0"
 dependencies:
   - name: opentelemetry-collector
diff --git a/charts/agent/README.md b/charts/agent/README.md
index 88822f97..4100550c 100644
--- a/charts/agent/README.md
+++ b/charts/agent/README.md
@@ -1,6 +1,6 @@
 # agent
 
-![Version: 0.17.0](https://img.shields.io/badge/Version-0.17.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.1.0](https://img.shields.io/badge/AppVersion-1.1.0-informational?style=flat-square)
+![Version: 0.18.0](https://img.shields.io/badge/Version-0.18.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.1.0](https://img.shields.io/badge/AppVersion-1.1.0-informational?style=flat-square)
 
 > [!CAUTION]
 > This chart is under active development and is not meant to be installed yet.
@@ -166,6 +166,14 @@ Chart to install K8s collection stack based on Observe Agent
 | cluster.events.enabled | bool | `true` |  |
 | cluster.events.pullInterval | string | `"20m"` |  |
 | cluster.metrics.enabled | bool | `true` |  |
+| cluster.metrics.pod.action | string | `"keep"` |  |
+| cluster.metrics.pod.enabled | bool | `false` |  |
+| cluster.metrics.pod.interval | string | `"10s"` |  |
+| cluster.metrics.pod.metric_drop_regex | string | `".*bucket"` |  |
+| cluster.metrics.pod.metric_keep_regex | string | `"(.*)"` |  |
+| cluster.metrics.pod.namespace_drop_regex | string | `"(.*istio.*|.*ingress.*|kube-system|observe)"` |  |
+| cluster.metrics.pod.namespace_keep_regex | string | `"(.*)"` |  |
+| cluster.metrics.pod.port_keep_regex | string | `".*metrics"` |  |
 | cluster.name | string | `"observe-agent-monitored-cluster"` |  |
 | cluster.namespaceOverride.value | string | `"observe"` |  |
 | cluster.uidOverride.value | string | `""` |  |
diff --git a/charts/agent/templates/_cluster-metrics-config.tpl b/charts/agent/templates/_cluster-metrics-config.tpl
index 11aa3d35..b9266d35 100644
--- a/charts/agent/templates/_cluster-metrics-config.tpl
+++ b/charts/agent/templates/_cluster-metrics-config.tpl
@@ -23,6 +23,109 @@ receivers:
     metrics:
       k8s.node.condition:
         enabled: true
+  {{- if .Values.cluster.metrics.pod.enabled }}
+  prometheus/pod_metrics:
+      config:
+        scrape_configs:
+        - job_name: pod-metrics
+          scrape_interval: {{.Values.cluster.metrics.pod.interval}}
+          honor_labels: true
+          kubernetes_sd_configs:
+          - role: pod
+          relabel_configs:
+          # this is defaulted to keep but if set to drop then pod metrics will not be collected
+          - action: {{.Values.cluster.metrics.pod.action}}
+
+          # Drop anything matching the configured namespace.
+          - action: 'drop'
+            source_labels: ['__meta_kubernetes_namespace']
+            regex: {{.Values.cluster.metrics.pod.namespace_drop_regex}}
+
+          # Drop anything not matching the configured namespace.
+          - action: 'keep'
+            source_labels: ['__meta_kubernetes_namespace']
+            regex: {{.Values.cluster.metrics.pod.namespace_keep_regex}}
+
+          # Drop endpoints without one of: a port name suffixed with the configured regex, or an explicit prometheus port annotation.
+          - action: 'keep'
+            source_labels: ['__meta_kubernetes_pod_container_port_name', '__meta_kubernetes_pod_annotation_prometheus_io_port']
+            regex: '({{.Values.cluster.metrics.pod.port_keep_regex}};|.*;\d+)'
+
+          # Drop pods with phase Succeeded or Failed.
+          - action: 'drop'
+            regex: 'Succeeded|Failed'
+            source_labels: ['__meta_kubernetes_pod_phase']
+
+          ################################################################
+          # Drop anything annotated with 'observeinc.com.scrape=false' or 'observeinc_com_scrape=false' .
+          - action: 'drop'
+            regex: 'false'
+            source_labels: ['__meta_kubernetes_pod_annotation_observeinc_com_scrape']
+
+          ################################################################
+          # Prometheus Configs
+          # Drop anything annotated with 'prometheus.io.scrape=false'.
+          - action: 'drop'
+            regex: 'false'
+            source_labels: ['__meta_kubernetes_pod_annotation_prometheus_io_scrape']
+
+          # Allow pods to override the scrape scheme with 'prometheus.io.scheme=https'.
+          - action: 'replace'
+            regex: '(https?)'
+            replacement: '$1'
+            source_labels: ['__meta_kubernetes_pod_annotation_prometheus_io_scheme']
+            target_label: '__scheme__'
+
+          # Allow service to override the scrape path with 'prometheus.io.path=/other_metrics_path'.
+          - action: 'replace'
+            regex: '(.+)'
+            replacement: '$1'
+            source_labels: ['__meta_kubernetes_pod_annotation_prometheus_io_path']
+            target_label: '__metrics_path__'
+
+          # Allow services to override the scrape port with 'prometheus.io.port=1234'.
+          - action: 'replace'
+            regex: '(.+?)(\:\d+)?;(\d+)'
+            replacement: '$1:$3'
+            source_labels: ['__address__', '__meta_kubernetes_pod_annotation_prometheus_io_port']
+            target_label: '__address__'
+
+
+          ################################################################
+
+          #podAnnotations: {
+          #  observeinc_com_scrape: 'true',
+          #  observeinc_com_path: '/metrics',
+          #  observeinc_com_port: '8080',
+          #}
+
+          # set metrics_path (default is /metrics) to the metrics path specified in "observeinc_com_path: <metric path>" annotation.
+          - source_labels: [__meta_kubernetes_pod_annotation_observeinc_com_path]
+            action: replace
+            target_label: __metrics_path__
+            regex: (.+)
+
+          # set the scrapping port to the port specified in "observeinc_com_port: <port>" annotation and set address accordingly.
+          - source_labels: [__address__, __meta_kubernetes_pod_annotation_observeinc_com_port]
+            action: replace
+            regex: ([^:]+)(?::\d+)?;(\d+)
+            replacement: $$1:$$2
+            target_label: __address__
+
+          # Maps all Kubernetes pod labels to Prometheus labels with the prefix removed (e.g., __meta_kubernetes_pod_label_app becomes app).
+          - action: labelmap
+            regex: __meta_kubernetes_pod_label_(.+)
+
+          # adds new label
+          - source_labels: [__meta_kubernetes_namespace]
+            action: replace
+            target_label: kubernetes_namespace
+
+          # adds new label
+          - source_labels: [__meta_kubernetes_pod_name]
+            action: replace
+            target_label: kubernetes_pod_name
+  {{ end }}
 
 processors:
 {{- include "config.processors.memory_limiter" . | nindent 2 }}
@@ -39,6 +142,11 @@ processors:
       - key: debug_source
         action: insert
         value: cluster_metrics
+  attributes/debug_source_pod_metrics:
+    actions:
+      - key: debug_source
+        action: insert
+        value: pod_metrics
 
 service:
   extensions: [health_check]
@@ -47,6 +155,12 @@ service:
         receivers: [k8s_cluster]
         processors: [memory_limiter, batch, k8sattributes, attributes/observe_common, attributes/debug_source_cluster_metrics]
         exporters: [prometheusremotewrite, debug/override]
+      {{- if .Values.cluster.metrics.pod.enabled }}
+      metrics/pod_metrics:
+        receivers: [prometheus/pod_metrics]
+        processors: [memory_limiter, batch, k8sattributes, attributes/observe_common, attributes/debug_source_pod_metrics]
+        exporters: [prometheusremotewrite, debug/override]
+      {{ end -}}
 {{- include "config.service.telemetry" . | nindent 2 }}
 
  {{- end }}
diff --git a/charts/agent/templates/_monitor-config.tpl b/charts/agent/templates/_monitor-config.tpl
index 3015fe5e..4543725b 100644
--- a/charts/agent/templates/_monitor-config.tpl
+++ b/charts/agent/templates/_monitor-config.tpl
@@ -31,7 +31,7 @@ receivers:
               action: keep
               regex: true
               # set metrics_path (default is /metrics) to the metrics path specified in "prometheus.io/path: <metric path>" annotation.
-            - source_labels: [__meta_kubernetes_pod_annotationobserve_monitor_path]
+            - source_labels: [__meta_kubernetes_pod_annotation_observe_monitor_path]
               action: replace
               target_label: __metrics_path__
               regex: (.+)
diff --git a/charts/agent/values.yaml b/charts/agent/values.yaml
index d91ba306..0cc44c20 100644
--- a/charts/agent/values.yaml
+++ b/charts/agent/values.yaml
@@ -21,6 +21,16 @@ cluster:
     enabled: true
   metrics:
     enabled: true
+    pod:
+      enabled: false
+      action: keep
+      interval: 10s
+      # add whatever namespace the collection is deployed to
+      namespace_drop_regex: (.*istio.*|.*ingress.*|kube-system|observe)
+      namespace_keep_regex: (.*)
+      port_keep_regex: .*metrics
+      metric_drop_regex: .*bucket
+      metric_keep_regex: (.*)
 
   namespaceOverride:
     # ! This needs to have same value as namespaceOverride in deployments and daemonsets below
diff --git a/examples/agent/affinity/affinity-values.yaml b/examples/agent/affinity/affinity-values.yaml
index 67e5c924..9a770827 100644
--- a/examples/agent/affinity/affinity-values.yaml
+++ b/examples/agent/affinity/affinity-values.yaml
@@ -30,7 +30,7 @@ config:
     debug:
       verbosity: normal
 
-deployment-cluster-events:
+cluster-events:
   enabled: true
   namespaceOverride: "k8smonitoring"
   affinity:
@@ -47,15 +47,14 @@ deployment-cluster-events:
                 operator: NotIn
                 values: [windows]
 
-deployment-cluster-metrics:
+cluster-metrics:
   enabled: true
   namespaceOverride: "k8smonitoring"
 
-daemonset-logs-metrics:
+node-logs-metrics:
   enabled: true
   namespaceOverride: "k8smonitoring"
 
-
-deployment-agent-monitor:
+monitor:
   enabled: true
   namespaceOverride: "k8smonitoring"
diff --git a/examples/agent/pod_metrics/README.md b/examples/agent/pod_metrics/README.md
new file mode 100644
index 00000000..0d02ef6b
--- /dev/null
+++ b/examples/agent/pod_metrics/README.md
@@ -0,0 +1,70 @@
+# Prometheus pod metrics scrape example
+This example deploys a sample container that emits prometheus metrics.  Source is here - https://github.com/brancz/prometheus-example-app/blob/master/README.md
+
+It also creates a service - prometheus-example-app-service - and a cron job that calls the service at / every 10s and /err every 20s for a total of 5 minutes.
+
+You can alter the length of time the cron job runs and sleep time between runs by altering the SLEEP_TIME and LOOP_COUNT environment variables in the sample-pod.yaml file.
+
+The sample-pod-no.yaml demonstrates using the observeinc_com_scrape: 'false' annotation - you can prove it's effectiveness by setting to true and seeing the metrics show up.
+
+## Deploy sample container
+```
+kubectl apply -f sample-pod.yaml
+
+kubectl apply -f sample-pod-no.yaml
+
+```
+
+## Port forward
+```
+kubectl port-forward service/prometheus-example-app-service 8080:8080
+
+kubectl port-forward service/prometheus-example-app-no-service 8080:8080
+```
+
+## Call service locally
+```
+# Get metrics
+curl localhost:8080/metrics
+
+# Generate 200 count for metrics
+curl localhost:8080/
+
+# Generate error count for metrics
+curl localhost:8080/err
+
+```
+
+### Cleanup
+```
+kubectl delete -f sample-pod.yaml
+
+kubectl delete -f sample-pod-no.yaml
+```
+
+## Deploy k8s monitoring with pod metrics collection enabled
+The sample-pod.yaml is deployed in the default namespace unless you change the provided command.
+
+It has annotations to change the metrics port to 8080 (observeinc_com_port: '8080') which will tell the scrape config to not use the default port of 8888.
+
+The pod-metrics-values.yaml file sets the namespace to scrape metrics from to default with namespace_keep_regex and adds the web port name as valid with port_keep_regex.
+
+```
+helm install pod-metrics-example -n k8smonitoring \
+    --set observe.token.value=$TOKEN \
+    --set observe.collectionEndpoint.value=$ENDPOINT \
+     -f ./pod-metrics-values.yaml ../../../charts/agent
+
+helm upgrade pod-metrics-example -n k8smonitoring -f ./pod-metrics-values.yaml ../../../charts/agent
+```
+
+### Opal validation
+Create a worksheet from your datastream and token then use following opal
+```
+make_col debug_source:string(EXTRA.debug_source)
+filter OBSERVATION_KIND = "prometheus"
+filter debug_source = "pod_metrics"
+make_col metric:string(EXTRA.__name__)
+make_col k8s_namespace_name:string(EXTRA.k8s_namespace_name)
+make_col app_kubernetes_io_name:string(EXTRA.app_kubernetes_io_name)
+```
diff --git a/examples/agent/pod_metrics/pod-metrics-values.yaml b/examples/agent/pod_metrics/pod-metrics-values.yaml
new file mode 100644
index 00000000..4eab42c6
--- /dev/null
+++ b/examples/agent/pod_metrics/pod-metrics-values.yaml
@@ -0,0 +1,52 @@
+cluster:
+  name: observe-agent-monitored-cluster
+  events:
+    pullInterval: 20m
+    enabled: true
+  metrics:
+    enabled: true
+    pod:
+      enabled: true
+      namespace_keep_regex: (default)
+      metric_drop_regex: ""
+      port_keep_regex: .*metrics|web
+
+  namespaceOverride:
+    value: k8smonitoring
+
+containers:
+  logs:
+    enabled: true
+  metrics:
+    enabled: false
+
+agent:
+  selfMonitor:
+    enabled: true
+
+config:
+global:
+  debug:
+    verbosity: normal
+
+cluster-events:
+  enabled: true
+  namespaceOverride: "k8smonitoring"
+  tolerations:
+  - key: "deployObserve"
+    operator: "Equal"
+    value: "notAllowed"
+    effect: "NoSchedule"
+
+cluster-metrics:
+  enabled: true
+  namespaceOverride: "k8smonitoring"
+
+node-logs-metrics:
+  enabled: true
+  namespaceOverride: "k8smonitoring"
+
+
+monitor:
+  enabled: true
+  namespaceOverride: "k8smonitoring"
diff --git a/examples/agent/pod_metrics/sample-pod-no.yaml b/examples/agent/pod_metrics/sample-pod-no.yaml
new file mode 100644
index 00000000..1725232a
--- /dev/null
+++ b/examples/agent/pod_metrics/sample-pod-no.yaml
@@ -0,0 +1,73 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app.kubernetes.io/name: prometheus-example-app-no
+  name: prometheus-example-app-no
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: prometheus-example-app-no
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: prometheus-example-app-no
+      annotations:
+        observeinc_com_scrape: 'false'
+        observeinc_com_path: '/metrics'
+        observeinc_com_port: '8080'
+    spec:
+      containers:
+      - name: prometheus-example-app-no
+        image: quay.io/brancz/prometheus-example-app:v0.3.0
+        ports:
+        - name: web
+          containerPort: 8080
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: prometheus-example-app-no-service
+spec:
+  selector:
+    app.kubernetes.io/name: prometheus-example-app-no
+  ports:
+    - protocol: TCP
+      port: 8080  # Exposed service port
+      targetPort: 8080
+      name: metrics
+---
+apiVersion: batch/v1
+kind: CronJob
+metadata:
+  name: caller-cronjob
+spec:
+  schedule: "*/1 * * * *"  # Runs every minute
+  jobTemplate:
+    spec:
+      template:
+        spec:
+          containers:
+          - name: caller
+            image: curlimages/curl:latest  # A lightweight curl image
+            env:
+              - name: SLEEP_TIME
+                value: "10"  # Sleep time in seconds
+              - name: LOOP_COUNT
+                value: "36"   # Number of iterations
+            command:
+              - /bin/sh
+              - -c
+              - |
+                for i in $(seq 1 $LOOP_COUNT); do
+                  curl http://prometheus-example-app-no-service:8080;  # Adjust the URL and port as necessary
+
+                  # Second call on even numbers
+                  if [ $((i % 2)) -eq 0 ]; then
+                    curl http://prometheus-example-app-no-service:8080/err;  # Second target service
+                    echo "Second call on even #$i made."
+                  fi
+                  sleep $SLEEP_TIME;
+                done
+          restartPolicy: OnFailure
diff --git a/examples/agent/pod_metrics/sample-pod.yaml b/examples/agent/pod_metrics/sample-pod.yaml
new file mode 100644
index 00000000..3018d477
--- /dev/null
+++ b/examples/agent/pod_metrics/sample-pod.yaml
@@ -0,0 +1,73 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app.kubernetes.io/name: prometheus-example-app
+  name: prometheus-example-app
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: prometheus-example-app
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: prometheus-example-app
+      annotations:
+        observeinc_com_scrape: 'true'
+        observeinc_com_path: '/metrics'
+        observeinc_com_port: '8080'
+    spec:
+      containers:
+      - name: prometheus-example-app
+        image: quay.io/brancz/prometheus-example-app:v0.3.0
+        ports:
+        - name: web
+          containerPort: 8080
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: prometheus-example-app-service
+spec:
+  selector:
+    app.kubernetes.io/name: prometheus-example-app
+  ports:
+    - protocol: TCP
+      port: 8080  # Exposed service port
+      targetPort: 8080
+      name: metrics
+---
+apiVersion: batch/v1
+kind: CronJob
+metadata:
+  name: caller-cronjob
+spec:
+  schedule: "*/1 * * * *"  # Runs every minute
+  jobTemplate:
+    spec:
+      template:
+        spec:
+          containers:
+          - name: caller
+            image: curlimages/curl:latest  # A lightweight curl image
+            env:
+              - name: SLEEP_TIME
+                value: "10"  # Sleep time in seconds
+              - name: LOOP_COUNT
+                value: "36"   # Number of iterations
+            command:
+              - /bin/sh
+              - -c
+              - |
+                for i in $(seq 1 $LOOP_COUNT); do
+                  curl http://prometheus-example-app-service:8080;  # Adjust the URL and port as necessary
+
+                  # Second call on even numbers
+                  if [ $((i % 2)) -eq 0 ]; then
+                    curl http://prometheus-example-app-service:8080/err;  # Second target service
+                    echo "Second call on even #$i made."
+                  fi
+                  sleep $SLEEP_TIME;
+                done
+          restartPolicy: OnFailure