feat: add send_queue and retry_on_failure options for otlp exporter

observeinc · Oct 17, 2024 · 6f3baae · 6f3baae
1 parent deacdf4
commit 6f3baae
Show file tree

Hide file tree

Showing 13 changed files with 60 additions and 32 deletions.
diff --git a/charts/agent/Chart.yaml b/charts/agent/Chart.yaml
@@ -2,7 +2,7 @@ apiVersion: v2
 name: agent
 description: Chart to install K8s collection stack based on Observe Agent
 type: application
-version: 0.23.0
+version: 0.24.0
 appVersion: "1.1.0"
 dependencies:
   - name: opentelemetry-collector

diff --git a/charts/agent/README.md b/charts/agent/README.md
@@ -1,6 +1,6 @@
 # agent
 
-![Version: 0.23.0](https://img.shields.io/badge/Version-0.23.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.1.0](https://img.shields.io/badge/AppVersion-1.1.0-informational?style=flat-square)
+![Version: 0.24.0](https://img.shields.io/badge/Version-0.24.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.1.0](https://img.shields.io/badge/AppVersion-1.1.0-informational?style=flat-square)
 
 > [!CAUTION]
 > This chart is under active development and is not meant to be installed yet.
@@ -27,11 +27,16 @@ Chart to install K8s collection stack based on Observe Agent
 | Key | Type | Default | Description |
 |-----|------|---------|-------------|
 | agent.config.global.debug.verbosity | string | `"basic"` |  |
-| agent.config.global.processors.batch.send_batch_max_size | int | `4096` |  |
-| agent.config.global.processors.batch.send_batch_size | int | `4096` |  |
-| agent.config.global.service.telemetry.logging_encoding | string | `"console"` |  |
-| agent.config.global.service.telemetry.logging_level | string | `"WARN"` |  |
-| agent.config.global.service.telemetry.metrics_level | string | `"normal"` |  |
+| agent.config.global.exporters.retryOnFailure.enabled | bool | `true` |  |
+| agent.config.global.exporters.retryOnFailure.initialInterval | string | `"1s"` |  |
+| agent.config.global.exporters.retryOnFailure.maxElapsedTime | string | `"5m"` |  |
+| agent.config.global.exporters.retryOnFailure.maxInterval | string | `"30s"` |  |
+| agent.config.global.exporters.sendingQueue.enabled | bool | `true` |  |
+| agent.config.global.processors.batch.sendBatchMaxSize | int | `4096` |  |
+| agent.config.global.processors.batch.sendBatchSize | int | `4096` |  |
+| agent.config.global.service.telemetry.loggingEncoding | string | `"console"` |  |
+| agent.config.global.service.telemetry.loggingLevel | string | `"WARN"` |  |
+| agent.config.global.service.telemetry.metricsLevel | string | `"normal"` |  |
 | agent.selfMonitor.enabled | bool | `true` |  |
 | application.prometheusScrape.enabled | bool | `false` |  |
 | application.prometheusScrape.interval | string | `"60s"` |  |

diff --git a/charts/agent/templates/_cluster-events-config.tpl b/charts/agent/templates/_cluster-events-config.tpl
@@ -19,11 +19,10 @@ receivers:
         mode: pull
         interval: {{ .Values.cluster.events.pullInterval }}
 
-  # this pulls all objects listed below
+  # retrieves descriptions of all resources listed below
   k8sobjects/objects:
     auth_type: serviceAccount
     objects:
-      ## P1
       - {name: events, mode: pull, interval: 15m}
       - {name: events, mode: watch}
       - {name: pods, mode: pull, interval: 15m}
@@ -40,7 +39,6 @@ receivers:
       - {name: configmaps, mode: watch}
       - {name: endpoints, mode: pull, interval: 15m}
       - {name: endpoints, mode: watch}
-      ## P2
       - {name: jobs, mode: pull, interval: 15m}
       - {name: jobs, mode: watch}
       - {name: cronjobs, mode: pull, interval: 15m}

diff --git a/charts/agent/templates/_cluster-metrics-config.tpl b/charts/agent/templates/_cluster-metrics-config.tpl
@@ -170,12 +170,12 @@ service:
   pipelines:
       metrics:
         receivers: [k8s_cluster]
-        processors: [memory_limiter, batch, k8sattributes, attributes/observe_common, attributes/debug_source_cluster_metrics]
+        processors: [memory_limiter, k8sattributes, batch, attributes/observe_common, attributes/debug_source_cluster_metrics]
         exporters: [prometheusremotewrite, debug/override]
       {{- if .Values.application.prometheusScrape.enabled }}
       metrics/pod_metrics:
         receivers: [prometheus/pod_metrics]
-        processors: [memory_limiter, batch, k8sattributes, attributes/observe_common, attributes/debug_source_pod_metrics]
+        processors: [memory_limiter, k8sattributes, batch, attributes/observe_common, attributes/debug_source_pod_metrics]
         exporters: [prometheusremotewrite, debug/override]
       {{ end -}}
 {{- include "config.service.telemetry" . | nindent 2 }}

diff --git a/charts/agent/templates/_config-exporters.tpl b/charts/agent/templates/_config-exporters.tpl
@@ -3,6 +3,13 @@ otlphttp/observe/base:
     endpoint: "{{ .Values.observe.collectionEndpoint.value | toString | trimSuffix "/" }}/v2/otel"
     headers:
         authorization: "${env:OBSERVE_TOKEN}"
+    sending_queue:
+      enabled: {{ .Values.agent.config.global.exporters.sendingQueue.enabled }}
+    retry_on_failure:
+      enabled: {{ .Values.agent.config.global.exporters.retryOnFailure.enabled }}
+      initial_interval: {{ .Values.agent.config.global.exporters.retryOnFailure.initialInterval }}
+      max_interval: {{ .Values.agent.config.global.exporters.retryOnFailure.maxInterval }}
+      max_elapsed_time: {{ .Values.agent.config.global.exporters.retryOnFailure.maxElapsedTime }}
     compression: zstd
 {{- end -}}
 
@@ -11,6 +18,13 @@ otlphttp/observe/entity:
     logs_endpoint: "{{ .Values.observe.collectionEndpoint.value | toString | trimSuffix "/" }}/v1/kubernetes/v1/entity"
     headers:
         authorization: "Bearer ${env:ENTITY_TOKEN}"
+    sending_queue:
+      enabled: {{ .Values.agent.config.global.exporters.sendingQueue.enabled }}
+    retry_on_failure:
+      enabled: {{ .Values.agent.config.global.exporters.retryOnFailure.enabled }}
+      initial_interval: {{ .Values.agent.config.global.exporters.retryOnFailure.initialInterval }}
+      max_interval: {{ .Values.agent.config.global.exporters.retryOnFailure.maxInterval }}
+      max_elapsed_time: {{ .Values.agent.config.global.exporters.retryOnFailure.maxElapsedTime }}
     compression: zstd
 {{- end -}}
 

diff --git a/charts/agent/templates/_config-processors.tpl b/charts/agent/templates/_config-processors.tpl
@@ -7,8 +7,8 @@ resourcedetection/cloud:
 
 {{- define "config.processors.batch" -}}
 batch:
-  send_batch_size: {{ .Values.agent.config.global.processors.batch.send_batch_size }}
-  send_batch_max_size: {{ .Values.agent.config.global.processors.batch.send_batch_max_size }}
+  send_batch_size: {{ .Values.agent.config.global.processors.batch.sendBatchSize }}
+  send_batch_max_size: {{ .Values.agent.config.global.processors.batch.sendBatchMaxSize }}
 {{- end -}}
 
 {{- define "config.processors.attributes.k8sattributes" -}}

diff --git a/charts/agent/templates/_config-telemetry.tpl b/charts/agent/templates/_config-telemetry.tpl
@@ -1,9 +1,9 @@
 {{- define "config.service.telemetry" -}}
 telemetry:
     metrics:
-      level: {{ .Values.agent.config.global.service.telemetry.metrics_level }}
+      level: {{ .Values.agent.config.global.service.telemetry.metricsLevel }}
       address: {{ template "config.local_host"}}:8888
     logs:
-      level: {{ .Values.agent.config.global.service.telemetry.logging_level }}
-      encoding: {{ .Values.agent.config.global.service.telemetry.logging_encoding }}
+      level: {{ .Values.agent.config.global.service.telemetry.loggingLevel }}
+      encoding: {{ .Values.agent.config.global.service.telemetry.loggingEncoding }}
 {{- end -}}
diff --git a/charts/agent/templates/_monitor-config.tpl b/charts/agent/templates/_monitor-config.tpl
@@ -72,7 +72,7 @@ service:
   pipelines:
       metrics:
         receivers: [prometheus/collector]
-        processors: [memory_limiter, batch, attributes/observe_common, k8sattributes, attributes/debug_source_agent_monitor]
+        processors: [memory_limiter, k8sattributes, batch, attributes/observe_common, attributes/debug_source_agent_monitor]
         exporters: [prometheusremotewrite]
 {{- include "config.service.telemetry" . | nindent 2 }}
 

diff --git a/charts/agent/templates/_node-logs-metrics-config.tpl b/charts/agent/templates/_node-logs-metrics-config.tpl
@@ -168,19 +168,19 @@ service:
       {{- if .Values.node.containers.logs.enabled }}
       logs:
         receivers: [filelog]
-        processors: [memory_limiter, batch, resourcedetection/cloud, k8sattributes, attributes/observe_common, attributes/debug_source_pod_logs]
+        processors: [memory_limiter, k8sattributes, batch, resourcedetection/cloud, attributes/observe_common, attributes/debug_source_pod_logs]
         exporters: [otlphttp/observe/base, debug/override]
       {{- end -}}
       {{- if .Values.node.metrics.enabled }}
       metrics/hostmetrics:
         receivers: [hostmetrics]
-        processors: [memory_limiter, batch, resourcedetection/cloud, k8sattributes, attributes/observe_common, attributes/debug_source_hostmetrics]
+        processors: [memory_limiter, k8sattributes, batch, resourcedetection/cloud, attributes/observe_common, attributes/debug_source_hostmetrics]
         exporters: [prometheusremotewrite, debug/override]
       {{- end -}}
       {{- if .Values.node.containers.metrics.enabled }}
       metrics/kubeletstats:
         receivers: [kubeletstats]
-        processors: [memory_limiter, batch, resourcedetection/cloud, k8sattributes, attributes/observe_common, attributes/debug_source_kubletstats_metrics]
+        processors: [memory_limiter, k8sattributes, batch, resourcedetection/cloud, attributes/observe_common, attributes/debug_source_kubletstats_metrics]
         exporters: [prometheusremotewrite, debug/override]
       {{- end -}}
 {{- include "config.service.telemetry" . | nindent 2 }}

diff --git a/charts/agent/values.yaml b/charts/agent/values.yaml
@@ -101,13 +101,24 @@ agent:
     global:
       processors:
         batch:
-          send_batch_size: 4096
-          send_batch_max_size: 4096
+          sendBatchSize: 4096
+          sendBatchMaxSize: 4096
+      exporters:
+        sendingQueue:
+          enabled: true
+        retryOnFailure:
+          enabled: true
+          # Time to wait after the first failure before retrying.
+          initialInterval: 1s
+          # Upper bound on retry backoff interval. Once this value is reached the delay between consecutive retries will remain constant at the specified value.
+          maxInterval: 30s
+          # Maximum amount of time (including retries) spent trying to send a logs batch to a downstream consumer. Once this value is reached, the data is discarded. Retrying never stops if set to 0.
+          maxElapsedTime: 5m
       service:
         telemetry:
-          metrics_level: normal
-          logging_level: WARN
-          logging_encoding: console
+          metricsLevel: normal
+          loggingLevel: WARN
+          loggingEncoding: console
       debug:
         # values basic, normal, detailed
         verbosity: basic

diff --git a/integration/modules/deploy_helm/values/default.yaml b/integration/modules/deploy_helm/values/default.yaml
@@ -4,8 +4,8 @@ agent:
       global:
         service:
           telemetry:
-            logging_encoding: json
-            logging_level: INFO
+            loggingEncoding: json
+            loggingLevel: INFO
 
 #Common for namepsace/url/token
 observe:

diff --git a/integration/modules/deploy_helm/values/node_affinity.yaml b/integration/modules/deploy_helm/values/node_affinity.yaml
@@ -4,8 +4,8 @@ agent:
       global:
         service:
           telemetry:
-            logging_encoding: json
-            logging_level: INFO
+            loggingEncoding: json
+            loggingLevel: INFO
 
 #Common for namepsace/url/token
 observe:

diff --git a/integration/modules/deploy_helm/values/node_taint.yaml b/integration/modules/deploy_helm/values/node_taint.yaml
@@ -4,8 +4,8 @@ agent:
       global:
         service:
           telemetry:
-            logging_encoding: json
-            logging_level: INFO
+            loggingEncoding: json
+            loggingLevel: INFO
 
 #Common for namepsace/url/token
 observe: