Skip to content

Commit

Permalink
feat: add send_queue and retry_on_failure options for otlp exporter
Browse files Browse the repository at this point in the history
  • Loading branch information
obs-gh-alexlew committed Oct 17, 2024
1 parent deacdf4 commit 6f3baae
Show file tree
Hide file tree
Showing 13 changed files with 60 additions and 32 deletions.
2 changes: 1 addition & 1 deletion charts/agent/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ apiVersion: v2
name: agent
description: Chart to install K8s collection stack based on Observe Agent
type: application
version: 0.23.0
version: 0.24.0
appVersion: "1.1.0"
dependencies:
- name: opentelemetry-collector
Expand Down
17 changes: 11 additions & 6 deletions charts/agent/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# agent

![Version: 0.23.0](https://img.shields.io/badge/Version-0.23.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.1.0](https://img.shields.io/badge/AppVersion-1.1.0-informational?style=flat-square)
![Version: 0.24.0](https://img.shields.io/badge/Version-0.24.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.1.0](https://img.shields.io/badge/AppVersion-1.1.0-informational?style=flat-square)

> [!CAUTION]
> This chart is under active development and is not meant to be installed yet.
Expand All @@ -27,11 +27,16 @@ Chart to install K8s collection stack based on Observe Agent
| Key | Type | Default | Description |
|-----|------|---------|-------------|
| agent.config.global.debug.verbosity | string | `"basic"` | |
| agent.config.global.processors.batch.send_batch_max_size | int | `4096` | |
| agent.config.global.processors.batch.send_batch_size | int | `4096` | |
| agent.config.global.service.telemetry.logging_encoding | string | `"console"` | |
| agent.config.global.service.telemetry.logging_level | string | `"WARN"` | |
| agent.config.global.service.telemetry.metrics_level | string | `"normal"` | |
| agent.config.global.exporters.retryOnFailure.enabled | bool | `true` | |
| agent.config.global.exporters.retryOnFailure.initialInterval | string | `"1s"` | |
| agent.config.global.exporters.retryOnFailure.maxElapsedTime | string | `"5m"` | |
| agent.config.global.exporters.retryOnFailure.maxInterval | string | `"30s"` | |
| agent.config.global.exporters.sendingQueue.enabled | bool | `true` | |
| agent.config.global.processors.batch.sendBatchMaxSize | int | `4096` | |
| agent.config.global.processors.batch.sendBatchSize | int | `4096` | |
| agent.config.global.service.telemetry.loggingEncoding | string | `"console"` | |
| agent.config.global.service.telemetry.loggingLevel | string | `"WARN"` | |
| agent.config.global.service.telemetry.metricsLevel | string | `"normal"` | |
| agent.selfMonitor.enabled | bool | `true` | |
| application.prometheusScrape.enabled | bool | `false` | |
| application.prometheusScrape.interval | string | `"60s"` | |
Expand Down
4 changes: 1 addition & 3 deletions charts/agent/templates/_cluster-events-config.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,10 @@ receivers:
mode: pull
interval: {{ .Values.cluster.events.pullInterval }}

# this pulls all objects listed below
# retrieves descriptions of all resources listed below
k8sobjects/objects:
auth_type: serviceAccount
objects:
## P1
- {name: events, mode: pull, interval: 15m}
- {name: events, mode: watch}
- {name: pods, mode: pull, interval: 15m}
Expand All @@ -40,7 +39,6 @@ receivers:
- {name: configmaps, mode: watch}
- {name: endpoints, mode: pull, interval: 15m}
- {name: endpoints, mode: watch}
## P2
- {name: jobs, mode: pull, interval: 15m}
- {name: jobs, mode: watch}
- {name: cronjobs, mode: pull, interval: 15m}
Expand Down
4 changes: 2 additions & 2 deletions charts/agent/templates/_cluster-metrics-config.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -170,12 +170,12 @@ service:
pipelines:
metrics:
receivers: [k8s_cluster]
processors: [memory_limiter, batch, k8sattributes, attributes/observe_common, attributes/debug_source_cluster_metrics]
processors: [memory_limiter, k8sattributes, batch, attributes/observe_common, attributes/debug_source_cluster_metrics]
exporters: [prometheusremotewrite, debug/override]
{{- if .Values.application.prometheusScrape.enabled }}
metrics/pod_metrics:
receivers: [prometheus/pod_metrics]
processors: [memory_limiter, batch, k8sattributes, attributes/observe_common, attributes/debug_source_pod_metrics]
processors: [memory_limiter, k8sattributes, batch, attributes/observe_common, attributes/debug_source_pod_metrics]
exporters: [prometheusremotewrite, debug/override]
{{ end -}}
{{- include "config.service.telemetry" . | nindent 2 }}
Expand Down
14 changes: 14 additions & 0 deletions charts/agent/templates/_config-exporters.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@ otlphttp/observe/base:
endpoint: "{{ .Values.observe.collectionEndpoint.value | toString | trimSuffix "/" }}/v2/otel"
headers:
authorization: "${env:OBSERVE_TOKEN}"
sending_queue:
enabled: {{ .Values.agent.config.global.exporters.sendingQueue.enabled }}
retry_on_failure:
enabled: {{ .Values.agent.config.global.exporters.retryOnFailure.enabled }}
initial_interval: {{ .Values.agent.config.global.exporters.retryOnFailure.initialInterval }}
max_interval: {{ .Values.agent.config.global.exporters.retryOnFailure.maxInterval }}
max_elapsed_time: {{ .Values.agent.config.global.exporters.retryOnFailure.maxElapsedTime }}
compression: zstd
{{- end -}}

Expand All @@ -11,6 +18,13 @@ otlphttp/observe/entity:
logs_endpoint: "{{ .Values.observe.collectionEndpoint.value | toString | trimSuffix "/" }}/v1/kubernetes/v1/entity"
headers:
authorization: "Bearer ${env:ENTITY_TOKEN}"
sending_queue:
enabled: {{ .Values.agent.config.global.exporters.sendingQueue.enabled }}
retry_on_failure:
enabled: {{ .Values.agent.config.global.exporters.retryOnFailure.enabled }}
initial_interval: {{ .Values.agent.config.global.exporters.retryOnFailure.initialInterval }}
max_interval: {{ .Values.agent.config.global.exporters.retryOnFailure.maxInterval }}
max_elapsed_time: {{ .Values.agent.config.global.exporters.retryOnFailure.maxElapsedTime }}
compression: zstd
{{- end -}}

Expand Down
4 changes: 2 additions & 2 deletions charts/agent/templates/_config-processors.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ resourcedetection/cloud:

{{- define "config.processors.batch" -}}
batch:
send_batch_size: {{ .Values.agent.config.global.processors.batch.send_batch_size }}
send_batch_max_size: {{ .Values.agent.config.global.processors.batch.send_batch_max_size }}
send_batch_size: {{ .Values.agent.config.global.processors.batch.sendBatchSize }}
send_batch_max_size: {{ .Values.agent.config.global.processors.batch.sendBatchMaxSize }}
{{- end -}}

{{- define "config.processors.attributes.k8sattributes" -}}
Expand Down
6 changes: 3 additions & 3 deletions charts/agent/templates/_config-telemetry.tpl
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{{- define "config.service.telemetry" -}}
telemetry:
metrics:
level: {{ .Values.agent.config.global.service.telemetry.metrics_level }}
level: {{ .Values.agent.config.global.service.telemetry.metricsLevel }}
address: {{ template "config.local_host"}}:8888
logs:
level: {{ .Values.agent.config.global.service.telemetry.logging_level }}
encoding: {{ .Values.agent.config.global.service.telemetry.logging_encoding }}
level: {{ .Values.agent.config.global.service.telemetry.loggingLevel }}
encoding: {{ .Values.agent.config.global.service.telemetry.loggingEncoding }}
{{- end -}}
2 changes: 1 addition & 1 deletion charts/agent/templates/_monitor-config.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ service:
pipelines:
metrics:
receivers: [prometheus/collector]
processors: [memory_limiter, batch, attributes/observe_common, k8sattributes, attributes/debug_source_agent_monitor]
processors: [memory_limiter, k8sattributes, batch, attributes/observe_common, attributes/debug_source_agent_monitor]
exporters: [prometheusremotewrite]
{{- include "config.service.telemetry" . | nindent 2 }}

Expand Down
6 changes: 3 additions & 3 deletions charts/agent/templates/_node-logs-metrics-config.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -168,19 +168,19 @@ service:
{{- if .Values.node.containers.logs.enabled }}
logs:
receivers: [filelog]
processors: [memory_limiter, batch, resourcedetection/cloud, k8sattributes, attributes/observe_common, attributes/debug_source_pod_logs]
processors: [memory_limiter, k8sattributes, batch, resourcedetection/cloud, attributes/observe_common, attributes/debug_source_pod_logs]
exporters: [otlphttp/observe/base, debug/override]
{{- end -}}
{{- if .Values.node.metrics.enabled }}
metrics/hostmetrics:
receivers: [hostmetrics]
processors: [memory_limiter, batch, resourcedetection/cloud, k8sattributes, attributes/observe_common, attributes/debug_source_hostmetrics]
processors: [memory_limiter, k8sattributes, batch, resourcedetection/cloud, attributes/observe_common, attributes/debug_source_hostmetrics]
exporters: [prometheusremotewrite, debug/override]
{{- end -}}
{{- if .Values.node.containers.metrics.enabled }}
metrics/kubeletstats:
receivers: [kubeletstats]
processors: [memory_limiter, batch, resourcedetection/cloud, k8sattributes, attributes/observe_common, attributes/debug_source_kubletstats_metrics]
processors: [memory_limiter, k8sattributes, batch, resourcedetection/cloud, attributes/observe_common, attributes/debug_source_kubletstats_metrics]
exporters: [prometheusremotewrite, debug/override]
{{- end -}}
{{- include "config.service.telemetry" . | nindent 2 }}
Expand Down
21 changes: 16 additions & 5 deletions charts/agent/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,24 @@ agent:
global:
processors:
batch:
send_batch_size: 4096
send_batch_max_size: 4096
sendBatchSize: 4096
sendBatchMaxSize: 4096
exporters:
sendingQueue:
enabled: true
retryOnFailure:
enabled: true
# Time to wait after the first failure before retrying.
initialInterval: 1s
# Upper bound on retry backoff interval. Once this value is reached the delay between consecutive retries will remain constant at the specified value.
maxInterval: 30s
# Maximum amount of time (including retries) spent trying to send a logs batch to a downstream consumer. Once this value is reached, the data is discarded. Retrying never stops if set to 0.
maxElapsedTime: 5m
service:
telemetry:
metrics_level: normal
logging_level: WARN
logging_encoding: console
metricsLevel: normal
loggingLevel: WARN
loggingEncoding: console
debug:
# values basic, normal, detailed
verbosity: basic
Expand Down
4 changes: 2 additions & 2 deletions integration/modules/deploy_helm/values/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ agent:
global:
service:
telemetry:
logging_encoding: json
logging_level: INFO
loggingEncoding: json
loggingLevel: INFO

#Common for namepsace/url/token
observe:
Expand Down
4 changes: 2 additions & 2 deletions integration/modules/deploy_helm/values/node_affinity.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ agent:
global:
service:
telemetry:
logging_encoding: json
logging_level: INFO
loggingEncoding: json
loggingLevel: INFO

#Common for namepsace/url/token
observe:
Expand Down
4 changes: 2 additions & 2 deletions integration/modules/deploy_helm/values/node_taint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ agent:
global:
service:
telemetry:
logging_encoding: json
logging_level: INFO
loggingEncoding: json
loggingLevel: INFO

#Common for namepsace/url/token
observe:
Expand Down

0 comments on commit 6f3baae

Please sign in to comment.