diff --git a/docs/node-mixin/Makefile b/docs/node-mixin/Makefile index d04b37d009..d1775a9828 100644 --- a/docs/node-mixin/Makefile +++ b/docs/node-mixin/Makefile @@ -6,15 +6,15 @@ fmt: find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ xargs -n 1 -- $(JSONNET_FMT) -i -node_alerts.yaml: mixin.libsonnet config.libsonnet $(wildcard alerts/*) - jsonnet -S alerts.jsonnet > $@ +node_alerts.yaml: mixin.libsonnet lib/linux/config.libsonnet $(wildcard lib/linux/alerts/*) + jsonnet -J vendor -S -e 'std.manifestYamlDoc((import "mixin.libsonnet").prometheusAlerts)' > $@ -node_rules.yaml: mixin.libsonnet config.libsonnet $(wildcard rules/*) - jsonnet -S rules.jsonnet > $@ +node_rules.yaml: mixin.libsonnet lib/linux/config.libsonnet $(wildcard lib/linux/rules/*) + jsonnet -J vendor -S -e 'std.manifestYamlDoc((import "mixin.libsonnet").prometheusRules)' > $@ -dashboards_out: mixin.libsonnet config.libsonnet $(wildcard dashboards/*) - @mkdir -p dashboards_out - jsonnet -J vendor -m dashboards_out dashboards.jsonnet +dashboards_out: mixin.libsonnet lib/linux/config.libsonnet lib/linux/dashboards.libsonnet $(wildcard lib/linux/*) + @mkdir -p dashboards_out/linux + jsonnet -J vendor -m dashboards_out/linux -e '(import "mixin.libsonnet").grafanaDashboards' lint: node_alerts.yaml node_rules.yaml find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ diff --git a/docs/node-mixin/alerts.jsonnet b/docs/node-mixin/alerts.jsonnet deleted file mode 100644 index 75e7c1b297..0000000000 --- a/docs/node-mixin/alerts.jsonnet +++ /dev/null @@ -1 +0,0 @@ -std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet deleted file mode 100644 index fc2ae2b56c..0000000000 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ /dev/null @@ -1,433 +0,0 @@ -{ - prometheusAlerts+:: { - groups+: [ - { - name: 'node-exporter-filesystem', - rules: [ - { - alert: 'NodeFilesystemSpaceFillingUp', - expr: ||| - ( - node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < %(fsSpaceFillingUpWarningThreshold)d - and - predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[%(fsSpaceFillingUpPredictionWindow)s], 24*60*60) < 0 - and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 - ) - ||| % $._config, - 'for': '1h', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Filesystem is predicted to run out of space within the next 24 hours.', - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.', - }, - }, - { - alert: 'NodeFilesystemSpaceFillingUp', - expr: ||| - ( - node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < %(fsSpaceFillingUpCriticalThreshold)d - and - predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 - ) - ||| % $._config, - 'for': '1h', - labels: { - severity: '%(nodeCriticalSeverity)s' % $._config, - }, - annotations: { - summary: 'Filesystem is predicted to run out of space within the next 4 hours.', - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.', - }, - }, - { - alert: 'NodeFilesystemAlmostOutOfSpace', - expr: ||| - ( - node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < %(fsSpaceAvailableWarningThreshold)d - and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 - ) - ||| % $._config, - 'for': '30m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Filesystem has less than %(fsSpaceAvailableWarningThreshold)d%% space left.' % $._config, - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', - }, - }, - { - alert: 'NodeFilesystemAlmostOutOfSpace', - expr: ||| - ( - node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < %(fsSpaceAvailableCriticalThreshold)d - and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 - ) - ||| % $._config, - 'for': '30m', - labels: { - severity: '%(nodeCriticalSeverity)s' % $._config, - }, - annotations: { - summary: 'Filesystem has less than %(fsSpaceAvailableCriticalThreshold)d%% space left.' % $._config, - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', - }, - }, - { - alert: 'NodeFilesystemFilesFillingUp', - expr: ||| - ( - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < 40 - and - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 - ) - ||| % $._config, - 'for': '1h', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.', - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.', - }, - }, - { - alert: 'NodeFilesystemFilesFillingUp', - expr: ||| - ( - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < 20 - and - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 - ) - ||| % $._config, - 'for': '1h', - labels: { - severity: '%(nodeCriticalSeverity)s' % $._config, - }, - annotations: { - summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.', - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.', - }, - }, - { - alert: 'NodeFilesystemAlmostOutOfFiles', - expr: ||| - ( - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < 5 - and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 - ) - ||| % $._config, - 'for': '1h', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Filesystem has less than 5% inodes left.', - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', - }, - }, - { - alert: 'NodeFilesystemAlmostOutOfFiles', - expr: ||| - ( - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < 3 - and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 - ) - ||| % $._config, - 'for': '1h', - labels: { - severity: '%(nodeCriticalSeverity)s' % $._config, - }, - annotations: { - summary: 'Filesystem has less than 3% inodes left.', - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', - }, - }, - ], - }, - { - name: 'node-exporter', - rules: [ - { - alert: 'NodeNetworkReceiveErrs', - expr: ||| - rate(node_network_receive_errs_total{%(nodeExporterSelector)s}[2m]) / rate(node_network_receive_packets_total{%(nodeExporterSelector)s}[2m]) > 0.01 - ||| % $._config, - 'for': '1h', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Network interface is reporting many receive errors.', - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.', - }, - }, - { - alert: 'NodeNetworkTransmitErrs', - expr: ||| - rate(node_network_transmit_errs_total{%(nodeExporterSelector)s}[2m]) / rate(node_network_transmit_packets_total{%(nodeExporterSelector)s}[2m]) > 0.01 - ||| % $._config, - 'for': '1h', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Network interface is reporting many transmit errors.', - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.', - }, - }, - { - alert: 'NodeHighNumberConntrackEntriesUsed', - expr: ||| - (node_nf_conntrack_entries{%(nodeExporterSelector)s} / node_nf_conntrack_entries_limit) > 0.75 - ||| % $._config, - annotations: { - summary: 'Number of conntrack are getting close to the limit.', - description: '{{ $value | humanizePercentage }} of conntrack entries are used.', - }, - labels: { - severity: 'warning', - }, - }, - { - alert: 'NodeTextFileCollectorScrapeError', - expr: ||| - node_textfile_scrape_error{%(nodeExporterSelector)s} == 1 - ||| % $._config, - annotations: { - summary: 'Node Exporter text file collector failed to scrape.', - description: 'Node Exporter text file collector on {{ $labels.instance }} failed to scrape.', - }, - labels: { - severity: 'warning', - }, - }, - { - alert: 'NodeClockSkewDetected', - expr: ||| - ( - node_timex_offset_seconds{%(nodeExporterSelector)s} > 0.05 - and - deriv(node_timex_offset_seconds{%(nodeExporterSelector)s}[5m]) >= 0 - ) - or - ( - node_timex_offset_seconds{%(nodeExporterSelector)s} < -0.05 - and - deriv(node_timex_offset_seconds{%(nodeExporterSelector)s}[5m]) <= 0 - ) - ||| % $._config, - 'for': '10m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Clock skew detected.', - description: 'Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.', - }, - }, - { - alert: 'NodeClockNotSynchronising', - expr: ||| - min_over_time(node_timex_sync_status{%(nodeExporterSelector)s}[5m]) == 0 - and - node_timex_maxerror_seconds{%(nodeExporterSelector)s} >= 16 - ||| % $._config, - 'for': '10m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Clock not synchronising.', - description: 'Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.', - }, - }, - { - alert: 'NodeRAIDDegraded', - expr: ||| - node_md_disks_required{%(nodeExporterSelector)s,%(diskDeviceSelector)s} - ignoring (state) (node_md_disks{state="active",%(nodeExporterSelector)s,%(diskDeviceSelector)s}) > 0 - ||| % $._config, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'RAID Array is degraded.', - description: "RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.", - }, - }, - { - alert: 'NodeRAIDDiskFailure', - expr: ||| - node_md_disks{state="failed",%(nodeExporterSelector)s,%(diskDeviceSelector)s} > 0 - ||| % $._config, - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Failed device in RAID array.', - description: "At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.", - }, - }, - { - alert: 'NodeFileDescriptorLimit', - expr: ||| - ( - node_filefd_allocated{%(nodeExporterSelector)s} * 100 / node_filefd_maximum{%(nodeExporterSelector)s} > 70 - ) - ||| % $._config, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Kernel is predicted to exhaust file descriptors limit soon.', - description: 'File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.', - }, - }, - { - alert: 'NodeFileDescriptorLimit', - expr: ||| - ( - node_filefd_allocated{%(nodeExporterSelector)s} * 100 / node_filefd_maximum{%(nodeExporterSelector)s} > 90 - ) - ||| % $._config, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'Kernel is predicted to exhaust file descriptors limit soon.', - description: 'File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.', - }, - }, - { - alert: 'NodeCPUHighUsage', - expr: ||| - sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode!="idle"}[2m]))) * 100 > %(cpuHighUsageThreshold)d - ||| % $._config, - 'for': '15m', - labels: { - severity: 'info', - }, - annotations: { - summary: 'High CPU usage.', - description: ||| - CPU usage at {{ $labels.instance }} has been above %(cpuHighUsageThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%. - ||| % $._config, - }, - }, - { - alert: 'NodeSystemSaturation', - expr: ||| - node_load1{%(nodeExporterSelector)s} - / count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}) > %(systemSaturationPerCoreThreshold)d - ||| % $._config, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'System saturated, load per core is very high.', - description: ||| - System load per core at {{ $labels.instance }} has been above %(systemSaturationPerCoreThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. - This might indicate this instance resources saturation and can cause it becoming unresponsive. - ||| % $._config, - }, - }, - { - alert: 'NodeMemoryMajorPagesFaults', - expr: ||| - rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[5m]) > %(memoryMajorPagesFaultsThreshold)d - ||| % $._config, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Memory major page faults are occurring at very high rate.', - description: ||| - Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsThreshold)d major page faults per second for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. - Please check that there is enough memory available at this instance. - ||| % $._config, - }, - }, - { - alert: 'NodeMemoryHighUtilization', - expr: ||| - 100 - (node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s} * 100) > %(memoryHighUtilizationThreshold)d - ||| % $._config, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Host is running out of memory.', - description: ||| - Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%. - ||| % $._config, - }, - }, - { - alert: 'NodeDiskIOSaturation', - expr: ||| - rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[5m]) > %(diskIOSaturationThreshold)d - ||| % $._config, - 'for': '30m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Disk IO queue is high.', - description: ||| - Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above %(diskIOSaturationThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. - This symptom might indicate disk saturation. - ||| % $._config, - }, - }, - { - alert: 'NodeSystemdServiceFailed', - expr: ||| - node_systemd_unit_state{%(nodeExporterSelector)s, state="failed"} == 1 - ||| % $._config, - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Systemd service has entered failed state.', - description: 'Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}', - }, - }, - { - alert: 'NodeBondingDegraded', - expr: ||| - (node_bonding_slaves - node_bonding_active) != 0 - ||| % $._config, - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Bonding interface is degraded', - description: 'Bonding interface {{ $labels.master }} on {{ $labels.instance }} is in degraded state due to one or more slave failures.', - }, - }, - ], - }, - ], - }, -} diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet deleted file mode 100644 index 4e9cafadf2..0000000000 --- a/docs/node-mixin/config.libsonnet +++ /dev/null @@ -1,123 +0,0 @@ -{ - _config+:: { - // Selectors are inserted between {} in Prometheus queries. - - // Select the metrics coming from the node exporter. Note that all - // the selected metrics are shown stacked on top of each other in - // the 'USE Method / Cluster' dashboard. Consider disabling that - // dashboard if mixing up all those metrics in the same dashboard - // doesn't make sense (e.g. because they are coming from different - // clusters). - nodeExporterSelector: 'job="node"', - - // Select the fstype for filesystem-related queries. If left - // empty, all filesystems are selected. If you have unusual - // filesystem you don't want to include in dashboards and - // alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'. - fsSelector: 'fstype!=""', - - // Select the mountpoint for filesystem-related queries. If left - // empty, all mountpoints are selected. For example if you have a - // special purpose tmpfs instance that has a fixed size and will - // always be 100% full, but you still want alerts and dashboards for - // other tmpfs instances, you can exclude those by mountpoint prefix - // like so: 'mountpoint!~"/var/lib/foo.*"'. - fsMountpointSelector: 'mountpoint!=""', - - // Select the device for disk-related queries. If left empty, all - // devices are selected. If you have unusual devices you don't - // want to include in dashboards and alerting, you can exclude - // them here, e.g. 'device!="tmpfs"'. - diskDeviceSelector: 'device!=""', - - // Some of the alerts are meant to fire if a critical failure of a - // node is imminent (e.g. the disk is about to run full). In a - // true “cloud native” setup, failures of a single node should be - // tolerated. Hence, even imminent failure of a single node is no - // reason to create a paging alert. However, in practice there are - // still many situations where operators like to get paged in time - // before a node runs out of disk space. nodeCriticalSeverity can - // be set to the desired severity for this kind of alerts. This - // can even be templated to depend on labels of the node, e.g. you - // could make this critical for traditional database masters but - // just a warning for K8s nodes. - nodeCriticalSeverity: 'critical', - - // CPU utilization (%) on which to trigger the - // 'NodeCPUHighUsage' alert. - cpuHighUsageThreshold: 90, - // Load average 1m (per core) on which to trigger the - // 'NodeSystemSaturation' alert. - systemSaturationPerCoreThreshold: 2, - - // Available disk space (%) thresholds on which to trigger the - // 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk - // usage grows in a way that it is predicted to run out in 4h or 1d - // and if the provided thresholds have been reached right now. - // In some cases you'll want to adjust these, e.g., by default, Kubernetes - // runs the image garbage collection when the disk usage reaches 85% - // of its available space. In that case, you'll want to reduce the - // critical threshold below to something like 14 or 15, otherwise - // the alert could fire under normal node usage. - // Additionally, the prediction window for the alert can be configured - // to account for environments where disk usage can fluctuate within - // a short time frame. By extending the prediction window, you can - // reduce false positives caused by temporary spikes, providing a - // more accurate prediction of disk space issues. - fsSpaceFillingUpWarningThreshold: 40, - fsSpaceFillingUpCriticalThreshold: 20, - fsSpaceFillingUpPredictionWindow: '6h', - - // Available disk space (%) thresholds on which to trigger the - // 'NodeFilesystemAlmostOutOfSpace' alerts. - fsSpaceAvailableWarningThreshold: 5, - fsSpaceAvailableCriticalThreshold: 3, - - // Memory utilzation (%) level on which to trigger the - // 'NodeMemoryHighUtilization' alert. - memoryHighUtilizationThreshold: 90, - - // Threshold for the rate of memory major page faults to trigger - // 'NodeMemoryMajorPagesFaults' alert. - memoryMajorPagesFaultsThreshold: 500, - - // Disk IO queue level above which to trigger - // 'NodeDiskIOSaturation' alert. - diskIOSaturationThreshold: 10, - - rateInterval: '5m', - // Opt-in for multi-cluster support. - showMultiCluster: false, - - clusterLabel: 'cluster', - - // groupLabels is a string with comma-separated - // labels that are common labels of instances belonging to the - // same logical group. Include not only enough labels to - // identify cluster members, but also all common labels you want - // to keep for resulting cluster-level alerts. - groupLabels: 'job', - // commaSeparated list of labels identifying a single instance: - instanceLabels: 'instance', - - dashboardNamePrefix: 'Node Exporter / ', - dashboardTags: ['node-exporter-mixin'], - dashboardRefresh: '30s', - dashboardTimezone: 'utc', - dashboardInterval: 'now-2h', - - // Grafana dashboard IDs are necessary for stable links for dashboards - grafanaDashboardIDs: { - 'node-rsrc-use.json': 'node-rsrc-use', - 'node-cluster-rsrc-use.json': 'node-cluster-rsrc-use', - 'node-multicluster-rsrc-use.json': 'node-multicluster-rsrc-use', - 'nodes.json': 'nodes', - 'nodes-darwin.json': 'nodes-darwin', - 'nodes-system.json': 'node-system', - 'nodes-memory.json': 'node-memory', - 'nodes-network.json': 'node-network', - 'nodes-disk.json': 'node-disk', - 'nodes-fleet.json': 'node-fleet', - }, - }, -} diff --git a/docs/node-mixin/dashboards.jsonnet b/docs/node-mixin/dashboards.jsonnet deleted file mode 100644 index 9d913ed3f1..0000000000 --- a/docs/node-mixin/dashboards.jsonnet +++ /dev/null @@ -1,6 +0,0 @@ -local dashboards = (import 'mixin.libsonnet').grafanaDashboards; - -{ - [name]: dashboards[name] - for name in std.objectFields(dashboards) -} diff --git a/docs/node-mixin/dashboards/dashboards.libsonnet b/docs/node-mixin/dashboards/dashboards.libsonnet deleted file mode 100644 index e6adbd4fa0..0000000000 --- a/docs/node-mixin/dashboards/dashboards.libsonnet +++ /dev/null @@ -1,2 +0,0 @@ -(import 'node.libsonnet') + -(import 'use.libsonnet') diff --git a/docs/node-mixin/dashboards/disk.libsonnet b/docs/node-mixin/dashboards/disk.libsonnet deleted file mode 100644 index 2f78c4da3e..0000000000 --- a/docs/node-mixin/dashboards/disk.libsonnet +++ /dev/null @@ -1,165 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; -local nodePanels = import '../lib/panels/panels.libsonnet'; -local commonPanels = import '../lib/panels/common/panels.libsonnet'; -local nodeTimeseries = nodePanels.timeseries; -local common = import '../lib/common.libsonnet'; - -{ - - // https://www.robustperception.io/filesystem-metrics-from-the-node-exporter/ - new(config=null, platform=null):: { - local c = common.new(config=config, platform=platform), - local commonPromTarget = c.commonPromTarget, - local templates = c.templates, - local q = c.queries, - - local fsAvailable = - nodeTimeseries.new( - 'Filesystem Space Available', - description=||| - Filesystem space utilisation in bytes, by mountpoint. - ||| - ) - .withUnits('decbytes') - .withFillOpacity(5) - .addTarget(commonPromTarget( - expr=q.node_filesystem_avail_bytes, - legendFormat='{{ mountpoint }}', - )), - - local fsInodes = - nodeTimeseries.new( - 'Free inodes', - description='The inode is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory.', - ) - .withUnits('short') - .addTarget(commonPromTarget( - expr=q.node_filesystem_files_free, - legendFormat='{{ mountpoint }}' - )) - .addTarget(commonPromTarget( - expr=q.node_filesystem_files, - legendFormat='{{ mountpoint }}' - )), - local fsInodesTotal = - nodeTimeseries.new( - 'Total inodes', - description='The inode is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory.', - ) - .withUnits('short') - .addTarget(commonPromTarget( - expr=q.node_filesystem_files, - legendFormat='{{ mountpoint }}' - )), - local fsErrorsandRO = - nodeTimeseries.new('Filesystems with errors / read-only') - .withMax(1) - .addTarget(commonPromTarget( - expr=q.node_filesystem_readonly, - legendFormat='{{ mountpoint }}' - )) - .addTarget(commonPromTarget( - expr=q.node_filesystem_device_error, - legendFormat='{{ mountpoint }}' - )), - local fileDescriptors = - nodeTimeseries.new( - 'File Descriptors', - description=||| - File descriptor is a handle to an open file or input/output (I/O) resource, such as a network socket or a pipe. - The operating system uses file descriptors to keep track of open files and I/O resources, and provides a way for programs to read from and write to them. - ||| - ) - .addTarget(commonPromTarget( - expr=q.process_max_fds, - legendFormat='Maximum open file descriptors', - )) - .addTarget(commonPromTarget( - expr=q.process_open_fds, - legendFormat='Open file descriptors', - )), - - local diskIOcompleted = - nodeTimeseries.new( - title='Disk IOps completed', - description='The number (after merges) of I/O requests completed per second for the device' - ) - .withUnits('iops') - .withNegativeYByRegex('reads') - .withAxisLabel('read(-) | write(+)') - .addTarget(commonPromTarget( - expr=q.node_disk_reads_completed_total, - legendFormat='{{device}} reads completed', - )) - .addTarget(commonPromTarget( - expr=q.node_disk_writes_completed_total, - legendFormat='{{device}} writes completed', - )), - - local diskAvgWaitTime = - nodeTimeseries.new( - title='Disk Average Wait Time', - description='The average time for requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them.' - ) - .withUnits('s') - .withNegativeYByRegex('read') - .withAxisLabel('read(-) | write(+)') - .addTarget(commonPromTarget( - expr=q.diskWaitReadTime, - legendFormat='{{device}} read wait time avg', - )) - .addTarget(commonPromTarget( - expr=q.diskWaitWriteTime, - legendFormat='{{device}} write wait time avg', - )), - - local diskAvgQueueSize = - nodeTimeseries.new( - title='Average Queue Size (aqu-sz)', - description='The average queue length of the requests that were issued to the device.' - ) - .addTarget(commonPromTarget( - expr=q.diskAvgQueueSize, - legendFormat='{{device}}', - )), - - local panelsGrid = - [ - { type: 'row', title: 'Filesystem', gridPos: { y: 0 } }, - fsAvailable { gridPos: { x: 0, w: 12, h: 8, y: 0 } }, - c.panelsWithTargets.diskSpaceUsage { gridPos: { x: 12, w: 12, h: 8, y: 0 } }, - fsInodes { gridPos: { x: 0, w: 12, h: 8, y: 0 } }, - fsInodesTotal { gridPos: { x: 12, w: 12, h: 8, y: 0 } }, - fsErrorsandRO { gridPos: { x: 0, w: 12, h: 8, y: 0 } }, - fileDescriptors { gridPos: { x: 12, w: 12, h: 8, y: 0 } }, - { type: 'row', title: 'Disk', gridPos: { y: 25 } }, - c.panelsWithTargets.diskIO { gridPos: { x: 0, w: 12, h: 8, y: 25 } }, - diskIOcompleted { gridPos: { x: 12, w: 12, h: 8, y: 25 } }, - diskAvgWaitTime { gridPos: { x: 0, w: 12, h: 8, y: 25 } }, - diskAvgQueueSize { gridPos: { x: 12, w: 12, h: 8, y: 25 } }, - ], - - dashboard: if platform == 'Linux' then - dashboard.new( - '%sNode Filesystem and Disk' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix, - time_from=config.dashboardInterval, - tags=(config.dashboardTags), - timezone=config.dashboardTimezone, - refresh=config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=config.grafanaDashboardIDs['nodes-disk.json'] - ) - .addLink(c.links.fleetDash) - .addLink(c.links.nodeDash) - .addLink(c.links.otherDashes) - .addAnnotations(c.annotations) - .addTemplates(templates) - .addPanels(panelsGrid) - else if platform == 'Darwin' then {}, - }, -} diff --git a/docs/node-mixin/dashboards/fleet.libsonnet b/docs/node-mixin/dashboards/fleet.libsonnet deleted file mode 100644 index a9939e59e2..0000000000 --- a/docs/node-mixin/dashboards/fleet.libsonnet +++ /dev/null @@ -1,505 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; -local nodePanels = import '../lib/panels/panels.libsonnet'; -local commonPanels = import '../lib/panels/common/panels.libsonnet'; -local nodeTimeseries = nodePanels.timeseries; -local common = import '../lib/common.libsonnet'; - -{ - - new(config=null, platform=null):: { - local c = common.new(config=config, platform=platform), - local commonPromTarget = c.commonPromTarget, - - - local templates = [ - if std.member(std.split(config.instanceLabels, ','), template.name) - then - template - { - allValue: '.+', - includeAll: true, - multi: true, - } - else template - for template in c.templates - ], - - local q = c.queries, - - local fleetTable = - nodePanels.table.new( - title='Linux Nodes Overview' - ) - .addTarget(commonPromTarget(expr=q.osInfo, format='table', instant=true) { refId: 'INFO' }) - .addTarget(commonPromTarget(expr=q.nodeInfo, format='table', instant=true) { refId: 'OS' }) - .addTarget(commonPromTarget(expr=q.uptime, format='table', instant=true) { refId: 'UPTIME' }) - .addTarget(commonPromTarget(expr=q.systemLoad1, format='table', instant=true) { refId: 'LOAD1' }) - .addTarget(commonPromTarget(expr=q.systemLoad5, format='table', instant=true) { refId: 'LOAD5' }) - .addTarget(commonPromTarget(expr=q.systemLoad15, format='table', instant=true) { refId: 'LOAD15' }) - .addTarget(commonPromTarget( - expr=q.cpuCount, - format='table', - instant=true, - ) { refId: 'CPUCOUNT' }) - .addTarget(commonPromTarget( - expr=q.cpuUsage, format='table', instant=true, - ) { refId: 'CPUUSAGE' }) - .addTarget(commonPromTarget(expr=q.memoryTotal, format='table', instant=true) { refId: 'MEMTOTAL' }) - .addTarget(commonPromTarget(expr=q.memoryUsage, format='table', instant=true) { refId: 'MEMUSAGE' }) - .addTarget(commonPromTarget(expr=q.fsSizeTotalRoot, format='table', instant=true) { refId: 'FSTOTAL' }) - .addTarget(commonPromTarget( - expr= - ||| - 100-(max by (%(instanceLabels)s) (node_filesystem_avail_bytes{%(nodeQuerySelector)s, fstype!="", mountpoint="/"}) - / - max by (%(instanceLabels)s) (node_filesystem_size_bytes{%(nodeQuerySelector)s, fstype!="", mountpoint="/"}) * 100) - ||| % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true, - ) { refId: 'FSUSAGE' }) - .addTarget(commonPromTarget( - expr='count by (%(instanceLabels)s) (max_over_time(ALERTS{%(nodeQuerySelector)s, alertstate="firing", severity="critical"}[1m])) * group by (%(instanceLabels)s) (node_uname_info{%(nodeQuerySelector)s})' % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true - ) { refId: 'CRITICAL' }) - .addTarget(commonPromTarget( - expr='count by (%(instanceLabels)s) (max_over_time(ALERTS{%(nodeQuerySelector)s, alertstate="firing", severity="warning"}[1m])) * group by (%(instanceLabels)s) (node_uname_info{%(nodeQuerySelector)s})' % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true - ) { refId: 'WARNING' }) - .withTransform() - .joinByField(field=std.split(config.instanceLabels, ',')[0]) - .filterFieldsByName(std.split(config.instanceLabels, ',')[0] + '|nodename|Value.+') - .organize( - excludeByName={ - 'Value #OS': true, - 'Value #INFO': true, - 'Value #LOAD5': true, - 'Value #LOAD15': true, - }, - renameByName={ - instance: 'Instance', - pretty_name: 'OS', - nodename: 'Hostname', - release: 'Kernel version', - 'Value #LOAD1': 'Load 1m', - 'Value #LOAD5': 'Load 5m', - 'Value #LOAD15': 'Load 15m', - 'Value #CPUCOUNT': 'Cores', - 'Value #CPUUSAGE': 'CPU usage', - 'Value #MEMTOTAL': 'Memory total', - 'Value #MEMUSAGE': 'Memory usage', - 'Value #FSTOTAL': 'Root disk size', - 'Value #FSUSAGE': 'Root disk usage', - 'Value #UPTIME': 'Uptime', - 'Value #CRITICAL': 'Crit Alerts', - 'Value #WARNING': 'Warnings', - } - ) - .withFooter(reducer=['mean'], fields=[ - 'Value #LOAD1', - 'Value #MEMUSAGE', - 'Value #CPUUSAGE', - ]) - .addThresholdStep(color='light-blue', value=null) - .addThresholdStep(color='light-yellow', value=80) - .addThresholdStep(color='light-red', value=90) - .addOverride( - matcher={ - id: 'byName', - options: 'Instance', - }, - properties=[ - { - id: 'links', - value: [ - { - targetBlank: true, - title: c.links.instanceDataLinkForTable.title, - url: c.links.instanceDataLinkForTable.url, - }, - ], - }, - { - id: 'custom.filterable', - value: true, - }, - ] - ) - .addOverride( - matcher={ - id: 'byRegexp', - options: 'OS|Kernel version|Hostname', - }, - properties=[ - { - id: 'custom.filterable', - value: true, - }, - ] - ) - .addOverride( - matcher={ - id: 'byRegexp', - options: 'Memory total|Root disk size', - }, - properties=[ - { - id: 'unit', - value: 'bytes', - }, - { - id: 'decimals', - value: 0, - }, - ] - ) - .addOverride( - matcher={ - id: 'byName', - options: 'Cores', - }, - properties=[ - { - id: 'custom.width', - value: 60, - }, - ] - ) - .addOverride( - matcher={ - id: 'byRegexp', - options: 'Load.+', - }, - properties=[ - { - id: 'custom.width', - value: 60, - }, - ] - ) - .addOverride( - matcher={ - id: 'byName', - options: 'Uptime', - }, - properties=[ - { - id: 'unit', - value: 'dtdurations', - }, - { - id: 'custom.displayMode', - value: 'color-text', - }, - { - id: 'thresholds', - value: { - mode: 'absolute', - steps: [ - { - color: 'light-orange', - value: null, - }, - { - color: 'text', - value: 300, - }, - ], - }, - }, - ] - ) - .addOverride( - matcher={ - id: 'byRegexp', - options: 'CPU usage|Memory usage|Root disk usage', - }, - properties=[ - { - id: 'unit', - value: 'percent', - }, - // { - // id: 'custom.displayMode', - // value: 'gradient-gauge', - // }, - { - id: 'custom.displayMode', - value: 'basic', - }, - { - id: 'max', - value: 100, - }, - { - id: 'min', - value: 0, - }, - ] - ) - .sortBy('Instance') - , - - local memoryUsagePanel = - nodePanels.timeseries.new('Memory Usage', description='Top 25') - .withUnits('percent') - .withMin(0) - .withMax(100) - .withColor(mode='continuous-BlYlRd') - .withFillOpacity(1) - .withGradientMode('scheme') - .withLegend(mode='table', calcs=['mean', 'max', 'lastNotNull'], placement='right') - .addDataLink( - title=c.links.instanceDataLink.title, - url=c.links.instanceDataLink.url, - ) - .addTarget(commonPromTarget( - expr='topk(25, ' + q.memoryUsage + ')', - legendFormat=c.labelsToLegend(std.split(config.instanceLabels, ',')) - )) - .addTarget(commonPromTarget( - expr='avg(' + q.memoryUsage + ')', - legendFormat='Mean', - )) - .addOverride( - matcher={ - id: 'byName', - options: 'Mean', - - }, - properties=[ - { - id: 'custom.lineStyle', - value: { - fill: 'dash', - dash: [ - 10, - 10, - ], - }, - }, - { - id: 'custom.fillOpacity', - value: 0, - }, - { - id: 'color', - value: { - mode: 'fixed', - fixedColor: 'light-purple', - }, - }, - { - id: 'custom.lineWidth', - value: 2, - }, - ] - ), - - local cpuUsagePanel = - nodePanels.timeseries.new('CPU Usage', description='Top 25') - .withUnits('percent') - .withMin(0) - .withMax(100) - .withFillOpacity(1) - .withColor(mode='continuous-BlYlRd') - .withGradientMode('scheme') - .withLegend(mode='table', calcs=['mean', 'max', 'lastNotNull'], placement='right') - .addDataLink( - title=c.links.instanceDataLink.title, - url=c.links.instanceDataLink.url, - ) - .addTarget(commonPromTarget( - expr='topk(25, ' + q.cpuUsage + ')', - legendFormat=c.labelsToLegend(std.split(config.instanceLabels, ',')), - )) - .addTarget(commonPromTarget( - expr='avg(' + q.cpuUsage + ')', - legendFormat='Mean', - )) - .addOverride( - matcher={ - id: 'byName', - options: 'Mean', - - }, - properties=[ - { - id: 'custom.lineStyle', - value: { - fill: 'dash', - dash: [ - 10, - 10, - ], - }, - }, - { - id: 'custom.fillOpacity', - value: 0, - }, - { - id: 'color', - value: { - mode: 'fixed', - fixedColor: 'light-purple', - }, - }, - { - id: 'custom.lineWidth', - value: 2, - }, - ] - ), - - local diskIOPanel = - nodePanels.timeseries.new('Disks I/O', description='Top 25') - .withUnits('percentunit') - .withMin(0) - .withMax(1) - .withFillOpacity(1) - .withColor(mode='continuous-BlYlRd') - .withGradientMode('scheme') - .withLegend(mode='table', calcs=['mean', 'max', 'lastNotNull'], placement='right') - .addDataLink( - title=c.links.instanceDataLink.title, - url=c.links.instanceDataLink.url, - ) - .addTarget(commonPromTarget( - expr='topk(25, ' + q.diskIoTime + ')', - legendFormat=c.labelsToLegend(std.split(config.instanceLabels, ',')) + ': {{device}}', - )) - .addOverride( - matcher={ - id: 'byName', - options: 'Mean', - - }, - properties=[ - { - id: 'custom.lineStyle', - value: { - fill: 'dash', - dash: [ - 10, - 10, - ], - }, - }, - { - id: 'custom.fillOpacity', - value: 0, - }, - { - id: 'color', - value: { - mode: 'fixed', - fixedColor: 'light-purple', - }, - }, - { - id: 'custom.lineWidth', - value: 2, - }, - ] - ), - local diskSpacePanel = - nodePanels.timeseries.new('Disks Space Usage', description='Top 25') - .withUnits('percentunit') - .withMin(0) - .withMax(1) - .withFillOpacity(1) - .withColor(mode='continuous-BlYlRd') - .withGradientMode('scheme') - .withLegend(mode='table', calcs=['mean', 'max', 'lastNotNull'], placement='right') - .addDataLink( - title=c.links.instanceDataLink.title, - url=c.links.instanceDataLink.url, - ) - .addTarget(commonPromTarget( - expr='topk(25, ' + q.diskSpaceUsage + ')', - legendFormat=c.labelsToLegend(std.split(config.instanceLabels, ',')) + ': {{mountpoint}}', - )) - .addOverride( - matcher={ - id: 'byName', - options: 'Mean', - - }, - properties=[ - { - id: 'custom.lineStyle', - value: { - fill: 'dash', - dash: [ - 10, - 10, - ], - }, - }, - { - id: 'custom.fillOpacity', - value: 0, - }, - { - id: 'color', - value: { - mode: 'fixed', - fixedColor: 'light-purple', - }, - }, - { - id: 'custom.lineWidth', - value: 2, - }, - ] - ), - local networkErrorsDropsPanel = - nodePanels.timeseries.new('Network Errors and Dropped Packets', description='Top 25') - .withLegend(mode='table', calcs=['mean', 'max', 'lastNotNull'], placement='right') - .addTarget(commonPromTarget( - expr='topk(25, ' + q.networkReceiveErrorsPerSec + ' + ' + q.networkTransmitErrorsPerSec + ' + ' + q.networkReceiveDropsPerSec + ' + ' + q.networkTransmitDropsPerSec + ') > 0.5', - legendFormat=c.labelsToLegend(std.split(config.instanceLabels, ',')) + ': {{device}}', - )) - .withDecimals(1) - .withUnits('pps') - .withDrawStyle('points') - .withPointsSize(5) - .addDataLink( - title=c.links.instanceDataLink.title, - url=c.links.instanceDataLink.url, - ), - - local rows = - [ - row.new('Overview') - .addPanel(fleetTable { span: 12, height: '800px' }) - .addPanel(cpuUsagePanel { span: 12 }) - .addPanel(memoryUsagePanel { span: 12 }) - .addPanel(diskIOPanel { span: 6 }).addPanel(diskSpacePanel { span: 6 }) - .addPanel(networkErrorsDropsPanel { span: 12 }), - ], - - dashboard: if platform == 'Linux' then - dashboard.new( - '%sNode Fleet Overview' % config.dashboardNamePrefix, - time_from=config.dashboardInterval, - tags=(config.dashboardTags), - timezone=config.dashboardTimezone, - refresh=config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=config.grafanaDashboardIDs['nodes-fleet.json'], - ) - .addLink(c.links.otherDashes { includeVars: false }) - .addAnnotations(c.annotations) - .addTemplates(templates) - .addRows(rows) - else if platform == 'Darwin' then {}, - }, -} diff --git a/docs/node-mixin/dashboards/memory.libsonnet b/docs/node-mixin/dashboards/memory.libsonnet deleted file mode 100644 index 5b6e613851..0000000000 --- a/docs/node-mixin/dashboards/memory.libsonnet +++ /dev/null @@ -1,406 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; -local nodePanels = import '../lib/panels/panels.libsonnet'; -local commonPanels = import '../lib/panels/common/panels.libsonnet'; -local nodeTimeseries = nodePanels.timeseries; -local common = import '../lib/common.libsonnet'; - -{ - - new(config=null, platform=null):: { - local c = common.new(config=config, platform=platform), - local commonPromTarget = c.commonPromTarget, - local templates = c.templates, - local q = c.queries, - - local memoryPagesInOut = - nodeTimeseries.new( - 'Memory Pages In / Out', - description=||| - Page-In - Return of pages to physical memory. This is a common and normal event. - - Page-Out - process of writing pages to disk. Unlike page-in, page-outs can indicate trouble. - When the kernel detects low memory, it attempts to free memory by paging out. - While occasional page-outs are normal, excessive and frequent page-outs can lead to thrashing. - Thrashing is a state in which the kernel spends more time managing paging activity than running applications, resulting in poor system performance. - ||| - ) - .withNegativeYByRegex('out') - .withAxisLabel('out(-) | in(+)') - .addTarget(commonPromTarget( - expr='irate(node_vmstat_pgpgin{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Page-In' - )) - .addTarget(commonPromTarget( - expr='irate(node_vmstat_pgpgout{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Page-Out' - )), - local memoryPagesSwapInOut = - nodeTimeseries.new( - 'Memory Pages Swapping In / Out', - description=||| - Compared to the speed of the CPU and main memory, writing pages out to disk is relatively slow. - Nonetheless, it is a preferable option to crashing or killing off processes. - - The process of writing pages out to disk to free memory is known as swapping-out. - If a page fault occurs because the page is on disk, in the swap area, rather than in memory, - the kernel will read the page back in from the disk to satisfy the page fault. - This is known as swapping-in. - ||| - ) - .withNegativeYByRegex('out') - .withAxisLabel('out(-) | in(+)') - .addTarget(commonPromTarget( - expr='irate(node_vmstat_pswpin{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Pages swapped in' - )) - .addTarget(commonPromTarget( - expr='irate(node_vmstat_pswpout{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Pages swapped out' - )), - - local memoryPagesFaults = - nodeTimeseries.new( - 'Memory Page Faults', - description=||| - A page fault is an exception raised by the memory when a process accesses a memory page without the necessary preparations, - requiring a mapping to be added to the process's virtual address space. The page contents may also need to be loaded from a backing store such as a disk. - While the MMU detects the page fault, the operating system's kernel handles the exception by either making the required page accessible in physical memory or denying an illegal memory access. - Valid page faults are common and necessary to increase memory availability in any operating system that uses virtual memory, including Windows, macOS, and the Linux kernel. - ||| - ) - .addTarget(commonPromTarget( - expr='irate(node_vmstat_pgmajfault{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Major page fault operations' - )) - .addTarget(commonPromTarget( - expr= - ||| - irate(node_vmstat_pgfault{%(nodeQuerySelector)s}[$__rate_interval]) - - - irate(node_vmstat_pgmajfault{%(nodeQuerySelector)s}[$__rate_interval]) - ||| % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Minor page fault operations' - )), - - local memoryOOMkiller = - nodeTimeseries.new( - 'OOM Killer', - description=||| - Out Of Memory Killer is a process used by the Linux kernel when the system is running critically low on memory. - This can happen when the kernel has allocated more memory than is available for its processes. - ||| - ) - .addTarget(commonPromTarget( - expr='increase(node_vmstat_oom_kill{%(nodeQuerySelector)s}[$__interval] offset -$__interval)' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='OOM killer invocations' - )), - - local memoryActiveInactive = - nodeTimeseries.new( - 'Memory Active / Inactive', - description=||| - Inactive: Memory which has been less recently used. It is more eligible to be reclaimed for other purposes. - Active: Memory that has been used more recently and usually not reclaimed unless absolutely necessary. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_Inactive_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Inactive', - )) - .addTarget(commonPromTarget( - expr='node_memory_Active_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Active', - )), - - local memoryActiveInactiveDetail = - nodeTimeseries.new( - 'Memory Active / Inactive Details', - description=||| - Inactive_file: File-backed memory on inactive LRU list. - Inactive_anon: Anonymous and swap cache on inactive LRU list, including tmpfs (shmem). - Active_file: File-backed memory on active LRU list. - Active_anon: Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_Inactive_file_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Inactive_file', - )) - .addTarget(commonPromTarget( - expr='node_memory_Inactive_anon_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Inactive_anon', - )) - .addTarget(commonPromTarget( - expr='node_memory_Active_file_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Active_file', - )) - .addTarget(commonPromTarget( - expr='node_memory_Active_anon_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Active_anon', - )), - - local memoryCommited = - nodeTimeseries.new( - 'Memory Commited', - description=||| - Committed_AS - Amount of memory presently allocated on the system. - CommitLimit - Amount of memory currently available to be allocated on the system. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_Committed_AS_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Committed_AS' - )) - .addTarget(commonPromTarget( - expr='node_memory_CommitLimit_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='CommitLimit' - )), - local memorySharedAndMapped = - nodeTimeseries.new( - 'Memory Shared and Mapped', - description=||| - Mapped: This refers to the memory used in mapped page files that have been memory mapped, such as libraries. - Shmem: This is the memory used by shared memory, which is shared between multiple processes, including RAM disks. - ShmemHugePages: This is the memory used by shared memory and tmpfs allocated with huge pages. - ShmemPmdMapped: This is the amount of shared memory (shmem/tmpfs) backed by huge pages. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_Mapped_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Mapped' - )) - .addTarget(commonPromTarget( - expr='node_memory_Shmem_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Shmem' - )) - .addTarget(commonPromTarget( - expr='node_memory_ShmemHugePages_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='ShmemHugePages' - )) - .addTarget(commonPromTarget( - expr='node_memory_ShmemPmdMapped_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='ShmemPmdMapped' - )), - - local memoryWriteAndDirty = - nodeTimeseries.new( - 'Memory Writeback and Dirty', - description=||| - Writeback: This refers to the memory that is currently being actively written back to the disk. - WritebackTmp: This is the memory used by FUSE for temporary writeback buffers. - Dirty: This type of memory is waiting to be written back to the disk. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_Writeback_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Writeback' - )) - .addTarget(commonPromTarget( - expr='node_memory_WritebackTmp_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='WritebackTmp' - )) - .addTarget(commonPromTarget( - expr='node_memory_Dirty_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Dirty' - )), - - local memoryVmalloc = - nodeTimeseries.new( - 'Memory Vmalloc', - description=||| - Virtual Memory Allocation is a type of memory allocation in Linux that allows a process to request a contiguous block of memory larger than the amount of physically available memory. This is achieved by mapping the requested memory to virtual addresses that are backed by a combination of physical memory and swap space on disk. - - VmallocChunk: Largest contiguous block of vmalloc area which is free. - VmallocTotal: Total size of vmalloc memory area. - VmallocUsed: Amount of vmalloc area which is used. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_VmallocChunk_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='VmallocChunk' - )) - .addTarget(commonPromTarget( - expr='node_memory_VmallocTotal_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='VmallocTotal' - )) - .addTarget(commonPromTarget( - expr='node_memory_VmallocUsed_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='VmallocUsed' - )), - - local memorySlab = - nodeTimeseries.new('Memory Slab', - description=||| - Slab Allocation is a type of memory allocation in Linux that allows the kernel to efficiently manage the allocation and deallocation of small and frequently used data structures, such as network packets, file system objects, and process descriptors. - - The Slab Allocator maintains a cache of pre-allocated objects of a fixed size and type, called slabs. When an application requests an object of a particular size and type, the Slab Allocator checks if a pre-allocated object of that size and type is available in the cache. If an object is available, it is returned to the application; if not, a new slab of objects is allocated and added to the cache. - - SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure. - SReclaimable: Part of Slab, that might be reclaimed, such as caches. - |||) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_SUnreclaim_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='SUnreclaim' - )) - .addTarget(commonPromTarget( - expr='node_memory_SReclaimable_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='SReclaimable' - )), - - local memoryAnonymous = - nodeTimeseries.new( - 'Memory Anonymous', - description=||| - Memory Anonymous refers to the portion of the virtual memory that is used by a process for dynamically allocated memory that is not backed by any file or device. - - This type of memory is commonly used for heap memory allocation, which is used by programs to allocate and free memory dynamically during runtime. - - Memory Anonymous is different from Memory Mapped files, which refer to portions of the virtual memory space that are backed by a file or device, - and from Memory Shared with other processes, - which refers to memory regions that can be accessed and modified by multiple processes. - - AnonHugePages: Memory in anonymous huge pages. - AnonPages: Memory in user pages not backed by files. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_AnonHugePages_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='AnonHugePages' - )) - .addTarget(commonPromTarget( - expr='node_memory_AnonPages_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='AnonPages' - )), - - local memoryHugePagesCounter = - nodeTimeseries.new( - 'Memory HugePages Counter', - description=||| - Huge Pages are a feature that allows for the allocation of larger memory pages than the standard 4KB page size. By using larger page sizes, the kernel can reduce the overhead associated with managing a large number of smaller pages, which can improve system performance for certain workloads. - - HugePages_Free: Huge pages in the pool that are not yet allocated. - HugePages_Rsvd: Huge pages for which a commitment to allocate from the pool has been made, but no allocation has yet been made. - HugePages_Surp: Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages. - ||| - ) - .addTarget(commonPromTarget( - expr='node_memory_HugePages_Free{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='HugePages_Free' - )) - .addTarget(commonPromTarget( - expr='node_memory_HugePages_Rsvd{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='HugePages_Rsvd' - )) - .addTarget(commonPromTarget( - expr='node_memory_HugePages_Surp{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='HugePages_Surp' - )), - local memoryHugePagesSize = - nodeTimeseries.new( - 'Memory HugePages Size', - description=||| - Huge Pages are a feature that allows for the allocation of larger memory pages than the standard 4KB page size. By using larger page sizes, the kernel can reduce the overhead associated with managing a large number of smaller pages, which can improve system performance for certain workloads. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_HugePages_Total{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Huge pages total size' - )) - .addTarget(commonPromTarget( - expr='node_memory_Hugepagesize_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Huge page size' - )), - local memoryDirectMap = - nodeTimeseries.new( - 'Memory Direct Map', - description=||| - Direct Map memory refers to the portion of the kernel's virtual address space that is directly mapped to physical memory. This mapping is set up by the kernel during boot time and is used to provide fast access to certain critical kernel data structures, such as page tables and interrupt descriptor tables. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_DirectMap1G_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='DirectMap1G' - )) - .addTarget(commonPromTarget( - expr='node_memory_DirectMap2M_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='DirectMap2M' - )) - .addTarget(commonPromTarget( - expr='node_memory_DirectMap4k_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='DirectMap4k' - )), - - local memoryBounce = - nodeTimeseries.new( - 'Memory Bounce', - description=||| - Memory bounce is a technique used in the Linux kernel to handle situations where direct memory access (DMA) is required but the physical memory being accessed is not contiguous. This can happen when a device, such as a network interface card or a disk controller, requires access to a large amount of memory that is not available as a single contiguous block. - - To handle this situation, the kernel uses a technique called memory bouncing. In memory bouncing, the kernel sets up a temporary buffer in physical memory that is large enough to hold the entire data block being transferred by the device. The data is then copied from the non-contiguous source memory to the temporary buffer, which is physically contiguous. - - Bounce: Memory used for block device bounce buffers. - ||| - ) - .withUnits('decbytes') - .addTarget(commonPromTarget( - expr='node_memory_Bounce_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Bounce' - )), - local panelsGrid = - [ - c.panelsWithTargets.memoryGauge { gridPos: { x: 0, w: 6, h: 6, y: 0 } }, - c.panelsWithTargets.memoryGraph { gridPos: { x: 6, w: 18, h: 6, y: 0 } }, - { type: 'row', title: 'Vmstat', gridPos: { y: 25 } }, - memoryPagesInOut { gridPos: { x: 0, w: 12, h: 8, y: 25 } }, - memoryPagesSwapInOut { gridPos: { x: 12, w: 12, h: 8, y: 25 } }, - memoryPagesFaults { gridPos: { x: 0, w: 12, h: 8, y: 25 } }, - memoryOOMkiller { gridPos: { x: 12, w: 12, h: 8, y: 25 } }, - { type: 'row', title: 'Memstat', gridPos: { y: 50 } }, - memoryActiveInactive { gridPos: { x: 0, w: 12, h: 8, y: 50 } }, - memoryActiveInactiveDetail { gridPos: { x: 12, w: 12, h: 8, y: 50 } }, - memoryCommited { gridPos: { x: 0, w: 12, h: 8, y: 50 } }, - memorySharedAndMapped { gridPos: { x: 12, w: 12, h: 8, y: 50 } }, - memoryWriteAndDirty { gridPos: { x: 0, w: 12, h: 8, y: 50 } }, - memoryVmalloc { gridPos: { x: 12, w: 12, h: 8, y: 50 } }, - memorySlab { gridPos: { x: 0, w: 12, h: 8, y: 50 } }, - memoryAnonymous { gridPos: { x: 12, w: 12, h: 8, y: 50 } }, - memoryHugePagesCounter { gridPos: { x: 0, w: 12, h: 8, y: 50 } }, - memoryHugePagesSize { gridPos: { x: 12, w: 12, h: 8, y: 50 } }, - memoryDirectMap { gridPos: { x: 0, w: 12, h: 8, y: 50 } }, - memoryBounce { gridPos: { x: 12, w: 12, h: 8, y: 50 } }, - ], - - dashboard: if platform == 'Linux' then - dashboard.new( - '%sNode Memory' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix, - time_from=config.dashboardInterval, - tags=(config.dashboardTags), - timezone=config.dashboardTimezone, - refresh=config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=config.grafanaDashboardIDs['nodes-memory.json'], - ) - .addLink(c.links.fleetDash) - .addLink(c.links.nodeDash) - .addLink(c.links.otherDashes) - .addAnnotations(c.annotations) - .addTemplates(templates) - .addPanels(panelsGrid) - else if platform == 'Darwin' then {}, - }, -} diff --git a/docs/node-mixin/dashboards/network.libsonnet b/docs/node-mixin/dashboards/network.libsonnet deleted file mode 100644 index ceacd13e42..0000000000 --- a/docs/node-mixin/dashboards/network.libsonnet +++ /dev/null @@ -1,796 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; -local nodePanels = import '../lib/panels/panels.libsonnet'; -local commonPanels = import '../lib/panels/common/panels.libsonnet'; -local nodeTimeseries = nodePanels.timeseries; -local common = import '../lib/common.libsonnet'; - -{ - - new(config=null, platform=null):: { - local c = common.new(config=config, platform=platform), - local commonPromTarget = c.commonPromTarget, - local templates = c.templates, - local q = c.queries, - - local networkTrafficPanel = - commonPanels.networkTrafficGraph.new( - 'Network Traffic', - description=||| - Network interfaces utilisation by device and direction. - ||| - ) - .addTarget(commonPromTarget( - expr=q.networkReceiveBitsPerSec, - legendFormat='{{device}} received', - )) - .addTarget(commonPromTarget( - expr=q.networkTransmitBitsPerSec, - legendFormat='{{device}} transmitted', - )), - - local networkPacketsPanel = - nodeTimeseries.new( - 'Packets', - description=||| - packets received: Number of good packets received by the interface. - For hardware interfaces counts all good packets received from the device by the host, including packets which host had to drop at various stages of processing (even in the driver). - - packets transmitted: Number of packets successfully transmitted. - For hardware interfaces counts packets which host was able to successfully hand over to the device, - which does not necessarily mean that packets had been successfully transmitted out of the device, only that device acknowledged it copied them out of host memory. - - https://docs.kernel.org/networking/statistics.html - ||| - ) - .addTarget(commonPromTarget( - 'irate(node_network_receive_packets_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='{{device}} received', - )) - .addTarget(commonPromTarget( - 'irate(node_network_transmit_packets_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='{{device}} transmitted', - )) - .withDecimals(1) - .withUnits('pps') - .withNegativeYByRegex('transmit') - .withAxisLabel('out(-) | in(+)'), - - local networkErrorsPanel = - nodeTimeseries.new( - 'Network Errors', - description=||| - errors received: Total number of bad packets received on this network device. This counter must include events counted by rx_length_errors, rx_crc_errors, rx_frame_errors and other errors not otherwise counted. - - errors transmitted: Total number of transmit problems. This counter must include events counter by tx_aborted_errors, tx_carrier_errors, tx_fifo_errors, tx_heartbeat_errors, tx_window_errors and other errors not otherwise counted. - - https://docs.kernel.org/networking/statistics.html - ||| - ) - .addTarget(commonPromTarget( - expr=q.networkReceiveErrorsPerSec, - legendFormat='{{device}} received', - )) - .addTarget(commonPromTarget( - expr=q.networkTransmitErrorsPerSec, - legendFormat='{{device}} transmitted', - )) - .withDecimals(1) - .withUnits('pps') - .withNegativeYByRegex('transmit') - .withAxisLabel('out(-) | in(+)'), - - local networkDropsPanel = - nodeTimeseries.new( - 'Dropped Packets', - description=||| - drops received: Number of packets received but not processed, e.g. due to lack of resources or unsupported protocol. For hardware interfaces this counter may include packets discarded due to L2 address filtering but should not include packets dropped by the device due to buffer exhaustion which are counted separately in rx_missed_errors (since procfs folds those two counters together). - - drops transmitted: Number of packets dropped on their way to transmission, e.g. due to lack of resources. - https://docs.kernel.org/networking/statistics.html - ||| - ) - .addTarget(commonPromTarget( - expr=q.networkReceiveDropsPerSec, - legendFormat='{{device}} received', - )) - .addTarget(commonPromTarget( - expr=q.networkTransmitDropsPerSec, - legendFormat='{{device}} transmitted', - )) - .withDecimals(1) - .withUnits('pps') - .withNegativeYByRegex('transmit') - .withAxisLabel('out(-) | in(+)'), - local networkCompressedPanel = - nodeTimeseries.new( - 'Compressed Packets', - description=||| - compressed received: - Number of correctly received compressed packets. This counters is only meaningful for interfaces which support packet compression (e.g. CSLIP, PPP). - - compressed transmitted: - Number of transmitted compressed packets. This counters is only meaningful for interfaces which support packet compression (e.g. CSLIP, PPP). - - https://docs.kernel.org/networking/statistics.html - ||| - ) - .addTarget(commonPromTarget( - 'irate(node_network_receive_compressed_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='{{device}} received', - )) - .addTarget(commonPromTarget( - 'irate(node_network_transmit_compressed_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='{{device}} transmitted', - )) - .withDecimals(1) - .withUnits('pps') - .withNegativeYByRegex('transmit') - .withAxisLabel('out(-) | in(+)'), - - local networkMulticastPanel = - nodeTimeseries.new( - 'Multicast Packets', - description=||| - Multicast packets received and transmitted. - ||| - ) - .addTarget(commonPromTarget( - 'irate(node_network_receive_multicast_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='{{device}} received', - )) - .addTarget(commonPromTarget( - 'irate(node_network_transmit_multicast_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='{{device}} transmitted', - )) - .withDecimals(1) - .withUnits('pps') - .withNegativeYByRegex('transmit'), - - local networkFifoPanel = - nodeTimeseries.new( - 'Network FIFO', - description=||| - Network FIFO (First-In, First-Out) refers to a buffer used by the network stack to store packets in a queue. - It is a mechanism used to manage network traffic and ensure that packets are delivered to their destination in the order they were received. - Packets are stored in the FIFO buffer until they can be transmitted or processed further. - ||| - ) - .addTarget(commonPromTarget( - 'irate(node_network_receive_fifo_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='{{device}} received', - )) - .addTarget(commonPromTarget( - 'irate(node_network_transmit_fifo_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='{{device}} transmitted', - )) - .withDecimals(1) - .withUnits('pps') - .withNegativeYByRegex('transmit') - .withAxisLabel('out(-) | in(+)'), - - local networkNFConntrack = - nodeTimeseries.new( - 'NF Conntrack', - description=||| - NF Conntrack is a component of the Linux kernel's netfilter framework that provides stateful packet inspection to track and manage network connections, - enforce firewall rules, perform NAT, and manage network address/port translation. - ||| - ) - .addTarget(commonPromTarget( - 'node_nf_conntrack_entries{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='NF conntrack entries', - )) - .addTarget(commonPromTarget( - 'node_nf_conntrack_entries_limit{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='NF conntrack limits', - )) - .withFillOpacity(0), - - local networkSoftnetPanel = - nodeTimeseries.new( - 'Softnet Packets', - description=||| - Softnet packets are received by the network and queued for processing by the kernel's networking stack. - Softnet packets are usually generated by network traffic that is directed to the local host, and they are typically processed by the kernel's networking subsystem before being passed on to the relevant application. - ||| - ) - .addTarget(commonPromTarget( - 'irate(node_softnet_processed_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='CPU {{cpu }} proccessed', - )) - .addTarget(commonPromTarget( - 'irate(node_softnet_dropped_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='CPU {{cpu }} dropped', - )) - .withDecimals(1) - .withUnits('pps') - .withNegativeYByRegex('dropped') - .withAxisLabel('Dropped(-) | Processed(+)'), - - local networkSoftnetSqueezePanel = - nodeTimeseries.new( - 'Softnet Out of Quota', - description=||| - "Softnet Out of Quota" is a network-related metric in Linux that measures the number of times the kernel's softirq processing was unable to handle incoming network traffic due to insufficient softirq processing capacity. - This means that the kernel has reached its processing capacity limit for incoming packets, and any additional packets will be dropped or deferred. - ||| - ) - .addTarget(commonPromTarget( - 'irate(node_softnet_times_squeezed_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='CPU {{cpu}} out of quota', - )) - .withDecimals(1) - .withUnits('pps'), - - local networkInterfacesTable = - nodePanels.table.new( - title='Network Interfaces Overview' - ) - // "Value #A" - .addTarget(commonPromTarget( - expr='node_network_up{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true, - )) - // "Value #B" - .addTarget(commonPromTarget( - expr='node_network_carrier{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true, - )) - // "Value #C" - .addTarget(commonPromTarget( - expr=q.networkTransmitBitsPerSec, - format='table', - instant=true, - )) - // "Value #D" - .addTarget(commonPromTarget( - expr=q.networkReceiveBitsPerSec, - format='table', - instant=true, - )) - // "Value #E" - .addTarget(commonPromTarget( - expr='node_arp_entries{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true, - )) - // "Value #F" - .addTarget(commonPromTarget( - expr='node_network_mtu_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true, - )) - // "Value #G" - .addTarget(commonPromTarget( - expr='node_network_speed_bytes{%(nodeQuerySelector)s} * 8' % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true, - )) - // "Value #H" - .addTarget(commonPromTarget( - expr='node_network_transmit_queue_length{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true, - )) - // "VALUE #I" - .addTarget(commonPromTarget( - expr='node_network_info{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - format='table', - instant=true, - )) - .withTransform() - .joinByField(field='device') - .filterFieldsByName('device|address|duplex|Value.+') - .organize( - excludeByName={ - 'Value #I': true, - }, - renameByName= - { - device: 'Interface', - address: 'Address', - duplex: 'Duplex', - 'Value #A': 'Up', - 'Value #B': 'Carrier', - 'Value #C': 'Transmit', - 'Value #D': 'Receive', - 'Value #E': 'ARP entries', - 'Value #F': 'MTU', - 'Value #G': 'Speed', - 'Value #H': 'Queue length', - } - ) - .addOverride( - matcher={ - id: 'byRegexp', - options: 'Speed', - }, - properties=[ - { - id: 'unit', - value: 'bps', - }, - ] - ) - .addOverride( - matcher={ - id: 'byRegexp', - options: 'Carrier|Up', - }, - properties=[ - { - id: 'custom.displayMode', - value: 'color-text', - }, - { - id: 'mappings', - value: [ - { - type: 'value', - options: { - '0': { - text: 'Down', - color: 'light-red', - index: 1, - }, - '1': { - text: 'Up', - color: 'light-green', - index: 0, - }, - }, - }, - ], - }, - ] - ) - .addOverride( - matcher={ - id: 'byRegexp', - options: 'Transmit|Receive', - }, - properties=[ - { - id: 'unit', - value: 'bps', - }, - { - id: 'custom.displayMode', - value: 'gradient-gauge', - }, - { - id: 'color', - value: { - mode: 'continuous-BlYlRd', - }, - }, - { - id: 'max', - value: 1000 * 1000 * 100, - }, - ] - ) - , - - local networkOperStatus = - nodeTimeseries.new( - title='Network Interfaces Carrier Status', - description='Network Interfaces Carrier Status', - ) - .withColor(mode='palette-classic') - .withFillOpacity(100) - .withLegend(mode='list') - .addTarget(commonPromTarget( - expr='node_network_carrier{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='{{device}}' - )) - + { - maxDataPoints: 100, - type: 'status-history', - fieldConfig+: { - defaults+: { - mappings+: [ - { - type: 'value', - options: { - '1': { - text: 'Up', - color: 'light-green', - index: 1, - }, - }, - }, - { - type: 'value', - options: { - '0': { - text: 'Down', - color: 'light-red', - index: 0, - }, - }, - }, - - ], - }, - }, - }, - // https://github.com/prometheus/node_exporter/pull/2346/files#diff-3699c850869aecf912f8e8272958b556913fc266534206833a5dcb7d6cca3610 - local networkSockstatTCP = - nodeTimeseries.new( - title='Sockets TCP', - description=||| - TCP sockets are used for establishing and managing network connections between two endpoints over the TCP/IP protocol. - - Orphan sockets: If a process terminates unexpectedly or is terminated without closing its sockets properly, the sockets may become orphaned. - ||| - ) - .addTarget(commonPromTarget( - expr='node_sockstat_TCP_alloc{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Allocated' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_TCP6_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv6 In use' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_TCP_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv4 In use' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_TCP_orphan{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Orphan sockets' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_TCP_tw{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Time wait' - )), - - local networkSockstatUDP = - nodeTimeseries.new( - title='Sockets UDP', - description=||| - UDP (User Datagram Protocol) and UDPlite (UDP-Lite) sockets are used for transmitting and receiving data over the UDP and UDPlite protocols, respectively. - Both UDP and UDPlite are connectionless protocols that do not provide a reliable data delivery mechanism. - ||| - ) - .addTarget(commonPromTarget( - expr='node_sockstat_UDPLITE_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv4 UDPLITE in use' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_UDP_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv4 UDP in use' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_UDPLITE6_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv6 UDPLITE in use' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_UDP6_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv6 UDP in use' - )), - - local networkSockstatOther = - nodeTimeseries.new( - title='Sockets Other', - description=||| - FRAG (IP fragment) sockets: Used to receive and process fragmented IP packets. FRAG sockets are useful in network monitoring and analysis. - - RAW sockets: Allow applications to send and receive raw IP packets directly without the need for a transport protocol like TCP or UDP. - ||| - ) - .addTarget(commonPromTarget( - expr='node_sockstat_FRAG_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv4 Frag sockets in use' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_FRAG6_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv6 Frag sockets in use' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_RAW_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv4 Raw sockets in use' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_RAW6_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv6 Raw sockets in use' - )), - - - local networkSockstatMemory = - nodeTimeseries.new( - title='Sockets Memory', - description=||| - Memory currently in use for sockets. - ||| - ) - .withMaxDataPoints(100) - .addTarget(commonPromTarget( - expr='node_sockstat_TCP_mem{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Memory pages allocated for TCP sockets' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_UDP_mem{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Memory pages allocated for UDP sockets' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_TCP_mem_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Memory bytes allocated for TCP sockets' - )) - .addTarget(commonPromTarget( - expr='node_sockstat_UDP_mem_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Memory bytes allocated for UDP sockets' - )) - .addOverride( - matcher={ - id: 'byRegexp', - options: '/bytes/', - }, - properties=[ - { - id: 'unit', - value: 'bytes', - }, - { - id: 'custom.drawStyle', - value: 'lines', - }, - { - id: 'custom.drawStyle', - value: 'bars', - }, - { - id: 'custom.stacking', - value: { - mode: 'normal', - group: 'A', - }, - }, - ] - ), - - local networkSockstatAll = - nodeTimeseries.new( - title='Sockets in use', - description='Number of sockets currently in use.', - ) - .addTarget(commonPromTarget( - expr='node_sockstat_sockets_used{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='IPv4 sockets in use' - )), - - local networkNetstatIP = - nodeTimeseries.new( - title='IP octets', - description='Rate of IP octets received and transmitted.' - ) - .withUnits('oct/s') - .withNegativeYByRegex('transmit') - .withAxisLabel('out(-) | in(+)') - .addTarget(commonPromTarget( - expr='irate(node_netstat_IpExt_InOctets{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Octets received' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_IpExt_OutOctets{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Octets transmitted' - )), - - - local networkNetstatTCP = - nodeTimeseries.new( - title='TCP segments', - description='Rate of TCP segments received and transmitted.' - ) - .withUnits('seg/s') - .withNegativeYByRegex('transmit') - .withAxisLabel('out(-) | in(+)') - .addTarget(commonPromTarget( - expr='irate(node_netstat_Tcp_InSegs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='TCP received' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Tcp_OutSegs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='TCP transmitted' - )), - - local networkNetstatTCPerrors = - nodeTimeseries.new( - title='TCP errors rate', - description='Rate of TCP errors.' - ) - .withUnits('err/s') - .addTarget(commonPromTarget( - expr='irate(node_netstat_TcpExt_ListenOverflows{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='TCP overflow' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_TcpExt_ListenDrops{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='TCP ListenDrops - SYNs to LISTEN sockets ignored' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_TcpExt_TCPSynRetrans{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='TCP SYN rentransmits' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Tcp_RetransSegs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='TCP retransmitted segments, containing one or more previously transmitted octets' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Tcp_InErrs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='TCP received with errors' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Tcp_OutRsts{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='TCP segments sent with RST flag' - )), - - local networkNetstatUDP = - nodeTimeseries.new( - title='UDP datagrams', - description='Rate of UDP datagrams received and transmitted.' - ) - .withUnits('dat/s') - .withNegativeYByRegex('transmit') - .withAxisLabel('out(-) | in(+)') - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp_InDatagrams{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP received' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp_OutDatagrams{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP transmitted' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp6_InDatagrams{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP6 received' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp6_OutDatagrams{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP6 transmitted' - )), - - local networkNetstatUDPerrors = - nodeTimeseries.new( - title='UDP errors rate', - description='Rate of UDP datagrams received and transmitted with errors.' - ) - .withUnits('err/s') - .addTarget(commonPromTarget( - expr='irate(node_netstat_UdpLite_InErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDPLite InErrors' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp_InErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP InErrors' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp6_InErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP6 InErrors' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp_NoPorts{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP NoPorts' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp6_NoPorts{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP6 NoPorts' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp_RcvbufErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP receive buffer errors' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp6_RcvbufErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP6 receive buffer errors' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp_SndbufErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP send buffer errors' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Udp6_SndbufErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='UDP6 send buffer errors' - )), - - - local networkNetstatICMP = - nodeTimeseries.new( - title='ICMP messages', - description="Rate of ICMP messages, like 'ping', received and transmitted." - ) - .withUnits('msg/s') - .withNegativeYByRegex('transmit') - .withAxisLabel('out(-) | in(+)') - .addTarget(commonPromTarget( - expr='irate(node_netstat_Icmp_InMsgs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='ICMP received' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Icmp_OutMsgs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='ICMP transmitted' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Icmp6_InMsgs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='ICMP6 received' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Icmp6_OutMsgs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='ICMP6 transmitted' - )), - - local networkNetstatICMPerrors = - nodeTimeseries.new( - title='ICMP errors rate', - description='Rate of ICMP messages received and transmitted with errors.' - ) - .withUnits('err/s') - .addTarget(commonPromTarget( - expr='irate(node_netstat_Icmp_InErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='ICMP Errors' - )) - .addTarget(commonPromTarget( - expr='irate(node_netstat_Icmp6_InErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='ICMP6 Errors' - )), - - - local rows = - [ - row.new('Network') - .addPanel(networkInterfacesTable { span: 12 }) - .addPanel(networkTrafficPanel { span: 6 }) - .addPanel(networkOperStatus { span: 6 }) - .addPanel(networkErrorsPanel { span: 6 }) - .addPanel(networkDropsPanel { span: 6 }) - .addPanel(networkPacketsPanel { span: 6 }) - .addPanel(networkMulticastPanel { span: 6 }) - .addPanel(networkFifoPanel { span: 6 }) - .addPanel(networkCompressedPanel { span: 6 }) - .addPanel(networkNFConntrack { span: 6 }) - .addPanel(networkSoftnetPanel { span: 6 }) - .addPanel(networkSoftnetSqueezePanel { span: 6 }), - row.new('Network Sockets') - .addPanel(networkSockstatAll { span: 12 }) - .addPanel(networkSockstatTCP { span: 6 }) - .addPanel(networkSockstatUDP { span: 6 }) - .addPanel(networkSockstatMemory { span: 6 }) - .addPanel(networkSockstatOther { span: 6 }), - - row.new('Network Netstat') - .addPanel(networkNetstatIP { span: 12 }) - .addPanel(networkNetstatTCP { span: 6 }) - .addPanel(networkNetstatTCPerrors { span: 6 }) - .addPanel(networkNetstatUDP { span: 6 }) - .addPanel(networkNetstatUDPerrors { span: 6 }) - .addPanel(networkNetstatICMP { span: 6 }) - .addPanel(networkNetstatICMPerrors { span: 6 }), - ], - - dashboard: if platform == 'Linux' then - dashboard.new( - '%sNode Network' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix, - time_from=config.dashboardInterval, - tags=(config.dashboardTags), - timezone=config.dashboardTimezone, - refresh=config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=config.grafanaDashboardIDs['nodes-network.json'] - ) - .addLink(c.links.fleetDash) - .addLink(c.links.nodeDash) - .addLink(c.links.otherDashes) - .addAnnotations(c.annotations) - .addTemplates(templates) - .addRows(rows) - else if platform == 'Darwin' then {}, - }, -} diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet deleted file mode 100644 index a00eb1b9f7..0000000000 --- a/docs/node-mixin/dashboards/node.libsonnet +++ /dev/null @@ -1,19 +0,0 @@ -{ - local nodemixin = import './prom-mixin.libsonnet', - local cpu = import './cpu.libsonnet', - local system = import './system.libsonnet', - local memory = import './memory.libsonnet', - local disk = import './disk.libsonnet', - local network = import './network.libsonnet', - local fleet = import './fleet.libsonnet', - - grafanaDashboards+:: { - 'nodes.json': nodemixin.new(config=$._config, platform='Linux').dashboard, - 'nodes-darwin.json': nodemixin.new(config=$._config, platform='Darwin').dashboard, - 'nodes-system.json': system.new(config=$._config, platform='Linux').dashboard, - 'nodes-memory.json': memory.new(config=$._config, platform='Linux').dashboard, - 'nodes-network.json': network.new(config=$._config, platform='Linux').dashboard, - 'nodes-disk.json': disk.new(config=$._config, platform='Linux').dashboard, - 'nodes-fleet.json': fleet.new(config=$._config, platform='Linux').dashboard, - }, -} diff --git a/docs/node-mixin/dashboards/prom-mixin.libsonnet b/docs/node-mixin/dashboards/prom-mixin.libsonnet deleted file mode 100644 index a562844073..0000000000 --- a/docs/node-mixin/dashboards/prom-mixin.libsonnet +++ /dev/null @@ -1,180 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; -local statPanel = grafana.statPanel; -local nodePanels = import '../lib/panels/panels.libsonnet'; -local commonPanels = import '../lib/panels/common/panels.libsonnet'; -local nodeTimeseries = nodePanels.timeseries; -local common = import '../lib/common.libsonnet'; -local nodeTemplates = common.templates; - -{ - - new(config=null, platform=null):: { - - local c = common.new(config=config, platform=platform), - local commonPromTarget = c.commonPromTarget, - local templates = c.templates, - local q = c.queries, - - local uptimePanel = - commonPanels.uptimeStat.new() - .addTarget(commonPromTarget(expr=q.uptime)), - - local cpuCountPanel = - commonPanels.infoStat.new('CPU Count') - .addTarget(commonPromTarget(expr=q.cpuCount)), - - local memoryTotalPanel = - commonPanels.infoStat.new('Memory Total') - .addTarget(commonPromTarget(expr=q.memoryTotal)) - .withUnits('bytes') - .withDecimals(0), - - local osPanel = - commonPanels.infoStat.new('OS') - .addTarget(commonPromTarget( - expr=q.osInfo, format='table' - )) { options+: { reduceOptions+: { fields: '/^pretty_name$/' } } }, - - local nodeNamePanel = - commonPanels.infoStat.new('Hostname') - .addTarget(commonPromTarget( - expr=q.nodeInfo, format='table' - )) - { options+: { reduceOptions+: { fields: '/^nodename$/' } } }, - - local kernelVersionPanel = - - commonPanels.infoStat.new('Kernel version') - .addTarget(commonPromTarget( - expr=q.nodeInfo, format='table' - )) - { options+: { reduceOptions+: { fields: '/^release$/' } } } - , - - local totalSwapPanel = - commonPanels.infoStat.new('Total swap') - .addTarget(commonPromTarget( - expr=q.memorySwapTotal - )) - .withUnits('bytes') - .withDecimals(0), - - local totalRootFSPanel = - commonPanels.infoStat.new('Root mount size') - .addTarget(commonPromTarget( - expr=q.fsSizeTotalRoot, - )) - .withUnits('bytes') - .withDecimals(0), - - local networkTrafficPanel = - commonPanels.networkTrafficGraph.new( - 'Network Traffic', description='Network transmitted and received (bits/s)', - ) - .addTarget(commonPromTarget( - expr=q.networkReceiveBitsPerSec, - legendFormat='{{device}} received', - )) - .addTarget(commonPromTarget( - expr=q.networkTransmitBitsPerSec, - legendFormat='{{device}} transmitted', - )), - - local networkErrorsDropsPanel = - nodePanels.timeseries.new( - 'Network Errors and Dropped Packets', - description=||| - errors received: Total number of bad packets received on this network device. This counter must include events counted by rx_length_errors, rx_crc_errors, rx_frame_errors and other errors not otherwise counted. - - errors transmitted: Total number of transmit problems. This counter must include events counter by tx_aborted_errors, tx_carrier_errors, tx_fifo_errors, tx_heartbeat_errors, tx_window_errors and other errors not otherwise counted. - - drops received: Number of packets received but not processed, e.g. due to lack of resources or unsupported protocol. For hardware interfaces this counter may include packets discarded due to L2 address filtering but should not include packets dropped by the device due to buffer exhaustion which are counted separately in rx_missed_errors (since procfs folds those two counters together). - - drops transmitted: Number of packets dropped on their way to transmission, e.g. due to lack of resources. - - https://docs.kernel.org/networking/statistics.html - ||| - ) - .addTarget(commonPromTarget( - expr=q.networkReceiveErrorsPerSec, - legendFormat='{{device}} errors received', - )) - .addTarget(commonPromTarget( - expr=q.networkTransmitErrorsPerSec, - legendFormat='{{device}} errors transmitted', - )) - .addTarget(commonPromTarget( - expr=q.networkReceiveDropsPerSec, - legendFormat='{{device}} drops received', - )) - .addTarget(commonPromTarget( - expr=q.networkTransmitDropsPerSec, - legendFormat='{{device}} drops transmitted', - )) - .withDecimals(1) - .withUnits('pps') - .withNegativeYByRegex('trasnmitted') - .withAxisLabel('out(-) | in(+)'), - - - local panelsGrid = - [ - // use negative gravity effect, max w=24, default h=8 - { type: 'row', title: 'Overview' }, - uptimePanel { gridPos: { x: 0, w: 6, h: 2 } }, - nodeNamePanel { gridPos: { x: 6, w: 6, h: 2 } }, - kernelVersionPanel { gridPos: { x: 12, w: 6, h: 2 } }, - osPanel { gridPos: { x: 18, w: 6, h: 2 } }, - cpuCountPanel { gridPos: { x: 0, w: 6, h: 2 } }, - memoryTotalPanel { gridPos: { x: 6, w: 6, h: 2 } }, - totalSwapPanel { gridPos: { x: 12, w: 6, h: 2 } }, - totalRootFSPanel { gridPos: { x: 18, w: 6, h: 2 } }, - { type: 'row', title: 'CPU' } { gridPos: { y: 25 } }, - c.panelsWithTargets.cpuStatPanel { gridPos: { x: 0, w: 6, h: 6, y: 25 } }, - c.panelsWithTargets.idleCPU { gridPos: { x: 6, w: 12, h: 6, y: 25 } }, - c.panelsWithTargets.systemLoad { gridPos: { x: 18, w: 6, h: 6, y: 25 } }, - { type: 'row', title: 'Memory' } { gridPos: { y: 50 } }, - c.panelsWithTargets.memoryGauge { gridPos: { x: 0, w: 6, h: 6, y: 50 } }, - c.panelsWithTargets.memoryGraph { gridPos: { x: 6, w: 18, h: 6, y: 50 } }, - { type: 'row', title: 'Disk' } { gridPos: { y: 75 } }, - c.panelsWithTargets.diskIO { gridPos: { x: 0, w: 12, h: 8, y: 75 } }, - c.panelsWithTargets.diskSpaceUsage { gridPos: { x: 12, w: 12, h: 8, y: 75 } }, - { type: 'row', title: 'Network' } { gridPos: { y: 100 } }, - networkTrafficPanel { gridPos: { x: 0, w: 12, h: 8, y: 100 } }, - networkErrorsDropsPanel { gridPos: { x: 12, w: 12, h: 8, y: 100 } }, - ], - dashboard: if platform == 'Linux' then - dashboard.new( - '%sNode Overview' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix, - time_from=config.dashboardInterval, - tags=(config.dashboardTags), - timezone=config.dashboardTimezone, - refresh=config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=config.grafanaDashboardIDs['nodes.json'], - ) - .addLink(c.links.fleetDash) - .addLink(c.links.otherDashes) - .addAnnotations(c.annotations) - .addTemplates(templates) - .addPanels(panelsGrid) - else if platform == 'Darwin' then - dashboard.new( - '%sMacOS' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix, - time_from=config.dashboardInterval, - tags=(config.dashboardTags), - timezone=config.dashboardTimezone, - refresh=config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=config.grafanaDashboardIDs['nodes-darwin.json'], - ) - .addTemplates(templates) - .addPanels(panelsGrid), - - }, -} diff --git a/docs/node-mixin/dashboards/system.libsonnet b/docs/node-mixin/dashboards/system.libsonnet deleted file mode 100644 index e1bd58d759..0000000000 --- a/docs/node-mixin/dashboards/system.libsonnet +++ /dev/null @@ -1,150 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; -local nodePanels = import '../lib/panels/panels.libsonnet'; -local commonPanels = import '../lib/panels/common/panels.libsonnet'; -local nodeTimeseries = nodePanels.timeseries; -local common = import '../lib/common.libsonnet'; - -{ - - new(config=null, platform=null):: { - local c = common.new(config=config, platform=platform), - local commonPromTarget = c.commonPromTarget, - local templates = c.templates, - local q = c.queries, - - local cpuUsageModes = - nodeTimeseries.new( - 'CPU Usage', - description=||| - System: Processes executing in kernel mode. - User: Normal processes executing in user mode. - Nice: Niced processes executing in user mode. - Idle: Waiting for something to happen. - Iowait: Waiting for I/O to complete. - Irq: Servicing interrupts. - Softirq: Servicing softirqs. - Steal: Time spent in other operating systems when running in a virtualized environment. - ||| - ) - .withStacking('normal') - .withUnits('percent') - .withFillOpacity(100) - .withMax(100) - .withMin(0) - .addTarget(commonPromTarget( - expr=q.cpuUsageModes, - legendFormat='{{mode}}', - )), - local timeZoneOffset = - commonPanels.infoStat.new( - 'Timezone', - description='Timezone set on instance.' - ) - .addTarget(commonPromTarget( - expr=q.node_time_zone_offset_seconds, format='table' - )) - { options+: { reduceOptions+: { fields: '/^time_zone$/' } } }, - local timeSyncDrift = - nodeTimeseries.new( - 'Time Synchronized Drift', - description=||| - Time synchronization is essential to ensure accurate timekeeping, which is critical for many system operations such as logging, authentication, and network communication, as well as distributed systems or clusters where data consistency is important. - ||| - ) - .withUnits('s') - .addTarget(commonPromTarget( - expr=q.node_timex_estimated_error_seconds, - legendFormat='Estimated error in seconds', - )) - .addTarget(commonPromTarget( - expr=q.node_timex_offset_seconds, - legendFormat='Time offset in between local system and reference clock', - )) - .addTarget(commonPromTarget( - expr=q.node_timex_maxerror_seconds, - legendFormat='Maximum error in seconds' - )), - - local timeSynchronizedStatus = - nodeTimeseries.new( - 'Time Synchronized Status', - description='Status of time synchronization.' - ) - .withColor(mode='palette-classic') - .withFillOpacity(75) - .withLegend(show=false) - { - maxDataPoints: 100, - type: 'status-history', - fieldConfig+: { - defaults+: { - mappings+: [ - { - type: 'value', - options: { - '1': { - text: 'In sync', - color: 'light-green', - index: 1, - }, - }, - }, - { - type: 'value', - options: { - '0': { - text: 'Not in sync', - color: 'light-yellow', - index: 0, - }, - }, - }, - - ], - }, - }, - } - .addTarget(commonPromTarget( - expr=q.node_timex_sync_status, - legendFormat='Sync status', - )), - - local panelsGrid = - [ - //use negative gravity(skip y), max w=24, default h should be '6'. - c.panelsWithTargets.cpuStatPanel { gridPos: { x: 0, w: 6, h: 6 } }, - c.panelsWithTargets.idleCPU { gridPos: { x: 6, h: 6, w: 9 } }, - cpuUsageModes { gridPos: { x: 15, h: 6, w: 9 } }, - //pseudorow y:25 - c.panelsWithTargets.systemLoad { gridPos: { x: 0, h: 6, w: 12, y: 25 } }, - c.panelsWithTargets.systemContextSwitches { gridPos: { x: 12, h: 6, w: 12, y: 25 } }, - { type: 'row', title: 'Time', gridPos: { x: 0, w: 24, y: 75 } }, - timeZoneOffset { gridPos: { x: 0, h: 3, w: 3, y: 75 } }, - timeSynchronizedStatus { gridPos: { x: 3, h: 3, w: 21, y: 75 } }, - timeSyncDrift { gridPos: { x: 0, h: 6, w: 24, y: 80 } }, - ], - - dashboard: if platform == 'Linux' then - dashboard.new( - '%sNode CPU and System' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix, - time_from=config.dashboardInterval, - tags=(config.dashboardTags), - timezone=config.dashboardTimezone, - refresh=config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=config.grafanaDashboardIDs['nodes-system.json'], - ) - .addLink(c.links.fleetDash) - .addLink(c.links.nodeDash) - .addLink(c.links.otherDashes) - .addAnnotations(c.annotations) - .addTemplates(templates) - .addPanels(panelsGrid) - else if platform == 'Darwin' then {}, - }, -} diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet deleted file mode 100644 index 9de0c4103a..0000000000 --- a/docs/node-mixin/dashboards/use.libsonnet +++ /dev/null @@ -1,476 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; - -local datasourceTemplate = { - current: { - text: 'default', - value: 'default', - }, - hide: 0, - label: 'Data Source', - name: 'datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: '', - type: 'datasource', -}; - -local CPUUtilisation = - graphPanel.new( - 'CPU Utilisation', - description='Total CPU utilisation percent.', - datasource='$datasource', - span=6, - format='percentunit', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; - -local CPUSaturation = - // TODO: Is this a useful panel? At least there should be some explanation how load - // average relates to the "CPU saturation" in the title. - graphPanel.new( - 'CPU Saturation (Load1 per CPU)', - description='System load average over the last minute. A measurement of how many processes are waiting for CPU cycles. The value is as a percent compared to the number of CPU cores for the node.', - datasource='$datasource', - span=6, - format='percentunit', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; - -local memoryUtilisation = - graphPanel.new( - 'Memory Utilisation', - description='Total memory utilisation in percent.', - datasource='$datasource', - span=6, - format='percentunit', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; - -local memorySaturation = - graphPanel.new( - 'Memory Saturation (Major Page Faults)', - description='Rate of major memory page faults.', - datasource='$datasource', - span=6, - format='rds', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; - -local networkUtilisation = - graphPanel.new( - 'Network Utilisation (Bytes Receive/Transmit)', - description='Network Utilisation (Bytes Receive/Transmit)', - datasource='$datasource', - span=6, - format='Bps', - stack=true, - fill=10, - legend_show=false, - ) - .addSeriesOverride({ alias: '/Receive/', stack: 'A' }) - .addSeriesOverride({ alias: '/Transmit/', stack: 'B', transform: 'negative-Y' }) - { tooltip+: { sort: 2 } }; - -local networkSaturation = - graphPanel.new( - 'Network Saturation (Drops Receive/Transmit)', - description='Network Saturation (Drops Receive/Transmit)', - datasource='$datasource', - span=6, - format='Bps', - stack=true, - fill=10, - legend_show=false, - ) - .addSeriesOverride({ alias: '/ Receive/', stack: 'A' }) - .addSeriesOverride({ alias: '/ Transmit/', stack: 'B', transform: 'negative-Y' }) - { tooltip+: { sort: 2 } }; - -local diskIOUtilisation = - graphPanel.new( - 'Disk IO Utilisation', - description='Disk total IO seconds.', - datasource='$datasource', - span=6, - format='percentunit', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; - -local diskIOSaturation = - graphPanel.new( - 'Disk IO Saturation', - description='Disk saturation (weighted seconds spent, 1 second rate)', - datasource='$datasource', - span=6, - format='percentunit', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; - -local diskSpaceUtilisation = - graphPanel.new( - 'Disk Space Utilisation', - description='Total disk utilisation percent', - datasource='$datasource', - span=12, - format='percentunit', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; - -{ - _clusterTemplate:: template.new( - name='cluster', - datasource='$datasource', - query='label_values(node_time_seconds, %s)' % $._config.clusterLabel, - current='', - hide=if $._config.showMultiCluster then '' else '2', - refresh=2, - includeAll=false, - sort=1 - ), - - grafanaDashboards+:: { - 'node-rsrc-use.json': - - dashboard.new( - '%sUSE Method / Node' % $._config.dashboardNamePrefix, - time_from=$._config.dashboardInterval, - tags=($._config.dashboardTags), - timezone=$._config.dashboardTimezone, - refresh=$._config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=$._config.grafanaDashboardIDs['node-rsrc-use.json'], - ) - .addTemplate(datasourceTemplate) - .addTemplate($._clusterTemplate) - .addTemplate( - template.new( - 'instance', - '$datasource', - 'label_values(node_exporter_build_info{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}, instance)' % $._config, - refresh='time', - sort=1 - ) - ) - .addRow( - row.new('CPU') - .addPanel(CPUUtilisation.addTarget(prometheus.target('instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Utilisation'))) - .addPanel(CPUSaturation.addTarget(prometheus.target('instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Saturation'))) - ) - .addRow( - row.new('Memory') - .addPanel(memoryUtilisation.addTarget(prometheus.target('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Utilisation'))) - .addPanel(memorySaturation.addTarget(prometheus.target('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Major page Faults'))) - ) - .addRow( - row.new('Network') - .addPanel( - networkUtilisation - .addTarget(prometheus.target('instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Receive')) - .addTarget(prometheus.target('instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Transmit')) - ) - .addPanel( - networkSaturation - .addTarget(prometheus.target('instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Receive')) - .addTarget(prometheus.target('instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Transmit')) - ) - ) - .addRow( - row.new('Disk IO') - .addPanel(diskIOUtilisation.addTarget(prometheus.target('instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{device}}'))) - .addPanel(diskIOSaturation.addTarget(prometheus.target('instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{device}}'))) - ) - .addRow( - row.new('Disk Space') - .addPanel( - diskSpaceUtilisation.addTarget(prometheus.target( - ||| - sort_desc(1 - - ( - max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"}) - / - max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"}) - ) != 0 - ) - ||| % $._config, legendFormat='{{device}}' - )) - ) - ), - - 'node-cluster-rsrc-use.json': - dashboard.new( - '%sUSE Method / Cluster' % $._config.dashboardNamePrefix, - time_from=$._config.dashboardInterval, - tags=($._config.dashboardTags), - timezone=$._config.dashboardTimezone, - refresh=$._config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=$._config.grafanaDashboardIDs['node-cluster-rsrc-use.json'], - ) - .addTemplate(datasourceTemplate) - .addTemplate($._clusterTemplate) - .addRow( - row.new('CPU') - .addPanel( - CPUUtilisation - .addTarget(prometheus.target( - ||| - (( - instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} - * - instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} - ) != 0 ) - / scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) - ||| % $._config, legendFormat='{{ instance }}' - )) - ) - .addPanel( - CPUSaturation - .addTarget(prometheus.target( - ||| - ( - instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} - / scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) - ) != 0 - ||| % $._config, legendFormat='{{instance}}' - )) - ) - ) - .addRow( - row.new('Memory') - .addPanel( - memoryUtilisation - .addTarget(prometheus.target( - ||| - ( - instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} - / scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) - ) != 0 - ||| % $._config, legendFormat='{{instance}}', - )) - ) - .addPanel(memorySaturation.addTarget(prometheus.target('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{instance}}'))) - ) - .addRow( - row.new('Network') - .addPanel( - networkUtilisation - .addTarget(prometheus.target('instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Receive')) - .addTarget(prometheus.target('instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Transmit')) - ) - .addPanel( - networkSaturation - .addTarget(prometheus.target('instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Receive')) - .addTarget(prometheus.target('instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Transmit')) - ) - ) - .addRow( - row.new('Disk IO') - .addPanel( - diskIOUtilisation - .addTarget(prometheus.target( - ||| - ( - instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} - / scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) - ) != 0 - ||| % $._config, legendFormat='{{instance}} {{device}}' - )) - ) - .addPanel( - diskIOSaturation - .addTarget(prometheus.target( - ||| - ( - instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} - / scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) - ) != 0 - ||| % $._config, legendFormat='{{instance}} {{device}}' - )) - ) - ) - .addRow( - row.new('Disk Space') - .addPanel( - diskSpaceUtilisation - .addTarget(prometheus.target( - ||| - sum without (device) ( - max without (fstype, mountpoint) (( - node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"} - - - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"} - ) != 0) - ) - / scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"}))) - ||| % $._config, legendFormat='{{instance}}' - )) - ) - ), - } + - if $._config.showMultiCluster then { - 'node-multicluster-rsrc-use.json': - dashboard.new( - '%sUSE Method / Multi-cluster' % $._config.dashboardNamePrefix, - time_from=$._config.dashboardInterval, - tags=($._config.dashboardTags), - timezone=$._config.dashboardTimezone, - refresh=$._config.dashboardRefresh, - graphTooltip='shared_crosshair', - uid=$._config.grafanaDashboardIDs['node-multicluster-rsrc-use.json'], - ) - .addTemplate(datasourceTemplate) - .addRow( - row.new('CPU') - .addPanel( - CPUUtilisation - .addTarget(prometheus.target( - ||| - sum( - (( - instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s} - * - instance:node_num_cpu:sum{%(nodeExporterSelector)s} - ) != 0) - / scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s})) - ) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config - )) - ) - .addPanel( - CPUSaturation - .addTarget(prometheus.target( - ||| - sum(( - instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s} - / scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s})) - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config - )) - ) - ) - .addRow( - row.new('Memory') - .addPanel( - memoryUtilisation - .addTarget(prometheus.target( - ||| - sum(( - instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s} - / scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s})) - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config - )) - ) - .addPanel( - memorySaturation - .addTarget(prometheus.target( - ||| - sum(( - instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s} - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config - )) - ) - ) - .addRow( - row.new('Network') - .addPanel( - networkUtilisation - .addTarget(prometheus.target( - ||| - sum(( - instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}} Receive' % $._config - )) - .addTarget(prometheus.target( - ||| - sum(( - instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}} Transmit' % $._config - )) - ) - .addPanel( - networkSaturation - .addTarget(prometheus.target( - ||| - sum(( - instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}} Receive' % $._config - )) - .addTarget(prometheus.target( - ||| - sum(( - instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}} Transmit' % $._config - )) - ) - ) - .addRow( - row.new('Disk IO') - .addPanel( - diskIOUtilisation - .addTarget(prometheus.target( - ||| - sum(( - instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s} - / scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s})) - ) != 0) by (%(clusterLabel)s, device) - ||| % $._config, legendFormat='{{%(clusterLabel)s}} {{device}}' % $._config - )) - ) - .addPanel( - diskIOSaturation - .addTarget(prometheus.target( - ||| - sum(( - instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s} - / scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s})) - ) != 0) by (%(clusterLabel)s, device) - ||| % $._config, legendFormat='{{%(clusterLabel)s}} {{device}}' % $._config - )) - ) - ) - .addRow( - row.new('Disk Space') - .addPanel( - diskSpaceUtilisation - .addTarget(prometheus.target( - ||| - sum ( - sum without (device) ( - max without (fstype, mountpoint, instance, pod) (( - node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} - ) != 0) - ) - / scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}))) - ) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config - )) - ) - ), - } else {}, -} diff --git a/docs/node-mixin/jsonnetfile.json b/docs/node-mixin/jsonnetfile.json index 721d4833a0..7459652bfa 100644 --- a/docs/node-mixin/jsonnetfile.json +++ b/docs/node-mixin/jsonnetfile.json @@ -4,8 +4,17 @@ { "source": { "git": { - "remote": "https://github.com/grafana/grafonnet-lib.git", - "subdir": "grafonnet" + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-v11.0.0" + } + }, + "version": "main" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "common-lib" } }, "version": "master" @@ -13,12 +22,12 @@ { "source": { "git": { - "remote": "https://github.com/grafana/grafonnet-lib.git", - "subdir": "grafonnet-7.0" + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "logs-lib" } }, "version": "master" } ], - "legacyImports": false -} + "legacyImports": true +} \ No newline at end of file diff --git a/docs/node-mixin/lib/common.libsonnet b/docs/node-mixin/lib/common.libsonnet deleted file mode 100644 index f9ff9f9b22..0000000000 --- a/docs/node-mixin/lib/common.libsonnet +++ /dev/null @@ -1,707 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local nodePanels = import '../lib/panels/panels.libsonnet'; -local commonPanels = import '../lib/panels/common/panels.libsonnet'; -local nodeTimeseries = nodePanels.timeseries; -{ - - new(config=null, platform=null):: { - - local c = self, - - local labelsToRegexSelector(labels) = - std.join(',', ['%s=~"$%s"' % [label, label] for label in labels]), - local labelsToLegend(labels) = - std.join('/', ['{{%s}}' % [label] for label in labels]), - - local labelsToURLvars(labels, prefix) = - std.join('&', ['var-%s=${%s%s}' % [label, prefix, label] for label in labels]), - // export - labelsToLegend:: labelsToLegend, - labelsToURLvars:: labelsToURLvars, - // add to all queries but not templates - local nodeQuerySelector = labelsToRegexSelector(std.split(config.groupLabels + ',' + config.instanceLabels, ',')), - nodeQuerySelector:: nodeQuerySelector, - - // common templates - local prometheusDatasourceTemplate = { - current: { - text: 'default', - value: 'default', - }, - hide: 0, - label: 'Data Source', - name: 'datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: '', - type: 'datasource', - }, - - local chainLabelsfold(prev, label) = { - chain: - if std.length(prev) > 0 - then - [[label] + prev.chain[0]] + prev.chain - else - [[label]], - }, - - local chainLabels(labels) = - [ - { - label: l[0:1][0], - chainSelector: labelsToRegexSelector(std.reverse(l[1:])), - } - for l in std.reverse(std.foldl(chainLabelsfold, labels, init={}).chain) - ], - - local groupTemplates = - [ - template.new( - name=label.label, - label=label.label, - datasource='$datasource', - query='', - current='', - refresh=2, - includeAll=true, - // do not use .*, will get series without such label at all when ALL is selected, ignoring nodeExporterSelector results - allValues=null, - multi=true, - sort=1 - ) - { - query: if platform == 'Darwin' then 'label_values(node_uname_info{sysname="Darwin", %(nodeExporterSelector)s, %(chainSelector)s}, %(label)s)' % config { label: label.label, chainSelector: label.chainSelector } - else 'label_values(node_uname_info{sysname!="Darwin", %(nodeExporterSelector)s, %(chainSelector)s}, %(label)s)' % config { label: label.label, chainSelector: label.chainSelector }, - } - for label in chainLabels(std.split(config.groupLabels, ',')) - ], - - local instanceTemplates = - [ - template.new( - label.label, - '$datasource', - 'label_values(node_uname_info{%(nodeExporterSelector)s, %(chainSelector)s}, %(label)s)' % config { label: label.label, chainSelector: labelsToRegexSelector(std.split(config.groupLabels, ',')) + ',' + label.chainSelector }, - sort=1, - refresh='time', - label=label.label, - ) - for label in chainLabels(std.split(config.instanceLabels, ',')) - ], - - // return common templates - templates: [prometheusDatasourceTemplate] + groupTemplates + instanceTemplates, - // return templates where instance select is not required - groupDashboardTemplates: [prometheusDatasourceTemplate] + groupTemplates, - - local rebootAnnotation = { - datasource: { - type: 'prometheus', - uid: '$datasource', - }, - enable: true, - hide: true, - expr: 'node_boot_time_seconds{%(nodeQuerySelector)s}*1000 > $__from < $__to' % config { nodeQuerySelector: nodeQuerySelector }, - name: 'Reboot', - iconColor: 'light-orange', - tagKeys: config.instanceLabels, - textFormat: '', - titleFormat: 'Reboot', - useValueForTime: 'on', - }, - local memoryOOMkillerAnnotation = { - datasource: { - type: 'prometheus', - uid: '$datasource', - }, - enable: true, - hide: true, - expr: 'increase(node_vmstat_oom_kill{%(nodeQuerySelector)s}[$__interval:] offset -$__interval)' % config { nodeQuerySelector: nodeQuerySelector }, - name: 'OOMkill', - iconColor: 'light-purple', - tagKeys: config.instanceLabels, - textFormat: '', - titleFormat: 'OOMkill', - }, - local newKernelAnnotation = { - datasource: { - type: 'prometheus', - uid: '$datasource', - }, - enable: true, - hide: true, - expr: ||| - changes( - sum by (%(instanceLabels)s) ( - group by (%(instanceLabels)s,release) (node_uname_info{%(nodeQuerySelector)s}) - ) - [$__interval:1m] offset -$__interval) > 1 - ||| % config { nodeQuerySelector: nodeQuerySelector }, - name: 'Kernel update', - iconColor: 'light-blue', - tagKeys: config.instanceLabels, - textFormat: '', - titleFormat: 'Kernel update', - step: '5m', // must be larger than possible scrape periods - }, - // return common annotations - annotations: [rebootAnnotation, memoryOOMkillerAnnotation, newKernelAnnotation], - - // return common prometheus target (with project defaults) - commonPromTarget( - expr=null, - intervalFactor=1, - datasource='$datasource', - legendFormat=null, - format='timeseries', - instant=null, - hide=null, - interval=null, - ):: - prometheus.target( - expr=expr, - intervalFactor=intervalFactor, - datasource=datasource, - legendFormat=legendFormat, - format=format, - instant=instant, - hide=hide, - interval=interval - ), - // link to fleet panel - links:: { - fleetDash:: grafana.link.dashboards( - asDropdown=false, - title='Back to Node Fleet Overview', - tags=[], - includeVars=false, - keepTime=true, - url='d/' + config.grafanaDashboardIDs['nodes-fleet.json'] - ) { type: 'link', icon: 'dashboard' }, - nodeDash:: grafana.link.dashboards( - asDropdown=false, - title='Back to Node Overview', - tags=[], - includeVars=true, - keepTime=true, - url='d/' + config.grafanaDashboardIDs['nodes.json'] - ) { type: 'link', icon: 'dashboard' }, - otherDashes:: grafana.link.dashboards( - asDropdown=true, - title='Other Node Dashboards', - includeVars=true, - keepTime=true, - tags=(config.dashboardTags), - ), - // used in fleet table - instanceDataLinkForTable:: { - title: 'Drill down to instance ${__data.fields.%s}' % std.split(config.instanceLabels, ',')[0], - url: 'd/' + config.grafanaDashboardIDs['nodes.json'] + '?' + labelsToURLvars(std.split(config.instanceLabels, ','), prefix='__data.fields.') + '&${__url_time_range}&var-datasource=${datasource}', - }, - // used in ts panels - instanceDataLink:: { - title: 'Drill down to instance ${__field.labels.%s}' % std.split(config.instanceLabels, ',')[0], - url: 'd/' + config.grafanaDashboardIDs['nodes.json'] + '?' + labelsToURLvars(std.split(config.instanceLabels, ','), prefix='__field.labels.') + '&${__url_time_range}&var-datasource=${datasource}', - }, - }, - // return common queries that could be used in multiple dashboards - queries:: { - systemLoad1:: 'avg by (%(instanceLabels)s) (node_load1{%(nodeQuerySelector)s})' % config { nodeQuerySelector: nodeQuerySelector }, - systemLoad5:: 'avg by (%(instanceLabels)s) (node_load5{%(nodeQuerySelector)s})' % config { nodeQuerySelector: nodeQuerySelector }, - systemLoad15:: 'avg by (%(instanceLabels)s) (node_load15{%(nodeQuerySelector)s})' % config { nodeQuerySelector: nodeQuerySelector }, - uptime:: 'time() - node_boot_time_seconds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - cpuCount:: 'count by (%(instanceLabels)s) (node_cpu_seconds_total{%(nodeQuerySelector)s, mode="idle"})' % config { nodeQuerySelector: nodeQuerySelector }, - cpuUsage:: - ||| - (((count by (%(instanceLabels)s) (count(node_cpu_seconds_total{%(nodeQuerySelector)s}) by (cpu, %(instanceLabels)s))) - - - avg by (%(instanceLabels)s) (sum by (%(instanceLabels)s, mode)(irate(node_cpu_seconds_total{mode='idle',%(nodeQuerySelector)s}[$__rate_interval])))) * 100) - / - count by(%(instanceLabels)s) (count(node_cpu_seconds_total{%(nodeQuerySelector)s}) by (cpu, %(instanceLabels)s)) - ||| % config { nodeQuerySelector: nodeQuerySelector }, - cpuUsageModes:: - ||| - sum by(%(instanceLabels)s, mode) (irate(node_cpu_seconds_total{%(nodeQuerySelector)s}[$__rate_interval])) - / on(%(instanceLabels)s) - group_left sum by (%(instanceLabels)s)((irate(node_cpu_seconds_total{%(nodeQuerySelector)s}[$__rate_interval]))) * 100 - ||| % config { nodeQuerySelector: nodeQuerySelector }, - cpuUsagePerCore:: - ||| - ( - (1 - sum without (mode) (rate(node_cpu_seconds_total{%(nodeQuerySelector)s, mode=~"idle|iowait|steal"}[$__rate_interval]))) - / ignoring(cpu) group_left - count without (cpu, mode) (node_cpu_seconds_total{%(nodeQuerySelector)s, mode="idle"}) - ) * 100 - ||| % config { nodeQuerySelector: nodeQuerySelector }, - memoryTotal:: 'node_memory_MemTotal_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - memorySwapTotal:: 'node_memory_SwapTotal_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - memoryUsage:: - ||| - 100 - - ( - avg by (%(instanceLabels)s) (node_memory_MemAvailable_bytes{%(nodeQuerySelector)s}) / - avg by (%(instanceLabels)s) (node_memory_MemTotal_bytes{%(nodeQuerySelector)s}) - * 100 - ) - ||| % config { nodeQuerySelector: nodeQuerySelector }, - - process_max_fds:: 'process_max_fds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - process_open_fds:: 'process_open_fds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - - fsSizeTotalRoot:: 'node_filesystem_size_bytes{%(nodeQuerySelector)s, mountpoint="/",fstype!="rootfs"}' % config { nodeQuerySelector: nodeQuerySelector }, - osInfo:: 'node_os_info{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - nodeInfo:: 'node_uname_info{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - node_disk_reads_completed_total:: 'irate(node_disk_reads_completed_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - node_disk_writes_completed_total:: 'irate(node_disk_writes_completed_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - diskReadTime:: 'rate(node_disk_read_bytes_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - diskWriteTime:: 'rate(node_disk_written_bytes_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - diskIoTime:: 'rate(node_disk_io_time_seconds_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - diskWaitReadTime:: - ||| - irate(node_disk_read_time_seconds_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - / - irate(node_disk_reads_completed_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - ||| % config { nodeQuerySelector: nodeQuerySelector }, - diskWaitWriteTime:: - ||| - irate(node_disk_write_time_seconds_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - / - irate(node_disk_writes_completed_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - ||| % config { nodeQuerySelector: nodeQuerySelector }, - diskAvgQueueSize:: 'irate(node_disk_io_time_weighted_seconds_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - diskSpaceUsage:: - ||| - sort_desc(1 - - ( - max by (job, %(instanceLabels)s, fstype, device, mountpoint) (node_filesystem_avail_bytes{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}) - / - max by (job, %(instanceLabels)s, fstype, device, mountpoint) (node_filesystem_size_bytes{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}) - ) != 0 - ) - ||| % config { nodeQuerySelector: nodeQuerySelector }, - node_filesystem_avail_bytes:: 'node_filesystem_avail_bytes{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - node_filesystem_files_free:: 'node_filesystem_files_free{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - node_filesystem_files:: 'node_filesystem_files{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - node_filesystem_readonly:: 'node_filesystem_readonly{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - node_filesystem_device_error:: 'node_filesystem_device_error{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - networkReceiveBitsPerSec:: 'irate(node_network_receive_bytes_total{%(nodeQuerySelector)s}[$__rate_interval])*8' % config { nodeQuerySelector: nodeQuerySelector }, - networkTransmitBitsPerSec:: 'irate(node_network_transmit_bytes_total{%(nodeQuerySelector)s}[$__rate_interval])*8' % config { nodeQuerySelector: nodeQuerySelector }, - networkReceiveErrorsPerSec:: 'irate(node_network_receive_errs_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - networkTransmitErrorsPerSec:: 'irate(node_network_transmit_errs_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - networkReceiveDropsPerSec:: 'irate(node_network_receive_drop_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - networkTransmitDropsPerSec:: 'irate(node_network_transmit_drop_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - - systemContextSwitches:: 'irate(node_context_switches_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - systemInterrupts:: 'irate(node_intr_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, - - //time - node_timex_estimated_error_seconds:: 'node_timex_estimated_error_seconds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - node_timex_offset_seconds:: 'node_timex_offset_seconds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - node_timex_maxerror_seconds:: 'node_timex_maxerror_seconds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - - node_timex_sync_status:: 'node_timex_sync_status{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - node_time_zone_offset_seconds:: 'node_time_zone_offset_seconds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - node_systemd_units:: 'node_systemd_units{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, - - - }, - // share across dashboards - panelsWithTargets:: { - // cpu - idleCPU:: - nodePanels.timeseries.new( - 'CPU Usage', - description='Total CPU utilisation percent.' - ) - .withUnits('percent') - .withStacking('normal') - .withMin(0) - .withMax(100) - .addTarget(c.commonPromTarget( - expr=c.queries.cpuUsagePerCore, - legendFormat='cpu {{cpu}}', - )), - - systemLoad:: - nodePanels.timeseries.new( - 'Load Average', - description='System load average over the previous 1, 5, and 15 minute ranges. A measurement of how many processes are waiting for CPU cycles. The maximum number is the number of CPU cores for the node.', - ) - .withUnits('short') - .withMin(0) - .withFillOpacity(0) - .addTarget(c.commonPromTarget(c.queries.systemLoad1, legendFormat='1m load average')) - .addTarget(c.commonPromTarget(c.queries.systemLoad5, legendFormat='5m load average')) - .addTarget(c.commonPromTarget(c.queries.systemLoad15, legendFormat='15m load average')) - .addTarget(c.commonPromTarget(c.queries.cpuCount, legendFormat='logical cores')) - .addOverride( - matcher={ - id: 'byName', - options: 'logical cores', - }, - properties=[ - { - id: 'custom.lineStyle', - value: { - fill: 'dash', - dash: [ - 10, - 10, - ], - }, - }, - ] - ), - cpuStatPanel:: - commonPanels.percentUsageStat.new( - 'CPU Usage', - description='Total CPU utilisation percent.' - ) - .addTarget(c.commonPromTarget( - expr=c.queries.cpuUsage - )), - systemContextSwitches:: - nodePanels.timeseries.new( - 'Context Switches / Interrupts', - description=||| - Context switches occur when the operating system switches from running one process to another. - Interrupts are signals sent to the CPU by external devices to request its attention. - - A high number of context switches or interrupts can indicate that the system is overloaded or that there are problems with specific devices or processes. - ||| - ) - .addTarget(c.commonPromTarget(c.queries.systemContextSwitches, legendFormat='Context Switches')) - .addTarget(c.commonPromTarget(c.queries.systemInterrupts, legendFormat='Interrupts')), - - diskSpaceUsage:: - nodePanels.table.new( - title='Disk Space Usage', - description='Disk utilisation in percent, by mountpoint. Some duplication can occur if the same filesystem is mounted in multiple locations.', - ) - .setFieldConfig(unit='decbytes') - //.addThresholdStep(color='light-green', value=null) - .addThresholdStep(color='light-blue', value=null) - .addThresholdStep(color='light-yellow', value=0.8) - .addThresholdStep(color='light-red', value=0.9) - .addTarget(c.commonPromTarget( - ||| - max by (mountpoint) (node_filesystem_size_bytes{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}) - ||| % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='', - instant=true, - format='table' - )) - .addTarget(c.commonPromTarget( - ||| - max by (mountpoint) (node_filesystem_avail_bytes{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}) - ||| % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='', - instant=true, - format='table', - )) - .addOverride( - matcher={ - id: 'byName', - options: 'Mounted on', - }, - properties=[ - { - id: 'custom.width', - value: 260, - }, - ], - ) - .addOverride( - matcher={ - id: 'byName', - options: 'Size', - }, - properties=[ - - { - id: 'custom.width', - value: 93, - }, - - ], - ) - .addOverride( - matcher={ - id: 'byName', - options: 'Used', - }, - properties=[ - { - id: 'custom.width', - value: 72, - }, - ], - ) - .addOverride( - matcher={ - id: 'byName', - options: 'Available', - }, - properties=[ - { - id: 'custom.width', - value: 88, - }, - ], - ) - - .addOverride( - matcher={ - id: 'byName', - options: 'Used, %', - }, - properties=[ - { - id: 'unit', - value: 'percentunit', - }, - { - id: 'custom.displayMode', - value: 'basic', - }, - { - id: 'max', - value: 1, - }, - { - id: 'min', - value: 0, - }, - ] - ) - .sortBy('Mounted on') - + { - transformations+: [ - { - id: 'groupBy', - options: { - fields: { - 'Value #A': { - aggregations: [ - 'lastNotNull', - ], - operation: 'aggregate', - }, - 'Value #B': { - aggregations: [ - 'lastNotNull', - ], - operation: 'aggregate', - }, - mountpoint: { - aggregations: [], - operation: 'groupby', - }, - }, - }, - }, - { - id: 'merge', - options: {}, - }, - { - id: 'calculateField', - options: { - alias: 'Used', - binary: { - left: 'Value #A (lastNotNull)', - operator: '-', - reducer: 'sum', - right: 'Value #B (lastNotNull)', - }, - mode: 'binary', - reduce: { - reducer: 'sum', - }, - }, - }, - { - id: 'calculateField', - options: { - alias: 'Used, %', - binary: { - left: 'Used', - operator: '/', - reducer: 'sum', - right: 'Value #A (lastNotNull)', - }, - mode: 'binary', - reduce: { - reducer: 'sum', - }, - }, - }, - { - id: 'organize', - options: { - excludeByName: {}, - indexByName: {}, - renameByName: { - 'Value #A (lastNotNull)': 'Size', - 'Value #B (lastNotNull)': 'Available', - mountpoint: 'Mounted on', - }, - }, - }, - ], - }, - memoryGraphPanelPrototype:: - nodePanels.timeseries.new( - 'Memory Usage', - description='Memory usage by category, measured in bytes.', - ) - .withMin(0) - .withUnits('bytes'), - memoryGraph:: - if platform == 'Linux' then - self.memoryGraphPanelPrototype - { - description: ||| - Used: The amount of physical memory currently in use by the system. - Cached: The amount of physical memory used for caching data from disk. The Linux kernel uses available memory to cache data that is read from or written to disk. This helps speed up disk access times. - Free: The amount of physical memory that is currently not in use. - Buffers: The amount of physical memory used for temporary storage of data being transferred between devices or applications. - Available: The amount of physical memory that is available for use by applications. This takes into account memory that is currently being used for caching but can be freed up if needed. - |||, - } - { stack: true } - .addTarget(c.commonPromTarget( - ||| - ( - node_memory_MemTotal_bytes{%(nodeQuerySelector)s} - - - node_memory_MemFree_bytes{%(nodeQuerySelector)s} - - - node_memory_Buffers_bytes{%(nodeQuerySelector)s} - - - node_memory_Cached_bytes{%(nodeQuerySelector)s} - ) - ||| % config { nodeQuerySelector: c.nodeQuerySelector }, - legendFormat='Memory used' - )) - .addTarget(c.commonPromTarget('node_memory_Buffers_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Memory buffers')) - .addTarget(c.commonPromTarget('node_memory_Cached_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Memory cached')) - .addTarget(c.commonPromTarget('node_memory_MemFree_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Memory free')) - .addTarget(c.commonPromTarget('node_memory_MemAvailable_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Memory available')) - .addTarget(c.commonPromTarget('node_memory_MemTotal_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Memory total')) - else if platform == 'Darwin' then - // not useful to stack - self.memoryGraphPanelPrototype { stack: false } - .addTarget(c.commonPromTarget('node_memory_total_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Physical Memory')) - .addTarget(c.commonPromTarget( - ||| - ( - node_memory_internal_bytes{%(nodeQuerySelector)s} - - node_memory_purgeable_bytes{%(nodeQuerySelector)s} + - node_memory_wired_bytes{%(nodeQuerySelector)s} + - node_memory_compressed_bytes{%(nodeQuerySelector)s} - ) - ||| % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Memory Used' - )) - .addTarget(c.commonPromTarget( - ||| - ( - node_memory_internal_bytes{%(nodeQuerySelector)s} - - node_memory_purgeable_bytes{%(nodeQuerySelector)s} - ) - ||| % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='App Memory' - )) - .addTarget(c.commonPromTarget('node_memory_wired_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Wired Memory')) - .addTarget(c.commonPromTarget('node_memory_compressed_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Compressed')), - - // NOTE: avg() is used to circumvent a label change caused by a node_exporter rollout. - memoryGaugePanelPrototype:: - commonPanels.percentUsageStat.new( - 'Memory Usage', - description='Total memory utilisation.', - ), - - memoryGauge:: - if platform == 'Linux' then - self.memoryGaugePanelPrototype - - .addTarget(c.commonPromTarget(c.queries.memoryUsage)) - - else if platform == 'Darwin' then - self.memoryGaugePanelPrototype - .addTarget(c.commonPromTarget( - ||| - ( - ( - avg(node_memory_internal_bytes{%(nodeQuerySelector)s}) - - avg(node_memory_purgeable_bytes{%(nodeQuerySelector)s}) + - avg(node_memory_wired_bytes{%(nodeQuerySelector)s}) + - avg(node_memory_compressed_bytes{%(nodeQuerySelector)s}) - ) / - avg(node_memory_total_bytes{%(nodeQuerySelector)s}) - ) - * - 100 - ||| % config { nodeQuerySelector: c.nodeQuerySelector } - )), - diskIO:: - nodePanels.timeseries.new( - 'Disk I/O', - description='Disk read/writes in bytes, and total IO seconds.' - ) - .withFillOpacity(0) - .withMin(0) - .addTarget(c.commonPromTarget( - c.queries.diskReadTime, - legendFormat='{{device}} read', - )) - .addTarget(c.commonPromTarget( - c.queries.diskWriteTime, - legendFormat='{{device}} written', - )) - .addTarget(c.commonPromTarget( - c.queries.diskIoTime, - legendFormat='{{device}} io time', - )) - .addOverride( - matcher={ - id: 'byRegexp', - options: '/ read| written/', - }, - properties=[ - { - id: 'unit', - value: 'bps', - }, - ] - ) - .addOverride( - matcher={ - id: 'byRegexp', - options: '/ io time/', - }, - properties=[ - { - id: 'unit', - value: 'percentunit', - }, - { - id: 'custom.axisSoftMax', - value: 1, - }, - { - id: 'custom.drawStyle', - value: 'points', - }, - ] - ), - }, - }, - -} diff --git a/docs/node-observ-lib/g.libsonnet b/docs/node-mixin/lib/g.libsonnet similarity index 100% rename from docs/node-observ-lib/g.libsonnet rename to docs/node-mixin/lib/g.libsonnet diff --git a/docs/node-observ-lib/linux/README.md b/docs/node-mixin/lib/linux/README.md similarity index 91% rename from docs/node-observ-lib/linux/README.md rename to docs/node-mixin/lib/linux/README.md index 3582c0cbb9..07e0c4628c 100644 --- a/docs/node-observ-lib/linux/README.md +++ b/docs/node-mixin/lib/linux/README.md @@ -6,7 +6,7 @@ This jsonnet observability lib can be used to generate observability package for ```sh jb init -jb install https://github.com/grafana/node_exporter/docs/node-observ-lib +jb install https://github.com/prometheus/node_exporter/docs/node-mixin/lib/linux ``` ## Examples @@ -17,7 +17,7 @@ You can use observ-lib to fill in monitoring-mixin structure: ```jsonnet // mixin.libsonnet file -local nodelib = import 'node-observ-lib/linux/main.libsonnet'; +local nodelib = import 'linux/main.libsonnet'; local linux = nodelib.new() @@ -45,7 +45,7 @@ local linux = ```jsonnet // mixin.libsonnet file -local nodelib = import 'node-observ-lib/linux/main.libsonnet'; +local nodelib = import 'linux/main.libsonnet'; local linux = nodelib.new() @@ -82,7 +82,7 @@ local linux = // mixin.libsonnet file local configOverride = import './overrides.libsonnet'; -local nodelib = import 'node-observ-lib/linux/main.libsonnet'; +local nodelib = import 'linux/main.libsonnet'; local linux = nodelib.new() @@ -101,7 +101,7 @@ local linux = ```jsonnet local g = import './g.libsonnet'; // mixin.libsonnet file -local nodelib = import 'node-observ-lib/linux/main.libsonnet'; +local nodelib = import 'linux/main.libsonnet'; local linux = nodelib.new() diff --git a/docs/node-observ-lib/linux/alerts.libsonnet b/docs/node-mixin/lib/linux/alerts/alerts.libsonnet similarity index 99% rename from docs/node-observ-lib/linux/alerts.libsonnet rename to docs/node-mixin/lib/linux/alerts/alerts.libsonnet index 8cc89d8fdf..8a307dff9f 100644 --- a/docs/node-observ-lib/linux/alerts.libsonnet +++ b/docs/node-mixin/lib/linux/alerts/alerts.libsonnet @@ -10,7 +10,7 @@ ( node_filesystem_avail_bytes{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_size_bytes{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < %(fsSpaceFillingUpWarningThreshold)d and - predict_linear(node_filesystem_avail_bytes{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 24*60*60) < 0 + predict_linear(node_filesystem_avail_bytes{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[%(fsSpaceFillingUpPredictionWindow)s], 24*60*60) < 0 and node_filesystem_readonly{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 ) diff --git a/docs/node-observ-lib/linux/annotations.libsonnet b/docs/node-mixin/lib/linux/annotations.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/annotations.libsonnet rename to docs/node-mixin/lib/linux/annotations.libsonnet diff --git a/docs/node-observ-lib/linux/config.libsonnet b/docs/node-mixin/lib/linux/config.libsonnet similarity index 89% rename from docs/node-observ-lib/linux/config.libsonnet rename to docs/node-mixin/lib/linux/config.libsonnet index 77a1bd10c5..5ce5548269 100644 --- a/docs/node-observ-lib/linux/config.libsonnet +++ b/docs/node-mixin/lib/linux/config.libsonnet @@ -6,14 +6,12 @@ // 'groupLabels' - one or more labels that can be used to identify 'group' of instances. In simple cases, can be 'job' or 'cluster'. // 'instanceLabels' - one or more labels that can be used to identify single entity of instances. In simple cases, can be 'instance' or 'pod'. // 'uid' - UID to prefix all dashboards original uids - filteringSelector: std.get(self, 'nodeExporterSelector', default='job="node"'), groupLabels: ['job'], instanceLabels: ['instance'], dashboardNamePrefix: 'Node exporter / ', uid: 'node', - - dashboardTags: [self.uid], + dashboardTags: ['node-exporter-mixin'], // Select the fstype for filesystem-related queries. If left // empty, all filesystems are selected. If you have unusual @@ -59,13 +57,19 @@ // 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk // usage grows in a way that it is predicted to run out in 4h or 1d // and if the provided thresholds have been reached right now. - // In some cases you'll want to adjust these, e.g. by default Kubernetes + // In some cases you'll want to adjust these, e.g., by default, Kubernetes // runs the image garbage collection when the disk usage reaches 85% // of its available space. In that case, you'll want to reduce the // critical threshold below to something like 14 or 15, otherwise // the alert could fire under normal node usage. + // Additionally, the prediction window for the alert can be configured + // to account for environments where disk usage can fluctuate within + // a short time frame. By extending the prediction window, you can + // reduce false positives caused by temporary spikes, providing a + // more accurate prediction of disk space issues. fsSpaceFillingUpWarningThreshold: 40, fsSpaceFillingUpCriticalThreshold: 20, + fsSpaceFillingUpPredictionWindow: '6h', // Available disk space (%) thresholds on which to trigger the // 'NodeFilesystemAlmostOutOfSpace' alerts. @@ -91,7 +95,7 @@ rateInterval: '5m', - dashboardPeriod: 'now-1h', + dashboardInterval: 'now-1h', dashboardTimezone: 'default', dashboardRefresh: '1m', @@ -100,7 +104,7 @@ //used in USE dashboards only. For others, add cluster label to groupLabels var. clusterLabel: 'cluster', - // logs lib related + // loki logs related related enableLokiLogs: false, extraLogLabels: ['transport', 'unit', 'level'], logsVolumeGroupBy: 'level', @@ -111,4 +115,5 @@ | label_format timestamp="{{__timestamp__}}" | line_format `{{ if eq "[[instance]]" ".*" }}{{alignLeft 25 .instance}}|{{alignLeft 25 .unit}}|{{else}}{{alignLeft 25 .unit}}|{{end}} {{__line__}}` |||, + } diff --git a/docs/node-observ-lib/linux/dashboards.libsonnet b/docs/node-mixin/lib/linux/dashboards.libsonnet similarity index 96% rename from docs/node-observ-lib/linux/dashboards.libsonnet rename to docs/node-mixin/lib/linux/dashboards.libsonnet index 5b442cd969..18f4810ad7 100644 --- a/docs/node-observ-lib/linux/dashboards.libsonnet +++ b/docs/node-mixin/lib/linux/dashboards.libsonnet @@ -10,7 +10,7 @@ local logslib = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main.libso local vars = this.grafana.variables.main; local annotations = this.grafana.annotations; local refresh = this.config.dashboardRefresh; - local period = this.config.dashboardPeriod; + local period = this.config.dashboardInterval; local timezone = this.config.dashboardTimezone; local panels = this.grafana.panels; local rows = this.grafana.rows; @@ -170,8 +170,8 @@ local logslib = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main.libso 'logs.json': logslib.new( prefix + 'logs', - datasourceName=this.grafana.variables.datasources.loki.name, - datasourceRegex=this.grafana.variables.datasources.loki.regex, + datasourceName=vars.datasources.loki.name, + datasourceRegex=vars.datasources.loki.regex, filterSelector=this.config.logsFilteringSelector, labels=this.config.groupLabels + this.config.instanceLabels + this.config.extraLogLabels, formatParser=null, @@ -197,7 +197,7 @@ local logslib = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main.libso variables+: { // add prometheus datasource for annotations processing toArray+: [ - this.grafana.variables.datasources.prometheus { hide: 2 }, + vars.datasources.prometheus { hide: 2 }, ], }, }.dashboards.logs, diff --git a/docs/node-observ-lib/linux/links.libsonnet b/docs/node-mixin/lib/linux/links.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/links.libsonnet rename to docs/node-mixin/lib/linux/links.libsonnet diff --git a/docs/node-observ-lib/linux/main.libsonnet b/docs/node-mixin/lib/linux/main.libsonnet similarity index 55% rename from docs/node-observ-lib/linux/main.libsonnet rename to docs/node-mixin/lib/linux/main.libsonnet index d8bf92b532..f6db00f81c 100644 --- a/docs/node-observ-lib/linux/main.libsonnet +++ b/docs/node-mixin/lib/linux/main.libsonnet @@ -1,4 +1,4 @@ -local alerts = import './alerts.libsonnet'; +local alerts = import './alerts/alerts.libsonnet'; local annotations = import './annotations.libsonnet'; local config = import './config.libsonnet'; local dashboards = import './dashboards.libsonnet'; @@ -7,16 +7,12 @@ local g = import './g.libsonnet'; local links = import './links.libsonnet'; local panels = import './panels/main.libsonnet'; local rows = import './rows/main.libsonnet'; -local rules = import './rules.libsonnet'; +local rules = import './rules/rules.libsonnet'; local targets = import './targets/main.libsonnet'; local variables = import './variables.libsonnet'; local commonlib = import 'common-lib/common/main.libsonnet'; { - withConfigMixin(config): { - config+: config, - }, - new(): { local this = self, @@ -37,5 +33,26 @@ local commonlib = import 'common-lib/common/main.libsonnet'; recordingRules: rules.new(this), }, + + }, + withConfigMixin(config): { + //backward compatible: handle both formats string and array for instanceLabels, groupLabels + local _patch = + ( + if std.objectHasAll(config, 'instanceLabels') + then + { instanceLabels: if std.isString(config.instanceLabels) then std.split(',', config.instanceLabels) else config.instanceLabels } + else {} + ) + + ( + if std.objectHasAll(config, 'groupLabels') + then + { + groupLabels: if std.isString(config.groupLabels) then std.split(',', config.groupLabels) else config.groupLabels, + } + else {} + ), + local groupLabels = if std.isString(config.groupLabels) then std.split(',', config.groupLabels) else config.groupLabels, + config+: config + _patch, }, } diff --git a/docs/node-observ-lib/linux/panels/cpu.libsonnet b/docs/node-mixin/lib/linux/panels/cpu.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/panels/cpu.libsonnet rename to docs/node-mixin/lib/linux/panels/cpu.libsonnet diff --git a/docs/node-observ-lib/linux/panels/disk.libsonnet b/docs/node-mixin/lib/linux/panels/disk.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/panels/disk.libsonnet rename to docs/node-mixin/lib/linux/panels/disk.libsonnet diff --git a/docs/node-observ-lib/linux/panels/fleet.libsonnet b/docs/node-mixin/lib/linux/panels/fleet.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/panels/fleet.libsonnet rename to docs/node-mixin/lib/linux/panels/fleet.libsonnet diff --git a/docs/node-observ-lib/linux/panels/hardware.libsonnet b/docs/node-mixin/lib/linux/panels/hardware.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/panels/hardware.libsonnet rename to docs/node-mixin/lib/linux/panels/hardware.libsonnet diff --git a/docs/node-observ-lib/linux/panels/main.libsonnet b/docs/node-mixin/lib/linux/panels/main.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/panels/main.libsonnet rename to docs/node-mixin/lib/linux/panels/main.libsonnet diff --git a/docs/node-observ-lib/linux/panels/memory.libsonnet b/docs/node-mixin/lib/linux/panels/memory.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/panels/memory.libsonnet rename to docs/node-mixin/lib/linux/panels/memory.libsonnet diff --git a/docs/node-observ-lib/linux/panels/network.libsonnet b/docs/node-mixin/lib/linux/panels/network.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/panels/network.libsonnet rename to docs/node-mixin/lib/linux/panels/network.libsonnet diff --git a/docs/node-observ-lib/linux/panels/system.libsonnet b/docs/node-mixin/lib/linux/panels/system.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/panels/system.libsonnet rename to docs/node-mixin/lib/linux/panels/system.libsonnet diff --git a/docs/node-observ-lib/linux/panels/use.libsonnet b/docs/node-mixin/lib/linux/panels/use.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/panels/use.libsonnet rename to docs/node-mixin/lib/linux/panels/use.libsonnet diff --git a/docs/node-observ-lib/linux/panels/useCluster.libsonnet b/docs/node-mixin/lib/linux/panels/useCluster.libsonnet similarity index 84% rename from docs/node-observ-lib/linux/panels/useCluster.libsonnet rename to docs/node-mixin/lib/linux/panels/useCluster.libsonnet index cb06b8f18b..9ae1a79e75 100644 --- a/docs/node-observ-lib/linux/panels/useCluster.libsonnet +++ b/docs/node-mixin/lib/linux/panels/useCluster.libsonnet @@ -10,10 +10,12 @@ local utils = commonlib.utils; local instanceLabel = this.config.instanceLabels[0], local instancePanels = this.grafana.panels.use, //for USE - cpuUtilization: instancePanels.cpuUtilization - + g.panel.timeSeries.queryOptions.withTargets([t.useCluster.cpuUtilization]), - cpuSaturation: instancePanels.cpuSaturation - + g.panel.timeSeries.queryOptions.withTargets([t.useCluster.cpuSaturation]), + cpuUtilization: + instancePanels.cpuUtilization + + g.panel.timeSeries.queryOptions.withTargets([t.useCluster.cpuUtilization]), + cpuSaturation: + instancePanels.cpuSaturation + + g.panel.timeSeries.queryOptions.withTargets([t.useCluster.cpuSaturation]), memoryUtilization: instancePanels.memoryUtilization diff --git a/docs/node-observ-lib/linux/panels/useClusterMulti.libsonnet b/docs/node-mixin/lib/linux/panels/useClusterMulti.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/panels/useClusterMulti.libsonnet rename to docs/node-mixin/lib/linux/panels/useClusterMulti.libsonnet diff --git a/docs/node-observ-lib/linux/rows/linux.libsonnet b/docs/node-mixin/lib/linux/rows/linux.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/rows/linux.libsonnet rename to docs/node-mixin/lib/linux/rows/linux.libsonnet diff --git a/docs/node-observ-lib/linux/rows/main.libsonnet b/docs/node-mixin/lib/linux/rows/main.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/rows/main.libsonnet rename to docs/node-mixin/lib/linux/rows/main.libsonnet diff --git a/docs/node-observ-lib/linux/rows/use.libsonnet b/docs/node-mixin/lib/linux/rows/use.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/rows/use.libsonnet rename to docs/node-mixin/lib/linux/rows/use.libsonnet diff --git a/docs/node-observ-lib/linux/rules.libsonnet b/docs/node-mixin/lib/linux/rules/rules.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/rules.libsonnet rename to docs/node-mixin/lib/linux/rules/rules.libsonnet diff --git a/docs/node-observ-lib/linux/targets/alerts.libsonnet b/docs/node-mixin/lib/linux/targets/alerts.libsonnet similarity index 96% rename from docs/node-observ-lib/linux/targets/alerts.libsonnet rename to docs/node-mixin/lib/linux/targets/alerts.libsonnet index 77101c7b2d..de4a23d931 100644 --- a/docs/node-observ-lib/linux/targets/alerts.libsonnet +++ b/docs/node-mixin/lib/linux/targets/alerts.libsonnet @@ -1,4 +1,4 @@ -local g = import '../g.libsonnet'; +local g = import '../../g.libsonnet'; local prometheusQuery = g.query.prometheus; local lokiQuery = g.query.loki; diff --git a/docs/node-observ-lib/linux/targets/cpu.libsonnet b/docs/node-mixin/lib/linux/targets/cpu.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/targets/cpu.libsonnet rename to docs/node-mixin/lib/linux/targets/cpu.libsonnet diff --git a/docs/node-observ-lib/linux/targets/disk.libsonnet b/docs/node-mixin/lib/linux/targets/disk.libsonnet similarity index 99% rename from docs/node-observ-lib/linux/targets/disk.libsonnet rename to docs/node-mixin/lib/linux/targets/disk.libsonnet index 4e91d619a4..036ecef63e 100644 --- a/docs/node-observ-lib/linux/targets/disk.libsonnet +++ b/docs/node-mixin/lib/linux/targets/disk.libsonnet @@ -1,4 +1,4 @@ -local g = import '../g.libsonnet'; +local g = import '../../g.libsonnet'; local prometheusQuery = g.query.prometheus; local lokiQuery = g.query.loki; diff --git a/docs/node-observ-lib/linux/targets/events.libsonnet b/docs/node-mixin/lib/linux/targets/events.libsonnet similarity index 98% rename from docs/node-observ-lib/linux/targets/events.libsonnet rename to docs/node-mixin/lib/linux/targets/events.libsonnet index 3cf73aa39e..3e95d4a57c 100644 --- a/docs/node-observ-lib/linux/targets/events.libsonnet +++ b/docs/node-mixin/lib/linux/targets/events.libsonnet @@ -1,4 +1,4 @@ -local g = import '../g.libsonnet'; +local g = import '../../g.libsonnet'; local prometheusQuery = g.query.prometheus; local lokiQuery = g.query.loki; diff --git a/docs/node-observ-lib/linux/targets/hardware.libsonnet b/docs/node-mixin/lib/linux/targets/hardware.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/targets/hardware.libsonnet rename to docs/node-mixin/lib/linux/targets/hardware.libsonnet diff --git a/docs/node-observ-lib/linux/targets/main.libsonnet b/docs/node-mixin/lib/linux/targets/main.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/targets/main.libsonnet rename to docs/node-mixin/lib/linux/targets/main.libsonnet diff --git a/docs/node-observ-lib/linux/targets/memory.libsonnet b/docs/node-mixin/lib/linux/targets/memory.libsonnet similarity index 99% rename from docs/node-observ-lib/linux/targets/memory.libsonnet rename to docs/node-mixin/lib/linux/targets/memory.libsonnet index b566b73b98..7f06662a23 100644 --- a/docs/node-observ-lib/linux/targets/memory.libsonnet +++ b/docs/node-mixin/lib/linux/targets/memory.libsonnet @@ -1,4 +1,4 @@ -local g = import '../g.libsonnet'; +local g = import '../../g.libsonnet'; local prometheusQuery = g.query.prometheus; local lokiQuery = g.query.loki; diff --git a/docs/node-observ-lib/linux/targets/network.libsonnet b/docs/node-mixin/lib/linux/targets/network.libsonnet similarity index 99% rename from docs/node-observ-lib/linux/targets/network.libsonnet rename to docs/node-mixin/lib/linux/targets/network.libsonnet index c5bdec6059..715bd8552c 100644 --- a/docs/node-observ-lib/linux/targets/network.libsonnet +++ b/docs/node-mixin/lib/linux/targets/network.libsonnet @@ -1,4 +1,4 @@ -local g = import '../g.libsonnet'; +local g = import '../../g.libsonnet'; local prometheusQuery = g.query.prometheus; local lokiQuery = g.query.loki; diff --git a/docs/node-observ-lib/linux/targets/system.libsonnet b/docs/node-mixin/lib/linux/targets/system.libsonnet similarity index 98% rename from docs/node-observ-lib/linux/targets/system.libsonnet rename to docs/node-mixin/lib/linux/targets/system.libsonnet index ac89ebaa31..04f6104857 100644 --- a/docs/node-observ-lib/linux/targets/system.libsonnet +++ b/docs/node-mixin/lib/linux/targets/system.libsonnet @@ -1,4 +1,4 @@ -local g = import '../g.libsonnet'; +local g = import '../../g.libsonnet'; local prometheusQuery = g.query.prometheus; local lokiQuery = g.query.loki; diff --git a/docs/node-observ-lib/linux/targets/use.libsonnet b/docs/node-mixin/lib/linux/targets/use.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/targets/use.libsonnet rename to docs/node-mixin/lib/linux/targets/use.libsonnet diff --git a/docs/node-observ-lib/linux/targets/useCluster.libsonnet b/docs/node-mixin/lib/linux/targets/useCluster.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/targets/useCluster.libsonnet rename to docs/node-mixin/lib/linux/targets/useCluster.libsonnet diff --git a/docs/node-observ-lib/linux/targets/useClusterMulti.libsonnet b/docs/node-mixin/lib/linux/targets/useClusterMulti.libsonnet similarity index 100% rename from docs/node-observ-lib/linux/targets/useClusterMulti.libsonnet rename to docs/node-mixin/lib/linux/targets/useClusterMulti.libsonnet diff --git a/docs/node-observ-lib/linux/variables.libsonnet b/docs/node-mixin/lib/linux/variables.libsonnet similarity index 89% rename from docs/node-observ-lib/linux/variables.libsonnet rename to docs/node-mixin/lib/linux/variables.libsonnet index 7eec70a84c..a30cf74f8b 100644 --- a/docs/node-observ-lib/linux/variables.libsonnet +++ b/docs/node-mixin/lib/linux/variables.libsonnet @@ -16,6 +16,7 @@ local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; instanceLabels=this.config.instanceLabels, varMetric='node_uname_info', customAllValue='.+', + enableLokiLogs=this.config.enableLokiLogs, ), // used in USE cluster dashboard use: @@ -26,6 +27,7 @@ local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; instanceLabels=this.config.instanceLabels, varMetric='instance:node_cpu_utilisation:rate5m', customAllValue='.+', + enableLokiLogs=this.config.enableLokiLogs, ), useCluster: commonlib.variables.new( @@ -34,6 +36,7 @@ local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; instanceLabels=[], varMetric='instance:node_cpu_utilisation:rate5m', customAllValue='.+', + enableLokiLogs=this.config.enableLokiLogs, ), }, diff --git a/docs/node-observ-lib/macos/README.md b/docs/node-mixin/lib/macos/README.md similarity index 91% rename from docs/node-observ-lib/macos/README.md rename to docs/node-mixin/lib/macos/README.md index 815903ffc1..e0ce700729 100644 --- a/docs/node-observ-lib/macos/README.md +++ b/docs/node-mixin/lib/macos/README.md @@ -6,7 +6,7 @@ This jsonnet observability lib can be used to generate observability package for ```sh jb init -jb install https://github.com/grafana/node_exporter/docs/node-observ-lib +jb install https://github.com/prometheus/node_exporter/docs/node-mixin/lib/macos ``` ## Examples @@ -17,7 +17,7 @@ You can use observ-lib to fill in monitoring-mixin structure: ```jsonnet // mixin.libsonnet file -local macoslib = import 'node-observ-lib/macos/main.libsonnet'; +local macoslib = import 'macos/main.libsonnet'; local mac = macoslib.new() @@ -39,7 +39,7 @@ local mac = } ``` -For more examples see [node-observ-lib/linux](../linux). +For more examples see [node-mixin/lib/linux](../linux). ## Collectors used: diff --git a/docs/node-observ-lib/macos/alerts.libsonnet b/docs/node-mixin/lib/macos/alerts.libsonnet similarity index 100% rename from docs/node-observ-lib/macos/alerts.libsonnet rename to docs/node-mixin/lib/macos/alerts.libsonnet diff --git a/docs/node-mixin/lib/macos/config.libsonnet b/docs/node-mixin/lib/macos/config.libsonnet new file mode 100644 index 0000000000..e931a24f9d --- /dev/null +++ b/docs/node-mixin/lib/macos/config.libsonnet @@ -0,0 +1,28 @@ +{ + // Rest of the config is imported from linux + filteringSelector: 'job="integrations/macos-node"', + dashboardNamePrefix: 'MacOS / ', + uid: 'darwin', + + dashboardTags: [self.uid], + + + // Alerts to keep from node-observ-lib: + alertsMacKeep: [ + 'NodeFilesystemAlmostOutOfSpace', + 'NodeNetworkReceiveErrs', + 'NodeNetworkTransmitErrs', + 'NodeTextFileCollectorScrapeError', + 'NodeFilesystemFilesFillingUp', + 'NodeFilesystemAlmostOutOfFiles', + ], + // logs lib related + enableLokiLogs: false, + extraLogLabels: ['filename', 'sender'], + logsVolumeGroupBy: 'sender', + showLogsVolume: true, + logsFilteringSelector: self.filteringSelector, + logsExtraFilters: '', + + +} diff --git a/docs/node-observ-lib/macos/main.libsonnet b/docs/node-mixin/lib/macos/main.libsonnet similarity index 98% rename from docs/node-observ-lib/macos/main.libsonnet rename to docs/node-mixin/lib/macos/main.libsonnet index 94605e9928..18a418f221 100644 --- a/docs/node-observ-lib/macos/main.libsonnet +++ b/docs/node-mixin/lib/macos/main.libsonnet @@ -45,6 +45,7 @@ nodelib { 'logs.json': parentGrafana.dashboards['logs.json'], } + else {} ), }, prometheus+: { diff --git a/docs/node-observ-lib/macos/panels.libsonnet b/docs/node-mixin/lib/macos/panels.libsonnet similarity index 100% rename from docs/node-observ-lib/macos/panels.libsonnet rename to docs/node-mixin/lib/macos/panels.libsonnet diff --git a/docs/node-observ-lib/macos/targets.libsonnet b/docs/node-mixin/lib/macos/targets.libsonnet similarity index 98% rename from docs/node-observ-lib/macos/targets.libsonnet rename to docs/node-mixin/lib/macos/targets.libsonnet index 0392a4c17f..7433b3974d 100644 --- a/docs/node-observ-lib/macos/targets.libsonnet +++ b/docs/node-mixin/lib/macos/targets.libsonnet @@ -4,7 +4,7 @@ local lokiQuery = g.query.loki; { new(this): { - local variables = this.grafana.variables, + local variables = this.grafana.variables.main, local config = this.config, local prometheusDatasource = '${' + variables.datasources.prometheus.name + '}', local lokiDatasource = '${' + variables.datasources.loki.name + '}', diff --git a/docs/node-mixin/lib/panels/common/info.libsonnet b/docs/node-mixin/lib/panels/common/info.libsonnet deleted file mode 100644 index 3b54d39382..0000000000 --- a/docs/node-mixin/lib/panels/common/info.libsonnet +++ /dev/null @@ -1,30 +0,0 @@ -// Info panel text (number or text) -local statPanel = import '../stat.libsonnet'; -statPanel { - new( - title=null, - description=null, - datasource=null, - ):: - super.new( - title, - description, - datasource, - ) - + self.withColor(color='text') - + self.withTextSize(value=20) - + self.withGraphMode('none') - + - { - options+: { - reduceOptions: { - values: false, - calcs: [ - 'lastNotNull', - ], - fields: '', - }, - graphMode: 'none', - }, - }, -} diff --git a/docs/node-mixin/lib/panels/common/networktraffic.libsonnet b/docs/node-mixin/lib/panels/common/networktraffic.libsonnet deleted file mode 100644 index 09f3370f67..0000000000 --- a/docs/node-mixin/lib/panels/common/networktraffic.libsonnet +++ /dev/null @@ -1,18 +0,0 @@ -// Panels to graph network traffic in and out -local timeseries = import '../timeseries.libsonnet'; -timeseries { - new( - title=null, - description=null, - datasource=null, - ):: - super.new( - title, - description, - datasource, - ) - + self.withDecimals(1) - + self.withUnits('bps') - + self.withNegativeYByRegex('transmit|tx|out') - + self.withAxisLabel('out(-) | in(+)'), -} diff --git a/docs/node-mixin/lib/panels/common/panels.libsonnet b/docs/node-mixin/lib/panels/common/panels.libsonnet deleted file mode 100644 index 88fea17a6e..0000000000 --- a/docs/node-mixin/lib/panels/common/panels.libsonnet +++ /dev/null @@ -1,6 +0,0 @@ -{ - uptimeStat:: import 'uptime.libsonnet', - infoStat:: import 'info.libsonnet', - percentUsageStat:: import 'percentusage.libsonnet', - networkTrafficGraph:: import 'networktraffic.libsonnet', -} diff --git a/docs/node-mixin/lib/panels/common/percentusage.libsonnet b/docs/node-mixin/lib/panels/common/percentusage.libsonnet deleted file mode 100644 index 884878f673..0000000000 --- a/docs/node-mixin/lib/panels/common/percentusage.libsonnet +++ /dev/null @@ -1,30 +0,0 @@ -// Panels to display metrics that can go from 0 to 100%. (cpu utilization, memory utilization etc). Full utilization is considered an issue. -local statPanel = import '../stat.libsonnet'; -statPanel { - new( - title=null, - description=null, - datasource=null, - ):: - super.new( - title, - description, - datasource, - ) - + self.withDecimals(1) - + self.withUnits('percent') - + self.withMax(100) - + self.withMin(0) - + self.withColor(mode='continuous-BlYlRd') - { - options+: { - reduceOptions: { - values: false, - calcs: [ - 'lastNotNull', - ], - fields: '', - }, - }, - }, -} diff --git a/docs/node-mixin/lib/panels/common/uptime.libsonnet b/docs/node-mixin/lib/panels/common/uptime.libsonnet deleted file mode 100644 index a64a179faa..0000000000 --- a/docs/node-mixin/lib/panels/common/uptime.libsonnet +++ /dev/null @@ -1,43 +0,0 @@ -local statPanel = import '../stat.libsonnet'; -statPanel { - new( - title='Uptime', - description=null, - datasource=null, - ):: - super.new( - title, - description, - datasource, - ) - + self.withDecimals(1) - + self.withGraphMode('none') - + self.withTextSize(value=20) - + self.withUnits('dtdurations') - + self.withThresholds( - mode='absolute', - steps=[ - { - color: 'orange', - value: null, - }, - { - color: 'text', - value: 300, - }, - ] - ) - + self.withColor(mode='thresholds') - + - { - options+: { - reduceOptions: { - values: false, - calcs: [ - 'lastNotNull', - ], - fields: '', - }, - }, - }, -} diff --git a/docs/node-mixin/lib/panels/panel.libsonnet b/docs/node-mixin/lib/panels/panel.libsonnet deleted file mode 100644 index 8ede6ffe87..0000000000 --- a/docs/node-mixin/lib/panels/panel.libsonnet +++ /dev/null @@ -1,129 +0,0 @@ -// generic grafana dashboard -{ - //feed grafonnet panel - new():: {}, - - withUnits(unit):: self { - - fieldConfig+: { - defaults+: { - unit: unit, - }, - }, - }, - - withLegend(show=true, mode='table', placement='bottom', calcs=['min', 'mean', 'max', 'lastNotNull']):: self { - options+: { - legend: { - showLegend: show, - displayMode: mode, - placement: placement, - calcs: calcs, - }, - }, - }, - withDecimals(decimals):: self { - - fieldConfig+: { - defaults+: { - decimals: decimals, - }, - }, - }, - - withThresholds(mode='absolute', steps=null):: self { - - fieldConfig+: { - defaults+: { - thresholds: { - mode: mode, - steps: steps, - }, - }, - }, - }, - withMin(value):: self { - fieldConfig+: { - defaults+: { - min: value, - }, - }, - }, - withMax(value):: self { - fieldConfig+: { - defaults+: { - max: value, - }, - }, - }, - withColor(color=null, mode='fixed'):: self { - fieldConfig+: { - defaults+: { - color: { - mode: mode, - fixedColor: if mode == 'fixed' then color else null, - }, - }, - }, - }, - withMaxDataPoints(value):: self { - maxDataPoints: value, - }, - withTransform():: self { - - merge():: self - { - transformations+: [ - { - id: 'merge', - options: {}, - }, - ], - }, - filterFieldsByName(pattern=null):: self - { - transformations+: [ - { - id: 'filterFieldsByName', - options: { - include: { - pattern: pattern, - }, - }, - }, - ], - }, - joinByField( - mode='outer', - field=null - ):: self { - transformations+: [ - { - id: 'joinByField', - options: { - byField: field, - mode: mode, - }, - }, - ], - }, - organize( - excludeByName={}, - indexByName={}, - renameByName={}, - - ):: self - { - transformations+: [ - { - id: 'organize', - options: { - excludeByName: excludeByName, - indexByName: indexByName, - renameByName: renameByName, - }, - }, - ], - }, - }, -} diff --git a/docs/node-mixin/lib/panels/panels.libsonnet b/docs/node-mixin/lib/panels/panels.libsonnet deleted file mode 100644 index 19c9a1d896..0000000000 --- a/docs/node-mixin/lib/panels/panels.libsonnet +++ /dev/null @@ -1,5 +0,0 @@ -{ - timeseries:: import 'timeseries.libsonnet', - stat:: import 'stat.libsonnet', - table:: import 'table.libsonnet', -} diff --git a/docs/node-mixin/lib/panels/stat.libsonnet b/docs/node-mixin/lib/panels/stat.libsonnet deleted file mode 100644 index e3fa4172f3..0000000000 --- a/docs/node-mixin/lib/panels/stat.libsonnet +++ /dev/null @@ -1,28 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local genericPanel = import 'panel.libsonnet'; -genericPanel { - new( - title=null, - description=null, - datasource=null, - ):: self + - grafana.statPanel.new( - title=title, - description=description, - datasource=datasource, - ), - withGraphMode(mode='none'):: self { - options+: - { - graphMode: mode, - }, - }, - withTextSize(value='auto', title='auto'):: self { - options+: - { text: { - valueSize: value, - titleSize: title, - } }, - }, - -} diff --git a/docs/node-mixin/lib/panels/table.libsonnet b/docs/node-mixin/lib/panels/table.libsonnet deleted file mode 100644 index 4a9c36cc66..0000000000 --- a/docs/node-mixin/lib/panels/table.libsonnet +++ /dev/null @@ -1,37 +0,0 @@ -local grafana70 = import 'github.com/grafana/grafonnet-lib/grafonnet-7.0/grafana.libsonnet'; -local genericPanel = import 'panel.libsonnet'; -local table = grafana70.panel.table; -genericPanel -{ - new( - title=null, - description=null, - datasource=null, - ):: self + - table.new( - title=title, - description=description, - datasource=datasource, - ), - sortBy(field, desc=false):: self { - options+: { - sortBy: [ - { - displayName: field, - desc: desc, - }, - ], - }, - }, - withFooter(reducer=['mean'], fields=[]):: self { - - options+: { - footer: { - show: true, - reducer: reducer, - fields: fields, - }, - }, - }, - -} diff --git a/docs/node-mixin/lib/panels/timeseries.libsonnet b/docs/node-mixin/lib/panels/timeseries.libsonnet deleted file mode 100644 index 816ec49ad0..0000000000 --- a/docs/node-mixin/lib/panels/timeseries.libsonnet +++ /dev/null @@ -1,145 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local genericPanel = import 'panel.libsonnet'; -genericPanel -{ - new( - title=null, - description=null, - datasource=null, - ):: self + - grafana.graphPanel.new( - title=title, - description=description, - datasource=datasource, - ) - + - { - type: 'timeseries', - } - + self.withFillOpacity(10) - + self.withGradientMode('opacity') - + self.withLineInterpolation('smooth') - + self.withShowPoints('never') - + self.withTooltip(mode='multi', sort='desc') - + self.withLegend(mode='list', calcs=[]), - withDrawStyle(style):: self { - fieldConfig+: { - defaults+: { - custom+: { - drawStyle: style, - }, - }, - }, - }, - withPointsSize(size):: self { - fieldConfig+: { - defaults+: { - custom+: { - pointSize: size, - }, - }, - }, - }, - withTooltip(mode=null, sort='none'):: self { - options+: { - tooltip: { - mode: 'multi', - sort: sort, - }, - }, - }, - withLineInterpolation(value):: self { - fieldConfig+: { - defaults+: { - custom+: { - lineInterpolation: value, - }, - }, - }, - }, - withShowPoints(value):: self { - fieldConfig+: { - defaults+: { - custom+: { - showPoints: value, - }, - }, - }, - }, - withStacking(stack='normal'):: self { - fieldConfig+: { - defaults+: { - custom+: { - stacking: { - mode: stack, - group: 'A', - }, - }, - }, - }, - }, - withGradientMode(mode):: self { - fieldConfig+: { - defaults+: { - custom+: { - gradientMode: mode, - }, - }, - }, - }, - addDataLink(title, url):: self { - - fieldConfig+: { - defaults+: { - links: [ - { - title: title, - url: url, - }, - ], - }, - }, - }, - withFillOpacity(opacity):: self { - fieldConfig+: { - defaults+: { - custom+: { - fillOpacity: opacity, - }, - }, - }, - - }, - - withAxisLabel(label):: self { - fieldConfig+: { - defaults+: { - custom+: { - axisLabel: label, - }, - }, - }, - }, - - withNegativeYByRegex(regex):: self { - fieldConfig+: { - overrides+: [ - { - matcher: { - id: 'byRegexp', - options: '/' + regex + '/', - }, - properties: [ - { - id: 'custom.transform', - value: 'negative-Y', - }, - ], - }, - - ], - }, - - - }, -} diff --git a/docs/node-mixin/mixin-mac.libsonnet b/docs/node-mixin/mixin-mac.libsonnet new file mode 100644 index 0000000000..66f2ec8bbf --- /dev/null +++ b/docs/node-mixin/mixin-mac.libsonnet @@ -0,0 +1,10 @@ +local macoslib = import './lib/macos/main.libsonnet'; +{ + _config:: {}, + _macosLib:: + macoslib.new() + + macoslib.withConfigMixin(self._config), + grafanaDashboards+:: self._macosLib.grafana.dashboards, + prometheusAlerts+:: self._macosLib.prometheus.alerts, + prometheusRules+:: self._macosLib.prometheus.recordingRules, +} diff --git a/docs/node-mixin/mixin.libsonnet b/docs/node-mixin/mixin.libsonnet index b9831f9380..1659faf9d3 100644 --- a/docs/node-mixin/mixin.libsonnet +++ b/docs/node-mixin/mixin.libsonnet @@ -1,4 +1,11 @@ -(import 'config.libsonnet') + -(import 'alerts/alerts.libsonnet') + -(import 'dashboards/dashboards.libsonnet') + -(import 'rules/rules.libsonnet') +local nodelib = import './lib/linux/main.libsonnet'; + +{ + _config:: {}, + _linuxLib:: + nodelib.new() + + nodelib.withConfigMixin(self._config), + grafanaDashboards+:: self._linuxLib.grafana.dashboards, + prometheusAlerts+:: self._linuxLib.prometheus.alerts, + prometheusRules+:: self._linuxLib.prometheus.recordingRules, +} diff --git a/docs/node-mixin/rules.jsonnet b/docs/node-mixin/rules.jsonnet deleted file mode 100644 index dbe13f417b..0000000000 --- a/docs/node-mixin/rules.jsonnet +++ /dev/null @@ -1 +0,0 @@ -std.manifestYamlDoc((import 'mixin.libsonnet').prometheusRules) diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet deleted file mode 100644 index 9c8eb90dd1..0000000000 --- a/docs/node-mixin/rules/rules.libsonnet +++ /dev/null @@ -1,119 +0,0 @@ -{ - prometheusRules+:: { - groups+: [ - { - name: 'node-exporter.rules', - rules: [ - { - // This rule gives the number of CPUs per node. - record: 'instance:node_num_cpu:sum', - expr: ||| - count without (cpu, mode) ( - node_cpu_seconds_total{%(nodeExporterSelector)s,mode="idle"} - ) - ||| % $._config, - }, - { - // CPU utilisation is % CPU without {idle,iowait,steal}. - record: 'instance:node_cpu_utilisation:rate%(rateInterval)s' % $._config, - expr: ||| - 1 - avg without (cpu) ( - sum without (mode) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode=~"idle|iowait|steal"}[%(rateInterval)s])) - ) - ||| % $._config, - }, - { - // This is CPU saturation: 1min avg run queue length / number of CPUs. - // Can go over 1. - // TODO: There are situation where a run queue >1/core is just normal and fine. - // We need to clarify how to read this metric and if its usage is helpful at all. - record: 'instance:node_load1_per_cpu:ratio', - expr: ||| - ( - node_load1{%(nodeExporterSelector)s} - / - instance:node_num_cpu:sum{%(nodeExporterSelector)s} - ) - ||| % $._config, - }, - { - // Memory utilisation (ratio of used memory per instance). - record: 'instance:node_memory_utilisation:ratio', - expr: ||| - 1 - ( - ( - node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} - or - ( - node_memory_Buffers_bytes{%(nodeExporterSelector)s} - + - node_memory_Cached_bytes{%(nodeExporterSelector)s} - + - node_memory_MemFree_bytes{%(nodeExporterSelector)s} - + - node_memory_Slab_bytes{%(nodeExporterSelector)s} - ) - ) - / - node_memory_MemTotal_bytes{%(nodeExporterSelector)s} - ) - ||| % $._config, - }, - { - record: 'instance:node_vmstat_pgmajfault:rate%(rateInterval)s' % $._config, - expr: ||| - rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[%(rateInterval)s]) - ||| % $._config, - }, - { - // Disk utilisation (seconds spent, 1 second rate). - record: 'instance_device:node_disk_io_time_seconds:rate%(rateInterval)s' % $._config, - expr: ||| - rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[%(rateInterval)s]) - ||| % $._config, - }, - { - // Disk saturation (weighted seconds spent, 1 second rate). - record: 'instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s' % $._config, - expr: ||| - rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[%(rateInterval)s]) - ||| % $._config, - }, - { - record: 'instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s' % $._config, - expr: ||| - sum without (device) ( - rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[%(rateInterval)s]) - ) - ||| % $._config, - }, - { - record: 'instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s' % $._config, - expr: ||| - sum without (device) ( - rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[%(rateInterval)s]) - ) - ||| % $._config, - }, - // TODO: Find out if those drops ever happen on modern switched networks. - { - record: 'instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s' % $._config, - expr: ||| - sum without (device) ( - rate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[%(rateInterval)s]) - ) - ||| % $._config, - }, - { - record: 'instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s' % $._config, - expr: ||| - sum without (device) ( - rate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[%(rateInterval)s]) - ) - ||| % $._config, - }, - ], - }, - ], - }, -} diff --git a/docs/node-observ-lib/.gitignore b/docs/node-observ-lib/.gitignore deleted file mode 100644 index f9bf6ba815..0000000000 --- a/docs/node-observ-lib/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -jsonnetfile.lock.json -vendor diff --git a/docs/node-observ-lib/jsonnetfile.json b/docs/node-observ-lib/jsonnetfile.json deleted file mode 100644 index b12b5dc0af..0000000000 --- a/docs/node-observ-lib/jsonnetfile.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "version": 1, - "dependencies": [ - { - "source": { - "git": { - "remote": "https://github.com/grafana/grafonnet.git", - "subdir": "gen/grafonnet-v10.0.0" - } - }, - "version": "main" - }, - { - "source": { - "git": { - "remote": "https://github.com/grafana/jsonnet-libs.git", - "subdir": "common-lib" - } - }, - "version": "master" - }, - { - "source": { - "git": { - "remote": "https://github.com/grafana/jsonnet-libs.git", - "subdir": "logs-lib" - } - }, - "version": "master" - } - ], - "legacyImports": true -} \ No newline at end of file diff --git a/docs/node-observ-lib/jsonnetfile.lock.json b/docs/node-observ-lib/jsonnetfile.lock.json deleted file mode 100644 index c10dd413b5..0000000000 --- a/docs/node-observ-lib/jsonnetfile.lock.json +++ /dev/null @@ -1,76 +0,0 @@ -{ - "version": 1, - "dependencies": [ - { - "source": { - "git": { - "remote": "https://github.com/grafana/grafonnet.git", - "subdir": "gen/grafonnet-v10.0.0" - } - }, - "version": "1ce5aec95ce32336fe47c8881361847c475b5254", - "sum": "xdcrJPJlpkq4+5LpGwN4tPAuheNNLXZjE6tDcyvFjr0=" - }, - { - "source": { - "git": { - "remote": "https://github.com/grafana/grafonnet.git", - "subdir": "gen/grafonnet-v11.0.0" - } - }, - "version": "1ce5aec95ce32336fe47c8881361847c475b5254", - "sum": "0BvzR0i4bS4hc2O3xDv6i9m52z7mPrjvqxtcPrGhynA=" - }, - { - "source": { - "git": { - "remote": "https://github.com/grafana/jsonnet-libs.git", - "subdir": "common-lib" - } - }, - "version": "2c38760394b41de9b7477e8ab26e9a24ed85b589", - "sum": "b/FOCPjMla8HXFMJNsjP+2/0UpWade7PDBzNARihj1U=" - }, - { - "source": { - "git": { - "remote": "https://github.com/grafana/jsonnet-libs.git", - "subdir": "logs-lib" - } - }, - "version": "2c38760394b41de9b7477e8ab26e9a24ed85b589", - "sum": "05RYR0TOrWza0n8lgF9K7naGY7kM6OTBePsoiJw2TUE=" - }, - { - "source": { - "git": { - "remote": "https://github.com/jsonnet-libs/docsonnet.git", - "subdir": "doc-util" - } - }, - "version": "6ac6c69685b8c29c54515448eaca583da2d88150", - "sum": "BrAL/k23jq+xy9oA7TWIhUx07dsA/QLm3g7ktCwe//U=" - }, - { - "source": { - "git": { - "remote": "https://github.com/jsonnet-libs/xtd.git", - "subdir": "" - } - }, - "version": "63d430b69a95741061c2f7fc9d84b1a778511d9c", - "sum": "qiZi3axUSXCVzKUF83zSAxklwrnitMmrDK4XAfjPMdE=" - }, - { - "source": { - "git": { - "remote": "https://github.com/yugui/jsonnetunit.git", - "subdir": "jsonnetunit" - } - }, - "version": "6927c58cae7624a00f368b977ccc477d4f74071f", - "sum": "9FFqqln65hooRF0l6rjICDtnTxUlmDj34+sKMh4sjPI=" - } - ], - "legacyImports": false -} diff --git a/docs/node-observ-lib/linux/targets.libsonnet b/docs/node-observ-lib/linux/targets.libsonnet deleted file mode 100644 index a12ad388a8..0000000000 --- a/docs/node-observ-lib/linux/targets.libsonnet +++ /dev/null @@ -1,1147 +0,0 @@ -local g = import '../g.libsonnet'; -local prometheusQuery = g.query.prometheus; -local lokiQuery = g.query.loki; - -{ - new(this): { - local variables = this.grafana.variables, - local config = this.config, - local prometheusDatasource = '${' + variables.datasources.prometheus.name + '}', - local lokiDatasource = '${' + variables.datasources.loki.name + '}', - uptimeQuery:: 'node_boot_time_seconds', - - reboot: - prometheusQuery.new( - prometheusDatasource, - self.uptimeQuery + '{%(queriesSelector)s}*1000 > $__from < $__to' % variables, - ), - - serviceFailed: - lokiQuery.new( - lokiDatasource, - '{%(queriesSelector)s, unit="init.scope"} |= "code=exited, status=1/FAILURE"' % variables - ), - // those events should be rare, so can be shown as annotations - criticalEvents: - lokiQuery.new( - lokiDatasource, - '{%(queriesSelector)s, transport="kernel", level="emerg"}' % variables - ), - memoryOOMkiller: - prometheusQuery.new( - prometheusDatasource, - 'increase(node_vmstat_oom_kill{%(queriesSelector)s}[$__interval:] offset -$__interval)' % variables, - ) - + prometheusQuery.withLegendFormat('OOM killer invocations'), - - kernelUpdate: - prometheusQuery.new( - prometheusDatasource, - expr=||| - changes( - sum by (%(instanceLabels)s) ( - group by (%(instanceLabels)s,release) (node_uname_info{%(queriesSelector)s}) - ) - [$__interval:1m] offset -$__interval) > 1 - ||| % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, - ), - - // new interactive session in logs: - sessionOpened: - lokiQuery.new( - lokiDatasource, - '{%(queriesSelector)s, unit="systemd-logind.service"}|= "New session"' % variables - ), - sessionClosed: - lokiQuery.new( - lokiDatasource, - '{%(queriesSelector)s, unit="systemd-logind.service"} |= "logged out"' % variables - ), - - alertsCritical: - prometheusQuery.new( - prometheusDatasource, - 'count by (%(instanceLabels)s) (max_over_time(ALERTS{%(queriesSelector)s, alertstate="firing", severity="critical"}[1m])) * group by (%(instanceLabels)s) (node_uname_info{%(queriesSelector)s})' % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, - ), - alertsWarning: - prometheusQuery.new( - prometheusDatasource, - 'count by (%(instanceLabels)s) (max_over_time(ALERTS{%(queriesSelector)s, alertstate="firing", severity="warning"}[1m])) * group by (%(instanceLabels)s) (node_uname_info{%(queriesSelector)s})' % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, - ), - - uptime: - prometheusQuery.new( - prometheusDatasource, - 'time() - ' + self.uptimeQuery + '{%(queriesSelector)s}' % variables - ), - cpuCount: - prometheusQuery.new( - prometheusDatasource, - 'count without (cpu) (node_cpu_seconds_total{%(queriesSelector)s, mode="idle"})' % variables - ) - + prometheusQuery.withLegendFormat('Cores'), - cpuUsage: - prometheusQuery.new( - prometheusDatasource, - ||| - (((count by (%(instanceLabels)s) (count(node_cpu_seconds_total{%(queriesSelector)s}) by (cpu, %(instanceLabels)s))) - - - avg by (%(instanceLabels)s) (sum by (%(instanceLabels)s, mode)(irate(node_cpu_seconds_total{mode='idle',%(queriesSelector)s}[$__rate_interval])))) * 100) - / - count by(%(instanceLabels)s) (count(node_cpu_seconds_total{%(queriesSelector)s}) by (cpu, %(instanceLabels)s)) - ||| % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, - ) - + prometheusQuery.withLegendFormat('CPU usage'), - cpuUsagePerCore: - prometheusQuery.new( - prometheusDatasource, - ||| - ( - (1 - sum without (mode) (rate(node_cpu_seconds_total{%(queriesSelector)s, mode=~"idle|iowait|steal"}[$__rate_interval]))) - / ignoring(cpu) group_left - count without (cpu, mode) (node_cpu_seconds_total{%(queriesSelector)s, mode="idle"}) - ) * 100 - ||| % variables, - ) - + prometheusQuery.withLegendFormat('CPU {{cpu}}'), - cpuUsageByMode: - prometheusQuery.new( - prometheusDatasource, - ||| - sum by(%(instanceLabels)s, mode) (irate(node_cpu_seconds_total{%(queriesSelector)s}[$__rate_interval])) - / on(%(instanceLabels)s) - group_left sum by (%(instanceLabels)s)((irate(node_cpu_seconds_total{%(queriesSelector)s}[$__rate_interval]))) * 100 - ||| % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, - ) - + prometheusQuery.withLegendFormat('{{ mode }}'), - memoryTotalBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_MemTotal_bytes{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Memory total'), - memoryFreeBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_MemFree_bytes{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Memory free'), - memoryAvailableBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_MemAvailable_bytes{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Memory available'), - memoryCachedBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Cached_bytes{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Memory cached'), - memoryBuffersBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Buffers_bytes{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Memory buffers'), - memoryUsedBytes: - prometheusQuery.new( - prometheusDatasource, - ||| - ( - node_memory_MemTotal_bytes{%(queriesSelector)s} - - - node_memory_MemFree_bytes{%(queriesSelector)s} - - - node_memory_Buffers_bytes{%(queriesSelector)s} - - - node_memory_Cached_bytes{%(queriesSelector)s} - ) - ||| % variables - ) - + prometheusQuery.withLegendFormat('Memory used'), - memoryUsagePercent: - prometheusQuery.new( - prometheusDatasource, - ||| - 100 - - ( - avg by (%(instanceLabels)s) (node_memory_MemAvailable_bytes{%(queriesSelector)s}) / - avg by (%(instanceLabels)s) (node_memory_MemTotal_bytes{%(queriesSelector)s}) - * 100 - ) - ||| - % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, - ), - memorySwapTotal: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_SwapTotal_bytes{%(queriesSelector)s}' % variables - ), - memoryPagesIn: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_vmstat_pgpgin{%(queriesSelector)s}[$__rate_interval])' % variables, - ) - + prometheusQuery.withLegendFormat('Page-In'), - memoryPagesOut: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_vmstat_pgpgout{%(queriesSelector)s}[$__rate_interval])' % variables, - ) - + prometheusQuery.withLegendFormat('Page-Out'), - - memoryPagesSwapIn: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_vmstat_pswpin{%(queriesSelector)s}[$__rate_interval])' % variables, - ) - + prometheusQuery.withLegendFormat('Pages swapped in'), - memoryPagesSwapOut: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_vmstat_pswpout{%(queriesSelector)s}[$__rate_interval])' % variables, - ) - + prometheusQuery.withLegendFormat('Pages swapped out'), - - memoryPageMajorFaults: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_vmstat_pgmajfault{%(queriesSelector)s}[$__rate_interval])' % variables, - ) - + prometheusQuery.withLegendFormat('Major page fault operations'), - memoryPageMinorFaults: - prometheusQuery.new( - prometheusDatasource, - ||| - irate(node_vmstat_pgfault{%(queriesSelector)s}[$__rate_interval]) - - - irate(node_vmstat_pgmajfault{%(queriesSelector)s}[$__rate_interval]) - ||| % variables, - ) - + prometheusQuery.withLegendFormat('Minor page fault operations'), - - memoryInactiveBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Inactive_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Inactive'), - memoryActiveBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Active_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Active'), - - memoryInactiveFile: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Inactive_file_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Inactive_file'), - - memoryInactiveAnon: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Inactive_anon_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Inactive_anon'), - - memoryActiveFile: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Active_file_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Active_file'), - - memoryActiveAnon: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Active_anon_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Active_anon'), - - memoryCommitedAs: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Committed_AS_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Commited_AS'), - memoryCommitedLimit: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_CommitLimit_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('CommitLimit'), - - memoryMappedBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Mapped_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Mapped'), - memoryShmemBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Shmem_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Shmem'), - memoryShmemHugePagesBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_ShmemHugePages_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('ShmemHugePages'), - memoryShmemPmdMappedBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_ShmemPmdMapped_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('ShmemPmdMapped'), - memoryWriteback: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Writeback_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Writeback'), - memoryWritebackTmp: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_WritebackTmp_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('WritebackTmp'), - memoryDirty: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Dirty_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Dirty'), - - memoryVmallocChunk: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_VmallocChunk_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('VmallocChunk'), - memoryVmallocTotal: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_VmallocTotal_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('VmallocTotal'), - memoryVmallocUsed: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_VmallocUsed_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('VmallocUsed'), - memorySlabSUnreclaim: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_SUnreclaim_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('SUnreclaim'), - memorySlabSReclaimable: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_SReclaimable_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('SReclaimable'), - - memoryAnonHugePages: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_AnonHugePages_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('AnonHugePages'), - memoryAnonPages: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_AnonPages_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('AnonPages'), - - memoryHugePages_Free: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_HugePages_Free{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('HugePages_Free'), - memoryHugePages_Rsvd: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_HugePages_Rsvd{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('HugePages_Rsvd'), - memoryHugePages_Surp: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_HugePages_Surp{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('HugePages_Surp'), - memoryHugePagesTotalSize: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_HugePages_Total{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Huge pages total size'), - memoryHugePagesSize: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Hugepagesize_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Huge page size'), - memoryDirectMap1G: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_DirectMap1G_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('DirectMap1G'), - memoryDirectMap2M: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_DirectMap2M_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('DirectMap2M'), - memoryDirectMap4k: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_DirectMap4k_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('DirectMap4k'), - memoryBounce: - prometheusQuery.new( - prometheusDatasource, - 'node_memory_Bounce_bytes{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Bounce'), - - diskTotal: - prometheusQuery.new( - prometheusDatasource, - 'node_filesystem_size_bytes{%(fsSelector)s, %(fsMountpointSelector)s, %(queriesSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } - ), - diskTotalRoot: - prometheusQuery.new( - prometheusDatasource, - 'node_filesystem_size_bytes{%(queriesSelector)s, mountpoint="/", fstype!="rootfs"}' % variables, - ), - diskUsageRoot: - prometheusQuery.new( - prometheusDatasource, - 'node_filesystem_avail_bytes{%(queriesSelector)s, mountpoint="/",fstype!="rootfs"}' % variables - ), - diskUsageRootPercent: - prometheusQuery.new( - prometheusDatasource, - '100 - node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs", %(queriesSelector)s}/node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs", %(queriesSelector)s}*100' % variables - ), - diskFree: - prometheusQuery.new( - prometheusDatasource, - 'node_filesystem_avail_bytes{%(fsSelector)s, %(fsMountpointSelector)s, %(queriesSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } - ) - + prometheusQuery.withLegendFormat('{{ mountpoint }} free'), - diskUsagePercent: - prometheusQuery.new( - prometheusDatasource, - '100 - node_filesystem_avail_bytes{%(fsSelector)s, %(fsMountpointSelector)s, %(queriesSelector)s}/node_filesystem_size_bytes{%(fsSelector)s, %(fsMountpointSelector)s, %(queriesSelector)s}*100' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } - ) - + prometheusQuery.withLegendFormat('{{ mountpoint }} used, %'), - - diskInodesFree: - prometheusQuery.new( - prometheusDatasource, - 'node_filesystem_files_free{%(queriesSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector }, - ) - + prometheusQuery.withLegendFormat('{{ mountpoint }} inodes free'), - diskInodesTotal: - prometheusQuery.new( - prometheusDatasource, - 'node_filesystem_files{%(queriesSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } - ) + prometheusQuery.withLegendFormat('{{ mountpoint }} inodes total'), - diskReadOnly: - prometheusQuery.new( - prometheusDatasource, - 'node_filesystem_readonly{%(queriesSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } - ) - + prometheusQuery.withLegendFormat('{{ mountpoint }} read-only'), - diskDeviceError: - prometheusQuery.new( - prometheusDatasource, - 'node_filesystem_device_error{%(queriesSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } - ) - + prometheusQuery.withLegendFormat('{{ mountpoint }} device error'), - // descriptors - processMaxFds: - prometheusQuery.new( - prometheusDatasource, - 'process_max_fds{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Maximum open file descriptors'), - processOpenFds: - prometheusQuery.new( - prometheusDatasource, - 'process_open_fds{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Open file descriptors'), - - // disk(device) - diskIOreadBytesPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_disk_read_bytes_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % variables { diskDeviceSelector: config.diskDeviceSelector }, - ) - + prometheusQuery.withLegendFormat('{{ device }} read'), - diskIOwriteBytesPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_disk_written_bytes_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % variables { diskDeviceSelector: config.diskDeviceSelector }, - ) - + prometheusQuery.withLegendFormat('{{ device }} written'), - diskIOutilization: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_disk_io_time_seconds_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % variables { diskDeviceSelector: config.diskDeviceSelector }, - ) - + prometheusQuery.withLegendFormat('{{ device }} io util'), - diskAvgQueueSize: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_disk_io_time_weighted_seconds_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % variables { diskDeviceSelector: config.diskDeviceSelector }, - ) - + prometheusQuery.withLegendFormat('{{ device }} avg queue'), - - diskIOWaitWriteTime: - prometheusQuery.new( - prometheusDatasource, - ||| - irate(node_disk_write_time_seconds_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - / - irate(node_disk_writes_completed_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - ||| % variables { diskDeviceSelector: config.diskDeviceSelector } - ) - + prometheusQuery.withLegendFormat('{{ device }} avg write time'), - diskIOWaitReadTime: - prometheusQuery.new( - prometheusDatasource, - ||| - irate(node_disk_read_time_seconds_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - / - irate(node_disk_reads_completed_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - ||| % variables { diskDeviceSelector: config.diskDeviceSelector } - ) - + prometheusQuery.withLegendFormat('{{ device }} avg read time'), - diskIOReads: - prometheusQuery.new( - prometheusDatasource, - ||| - irate(node_disk_reads_completed_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - ||| % variables { diskDeviceSelector: config.diskDeviceSelector } - ) - + prometheusQuery.withLegendFormat('{{ device }} reads'), - diskIOWrites: - prometheusQuery.new( - prometheusDatasource, - ||| - irate(node_disk_writes_completed_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) - ||| % variables { diskDeviceSelector: config.diskDeviceSelector } - ) - + prometheusQuery.withLegendFormat('{{ device }} writes'), - - unameInfo: - prometheusQuery.new( - prometheusDatasource, - 'node_uname_info{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withFormat('table'), - osInfo: - prometheusQuery.new( - prometheusDatasource, - ||| - node_os_info{%(queriesSelector)s} - ||| % variables, - ) - + prometheusQuery.withFormat('table'), - osInfoCombined: - prometheusQuery.new( - prometheusDatasource, - ||| - node_uname_info{%(queriesSelector)s} - * on (%(groupLabels)s,%(instanceLabels)s) - group_left(pretty_name) - node_os_info{%(queriesSelector)s} - ||| % variables { - instanceLabels: std.join(',', this.config.instanceLabels), - groupLabels: std.join(',', this.config.groupLabels), - }, - ) - + prometheusQuery.withFormat('table'), - - osTimezone: //timezone label - prometheusQuery.new( - prometheusDatasource, - 'node_time_zone_offset_seconds{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withFormat('table'), - - systemLoad1: - prometheusQuery.new( - prometheusDatasource, - 'node_load1{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('1m'), - systemLoad5: - prometheusQuery.new( - prometheusDatasource, - 'node_load5{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('5m'), - systemLoad15: - prometheusQuery.new( - prometheusDatasource, - 'node_load15{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('15m'), - - systemContextSwitches: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_context_switches_total{%(queriesSelector)s}[$__rate_interval])' % variables, - ) - + prometheusQuery.withLegendFormat('Context switches'), - - systemInterrupts: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_intr_total{%(queriesSelector)s}[$__rate_interval])' % variables, - ) - + prometheusQuery.withLegendFormat('Interrupts'), - - timeNtpStatus: - prometheusQuery.new( - prometheusDatasource, - 'node_timex_sync_status{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('NTP status'), - - timeOffset: - prometheusQuery.new( - prometheusDatasource, - 'node_timex_offset_seconds{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Time offset'), - - timeEstimatedError: - prometheusQuery.new( - prometheusDatasource, - 'node_timex_estimated_error_seconds{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Estimated error in seconds'), - timeMaxError: - prometheusQuery.new( - prometheusDatasource, - 'node_timex_maxerror_seconds{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('Maximum error in seconds'), - - networkUp: - prometheusQuery.new( - prometheusDatasource, - 'node_network_up{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('{{device}}'), - networkCarrier: - prometheusQuery.new( - prometheusDatasource, - 'node_network_carrier{%(queriesSelector)s}' % variables, - ) - + prometheusQuery.withLegendFormat('{{device}}'), - networkArpEntries: - prometheusQuery.new( - prometheusDatasource, - 'node_arp_entries{%(queriesSelector)s}' % variables, - ), - networkMtuBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_network_mtu_bytes{%(queriesSelector)s}' % variables, - ), - networkSpeedBitsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'node_network_speed_bytes{%(queriesSelector)s} * 8' % variables, - ), - networkTransmitQueueLength: - prometheusQuery.new( - prometheusDatasource, - 'node_network_transmit_queue_length{%(queriesSelector)s}' % variables, - ), - networkInfo: - prometheusQuery.new( - prometheusDatasource, - 'node_network_info{%(queriesSelector)s}' % variables, - ), - - networkOutBitPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_transmit_bytes_total{%(queriesSelector)s}[$__rate_interval])*8' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} transmitted'), - networkInBitPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_receive_bytes_total{%(queriesSelector)s}[$__rate_interval])*8' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} received'), - networkOutBitPerSecFiltered: - prometheusQuery.new( - prometheusDatasource, - ||| - irate(node_network_transmit_bytes_total{%(queriesSelector)s}[$__rate_interval])*8 - # only show interfaces that had traffic change at least once during selected dashboard interval: - and - increase( - node_network_transmit_bytes_total{%(queriesSelector)s}[$__range] - ) > 0 - ||| % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} transmitted'), - networkInBitPerSecFiltered: - prometheusQuery.new( - prometheusDatasource, - ||| - irate(node_network_receive_bytes_total{%(queriesSelector)s}[$__rate_interval])*8 - # only show interfaces that had traffic change at least once during selected dashboard interval: - and - increase( - node_network_receive_bytes_total{%(queriesSelector)s}[$__range] - ) > 0 - ||| % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} received'), - - - networkOutErrorsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_transmit_errs_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} errors transmitted'), - networkInErrorsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_receive_errs_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} errors received'), - networkOutDroppedPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_transmit_drop_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} transmitted dropped'), - networkInDroppedPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_receive_drop_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} received dropped'), - - networkInPacketsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_receive_packets_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} received'), - networkOutPacketsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_transmit_packets_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} transmitted'), - - networkInMulticastPacketsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_receive_multicast_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} received'), - networkOutMulticastPacketsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_transmit_multicast_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} transmitted'), - networkFifoInPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_receive_fifo_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} received'), - networkFifoOutPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_transmit_fifo_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} transmitted'), - - networkCompressedInPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_receive_compressed_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} received'), - networkCompressedOutPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_network_transmit_compressed_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('{{ device }} transmitted'), - - networkNFConntrackEntries: - prometheusQuery.new( - prometheusDatasource, - 'node_nf_conntrack_entries{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('NF conntrack entries'), - networkNFConntrackLimits: - prometheusQuery.new( - prometheusDatasource, - 'node_nf_conntrack_entries_limit{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('NF conntrack limits'), - - networkSoftnetProcessedPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_softnet_processed_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('CPU {{ cpu }} processed'), - networkSoftnetDroppedPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_softnet_dropped_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('CPU {{ cpu }} dropped'), - networkSoftnetSqueezedPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_softnet_times_squeezed_total{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('CPU {{ cpu }} out of quota'), - - networkSocketsUsed: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_sockets_used{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv4 sockets in use'), - networkSocketsTCPAllocated: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_TCP_alloc{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Allocated'), - networkSocketsTCPIPv6: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_TCP6_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv6 in use'), - networkSocketsTCPIPv4: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_TCP_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv4 in use'), - networkSocketsTCPOrphans: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_TCP_orphan{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Orphan sockets'), - networkSocketsTCPTimeWait: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_TCP_tw{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Time wait'), - - networkSocketsUDPLiteInUse: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_UDPLITE_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv4 UDPLITE in use'), - networkSocketsUDPInUse: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_UDP_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv4 UDP in use'), - networkSocketsUDPLiteIPv6InUse: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_UDPLITE6_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv6 UDPLITE in use'), - networkSocketsUDPIPv6InUse: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_UDP6_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv6 UDP in use'), - - networkSocketsFragInUse: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_FRAG_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv4 Frag sockets in use'), - networkSocketsFragIPv6InUse: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_FRAG6_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv6 Frag sockets in use'), - networkSocketsRawInUse: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_RAW_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv4 Raw sockets in use'), - networkSocketsIPv6RawInUse: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_RAW6_inuse{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('IPv6 Raw sockets in use'), - - networkSocketsTCPMemoryPages: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_TCP_mem{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Memory pages allocated for TCP sockets'), - networkSocketsUDPMemoryPages: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_UDP_mem{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Memory pages allocated for UDP sockets'), - - networkSocketsTCPMemoryBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_TCP_mem_bytes{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Memory bytes allocated for TCP sockets'), - networkSocketsUDPMemoryBytes: - prometheusQuery.new( - prometheusDatasource, - 'node_sockstat_UDP_mem_bytes{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('Memory bytes allocated for UDP sockets'), - - networkNetstatIPInOctetsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_IpExt_InOctets{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('Octets received'), - networkNetstatIPOutOctetsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_IpExt_OutOctets{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('Octets transmitted'), - - networkNetstatTCPInSegmentsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Tcp_InSegs{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('TCP received'), - networkNetstatTCPOutSegmentsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Tcp_OutSegs{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('TCP transmitted'), - - networkNetstatTCPOverflowPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_TcpExt_ListenOverflows{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('TCP overflow'), - - networkNetstatTCPListenDropsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_TcpExt_ListenDrops{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('TCP ListenDrops - SYNs to LISTEN sockets ignored'), - - networkNetstatTCPRetransPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_TcpExt_TCPSynRetrans{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('TCP SYN rentransmits'), - - networkNetstatTCPRetransSegPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Tcp_RetransSegs{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('TCP retransmitted segments, containing one or more previously transmitted octets'), - networkNetstatTCPInWithErrorsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Tcp_InErrs{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('TCP received with errors'), - - networkNetstatTCPOutWithRstPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Tcp_OutRsts{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('TCP segments sent with RST flag'), - - networkNetstatIPInUDPPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp_InDatagrams{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP received'), - - networkNetstatIPOutUDPPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp_OutDatagrams{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP transmitted'), - - networkNetstatIPInUDP6PerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp6_InDatagrams{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP6 received'), - - networkNetstatIPOutUDP6PerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp6_OutDatagrams{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP6 transmitted'), - - //UDP errors - networkNetstatUDPLiteInErrorsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_UdpLite_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDPLite InErrors'), - - networkNetstatUDPInErrorsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP InErrors'), - networkNetstatUDP6InErrorsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp6_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP6 InErrors'), - networkNetstatUDPNoPortsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp_NoPorts{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP NoPorts'), - networkNetstatUDP6NoPortsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp6_NoPorts{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP6 NoPorts'), - networkNetstatUDPRcvBufErrsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp_RcvbufErrors{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP receive buffer errors'), - networkNetstatUDP6RcvBufErrsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp6_RcvbufErrors{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP6 receive buffer errors'), - networkNetstatUDPSndBufErrsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp_SndbufErrors{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP transmit buffer errors'), - networkNetstatUDP6SndBufErrsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Udp6_SndbufErrors{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('UDP6 transmit buffer errors'), - - //ICMP - networkNetstatICMPInPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Icmp_InMsgs{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('ICMP received'), - networkNetstatICMPOutPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Icmp_OutMsgs{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('ICMP transmitted'), - networkNetstatICMP6InPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Icmp6_InMsgs{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('ICMP6 received'), - networkNetstatICMP6OutPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Icmp6_OutMsgs{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('ICMP6 transmitted'), - - networkNetstatICMPInErrorsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Icmp_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('ICMP6 errors'), - networkNetstatICM6PInErrorsPerSec: - prometheusQuery.new( - prometheusDatasource, - 'irate(node_netstat_Icmp6_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables - ) - + prometheusQuery.withLegendFormat('ICMP6 errors'), - - hardwareTemperature: - prometheusQuery.new( - prometheusDatasource, - 'node_hwmon_temp_celsius{%(queriesSelector)s}' % variables - ) - + prometheusQuery.withLegendFormat('{{chip}}/{{sensor}}'), - - }, -} diff --git a/docs/node-observ-lib/macos/config.libsonnet b/docs/node-observ-lib/macos/config.libsonnet deleted file mode 100644 index 49ea6ecc4a..0000000000 --- a/docs/node-observ-lib/macos/config.libsonnet +++ /dev/null @@ -1,59 +0,0 @@ -{ - - // any modular observability library should inlcude as inputs: - // 'dashboardNamePrefix' - Use as prefix for all Dashboards and (optional) rule groups - // 'filteringSelector' - Static selector to apply to ALL dashboard variables of type query, panel queries, alerts and recording rules. - // 'groupLabels' - one or more labels that can be used to identify 'group' of instances. In simple cases, can be 'job' or 'cluster'. - // 'instanceLabels' - one or more labels that can be used to identify single entity of instances. In simple cases, can be 'instance' or 'pod'. - // 'uid' - UID to prefix all dashboards original uids - - filteringSelector: 'job="integrations/macos-node"', - groupLabels: ['job'], - instanceLabels: ['instance'], - dashboardNamePrefix: 'MacOS / ', - uid: 'darwin', - - dashboardTags: [self.uid], - - // Select the fstype for filesystem-related queries. If left - // empty, all filesystems are selected. If you have unusual - // filesystem you don't want to include in dashboards and - // alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'. - fsSelector: 'fstype!=""', - - // Select the mountpoint for filesystem-related queries. If left - // empty, all mountpoints are selected. For example if you have a - // special purpose tmpfs instance that has a fixed size and will - // always be 100% full, but you still want alerts and dashboards for - // other tmpfs instances, you can exclude those by mountpoint prefix - // like so: 'mountpoint!~"/var/lib/foo.*"'. - fsMountpointSelector: 'mountpoint!=""', - - // Select the device for disk-related queries. If left empty, all - // devices are selected. If you have unusual devices you don't - // want to include in dashboards and alerting, you can exclude - // them here, e.g. 'device!="tmpfs"'. - diskDeviceSelector: 'device!=""', - dashboardPeriod: 'now-1h', - dashboardTimezone: 'default', - dashboardRefresh: '1m', - - // Alerts to keep from node-observ-lib: - alertsMacKeep: [ - 'NodeFilesystemAlmostOutOfSpace', - 'NodeNetworkReceiveErrs', - 'NodeNetworkTransmitErrs', - 'NodeTextFileCollectorScrapeError', - 'NodeFilesystemFilesFillingUp', - 'NodeFilesystemAlmostOutOfFiles', - ], - // logs lib related - enableLokiLogs: true, - extraLogLabels: ['filename', 'sender'], - logsVolumeGroupBy: 'sender', - showLogsVolume: true, - logsFilteringSelector: self.filteringSelector, - logsExtraFilters: '', - - -} diff --git a/docs/node-observ-lib/mixin-mac.libsonnet b/docs/node-observ-lib/mixin-mac.libsonnet deleted file mode 100644 index d0b56adf12..0000000000 --- a/docs/node-observ-lib/mixin-mac.libsonnet +++ /dev/null @@ -1,8 +0,0 @@ -local macoslib = import './macos/main.libsonnet'; -local macos = macoslib.new(); - -{ - grafanaDashboards+:: macos.grafana.dashboards, - prometheusAlerts+:: macos.prometheus.alerts, - prometheusRules+:: macos.prometheus.recordingRules, -} diff --git a/docs/node-observ-lib/mixin.libsonnet b/docs/node-observ-lib/mixin.libsonnet deleted file mode 100644 index 284f307dd4..0000000000 --- a/docs/node-observ-lib/mixin.libsonnet +++ /dev/null @@ -1,16 +0,0 @@ -local nodelib = import './linux/main.libsonnet'; -local linux = - nodelib.new() - + nodelib.withConfigMixin({ - filteringSelector: 'job=~".*node.*"', - groupLabels: ['job'], - instanceLabels: ['instance'], - dashboardNamePrefix: 'Node exporter / ', - dashboardTags: ['node-exporter-mixin'], - uid: 'node', - }); -{ - grafanaDashboards+:: linux.grafana.dashboards, - prometheusAlerts+:: linux.prometheus.alerts, - prometheusRules+:: linux.prometheus.recordingRules, -}