diff --git a/cluster-manifests/kube-system/container-azm-ms-agentconfig.yaml b/cluster-manifests/kube-system/container-azm-ms-agentconfig.yaml index 12ba960e..7fcd912a 100644 --- a/cluster-manifests/kube-system/container-azm-ms-agentconfig.yaml +++ b/cluster-manifests/kube-system/container-azm-ms-agentconfig.yaml @@ -124,26 +124,6 @@ data: # When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected enabled = true - alertable-metrics-configuration-settings: |- - # Alertable metrics configuration settings for container resource utilization - [alertable_metrics_configuration_settings.container_resource_utilization_thresholds] - # The threshold(Type Float) will be rounded off to 2 decimal points - # Threshold for container cpu, metric will be sent only when cpu utilization exceeds or becomes equal to the following percentage - container_cpu_threshold_percentage = 90.0 - # Threshold for container memoryRss, metric will be sent only when memory rss exceeds or becomes equal to the following percentage - container_memory_rss_threshold_percentage = 90.0 - # Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage - container_memory_working_set_threshold_percentage = 90.0 - - # Alertable metrics configuration settings for persistent volume utilization - [alertable_metrics_configuration_settings.pv_utilization_thresholds] - # Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage - pv_usage_threshold_percentage = 75.0 - - # Alertable metrics configuration settings for completed jobs count - [alertable_metrics_configuration_settings.job_completion_threshold] - # Threshold for completed job count , metric will be sent only for those jobs which were completed earlier than the following threshold - job_completion_threshold_time_minutes = 360 integrations: |- [integrations.azure_network_policy_manager] collect_basic_metrics = false diff --git a/workload-team/cluster-stamp.bicep b/workload-team/cluster-stamp.bicep index aef7a29a..3c65f635 100644 --- a/workload-team/cluster-stamp.bicep +++ b/workload-team/cluster-stamp.bicep @@ -182,7 +182,7 @@ resource dce 'Microsoft.Insights/dataCollectionEndpoints@2023-03-11' = { } } -// A data collection rule that collects PrometheusMetrics from pods, nodes and cluster and configure Azure monitor workspace as destination +// A data collection rule that collects PrometheusMetrics from pods, nodes and cluster and configure Azure monitor workspace as destination resource dcr 'Microsoft.Insights/dataCollectionRules@2023-03-11' = { name: 'MSProm-${location}-${clusterName}' kind: 'Linux' @@ -294,6 +294,7 @@ module alerts 'modules/alerts.bicep' = { } dependsOn: [ sci + amw ] } @@ -727,7 +728,7 @@ resource mc 'Microsoft.ContainerService/managedClusters@2024-03-02-preview' = { // A single * can be provided for each resource to allow any annotations, but this has severe performance implications // https://github.com/prometheus-community/helm-charts/blob/e68c764aa6c764ec5934c6812ff0eaa0877ba275/charts/kube-state-metrics/values.yaml#L342 metricAnnotationsAllowList: '' - + // Comma-separated list of more Kubernetes label keys that is used in the resource's kube_resource_labels metric kube_resource_labels metric. // For example, kube_pod_labels is the labels metric for the pods resource. By default this metric contains only name and namespace labels. // To include more labels, provide a list of resource names in their plural form and Kubernetes label keys that you want to allow for them. diff --git a/workload-team/modules/alerts.bicep b/workload-team/modules/alerts.bicep index a5af4414..a119336d 100644 --- a/workload-team/modules/alerts.bicep +++ b/workload-team/modules/alerts.bicep @@ -1,3 +1,7 @@ +targetScope = 'resourceGroup' + +/*** PARAMETERS ***/ + @description('Location of the regional resources.') param location string @@ -7,10 +11,23 @@ param clusterName string @description('Resource ID of the Log Analytics workspace.') param logAnalyticsWorkspaceResourceId string +/*** VARIABLES ***/ + +var kubernetesAlertRuleGroupName = 'KubernetesAlert-RecommendedMetricAlerts${clusterName}' +var kubernetesAlertRuleGroupDescription = 'Kubernetes Alert RuleGroup-RecommendedMetricAlerts - 0.1' + +/*** EXISTING RESOURCES ***/ + resource mc 'Microsoft.ContainerService/managedClusters@2024-03-02-preview' existing = { name: clusterName } +resource amw 'Microsoft.Monitor/accounts@2023-04-03' existing = { + name: 'amw-${mc.name}' +} + +/*** RESOURCES ***/ + resource alaRgRecommendations 'Microsoft.Insights/activityLogAlerts@2020-10-01' = { name: 'AllAzureAdvisorAlert' location: 'Global' @@ -38,594 +55,623 @@ resource alaRgRecommendations 'Microsoft.Insights/activityLogAlerts@2020-10-01' } } -resource sqrPodFailed 'Microsoft.Insights/scheduledQueryRules@2022-06-15' = { - name: 'PodFailedScheduledQuery' +resource kubernetesAlertRuleGroupName_Pod_level 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = { + name: '${kubernetesAlertRuleGroupName}-Pod-level' location: location properties: { - autoMitigate: true - displayName: '[${clusterName}] Scheduled Query for Pod Failed Alert' - description: 'Alert on pod Failed phase.' - severity: 3 - enabled: true - scopes: [ - logAnalyticsWorkspaceResourceId - ] - evaluationFrequency: 'PT5M' - windowSize: 'PT10M' - criteria: { - allOf: [ - { - query: '//https://learn.microsoft.com/azure/azure-monitor/containers/container-insights-log-alerts \r\n let endDateTime = now(); let startDateTime = ago(1h); let trendBinSize = 1m; let clusterName = "${clusterName}"; KubePodInventory | where TimeGenerated < endDateTime | where TimeGenerated >= startDateTime | where ClusterName == clusterName | distinct ClusterName, TimeGenerated | summarize ClusterSnapshotCount = count() by bin(TimeGenerated, trendBinSize), ClusterName | join hint.strategy=broadcast ( KubePodInventory | where TimeGenerated < endDateTime | where TimeGenerated >= startDateTime | distinct ClusterName, Computer, PodUid, TimeGenerated, PodStatus | summarize TotalCount = count(), PendingCount = sumif(1, PodStatus =~ "Pending"), RunningCount = sumif(1, PodStatus =~ "Running"), SucceededCount = sumif(1, PodStatus =~ "Succeeded"), FailedCount = sumif(1, PodStatus =~ "Failed") by ClusterName, bin(TimeGenerated, trendBinSize) ) on ClusterName, TimeGenerated | extend UnknownCount = TotalCount - PendingCount - RunningCount - SucceededCount - FailedCount | project TimeGenerated, TotalCount = todouble(TotalCount) / ClusterSnapshotCount, PendingCount = todouble(PendingCount) / ClusterSnapshotCount, RunningCount = todouble(RunningCount) / ClusterSnapshotCount, SucceededCount = todouble(SucceededCount) / ClusterSnapshotCount, FailedCount = todouble(FailedCount) / ClusterSnapshotCount, UnknownCount = todouble(UnknownCount) / ClusterSnapshotCount| summarize AggregatedValue = avg(FailedCount) by bin(TimeGenerated, trendBinSize)' - metricMeasureColumn: 'AggregatedValue' - operator: 'GreaterThan' - threshold: 3 - timeAggregation: 'Average' - failingPeriods: { - minFailingPeriodsToAlert: 2 - numberOfEvaluationPeriods: 2 - } - } - ] - } - } -} - -resource maHighNodeCPUUtilization 'Microsoft.Insights/metricAlerts@2018-03-01' = { - name: 'Node CPU utilization high for ${clusterName} CI-1' - location: 'global' - properties: { - autoMitigate: true + description: kubernetesAlertRuleGroupDescription scopes: [ + amw.id mc.id ] - actions: [] - criteria: { - allOf: [ - { - criterionType: 'StaticThresholdCriterion' - dimensions: [ - { - name: 'host' - operator: 'Include' - values: [ - '*' - ] - } - ] - metricName: 'cpuUsagePercentage' - metricNamespace: 'Insights.Container/nodes' - name: 'Metric1' - operator: 'GreaterThan' - threshold: 80 - timeAggregation: 'Average' - skipMetricValidation: true + clusterName: mc.name + interval: 'PT1M' + rules: [ + { + alert: 'KubeJobStale' + expression: 'sum by(namespace,cluster)(kube_job_spec_completions{job="kube-state-metrics"}) - sum by(namespace,cluster)(kube_job_status_succeeded{job="kube-state-metrics"}) > 0 ' + for: 'PT360M' + annotations: { + description: 'Number of stale jobs older than six hours is greater than 0. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' } - ] - 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' - } - description: 'Node CPU utilization across the cluster.' - enabled: true - evaluationFrequency: 'PT1M' - severity: 3 - targetResourceType: 'microsoft.containerservice/managedclusters' - windowSize: 'PT5M' - } -} - -resource maHighNodeWorkingSetMemoryUtilization 'Microsoft.Insights/metricAlerts@2018-03-01' = { - name: 'Node working set memory utilization high for ${clusterName} CI-2' - location: 'global' - properties: { - autoMitigate: true - actions: [] - criteria: { - allOf: [ - { - criterionType: 'StaticThresholdCriterion' - dimensions: [ - { - name: 'host' - operator: 'Include' - values: [ - '*' - ] - } - ] - metricName: 'memoryWorkingSetPercentage' - metricNamespace: 'Insights.Container/nodes' - name: 'Metric1' - operator: 'GreaterThan' - threshold: 80 - timeAggregation: 'Average' - skipMetricValidation: true + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT15M' } - ] - 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' - } - description: 'Node working set memory utilization across the cluster.' - enabled: true - evaluationFrequency: 'PT1M' - scopes: [ - mc.id - ] - severity: 3 - targetResourceType: 'microsoft.containerservice/managedclusters' - windowSize: 'PT5M' - } -} - -resource maJobsCompletedMoreThan6HoursAgo 'Microsoft.Insights/metricAlerts@2018-03-01' = { - name: 'Jobs completed more than 6 hours ago for ${clusterName} CI-11' - location: 'global' - properties: { - autoMitigate: true - actions: [] - criteria: { - allOf: [ - { - criterionType: 'StaticThresholdCriterion' - dimensions: [ - { - name: 'controllerName' - operator: 'Include' - values: [ - '*' - ] - } - { - name: 'kubernetes namespace' - operator: 'Include' - values: [ - '*' - ] - } - ] - metricName: 'completedJobsCount' - metricNamespace: 'Insights.Container/pods' - name: 'Metric1' - operator: 'GreaterThan' - threshold: 0 - timeAggregation: 'Average' - skipMetricValidation: true + labels: { + severity: 'warning' } - ] - 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' - } - description: 'This alert monitors completed jobs (more than 6 hours ago).' - enabled: true - evaluationFrequency: 'PT1M' - scopes: [ - mc.id - ] - severity: 3 - targetResourceType: 'microsoft.containerservice/managedclusters' - windowSize: 'PT1M' - } -} - -resource maHighContainerCPUUsage 'Microsoft.Insights/metricAlerts@2018-03-01' = { - name: 'Container CPU usage violates the configured threshold for ${clusterName} CI-19' - location: 'global' - properties: { - autoMitigate: true - actions: [] - criteria: { - allOf: [ - { - criterionType: 'StaticThresholdCriterion' - dimensions: [ - { - name: 'controllerName' - operator: 'Include' - values: [ - '*' - ] - } - { - name: 'kubernetes namespace' - operator: 'Include' - values: [ - '*' - ] - } - ] - metricName: 'cpuThresholdViolated' - metricNamespace: 'Insights.Container/containers' - name: 'Metric1' - operator: 'GreaterThan' - threshold: 0 // This threshold is defined in the container-azm-ms-agentconfig.yaml file. - timeAggregation: 'Average' - skipMetricValidation: true + actions: [] + } + { + alert: 'KubeContainerAverageCPUHigh' + expression: 'sum (rate(container_cpu_usage_seconds_total{image!="", container!="POD"}[5m])) by (pod,cluster,container,namespace) / sum(container_spec_cpu_quota{image!="", container!="POD"}/container_spec_cpu_period{image!="", container!="POD"}) by (pod,cluster,container,namespace) > .95' + for: 'PT5M' + annotations: { + description: 'Average CPU usage per container is greater than 95%. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' } - ] - 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' - } - description: 'This alert monitors container CPU usage. It uses the threshold defined in the config map.' - enabled: true - evaluationFrequency: 'PT1M' - scopes: [ - mc.id - ] - severity: 3 - targetResourceType: 'microsoft.containerservice/managedclusters' - windowSize: 'PT5M' - } -} - -resource maHighContainerWorkingSetMemoryUsage 'Microsoft.Insights/metricAlerts@2018-03-01' = { - name: 'Container working set memory usage violates the configured threshold for ${clusterName} CI-20' - location: 'global' - properties: { - autoMitigate: true - actions: [] - criteria: { - allOf: [ - { - criterionType: 'StaticThresholdCriterion' - dimensions: [ - { - name: 'controllerName' - operator: 'Include' - values: [ - '*' - ] - } - { - name: 'kubernetes namespace' - operator: 'Include' - values: [ - '*' - ] - } - ] - metricName: 'memoryWorkingSetThresholdViolated' - metricNamespace: 'Insights.Container/containers' - name: 'Metric1' - operator: 'GreaterThan' - threshold: 0 // This threshold is defined in the container-azm-ms-agentconfig.yaml file. - timeAggregation: 'Average' - skipMetricValidation: true + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT15M' } - ] - 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' - } - description: 'This alert monitors container working set memory usage. It uses the threshold defined in the config map.' - enabled: true - evaluationFrequency: 'PT1M' - scopes: [ - mc.id - ] - severity: 3 - targetResourceType: 'microsoft.containerservice/managedclusters' - windowSize: 'PT5M' - } -} - -resource maPodsInFailedState 'Microsoft.Insights/metricAlerts@2018-03-01' = { - name: 'Pods in failed state for ${clusterName} CI-4' - location: 'global' - properties: { - autoMitigate: true - actions: [] - criteria: { - allOf: [ - { - criterionType: 'StaticThresholdCriterion' - dimensions: [ - { - name: 'phase' - operator: 'Include' - values: [ - 'Failed' - ] - } - ] - metricName: 'podCount' - metricNamespace: 'Insights.Container/pods' - name: 'Metric1' - operator: 'GreaterThan' - threshold: 0 - timeAggregation: 'Average' - skipMetricValidation: true + labels: { + severity: 'warning' } - ] - 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' - } - description: 'Pod status monitoring.' - enabled: true - evaluationFrequency: 'PT1M' - scopes: [ - mc.id + actions: [] + } + { + alert: 'KubeContainerAverageMemoryHigh' + expression: 'avg by (namespace, controller, container, cluster)(((container_memory_working_set_bytes{container!="", image!="", container!="POD"} / on(namespace,cluster,pod,container) group_left kube_pod_container_resource_limits{resource="memory", node!=""})*on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)")) > .95)' + for: 'PT10M' + annotations: { + description: 'Average Memory usage per container is greater than 95%. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubePodFailedState' + expression: 'sum by (cluster, namespace, controller) (kube_pod_status_phase{phase="failed"} * on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)")) > 0' + for: 'PT5M' + annotations: { + description: 'Number of pods in failed state are greater than 0. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT15M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubePVUsageHigh' + expression: 'avg by (namespace, controller, container, cluster)(((kubelet_volume_stats_used_bytes{job="kubelet"} / on(namespace,cluster,pod,container) group_left kubelet_volume_stats_capacity_bytes{job="kubelet"}) * on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)"))) > .8' + for: 'PT15M' + annotations: { + description: 'Average PV usage on pod {{ $labels.pod }} in container {{ $labels.container }} is greater than 80%. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubePodReadyStateLow' + expression: 'sum by (cluster,namespace,deployment)(kube_deployment_status_replicas_ready) / sum by (cluster,namespace,deployment)(kube_deployment_spec_replicas) <.8 or sum by (cluster,namespace,deployment)(kube_daemonset_status_number_ready) / sum by (cluster,namespace,deployment)(kube_daemonset_status_desired_number_scheduled) <.8 ' + for: 'PT5M' + annotations: { + description: 'Ready state of pods is less than 80%. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT15M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubePodContainerRestart' + expression: 'sum by (namespace, controller, container, cluster)(increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[1h])* on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)")) > 0' + for: 'PT15M' + annotations: { + description: 'Pod container restarted in the last 1 hour. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubePodCrashLooping' + expression: 'max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics"}[5m]) >= 1' + for: 'PT15M' + annotations: { + description: '{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) in {{ $labels.cluster}} is restarting {{ printf "%.2f" $value }} / second. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubePodNotReadyByController' + expression: 'sum by (namespace, controller, cluster) (max by(namespace, pod, cluster) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"} ) * on(namespace, pod, cluster) group_left(controller)label_replace(kube_pod_owner,"controller","$1","owner_name","(.*)")) > 0' + for: 'PT15M' + annotations: { + description: '{{ $labels.namespace }}/{{ $labels.pod }} in {{ $labels.cluster}} by controller is not ready. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubeStatefulSetGenerationMismatch' + expression: 'kube_statefulset_status_observed_generation{job="kube-state-metrics"} != kube_statefulset_metadata_generation{job="kube-state-metrics"}' + for: 'PT15M' + annotations: { + description: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubeJobFailed' + expression: 'kube_job_failed{job="kube-state-metrics"} > 0' + for: 'PT15M' + annotations: { + description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} in {{ $labels.cluster}} failed to complete. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubeletPodStartUpLatencyHigh' + expression: 'histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet"} > 60' + for: 'PT10M' + annotations: { + description: 'Kubelet Pod startup latency is too high. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubeDeploymentReplicasMismatch' + expression: '( kube_deployment_spec_replicas{job="kube-state-metrics"} > kube_deployment_status_replicas_available{job="kube-state-metrics"}) and ( changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[10m]) == 0)' + for: 'PT15M' + annotations: { + description: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT15M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubeStatefulSetReplicasMismatch' + expression: '( kube_statefulset_status_replicas_ready{job="kube-state-metrics"} != kube_statefulset_status_replicas{job="kube-state-metrics"}) and ( changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[10m]) == 0)' + for: 'PT15M' + annotations: { + description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubeHpaReplicasMismatch' + expression: '(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics"} !=kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}) and(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} >kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics"}) and(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} 2' + for: 'PT15M' + annotations: { + description: 'The readiness status of node {{ $labels.node }} in {{ $labels.cluster}} has changed more than 2 times in the last 15 minutes. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/node-level-recommended-alerts).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubeNodeDiskUsageHigh' + expression: '100 - ((node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100) > 80' + for: 'PT15M' + annotations: { + description: 'The disk usage on node {{ $labels.instance }} in {{ $labels.job }} has exceeded 80% and current usage is {{ $value | humanizePercentage }}%). For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/node-level-recommended-alerts).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } ] - severity: 3 - targetResourceType: 'microsoft.containerservice/managedclusters' - windowSize: 'PT5M' } } -resource maContainersGettingKilledOOM 'Microsoft.Insights/metricAlerts@2018-03-01' = { - name: 'Containers getting OOM killed for ${clusterName} CI-6' - location: 'global' +resource kubernetesAlertRuleGroupName_Cluster_level 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = { + name: '${kubernetesAlertRuleGroupName}-Cluster-level' + location: location properties: { - autoMitigate: true - actions: [] - criteria: { - allOf: [ - { - criterionType: 'StaticThresholdCriterion' - dimensions: [ - { - name: 'kubernetes namespace' - operator: 'Include' - values: [ - '*' - ] - } - { - name: 'controllerName' - operator: 'Include' - values: [ - '*' - ] - } - ] - metricName: 'oomKilledContainerCount' - metricNamespace: 'Insights.Container/pods' - name: 'Metric1' - operator: 'GreaterThan' - threshold: 0 - timeAggregation: 'Average' - skipMetricValidation: true - } - ] - 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' - } - description: 'This alert monitors number of containers killed due to out of memory (OOM) error.' - enabled: true - evaluationFrequency: 'PT1M' + description: kubernetesAlertRuleGroupDescription scopes: [ + amw.id mc.id ] - severity: 3 - targetResourceType: 'microsoft.containerservice/managedclusters' - windowSize: 'PT1M' - } -} - -resource maHighPersistentVolumeUsage 'Microsoft.Insights/metricAlerts@2018-03-01' = { - name: 'Persistent volume usage high for ${clusterName} CI-18' - location: 'global' - properties: { - autoMitigate: true - actions: [] - criteria: { - allOf: [ - { - criterionType: 'StaticThresholdCriterion' - dimensions: [ - { - name: 'podName' - operator: 'Include' - values: [ - '*' - ] - } - { - name: 'kubernetesNamespace' - operator: 'Include' - values: [ - '*' - ] - } - ] - metricName: 'pvUsageExceededPercentage' - metricNamespace: 'Insights.Container/persistentvolumes' - name: 'Metric1' - operator: 'GreaterThan' - threshold: 80 - timeAggregation: 'Average' - skipMetricValidation: true + clusterName: mc.name + interval: 'PT1M' + rules: [ + { + alert: 'KubeContainerOOMKilledCount' + expression: 'sum by (cluster,container,controller,namespace)(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} * on(cluster,namespace,pod) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)")) > 0' + for: 'PT5M' + annotations: { + description: 'Number of OOM killed containers is greater than 0. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' } - ] - 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' - } - description: 'This alert monitors persistent volume utilization.' - enabled: false - evaluationFrequency: 'PT1M' - scopes: [ - mc.id + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubeCPUQuotaOvercommit' + expression: 'sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) /sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) > 1.5' + for: 'PT5M' + annotations: { + description: 'Cluster {{ $labels.cluster}} has overcommitted CPU resource requests for Namespaces. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' + } + labels: { + severity: 'warning' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + actions: [] + } + { + alert: 'KubeMemoryQuotaOvercommit' + expression: 'sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) /sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) > 1.5' + for: 'PT5M' + annotations: { + description: 'Cluster {{ $labels.cluster}} has overcommitted memory resource requests for Namespaces. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubeClientErrors' + expression: '(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (cluster, instance, job, namespace) / sum(rate(rest_client_requests_total[5m])) by (cluster, instance, job, namespace)) > 0.01' + for: 'PT15M' + annotations: { + description: 'Kubernetes API server client \'{{ $labels.job }}/{{ $labels.instance }}\' is experiencing {{ $value | humanizePercentage }} errors. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubePersistentVolumeFillingUp' + expression: 'kubelet_volume_stats_available_bytes{job="kubelet"}/kubelet_volume_stats_capacity_bytes{job="kubelet"} < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1' + for: 'PT60M' + annotations: { + description: 'Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT15M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubePersistentVolumeInodesFillingUp' + expression: 'kubelet_volume_stats_inodes_free{job="kubelet"} / kubelet_volume_stats_inodes{job="kubelet"} < 0.03' + for: 'PT15M' + annotations: { + description: 'The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} only has {{ $value | humanizePercentage }} free inodes. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubePersistentVolumeErrors' + expression: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0' + for: 'PT05M' + annotations: { + description: 'The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubeContainerWaiting' + expression: 'sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0' + for: 'PT60M' + annotations: { + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubeDaemonSetNotScheduled' + expression: 'kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0' + for: 'PT15M' + annotations: { + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubeDaemonSetMisScheduled' + expression: 'kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0' + for: 'PT15M' + annotations: { + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + { + alert: 'KubeQuotaAlmostFull' + expression: 'kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type)(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) > 0.9 < 1' + for: 'PT15M' + annotations: { + description: '{{ $value | humanizePercentage }} usage of {{ $labels.resource }} in namespace {{ $labels.namespace }} in {{ $labels.cluster}}. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [] + } + ] - severity: 3 - targetResourceType: 'microsoft.containerservice/managedclusters' - windowSize: 'PT5M' } } -resource maPodsNotInReadyState 'Microsoft.Insights/metricAlerts@2018-03-01' = { - name: 'Pods not in ready state for ${clusterName} CI-8' - location: 'global' +resource sqrPodFailed 'Microsoft.Insights/scheduledQueryRules@2022-06-15' = { + name: 'PodFailedScheduledQuery' + location: location properties: { autoMitigate: true - actions: [] - criteria: { - allOf: [ - { - criterionType: 'StaticThresholdCriterion' - dimensions: [ - { - name: 'controllerName' - operator: 'Include' - values: [ - '*' - ] - } - { - name: 'kubernetes namespace' - operator: 'Include' - values: [ - '*' - ] - } - ] - metricName: 'PodReadyPercentage' - metricNamespace: 'Insights.Container/pods' - name: 'Metric1' - operator: 'LessThan' - threshold: 80 - timeAggregation: 'Average' - skipMetricValidation: true - } - ] - 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' - } - description: 'This alert monitors for excessive pods not in the ready state.' + displayName: '[${clusterName}] Scheduled Query for Pod Failed Alert' + description: 'Alert on pod Failed phase.' + severity: 3 enabled: true - evaluationFrequency: 'PT1M' scopes: [ - mc.id + logAnalyticsWorkspaceResourceId ] - severity: 3 - targetResourceType: 'microsoft.containerservice/managedclusters' - windowSize: 'PT5M' - } -} - -resource maRestartingContainerCount 'Microsoft.Insights/metricAlerts@2018-03-01' = { - name: 'Restarting container count for ${clusterName} CI-7' - location: 'global' - properties: { - autoMitigate: true - actions: [] + evaluationFrequency: 'PT5M' + windowSize: 'PT10M' criteria: { allOf: [ { - criterionType: 'StaticThresholdCriterion' - dimensions: [ - { - name: 'kubernetes namespace' - operator: 'Include' - values: [ - '*' - ] - } - { - name: 'controllerName' - operator: 'Include' - values: [ - '*' - ] - } - ] - metricName: 'restartingContainerCount' - metricNamespace: 'Insights.Container/pods' - name: 'Metric1' + query: '//https://learn.microsoft.com/azure/azure-monitor/containers/container-insights-log-alerts \r\n let endDateTime = now(); let startDateTime = ago(1h); let trendBinSize = 1m; let clusterName = "${clusterName}"; KubePodInventory | where TimeGenerated < endDateTime | where TimeGenerated >= startDateTime | where ClusterName == clusterName | distinct ClusterName, TimeGenerated | summarize ClusterSnapshotCount = count() by bin(TimeGenerated, trendBinSize), ClusterName | join hint.strategy=broadcast ( KubePodInventory | where TimeGenerated < endDateTime | where TimeGenerated >= startDateTime | distinct ClusterName, Computer, PodUid, TimeGenerated, PodStatus | summarize TotalCount = count(), PendingCount = sumif(1, PodStatus =~ "Pending"), RunningCount = sumif(1, PodStatus =~ "Running"), SucceededCount = sumif(1, PodStatus =~ "Succeeded"), FailedCount = sumif(1, PodStatus =~ "Failed") by ClusterName, bin(TimeGenerated, trendBinSize) ) on ClusterName, TimeGenerated | extend UnknownCount = TotalCount - PendingCount - RunningCount - SucceededCount - FailedCount | project TimeGenerated, TotalCount = todouble(TotalCount) / ClusterSnapshotCount, PendingCount = todouble(PendingCount) / ClusterSnapshotCount, RunningCount = todouble(RunningCount) / ClusterSnapshotCount, SucceededCount = todouble(SucceededCount) / ClusterSnapshotCount, FailedCount = todouble(FailedCount) / ClusterSnapshotCount, UnknownCount = todouble(UnknownCount) / ClusterSnapshotCount| summarize AggregatedValue = avg(FailedCount) by bin(TimeGenerated, trendBinSize)' + metricMeasureColumn: 'AggregatedValue' operator: 'GreaterThan' - threshold: 0 + threshold: 3 timeAggregation: 'Average' - skipMetricValidation: true + failingPeriods: { + minFailingPeriodsToAlert: 2 + numberOfEvaluationPeriods: 2 + } } ] - 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' } - description: 'This alert monitors number of containers restarting across the cluster.' - enabled: true - evaluationFrequency: 'PT1M' - scopes: [ - mc.id - ] - severity: 3 - targetResourceType: 'Microsoft.ContainerService/managedClusters' - windowSize: 'PT1M' } }