feat (monitoring): [alerts] enable new recommended experience for aks…

… clusters (#435) * enable new recommended alert rules * replace legacy Completed job count CI alert w/ KubeJobStale Pod level alert: at least one Job instance did not complete successfully for the last 6 hours. * replace legacy Container CPU % CI alert w/ KubeContainerAverageCPUHigh Pod level alert: The average CPU usage per container exceeds 95% for the last 5 minutes. * replace legacy Container working set memory % CI alert w/ KubeContainerAverageMemoryHigh Pod level alert: The average memory usage per container exceeds 95% for the last 5 minutes * replace legacy Failed Pod counts CI alert w/ KubePodFailedState Pod level alert: One or more pods is in a failed state for the last 5 minutes * disabling legacy Node CPU % CI alert Platform level alert Node cpu percentage is replacing this * disabling legacy Node Disk Usage % CI alert no replacement available * replace legacy Node NotReady status CI alert w/ KubeNodeUnreachable Node level alert: A node has been unreachable for the last 15 minutes * disabling legacy Node working set memory % CI alert Platform level alert Node memory working set percentage is greater than 100% is replacing this * replace legacy OOM Killed Containers CI alert w/ KubeContainerOOMKilledCount Cluster level alert: One or more containers within pods have been killed due to out-of-memory (OOM) events for the last 5 minutes * replace legacy Persistent Volume Usage % CI alert w/ KubePVUsageHigh Pod level alert: The average usage of Persistent Volumes (PVs) on pod exceeds 80% for the last 15 minutes * replace legacy Pods ready % CI alert w/ KubePodReadyStateLow Pod level alert: The percentage of pods in a ready state falls below 80% for any deployment or daemonset in the Kubernetes cluster for the last 5 minutes * replace legacy Restarting container count CI alert w/ KubePodContainerRestart Pod level alert: One or more containers within pods in the Kubernetes cluster have been restarted at least once within the last hour * add extra recommended Prometheus Pod level metric alert rules * add extra recommended Prometheus Node level metric alert rules * add extra recommended Prometheus Cluster level metric alert rules * remove unsed module * comment out configmap CI alert configuration * Address PR Feedback: remove legacy alterting configuration instead of commenting out
mspnp · Jan 9, 2025 · c200225 · c200225
1 parent 17de3d7
commit c200225
Show file tree

Hide file tree

Showing 3 changed files with 592 additions and 565 deletions.
diff --git a/cluster-manifests/kube-system/container-azm-ms-agentconfig.yaml b/cluster-manifests/kube-system/container-azm-ms-agentconfig.yaml
@@ -124,26 +124,6 @@ data:
       # When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected
       enabled = true
 
-  alertable-metrics-configuration-settings: |-
-    # Alertable metrics configuration settings for container resource utilization
-    [alertable_metrics_configuration_settings.container_resource_utilization_thresholds]
-        # The threshold(Type Float) will be rounded off to 2 decimal points
-        # Threshold for container cpu, metric will be sent only when cpu utilization exceeds or becomes equal to the following percentage
-        container_cpu_threshold_percentage = 90.0
-        # Threshold for container memoryRss, metric will be sent only when memory rss exceeds or becomes equal to the following percentage
-        container_memory_rss_threshold_percentage = 90.0
-        # Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage
-        container_memory_working_set_threshold_percentage = 90.0
-
-    # Alertable metrics configuration settings for persistent volume utilization
-    [alertable_metrics_configuration_settings.pv_utilization_thresholds]
-        # Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage
-        pv_usage_threshold_percentage = 75.0
-
-    # Alertable metrics configuration settings for completed jobs count
-    [alertable_metrics_configuration_settings.job_completion_threshold]
-        # Threshold for completed job count , metric will be sent only for those jobs which were completed earlier than the following threshold
-        job_completion_threshold_time_minutes = 360
   integrations: |-
     [integrations.azure_network_policy_manager]
         collect_basic_metrics = false

diff --git a/workload-team/cluster-stamp.bicep b/workload-team/cluster-stamp.bicep
@@ -182,7 +182,7 @@ resource dce 'Microsoft.Insights/dataCollectionEndpoints@2023-03-11' = {
   }
 }
 
-// A data collection rule that collects PrometheusMetrics from pods, nodes and cluster and configure Azure monitor workspace as destination  
+// A data collection rule that collects PrometheusMetrics from pods, nodes and cluster and configure Azure monitor workspace as destination
 resource dcr 'Microsoft.Insights/dataCollectionRules@2023-03-11' = {
   name: 'MSProm-${location}-${clusterName}'
   kind: 'Linux'
@@ -294,6 +294,7 @@ module alerts 'modules/alerts.bicep' = {
   }
   dependsOn: [
     sci
+    amw
   ]
 }
 
@@ -727,7 +728,7 @@ resource mc 'Microsoft.ContainerService/managedClusters@2024-03-02-preview' = {
           // A single * can be provided for each resource to allow any annotations, but this has severe performance implications
           // https://github.com/prometheus-community/helm-charts/blob/e68c764aa6c764ec5934c6812ff0eaa0877ba275/charts/kube-state-metrics/values.yaml#L342
           metricAnnotationsAllowList: ''
-          
+
           // Comma-separated list of more Kubernetes label keys that is used in the resource's kube_resource_labels metric kube_resource_labels metric.
           // For example, kube_pod_labels is the labels metric for the pods resource. By default this metric contains only name and namespace labels.
           // To include more labels, provide a list of resource names in their plural form and Kubernetes label keys that you want to allow for them.