From 53f302d12b03da1e5ac9929bf761cb197ab7a1f7 Mon Sep 17 00:00:00 2001 From: jon4hz Date: Fri, 25 Oct 2024 20:56:25 +0200 Subject: [PATCH] feat: add support for kubernetes workload maintenance --- README.md | 25 +++ galaxy.yml | 1 + plugins/filter/k8s_workload.py | 40 ++++ .../README.md | 1 + .../defaults/main.yml | 22 +++ .../tasks/main.yml | 173 ++++++++++++++++++ 6 files changed, 262 insertions(+) create mode 100644 plugins/filter/k8s_workload.py create mode 100644 roles/maintenance_63_kubernetes_workload/README.md create mode 100644 roles/maintenance_63_kubernetes_workload/defaults/main.yml create mode 100644 roles/maintenance_63_kubernetes_workload/tasks/main.yml diff --git a/README.md b/README.md index 408a370..40240ca 100644 --- a/README.md +++ b/README.md @@ -163,6 +163,31 @@ changed: [debian01.example.org] There is also a checklist summarising all tasks that were run but finished with either `ok` or `skipped`. +## Kubernetes Workload Maintenance + +### Requirements + +Make sure to install the python `kubernetes` package. + +### Authentication + +Ansible will use your default kubernetes config for authentication, so make sure you have that set up. + +### Inventory + +You can add the namespace which you want to run maintenance against as a host in your inventory. +Make sure to set `ansible_connection: local` whenever you do kubernetes maintenance. + +```ini +[maintenance_63_kubernetes_workload] +kube-public +kube-system +argocd + +[maintenance_63_kubernetes_workload:vars] +ansible_connection=local +``` + ## Development Setup diff --git a/galaxy.yml b/galaxy.yml index 832728c..31e1cca 100644 --- a/galaxy.yml +++ b/galaxy.yml @@ -43,6 +43,7 @@ dependencies: community.general: ">=1.3.6" adfinis.facts: ">=1.0.2" community.crypto: ">=2.15.0" + kubernetes.core: ">=3.2.0" # The URL of the originating SCM repository repository: https://github.com/adfinis/ansible-collection-maintenance diff --git a/plugins/filter/k8s_workload.py b/plugins/filter/k8s_workload.py new file mode 100644 index 0000000..e8d825a --- /dev/null +++ b/plugins/filter/k8s_workload.py @@ -0,0 +1,40 @@ +from datetime import datetime + +class FilterModule(object): + """Ansible custom filters""" + + def filters(self): + """Return the custom filters""" + return { + "k8s_workload_pods_restart_last_days": self._k8s_workload_pods_restart_last_days, + "k8s_workload_check_service_type": self._k8s_workload_check_service_type + } + + def _k8s_workload_pods_restart_last_days(self, pods, x_days): + if not pods: + return [] + restarted_pods = [] + for pod in pods: + for status in pod.get('containerStatuses', []): + started_at = datetime.fromisoformat(status.get('startedAt')) + if (datetime.now(started_at.tzinfo) - started_at).days < x_days: + restarted_pods.append({ + "name": pod.get('name'), + "started_at": started_at.strftime("%Y-%m-%d %H:%M:%S"), + "restarts": status.get('restartCount') + }) + return restarted_pods + + def _k8s_workload_check_service_type(self, services, allowed_types): + if not services: + return [] + faulty_service = [] + for service in services: + allowed_type = allowed_types.get(service.get('name')) + if service.get('type') != allowed_type: + faulty_service.append({ + "name": service.get('name'), + "type": service.get('type'), + "allowed_type": allowed_type + }) + return faulty_service diff --git a/roles/maintenance_63_kubernetes_workload/README.md b/roles/maintenance_63_kubernetes_workload/README.md new file mode 100644 index 0000000..d14b66e --- /dev/null +++ b/roles/maintenance_63_kubernetes_workload/README.md @@ -0,0 +1 @@ +# Ansible Role adfinis.maintenance.maintenance_63_kubernetes_workload diff --git a/roles/maintenance_63_kubernetes_workload/defaults/main.yml b/roles/maintenance_63_kubernetes_workload/defaults/main.yml new file mode 100644 index 0000000..8961416 --- /dev/null +++ b/roles/maintenance_63_kubernetes_workload/defaults/main.yml @@ -0,0 +1,22 @@ +--- + +maintenance_global_exclude_tasks: [] +maintenance_host_exclude_tasks: [] +maintenance_exclude_tasks: "{{ maintenance_global_exclude_tasks + maintenance_host_exclude_tasks }}" + +# Define defaults which can be overriden on a host-by-host basis + +# the namespace in which the workload is running +k8s_workload_namespace: "{{ inventory_hostname }}" + +# the service types which are allowed (if not ClusterIP) +k8s_workload_allowed_service_types: {} +# netbox: LoadBalancer + +# the deployments which should be highly available +k8s_workload_ha_deployments: [] +# - netbox + +# the statefulsets which should be highly available +k8s_workload_ha_statefulsets: [] +# - netbox-redis-replicas diff --git a/roles/maintenance_63_kubernetes_workload/tasks/main.yml b/roles/maintenance_63_kubernetes_workload/tasks/main.yml new file mode 100644 index 0000000..9d1756c --- /dev/null +++ b/roles/maintenance_63_kubernetes_workload/tasks/main.yml @@ -0,0 +1,173 @@ +--- + +- name: This task only serves as a template for the tasks below + ansible.builtin.set_fact: + ignoreme: &task + name: "{{ vars.taskid }}: {{ vars.name }}" + register: task + when: + - "vars.taskid not in maintenance_exclude_tasks" + - "maintenance_only is not defined or maintenance_only == vars.taskid" + vars: + taskid: ignoreme + name: bar + +- <<: *task + vars: + taskid: 63-004 + name: "Health: Check that all service related Pods in a healthy (Running, Completed) state | Gather all infos" + kubernetes.core.k8s_info: + kind: Pod + namespace: "{{ k8s_workload_namespace }}" + register: k8s_pods + changed_when: false + +- <<: *task + vars: + taskid: 63-004 + name: "Health: Check that all service related Pods in a healthy (Running, Completed) state | Report unhealthy Pods" + unhealthy_pods: "{{ k8s_pods | json_query(\"resources[?status.phase!='Running' && status.phase!='Succeeded']\") }}" + ansible.builtin.debug: + msg: | + Unhealthy Pods: + {{ unhealthy_pods | json_query("[].metadata.name") | to_nice_json }} + changed_when: unhealthy_pods | length > 0 + +- <<: *task + vars: + taskid: 63-006 + name: "Health: Check if a container of service related Pod got recently restarted, eg. OOMKilled | Gather all infos" + kubernetes.core.k8s_info: + kind: Pod + namespace: "{{ k8s_workload_namespace }}" + register: k8s_pods + changed_when: false + +- <<: *task + vars: + taskid: 63-006 + name: "Health: Check if a container of service related Pod got recently restarted, eg. OOMKilled | Report restarted Pods" + pods_restarted: "{{ k8s_pods | json_query('resources[?status.containerStatuses[?restartCount > `0`]].{name: metadata.name, containerStatuses: status.containerStatuses[].{restartCount: restartCount, startedAt: (state.*.startedAt)[0]}}') }}" # noqa: yaml[line-length] + pods_restarted_last_1d: "{{ pods_restarted | adfinis.maintenance.k8s_workload_pods_restart_last_days(1) }}" + ansible.builtin.debug: + var: pods_restarted_last_1d + changed_when: pods_restarted_last_1d | length > 0 + +- <<: *task + vars: + taskid: 63-012 + name: "Config: Check if all Services are configured as ClusterIP unless necessary | Gather all infos" + kubernetes.core.k8s_info: + kind: Service + namespace: "{{ k8s_workload_namespace }}" + register: k8s_services + changed_when: false + +- <<: *task + vars: + taskid: 63-012 + name: "Config: Check if all Services are configured as ClusterIP unless necessary | Check for Services with wrong type" + none_cluster_ip_services: "{{ k8s_services | json_query('resources[?spec.type!=`ClusterIP`].{name: metadata.name, type: spec.type}') }}" + wrong_type_services: "{{ none_cluster_ip_services | adfinis.maintenance.k8s_workload_check_service_type(k8s_workload_allowed_service_types) }}" + ansible.builtin.debug: + var: wrong_type_services + changed_when: wrong_type_services | length > 0 + +- <<: *task + vars: + taskid: 63-013 + name: "Config: Check that the PullPolicy is not configured to Always | Gather all infos" + kubernetes.core.k8s_info: + kind: Pod + namespace: "{{ k8s_workload_namespace }}" + register: k8s_pods + changed_when: false + +- <<: *task + vars: + taskid: 63-013 + name: "Config: Check that the PullPolicy is not configured to Always | Report all faulty PullPolicies" + pull_policy_always: "{{ k8s_pods | json_query('resources[?spec.containers[?imagePullPolicy == `Always`] || spec.initContainers[?imagePullPolicy == `Always`]].{name: metadata.name}') }}" # noqa: yaml[line-length] + ansible.builtin.debug: + var: pull_policy_always + changed_when: pull_policy_always | length > 0 + +- <<: *task + vars: + taskid: 63-014 + name: "Config: Check that the Ingress class is configured with the IngressClass attribute and not as annotation | Gather all infos" + kubernetes.core.k8s_info: + kind: Ingress + namespace: "{{ k8s_workload_namespace }}" + register: k8s_ingresses + changed_when: false + +- <<: *task + vars: + taskid: 63-014 + name: "Config: Check that the Ingress class is configured with the IngressClass attribute and not as annotation | Report all faulty Ingresses" + ingresses_with_annotation: "{{ k8s_ingresses | json_query('resources[?metadata.annotations.\"kubernetes.io/ingress.class\"].metadata.name') }}" # noqa: yaml[line-length] + ansible.builtin.debug: + var: ingresses_with_annotation + changed_when: ingresses_with_annotation | length > 0 + +- <<: *task + vars: + taskid: 63-015 + name: "Config: For HA deployments, check if replicas >= 2 for all relevant Deployments/StateFulSets | Gather all infos" + kubernetes.core.k8s_info: + kind: Deployment + namespace: "{{ k8s_workload_namespace }}" + register: k8s_deployments + changed_when: false + +- <<: *task + vars: + taskid: 63-015 + name: "Config: For HA deployments, check if replicas >= 2 for all relevant Deployments/StateFulSets | Gather all infos" + kubernetes.core.k8s_info: + kind: StatefulSet + namespace: "{{ k8s_workload_namespace }}" + register: k8s_statefullsets + changed_when: false + +- <<: *task + vars: + taskid: 63-015 + name: "Config: For HA deployments, check if replicas >= 2 for all relevant Deployments/StateFulSets | Report Deployments" + low_replica_deployments: "{{ k8s_deployments | json_query('resources[?spec.replicas < `2`].metadata.name') }}" + ansible.builtin.debug: + msg: item + changed_when: true + loop: "{{ k8s_workload_ha_deployments }}" + when: "item in k8s_workload_ha_deployments" + +- <<: *task + vars: + taskid: 63-015 + name: "Config: For HA deployments, check if replicas >= 2 for all relevant Deployments/StateFulSets | Report Deployments" + low_replica_statefullsets: "{{ k8s_statefullsets | json_query('resources[?spec.replicas < `2`].metadata.name') }}" + ansible.builtin.debug: + var: item + changed_when: true + loop: "{{ k8s_workload_ha_statefulsets }}" + when: "item in k8s_workload_ha_statefulsets" + +- <<: *task + vars: + taskid: 63-018 + name: "Config: For HA deployments, check if the HorizontalPodAutoscaler cannot scale below 2 replicas | Gather all infos" + kubernetes.core.k8s_info: + kind: HorizontalPodAutoscaler + namespace: "{{ k8s_workload_namespace }}" + register: k8s_hpas + changed_when: false + +- <<: *task + vars: + taskid: 63-018 + name: "Config: For HA deployments, check if the HorizontalPodAutoscaler cannot scale below 2 replicas | Report HPAs" + low_replica_hpas: "{{ k8s_hpas | json_query('resources[?spec.minReplicas < `2`].metadata.name') }}" + ansible.builtin.debug: + var: low_replica_hpas + changed_when: low_replica_hpas | length > 0