Skip to content

Commit

Permalink
feat: add support for kubernetes workload maintenance
Browse files Browse the repository at this point in the history
  • Loading branch information
jon4hz committed Oct 28, 2024
1 parent da2355a commit 53f302d
Show file tree
Hide file tree
Showing 6 changed files with 262 additions and 0 deletions.
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,31 @@ changed: [debian01.example.org]

There is also a checklist summarising all tasks that were run but finished with either `ok` or `skipped`.

## Kubernetes Workload Maintenance

### Requirements

Make sure to install the python `kubernetes` package.

### Authentication

Ansible will use your default kubernetes config for authentication, so make sure you have that set up.

### Inventory

You can add the namespace which you want to run maintenance against as a host in your inventory.
Make sure to set `ansible_connection: local` whenever you do kubernetes maintenance.

```ini
[maintenance_63_kubernetes_workload]
kube-public
kube-system
argocd
[maintenance_63_kubernetes_workload:vars]
ansible_connection=local
```


## Development Setup

Expand Down
1 change: 1 addition & 0 deletions galaxy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ dependencies:
community.general: ">=1.3.6"
adfinis.facts: ">=1.0.2"
community.crypto: ">=2.15.0"
kubernetes.core: ">=3.2.0"

# The URL of the originating SCM repository
repository: https://github.com/adfinis/ansible-collection-maintenance
Expand Down
40 changes: 40 additions & 0 deletions plugins/filter/k8s_workload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from datetime import datetime

class FilterModule(object):
"""Ansible custom filters"""

def filters(self):
"""Return the custom filters"""
return {
"k8s_workload_pods_restart_last_days": self._k8s_workload_pods_restart_last_days,
"k8s_workload_check_service_type": self._k8s_workload_check_service_type
}

def _k8s_workload_pods_restart_last_days(self, pods, x_days):
if not pods:
return []
restarted_pods = []
for pod in pods:
for status in pod.get('containerStatuses', []):
started_at = datetime.fromisoformat(status.get('startedAt'))
if (datetime.now(started_at.tzinfo) - started_at).days < x_days:
restarted_pods.append({
"name": pod.get('name'),
"started_at": started_at.strftime("%Y-%m-%d %H:%M:%S"),
"restarts": status.get('restartCount')
})
return restarted_pods

def _k8s_workload_check_service_type(self, services, allowed_types):
if not services:
return []
faulty_service = []
for service in services:
allowed_type = allowed_types.get(service.get('name'))
if service.get('type') != allowed_type:
faulty_service.append({
"name": service.get('name'),
"type": service.get('type'),
"allowed_type": allowed_type
})
return faulty_service
1 change: 1 addition & 0 deletions roles/maintenance_63_kubernetes_workload/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Ansible Role adfinis.maintenance.maintenance_63_kubernetes_workload
22 changes: 22 additions & 0 deletions roles/maintenance_63_kubernetes_workload/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
---

maintenance_global_exclude_tasks: []
maintenance_host_exclude_tasks: []
maintenance_exclude_tasks: "{{ maintenance_global_exclude_tasks + maintenance_host_exclude_tasks }}"

# Define defaults which can be overriden on a host-by-host basis

# the namespace in which the workload is running
k8s_workload_namespace: "{{ inventory_hostname }}"

# the service types which are allowed (if not ClusterIP)
k8s_workload_allowed_service_types: {}
# netbox: LoadBalancer

# the deployments which should be highly available
k8s_workload_ha_deployments: []
# - netbox

# the statefulsets which should be highly available
k8s_workload_ha_statefulsets: []
# - netbox-redis-replicas
173 changes: 173 additions & 0 deletions roles/maintenance_63_kubernetes_workload/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
---

- name: This task only serves as a template for the tasks below
ansible.builtin.set_fact:
ignoreme: &task
name: "{{ vars.taskid }}: {{ vars.name }}"
register: task
when:
- "vars.taskid not in maintenance_exclude_tasks"
- "maintenance_only is not defined or maintenance_only == vars.taskid"
vars:
taskid: ignoreme
name: bar

- <<: *task
vars:
taskid: 63-004
name: "Health: Check that all service related Pods in a healthy (Running, Completed) state | Gather all infos"
kubernetes.core.k8s_info:
kind: Pod
namespace: "{{ k8s_workload_namespace }}"
register: k8s_pods
changed_when: false

- <<: *task
vars:
taskid: 63-004
name: "Health: Check that all service related Pods in a healthy (Running, Completed) state | Report unhealthy Pods"
unhealthy_pods: "{{ k8s_pods | json_query(\"resources[?status.phase!='Running' && status.phase!='Succeeded']\") }}"
ansible.builtin.debug:
msg: |
Unhealthy Pods:
{{ unhealthy_pods | json_query("[].metadata.name") | to_nice_json }}
changed_when: unhealthy_pods | length > 0

- <<: *task
vars:
taskid: 63-006
name: "Health: Check if a container of service related Pod got recently restarted, eg. OOMKilled | Gather all infos"
kubernetes.core.k8s_info:
kind: Pod
namespace: "{{ k8s_workload_namespace }}"
register: k8s_pods
changed_when: false

- <<: *task
vars:
taskid: 63-006
name: "Health: Check if a container of service related Pod got recently restarted, eg. OOMKilled | Report restarted Pods"
pods_restarted: "{{ k8s_pods | json_query('resources[?status.containerStatuses[?restartCount > `0`]].{name: metadata.name, containerStatuses: status.containerStatuses[].{restartCount: restartCount, startedAt: (state.*.startedAt)[0]}}') }}" # noqa: yaml[line-length]
pods_restarted_last_1d: "{{ pods_restarted | adfinis.maintenance.k8s_workload_pods_restart_last_days(1) }}"
ansible.builtin.debug:
var: pods_restarted_last_1d
changed_when: pods_restarted_last_1d | length > 0

- <<: *task
vars:
taskid: 63-012
name: "Config: Check if all Services are configured as ClusterIP unless necessary | Gather all infos"
kubernetes.core.k8s_info:
kind: Service
namespace: "{{ k8s_workload_namespace }}"
register: k8s_services
changed_when: false

- <<: *task
vars:
taskid: 63-012
name: "Config: Check if all Services are configured as ClusterIP unless necessary | Check for Services with wrong type"
none_cluster_ip_services: "{{ k8s_services | json_query('resources[?spec.type!=`ClusterIP`].{name: metadata.name, type: spec.type}') }}"
wrong_type_services: "{{ none_cluster_ip_services | adfinis.maintenance.k8s_workload_check_service_type(k8s_workload_allowed_service_types) }}"
ansible.builtin.debug:
var: wrong_type_services
changed_when: wrong_type_services | length > 0

- <<: *task
vars:
taskid: 63-013
name: "Config: Check that the PullPolicy is not configured to Always | Gather all infos"
kubernetes.core.k8s_info:
kind: Pod
namespace: "{{ k8s_workload_namespace }}"
register: k8s_pods
changed_when: false

- <<: *task
vars:
taskid: 63-013
name: "Config: Check that the PullPolicy is not configured to Always | Report all faulty PullPolicies"
pull_policy_always: "{{ k8s_pods | json_query('resources[?spec.containers[?imagePullPolicy == `Always`] || spec.initContainers[?imagePullPolicy == `Always`]].{name: metadata.name}') }}" # noqa: yaml[line-length]
ansible.builtin.debug:
var: pull_policy_always
changed_when: pull_policy_always | length > 0

- <<: *task
vars:
taskid: 63-014
name: "Config: Check that the Ingress class is configured with the IngressClass attribute and not as annotation | Gather all infos"
kubernetes.core.k8s_info:
kind: Ingress
namespace: "{{ k8s_workload_namespace }}"
register: k8s_ingresses
changed_when: false

- <<: *task
vars:
taskid: 63-014
name: "Config: Check that the Ingress class is configured with the IngressClass attribute and not as annotation | Report all faulty Ingresses"
ingresses_with_annotation: "{{ k8s_ingresses | json_query('resources[?metadata.annotations.\"kubernetes.io/ingress.class\"].metadata.name') }}" # noqa: yaml[line-length]
ansible.builtin.debug:
var: ingresses_with_annotation
changed_when: ingresses_with_annotation | length > 0

- <<: *task
vars:
taskid: 63-015
name: "Config: For HA deployments, check if replicas >= 2 for all relevant Deployments/StateFulSets | Gather all infos"
kubernetes.core.k8s_info:
kind: Deployment
namespace: "{{ k8s_workload_namespace }}"
register: k8s_deployments
changed_when: false

- <<: *task
vars:
taskid: 63-015
name: "Config: For HA deployments, check if replicas >= 2 for all relevant Deployments/StateFulSets | Gather all infos"
kubernetes.core.k8s_info:
kind: StatefulSet
namespace: "{{ k8s_workload_namespace }}"
register: k8s_statefullsets
changed_when: false

- <<: *task
vars:
taskid: 63-015
name: "Config: For HA deployments, check if replicas >= 2 for all relevant Deployments/StateFulSets | Report Deployments"
low_replica_deployments: "{{ k8s_deployments | json_query('resources[?spec.replicas < `2`].metadata.name') }}"
ansible.builtin.debug:
msg: item
changed_when: true
loop: "{{ k8s_workload_ha_deployments }}"
when: "item in k8s_workload_ha_deployments"

- <<: *task
vars:
taskid: 63-015
name: "Config: For HA deployments, check if replicas >= 2 for all relevant Deployments/StateFulSets | Report Deployments"
low_replica_statefullsets: "{{ k8s_statefullsets | json_query('resources[?spec.replicas < `2`].metadata.name') }}"
ansible.builtin.debug:
var: item
changed_when: true
loop: "{{ k8s_workload_ha_statefulsets }}"
when: "item in k8s_workload_ha_statefulsets"

- <<: *task
vars:
taskid: 63-018
name: "Config: For HA deployments, check if the HorizontalPodAutoscaler cannot scale below 2 replicas | Gather all infos"
kubernetes.core.k8s_info:
kind: HorizontalPodAutoscaler
namespace: "{{ k8s_workload_namespace }}"
register: k8s_hpas
changed_when: false

- <<: *task
vars:
taskid: 63-018
name: "Config: For HA deployments, check if the HorizontalPodAutoscaler cannot scale below 2 replicas | Report HPAs"
low_replica_hpas: "{{ k8s_hpas | json_query('resources[?spec.minReplicas < `2`].metadata.name') }}"
ansible.builtin.debug:
var: low_replica_hpas
changed_when: low_replica_hpas | length > 0

0 comments on commit 53f302d

Please sign in to comment.