-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add support for kubernetes workload maintenance #77
base: main
Are you sure you want to change the base?
Changes from all commits
53f302d
a382b36
ffceea7
4bf5188
ad4cc2c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
from datetime import datetime | ||
|
||
class FilterModule(object): | ||
"""Ansible custom filters""" | ||
|
||
def filters(self): | ||
"""Return the custom filters""" | ||
return { | ||
"k8s_workload_pods_unhealthy": self._k8s_workload_pods_unhealthy, | ||
"k8s_workload_pods_restart_last_days": self._k8s_workload_pods_restart_last_days, | ||
"k8s_workload_check_service_type": self._k8s_workload_check_service_type, | ||
"k8s_workload_check_low_replicas": self._k8s_workload_check_low_replicas | ||
} | ||
|
||
def _k8s_workload_pods_unhealthy(self, pods): | ||
if not pods: | ||
return [] | ||
unhealthy_pods = [] | ||
for pod in pods: | ||
for status in pod.get('status').get('containerStatuses', []): | ||
if status.get('state') and status.get('state').get('running'): | ||
continue | ||
if status.get('state') and status.get('state').get('terminated'): | ||
if status['state']['terminated'].get('reason') == 'Completed': | ||
continue | ||
unhealthy_pods.append(pod.get('metadata').get('name')) | ||
return unhealthy_pods | ||
|
||
def _k8s_workload_pods_restart_last_days(self, pods, x_days): | ||
if not pods: | ||
return [] | ||
restarted_pods = [] | ||
for pod in pods: | ||
for status in pod.get('containerStatuses', []): | ||
if not status.get('startedAt') and status.get('restartCount', 0) > 0: | ||
restarted_pods.append({ | ||
"name": pod.get('name'), | ||
"started_at": "unknown", | ||
"restarts": status.get('restartCount') | ||
}) | ||
continue | ||
started_at = datetime.fromisoformat(status.get('startedAt')) | ||
if (datetime.now(started_at.tzinfo) - started_at).days < x_days: | ||
restarted_pods.append({ | ||
"name": pod.get('name'), | ||
"started_at": started_at.strftime("%Y-%m-%d %H:%M:%S"), | ||
"restarts": status.get('restartCount') | ||
}) | ||
return restarted_pods | ||
|
||
def _k8s_workload_check_service_type(self, services, allowed_types): | ||
if not services: | ||
return [] | ||
faulty_service = [] | ||
for service in services: | ||
allowed_type = allowed_types.get(service.get('name')) | ||
if service.get('type') != allowed_type: | ||
faulty_service.append({ | ||
"name": service.get('name'), | ||
"type": service.get('type'), | ||
"allowed_type": allowed_type | ||
}) | ||
return faulty_service | ||
|
||
def _k8s_workload_check_low_replicas(self, resources, ha_resources, min_replicas): | ||
if not resources: | ||
return [] | ||
low_replicas = [] | ||
for resource in resources: | ||
if resource.get('metadata').get('name') not in ha_resources: | ||
continue | ||
if resource.get('spec').get('replicas') < min_replicas: | ||
low_replicas.append(resource.get('metadata').get('name')) | ||
return low_replicas |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# Ansible Role adfinis.maintenance.maintenance_63_kubernetes_workload | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no docs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. well... none of the other roles have any docs in their README, so I didn't really want to break the pattern :) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
--- | ||
|
||
maintenance_global_exclude_tasks: [] | ||
maintenance_host_exclude_tasks: [] | ||
maintenance_exclude_tasks: "{{ maintenance_global_exclude_tasks + maintenance_host_exclude_tasks }}" | ||
|
||
# Define defaults which can be overriden on a host-by-host basis | ||
|
||
# the namespace in which the workload is running | ||
k8s_workload_namespace: "{{ inventory_hostname }}" | ||
|
||
# the service types which are allowed (if not ClusterIP) | ||
k8s_workload_allowed_service_types: {} | ||
# netbox: LoadBalancer | ||
|
||
# the deployments which should be highly available | ||
k8s_workload_ha_deployments: [] | ||
# - netbox | ||
|
||
# the statefulsets which should be highly available | ||
k8s_workload_ha_statefulsets: [] | ||
# - netbox-redis-replicas |
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
@@ -0,0 +1,181 @@ | ||||
--- | ||||
|
||||
- name: This task only serves as a template for the tasks below | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what the ansible? are we sure there isn't an idiomatic way to do this? e.g. if it's just about the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nothing in this collection is idiomatic, because it's using Ansible for something that Ansible shouldn't be used for. See https://github.com/adfinis/ansible-collection-maintenance/blob/main/README.md?plain=1#L10-L18 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh my There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you point me towards some documentation on the reasoning for using anchors and aliases? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think there is any documentation regarding this. From what I saw, it's important that every task has an ID and name as variable because the callback plugin expects this:
name and a simple when condition so you can skip each task based on it's ID.
And generally speaking, I just followed the pattern from the other roles. It probably isn't very idiomatic, but as @s3lph already said, nothing in this collection is really idiomatic... |
||||
ansible.builtin.set_fact: | ||||
ignoreme: &task | ||||
name: "{{ vars.taskid }}: {{ vars.name }}" | ||||
register: task | ||||
when: | ||||
- "vars.taskid not in maintenance_exclude_tasks" | ||||
- "maintenance_only is not defined or maintenance_only == vars.taskid" | ||||
vars: | ||||
taskid: ignoreme | ||||
name: bar | ||||
|
||||
- name: Fetch namespace information | ||||
kubernetes.core.k8s_info: | ||||
kind: Namespace | ||||
name: "{{ k8s_workload_namespace }}" | ||||
register: k8s_namespace | ||||
changed_when: false | ||||
|
||||
- name: Make sure namespace exists | ||||
ansible.builtin.fail: | ||||
msg: "Namespace '{{ k8s_workload_namespace }}' does not exist" | ||||
when: k8s_namespace.resources | length == 0 | ||||
|
||||
- <<: *task | ||||
vars: | ||||
taskid: 63-004 | ||||
name: "Health: Check that all service related Pods in a healthy (Running, Completed) state | Gather all infos" | ||||
kubernetes.core.k8s_info: | ||||
kind: Pod | ||||
namespace: "{{ k8s_workload_namespace }}" | ||||
register: k8s_pods | ||||
changed_when: false | ||||
|
||||
- <<: *task | ||||
vars: | ||||
taskid: 63-004 | ||||
name: "Health: Check that all service related Pods in a healthy (Running, Completed) state | Report unhealthy Pods" | ||||
unhealthy_pods: "{{ k8s_pods.resources | adfinis.maintenance.k8s_workload_pods_unhealthy }}" | ||||
ansible.builtin.debug: | ||||
msg: | | ||||
Unhealthy Pods: | ||||
{{ unhealthy_pods | to_nice_json }} | ||||
changed_when: unhealthy_pods | length > 0 | ||||
|
||||
- <<: *task | ||||
vars: | ||||
taskid: 63-006 | ||||
name: "Health: Check if a container of service related Pod got recently restarted, eg. OOMKilled | Gather all infos" | ||||
kubernetes.core.k8s_info: | ||||
kind: Pod | ||||
namespace: "{{ k8s_workload_namespace }}" | ||||
register: k8s_pods | ||||
changed_when: false | ||||
|
||||
- <<: *task | ||||
vars: | ||||
taskid: 63-006 | ||||
name: "Health: Check if a container of service related Pod got recently restarted, eg. OOMKilled | Report restarted Pods" | ||||
pods_restarted: "{{ k8s_pods | json_query('resources[?status.containerStatuses[?restartCount > `0`]].{name: metadata.name, containerStatuses: status.containerStatuses[].{restartCount: restartCount, startedAt: (state.*.startedAt)[0]}}') }}" # noqa: yaml[line-length] | ||||
pods_restarted_last_1d: "{{ pods_restarted | adfinis.maintenance.k8s_workload_pods_restart_last_days(1) }}" | ||||
ansible.builtin.debug: | ||||
var: pods_restarted_last_1d | ||||
changed_when: pods_restarted_last_1d | length > 0 | ||||
|
||||
- <<: *task | ||||
vars: | ||||
taskid: 63-012 | ||||
name: "Config: Check if all Services are configured as ClusterIP unless necessary | Gather all infos" | ||||
kubernetes.core.k8s_info: | ||||
kind: Service | ||||
namespace: "{{ k8s_workload_namespace }}" | ||||
register: k8s_services | ||||
changed_when: false | ||||
|
||||
- <<: *task | ||||
vars: | ||||
taskid: 63-012 | ||||
name: "Config: Check if all Services are configured as ClusterIP unless necessary | Check for Services with wrong type" | ||||
none_cluster_ip_services: "{{ k8s_services | json_query('resources[?spec.type!=`ClusterIP`].{name: metadata.name, type: spec.type}') }}" | ||||
wrong_type_services: "{{ none_cluster_ip_services | adfinis.maintenance.k8s_workload_check_service_type(k8s_workload_allowed_service_types) }}" | ||||
ansible.builtin.debug: | ||||
var: wrong_type_services | ||||
changed_when: wrong_type_services | length > 0 | ||||
|
||||
- <<: *task | ||||
vars: | ||||
taskid: 63-013 | ||||
name: "Config: Check that the PullPolicy is not configured to Always | Gather all infos" | ||||
kubernetes.core.k8s_info: | ||||
kind: Pod | ||||
namespace: "{{ k8s_workload_namespace }}" | ||||
register: k8s_pods | ||||
changed_when: false | ||||
|
||||
- <<: *task | ||||
vars: | ||||
taskid: 63-013 | ||||
name: "Config: Check that the PullPolicy is not configured to Always | Report all faulty PullPolicies" | ||||
pull_policy_always: "{{ k8s_pods | json_query('resources[?spec.containers[?imagePullPolicy == `Always`] || spec.initContainers[?imagePullPolicy == `Always`]].{name: metadata.name}') }}" # noqa: yaml[line-length] | ||||
ansible.builtin.debug: | ||||
var: pull_policy_always | ||||
changed_when: pull_policy_always | length > 0 | ||||
|
||||
- <<: *task | ||||
vars: | ||||
taskid: 63-014 | ||||
name: "Config: Check that the Ingress class is configured with the IngressClass attribute and not as annotation | Gather all infos" | ||||
kubernetes.core.k8s_info: | ||||
kind: Ingress | ||||
namespace: "{{ k8s_workload_namespace }}" | ||||
register: k8s_ingresses | ||||
changed_when: false | ||||
|
||||
- <<: *task | ||||
vars: | ||||
taskid: 63-014 | ||||
name: "Config: Check that the Ingress class is configured with the IngressClass attribute and not as annotation | Report all faulty Ingresses" | ||||
ingresses_with_annotation: "{{ k8s_ingresses | json_query('resources[?metadata.annotations.\"kubernetes.io/ingress.class\"].metadata.name') }}" # noqa: yaml[line-length] | ||||
ansible.builtin.debug: | ||||
var: ingresses_with_annotation | ||||
changed_when: ingresses_with_annotation | length > 0 | ||||
|
||||
- <<: *task | ||||
vars: | ||||
taskid: 63-015 | ||||
name: "Config: For HA deployments, check if replicas >= 2 for all relevant Deployments/StateFulSets | Gather all infos" | ||||
kubernetes.core.k8s_info: | ||||
kind: Deployment | ||||
namespace: "{{ k8s_workload_namespace }}" | ||||
register: k8s_deployments | ||||
changed_when: false | ||||
|
||||
- <<: *task | ||||
vars: | ||||
taskid: 63-015 | ||||
name: "Config: For HA deployments, check if replicas >= 2 for all relevant Deployments/StateFulSets | Gather all infos" | ||||
kubernetes.core.k8s_info: | ||||
kind: StatefulSet | ||||
namespace: "{{ k8s_workload_namespace }}" | ||||
register: k8s_statefullsets | ||||
changed_when: false | ||||
|
||||
- <<: *task | ||||
vars: | ||||
taskid: 63-015 | ||||
name: "Config: For HA deployments, check if replicas >= 2 for all relevant Deployments/StateFulSets | Report Deployments" | ||||
low_replica_deployments: "{{ k8s_deployments.resources | adfinis.maintenance.k8s_workload_check_low_replicas(k8s_workload_ha_deployments, 2) }}" | ||||
ansible.builtin.debug: | ||||
msg: low_replica_statefullsets | ||||
changed_when: low_replica_deployments | length > 0 | ||||
|
||||
- <<: *task | ||||
vars: | ||||
taskid: 63-015 | ||||
name: "Config: For HA deployments, check if replicas >= 2 for all relevant Deployments/StateFulSets | Report Deployments" | ||||
low_replica_statefullsets: "{{ k8s_statefullsets.resources | adfinis.maintenance.k8s_workload_check_low_replicas(k8s_workload_ha_statefulsets, 2) }}" | ||||
ansible.builtin.debug: | ||||
var: low_replica_statefullsets | ||||
changed_when: low_replica_statefullsets | length > 0 | ||||
|
||||
- <<: *task | ||||
vars: | ||||
taskid: 63-018 | ||||
name: "Config: For HA deployments, check if the HorizontalPodAutoscaler cannot scale below 2 replicas | Gather all infos" | ||||
kubernetes.core.k8s_info: | ||||
kind: HorizontalPodAutoscaler | ||||
namespace: "{{ k8s_workload_namespace }}" | ||||
register: k8s_hpas | ||||
changed_when: false | ||||
|
||||
- <<: *task | ||||
vars: | ||||
taskid: 63-018 | ||||
name: "Config: For HA deployments, check if the HorizontalPodAutoscaler cannot scale below 2 replicas | Report HPAs" | ||||
low_replica_hpas: "{{ k8s_hpas | json_query('resources[?spec.minReplicas < `2`].metadata.name') }}" | ||||
ansible.builtin.debug: | ||||
var: low_replica_hpas | ||||
changed_when: low_replica_hpas | length > 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
did you consider testing this with pytest-ansible?