From 0f04b124ab69c2d28b9c28465576e1ee0a202005 Mon Sep 17 00:00:00 2001 From: Matt Yaraskavitch <62650344+yaraskm@users.noreply.github.com> Date: Tue, 3 Sep 2024 08:53:22 -0400 Subject: [PATCH 1/2] 970: Taint nodes even if reboot is currently blocked - If using the --prefer-no-schedule-taint flag, apply the taint even if the reboot is currently blocked. This will make it less likley for Pods to get scheduled to nodes that are pending a reboot, once the blocking Pods / alerts are cleared. Signed-off-by: Matt Yaraskavitch <62650344+yaraskm@users.noreply.github.com> --- cmd/kured/main.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 3ad1cbee2..63c217ba9 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -761,11 +761,13 @@ func rebootAsRequired(nodeID string, booter reboot.Reboot, sentinelCommand []str blockCheckers = append(blockCheckers, KubernetesBlockingChecker{client: client, nodename: nodeID, filter: podSelectors}) } + rebootBlocked := rebootBlocked(blockCheckers...) + var rebootRequiredBlockCondition string - if rebootBlocked(blockCheckers...) { + if rebootBlocked { rebootRequiredBlockCondition = ", but blocked at this time" - continue } + log.Infof("Reboot required%s", rebootRequiredBlockCondition) if !holding(lock, &nodeMeta, concurrency > 1) && !acquire(lock, &nodeMeta, TTL, concurrency) { @@ -774,6 +776,11 @@ func rebootAsRequired(nodeID string, booter reboot.Reboot, sentinelCommand []str continue } + if rebootBlocked { + // We've logged that the reboot is needed, but curently blocked, and have tainted the node. + continue + } + err = drain(client, node) if err != nil { if !forceReboot { From 3fca49847254e31e555ae0b4a7d3e8da63033403 Mon Sep 17 00:00:00 2001 From: Matt Yaraskavitch <62650344+yaraskm@users.noreply.github.com> Date: Wed, 4 Sep 2024 13:56:15 -0400 Subject: [PATCH 2/2] fix issue where lock was acquired while waiting for blockers to clear Signed-off-by: Matt Yaraskavitch <62650344+yaraskm@users.noreply.github.com> --- cmd/kured/main.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 63c217ba9..0c31ac221 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -770,14 +770,15 @@ func rebootAsRequired(nodeID string, booter reboot.Reboot, sentinelCommand []str log.Infof("Reboot required%s", rebootRequiredBlockCondition) - if !holding(lock, &nodeMeta, concurrency > 1) && !acquire(lock, &nodeMeta, TTL, concurrency) { - // Prefer to not schedule pods onto this node to avoid draing the same pod multiple times. + if rebootBlocked { + // Prefer to not schedule pods onto this node to avoid draining the same pod multiple times. preferNoScheduleTaint.Enable() + // We've logged that the reboot is needed, but curently blocked, and have tainted the node. continue } - if rebootBlocked { - // We've logged that the reboot is needed, but curently blocked, and have tainted the node. + if !holding(lock, &nodeMeta, concurrency > 1) && !acquire(lock, &nodeMeta, TTL, concurrency) { + // If we can't acquire the lock, poll again continue }