From f34864758ebc62552511e447fe2c5541525e9361 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Sun, 29 Sep 2024 18:16:18 +0200 Subject: [PATCH 01/18] Cleanup rebooter interface Without this, the interface and the code to reboot is a bit more complex than it should be. We do not need setters and getters, as we are just instanciating a single instance of a rebooter interface. We create it based on user input, then pass the object around. This should cleanup the code. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 57 ++++++++++++++++-------------------------- cmd/kured/main_test.go | 28 +-------------------- pkg/reboot/command.go | 17 ++++++------- pkg/reboot/reboot.go | 7 ++++-- pkg/reboot/signal.go | 21 ++++++---------- 5 files changed, 43 insertions(+), 87 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 3ad1cbee2..77217e243 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -108,11 +108,6 @@ const ( // EnvPrefix The environment variable prefix of all environment variables bound to our command line flags. EnvPrefix = "KURED" - // MethodCommand is used as "--reboot-method" value when rebooting with the configured "--reboot-command" - MethodCommand = "command" - // MethodSignal is used as "--reboot-method" value when rebooting with a SIGRTMIN+5 signal. - MethodSignal = "signal" - sigTrminPlus5 = 34 + 5 ) @@ -646,7 +641,7 @@ func updateNodeLabels(client *kubernetes.Clientset, node *v1.Node, labels []stri } } -func rebootAsRequired(nodeID string, booter reboot.Reboot, sentinelCommand []string, window *timewindow.TimeWindow, TTL time.Duration, releaseDelay time.Duration) { +func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, sentinelCommand []string, window *timewindow.TimeWindow, TTL time.Duration, releaseDelay time.Duration) { config, err := rest.InClusterConfig() if err != nil { log.Fatal(err) @@ -796,7 +791,7 @@ func rebootAsRequired(nodeID string, booter reboot.Reboot, sentinelCommand []str } } - booter.Reboot() + rebooter.Reboot() for { log.Infof("Waiting for reboot") time.Sleep(time.Minute) @@ -817,16 +812,6 @@ func buildSentinelCommand(rebootSentinelFile string, rebootSentinelCommand strin return []string{"test", "-f", rebootSentinelFile} } -// parseRebootCommand creates the shell command line which will need wrapping to escape -// the container boundaries -func parseRebootCommand(rebootCommand string) []string { - command, err := shlex.Split(rebootCommand) - if err != nil { - log.Fatalf("Error parsing provided reboot command: %v", err) - } - return command -} - func root(cmd *cobra.Command, args []string) { if logFormat == "json" { log.SetFormatter(&log.JSONFormatter{}) @@ -844,7 +829,6 @@ func root(cmd *cobra.Command, args []string) { } sentinelCommand := buildSentinelCommand(rebootSentinelFile, rebootSentinelCommand) - restartCommand := parseRebootCommand(rebootCommand) log.Infof("Node ID: %s", nodeID) log.Infof("Lock Annotation: %s/%s:%s", dsNamespace, dsName, lockAnnotation) @@ -864,37 +848,40 @@ func root(cmd *cobra.Command, args []string) { log.Infof("Reboot check command: %s every %v", sentinelCommand, period) log.Infof("Concurrency: %v", concurrency) log.Infof("Reboot method: %s", rebootMethod) - if rebootCommand == MethodCommand { + + restartCommand, err := shlex.Split(rebootCommand) + if err != nil { + log.Fatalf("Error parsing provided reboot command: %v", err) + } + + // To run those commands as it was the host, we'll use nsenter + // Relies on hostPID:true and privileged:true to enter host mount space + // PID set to 1, until we have a better discovery mechanism. + privilegedRestartCommand := buildHostCommand(1, restartCommand) + + var rebooter reboot.Rebooter + switch { + case rebootMethod == "command": log.Infof("Reboot command: %s", restartCommand) - } else { + rebooter = reboot.CommandRebooter{NodeID: nodeID, RebootCommand: privilegedRestartCommand} + case rebootMethod == "signal": log.Infof("Reboot signal: %v", rebootSignal) + rebooter = reboot.SignalRebooter{NodeID: nodeID, Signal: rebootSignal} + default: + log.Fatalf("Invalid reboot-method configured: %s", rebootMethod) } if annotateNodes { log.Infof("Will annotate nodes during kured reboot operations") } - // To run those commands as it was the host, we'll use nsenter - // Relies on hostPID:true and privileged:true to enter host mount space - // PID set to 1, until we have a better discovery mechanism. - hostRestartCommand := buildHostCommand(1, restartCommand) - // Only wrap sentinel-command with nsenter, if a custom-command was configured, otherwise use the host-path mount hostSentinelCommand := sentinelCommand if rebootSentinelCommand != "" { hostSentinelCommand = buildHostCommand(1, sentinelCommand) } - var booter reboot.Reboot - if rebootMethod == MethodCommand { - booter = reboot.NewCommandReboot(nodeID, hostRestartCommand) - } else if rebootMethod == MethodSignal { - booter = reboot.NewSignalReboot(nodeID, rebootSignal) - } else { - log.Fatalf("Invalid reboot-method configured: %s", rebootMethod) - } - - go rebootAsRequired(nodeID, booter, hostSentinelCommand, window, lockTTL, lockReleaseDelay) + go rebootAsRequired(nodeID, rebooter, hostSentinelCommand, window, lockTTL, lockReleaseDelay) go maintainRebootRequiredMetric(nodeID, hostSentinelCommand) http.Handle("/metrics", promhttp.Handler()) diff --git a/cmd/kured/main_test.go b/cmd/kured/main_test.go index a7b11b15a..04d4cd425 100644 --- a/cmd/kured/main_test.go +++ b/cmd/kured/main_test.go @@ -4,9 +4,9 @@ import ( "reflect" "testing" + "github.com/kubereboot/kured/pkg/alerts" log "github.com/sirupsen/logrus" "github.com/spf13/cobra" - "github.com/kubereboot/kured/pkg/alerts" assert "gotest.tools/v3/assert" papi "github.com/prometheus/client_golang/api" @@ -223,32 +223,6 @@ func Test_buildSentinelCommand(t *testing.T) { } } -func Test_parseRebootCommand(t *testing.T) { - type args struct { - rebootCommand string - } - tests := []struct { - name string - args args - want []string - }{ - { - name: "Ensure a reboot command is properly parsed", - args: args{ - rebootCommand: "/sbin/systemctl reboot", - }, - want: []string{"/sbin/systemctl", "reboot"}, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := parseRebootCommand(tt.args.rebootCommand); !reflect.DeepEqual(got, tt.want) { - t.Errorf("parseRebootCommand() = %v, want %v", got, tt.want) - } - }) - } -} - func Test_rebootRequired(t *testing.T) { type args struct { sentinelCommand []string diff --git a/pkg/reboot/command.go b/pkg/reboot/command.go index 1522d2a86..f353fe5a4 100644 --- a/pkg/reboot/command.go +++ b/pkg/reboot/command.go @@ -5,21 +5,18 @@ import ( log "github.com/sirupsen/logrus" ) -// CommandRebootMethod holds context-information for a command reboot. -type CommandRebootMethod struct { - nodeID string - rebootCommand []string +// CommandRebooter holds context-information for a command reboot. +type CommandRebooter struct { + NodeID string + RebootCommand []string } // NewCommandReboot creates a new command-rebooter which needs full privileges on the host. -func NewCommandReboot(nodeID string, rebootCommand []string) *CommandRebootMethod { - return &CommandRebootMethod{nodeID: nodeID, rebootCommand: rebootCommand} -} // Reboot triggers the command-reboot. -func (c *CommandRebootMethod) Reboot() { - log.Infof("Running command: %s for node: %s", c.rebootCommand, c.nodeID) - if err := util.NewCommand(c.rebootCommand[0], c.rebootCommand[1:]...).Run(); err != nil { +func (c CommandRebooter) Reboot() { + log.Infof("Running command: %s for node: %s", c.RebootCommand, c.NodeID) + if err := util.NewCommand(c.RebootCommand[0], c.RebootCommand[1:]...).Run(); err != nil { log.Fatalf("Error invoking reboot command: %v", err) } } diff --git a/pkg/reboot/reboot.go b/pkg/reboot/reboot.go index 83d788ecf..84ea93a89 100644 --- a/pkg/reboot/reboot.go +++ b/pkg/reboot/reboot.go @@ -1,6 +1,9 @@ package reboot -// Reboot interface defines the Reboot function to be implemented. -type Reboot interface { +// Rebooter is the standard interface to use to execute +// the reboot, after it has been considered as necessary. +// The Reboot method does not expect any return, yet should +// most likely be refactored in the future to return an error +type Rebooter interface { Reboot() } diff --git a/pkg/reboot/signal.go b/pkg/reboot/signal.go index 245018c2e..7f5a2c314 100644 --- a/pkg/reboot/signal.go +++ b/pkg/reboot/signal.go @@ -7,27 +7,22 @@ import ( log "github.com/sirupsen/logrus" ) -// SignalRebootMethod holds context-information for a signal reboot. -type SignalRebootMethod struct { - nodeID string - signal int +// SignalRebooter holds context-information for a signal reboot. +type SignalRebooter struct { + NodeID string + Signal int } -// NewSignalReboot creates a new signal-rebooter which can run unprivileged. -func NewSignalReboot(nodeID string, signal int) *SignalRebootMethod { - return &SignalRebootMethod{nodeID: nodeID, signal: signal} -} - -// Reboot triggers the signal-reboot. -func (c *SignalRebootMethod) Reboot() { - log.Infof("Emit reboot-signal for node: %s", c.nodeID) +// Reboot triggers the reboot signal using SIGTERMIN+5 +func (c SignalRebooter) Reboot() { + log.Infof("Emit reboot-signal for node: %s", c.NodeID) process, err := os.FindProcess(1) if err != nil { log.Fatalf("There was no systemd process found: %v", err) } - err = process.Signal(syscall.Signal(c.signal)) + err = process.Signal(syscall.Signal(c.Signal)) if err != nil { log.Fatalf("Signal of SIGRTMIN+5 failed: %v", err) } From 3bfdd76f294ac47b90257d8bcfd760ea17cce25f Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Sun, 29 Sep 2024 18:35:08 +0200 Subject: [PATCH 02/18] Extract privileged command wrapper into util Without this, it makes the code a bit harder to read. This fixes it by extracting the method. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 20 +------ cmd/kured/main_test.go | 127 +++++++++++++++++------------------------ pkg/util/util.go | 12 ++++ pkg/util/util_test.go | 31 ++++++++++ 4 files changed, 98 insertions(+), 92 deletions(-) create mode 100644 pkg/util/util_test.go diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 77217e243..18b7d7366 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -309,17 +309,6 @@ func flagToEnvVar(flag string) string { return fmt.Sprintf("%s_%s", EnvPrefix, envVarSuffix) } -// buildHostCommand writes a new command to run in the host namespace -// Rancher based need different pid -func buildHostCommand(pid int, command []string) []string { - - // From the container, we nsenter into the proper PID to run the hostCommand. - // For this, kured daemonset need to be configured with hostPID:true and privileged:true - cmd := []string{"/usr/bin/nsenter", fmt.Sprintf("-m/proc/%d/ns/mnt", pid), "--"} - cmd = append(cmd, command...) - return cmd -} - func rebootRequired(sentinelCommand []string) bool { cmd := util.NewCommand(sentinelCommand[0], sentinelCommand[1:]...) if err := cmd.Run(); err != nil { @@ -854,16 +843,11 @@ func root(cmd *cobra.Command, args []string) { log.Fatalf("Error parsing provided reboot command: %v", err) } - // To run those commands as it was the host, we'll use nsenter - // Relies on hostPID:true and privileged:true to enter host mount space - // PID set to 1, until we have a better discovery mechanism. - privilegedRestartCommand := buildHostCommand(1, restartCommand) - var rebooter reboot.Rebooter switch { case rebootMethod == "command": log.Infof("Reboot command: %s", restartCommand) - rebooter = reboot.CommandRebooter{NodeID: nodeID, RebootCommand: privilegedRestartCommand} + rebooter = reboot.CommandRebooter{NodeID: nodeID, RebootCommand: util.PrivilegedHostCommand(1, restartCommand)} case rebootMethod == "signal": log.Infof("Reboot signal: %v", rebootSignal) rebooter = reboot.SignalRebooter{NodeID: nodeID, Signal: rebootSignal} @@ -878,7 +862,7 @@ func root(cmd *cobra.Command, args []string) { // Only wrap sentinel-command with nsenter, if a custom-command was configured, otherwise use the host-path mount hostSentinelCommand := sentinelCommand if rebootSentinelCommand != "" { - hostSentinelCommand = buildHostCommand(1, sentinelCommand) + hostSentinelCommand = util.PrivilegedHostCommand(1, sentinelCommand) } go rebootAsRequired(nodeID, rebooter, hostSentinelCommand, window, lockTTL, lockReleaseDelay) diff --git a/cmd/kured/main_test.go b/cmd/kured/main_test.go index 04d4cd425..2659ceb47 100644 --- a/cmd/kured/main_test.go +++ b/cmd/kured/main_test.go @@ -1,6 +1,7 @@ package main import ( + "github.com/kubereboot/kured/pkg/util" "reflect" "testing" @@ -23,6 +24,7 @@ func (fbc BlockingChecker) isBlocked() bool { var _ RebootBlocker = BlockingChecker{} // Verify that Type implements Interface. var _ RebootBlocker = (*BlockingChecker)(nil) // Verify that *Type implements Interface. + func Test_flagCheck(t *testing.T) { var cmd *cobra.Command var args []string @@ -107,85 +109,62 @@ func Test_stripQuotes(t *testing.T) { } } + func Test_rebootBlocked(t *testing.T) { - noCheckers := []RebootBlocker{} - nonblockingChecker := BlockingChecker{blocking: false} - blockingChecker := BlockingChecker{blocking: true} + noCheckers := []RebootBlocker{} + nonblockingChecker := BlockingChecker{blocking: false} + blockingChecker := BlockingChecker{blocking: true} - // Instantiate a prometheusClient with a broken_url - promClient, err := alerts.NewPromClient(papi.Config{Address: "broken_url"}) - if err != nil { - log.Fatal("Can't create prometheusClient: ", err) - } - brokenPrometheusClient := PrometheusBlockingChecker{promClient: promClient, filter: nil, firingOnly: false} + // Instantiate a prometheusClient with a broken_url + promClient, err := alerts.NewPromClient(papi.Config{Address: "broken_url"}) + if err != nil { + log.Fatal("Can't create prometheusClient: ", err) + } + brokenPrometheusClient := PrometheusBlockingChecker{promClient: promClient, filter: nil, firingOnly: false} - type args struct { - blockers []RebootBlocker - } - tests := []struct { - name string - args args - want bool - }{ - { - name: "Do not block on no blocker defined", - args: args{blockers: noCheckers}, - want: false, - }, - { - name: "Ensure a blocker blocks", - args: args{blockers: []RebootBlocker{blockingChecker}}, - want: true, - }, - { - name: "Ensure a non-blocker doesn't block", - args: args{blockers: []RebootBlocker{nonblockingChecker}}, - want: false, - }, - { - name: "Ensure one blocker is enough to block", - args: args{blockers: []RebootBlocker{nonblockingChecker, blockingChecker}}, - want: true, - }, - { - name: "Do block on error contacting prometheus API", - args: args{blockers: []RebootBlocker{brokenPrometheusClient}}, - want: true, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := rebootBlocked(tt.args.blockers...); got != tt.want { - t.Errorf("rebootBlocked() = %v, want %v", got, tt.want) - } - }) - } + type args struct { + blockers []RebootBlocker + } + tests := []struct { + name string + args args + want bool + }{ + { + name: "Do not block on no blocker defined", + args: args{blockers: noCheckers}, + want: false, + }, + { + name: "Ensure a blocker blocks", + args: args{blockers: []RebootBlocker{blockingChecker}}, + want: true, + }, + { + name: "Ensure a non-blocker doesn't block", + args: args{blockers: []RebootBlocker{nonblockingChecker}}, + want: false, + }, + { + name: "Ensure one blocker is enough to block", + args: args{blockers: []RebootBlocker{nonblockingChecker, blockingChecker}}, + want: true, + }, + { + name: "Do block on error contacting prometheus API", + args: args{blockers: []RebootBlocker{brokenPrometheusClient}}, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := rebootBlocked(tt.args.blockers...); got != tt.want { + t.Errorf("rebootBlocked() = %v, want %v", got, tt.want) + } + }) + } } -func Test_buildHostCommand(t *testing.T) { - type args struct { - pid int - command []string - } - tests := []struct { - name string - args args - want []string - }{ - { - name: "Ensure command will run with nsenter", - args: args{pid: 1, command: []string{"ls", "-Fal"}}, - want: []string{"/usr/bin/nsenter", "-m/proc/1/ns/mnt", "--", "ls", "-Fal"}, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := buildHostCommand(tt.args.pid, tt.args.command); !reflect.DeepEqual(got, tt.want) { - t.Errorf("buildHostCommand() = %v, want %v", got, tt.want) - } - }) - } -} func Test_buildSentinelCommand(t *testing.T) { type args struct { diff --git a/pkg/util/util.go b/pkg/util/util.go index d32f9d1c2..ffe748063 100644 --- a/pkg/util/util.go +++ b/pkg/util/util.go @@ -1,6 +1,7 @@ package util import ( + "fmt" "os/exec" log "github.com/sirupsen/logrus" @@ -21,3 +22,14 @@ func NewCommand(name string, arg ...string) *exec.Cmd { return cmd } + +// PrivilegedHostCommand wraps the command with nsenter. +// It allows to run a command from systemd's namespace for example (pid 1) +// This relies on hostPID:true and privileged:true to enter host mount space +// For info, rancher based need different pid, which should be user given. +// until we have a better discovery mechanism. +func PrivilegedHostCommand(pid int, command []string) []string { + cmd := []string{"/usr/bin/nsenter", fmt.Sprintf("-m/proc/%d/ns/mnt", pid), "--"} + cmd = append(cmd, command...) + return cmd +} diff --git a/pkg/util/util_test.go b/pkg/util/util_test.go new file mode 100644 index 000000000..650fee142 --- /dev/null +++ b/pkg/util/util_test.go @@ -0,0 +1,31 @@ +package util + +import ( + "reflect" + "testing" +) + +func Test_buildHostCommand(t *testing.T) { + type args struct { + pid int + command []string + } + tests := []struct { + name string + args args + want []string + }{ + { + name: "Ensure command will run with nsenter", + args: args{pid: 1, command: []string{"ls", "-Fal"}}, + want: []string{"/usr/bin/nsenter", "-m/proc/1/ns/mnt", "--", "ls", "-Fal"}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := PrivilegedHostCommand(tt.args.pid, tt.args.command); !reflect.DeepEqual(got, tt.want) { + t.Errorf("buildHostCommand() = %v, want %v", got, tt.want) + } + }) + } +} From 574065ff8a8877fdd824f8b157ea764b3feff173 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Sun, 29 Sep 2024 21:17:04 +0200 Subject: [PATCH 03/18] Add checker interface This will be useful to refactor the checkers loop. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 82 ++++++----------- cmd/kured/main_test.go | 174 +---------------------------------- pkg/checkers/checker.go | 74 +++++++++++++++ pkg/checkers/checker_test.go | 69 ++++++++++++++ 4 files changed, 173 insertions(+), 226 deletions(-) create mode 100644 pkg/checkers/checker.go create mode 100644 pkg/checkers/checker_test.go diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 18b7d7366..fe259282b 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -4,11 +4,11 @@ import ( "context" "encoding/json" "fmt" + "github.com/kubereboot/kured/pkg/checkers" "math/rand" "net/http" "net/url" "os" - "os/exec" "reflect" "regexp" "sort" @@ -309,28 +309,6 @@ func flagToEnvVar(flag string) string { return fmt.Sprintf("%s_%s", EnvPrefix, envVarSuffix) } -func rebootRequired(sentinelCommand []string) bool { - cmd := util.NewCommand(sentinelCommand[0], sentinelCommand[1:]...) - if err := cmd.Run(); err != nil { - switch err := err.(type) { - case *exec.ExitError: - // We assume a non-zero exit code means 'reboot not required', but of course - // the user could have misconfigured the sentinel command or something else - // went wrong during its execution. In that case, not entering a reboot loop - // is the right thing to do, and we are logging stdout/stderr of the command - // so it should be obvious what is wrong. - if cmd.ProcessState.ExitCode() != 1 { - log.Warnf("sentinel command ended with unexpected exit code: %v", cmd.ProcessState.ExitCode()) - } - return false - default: - // Something was grossly misconfigured, such as the command path being wrong. - log.Fatalf("Error invoking sentinel command: %v", err) - } - } - return true -} - // RebootBlocker interface should be implemented by types // to know if their instantiations should block a reboot type RebootBlocker interface { @@ -540,9 +518,9 @@ func uncordon(client *kubernetes.Clientset, node *v1.Node) error { return nil } -func maintainRebootRequiredMetric(nodeID string, sentinelCommand []string) { +func maintainRebootRequiredMetric(nodeID string, checker checkers.Checker) { for { - if rebootRequired(sentinelCommand) { + if checker.CheckRebootRequired() { rebootRequiredGauge.WithLabelValues(nodeID).Set(1) } else { rebootRequiredGauge.WithLabelValues(nodeID).Set(0) @@ -630,7 +608,7 @@ func updateNodeLabels(client *kubernetes.Clientset, node *v1.Node, labels []stri } } -func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, sentinelCommand []string, window *timewindow.TimeWindow, TTL time.Duration, releaseDelay time.Duration) { +func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers.Checker, window *timewindow.TimeWindow, TTL time.Duration, releaseDelay time.Duration) { config, err := rest.InClusterConfig() if err != nil { log.Fatal(err) @@ -671,7 +649,7 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, sentinelCommand [ // And (2) check if we previously annotated the node that it was in the process of being rebooted, // And finally (3) if it has that annotation, to delete it. // This indicates to other node tools running on the cluster that this node may be a candidate for maintenance - if annotateNodes && !rebootRequired(sentinelCommand) { + if annotateNodes && !checker.CheckRebootRequired() { if _, ok := node.Annotations[KuredRebootInProgressAnnotation]; ok { err := deleteNodeAnnotation(client, nodeID, KuredRebootInProgressAnnotation) if err != nil { @@ -690,7 +668,7 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, sentinelCommand [ preferNoScheduleTaint := taints.New(client, nodeID, preferNoScheduleTaintName, v1.TaintEffectPreferNoSchedule) // Remove taint immediately during startup to quickly allow scheduling again. - if !rebootRequired(sentinelCommand) { + if !checker.CheckRebootRequired() { preferNoScheduleTaint.Disable() } @@ -709,7 +687,7 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, sentinelCommand [ continue } - if !rebootRequired(sentinelCommand) { + if !checker.CheckRebootRequired() { log.Infof("Reboot not required") preferNoScheduleTaint.Disable() continue @@ -788,19 +766,6 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, sentinelCommand [ } } -// buildSentinelCommand creates the shell command line which will need wrapping to escape -// the container boundaries -func buildSentinelCommand(rebootSentinelFile string, rebootSentinelCommand string) []string { - if rebootSentinelCommand != "" { - cmd, err := shlex.Split(rebootSentinelCommand) - if err != nil { - log.Fatalf("Error parsing provided sentinel command: %v", err) - } - return cmd - } - return []string{"test", "-f", rebootSentinelFile} -} - func root(cmd *cobra.Command, args []string) { if logFormat == "json" { log.SetFormatter(&log.JSONFormatter{}) @@ -817,8 +782,6 @@ func root(cmd *cobra.Command, args []string) { log.Fatalf("Failed to build time window: %v", err) } - sentinelCommand := buildSentinelCommand(rebootSentinelFile, rebootSentinelCommand) - log.Infof("Node ID: %s", nodeID) log.Infof("Lock Annotation: %s/%s:%s", dsNamespace, dsName, lockAnnotation) if lockTTL > 0 { @@ -834,7 +797,7 @@ func root(cmd *cobra.Command, args []string) { log.Infof("PreferNoSchedule taint: %s", preferNoScheduleTaintName) log.Infof("Blocking Pod Selectors: %v", podSelectors) log.Infof("Reboot schedule: %v", window) - log.Infof("Reboot check command: %s every %v", sentinelCommand, period) + log.Infof("Reboot period %v", period) log.Infof("Concurrency: %v", concurrency) log.Infof("Reboot method: %s", rebootMethod) @@ -855,18 +818,31 @@ func root(cmd *cobra.Command, args []string) { log.Fatalf("Invalid reboot-method configured: %s", rebootMethod) } - if annotateNodes { - log.Infof("Will annotate nodes during kured reboot operations") + var checker checkers.Checker + // An override of rebootsentinelcommand means a privileged command + if rebootSentinelCommand != "" { + log.Infof("Sentinel checker is user provided command: %s", rebootSentinelCommand) + cmd, err := shlex.Split(rebootSentinelCommand) + if err != nil { + log.Fatalf("Error parsing provided sentinel command: %v", err) + } + checker = checkers.NsEnterRebootChecker{ + CustomCheckCommand: cmd, + NamespacePid: 1, + } + } else { + log.Infof("Sentinel checker is (unprivileged) testing for the presence of: %s", rebootSentinelFile) + checker = checkers.UnprivilegedRebootChecker{ + CheckCommand: []string{"test", "-f", rebootSentinelFile}, + } } - // Only wrap sentinel-command with nsenter, if a custom-command was configured, otherwise use the host-path mount - hostSentinelCommand := sentinelCommand - if rebootSentinelCommand != "" { - hostSentinelCommand = util.PrivilegedHostCommand(1, sentinelCommand) + if annotateNodes { + log.Infof("Will annotate nodes during kured reboot operations") } - go rebootAsRequired(nodeID, rebooter, hostSentinelCommand, window, lockTTL, lockReleaseDelay) - go maintainRebootRequiredMetric(nodeID, hostSentinelCommand) + go rebootAsRequired(nodeID, rebooter, checker, window, lockTTL, lockReleaseDelay) + go maintainRebootRequiredMetric(nodeID, checker) http.Handle("/metrics", promhttp.Handler()) log.Fatal(http.ListenAndServe(fmt.Sprintf("%s:%d", metricsHost, metricsPort), nil)) diff --git a/cmd/kured/main_test.go b/cmd/kured/main_test.go index 2659ceb47..542698c63 100644 --- a/cmd/kured/main_test.go +++ b/cmd/kured/main_test.go @@ -1,30 +1,11 @@ package main import ( - "github.com/kubereboot/kured/pkg/util" + "github.com/spf13/cobra" "reflect" "testing" - - "github.com/kubereboot/kured/pkg/alerts" - log "github.com/sirupsen/logrus" - "github.com/spf13/cobra" - assert "gotest.tools/v3/assert" - - papi "github.com/prometheus/client_golang/api" ) -type BlockingChecker struct { - blocking bool -} - -func (fbc BlockingChecker) isBlocked() bool { - return fbc.blocking -} - -var _ RebootBlocker = BlockingChecker{} // Verify that Type implements Interface. -var _ RebootBlocker = (*BlockingChecker)(nil) // Verify that *Type implements Interface. - - func Test_flagCheck(t *testing.T) { var cmd *cobra.Command var args []string @@ -108,156 +89,3 @@ func Test_stripQuotes(t *testing.T) { }) } } - - -func Test_rebootBlocked(t *testing.T) { - noCheckers := []RebootBlocker{} - nonblockingChecker := BlockingChecker{blocking: false} - blockingChecker := BlockingChecker{blocking: true} - - // Instantiate a prometheusClient with a broken_url - promClient, err := alerts.NewPromClient(papi.Config{Address: "broken_url"}) - if err != nil { - log.Fatal("Can't create prometheusClient: ", err) - } - brokenPrometheusClient := PrometheusBlockingChecker{promClient: promClient, filter: nil, firingOnly: false} - - type args struct { - blockers []RebootBlocker - } - tests := []struct { - name string - args args - want bool - }{ - { - name: "Do not block on no blocker defined", - args: args{blockers: noCheckers}, - want: false, - }, - { - name: "Ensure a blocker blocks", - args: args{blockers: []RebootBlocker{blockingChecker}}, - want: true, - }, - { - name: "Ensure a non-blocker doesn't block", - args: args{blockers: []RebootBlocker{nonblockingChecker}}, - want: false, - }, - { - name: "Ensure one blocker is enough to block", - args: args{blockers: []RebootBlocker{nonblockingChecker, blockingChecker}}, - want: true, - }, - { - name: "Do block on error contacting prometheus API", - args: args{blockers: []RebootBlocker{brokenPrometheusClient}}, - want: true, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := rebootBlocked(tt.args.blockers...); got != tt.want { - t.Errorf("rebootBlocked() = %v, want %v", got, tt.want) - } - }) - } -} - - -func Test_buildSentinelCommand(t *testing.T) { - type args struct { - rebootSentinelFile string - rebootSentinelCommand string - } - tests := []struct { - name string - args args - want []string - }{ - { - name: "Ensure a sentinelFile generates a shell 'test' command with the right file", - args: args{ - rebootSentinelFile: "/test1", - rebootSentinelCommand: "", - }, - want: []string{"test", "-f", "/test1"}, - }, - { - name: "Ensure a sentinelCommand has priority over a sentinelFile if both are provided (because sentinelFile is always provided)", - args: args{ - rebootSentinelFile: "/test1", - rebootSentinelCommand: "/sbin/reboot-required -r", - }, - want: []string{"/sbin/reboot-required", "-r"}, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := buildSentinelCommand(tt.args.rebootSentinelFile, tt.args.rebootSentinelCommand); !reflect.DeepEqual(got, tt.want) { - t.Errorf("buildSentinelCommand() = %v, want %v", got, tt.want) - } - }) - } -} - -func Test_rebootRequired(t *testing.T) { - type args struct { - sentinelCommand []string - } - tests := []struct { - name string - args args - want bool - }{ - { - name: "Ensure rc = 0 means reboot required", - args: args{ - sentinelCommand: []string{"true"}, - }, - want: true, - }, - { - name: "Ensure rc != 0 means reboot NOT required", - args: args{ - sentinelCommand: []string{"false"}, - }, - want: false, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := rebootRequired(tt.args.sentinelCommand); got != tt.want { - t.Errorf("rebootRequired() = %v, want %v", got, tt.want) - } - }) - } -} - -func Test_rebootRequired_fatals(t *testing.T) { - cases := []struct { - param []string - expectFatal bool - }{ - { - param: []string{"true"}, - expectFatal: false, - }, - { - param: []string{"./babar"}, - expectFatal: true, - }, - } - - defer func() { log.StandardLogger().ExitFunc = nil }() - var fatal bool - log.StandardLogger().ExitFunc = func(int) { fatal = true } - - for _, c := range cases { - fatal = false - rebootRequired(c.param) - assert.Equal(t, c.expectFatal, fatal) - } - -} diff --git a/pkg/checkers/checker.go b/pkg/checkers/checker.go new file mode 100644 index 000000000..3a0fc0123 --- /dev/null +++ b/pkg/checkers/checker.go @@ -0,0 +1,74 @@ +package checkers + +import ( + "github.com/kubereboot/kured/pkg/util" + log "github.com/sirupsen/logrus" + "os/exec" +) + +type Checker interface { + CheckRebootRequired() bool +} + +// UnprivilegedRebootChecker is the default reboot checker. +// It is unprivileged, and tests the presence of a files +type UnprivilegedRebootChecker struct { + CheckCommand []string +} + +// CheckRebootRequired runs the test command of the file +// needs refactoring to also return an error, instead of leaking it inside the code. +// This needs refactoring to get rid of NewCommand +// This needs refactoring to only contain file location, instead of CheckCommand +func (rc UnprivilegedRebootChecker) CheckRebootRequired() bool { + cmd := util.NewCommand(rc.CheckCommand[0], rc.CheckCommand[1:]...) + if err := cmd.Run(); err != nil { + switch err := err.(type) { + case *exec.ExitError: + // We assume a non-zero exit code means 'reboot not required', but of course + // the user could have misconfigured the sentinel command or something else + // went wrong during its execution. In that case, not entering a reboot loop + // is the right thing to do, and we are logging stdout/stderr of the command + // so it should be obvious what is wrong. + if cmd.ProcessState.ExitCode() != 1 { + log.Warnf("sentinel command ended with unexpected exit code: %v", cmd.ProcessState.ExitCode()) + } + return false + default: + // Something was grossly misconfigured, such as the command path being wrong. + log.Fatalf("Error invoking sentinel command: %v", err) + } + } + return true +} + +// NsEnterRebootChecker is using a custom command to check +// if a reboot is required, but therefore needs a pid for entering the namespace, +// on top of the required command. This requires elevation. +type NsEnterRebootChecker struct { + CustomCheckCommand []string + NamespacePid int +} + +func (rc NsEnterRebootChecker) CheckRebootRequired() bool { + privCommand := util.PrivilegedHostCommand(rc.NamespacePid, rc.CustomCheckCommand) + cmd := util.NewCommand(privCommand[0], privCommand[1:]...) + if err := cmd.Run(); err != nil { + switch err := err.(type) { + case *exec.ExitError: + // We assume a non-zero exit code means 'reboot not required', but of course + // the user could have misconfigured the sentinel command or something else + // went wrong during its execution. In that case, not entering a reboot loop + // is the right thing to do, and we are logging stdout/stderr of the command + // so it should be obvious what is wrong. + if cmd.ProcessState.ExitCode() != 1 { + log.Warnf("sentinel command ended with unexpected exit code: %v", cmd.ProcessState.ExitCode()) + } + return false + default: + // Something was grossly misconfigured, such as the command path being wrong. + log.Fatalf("Error invoking sentinel command: %v", err) + } + } + return true +} diff --git a/pkg/checkers/checker_test.go b/pkg/checkers/checker_test.go new file mode 100644 index 000000000..f3a689434 --- /dev/null +++ b/pkg/checkers/checker_test.go @@ -0,0 +1,69 @@ +package checkers + +import ( + log "github.com/sirupsen/logrus" + assert "gotest.tools/v3/assert" + "testing" +) + +func Test_rebootRequired(t *testing.T) { + type args struct { + sentinelCommand []string + } + tests := []struct { + name string + args args + want bool + }{ + { + name: "Ensure rc = 0 means reboot required", + args: args{ + sentinelCommand: []string{"true"}, + }, + want: true, + }, + { + name: "Ensure rc != 0 means reboot NOT required", + args: args{ + sentinelCommand: []string{"false"}, + }, + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + a := UnprivilegedRebootChecker{CheckCommand: tt.args.sentinelCommand} + if got := a.CheckRebootRequired(); got != tt.want { + t.Errorf("rebootRequired() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_rebootRequired_fatals(t *testing.T) { + cases := []struct { + param []string + expectFatal bool + }{ + { + param: []string{"true"}, + expectFatal: false, + }, + { + param: []string{"./babar"}, + expectFatal: true, + }, + } + + defer func() { log.StandardLogger().ExitFunc = nil }() + var fatal bool + log.StandardLogger().ExitFunc = func(int) { fatal = true } + + for _, c := range cases { + fatal = false + a := UnprivilegedRebootChecker{CheckCommand: c.param} + a.CheckRebootRequired() + assert.Equal(t, c.expectFatal, fatal) + } + +} From eeedf203c37461c27b15d899c3befe4ae886f831 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Sun, 29 Sep 2024 21:45:48 +0200 Subject: [PATCH 04/18] Extract blockers This will make it easier to manipulate main in the future. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 92 ++---------------------------- pkg/blockers/blockers.go | 102 ++++++++++++++++++++++++++++++++++ pkg/blockers/blockers_test.go | 67 ++++++++++++++++++++++ 3 files changed, 174 insertions(+), 87 deletions(-) create mode 100644 pkg/blockers/blockers.go create mode 100644 pkg/blockers/blockers_test.go diff --git a/cmd/kured/main.go b/cmd/kured/main.go index fe259282b..981c22248 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "fmt" + "github.com/kubereboot/kured/pkg/blockers" "github.com/kubereboot/kured/pkg/checkers" "math/rand" "net/http" @@ -309,89 +310,6 @@ func flagToEnvVar(flag string) string { return fmt.Sprintf("%s_%s", EnvPrefix, envVarSuffix) } -// RebootBlocker interface should be implemented by types -// to know if their instantiations should block a reboot -type RebootBlocker interface { - isBlocked() bool -} - -// PrometheusBlockingChecker contains info for connecting -// to prometheus, and can give info about whether a reboot should be blocked -type PrometheusBlockingChecker struct { - // prometheusClient to make prometheus-go-client and api config available - // into the PrometheusBlockingChecker struct - promClient *alerts.PromClient - // regexp used to get alerts - filter *regexp.Regexp - // bool to indicate if only firing alerts should be considered - firingOnly bool - // bool to indicate that we're only blocking on alerts which match the filter - filterMatchOnly bool -} - -// KubernetesBlockingChecker contains info for connecting -// to k8s, and can give info about whether a reboot should be blocked -type KubernetesBlockingChecker struct { - // client used to contact kubernetes API - client *kubernetes.Clientset - nodename string - // lised used to filter pods (podSelector) - filter []string -} - -func (pb PrometheusBlockingChecker) isBlocked() bool { - alertNames, err := pb.promClient.ActiveAlerts(pb.filter, pb.firingOnly, pb.filterMatchOnly) - if err != nil { - log.Warnf("Reboot blocked: prometheus query error: %v", err) - return true - } - count := len(alertNames) - if count > 10 { - alertNames = append(alertNames[:10], "...") - } - if count > 0 { - log.Warnf("Reboot blocked: %d active alerts: %v", count, alertNames) - return true - } - return false -} - -func (kb KubernetesBlockingChecker) isBlocked() bool { - fieldSelector := fmt.Sprintf("spec.nodeName=%s,status.phase!=Succeeded,status.phase!=Failed,status.phase!=Unknown", kb.nodename) - for _, labelSelector := range kb.filter { - podList, err := kb.client.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{ - LabelSelector: labelSelector, - FieldSelector: fieldSelector, - Limit: 10}) - if err != nil { - log.Warnf("Reboot blocked: pod query error: %v", err) - return true - } - - if len(podList.Items) > 0 { - podNames := make([]string, 0, len(podList.Items)) - for _, pod := range podList.Items { - podNames = append(podNames, pod.Name) - } - if len(podList.Continue) > 0 { - podNames = append(podNames, "...") - } - log.Warnf("Reboot blocked: matching pods: %v", podNames) - return true - } - } - return false -} - -func rebootBlocked(blockers ...RebootBlocker) bool { - for _, blocker := range blockers { - if blocker.isBlocked() { - return true - } - } - return false -} - func holding(lock *daemonsetlock.DaemonSetLock, metadata interface{}, isMultiLock bool) bool { var holding bool var err error @@ -715,16 +633,16 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. } } - var blockCheckers []RebootBlocker + var blockCheckers []blockers.RebootBlocker if prometheusURL != "" { - blockCheckers = append(blockCheckers, PrometheusBlockingChecker{promClient: promClient, filter: alertFilter, firingOnly: alertFiringOnly, filterMatchOnly: alertFilterMatchOnly}) + blockCheckers = append(blockCheckers, blockers.PrometheusBlockingChecker{PromClient: promClient, Filter: alertFilter, FiringOnly: alertFiringOnly, FilterMatchOnly: alertFilterMatchOnly}) } if podSelectors != nil { - blockCheckers = append(blockCheckers, KubernetesBlockingChecker{client: client, nodename: nodeID, filter: podSelectors}) + blockCheckers = append(blockCheckers, blockers.KubernetesBlockingChecker{Client: client, Nodename: nodeID, Filter: podSelectors}) } var rebootRequiredBlockCondition string - if rebootBlocked(blockCheckers...) { + if blockers.RebootBlocked(blockCheckers...) { rebootRequiredBlockCondition = ", but blocked at this time" continue } diff --git a/pkg/blockers/blockers.go b/pkg/blockers/blockers.go new file mode 100644 index 000000000..e2dc15007 --- /dev/null +++ b/pkg/blockers/blockers.go @@ -0,0 +1,102 @@ +package blockers + +import ( + "context" + "fmt" + "github.com/kubereboot/kured/pkg/alerts" + log "github.com/sirupsen/logrus" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "regexp" +) + +// RebootBlocked checks that a single block Checker +// will block the reboot or not. +func RebootBlocked(blockers ...RebootBlocker) bool { + for _, blocker := range blockers { + if blocker.IsBlocked() { + return true + } + } + return false +} + +// RebootBlocker interface should be implemented by types +// to know if their instantiations should block a reboot +type RebootBlocker interface { + IsBlocked() bool +} + +// PrometheusBlockingChecker contains info for connecting +// to prometheus, and can give info about whether a reboot should be blocked +type PrometheusBlockingChecker struct { + // prometheusClient to make prometheus-go-client and api config available + // into the PrometheusBlockingChecker struct + PromClient *alerts.PromClient + // regexp used to get alerts + Filter *regexp.Regexp + // bool to indicate if only firing alerts should be considered + FiringOnly bool + // bool to indicate that we're only blocking on alerts which match the filter + FilterMatchOnly bool +} + +// KubernetesBlockingChecker contains info for connecting +// to k8s, and can give info about whether a reboot should be blocked +type KubernetesBlockingChecker struct { + // client used to contact kubernetes API + Client *kubernetes.Clientset + Nodename string + // lised used to filter pods (podSelector) + Filter []string +} + +// IsBlocked for the prometheus will check if there are active alerts matching +// the arguments given into promclient which would actively block the reboot. +// As of today, no blocker information is shared as a return of the method, +// and the information is simply logged. +func (pb PrometheusBlockingChecker) IsBlocked() bool { + alertNames, err := pb.PromClient.ActiveAlerts(pb.Filter, pb.FiringOnly, pb.FilterMatchOnly) + if err != nil { + log.Warnf("Reboot blocked: prometheus query error: %v", err) + return true + } + count := len(alertNames) + if count > 10 { + alertNames = append(alertNames[:10], "...") + } + if count > 0 { + log.Warnf("Reboot blocked: %d active alerts: %v", count, alertNames) + return true + } + return false +} + +// IsBlocked for the KubernetesBlockingChecker will check if a pod, for the node, is preventing +// the reboot. It will warn in the logs about blocking, but does not return an error. +func (kb KubernetesBlockingChecker) IsBlocked() bool { + fieldSelector := fmt.Sprintf("spec.nodeName=%s,status.phase!=Succeeded,status.phase!=Failed,status.phase!=Unknown", kb.Nodename) + for _, labelSelector := range kb.Filter { + podList, err := kb.Client.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{ + LabelSelector: labelSelector, + FieldSelector: fieldSelector, + Limit: 10}) + if err != nil { + log.Warnf("Reboot blocked: pod query error: %v", err) + return true + } + + if len(podList.Items) > 0 { + podNames := make([]string, 0, len(podList.Items)) + for _, pod := range podList.Items { + podNames = append(podNames, pod.Name) + } + if len(podList.Continue) > 0 { + podNames = append(podNames, "...") + } + log.Warnf("Reboot blocked: matching pods: %v", podNames) + return true + } + } + return false +} diff --git a/pkg/blockers/blockers_test.go b/pkg/blockers/blockers_test.go new file mode 100644 index 000000000..1eddf74a1 --- /dev/null +++ b/pkg/blockers/blockers_test.go @@ -0,0 +1,67 @@ +package blockers + +import ( + "github.com/kubereboot/kured/pkg/alerts" + papi "github.com/prometheus/client_golang/api" + "testing" +) + +type BlockingChecker struct { + blocking bool +} + +func (fbc BlockingChecker) IsBlocked() bool { + return fbc.blocking +} + +func Test_rebootBlocked(t *testing.T) { + noCheckers := []RebootBlocker{} + nonblockingChecker := BlockingChecker{blocking: false} + blockingChecker := BlockingChecker{blocking: true} + + // Instantiate a prometheusClient with a broken_url + promClient, _ := alerts.NewPromClient(papi.Config{Address: "broken_url"}) + brokenPrometheusClient := PrometheusBlockingChecker{PromClient: promClient, Filter: nil, FiringOnly: false} + + type args struct { + blockers []RebootBlocker + } + tests := []struct { + name string + args args + want bool + }{ + { + name: "Do not block on no blocker defined", + args: args{blockers: noCheckers}, + want: false, + }, + { + name: "Ensure a blocker blocks", + args: args{blockers: []RebootBlocker{blockingChecker}}, + want: true, + }, + { + name: "Ensure a non-blocker doesn't block", + args: args{blockers: []RebootBlocker{nonblockingChecker}}, + want: false, + }, + { + name: "Ensure one blocker is enough to block", + args: args{blockers: []RebootBlocker{nonblockingChecker, blockingChecker}}, + want: true, + }, + { + name: "Do block on error contacting prometheus API", + args: args{blockers: []RebootBlocker{brokenPrometheusClient}}, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := RebootBlocked(tt.args.blockers...); got != tt.want { + t.Errorf("rebootBlocked() = %v, want %v", got, tt.want) + } + }) + } +} From 00d8a524ab48cf57a890ef8c2e080fcf70322a3a Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Mon, 30 Sep 2024 20:51:12 +0200 Subject: [PATCH 05/18] Move command line validations in pre function Without this, validations are all over the place. This moves some validations directly into the function, to make the code simpler to read. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 77 +++++++++++++++++++++--------------------- cmd/kured/main_test.go | 1 + 2 files changed, 40 insertions(+), 38 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 981c22248..a0812c172 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -229,17 +229,22 @@ func NewRootCommand() *cobra.Command { return rootCmd } -// func that checks for deprecated slack-notification-related flags and node labels that do not match +// func that checks for cmd line flags validity func flagCheck(cmd *cobra.Command, args []string) { - if slackHookURL != "" && notifyURL != "" { - log.Warnf("Cannot use both --notify-url and --slack-hook-url flags. Kured will use --notify-url flag only...") + if logFormat == "json" { + log.SetFormatter(&log.JSONFormatter{}) } - if notifyURL != "" { + if nodeID == "" { + log.Fatal("KURED_NODE_ID environment variable required") + } + switch { + case slackHookURL != "" && notifyURL != "": + log.Warnf("Cannot use both --notify-url and --slack-hook-url flags. Kured will use --notify-url flag only...") + case notifyURL != "": notifyURL = stripQuotes(notifyURL) - } else if slackHookURL != "" { - slackHookURL = stripQuotes(slackHookURL) + case slackHookURL != "": log.Warnf("Deprecated flag(s). Please use --notify-url flag instead.") - trataURL, err := url.Parse(slackHookURL) + trataURL, err := url.Parse(stripQuotes(slackHookURL)) if err != nil { log.Warnf("slack-hook-url is not properly formatted... no notification will be sent: %v\n", err) } @@ -249,6 +254,7 @@ func flagCheck(cmd *cobra.Command, args []string) { notifyURL = fmt.Sprintf("slack://%s", strings.Trim(trataURL.Path, "/services/")) } } + var preRebootNodeLabelKeys, postRebootNodeLabelKeys []string for _, label := range preRebootNodeLabels { preRebootNodeLabelKeys = append(preRebootNodeLabelKeys, strings.Split(label, "=")[0]) @@ -261,6 +267,32 @@ func flagCheck(cmd *cobra.Command, args []string) { if !reflect.DeepEqual(preRebootNodeLabelKeys, postRebootNodeLabelKeys) { log.Warnf("pre-reboot-node-labels keys and post-reboot-node-labels keys do not match. This may result in unexpected behaviour.") } + + // Not really validation, could stay in root command if necessary. + // However, using the information here makes it very explicit that + // root is only for the main business logic. + log.Infof("Kubernetes Reboot Daemon: %s", version) + log.Infof("Node ID: %s", nodeID) + log.Infof("Lock Annotation: %s/%s:%s", dsNamespace, dsName, lockAnnotation) + if lockTTL > 0 { + log.Infof("Lock TTL set, lock will expire after: %v", lockTTL) + } else { + log.Info("Lock TTL not set, lock will remain until being released") + } + if lockReleaseDelay > 0 { + log.Infof("Lock release delay set, lock release will be delayed by: %v", lockReleaseDelay) + } else { + log.Info("Lock release delay not set, lock will be released immediately after rebooting") + } + log.Infof("PreferNoSchedule taint: %s", preferNoScheduleTaintName) + log.Infof("Blocking Pod Selectors: %v", podSelectors) + log.Infof("Reboot period %v", period) + log.Infof("Concurrency: %v", concurrency) + log.Infof("Reboot method: %s", rebootMethod) + + if annotateNodes { + log.Infof("Will annotate nodes during kured reboot operations") + } } // stripQuotes removes any literal single or double quote chars that surround a string @@ -685,39 +717,12 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. } func root(cmd *cobra.Command, args []string) { - if logFormat == "json" { - log.SetFormatter(&log.JSONFormatter{}) - } - - log.Infof("Kubernetes Reboot Daemon: %s", version) - - if nodeID == "" { - log.Fatal("KURED_NODE_ID environment variable required") - } window, err := timewindow.New(rebootDays, rebootStart, rebootEnd, timezone) if err != nil { log.Fatalf("Failed to build time window: %v", err) } - - log.Infof("Node ID: %s", nodeID) - log.Infof("Lock Annotation: %s/%s:%s", dsNamespace, dsName, lockAnnotation) - if lockTTL > 0 { - log.Infof("Lock TTL set, lock will expire after: %v", lockTTL) - } else { - log.Info("Lock TTL not set, lock will remain until being released") - } - if lockReleaseDelay > 0 { - log.Infof("Lock release delay set, lock release will be delayed by: %v", lockReleaseDelay) - } else { - log.Info("Lock release delay not set, lock will be released immediately after rebooting") - } - log.Infof("PreferNoSchedule taint: %s", preferNoScheduleTaintName) - log.Infof("Blocking Pod Selectors: %v", podSelectors) log.Infof("Reboot schedule: %v", window) - log.Infof("Reboot period %v", period) - log.Infof("Concurrency: %v", concurrency) - log.Infof("Reboot method: %s", rebootMethod) restartCommand, err := shlex.Split(rebootCommand) if err != nil { @@ -755,10 +760,6 @@ func root(cmd *cobra.Command, args []string) { } } - if annotateNodes { - log.Infof("Will annotate nodes during kured reboot operations") - } - go rebootAsRequired(nodeID, rebooter, checker, window, lockTTL, lockReleaseDelay) go maintainRebootRequiredMetric(nodeID, checker) diff --git a/cmd/kured/main_test.go b/cmd/kured/main_test.go index 542698c63..9d60109d0 100644 --- a/cmd/kured/main_test.go +++ b/cmd/kured/main_test.go @@ -9,6 +9,7 @@ import ( func Test_flagCheck(t *testing.T) { var cmd *cobra.Command var args []string + nodeID = "babar" slackHookURL = "https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET" expected := "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET" flagCheck(cmd, args) From 36e6c8b4d8516676128760434db97487b732a244 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Mon, 30 Sep 2024 21:03:37 +0200 Subject: [PATCH 06/18] Rename variable Without this, the variable name is hard to follow. This fixes it by cleaning up the var name. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index a0812c172..957bc4f63 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -244,14 +244,14 @@ func flagCheck(cmd *cobra.Command, args []string) { notifyURL = stripQuotes(notifyURL) case slackHookURL != "": log.Warnf("Deprecated flag(s). Please use --notify-url flag instead.") - trataURL, err := url.Parse(stripQuotes(slackHookURL)) + parsedURL, err := url.Parse(stripQuotes(slackHookURL)) if err != nil { log.Warnf("slack-hook-url is not properly formatted... no notification will be sent: %v\n", err) } - if len(strings.Split(strings.Trim(trataURL.Path, "/services/"), "/")) != 3 { + if len(strings.Split(strings.Trim(parsedURL.Path, "/services/"), "/")) != 3 { log.Warnf("slack-hook-url is not properly formatted... no notification will be sent: unexpected number of / in URL\n") } else { - notifyURL = fmt.Sprintf("slack://%s", strings.Trim(trataURL.Path, "/services/")) + notifyURL = fmt.Sprintf("slack://%s", strings.Trim(parsedURL.Path, "/services/")) } } From f43ed1484e2e2fc6f0129307daa30856a373df7c Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Mon, 30 Sep 2024 21:56:52 +0200 Subject: [PATCH 07/18] Cleanup checkers Without this, the checkers are only shell calls: test -f sentinelFile, or sentinelCommand. This changes the behaviour of existing code to test file for sentinelFile checker, and to keep the sentinel command as a command. However, to avoid having validation in the root loop, it moves to use a constructor to cleanup the code. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 17 ++----- pkg/checkers/checker.go | 87 +++++++++++++++++++++++------------- pkg/checkers/checker_test.go | 4 +- 3 files changed, 60 insertions(+), 48 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 957bc4f63..8524f0eeb 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -28,8 +28,6 @@ import ( "k8s.io/client-go/rest" kubectldrain "k8s.io/kubectl/pkg/drain" - "github.com/google/shlex" - shoutrrr "github.com/containrrr/shoutrrr" "github.com/kubereboot/kured/pkg/alerts" "github.com/kubereboot/kured/pkg/daemonsetlock" @@ -744,20 +742,11 @@ func root(cmd *cobra.Command, args []string) { var checker checkers.Checker // An override of rebootsentinelcommand means a privileged command if rebootSentinelCommand != "" { - log.Infof("Sentinel checker is user provided command: %s", rebootSentinelCommand) - cmd, err := shlex.Split(rebootSentinelCommand) - if err != nil { - log.Fatalf("Error parsing provided sentinel command: %v", err) - } - checker = checkers.NsEnterRebootChecker{ - CustomCheckCommand: cmd, - NamespacePid: 1, - } + log.Infof("Sentinel checker is (privileged) user provided command: %s", rebootSentinelCommand) + checker = checkers.NewCommandChecker(rebootSentinelCommand) } else { log.Infof("Sentinel checker is (unprivileged) testing for the presence of: %s", rebootSentinelFile) - checker = checkers.UnprivilegedRebootChecker{ - CheckCommand: []string{"test", "-f", rebootSentinelFile}, - } + checker = checkers.NewFileRebootChecker(rebootSentinelFile) } go rebootAsRequired(nodeID, rebooter, checker, window, lockTTL, lockReleaseDelay) diff --git a/pkg/checkers/checker.go b/pkg/checkers/checker.go index 3a0fc0123..6a75b959d 100644 --- a/pkg/checkers/checker.go +++ b/pkg/checkers/checker.go @@ -1,58 +1,67 @@ package checkers import ( + "github.com/google/shlex" "github.com/kubereboot/kured/pkg/util" log "github.com/sirupsen/logrus" + "os" "os/exec" ) +// Checker is the standard interface to use to check +// if a reboot is required. Its types must implement a +// CheckRebootRequired method which returns a single boolean +// clarifying whether a reboot is expected or not. type Checker interface { CheckRebootRequired() bool } -// UnprivilegedRebootChecker is the default reboot checker. +// FileRebootChecker is the default reboot checker. // It is unprivileged, and tests the presence of a files -type UnprivilegedRebootChecker struct { - CheckCommand []string +type FileRebootChecker struct { + FilePath string } -// CheckRebootRequired runs the test command of the file +// CheckRebootRequired checks the file presence // needs refactoring to also return an error, instead of leaking it inside the code. // This needs refactoring to get rid of NewCommand // This needs refactoring to only contain file location, instead of CheckCommand -func (rc UnprivilegedRebootChecker) CheckRebootRequired() bool { - cmd := util.NewCommand(rc.CheckCommand[0], rc.CheckCommand[1:]...) - if err := cmd.Run(); err != nil { - switch err := err.(type) { - case *exec.ExitError: - // We assume a non-zero exit code means 'reboot not required', but of course - // the user could have misconfigured the sentinel command or something else - // went wrong during its execution. In that case, not entering a reboot loop - // is the right thing to do, and we are logging stdout/stderr of the command - // so it should be obvious what is wrong. - if cmd.ProcessState.ExitCode() != 1 { - log.Warnf("sentinel command ended with unexpected exit code: %v", cmd.ProcessState.ExitCode()) - } - return false - default: - // Something was grossly misconfigured, such as the command path being wrong. - log.Fatalf("Error invoking sentinel command: %v", err) - } +func (rc FileRebootChecker) CheckRebootRequired() bool { + if _, err := os.Stat(rc.FilePath); err == nil { + return true } - return true + return false } -// NsEnterRebootChecker is using a custom command to check -// if a reboot is required, but therefore needs a pid for entering the namespace, -// on top of the required command. This requires elevation. -type NsEnterRebootChecker struct { - CustomCheckCommand []string - NamespacePid int +// NewFileRebootChecker is the constructor for the file based reboot checker +// TODO: Add extra input validation on filePath string here +func NewFileRebootChecker(filePath string) *FileRebootChecker { + return &FileRebootChecker{ + FilePath: filePath, + } } -func (rc NsEnterRebootChecker) CheckRebootRequired() bool { - privCommand := util.PrivilegedHostCommand(rc.NamespacePid, rc.CustomCheckCommand) - cmd := util.NewCommand(privCommand[0], privCommand[1:]...) +// CommandChecker is using a custom command to check +// if a reboot is required. There are two modes of behaviour, +// if Privileged is granted, the NamespacePid is used to enter +// the given PID's namespace. +type CommandChecker struct { + CheckCommand []string + NamespacePid int + Privileged bool +} + +// CheckRebootRequired for CommandChecker runs a command without returning +// any eventual error. THis should be later refactored to remove the util wrapper +// and return the errors, instead of logging them here. +func (rc CommandChecker) CheckRebootRequired() bool { + var cmdline []string + if rc.Privileged { + cmdline = util.PrivilegedHostCommand(rc.NamespacePid, rc.CheckCommand) + } else { + cmdline = rc.CheckCommand + } + cmd := util.NewCommand(cmdline[0], cmdline[1:]...) if err := cmd.Run(); err != nil { switch err := err.(type) { case *exec.ExitError: @@ -72,3 +81,17 @@ func (rc NsEnterRebootChecker) CheckRebootRequired() bool { } return true } + +// NewCommandChecker is the constructor for the commandChecker, and by default +// runs new commands in a privileged fashion. +func NewCommandChecker(sentinelCommand string) *CommandChecker { + cmd, err := shlex.Split(sentinelCommand) + if err != nil { + log.Fatalf("Error parsing provided sentinel command: %v", err) + } + return &CommandChecker{ + CheckCommand: cmd, + NamespacePid: 1, + Privileged: true, + } +} diff --git a/pkg/checkers/checker_test.go b/pkg/checkers/checker_test.go index f3a689434..4df9870c1 100644 --- a/pkg/checkers/checker_test.go +++ b/pkg/checkers/checker_test.go @@ -32,7 +32,7 @@ func Test_rebootRequired(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - a := UnprivilegedRebootChecker{CheckCommand: tt.args.sentinelCommand} + a := CommandChecker{CheckCommand: tt.args.sentinelCommand, NamespacePid: 1, Privileged: false} if got := a.CheckRebootRequired(); got != tt.want { t.Errorf("rebootRequired() = %v, want %v", got, tt.want) } @@ -61,7 +61,7 @@ func Test_rebootRequired_fatals(t *testing.T) { for _, c := range cases { fatal = false - a := UnprivilegedRebootChecker{CheckCommand: c.param} + a := CommandChecker{CheckCommand: c.param, NamespacePid: 1, Privileged: false} a.CheckRebootRequired() assert.Equal(t, c.expectFatal, fatal) } From 3895a2f6d36b0ec9a592979ed3c45c3bbb8bde0c Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Mon, 30 Sep 2024 20:53:01 +0200 Subject: [PATCH 08/18] Remove nodeID from rebooter interface Without this patch, the rebooter interface has data which is not related to the rebooter interface. This should get removed to make it easier to maintain. The loss comes from the logging, which mentioned the node. In order to not have a regression compared to [1], this ensures that at least the node to be rebooted appears in the main. [1]: https://github.com/kubereboot/kured/pull/134 Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 6 +++--- pkg/reboot/command.go | 7 ++----- pkg/reboot/signal.go | 9 +++++---- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 8524f0eeb..c28a85fcb 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -705,7 +705,7 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. log.Warnf("Error notifying: %v", err) } } - + log.Infof("Triggering reboot for node %v", nodeID) rebooter.Reboot() for { log.Infof("Waiting for reboot") @@ -731,10 +731,10 @@ func root(cmd *cobra.Command, args []string) { switch { case rebootMethod == "command": log.Infof("Reboot command: %s", restartCommand) - rebooter = reboot.CommandRebooter{NodeID: nodeID, RebootCommand: util.PrivilegedHostCommand(1, restartCommand)} + rebooter = reboot.CommandRebooter{RebootCommand: util.PrivilegedHostCommand(1, restartCommand)} case rebootMethod == "signal": log.Infof("Reboot signal: %v", rebootSignal) - rebooter = reboot.SignalRebooter{NodeID: nodeID, Signal: rebootSignal} + rebooter = reboot.SignalRebooter{Signal: rebootSignal} default: log.Fatalf("Invalid reboot-method configured: %s", rebootMethod) } diff --git a/pkg/reboot/command.go b/pkg/reboot/command.go index f353fe5a4..f252bd821 100644 --- a/pkg/reboot/command.go +++ b/pkg/reboot/command.go @@ -7,15 +7,12 @@ import ( // CommandRebooter holds context-information for a command reboot. type CommandRebooter struct { - NodeID string RebootCommand []string } -// NewCommandReboot creates a new command-rebooter which needs full privileges on the host. - -// Reboot triggers the command-reboot. +// Reboot triggers the reboot command func (c CommandRebooter) Reboot() { - log.Infof("Running command: %s for node: %s", c.RebootCommand, c.NodeID) + log.Infof("Invoking command: %s", c.RebootCommand) if err := util.NewCommand(c.RebootCommand[0], c.RebootCommand[1:]...).Run(); err != nil { log.Fatalf("Error invoking reboot command: %v", err) } diff --git a/pkg/reboot/signal.go b/pkg/reboot/signal.go index 7f5a2c314..2ec40f50e 100644 --- a/pkg/reboot/signal.go +++ b/pkg/reboot/signal.go @@ -9,20 +9,21 @@ import ( // SignalRebooter holds context-information for a signal reboot. type SignalRebooter struct { - NodeID string Signal int } -// Reboot triggers the reboot signal using SIGTERMIN+5 +// Reboot triggers the reboot signal func (c SignalRebooter) Reboot() { - log.Infof("Emit reboot-signal for node: %s", c.NodeID) + log.Infof("Invoking signal: %v", c.Signal) process, err := os.FindProcess(1) if err != nil { - log.Fatalf("There was no systemd process found: %v", err) + log.Fatalf("Not running on Unix: %v", err) } err = process.Signal(syscall.Signal(c.Signal)) + // Either PID does not exist, or the signal does not work. Hoping for + // a decent enough error. if err != nil { log.Fatalf("Signal of SIGRTMIN+5 failed: %v", err) } From 42c4b8bc53c5e664ee45d1f8ea8d18d5c585b24d Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Mon, 30 Sep 2024 21:36:29 +0200 Subject: [PATCH 09/18] Revert to use a constructor again Without this, we have no validation of the data in command/signal reboot. This was not a problem in the first refactor, as the constructor was a dummy one, without validation. However, as we refactoed, we now have code in the root method that is validation for the reboot command. This can now be encompassed in the constructor. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 12 +++--------- pkg/reboot/command.go | 15 ++++++++++++++- pkg/reboot/signal.go | 6 ++++++ 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index c28a85fcb..abe2f53eb 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -35,7 +35,6 @@ import ( "github.com/kubereboot/kured/pkg/reboot" "github.com/kubereboot/kured/pkg/taints" "github.com/kubereboot/kured/pkg/timewindow" - "github.com/kubereboot/kured/pkg/util" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" ) @@ -722,19 +721,14 @@ func root(cmd *cobra.Command, args []string) { } log.Infof("Reboot schedule: %v", window) - restartCommand, err := shlex.Split(rebootCommand) - if err != nil { - log.Fatalf("Error parsing provided reboot command: %v", err) - } - var rebooter reboot.Rebooter switch { case rebootMethod == "command": - log.Infof("Reboot command: %s", restartCommand) - rebooter = reboot.CommandRebooter{RebootCommand: util.PrivilegedHostCommand(1, restartCommand)} + log.Infof("Reboot command: %s", rebootCommand) + rebooter = reboot.NewCommandRebooter(rebootCommand) case rebootMethod == "signal": log.Infof("Reboot signal: %v", rebootSignal) - rebooter = reboot.SignalRebooter{Signal: rebootSignal} + rebooter = reboot.NewSignalRebooter(rebootSignal) default: log.Fatalf("Invalid reboot-method configured: %s", rebootMethod) } diff --git a/pkg/reboot/command.go b/pkg/reboot/command.go index f252bd821..07c17f238 100644 --- a/pkg/reboot/command.go +++ b/pkg/reboot/command.go @@ -1,11 +1,12 @@ package reboot import ( + "github.com/google/shlex" "github.com/kubereboot/kured/pkg/util" log "github.com/sirupsen/logrus" ) -// CommandRebooter holds context-information for a command reboot. +// CommandRebooter holds context-information for a reboot with command type CommandRebooter struct { RebootCommand []string } @@ -17,3 +18,15 @@ func (c CommandRebooter) Reboot() { log.Fatalf("Error invoking reboot command: %v", err) } } + +// NewCommandRebooter is the constructor to create a CommandRebooter from a string not +// yet shell lexed. You can skip this constructor if you parse the data correctly first +// when instantiating a CommandRebooter instance. +func NewCommandRebooter(rebootCommand string) *CommandRebooter { + cmd, err := shlex.Split(rebootCommand) + if err != nil { + log.Fatalf("Error parsing provided reboot command: %v", err) + } + + return &CommandRebooter{RebootCommand: util.PrivilegedHostCommand(1, cmd)} +} diff --git a/pkg/reboot/signal.go b/pkg/reboot/signal.go index 2ec40f50e..ab9adbdbb 100644 --- a/pkg/reboot/signal.go +++ b/pkg/reboot/signal.go @@ -28,3 +28,9 @@ func (c SignalRebooter) Reboot() { log.Fatalf("Signal of SIGRTMIN+5 failed: %v", err) } } + +// NewSignalRebooter is the constructor which sets the signal number. +// The constructor does not yet validate any input. It should be done in a later commit. +func NewSignalRebooter(sig int) *SignalRebooter { + return &SignalRebooter{Signal: sig} +} From a8132a2286c97f94c2532a39400d83039f9a9c18 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Mon, 7 Oct 2024 22:55:40 +0200 Subject: [PATCH 10/18] Remove viper/cobra deps Without this, the main loop is in need of 3 functions to simply parse flags and env variables (excluding input validation). This is a bit more complex than it should, especially since we only need to parse command line flags and env vars. This fixes it by simply using pflags (which we were already using) instead of pflags + viper + cobra (for which we do not have any benefit), and removing all the methods outside the mapping of env var with cli flag. The main code is now far simpler: It handles the reading, parsing, and returning in case of error. As we do not bubble up errors from rebootasRequired yet, this is good enough at this moment. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 376 ++++++++++++++++++++++------------------- cmd/kured/main_test.go | 54 +++--- cmd/kured/pflags.go | 21 ++- 3 files changed, 231 insertions(+), 220 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index abe2f53eb..56ed3bd65 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -6,6 +6,7 @@ import ( "fmt" "github.com/kubereboot/kured/pkg/blockers" "github.com/kubereboot/kured/pkg/checkers" + "github.com/prometheus/client_golang/prometheus/promhttp" "math/rand" "net/http" "net/url" @@ -13,14 +14,13 @@ import ( "reflect" "regexp" "sort" + "strconv" "strings" "time" papi "github.com/prometheus/client_golang/api" log "github.com/sirupsen/logrus" - "github.com/spf13/cobra" - "github.com/spf13/pflag" - "github.com/spf13/viper" + flag "github.com/spf13/pflag" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -36,7 +36,6 @@ import ( "github.com/kubereboot/kured/pkg/taints" "github.com/kubereboot/kured/pkg/timewindow" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promhttp" ) var ( @@ -114,162 +113,117 @@ func init() { } func main() { - cmd := NewRootCommand() - - if err := cmd.Execute(); err != nil { - log.Fatal(err) - } -} -// NewRootCommand construct the Cobra root command -func NewRootCommand() *cobra.Command { - rootCmd := &cobra.Command{ - Use: "kured", - Short: "Kubernetes Reboot Daemon", - PersistentPreRunE: bindViper, - PreRun: flagCheck, - Run: root} - - rootCmd.PersistentFlags().StringVar(&nodeID, "node-id", "", + flag.StringVar(&nodeID, "node-id", "", "node name kured runs on, should be passed down from spec.nodeName via KURED_NODE_ID environment variable") - rootCmd.PersistentFlags().BoolVar(&forceReboot, "force-reboot", false, + flag.BoolVar(&forceReboot, "force-reboot", false, "force a reboot even if the drain fails or times out") - rootCmd.PersistentFlags().StringVar(&metricsHost, "metrics-host", "", + flag.StringVar(&metricsHost, "metrics-host", "", "host where metrics will listen") - rootCmd.PersistentFlags().IntVar(&metricsPort, "metrics-port", 8080, + flag.IntVar(&metricsPort, "metrics-port", 8080, "port number where metrics will listen") - rootCmd.PersistentFlags().IntVar(&drainGracePeriod, "drain-grace-period", -1, + flag.IntVar(&drainGracePeriod, "drain-grace-period", -1, "time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used") - rootCmd.PersistentFlags().StringVar(&drainPodSelector, "drain-pod-selector", "", + flag.StringVar(&drainPodSelector, "drain-pod-selector", "", "only drain pods with labels matching the selector (default: '', all pods)") - rootCmd.PersistentFlags().IntVar(&skipWaitForDeleteTimeoutSeconds, "skip-wait-for-delete-timeout", 0, + flag.IntVar(&skipWaitForDeleteTimeoutSeconds, "skip-wait-for-delete-timeout", 0, "when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node") - rootCmd.PersistentFlags().DurationVar(&drainDelay, "drain-delay", 0, + flag.DurationVar(&drainDelay, "drain-delay", 0, "delay drain for this duration (default: 0, disabled)") - rootCmd.PersistentFlags().DurationVar(&drainTimeout, "drain-timeout", 0, + flag.DurationVar(&drainTimeout, "drain-timeout", 0, "timeout after which the drain is aborted (default: 0, infinite time)") - rootCmd.PersistentFlags().DurationVar(&rebootDelay, "reboot-delay", 0, + flag.DurationVar(&rebootDelay, "reboot-delay", 0, "delay reboot for this duration (default: 0, disabled)") - rootCmd.PersistentFlags().StringVar(&rebootMethod, "reboot-method", "command", + flag.StringVar(&rebootMethod, "reboot-method", "command", "method to use for reboots. Available: command") - rootCmd.PersistentFlags().DurationVar(&period, "period", time.Minute*60, + flag.DurationVar(&period, "period", time.Minute*60, "sentinel check period") - rootCmd.PersistentFlags().StringVar(&dsNamespace, "ds-namespace", "kube-system", + flag.StringVar(&dsNamespace, "ds-namespace", "kube-system", "namespace containing daemonset on which to place lock") - rootCmd.PersistentFlags().StringVar(&dsName, "ds-name", "kured", + flag.StringVar(&dsName, "ds-name", "kured", "name of daemonset on which to place lock") - rootCmd.PersistentFlags().StringVar(&lockAnnotation, "lock-annotation", KuredNodeLockAnnotation, + flag.StringVar(&lockAnnotation, "lock-annotation", KuredNodeLockAnnotation, "annotation in which to record locking node") - rootCmd.PersistentFlags().DurationVar(&lockTTL, "lock-ttl", 0, + flag.DurationVar(&lockTTL, "lock-ttl", 0, "expire lock annotation after this duration (default: 0, disabled)") - rootCmd.PersistentFlags().DurationVar(&lockReleaseDelay, "lock-release-delay", 0, + flag.DurationVar(&lockReleaseDelay, "lock-release-delay", 0, "delay lock release for this duration (default: 0, disabled)") - rootCmd.PersistentFlags().StringVar(&prometheusURL, "prometheus-url", "", + flag.StringVar(&prometheusURL, "prometheus-url", "", "Prometheus instance to probe for active alerts") - rootCmd.PersistentFlags().Var(®expValue{&alertFilter}, "alert-filter-regexp", + flag.Var(®expValue{&alertFilter}, "alert-filter-regexp", "alert names to ignore when checking for active alerts") - rootCmd.PersistentFlags().BoolVar(&alertFilterMatchOnly, "alert-filter-match-only", false, + flag.BoolVar(&alertFilterMatchOnly, "alert-filter-match-only", false, "Only block if the alert-filter-regexp matches active alerts") - rootCmd.PersistentFlags().BoolVar(&alertFiringOnly, "alert-firing-only", false, + flag.BoolVar(&alertFiringOnly, "alert-firing-only", false, "only consider firing alerts when checking for active alerts") - rootCmd.PersistentFlags().StringVar(&rebootSentinelFile, "reboot-sentinel", "/var/run/reboot-required", + flag.StringVar(&rebootSentinelFile, "reboot-sentinel", "/var/run/reboot-required", "path to file whose existence triggers the reboot command") - rootCmd.PersistentFlags().StringVar(&preferNoScheduleTaintName, "prefer-no-schedule-taint", "", + flag.StringVar(&preferNoScheduleTaintName, "prefer-no-schedule-taint", "", "Taint name applied during pending node reboot (to prevent receiving additional pods from other rebooting nodes). Disabled by default. Set e.g. to \"weave.works/kured-node-reboot\" to enable tainting.") - rootCmd.PersistentFlags().StringVar(&rebootSentinelCommand, "reboot-sentinel-command", "", + flag.StringVar(&rebootSentinelCommand, "reboot-sentinel-command", "", "command for which a zero return code will trigger a reboot command") - rootCmd.PersistentFlags().StringVar(&rebootCommand, "reboot-command", "/bin/systemctl reboot", + flag.StringVar(&rebootCommand, "reboot-command", "/bin/systemctl reboot", "command to run when a reboot is required") - rootCmd.PersistentFlags().IntVar(&concurrency, "concurrency", 1, + flag.IntVar(&concurrency, "concurrency", 1, "amount of nodes to concurrently reboot. Defaults to 1") - rootCmd.PersistentFlags().IntVar(&rebootSignal, "reboot-signal", sigTrminPlus5, + flag.IntVar(&rebootSignal, "reboot-signal", sigTrminPlus5, "signal to use for reboot, SIGRTMIN+5 by default.") - - rootCmd.PersistentFlags().StringVar(&slackHookURL, "slack-hook-url", "", + flag.StringVar(&slackHookURL, "slack-hook-url", "", "slack hook URL for reboot notifications [deprecated in favor of --notify-url]") - rootCmd.PersistentFlags().StringVar(&slackUsername, "slack-username", "kured", + flag.StringVar(&slackUsername, "slack-username", "kured", "slack username for reboot notifications") - rootCmd.PersistentFlags().StringVar(&slackChannel, "slack-channel", "", + flag.StringVar(&slackChannel, "slack-channel", "", "slack channel for reboot notifications") - rootCmd.PersistentFlags().StringVar(¬ifyURL, "notify-url", "", + flag.StringVar(¬ifyURL, "notify-url", "", "notify URL for reboot notifications (cannot use with --slack-hook-url flags)") - rootCmd.PersistentFlags().StringVar(&messageTemplateUncordon, "message-template-uncordon", "Node %s rebooted & uncordoned successfully!", + flag.StringVar(&messageTemplateUncordon, "message-template-uncordon", "Node %s rebooted & uncordoned successfully!", "message template used to notify about a node being successfully uncordoned") - rootCmd.PersistentFlags().StringVar(&messageTemplateDrain, "message-template-drain", "Draining node %s", + flag.StringVar(&messageTemplateDrain, "message-template-drain", "Draining node %s", "message template used to notify about a node being drained") - rootCmd.PersistentFlags().StringVar(&messageTemplateReboot, "message-template-reboot", "Rebooting node %s", + flag.StringVar(&messageTemplateReboot, "message-template-reboot", "Rebooting node %s", "message template used to notify about a node being rebooted") - - rootCmd.PersistentFlags().StringArrayVar(&podSelectors, "blocking-pod-selector", nil, + flag.StringArrayVar(&podSelectors, "blocking-pod-selector", nil, "label selector identifying pods whose presence should prevent reboots") - - rootCmd.PersistentFlags().StringSliceVar(&rebootDays, "reboot-days", timewindow.EveryDay, + flag.StringSliceVar(&rebootDays, "reboot-days", timewindow.EveryDay, "schedule reboot on these days") - rootCmd.PersistentFlags().StringVar(&rebootStart, "start-time", "0:00", + flag.StringVar(&rebootStart, "start-time", "0:00", "schedule reboot only after this time of day") - rootCmd.PersistentFlags().StringVar(&rebootEnd, "end-time", "23:59:59", + flag.StringVar(&rebootEnd, "end-time", "23:59:59", "schedule reboot only before this time of day") - rootCmd.PersistentFlags().StringVar(&timezone, "time-zone", "UTC", + flag.StringVar(&timezone, "time-zone", "UTC", "use this timezone for schedule inputs") - - rootCmd.PersistentFlags().BoolVar(&annotateNodes, "annotate-nodes", false, + flag.BoolVar(&annotateNodes, "annotate-nodes", false, "if set, the annotations 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' will be given to nodes undergoing kured reboots") - - rootCmd.PersistentFlags().StringVar(&logFormat, "log-format", "text", + flag.StringVar(&logFormat, "log-format", "text", "use text or json log format") - - rootCmd.PersistentFlags().StringSliceVar(&preRebootNodeLabels, "pre-reboot-node-labels", nil, + flag.StringSliceVar(&preRebootNodeLabels, "pre-reboot-node-labels", nil, "labels to add to nodes before cordoning") - rootCmd.PersistentFlags().StringSliceVar(&postRebootNodeLabels, "post-reboot-node-labels", nil, + flag.StringSliceVar(&postRebootNodeLabels, "post-reboot-node-labels", nil, "labels to add to nodes after uncordoning") - return rootCmd -} + flag.Parse() + + // Load flags from environment variables + LoadFromEnv() + + log.Infof("Kubernetes Reboot Daemon: %s", version) -// func that checks for cmd line flags validity -func flagCheck(cmd *cobra.Command, args []string) { if logFormat == "json" { log.SetFormatter(&log.JSONFormatter{}) } + if nodeID == "" { log.Fatal("KURED_NODE_ID environment variable required") } - switch { - case slackHookURL != "" && notifyURL != "": - log.Warnf("Cannot use both --notify-url and --slack-hook-url flags. Kured will use --notify-url flag only...") - case notifyURL != "": - notifyURL = stripQuotes(notifyURL) - case slackHookURL != "": - log.Warnf("Deprecated flag(s). Please use --notify-url flag instead.") - parsedURL, err := url.Parse(stripQuotes(slackHookURL)) - if err != nil { - log.Warnf("slack-hook-url is not properly formatted... no notification will be sent: %v\n", err) - } - if len(strings.Split(strings.Trim(parsedURL.Path, "/services/"), "/")) != 3 { - log.Warnf("slack-hook-url is not properly formatted... no notification will be sent: unexpected number of / in URL\n") - } else { - notifyURL = fmt.Sprintf("slack://%s", strings.Trim(parsedURL.Path, "/services/")) - } - } + log.Infof("Node ID: %s", nodeID) - var preRebootNodeLabelKeys, postRebootNodeLabelKeys []string - for _, label := range preRebootNodeLabels { - preRebootNodeLabelKeys = append(preRebootNodeLabelKeys, strings.Split(label, "=")[0]) - } - for _, label := range postRebootNodeLabels { - postRebootNodeLabelKeys = append(postRebootNodeLabelKeys, strings.Split(label, "=")[0]) - } - sort.Strings(preRebootNodeLabelKeys) - sort.Strings(postRebootNodeLabelKeys) - if !reflect.DeepEqual(preRebootNodeLabelKeys, postRebootNodeLabelKeys) { - log.Warnf("pre-reboot-node-labels keys and post-reboot-node-labels keys do not match. This may result in unexpected behaviour.") + notifyURL = validateNotificationURL(notifyURL, slackHookURL) + + err := validateNodeLabels(preRebootNodeLabels, postRebootNodeLabels) + if err != nil { + log.Warnf(err.Error()) } - // Not really validation, could stay in root command if necessary. - // However, using the information here makes it very explicit that - // root is only for the main business logic. - log.Infof("Kubernetes Reboot Daemon: %s", version) - log.Infof("Node ID: %s", nodeID) log.Infof("Lock Annotation: %s/%s:%s", dsNamespace, dsName, lockAnnotation) if lockTTL > 0 { log.Infof("Lock TTL set, lock will expire after: %v", lockTTL) @@ -281,62 +235,169 @@ func flagCheck(cmd *cobra.Command, args []string) { } else { log.Info("Lock release delay not set, lock will be released immediately after rebooting") } + log.Infof("PreferNoSchedule taint: %s", preferNoScheduleTaintName) + + // This should be printed from blocker list instead of only blocking pod selectors log.Infof("Blocking Pod Selectors: %v", podSelectors) + log.Infof("Reboot period %v", period) log.Infof("Concurrency: %v", concurrency) - log.Infof("Reboot method: %s", rebootMethod) if annotateNodes { log.Infof("Will annotate nodes during kured reboot operations") } -} -// stripQuotes removes any literal single or double quote chars that surround a string -func stripQuotes(str string) string { - if len(str) > 2 { - firstChar := str[0] - lastChar := str[len(str)-1] - if firstChar == lastChar && (firstChar == '"' || firstChar == '\'') { - return str[1 : len(str)-1] - } + // Now call the rest of the main loop. + window, err := timewindow.New(rebootDays, rebootStart, rebootEnd, timezone) + if err != nil { + log.Fatalf("Failed to build time window: %v", err) + } + log.Infof("Reboot schedule: %v", window) + + log.Infof("Reboot method: %s", rebootMethod) + var rebooter reboot.Rebooter + switch { + case rebootMethod == "command": + log.Infof("Reboot command: %s", rebootCommand) + rebooter = reboot.NewCommandRebooter(rebootCommand) + case rebootMethod == "signal": + log.Infof("Reboot signal: %v", rebootSignal) + rebooter = reboot.NewSignalRebooter(rebootSignal) + default: + log.Fatalf("Invalid reboot-method configured: %s", rebootMethod) + } + + var checker checkers.Checker + // An override of rebootSentinelCommand means a privileged command + if rebootSentinelCommand != "" { + log.Infof("Sentinel checker is (privileged) user provided command: %s", rebootSentinelCommand) + checker = checkers.NewCommandChecker(rebootSentinelCommand) + } else { + log.Infof("Sentinel checker is (unprivileged) testing for the presence of: %s", rebootSentinelFile) + checker = checkers.NewFileRebootChecker(rebootSentinelFile) } - // return the original string if it has a length of zero or one - return str -} -// bindViper initializes viper and binds command flags with environment variables -func bindViper(cmd *cobra.Command, args []string) error { - v := viper.New() + go rebootAsRequired(nodeID, rebooter, checker, window, lockTTL, lockReleaseDelay) + go maintainRebootRequiredMetric(nodeID, checker) - v.SetEnvPrefix(EnvPrefix) - v.AutomaticEnv() - bindFlags(cmd, v) + http.Handle("/metrics", promhttp.Handler()) + log.Fatal(http.ListenAndServe(fmt.Sprintf("%s:%d", metricsHost, metricsPort), nil)) +} + +func validateNodeLabels(preRebootNodeLabels []string, postRebootNodeLabels []string) error { + var preRebootNodeLabelKeys, postRebootNodeLabelKeys []string + for _, label := range preRebootNodeLabels { + preRebootNodeLabelKeys = append(preRebootNodeLabelKeys, strings.Split(label, "=")[0]) + } + for _, label := range postRebootNodeLabels { + postRebootNodeLabelKeys = append(postRebootNodeLabelKeys, strings.Split(label, "=")[0]) + } + sort.Strings(preRebootNodeLabelKeys) + sort.Strings(postRebootNodeLabelKeys) + if !reflect.DeepEqual(preRebootNodeLabelKeys, postRebootNodeLabelKeys) { + return fmt.Errorf("pre-reboot-node-labels keys and post-reboot-node-labels keys do not match, resulting in unexpected behaviour") + } return nil } -// bindFlags binds each cobra flag to its associated viper configuration (environment variable) -func bindFlags(cmd *cobra.Command, v *viper.Viper) { - cmd.Flags().VisitAll(func(f *pflag.Flag) { - // Environment variables can't have dashes in them, so bind them to their equivalent keys with underscores - if strings.Contains(f.Name, "-") { - v.BindEnv(f.Name, flagToEnvVar(f.Name)) +func validateNotificationURL(notifyURL string, slackHookURL string) string { + switch { + case slackHookURL != "" && notifyURL != "": + log.Warnf("Cannot use both --notify-url (given: %v) and --slack-hook-url (given: %v) flags. Kured will only use --notify-url flag", slackHookURL, notifyURL) + return validateNotificationURL(notifyURL, "") + case notifyURL != "": + return stripQuotes(notifyURL) + case slackHookURL != "": + log.Warnf("Deprecated flag(s). Please use --notify-url flag instead.") + parsedURL, err := url.Parse(stripQuotes(slackHookURL)) + if err != nil { + log.Warnf("slack-hook-url is not properly formatted... no notification will be sent: %v\n", err) + return "" + } + if len(strings.Split(strings.Trim(parsedURL.Path, "/services/"), "/")) != 3 { + log.Warnf("slack-hook-url is not properly formatted... no notification will be sent: unexpected number of / in URL\n") + return "" } + return fmt.Sprintf("slack://%s", strings.Trim(parsedURL.Path, "/services/")) + } + return "" +} - // Apply the viper config value to the flag when the flag is not set and viper has a value - if !f.Changed && v.IsSet(f.Name) { - val := v.Get(f.Name) - log.Infof("Binding %s command flag to environment variable: %s", f.Name, flagToEnvVar(f.Name)) - cmd.Flags().Set(f.Name, fmt.Sprintf("%v", val)) +// LoadFromEnv attempts to load environment variables corresponding to flags. +// It looks for an environment variable with the uppercase version of the flag name (prefixed by EnvPrefix). +func LoadFromEnv() { + flag.VisitAll(func(f *flag.Flag) { + envVarName := fmt.Sprintf("%s_%s", EnvPrefix, strings.ToUpper(strings.ReplaceAll(f.Name, "-", "_"))) + + if envValue, exists := os.LookupEnv(envVarName); exists { + switch f.Value.Type() { + case "int": + if parsedVal, err := strconv.Atoi(envValue); err == nil { + err := flag.Set(f.Name, strconv.Itoa(parsedVal)) + if err != nil { + fmt.Printf("cannot set flag %s from env var named %s", f.Name, envVarName) + os.Exit(1) + } // Set int flag + } else { + fmt.Printf("Invalid value for env var named %s", envVarName) + os.Exit(1) + } + case "string": + err := flag.Set(f.Name, envValue) + if err != nil { + fmt.Printf("cannot set flag %s from env{%s}: %s\n", f.Name, envVarName, envValue) + os.Exit(1) + } // Set string flag + case "bool": + if parsedVal, err := strconv.ParseBool(envValue); err == nil { + err := flag.Set(f.Name, strconv.FormatBool(parsedVal)) + if err != nil { + fmt.Printf("cannot set flag %s from env{%s}: %s\n", f.Name, envVarName, envValue) + os.Exit(1) + } // Set boolean flag + } else { + fmt.Printf("Invalid value for %s: %s\n", envVarName, envValue) + os.Exit(1) + } + case "duration": + // Set duration from the environment variable (e.g., "1h30m") + if _, err := time.ParseDuration(envValue); err == nil { + flag.Set(f.Name, envValue) + } else { + fmt.Printf("Invalid duration for %s: %s\n", envVarName, envValue) + os.Exit(1) + } + case "regexp": + // For regexp, set it from the environment variable + flag.Set(f.Name, envValue) + case "stringSlice": + // For stringSlice, split the environment variable by commas and set it + err := flag.Set(f.Name, envValue) + if err != nil { + fmt.Printf("cannot set flag %s from env{%s}: %s\n", f.Name, envVarName, envValue) + os.Exit(1) + } + default: + fmt.Printf("Unsupported flag type for %s\n", f.Name) + } } }) + } -// flagToEnvVar converts command flag name to equivalent environment variable name -func flagToEnvVar(flag string) string { - envVarSuffix := strings.ToUpper(strings.ReplaceAll(flag, "-", "_")) - return fmt.Sprintf("%s_%s", EnvPrefix, envVarSuffix) +// stripQuotes removes any literal single or double quote chars that surround a string +func stripQuotes(str string) string { + if len(str) > 2 { + firstChar := str[0] + lastChar := str[len(str)-1] + if firstChar == lastChar && (firstChar == '"' || firstChar == '\'') { + return str[1 : len(str)-1] + } + } + // return the original string if it has a length of zero or one + return str } func holding(lock *daemonsetlock.DaemonSetLock, metadata interface{}, isMultiLock bool) bool { @@ -712,40 +773,3 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. } } } - -func root(cmd *cobra.Command, args []string) { - - window, err := timewindow.New(rebootDays, rebootStart, rebootEnd, timezone) - if err != nil { - log.Fatalf("Failed to build time window: %v", err) - } - log.Infof("Reboot schedule: %v", window) - - var rebooter reboot.Rebooter - switch { - case rebootMethod == "command": - log.Infof("Reboot command: %s", rebootCommand) - rebooter = reboot.NewCommandRebooter(rebootCommand) - case rebootMethod == "signal": - log.Infof("Reboot signal: %v", rebootSignal) - rebooter = reboot.NewSignalRebooter(rebootSignal) - default: - log.Fatalf("Invalid reboot-method configured: %s", rebootMethod) - } - - var checker checkers.Checker - // An override of rebootsentinelcommand means a privileged command - if rebootSentinelCommand != "" { - log.Infof("Sentinel checker is (privileged) user provided command: %s", rebootSentinelCommand) - checker = checkers.NewCommandChecker(rebootSentinelCommand) - } else { - log.Infof("Sentinel checker is (unprivileged) testing for the presence of: %s", rebootSentinelFile) - checker = checkers.NewFileRebootChecker(rebootSentinelFile) - } - - go rebootAsRequired(nodeID, rebooter, checker, window, lockTTL, lockReleaseDelay) - go maintainRebootRequiredMetric(nodeID, checker) - - http.Handle("/metrics", promhttp.Handler()) - log.Fatal(http.ListenAndServe(fmt.Sprintf("%s:%d", metricsHost, metricsPort), nil)) -} diff --git a/cmd/kured/main_test.go b/cmd/kured/main_test.go index 9d60109d0..4687d0e77 100644 --- a/cmd/kured/main_test.go +++ b/cmd/kured/main_test.go @@ -1,47 +1,31 @@ package main import ( - "github.com/spf13/cobra" "reflect" "testing" ) -func Test_flagCheck(t *testing.T) { - var cmd *cobra.Command - var args []string - nodeID = "babar" - slackHookURL = "https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET" - expected := "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET" - flagCheck(cmd, args) - if notifyURL != expected { - t.Errorf("Slack URL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL) - } +func TestValidateNotificationURL(t *testing.T) { - // validate that surrounding quotes are stripped - slackHookURL = "\"https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET\"" - expected = "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET" - flagCheck(cmd, args) - if notifyURL != expected { - t.Errorf("Slack URL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL) - } - slackHookURL = "'https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET'" - expected = "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET" - flagCheck(cmd, args) - if notifyURL != expected { - t.Errorf("Slack URL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL) - } - slackHookURL = "" - notifyURL = "\"teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com\"" - expected = "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com" - flagCheck(cmd, args) - if notifyURL != expected { - t.Errorf("notifyURL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL) + tests := []struct { + name string + slackHookURL string + notifyURL string + expected string + }{ + {"slackHookURL only works fine", "https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET", "", "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"}, + {"slackHookURL and notify URL together only keeps notifyURL", "\"https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET\"", "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com", "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"}, + {"slackHookURL removes extraneous double quotes", "\"https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET\"", "", "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"}, + {"slackHookURL removes extraneous single quotes", "'https://hooks.slack.com/services/BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET'", "", "slack://BLABLABA12345/IAM931A0VERY/COMPLICATED711854TOKEN1SET"}, + {"notifyURL removes extraneous double quotes", "", "\"teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com\"", "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"}, + {"notifyURL removes extraneous single quotes", "", "'teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com'", "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com"}, } - notifyURL = "'teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com'" - expected = "teams://79b4XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@acd8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/204cXXXXXXXXXXXXXXXXXXXXXXXXXXXX/a1f8XXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX?host=XXXX.webhook.office.com" - flagCheck(cmd, args) - if notifyURL != expected { - t.Errorf("notifyURL Parsing is wrong: expecting %s but got %s\n", expected, notifyURL) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := validateNotificationURL(tt.notifyURL, tt.slackHookURL); !reflect.DeepEqual(got, tt.expected) { + t.Errorf("validateNotificationURL() = %v, expected %v", got, tt.expected) + } + }) } } diff --git a/cmd/kured/pflags.go b/cmd/kured/pflags.go index b6eb5abe0..ffec3a704 100644 --- a/cmd/kured/pflags.go +++ b/cmd/kured/pflags.go @@ -1,31 +1,34 @@ package main import ( + "fmt" "regexp" ) +// Custom flag type to handle *regexp.Regexp with pflag type regexpValue struct { value **regexp.Regexp } +// String method to return the string representation of the value func (rev *regexpValue) String() string { - if *rev.value == nil { - return "" + if *rev.value != nil { + return (*rev.value).String() } - return (*rev.value).String() + return "" } +// Set method to parse the input string and set the regexp value func (rev *regexpValue) Set(s string) error { - value, err := regexp.Compile(s) + compiledRegexp, err := regexp.Compile(s) if err != nil { - return err + return fmt.Errorf("invalid regular expression: %w", err) } - - *rev.value = value - + *rev.value = compiledRegexp return nil } +// Type method returns the type of the flag as a string func (rev *regexpValue) Type() string { - return "regexp.Regexp" + return "regexp" } From aae5bb6ebb17939797ff09a147fb512022f3826b Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Wed, 9 Oct 2024 20:32:28 +0200 Subject: [PATCH 11/18] Raise the error levels for wrong flag If the notification url configuration is known to be not working, this should be raised as an error, not a warning. Without this, it would be easy to miss a misconfiguration. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 56ed3bd65..1ab4b8424 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -313,11 +313,11 @@ func validateNotificationURL(notifyURL string, slackHookURL string) string { log.Warnf("Deprecated flag(s). Please use --notify-url flag instead.") parsedURL, err := url.Parse(stripQuotes(slackHookURL)) if err != nil { - log.Warnf("slack-hook-url is not properly formatted... no notification will be sent: %v\n", err) + log.Errorf("slack-hook-url is not properly formatted... no notification will be sent: %v\n", err) return "" } if len(strings.Split(strings.Trim(parsedURL.Path, "/services/"), "/")) != 3 { - log.Warnf("slack-hook-url is not properly formatted... no notification will be sent: unexpected number of / in URL\n") + log.Errorf("slack-hook-url is not properly formatted... no notification will be sent: unexpected number of / in URL\n") return "" } return fmt.Sprintf("slack://%s", strings.Trim(parsedURL.Path, "/services/")) From 104a745305e02d85ce1555b7d2c7d1f44cab1ccf Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Thu, 10 Oct 2024 22:56:33 +0200 Subject: [PATCH 12/18] Make locks more generic Implementation details of lock should not leak into the calling methods. Without this path, calls are a bit more complex and error handling is harder to find. This is a problem for long term maintenance, as it is tougher to refactor the locks without impacting the main. Decoupling the two (main usage of the lock, and the lock themselves) will allow us to introduce other kinds of locks easily. I solve this by inlining into the daemonsetlock package: - including all the methods for managing locks from the main.go functions. Those were mostly doing error handling where code became no-op by introducing multiple daemonsetlock types - adding the lock release delay part of lock info I also did not like the pattern include in Test method, which added a reference to nodeMeta: It was not very clear that Test was storing the current metadata of the node, or was returning the current state. (Metadata here only means unschedulable). The problem I saw was that the metadata was silently mutated from a lock Test method, which was very not obvious. Instead, I picked to explicitly return the lock data instead. I also made it explicit that the Acquire lock method is passing the node metadata as structured information, rather than an interface{}. This is a bit more fragile at runtime, but I prefer having very explicit errors if the locks are incorrect, rather than having to deal with unvalidated data. For the lock release delay, it was part of the rebootasrequired loop, where I believe it makes more sense to be part of the Release method itself, for readability. Yet, it hides the delay into the implementation detail, but it keeps the reboot as required goroutine more readable. Instead of passing the argument rebootDelay as parameter of the rebootasrequired method, this refactor took creation of the lock object in the main loop, close to all the variables, and then pass the lock object to the rebootasrequired. This makes the call for rebootasrequired more clear, and lock is now encompassing everything needed to acquire, release, or get info about the lock. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 163 +++++-------- pkg/daemonsetlock/daemonsetlock.go | 306 +++++++++++++++--------- pkg/daemonsetlock/daemonsetlock_test.go | 22 +- 3 files changed, 262 insertions(+), 229 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 1ab4b8424..96e229f7d 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -224,18 +224,6 @@ func main() { log.Warnf(err.Error()) } - log.Infof("Lock Annotation: %s/%s:%s", dsNamespace, dsName, lockAnnotation) - if lockTTL > 0 { - log.Infof("Lock TTL set, lock will expire after: %v", lockTTL) - } else { - log.Info("Lock TTL not set, lock will remain until being released") - } - if lockReleaseDelay > 0 { - log.Infof("Lock release delay set, lock release will be delayed by: %v", lockReleaseDelay) - } else { - log.Info("Lock release delay not set, lock will be released immediately after rebooting") - } - log.Infof("PreferNoSchedule taint: %s", preferNoScheduleTaintName) // This should be printed from blocker list instead of only blocking pod selectors @@ -278,7 +266,30 @@ func main() { checker = checkers.NewFileRebootChecker(rebootSentinelFile) } - go rebootAsRequired(nodeID, rebooter, checker, window, lockTTL, lockReleaseDelay) + config, err := rest.InClusterConfig() + if err != nil { + log.Fatal(err) + } + + client, err := kubernetes.NewForConfig(config) + if err != nil { + log.Fatal(err) + } + + log.Infof("Lock Annotation: %s/%s:%s", dsNamespace, dsName, lockAnnotation) + if lockTTL > 0 { + log.Infof("Lock TTL set, lock will expire after: %v", lockTTL) + } else { + log.Info("Lock TTL not set, lock will remain until being released") + } + if lockReleaseDelay > 0 { + log.Infof("Lock release delay set, lock release will be delayed by: %v", lockReleaseDelay) + } else { + log.Info("Lock release delay not set, lock will be released immediately after rebooting") + } + lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation, lockTTL, concurrency, lockReleaseDelay) + + go rebootAsRequired(nodeID, rebooter, checker, window, lock, client) go maintainRebootRequiredMetric(nodeID, checker) http.Handle("/metrics", promhttp.Handler()) @@ -400,68 +411,6 @@ func stripQuotes(str string) string { return str } -func holding(lock *daemonsetlock.DaemonSetLock, metadata interface{}, isMultiLock bool) bool { - var holding bool - var err error - if isMultiLock { - holding, err = lock.TestMultiple() - } else { - holding, err = lock.Test(metadata) - } - if err != nil { - log.Fatalf("Error testing lock: %v", err) - } - if holding { - log.Infof("Holding lock") - } - return holding -} - -func acquire(lock *daemonsetlock.DaemonSetLock, metadata interface{}, TTL time.Duration, maxOwners int) bool { - var holding bool - var holder string - var err error - if maxOwners > 1 { - var holders []string - holding, holders, err = lock.AcquireMultiple(metadata, TTL, maxOwners) - holder = strings.Join(holders, ",") - } else { - holding, holder, err = lock.Acquire(metadata, TTL) - } - switch { - case err != nil: - log.Fatalf("Error acquiring lock: %v", err) - return false - case !holding: - log.Warnf("Lock already held: %v", holder) - return false - default: - log.Infof("Acquired reboot lock") - return true - } -} - -func throttle(releaseDelay time.Duration) { - if releaseDelay > 0 { - log.Infof("Delaying lock release by %v", releaseDelay) - time.Sleep(releaseDelay) - } -} - -func release(lock *daemonsetlock.DaemonSetLock, isMultiLock bool) { - log.Infof("Releasing lock") - - var err error - if isMultiLock { - err = lock.ReleaseMultiple() - } else { - err = lock.Release() - } - if err != nil { - log.Fatalf("Error releasing lock: %v", err) - } -} - func drain(client *kubernetes.Clientset, node *v1.Node) error { nodename := node.GetName() @@ -537,11 +486,6 @@ func maintainRebootRequiredMetric(nodeID string, checker checkers.Checker) { } } -// nodeMeta is used to remember information across reboots -type nodeMeta struct { - Unschedulable bool `json:"unschedulable"` -} - func addNodeAnnotations(client *kubernetes.Clientset, nodeID string, annotations map[string]string) error { node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{}) if err != nil { @@ -616,30 +560,23 @@ func updateNodeLabels(client *kubernetes.Clientset, node *v1.Node, labels []stri } } -func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers.Checker, window *timewindow.TimeWindow, TTL time.Duration, releaseDelay time.Duration) { - config, err := rest.InClusterConfig() - if err != nil { - log.Fatal(err) - } - - client, err := kubernetes.NewForConfig(config) - if err != nil { - log.Fatal(err) - } - - lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation) +func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers.Checker, window *timewindow.TimeWindow, lock daemonsetlock.Lock, client *kubernetes.Clientset) { - nodeMeta := nodeMeta{} source := rand.NewSource(time.Now().UnixNano()) tick := delaytick.New(source, 1*time.Minute) for range tick { - if holding(lock, &nodeMeta, concurrency > 1) { + holding, lockData, err := lock.Holding() + if err != nil { + log.Errorf("Error testing lock: %v", err) + } + if holding { node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{}) if err != nil { log.Errorf("Error retrieving node object via k8s API: %v", err) continue } - if !nodeMeta.Unschedulable { + + if !lockData.Metadata.Unschedulable { err = uncordon(client, node) if err != nil { log.Errorf("Unable to uncordon %s: %v, will continue to hold lock and retry uncordon", node.GetName(), err) @@ -665,8 +602,12 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. } } } - throttle(releaseDelay) - release(lock, concurrency > 1) + + err = lock.Release() + if err != nil { + log.Errorf("Error releasing lock, will retry: %v", err) + continue + } break } else { break @@ -705,7 +646,8 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. if err != nil { log.Fatalf("Error retrieving node object via k8s API: %v", err) } - nodeMeta.Unschedulable = node.Spec.Unschedulable + + nodeMeta := daemonsetlock.NodeMeta{Unschedulable: node.Spec.Unschedulable} var timeNowString string if annotateNodes { @@ -738,17 +680,32 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. } log.Infof("Reboot required%s", rebootRequiredBlockCondition) - if !holding(lock, &nodeMeta, concurrency > 1) && !acquire(lock, &nodeMeta, TTL, concurrency) { - // Prefer to not schedule pods onto this node to avoid draing the same pod multiple times. - preferNoScheduleTaint.Enable() - continue + holding, _, err := lock.Holding() + if err != nil { + log.Errorf("Error testing lock: %v", err) + } + + if !holding { + acquired, holder, err := lock.Acquire(nodeMeta) + if err != nil { + log.Errorf("Error acquiring lock: %v", err) + } + if !acquired { + log.Warnf("Lock already held: %v", holder) + // Prefer to not schedule pods onto this node to avoid draing the same pod multiple times. + preferNoScheduleTaint.Enable() + continue + } } err = drain(client, node) if err != nil { if !forceReboot { log.Errorf("Unable to cordon or drain %s: %v, will release lock and retry cordon and drain before rebooting when lock is next acquired", node.GetName(), err) - release(lock, concurrency > 1) + err = lock.Release() + if err != nil { + log.Errorf("Error releasing lock: %v", err) + } log.Infof("Performing a best-effort uncordon after failed cordon and drain") uncordon(client, node) continue diff --git a/pkg/daemonsetlock/daemonsetlock.go b/pkg/daemonsetlock/daemonsetlock.go index 2d8949b5a..f9386bb06 100644 --- a/pkg/daemonsetlock/daemonsetlock.go +++ b/pkg/daemonsetlock/daemonsetlock.go @@ -4,6 +4,8 @@ import ( "context" "encoding/json" "fmt" + log "github.com/sirupsen/logrus" + "strings" "time" v1 "k8s.io/api/apps/v1" @@ -18,6 +20,21 @@ const ( k8sAPICallRetryTimeout = 5 * time.Minute // How long to wait until we determine that the k8s API is definitively unavailable ) +type Lock interface { + Acquire(NodeMeta) (bool, string, error) + Release() error + Holding() (bool, LockAnnotationValue, error) +} + +type GenericLock struct { + TTL time.Duration + releaseDelay time.Duration +} + +type NodeMeta struct { + Unschedulable bool `json:"unschedulable"` +} + // DaemonSetLock holds all necessary information to do actions // on the kured ds which holds lock info through annotations. type DaemonSetLock struct { @@ -28,25 +45,92 @@ type DaemonSetLock struct { annotation string } -type lockAnnotationValue struct { +// DaemonSetSingleLock holds all necessary information to do actions +// on the kured ds which holds lock info through annotations. +type DaemonSetSingleLock struct { + GenericLock + DaemonSetLock +} + +// DaemonSetMultiLock holds all necessary information to do actions +// on the kured ds which holds lock info through annotations, valid +// for multiple nodes +type DaemonSetMultiLock struct { + GenericLock + DaemonSetLock + maxOwners int +} + +// LockAnnotationValue contains the lock data, +// which allows persistence across reboots, particularily recording if the +// node was already unschedulable before kured reboot. +// To be modified when using another type of lock storage. +type LockAnnotationValue struct { NodeID string `json:"nodeID"` - Metadata interface{} `json:"metadata,omitempty"` + Metadata NodeMeta `json:"metadata,omitempty"` Created time.Time `json:"created"` TTL time.Duration `json:"TTL"` } type multiLockAnnotationValue struct { MaxOwners int `json:"maxOwners"` - LockAnnotations []lockAnnotationValue `json:"locks"` + LockAnnotations []LockAnnotationValue `json:"locks"` } // New creates a daemonsetLock object containing the necessary data for follow up k8s requests -func New(client *kubernetes.Clientset, nodeID, namespace, name, annotation string) *DaemonSetLock { - return &DaemonSetLock{client, nodeID, namespace, name, annotation} +func New(client *kubernetes.Clientset, nodeID, namespace, name, annotation string, TTL time.Duration, concurrency int, lockReleaseDelay time.Duration) Lock { + if concurrency > 1 { + return &DaemonSetMultiLock{ + GenericLock: GenericLock{ + TTL: TTL, + releaseDelay: lockReleaseDelay, + }, + DaemonSetLock: DaemonSetLock{ + client: client, + nodeID: nodeID, + namespace: namespace, + name: name, + annotation: annotation, + }, + maxOwners: concurrency, + } + } else { + return &DaemonSetSingleLock{ + GenericLock: GenericLock{ + TTL: TTL, + releaseDelay: lockReleaseDelay, + }, + DaemonSetLock: DaemonSetLock{ + client: client, + nodeID: nodeID, + namespace: namespace, + name: name, + annotation: annotation, + }, + } + } +} + +// GetDaemonSet returns the named DaemonSet resource from the DaemonSetLock's configured client +func (dsl *DaemonSetLock) GetDaemonSet(sleep, timeout time.Duration) (*v1.DaemonSet, error) { + var ds *v1.DaemonSet + var lastError error + err := wait.PollImmediate(sleep, timeout, func() (bool, error) { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + if ds, lastError = dsl.client.AppsV1().DaemonSets(dsl.namespace).Get(ctx, dsl.name, metav1.GetOptions{}); lastError != nil { + return false, nil + } + return true, nil + }) + if err != nil { + return nil, fmt.Errorf("Timed out trying to get daemonset %s in namespace %s: %v", dsl.name, dsl.namespace, lastError) + } + return ds, nil } // Acquire attempts to annotate the kured daemonset with lock info from instantiated DaemonSetLock using client-go -func (dsl *DaemonSetLock) Acquire(metadata interface{}, TTL time.Duration) (bool, string, error) { +func (dsl *DaemonSetSingleLock) Acquire(nodeMetadata NodeMeta) (bool, string, error) { for { ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout) if err != nil { @@ -55,7 +139,7 @@ func (dsl *DaemonSetLock) Acquire(metadata interface{}, TTL time.Duration) (bool valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation] if exists { - value := lockAnnotationValue{} + value := LockAnnotationValue{} if err := json.Unmarshal([]byte(valueString), &value); err != nil { return false, "", err } @@ -68,7 +152,7 @@ func (dsl *DaemonSetLock) Acquire(metadata interface{}, TTL time.Duration) (bool if ds.ObjectMeta.Annotations == nil { ds.ObjectMeta.Annotations = make(map[string]string) } - value := lockAnnotationValue{NodeID: dsl.nodeID, Metadata: metadata, Created: time.Now().UTC(), TTL: TTL} + value := LockAnnotationValue{NodeID: dsl.nodeID, Metadata: nodeMetadata, Created: time.Now().UTC(), TTL: dsl.TTL} valueBytes, err := json.Marshal(&value) if err != nil { return false, "", err @@ -89,49 +173,78 @@ func (dsl *DaemonSetLock) Acquire(metadata interface{}, TTL time.Duration) (bool } } -// AcquireMultiple creates and annotates the daemonset with a multiple owner lock -func (dsl *DaemonSetLock) AcquireMultiple(metadata interface{}, TTL time.Duration, maxOwners int) (bool, []string, error) { +// Test attempts to check the kured daemonset lock status (existence, expiry) from instantiated DaemonSetLock using client-go +func (dsl *DaemonSetSingleLock) Holding() (bool, LockAnnotationValue, error) { + var lockData LockAnnotationValue + ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout) + if err != nil { + return false, lockData, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err) + } + + valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation] + if exists { + value := LockAnnotationValue{} + if err := json.Unmarshal([]byte(valueString), &value); err != nil { + return false, lockData, err + } + + if !ttlExpired(value.Created, value.TTL) { + return value.NodeID == dsl.nodeID, value, nil + } + } + + return false, lockData, nil +} + +// Release attempts to remove the lock data from the kured ds annotations using client-go +func (dsl *DaemonSetSingleLock) Release() error { + if dsl.releaseDelay > 0 { + log.Infof("Waiting %v before releasing lock", dsl.releaseDelay) + time.Sleep(dsl.releaseDelay) + } for { ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout) if err != nil { - return false, []string{}, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err) + return fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err) } - annotation := multiLockAnnotationValue{} valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation] if exists { - if err := json.Unmarshal([]byte(valueString), &annotation); err != nil { - return false, []string{}, fmt.Errorf("error getting multi lock: %w", err) + value := LockAnnotationValue{} + if err := json.Unmarshal([]byte(valueString), &value); err != nil { + return err } - } - lockPossible, newAnnotation := dsl.canAcquireMultiple(annotation, metadata, TTL, maxOwners) - if !lockPossible { - return false, nodeIDsFromMultiLock(newAnnotation), nil + if value.NodeID != dsl.nodeID { + return fmt.Errorf("Not lock holder: %v", value.NodeID) + } + } else { + return fmt.Errorf("Lock not held") } - if ds.ObjectMeta.Annotations == nil { - ds.ObjectMeta.Annotations = make(map[string]string) - } - newAnnotationBytes, err := json.Marshal(&newAnnotation) - if err != nil { - return false, []string{}, fmt.Errorf("error marshalling new annotation lock: %w", err) - } - ds.ObjectMeta.Annotations[dsl.annotation] = string(newAnnotationBytes) + delete(ds.ObjectMeta.Annotations, dsl.annotation) - _, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.Background(), ds, metav1.UpdateOptions{}) + _, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.TODO(), ds, metav1.UpdateOptions{}) if err != nil { if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict { + // Something else updated the resource between us reading and writing - try again soon time.Sleep(time.Second) continue } else { - return false, []string{}, fmt.Errorf("error updating daemonset with multi lock: %w", err) + return err } } - return true, nodeIDsFromMultiLock(newAnnotation), nil + return nil } } +func ttlExpired(created time.Time, ttl time.Duration) bool { + if ttl > 0 && time.Since(created) >= ttl { + return true + } + return false +} + func nodeIDsFromMultiLock(annotation multiLockAnnotationValue) []string { nodeIDs := make([]string, 0, len(annotation.LockAnnotations)) for _, nodeLock := range annotation.LockAnnotations { @@ -140,7 +253,7 @@ func nodeIDsFromMultiLock(annotation multiLockAnnotationValue) []string { return nodeIDs } -func (dsl *DaemonSetLock) canAcquireMultiple(annotation multiLockAnnotationValue, metadata interface{}, TTL time.Duration, maxOwners int) (bool, multiLockAnnotationValue) { +func (dsl *DaemonSetLock) canAcquireMultiple(annotation multiLockAnnotationValue, metadata NodeMeta, TTL time.Duration, maxOwners int) (bool, multiLockAnnotationValue) { newAnnotation := multiLockAnnotationValue{MaxOwners: maxOwners} freeSpace := false if annotation.LockAnnotations == nil || len(annotation.LockAnnotations) < maxOwners { @@ -162,7 +275,7 @@ func (dsl *DaemonSetLock) canAcquireMultiple(annotation multiLockAnnotationValue if freeSpace { newAnnotation.LockAnnotations = append( newAnnotation.LockAnnotations, - lockAnnotationValue{ + LockAnnotationValue{ NodeID: dsl.nodeID, Metadata: metadata, Created: time.Now().UTC(), @@ -175,92 +288,80 @@ func (dsl *DaemonSetLock) canAcquireMultiple(annotation multiLockAnnotationValue return false, multiLockAnnotationValue{} } -// Test attempts to check the kured daemonset lock status (existence, expiry) from instantiated DaemonSetLock using client-go -func (dsl *DaemonSetLock) Test(metadata interface{}) (bool, error) { - ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout) - if err != nil { - return false, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err) - } +// Acquire creates and annotates the daemonset with a multiple owner lock +func (dsl *DaemonSetMultiLock) Acquire(nodeMetaData NodeMeta) (bool, string, error) { + for { + ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout) + if err != nil { + return false, "", fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err) + } - valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation] - if exists { - value := lockAnnotationValue{Metadata: metadata} - if err := json.Unmarshal([]byte(valueString), &value); err != nil { - return false, err + annotation := multiLockAnnotationValue{} + valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation] + if exists { + if err := json.Unmarshal([]byte(valueString), &annotation); err != nil { + return false, "", fmt.Errorf("error getting multi lock: %w", err) + } } - if !ttlExpired(value.Created, value.TTL) { - return value.NodeID == dsl.nodeID, nil + lockPossible, newAnnotation := dsl.canAcquireMultiple(annotation, nodeMetaData, dsl.TTL, dsl.maxOwners) + if !lockPossible { + return false, strings.Join(nodeIDsFromMultiLock(newAnnotation), ","), nil } - } - return false, nil + if ds.ObjectMeta.Annotations == nil { + ds.ObjectMeta.Annotations = make(map[string]string) + } + newAnnotationBytes, err := json.Marshal(&newAnnotation) + if err != nil { + return false, "", fmt.Errorf("error marshalling new annotation lock: %w", err) + } + ds.ObjectMeta.Annotations[dsl.annotation] = string(newAnnotationBytes) + + _, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.Background(), ds, metav1.UpdateOptions{}) + if err != nil { + if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict { + time.Sleep(time.Second) + continue + } else { + return false, "", fmt.Errorf("error updating daemonset with multi lock: %w", err) + } + } + return true, strings.Join(nodeIDsFromMultiLock(newAnnotation), ","), nil + } } // TestMultiple attempts to check the kured daemonset lock status for multi locks -func (dsl *DaemonSetLock) TestMultiple() (bool, error) { +func (dsl *DaemonSetMultiLock) Holding() (bool, LockAnnotationValue, error) { + var lockdata LockAnnotationValue ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout) if err != nil { - return false, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err) + return false, lockdata, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err) } valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation] if exists { value := multiLockAnnotationValue{} if err := json.Unmarshal([]byte(valueString), &value); err != nil { - return false, err + return false, lockdata, err } for _, nodeLock := range value.LockAnnotations { if nodeLock.NodeID == dsl.nodeID && !ttlExpired(nodeLock.Created, nodeLock.TTL) { - return true, nil + return true, nodeLock, nil } } } - return false, nil + return false, lockdata, nil } -// Release attempts to remove the lock data from the kured ds annotations using client-go -func (dsl *DaemonSetLock) Release() error { - for { - ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout) - if err != nil { - return fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %w", dsl.name, dsl.namespace, err) - } - - valueString, exists := ds.ObjectMeta.Annotations[dsl.annotation] - if exists { - value := lockAnnotationValue{} - if err := json.Unmarshal([]byte(valueString), &value); err != nil { - return err - } - - if value.NodeID != dsl.nodeID { - return fmt.Errorf("Not lock holder: %v", value.NodeID) - } - } else { - return fmt.Errorf("Lock not held") - } - - delete(ds.ObjectMeta.Annotations, dsl.annotation) - - _, err = dsl.client.AppsV1().DaemonSets(dsl.namespace).Update(context.TODO(), ds, metav1.UpdateOptions{}) - if err != nil { - if se, ok := err.(*errors.StatusError); ok && se.ErrStatus.Reason == metav1.StatusReasonConflict { - // Something else updated the resource between us reading and writing - try again soon - time.Sleep(time.Second) - continue - } else { - return err - } - } - return nil +// Release attempts to remove the lock data for a single node from the multi node annotation +func (dsl *DaemonSetMultiLock) Release() error { + if dsl.releaseDelay > 0 { + log.Infof("Waiting %v before releasing lock", dsl.releaseDelay) + time.Sleep(dsl.releaseDelay) } -} - -// ReleaseMultiple attempts to remove the lock data from the kured ds annotations using client-go -func (dsl *DaemonSetLock) ReleaseMultiple() error { for { ds, err := dsl.GetDaemonSet(k8sAPICallRetrySleep, k8sAPICallRetryTimeout) if err != nil { @@ -307,28 +408,3 @@ func (dsl *DaemonSetLock) ReleaseMultiple() error { return nil } } - -// GetDaemonSet returns the named DaemonSet resource from the DaemonSetLock's configured client -func (dsl *DaemonSetLock) GetDaemonSet(sleep, timeout time.Duration) (*v1.DaemonSet, error) { - var ds *v1.DaemonSet - var lastError error - err := wait.PollImmediate(sleep, timeout, func() (bool, error) { - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() - if ds, lastError = dsl.client.AppsV1().DaemonSets(dsl.namespace).Get(ctx, dsl.name, metav1.GetOptions{}); lastError != nil { - return false, nil - } - return true, nil - }) - if err != nil { - return nil, fmt.Errorf("Timed out trying to get daemonset %s in namespace %s: %v", dsl.name, dsl.namespace, lastError) - } - return ds, nil -} - -func ttlExpired(created time.Time, ttl time.Duration) bool { - if ttl > 0 && time.Since(created) >= ttl { - return true - } - return false -} diff --git a/pkg/daemonsetlock/daemonsetlock_test.go b/pkg/daemonsetlock/daemonsetlock_test.go index b21bcc23c..f68a81e28 100644 --- a/pkg/daemonsetlock/daemonsetlock_test.go +++ b/pkg/daemonsetlock/daemonsetlock_test.go @@ -66,7 +66,7 @@ func TestCanAcquireMultiple(t *testing.T) { current: multiLockAnnotationValue{}, desired: multiLockAnnotationValue{ MaxOwners: 2, - LockAnnotations: []lockAnnotationValue{ + LockAnnotations: []LockAnnotationValue{ {NodeID: node1Name}, }, }, @@ -80,13 +80,13 @@ func TestCanAcquireMultiple(t *testing.T) { maxOwners: 2, current: multiLockAnnotationValue{ MaxOwners: 2, - LockAnnotations: []lockAnnotationValue{ + LockAnnotations: []LockAnnotationValue{ {NodeID: node2Name}, }, }, desired: multiLockAnnotationValue{ MaxOwners: 2, - LockAnnotations: []lockAnnotationValue{ + LockAnnotations: []LockAnnotationValue{ {NodeID: node1Name}, {NodeID: node2Name}, }, @@ -101,7 +101,7 @@ func TestCanAcquireMultiple(t *testing.T) { maxOwners: 2, current: multiLockAnnotationValue{ MaxOwners: 2, - LockAnnotations: []lockAnnotationValue{ + LockAnnotations: []LockAnnotationValue{ { NodeID: node2Name, Created: time.Now().UTC().Add(-1 * time.Minute), @@ -116,7 +116,7 @@ func TestCanAcquireMultiple(t *testing.T) { }, desired: multiLockAnnotationValue{ MaxOwners: 2, - LockAnnotations: []lockAnnotationValue{ + LockAnnotations: []LockAnnotationValue{ {NodeID: node2Name}, {NodeID: node3Name}, }, @@ -131,7 +131,7 @@ func TestCanAcquireMultiple(t *testing.T) { maxOwners: 2, current: multiLockAnnotationValue{ MaxOwners: 2, - LockAnnotations: []lockAnnotationValue{ + LockAnnotations: []LockAnnotationValue{ { NodeID: node2Name, Created: time.Now().UTC().Add(-1 * time.Hour), @@ -146,7 +146,7 @@ func TestCanAcquireMultiple(t *testing.T) { }, desired: multiLockAnnotationValue{ MaxOwners: 2, - LockAnnotations: []lockAnnotationValue{ + LockAnnotations: []LockAnnotationValue{ {NodeID: node1Name}, {NodeID: node3Name}, }, @@ -161,7 +161,7 @@ func TestCanAcquireMultiple(t *testing.T) { maxOwners: 2, current: multiLockAnnotationValue{ MaxOwners: 2, - LockAnnotations: []lockAnnotationValue{ + LockAnnotations: []LockAnnotationValue{ { NodeID: node2Name, Created: time.Now().UTC().Add(-1 * time.Hour), @@ -176,17 +176,17 @@ func TestCanAcquireMultiple(t *testing.T) { }, desired: multiLockAnnotationValue{ MaxOwners: 2, - LockAnnotations: []lockAnnotationValue{ + LockAnnotations: []LockAnnotationValue{ {NodeID: node1Name}, }, }, lockPossible: true, }, } - + nm := NodeMeta{Unschedulable: false} for _, testCase := range testCases { t.Run(testCase.name, func(t *testing.T) { - lockPossible, actual := testCase.daemonSetLock.canAcquireMultiple(testCase.current, struct{}{}, time.Minute, testCase.maxOwners) + lockPossible, actual := testCase.daemonSetLock.canAcquireMultiple(testCase.current, nm, time.Minute, testCase.maxOwners) if lockPossible != testCase.lockPossible { t.Fatalf( "unexpected result for lock possible (got %t expected %t new annotation %v", From d8b9e31ac96a73065cb84819c0c05ff73d7c05b2 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Sat, 19 Oct 2024 10:52:41 +0200 Subject: [PATCH 13/18] Refactor constructors Without this, a bit of the validation is done in main, while the rest is done in each constructor. This fixes it by create a new global constructor in checkers/reboot to solve all the cases and bubble up the errors. I prefered keeping the old constructors, and calling them, this way someone wanting to have a fork of the code could still create directly the good checker/rebooter, without the arbitrary decisions taken by the generic constructor. However, kured is not a library, and was never intended to be usable in forks, so we might want to reconsider is part 2 of the refactor. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 40 +++++++++++++----------------------- pkg/checkers/checker.go | 31 +++++++++++++++++++--------- pkg/checkers/checker_test.go | 4 ++-- pkg/reboot/command.go | 10 ++++++--- pkg/reboot/reboot.go | 21 +++++++++++++++++++ pkg/reboot/signal.go | 8 ++++++-- 6 files changed, 71 insertions(+), 43 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 96e229f7d..96b754130 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -244,26 +244,14 @@ func main() { log.Infof("Reboot schedule: %v", window) log.Infof("Reboot method: %s", rebootMethod) - var rebooter reboot.Rebooter - switch { - case rebootMethod == "command": - log.Infof("Reboot command: %s", rebootCommand) - rebooter = reboot.NewCommandRebooter(rebootCommand) - case rebootMethod == "signal": - log.Infof("Reboot signal: %v", rebootSignal) - rebooter = reboot.NewSignalRebooter(rebootSignal) - default: - log.Fatalf("Invalid reboot-method configured: %s", rebootMethod) - } - - var checker checkers.Checker - // An override of rebootSentinelCommand means a privileged command - if rebootSentinelCommand != "" { - log.Infof("Sentinel checker is (privileged) user provided command: %s", rebootSentinelCommand) - checker = checkers.NewCommandChecker(rebootSentinelCommand) - } else { - log.Infof("Sentinel checker is (unprivileged) testing for the presence of: %s", rebootSentinelFile) - checker = checkers.NewFileRebootChecker(rebootSentinelFile) + rebooter, err := reboot.NewRebooter(rebootMethod, rebootCommand, rebootSignal) + if err != nil { + log.Fatalf("Failed to build rebooter: %v", err) + } + + rebootChecker, err := checkers.NewRebootChecker(rebootSentinelCommand, rebootSentinelFile) + if err != nil { + log.Fatalf("Failed to build reboot checker: %v", err) } config, err := rest.InClusterConfig() @@ -289,8 +277,8 @@ func main() { } lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation, lockTTL, concurrency, lockReleaseDelay) - go rebootAsRequired(nodeID, rebooter, checker, window, lock, client) - go maintainRebootRequiredMetric(nodeID, checker) + go rebootAsRequired(nodeID, rebooter, rebootChecker, window, lock, client) + go maintainRebootRequiredMetric(nodeID, rebootChecker) http.Handle("/metrics", promhttp.Handler()) log.Fatal(http.ListenAndServe(fmt.Sprintf("%s:%d", metricsHost, metricsPort), nil)) @@ -477,7 +465,7 @@ func uncordon(client *kubernetes.Clientset, node *v1.Node) error { func maintainRebootRequiredMetric(nodeID string, checker checkers.Checker) { for { - if checker.CheckRebootRequired() { + if checker.RebootRequired() { rebootRequiredGauge.WithLabelValues(nodeID).Set(1) } else { rebootRequiredGauge.WithLabelValues(nodeID).Set(0) @@ -594,7 +582,7 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. // And (2) check if we previously annotated the node that it was in the process of being rebooted, // And finally (3) if it has that annotation, to delete it. // This indicates to other node tools running on the cluster that this node may be a candidate for maintenance - if annotateNodes && !checker.CheckRebootRequired() { + if annotateNodes && !checker.RebootRequired() { if _, ok := node.Annotations[KuredRebootInProgressAnnotation]; ok { err := deleteNodeAnnotation(client, nodeID, KuredRebootInProgressAnnotation) if err != nil { @@ -617,7 +605,7 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. preferNoScheduleTaint := taints.New(client, nodeID, preferNoScheduleTaintName, v1.TaintEffectPreferNoSchedule) // Remove taint immediately during startup to quickly allow scheduling again. - if !checker.CheckRebootRequired() { + if !checker.RebootRequired() { preferNoScheduleTaint.Disable() } @@ -636,7 +624,7 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. continue } - if !checker.CheckRebootRequired() { + if !checker.RebootRequired() { log.Infof("Reboot not required") preferNoScheduleTaint.Disable() continue diff --git a/pkg/checkers/checker.go b/pkg/checkers/checker.go index 6a75b959d..f90c164d3 100644 --- a/pkg/checkers/checker.go +++ b/pkg/checkers/checker.go @@ -1,6 +1,7 @@ package checkers import ( + "fmt" "github.com/google/shlex" "github.com/kubereboot/kured/pkg/util" log "github.com/sirupsen/logrus" @@ -13,7 +14,7 @@ import ( // CheckRebootRequired method which returns a single boolean // clarifying whether a reboot is expected or not. type Checker interface { - CheckRebootRequired() bool + RebootRequired() bool } // FileRebootChecker is the default reboot checker. @@ -22,11 +23,21 @@ type FileRebootChecker struct { FilePath string } -// CheckRebootRequired checks the file presence +func NewRebootChecker(rebootSentinelCommand string, rebootSentinelFile string) (Checker, error) { + // An override of rebootSentinelCommand means a privileged command + if rebootSentinelCommand != "" { + log.Infof("Sentinel checker is (privileged) user provided command: %s", rebootSentinelCommand) + return NewCommandChecker(rebootSentinelCommand) + } + log.Infof("Sentinel checker is (unprivileged) testing for the presence of: %s", rebootSentinelFile) + return NewFileRebootChecker(rebootSentinelFile) +} + +// RebootRequired checks the file presence // needs refactoring to also return an error, instead of leaking it inside the code. // This needs refactoring to get rid of NewCommand // This needs refactoring to only contain file location, instead of CheckCommand -func (rc FileRebootChecker) CheckRebootRequired() bool { +func (rc FileRebootChecker) RebootRequired() bool { if _, err := os.Stat(rc.FilePath); err == nil { return true } @@ -35,10 +46,10 @@ func (rc FileRebootChecker) CheckRebootRequired() bool { // NewFileRebootChecker is the constructor for the file based reboot checker // TODO: Add extra input validation on filePath string here -func NewFileRebootChecker(filePath string) *FileRebootChecker { +func NewFileRebootChecker(filePath string) (*FileRebootChecker, error) { return &FileRebootChecker{ FilePath: filePath, - } + }, nil } // CommandChecker is using a custom command to check @@ -51,10 +62,10 @@ type CommandChecker struct { Privileged bool } -// CheckRebootRequired for CommandChecker runs a command without returning +// RebootRequired for CommandChecker runs a command without returning // any eventual error. THis should be later refactored to remove the util wrapper // and return the errors, instead of logging them here. -func (rc CommandChecker) CheckRebootRequired() bool { +func (rc CommandChecker) RebootRequired() bool { var cmdline []string if rc.Privileged { cmdline = util.PrivilegedHostCommand(rc.NamespacePid, rc.CheckCommand) @@ -84,14 +95,14 @@ func (rc CommandChecker) CheckRebootRequired() bool { // NewCommandChecker is the constructor for the commandChecker, and by default // runs new commands in a privileged fashion. -func NewCommandChecker(sentinelCommand string) *CommandChecker { +func NewCommandChecker(sentinelCommand string) (*CommandChecker, error) { cmd, err := shlex.Split(sentinelCommand) if err != nil { - log.Fatalf("Error parsing provided sentinel command: %v", err) + return nil, fmt.Errorf("error parsing provided sentinel command: %v", err) } return &CommandChecker{ CheckCommand: cmd, NamespacePid: 1, Privileged: true, - } + }, nil } diff --git a/pkg/checkers/checker_test.go b/pkg/checkers/checker_test.go index 4df9870c1..683bddcd0 100644 --- a/pkg/checkers/checker_test.go +++ b/pkg/checkers/checker_test.go @@ -33,7 +33,7 @@ func Test_rebootRequired(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { a := CommandChecker{CheckCommand: tt.args.sentinelCommand, NamespacePid: 1, Privileged: false} - if got := a.CheckRebootRequired(); got != tt.want { + if got := a.RebootRequired(); got != tt.want { t.Errorf("rebootRequired() = %v, want %v", got, tt.want) } }) @@ -62,7 +62,7 @@ func Test_rebootRequired_fatals(t *testing.T) { for _, c := range cases { fatal = false a := CommandChecker{CheckCommand: c.param, NamespacePid: 1, Privileged: false} - a.CheckRebootRequired() + a.RebootRequired() assert.Equal(t, c.expectFatal, fatal) } diff --git a/pkg/reboot/command.go b/pkg/reboot/command.go index 07c17f238..74143028c 100644 --- a/pkg/reboot/command.go +++ b/pkg/reboot/command.go @@ -1,6 +1,7 @@ package reboot import ( + "fmt" "github.com/google/shlex" "github.com/kubereboot/kured/pkg/util" log "github.com/sirupsen/logrus" @@ -22,11 +23,14 @@ func (c CommandRebooter) Reboot() { // NewCommandRebooter is the constructor to create a CommandRebooter from a string not // yet shell lexed. You can skip this constructor if you parse the data correctly first // when instantiating a CommandRebooter instance. -func NewCommandRebooter(rebootCommand string) *CommandRebooter { +func NewCommandRebooter(rebootCommand string) (*CommandRebooter, error) { + if rebootCommand == "" { + return nil, fmt.Errorf("no reboot command specified") + } cmd, err := shlex.Split(rebootCommand) if err != nil { - log.Fatalf("Error parsing provided reboot command: %v", err) + return nil, fmt.Errorf("error %v when parsing reboot command %s", err, rebootCommand) } - return &CommandRebooter{RebootCommand: util.PrivilegedHostCommand(1, cmd)} + return &CommandRebooter{RebootCommand: util.PrivilegedHostCommand(1, cmd)}, nil } diff --git a/pkg/reboot/reboot.go b/pkg/reboot/reboot.go index 84ea93a89..171004d85 100644 --- a/pkg/reboot/reboot.go +++ b/pkg/reboot/reboot.go @@ -1,5 +1,10 @@ package reboot +import ( + "fmt" + log "github.com/sirupsen/logrus" +) + // Rebooter is the standard interface to use to execute // the reboot, after it has been considered as necessary. // The Reboot method does not expect any return, yet should @@ -7,3 +12,19 @@ package reboot type Rebooter interface { Reboot() } + +// NewRebooter validates the rebootMethod, rebootCommand, and rebootSignal input, +// then chains to the right constructor. This can be made internal later, as +// only the real rebooters constructors should be public (by opposition to this one) +func NewRebooter(rebootMethod string, rebootCommand string, rebootSignal int) (Rebooter, error) { + switch { + case rebootMethod == "command": + log.Infof("Reboot command: %s", rebootCommand) + return NewCommandRebooter(rebootCommand) + case rebootMethod == "signal": + log.Infof("Reboot signal: %d", rebootSignal) + return NewSignalRebooter(rebootSignal) + default: + return nil, fmt.Errorf("invalid reboot-method configured %s, expected signal or command", rebootMethod) + } +} diff --git a/pkg/reboot/signal.go b/pkg/reboot/signal.go index ab9adbdbb..ef2595f71 100644 --- a/pkg/reboot/signal.go +++ b/pkg/reboot/signal.go @@ -1,6 +1,7 @@ package reboot import ( + "fmt" "os" "syscall" @@ -31,6 +32,9 @@ func (c SignalRebooter) Reboot() { // NewSignalRebooter is the constructor which sets the signal number. // The constructor does not yet validate any input. It should be done in a later commit. -func NewSignalRebooter(sig int) *SignalRebooter { - return &SignalRebooter{Signal: sig} +func NewSignalRebooter(sig int) (*SignalRebooter, error) { + if sig < 1 { + return nil, fmt.Errorf("invalid signal: %v", sig) + } + return &SignalRebooter{Signal: sig}, nil } From 231888e58a94bbac4dd1aa122704c9a2db657dcc Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Sat, 19 Oct 2024 11:30:28 +0200 Subject: [PATCH 14/18] Use RegexpValue in plags This will remove double pointers, and be explicit about the type we are using. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 7 +++---- cmd/kured/pflags.go | 18 +++++++----------- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 96b754130..1ff59d776 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -12,7 +12,6 @@ import ( "net/url" "os" "reflect" - "regexp" "sort" "strconv" "strings" @@ -60,7 +59,7 @@ var ( lockReleaseDelay time.Duration prometheusURL string preferNoScheduleTaintName string - alertFilter *regexp.Regexp + alertFilter regexpValue alertFilterMatchOnly bool alertFiringOnly bool rebootSentinelFile string @@ -150,7 +149,7 @@ func main() { "delay lock release for this duration (default: 0, disabled)") flag.StringVar(&prometheusURL, "prometheus-url", "", "Prometheus instance to probe for active alerts") - flag.Var(®expValue{&alertFilter}, "alert-filter-regexp", + flag.Var(&alertFilter, "alert-filter-regexp", "alert names to ignore when checking for active alerts") flag.BoolVar(&alertFilterMatchOnly, "alert-filter-match-only", false, "Only block if the alert-filter-regexp matches active alerts") @@ -655,7 +654,7 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. var blockCheckers []blockers.RebootBlocker if prometheusURL != "" { - blockCheckers = append(blockCheckers, blockers.PrometheusBlockingChecker{PromClient: promClient, Filter: alertFilter, FiringOnly: alertFiringOnly, FilterMatchOnly: alertFilterMatchOnly}) + blockCheckers = append(blockCheckers, blockers.PrometheusBlockingChecker{PromClient: promClient, Filter: alertFilter.Regexp, FiringOnly: alertFiringOnly, FilterMatchOnly: alertFilterMatchOnly}) } if podSelectors != nil { blockCheckers = append(blockCheckers, blockers.KubernetesBlockingChecker{Client: client, Nodename: nodeID, Filter: podSelectors}) diff --git a/cmd/kured/pflags.go b/cmd/kured/pflags.go index ffec3a704..a79d694b1 100644 --- a/cmd/kured/pflags.go +++ b/cmd/kured/pflags.go @@ -1,30 +1,26 @@ package main import ( - "fmt" "regexp" ) -// Custom flag type to handle *regexp.Regexp with pflag type regexpValue struct { - value **regexp.Regexp + *regexp.Regexp } -// String method to return the string representation of the value func (rev *regexpValue) String() string { - if *rev.value != nil { - return (*rev.value).String() + if rev.Regexp == nil { + return "" } - return "" + return rev.Regexp.String() } -// Set method to parse the input string and set the regexp value func (rev *regexpValue) Set(s string) error { - compiledRegexp, err := regexp.Compile(s) + value, err := regexp.Compile(s) if err != nil { - return fmt.Errorf("invalid regular expression: %w", err) + return err } - *rev.value = compiledRegexp + rev.Regexp = value return nil } From 67df0e935abe6eb1110576326b818ef33e919038 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Sat, 19 Oct 2024 13:40:17 +0200 Subject: [PATCH 15/18] Remove deprecated PollWithContext Replaced with PollUntilContextTimeout. Signed-off-by: Jean-Philippe Evrard --- pkg/daemonsetlock/daemonsetlock.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pkg/daemonsetlock/daemonsetlock.go b/pkg/daemonsetlock/daemonsetlock.go index f9386bb06..c0da3a59c 100644 --- a/pkg/daemonsetlock/daemonsetlock.go +++ b/pkg/daemonsetlock/daemonsetlock.go @@ -115,16 +115,14 @@ func New(client *kubernetes.Clientset, nodeID, namespace, name, annotation strin func (dsl *DaemonSetLock) GetDaemonSet(sleep, timeout time.Duration) (*v1.DaemonSet, error) { var ds *v1.DaemonSet var lastError error - err := wait.PollImmediate(sleep, timeout, func() (bool, error) { - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() + err := wait.PollUntilContextTimeout(context.Background(), sleep, timeout, true, func(ctx context.Context) (bool, error) { if ds, lastError = dsl.client.AppsV1().DaemonSets(dsl.namespace).Get(ctx, dsl.name, metav1.GetOptions{}); lastError != nil { return false, nil } return true, nil }) if err != nil { - return nil, fmt.Errorf("Timed out trying to get daemonset %s in namespace %s: %v", dsl.name, dsl.namespace, lastError) + return nil, fmt.Errorf("timed out trying to get daemonset %s in namespace %s: %v", dsl.name, dsl.namespace, lastError) } return ds, nil } From 626db871581aee0f7d5346d75e96b6d349ea08e6 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Sat, 12 Oct 2024 17:13:25 +0200 Subject: [PATCH 16/18] Add error to reboot interface Without this, impossible to bubble up errors to main Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 6 +++++- pkg/reboot/command.go | 5 +++-- pkg/reboot/reboot.go | 2 +- pkg/reboot/signal.go | 11 ++++------- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 1ff59d776..3c77c8bbb 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -710,7 +710,11 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. } } log.Infof("Triggering reboot for node %v", nodeID) - rebooter.Reboot() + + err = rebooter.Reboot() + if err != nil { + log.Fatalf("Unable to reboot node: %v", err) + } for { log.Infof("Waiting for reboot") time.Sleep(time.Minute) diff --git a/pkg/reboot/command.go b/pkg/reboot/command.go index 74143028c..6cfb61d75 100644 --- a/pkg/reboot/command.go +++ b/pkg/reboot/command.go @@ -13,11 +13,12 @@ type CommandRebooter struct { } // Reboot triggers the reboot command -func (c CommandRebooter) Reboot() { +func (c CommandRebooter) Reboot() error { log.Infof("Invoking command: %s", c.RebootCommand) if err := util.NewCommand(c.RebootCommand[0], c.RebootCommand[1:]...).Run(); err != nil { - log.Fatalf("Error invoking reboot command: %v", err) + return fmt.Errorf("error invoking reboot command %s: %v", c.RebootCommand, err) } + return nil } // NewCommandRebooter is the constructor to create a CommandRebooter from a string not diff --git a/pkg/reboot/reboot.go b/pkg/reboot/reboot.go index 171004d85..9ce65d2de 100644 --- a/pkg/reboot/reboot.go +++ b/pkg/reboot/reboot.go @@ -10,7 +10,7 @@ import ( // The Reboot method does not expect any return, yet should // most likely be refactored in the future to return an error type Rebooter interface { - Reboot() + Reboot() error } // NewRebooter validates the rebootMethod, rebootCommand, and rebootSignal input, diff --git a/pkg/reboot/signal.go b/pkg/reboot/signal.go index ef2595f71..92b7e0418 100644 --- a/pkg/reboot/signal.go +++ b/pkg/reboot/signal.go @@ -4,8 +4,6 @@ import ( "fmt" "os" "syscall" - - log "github.com/sirupsen/logrus" ) // SignalRebooter holds context-information for a signal reboot. @@ -14,20 +12,19 @@ type SignalRebooter struct { } // Reboot triggers the reboot signal -func (c SignalRebooter) Reboot() { - log.Infof("Invoking signal: %v", c.Signal) - +func (c SignalRebooter) Reboot() error { process, err := os.FindProcess(1) if err != nil { - log.Fatalf("Not running on Unix: %v", err) + return fmt.Errorf("not running on Unix: %v", err) } err = process.Signal(syscall.Signal(c.Signal)) // Either PID does not exist, or the signal does not work. Hoping for // a decent enough error. if err != nil { - log.Fatalf("Signal of SIGRTMIN+5 failed: %v", err) + return fmt.Errorf("signal of SIGRTMIN+5 failed: %v", err) } + return nil } // NewSignalRebooter is the constructor which sets the signal number. From 73f00ce44585687ed518afb3278caae5e2782b97 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Sat, 19 Oct 2024 14:19:20 +0200 Subject: [PATCH 17/18] Make all the internal validations ... internal The main is doing flag validation through pflags, then did further validation by involving the constructors. With the recent refactor on the commit "Refactor constructors" in this branch, we moved away from that pattern. However, it means we reintroduced a log dependency into our external API, and the external API now had extra validations regardless of the type. This is unnecessary, so I moved away from that pattern, and moved back all the validation into a central place, internal, which is only doing what kured would desire, without exposing it to users. The users could still theoretically use the proper constructors for each type, as they would validate just fine. The only thing they would lose is the kured internal decision of validation/precedence. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 9 +++++---- internal/validators.go | 35 +++++++++++++++++++++++++++++++++++ pkg/checkers/checker.go | 10 ---------- pkg/reboot/reboot.go | 21 --------------------- 4 files changed, 40 insertions(+), 35 deletions(-) create mode 100644 internal/validators.go diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 3c77c8bbb..874883047 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "fmt" + "github.com/kubereboot/kured/internal" "github.com/kubereboot/kured/pkg/blockers" "github.com/kubereboot/kured/pkg/checkers" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -17,7 +18,9 @@ import ( "strings" "time" + "github.com/containrrr/shoutrrr" papi "github.com/prometheus/client_golang/api" + "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" flag "github.com/spf13/pflag" v1 "k8s.io/api/core/v1" @@ -27,14 +30,12 @@ import ( "k8s.io/client-go/rest" kubectldrain "k8s.io/kubectl/pkg/drain" - shoutrrr "github.com/containrrr/shoutrrr" "github.com/kubereboot/kured/pkg/alerts" "github.com/kubereboot/kured/pkg/daemonsetlock" "github.com/kubereboot/kured/pkg/delaytick" "github.com/kubereboot/kured/pkg/reboot" "github.com/kubereboot/kured/pkg/taints" "github.com/kubereboot/kured/pkg/timewindow" - "github.com/prometheus/client_golang/prometheus" ) var ( @@ -243,12 +244,12 @@ func main() { log.Infof("Reboot schedule: %v", window) log.Infof("Reboot method: %s", rebootMethod) - rebooter, err := reboot.NewRebooter(rebootMethod, rebootCommand, rebootSignal) + rebooter, err := internal.NewRebooter(rebootMethod, rebootCommand, rebootSignal) if err != nil { log.Fatalf("Failed to build rebooter: %v", err) } - rebootChecker, err := checkers.NewRebootChecker(rebootSentinelCommand, rebootSentinelFile) + rebootChecker, err := internal.NewRebootChecker(rebootSentinelCommand, rebootSentinelFile) if err != nil { log.Fatalf("Failed to build reboot checker: %v", err) } diff --git a/internal/validators.go b/internal/validators.go new file mode 100644 index 000000000..59251e124 --- /dev/null +++ b/internal/validators.go @@ -0,0 +1,35 @@ +package internal + +import ( + "fmt" + "github.com/kubereboot/kured/pkg/checkers" + "github.com/kubereboot/kured/pkg/reboot" + log "github.com/sirupsen/logrus" +) + +// NewRebooter validates the rebootMethod, rebootCommand, and rebootSignal input, +// then chains to the right constructor. +func NewRebooter(rebootMethod string, rebootCommand string, rebootSignal int) (reboot.Rebooter, error) { + switch { + case rebootMethod == "command": + log.Infof("Reboot command: %s", rebootCommand) + return reboot.NewCommandRebooter(rebootCommand) + case rebootMethod == "signal": + log.Infof("Reboot signal: %d", rebootSignal) + return reboot.NewSignalRebooter(rebootSignal) + default: + return nil, fmt.Errorf("invalid reboot-method configured %s, expected signal or command", rebootMethod) + } +} + +// NewRebootChecker validates the rebootSentinelCommand, rebootSentinelFile input, +// then chains to the right constructor. +func NewRebootChecker(rebootSentinelCommand string, rebootSentinelFile string) (checkers.Checker, error) { + // An override of rebootSentinelCommand means a privileged command + if rebootSentinelCommand != "" { + log.Infof("Sentinel checker is (privileged) user provided command: %s", rebootSentinelCommand) + return checkers.NewCommandChecker(rebootSentinelCommand) + } + log.Infof("Sentinel checker is (unprivileged) testing for the presence of: %s", rebootSentinelFile) + return checkers.NewFileRebootChecker(rebootSentinelFile) +} diff --git a/pkg/checkers/checker.go b/pkg/checkers/checker.go index f90c164d3..d76fb0088 100644 --- a/pkg/checkers/checker.go +++ b/pkg/checkers/checker.go @@ -23,16 +23,6 @@ type FileRebootChecker struct { FilePath string } -func NewRebootChecker(rebootSentinelCommand string, rebootSentinelFile string) (Checker, error) { - // An override of rebootSentinelCommand means a privileged command - if rebootSentinelCommand != "" { - log.Infof("Sentinel checker is (privileged) user provided command: %s", rebootSentinelCommand) - return NewCommandChecker(rebootSentinelCommand) - } - log.Infof("Sentinel checker is (unprivileged) testing for the presence of: %s", rebootSentinelFile) - return NewFileRebootChecker(rebootSentinelFile) -} - // RebootRequired checks the file presence // needs refactoring to also return an error, instead of leaking it inside the code. // This needs refactoring to get rid of NewCommand diff --git a/pkg/reboot/reboot.go b/pkg/reboot/reboot.go index 9ce65d2de..bfcf2036a 100644 --- a/pkg/reboot/reboot.go +++ b/pkg/reboot/reboot.go @@ -1,10 +1,5 @@ package reboot -import ( - "fmt" - log "github.com/sirupsen/logrus" -) - // Rebooter is the standard interface to use to execute // the reboot, after it has been considered as necessary. // The Reboot method does not expect any return, yet should @@ -12,19 +7,3 @@ import ( type Rebooter interface { Reboot() error } - -// NewRebooter validates the rebootMethod, rebootCommand, and rebootSignal input, -// then chains to the right constructor. This can be made internal later, as -// only the real rebooters constructors should be public (by opposition to this one) -func NewRebooter(rebootMethod string, rebootCommand string, rebootSignal int) (Rebooter, error) { - switch { - case rebootMethod == "command": - log.Infof("Reboot command: %s", rebootCommand) - return NewCommandRebooter(rebootCommand) - case rebootMethod == "signal": - log.Infof("Reboot signal: %d", rebootSignal) - return NewSignalRebooter(rebootSignal) - default: - return nil, fmt.Errorf("invalid reboot-method configured %s, expected signal or command", rebootMethod) - } -} From f559a953044e4e1e624892049af1b301225ef857 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Evrard Date: Fri, 18 Oct 2024 22:11:35 +0200 Subject: [PATCH 18/18] Make the internal blockers implementation internal This at the same time, removes the alert public package. Alert was only used inside prometheus blocker, so it allows to simplify the code. Signed-off-by: Jean-Philippe Evrard --- cmd/kured/main.go | 30 ++--- pkg/alerts/prometheus.go | 77 ------------- pkg/blockers/blockers.go | 84 -------------- pkg/blockers/blockers_test.go | 4 +- pkg/blockers/kubernetespod.go | 61 ++++++++++ pkg/blockers/prometheus.go | 118 ++++++++++++++++++++ pkg/{alerts => blockers}/prometheus_test.go | 9 +- 7 files changed, 194 insertions(+), 189 deletions(-) delete mode 100644 pkg/alerts/prometheus.go create mode 100644 pkg/blockers/kubernetespod.go create mode 100644 pkg/blockers/prometheus.go rename pkg/{alerts => blockers}/prometheus_test.go (96%) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 874883047..c7bf94985 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -4,10 +4,6 @@ import ( "context" "encoding/json" "fmt" - "github.com/kubereboot/kured/internal" - "github.com/kubereboot/kured/pkg/blockers" - "github.com/kubereboot/kured/pkg/checkers" - "github.com/prometheus/client_golang/prometheus/promhttp" "math/rand" "net/http" "net/url" @@ -19,8 +15,17 @@ import ( "time" "github.com/containrrr/shoutrrr" + "github.com/kubereboot/kured/internal" + "github.com/kubereboot/kured/pkg/blockers" + "github.com/kubereboot/kured/pkg/checkers" + "github.com/kubereboot/kured/pkg/daemonsetlock" + "github.com/kubereboot/kured/pkg/delaytick" + "github.com/kubereboot/kured/pkg/reboot" + "github.com/kubereboot/kured/pkg/taints" + "github.com/kubereboot/kured/pkg/timewindow" papi "github.com/prometheus/client_golang/api" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" log "github.com/sirupsen/logrus" flag "github.com/spf13/pflag" v1 "k8s.io/api/core/v1" @@ -29,13 +34,6 @@ import ( "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" kubectldrain "k8s.io/kubectl/pkg/drain" - - "github.com/kubereboot/kured/pkg/alerts" - "github.com/kubereboot/kured/pkg/daemonsetlock" - "github.com/kubereboot/kured/pkg/delaytick" - "github.com/kubereboot/kured/pkg/reboot" - "github.com/kubereboot/kured/pkg/taints" - "github.com/kubereboot/kured/pkg/timewindow" ) var ( @@ -609,12 +607,6 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. preferNoScheduleTaint.Disable() } - // instantiate prometheus client - promClient, err := alerts.NewPromClient(papi.Config{Address: prometheusURL}) - if err != nil { - log.Fatal("Unable to create prometheus client: ", err) - } - source = rand.NewSource(time.Now().UnixNano()) tick = delaytick.New(source, period) for range tick { @@ -655,10 +647,10 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers. var blockCheckers []blockers.RebootBlocker if prometheusURL != "" { - blockCheckers = append(blockCheckers, blockers.PrometheusBlockingChecker{PromClient: promClient, Filter: alertFilter.Regexp, FiringOnly: alertFiringOnly, FilterMatchOnly: alertFilterMatchOnly}) + blockCheckers = append(blockCheckers, blockers.NewPrometheusBlockingChecker(papi.Config{Address: prometheusURL}, alertFilter.Regexp, alertFiringOnly, alertFilterMatchOnly)) } if podSelectors != nil { - blockCheckers = append(blockCheckers, blockers.KubernetesBlockingChecker{Client: client, Nodename: nodeID, Filter: podSelectors}) + blockCheckers = append(blockCheckers, blockers.NewKubernetesBlockingChecker(client, nodeID, podSelectors)) } var rebootRequiredBlockCondition string diff --git a/pkg/alerts/prometheus.go b/pkg/alerts/prometheus.go deleted file mode 100644 index ea1457456..000000000 --- a/pkg/alerts/prometheus.go +++ /dev/null @@ -1,77 +0,0 @@ -package alerts - -import ( - "context" - "fmt" - "regexp" - "sort" - "time" - - papi "github.com/prometheus/client_golang/api" - v1 "github.com/prometheus/client_golang/api/prometheus/v1" - "github.com/prometheus/common/model" -) - -// PromClient is a wrapper around the Prometheus Client interface and implements the api -// This way, the PromClient can be instantiated with the configuration the Client needs, and -// the ability to use the methods the api has, like Query and so on. -type PromClient struct { - papi papi.Client - api v1.API -} - -// NewPromClient creates a new client to the Prometheus API. -// It returns an error on any problem. -func NewPromClient(conf papi.Config) (*PromClient, error) { - promClient, err := papi.NewClient(conf) - if err != nil { - return nil, err - } - client := PromClient{papi: promClient, api: v1.NewAPI(promClient)} - return &client, nil -} - -// ActiveAlerts is a method of type PromClient, it returns a list of names of active alerts -// (e.g. pending or firing), filtered by the supplied regexp or by the includeLabels query. -// filter by regexp means when the regex finds the alert-name; the alert is exluded from the -// block-list and will NOT block rebooting. query by includeLabel means, -// if the query finds an alert, it will include it to the block-list and it WILL block rebooting. -func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly, filterMatchOnly bool) ([]string, error) { - - // get all alerts from prometheus - value, _, err := p.api.Query(context.Background(), "ALERTS", time.Now()) - if err != nil { - return nil, err - } - - if value.Type() == model.ValVector { - if vector, ok := value.(model.Vector); ok { - activeAlertSet := make(map[string]bool) - for _, sample := range vector { - if alertName, isAlert := sample.Metric[model.AlertNameLabel]; isAlert && sample.Value != 0 { - if matchesRegex(filter, string(alertName), filterMatchOnly) && (!firingOnly || sample.Metric["alertstate"] == "firing") { - activeAlertSet[string(alertName)] = true - } - } - } - - var activeAlerts []string - for activeAlert := range activeAlertSet { - activeAlerts = append(activeAlerts, activeAlert) - } - sort.Strings(activeAlerts) - - return activeAlerts, nil - } - } - - return nil, fmt.Errorf("Unexpected value type: %v", value) -} - -func matchesRegex(filter *regexp.Regexp, alertName string, filterMatchOnly bool) bool { - if filter == nil { - return true - } - - return filter.MatchString(string(alertName)) == filterMatchOnly -} diff --git a/pkg/blockers/blockers.go b/pkg/blockers/blockers.go index e2dc15007..04383103b 100644 --- a/pkg/blockers/blockers.go +++ b/pkg/blockers/blockers.go @@ -1,15 +1,5 @@ package blockers -import ( - "context" - "fmt" - "github.com/kubereboot/kured/pkg/alerts" - log "github.com/sirupsen/logrus" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes" - "regexp" -) - // RebootBlocked checks that a single block Checker // will block the reboot or not. func RebootBlocked(blockers ...RebootBlocker) bool { @@ -26,77 +16,3 @@ func RebootBlocked(blockers ...RebootBlocker) bool { type RebootBlocker interface { IsBlocked() bool } - -// PrometheusBlockingChecker contains info for connecting -// to prometheus, and can give info about whether a reboot should be blocked -type PrometheusBlockingChecker struct { - // prometheusClient to make prometheus-go-client and api config available - // into the PrometheusBlockingChecker struct - PromClient *alerts.PromClient - // regexp used to get alerts - Filter *regexp.Regexp - // bool to indicate if only firing alerts should be considered - FiringOnly bool - // bool to indicate that we're only blocking on alerts which match the filter - FilterMatchOnly bool -} - -// KubernetesBlockingChecker contains info for connecting -// to k8s, and can give info about whether a reboot should be blocked -type KubernetesBlockingChecker struct { - // client used to contact kubernetes API - Client *kubernetes.Clientset - Nodename string - // lised used to filter pods (podSelector) - Filter []string -} - -// IsBlocked for the prometheus will check if there are active alerts matching -// the arguments given into promclient which would actively block the reboot. -// As of today, no blocker information is shared as a return of the method, -// and the information is simply logged. -func (pb PrometheusBlockingChecker) IsBlocked() bool { - alertNames, err := pb.PromClient.ActiveAlerts(pb.Filter, pb.FiringOnly, pb.FilterMatchOnly) - if err != nil { - log.Warnf("Reboot blocked: prometheus query error: %v", err) - return true - } - count := len(alertNames) - if count > 10 { - alertNames = append(alertNames[:10], "...") - } - if count > 0 { - log.Warnf("Reboot blocked: %d active alerts: %v", count, alertNames) - return true - } - return false -} - -// IsBlocked for the KubernetesBlockingChecker will check if a pod, for the node, is preventing -// the reboot. It will warn in the logs about blocking, but does not return an error. -func (kb KubernetesBlockingChecker) IsBlocked() bool { - fieldSelector := fmt.Sprintf("spec.nodeName=%s,status.phase!=Succeeded,status.phase!=Failed,status.phase!=Unknown", kb.Nodename) - for _, labelSelector := range kb.Filter { - podList, err := kb.Client.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{ - LabelSelector: labelSelector, - FieldSelector: fieldSelector, - Limit: 10}) - if err != nil { - log.Warnf("Reboot blocked: pod query error: %v", err) - return true - } - - if len(podList.Items) > 0 { - podNames := make([]string, 0, len(podList.Items)) - for _, pod := range podList.Items { - podNames = append(podNames, pod.Name) - } - if len(podList.Continue) > 0 { - podNames = append(podNames, "...") - } - log.Warnf("Reboot blocked: matching pods: %v", podNames) - return true - } - } - return false -} diff --git a/pkg/blockers/blockers_test.go b/pkg/blockers/blockers_test.go index 1eddf74a1..653e2281e 100644 --- a/pkg/blockers/blockers_test.go +++ b/pkg/blockers/blockers_test.go @@ -1,7 +1,6 @@ package blockers import ( - "github.com/kubereboot/kured/pkg/alerts" papi "github.com/prometheus/client_golang/api" "testing" ) @@ -20,8 +19,7 @@ func Test_rebootBlocked(t *testing.T) { blockingChecker := BlockingChecker{blocking: true} // Instantiate a prometheusClient with a broken_url - promClient, _ := alerts.NewPromClient(papi.Config{Address: "broken_url"}) - brokenPrometheusClient := PrometheusBlockingChecker{PromClient: promClient, Filter: nil, FiringOnly: false} + brokenPrometheusClient := NewPrometheusBlockingChecker(papi.Config{Address: "broken_url"}, nil, false, false) type args struct { blockers []RebootBlocker diff --git a/pkg/blockers/kubernetespod.go b/pkg/blockers/kubernetespod.go new file mode 100644 index 000000000..ebd034097 --- /dev/null +++ b/pkg/blockers/kubernetespod.go @@ -0,0 +1,61 @@ +package blockers + +import ( + "context" + "fmt" + log "github.com/sirupsen/logrus" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +// Compile-time checks to ensure the type implements the interface +var ( + _ RebootBlocker = (*KubernetesBlockingChecker)(nil) +) + +// KubernetesBlockingChecker contains info for connecting +// to k8s, and can give info about whether a reboot should be blocked +type KubernetesBlockingChecker struct { + // client used to contact kubernetes API + client *kubernetes.Clientset + nodeName string + // lised used to filter pods (podSelector) + filter []string +} + +func NewKubernetesBlockingChecker(client *kubernetes.Clientset, nodename string, podSelectors []string) *KubernetesBlockingChecker { + return &KubernetesBlockingChecker{ + client: client, + nodeName: nodename, + filter: podSelectors, + } +} + +// IsBlocked for the KubernetesBlockingChecker will check if a pod, for the node, is preventing +// the reboot. It will warn in the logs about blocking, but does not return an error. +func (kb KubernetesBlockingChecker) IsBlocked() bool { + fieldSelector := fmt.Sprintf("spec.nodeName=%s,status.phase!=Succeeded,status.phase!=Failed,status.phase!=Unknown", kb.nodeName) + for _, labelSelector := range kb.filter { + podList, err := kb.client.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{ + LabelSelector: labelSelector, + FieldSelector: fieldSelector, + Limit: 10}) + if err != nil { + log.Warnf("Reboot blocked: pod query error: %v", err) + return true + } + + if len(podList.Items) > 0 { + podNames := make([]string, 0, len(podList.Items)) + for _, pod := range podList.Items { + podNames = append(podNames, pod.Name) + } + if len(podList.Continue) > 0 { + podNames = append(podNames, "...") + } + log.Warnf("Reboot blocked: matching pods: %v", podNames) + return true + } + } + return false +} diff --git a/pkg/blockers/prometheus.go b/pkg/blockers/prometheus.go new file mode 100644 index 000000000..05c1bf136 --- /dev/null +++ b/pkg/blockers/prometheus.go @@ -0,0 +1,118 @@ +package blockers + +import ( + "context" + "fmt" + papi "github.com/prometheus/client_golang/api" + v1 "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" + log "github.com/sirupsen/logrus" + "regexp" + "sort" + "time" +) + +// Compile-time checks to ensure the type implements the interface +var ( + _ RebootBlocker = (*PrometheusBlockingChecker)(nil) +) + +// PrometheusBlockingChecker contains info for connecting +// to prometheus, and can give info about whether a reboot should be blocked +type PrometheusBlockingChecker struct { + promConfig papi.Config + // regexp used to get alerts + filter *regexp.Regexp + // bool to indicate if only firing alerts should be considered + firingOnly bool + // bool to indicate that we're only blocking on alerts which match the filter + filterMatchOnly bool + // storing the promClient + promClient papi.Client +} + +func NewPrometheusBlockingChecker(config papi.Config, alertFilter *regexp.Regexp, firingOnly bool, filterMatchOnly bool) PrometheusBlockingChecker { + promClient, _ := papi.NewClient(config) + + return PrometheusBlockingChecker{ + promConfig: config, + filter: alertFilter, + firingOnly: firingOnly, + filterMatchOnly: filterMatchOnly, + promClient: promClient, + } +} + +// IsBlocked for the prometheus will check if there are active alerts matching +// the arguments given into the PrometheusBlockingChecker which would actively +// block the reboot. +// As of today, no blocker information is shared as a return of the method, +// and the information is simply logged. +func (pb PrometheusBlockingChecker) IsBlocked() bool { + alertNames, err := pb.ActiveAlerts() + if err != nil { + log.Warnf("Reboot blocked: prometheus query error: %v", err) + return true + } + count := len(alertNames) + if count > 10 { + alertNames = append(alertNames[:10], "...") + } + if count > 0 { + log.Warnf("Reboot blocked: %d active alerts: %v", count, alertNames) + return true + } + return false +} + +// MetricLabel is used to give a fancier name +// than the type to the label for rebootBlockedCounter +func (pb PrometheusBlockingChecker) MetricLabel() string { + return "prometheus" +} + +// ActiveAlerts is a method of type promClient, it returns a list of names of active alerts +// (e.g. pending or firing), filtered by the supplied regexp or by the includeLabels query. +// filter by regexp means when the regexp finds the alert-name; the alert is excluded from the +// block-list and will NOT block rebooting. query by includeLabel means, +// if the query finds an alert, it will include it to the block-list, and it WILL block rebooting. +func (pb PrometheusBlockingChecker) ActiveAlerts() ([]string, error) { + api := v1.NewAPI(pb.promClient) + + // get all alerts from prometheus + value, _, err := api.Query(context.Background(), "ALERTS", time.Now()) + if err != nil { + return nil, err + } + + if value.Type() == model.ValVector { + if vector, ok := value.(model.Vector); ok { + activeAlertSet := make(map[string]bool) + for _, sample := range vector { + if alertName, isAlert := sample.Metric[model.AlertNameLabel]; isAlert && sample.Value != 0 { + if matchesRegex(pb.filter, string(alertName), pb.filterMatchOnly) && (!pb.firingOnly || sample.Metric["alertstate"] == "firing") { + activeAlertSet[string(alertName)] = true + } + } + } + + var activeAlerts []string + for activeAlert := range activeAlertSet { + activeAlerts = append(activeAlerts, activeAlert) + } + sort.Strings(activeAlerts) + + return activeAlerts, nil + } + } + + return nil, fmt.Errorf("unexpected value type %v", value) +} + +func matchesRegex(filter *regexp.Regexp, alertName string, filterMatchOnly bool) bool { + if filter == nil { + return true + } + + return filter.MatchString(alertName) == filterMatchOnly +} diff --git a/pkg/alerts/prometheus_test.go b/pkg/blockers/prometheus_test.go similarity index 96% rename from pkg/alerts/prometheus_test.go rename to pkg/blockers/prometheus_test.go index d2a224f34..f361aafe5 100644 --- a/pkg/alerts/prometheus_test.go +++ b/pkg/blockers/prometheus_test.go @@ -1,4 +1,4 @@ -package alerts +package blockers import ( "log" @@ -145,12 +145,9 @@ func TestActiveAlerts(t *testing.T) { regex, _ := regexp.Compile(tc.rFilter) // instantiate the prometheus client with the mockserver-address - p, err := NewPromClient(api.Config{Address: mockServer.URL}) - if err != nil { - log.Fatal(err) - } + p := NewPrometheusBlockingChecker(api.Config{Address: mockServer.URL}, regex, tc.firingOnly, tc.filterMatchOnly) - result, err := p.ActiveAlerts(regex, tc.firingOnly, tc.filterMatchOnly) + result, err := p.ActiveAlerts() if err != nil { log.Fatal(err) }