diff --git a/.github/workflows/on-pr.yaml b/.github/workflows/on-pr.yaml index 8416fc836..7bc22b669 100644 --- a/.github/workflows/on-pr.yaml +++ b/.github/workflows/on-pr.yaml @@ -103,7 +103,8 @@ jobs: # - Ensure manifests work with the latest versions even with no manifest change # (compared to helm charts, manifests cannot easily template changes based on versions) # Helm charts are _trailing_ releases, while manifests are done during development. - e2e-manifests: + # This test uses the "command" reboot-method. + e2e-manifests-command: name: End-to-End test with kured with code and manifests from HEAD runs-on: ubuntu-latest strategy: @@ -179,3 +180,90 @@ jobs: DEBUG: true run: | ./tests/kind/follow-coordinated-reboot.sh + + + + # This ensures the latest code works with the manifests built from tree. + # It is useful for two things: + # - Test manifests changes (obviously), ensuring they don't break existing clusters + # - Ensure manifests work with the latest versions even with no manifest change + # (compared to helm charts, manifests cannot easily template changes based on versions) + # Helm charts are _trailing_ releases, while manifests are done during development. + # This test uses the "signal" reboot-method. + e2e-manifests-signal: + name: End-to-End test with kured with code and manifests from HEAD + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + kubernetes: + - "1.25" + - "1.26" + - "1.27" + steps: + - uses: actions/checkout@v3 + - name: Ensure go version + uses: actions/setup-go@v4 + with: + go-version-file: 'go.mod' + check-latest: true + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - name: Setup GoReleaser + run: make bootstrap-tools + - name: Find current tag version + run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT + id: tags + - name: Build artifacts + run: | + VERSION="${{ steps.tags.outputs.sha_short }}" make image + VERSION="${{ steps.tags.outputs.sha_short }}" make manifest + + - name: Workaround "Failed to attach 1 to compat systemd cgroup /actions_job/..." on gh actions + run: | + sudo bash << EOF + cp /etc/docker/daemon.json /etc/docker/daemon.json.old + echo '{}' > /etc/docker/daemon.json + systemctl restart docker || journalctl --no-pager -n 500 + systemctl status docker + EOF + + # Default name for helm/kind-action kind clusters is "chart-testing" + - name: Create kind cluster with 5 nodes + uses: helm/kind-action@v1.8.0 + with: + config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml + version: v0.14.0 + + - name: Preload previously built images onto kind cluster + run: kind load docker-image ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }} --name chart-testing + + - name: Do not wait for an hour before detecting the rebootSentinel + run: | + sed -i 's/#\(.*\)--period=1h/\1--period=30s/g' kured-ds-signal.yaml + + - name: Install kured with kubectl + run: | + kubectl apply -f kured-rbac.yaml && kubectl apply -f kured-ds-signal.yaml + + - name: Ensure kured is ready + uses: nick-invision/retry@v2.8.3 + with: + timeout_minutes: 10 + max_attempts: 10 + retry_wait_seconds: 60 + # DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = to cluster_size + command: "kubectl get ds -n kube-system kured | grep -E 'kured.*5.*5.*5.*5.*5'" + + - name: Create reboot sentinel files + run: | + ./tests/kind/create-reboot-sentinels.sh + + - name: Follow reboot until success + env: + DEBUG: true + run: | + ./tests/kind/follow-coordinated-reboot.sh + diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 42d941867..d01c95edf 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -74,6 +74,7 @@ var ( messageTemplateUncordon string podSelectors []string rebootCommand string + rebootSignal int logFormat string preRebootNodeLabels []string postRebootNodeLabels []string @@ -174,6 +175,8 @@ func NewRootCommand() *cobra.Command { "command for which a zero return code will trigger a reboot command") rootCmd.PersistentFlags().StringVar(&rebootCommand, "reboot-command", "/bin/systemctl reboot", "command to run when a reboot is required") + rootCmd.PersistentFlags().IntVar(&rebootSignal, "reboot-signal", 34+5, + "signal to use for reboot, SIGRTMIN+5 by default.") rootCmd.PersistentFlags().StringVar(&slackHookURL, "slack-hook-url", "", "slack hook URL for reboot notifications [deprecated in favor of --notify-url]") @@ -522,7 +525,7 @@ func invokeReboot(nodeID string, rebootCommand []string) { if rebootMethod == MethodCommand { booter = reboot.NewCommandReboot(nodeID, rebootCommand) } else if rebootMethod == MethodSignal { - booter = reboot.NewSignalReboot(nodeID) + booter = reboot.NewSignalReboot(nodeID, rebootSignal) } else { log.Fatalf("Invalid reboot-method configured: %s", rebootMethod) } @@ -829,8 +832,10 @@ func root(cmd *cobra.Command, args []string) { log.Infof("Reboot schedule: %v", window) log.Infof("Reboot check command: %s every %v", sentinelCommand, period) log.Infof("Reboot method: %s", rebootMethod) - if rebootCommand == MethodSignal { + if rebootCommand == MethodCommand { log.Infof("Reboot command: %s", restartCommand) + } else { + log.Infof("Reboot signal: %v", rebootSignal) } if annotateNodes { diff --git a/kured-ds-signal.yaml b/kured-ds-signal.yaml new file mode 100644 index 000000000..9fe000cad --- /dev/null +++ b/kured-ds-signal.yaml @@ -0,0 +1,100 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kured + namespace: kube-system +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: kured # Must match `--ds-name` + namespace: kube-system # Must match `--ds-namespace` +spec: + selector: + matchLabels: + name: kured + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: kured + spec: + serviceAccountName: kured + tolerations: + - key: node-role.kubernetes.io/control-plane + effect: NoSchedule + - key: node-role.kubernetes.io/master + effect: NoSchedule + hostPID: true # Facilitate entering the host mount namespace via init + restartPolicy: Always + volumes: + - name: sentinel + hostPath: + path: /var/run + type: Directory + containers: + - name: kured + # If you find yourself here wondering why there is no + # :latest tag on Docker Hub,see the FAQ in the README + image: ghcr.io/kubereboot/kured:1.13.2 + imagePullPolicy: IfNotPresent + securityContext: + privileged: false # Give permission to nsenter /proc/1/ns/mnt + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: ["*"] + add: ["CAP_KILL"] + ports: + - containerPort: 8080 + name: metrics + env: + # Pass in the name of the node on which this pod is scheduled + # for use with drain/uncordon operations and lock acquisition + - name: KURED_NODE_ID + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - mountPath: /sentinel + name: sentinel + readOnly: true + command: + - /usr/bin/kured + - --reboot-sentinel=/sentinel/reboot-required + - --reboot-method=signal +# - --reboot-signal=39 +# - --force-reboot=false +# - --drain-grace-period=-1 +# - --skip-wait-for-delete-timeout=0 +# - --drain-timeout=0 +# - --period=1h +# - --ds-namespace=kube-system +# - --ds-name=kured +# - --lock-annotation=weave.works/kured-node-lock +# - --lock-ttl=0 +# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local +# - --alert-filter-regexp=^RebootRequired$ +# - --alert-firing-only=false +# - --prefer-no-schedule-taint="" +# - --reboot-sentinel-command="" +# - --slack-hook-url=https://hooks.slack.com/... +# - --slack-username=prod +# - --slack-channel=alerting +# - --notify-url="" # See also shoutrrr url format +# - --message-template-drain=Draining node %s +# - --message-template-reboot=Rebooting node %s +# - --message-template-uncordon=Node %s rebooted & uncordoned successfully! +# - --blocking-pod-selector=runtime=long,cost=expensive +# - --blocking-pod-selector=name=temperamental +# - --blocking-pod-selector=... +# - --reboot-days=sun,mon,tue,wed,thu,fri,sat +# - --reboot-delay=90s +# - --start-time=0:00 +# - --end-time=23:59:59 +# - --time-zone=UTC +# - --annotate-nodes=false +# - --lock-release-delay=30m +# - --log-format=text diff --git a/kured-ds.yaml b/kured-ds.yaml index d0ee4acdb..e4362fe55 100644 --- a/kured-ds.yaml +++ b/kured-ds.yaml @@ -33,6 +33,7 @@ spec: - name: sentinel hostPath: path: /var/run + type: Directory containers: - name: kured # If you find yourself here wondering why there is no @@ -73,6 +74,8 @@ spec: # - --alert-firing-only=false # - --prefer-no-schedule-taint="" # - --reboot-sentinel-command="" +# - --reboot-method=command +# - --reboot-signal=39 # - --slack-hook-url=https://hooks.slack.com/... # - --slack-username=prod # - --slack-channel=alerting diff --git a/pkg/reboot/command.go b/pkg/reboot/command.go index 9d38453a7..8589be49d 100644 --- a/pkg/reboot/command.go +++ b/pkg/reboot/command.go @@ -5,16 +5,18 @@ import ( log "github.com/sirupsen/logrus" ) -type commandRebootMethod struct { +// CommandRebootMethod holds context-information for a command reboot. +type CommandRebootMethod struct { nodeID string rebootCommand []string } -func NewCommandReboot(nodeID string, rebootCommand []string) *commandRebootMethod { - return &commandRebootMethod{nodeID: nodeID, rebootCommand: rebootCommand} +// NewCommandReboot creates a new command-rebooter which needs full privileges on the host. +func NewCommandReboot(nodeID string, rebootCommand []string) *CommandRebootMethod { + return &CommandRebootMethod{nodeID: nodeID, rebootCommand: rebootCommand} } -func (c *commandRebootMethod) Reboot() { +func (c *CommandRebootMethod) Reboot() { log.Infof("Running command: %s for node: %s", c.rebootCommand, c.nodeID) if err := util.NewCommand(c.rebootCommand[0], c.rebootCommand[1:]...).Run(); err != nil { log.Fatalf("Error invoking reboot command: %v", err) diff --git a/pkg/reboot/reboot.go b/pkg/reboot/reboot.go index a7466cde6..83d788ecf 100644 --- a/pkg/reboot/reboot.go +++ b/pkg/reboot/reboot.go @@ -1,5 +1,6 @@ package reboot +// Reboot interface defines the Reboot function to be implemented. type Reboot interface { Reboot() } diff --git a/pkg/reboot/signal.go b/pkg/reboot/signal.go index 543bc7227..4f12fbc6e 100644 --- a/pkg/reboot/signal.go +++ b/pkg/reboot/signal.go @@ -7,15 +7,18 @@ import ( log "github.com/sirupsen/logrus" ) -type signalRebootMethod struct { +// SignalRebootMethod holds context-information for a signal reboot. +type SignalRebootMethod struct { nodeID string + signal int } -func NewSignalReboot(nodeID string) *signalRebootMethod { - return &signalRebootMethod{nodeID: nodeID} +// NewSignalReboot creates a new signal-rebooter which can run unprivileged. +func NewSignalReboot(nodeID string, signal int) *SignalRebootMethod { + return &SignalRebootMethod{nodeID: nodeID, signal: signal} } -func (c *signalRebootMethod) Reboot() { +func (c *SignalRebootMethod) Reboot() { log.Infof("Emit reboot-signal for node: %s", c.nodeID) process, err := os.FindProcess(1) @@ -23,7 +26,7 @@ func (c *signalRebootMethod) Reboot() { log.Fatalf("There was no systemd process found: %v", err) } - err = process.Signal(syscall.Signal(34 + 5)) // SIGRTMIN+5 + err = process.Signal(syscall.Signal(c.signal)) if err != nil { log.Fatalf("Signal of SIGRTMIN+5 failed: %v", err) } diff --git a/pkg/util/util.go b/pkg/util/util.go index 449c6f204..d32f9d1c2 100644 --- a/pkg/util/util.go +++ b/pkg/util/util.go @@ -6,7 +6,7 @@ import ( log "github.com/sirupsen/logrus" ) -// newCommand creates a new Command with stdout/stderr wired to our standard logger +// NewCommand creates a new Command with stdout/stderr wired to our standard logger func NewCommand(name string, arg ...string) *exec.Cmd { cmd := exec.Command(name, arg...) cmd.Stdout = log.NewEntry(log.StandardLogger()).