Skip to content

Commit

Permalink
fail fs-backup for windows nodes
Browse files Browse the repository at this point in the history
Signed-off-by: Lyndon-Li <[email protected]>
  • Loading branch information
Lyndon-Li committed Dec 18, 2024
1 parent 010fd1c commit 33efb69
Show file tree
Hide file tree
Showing 12 changed files with 326 additions and 31 deletions.
1 change: 1 addition & 0 deletions changelogs/unreleased/8518-Lyndon-Li
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
For issue #8424, disable fs-backup if the source/target pod is running in non-linux node
19 changes: 15 additions & 4 deletions pkg/cmd/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ import (
"github.com/vmware-tanzu/velero/pkg/restore"
"github.com/vmware-tanzu/velero/pkg/uploader"
"github.com/vmware-tanzu/velero/pkg/util/filesystem"
"github.com/vmware-tanzu/velero/pkg/util/kube"
"github.com/vmware-tanzu/velero/pkg/util/logging"
)

Expand Down Expand Up @@ -454,10 +455,20 @@ func (s *server) veleroResourcesExist() error {

func (s *server) checkNodeAgent() {
// warn if node agent does not exist
if err := nodeagent.IsRunning(s.ctx, s.kubeClient, s.namespace); err == nodeagent.ErrDaemonSetNotFound {
s.logger.Warn("Velero node agent not found; pod volume backups/restores will not work until it's created")
} else if err != nil {
s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent")
if kube.WithLinuxNode(s.ctx, s.crClient, s.logger) {
if err := nodeagent.IsRunningOnLinux(s.ctx, s.kubeClient, s.namespace); err == nodeagent.ErrDaemonSetNotFound {
s.logger.Warn("Velero node agent not found for linux nodes; pod volume backups/restores and data mover backups/restores will not work until it's created")
} else if err != nil {
s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent for linux nodes")
}

Check warning on line 463 in pkg/cmd/server/server.go

View check run for this annotation

Codecov / codecov/patch

pkg/cmd/server/server.go#L458-L463

Added lines #L458 - L463 were not covered by tests
}

if kube.WithWindowsNode(s.ctx, s.crClient, s.logger) {
if err := nodeagent.IsRunningOnWindows(s.ctx, s.kubeClient, s.namespace); err == nodeagent.ErrDaemonSetNotFound {
s.logger.Warn("Velero node agent not found for Windows nodes; pod volume backups/restores and data mover backups/restores will not work until it's created")
} else if err != nil {
s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent for Windows nodes")
}

Check warning on line 471 in pkg/cmd/server/server.go

View check run for this annotation

Codecov / codecov/patch

pkg/cmd/server/server.go#L466-L471

Added lines #L466 - L471 were not covered by tests
}
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/exposer/csi_snapshot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1146,7 +1146,7 @@ func Test_csiSnapshotExposer_DiagnoseExpose(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Namespace: velerov1.DefaultNamespace,
Name: "node-agent-pod-1",
Labels: map[string]string{"name": "node-agent"},
Labels: map[string]string{"role": "node-agent"},
},
Spec: corev1.PodSpec{
NodeName: "fake-node",
Expand Down
2 changes: 1 addition & 1 deletion pkg/exposer/generic_restore_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,7 @@ func Test_ReastoreDiagnoseExpose(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Namespace: velerov1.DefaultNamespace,
Name: "node-agent-pod-1",
Labels: map[string]string{"name": "node-agent"},
Labels: map[string]string{"role": "node-agent"},
},
Spec: corev1.PodSpec{
NodeName: "fake-node",
Expand Down
25 changes: 19 additions & 6 deletions pkg/nodeagent/node_agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,14 @@ import (
)

const (
// daemonSet is the name of the Velero node agent daemonset.
// daemonSet is the name of the Velero node agent daemonset on linux nodes.
daemonSet = "node-agent"

// daemonsetWindows is the name of the Velero node agent daemonset on Windows nodes.
daemonsetWindows = "node-agent-windows"

// nodeAgentRole marks pods with node-agent role on all nodes.
nodeAgentRole = "node-agent"
)

var (
Expand Down Expand Up @@ -89,9 +95,16 @@ type Configs struct {
PodResources *kube.PodResources `json:"podResources,omitempty"`
}

// IsRunning checks if the node agent daemonset is running properly. If not, return the error found
func IsRunning(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error {
if _, err := kubeClient.AppsV1().DaemonSets(namespace).Get(ctx, daemonSet, metav1.GetOptions{}); apierrors.IsNotFound(err) {
func IsRunningOnLinux(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error {
return isRunning(ctx, kubeClient, namespace, daemonSet)

Check warning on line 99 in pkg/nodeagent/node_agent.go

View check run for this annotation

Codecov / codecov/patch

pkg/nodeagent/node_agent.go#L98-L99

Added lines #L98 - L99 were not covered by tests
}

func IsRunningOnWindows(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error {
return isRunning(ctx, kubeClient, namespace, daemonsetWindows)

Check warning on line 103 in pkg/nodeagent/node_agent.go

View check run for this annotation

Codecov / codecov/patch

pkg/nodeagent/node_agent.go#L102-L103

Added lines #L102 - L103 were not covered by tests
}

func isRunning(ctx context.Context, kubeClient kubernetes.Interface, namespace string, daemonset string) error {
if _, err := kubeClient.AppsV1().DaemonSets(namespace).Get(ctx, daemonset, metav1.GetOptions{}); apierrors.IsNotFound(err) {
return ErrDaemonSetNotFound
} else if err != nil {
return err
Expand All @@ -116,7 +129,7 @@ func isRunningInNode(ctx context.Context, namespace string, nodeName string, crC
}

pods := new(v1.PodList)
parsedSelector, err := labels.Parse(fmt.Sprintf("name=%s", daemonSet))
parsedSelector, err := labels.Parse(fmt.Sprintf("role=%s", nodeAgentRole))
if err != nil {
return errors.Wrap(err, "fail to parse selector")
}
Expand All @@ -128,7 +141,7 @@ func isRunningInNode(ctx context.Context, namespace string, nodeName string, crC
}

if err != nil {
return errors.Wrap(err, "failed to list daemonset pods")
return errors.Wrap(err, "failed to list node-agent pods")

Check warning on line 144 in pkg/nodeagent/node_agent.go

View check run for this annotation

Codecov / codecov/patch

pkg/nodeagent/node_agent.go#L144

Added line #L144 was not covered by tests
}

for i := range pods.Items {
Expand Down
14 changes: 7 additions & 7 deletions pkg/nodeagent/node_agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ type reactor struct {
}

func TestIsRunning(t *testing.T) {
daemonSet := &appsv1.DaemonSet{
ds := &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Namespace: "fake-ns",
Name: "node-agent",
Expand Down Expand Up @@ -80,7 +80,7 @@ func TestIsRunning(t *testing.T) {
name: "succeed",
namespace: "fake-ns",
kubeClientObj: []runtime.Object{
daemonSet,
ds,
},
},
}
Expand All @@ -93,7 +93,7 @@ func TestIsRunning(t *testing.T) {
fakeKubeClient.Fake.PrependReactor(reactor.verb, reactor.resource, reactor.reactorFunc)
}

err := IsRunning(context.TODO(), fakeKubeClient, test.namespace)
err := isRunning(context.TODO(), fakeKubeClient, test.namespace, daemonSet)
if test.expectErr == "" {
assert.NoError(t, err)
} else {
Expand All @@ -108,11 +108,11 @@ func TestIsRunningInNode(t *testing.T) {
corev1.AddToScheme(scheme)

nonNodeAgentPod := builder.ForPod("fake-ns", "fake-pod").Result()
nodeAgentPodNotRunning := builder.ForPod("fake-ns", "fake-pod").Labels(map[string]string{"name": "node-agent"}).Result()
nodeAgentPodRunning1 := builder.ForPod("fake-ns", "fake-pod-1").Labels(map[string]string{"name": "node-agent"}).Phase(corev1.PodRunning).Result()
nodeAgentPodRunning2 := builder.ForPod("fake-ns", "fake-pod-2").Labels(map[string]string{"name": "node-agent"}).Phase(corev1.PodRunning).Result()
nodeAgentPodNotRunning := builder.ForPod("fake-ns", "fake-pod").Labels(map[string]string{"role": "node-agent"}).Result()
nodeAgentPodRunning1 := builder.ForPod("fake-ns", "fake-pod-1").Labels(map[string]string{"role": "node-agent"}).Phase(corev1.PodRunning).Result()
nodeAgentPodRunning2 := builder.ForPod("fake-ns", "fake-pod-2").Labels(map[string]string{"role": "node-agent"}).Phase(corev1.PodRunning).Result()
nodeAgentPodRunning3 := builder.ForPod("fake-ns", "fake-pod-3").
Labels(map[string]string{"name": "node-agent"}).
Labels(map[string]string{"role": "node-agent"}).
Phase(corev1.PodRunning).
NodeName("fake-node").
Result()
Expand Down
6 changes: 6 additions & 0 deletions pkg/podvolume/backupper.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,12 @@ func (b *backupper) BackupPodVolumes(backup *velerov1api.Backup, pod *corev1api.
return nil, pvcSummary, nil
}

if err := kube.IsLinuxNode(b.ctx, pod.Spec.NodeName, b.crClient); err != nil {
err := errors.Wrapf(err, "Pod %s/%s is not running in linux node(%s), skip", pod.Namespace, pod.Name, pod.Spec.NodeName)
skipAllPodVolumes(pod, volumesToBackup, err, pvcSummary, log)
return nil, pvcSummary, []error{err}
}

Check warning on line 213 in pkg/podvolume/backupper.go

View check run for this annotation

Codecov / codecov/patch

pkg/podvolume/backupper.go#L210-L213

Added lines #L210 - L213 were not covered by tests

err := nodeagent.IsRunningInNode(b.ctx, backup.Namespace, pod.Spec.NodeName, b.crClient)
if err != nil {
skipAllPodVolumes(pod, volumesToBackup, err, pvcSummary, log)
Expand Down
46 changes: 42 additions & 4 deletions pkg/podvolume/backupper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ func createPodObj(running bool, withVolume bool, withVolumeMounted bool, volumeN

func createNodeAgentPodObj(running bool) *corev1api.Pod {
podObj := builder.ForPod(velerov1api.DefaultNamespace, "fake-node-agent").Result()
podObj.Labels = map[string]string{"name": "node-agent"}
podObj.Labels = map[string]string{"role": "node-agent"}

if running {
podObj.Status.Phase = corev1api.PodRunning
Expand Down Expand Up @@ -303,6 +303,14 @@ func createPVBObj(fail bool, withSnapshot bool, index int, uploaderType string)
return pvbObj
}

func createNodeObj() *corev1api.Node {
return builder.ForNode("fake-node-name").Labels(map[string]string{"kubernetes.io/os": "linux"}).Result()
}

func createWindowsNodeObj() *corev1api.Node {
return builder.ForNode("fake-node-name").Labels(map[string]string{"kubernetes.io/os": "windows"}).Result()
}

func TestBackupPodVolumes(t *testing.T) {
scheme := runtime.NewScheme()
velerov1api.AddToScheme(scheme)
Expand Down Expand Up @@ -358,13 +366,32 @@ func TestBackupPodVolumes(t *testing.T) {
uploaderType: "kopia",
bsl: "fake-bsl",
},
{
name: "pod is not running on Linux node",
volumes: []string{
"fake-volume-1",
"fake-volume-2",
},
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createWindowsNodeObj(),
},
sourcePod: createPodObj(false, false, false, 2),
uploaderType: "kopia",
errs: []string{
"Pod fake-ns/fake-pod is not running in linux node(fake-node-name), skip",
},
},
{
name: "node-agent pod is not running in node",
volumes: []string{
"fake-volume-1",
"fake-volume-2",
},
sourcePod: createPodObj(true, false, false, 2),
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeObj(),
},
uploaderType: "kopia",
errs: []string{
"daemonset pod not found in running state in node fake-node-name",
Expand All @@ -379,6 +406,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
uploaderType: "kopia",
mockGetRepositoryType: true,
Expand All @@ -395,6 +423,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
uploaderType: "kopia",
errs: []string{
Expand All @@ -410,6 +439,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
ctlClientObj: []runtime.Object{
createBackupRepoObj(),
Expand All @@ -427,6 +457,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
ctlClientObj: []runtime.Object{
createBackupRepoObj(),
Expand All @@ -448,6 +479,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVCObj(2),
},
Expand All @@ -471,6 +503,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVCObj(2),
createPVObj(1, true),
Expand All @@ -482,6 +515,7 @@ func TestBackupPodVolumes(t *testing.T) {
runtimeScheme: scheme,
uploaderType: "kopia",
bsl: "fake-bsl",
errs: []string{},
},
{
name: "volume not mounted by pod should be skipped",
Expand All @@ -492,6 +526,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVCObj(2),
createPVObj(1, false),
Expand All @@ -503,6 +538,7 @@ func TestBackupPodVolumes(t *testing.T) {
runtimeScheme: scheme,
uploaderType: "kopia",
bsl: "fake-bsl",
errs: []string{},
},
{
name: "return completed pvbs",
Expand All @@ -512,6 +548,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, true, 1),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVObj(1, false),
},
Expand All @@ -522,6 +559,7 @@ func TestBackupPodVolumes(t *testing.T) {
uploaderType: "kopia",
bsl: "fake-bsl",
pvbs: 1,
errs: []string{},
},
}
// TODO add more verification around PVCBackupSummary returned by "BackupPodVolumes"
Expand Down Expand Up @@ -568,8 +606,8 @@ func TestBackupPodVolumes(t *testing.T) {

pvbs, _, errs := bp.BackupPodVolumes(backupObj, test.sourcePod, test.volumes, nil, velerotest.NewLogger())

if errs == nil {
assert.Nil(t, test.errs)
if test.errs == nil {
assert.NoError(t, err)
} else {
for i := 0; i < len(errs); i++ {
assert.EqualError(t, errs[i], test.errs[i])
Expand Down
8 changes: 7 additions & 1 deletion pkg/podvolume/restorer.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ func (r *restorer) RestorePodVolumes(data RestoreData, tracker *volume.RestoreVo
return nil
}

if err := nodeagent.IsRunning(r.ctx, r.kubeClient, data.Restore.Namespace); err != nil {
if err := nodeagent.IsRunningOnLinux(r.ctx, r.kubeClient, data.Restore.Namespace); err != nil {
return []error{errors.Wrapf(err, "error to check node agent status")}
}

Expand Down Expand Up @@ -213,6 +213,12 @@ func (r *restorer) RestorePodVolumes(data RestoreData, tracker *volume.RestoreVo
} else if err != nil {
r.log.WithError(err).Error("Failed to check node-agent pod status, disengage")
} else {
if err := kube.IsLinuxNode(checkCtx, nodeName, r.crClient); err != nil {
r.log.WithField("node", nodeName).WithError(err).Error("Restored pod is not running in linux node")
r.nodeAgentCheck <- errors.Wrapf(err, "restored pod %s/%s is not running in linux node(%s)", data.Pod.Namespace, data.Pod.Name, nodeName)
return
}

err = nodeagent.IsRunningInNode(checkCtx, data.Restore.Namespace, nodeName, r.crClient)
if err != nil {
r.log.WithField("node", nodeName).WithError(err).Error("node-agent pod is not running in node, abort the restore")
Expand Down
Loading

0 comments on commit 33efb69

Please sign in to comment.