Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fs-backup for clusters with windows nodes #8518

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelogs/unreleased/8518-Lyndon-Li
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Make fs-backup work on linux nodes with the new Velero deployment and disable fs-backup if the source/target pod is running in non-linux node (#8424)
2 changes: 2 additions & 0 deletions pkg/cmd/cli/install/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,9 @@
if _, err = install.NodeAgentIsReady(dynamicFactory, o.Namespace); err != nil {
return errors.Wrap(err, errorMsg)
}
}

if o.UseNodeAgentWindows {

Check warning on line 403 in pkg/cmd/cli/install/install.go

View check run for this annotation

Codecov / codecov/patch

pkg/cmd/cli/install/install.go#L403

Added line #L403 was not covered by tests
fmt.Println("Waiting for node-agent-windows daemonset to be ready.")
if _, err = install.NodeAgentWindowsIsReady(dynamicFactory, o.Namespace); err != nil {
return errors.Wrap(err, errorMsg)
Expand Down
19 changes: 15 additions & 4 deletions pkg/cmd/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
"github.com/vmware-tanzu/velero/pkg/restore"
"github.com/vmware-tanzu/velero/pkg/uploader"
"github.com/vmware-tanzu/velero/pkg/util/filesystem"
"github.com/vmware-tanzu/velero/pkg/util/kube"
"github.com/vmware-tanzu/velero/pkg/util/logging"
)

Expand Down Expand Up @@ -471,10 +472,20 @@

func (s *server) checkNodeAgent() {
// warn if node agent does not exist
if err := nodeagent.IsRunning(s.ctx, s.kubeClient, s.namespace); err == nodeagent.ErrDaemonSetNotFound {
s.logger.Warn("Velero node agent not found; pod volume backups/restores will not work until it's created")
} else if err != nil {
s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent")
if kube.WithLinuxNode(s.ctx, s.crClient, s.logger) {
if err := nodeagent.IsRunningOnLinux(s.ctx, s.kubeClient, s.namespace); err == nodeagent.ErrDaemonSetNotFound {
s.logger.Warn("Velero node agent not found for linux nodes; pod volume backups/restores and data mover backups/restores will not work until it's created")
} else if err != nil {
s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent for linux nodes")
}

Check warning on line 480 in pkg/cmd/server/server.go

View check run for this annotation

Codecov / codecov/patch

pkg/cmd/server/server.go#L475-L480

Added lines #L475 - L480 were not covered by tests
}

if kube.WithWindowsNode(s.ctx, s.crClient, s.logger) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to check this if the Windows node agent isn't enabled?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need to do this.
This is a sanity check and needs to state the fact in all cases when Windows/linux nodes exist but fs-backup/data mover won't work in the nodes.

if err := nodeagent.IsRunningOnWindows(s.ctx, s.kubeClient, s.namespace); err == nodeagent.ErrDaemonSetNotFound {
s.logger.Warn("Velero node agent not found for Windows nodes; pod volume backups/restores and data mover backups/restores will not work until it's created")
} else if err != nil {
s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent for Windows nodes")
}

Check warning on line 488 in pkg/cmd/server/server.go

View check run for this annotation

Codecov / codecov/patch

pkg/cmd/server/server.go#L483-L488

Added lines #L483 - L488 were not covered by tests
}
}

Expand Down
18 changes: 14 additions & 4 deletions pkg/nodeagent/node_agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,12 @@
)

const (
// daemonSet is the name of the Velero node agent daemonset.
// daemonSet is the name of the Velero node agent daemonset on linux nodes.
daemonSet = "node-agent"

// daemonsetWindows is the name of the Velero node agent daemonset on Windows nodes.
daemonsetWindows = "node-agent-windows"

// nodeAgentRole marks pods with node-agent role on all nodes.
nodeAgentRole = "node-agent"
)
Expand Down Expand Up @@ -92,9 +95,16 @@
PodResources *kube.PodResources `json:"podResources,omitempty"`
}

// IsRunning checks if the node agent daemonset is running properly. If not, return the error found
func IsRunning(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error {
if _, err := kubeClient.AppsV1().DaemonSets(namespace).Get(ctx, daemonSet, metav1.GetOptions{}); apierrors.IsNotFound(err) {
func IsRunningOnLinux(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error {
return isRunning(ctx, kubeClient, namespace, daemonSet)

Check warning on line 99 in pkg/nodeagent/node_agent.go

View check run for this annotation

Codecov / codecov/patch

pkg/nodeagent/node_agent.go#L98-L99

Added lines #L98 - L99 were not covered by tests
}

func IsRunningOnWindows(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error {
return isRunning(ctx, kubeClient, namespace, daemonsetWindows)

Check warning on line 103 in pkg/nodeagent/node_agent.go

View check run for this annotation

Codecov / codecov/patch

pkg/nodeagent/node_agent.go#L102-L103

Added lines #L102 - L103 were not covered by tests
}

func isRunning(ctx context.Context, kubeClient kubernetes.Interface, namespace string, daemonset string) error {
if _, err := kubeClient.AppsV1().DaemonSets(namespace).Get(ctx, daemonset, metav1.GetOptions{}); apierrors.IsNotFound(err) {
return ErrDaemonSetNotFound
} else if err != nil {
return err
Expand Down
6 changes: 3 additions & 3 deletions pkg/nodeagent/node_agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ type reactor struct {
}

func TestIsRunning(t *testing.T) {
daemonSet := &appsv1.DaemonSet{
ds := &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Namespace: "fake-ns",
Name: "node-agent",
Expand Down Expand Up @@ -80,7 +80,7 @@ func TestIsRunning(t *testing.T) {
name: "succeed",
namespace: "fake-ns",
kubeClientObj: []runtime.Object{
daemonSet,
ds,
},
},
}
Expand All @@ -93,7 +93,7 @@ func TestIsRunning(t *testing.T) {
fakeKubeClient.Fake.PrependReactor(reactor.verb, reactor.resource, reactor.reactorFunc)
}

err := IsRunning(context.TODO(), fakeKubeClient, test.namespace)
err := isRunning(context.TODO(), fakeKubeClient, test.namespace, daemonSet)
if test.expectErr == "" {
assert.NoError(t, err)
} else {
Expand Down
6 changes: 6 additions & 0 deletions pkg/podvolume/backupper.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,12 @@
return nil, pvcSummary, nil
}

if err := kube.IsLinuxNode(b.ctx, pod.Spec.NodeName, b.crClient); err != nil {
err := errors.Wrapf(err, "Pod %s/%s is not running in linux node(%s), skip", pod.Namespace, pod.Name, pod.Spec.NodeName)
skipAllPodVolumes(pod, volumesToBackup, err, pvcSummary, log)
return nil, pvcSummary, []error{err}
}

Check warning on line 213 in pkg/podvolume/backupper.go

View check run for this annotation

Codecov / codecov/patch

pkg/podvolume/backupper.go#L210-L213

Added lines #L210 - L213 were not covered by tests

err := nodeagent.IsRunningInNode(b.ctx, backup.Namespace, pod.Spec.NodeName, b.crClient)
if err != nil {
skipAllPodVolumes(pod, volumesToBackup, err, pvcSummary, log)
Expand Down
44 changes: 41 additions & 3 deletions pkg/podvolume/backupper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,14 @@ func createPVBObj(fail bool, withSnapshot bool, index int, uploaderType string)
return pvbObj
}

func createNodeObj() *corev1api.Node {
return builder.ForNode("fake-node-name").Labels(map[string]string{"kubernetes.io/os": "linux"}).Result()
}

func createWindowsNodeObj() *corev1api.Node {
return builder.ForNode("fake-node-name").Labels(map[string]string{"kubernetes.io/os": "windows"}).Result()
}

func TestBackupPodVolumes(t *testing.T) {
scheme := runtime.NewScheme()
velerov1api.AddToScheme(scheme)
Expand Down Expand Up @@ -358,13 +366,32 @@ func TestBackupPodVolumes(t *testing.T) {
uploaderType: "kopia",
bsl: "fake-bsl",
},
{
name: "pod is not running on Linux node",
volumes: []string{
"fake-volume-1",
"fake-volume-2",
},
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createWindowsNodeObj(),
},
sourcePod: createPodObj(false, false, false, 2),
uploaderType: "kopia",
errs: []string{
"Pod fake-ns/fake-pod is not running in linux node(fake-node-name), skip",
},
},
{
name: "node-agent pod is not running in node",
volumes: []string{
"fake-volume-1",
"fake-volume-2",
},
sourcePod: createPodObj(true, false, false, 2),
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeObj(),
},
uploaderType: "kopia",
errs: []string{
"daemonset pod not found in running state in node fake-node-name",
Expand All @@ -379,6 +406,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
uploaderType: "kopia",
mockGetRepositoryType: true,
Expand All @@ -395,6 +423,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
uploaderType: "kopia",
errs: []string{
Expand All @@ -410,6 +439,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
ctlClientObj: []runtime.Object{
createBackupRepoObj(),
Expand All @@ -427,6 +457,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
ctlClientObj: []runtime.Object{
createBackupRepoObj(),
Expand All @@ -448,6 +479,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVCObj(2),
},
Expand All @@ -471,6 +503,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVCObj(2),
createPVObj(1, true),
Expand All @@ -482,6 +515,7 @@ func TestBackupPodVolumes(t *testing.T) {
runtimeScheme: scheme,
uploaderType: "kopia",
bsl: "fake-bsl",
errs: []string{},
},
{
name: "volume not mounted by pod should be skipped",
Expand All @@ -492,6 +526,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVCObj(2),
createPVObj(1, false),
Expand All @@ -503,6 +538,7 @@ func TestBackupPodVolumes(t *testing.T) {
runtimeScheme: scheme,
uploaderType: "kopia",
bsl: "fake-bsl",
errs: []string{},
},
{
name: "return completed pvbs",
Expand All @@ -512,6 +548,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, true, 1),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVObj(1, false),
},
Expand All @@ -522,6 +559,7 @@ func TestBackupPodVolumes(t *testing.T) {
uploaderType: "kopia",
bsl: "fake-bsl",
pvbs: 1,
errs: []string{},
},
}
// TODO add more verification around PVCBackupSummary returned by "BackupPodVolumes"
Expand Down Expand Up @@ -568,8 +606,8 @@ func TestBackupPodVolumes(t *testing.T) {

pvbs, _, errs := bp.BackupPodVolumes(backupObj, test.sourcePod, test.volumes, nil, velerotest.NewLogger())

if errs == nil {
assert.Nil(t, test.errs)
if test.errs == nil {
assert.NoError(t, err)
} else {
for i := 0; i < len(errs); i++ {
assert.EqualError(t, errs[i], test.errs[i])
Expand Down
8 changes: 7 additions & 1 deletion pkg/podvolume/restorer.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ func (r *restorer) RestorePodVolumes(data RestoreData, tracker *volume.RestoreVo
return nil
}

if err := nodeagent.IsRunning(r.ctx, r.kubeClient, data.Restore.Namespace); err != nil {
if err := nodeagent.IsRunningOnLinux(r.ctx, r.kubeClient, data.Restore.Namespace); err != nil {
return []error{errors.Wrapf(err, "error to check node agent status")}
}

Expand Down Expand Up @@ -213,6 +213,12 @@ func (r *restorer) RestorePodVolumes(data RestoreData, tracker *volume.RestoreVo
} else if err != nil {
r.log.WithError(err).Error("Failed to check node-agent pod status, disengage")
} else {
if err := kube.IsLinuxNode(checkCtx, nodeName, r.crClient); err != nil {
r.log.WithField("node", nodeName).WithError(err).Error("Restored pod is not running in linux node")
r.nodeAgentCheck <- errors.Wrapf(err, "restored pod %s/%s is not running in linux node(%s)", data.Pod.Namespace, data.Pod.Name, nodeName)
return
}

err = nodeagent.IsRunningInNode(checkCtx, data.Restore.Namespace, nodeName, r.crClient)
if err != nil {
r.log.WithField("node", nodeName).WithError(err).Error("node-agent pod is not running in node, abort the restore")
Expand Down
35 changes: 28 additions & 7 deletions pkg/podvolume/restorer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ import (
"k8s.io/client-go/kubernetes"
kubefake "k8s.io/client-go/kubernetes/fake"
"k8s.io/client-go/tools/cache"
ctrlfake "sigs.k8s.io/controller-runtime/pkg/client/fake"

"github.com/vmware-tanzu/velero/internal/volume"
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
Expand Down Expand Up @@ -314,13 +313,38 @@ func TestRestorePodVolumes(t *testing.T) {
},
},
},
{
name: "pod is not running on linux nodes",
pvbs: []*velerov1api.PodVolumeBackup{
createPVBObj(true, true, 1, "kopia"),
},
kubeClientObj: []runtime.Object{
createNodeAgentDaemonset(),
createWindowsNodeObj(),
createPVCObj(1),
createPodObj(true, true, true, 1),
},
ctlClientObj: []runtime.Object{
createBackupRepoObj(),
},
restoredPod: createPodObj(true, true, true, 1),
sourceNamespace: "fake-ns",
bsl: "fake-bsl",
runtimeScheme: scheme,
errs: []expectError{
{
err: "restored pod fake-ns/fake-pod is not running in linux node(fake-node-name): os type windows for node fake-node-name is not linux",
},
},
},
{
name: "node-agent pod is not running",
pvbs: []*velerov1api.PodVolumeBackup{
createPVBObj(true, true, 1, "kopia"),
},
kubeClientObj: []runtime.Object{
createNodeAgentDaemonset(),
createNodeObj(),
createPVCObj(1),
createPodObj(true, true, true, 1),
},
Expand All @@ -344,6 +368,7 @@ func TestRestorePodVolumes(t *testing.T) {
},
kubeClientObj: []runtime.Object{
createNodeAgentDaemonset(),
createNodeObj(),
createPVCObj(1),
createPodObj(true, true, true, 1),
createNodeAgentPodObj(true),
Expand All @@ -368,11 +393,6 @@ func TestRestorePodVolumes(t *testing.T) {
ctx = test.ctx
}

fakeClientBuilder := ctrlfake.NewClientBuilder()
if test.runtimeScheme != nil {
fakeClientBuilder = fakeClientBuilder.WithScheme(test.runtimeScheme)
}

objClient := append(test.ctlClientObj, test.kubeClientObj...)
objClient = append(objClient, test.veleroClientObj...)

Expand Down Expand Up @@ -438,7 +458,8 @@ func TestRestorePodVolumes(t *testing.T) {
for i := 0; i < len(errs); i++ {
j := 0
for ; j < len(test.errs); j++ {
if errs[i].Error() == test.errs[j].err {
err := errs[i].Error()
if err == test.errs[j].err {
break
}
}
Expand Down
Loading
Loading