From a536ff0adf5f1e5d25946756c46289174c2c9a2a Mon Sep 17 00:00:00 2001 From: Radovan Zvoncek Date: Mon, 21 Oct 2024 16:20:33 +0300 Subject: [PATCH] Prevent sync tasks from creating backups from unsuccessful jobs --- .../medusa/medusabackupjob_controller_test.go | 30 +++++++-- controllers/medusa/medusatask_controller.go | 4 ++ .../medusa/medusatask_controller_test.go | 61 +++++++++++++++++-- 3 files changed, 84 insertions(+), 11 deletions(-) diff --git a/controllers/medusa/medusabackupjob_controller_test.go b/controllers/medusa/medusabackupjob_controller_test.go index 1f6b2bd81..1c3091387 100644 --- a/controllers/medusa/medusabackupjob_controller_test.go +++ b/controllers/medusa/medusabackupjob_controller_test.go @@ -249,7 +249,7 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa t.Logf("backup %s failed: %v", backupName, updated.Status.Failed) t.Logf("backup %s finished: %v", backupName, updated.Status.Finished) t.Logf("backup %s in progress: %v", backupName, updated.Status.InProgress) - return !updated.Status.FinishTime.IsZero() // && len(updated.Status.Finished) == 3 && len(updated.Status.InProgress) == 0 + return !updated.Status.FinishTime.IsZero() }, timeout, interval) t.Log("check for the MedusaBackup being created") @@ -370,8 +370,26 @@ func (c *fakeMedusaClient) CreateBackup(ctx context.Context, name string, backup } func (c *fakeMedusaClient) GetBackups(ctx context.Context) ([]*medusa.BackupSummary, error) { + backups := make([]*medusa.BackupSummary, 0) + for _, name := range c.RequestedBackups { + + // return status based on the backup name + // since we're implementing altogether different method of the Medusa client, we cannot reuse the BackupStatus logic + // but we still want to "mock" failing backups + // this does not get called per node/pod, se we don't need to track counts of what we returned + var status medusa.StatusType + if strings.HasPrefix(name, "good") { + status = *medusa.StatusType_SUCCESS.Enum() + } else if strings.HasPrefix(name, "bad") { + status = *medusa.StatusType_FAILED.Enum() + } else if strings.HasPrefix(name, "missing") { + status = *medusa.StatusType_UNKNOWN.Enum() + } else { + status = *medusa.StatusType_IN_PROGRESS.Enum() + } + backup := &medusa.BackupSummary{ BackupName: name, StartTime: 0, @@ -380,7 +398,7 @@ func (c *fakeMedusaClient) GetBackups(ctx context.Context) ([]*medusa.BackupSumm FinishedNodes: 3, TotalObjects: fakeBackupFileCount, TotalSize: fakeBackupByteSize, - Status: *medusa.StatusType_SUCCESS.Enum(), + Status: status, Nodes: []*medusa.BackupNode{ { Host: "host1", @@ -408,17 +426,19 @@ func (c *fakeMedusaClient) GetBackups(ctx context.Context) ([]*medusa.BackupSumm } func (c *fakeMedusaClient) BackupStatus(ctx context.Context, name string) (*medusa.BackupStatusResponse, error) { + // return different status for differently named backups + // but for each not-successful backup, return not-a-success only once var status medusa.StatusType - if name == successfulBackupName || strings.HasPrefix(name, successfulBackupName) { + if strings.HasPrefix(name, successfulBackupName) { status = medusa.StatusType_SUCCESS - } else if name == failingBackupName { + } else if strings.HasPrefix(name, failingBackupName) { if !alreadyReportedFailingBackup { status = medusa.StatusType_FAILED alreadyReportedFailingBackup = true } else { status = medusa.StatusType_SUCCESS } - } else if name == missingBackupName { + } else if strings.HasPrefix(name, missingBackupName) { if !alreadyReportedMissingBackup { alreadyReportedMissingBackup = true // reproducing what the gRPC client would send. sadly, it's not a proper NotFound error diff --git a/controllers/medusa/medusatask_controller.go b/controllers/medusa/medusatask_controller.go index 5f3303cfb..9648415ff 100644 --- a/controllers/medusa/medusatask_controller.go +++ b/controllers/medusa/medusatask_controller.go @@ -269,6 +269,10 @@ func (r *MedusaTaskReconciler) syncOperation(ctx context.Context, task *medusav1 logger.Error(err, "failed to list backups", "CassandraPod", pod.Name) } else { for _, backup := range remoteBackups { + if backup.Status != medusa.StatusType_SUCCESS { + logger.Info(fmt.Sprintf("Skipping sync of backup %s because it wasn't a success", backup.BackupName)) + continue + } logger.Info("Syncing Backup", "Backup", backup.BackupName) // Create backups that should exist but are missing backupKey := types.NamespacedName{Namespace: task.Namespace, Name: backup.BackupName} diff --git a/controllers/medusa/medusatask_controller_test.go b/controllers/medusa/medusatask_controller_test.go index 3dea1f07e..fb87fc5d5 100644 --- a/controllers/medusa/medusatask_controller_test.go +++ b/controllers/medusa/medusatask_controller_test.go @@ -20,6 +20,8 @@ const ( backup2 = "good-backup2" backup3 = "good-backup3" backup4 = "good-backup4" + backup5 = "bad-backup5" + backup6 = "missing-backup6" ) func testMedusaTasks(t *testing.T, ctx context.Context, f *framework.Framework, namespace string) { @@ -152,9 +154,18 @@ func testMedusaTasks(t *testing.T, ctx context.Context, f *framework.Framework, require.True(backup3Created, "failed to create backup3") backup4Created := createAndVerifyMedusaBackup(dc2Key, dc2, f, ctx, require, t, namespace, backup4) require.True(backup4Created, "failed to create backup4") + backup5Created := createAndVerifyMedusaBackup(dc2Key, dc2, f, ctx, require, t, namespace, backup5) + require.False(backup5Created, "failed to create backup4") + backup6Created := createAndVerifyMedusaBackup(dc2Key, dc2, f, ctx, require, t, namespace, backup6) + require.False(backup6Created, "failed to create backup4") - // Ensure that 4 backups and backup jobs were created - checkBackupsAndJobs(require, ctx, 4, namespace, f, []string{}) + // Ensure that 6 backups jobs, but only 4 backups were created (two jobs did not succeed on some pods) + checkBackupsAndJobs(require, ctx, 6, 4, namespace, f, []string{}) + + checkSyncTask(require, ctx, namespace, "dc2", f) + + // Ensure the sync task did not create backups for the failed jobs + checkBackupsAndJobs(require, ctx, 6, 4, namespace, f, []string{}) // Purge backups and verify that only one out of three remains t.Log("purge backups") @@ -209,7 +220,7 @@ func testMedusaTasks(t *testing.T, ctx context.Context, f *framework.Framework, // Ensure that 2 backups and backup jobs were deleted deletedBackups := []string{backup1, backup2} - checkBackupsAndJobs(require, ctx, 2, namespace, f, deletedBackups) + checkBackupsAndJobs(require, ctx, 4, 2, namespace, f, deletedBackups) medusaBackup4Key := framework.NewClusterKey(f.DataPlaneContexts[0], namespace, backup4) medusaBackup4 := &api.MedusaBackup{} @@ -222,19 +233,57 @@ func testMedusaTasks(t *testing.T, ctx context.Context, f *framework.Framework, verifyObjectDoesNotExist(ctx, t, f, dc2Key, &cassdcapi.CassandraDatacenter{}) } -func checkBackupsAndJobs(require *require.Assertions, ctx context.Context, expectedLen int, namespace string, f *framework.Framework, deleted []string) { +func checkBackupsAndJobs(require *require.Assertions, ctx context.Context, expectedJobsLen, expectedBackupsLen int, namespace string, f *framework.Framework, deleted []string) { var backups api.MedusaBackupList err := f.List(ctx, framework.NewClusterKey(f.DataPlaneContexts[0], namespace, "list-backups"), &backups) require.NoError(err, "failed to list medusabackup") - require.Len(backups.Items, expectedLen, "expected %d backups, got %d", expectedLen, len(backups.Items)) + require.Len(backups.Items, expectedBackupsLen, "expected %d backups, got %d", expectedBackupsLen, len(backups.Items)) var jobs api.MedusaBackupJobList err = f.List(ctx, framework.NewClusterKey(f.DataPlaneContexts[0], namespace, "list-backup-jobs"), &jobs) require.NoError(err, "failed to list medusabackupjobs") - require.Len(jobs.Items, expectedLen, "expected %d jobs, got %d", expectedLen, len(jobs.Items)) + require.Len(jobs.Items, expectedJobsLen, "expected %d jobs, got %d", expectedJobsLen, len(jobs.Items)) for _, d := range deleted { require.NotContains(backups.Items, d, "MedusaBackup %s to have been deleted", d) require.NotContains(jobs.Items, d, "MedusaBackupJob %s to have been deleted", d) } } + +func checkSyncTask(require *require.Assertions, ctx context.Context, namespace, dc string, f *framework.Framework) { + syncTaskName := "sync-backups" + + // create a sync task + syncTask := &api.MedusaTask{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: syncTaskName, + Labels: map[string]string{ + "app": "medusa", + }, + }, + Spec: api.MedusaTaskSpec{ + CassandraDatacenter: dc, + Operation: "sync", + }, + } + syncTaskKey := framework.NewClusterKey(f.DataPlaneContexts[0], namespace, syncTaskName) + err := f.Create(ctx, syncTaskKey, syncTask) + require.NoError(err, "failed to create sync task") + + // wait for sync task to finish + require.Eventually(func() bool { + updated := &api.MedusaTask{} + err := f.Get(ctx, syncTaskKey, updated) + if err != nil { + return false + } + + v, ok := updated.Labels["app"] + if !ok || v != "medusa" { + return false + } + + return !updated.Status.FinishTime.IsZero() + }, timeout, interval) +}