Skip to content

Commit

Permalink
Prevent sync tasks from creating backups from unsuccessful jobs
Browse files Browse the repository at this point in the history
  • Loading branch information
rzvoncek committed Oct 21, 2024
1 parent 3b93c7d commit a536ff0
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 11 deletions.
30 changes: 25 additions & 5 deletions controllers/medusa/medusabackupjob_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa
t.Logf("backup %s failed: %v", backupName, updated.Status.Failed)
t.Logf("backup %s finished: %v", backupName, updated.Status.Finished)
t.Logf("backup %s in progress: %v", backupName, updated.Status.InProgress)
return !updated.Status.FinishTime.IsZero() // && len(updated.Status.Finished) == 3 && len(updated.Status.InProgress) == 0
return !updated.Status.FinishTime.IsZero()
}, timeout, interval)

t.Log("check for the MedusaBackup being created")
Expand Down Expand Up @@ -370,8 +370,26 @@ func (c *fakeMedusaClient) CreateBackup(ctx context.Context, name string, backup
}

func (c *fakeMedusaClient) GetBackups(ctx context.Context) ([]*medusa.BackupSummary, error) {

backups := make([]*medusa.BackupSummary, 0)

for _, name := range c.RequestedBackups {

// return status based on the backup name
// since we're implementing altogether different method of the Medusa client, we cannot reuse the BackupStatus logic
// but we still want to "mock" failing backups
// this does not get called per node/pod, se we don't need to track counts of what we returned
var status medusa.StatusType
if strings.HasPrefix(name, "good") {
status = *medusa.StatusType_SUCCESS.Enum()
} else if strings.HasPrefix(name, "bad") {
status = *medusa.StatusType_FAILED.Enum()
} else if strings.HasPrefix(name, "missing") {
status = *medusa.StatusType_UNKNOWN.Enum()
} else {
status = *medusa.StatusType_IN_PROGRESS.Enum()
}

backup := &medusa.BackupSummary{
BackupName: name,
StartTime: 0,
Expand All @@ -380,7 +398,7 @@ func (c *fakeMedusaClient) GetBackups(ctx context.Context) ([]*medusa.BackupSumm
FinishedNodes: 3,
TotalObjects: fakeBackupFileCount,
TotalSize: fakeBackupByteSize,
Status: *medusa.StatusType_SUCCESS.Enum(),
Status: status,
Nodes: []*medusa.BackupNode{
{
Host: "host1",
Expand Down Expand Up @@ -408,17 +426,19 @@ func (c *fakeMedusaClient) GetBackups(ctx context.Context) ([]*medusa.BackupSumm
}

func (c *fakeMedusaClient) BackupStatus(ctx context.Context, name string) (*medusa.BackupStatusResponse, error) {
// return different status for differently named backups
// but for each not-successful backup, return not-a-success only once
var status medusa.StatusType
if name == successfulBackupName || strings.HasPrefix(name, successfulBackupName) {
if strings.HasPrefix(name, successfulBackupName) {
status = medusa.StatusType_SUCCESS
} else if name == failingBackupName {
} else if strings.HasPrefix(name, failingBackupName) {
if !alreadyReportedFailingBackup {
status = medusa.StatusType_FAILED
alreadyReportedFailingBackup = true
} else {
status = medusa.StatusType_SUCCESS
}
} else if name == missingBackupName {
} else if strings.HasPrefix(name, missingBackupName) {
if !alreadyReportedMissingBackup {
alreadyReportedMissingBackup = true
// reproducing what the gRPC client would send. sadly, it's not a proper NotFound error
Expand Down
4 changes: 4 additions & 0 deletions controllers/medusa/medusatask_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,10 @@ func (r *MedusaTaskReconciler) syncOperation(ctx context.Context, task *medusav1
logger.Error(err, "failed to list backups", "CassandraPod", pod.Name)
} else {
for _, backup := range remoteBackups {
if backup.Status != medusa.StatusType_SUCCESS {
logger.Info(fmt.Sprintf("Skipping sync of backup %s because it wasn't a success", backup.BackupName))
continue
}
logger.Info("Syncing Backup", "Backup", backup.BackupName)
// Create backups that should exist but are missing
backupKey := types.NamespacedName{Namespace: task.Namespace, Name: backup.BackupName}
Expand Down
61 changes: 55 additions & 6 deletions controllers/medusa/medusatask_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ const (
backup2 = "good-backup2"
backup3 = "good-backup3"
backup4 = "good-backup4"
backup5 = "bad-backup5"
backup6 = "missing-backup6"
)

func testMedusaTasks(t *testing.T, ctx context.Context, f *framework.Framework, namespace string) {
Expand Down Expand Up @@ -152,9 +154,18 @@ func testMedusaTasks(t *testing.T, ctx context.Context, f *framework.Framework,
require.True(backup3Created, "failed to create backup3")
backup4Created := createAndVerifyMedusaBackup(dc2Key, dc2, f, ctx, require, t, namespace, backup4)
require.True(backup4Created, "failed to create backup4")
backup5Created := createAndVerifyMedusaBackup(dc2Key, dc2, f, ctx, require, t, namespace, backup5)
require.False(backup5Created, "failed to create backup4")
backup6Created := createAndVerifyMedusaBackup(dc2Key, dc2, f, ctx, require, t, namespace, backup6)
require.False(backup6Created, "failed to create backup4")

// Ensure that 4 backups and backup jobs were created
checkBackupsAndJobs(require, ctx, 4, namespace, f, []string{})
// Ensure that 6 backups jobs, but only 4 backups were created (two jobs did not succeed on some pods)
checkBackupsAndJobs(require, ctx, 6, 4, namespace, f, []string{})

checkSyncTask(require, ctx, namespace, "dc2", f)

// Ensure the sync task did not create backups for the failed jobs
checkBackupsAndJobs(require, ctx, 6, 4, namespace, f, []string{})

// Purge backups and verify that only one out of three remains
t.Log("purge backups")
Expand Down Expand Up @@ -209,7 +220,7 @@ func testMedusaTasks(t *testing.T, ctx context.Context, f *framework.Framework,

// Ensure that 2 backups and backup jobs were deleted
deletedBackups := []string{backup1, backup2}
checkBackupsAndJobs(require, ctx, 2, namespace, f, deletedBackups)
checkBackupsAndJobs(require, ctx, 4, 2, namespace, f, deletedBackups)

medusaBackup4Key := framework.NewClusterKey(f.DataPlaneContexts[0], namespace, backup4)
medusaBackup4 := &api.MedusaBackup{}
Expand All @@ -222,19 +233,57 @@ func testMedusaTasks(t *testing.T, ctx context.Context, f *framework.Framework,
verifyObjectDoesNotExist(ctx, t, f, dc2Key, &cassdcapi.CassandraDatacenter{})
}

func checkBackupsAndJobs(require *require.Assertions, ctx context.Context, expectedLen int, namespace string, f *framework.Framework, deleted []string) {
func checkBackupsAndJobs(require *require.Assertions, ctx context.Context, expectedJobsLen, expectedBackupsLen int, namespace string, f *framework.Framework, deleted []string) {
var backups api.MedusaBackupList
err := f.List(ctx, framework.NewClusterKey(f.DataPlaneContexts[0], namespace, "list-backups"), &backups)
require.NoError(err, "failed to list medusabackup")
require.Len(backups.Items, expectedLen, "expected %d backups, got %d", expectedLen, len(backups.Items))
require.Len(backups.Items, expectedBackupsLen, "expected %d backups, got %d", expectedBackupsLen, len(backups.Items))

var jobs api.MedusaBackupJobList
err = f.List(ctx, framework.NewClusterKey(f.DataPlaneContexts[0], namespace, "list-backup-jobs"), &jobs)
require.NoError(err, "failed to list medusabackupjobs")
require.Len(jobs.Items, expectedLen, "expected %d jobs, got %d", expectedLen, len(jobs.Items))
require.Len(jobs.Items, expectedJobsLen, "expected %d jobs, got %d", expectedJobsLen, len(jobs.Items))

for _, d := range deleted {
require.NotContains(backups.Items, d, "MedusaBackup %s to have been deleted", d)
require.NotContains(jobs.Items, d, "MedusaBackupJob %s to have been deleted", d)
}
}

func checkSyncTask(require *require.Assertions, ctx context.Context, namespace, dc string, f *framework.Framework) {
syncTaskName := "sync-backups"

// create a sync task
syncTask := &api.MedusaTask{
ObjectMeta: metav1.ObjectMeta{
Namespace: namespace,
Name: syncTaskName,
Labels: map[string]string{
"app": "medusa",
},
},
Spec: api.MedusaTaskSpec{
CassandraDatacenter: dc,
Operation: "sync",
},
}
syncTaskKey := framework.NewClusterKey(f.DataPlaneContexts[0], namespace, syncTaskName)
err := f.Create(ctx, syncTaskKey, syncTask)
require.NoError(err, "failed to create sync task")

// wait for sync task to finish
require.Eventually(func() bool {
updated := &api.MedusaTask{}
err := f.Get(ctx, syncTaskKey, updated)
if err != nil {
return false
}

v, ok := updated.Labels["app"]
if !ok || v != "medusa" {
return false
}

return !updated.Status.FinishTime.IsZero()
}, timeout, interval)
}

0 comments on commit a536ff0

Please sign in to comment.