Skip to content

Commit

Permalink
Enhance backup ready conditions
Browse files Browse the repository at this point in the history
  • Loading branch information
shreyas-s-rao committed Jun 19, 2023
1 parent c409d9c commit 088a2a8
Show file tree
Hide file tree
Showing 5 changed files with 187 additions and 41 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ install: manifests
kubectl apply -f config/crd/bases

# Deploy controller in the configured Kubernetes cluster in ~/.kube/config
.PHONY: deploy
.PHONY: deploy-via-kustomize
deploy-via-kustomize: manifests $(KUSTOMIZE)
kubectl apply -f config/crd/bases
kustomize build config/default | kubectl apply -f -
Expand Down
2 changes: 1 addition & 1 deletion pkg/health/condition/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ func (b *defaultBuilder) Build(replicas int32) []druidv1alpha1.Condition {
if condition.Status == "" {
condition.Status = druidv1alpha1.ConditionUnknown
}
condition.Reason = ConditionNotChecked
condition.Reason = NotChecked
condition.Message = "etcd cluster has been scaled down"
} else {
condition.Status = res.Status()
Expand Down
102 changes: 71 additions & 31 deletions pkg/health/condition/check_backup_ready.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"time"

druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1"
"github.com/gardener/etcd-druid/pkg/utils"
coordinationv1 "k8s.io/api/coordination/v1"

"k8s.io/apimachinery/pkg/types"
Expand All @@ -37,12 +38,12 @@ const (
BackupFailed string = "BackupFailed"
// Unknown is a constant that means that the etcd backup status is currently not known
Unknown string = "Unknown"
// ConditionNotChecked is a constant that means that the etcd backup status has not been updated or rechecked
ConditionNotChecked string = "ConditionNotChecked"
// NotChecked is a constant that means that the etcd backup status has not been updated or rechecked
NotChecked string = "ConditionNotChecked"
)

func (a *backupReadyCheck) Check(ctx context.Context, etcd druidv1alpha1.Etcd) Result {
//Default case
// Default case
result := &result{
conType: druidv1alpha1.ConditionTypeBackupReady,
status: druidv1alpha1.ConditionUnknown,
Expand All @@ -56,53 +57,92 @@ func (a *backupReadyCheck) Check(ctx context.Context, etcd druidv1alpha1.Etcd) R
return nil
}

//Fetch snapshot leases
// Fetch snapshot leases
var (
fullSnapErr, incrSnapErr error
fullSnapLease = &coordinationv1.Lease{}
deltaSnapLease = &coordinationv1.Lease{}
err error
fullSnapLease = &coordinationv1.Lease{}
deltaSnapLease = &coordinationv1.Lease{}
)
fullSnapErr = a.cl.Get(ctx, types.NamespacedName{Name: getFullSnapLeaseName(&etcd), Namespace: etcd.ObjectMeta.Namespace}, fullSnapLease)
incrSnapErr = a.cl.Get(ctx, types.NamespacedName{Name: getDeltaSnapLeaseName(&etcd), Namespace: etcd.ObjectMeta.Namespace}, deltaSnapLease)

//Set status to Unknown if errors in fetching snapshot leases or lease never renewed
if fullSnapErr != nil || incrSnapErr != nil || (fullSnapLease.Spec.RenewTime == nil && deltaSnapLease.Spec.RenewTime == nil) {
err = a.cl.Get(ctx, types.NamespacedName{Name: getFullSnapLeaseName(&etcd), Namespace: etcd.ObjectMeta.Namespace}, fullSnapLease)
if err != nil {
result.message = fmt.Sprintf("Unable to fetch full snap lease. %s", err.Error())
return result
}

err = a.cl.Get(ctx, types.NamespacedName{Name: getDeltaSnapLeaseName(&etcd), Namespace: etcd.ObjectMeta.Namespace}, deltaSnapLease)
if err != nil {
result.message = fmt.Sprintf("Unable to fetch delta snap lease. %s", err.Error())
return result
}

deltaLeaseRenewTime := deltaSnapLease.Spec.RenewTime
fullLeaseRenewTime := fullSnapLease.Spec.RenewTime
fullLeaseCreateTime := &fullSnapLease.ObjectMeta.CreationTimestamp

if fullLeaseRenewTime == nil && deltaLeaseRenewTime != nil {
if fullLeaseRenewTime == nil && deltaLeaseRenewTime == nil {
// Both snapshot leases are not yet renewed
result.message = "Snapshotter has not started yet"
return result

} else if fullLeaseRenewTime == nil && deltaLeaseRenewTime != nil {
// Most probable during reconcile of existing clusters if fresh leases are created
// Treat backup as succeeded if delta snap lease renewal happens in the required time window and full snap lease is not older than 24h.
if time.Since(deltaLeaseRenewTime.Time) < 2*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration && time.Since(fullLeaseCreateTime.Time) < 24*time.Hour {
result.reason = BackupSucceeded
result.message = "Delta snapshot backup succeeded"
result.status = druidv1alpha1.ConditionTrue

fullSnapshotDuration, err := utils.ComputeScheduleDuration(*etcd.Spec.Backup.FullSnapshotSchedule)
if err != nil {
return result
}

// Treat backup as succeeded if delta snap lease renewal happens in the required time window
// and full snap lease is not older than the computed full snapshot duration.
if time.Since(deltaLeaseRenewTime.Time) < 2*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration {
if time.Since(fullLeaseCreateTime.Time) < fullSnapshotDuration {
result.reason = BackupSucceeded
result.message = "Delta snapshot backup succeeded"
result.status = druidv1alpha1.ConditionTrue
return result
} else {
result.status = druidv1alpha1.ConditionFalse
result.reason = BackupFailed
result.message = "Full snapshot backup failed. Full snapshot lease created long ago, but not renewed"
return result
}
}

} else if fullLeaseRenewTime != nil && deltaLeaseRenewTime == nil {
// Most probable during a startup scenario for new clusters
// Special case: return Unknown condition for some time to allow delta backups to start up
// Even though full snapshot may have succeeded by the required time, we must still wait
// for delta snapshotting to begin to consider the backups as healthy, to maintain the given RPO
if time.Since(fullLeaseRenewTime.Time) < 5*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration {
result.message = "Waiting for periodic delta snapshotting to begin"
return result
} else {
result.status = druidv1alpha1.ConditionFalse
result.reason = BackupFailed
result.message = "Delta snapshot backup failed. Delta snapshot lease created long ago, but not renewed"
return result
}
} else if deltaLeaseRenewTime == nil && fullLeaseRenewTime != nil {
//Most probable during a startup scenario for new clusters
//Special case. Return Unknown condition for some time to allow delta backups to start up
if time.Since(fullLeaseRenewTime.Time) > 5*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration {
result.message = "Periodic delta snapshots not started yet"

} else {
// Both snap leases are renewed, ie, maintained. Both are expected to be renewed periodically
fullSnapshotDuration, err := utils.ComputeScheduleDuration(*etcd.Spec.Backup.FullSnapshotSchedule)
if err != nil {
return result
}
} else if deltaLeaseRenewTime != nil && fullLeaseRenewTime != nil {
//Both snap leases are maintained. Both are expected to be renewed periodically
if time.Since(deltaLeaseRenewTime.Time) < 2*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration && time.Since(fullLeaseRenewTime.Time) < 24*time.Hour {

// Mark backup as succeeded only if at least one of the last two delta snapshots has been taken successfully
// and if the last full snapshot has been taken according to the given full snapshot schedule
if time.Since(deltaLeaseRenewTime.Time) < 2*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration && time.Since(fullLeaseRenewTime.Time) < fullSnapshotDuration {
result.reason = BackupSucceeded
result.message = "Snapshot backup succeeded"
result.status = druidv1alpha1.ConditionTrue
return result
}
}

//Cases where snapshot leases are not updated for a long time
//If snapshot leases are present and leases aren't updated, it is safe to assume that backup is not healthy

// Cases where snapshot leases are not updated for a long time
// If snapshot leases are present and leases aren't updated, it is safe to assume that backup is not healthy
if etcd.Status.Conditions != nil {
var prevBackupReadyStatus druidv1alpha1.Condition
for _, prevBackupReadyStatus = range etcd.Status.Conditions {
Expand All @@ -122,16 +162,16 @@ func (a *backupReadyCheck) Check(ctx context.Context, etcd druidv1alpha1.Etcd) R
}
}

//Transition to "Unknown" state is we cannot prove a "True" state
// Transition to "Unknown" state is we cannot prove a "True" state
return result
}

func getDeltaSnapLeaseName(etcd *druidv1alpha1.Etcd) string {
return fmt.Sprintf("%s-delta-snap", string(etcd.ObjectMeta.Name))
return fmt.Sprintf("%s-delta-snap", etcd.ObjectMeta.Name)
}

func getFullSnapLeaseName(etcd *druidv1alpha1.Etcd) string {
return fmt.Sprintf("%s-full-snap", string(etcd.ObjectMeta.Name))
return fmt.Sprintf("%s-full-snap", etcd.ObjectMeta.Name)
}

// BackupReadyCheck returns a check for the "BackupReady" condition.
Expand Down
104 changes: 96 additions & 8 deletions pkg/health/condition/check_backup_ready_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"k8s.io/apimachinery/pkg/types"
"k8s.io/utils/pointer"
"sigs.k8s.io/controller-runtime/pkg/client"

coordinationv1 "k8s.io/api/coordination/v1"
Expand Down Expand Up @@ -56,6 +57,7 @@ var _ = Describe("BackupReadyCheck", func() {
Spec: druidv1alpha1.EtcdSpec{
Replicas: 1,
Backup: druidv1alpha1.BackupSpec{
FullSnapshotSchedule: pointer.String("0 0 * * *"), // at 00:00 every day
DeltaSnapshotPeriod: &v1.Duration{
Duration: deltaSnapshotDuration,
},
Expand Down Expand Up @@ -90,7 +92,7 @@ var _ = Describe("BackupReadyCheck", func() {
})

Context("With no snapshot leases present", func() {
It("Should return Unknown rediness", func() {
It("Should return Unknown readiness", func() {
cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, er *coordinationv1.Lease, _ ...client.GetOption) error {
return &noLeaseError
Expand All @@ -108,24 +110,35 @@ var _ = Describe("BackupReadyCheck", func() {
})

Context("With both snapshot leases present", func() {
It("Should set status to BackupSucceeded if both leases are recently renewed", func() {
It("Should set status to Unknown if both leases are recently created but not renewed", func() {
cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error {
*le = lease
le.Spec.RenewTime = nil
le.Spec.HolderIdentity = nil
return nil
},
).AnyTimes()

etcd.Status.Conditions = []druidv1alpha1.Condition{
{
Type: druidv1alpha1.ConditionTypeBackupReady,
Status: druidv1alpha1.ConditionTrue,
Message: "True",
},
}

check := BackupReadyCheck(cl)
result := check.Check(context.TODO(), etcd)

Expect(result).ToNot(BeNil())
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionTrue))
Expect(result.Reason()).To(Equal(BackupSucceeded))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionUnknown))
Expect(result.Reason()).To(Equal(Unknown))
Expect(result.Message()).To(Equal("Snapshotter has not started yet"))
})

It("Should set status to BackupSucceeded if delta snap lease is recently created and empty full snap lease has been created in the last 24h", func() {
It("Should set status to BackupSucceeded if delta snap lease is renewed recently, and empty full snap lease has been created in the last 24h", func() {
cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-full-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error {
*le = lease
Expand All @@ -149,13 +162,41 @@ var _ = Describe("BackupReadyCheck", func() {
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionTrue))
Expect(result.Reason()).To(Equal(BackupSucceeded))
Expect(result.Message()).To(Equal("Delta snapshot backup succeeded"))
})

It("Should set status to Unknown if delta snap lease is renewed recently, and no full snap lease has been created in the last 24h", func() {
cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-full-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error {
*le = lease
le.Spec.RenewTime = nil
le.Spec.HolderIdentity = nil
le.ObjectMeta.CreationTimestamp = v1.NewTime(time.Now().Add(-25 * time.Hour))
return nil
},
).AnyTimes()
cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-delta-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error {
*le = lease
return nil
},
).AnyTimes()

check := BackupReadyCheck(cl)
result := check.Check(context.TODO(), etcd)

Expect(result).ToNot(BeNil())
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionFalse))
Expect(result.Reason()).To(Equal(BackupFailed))
Expect(result.Message()).To(Equal("Full snapshot backup failed. Full snapshot lease created long ago, but not renewed"))
})

It("Should set status to Unknown if empty delta snap lease is present but full snap lease is renewed recently", func() {
It("Should set status to Unknown if empty delta snap lease is present but full snap lease has been renewed recently", func() {
cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-full-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error {
*le = lease
le.Spec.RenewTime = &v1.MicroTime{Time: lease.Spec.RenewTime.Time.Add(-5 * deltaSnapshotDuration)}
le.Spec.RenewTime = &v1.MicroTime{Time: lease.Spec.RenewTime.Time.Add(-4 * deltaSnapshotDuration)}
return nil
},
).AnyTimes()
Expand All @@ -175,7 +216,52 @@ var _ = Describe("BackupReadyCheck", func() {
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionUnknown))
Expect(result.Reason()).To(Equal(Unknown))
Expect(result.Message()).To(Equal("Periodic delta snapshots not started yet"))
Expect(result.Message()).To(Equal("Waiting for periodic delta snapshotting to begin"))
})

It("Should set status to Unknown if empty delta snap lease is present but full snap lease has not been renewed recently", func() {
cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-full-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error {
*le = lease
le.Spec.RenewTime = &v1.MicroTime{Time: lease.Spec.RenewTime.Time.Add(-6 * deltaSnapshotDuration)}
return nil
},
).AnyTimes()
cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-delta-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error {
*le = lease
le.Spec.RenewTime = nil
le.Spec.HolderIdentity = nil
return nil
},
).AnyTimes()

check := BackupReadyCheck(cl)
result := check.Check(context.TODO(), etcd)

Expect(result).ToNot(BeNil())
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionFalse))
Expect(result.Reason()).To(Equal(BackupFailed))
Expect(result.Message()).To(Equal("Delta snapshot backup failed. Delta snapshot lease created long ago, but not renewed"))
})

It("Should set status to BackupSucceeded if both leases are recently renewed", func() {
cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error {
*le = lease
return nil
},
).AnyTimes()

check := BackupReadyCheck(cl)
result := check.Check(context.TODO(), etcd)

Expect(result).ToNot(BeNil())
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionTrue))
Expect(result.Reason()).To(Equal(BackupSucceeded))
Expect(result.Message()).To(Equal("Snapshot backup succeeded"))
})

It("Should set status to Unknown if both leases are stale", func() {
Expand Down Expand Up @@ -204,6 +290,7 @@ var _ = Describe("BackupReadyCheck", func() {
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionUnknown))
Expect(result.Reason()).To(Equal(Unknown))
Expect(result.Message()).To(Equal("Cannot determine etcd backup status"))
})

It("Should set status to BackupFailed if both leases are stale and current condition is Unknown", func() {
Expand Down Expand Up @@ -232,6 +319,7 @@ var _ = Describe("BackupReadyCheck", func() {
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionFalse))
Expect(result.Reason()).To(Equal(BackupFailed))
Expect(result.Message()).To(Equal("Stale snapshot leases. Not renewed in a long time"))
})
})

Expand Down
18 changes: 18 additions & 0 deletions pkg/utils/miscellaneous.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ package utils

import (
"fmt"
"time"

"github.com/robfig/cron/v3"
"sigs.k8s.io/controller-runtime/pkg/client"
)

Expand Down Expand Up @@ -77,3 +79,19 @@ func Max(x, y int) int {
}
return x
}

// ComputeScheduleDuration computes the duration between two activations for the given cron schedule.
// Assumes that every cron activation is at equal durations apart, based on cron schedules such as
// "once every X hours", "once every Y days", "at 1:00pm on every Tuesday", etc.
// TODO: write a new function to accurately compute the previous activation time from the cron schedule
// in order to compute when the previous activation of the cron schedule was supposed to have occurred.
func ComputeScheduleDuration(cronSchedule string) (time.Duration, error) {
schedule, err := cron.ParseStandard(cronSchedule)
if err != nil {
return 0, err
}

nextScheduledTime := schedule.Next(time.Now())
nextNextScheduledTime := schedule.Next(nextScheduledTime.Add(time.Second))
return nextNextScheduledTime.Sub(nextScheduledTime), nil
}

0 comments on commit 088a2a8

Please sign in to comment.