Skip to content

Commit

Permalink
Add operational-mode annotation to resolve VRG state ambiguity during…
Browse files Browse the repository at this point in the history
… failover

- Problem: During failover, if the primary cluster is inaccessible, the DRPC cannot
  distinguish between a VRG in its final secondary state and one transitioning to
  secondary. This ambiguity prevents the user from failing over the application.
- Solution: Introduced  annotation to indicate the current operational mode of the VRG.

This fix allows the DRPC to correctly identify the VRG's state and maintaining the
PeerReady condition.

Fixes: bz-2264765

Signed-off-by: Benamar Mekhissi <[email protected]>
  • Loading branch information
Benamar Mekhissi committed Dec 16, 2024
1 parent 4563fb3 commit 6c0a8ce
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 16 deletions.
83 changes: 67 additions & 16 deletions internal/controller/drplacementcontrol.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ const (

// Annotation for application namespace on the managed cluster
DRPCAppNamespace = "drplacementcontrol.ramendr.openshift.io/app-namespace"

// vrg-operational-mode annotation will be used to indicate the current operational mode of the VRG resource.
// This annotation helps track whether the resource is currently functioning as Primary or Secondary, even when
// it is in the middle of transitioning between these states.
VRGOpModeAnnoKey = "drplacementcontrol.ramendr.openshift.io/vrg-operational-mode"
)

var (
Expand Down Expand Up @@ -800,6 +805,46 @@ func checkActivationForStorageIdentifier(
return false
}

func (d *DRPCInstance) isCurrentPrimaryValidForRelocation(preferredCluster string) (string, bool, error) {
currentPrimary, secondaries, err := d.validateAndSelectCurrentPrimary(preferredCluster)
d.log.Info("isCurrentPrimaryValidForRelocation: ", "cp", currentPrimary, "secondaries", secondaries, "err", err)

if err != nil {
return "", false, err
}

if currentPrimary == "" {
if d.isPrimaryTransitioning(secondaries) {
return "", true, nil
}
}

return currentPrimary, false, nil
}

// isPrimaryTransitioning checks if any of the secondary clusters are in the process of transitioning.
// It iterates through the list of secondary clusters, comparing the current replication state
// (from annotations) to the desired replication state (from the spec). If any of the secondary
// clusters have a current state (from annotation) that does not match the desired state, it returns true,
// indicating that a transition is in progress. If all secondary clusters have matching states, it returns false.
func (d *DRPCInstance) isPrimaryTransitioning(secondaries []string) bool {
for _, secondaryCluster := range secondaries {
vrg := d.vrgs[secondaryCluster]
if vrg.Status.State == rmn.SecondaryState {
continue
}

currentState := rmn.ReplicationState(vrg.Annotations[VRGOpModeAnnoKey])
desiredState := vrg.Spec.ReplicationState

if currentState != desiredState {
return true
}
}

return false
}

// runRelocate checks if pre-conditions for relocation are met, and if so performs the relocation
// Pre-requisites for relocation are checked as follows:
// - The exists at least one VRG across clusters (there is no state where we do not have a VRG as
Expand All @@ -819,7 +864,7 @@ func checkActivationForStorageIdentifier(
// - Check if current primary (that is not the preferred cluster), is ready to switch over
// - Relocate!
//
//nolint:gocognit,cyclop,funlen
//nolint:gocognit,cyclop,funlen,gocyclo
func (d *DRPCInstance) RunRelocate() (bool, error) {
d.log.Info("Entering RunRelocate", "state", d.getLastDRState(), "progression", d.getProgression())

Expand All @@ -837,16 +882,21 @@ func (d *DRPCInstance) RunRelocate() (bool, error) {
preferredClusterNamespace := d.instance.Spec.PreferredCluster

// Before relocating to the preferredCluster, do a quick validation and select the current preferred cluster.
curHomeCluster, err := d.validateAndSelectCurrentPrimary(preferredCluster)
if err != nil {
currentPrimary, transitioning, err := d.isCurrentPrimaryValidForRelocation(preferredCluster)
if (currentPrimary == "" && transitioning) || err != nil {
errMsg := "The DRPC is currently in the process of transitioning to secondary"
if err != nil {
errMsg = err.Error()
}

addOrUpdateCondition(&d.instance.Status.Conditions, rmn.ConditionAvailable, d.instance.Generation,
d.getConditionStatusForTypeAvailable(), string(d.instance.Status.Phase), err.Error())
d.getConditionStatusForTypeAvailable(), string(d.instance.Status.Phase), errMsg)

return !done, err
}

// If already relocated to preferredCluster; ensure required setup is complete
if curHomeCluster != "" && d.vrgExistsAndPrimary(preferredCluster) {
if currentPrimary != "" && d.vrgExistsAndPrimary(preferredCluster) {
d.setDRState(rmn.Relocating)
d.updatePreferredDecision()

Expand All @@ -868,9 +918,9 @@ func (d *DRPCInstance) RunRelocate() (bool, error) {
d.setStatusInitiating()

// Check if current primary (that is not the preferred cluster), is ready to switch over
if curHomeCluster != "" && curHomeCluster != preferredCluster &&
!d.readyToSwitchOver(curHomeCluster, preferredCluster) {
errMsg := fmt.Sprintf("current cluster (%s) has not completed protection actions", curHomeCluster)
if currentPrimary != "" && currentPrimary != preferredCluster &&
!d.readyToSwitchOver(currentPrimary, preferredCluster) {
errMsg := fmt.Sprintf("current cluster (%s) has not completed protection actions", currentPrimary)
addOrUpdateCondition(&d.instance.Status.Conditions, rmn.ConditionAvailable, d.instance.Generation,
d.getConditionStatusForTypeAvailable(), string(d.instance.Status.Phase), errMsg)

Expand All @@ -881,8 +931,8 @@ func (d *DRPCInstance) RunRelocate() (bool, error) {
return !done, fmt.Errorf("clean up secondaries is pending, peer is not ready")
}

if curHomeCluster != "" && curHomeCluster != preferredCluster {
result, err := d.quiesceAndRunFinalSync(curHomeCluster)
if currentPrimary != "" && currentPrimary != preferredCluster {
result, err := d.quiesceAndRunFinalSync(currentPrimary)
if err != nil {
return !done, err
}
Expand Down Expand Up @@ -1139,25 +1189,25 @@ func (d *DRPCInstance) selectCurrentPrimaryAndSecondaries() (string, []string) {
return primaryVRG, secondaryVRGs
}

func (d *DRPCInstance) validateAndSelectCurrentPrimary(preferredCluster string) (string, error) {
func (d *DRPCInstance) validateAndSelectCurrentPrimary(preferredCluster string) (string, []string, error) {
// Relocation requires preferredCluster to be configured
if preferredCluster == "" {
return "", fmt.Errorf("preferred cluster not valid")
return "", []string{}, fmt.Errorf("preferred cluster not valid")
}

// No VRGs found, invalid state, possibly deployment was not started
if len(d.vrgs) == 0 {
return "", fmt.Errorf("no VRGs exists. Can't relocate")
return "", []string{}, fmt.Errorf("no VRGs exists. Can't relocate")
}

// Check for at most a single cluster in primary state
if d.areMultipleVRGsPrimary() {
return "", fmt.Errorf("multiple primaries in transition detected")
return "", []string{}, fmt.Errorf("multiple primaries in transition detected")
}
// Pre-relocate cleanup
homeCluster, _ := d.selectCurrentPrimaryAndSecondaries()
homeCluster, secondaries := d.selectCurrentPrimaryAndSecondaries()

return homeCluster, nil
return homeCluster, secondaries, nil
}

// readyToSwitchOver checks App resources are ready and the cluster data has been protected.
Expand Down Expand Up @@ -1711,6 +1761,7 @@ func (d *DRPCInstance) updateVRGOptionalFields(vrg, vrgFromView *rmn.VolumeRepli
DoNotDeletePVCAnnotation: d.instance.GetAnnotations()[DoNotDeletePVCAnnotation],
DRPCUIDAnnotation: string(d.instance.UID),
rmnutil.IsCGEnabledAnnotation: d.instance.GetAnnotations()[rmnutil.IsCGEnabledAnnotation],
VRGOpModeAnnoKey: string(vrg.Spec.ReplicationState),
}

vrg.Spec.ProtectedNamespaces = d.instance.Spec.ProtectedNamespaces
Expand Down
5 changes: 5 additions & 0 deletions internal/controller/drplacementcontrol_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1255,6 +1255,7 @@ func verifyVRGManifestWorkCreatedAsPrimary(namespace, managedCluster string) {
Expect(vrg.Name).Should(Equal(DRPCCommonName))
Expect(vrg.Spec.PVCSelector.MatchLabels["appclass"]).Should(Equal("gold"))
Expect(vrg.Spec.ReplicationState).Should(Equal(rmn.Primary))
Expect(vrg.Annotations[controllers.VRGOpModeAnnoKey]).Should(Equal(string(rmn.Primary)))

// ensure DRPC copied KubeObjectProtection contents to VRG
drpc := getLatestDRPC(namespace)
Expand Down Expand Up @@ -1794,6 +1795,10 @@ func verifyFailoverToSecondary(placementObj client.Object, toCluster string,
decision := getLatestUserPlacementDecision(placementObj.GetName(), placementObj.GetNamespace())
Expect(decision.ClusterName).To(Equal(toCluster))
Expect(drpc.GetAnnotations()[controllers.LastAppDeploymentCluster]).To(Equal(toCluster))
vrg, err := getVRGFromManifestWork(toCluster)
Expect(err).NotTo(HaveOccurred())
Expect(vrg.Spec.ReplicationState).Should(Equal(rmn.Primary))
Expect(vrg.Annotations[controllers.VRGOpModeAnnoKey]).Should(Equal(string(rmn.Primary)))
}

func verifyActionResultForPlacement(placement *clrapiv1beta1.Placement, homeCluster string, plType PlacementType) {
Expand Down

0 comments on commit 6c0a8ce

Please sign in to comment.