Implement recreate ExistingResourcePolicy to restore API

Signed-off-by: Tiger Kaovilai <[email protected]>
vmware-tanzu · Jul 19, 2023 · da6fdac · da6fdac
1 parent c4286d7
commit da6fdac
Show file tree

Hide file tree

Showing 8 changed files with 78 additions and 14 deletions.
diff --git a/changelogs/unreleased/6354-kaovilai b/changelogs/unreleased/6354-kaovilai
@@ -0,0 +1 @@
+Implement recreate ExistingResourcePolicy to restore API
diff --git a/config/crd/v1/bases/velero.io_restores.yaml b/config/crd/v1/bases/velero.io_restores.yaml
@@ -56,6 +56,10 @@ spec:
               existingResourcePolicy:
                 description: ExistingResourcePolicy specifies the restore behavior
                   for the kubernetes resource to be restored
+                enum:
+                - none
+                - update
+                - recreate
                 nullable: true
                 type: string
               hooks:

diff --git a/config/crd/v1/crds/crds.go b/config/crd/v1/crds/crds.go
diff --git a/design/Implemented/existing-resource-policy_design.md b/design/Implemented/existing-resource-policy_design.md
@@ -55,10 +55,11 @@ skip restoration.
    - Changed resources: Velero will first try to patch the changed resource, Now if the patch:
        - succeeds: Then the in-cluster resource gets updated with the labels as well as the resource diff
        - fails: Velero adds a restore warning and tries to just update the backup/restore labels on the resource, if the labels patch also fails then we add restore error.
-3. `recreate`:  If resource already exists, then Velero will delete it and recreate the resource.
+3. `recreate`:  Similar to `update` but if resource already exists and is immutable (such as Pods), then Velero attempt to recreate the resource by deleting it first.
 
-*Note:* The `recreate` option is a non-goal for this enhancement proposal, but it is considered as a future scope.
-Another thing to highlight is that Velero will not be deleting any resources in any of the policy options proposed in 
+*Note:* The `recreate` option is being implemented in [#6354](https://github.com/vmware-tanzu/velero/pull/6354) separate from original proposal.
+
+Velero will not be deleting any resources for policy `update` and `none` proposed in 
 this design but Velero will patch the resources in `update` policy option.
 
 Example:

diff --git a/pkg/apis/velero/v1/restore_types.go b/pkg/apis/velero/v1/restore_types.go
@@ -109,6 +109,7 @@ type RestoreSpec struct {
 	Hooks RestoreHooks `json:"hooks,omitempty"`
 
 	// ExistingResourcePolicy specifies the restore behavior for the kubernetes resource to be restored
+	// +kubebuilder:validation:Enum=none;update;recreate
 	// +optional
 	// +nullable
 	ExistingResourcePolicy PolicyType `json:"existingResourcePolicy,omitempty"`
@@ -272,6 +273,9 @@ const (
 	// PolicyTypeUpdate means velero will try to attempt a patch on
 	// the changed resources.
 	PolicyTypeUpdate PolicyType = "update"
+
+	// PolicyTypeRecreate means velero attempt a patch on changed resource and fall back to recreating the resource when patch fails.
+	PolicyTypeRecreate PolicyType = "recreate"
 )
 
 // RestoreStatus captures the current status of a Velero restore

diff --git a/pkg/cmd/cli/restore/create.go b/pkg/cmd/cli/restore/create.go
@@ -117,7 +117,7 @@ func (o *CreateOptions) BindFlags(flags *pflag.FlagSet) {
 	flags.Var(&o.Labels, "labels", "Labels to apply to the restore.")
 	flags.Var(&o.IncludeResources, "include-resources", "Resources to include in the restore, formatted as resource.group, such as storageclasses.storage.k8s.io (use '*' for all resources).")
 	flags.Var(&o.ExcludeResources, "exclude-resources", "Resources to exclude from the restore, formatted as resource.group, such as storageclasses.storage.k8s.io.")
-	flags.StringVar(&o.ExistingResourcePolicy, "existing-resource-policy", "", "Restore Policy to be used during the restore workflow, can be - none or update")
+	flags.StringVar(&o.ExistingResourcePolicy, "existing-resource-policy", "", "Restore Policy to be used during the restore workflow, can be - none, update, or recreate")
 	flags.Var(&o.StatusIncludeResources, "status-include-resources", "Resources to include in the restore status, formatted as resource.group, such as storageclasses.storage.k8s.io.")
 	flags.Var(&o.StatusExcludeResources, "status-exclude-resources", "Resources to exclude from the restore status, formatted as resource.group, such as storageclasses.storage.k8s.io.")
 	flags.VarP(&o.Selector, "selector", "l", "Only restore resources matching this label selector.")
@@ -181,7 +181,7 @@ func (o *CreateOptions) Validate(c *cobra.Command, args []string, f client.Facto
 	}
 
 	if len(o.ExistingResourcePolicy) > 0 && !isResourcePolicyValid(o.ExistingResourcePolicy) {
-		return errors.New("existing-resource-policy has invalid value, it accepts only none, update as value")
+		return errors.New("existing-resource-policy has invalid value, it accepts only none, update, or recreate as value")
 	}
 
 	switch {
@@ -390,7 +390,7 @@ func (o *CreateOptions) Run(c *cobra.Command, f client.Factory) error {
 }
 
 func isResourcePolicyValid(resourcePolicy string) bool {
-	if resourcePolicy == string(api.PolicyTypeNone) || resourcePolicy == string(api.PolicyTypeUpdate) {
+	if resourcePolicy == string(api.PolicyTypeNone) || resourcePolicy == string(api.PolicyTypeUpdate) || resourcePolicy == string(api.PolicyTypeRecreate) {
 		return true
 	}
 	return false

diff --git a/pkg/restore/restore.go b/pkg/restore/restore.go
@@ -1450,20 +1450,33 @@ func (ctx *restoreContext) restoreItem(obj *unstructured.Unstructured, groupReso
 					ctx.log.Infof("restore API has resource policy defined %s , executing restore workflow accordingly for changed resource %s %s", resourcePolicy, fromCluster.GroupVersionKind().Kind, kube.NamespaceAndName(fromCluster))
 
 					// existingResourcePolicy is set as none, add warning
-					if resourcePolicy == velerov1api.PolicyTypeNone {
+					switch resourcePolicy {
+					case velerov1api.PolicyTypeNone:
 						e := errors.Errorf("could not restore, %s %q already exists. Warning: the in-cluster version is different than the backed-up version",
 							obj.GetKind(), obj.GetName())
 						warnings.Add(namespace, e)
-						// existingResourcePolicy is set as update, attempt patch on the resource and add warning if it fails
-					} else if resourcePolicy == velerov1api.PolicyTypeUpdate {
+					case velerov1api.PolicyTypeUpdate, velerov1api.PolicyTypeRecreate:
+						// existingResourcePolicy is set as update or recreate, attempt patch on the resource and add warning if it fails
 						// processing update as existingResourcePolicy
 						warningsFromUpdateRP, errsFromUpdateRP := ctx.processUpdateResourcePolicy(fromCluster, fromClusterWithLabels, obj, namespace, resourceClient)
 						if warningsFromUpdateRP.IsEmpty() && errsFromUpdateRP.IsEmpty() {
 							itemStatus.action = itemRestoreResultUpdated
 							ctx.restoredItems[itemKey] = itemStatus
 						}
-						warnings.Merge(&warningsFromUpdateRP)
-						errs.Merge(&errsFromUpdateRP)
+						// if resourcePolicy is recreate, attempt to recreate if patch had errors
+						if !errsFromUpdateRP.IsEmpty() && resourcePolicy == velerov1api.PolicyTypeRecreate {
+							ctx.log.Infof("patch attempt had errors, falling back to recreate due to recreate existingResourcePolicy for %s %s", fromCluster.GroupVersionKind().Kind, kube.NamespaceAndName(fromCluster))
+							warningsFromRecreateRP, errsFromRecreateRP := ctx.processRecreateResourcePolicy(fromCluster, fromClusterWithLabels, obj, namespace, resourceClient)
+							if warningsFromRecreateRP.IsEmpty() && errsFromRecreateRP.IsEmpty() {
+								itemStatus.action = itemRestoreResultUpdated
+								ctx.restoredItems[itemKey] = itemStatus
+							}
+							warnings.Merge(&warningsFromRecreateRP)
+							errs.Merge(&errsFromRecreateRP)
+						} else {
+							warnings.Merge(&warningsFromUpdateRP)
+							errs.Merge(&errsFromUpdateRP)
+						}
 					}
 				} else {
 					// Preserved Velero behavior when existingResourcePolicy is not specified by the user
@@ -2195,3 +2208,40 @@ func (ctx *restoreContext) processUpdateResourcePolicy(fromCluster, fromClusterW
 	}
 	return warnings, errs
 }
+
+// function to process existingResourcePolicy as recreate, tries to delete object first before restoring obj
+func (ctx *restoreContext) processRecreateResourcePolicy(fromCluster, fromClusterWithLabels, obj *unstructured.Unstructured, namespace string, resourceClient client.Dynamic) (warnings, errs results.Result) {
+	ctx.log.Infof("restore API has existingResourcePolicy defined as recreate , executing restore workflow accordingly for changed resource %s %s ", obj.GroupVersionKind().Kind, kube.NamespaceAndName(fromCluster))
+	ctx.log.Infof("attempting recreate on %s %q", fromCluster.GetKind(), fromCluster.GetName())
+	// try to delete the object in cluster
+	err := resourceClient.Delete(obj.GetName(), metav1.DeleteOptions{})
+	if err != nil {
+		if !apierrors.IsNotFound(err) {
+			ctx.log.Errorf("delete attempt failed for %s %s: %v", fromCluster.GroupVersionKind(), kube.NamespaceAndName(fromCluster), err)
+		}
+	}
+	// wait up to 2 minutes until object does not exists in cluster
+	for timeStarted := time.Now(); apierrors.IsNotFound(err) || time.Now().After(timeStarted.Add(2*time.Minute)); {
+		_, err = resourceClient.Get(obj.GetName(), metav1.GetOptions{})
+		if !apierrors.IsNotFound(err) {
+			ctx.log.Warnf("get attempt to check object is gone failed for %s %s: %v", fromCluster.GroupVersionKind(), kube.NamespaceAndName(fromCluster), err)
+			time.Sleep(10 * time.Second)
+			continue
+		}
+		break
+	}
+	// Create object from latest backup/restore)
+	obj.SetNamespace(namespace)
+	_, err = resourceClient.Create(obj)
+	if err != nil {
+		ctx.log.Warnf("create attempt failed for %s %s: %v", fromCluster.GroupVersionKind(), kube.NamespaceAndName(fromCluster), err)
+		warnings.Add(namespace, err)
+		// try just patching the labels
+		warningsFromUpdate, errsFromUpdate := ctx.updateBackupRestoreLabels(fromCluster, fromClusterWithLabels, namespace, resourceClient)
+		warnings.Merge(&warningsFromUpdate)
+		errs.Merge(&errsFromUpdate)
+	} else {
+		ctx.log.Infof("%s %s successfully recreated", obj.GroupVersionKind().Kind, kube.NamespaceAndName(obj))
+	}
+	return warnings, errs
+}
diff --git a/site/content/docs/main/restore-reference.md b/site/content/docs/main/restore-reference.md
@@ -58,7 +58,7 @@ The following is an overview of Velero's restore process that starts after you r
 
 1. The `RestoreController` creates the resource object on the target cluster. If the resource is a PV then the `RestoreController` will restore the PV data from the [durable snapshot](#durable-snapshot-pv-restore), [File System Backup](#file-system-backup-pv-restore), or [CSI snapshot](#csi-pv-restore) depending on how the PV was backed up.
 
-    If the resource already exists in the target cluster, which is determined by the Kubernetes API during resource creation, the `RestoreController` will skip the resource. The only [exception](#restore-existing-resource-policy) are Service Accounts, which Velero will attempt to merge differences between the backed up ServiceAccount into the ServiceAccount on the target cluster. You can [change the default existing resource restore policy](#restore-existing-resource-policy) to update resources instead of skipping them using the `--existing-resource-policy`.
+    If the resource already exists in the target cluster, which is determined by the Kubernetes API during resource creation, the `RestoreController` will skip the resource. The only [exception](#restore-existing-resource-policy) are Service Accounts, which Velero will attempt to merge differences between the backed up ServiceAccount into the ServiceAccount on the target cluster. You can [change the default existing resource restore policy](#restore-existing-resource-policy) to update or recreate resources instead of skipping them using the `--existing-resource-policy`.
 
 1. Once the resource is created on the target cluster, Velero may take some additional steps or wait for additional processes to complete before moving onto the next resource to restore.
 
@@ -260,13 +260,17 @@ By default, Velero is configured to be non-destructive during a restore. This me
 An exception to the default restore policy is ServiceAccounts. When restoring a ServiceAccount that already exists on the target cluster, Velero will attempt to merge the fields of the ServiceAccount from the backup into the existing ServiceAccount. Secrets and ImagePullSecrets are appended from the backed-up ServiceAccount. Velero adds any non-existing labels and annotations from the backed-up ServiceAccount to the existing resource, leaving the existing labels and annotations in place.
 
 You can change this policy for a restore by using the `--existing-resource-policy` restore flag. The available options
-are `none` (default) and `update`. If you choose to update existing resources during a restore
+are `none` (default), `update`, and `recreate`.
+
+If you choose to update existing resources during a restore
 (`--existing-resource-policy=update`), Velero will attempt to update an existing resource to match the resource from the backup: 
 
 * If the existing resource in the target cluster is the same as the resource Velero is attempting to restore, Velero will add a `velero.io/backup-name` label with the backup name and a `velero.io/restore-name` label with the restore name to the existing resource. If patching the labels fails, Velero adds a restore error and continues restoring the next resource.
 
 * If the existing resource in the target cluster is different from the backup, Velero will first try to patch the existing resource to match the backup resource. If the patch is successful, Velero will add a `velero.io/backup-name` label with the backup name and a `velero.io/restore-name` label with the restore name to the existing resource. If the patch fails, Velero adds a restore warning and tries to add the `velero.io/backup-name` and `velero.io/restore-name` labels on the resource. If the labels patch also fails, then Velero logs a restore error and continues restoring the next resource.
 
+If you choose to recreate an existing resource during a restore (`--existing-resource-policy=recreate`) Velero will first try to patch the resouce. Upon patch failures, velero will try to delete the existing resource and then create the resource from the backup. If the recreate fails, Velero will log a restore error and continue restoring the next resource.
+
 You can also configure the existing resource policy in a [Restore](api-types/restore.md) object.
 
 **NOTE:** Update of a resource only applies to the Kubernetes resource data such as its spec. It may not work as expected for certain resource types such as PVCs and Pods. In case of PVCs for example, data in the PV is not restored or overwritten in any way.