From b5ff9fe07cc4c7cad2894ea071dc5e3514119fde Mon Sep 17 00:00:00 2001
From: Alex Leites <18728999+tallaxes@users.noreply.github.com>
Date: Tue, 14 Jan 2025 18:09:43 +0000
Subject: [PATCH] test(e2e): update drift suite

---
 test/suites/drift/suite_test.go | 686 +++++++++++++++++++++++++++++---
 1 file changed, 640 insertions(+), 46 deletions(-)

diff --git a/test/suites/drift/suite_test.go b/test/suites/drift/suite_test.go
index 4d7fce780..b008dea07 100644
--- a/test/suites/drift/suite_test.go
+++ b/test/suites/drift/suite_test.go
@@ -14,26 +14,35 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-package drift
+package drift_test
 
 import (
 	"fmt"
+	"sort"
 	"testing"
 	"time"
 
+	"github.com/awslabs/operatorpkg/object"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 	"github.com/samber/lo"
+	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/apimachinery/pkg/util/intstr"
+	"k8s.io/apimachinery/pkg/util/sets"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1"
+	coretest "sigs.k8s.io/karpenter/pkg/test"
 
+	"github.com/Azure/go-autorest/autorest/to"
 	"github.com/Azure/karpenter-provider-azure/pkg/apis/v1alpha2"
+	"github.com/Azure/karpenter-provider-azure/pkg/test"
 	"github.com/Azure/karpenter-provider-azure/test/pkg/environment/azure"
-
-	karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1"
-
-	coretest "sigs.k8s.io/karpenter/pkg/test"
+	"github.com/Azure/karpenter-provider-azure/test/pkg/environment/common"
 )
 
 var env *azure.Environment
@@ -45,6 +54,9 @@ func TestDrift(t *testing.T) {
 	BeforeSuite(func() {
 		env = azure.NewEnvironment(t)
 	})
+	AfterSuite(func() {
+		env.Stop()
+	})
 	RunSpecs(t, "Drift")
 }
 
@@ -57,71 +69,653 @@ var _ = AfterEach(func() { env.Cleanup() })
 var _ = AfterEach(func() { env.AfterEach() })
 
 var _ = Describe("Drift", func() {
-
-	var pod *corev1.Pod
-
+	var dep *appsv1.Deployment
+	var selector labels.Selector
+	var numPods int
 	BeforeEach(func() {
-		coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{
-			NodeSelectorRequirement: corev1.NodeSelectorRequirement{
-				Key:      corev1.LabelInstanceTypeStable,
-				Operator: corev1.NodeSelectorOpIn,
-				Values:   []string{"Standard_DS2_v2"},
-			}})
-
-		// Add a do-not-disrupt pod so that we can check node metadata before we disrupt
-		pod = coretest.Pod(coretest.PodOptions{
-			ObjectMeta: metav1.ObjectMeta{
-				Annotations: map[string]string{
-					karpv1.DoNotDisruptAnnotationKey: "true",
+		numPods = 1
+		// Add pods with a do-not-disrupt annotation so that we can check node metadata before we disrupt
+		dep = coretest.Deployment(coretest.DeploymentOptions{
+			Replicas: int32(numPods),
+			PodOptions: coretest.PodOptions{
+				ObjectMeta: metav1.ObjectMeta{
+					Labels: map[string]string{
+						"app": "my-app",
+					},
+					Annotations: map[string]string{
+						karpv1.DoNotDisruptAnnotationKey: "true",
+					},
 				},
+				TerminationGracePeriodSeconds: lo.ToPtr[int64](0),
 			},
-			ResourceRequirements: corev1.ResourceRequirements{Requests: corev1.ResourceList{corev1.ResourceCPU: resource.MustParse("0.5")}},
-			Image:                "mcr.microsoft.com/oss/kubernetes/pause:3.6",
 		})
+		selector = labels.SelectorFromSet(dep.Spec.Selector.MatchLabels)
 	})
+	Context("Budgets", func() {
+		It("should respect budgets for empty drift", func() {
+			nodePool = coretest.ReplaceRequirements(nodePool,
+				karpv1.NodeSelectorRequirementWithMinValues{
+					NodeSelectorRequirement: corev1.NodeSelectorRequirement{
+						Key:      corev1.LabelInstanceTypeStable,
+						Operator: corev1.NodeSelectorOpIn,
+						Values:   []string{"Standard_DS4_v2"},
+					},
+				},
+			)
+			// We're expecting to create 3 nodes, so we'll expect to see 2 nodes deleting at one time.
+			nodePool.Spec.Disruption.Budgets = []karpv1.Budget{{
+				Nodes: "50%",
+			}}
+			var numPods int32 = 6
+			dep = coretest.Deployment(coretest.DeploymentOptions{
+				Replicas: numPods,
+				PodOptions: coretest.PodOptions{
+					ObjectMeta: metav1.ObjectMeta{
+						Annotations: map[string]string{
+							karpv1.DoNotDisruptAnnotationKey: "true",
+						},
+						Labels: map[string]string{"app": "large-app"},
+					},
+					// Each Standard_DS4_v2 has 8 cpu, so each node should fit 2 pods.
+					ResourceRequirements: corev1.ResourceRequirements{
+						Requests: corev1.ResourceList{
+							corev1.ResourceCPU: resource.MustParse("3"),
+						},
+					},
+				},
+			})
+			selector = labels.SelectorFromSet(dep.Spec.Selector.MatchLabels)
+			env.ExpectCreated(nodeClass, nodePool, dep)
 
-	// TODO: Add budget tests
+			nodeClaims := env.EventuallyExpectCreatedNodeClaimCount("==", 3)
+			nodes := env.EventuallyExpectCreatedNodeCount("==", 3)
+			env.EventuallyExpectHealthyPodCount(selector, int(numPods))
 
-	It("should deprovision nodes that have drifted due to labels", func() {
+			// List nodes so that we get any updated information on the nodes. If we don't
+			// we have the potential to over-write any changes Karpenter makes to the nodes.
+			// Add a finalizer to each node so that we can stop termination disruptions
+			By("adding finalizers to the nodes to prevent termination")
+			for _, node := range nodes {
+				Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), node)).To(Succeed())
+				node.Finalizers = append(node.Finalizers, common.TestingFinalizer)
+				env.ExpectUpdated(node)
+			}
 
-		By(fmt.Sprintf("creating pod %s, nodepool %s, and nodeclass %s", pod.Name, nodePool.Name, nodeClass.Name))
-		env.ExpectCreated(pod, nodeClass, nodePool)
+			By("making the nodes empty")
+			// Delete the deployment to make all nodes empty.
+			env.ExpectDeleted(dep)
 
-		By(fmt.Sprintf("expect pod %s to be healthy", pod.Name))
-		env.EventuallyExpectHealthy(pod)
+			// Drift the nodeclaims
+			By("drift the nodeclaims")
+			nodePool.Spec.Template.Annotations = map[string]string{"test": "annotation"}
+			env.ExpectUpdated(nodePool)
 
-		By("expect created node count to be 1")
-		env.ExpectCreatedNodeCount("==", 1)
+			env.EventuallyExpectDrifted(nodeClaims...)
+
+			env.ConsistentlyExpectDisruptionsUntilNoneLeft(3, 2, 5*time.Minute)
+		})
+		It("should respect budgets for non-empty delete drift", func() {
+			nodePool = coretest.ReplaceRequirements(nodePool,
+				karpv1.NodeSelectorRequirementWithMinValues{
+					NodeSelectorRequirement: corev1.NodeSelectorRequirement{
+						Key:      corev1.LabelInstanceTypeStable,
+						Operator: corev1.NodeSelectorOpIn,
+						Values:   []string{"Standard_DS4_v2"},
+					},
+				},
+			)
+			// We're expecting to create 3 nodes, so we'll expect to see at most 2 nodes deleting at one time.
+			nodePool.Spec.Disruption.Budgets = []karpv1.Budget{{
+				Nodes: "50%",
+			}}
+			var numPods int32 = 9
+			dep = coretest.Deployment(coretest.DeploymentOptions{
+				Replicas: numPods,
+				PodOptions: coretest.PodOptions{
+					ObjectMeta: metav1.ObjectMeta{
+						Annotations: map[string]string{
+							karpv1.DoNotDisruptAnnotationKey: "true",
+						},
+						Labels: map[string]string{"app": "large-app"},
+					},
+					// TODO
+					// Each Standard_DS4_v2 has 8 cpu, so each node should fit no more than 3 pods.
+					ResourceRequirements: corev1.ResourceRequirements{
+						Requests: corev1.ResourceList{
+							corev1.ResourceCPU: resource.MustParse("2100m"),
+						},
+					},
+				},
+			})
+			selector = labels.SelectorFromSet(dep.Spec.Selector.MatchLabels)
+			env.ExpectCreated(nodeClass, nodePool, dep)
+
+			nodeClaims := env.EventuallyExpectCreatedNodeClaimCount("==", 3)
+			nodes := env.EventuallyExpectCreatedNodeCount("==", 3)
+			env.EventuallyExpectHealthyPodCount(selector, int(numPods))
+
+			By("scaling down the deployment")
+			// Update the deployment to a third of the replicas.
+			dep.Spec.Replicas = lo.ToPtr[int32](3)
+			env.ExpectUpdated(dep)
+
+			// First expect there to be 3 pods, then try to spread the pods.
+			env.EventuallyExpectHealthyPodCount(selector, 3)
+			env.ForcePodsToSpread(nodes...)
+			env.EventuallyExpectHealthyPodCount(selector, 3)
+
+			By("cordoning and adding finalizer to the nodes")
+			// Add a finalizer to each node so that we can stop termination disruptions
+			for _, node := range nodes {
+				Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), node)).To(Succeed())
+				node.Finalizers = append(node.Finalizers, common.TestingFinalizer)
+				env.ExpectUpdated(node)
+			}
+
+			By("drifting the nodes")
+			// Drift the nodeclaims
+			nodePool.Spec.Template.Annotations = map[string]string{"test": "annotation"}
+			env.ExpectUpdated(nodePool)
+
+			env.EventuallyExpectDrifted(nodeClaims...)
+
+			By("enabling disruption by removing the do not disrupt annotation")
+			pods := env.EventuallyExpectHealthyPodCount(selector, 3)
+			// Remove the do-not-disrupt annotation so that the nodes are now disruptable
+			for _, pod := range pods {
+				delete(pod.Annotations, karpv1.DoNotDisruptAnnotationKey)
+				env.ExpectUpdated(pod)
+			}
+
+			env.ConsistentlyExpectDisruptionsUntilNoneLeft(3, 2, 5*time.Minute)
+		})
+		It("should respect budgets for non-empty replace drift", func() {
+			appLabels := map[string]string{"app": "large-app"}
+			nodePool.Labels = appLabels
+			// We're expecting to create 5 nodes, so we'll expect to see at most 3 nodes deleting at one time.
+			nodePool.Spec.Disruption.Budgets = []karpv1.Budget{{
+				Nodes: "3",
+			}}
+
+			// Create a 5 pod deployment with hostname inter-pod anti-affinity to ensure each pod is placed on a unique node
+			numPods = 5
+			selector = labels.SelectorFromSet(appLabels)
+			deployment := coretest.Deployment(coretest.DeploymentOptions{
+				Replicas: int32(numPods),
+				PodOptions: coretest.PodOptions{
+					ObjectMeta: metav1.ObjectMeta{
+						Labels: appLabels,
+					},
+					PodAntiRequirements: []corev1.PodAffinityTerm{{
+						TopologyKey: corev1.LabelHostname,
+						LabelSelector: &metav1.LabelSelector{
+							MatchLabels: appLabels,
+						},
+					}},
+				},
+			})
+
+			env.ExpectCreated(nodeClass, nodePool, deployment)
+
+			originalNodeClaims := env.EventuallyExpectCreatedNodeClaimCount("==", numPods)
+			originalNodes := env.EventuallyExpectCreatedNodeCount("==", numPods)
+
+			// Check that all deployment pods are online
+			env.EventuallyExpectHealthyPodCount(selector, numPods)
 
+			By("cordoning and adding finalizer to the nodes")
+			// Add a finalizer to each node so that we can stop termination disruptions
+			for _, node := range originalNodes {
+				Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), node)).To(Succeed())
+				node.Finalizers = append(node.Finalizers, common.TestingFinalizer)
+				env.ExpectUpdated(node)
+			}
+
+			By("drifting the nodepool")
+			nodePool.Spec.Template.Annotations = lo.Assign(nodePool.Spec.Template.Annotations, map[string]string{"test-annotation": "drift"})
+			env.ExpectUpdated(nodePool)
+			env.ConsistentlyExpectDisruptionsUntilNoneLeft(5, 3, 10*time.Minute)
+
+			for _, node := range originalNodes {
+				Expect(env.ExpectTestingFinalizerRemoved(node)).To(Succeed())
+			}
+			for _, nodeClaim := range originalNodeClaims {
+				Expect(env.ExpectTestingFinalizerRemoved(nodeClaim)).To(Succeed())
+			}
+
+			// Eventually expect all the nodes to be rolled and completely removed
+			// Since this completes the disruption operation, this also ensures that we aren't leaking nodes into subsequent
+			// tests since nodeclaims that are actively replacing but haven't brought-up nodes yet can register nodes later
+			env.EventuallyExpectNotFound(lo.Map(originalNodes, func(n *corev1.Node, _ int) client.Object { return n })...)
+			env.EventuallyExpectNotFound(lo.Map(originalNodeClaims, func(n *karpv1.NodeClaim, _ int) client.Object { return n })...)
+			env.ExpectNodeClaimCount("==", 5)
+			env.ExpectNodeCount("==", 5)
+		})
+		It("should not allow drift if the budget is fully blocking", func() {
+			// We're going to define a budget that doesn't allow any drift to happen
+			nodePool.Spec.Disruption.Budgets = []karpv1.Budget{{
+				Nodes: "0",
+			}}
+
+			dep.Spec.Template.Annotations = nil
+			env.ExpectCreated(nodeClass, nodePool, dep)
+
+			nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0]
+			env.EventuallyExpectCreatedNodeCount("==", 1)
+			env.EventuallyExpectHealthyPodCount(selector, numPods)
+
+			By("drifting the nodes")
+			// Drift the nodeclaims
+			nodePool.Spec.Template.Annotations = map[string]string{"test": "annotation"}
+			env.ExpectUpdated(nodePool)
+
+			env.EventuallyExpectDrifted(nodeClaim)
+			env.ConsistentlyExpectNoDisruptions(1, time.Minute)
+		})
+		It("should not allow drift if the budget is fully blocking during a scheduled time", func() {
+			// We're going to define a budget that doesn't allow any drift to happen
+			// This is going to be on a schedule that only lasts 30 minutes, whose window starts 15 minutes before
+			// the current time and extends 15 minutes past the current time
+			// Times need to be in UTC since the karpenter containers were built in UTC time
+			windowStart := time.Now().Add(-time.Minute * 15).UTC()
+			nodePool.Spec.Disruption.Budgets = []karpv1.Budget{{
+				Nodes:    "0",
+				Schedule: lo.ToPtr(fmt.Sprintf("%d %d * * *", windowStart.Minute(), windowStart.Hour())),
+				Duration: &metav1.Duration{Duration: time.Minute * 30},
+			}}
+
+			dep.Spec.Template.Annotations = nil
+			env.ExpectCreated(nodeClass, nodePool, dep)
+
+			nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0]
+			env.EventuallyExpectCreatedNodeCount("==", 1)
+			env.EventuallyExpectHealthyPodCount(selector, numPods)
+
+			By("drifting the nodes")
+			// Drift the nodeclaims
+			nodePool.Spec.Template.Annotations = map[string]string{"test": "annotation"}
+			env.ExpectUpdated(nodePool)
+
+			env.EventuallyExpectDrifted(nodeClaim)
+			env.ConsistentlyExpectNoDisruptions(1, time.Minute)
+		})
+	})
+
+	It("should disrupt nodes that have drifted due to images", func() {
+		// TODO
+	})
+
+	DescribeTable("NodePool Drift", func(nodeClaimTemplate karpv1.NodeClaimTemplate) {
+		updatedNodePool := coretest.NodePool(
+			karpv1.NodePool{
+				Spec: karpv1.NodePoolSpec{
+					Template: karpv1.NodeClaimTemplate{
+						Spec: karpv1.NodeClaimTemplateSpec{
+							NodeClassRef: &karpv1.NodeClassReference{
+								Group: object.GVK(nodeClass).Group,
+								Kind:  object.GVK(nodeClass).Kind,
+								Name:  nodeClass.Name,
+							},
+							// keep the same instance type requirements to prevent considering instance types that require swap
+							Requirements: nodePool.Spec.Template.Spec.Requirements,
+						},
+					},
+				},
+			},
+			karpv1.NodePool{
+				Spec: karpv1.NodePoolSpec{
+					Template: nodeClaimTemplate,
+				},
+			},
+		)
+		updatedNodePool.ObjectMeta = nodePool.ObjectMeta
+
+		env.ExpectCreated(dep, nodeClass, nodePool)
+		pod := env.EventuallyExpectHealthyPodCount(selector, numPods)[0]
 		nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0]
-		node := env.EventuallyExpectNodeCount("==", 1)[0]
+		node := env.ExpectCreatedNodeCount("==", 1)[0]
 
-		By(fmt.Sprintf("waiting for nodepool %s update", nodePool.Name))
-		nodePool.Spec.Template.Labels["triggerdrift"] = "value"
-		env.ExpectCreatedOrUpdated(nodePool)
+		env.ExpectCreatedOrUpdated(updatedNodePool)
 
-		By(fmt.Sprintf("waiting for nodeclaim %s to be marked as drifted", nodeClaim.Name))
 		env.EventuallyExpectDrifted(nodeClaim)
 
-		By(fmt.Sprintf("waiting for pod %s to to update", pod.Name))
 		delete(pod.Annotations, karpv1.DoNotDisruptAnnotationKey)
 		env.ExpectUpdated(pod)
 
-		By(fmt.Sprintf("expect pod %s, nodeclaim %s, and node %s to eventually not exist", pod.Name, nodeClaim.Name, node.Name))
-		SetDefaultEventuallyTimeout(10 * time.Minute)
-		env.EventuallyExpectNotFound(pod, nodeClaim, node)
-		SetDefaultEventuallyTimeout(5 * time.Minute)
+		// Nodes will need to have the start-up taint removed before the node can be considered as initialized
+		fmt.Println(CurrentSpecReport().LeafNodeText)
+		if CurrentSpecReport().LeafNodeText == "Start-up Taints" {
+			nodes := env.EventuallyExpectCreatedNodeCount("==", 2)
+			sort.Slice(nodes, func(i int, j int) bool {
+				return nodes[i].CreationTimestamp.Before(&nodes[j].CreationTimestamp)
+			})
+			nodeTwo := nodes[1]
+			// Remove the startup taints from the new nodes to initialize them
+			Eventually(func(g Gomega) {
+				g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeTwo), nodeTwo)).To(Succeed())
+				g.Expect(len(nodeTwo.Spec.Taints)).To(BeNumerically("==", 1))
+				_, found := lo.Find(nodeTwo.Spec.Taints, func(t corev1.Taint) bool {
+					return t.MatchTaint(&corev1.Taint{Key: "example.com/another-taint-2", Effect: corev1.TaintEffectPreferNoSchedule})
+				})
+				g.Expect(found).To(BeTrue())
+				stored := nodeTwo.DeepCopy()
+				nodeTwo.Spec.Taints = lo.Reject(nodeTwo.Spec.Taints, func(t corev1.Taint, _ int) bool { return t.Key == "example.com/another-taint-2" })
+				g.Expect(env.Client.Patch(env.Context, nodeTwo, client.StrategicMergeFrom(stored))).To(Succeed())
+			}).Should(Succeed())
+		}
+		env.EventuallyExpectNotFound(pod, node)
+		env.EventuallyExpectHealthyPodCount(selector, numPods)
+	},
+		Entry("Annotations", karpv1.NodeClaimTemplate{
+			ObjectMeta: karpv1.ObjectMeta{
+				Annotations: map[string]string{"keyAnnotationTest": "valueAnnotationTest"},
+			},
+		}),
+		Entry("Labels", karpv1.NodeClaimTemplate{
+			ObjectMeta: karpv1.ObjectMeta{
+				Labels: map[string]string{"keyLabelTest": "valueLabelTest"},
+			},
+		}),
+		Entry("Taints", karpv1.NodeClaimTemplate{
+			Spec: karpv1.NodeClaimTemplateSpec{
+				Taints: []corev1.Taint{{Key: "example.com/another-taint-2", Effect: corev1.TaintEffectPreferNoSchedule}},
+			},
+		}),
+		Entry("Start-up Taints", karpv1.NodeClaimTemplate{
+			Spec: karpv1.NodeClaimTemplateSpec{
+				StartupTaints: []corev1.Taint{{Key: "example.com/another-taint-2", Effect: corev1.TaintEffectPreferNoSchedule}},
+			},
+		}),
+		Entry("NodeRequirements", karpv1.NodeClaimTemplate{
+			Spec: karpv1.NodeClaimTemplateSpec{
+				// since this will overwrite the default requirements, add instance category and family selectors back into requirements
+				Requirements: []karpv1.NodeSelectorRequirementWithMinValues{
+					{NodeSelectorRequirement: corev1.NodeSelectorRequirement{Key: karpv1.CapacityTypeLabelKey, Operator: corev1.NodeSelectorOpIn, Values: []string{karpv1.CapacityTypeSpot}}},
+					{NodeSelectorRequirement: corev1.NodeSelectorRequirement{Key: corev1.LabelInstanceTypeStable, Operator: corev1.NodeSelectorOpIn, Values: []string{"Standard_DS4_v2"}}},
+				},
+			},
+		}),
+	)
+	FDescribeTable("AKSNodeClass", func(nodeClassSpec v1alpha2.AKSNodeClassSpec) {
+		updatedNodeClass := test.AKSNodeClass(v1alpha2.AKSNodeClass{Spec: *nodeClass.Spec.DeepCopy()}, v1alpha2.AKSNodeClass{Spec: nodeClassSpec})
+		updatedNodeClass.ObjectMeta = nodeClass.ObjectMeta
+
+		env.ExpectCreated(dep, nodeClass, nodePool)
+		pod := env.EventuallyExpectHealthyPodCount(selector, numPods)[0]
+		nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0]
+		node := env.ExpectCreatedNodeCount("==", 1)[0]
+
+		env.ExpectCreatedOrUpdated(updatedNodeClass)
+
+		env.EventuallyExpectDrifted(nodeClaim)
+
+		delete(pod.Annotations, karpv1.DoNotDisruptAnnotationKey)
+		env.ExpectUpdated(pod)
+		env.EventuallyExpectNotFound(pod, node)
+		env.EventuallyExpectHealthyPodCount(selector, numPods)
+	},
+		// VNETSubnetID tested separately
+		Entry("OSDiskSizeGB", v1alpha2.AKSNodeClassSpec{OSDiskSizeGB: to.Int32Ptr(100)}),
+		// ImageID TBD
+		Entry("ImageFamily", v1alpha2.AKSNodeClassSpec{ImageFamily: to.StringPtr("AzureLinux")}),
+		Entry("Tags", v1alpha2.AKSNodeClassSpec{Tags: map[string]string{"keyTag-test-3": "valueTag-test-3"}}),
+		Entry("KubeletConfiguration", v1alpha2.AKSNodeClassSpec{
+			Kubelet: &v1alpha2.KubeletConfiguration{
+				ImageGCLowThresholdPercent:  to.Int32Ptr(10),
+				ImageGCHighThresholdPercent: to.Int32Ptr(90),
+			},
+		}),
+		Entry("MaxPods", v1alpha2.AKSNodeClassSpec{MaxPods: to.Int32Ptr(10)}),
+	)
+
+	It("should update the nodepool-hash annotation on the nodepool and nodeclaim when the nodepool's nodepool-hash-version annotation does not match the controller hash version", func() {
+		env.ExpectCreated(dep, nodeClass, nodePool)
+		env.EventuallyExpectHealthyPodCount(selector, numPods)
+		nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0]
+		nodePool = env.ExpectExists(nodePool).(*karpv1.NodePool)
+		expectedHash := nodePool.Hash()
+
+		By(fmt.Sprintf("expect nodepool %s and nodeclaim %s to contain %s and %s annotations", nodePool.Name, nodeClaim.Name, karpv1.NodePoolHashAnnotationKey, karpv1.NodePoolHashVersionAnnotationKey))
+		Eventually(func(g Gomega) {
+			g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodePool), nodePool)).To(Succeed())
+			g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed())
+
+			g.Expect(nodePool.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashAnnotationKey, expectedHash))
+			g.Expect(nodePool.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashVersionAnnotationKey, karpv1.NodePoolHashVersion))
+			g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashAnnotationKey, expectedHash))
+			g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashVersionAnnotationKey, karpv1.NodePoolHashVersion))
+		}).WithTimeout(30 * time.Second).Should(Succeed())
+
+		nodePool.Annotations = lo.Assign(nodePool.Annotations, map[string]string{
+			karpv1.NodePoolHashAnnotationKey:        "test-hash-1",
+			karpv1.NodePoolHashVersionAnnotationKey: "test-hash-version-1",
+		})
+		// Updating `nodePool.Spec.Template.Annotations` would normally trigger drift on all nodeclaims owned by the
+		// nodepool. However, the nodepool-hash-version does not match the controller hash version, so we will see that
+		// none of the nodeclaims will be drifted and all nodeclaims will have an updated `nodepool-hash` and `nodepool-hash-version` annotation
+		nodePool.Spec.Template.Annotations = lo.Assign(nodePool.Spec.Template.Annotations, map[string]string{
+			"test-key": "test-value",
+		})
+		nodeClaim.Annotations = lo.Assign(nodePool.Annotations, map[string]string{
+			karpv1.NodePoolHashAnnotationKey:        "test-hash-2",
+			karpv1.NodePoolHashVersionAnnotationKey: "test-hash-version-2",
+		})
+
+		// The nodeclaim will need to be updated first, as the hash controller will only be triggered on changes to the nodepool
+		env.ExpectUpdated(nodeClaim, nodePool)
+		expectedHash = nodePool.Hash()
+
+		// Expect all nodeclaims not to be drifted and contain an updated `nodepool-hash` and `nodepool-hash-version` annotation
+		Eventually(func(g Gomega) {
+			g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodePool), nodePool)).To(Succeed())
+			g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed())
+
+			g.Expect(nodePool.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashAnnotationKey, expectedHash))
+			g.Expect(nodePool.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashVersionAnnotationKey, karpv1.NodePoolHashVersion))
+			g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashAnnotationKey, expectedHash))
+			g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashVersionAnnotationKey, karpv1.NodePoolHashVersion))
+		})
 	})
-	It("should mark the nodeclaim as drifted for SubnetDrift if AKSNodeClass subnet id changes", func() {
-		env.ExpectCreated(pod, nodeClass, nodePool)
+	It("should update the aksnodeclass-hash annotation on the aksnodeclass and nodeclaim when the aksnodeclass's aksnodeclass-hash-version annotation does not match the controller hash version", func() {
+		env.ExpectCreated(dep, nodeClass, nodePool)
+		env.EventuallyExpectHealthyPodCount(selector, numPods)
+		nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0]
+		nodeClass = env.ExpectExists(nodeClass).(*v1alpha2.AKSNodeClass)
+		expectedHash := nodeClass.Hash()
 
-		By(fmt.Sprintf("expect pod %s to be healthy", pod.Name))
-		env.EventuallyExpectHealthy(pod)
+		By(fmt.Sprintf("expect nodeclass %s and nodeclaim %s to contain %s and %s annotations", nodeClass.Name, nodeClaim.Name, v1alpha2.AnnotationAKSNodeClassHash, v1alpha2.AnnotationAKSNodeClassHashVersion))
+		Eventually(func(g Gomega) {
+			g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClass), nodeClass)).To(Succeed())
+			g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed())
 
-		By("expect created node count to be 1")
-		env.ExpectCreatedNodeCount("==", 1)
+			g.Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHash, expectedHash))
+			g.Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHashVersion, v1alpha2.AKSNodeClassHashVersion))
+			g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHash, expectedHash))
+			g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHashVersion, v1alpha2.AKSNodeClassHashVersion))
+		}).WithTimeout(30 * time.Second).Should(Succeed())
+
+		nodeClass.Annotations = lo.Assign(nodeClass.Annotations, map[string]string{
+			v1alpha2.AnnotationAKSNodeClassHash:        "test-hash-1",
+			v1alpha2.AnnotationAKSNodeClassHashVersion: "test-hash-version-1",
+		})
+		// Updating `nodeClass.Spec.Tags` would normally trigger drift on all nodeclaims using the
+		// nodeclass. However, the aksnodeclass-hash-version does not match the controller hash version, so we will see that
+		// none of the nodeclaims will be drifted and all nodeclaims will have an updated `aksnodeclass-hash` and `aksnodeclass-hash-version` annotation
+		nodeClass.Spec.Tags = lo.Assign(nodeClass.Spec.Tags, map[string]string{
+			"test-key": "test-value",
+		})
+		nodeClaim.Annotations = lo.Assign(nodePool.Annotations, map[string]string{
+			v1alpha2.AnnotationAKSNodeClassHash:        "test-hash-2",
+			v1alpha2.AnnotationAKSNodeClassHashVersion: "test-hash-version-2",
+		})
+
+		// The nodeclaim will need to be updated first, as the hash controller will only be triggered on changes to the nodeclass
+		env.ExpectUpdated(nodeClaim, nodeClass)
+		expectedHash = nodeClass.Hash()
+
+		// Expect all nodeclaims not to be drifted and contain an updated `nodepool-hash` and `nodepool-hash-version` annotation
+		Eventually(func(g Gomega) {
+			g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClass), nodeClass)).To(Succeed())
+			g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed())
 
+			g.Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHash, expectedHash))
+			g.Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHashVersion, v1alpha2.AKSNodeClassHashVersion))
+			g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHash, expectedHash))
+			g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHashVersion, v1alpha2.AKSNodeClassHashVersion))
+		}).WithTimeout(30 * time.Second).Should(Succeed())
+		env.ConsistentlyExpectNodeClaimsNotDrifted(time.Minute, nodeClaim)
+	})
+	Context("Failure", func() {
+		It("should not disrupt a drifted node if the replacement node never registers", func() {
+			// launch a new nodeClaim
+			var numPods int32 = 2
+			dep = coretest.Deployment(coretest.DeploymentOptions{
+				Replicas: 2,
+				PodOptions: coretest.PodOptions{
+					ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "inflate"}},
+					PodAntiRequirements: []corev1.PodAffinityTerm{{
+						TopologyKey: corev1.LabelHostname,
+						LabelSelector: &metav1.LabelSelector{
+							MatchLabels: map[string]string{"app": "inflate"},
+						}},
+					},
+				},
+			})
+			env.ExpectCreated(dep, nodeClass, nodePool)
+
+			startingNodeClaimState := env.EventuallyExpectCreatedNodeClaimCount("==", int(numPods))
+			env.EventuallyExpectCreatedNodeCount("==", int(numPods))
+
+			// Drift the nodeClaim with bad configuration that will not register a NodeClaim
+			nodeClass.Spec.VNETSubnetID = lo.ToPtr("/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/sillygeese/providers/Microsoft.Network/virtualNetworks/karpenter/subnets/nodeclassSubnet2")
+			env.ExpectCreatedOrUpdated(nodeClass)
+
+			env.EventuallyExpectDrifted(startingNodeClaimState...)
+
+			// Expect only a single node to be tainted due to default disruption budgets
+			taintedNodes := env.EventuallyExpectTaintedNodeCount("==", 1)
+
+			// Drift should fail and the original node should be untainted
+			// TODO: reduce timeouts when disruption waits are factored out
+			env.EventuallyExpectNodesUntaintedWithTimeout(11*time.Minute, taintedNodes...)
+
+			// Expect all the NodeClaims that existed on the initial provisioning loop are not removed.
+			// Assert this over several minutes to ensure a subsequent disruption controller pass doesn't
+			// successfully schedule the evicted pods to the in-flight nodeclaim and disrupt the original node
+			Consistently(func(g Gomega) {
+				nodeClaims := &karpv1.NodeClaimList{}
+				g.Expect(env.Client.List(env, nodeClaims, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed())
+				startingNodeClaimUIDs := sets.New(lo.Map(startingNodeClaimState, func(nc *karpv1.NodeClaim, _ int) types.UID { return nc.UID })...)
+				nodeClaimUIDs := sets.New(lo.Map(nodeClaims.Items, func(nc karpv1.NodeClaim, _ int) types.UID { return nc.UID })...)
+				g.Expect(nodeClaimUIDs.IsSuperset(startingNodeClaimUIDs)).To(BeTrue())
+			}, "2m").Should(Succeed())
+
+		})
+		It("should not disrupt a drifted node if the replacement node registers but never initialized", func() {
+			// launch a new nodeClaim
+			var numPods int32 = 2
+			dep = coretest.Deployment(coretest.DeploymentOptions{
+				Replicas: 2,
+				PodOptions: coretest.PodOptions{
+					ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "inflate"}},
+					PodAntiRequirements: []corev1.PodAffinityTerm{{
+						TopologyKey: corev1.LabelHostname,
+						LabelSelector: &metav1.LabelSelector{
+							MatchLabels: map[string]string{"app": "inflate"},
+						}},
+					},
+				},
+			})
+			env.ExpectCreated(dep, nodeClass, nodePool)
+
+			startingNodeClaimState := env.EventuallyExpectCreatedNodeClaimCount("==", int(numPods))
+			env.EventuallyExpectCreatedNodeCount("==", int(numPods))
+
+			// Drift the nodeClaim with bad configuration that never initializes
+			nodePool.Spec.Template.Spec.StartupTaints = []corev1.Taint{{Key: "example.com/taint", Effect: corev1.TaintEffectPreferNoSchedule}}
+			env.ExpectCreatedOrUpdated(nodePool)
+
+			env.EventuallyExpectDrifted(startingNodeClaimState...)
+
+			// Expect only a single node to get tainted due to default disruption budgets
+			taintedNodes := env.EventuallyExpectTaintedNodeCount("==", 1)
+
+			// Drift should fail and original node should be untainted
+			// TODO: reduce timeouts when disruption waits are factored out
+			env.EventuallyExpectNodesUntaintedWithTimeout(11*time.Minute, taintedNodes...)
+
+			// Expect that the new nodeClaim/node is kept around after the un-cordon
+			nodeList := &corev1.NodeList{}
+			Expect(env.Client.List(env, nodeList, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed())
+			Expect(nodeList.Items).To(HaveLen(int(numPods) + 1))
+
+			nodeClaimList := &karpv1.NodeClaimList{}
+			Expect(env.Client.List(env, nodeClaimList, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed())
+			Expect(nodeClaimList.Items).To(HaveLen(int(numPods) + 1))
+
+			// Expect all the NodeClaims that existed on the initial provisioning loop are not removed
+			// Assert this over several minutes to ensure a subsequent disruption controller pass doesn't
+			// successfully schedule the evicted pods to the in-flight nodeclaim and disrupt the original node
+			Consistently(func(g Gomega) {
+				nodeClaims := &karpv1.NodeClaimList{}
+				g.Expect(env.Client.List(env, nodeClaims, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed())
+				startingNodeClaimUIDs := sets.New(lo.Map(startingNodeClaimState, func(m *karpv1.NodeClaim, _ int) types.UID { return m.UID })...)
+				nodeClaimUIDs := sets.New(lo.Map(nodeClaims.Items, func(m karpv1.NodeClaim, _ int) types.UID { return m.UID })...)
+				g.Expect(nodeClaimUIDs.IsSuperset(startingNodeClaimUIDs)).To(BeTrue())
+			}, "2m").Should(Succeed())
+		})
+		It("should not drift any nodes if their PodDisruptionBudgets are unhealthy", func() {
+			// Create a deployment that contains a readiness probe that will never succeed
+			// This way, the pod will bind to the node, but the PodDisruptionBudget will never go healthy
+			var numPods int32 = 2
+			dep = coretest.Deployment(coretest.DeploymentOptions{
+				Replicas: 2,
+				PodOptions: coretest.PodOptions{
+					ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "inflate"}},
+					PodAntiRequirements: []corev1.PodAffinityTerm{{
+						TopologyKey: corev1.LabelHostname,
+						LabelSelector: &metav1.LabelSelector{
+							MatchLabels: map[string]string{"app": "inflate"},
+						}},
+					},
+					ReadinessProbe: &corev1.Probe{
+						ProbeHandler: corev1.ProbeHandler{
+							HTTPGet: &corev1.HTTPGetAction{
+								Port: intstr.FromInt32(80),
+							},
+						},
+					},
+				},
+			})
+			selector = labels.SelectorFromSet(dep.Spec.Selector.MatchLabels)
+			minAvailable := intstr.FromInt32(numPods - 1)
+			pdb := coretest.PodDisruptionBudget(coretest.PDBOptions{
+				Labels:       dep.Spec.Template.Labels,
+				MinAvailable: &minAvailable,
+			})
+			env.ExpectCreated(dep, nodeClass, nodePool, pdb)
+
+			nodeClaims := env.EventuallyExpectCreatedNodeClaimCount("==", int(numPods))
+			env.EventuallyExpectCreatedNodeCount("==", int(numPods))
+
+			// Expect pods to be bound but not to be ready since we are intentionally failing the readiness check
+			env.EventuallyExpectBoundPodCount(selector, int(numPods))
+
+			// Drift the nodeclaims
+			nodePool.Spec.Template.Annotations = map[string]string{"test": "annotation"}
+			env.ExpectUpdated(nodePool)
+
+			env.EventuallyExpectDrifted(nodeClaims...)
+			env.ConsistentlyExpectNoDisruptions(int(numPods), time.Minute)
+		})
+	})
+
+	It("should disrupt nodes that have drifted due to VNETSubnetID", func() {
+		env.ExpectCreated(dep, nodeClass, nodePool)
+		env.EventuallyExpectHealthyPodCount(selector, numPods)
 		nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0]
+		By("expect created node count to be 1")
+		env.ExpectCreatedNodeCount("==", 1)
 		By("triggering subnet drift")
 		// TODO: Introduce azure clients to the tests to get values dynamically and be able to create azure resources inside of tests rather than using a fake id.
 		// this will fail to actually create a new nodeclaim for the drift replacement but should still test that we are marking the nodeclaim as drifted.