From b5ff9fe07cc4c7cad2894ea071dc5e3514119fde Mon Sep 17 00:00:00 2001 From: Alex Leites <18728999+tallaxes@users.noreply.github.com> Date: Tue, 14 Jan 2025 18:09:43 +0000 Subject: [PATCH] test(e2e): update drift suite --- test/suites/drift/suite_test.go | 686 +++++++++++++++++++++++++++++--- 1 file changed, 640 insertions(+), 46 deletions(-) diff --git a/test/suites/drift/suite_test.go b/test/suites/drift/suite_test.go index 4d7fce780..b008dea07 100644 --- a/test/suites/drift/suite_test.go +++ b/test/suites/drift/suite_test.go @@ -14,26 +14,35 @@ See the License for the specific language governing permissions and limitations under the License. */ -package drift +package drift_test import ( "fmt" + "sort" "testing" "time" + "github.com/awslabs/operatorpkg/object" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/samber/lo" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apimachinery/pkg/util/sets" + "sigs.k8s.io/controller-runtime/pkg/client" + karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" + coretest "sigs.k8s.io/karpenter/pkg/test" + "github.com/Azure/go-autorest/autorest/to" "github.com/Azure/karpenter-provider-azure/pkg/apis/v1alpha2" + "github.com/Azure/karpenter-provider-azure/pkg/test" "github.com/Azure/karpenter-provider-azure/test/pkg/environment/azure" - - karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" - - coretest "sigs.k8s.io/karpenter/pkg/test" + "github.com/Azure/karpenter-provider-azure/test/pkg/environment/common" ) var env *azure.Environment @@ -45,6 +54,9 @@ func TestDrift(t *testing.T) { BeforeSuite(func() { env = azure.NewEnvironment(t) }) + AfterSuite(func() { + env.Stop() + }) RunSpecs(t, "Drift") } @@ -57,71 +69,653 @@ var _ = AfterEach(func() { env.Cleanup() }) var _ = AfterEach(func() { env.AfterEach() }) var _ = Describe("Drift", func() { - - var pod *corev1.Pod - + var dep *appsv1.Deployment + var selector labels.Selector + var numPods int BeforeEach(func() { - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - NodeSelectorRequirement: corev1.NodeSelectorRequirement{ - Key: corev1.LabelInstanceTypeStable, - Operator: corev1.NodeSelectorOpIn, - Values: []string{"Standard_DS2_v2"}, - }}) - - // Add a do-not-disrupt pod so that we can check node metadata before we disrupt - pod = coretest.Pod(coretest.PodOptions{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - karpv1.DoNotDisruptAnnotationKey: "true", + numPods = 1 + // Add pods with a do-not-disrupt annotation so that we can check node metadata before we disrupt + dep = coretest.Deployment(coretest.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "app": "my-app", + }, + Annotations: map[string]string{ + karpv1.DoNotDisruptAnnotationKey: "true", + }, }, + TerminationGracePeriodSeconds: lo.ToPtr[int64](0), }, - ResourceRequirements: corev1.ResourceRequirements{Requests: corev1.ResourceList{corev1.ResourceCPU: resource.MustParse("0.5")}}, - Image: "mcr.microsoft.com/oss/kubernetes/pause:3.6", }) + selector = labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) }) + Context("Budgets", func() { + It("should respect budgets for empty drift", func() { + nodePool = coretest.ReplaceRequirements(nodePool, + karpv1.NodeSelectorRequirementWithMinValues{ + NodeSelectorRequirement: corev1.NodeSelectorRequirement{ + Key: corev1.LabelInstanceTypeStable, + Operator: corev1.NodeSelectorOpIn, + Values: []string{"Standard_DS4_v2"}, + }, + }, + ) + // We're expecting to create 3 nodes, so we'll expect to see 2 nodes deleting at one time. + nodePool.Spec.Disruption.Budgets = []karpv1.Budget{{ + Nodes: "50%", + }} + var numPods int32 = 6 + dep = coretest.Deployment(coretest.DeploymentOptions{ + Replicas: numPods, + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + karpv1.DoNotDisruptAnnotationKey: "true", + }, + Labels: map[string]string{"app": "large-app"}, + }, + // Each Standard_DS4_v2 has 8 cpu, so each node should fit 2 pods. + ResourceRequirements: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("3"), + }, + }, + }, + }) + selector = labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + env.ExpectCreated(nodeClass, nodePool, dep) - // TODO: Add budget tests + nodeClaims := env.EventuallyExpectCreatedNodeClaimCount("==", 3) + nodes := env.EventuallyExpectCreatedNodeCount("==", 3) + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) - It("should deprovision nodes that have drifted due to labels", func() { + // List nodes so that we get any updated information on the nodes. If we don't + // we have the potential to over-write any changes Karpenter makes to the nodes. + // Add a finalizer to each node so that we can stop termination disruptions + By("adding finalizers to the nodes to prevent termination") + for _, node := range nodes { + Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), node)).To(Succeed()) + node.Finalizers = append(node.Finalizers, common.TestingFinalizer) + env.ExpectUpdated(node) + } - By(fmt.Sprintf("creating pod %s, nodepool %s, and nodeclass %s", pod.Name, nodePool.Name, nodeClass.Name)) - env.ExpectCreated(pod, nodeClass, nodePool) + By("making the nodes empty") + // Delete the deployment to make all nodes empty. + env.ExpectDeleted(dep) - By(fmt.Sprintf("expect pod %s to be healthy", pod.Name)) - env.EventuallyExpectHealthy(pod) + // Drift the nodeclaims + By("drift the nodeclaims") + nodePool.Spec.Template.Annotations = map[string]string{"test": "annotation"} + env.ExpectUpdated(nodePool) - By("expect created node count to be 1") - env.ExpectCreatedNodeCount("==", 1) + env.EventuallyExpectDrifted(nodeClaims...) + + env.ConsistentlyExpectDisruptionsUntilNoneLeft(3, 2, 5*time.Minute) + }) + It("should respect budgets for non-empty delete drift", func() { + nodePool = coretest.ReplaceRequirements(nodePool, + karpv1.NodeSelectorRequirementWithMinValues{ + NodeSelectorRequirement: corev1.NodeSelectorRequirement{ + Key: corev1.LabelInstanceTypeStable, + Operator: corev1.NodeSelectorOpIn, + Values: []string{"Standard_DS4_v2"}, + }, + }, + ) + // We're expecting to create 3 nodes, so we'll expect to see at most 2 nodes deleting at one time. + nodePool.Spec.Disruption.Budgets = []karpv1.Budget{{ + Nodes: "50%", + }} + var numPods int32 = 9 + dep = coretest.Deployment(coretest.DeploymentOptions{ + Replicas: numPods, + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + karpv1.DoNotDisruptAnnotationKey: "true", + }, + Labels: map[string]string{"app": "large-app"}, + }, + // TODO + // Each Standard_DS4_v2 has 8 cpu, so each node should fit no more than 3 pods. + ResourceRequirements: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2100m"), + }, + }, + }, + }) + selector = labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + env.ExpectCreated(nodeClass, nodePool, dep) + + nodeClaims := env.EventuallyExpectCreatedNodeClaimCount("==", 3) + nodes := env.EventuallyExpectCreatedNodeCount("==", 3) + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) + + By("scaling down the deployment") + // Update the deployment to a third of the replicas. + dep.Spec.Replicas = lo.ToPtr[int32](3) + env.ExpectUpdated(dep) + + // First expect there to be 3 pods, then try to spread the pods. + env.EventuallyExpectHealthyPodCount(selector, 3) + env.ForcePodsToSpread(nodes...) + env.EventuallyExpectHealthyPodCount(selector, 3) + + By("cordoning and adding finalizer to the nodes") + // Add a finalizer to each node so that we can stop termination disruptions + for _, node := range nodes { + Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), node)).To(Succeed()) + node.Finalizers = append(node.Finalizers, common.TestingFinalizer) + env.ExpectUpdated(node) + } + + By("drifting the nodes") + // Drift the nodeclaims + nodePool.Spec.Template.Annotations = map[string]string{"test": "annotation"} + env.ExpectUpdated(nodePool) + + env.EventuallyExpectDrifted(nodeClaims...) + + By("enabling disruption by removing the do not disrupt annotation") + pods := env.EventuallyExpectHealthyPodCount(selector, 3) + // Remove the do-not-disrupt annotation so that the nodes are now disruptable + for _, pod := range pods { + delete(pod.Annotations, karpv1.DoNotDisruptAnnotationKey) + env.ExpectUpdated(pod) + } + + env.ConsistentlyExpectDisruptionsUntilNoneLeft(3, 2, 5*time.Minute) + }) + It("should respect budgets for non-empty replace drift", func() { + appLabels := map[string]string{"app": "large-app"} + nodePool.Labels = appLabels + // We're expecting to create 5 nodes, so we'll expect to see at most 3 nodes deleting at one time. + nodePool.Spec.Disruption.Budgets = []karpv1.Budget{{ + Nodes: "3", + }} + + // Create a 5 pod deployment with hostname inter-pod anti-affinity to ensure each pod is placed on a unique node + numPods = 5 + selector = labels.SelectorFromSet(appLabels) + deployment := coretest.Deployment(coretest.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: appLabels, + }, + PodAntiRequirements: []corev1.PodAffinityTerm{{ + TopologyKey: corev1.LabelHostname, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: appLabels, + }, + }}, + }, + }) + + env.ExpectCreated(nodeClass, nodePool, deployment) + + originalNodeClaims := env.EventuallyExpectCreatedNodeClaimCount("==", numPods) + originalNodes := env.EventuallyExpectCreatedNodeCount("==", numPods) + + // Check that all deployment pods are online + env.EventuallyExpectHealthyPodCount(selector, numPods) + By("cordoning and adding finalizer to the nodes") + // Add a finalizer to each node so that we can stop termination disruptions + for _, node := range originalNodes { + Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), node)).To(Succeed()) + node.Finalizers = append(node.Finalizers, common.TestingFinalizer) + env.ExpectUpdated(node) + } + + By("drifting the nodepool") + nodePool.Spec.Template.Annotations = lo.Assign(nodePool.Spec.Template.Annotations, map[string]string{"test-annotation": "drift"}) + env.ExpectUpdated(nodePool) + env.ConsistentlyExpectDisruptionsUntilNoneLeft(5, 3, 10*time.Minute) + + for _, node := range originalNodes { + Expect(env.ExpectTestingFinalizerRemoved(node)).To(Succeed()) + } + for _, nodeClaim := range originalNodeClaims { + Expect(env.ExpectTestingFinalizerRemoved(nodeClaim)).To(Succeed()) + } + + // Eventually expect all the nodes to be rolled and completely removed + // Since this completes the disruption operation, this also ensures that we aren't leaking nodes into subsequent + // tests since nodeclaims that are actively replacing but haven't brought-up nodes yet can register nodes later + env.EventuallyExpectNotFound(lo.Map(originalNodes, func(n *corev1.Node, _ int) client.Object { return n })...) + env.EventuallyExpectNotFound(lo.Map(originalNodeClaims, func(n *karpv1.NodeClaim, _ int) client.Object { return n })...) + env.ExpectNodeClaimCount("==", 5) + env.ExpectNodeCount("==", 5) + }) + It("should not allow drift if the budget is fully blocking", func() { + // We're going to define a budget that doesn't allow any drift to happen + nodePool.Spec.Disruption.Budgets = []karpv1.Budget{{ + Nodes: "0", + }} + + dep.Spec.Template.Annotations = nil + env.ExpectCreated(nodeClass, nodePool, dep) + + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + env.EventuallyExpectCreatedNodeCount("==", 1) + env.EventuallyExpectHealthyPodCount(selector, numPods) + + By("drifting the nodes") + // Drift the nodeclaims + nodePool.Spec.Template.Annotations = map[string]string{"test": "annotation"} + env.ExpectUpdated(nodePool) + + env.EventuallyExpectDrifted(nodeClaim) + env.ConsistentlyExpectNoDisruptions(1, time.Minute) + }) + It("should not allow drift if the budget is fully blocking during a scheduled time", func() { + // We're going to define a budget that doesn't allow any drift to happen + // This is going to be on a schedule that only lasts 30 minutes, whose window starts 15 minutes before + // the current time and extends 15 minutes past the current time + // Times need to be in UTC since the karpenter containers were built in UTC time + windowStart := time.Now().Add(-time.Minute * 15).UTC() + nodePool.Spec.Disruption.Budgets = []karpv1.Budget{{ + Nodes: "0", + Schedule: lo.ToPtr(fmt.Sprintf("%d %d * * *", windowStart.Minute(), windowStart.Hour())), + Duration: &metav1.Duration{Duration: time.Minute * 30}, + }} + + dep.Spec.Template.Annotations = nil + env.ExpectCreated(nodeClass, nodePool, dep) + + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + env.EventuallyExpectCreatedNodeCount("==", 1) + env.EventuallyExpectHealthyPodCount(selector, numPods) + + By("drifting the nodes") + // Drift the nodeclaims + nodePool.Spec.Template.Annotations = map[string]string{"test": "annotation"} + env.ExpectUpdated(nodePool) + + env.EventuallyExpectDrifted(nodeClaim) + env.ConsistentlyExpectNoDisruptions(1, time.Minute) + }) + }) + + It("should disrupt nodes that have drifted due to images", func() { + // TODO + }) + + DescribeTable("NodePool Drift", func(nodeClaimTemplate karpv1.NodeClaimTemplate) { + updatedNodePool := coretest.NodePool( + karpv1.NodePool{ + Spec: karpv1.NodePoolSpec{ + Template: karpv1.NodeClaimTemplate{ + Spec: karpv1.NodeClaimTemplateSpec{ + NodeClassRef: &karpv1.NodeClassReference{ + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + Name: nodeClass.Name, + }, + // keep the same instance type requirements to prevent considering instance types that require swap + Requirements: nodePool.Spec.Template.Spec.Requirements, + }, + }, + }, + }, + karpv1.NodePool{ + Spec: karpv1.NodePoolSpec{ + Template: nodeClaimTemplate, + }, + }, + ) + updatedNodePool.ObjectMeta = nodePool.ObjectMeta + + env.ExpectCreated(dep, nodeClass, nodePool) + pod := env.EventuallyExpectHealthyPodCount(selector, numPods)[0] nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] - node := env.EventuallyExpectNodeCount("==", 1)[0] + node := env.ExpectCreatedNodeCount("==", 1)[0] - By(fmt.Sprintf("waiting for nodepool %s update", nodePool.Name)) - nodePool.Spec.Template.Labels["triggerdrift"] = "value" - env.ExpectCreatedOrUpdated(nodePool) + env.ExpectCreatedOrUpdated(updatedNodePool) - By(fmt.Sprintf("waiting for nodeclaim %s to be marked as drifted", nodeClaim.Name)) env.EventuallyExpectDrifted(nodeClaim) - By(fmt.Sprintf("waiting for pod %s to to update", pod.Name)) delete(pod.Annotations, karpv1.DoNotDisruptAnnotationKey) env.ExpectUpdated(pod) - By(fmt.Sprintf("expect pod %s, nodeclaim %s, and node %s to eventually not exist", pod.Name, nodeClaim.Name, node.Name)) - SetDefaultEventuallyTimeout(10 * time.Minute) - env.EventuallyExpectNotFound(pod, nodeClaim, node) - SetDefaultEventuallyTimeout(5 * time.Minute) + // Nodes will need to have the start-up taint removed before the node can be considered as initialized + fmt.Println(CurrentSpecReport().LeafNodeText) + if CurrentSpecReport().LeafNodeText == "Start-up Taints" { + nodes := env.EventuallyExpectCreatedNodeCount("==", 2) + sort.Slice(nodes, func(i int, j int) bool { + return nodes[i].CreationTimestamp.Before(&nodes[j].CreationTimestamp) + }) + nodeTwo := nodes[1] + // Remove the startup taints from the new nodes to initialize them + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeTwo), nodeTwo)).To(Succeed()) + g.Expect(len(nodeTwo.Spec.Taints)).To(BeNumerically("==", 1)) + _, found := lo.Find(nodeTwo.Spec.Taints, func(t corev1.Taint) bool { + return t.MatchTaint(&corev1.Taint{Key: "example.com/another-taint-2", Effect: corev1.TaintEffectPreferNoSchedule}) + }) + g.Expect(found).To(BeTrue()) + stored := nodeTwo.DeepCopy() + nodeTwo.Spec.Taints = lo.Reject(nodeTwo.Spec.Taints, func(t corev1.Taint, _ int) bool { return t.Key == "example.com/another-taint-2" }) + g.Expect(env.Client.Patch(env.Context, nodeTwo, client.StrategicMergeFrom(stored))).To(Succeed()) + }).Should(Succeed()) + } + env.EventuallyExpectNotFound(pod, node) + env.EventuallyExpectHealthyPodCount(selector, numPods) + }, + Entry("Annotations", karpv1.NodeClaimTemplate{ + ObjectMeta: karpv1.ObjectMeta{ + Annotations: map[string]string{"keyAnnotationTest": "valueAnnotationTest"}, + }, + }), + Entry("Labels", karpv1.NodeClaimTemplate{ + ObjectMeta: karpv1.ObjectMeta{ + Labels: map[string]string{"keyLabelTest": "valueLabelTest"}, + }, + }), + Entry("Taints", karpv1.NodeClaimTemplate{ + Spec: karpv1.NodeClaimTemplateSpec{ + Taints: []corev1.Taint{{Key: "example.com/another-taint-2", Effect: corev1.TaintEffectPreferNoSchedule}}, + }, + }), + Entry("Start-up Taints", karpv1.NodeClaimTemplate{ + Spec: karpv1.NodeClaimTemplateSpec{ + StartupTaints: []corev1.Taint{{Key: "example.com/another-taint-2", Effect: corev1.TaintEffectPreferNoSchedule}}, + }, + }), + Entry("NodeRequirements", karpv1.NodeClaimTemplate{ + Spec: karpv1.NodeClaimTemplateSpec{ + // since this will overwrite the default requirements, add instance category and family selectors back into requirements + Requirements: []karpv1.NodeSelectorRequirementWithMinValues{ + {NodeSelectorRequirement: corev1.NodeSelectorRequirement{Key: karpv1.CapacityTypeLabelKey, Operator: corev1.NodeSelectorOpIn, Values: []string{karpv1.CapacityTypeSpot}}}, + {NodeSelectorRequirement: corev1.NodeSelectorRequirement{Key: corev1.LabelInstanceTypeStable, Operator: corev1.NodeSelectorOpIn, Values: []string{"Standard_DS4_v2"}}}, + }, + }, + }), + ) + FDescribeTable("AKSNodeClass", func(nodeClassSpec v1alpha2.AKSNodeClassSpec) { + updatedNodeClass := test.AKSNodeClass(v1alpha2.AKSNodeClass{Spec: *nodeClass.Spec.DeepCopy()}, v1alpha2.AKSNodeClass{Spec: nodeClassSpec}) + updatedNodeClass.ObjectMeta = nodeClass.ObjectMeta + + env.ExpectCreated(dep, nodeClass, nodePool) + pod := env.EventuallyExpectHealthyPodCount(selector, numPods)[0] + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + node := env.ExpectCreatedNodeCount("==", 1)[0] + + env.ExpectCreatedOrUpdated(updatedNodeClass) + + env.EventuallyExpectDrifted(nodeClaim) + + delete(pod.Annotations, karpv1.DoNotDisruptAnnotationKey) + env.ExpectUpdated(pod) + env.EventuallyExpectNotFound(pod, node) + env.EventuallyExpectHealthyPodCount(selector, numPods) + }, + // VNETSubnetID tested separately + Entry("OSDiskSizeGB", v1alpha2.AKSNodeClassSpec{OSDiskSizeGB: to.Int32Ptr(100)}), + // ImageID TBD + Entry("ImageFamily", v1alpha2.AKSNodeClassSpec{ImageFamily: to.StringPtr("AzureLinux")}), + Entry("Tags", v1alpha2.AKSNodeClassSpec{Tags: map[string]string{"keyTag-test-3": "valueTag-test-3"}}), + Entry("KubeletConfiguration", v1alpha2.AKSNodeClassSpec{ + Kubelet: &v1alpha2.KubeletConfiguration{ + ImageGCLowThresholdPercent: to.Int32Ptr(10), + ImageGCHighThresholdPercent: to.Int32Ptr(90), + }, + }), + Entry("MaxPods", v1alpha2.AKSNodeClassSpec{MaxPods: to.Int32Ptr(10)}), + ) + + It("should update the nodepool-hash annotation on the nodepool and nodeclaim when the nodepool's nodepool-hash-version annotation does not match the controller hash version", func() { + env.ExpectCreated(dep, nodeClass, nodePool) + env.EventuallyExpectHealthyPodCount(selector, numPods) + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + nodePool = env.ExpectExists(nodePool).(*karpv1.NodePool) + expectedHash := nodePool.Hash() + + By(fmt.Sprintf("expect nodepool %s and nodeclaim %s to contain %s and %s annotations", nodePool.Name, nodeClaim.Name, karpv1.NodePoolHashAnnotationKey, karpv1.NodePoolHashVersionAnnotationKey)) + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodePool), nodePool)).To(Succeed()) + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + + g.Expect(nodePool.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashAnnotationKey, expectedHash)) + g.Expect(nodePool.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashVersionAnnotationKey, karpv1.NodePoolHashVersion)) + g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashAnnotationKey, expectedHash)) + g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashVersionAnnotationKey, karpv1.NodePoolHashVersion)) + }).WithTimeout(30 * time.Second).Should(Succeed()) + + nodePool.Annotations = lo.Assign(nodePool.Annotations, map[string]string{ + karpv1.NodePoolHashAnnotationKey: "test-hash-1", + karpv1.NodePoolHashVersionAnnotationKey: "test-hash-version-1", + }) + // Updating `nodePool.Spec.Template.Annotations` would normally trigger drift on all nodeclaims owned by the + // nodepool. However, the nodepool-hash-version does not match the controller hash version, so we will see that + // none of the nodeclaims will be drifted and all nodeclaims will have an updated `nodepool-hash` and `nodepool-hash-version` annotation + nodePool.Spec.Template.Annotations = lo.Assign(nodePool.Spec.Template.Annotations, map[string]string{ + "test-key": "test-value", + }) + nodeClaim.Annotations = lo.Assign(nodePool.Annotations, map[string]string{ + karpv1.NodePoolHashAnnotationKey: "test-hash-2", + karpv1.NodePoolHashVersionAnnotationKey: "test-hash-version-2", + }) + + // The nodeclaim will need to be updated first, as the hash controller will only be triggered on changes to the nodepool + env.ExpectUpdated(nodeClaim, nodePool) + expectedHash = nodePool.Hash() + + // Expect all nodeclaims not to be drifted and contain an updated `nodepool-hash` and `nodepool-hash-version` annotation + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodePool), nodePool)).To(Succeed()) + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + + g.Expect(nodePool.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashAnnotationKey, expectedHash)) + g.Expect(nodePool.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashVersionAnnotationKey, karpv1.NodePoolHashVersion)) + g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashAnnotationKey, expectedHash)) + g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashVersionAnnotationKey, karpv1.NodePoolHashVersion)) + }) }) - It("should mark the nodeclaim as drifted for SubnetDrift if AKSNodeClass subnet id changes", func() { - env.ExpectCreated(pod, nodeClass, nodePool) + It("should update the aksnodeclass-hash annotation on the aksnodeclass and nodeclaim when the aksnodeclass's aksnodeclass-hash-version annotation does not match the controller hash version", func() { + env.ExpectCreated(dep, nodeClass, nodePool) + env.EventuallyExpectHealthyPodCount(selector, numPods) + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + nodeClass = env.ExpectExists(nodeClass).(*v1alpha2.AKSNodeClass) + expectedHash := nodeClass.Hash() - By(fmt.Sprintf("expect pod %s to be healthy", pod.Name)) - env.EventuallyExpectHealthy(pod) + By(fmt.Sprintf("expect nodeclass %s and nodeclaim %s to contain %s and %s annotations", nodeClass.Name, nodeClaim.Name, v1alpha2.AnnotationAKSNodeClassHash, v1alpha2.AnnotationAKSNodeClassHashVersion)) + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClass), nodeClass)).To(Succeed()) + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) - By("expect created node count to be 1") - env.ExpectCreatedNodeCount("==", 1) + g.Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHash, expectedHash)) + g.Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHashVersion, v1alpha2.AKSNodeClassHashVersion)) + g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHash, expectedHash)) + g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHashVersion, v1alpha2.AKSNodeClassHashVersion)) + }).WithTimeout(30 * time.Second).Should(Succeed()) + + nodeClass.Annotations = lo.Assign(nodeClass.Annotations, map[string]string{ + v1alpha2.AnnotationAKSNodeClassHash: "test-hash-1", + v1alpha2.AnnotationAKSNodeClassHashVersion: "test-hash-version-1", + }) + // Updating `nodeClass.Spec.Tags` would normally trigger drift on all nodeclaims using the + // nodeclass. However, the aksnodeclass-hash-version does not match the controller hash version, so we will see that + // none of the nodeclaims will be drifted and all nodeclaims will have an updated `aksnodeclass-hash` and `aksnodeclass-hash-version` annotation + nodeClass.Spec.Tags = lo.Assign(nodeClass.Spec.Tags, map[string]string{ + "test-key": "test-value", + }) + nodeClaim.Annotations = lo.Assign(nodePool.Annotations, map[string]string{ + v1alpha2.AnnotationAKSNodeClassHash: "test-hash-2", + v1alpha2.AnnotationAKSNodeClassHashVersion: "test-hash-version-2", + }) + + // The nodeclaim will need to be updated first, as the hash controller will only be triggered on changes to the nodeclass + env.ExpectUpdated(nodeClaim, nodeClass) + expectedHash = nodeClass.Hash() + + // Expect all nodeclaims not to be drifted and contain an updated `nodepool-hash` and `nodepool-hash-version` annotation + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClass), nodeClass)).To(Succeed()) + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + g.Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHash, expectedHash)) + g.Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHashVersion, v1alpha2.AKSNodeClassHashVersion)) + g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHash, expectedHash)) + g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHashVersion, v1alpha2.AKSNodeClassHashVersion)) + }).WithTimeout(30 * time.Second).Should(Succeed()) + env.ConsistentlyExpectNodeClaimsNotDrifted(time.Minute, nodeClaim) + }) + Context("Failure", func() { + It("should not disrupt a drifted node if the replacement node never registers", func() { + // launch a new nodeClaim + var numPods int32 = 2 + dep = coretest.Deployment(coretest.DeploymentOptions{ + Replicas: 2, + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "inflate"}}, + PodAntiRequirements: []corev1.PodAffinityTerm{{ + TopologyKey: corev1.LabelHostname, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "inflate"}, + }}, + }, + }, + }) + env.ExpectCreated(dep, nodeClass, nodePool) + + startingNodeClaimState := env.EventuallyExpectCreatedNodeClaimCount("==", int(numPods)) + env.EventuallyExpectCreatedNodeCount("==", int(numPods)) + + // Drift the nodeClaim with bad configuration that will not register a NodeClaim + nodeClass.Spec.VNETSubnetID = lo.ToPtr("/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/sillygeese/providers/Microsoft.Network/virtualNetworks/karpenter/subnets/nodeclassSubnet2") + env.ExpectCreatedOrUpdated(nodeClass) + + env.EventuallyExpectDrifted(startingNodeClaimState...) + + // Expect only a single node to be tainted due to default disruption budgets + taintedNodes := env.EventuallyExpectTaintedNodeCount("==", 1) + + // Drift should fail and the original node should be untainted + // TODO: reduce timeouts when disruption waits are factored out + env.EventuallyExpectNodesUntaintedWithTimeout(11*time.Minute, taintedNodes...) + + // Expect all the NodeClaims that existed on the initial provisioning loop are not removed. + // Assert this over several minutes to ensure a subsequent disruption controller pass doesn't + // successfully schedule the evicted pods to the in-flight nodeclaim and disrupt the original node + Consistently(func(g Gomega) { + nodeClaims := &karpv1.NodeClaimList{} + g.Expect(env.Client.List(env, nodeClaims, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + startingNodeClaimUIDs := sets.New(lo.Map(startingNodeClaimState, func(nc *karpv1.NodeClaim, _ int) types.UID { return nc.UID })...) + nodeClaimUIDs := sets.New(lo.Map(nodeClaims.Items, func(nc karpv1.NodeClaim, _ int) types.UID { return nc.UID })...) + g.Expect(nodeClaimUIDs.IsSuperset(startingNodeClaimUIDs)).To(BeTrue()) + }, "2m").Should(Succeed()) + + }) + It("should not disrupt a drifted node if the replacement node registers but never initialized", func() { + // launch a new nodeClaim + var numPods int32 = 2 + dep = coretest.Deployment(coretest.DeploymentOptions{ + Replicas: 2, + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "inflate"}}, + PodAntiRequirements: []corev1.PodAffinityTerm{{ + TopologyKey: corev1.LabelHostname, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "inflate"}, + }}, + }, + }, + }) + env.ExpectCreated(dep, nodeClass, nodePool) + + startingNodeClaimState := env.EventuallyExpectCreatedNodeClaimCount("==", int(numPods)) + env.EventuallyExpectCreatedNodeCount("==", int(numPods)) + + // Drift the nodeClaim with bad configuration that never initializes + nodePool.Spec.Template.Spec.StartupTaints = []corev1.Taint{{Key: "example.com/taint", Effect: corev1.TaintEffectPreferNoSchedule}} + env.ExpectCreatedOrUpdated(nodePool) + + env.EventuallyExpectDrifted(startingNodeClaimState...) + + // Expect only a single node to get tainted due to default disruption budgets + taintedNodes := env.EventuallyExpectTaintedNodeCount("==", 1) + + // Drift should fail and original node should be untainted + // TODO: reduce timeouts when disruption waits are factored out + env.EventuallyExpectNodesUntaintedWithTimeout(11*time.Minute, taintedNodes...) + + // Expect that the new nodeClaim/node is kept around after the un-cordon + nodeList := &corev1.NodeList{} + Expect(env.Client.List(env, nodeList, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + Expect(nodeList.Items).To(HaveLen(int(numPods) + 1)) + + nodeClaimList := &karpv1.NodeClaimList{} + Expect(env.Client.List(env, nodeClaimList, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + Expect(nodeClaimList.Items).To(HaveLen(int(numPods) + 1)) + + // Expect all the NodeClaims that existed on the initial provisioning loop are not removed + // Assert this over several minutes to ensure a subsequent disruption controller pass doesn't + // successfully schedule the evicted pods to the in-flight nodeclaim and disrupt the original node + Consistently(func(g Gomega) { + nodeClaims := &karpv1.NodeClaimList{} + g.Expect(env.Client.List(env, nodeClaims, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + startingNodeClaimUIDs := sets.New(lo.Map(startingNodeClaimState, func(m *karpv1.NodeClaim, _ int) types.UID { return m.UID })...) + nodeClaimUIDs := sets.New(lo.Map(nodeClaims.Items, func(m karpv1.NodeClaim, _ int) types.UID { return m.UID })...) + g.Expect(nodeClaimUIDs.IsSuperset(startingNodeClaimUIDs)).To(BeTrue()) + }, "2m").Should(Succeed()) + }) + It("should not drift any nodes if their PodDisruptionBudgets are unhealthy", func() { + // Create a deployment that contains a readiness probe that will never succeed + // This way, the pod will bind to the node, but the PodDisruptionBudget will never go healthy + var numPods int32 = 2 + dep = coretest.Deployment(coretest.DeploymentOptions{ + Replicas: 2, + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "inflate"}}, + PodAntiRequirements: []corev1.PodAffinityTerm{{ + TopologyKey: corev1.LabelHostname, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "inflate"}, + }}, + }, + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Port: intstr.FromInt32(80), + }, + }, + }, + }, + }) + selector = labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + minAvailable := intstr.FromInt32(numPods - 1) + pdb := coretest.PodDisruptionBudget(coretest.PDBOptions{ + Labels: dep.Spec.Template.Labels, + MinAvailable: &minAvailable, + }) + env.ExpectCreated(dep, nodeClass, nodePool, pdb) + + nodeClaims := env.EventuallyExpectCreatedNodeClaimCount("==", int(numPods)) + env.EventuallyExpectCreatedNodeCount("==", int(numPods)) + + // Expect pods to be bound but not to be ready since we are intentionally failing the readiness check + env.EventuallyExpectBoundPodCount(selector, int(numPods)) + + // Drift the nodeclaims + nodePool.Spec.Template.Annotations = map[string]string{"test": "annotation"} + env.ExpectUpdated(nodePool) + + env.EventuallyExpectDrifted(nodeClaims...) + env.ConsistentlyExpectNoDisruptions(int(numPods), time.Minute) + }) + }) + + It("should disrupt nodes that have drifted due to VNETSubnetID", func() { + env.ExpectCreated(dep, nodeClass, nodePool) + env.EventuallyExpectHealthyPodCount(selector, numPods) nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + By("expect created node count to be 1") + env.ExpectCreatedNodeCount("==", 1) By("triggering subnet drift") // TODO: Introduce azure clients to the tests to get values dynamically and be able to create azure resources inside of tests rather than using a fake id. // this will fail to actually create a new nodeclaim for the drift replacement but should still test that we are marking the nodeclaim as drifted.