diff --git a/pkg/apis/v1alpha2/aksnodeclass.go b/pkg/apis/v1alpha2/aksnodeclass.go index 49f3764df..c4fdb66eb 100644 --- a/pkg/apis/v1alpha2/aksnodeclass.go +++ b/pkg/apis/v1alpha2/aksnodeclass.go @@ -51,7 +51,7 @@ type AKSNodeClassSpec struct { // Wherever possible, the types and names should reflect the upstream kubelet types. // +kubebuilder:validation:XValidation:message="imageGCHighThresholdPercent must be greater than imageGCLowThresholdPercent",rule="has(self.imageGCHighThresholdPercent) && has(self.imageGCLowThresholdPercent) ? self.imageGCHighThresholdPercent > self.imageGCLowThresholdPercent : true" // +optional - Kubelet *KubeletConfiguration `json:"kubelet,omitempty" hash:"ignore"` + Kubelet *KubeletConfiguration `json:"kubelet,omitempty"` // MaxPods is an override for the maximum number of pods that can run on a worker node instance. // +kubebuilder:validation:Minimum:=0 // +optional diff --git a/pkg/apis/v1alpha2/aksnodeclass_hash_test.go b/pkg/apis/v1alpha2/aksnodeclass_hash_test.go new file mode 100644 index 000000000..567159c2c --- /dev/null +++ b/pkg/apis/v1alpha2/aksnodeclass_hash_test.go @@ -0,0 +1,106 @@ +/* +Portions Copyright (c) Microsoft Corporation. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha2_test + +import ( + "time" + + "github.com/Azure/karpenter-provider-azure/pkg/apis/v1alpha2" + "github.com/imdario/mergo" + "github.com/samber/lo" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/karpenter/pkg/test" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Hash", func() { + const staticHash = "3322684356700793451" + var nodeClass *v1alpha2.AKSNodeClass + BeforeEach(func() { + nodeClass = &v1alpha2.AKSNodeClass{ + ObjectMeta: test.ObjectMeta(metav1.ObjectMeta{}), + Spec: v1alpha2.AKSNodeClassSpec{ + VNETSubnetID: lo.ToPtr("subnet-id"), + OSDiskSizeGB: lo.ToPtr(int32(30)), + ImageFamily: lo.ToPtr("Ubuntu2204"), + Tags: map[string]string{ + "keyTag-1": "valueTag-1", + "keyTag-2": "valueTag-2", + }, + Kubelet: &v1alpha2.KubeletConfiguration{ + CPUManagerPolicy: "static", + CPUCFSQuota: lo.ToPtr(true), + CPUCFSQuotaPeriod: metav1.Duration{Duration: lo.Must(time.ParseDuration("100ms"))}, + ImageGCHighThresholdPercent: lo.ToPtr(int32(85)), + ImageGCLowThresholdPercent: lo.ToPtr(int32(80)), + TopologyManagerPolicy: "none", + AllowedUnsafeSysctls: []string{"net.core.somaxconn"}, + ContainerLogMaxSize: "10Mi", + ContainerLogMaxFiles: lo.ToPtr(int32(10)), + }, + MaxPods: lo.ToPtr(int32(100)), + }, + } + }) + DescribeTable( + "should match static hash on field value change", + func(hash string, changes v1alpha2.AKSNodeClass) { + Expect(mergo.Merge(nodeClass, changes, mergo.WithOverride, mergo.WithSliceDeepCopy)).To(Succeed()) + Expect(nodeClass.Hash()).To(Equal(hash)) + }, + Entry("Base AKSNodeClass", staticHash, v1alpha2.AKSNodeClass{}), + + // Static fields, expect changed hash from base + Entry("VNETSubnetID", "1592960122675270014", v1alpha2.AKSNodeClass{Spec: v1alpha2.AKSNodeClassSpec{VNETSubnetID: lo.ToPtr("subnet-id-2")}}), + Entry("OSDiskSizeGB", "8480605960768312946", v1alpha2.AKSNodeClass{Spec: v1alpha2.AKSNodeClassSpec{OSDiskSizeGB: lo.ToPtr(int32(40))}}), + Entry("ImageFamily", "1518600367124538723", v1alpha2.AKSNodeClass{Spec: v1alpha2.AKSNodeClassSpec{ImageFamily: lo.ToPtr("AzureLinux")}}), + Entry("Tags", "13342466710512152270", v1alpha2.AKSNodeClass{Spec: v1alpha2.AKSNodeClassSpec{Tags: map[string]string{"keyTag-test-3": "valueTag-test-3"}}}), + Entry("Kubelet", "13352743309220827045", v1alpha2.AKSNodeClass{Spec: v1alpha2.AKSNodeClassSpec{Kubelet: &v1alpha2.KubeletConfiguration{CPUManagerPolicy: "none"}}}), + Entry("MaxPods", "17237389697604178241", v1alpha2.AKSNodeClass{Spec: v1alpha2.AKSNodeClassSpec{MaxPods: lo.ToPtr(int32(200))}}), + ) + It("should match static hash when reordering tags", func() { + nodeClass.Spec.Tags = map[string]string{"keyTag-2": "valueTag-2", "keyTag-1": "valueTag-1"} + Expect(nodeClass.Hash()).To(Equal(staticHash)) + }) + DescribeTable("should change hash when static fields are updated", func(changes v1alpha2.AKSNodeClass) { + hash := nodeClass.Hash() + Expect(mergo.Merge(nodeClass, changes, mergo.WithOverride, mergo.WithSliceDeepCopy)).To(Succeed()) + updatedHash := nodeClass.Hash() + Expect(hash).ToNot(Equal(updatedHash)) + }, + Entry("VNETSubnetID", v1alpha2.AKSNodeClass{Spec: v1alpha2.AKSNodeClassSpec{VNETSubnetID: lo.ToPtr("subnet-id-2")}}), + Entry("OSDiskSizeGB", v1alpha2.AKSNodeClass{Spec: v1alpha2.AKSNodeClassSpec{OSDiskSizeGB: lo.ToPtr(int32(40))}}), + Entry("ImageFamily", v1alpha2.AKSNodeClass{Spec: v1alpha2.AKSNodeClassSpec{ImageFamily: lo.ToPtr("AzureLinux")}}), + Entry("Tags", v1alpha2.AKSNodeClass{Spec: v1alpha2.AKSNodeClassSpec{Tags: map[string]string{"keyTag-test-3": "valueTag-test-3"}}}), + Entry("Kubelet", v1alpha2.AKSNodeClass{Spec: v1alpha2.AKSNodeClassSpec{Kubelet: &v1alpha2.KubeletConfiguration{CPUManagerPolicy: "none"}}}), + Entry("MaxPods", v1alpha2.AKSNodeClass{Spec: v1alpha2.AKSNodeClassSpec{MaxPods: lo.ToPtr(int32(200))}}), + ) + It("should not change hash when tags are re-ordered", func() { + hash := nodeClass.Hash() + nodeClass.Spec.Tags = map[string]string{"keyTag-2": "valueTag-2", "keyTag-1": "valueTag-1"} + updatedHash := nodeClass.Hash() + Expect(hash).To(Equal(updatedHash)) + }) + It("should expect two AKSNodeClasses with the same spec to have the same hash", func() { + otherNodeClass := &v1alpha2.AKSNodeClass{ + Spec: nodeClass.Spec, + } + Expect(nodeClass.Hash()).To(Equal(otherNodeClass.Hash())) + }) +}) diff --git a/pkg/controllers/nodeclass/hash/controller.go b/pkg/controllers/nodeclass/hash/controller.go index c9bb64d0b..a82ed7a3b 100644 --- a/pkg/controllers/nodeclass/hash/controller.go +++ b/pkg/controllers/nodeclass/hash/controller.go @@ -80,9 +80,9 @@ func (c *Controller) Register(_ context.Context, m manager.Manager) error { Complete(reconcile.AsReconciler(m.GetClient(), c)) } -// Updating `AKSNodeClass-hash-version` annotation inside the karpenter controller means a breaking change has been made to the hash calculation. -// `AKSNodeClass-hash` annotation on the AKSNodeClass will be updated, due to the breaking change, making the `AKSNodeClass-hash` on the NodeClaim different from -// AKSNodeClass. Since, we cannot rely on the `AKSNodeClass-hash` on the NodeClaims, due to the breaking change, we will need to re-calculate the hash and update the annotation. +// Updating `aksnodeclass-hash-version` annotation inside the karpenter controller means a breaking change has been made to the hash calculation. +// `aksnodeclass-hash` annotation on the AKSNodeClass will be updated, due to the breaking change, making the `aksnodeclass-hash` on the NodeClaim different from +// AKSNodeClass. Since, we cannot rely on the `aksnodeclass-hash` on the NodeClaims, due to the breaking change, we will need to re-calculate the hash and update the annotation. // For more information on the Drift Hash Versioning: https://github.com/kubernetes-sigs/karpenter/blob/main/designs/drift-hash-versioning.md func (c *Controller) updateNodeClaimHash(ctx context.Context, nodeClass *v1alpha2.AKSNodeClass) error { ncList := &karpv1.NodeClaimList{} diff --git a/pkg/controllers/nodeclass/hash/suite_test.go b/pkg/controllers/nodeclass/hash/suite_test.go index 0fac30d3e..d95f6c916 100644 --- a/pkg/controllers/nodeclass/hash/suite_test.go +++ b/pkg/controllers/nodeclass/hash/suite_test.go @@ -117,7 +117,7 @@ var _ = Describe("NodeClass Hash Controller", func() { Entry("OSDiskSizeGB Drift", &v1alpha2.AKSNodeClass{Spec: v1alpha2.AKSNodeClassSpec{OSDiskSizeGB: lo.ToPtr(int32(100))}}), Entry("Tags Drift", &v1alpha2.AKSNodeClass{Spec: v1alpha2.AKSNodeClassSpec{Tags: map[string]string{"keyTag-test-3": "valueTag-test-3"}}}), ) - It("should update AKSNodeClass-hash-version annotation when the AKSNodeClass-hash-version on the NodeClass does not match with the controller hash version", func() { + It("should update aksnodeclass-hash-version annotation when the aksnodeclass-hash-version on the NodeClass does not match with the controller hash version", func() { nodeClass.Annotations = map[string]string{ v1alpha2.AnnotationAKSNodeClassHash: "abceduefed", v1alpha2.AnnotationAKSNodeClassHashVersion: "test", @@ -128,11 +128,11 @@ var _ = Describe("NodeClass Hash Controller", func() { nodeClass = ExpectExists(ctx, env.Client, nodeClass) expectedHash := nodeClass.Hash() - // Expect AKSNodeClass-hash on the NodeClass to be updated + // Expect aksnodeclass-hash on the NodeClass to be updated Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHash, expectedHash)) Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHashVersion, v1alpha2.AKSNodeClassHashVersion)) }) - It("should update AKSNodeClass-hash-versions on all NodeClaims when the AKSNodeClass-hash-version does not match with the controller hash version", func() { + It("should update aksnodeclass-hash-versions on all NodeClaims when the aksnodeclass-hash-version does not match with the controller hash version", func() { nodeClass.Annotations = map[string]string{ v1alpha2.AnnotationAKSNodeClassHash: "abceduefed", v1alpha2.AnnotationAKSNodeClassHashVersion: "test", @@ -178,13 +178,13 @@ var _ = Describe("NodeClass Hash Controller", func() { nodeClaimTwo = ExpectExists(ctx, env.Client, nodeClaimTwo) expectedHash := nodeClass.Hash() - // Expect AKSNodeClass-hash on the NodeClaims to be updated + // Expect aksnodeclass-hash on the NodeClaims to be updated Expect(nodeClaimOne.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHash, expectedHash)) Expect(nodeClaimOne.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHashVersion, v1alpha2.AKSNodeClassHashVersion)) Expect(nodeClaimTwo.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHash, expectedHash)) Expect(nodeClaimTwo.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHashVersion, v1alpha2.AKSNodeClassHashVersion)) }) - It("should not update AKSNodeClass-hash on all NodeClaims when the AKSNodeClass-hash-version matches the controller hash version", func() { + It("should not update aksnodeclass-hash on all NodeClaims when the aksnodeclass-hash-version matches the controller hash version", func() { nodeClass.Annotations = map[string]string{ v1alpha2.AnnotationAKSNodeClassHash: "abceduefed", v1alpha2.AnnotationAKSNodeClassHashVersion: "test-version", @@ -213,14 +213,14 @@ var _ = Describe("NodeClass Hash Controller", func() { expectedHash := nodeClass.Hash() - // Expect AKSNodeClass-hash on the NodeClass to be updated + // Expect aksnodeclass-hash on the NodeClass to be updated Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHash, expectedHash)) Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHashVersion, v1alpha2.AKSNodeClassHashVersion)) - // Expect AKSNodeClass-hash on the NodeClaims to stay the same + // Expect aksnodeclass-hash on the NodeClaims to stay the same Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHash, "1234564654")) Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHashVersion, v1alpha2.AKSNodeClassHashVersion)) }) - It("should not update AKSNodeClass-hash on the NodeClaim if it's drifted and the AKSNodeClass-hash-version does not match the controller hash version", func() { + It("should not update aksnodeclass-hash on the NodeClaim if it's drifted and the aksnodeclass-hash-version does not match the controller hash version", func() { nodeClass.Annotations = map[string]string{ v1alpha2.AnnotationAKSNodeClassHash: "abceduefed", v1alpha2.AnnotationAKSNodeClassHashVersion: "test", @@ -247,7 +247,7 @@ var _ = Describe("NodeClass Hash Controller", func() { ExpectObjectReconciled(ctx, env.Client, hashController, nodeClass) nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - // Expect AKSNodeClass-hash on the NodeClaims to stay the same + // Expect aksnodeclass-hash on the NodeClaims to stay the same Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHash, "123456")) Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHashVersion, v1alpha2.AKSNodeClassHashVersion)) }) diff --git a/pkg/test/aksnodeclass.go b/pkg/test/nodeclass.go similarity index 89% rename from pkg/test/aksnodeclass.go rename to pkg/test/nodeclass.go index 32ce145e0..400048572 100644 --- a/pkg/test/aksnodeclass.go +++ b/pkg/test/nodeclass.go @@ -36,17 +36,19 @@ func AKSNodeClass(overrides ...v1alpha2.AKSNodeClass) *v1alpha2.AKSNodeClass { panic(fmt.Sprintf("Failed to merge settings: %s", err)) } } - - nc := &v1alpha2.AKSNodeClass{ + // In reality, these default values will be set via the defaulting done by the API server. The reason we provide them here is + // we sometimes reference a test.AKSNodeClass without applying it, and in that case we need to set the default values ourselves + if options.Spec.OSDiskSizeGB == nil { + options.Spec.OSDiskSizeGB = lo.ToPtr[int32](128) + } + if options.Spec.ImageFamily == nil { + options.Spec.ImageFamily = lo.ToPtr(v1alpha2.Ubuntu2204ImageFamily) + } + return &v1alpha2.AKSNodeClass{ ObjectMeta: test.ObjectMeta(options.ObjectMeta), Spec: options.Spec, Status: options.Status, } - // In reality, these default values will be set via the defaulting done by the API server. The reason we provide them here is - // we sometimes reference a test.AKSNodeClass without applying it, and in that case we need to set the default values ourselves - nc.Spec.OSDiskSizeGB = lo.ToPtr[int32](128) - nc.Spec.ImageFamily = lo.ToPtr(v1alpha2.Ubuntu2204ImageFamily) - return nc } func AKSNodeClassFieldIndexer(ctx context.Context) func(cache.Cache) error { diff --git a/test/pkg/environment/azure/environment.go b/test/pkg/environment/azure/environment.go index b8f72f550..aa1ac839d 100644 --- a/test/pkg/environment/azure/environment.go +++ b/test/pkg/environment/azure/environment.go @@ -22,6 +22,7 @@ import ( "github.com/samber/lo" v1 "k8s.io/api/core/v1" karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" + coretest "sigs.k8s.io/karpenter/pkg/test" "github.com/Azure/karpenter-provider-azure/pkg/apis/v1alpha2" "github.com/Azure/karpenter-provider-azure/pkg/test" @@ -29,12 +30,11 @@ import ( ) func init() { - // TODO: should have core1beta1.NormalizedLabels too? karpv1.NormalizedLabels = lo.Assign(karpv1.NormalizedLabels, map[string]string{"topology.disk.csi.azure.com/zone": v1.LabelTopologyZone}) + coretest.DefaultImage = "mcr.microsoft.com/oss/kubernetes/pause:3.6" } const ( - WindowsDefaultImage = "mcr.microsoft.com/oss/kubernetes/pause:3.9" CiliumAgentNotReadyTaint = "node.cilium.io/agent-not-ready" ) diff --git a/test/pkg/environment/common/setup.go b/test/pkg/environment/common/setup.go index b56f71117..0787f5b6a 100644 --- a/test/pkg/environment/common/setup.go +++ b/test/pkg/environment/common/setup.go @@ -21,10 +21,6 @@ import ( "sync" "time" - "github.com/Azure/karpenter-provider-azure/pkg/apis/v1alpha2" - "github.com/Azure/karpenter-provider-azure/test/pkg/debug" - . "github.com/onsi/ginkgo/v2" //nolint:revive,stylecheck - . "github.com/onsi/gomega" //nolint:revive,stylecheck "github.com/samber/lo" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -40,6 +36,12 @@ import ( karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/test" "sigs.k8s.io/karpenter/pkg/utils/pod" + + "github.com/Azure/karpenter-provider-azure/pkg/apis/v1alpha2" + "github.com/Azure/karpenter-provider-azure/test/pkg/debug" + + . "github.com/onsi/ginkgo/v2" //nolint:revive,stylecheck + . "github.com/onsi/gomega" //nolint:revive,stylecheck ) const TestingFinalizer = "testing/finalizer" @@ -107,6 +109,7 @@ func (env *Environment) ExpectCleanCluster() { func (env *Environment) Cleanup() { env.CleanupObjects(CleanableObjects...) + env.EventuallyExpectNoLeakedKubeNodeLease() env.eventuallyExpectScaleDown() env.ExpectNoCrashes() } diff --git a/test/suites/chaos/suite_test.go b/test/suites/chaos/suite_test.go index fc00162b3..12bd3693a 100644 --- a/test/suites/chaos/suite_test.go +++ b/test/suites/chaos/suite_test.go @@ -87,7 +87,6 @@ var _ = Describe("Chaos", func() { dep := coretest.Deployment(coretest.DeploymentOptions{ Replicas: int32(numPods), PodOptions: coretest.PodOptions{ - Image: "mcr.microsoft.com/oss/kubernetes/pause:3.6", ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{"app": "my-app"}, }, diff --git a/test/suites/drift/suite_test.go b/test/suites/drift/suite_test.go index 4d7fce780..7d4882f15 100644 --- a/test/suites/drift/suite_test.go +++ b/test/suites/drift/suite_test.go @@ -14,26 +14,35 @@ See the License for the specific language governing permissions and limitations under the License. */ -package drift +package drift_test import ( "fmt" + "sort" "testing" "time" + "github.com/awslabs/operatorpkg/object" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/samber/lo" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apimachinery/pkg/util/sets" + "sigs.k8s.io/controller-runtime/pkg/client" + karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" + coretest "sigs.k8s.io/karpenter/pkg/test" + "github.com/Azure/go-autorest/autorest/to" "github.com/Azure/karpenter-provider-azure/pkg/apis/v1alpha2" + "github.com/Azure/karpenter-provider-azure/pkg/test" "github.com/Azure/karpenter-provider-azure/test/pkg/environment/azure" - - karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" - - coretest "sigs.k8s.io/karpenter/pkg/test" + "github.com/Azure/karpenter-provider-azure/test/pkg/environment/common" ) var env *azure.Environment @@ -45,6 +54,9 @@ func TestDrift(t *testing.T) { BeforeSuite(func() { env = azure.NewEnvironment(t) }) + AfterSuite(func() { + env.Stop() + }) RunSpecs(t, "Drift") } @@ -57,71 +69,652 @@ var _ = AfterEach(func() { env.Cleanup() }) var _ = AfterEach(func() { env.AfterEach() }) var _ = Describe("Drift", func() { - - var pod *corev1.Pod - + var dep *appsv1.Deployment + var selector labels.Selector + var numPods int BeforeEach(func() { - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ - NodeSelectorRequirement: corev1.NodeSelectorRequirement{ - Key: corev1.LabelInstanceTypeStable, - Operator: corev1.NodeSelectorOpIn, - Values: []string{"Standard_DS2_v2"}, - }}) - - // Add a do-not-disrupt pod so that we can check node metadata before we disrupt - pod = coretest.Pod(coretest.PodOptions{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - karpv1.DoNotDisruptAnnotationKey: "true", + numPods = 1 + // Add pods with a do-not-disrupt annotation so that we can check node metadata before we disrupt + dep = coretest.Deployment(coretest.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "app": "my-app", + }, + Annotations: map[string]string{ + karpv1.DoNotDisruptAnnotationKey: "true", + }, }, + TerminationGracePeriodSeconds: lo.ToPtr[int64](0), }, - ResourceRequirements: corev1.ResourceRequirements{Requests: corev1.ResourceList{corev1.ResourceCPU: resource.MustParse("0.5")}}, - Image: "mcr.microsoft.com/oss/kubernetes/pause:3.6", }) + selector = labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) }) + Context("Budgets", func() { + It("should respect budgets for empty drift", func() { + nodePool = coretest.ReplaceRequirements(nodePool, + karpv1.NodeSelectorRequirementWithMinValues{ + NodeSelectorRequirement: corev1.NodeSelectorRequirement{ + Key: corev1.LabelInstanceTypeStable, + Operator: corev1.NodeSelectorOpIn, + Values: []string{"Standard_DS4_v2"}, + }, + }, + ) + // We're expecting to create 3 nodes, so we'll expect to see 2 nodes deleting at one time. + nodePool.Spec.Disruption.Budgets = []karpv1.Budget{{ + Nodes: "50%", + }} + var numPods int32 = 6 + dep = coretest.Deployment(coretest.DeploymentOptions{ + Replicas: numPods, + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + karpv1.DoNotDisruptAnnotationKey: "true", + }, + Labels: map[string]string{"app": "large-app"}, + }, + // Each Standard_DS4_v2 has 8 cpu, so each node should fit 2 pods. + ResourceRequirements: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("3"), + }, + }, + }, + }) + selector = labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + env.ExpectCreated(nodeClass, nodePool, dep) - // TODO: Add budget tests + nodeClaims := env.EventuallyExpectCreatedNodeClaimCount("==", 3) + nodes := env.EventuallyExpectCreatedNodeCount("==", 3) + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) - It("should deprovision nodes that have drifted due to labels", func() { + // List nodes so that we get any updated information on the nodes. If we don't + // we have the potential to over-write any changes Karpenter makes to the nodes. + // Add a finalizer to each node so that we can stop termination disruptions + By("adding finalizers to the nodes to prevent termination") + for _, node := range nodes { + Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), node)).To(Succeed()) + node.Finalizers = append(node.Finalizers, common.TestingFinalizer) + env.ExpectUpdated(node) + } - By(fmt.Sprintf("creating pod %s, nodepool %s, and nodeclass %s", pod.Name, nodePool.Name, nodeClass.Name)) - env.ExpectCreated(pod, nodeClass, nodePool) + By("making the nodes empty") + // Delete the deployment to make all nodes empty. + env.ExpectDeleted(dep) - By(fmt.Sprintf("expect pod %s to be healthy", pod.Name)) - env.EventuallyExpectHealthy(pod) + // Drift the nodeclaims + By("drift the nodeclaims") + nodePool.Spec.Template.Annotations = map[string]string{"test": "annotation"} + env.ExpectUpdated(nodePool) - By("expect created node count to be 1") - env.ExpectCreatedNodeCount("==", 1) + env.EventuallyExpectDrifted(nodeClaims...) + + env.ConsistentlyExpectDisruptionsUntilNoneLeft(3, 2, 5*time.Minute) + }) + It("should respect budgets for non-empty delete drift", func() { + nodePool = coretest.ReplaceRequirements(nodePool, + karpv1.NodeSelectorRequirementWithMinValues{ + NodeSelectorRequirement: corev1.NodeSelectorRequirement{ + Key: corev1.LabelInstanceTypeStable, + Operator: corev1.NodeSelectorOpIn, + Values: []string{"Standard_DS4_v2"}, + }, + }, + ) + // We're expecting to create 3 nodes, so we'll expect to see at most 2 nodes deleting at one time. + nodePool.Spec.Disruption.Budgets = []karpv1.Budget{{ + Nodes: "50%", + }} + var numPods int32 = 9 + dep = coretest.Deployment(coretest.DeploymentOptions{ + Replicas: numPods, + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + karpv1.DoNotDisruptAnnotationKey: "true", + }, + Labels: map[string]string{"app": "large-app"}, + }, + // Each Standard_DS4_v2 has 8 cpu, so each node should fit no more than 3 pods. + ResourceRequirements: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2100m"), + }, + }, + }, + }) + selector = labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + env.ExpectCreated(nodeClass, nodePool, dep) + + nodeClaims := env.EventuallyExpectCreatedNodeClaimCount("==", 3) + nodes := env.EventuallyExpectCreatedNodeCount("==", 3) + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) + + By("scaling down the deployment") + // Update the deployment to a third of the replicas. + dep.Spec.Replicas = lo.ToPtr[int32](3) + env.ExpectUpdated(dep) + + // First expect there to be 3 pods, then try to spread the pods. + env.EventuallyExpectHealthyPodCount(selector, 3) + env.ForcePodsToSpread(nodes...) + env.EventuallyExpectHealthyPodCount(selector, 3) + + By("cordoning and adding finalizer to the nodes") + // Add a finalizer to each node so that we can stop termination disruptions + for _, node := range nodes { + Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), node)).To(Succeed()) + node.Finalizers = append(node.Finalizers, common.TestingFinalizer) + env.ExpectUpdated(node) + } + + By("drifting the nodes") + // Drift the nodeclaims + nodePool.Spec.Template.Annotations = map[string]string{"test": "annotation"} + env.ExpectUpdated(nodePool) + + env.EventuallyExpectDrifted(nodeClaims...) + + By("enabling disruption by removing the do not disrupt annotation") + pods := env.EventuallyExpectHealthyPodCount(selector, 3) + // Remove the do-not-disrupt annotation so that the nodes are now disruptable + for _, pod := range pods { + delete(pod.Annotations, karpv1.DoNotDisruptAnnotationKey) + env.ExpectUpdated(pod) + } + + env.ConsistentlyExpectDisruptionsUntilNoneLeft(3, 2, 5*time.Minute) + }) + It("should respect budgets for non-empty replace drift", func() { + appLabels := map[string]string{"app": "large-app"} + nodePool.Labels = appLabels + // We're expecting to create 5 nodes, so we'll expect to see at most 3 nodes deleting at one time. + nodePool.Spec.Disruption.Budgets = []karpv1.Budget{{ + Nodes: "3", + }} + + // Create a 5 pod deployment with hostname inter-pod anti-affinity to ensure each pod is placed on a unique node + numPods = 5 + selector = labels.SelectorFromSet(appLabels) + deployment := coretest.Deployment(coretest.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: appLabels, + }, + PodAntiRequirements: []corev1.PodAffinityTerm{{ + TopologyKey: corev1.LabelHostname, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: appLabels, + }, + }}, + }, + }) + + env.ExpectCreated(nodeClass, nodePool, deployment) + + originalNodeClaims := env.EventuallyExpectCreatedNodeClaimCount("==", numPods) + originalNodes := env.EventuallyExpectCreatedNodeCount("==", numPods) + + // Check that all deployment pods are online + env.EventuallyExpectHealthyPodCount(selector, numPods) + By("cordoning and adding finalizer to the nodes") + // Add a finalizer to each node so that we can stop termination disruptions + for _, node := range originalNodes { + Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), node)).To(Succeed()) + node.Finalizers = append(node.Finalizers, common.TestingFinalizer) + env.ExpectUpdated(node) + } + + By("drifting the nodepool") + nodePool.Spec.Template.Annotations = lo.Assign(nodePool.Spec.Template.Annotations, map[string]string{"test-annotation": "drift"}) + env.ExpectUpdated(nodePool) + env.ConsistentlyExpectDisruptionsUntilNoneLeft(5, 3, 10*time.Minute) + + for _, node := range originalNodes { + Expect(env.ExpectTestingFinalizerRemoved(node)).To(Succeed()) + } + for _, nodeClaim := range originalNodeClaims { + Expect(env.ExpectTestingFinalizerRemoved(nodeClaim)).To(Succeed()) + } + + // Eventually expect all the nodes to be rolled and completely removed + // Since this completes the disruption operation, this also ensures that we aren't leaking nodes into subsequent + // tests since nodeclaims that are actively replacing but haven't brought-up nodes yet can register nodes later + env.EventuallyExpectNotFound(lo.Map(originalNodes, func(n *corev1.Node, _ int) client.Object { return n })...) + env.EventuallyExpectNotFound(lo.Map(originalNodeClaims, func(n *karpv1.NodeClaim, _ int) client.Object { return n })...) + env.ExpectNodeClaimCount("==", 5) + env.ExpectNodeCount("==", 5) + }) + It("should not allow drift if the budget is fully blocking", func() { + // We're going to define a budget that doesn't allow any drift to happen + nodePool.Spec.Disruption.Budgets = []karpv1.Budget{{ + Nodes: "0", + }} + + dep.Spec.Template.Annotations = nil + env.ExpectCreated(nodeClass, nodePool, dep) + + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + env.EventuallyExpectCreatedNodeCount("==", 1) + env.EventuallyExpectHealthyPodCount(selector, numPods) + + By("drifting the nodes") + // Drift the nodeclaims + nodePool.Spec.Template.Annotations = map[string]string{"test": "annotation"} + env.ExpectUpdated(nodePool) + + env.EventuallyExpectDrifted(nodeClaim) + env.ConsistentlyExpectNoDisruptions(1, time.Minute) + }) + It("should not allow drift if the budget is fully blocking during a scheduled time", func() { + // We're going to define a budget that doesn't allow any drift to happen + // This is going to be on a schedule that only lasts 30 minutes, whose window starts 15 minutes before + // the current time and extends 15 minutes past the current time + // Times need to be in UTC since the karpenter containers were built in UTC time + windowStart := time.Now().Add(-time.Minute * 15).UTC() + nodePool.Spec.Disruption.Budgets = []karpv1.Budget{{ + Nodes: "0", + Schedule: lo.ToPtr(fmt.Sprintf("%d %d * * *", windowStart.Minute(), windowStart.Hour())), + Duration: &metav1.Duration{Duration: time.Minute * 30}, + }} + + dep.Spec.Template.Annotations = nil + env.ExpectCreated(nodeClass, nodePool, dep) + + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + env.EventuallyExpectCreatedNodeCount("==", 1) + env.EventuallyExpectHealthyPodCount(selector, numPods) + + By("drifting the nodes") + // Drift the nodeclaims + nodePool.Spec.Template.Annotations = map[string]string{"test": "annotation"} + env.ExpectUpdated(nodePool) + + env.EventuallyExpectDrifted(nodeClaim) + env.ConsistentlyExpectNoDisruptions(1, time.Minute) + }) + }) + + It("should disrupt nodes that have drifted due to images", func() { + // TODO + }) + + DescribeTable("NodePool Drift", func(nodeClaimTemplate karpv1.NodeClaimTemplate) { + updatedNodePool := coretest.NodePool( + karpv1.NodePool{ + Spec: karpv1.NodePoolSpec{ + Template: karpv1.NodeClaimTemplate{ + Spec: karpv1.NodeClaimTemplateSpec{ + NodeClassRef: &karpv1.NodeClassReference{ + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + Name: nodeClass.Name, + }, + // keep the same instance type requirements to prevent considering instance types that require swap + Requirements: nodePool.Spec.Template.Spec.Requirements, + }, + }, + }, + }, + karpv1.NodePool{ + Spec: karpv1.NodePoolSpec{ + Template: nodeClaimTemplate, + }, + }, + ) + updatedNodePool.ObjectMeta = nodePool.ObjectMeta + + env.ExpectCreated(dep, nodeClass, nodePool) + pod := env.EventuallyExpectHealthyPodCount(selector, numPods)[0] nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] - node := env.EventuallyExpectNodeCount("==", 1)[0] + node := env.ExpectCreatedNodeCount("==", 1)[0] - By(fmt.Sprintf("waiting for nodepool %s update", nodePool.Name)) - nodePool.Spec.Template.Labels["triggerdrift"] = "value" - env.ExpectCreatedOrUpdated(nodePool) + env.ExpectCreatedOrUpdated(updatedNodePool) - By(fmt.Sprintf("waiting for nodeclaim %s to be marked as drifted", nodeClaim.Name)) env.EventuallyExpectDrifted(nodeClaim) - By(fmt.Sprintf("waiting for pod %s to to update", pod.Name)) delete(pod.Annotations, karpv1.DoNotDisruptAnnotationKey) env.ExpectUpdated(pod) - By(fmt.Sprintf("expect pod %s, nodeclaim %s, and node %s to eventually not exist", pod.Name, nodeClaim.Name, node.Name)) - SetDefaultEventuallyTimeout(10 * time.Minute) - env.EventuallyExpectNotFound(pod, nodeClaim, node) - SetDefaultEventuallyTimeout(5 * time.Minute) + // Nodes will need to have the start-up taint removed before the node can be considered as initialized + fmt.Println(CurrentSpecReport().LeafNodeText) + if CurrentSpecReport().LeafNodeText == "Start-up Taints" { + nodes := env.EventuallyExpectCreatedNodeCount("==", 2) + sort.Slice(nodes, func(i int, j int) bool { + return nodes[i].CreationTimestamp.Before(&nodes[j].CreationTimestamp) + }) + nodeTwo := nodes[1] + // Remove the startup taints from the new nodes to initialize them + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeTwo), nodeTwo)).To(Succeed()) + g.Expect(len(nodeTwo.Spec.Taints)).To(BeNumerically("==", 1)) + _, found := lo.Find(nodeTwo.Spec.Taints, func(t corev1.Taint) bool { + return t.MatchTaint(&corev1.Taint{Key: "example.com/another-taint-2", Effect: corev1.TaintEffectPreferNoSchedule}) + }) + g.Expect(found).To(BeTrue()) + stored := nodeTwo.DeepCopy() + nodeTwo.Spec.Taints = lo.Reject(nodeTwo.Spec.Taints, func(t corev1.Taint, _ int) bool { return t.Key == "example.com/another-taint-2" }) + g.Expect(env.Client.Patch(env.Context, nodeTwo, client.StrategicMergeFrom(stored))).To(Succeed()) + }).Should(Succeed()) + } + env.EventuallyExpectNotFound(pod, node) + env.EventuallyExpectHealthyPodCount(selector, numPods) + }, + Entry("Annotations", karpv1.NodeClaimTemplate{ + ObjectMeta: karpv1.ObjectMeta{ + Annotations: map[string]string{"keyAnnotationTest": "valueAnnotationTest"}, + }, + }), + Entry("Labels", karpv1.NodeClaimTemplate{ + ObjectMeta: karpv1.ObjectMeta{ + Labels: map[string]string{"keyLabelTest": "valueLabelTest"}, + }, + }), + Entry("Taints", karpv1.NodeClaimTemplate{ + Spec: karpv1.NodeClaimTemplateSpec{ + Taints: []corev1.Taint{{Key: "example.com/another-taint-2", Effect: corev1.TaintEffectPreferNoSchedule}}, + }, + }), + Entry("Start-up Taints", karpv1.NodeClaimTemplate{ + Spec: karpv1.NodeClaimTemplateSpec{ + StartupTaints: []corev1.Taint{{Key: "example.com/another-taint-2", Effect: corev1.TaintEffectPreferNoSchedule}}, + }, + }), + Entry("NodeRequirements", karpv1.NodeClaimTemplate{ + Spec: karpv1.NodeClaimTemplateSpec{ + // since this will overwrite the default requirements, add instance category and family selectors back into requirements + Requirements: []karpv1.NodeSelectorRequirementWithMinValues{ + {NodeSelectorRequirement: corev1.NodeSelectorRequirement{Key: karpv1.CapacityTypeLabelKey, Operator: corev1.NodeSelectorOpIn, Values: []string{karpv1.CapacityTypeSpot}}}, + {NodeSelectorRequirement: corev1.NodeSelectorRequirement{Key: corev1.LabelInstanceTypeStable, Operator: corev1.NodeSelectorOpIn, Values: []string{"Standard_DS4_v2"}}}, + }, + }, + }), + ) + DescribeTable("AKSNodeClass", func(nodeClassSpec v1alpha2.AKSNodeClassSpec) { + updatedNodeClass := test.AKSNodeClass(v1alpha2.AKSNodeClass{Spec: *nodeClass.Spec.DeepCopy()}, v1alpha2.AKSNodeClass{Spec: nodeClassSpec}) + updatedNodeClass.ObjectMeta = nodeClass.ObjectMeta + + env.ExpectCreated(dep, nodeClass, nodePool) + pod := env.EventuallyExpectHealthyPodCount(selector, numPods)[0] + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + node := env.ExpectCreatedNodeCount("==", 1)[0] + + env.ExpectCreatedOrUpdated(updatedNodeClass) + + env.EventuallyExpectDrifted(nodeClaim) + + delete(pod.Annotations, karpv1.DoNotDisruptAnnotationKey) + env.ExpectUpdated(pod) + env.EventuallyExpectNotFound(pod, node) + env.EventuallyExpectHealthyPodCount(selector, numPods) + }, + // VNETSubnetID tested separately + Entry("OSDiskSizeGB", v1alpha2.AKSNodeClassSpec{OSDiskSizeGB: to.Int32Ptr(100)}), + // ImageID TBD + Entry("ImageFamily", v1alpha2.AKSNodeClassSpec{ImageFamily: to.StringPtr("AzureLinux")}), + Entry("Tags", v1alpha2.AKSNodeClassSpec{Tags: map[string]string{"keyTag-test-3": "valueTag-test-3"}}), + Entry("KubeletConfiguration", v1alpha2.AKSNodeClassSpec{ + Kubelet: &v1alpha2.KubeletConfiguration{ + ImageGCLowThresholdPercent: to.Int32Ptr(10), + ImageGCHighThresholdPercent: to.Int32Ptr(90), + }, + }), + Entry("MaxPods", v1alpha2.AKSNodeClassSpec{MaxPods: to.Int32Ptr(10)}), + ) + + It("should update the nodepool-hash annotation on the nodepool and nodeclaim when the nodepool's nodepool-hash-version annotation does not match the controller hash version", func() { + env.ExpectCreated(dep, nodeClass, nodePool) + env.EventuallyExpectHealthyPodCount(selector, numPods) + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + nodePool = env.ExpectExists(nodePool).(*karpv1.NodePool) + expectedHash := nodePool.Hash() + + By(fmt.Sprintf("expect nodepool %s and nodeclaim %s to contain %s and %s annotations", nodePool.Name, nodeClaim.Name, karpv1.NodePoolHashAnnotationKey, karpv1.NodePoolHashVersionAnnotationKey)) + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodePool), nodePool)).To(Succeed()) + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + + g.Expect(nodePool.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashAnnotationKey, expectedHash)) + g.Expect(nodePool.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashVersionAnnotationKey, karpv1.NodePoolHashVersion)) + g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashAnnotationKey, expectedHash)) + g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashVersionAnnotationKey, karpv1.NodePoolHashVersion)) + }).WithTimeout(30 * time.Second).Should(Succeed()) + + nodePool.Annotations = lo.Assign(nodePool.Annotations, map[string]string{ + karpv1.NodePoolHashAnnotationKey: "test-hash-1", + karpv1.NodePoolHashVersionAnnotationKey: "test-hash-version-1", + }) + // Updating `nodePool.Spec.Template.Annotations` would normally trigger drift on all nodeclaims owned by the + // nodepool. However, the nodepool-hash-version does not match the controller hash version, so we will see that + // none of the nodeclaims will be drifted and all nodeclaims will have an updated `nodepool-hash` and `nodepool-hash-version` annotation + nodePool.Spec.Template.Annotations = lo.Assign(nodePool.Spec.Template.Annotations, map[string]string{ + "test-key": "test-value", + }) + nodeClaim.Annotations = lo.Assign(nodePool.Annotations, map[string]string{ + karpv1.NodePoolHashAnnotationKey: "test-hash-2", + karpv1.NodePoolHashVersionAnnotationKey: "test-hash-version-2", + }) + + // The nodeclaim will need to be updated first, as the hash controller will only be triggered on changes to the nodepool + env.ExpectUpdated(nodeClaim, nodePool) + expectedHash = nodePool.Hash() + + // Expect all nodeclaims not to be drifted and contain an updated `nodepool-hash` and `nodepool-hash-version` annotation + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodePool), nodePool)).To(Succeed()) + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + + g.Expect(nodePool.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashAnnotationKey, expectedHash)) + g.Expect(nodePool.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashVersionAnnotationKey, karpv1.NodePoolHashVersion)) + g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashAnnotationKey, expectedHash)) + g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(karpv1.NodePoolHashVersionAnnotationKey, karpv1.NodePoolHashVersion)) + }) }) - It("should mark the nodeclaim as drifted for SubnetDrift if AKSNodeClass subnet id changes", func() { - env.ExpectCreated(pod, nodeClass, nodePool) + It("should update the aksnodeclass-hash annotation on the aksnodeclass and nodeclaim when the aksnodeclass's aksnodeclass-hash-version annotation does not match the controller hash version", func() { + env.ExpectCreated(dep, nodeClass, nodePool) + env.EventuallyExpectHealthyPodCount(selector, numPods) + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + nodeClass = env.ExpectExists(nodeClass).(*v1alpha2.AKSNodeClass) + expectedHash := nodeClass.Hash() - By(fmt.Sprintf("expect pod %s to be healthy", pod.Name)) - env.EventuallyExpectHealthy(pod) + By(fmt.Sprintf("expect nodeclass %s and nodeclaim %s to contain %s and %s annotations", nodeClass.Name, nodeClaim.Name, v1alpha2.AnnotationAKSNodeClassHash, v1alpha2.AnnotationAKSNodeClassHashVersion)) + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClass), nodeClass)).To(Succeed()) + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) - By("expect created node count to be 1") - env.ExpectCreatedNodeCount("==", 1) + g.Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHash, expectedHash)) + g.Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHashVersion, v1alpha2.AKSNodeClassHashVersion)) + g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHash, expectedHash)) + g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHashVersion, v1alpha2.AKSNodeClassHashVersion)) + }).WithTimeout(30 * time.Second).Should(Succeed()) + + nodeClass.Annotations = lo.Assign(nodeClass.Annotations, map[string]string{ + v1alpha2.AnnotationAKSNodeClassHash: "test-hash-1", + v1alpha2.AnnotationAKSNodeClassHashVersion: "test-hash-version-1", + }) + // Updating `nodeClass.Spec.Tags` would normally trigger drift on all nodeclaims using the + // nodeclass. However, the aksnodeclass-hash-version does not match the controller hash version, so we will see that + // none of the nodeclaims will be drifted and all nodeclaims will have an updated `aksnodeclass-hash` and `aksnodeclass-hash-version` annotation + nodeClass.Spec.Tags = lo.Assign(nodeClass.Spec.Tags, map[string]string{ + "test-key": "test-value", + }) + nodeClaim.Annotations = lo.Assign(nodePool.Annotations, map[string]string{ + v1alpha2.AnnotationAKSNodeClassHash: "test-hash-2", + v1alpha2.AnnotationAKSNodeClassHashVersion: "test-hash-version-2", + }) + + // The nodeclaim will need to be updated first, as the hash controller will only be triggered on changes to the nodeclass + env.ExpectUpdated(nodeClaim, nodeClass) + expectedHash = nodeClass.Hash() + + // Expect all nodeclaims not to be drifted and contain an updated `nodepool-hash` and `nodepool-hash-version` annotation + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClass), nodeClass)).To(Succeed()) + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + g.Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHash, expectedHash)) + g.Expect(nodeClass.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHashVersion, v1alpha2.AKSNodeClassHashVersion)) + g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHash, expectedHash)) + g.Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1alpha2.AnnotationAKSNodeClassHashVersion, v1alpha2.AKSNodeClassHashVersion)) + }).WithTimeout(30 * time.Second).Should(Succeed()) + env.ConsistentlyExpectNodeClaimsNotDrifted(time.Minute, nodeClaim) + }) + Context("Failure", func() { + It("should not disrupt a drifted node if the replacement node never registers", func() { + // launch a new nodeClaim + var numPods int32 = 2 + dep = coretest.Deployment(coretest.DeploymentOptions{ + Replicas: 2, + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "inflate"}}, + PodAntiRequirements: []corev1.PodAffinityTerm{{ + TopologyKey: corev1.LabelHostname, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "inflate"}, + }}, + }, + }, + }) + env.ExpectCreated(dep, nodeClass, nodePool) + + startingNodeClaimState := env.EventuallyExpectCreatedNodeClaimCount("==", int(numPods)) + env.EventuallyExpectCreatedNodeCount("==", int(numPods)) + + // Drift the nodeClaim with bad configuration that will not register a NodeClaim + nodeClass.Spec.VNETSubnetID = lo.ToPtr("/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/sillygeese/providers/Microsoft.Network/virtualNetworks/karpenter/subnets/nodeclassSubnet2") + env.ExpectCreatedOrUpdated(nodeClass) + + env.EventuallyExpectDrifted(startingNodeClaimState...) + + // Expect only a single node to be tainted due to default disruption budgets + taintedNodes := env.EventuallyExpectTaintedNodeCount("==", 1) + + // Drift should fail and the original node should be untainted + // TODO: reduce timeouts when disruption waits are factored out + env.EventuallyExpectNodesUntaintedWithTimeout(11*time.Minute, taintedNodes...) + + // Expect all the NodeClaims that existed on the initial provisioning loop are not removed. + // Assert this over several minutes to ensure a subsequent disruption controller pass doesn't + // successfully schedule the evicted pods to the in-flight nodeclaim and disrupt the original node + Consistently(func(g Gomega) { + nodeClaims := &karpv1.NodeClaimList{} + g.Expect(env.Client.List(env, nodeClaims, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + startingNodeClaimUIDs := sets.New(lo.Map(startingNodeClaimState, func(nc *karpv1.NodeClaim, _ int) types.UID { return nc.UID })...) + nodeClaimUIDs := sets.New(lo.Map(nodeClaims.Items, func(nc karpv1.NodeClaim, _ int) types.UID { return nc.UID })...) + g.Expect(nodeClaimUIDs.IsSuperset(startingNodeClaimUIDs)).To(BeTrue()) + }, "2m").Should(Succeed()) + + }) + It("should not disrupt a drifted node if the replacement node registers but never initialized", func() { + // launch a new nodeClaim + var numPods int32 = 2 + dep = coretest.Deployment(coretest.DeploymentOptions{ + Replicas: 2, + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "inflate"}}, + PodAntiRequirements: []corev1.PodAffinityTerm{{ + TopologyKey: corev1.LabelHostname, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "inflate"}, + }}, + }, + }, + }) + env.ExpectCreated(dep, nodeClass, nodePool) + + startingNodeClaimState := env.EventuallyExpectCreatedNodeClaimCount("==", int(numPods)) + env.EventuallyExpectCreatedNodeCount("==", int(numPods)) + + // Drift the nodeClaim with bad configuration that never initializes + nodePool.Spec.Template.Spec.StartupTaints = []corev1.Taint{{Key: "example.com/taint", Effect: corev1.TaintEffectPreferNoSchedule}} + env.ExpectCreatedOrUpdated(nodePool) + + env.EventuallyExpectDrifted(startingNodeClaimState...) + + // Expect only a single node to get tainted due to default disruption budgets + taintedNodes := env.EventuallyExpectTaintedNodeCount("==", 1) + + // Drift should fail and original node should be untainted + // TODO: reduce timeouts when disruption waits are factored out + env.EventuallyExpectNodesUntaintedWithTimeout(11*time.Minute, taintedNodes...) + + // Expect that the new nodeClaim/node is kept around after the un-cordon + nodeList := &corev1.NodeList{} + Expect(env.Client.List(env, nodeList, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + Expect(nodeList.Items).To(HaveLen(int(numPods) + 1)) + + nodeClaimList := &karpv1.NodeClaimList{} + Expect(env.Client.List(env, nodeClaimList, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + Expect(nodeClaimList.Items).To(HaveLen(int(numPods) + 1)) + + // Expect all the NodeClaims that existed on the initial provisioning loop are not removed + // Assert this over several minutes to ensure a subsequent disruption controller pass doesn't + // successfully schedule the evicted pods to the in-flight nodeclaim and disrupt the original node + Consistently(func(g Gomega) { + nodeClaims := &karpv1.NodeClaimList{} + g.Expect(env.Client.List(env, nodeClaims, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + startingNodeClaimUIDs := sets.New(lo.Map(startingNodeClaimState, func(m *karpv1.NodeClaim, _ int) types.UID { return m.UID })...) + nodeClaimUIDs := sets.New(lo.Map(nodeClaims.Items, func(m karpv1.NodeClaim, _ int) types.UID { return m.UID })...) + g.Expect(nodeClaimUIDs.IsSuperset(startingNodeClaimUIDs)).To(BeTrue()) + }, "2m").Should(Succeed()) + }) + It("should not drift any nodes if their PodDisruptionBudgets are unhealthy", func() { + // Create a deployment that contains a readiness probe that will never succeed + // This way, the pod will bind to the node, but the PodDisruptionBudget will never go healthy + var numPods int32 = 2 + dep = coretest.Deployment(coretest.DeploymentOptions{ + Replicas: 2, + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "inflate"}}, + PodAntiRequirements: []corev1.PodAffinityTerm{{ + TopologyKey: corev1.LabelHostname, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "inflate"}, + }}, + }, + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Port: intstr.FromInt32(80), + }, + }, + }, + }, + }) + selector = labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + minAvailable := intstr.FromInt32(numPods - 1) + pdb := coretest.PodDisruptionBudget(coretest.PDBOptions{ + Labels: dep.Spec.Template.Labels, + MinAvailable: &minAvailable, + }) + env.ExpectCreated(dep, nodeClass, nodePool, pdb) + + nodeClaims := env.EventuallyExpectCreatedNodeClaimCount("==", int(numPods)) + env.EventuallyExpectCreatedNodeCount("==", int(numPods)) + + // Expect pods to be bound but not to be ready since we are intentionally failing the readiness check + env.EventuallyExpectBoundPodCount(selector, int(numPods)) + + // Drift the nodeclaims + nodePool.Spec.Template.Annotations = map[string]string{"test": "annotation"} + env.ExpectUpdated(nodePool) + + env.EventuallyExpectDrifted(nodeClaims...) + env.ConsistentlyExpectNoDisruptions(int(numPods), time.Minute) + }) + }) + + It("should disrupt nodes that have drifted due to VNETSubnetID", func() { + env.ExpectCreated(dep, nodeClass, nodePool) + env.EventuallyExpectHealthyPodCount(selector, numPods) nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + By("expect created node count to be 1") + env.ExpectCreatedNodeCount("==", 1) By("triggering subnet drift") // TODO: Introduce azure clients to the tests to get values dynamically and be able to create azure resources inside of tests rather than using a fake id. // this will fail to actually create a new nodeclaim for the drift replacement but should still test that we are marking the nodeclaim as drifted. diff --git a/test/suites/gpu/suite_test.go b/test/suites/gpu/suite_test.go index 704c45fac..5ef4d5fb5 100644 --- a/test/suites/gpu/suite_test.go +++ b/test/suites/gpu/suite_test.go @@ -75,7 +75,6 @@ var _ = Describe("GPU", func() { "app": "samples-tf-mnist-demo", }, }, - Image: "mcr.microsoft.com/oss/kubernetes/pause:3.6", ResourceRequirements: v1.ResourceRequirements{ Limits: v1.ResourceList{ "nvidia.com/gpu": resource.MustParse("1"), diff --git a/test/suites/utilization/suite_test.go b/test/suites/utilization/suite_test.go index dc50b4357..6768d6bb7 100644 --- a/test/suites/utilization/suite_test.go +++ b/test/suites/utilization/suite_test.go @@ -80,7 +80,6 @@ func ExpectProvisionPodPerNode(getNodeClass func() *v1alpha2.AKSNodeClass, getNo v1.ResourceCPU: resource.MustParse("1.1"), }, }, - Image: "mcr.microsoft.com/oss/kubernetes/pause:3.6", }, })