diff --git a/test/pkg/environment/common/expectations.go b/test/pkg/environment/common/expectations.go index 754d1eb5f..af3876eee 100644 --- a/test/pkg/environment/common/expectations.go +++ b/test/pkg/environment/common/expectations.go @@ -91,6 +91,36 @@ func (env *Environment) ExpectUpdated(objects ...client.Object) { } } +// ExpectStatusUpdated will update objects in the cluster to match the inputs. +// WARNING: This ignores the resource version check, which can result in +// overwriting changes made by other controllers in the cluster. +// This is useful in ensuring that we can clean up resources by patching +// out finalizers. +// Grab the object before making the updates to reduce the chance of this race. +func (env *Environment) ExpectStatusUpdated(objects ...client.Object) { + GinkgoHelper() + for _, o := range objects { + Eventually(func(g Gomega) { + current := o.DeepCopyObject().(client.Object) + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(current), current)).To(Succeed()) + if current.GetResourceVersion() != o.GetResourceVersion() { + log.FromContext(env).Info(fmt.Sprintf("detected an update to an object (%s) with an outdated resource version, did you get the latest version of the object before patching?", lo.Must(apiutil.GVKForObject(o, env.Client.Scheme())))) + } + o.SetResourceVersion(current.GetResourceVersion()) + g.Expect(env.Client.Status().Update(env.Context, o)).To(Succeed()) + }).WithTimeout(time.Second * 10).Should(Succeed()) + } +} + +func ReplaceNodeConditions(node *corev1.Node, conds ...corev1.NodeCondition) *corev1.Node { + keys := sets.New[string](lo.Map(conds, func(c corev1.NodeCondition, _ int) string { return string(c.Type) })...) + node.Status.Conditions = lo.Reject(node.Status.Conditions, func(c corev1.NodeCondition, _ int) bool { + return keys.Has(string(c.Type)) + }) + node.Status.Conditions = append(node.Status.Conditions, conds...) + return node +} + // ExpectCreatedOrUpdated can update objects in the cluster to match the inputs. // WARNING: ExpectUpdated ignores the resource version check, which can result in // overwriting changes made by other controllers in the cluster. @@ -272,6 +302,17 @@ func (env *Environment) EventuallyExpectTerminatingWithTimeout(timeout time.Dura }).WithTimeout(timeout).Should(Succeed()) } +func (env *Environment) EventuallyExpectNoLeakedKubeNodeLease() { + GinkgoHelper() + // expect no kube node lease to be leaked + leases := &coordinationv1.LeaseList{} + Expect(env.Client.List(env.Context, leases, client.InNamespace("kube-node-lease"))).To(Succeed()) + leakedLeases := lo.Filter(leases.Items, func(l coordinationv1.Lease, _ int) bool { + return l.OwnerReferences == nil + }) + Expect(leakedLeases).To(HaveLen(0)) +} + func (env *Environment) EventuallyExpectHealthyWithTimeout(timeout time.Duration, pods ...*corev1.Pod) { GinkgoHelper() Eventually(func(g Gomega) { @@ -296,6 +337,17 @@ func (env *Environment) ConsistentlyExpectTerminatingPods(duration time.Duration }, duration.String()).Should(Succeed()) } +func (env *Environment) ConsistentlyExpectActivePods(duration time.Duration, pods ...*corev1.Pod) { + GinkgoHelper() + By(fmt.Sprintf("expecting %d pods to be live for %s", len(pods), duration)) + Consistently(func(g Gomega) { + for _, pod := range pods { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(pod), pod)).To(Succeed()) + g.Expect(pod.DeletionTimestamp.IsZero()).To(BeTrue()) + } + }, duration.String()).Should(Succeed()) +} + func (env *Environment) ConsistentlyExpectHealthyPods(duration time.Duration, pods ...*corev1.Pod) { GinkgoHelper() By(fmt.Sprintf("expecting %d pods to be ready for %s", len(pods), duration)) @@ -462,16 +514,13 @@ func (env *Environment) eventuallyExpectScaleDown() { func (env *Environment) EventuallyExpectNotFound(objects ...client.Object) { GinkgoHelper() - env.EventuallyExpectNotFoundAssertion(objects...).Should(Succeed()) -} -func (env *Environment) EventuallyExpectNotFoundAssertion(objects ...client.Object) AsyncAssertion { - return Eventually(func(g Gomega) { + Eventually(func(g Gomega) { for _, object := range objects { err := env.Client.Get(env, client.ObjectKeyFromObject(object), object) g.Expect(errors.IsNotFound(err)).To(BeTrue()) } - }) + }).Should(Succeed()) } func (env *Environment) ExpectCreatedNodeCount(comparator string, count int) []*corev1.Node { @@ -524,34 +573,77 @@ func (env *Environment) ConsistentlyExpectNodeCount(comparator string, count int return lo.ToSlicePtr(nodeList.Items) } -func (env *Environment) ConsistentlyExpectNoDisruptions(nodeCount int, duration time.Duration) (taintedNodes []*corev1.Node) { +// ConsistentlyExpectNoDisruptions asserts that the number of tainted nodes remains the same. +// And that the number of nodeclaims remains the same. +func (env *Environment) ConsistentlyExpectNoDisruptions(nodeCount int, duration time.Duration) { GinkgoHelper() - return env.ConsistentlyExpectDisruptionsWithNodeCount(0, nodeCount, duration) + Consistently(func(g Gomega) { + nodeClaimList := &karpv1.NodeClaimList{} + g.Expect(env.Client.List(env, nodeClaimList, client.HasLabels{test.DiscoveryLabel})).To(Succeed()) + g.Expect(nodeClaimList.Items).To(HaveLen(nodeCount)) + nodeList := &corev1.NodeList{} + g.Expect(env.Client.List(env, nodeList, client.HasLabels{test.DiscoveryLabel})).To(Succeed()) + g.Expect(nodeList.Items).To(HaveLen(nodeCount)) + nodeList.Items = lo.Filter(nodeList.Items, func(n corev1.Node, _ int) bool { + _, ok := lo.Find(n.Spec.Taints, func(t corev1.Taint) bool { + return t.MatchTaint(&karpv1.DisruptedNoScheduleTaint) + }) + return ok + }) + g.Expect(nodeList.Items).To(HaveLen(0)) + }, duration).Should(Succeed()) } -// ConsistentlyExpectDisruptionsWithNodeCount will continually ensure that there are exactly disruptingNodes with totalNodes (including replacements and existing nodes) -func (env *Environment) ConsistentlyExpectDisruptionsWithNodeCount(disruptingNodes, totalNodes int, duration time.Duration) (taintedNodes []*corev1.Node) { +// ConsistentlyExpectDisruptionsUntilNoneLeft consistently ensures a max on number of concurrently disrupting and non-terminating nodes. +// This actually uses an Eventually() under the hood so that when we reach 0 tainted nodes we exit early. +// We use the StopTrying() so that we can exit the Eventually() if we've breached an assertion on total concurrency of disruptions. +// For example: if we have 5 nodes, with a budget of 2 nodes, we ensure that `disruptingNodes <= maxNodesDisrupting=2` +// We use nodesAtStart+maxNodesDisrupting to assert that we're not creating too many instances in replacement. +func (env *Environment) ConsistentlyExpectDisruptionsUntilNoneLeft(nodesAtStart, maxNodesDisrupting int, timeout time.Duration) { GinkgoHelper() nodes := []corev1.Node{} - Consistently(func(g Gomega) { - // Ensure we don't change our NodeClaims + // We use an eventually to exit when we detect the number of tainted/disrupted nodes matches our target. + Eventually(func(g Gomega) { + // Grab Nodes and NodeClaims nodeClaimList := &karpv1.NodeClaimList{} - g.Expect(env.Client.List(env, nodeClaimList, client.HasLabels{test.DiscoveryLabel})).To(Succeed()) - g.Expect(nodeClaimList.Items).To(HaveLen(totalNodes)) - nodeList := &corev1.NodeList{} + g.Expect(env.Client.List(env, nodeClaimList, client.HasLabels{test.DiscoveryLabel})).To(Succeed()) g.Expect(env.Client.List(env, nodeList, client.HasLabels{test.DiscoveryLabel})).To(Succeed()) - g.Expect(nodeList.Items).To(HaveLen(totalNodes)) + // Don't include NodeClaims with the `Terminating` status condition, as they're not included in budgets + removedProviderIDs := sets.Set[string]{} + nodeClaimList.Items = lo.Filter(nodeClaimList.Items, func(nc karpv1.NodeClaim, _ int) bool { + if !nc.StatusConditions().IsTrue(karpv1.ConditionTypeInstanceTerminating) { + return true + } + removedProviderIDs.Insert(nc.Status.ProviderID) + return false + }) + if len(nodeClaimList.Items) > nodesAtStart+maxNodesDisrupting { + StopTrying(fmt.Sprintf("Too many nodeclaims created. Expected no more than %d, got %d", nodesAtStart+maxNodesDisrupting, len(nodeClaimList.Items))).Now() + } + + // Don't include Nodes whose NodeClaims have been ignored + nodeList.Items = lo.Filter(nodeList.Items, func(n corev1.Node, _ int) bool { + return !removedProviderIDs.Has(n.Spec.ProviderID) + }) + if len(nodeList.Items) > nodesAtStart+maxNodesDisrupting { + StopTrying(fmt.Sprintf("Too many nodes created. Expected no more than %d, got %d", nodesAtStart+maxNodesDisrupting, len(nodeList.Items))).Now() + } + + // Filter further by the number of tainted nodes to get the number of nodes that are disrupting nodes = lo.Filter(nodeList.Items, func(n corev1.Node, _ int) bool { _, ok := lo.Find(n.Spec.Taints, func(t corev1.Taint) bool { - return karpv1.IsDisruptingTaint(t) + return t.MatchTaint(&karpv1.DisruptedNoScheduleTaint) }) return ok }) - g.Expect(nodes).To(HaveLen(disruptingNodes)) - }, duration).Should(Succeed()) - return lo.ToSlicePtr(nodes) + if len(nodes) > maxNodesDisrupting { + StopTrying(fmt.Sprintf("Too many disruptions detected. Expected no more than %d, got %d", maxNodesDisrupting, len(nodeList.Items))).Now() + } + + g.Expect(nodes).To(HaveLen(0)) + }).WithTimeout(timeout).WithPolling(5 * time.Second).Should(Succeed()) } func (env *Environment) EventuallyExpectTaintedNodeCount(comparator string, count int) []*corev1.Node {