diff --git a/pkg/controllers/nodeclaim/garbagecollection/suite_test.go b/pkg/controllers/nodeclaim/garbagecollection/suite_test.go index b98411895..dd8e0066b 100644 --- a/pkg/controllers/nodeclaim/garbagecollection/suite_test.go +++ b/pkg/controllers/nodeclaim/garbagecollection/suite_test.go @@ -18,7 +18,6 @@ package garbagecollection_test import ( "context" - "fmt" "sync" "testing" "time" @@ -43,6 +42,7 @@ import ( "github.com/Azure/karpenter/pkg/test" corecloudprovider "github.com/aws/karpenter-core/pkg/cloudprovider" + "github.com/aws/karpenter-core/pkg/controllers/provisioning" "github.com/aws/karpenter-core/pkg/controllers/state" "github.com/aws/karpenter-core/pkg/events" "github.com/aws/karpenter-core/pkg/operator/controller" @@ -64,6 +64,7 @@ var nodeClass *v1alpha2.AKSNodeClass var cluster *state.Cluster var cloudProvider *cloudprovider.CloudProvider var garbageCollectionController controller.Controller +var coreProvisioner *provisioning.Provisioner func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) @@ -84,6 +85,7 @@ var _ = BeforeSuite(func() { garbageCollectionController = garbagecollection.NewController(env.Client, cloudProvider) fakeClock = &clock.FakeClock{} cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + coreProvisioner = provisioning.NewProvisioner(env.Client, env.KubernetesInterface.CoreV1(), events.NewRecorder(&record.FakeRecorder{}), cloudProvider, cluster) }) var _ = AfterSuite(func() { @@ -114,188 +116,214 @@ var _ = AfterEach(func() { }) var _ = Describe("NodeClaimGarbageCollection", func() { - var vm armcompute.VirtualMachine + var vm *armcompute.VirtualMachine var providerID string - - BeforeEach(func() { - id := utils.MkVMID(azureEnv.AzureResourceGraphAPI.ResourceGroup, "vm-a") - vm = armcompute.VirtualMachine{ - ID: lo.ToPtr(id), - Name: lo.ToPtr("vm-a"), - Location: lo.ToPtr(fake.Region), - Tags: map[string]*string{ - instance.NodePoolTagKey: lo.ToPtr("default"), - }, - } - providerID = utils.ResourceIDToProviderID(ctx, id) - }) - - It("should delete an instance if there is no NodeClaim owner", func() { - // Launch happened 10m ago - vm.Properties = &armcompute.VirtualMachineProperties{ - TimeCreated: lo.ToPtr(time.Now().Add(-time.Minute * 10)), - } - azureEnv.VirtualMachinesAPI.Instances.Store(lo.FromPtr(vm.ID), vm) - - ExpectReconcileSucceeded(ctx, garbageCollectionController, client.ObjectKey{}) - _, err := cloudProvider.Get(ctx, providerID) - Expect(err).To(HaveOccurred()) - Expect(corecloudprovider.IsNodeClaimNotFoundError(err)).To(BeTrue()) - }) - It("should not delete an instance if it was not launched by a NodeClaim", func() { - // Remove the "karpenter.sh/managed-by" tag (this isn't launched by a NodeClaim) - vm.Properties = &armcompute.VirtualMachineProperties{ - TimeCreated: lo.ToPtr(time.Now().Add(-time.Minute * 10)), - } - vm.Tags = lo.OmitBy(vm.Tags, func(key string, value *string) bool { - return key == instance.NodePoolTagKey + var err error + + var _ = Context("Pod pressure", func() { + BeforeEach(func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + ExpectScheduled(ctx, env.Client, pod) + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vmName := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VMName + vm, err = azureEnv.InstanceProvider.Get(ctx, vmName) + Expect(err).To(BeNil()) + providerID = utils.ResourceIDToProviderID(ctx, *vm.ID) }) - azureEnv.VirtualMachinesAPI.Instances.Store(lo.FromPtr(vm.ID), vm) + It("should not delete an instance if it was not launched by a NodeClaim", func() { + // Remove the "karpenter.sh/managed-by" tag (this isn't launched by a NodeClaim) + vm.Properties = &armcompute.VirtualMachineProperties{ + TimeCreated: lo.ToPtr(time.Now().Add(-time.Minute * 10)), + } + vm.Tags = lo.OmitBy(vm.Tags, func(key string, value *string) bool { + return key == instance.NodePoolTagKey + }) + azureEnv.VirtualMachinesAPI.Instances.Store(lo.FromPtr(vm.ID), *vm) - ExpectReconcileSucceeded(ctx, garbageCollectionController, client.ObjectKey{}) - _, err := cloudProvider.Get(ctx, providerID) - Expect(err).NotTo(HaveOccurred()) - }) - It("should delete an instance along with the node if there is no NodeClaim owner (to quicken scheduling)", func() { - // Launch happened 10m ago - vm.Properties = &armcompute.VirtualMachineProperties{ - TimeCreated: lo.ToPtr(time.Now().Add(-time.Minute * 10)), - } - azureEnv.VirtualMachinesAPI.Instances.Store(lo.FromPtr(vm.ID), vm) - - node := coretest.Node(coretest.NodeOptions{ - ProviderID: providerID, + ExpectReconcileSucceeded(ctx, garbageCollectionController, client.ObjectKey{}) + _, err := cloudProvider.Get(ctx, providerID) + Expect(err).NotTo(HaveOccurred()) }) - ExpectApplied(ctx, env.Client, node) - - ExpectReconcileSucceeded(ctx, garbageCollectionController, client.ObjectKey{}) - _, err := cloudProvider.Get(ctx, providerID) - Expect(err).To(HaveOccurred()) - Expect(corecloudprovider.IsNodeClaimNotFoundError(err)).To(BeTrue()) + It("should delete many instances if they all don't have NodeClaim owners", func() { + // Generate 100 instances that have different vmIDs + var ids []string + var vmName string + for i := 0; i < 100; i++ { + pod := coretest.UnschedulablePod() + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + ExpectScheduled(ctx, env.Client, pod) + if azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len() == 1 { + vmName = azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VMName + vm, err = azureEnv.InstanceProvider.Get(ctx, vmName) + Expect(err).To(BeNil()) + providerID = utils.ResourceIDToProviderID(ctx, *vm.ID) + azureEnv.VirtualMachinesAPI.Instances.Store( + *vm.ID, + armcompute.VirtualMachine{ + ID: vm.ID, + Name: vm.Name, + Location: lo.ToPtr(fake.Region), + Properties: &armcompute.VirtualMachineProperties{ + TimeCreated: lo.ToPtr(time.Now().Add(-time.Minute * 10)), + }, + Tags: map[string]*string{ + instance.NodePoolTagKey: lo.ToPtr("default"), + }, + }) + ids = append(ids, *vm.ID) + } + } + ExpectReconcileSucceeded(ctx, garbageCollectionController, client.ObjectKey{}) + + wg := sync.WaitGroup{} + for _, id := range ids { + wg.Add(1) + go func(id string) { + defer GinkgoRecover() + defer wg.Done() + + _, err := cloudProvider.Get(ctx, utils.ResourceIDToProviderID(ctx, id)) + Expect(err).To(HaveOccurred()) + Expect(corecloudprovider.IsNodeClaimNotFoundError(err)).To(BeTrue()) + }(id) + } + wg.Wait() + }) + It("should not delete all instances if they all have NodeClaim owners", func() { + // Generate 100 instances that have different instanceIDs + var ids []string + var nodeClaims []*corev1beta1.NodeClaim + var vmName string + for i := 0; i < 100; i++ { + pod := coretest.UnschedulablePod() + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + ExpectScheduled(ctx, env.Client, pod) + if azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len() == 1 { + vmName = azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VMName + vm, err = azureEnv.InstanceProvider.Get(ctx, vmName) + Expect(err).To(BeNil()) + providerID = utils.ResourceIDToProviderID(ctx, *vm.ID) + azureEnv.VirtualMachinesAPI.Instances.Store( + *vm.ID, + armcompute.VirtualMachine{ + ID: vm.ID, + Name: vm.Name, + Location: lo.ToPtr(fake.Region), + Properties: &armcompute.VirtualMachineProperties{ + TimeCreated: lo.ToPtr(time.Now().Add(-time.Minute * 10)), + }, + Tags: map[string]*string{ + instance.NodePoolTagKey: lo.ToPtr("default"), + }, + }) + nodeClaim := coretest.NodeClaim(corev1beta1.NodeClaim{ + Status: corev1beta1.NodeClaimStatus{ + ProviderID: utils.ResourceIDToProviderID(ctx, *vm.ID), + }, + }) + ids = append(ids, *vm.ID) + ExpectApplied(ctx, env.Client, nodeClaim) + nodeClaims = append(nodeClaims, nodeClaim) + } + } + ExpectReconcileSucceeded(ctx, garbageCollectionController, client.ObjectKey{}) + + wg := sync.WaitGroup{} + for _, id := range ids { + wg.Add(1) + go func(id string) { + defer GinkgoRecover() + defer wg.Done() + + _, err := cloudProvider.Get(ctx, utils.ResourceIDToProviderID(ctx, id)) + Expect(err).ToNot(HaveOccurred()) + }(id) + } + wg.Wait() + + for _, nodeClaim := range nodeClaims { + ExpectExists(ctx, env.Client, nodeClaim) + } + }) + It("should not delete an instance if it is within the nodeClaim resolution window (5m)", func() { + // Launch time just happened + vm.Properties = &armcompute.VirtualMachineProperties{ + TimeCreated: lo.ToPtr(time.Now()), + } + azureEnv.VirtualMachinesAPI.Instances.Store(lo.FromPtr(vm.ID), *vm) + + ExpectReconcileSucceeded(ctx, garbageCollectionController, client.ObjectKey{}) + _, err := cloudProvider.Get(ctx, providerID) + Expect(err).NotTo(HaveOccurred()) + }) + It("should not delete the instance or node if it already has a nodeClaim that matches it", func() { + // Launch time was 10m ago + vm.Properties = &armcompute.VirtualMachineProperties{ + TimeCreated: lo.ToPtr(time.Now().Add(-time.Minute * 10)), + } + azureEnv.VirtualMachinesAPI.Instances.Store(lo.FromPtr(vm.ID), *vm) - ExpectNotFound(ctx, env.Client, node) - }) - It("should delete many instances if they all don't have NodeClaim owners", func() { - // Generate 100 instances that have different vmIDs - var ids []string - var vmName string - var vmID string - for i := 0; i < 100; i++ { - vmName = fmt.Sprintf("vm-%d", i) - vmID = utils.MkVMID(azureEnv.AzureResourceGraphAPI.ResourceGroup, vmName) - azureEnv.VirtualMachinesAPI.Instances.Store( - vmID, - armcompute.VirtualMachine{ - ID: lo.ToPtr(utils.MkVMID(azureEnv.AzureResourceGraphAPI.ResourceGroup, vmName)), - Name: lo.ToPtr(vmName), - Location: lo.ToPtr(fake.Region), - Properties: &armcompute.VirtualMachineProperties{ - TimeCreated: lo.ToPtr(time.Now().Add(-time.Minute * 10)), - }, - Tags: map[string]*string{ - instance.NodePoolTagKey: lo.ToPtr("default"), - }, - }) - ids = append(ids, vmID) - } - ExpectReconcileSucceeded(ctx, garbageCollectionController, client.ObjectKey{}) - - wg := sync.WaitGroup{} - for _, id := range ids { - wg.Add(1) - go func(id string) { - defer GinkgoRecover() - defer wg.Done() - - _, err := cloudProvider.Get(ctx, utils.ResourceIDToProviderID(ctx, id)) - Expect(err).To(HaveOccurred()) - Expect(corecloudprovider.IsNodeClaimNotFoundError(err)).To(BeTrue()) - }(id) - } - wg.Wait() - }) - It("should not delete all instances if they all have NodeClaim owners", func() { - // Generate 100 instances that have different instanceIDs - var ids []string - var nodeClaims []*corev1beta1.NodeClaim - var vmName string - var vmID string - for i := 0; i < 100; i++ { - vmName = fmt.Sprintf("vm-%d", i) - vmID = utils.MkVMID(azureEnv.AzureResourceGraphAPI.ResourceGroup, vmName) - azureEnv.VirtualMachinesAPI.Instances.Store( - vmID, - armcompute.VirtualMachine{ - ID: lo.ToPtr(utils.MkVMID(azureEnv.AzureResourceGraphAPI.ResourceGroup, vmName)), - Name: lo.ToPtr(vmName), - Location: lo.ToPtr(fake.Region), - Properties: &armcompute.VirtualMachineProperties{ - TimeCreated: lo.ToPtr(time.Now().Add(-time.Minute * 10)), - }, - Tags: map[string]*string{ - instance.NodePoolTagKey: lo.ToPtr("default"), - }, - }, - ) nodeClaim := coretest.NodeClaim(corev1beta1.NodeClaim{ Status: corev1beta1.NodeClaimStatus{ - ProviderID: utils.ResourceIDToProviderID(ctx, vmID), + ProviderID: providerID, }, }) - ExpectApplied(ctx, env.Client, nodeClaim) - nodeClaims = append(nodeClaims, nodeClaim) - ids = append(ids, vmID) - } - ExpectReconcileSucceeded(ctx, garbageCollectionController, client.ObjectKey{}) - - wg := sync.WaitGroup{} - for _, id := range ids { - wg.Add(1) - go func(id string) { - defer GinkgoRecover() - defer wg.Done() - - _, err := cloudProvider.Get(ctx, utils.ResourceIDToProviderID(ctx, id)) - Expect(err).ToNot(HaveOccurred()) - }(id) - } - wg.Wait() - - for _, nodeClaim := range nodeClaims { - ExpectExists(ctx, env.Client, nodeClaim) - } - }) - It("should not delete an instance if it is within the nodeClaim resolution window (5m)", func() { - // Launch time just happened - vm.Properties = &armcompute.VirtualMachineProperties{ - TimeCreated: lo.ToPtr(time.Now()), - } - azureEnv.VirtualMachinesAPI.Instances.Store(lo.FromPtr(vm.ID), vm) - - ExpectReconcileSucceeded(ctx, garbageCollectionController, client.ObjectKey{}) - _, err := cloudProvider.Get(ctx, providerID) - Expect(err).NotTo(HaveOccurred()) - }) - It("should not delete the instance or node if it already has a nodeClaim that matches it", func() { - // Launch time was 10m ago - vm.Properties = &armcompute.VirtualMachineProperties{ - TimeCreated: lo.ToPtr(time.Now().Add(-time.Minute * 10)), - } - azureEnv.VirtualMachinesAPI.Instances.Store(lo.FromPtr(vm.ID), vm) - - nodeClaim := coretest.NodeClaim(corev1beta1.NodeClaim{ - Status: corev1beta1.NodeClaimStatus{ + node := coretest.Node(coretest.NodeOptions{ ProviderID: providerID, - }, + }) + ExpectApplied(ctx, env.Client, nodeClaim, node) + + ExpectReconcileSucceeded(ctx, garbageCollectionController, client.ObjectKey{}) + _, err := cloudProvider.Get(ctx, providerID) + Expect(err).ToNot(HaveOccurred()) + ExpectExists(ctx, env.Client, node) }) - node := coretest.Node(coretest.NodeOptions{ - ProviderID: providerID, + }) + + var _ = Context("Basic", func() { + BeforeEach(func() { + id := utils.MkVMID(azureEnv.AzureResourceGraphAPI.ResourceGroup, "vm-a") + vm = &armcompute.VirtualMachine{ + ID: lo.ToPtr(id), + Name: lo.ToPtr("vm-a"), + Location: lo.ToPtr(fake.Region), + Tags: map[string]*string{ + instance.NodePoolTagKey: lo.ToPtr("default"), + }, + } + providerID = utils.ResourceIDToProviderID(ctx, id) }) - ExpectApplied(ctx, env.Client, nodeClaim, node) + It("should delete an instance if there is no NodeClaim owner", func() { + // Launch happened 10m ago + vm.Properties = &armcompute.VirtualMachineProperties{ + TimeCreated: lo.ToPtr(time.Now().Add(-time.Minute * 10)), + } + azureEnv.VirtualMachinesAPI.Instances.Store(lo.FromPtr(vm.ID), *vm) + + ExpectReconcileSucceeded(ctx, garbageCollectionController, client.ObjectKey{}) + _, err = cloudProvider.Get(ctx, providerID) + Expect(err).To(HaveOccurred()) + Expect(corecloudprovider.IsNodeClaimNotFoundError(err)).To(BeTrue()) + }) + It("should delete an instance along with the node if there is no NodeClaim owner (to quicken scheduling)", func() { + // Launch happened 10m ago + vm.Properties = &armcompute.VirtualMachineProperties{ + TimeCreated: lo.ToPtr(time.Now().Add(-time.Minute * 10)), + } + azureEnv.VirtualMachinesAPI.Instances.Store(lo.FromPtr(vm.ID), *vm) + node := coretest.Node(coretest.NodeOptions{ + ProviderID: providerID, + }) + ExpectApplied(ctx, env.Client, node) + + ExpectReconcileSucceeded(ctx, garbageCollectionController, client.ObjectKey{}) + _, err = cloudProvider.Get(ctx, providerID) + Expect(err).To(HaveOccurred()) + Expect(corecloudprovider.IsNodeClaimNotFoundError(err)).To(BeTrue()) - ExpectReconcileSucceeded(ctx, garbageCollectionController, client.ObjectKey{}) - _, err := cloudProvider.Get(ctx, providerID) - Expect(err).ToNot(HaveOccurred()) - ExpectExists(ctx, env.Client, node) + ExpectNotFound(ctx, env.Client, node) + }) }) }) diff --git a/pkg/providers/instance/suite_test.go b/pkg/providers/instance/suite_test.go index d04e958c6..258b51bc8 100644 --- a/pkg/providers/instance/suite_test.go +++ b/pkg/providers/instance/suite_test.go @@ -18,20 +18,26 @@ package instance_test import ( "context" + "strings" "testing" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/samber/lo" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + clock "k8s.io/utils/clock/testing" + "k8s.io/client-go/tools/record" "github.com/Azure/karpenter/pkg/apis" "github.com/Azure/karpenter/pkg/apis/settings" "github.com/Azure/karpenter/pkg/apis/v1alpha2" "github.com/Azure/karpenter/pkg/cloudprovider" + "github.com/Azure/karpenter/pkg/providers/instance" "github.com/Azure/karpenter/pkg/test" "github.com/aws/karpenter-core/pkg/apis/v1alpha5" + "github.com/aws/karpenter-core/pkg/controllers/provisioning" + "github.com/aws/karpenter-core/pkg/controllers/state" "github.com/aws/karpenter-core/pkg/events" corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" @@ -52,15 +58,18 @@ var azureEnv *test.Environment var azureEnvNonZonal *test.Environment var cloudProvider *cloudprovider.CloudProvider var cloudProviderNonZonal *cloudprovider.CloudProvider +var fakeClock *clock.FakeClock +var cluster *state.Cluster +var coreProvisioner *provisioning.Provisioner func TestAzure(t *testing.T) { ctx = TestContextWithLogger(t) RegisterFailHandler(Fail) ctx = coreoptions.ToContext(ctx, coretest.Options()) + // TODO v1beta1 options // ctx = options.ToContext(ctx, test.Options()) ctx = settings.ToContext(ctx, test.Settings()) - env = coretest.NewEnvironment(scheme.Scheme, coretest.WithCRDs(apis.CRDs...)) ctx, stop = context.WithCancel(ctx) @@ -68,7 +77,9 @@ func TestAzure(t *testing.T) { azureEnvNonZonal = test.NewEnvironmentNonZonal(ctx, env) cloudProvider = cloudprovider.New(azureEnv.InstanceTypesProvider, azureEnv.InstanceProvider, events.NewRecorder(&record.FakeRecorder{}), env.Client, azureEnv.ImageProvider) cloudProviderNonZonal = cloudprovider.New(azureEnvNonZonal.InstanceTypesProvider, azureEnvNonZonal.InstanceProvider, events.NewRecorder(&record.FakeRecorder{}), env.Client, azureEnvNonZonal.ImageProvider) - + fakeClock = &clock.FakeClock{} + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + coreProvisioner = provisioning.NewProvisioner(env.Client, env.KubernetesInterface.CoreV1(), events.NewRecorder(&record.FakeRecorder{}), cloudProvider, cluster) RunSpecs(t, "Provider/Azure") } @@ -108,11 +119,14 @@ var _ = Describe("InstanceProvider", func() { }, }, }) - azureEnv.Reset() azureEnvNonZonal.Reset() }) + var _ = AfterEach(func() { + ExpectCleanedUp(ctx, env.Client) + }) + var ZonalAndNonZonalRegions = []TableEntry{ Entry("zonal", azureEnv, cloudProvider), Entry("non-zonal", azureEnvNonZonal, cloudProviderNonZonal), @@ -138,4 +152,20 @@ var _ = Describe("InstanceProvider", func() { }, ZonalAndNonZonalRegions, ) + + It("should create VM with valid ARM tags", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod(coretest.PodOptions{}) + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + ExpectScheduled(ctx, env.Client, pod) + Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) + vmName := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VMName + vm, err := azureEnv.InstanceProvider.Get(ctx, vmName) + Expect(err).To(BeNil()) + tags := vm.Tags + Expect(lo.FromPtr(tags[instance.NodePoolTagKey])).To(Equal(nodePool.Name)) + Expect(lo.PickBy(tags, func(key string, value *string) bool { + return strings.Contains(key, "/") // ARM tags can't contain '/' + })).To(HaveLen(0)) + }) })