From 4abea32ab28ef0b29985b987f2de18fab546d1d6 Mon Sep 17 00:00:00 2001 From: Bryce Soghigian Date: Wed, 22 Nov 2023 19:51:55 -0800 Subject: [PATCH 01/15] feat: supporting additional handing for TotalRegionalCores quota error, alongside adding better customer facing error messages for the controller errors that end up in the nodeclaim --- go.mod | 2 +- go.sum | 4 ++-- pkg/providers/instance/instance.go | 25 ++++++++++++++++++++----- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/go.mod b/go.mod index a1aa0304b..498bdbacd 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.21 require ( github.com/Azure/azure-kusto-go v0.14.0 github.com/Azure/azure-sdk-for-go v68.0.0+incompatible - github.com/Azure/azure-sdk-for-go-extensions v0.1.3 + github.com/Azure/azure-sdk-for-go-extensions v0.1.4 github.com/Azure/azure-sdk-for-go/sdk/azcore v1.9.0 github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.3.1 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute v1.0.0 diff --git a/go.sum b/go.sum index e26371db8..5d7de1b02 100644 --- a/go.sum +++ b/go.sum @@ -39,8 +39,8 @@ github.com/Azure/azure-kusto-go v0.14.0 h1:5XVmjh5kVgsm2scpsWisJ6Q1ZgWHJcIOPCZC1 github.com/Azure/azure-kusto-go v0.14.0/go.mod h1:wSmXIsQwBVPHDNsSQsX98nuc12VyvxoNHQa2q9t1Ce0= github.com/Azure/azure-sdk-for-go v68.0.0+incompatible h1:fcYLmCpyNYRnvJbPerq7U0hS+6+I79yEDJBqVNcqUzU= github.com/Azure/azure-sdk-for-go v68.0.0+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc= -github.com/Azure/azure-sdk-for-go-extensions v0.1.3 h1:hDHVEvhqVufFPmD5b+xRjtGXSQTMEia7S/xntltnG44= -github.com/Azure/azure-sdk-for-go-extensions v0.1.3/go.mod h1:dJfn8QUzuvyO4hGZ8pkROwd7/VQzDG8ER2SRk+V0afY= +github.com/Azure/azure-sdk-for-go-extensions v0.1.4 h1:XNT7IWmj4u3AfSag3t2mFupHT59J58pknX+daqprjm8= +github.com/Azure/azure-sdk-for-go-extensions v0.1.4/go.mod h1:dJfn8QUzuvyO4hGZ8pkROwd7/VQzDG8ER2SRk+V0afY= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.9.0 h1:fb8kj/Dh4CSwgsOzHeZY4Xh68cFVbzXx+ONXGMY//4w= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.9.0/go.mod h1:uReU2sSxZExRPBAg3qKzmAucSi51+SP1OhohieR821Q= github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.3.1 h1:LNHhpdK7hzUcx/k1LIcuh5k7k1LGIWLQfCjaneSj7Fc= diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index 54ade243f..af5e9b35e 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -19,6 +19,7 @@ package instance import ( "context" "encoding/json" + "errors" "fmt" "math" "sort" @@ -452,9 +453,11 @@ func isSKUNotAvailable(err error) bool { } func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corecloudprovider.InstanceType, zone, capacityType string, err error) error { - if sdkerrors.SubscriptionQuotaHasBeenReached(err) { - // Subscription quota reached, mark the instance type as unavailable in all zones available to the offering + if sdkerrors.SKUFamilyQuotaHasBeenReached(err) { + // Subscription quota has reached for this vm sku, mark the instance type as unavailable in all zones available to the offering // This will also update the TTL for an existing offering in the cache that is already unavailable + + logging.FromContext(ctx).Error(err) for _, offering := range instanceType.Offerings { if offering.CapacityType != capacityType { continue @@ -468,9 +471,7 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec p.unavailableOfferings.MarkUnavailable(ctx, SubscriptionQuotaReachedReason, instanceType.Name, offering.Zone, capacityType) } } - } else if sdkerrors.ZonalAllocationFailureOccurred(err) { - p.unavailableOfferings.MarkUnavailable(ctx, ZonalAllocationFailureReason, instanceType.Name, zone, corev1beta1.CapacityTypeOnDemand) - p.unavailableOfferings.MarkUnavailable(ctx, ZonalAllocationFailureReason, instanceType.Name, zone, corev1beta1.CapacityTypeSpot) + return fmt.Errorf("subscription level quota for %s has been reached (may try provision an alternative instance type)", instanceType.Name) } else if isSKUNotAvailable(err) { // https://aka.ms/azureskunotavailable: either not available for a location or zone, or out of capacity for Spot. // We only expect to observe the Spot case, not location or zone restrictions, because: @@ -489,6 +490,20 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec } p.unavailableOfferings.MarkUnavailableWithTTL(ctx, SKUNotAvailableReason, instanceType.Name, offering.Zone, capacityType, skuNotAvailableTTL) } + return err + } + if sdkerrors.ZonalAllocationFailureOccurred(err) { + logging.FromContext(ctx).With("zone", zone).Error(err) + p.unavailableOfferings.MarkUnavailable(ctx, ZonalAllocationFailureReason, instanceType.Name, zone, corev1beta1.CapacityTypeOnDemand) + p.unavailableOfferings.MarkUnavailable(ctx, ZonalAllocationFailureReason, instanceType.Name, zone, corev1beta1.CapacityTypeSpot) + + //nolint:stylecheck // Ignore ST1005: error strings should not be capitalized. This error message will pop up in the machine CRD and is intended to be read directly by the customer + return fmt.Errorf("We're currently unable to allocate resources in the selected zone (%s). Karpenter will try a different zone to fulfill your request.", zone) + } + if sdkerrors.RegionalQuotaHasBeenReached(err) { + logging.FromContext(ctx).Error(err) + //nolint:stylecheck // Ignore ST1005: error strings should not be capitalized. This error message will pop up in the machine CRD and is intended to be read directly by the customer + return corecloudprovider.NewInsufficientCapacityError(errors.New("The regional capacity limit for your subscription has been reached. To scale beyond this limit, please review the quota increase process here: https://learn.microsoft.com/en-us/azure/quotas/regional-quota-requests")) } return err } From 17556b109f8ce273ef3642464480e75d55fabb82 Mon Sep 17 00:00:00 2001 From: Bryce Soghigian <49734722+Bryce-Soghigian@users.noreply.github.com> Date: Mon, 27 Nov 2023 22:21:32 -0800 Subject: [PATCH 02/15] style: changing capacity to be quota Co-authored-by: Alex Leites <18728999+tallaxes@users.noreply.github.com> --- pkg/providers/instance/instance.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index af5e9b35e..504016d29 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -503,7 +503,7 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec if sdkerrors.RegionalQuotaHasBeenReached(err) { logging.FromContext(ctx).Error(err) //nolint:stylecheck // Ignore ST1005: error strings should not be capitalized. This error message will pop up in the machine CRD and is intended to be read directly by the customer - return corecloudprovider.NewInsufficientCapacityError(errors.New("The regional capacity limit for your subscription has been reached. To scale beyond this limit, please review the quota increase process here: https://learn.microsoft.com/en-us/azure/quotas/regional-quota-requests")) + return corecloudprovider.NewInsufficientCapacityError(errors.New("regional quota limit for subscription has been reached. To scale beyond this limit, please review the quota increase process here: https://learn.microsoft.com/en-us/azure/quotas/regional-quota-requests")) } return err } From ea2aba63be7ae14b99e4d25caa90ea178e0108a6 Mon Sep 17 00:00:00 2001 From: Bryce Soghigian Date: Mon, 27 Nov 2023 22:26:27 -0800 Subject: [PATCH 03/15] style: changing error messages to not reference karpenter directly --- pkg/providers/instance/instance.go | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index 504016d29..ecca3f037 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -461,9 +461,7 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec for _, offering := range instanceType.Offerings { if offering.CapacityType != capacityType { continue - } - - // If we have a quota limit of 0 vcpus, we mark the offerings unavailable for an hour. + } // If we have a quota limit of 0 vcpus, we mark the offerings unavailable for an hour. // CPU limits of 0 are usually due to a subscription having no allocated quota for that instance type at all on the subscription. if cpuLimitIsZero(err) { p.unavailableOfferings.MarkUnavailableWithTTL(ctx, SubscriptionQuotaReachedReason, instanceType.Name, offering.Zone, capacityType, SubscriptionQuotaReachedTTL) @@ -472,7 +470,8 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec } } return fmt.Errorf("subscription level quota for %s has been reached (may try provision an alternative instance type)", instanceType.Name) - } else if isSKUNotAvailable(err) { + } + if isSKUNotAvailable(err) { // https://aka.ms/azureskunotavailable: either not available for a location or zone, or out of capacity for Spot. // We only expect to observe the Spot case, not location or zone restrictions, because: // - SKUs with location restriction are already filtered out via sku.HasLocationRestriction @@ -497,8 +496,7 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec p.unavailableOfferings.MarkUnavailable(ctx, ZonalAllocationFailureReason, instanceType.Name, zone, corev1beta1.CapacityTypeOnDemand) p.unavailableOfferings.MarkUnavailable(ctx, ZonalAllocationFailureReason, instanceType.Name, zone, corev1beta1.CapacityTypeSpot) - //nolint:stylecheck // Ignore ST1005: error strings should not be capitalized. This error message will pop up in the machine CRD and is intended to be read directly by the customer - return fmt.Errorf("We're currently unable to allocate resources in the selected zone (%s). Karpenter will try a different zone to fulfill your request.", zone) + return fmt.Errorf("unable to allocate resources in the selected zone (%s). (will try a different zone to fulfill your request)", zone) } if sdkerrors.RegionalQuotaHasBeenReached(err) { logging.FromContext(ctx).Error(err) From 3c900066303c6163c794d77356d0f4564e711f6f Mon Sep 17 00:00:00 2001 From: Bryce Soghigian Date: Fri, 1 Dec 2023 01:28:27 -0800 Subject: [PATCH 04/15] fix: removing wrapped error type --- pkg/cloudprovider/cloudprovider.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/cloudprovider/cloudprovider.go b/pkg/cloudprovider/cloudprovider.go index a1be0a5ff..4783a88bf 100644 --- a/pkg/cloudprovider/cloudprovider.go +++ b/pkg/cloudprovider/cloudprovider.go @@ -103,7 +103,7 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *corev1beta1.NodeC } instance, err := c.instanceProvider.Create(ctx, nodeClass, nodeClaim, instanceTypes) if err != nil { - return nil, fmt.Errorf("creating instance, %w", err) + return nil, err } instanceType, _ := lo.Find(instanceTypes, func(i *cloudprovider.InstanceType) bool { return i.Name == string(*instance.Properties.HardwareProfile.VMSize) From 5cea5ae575bb629bb56fac13e5bea1caa0d8b1e1 Mon Sep 17 00:00:00 2001 From: Bryce Soghigian Date: Fri, 1 Dec 2023 01:29:19 -0800 Subject: [PATCH 05/15] test(sdkerrors): testing SKUFamily errors and TotalRegional cores --- pkg/providers/instancetype/suite_test.go | 83 +++++++++++++++++++++++- 1 file changed, 81 insertions(+), 2 deletions(-) diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index e5b3416c6..63594edf3 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -17,9 +17,12 @@ limitations under the License. package instancetype_test import ( + "bytes" "context" "encoding/base64" "fmt" + "io" + "net/http" "os" "strings" "testing" @@ -48,7 +51,9 @@ import ( coretest "github.com/aws/karpenter-core/pkg/test" . "github.com/aws/karpenter-core/pkg/test/expectations" + sdkerrors "github.com/Azure/azure-sdk-for-go-extensions/pkg/errors" "github.com/Azure/azure-sdk-for-go/sdk/azcore" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute" "github.com/Azure/karpenter/pkg/apis" "github.com/Azure/karpenter/pkg/apis/settings" @@ -80,7 +85,6 @@ func TestAzure(t *testing.T) { var _ = BeforeSuite(func() { ctx = coreoptions.ToContext(ctx, coretest.Options()) - // ctx = options.ToContext(ctx, test.Options()) ctx = settings.ToContext(ctx, test.Settings()) env = coretest.NewEnvironment(scheme.Scheme, coretest.WithCRDs(apis.CRDs...)) @@ -133,6 +137,77 @@ var _ = Describe("InstanceType Provider", func() { ExpectCleanedUp(ctx, env.Client) }) + Context("subscription level quota error responses", func() { + It("should fail to provsion when SKUFamily error is returned, and succeed when its gone", func() { + errorMessageHittingLimit := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 100, Current Usage: 96, Additional Required: 32, (Minimum) New Limit Required: 128. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests" + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set( + &azcore.ResponseError{ + ErrorCode: sdkerrors.OperationNotAllowed, + RawResponse: &http.Response{ + Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, errorMessageHittingLimit), + }, + }, + ) + pod := coretest.UnschedulablePod() + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + ExpectNotScheduled(ctx, env.Client, pod) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) + pod = coretest.UnschedulablePod() + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + ExpectScheduled(ctx, env.Client, pod) + }) + It("should fail to provsion when SKUFamily quota limit is zero, and succeed when its gone", func() { + errorMessageHittingLimit := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 0, Current Usage: 0, Additional Required: 32, (Minimum) New Limit Required: 32. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests" + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set( + &azcore.ResponseError{ + ErrorCode: sdkerrors.OperationNotAllowed, + RawResponse: &http.Response{ + Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, errorMessageHittingLimit), + }, + }, + ) + pod := coretest.UnschedulablePod() + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + ExpectNotScheduled(ctx, env.Client, pod) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) + pod = coretest.UnschedulablePod() + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + ExpectScheduled(ctx, env.Client, pod) + }) + + It("should return ICE if Total Regional Cores Quota errors are hit", func() { + regionalCoresErrorMessage := "Operation could not be completed as it results in exceeding approved Total Regional Cores quota. Additional details - Deployment Model: Resource Manager, Location: uksouth, Current Limit: 100, Current Usage: 100, Additional Required: 64, (Minimum) New Limit Required: 164. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22uksouth%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22cores%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:164,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22cores%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/regional-quota-requests" + azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set( + &azcore.ResponseError{ + ErrorCode: sdkerrors.OperationNotAllowed, + RawResponse: &http.Response{ + Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, regionalCoresErrorMessage), + }, + }, + ) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + nodeClaim := coretest.NodeClaim(corev1beta1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + corev1beta1.NodePoolLabelKey: nodePool.Name, + }, + }, + Spec: corev1beta1.NodeClaimSpec{ + NodeClassRef: &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + }, + }, + }) + claim, err := cloudProvider.Create(ctx, nodeClaim) + Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue()) + Expect(claim).To(BeNil()) + + }) + }) + Context("Filtering in InstanceType Provider List", func() { var instanceTypes corecloudprovider.InstanceTypes var err error @@ -230,7 +305,7 @@ var _ = Describe("InstanceType Provider", func() { Name: nodeClass.Name, } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectApplied(ctx, env.Client, np, nodeClass) pod := coretest.UnschedulablePod() ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) ExpectScheduled(ctx, env.Client, pod) @@ -946,3 +1021,7 @@ var _ = Describe("Tax Calculator", func() { }) }) }) + +func createSDKErrorBody(code, message string) io.ReadCloser { + return io.NopCloser(bytes.NewReader([]byte(fmt.Sprintf(`{"error":{"code": "%s", "message": "%s"}}`, code, message)))) +} From 089d0a77feba074e22dfc4457289e790c3499322 Mon Sep 17 00:00:00 2001 From: Bryce Soghigian Date: Fri, 1 Dec 2023 01:51:09 -0800 Subject: [PATCH 06/15] style: gofmt style: removing lint exception rule --- pkg/providers/instance/instance.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index ecca3f037..5e0d188b8 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -470,7 +470,7 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec } } return fmt.Errorf("subscription level quota for %s has been reached (may try provision an alternative instance type)", instanceType.Name) - } + } if isSKUNotAvailable(err) { // https://aka.ms/azureskunotavailable: either not available for a location or zone, or out of capacity for Spot. // We only expect to observe the Spot case, not location or zone restrictions, because: @@ -500,7 +500,6 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec } if sdkerrors.RegionalQuotaHasBeenReached(err) { logging.FromContext(ctx).Error(err) - //nolint:stylecheck // Ignore ST1005: error strings should not be capitalized. This error message will pop up in the machine CRD and is intended to be read directly by the customer return corecloudprovider.NewInsufficientCapacityError(errors.New("regional quota limit for subscription has been reached. To scale beyond this limit, please review the quota increase process here: https://learn.microsoft.com/en-us/azure/quotas/regional-quota-requests")) } return err From 2a125582319b0c79f46be06d7d481757e90c43a3 Mon Sep 17 00:00:00 2001 From: Bryce Soghigian <49734722+Bryce-Soghigian@users.noreply.github.com> Date: Fri, 1 Dec 2023 13:47:45 -0800 Subject: [PATCH 07/15] fix: pkg/providers/instancetype/suite_test.go Co-authored-by: Alex Leites <18728999+tallaxes@users.noreply.github.com> --- pkg/providers/instancetype/suite_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index 63594edf3..4d4a2eab3 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -138,7 +138,7 @@ var _ = Describe("InstanceType Provider", func() { }) Context("subscription level quota error responses", func() { - It("should fail to provsion when SKUFamily error is returned, and succeed when its gone", func() { + It("should fail to provision when VM SKU family vCPU quota exceeded error is returned, and succeed when it is gone", func() { errorMessageHittingLimit := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 100, Current Usage: 96, Additional Required: 32, (Minimum) New Limit Required: 128. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests" ExpectApplied(ctx, env.Client, nodePool, nodeClass) azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set( From a5a3f520557d47fc7631e9de1e4eb7d61dd6ce3e Mon Sep 17 00:00:00 2001 From: Bryce Soghigian <49734722+Bryce-Soghigian@users.noreply.github.com> Date: Fri, 1 Dec 2023 13:52:03 -0800 Subject: [PATCH 08/15] Update pkg/providers/instance/instance.go Co-authored-by: Alex Leites <18728999+tallaxes@users.noreply.github.com> --- pkg/providers/instance/instance.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index 5e0d188b8..36f818ffe 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -454,7 +454,7 @@ func isSKUNotAvailable(err error) bool { func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corecloudprovider.InstanceType, zone, capacityType string, err error) error { if sdkerrors.SKUFamilyQuotaHasBeenReached(err) { - // Subscription quota has reached for this vm sku, mark the instance type as unavailable in all zones available to the offering + // Subscription quota has been reached for this VM SKU, mark the instance type as unavailable in all zones available to the offering // This will also update the TTL for an existing offering in the cache that is already unavailable logging.FromContext(ctx).Error(err) From 4af9061994775e90558ad13d8b435b2a86a32a1d Mon Sep 17 00:00:00 2001 From: Bryce Soghigian <49734722+Bryce-Soghigian@users.noreply.github.com> Date: Fri, 1 Dec 2023 13:55:14 -0800 Subject: [PATCH 09/15] Update pkg/providers/instance/instance.go Co-authored-by: Alex Leites <18728999+tallaxes@users.noreply.github.com> --- pkg/providers/instance/instance.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index 36f818ffe..70d1cbddf 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -469,7 +469,7 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec p.unavailableOfferings.MarkUnavailable(ctx, SubscriptionQuotaReachedReason, instanceType.Name, offering.Zone, capacityType) } } - return fmt.Errorf("subscription level quota for %s has been reached (may try provision an alternative instance type)", instanceType.Name) + return fmt.Errorf("subscription level %s vCPU quota for %s has been reached (may try provision an alternative instance type)", capacityType, instanceType.Name) } if isSKUNotAvailable(err) { // https://aka.ms/azureskunotavailable: either not available for a location or zone, or out of capacity for Spot. From 10e560a0653a3c5ab9dfdf5b0220eaab1d8e5510 Mon Sep 17 00:00:00 2001 From: Bryce Soghigian <49734722+Bryce-Soghigian@users.noreply.github.com> Date: Fri, 1 Dec 2023 13:55:42 -0800 Subject: [PATCH 10/15] Update pkg/providers/instance/instance.go Co-authored-by: Alex Leites <18728999+tallaxes@users.noreply.github.com> --- pkg/providers/instance/instance.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index 70d1cbddf..b14b44f5d 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -500,7 +500,8 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec } if sdkerrors.RegionalQuotaHasBeenReached(err) { logging.FromContext(ctx).Error(err) - return corecloudprovider.NewInsufficientCapacityError(errors.New("regional quota limit for subscription has been reached. To scale beyond this limit, please review the quota increase process here: https://learn.microsoft.com/en-us/azure/quotas/regional-quota-requests")) + // InsufficientCapacityError is appropriate here because trying any other instance type will not help + return corecloudprovider.NewInsufficientCapacityError(fmt.Errorf("regional %s vCPU quota limit for subscription has been reached. To scale beyond this limit, please review the quota increase process here: https://learn.microsoft.com/en-us/azure/quotas/regional-quota-requests", capacityType)) } return err } From 45a2ed3215dd75ec527be37871dc3bd87dff18cd Mon Sep 17 00:00:00 2001 From: Bryce Soghigian Date: Fri, 1 Dec 2023 14:00:28 -0800 Subject: [PATCH 11/15] fix: nit pr feedback --- pkg/providers/instance/instance.go | 3 ++- pkg/providers/instancetype/suite_test.go | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index b14b44f5d..a5985e9b3 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -461,7 +461,8 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec for _, offering := range instanceType.Offerings { if offering.CapacityType != capacityType { continue - } // If we have a quota limit of 0 vcpus, we mark the offerings unavailable for an hour. + } + // If we have a quota limit of 0 vcpus, we mark the offerings unavailable for an hour. // CPU limits of 0 are usually due to a subscription having no allocated quota for that instance type at all on the subscription. if cpuLimitIsZero(err) { p.unavailableOfferings.MarkUnavailableWithTTL(ctx, SubscriptionQuotaReachedReason, instanceType.Name, offering.Zone, capacityType, SubscriptionQuotaReachedTTL) diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index 4d4a2eab3..d28879704 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -139,13 +139,13 @@ var _ = Describe("InstanceType Provider", func() { Context("subscription level quota error responses", func() { It("should fail to provision when VM SKU family vCPU quota exceeded error is returned, and succeed when it is gone", func() { - errorMessageHittingLimit := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 100, Current Usage: 96, Additional Required: 32, (Minimum) New Limit Required: 128. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests" + familyVCPUQuotaExceededErrorMessage := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 100, Current Usage: 96, Additional Required: 32, (Minimum) New Limit Required: 128. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests" ExpectApplied(ctx, env.Client, nodePool, nodeClass) azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set( &azcore.ResponseError{ ErrorCode: sdkerrors.OperationNotAllowed, RawResponse: &http.Response{ - Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, errorMessageHittingLimit), + Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, familyVCPUQuotaExceededErrorMessage), }, }, ) @@ -158,13 +158,13 @@ var _ = Describe("InstanceType Provider", func() { ExpectScheduled(ctx, env.Client, pod) }) It("should fail to provsion when SKUFamily quota limit is zero, and succeed when its gone", func() { - errorMessageHittingLimit := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 0, Current Usage: 0, Additional Required: 32, (Minimum) New Limit Required: 32. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests" + familyVCPUQuotaIsZeroErrorMessage := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 0, Current Usage: 0, Additional Required: 32, (Minimum) New Limit Required: 32. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests" ExpectApplied(ctx, env.Client, nodePool, nodeClass) azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set( &azcore.ResponseError{ ErrorCode: sdkerrors.OperationNotAllowed, RawResponse: &http.Response{ - Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, errorMessageHittingLimit), + Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, familyVCPUQuotaIsZeroErrorMessage), }, }, ) @@ -178,12 +178,12 @@ var _ = Describe("InstanceType Provider", func() { }) It("should return ICE if Total Regional Cores Quota errors are hit", func() { - regionalCoresErrorMessage := "Operation could not be completed as it results in exceeding approved Total Regional Cores quota. Additional details - Deployment Model: Resource Manager, Location: uksouth, Current Limit: 100, Current Usage: 100, Additional Required: 64, (Minimum) New Limit Required: 164. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22uksouth%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22cores%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:164,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22cores%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/regional-quota-requests" + regionalVCPUQuotaExceededErrorMessage := "Operation could not be completed as it results in exceeding approved Total Regional Cores quota. Additional details - Deployment Model: Resource Manager, Location: uksouth, Current Limit: 100, Current Usage: 100, Additional Required: 64, (Minimum) New Limit Required: 164. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22uksouth%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22cores%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:164,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22cores%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/regional-quota-requests" azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set( &azcore.ResponseError{ ErrorCode: sdkerrors.OperationNotAllowed, RawResponse: &http.Response{ - Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, regionalCoresErrorMessage), + Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, regionalVCPUQuotaExceededErrorMessage), }, }, ) From 82727f7b14e5873e114aefa0c4728b4cc8671bc5 Mon Sep 17 00:00:00 2001 From: Bryce Soghigian Date: Fri, 1 Dec 2023 14:00:55 -0800 Subject: [PATCH 12/15] fix: adding back error wrap --- pkg/cloudprovider/cloudprovider.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/cloudprovider/cloudprovider.go b/pkg/cloudprovider/cloudprovider.go index 4783a88bf..a1be0a5ff 100644 --- a/pkg/cloudprovider/cloudprovider.go +++ b/pkg/cloudprovider/cloudprovider.go @@ -103,7 +103,7 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *corev1beta1.NodeC } instance, err := c.instanceProvider.Create(ctx, nodeClass, nodeClaim, instanceTypes) if err != nil { - return nil, err + return nil, fmt.Errorf("creating instance, %w", err) } instanceType, _ := lo.Find(instanceTypes, func(i *cloudprovider.InstanceType) bool { return i.Name == string(*instance.Properties.HardwareProfile.VMSize) From a6b1080044d187dfdd8b4d573f089dd4921687e5 Mon Sep 17 00:00:00 2001 From: Bryce Soghigian Date: Fri, 1 Dec 2023 14:19:12 -0800 Subject: [PATCH 13/15] feat: updating SKUUnavailable to follow the new pattern --- pkg/providers/instance/instance.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index a5985e9b3..7234aab1a 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -19,7 +19,6 @@ package instance import ( "context" "encoding/json" - "errors" "fmt" "math" "sort" @@ -490,7 +489,9 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec } p.unavailableOfferings.MarkUnavailableWithTTL(ctx, SKUNotAvailableReason, instanceType.Name, offering.Zone, capacityType, skuNotAvailableTTL) } - return err + + logging.FromContext(ctx).Error(err) + return fmt.Errorf("the requested SKU is unavailable for instance type %s in zone %s with capacity type %s, for more details please visit: https://aka.ms/azureskunotavailable", instanceType.Name, zone, capacityType) } if sdkerrors.ZonalAllocationFailureOccurred(err) { logging.FromContext(ctx).With("zone", zone).Error(err) From 34fde1ff4e8eacc173d22780ae2b36097133712d Mon Sep 17 00:00:00 2001 From: Bryce Soghigian Date: Fri, 1 Dec 2023 14:22:48 -0800 Subject: [PATCH 14/15] style: gofmt --- pkg/providers/instance/instance.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index 7234aab1a..dee3a778f 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -460,7 +460,7 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec for _, offering := range instanceType.Offerings { if offering.CapacityType != capacityType { continue - } + } // If we have a quota limit of 0 vcpus, we mark the offerings unavailable for an hour. // CPU limits of 0 are usually due to a subscription having no allocated quota for that instance type at all on the subscription. if cpuLimitIsZero(err) { From f69403682c980e6abd6a5946881007fb3e377c6e Mon Sep 17 00:00:00 2001 From: Bryce Soghigian <49734722+Bryce-Soghigian@users.noreply.github.com> Date: Fri, 1 Dec 2023 14:25:06 -0800 Subject: [PATCH 15/15] Update pkg/providers/instancetype/suite_test.go Co-authored-by: Alex Leites <18728999+tallaxes@users.noreply.github.com> --- pkg/providers/instancetype/suite_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index d28879704..49eb0bb45 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -157,7 +157,7 @@ var _ = Describe("InstanceType Provider", func() { ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) ExpectScheduled(ctx, env.Client, pod) }) - It("should fail to provsion when SKUFamily quota limit is zero, and succeed when its gone", func() { + It("should fail to provision when VM SKU family vCPU quota limit is zero, and succeed when its gone", func() { familyVCPUQuotaIsZeroErrorMessage := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 0, Current Usage: 0, Additional Required: 32, (Minimum) New Limit Required: 32. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests" ExpectApplied(ctx, env.Client, nodePool, nodeClass) azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set(