diff --git a/go.mod b/go.mod index a1aa0304b..498bdbacd 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.21 require ( github.com/Azure/azure-kusto-go v0.14.0 github.com/Azure/azure-sdk-for-go v68.0.0+incompatible - github.com/Azure/azure-sdk-for-go-extensions v0.1.3 + github.com/Azure/azure-sdk-for-go-extensions v0.1.4 github.com/Azure/azure-sdk-for-go/sdk/azcore v1.9.0 github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.3.1 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute v1.0.0 diff --git a/go.sum b/go.sum index e26371db8..5d7de1b02 100644 --- a/go.sum +++ b/go.sum @@ -39,8 +39,8 @@ github.com/Azure/azure-kusto-go v0.14.0 h1:5XVmjh5kVgsm2scpsWisJ6Q1ZgWHJcIOPCZC1 github.com/Azure/azure-kusto-go v0.14.0/go.mod h1:wSmXIsQwBVPHDNsSQsX98nuc12VyvxoNHQa2q9t1Ce0= github.com/Azure/azure-sdk-for-go v68.0.0+incompatible h1:fcYLmCpyNYRnvJbPerq7U0hS+6+I79yEDJBqVNcqUzU= github.com/Azure/azure-sdk-for-go v68.0.0+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc= -github.com/Azure/azure-sdk-for-go-extensions v0.1.3 h1:hDHVEvhqVufFPmD5b+xRjtGXSQTMEia7S/xntltnG44= -github.com/Azure/azure-sdk-for-go-extensions v0.1.3/go.mod h1:dJfn8QUzuvyO4hGZ8pkROwd7/VQzDG8ER2SRk+V0afY= +github.com/Azure/azure-sdk-for-go-extensions v0.1.4 h1:XNT7IWmj4u3AfSag3t2mFupHT59J58pknX+daqprjm8= +github.com/Azure/azure-sdk-for-go-extensions v0.1.4/go.mod h1:dJfn8QUzuvyO4hGZ8pkROwd7/VQzDG8ER2SRk+V0afY= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.9.0 h1:fb8kj/Dh4CSwgsOzHeZY4Xh68cFVbzXx+ONXGMY//4w= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.9.0/go.mod h1:uReU2sSxZExRPBAg3qKzmAucSi51+SP1OhohieR821Q= github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.3.1 h1:LNHhpdK7hzUcx/k1LIcuh5k7k1LGIWLQfCjaneSj7Fc= diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index 31b07f755..84ef1fa72 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -452,14 +452,15 @@ func isSKUNotAvailable(err error) bool { } func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corecloudprovider.InstanceType, zone, capacityType string, err error) error { - if sdkerrors.SubscriptionQuotaHasBeenReached(err) { - // Subscription quota reached, mark the instance type as unavailable in all zones available to the offering + if sdkerrors.SKUFamilyQuotaHasBeenReached(err) { + // Subscription quota has been reached for this VM SKU, mark the instance type as unavailable in all zones available to the offering // This will also update the TTL for an existing offering in the cache that is already unavailable + + logging.FromContext(ctx).Error(err) for _, offering := range instanceType.Offerings { if offering.CapacityType != capacityType { continue } - // If we have a quota limit of 0 vcpus, we mark the offerings unavailable for an hour. // CPU limits of 0 are usually due to a subscription having no allocated quota for that instance type at all on the subscription. if cpuLimitIsZero(err) { @@ -468,10 +469,9 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec p.unavailableOfferings.MarkUnavailable(ctx, SubscriptionQuotaReachedReason, instanceType.Name, offering.Zone, capacityType) } } - } else if sdkerrors.ZonalAllocationFailureOccurred(err) { - p.unavailableOfferings.MarkUnavailable(ctx, ZonalAllocationFailureReason, instanceType.Name, zone, corev1beta1.CapacityTypeOnDemand) - p.unavailableOfferings.MarkUnavailable(ctx, ZonalAllocationFailureReason, instanceType.Name, zone, corev1beta1.CapacityTypeSpot) - } else if isSKUNotAvailable(err) { + return fmt.Errorf("subscription level %s vCPU quota for %s has been reached (may try provision an alternative instance type)", capacityType, instanceType.Name) + } + if isSKUNotAvailable(err) { // https://aka.ms/azureskunotavailable: either not available for a location or zone, or out of capacity for Spot. // We only expect to observe the Spot case, not location or zone restrictions, because: // - SKUs with location restriction are already filtered out via sku.HasLocationRestriction @@ -489,6 +489,21 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec } p.unavailableOfferings.MarkUnavailableWithTTL(ctx, SKUNotAvailableReason, instanceType.Name, offering.Zone, capacityType, skuNotAvailableTTL) } + + logging.FromContext(ctx).Error(err) + return fmt.Errorf("the requested SKU is unavailable for instance type %s in zone %s with capacity type %s, for more details please visit: https://aka.ms/azureskunotavailable", instanceType.Name, zone, capacityType) + } + if sdkerrors.ZonalAllocationFailureOccurred(err) { + logging.FromContext(ctx).With("zone", zone).Error(err) + p.unavailableOfferings.MarkUnavailable(ctx, ZonalAllocationFailureReason, instanceType.Name, zone, corev1beta1.CapacityTypeOnDemand) + p.unavailableOfferings.MarkUnavailable(ctx, ZonalAllocationFailureReason, instanceType.Name, zone, corev1beta1.CapacityTypeSpot) + + return fmt.Errorf("unable to allocate resources in the selected zone (%s). (will try a different zone to fulfill your request)", zone) + } + if sdkerrors.RegionalQuotaHasBeenReached(err) { + logging.FromContext(ctx).Error(err) + // InsufficientCapacityError is appropriate here because trying any other instance type will not help + return corecloudprovider.NewInsufficientCapacityError(fmt.Errorf("regional %s vCPU quota limit for subscription has been reached. To scale beyond this limit, please review the quota increase process here: https://learn.microsoft.com/en-us/azure/quotas/regional-quota-requests", capacityType)) } return err } diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index e5b3416c6..49eb0bb45 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -17,9 +17,12 @@ limitations under the License. package instancetype_test import ( + "bytes" "context" "encoding/base64" "fmt" + "io" + "net/http" "os" "strings" "testing" @@ -48,7 +51,9 @@ import ( coretest "github.com/aws/karpenter-core/pkg/test" . "github.com/aws/karpenter-core/pkg/test/expectations" + sdkerrors "github.com/Azure/azure-sdk-for-go-extensions/pkg/errors" "github.com/Azure/azure-sdk-for-go/sdk/azcore" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute" "github.com/Azure/karpenter/pkg/apis" "github.com/Azure/karpenter/pkg/apis/settings" @@ -80,7 +85,6 @@ func TestAzure(t *testing.T) { var _ = BeforeSuite(func() { ctx = coreoptions.ToContext(ctx, coretest.Options()) - // ctx = options.ToContext(ctx, test.Options()) ctx = settings.ToContext(ctx, test.Settings()) env = coretest.NewEnvironment(scheme.Scheme, coretest.WithCRDs(apis.CRDs...)) @@ -133,6 +137,77 @@ var _ = Describe("InstanceType Provider", func() { ExpectCleanedUp(ctx, env.Client) }) + Context("subscription level quota error responses", func() { + It("should fail to provision when VM SKU family vCPU quota exceeded error is returned, and succeed when it is gone", func() { + familyVCPUQuotaExceededErrorMessage := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 100, Current Usage: 96, Additional Required: 32, (Minimum) New Limit Required: 128. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests" + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set( + &azcore.ResponseError{ + ErrorCode: sdkerrors.OperationNotAllowed, + RawResponse: &http.Response{ + Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, familyVCPUQuotaExceededErrorMessage), + }, + }, + ) + pod := coretest.UnschedulablePod() + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + ExpectNotScheduled(ctx, env.Client, pod) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) + pod = coretest.UnschedulablePod() + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + ExpectScheduled(ctx, env.Client, pod) + }) + It("should fail to provision when VM SKU family vCPU quota limit is zero, and succeed when its gone", func() { + familyVCPUQuotaIsZeroErrorMessage := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 0, Current Usage: 0, Additional Required: 32, (Minimum) New Limit Required: 32. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests" + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set( + &azcore.ResponseError{ + ErrorCode: sdkerrors.OperationNotAllowed, + RawResponse: &http.Response{ + Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, familyVCPUQuotaIsZeroErrorMessage), + }, + }, + ) + pod := coretest.UnschedulablePod() + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + ExpectNotScheduled(ctx, env.Client, pod) + azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil) + pod = coretest.UnschedulablePod() + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + ExpectScheduled(ctx, env.Client, pod) + }) + + It("should return ICE if Total Regional Cores Quota errors are hit", func() { + regionalVCPUQuotaExceededErrorMessage := "Operation could not be completed as it results in exceeding approved Total Regional Cores quota. Additional details - Deployment Model: Resource Manager, Location: uksouth, Current Limit: 100, Current Usage: 100, Additional Required: 64, (Minimum) New Limit Required: 164. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22uksouth%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22cores%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:164,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22cores%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/regional-quota-requests" + azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set( + &azcore.ResponseError{ + ErrorCode: sdkerrors.OperationNotAllowed, + RawResponse: &http.Response{ + Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, regionalVCPUQuotaExceededErrorMessage), + }, + }, + ) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + nodeClaim := coretest.NodeClaim(corev1beta1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + corev1beta1.NodePoolLabelKey: nodePool.Name, + }, + }, + Spec: corev1beta1.NodeClaimSpec{ + NodeClassRef: &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + }, + }, + }) + claim, err := cloudProvider.Create(ctx, nodeClaim) + Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue()) + Expect(claim).To(BeNil()) + + }) + }) + Context("Filtering in InstanceType Provider List", func() { var instanceTypes corecloudprovider.InstanceTypes var err error @@ -230,7 +305,7 @@ var _ = Describe("InstanceType Provider", func() { Name: nodeClass.Name, } - ExpectApplied(ctx, env.Client, nodePool, nodeClass) + ExpectApplied(ctx, env.Client, np, nodeClass) pod := coretest.UnschedulablePod() ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) ExpectScheduled(ctx, env.Client, pod) @@ -946,3 +1021,7 @@ var _ = Describe("Tax Calculator", func() { }) }) }) + +func createSDKErrorBody(code, message string) io.ReadCloser { + return io.NopCloser(bytes.NewReader([]byte(fmt.Sprintf(`{"error":{"code": "%s", "message": "%s"}}`, code, message)))) +}