Skip to content

Commit

Permalink
feat: supporting additional handing for TotalRegionalCores quota erro…
Browse files Browse the repository at this point in the history
…r, and adding better customer facing error messages on nodeclaim (#45)
  • Loading branch information
Bryce-Soghigian authored Dec 2, 2023
1 parent b58470a commit 20da22d
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 12 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ go 1.21
require (
github.com/Azure/azure-kusto-go v0.14.0
github.com/Azure/azure-sdk-for-go v68.0.0+incompatible
github.com/Azure/azure-sdk-for-go-extensions v0.1.3
github.com/Azure/azure-sdk-for-go-extensions v0.1.4
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.9.0
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.3.1
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute v1.0.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ github.com/Azure/azure-kusto-go v0.14.0 h1:5XVmjh5kVgsm2scpsWisJ6Q1ZgWHJcIOPCZC1
github.com/Azure/azure-kusto-go v0.14.0/go.mod h1:wSmXIsQwBVPHDNsSQsX98nuc12VyvxoNHQa2q9t1Ce0=
github.com/Azure/azure-sdk-for-go v68.0.0+incompatible h1:fcYLmCpyNYRnvJbPerq7U0hS+6+I79yEDJBqVNcqUzU=
github.com/Azure/azure-sdk-for-go v68.0.0+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc=
github.com/Azure/azure-sdk-for-go-extensions v0.1.3 h1:hDHVEvhqVufFPmD5b+xRjtGXSQTMEia7S/xntltnG44=
github.com/Azure/azure-sdk-for-go-extensions v0.1.3/go.mod h1:dJfn8QUzuvyO4hGZ8pkROwd7/VQzDG8ER2SRk+V0afY=
github.com/Azure/azure-sdk-for-go-extensions v0.1.4 h1:XNT7IWmj4u3AfSag3t2mFupHT59J58pknX+daqprjm8=
github.com/Azure/azure-sdk-for-go-extensions v0.1.4/go.mod h1:dJfn8QUzuvyO4hGZ8pkROwd7/VQzDG8ER2SRk+V0afY=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.9.0 h1:fb8kj/Dh4CSwgsOzHeZY4Xh68cFVbzXx+ONXGMY//4w=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.9.0/go.mod h1:uReU2sSxZExRPBAg3qKzmAucSi51+SP1OhohieR821Q=
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.3.1 h1:LNHhpdK7hzUcx/k1LIcuh5k7k1LGIWLQfCjaneSj7Fc=
Expand Down
29 changes: 22 additions & 7 deletions pkg/providers/instance/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -452,14 +452,15 @@ func isSKUNotAvailable(err error) bool {
}

func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corecloudprovider.InstanceType, zone, capacityType string, err error) error {
if sdkerrors.SubscriptionQuotaHasBeenReached(err) {
// Subscription quota reached, mark the instance type as unavailable in all zones available to the offering
if sdkerrors.SKUFamilyQuotaHasBeenReached(err) {
// Subscription quota has been reached for this VM SKU, mark the instance type as unavailable in all zones available to the offering
// This will also update the TTL for an existing offering in the cache that is already unavailable

logging.FromContext(ctx).Error(err)
for _, offering := range instanceType.Offerings {
if offering.CapacityType != capacityType {
continue
}

// If we have a quota limit of 0 vcpus, we mark the offerings unavailable for an hour.
// CPU limits of 0 are usually due to a subscription having no allocated quota for that instance type at all on the subscription.
if cpuLimitIsZero(err) {
Expand All @@ -468,10 +469,9 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec
p.unavailableOfferings.MarkUnavailable(ctx, SubscriptionQuotaReachedReason, instanceType.Name, offering.Zone, capacityType)
}
}
} else if sdkerrors.ZonalAllocationFailureOccurred(err) {
p.unavailableOfferings.MarkUnavailable(ctx, ZonalAllocationFailureReason, instanceType.Name, zone, corev1beta1.CapacityTypeOnDemand)
p.unavailableOfferings.MarkUnavailable(ctx, ZonalAllocationFailureReason, instanceType.Name, zone, corev1beta1.CapacityTypeSpot)
} else if isSKUNotAvailable(err) {
return fmt.Errorf("subscription level %s vCPU quota for %s has been reached (may try provision an alternative instance type)", capacityType, instanceType.Name)
}
if isSKUNotAvailable(err) {
// https://aka.ms/azureskunotavailable: either not available for a location or zone, or out of capacity for Spot.
// We only expect to observe the Spot case, not location or zone restrictions, because:
// - SKUs with location restriction are already filtered out via sku.HasLocationRestriction
Expand All @@ -489,6 +489,21 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec
}
p.unavailableOfferings.MarkUnavailableWithTTL(ctx, SKUNotAvailableReason, instanceType.Name, offering.Zone, capacityType, skuNotAvailableTTL)
}

logging.FromContext(ctx).Error(err)
return fmt.Errorf("the requested SKU is unavailable for instance type %s in zone %s with capacity type %s, for more details please visit: https://aka.ms/azureskunotavailable", instanceType.Name, zone, capacityType)
}
if sdkerrors.ZonalAllocationFailureOccurred(err) {
logging.FromContext(ctx).With("zone", zone).Error(err)
p.unavailableOfferings.MarkUnavailable(ctx, ZonalAllocationFailureReason, instanceType.Name, zone, corev1beta1.CapacityTypeOnDemand)
p.unavailableOfferings.MarkUnavailable(ctx, ZonalAllocationFailureReason, instanceType.Name, zone, corev1beta1.CapacityTypeSpot)

return fmt.Errorf("unable to allocate resources in the selected zone (%s). (will try a different zone to fulfill your request)", zone)
}
if sdkerrors.RegionalQuotaHasBeenReached(err) {
logging.FromContext(ctx).Error(err)
// InsufficientCapacityError is appropriate here because trying any other instance type will not help
return corecloudprovider.NewInsufficientCapacityError(fmt.Errorf("regional %s vCPU quota limit for subscription has been reached. To scale beyond this limit, please review the quota increase process here: https://learn.microsoft.com/en-us/azure/quotas/regional-quota-requests", capacityType))
}
return err
}
Expand Down
83 changes: 81 additions & 2 deletions pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@ limitations under the License.
package instancetype_test

import (
"bytes"
"context"
"encoding/base64"
"fmt"
"io"
"net/http"
"os"
"strings"
"testing"
Expand Down Expand Up @@ -48,7 +51,9 @@ import (
coretest "github.com/aws/karpenter-core/pkg/test"
. "github.com/aws/karpenter-core/pkg/test/expectations"

sdkerrors "github.com/Azure/azure-sdk-for-go-extensions/pkg/errors"
"github.com/Azure/azure-sdk-for-go/sdk/azcore"

"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute"
"github.com/Azure/karpenter/pkg/apis"
"github.com/Azure/karpenter/pkg/apis/settings"
Expand Down Expand Up @@ -80,7 +85,6 @@ func TestAzure(t *testing.T) {

var _ = BeforeSuite(func() {
ctx = coreoptions.ToContext(ctx, coretest.Options())
// ctx = options.ToContext(ctx, test.Options())
ctx = settings.ToContext(ctx, test.Settings())

env = coretest.NewEnvironment(scheme.Scheme, coretest.WithCRDs(apis.CRDs...))
Expand Down Expand Up @@ -133,6 +137,77 @@ var _ = Describe("InstanceType Provider", func() {
ExpectCleanedUp(ctx, env.Client)
})

Context("subscription level quota error responses", func() {
It("should fail to provision when VM SKU family vCPU quota exceeded error is returned, and succeed when it is gone", func() {
familyVCPUQuotaExceededErrorMessage := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 100, Current Usage: 96, Additional Required: 32, (Minimum) New Limit Required: 128. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests"
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set(
&azcore.ResponseError{
ErrorCode: sdkerrors.OperationNotAllowed,
RawResponse: &http.Response{
Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, familyVCPUQuotaExceededErrorMessage),
},
},
)
pod := coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectNotScheduled(ctx, env.Client, pod)
azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil)
pod = coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)
})
It("should fail to provision when VM SKU family vCPU quota limit is zero, and succeed when its gone", func() {
familyVCPUQuotaIsZeroErrorMessage := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 0, Current Usage: 0, Additional Required: 32, (Minimum) New Limit Required: 32. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests"
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set(
&azcore.ResponseError{
ErrorCode: sdkerrors.OperationNotAllowed,
RawResponse: &http.Response{
Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, familyVCPUQuotaIsZeroErrorMessage),
},
},
)
pod := coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectNotScheduled(ctx, env.Client, pod)
azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil)
pod = coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)
})

It("should return ICE if Total Regional Cores Quota errors are hit", func() {
regionalVCPUQuotaExceededErrorMessage := "Operation could not be completed as it results in exceeding approved Total Regional Cores quota. Additional details - Deployment Model: Resource Manager, Location: uksouth, Current Limit: 100, Current Usage: 100, Additional Required: 64, (Minimum) New Limit Required: 164. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22uksouth%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22cores%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:164,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22cores%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/regional-quota-requests"
azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set(
&azcore.ResponseError{
ErrorCode: sdkerrors.OperationNotAllowed,
RawResponse: &http.Response{
Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, regionalVCPUQuotaExceededErrorMessage),
},
},
)

ExpectApplied(ctx, env.Client, nodePool, nodeClass)
nodeClaim := coretest.NodeClaim(corev1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
corev1beta1.NodePoolLabelKey: nodePool.Name,
},
},
Spec: corev1beta1.NodeClaimSpec{
NodeClassRef: &corev1beta1.NodeClassReference{
Name: nodeClass.Name,
},
},
})
claim, err := cloudProvider.Create(ctx, nodeClaim)
Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue())
Expect(claim).To(BeNil())

})
})

Context("Filtering in InstanceType Provider List", func() {
var instanceTypes corecloudprovider.InstanceTypes
var err error
Expand Down Expand Up @@ -230,7 +305,7 @@ var _ = Describe("InstanceType Provider", func() {
Name: nodeClass.Name,
}

ExpectApplied(ctx, env.Client, nodePool, nodeClass)
ExpectApplied(ctx, env.Client, np, nodeClass)
pod := coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)
Expand Down Expand Up @@ -946,3 +1021,7 @@ var _ = Describe("Tax Calculator", func() {
})
})
})

func createSDKErrorBody(code, message string) io.ReadCloser {
return io.NopCloser(bytes.NewReader([]byte(fmt.Sprintf(`{"error":{"code": "%s", "message": "%s"}}`, code, message))))
}

0 comments on commit 20da22d

Please sign in to comment.