Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: supporting additional handing for TotalRegionalCores quota error, and adding better customer facing error messages on nodeclaim #45

Merged
merged 16 commits into from
Dec 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ go 1.21
require (
github.com/Azure/azure-kusto-go v0.14.0
github.com/Azure/azure-sdk-for-go v68.0.0+incompatible
github.com/Azure/azure-sdk-for-go-extensions v0.1.3
github.com/Azure/azure-sdk-for-go-extensions v0.1.4
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.9.0
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.3.1
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute v1.0.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ github.com/Azure/azure-kusto-go v0.14.0 h1:5XVmjh5kVgsm2scpsWisJ6Q1ZgWHJcIOPCZC1
github.com/Azure/azure-kusto-go v0.14.0/go.mod h1:wSmXIsQwBVPHDNsSQsX98nuc12VyvxoNHQa2q9t1Ce0=
github.com/Azure/azure-sdk-for-go v68.0.0+incompatible h1:fcYLmCpyNYRnvJbPerq7U0hS+6+I79yEDJBqVNcqUzU=
github.com/Azure/azure-sdk-for-go v68.0.0+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc=
github.com/Azure/azure-sdk-for-go-extensions v0.1.3 h1:hDHVEvhqVufFPmD5b+xRjtGXSQTMEia7S/xntltnG44=
github.com/Azure/azure-sdk-for-go-extensions v0.1.3/go.mod h1:dJfn8QUzuvyO4hGZ8pkROwd7/VQzDG8ER2SRk+V0afY=
github.com/Azure/azure-sdk-for-go-extensions v0.1.4 h1:XNT7IWmj4u3AfSag3t2mFupHT59J58pknX+daqprjm8=
github.com/Azure/azure-sdk-for-go-extensions v0.1.4/go.mod h1:dJfn8QUzuvyO4hGZ8pkROwd7/VQzDG8ER2SRk+V0afY=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.9.0 h1:fb8kj/Dh4CSwgsOzHeZY4Xh68cFVbzXx+ONXGMY//4w=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.9.0/go.mod h1:uReU2sSxZExRPBAg3qKzmAucSi51+SP1OhohieR821Q=
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.3.1 h1:LNHhpdK7hzUcx/k1LIcuh5k7k1LGIWLQfCjaneSj7Fc=
Expand Down
29 changes: 22 additions & 7 deletions pkg/providers/instance/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -452,14 +452,15 @@ func isSKUNotAvailable(err error) bool {
}

func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corecloudprovider.InstanceType, zone, capacityType string, err error) error {
if sdkerrors.SubscriptionQuotaHasBeenReached(err) {
// Subscription quota reached, mark the instance type as unavailable in all zones available to the offering
if sdkerrors.SKUFamilyQuotaHasBeenReached(err) {
// Subscription quota has been reached for this VM SKU, mark the instance type as unavailable in all zones available to the offering
// This will also update the TTL for an existing offering in the cache that is already unavailable

logging.FromContext(ctx).Error(err)
for _, offering := range instanceType.Offerings {
if offering.CapacityType != capacityType {
continue
}

// If we have a quota limit of 0 vcpus, we mark the offerings unavailable for an hour.
// CPU limits of 0 are usually due to a subscription having no allocated quota for that instance type at all on the subscription.
if cpuLimitIsZero(err) {
Expand All @@ -468,10 +469,9 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec
p.unavailableOfferings.MarkUnavailable(ctx, SubscriptionQuotaReachedReason, instanceType.Name, offering.Zone, capacityType)
}
}
} else if sdkerrors.ZonalAllocationFailureOccurred(err) {
p.unavailableOfferings.MarkUnavailable(ctx, ZonalAllocationFailureReason, instanceType.Name, zone, corev1beta1.CapacityTypeOnDemand)
p.unavailableOfferings.MarkUnavailable(ctx, ZonalAllocationFailureReason, instanceType.Name, zone, corev1beta1.CapacityTypeSpot)
} else if isSKUNotAvailable(err) {
return fmt.Errorf("subscription level %s vCPU quota for %s has been reached (may try provision an alternative instance type)", capacityType, instanceType.Name)
}
if isSKUNotAvailable(err) {
Bryce-Soghigian marked this conversation as resolved.
Show resolved Hide resolved
// https://aka.ms/azureskunotavailable: either not available for a location or zone, or out of capacity for Spot.
// We only expect to observe the Spot case, not location or zone restrictions, because:
// - SKUs with location restriction are already filtered out via sku.HasLocationRestriction
Expand All @@ -489,6 +489,21 @@ func (p *Provider) handleResponseErrors(ctx context.Context, instanceType *corec
}
p.unavailableOfferings.MarkUnavailableWithTTL(ctx, SKUNotAvailableReason, instanceType.Name, offering.Zone, capacityType, skuNotAvailableTTL)
}

logging.FromContext(ctx).Error(err)
return fmt.Errorf("the requested SKU is unavailable for instance type %s in zone %s with capacity type %s, for more details please visit: https://aka.ms/azureskunotavailable", instanceType.Name, zone, capacityType)
}
if sdkerrors.ZonalAllocationFailureOccurred(err) {
logging.FromContext(ctx).With("zone", zone).Error(err)
p.unavailableOfferings.MarkUnavailable(ctx, ZonalAllocationFailureReason, instanceType.Name, zone, corev1beta1.CapacityTypeOnDemand)
p.unavailableOfferings.MarkUnavailable(ctx, ZonalAllocationFailureReason, instanceType.Name, zone, corev1beta1.CapacityTypeSpot)

return fmt.Errorf("unable to allocate resources in the selected zone (%s). (will try a different zone to fulfill your request)", zone)
}
if sdkerrors.RegionalQuotaHasBeenReached(err) {
logging.FromContext(ctx).Error(err)
// InsufficientCapacityError is appropriate here because trying any other instance type will not help
return corecloudprovider.NewInsufficientCapacityError(fmt.Errorf("regional %s vCPU quota limit for subscription has been reached. To scale beyond this limit, please review the quota increase process here: https://learn.microsoft.com/en-us/azure/quotas/regional-quota-requests", capacityType))
}
return err
}
Expand Down
83 changes: 81 additions & 2 deletions pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@ limitations under the License.
package instancetype_test

import (
"bytes"
"context"
"encoding/base64"
"fmt"
"io"
"net/http"
"os"
"strings"
"testing"
Expand Down Expand Up @@ -48,7 +51,9 @@ import (
coretest "github.com/aws/karpenter-core/pkg/test"
. "github.com/aws/karpenter-core/pkg/test/expectations"

sdkerrors "github.com/Azure/azure-sdk-for-go-extensions/pkg/errors"
"github.com/Azure/azure-sdk-for-go/sdk/azcore"

"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute"
"github.com/Azure/karpenter/pkg/apis"
"github.com/Azure/karpenter/pkg/apis/settings"
Expand Down Expand Up @@ -80,7 +85,6 @@ func TestAzure(t *testing.T) {

var _ = BeforeSuite(func() {
ctx = coreoptions.ToContext(ctx, coretest.Options())
// ctx = options.ToContext(ctx, test.Options())
ctx = settings.ToContext(ctx, test.Settings())

env = coretest.NewEnvironment(scheme.Scheme, coretest.WithCRDs(apis.CRDs...))
Expand Down Expand Up @@ -133,6 +137,77 @@ var _ = Describe("InstanceType Provider", func() {
ExpectCleanedUp(ctx, env.Client)
})

Context("subscription level quota error responses", func() {
It("should fail to provision when VM SKU family vCPU quota exceeded error is returned, and succeed when it is gone", func() {
familyVCPUQuotaExceededErrorMessage := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 100, Current Usage: 96, Additional Required: 32, (Minimum) New Limit Required: 128. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests"
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set(
&azcore.ResponseError{
ErrorCode: sdkerrors.OperationNotAllowed,
RawResponse: &http.Response{
Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, familyVCPUQuotaExceededErrorMessage),
},
},
)
pod := coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectNotScheduled(ctx, env.Client, pod)
azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil)
pod = coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)
})
It("should fail to provision when VM SKU family vCPU quota limit is zero, and succeed when its gone", func() {
familyVCPUQuotaIsZeroErrorMessage := "Operation could not be completed as it results in exceeding approved standardDLSv5Family Cores quota. Additional details - Deployment Model: Resource Manager, Location: westus2, Current Limit: 0, Current Usage: 0, Additional Required: 32, (Minimum) New Limit Required: 32. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22westus2%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22standardDLSv5Family%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:128,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22standardDLSv5Family%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/per-vm-quota-requests"
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set(
&azcore.ResponseError{
ErrorCode: sdkerrors.OperationNotAllowed,
RawResponse: &http.Response{
Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, familyVCPUQuotaIsZeroErrorMessage),
},
},
)
pod := coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectNotScheduled(ctx, env.Client, pod)
azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.BeginError.Set(nil)
pod = coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)
})

It("should return ICE if Total Regional Cores Quota errors are hit", func() {
regionalVCPUQuotaExceededErrorMessage := "Operation could not be completed as it results in exceeding approved Total Regional Cores quota. Additional details - Deployment Model: Resource Manager, Location: uksouth, Current Limit: 100, Current Usage: 100, Additional Required: 64, (Minimum) New Limit Required: 164. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/#blade/Microsoft_Azure_Capacity/UsageAndQuota.ReactView/Parameters/%7B%22subscriptionId%22:%(redacted)%22,%22command%22:%22openQuotaApprovalBlade%22,%22quotas%22:[%7B%22location%22:%22uksouth%22,%22providerId%22:%22Microsoft.Compute%22,%22resourceName%22:%22cores%22,%22quotaRequest%22:%7B%22properties%22:%7B%22limit%22:164,%22unit%22:%22Count%22,%22name%22:%7B%22value%22:%22cores%22%7D%7D%7D%7D]%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/regional-quota-requests"
azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set(
&azcore.ResponseError{
ErrorCode: sdkerrors.OperationNotAllowed,
RawResponse: &http.Response{
Body: createSDKErrorBody(sdkerrors.OperationNotAllowed, regionalVCPUQuotaExceededErrorMessage),
},
},
)

ExpectApplied(ctx, env.Client, nodePool, nodeClass)
nodeClaim := coretest.NodeClaim(corev1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
corev1beta1.NodePoolLabelKey: nodePool.Name,
},
},
Spec: corev1beta1.NodeClaimSpec{
NodeClassRef: &corev1beta1.NodeClassReference{
Name: nodeClass.Name,
},
},
})
claim, err := cloudProvider.Create(ctx, nodeClaim)
Expect(corecloudprovider.IsInsufficientCapacityError(err)).To(BeTrue())
Expect(claim).To(BeNil())

})
})

Context("Filtering in InstanceType Provider List", func() {
var instanceTypes corecloudprovider.InstanceTypes
var err error
Expand Down Expand Up @@ -230,7 +305,7 @@ var _ = Describe("InstanceType Provider", func() {
Name: nodeClass.Name,
}

ExpectApplied(ctx, env.Client, nodePool, nodeClass)
ExpectApplied(ctx, env.Client, np, nodeClass)
pod := coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)
Expand Down Expand Up @@ -946,3 +1021,7 @@ var _ = Describe("Tax Calculator", func() {
})
})
})

func createSDKErrorBody(code, message string) io.ReadCloser {
return io.NopCloser(bytes.NewReader([]byte(fmt.Sprintf(`{"error":{"code": "%s", "message": "%s"}}`, code, message))))
}