Skip to content

Commit

Permalink
Merge branch 'main' into tallaxes/devcontainer-go
Browse files Browse the repository at this point in the history
  • Loading branch information
tallaxes authored Dec 1, 2024
2 parents a5b6390 + 17656a0 commit 25a10e3
Show file tree
Hide file tree
Showing 16 changed files with 232 additions and 146 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ repos:
hooks:
- id: shellcheck
- repo: https://github.com/crate-ci/typos
rev: v1.26.0
rev: v1.28.1
hooks:
- id: typos
args: [--write-changes, --force-exclude, --exclude, go.mod]
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ require (
go.uber.org/multierr v1.11.0
go.uber.org/zap v1.27.0
golang.org/x/sync v0.8.0
gopkg.in/yaml.v2 v2.4.0
k8s.io/api v0.30.3
k8s.io/apiextensions-apiserver v0.30.3
k8s.io/apimachinery v0.30.3
Expand Down Expand Up @@ -158,7 +159,6 @@ require (
google.golang.org/protobuf v1.34.2 // indirect
gopkg.in/dnaeon/go-vcr.v3 v3.2.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/cloud-provider v0.30.3 // indirect
k8s.io/component-base v0.30.3 // indirect
Expand Down
8 changes: 4 additions & 4 deletions hack/toolchain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,17 @@ main() {

tools() {
go install github.com/google/[email protected]
go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.61.0
go install github.com/google/ko@v0.16.0
go install github.com/mikefarah/yq/[email protected].3
go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.62.2
go install github.com/google/ko@v0.17.1
go install github.com/mikefarah/yq/[email protected].5
go install github.com/norwoodj/helm-docs/cmd/[email protected]
go install sigs.k8s.io/controller-runtime/tools/setup-envtest@latest
go install sigs.k8s.io/controller-tools/cmd/controller-gen@latest
go install github.com/sigstore/cosign/v2/cmd/[email protected]
# go install -tags extended github.com/gohugoio/[email protected]
go install golang.org/x/vuln/cmd/[email protected]
go install github.com/onsi/ginkgo/v2/ginkgo@latest
go install github.com/rhysd/actionlint/cmd/[email protected].3
go install github.com/rhysd/actionlint/cmd/[email protected].4
go install github.com/mattn/[email protected]
go install github.com/google/go-containerregistry/cmd/[email protected]
go install github.com/go-swagger/go-swagger/cmd/[email protected]
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/nodeclass/hash/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ func (c *Controller) updateNodeClaimHash(ctx context.Context, nodeClass *v1alpha
v1alpha2.AnnotationAKSNodeClassHashVersion: v1alpha2.AKSNodeClassHashVersion,
})

// Any NodeClaim that is already drifted will remain drifted if the karpenter.k8s.aws/nodepool-hash-version doesn't match
// Any NodeClaim that is already drifted will remain drifted if the karpenter.azure.com/nodepool-hash-version doesn't match
// Since the hashing mechanism has changed we will not be able to determine if the drifted status of the NodeClaim has changed
if nc.StatusConditions().Get(karpv1.ConditionTypeDrifted) == nil {
nc.Annotations = lo.Assign(nc.Annotations, map[string]string{
Expand Down
6 changes: 3 additions & 3 deletions pkg/providers/imagefamily/azlinux.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,9 @@ func (u AzureLinux) ScriptlessCustomData(kubeletConfig *bootstrap.KubeletConfigu
CABundle: caBundle,
GPUNode: u.Options.GPUNode,
GPUDriverVersion: u.Options.GPUDriverVersion,
// GPUImageSHA: u.Options.GPUImageSHA - GPU image SHA only applies to Ubuntu
// See: https://github.com/Azure/AgentBaker/blob/f393d6e4d689d9204d6000c85623ad9b764e2a29/vhdbuilder/packer/install-dependencies.sh#L201
SubnetID: u.Options.SubnetID,
GPUDriverType: u.Options.GPUDriverType,
GPUImageSHA: u.Options.GPUImageSHA,
SubnetID: u.Options.SubnetID,
},
Arch: u.Options.Arch,
TenantID: u.Options.TenantID,
Expand Down
2 changes: 2 additions & 0 deletions pkg/providers/imagefamily/bootstrap/aksbootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ type NodeBootstrapVariables struct {
SwapFileSizeMB int // t user input
GPUImageSHA string // s static sha rarely updated
GPUDriverVersion string // k determine by OS + GPU hardware requirements; can be determined automatically, but hard. suggest using GPU operator.
GPUDriverType string // k
GPUInstanceProfile string // t user-specified
CustomSearchDomainName string // c user-specified [presumably cluster-level]
CustomSearchRealmUser string // c user-specified [presumably cluster-level]
Expand Down Expand Up @@ -467,6 +468,7 @@ func (a AKS) applyOptions(nbv *NodeBootstrapVariables) {
nbv.GPUNode = true
nbv.ConfigGPUDriverIfNeeded = true
nbv.GPUDriverVersion = a.GPUDriverVersion
nbv.GPUDriverType = a.GPUDriverType
nbv.GPUImageSHA = a.GPUImageSHA
}

Expand Down
1 change: 1 addition & 0 deletions pkg/providers/imagefamily/bootstrap/bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ type Options struct {
CABundle *string
GPUNode bool
GPUDriverVersion string
GPUDriverType string
GPUImageSHA string
SubnetID string
}
Expand Down
1 change: 1 addition & 0 deletions pkg/providers/imagefamily/bootstrap/cse_cmd.sh.gtpl
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ KUBELET_CONFIG_FILE_CONTENT="{{.KubeletConfigFileContent}}"
SWAP_FILE_SIZE_MB="{{.SwapFileSizeMB}}"
GPU_IMAGE_SHA="{{.GPUImageSHA}}"
GPU_DRIVER_VERSION="{{.GPUDriverVersion}}"
GPU_DRIVER_TYPE="{{.GPUDriverType}}"
GPU_INSTANCE_PROFILE="{{.GPUInstanceProfile}}"
CUSTOM_SEARCH_DOMAIN_NAME="{{.CustomSearchDomainName}}"
CUSTOM_SEARCH_REALM_USER="{{.CustomSearchRealmUser}}"
Expand Down
1 change: 1 addition & 0 deletions pkg/providers/imagefamily/ubuntu_2204.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ func (u Ubuntu2204) ScriptlessCustomData(kubeletConfig *bootstrap.KubeletConfigu
GPUNode: u.Options.GPUNode,
GPUDriverVersion: u.Options.GPUDriverVersion,
GPUImageSHA: u.Options.GPUImageSHA,
GPUDriverType: u.Options.GPUDriverType,
SubnetID: u.Options.SubnetID,
},
Arch: u.Options.Arch,
Expand Down
51 changes: 26 additions & 25 deletions pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -604,12 +604,10 @@ var _ = Describe("InstanceType Provider", func() {
nodes := &v1.NodeList{}
Expect(env.Client.List(ctx, nodes)).To(Succeed())
for _, node := range nodes.Items {
Expect(node.Labels["karpenter.k8s.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["karpenter.kubernetes.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["node.kubernetes.io/instance-type"]).To(Equal("Standard_D2_v2"))

}
}

})

DescribeTable("Should not return unavailable offerings", func(azEnv *test.Environment) {
Expand Down Expand Up @@ -655,7 +653,7 @@ var _ = Describe("InstanceType Provider", func() {
}}}
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
node := ExpectScheduled(ctx, env.Client, pod)
Expect(node.Labels["karpenter.k8s.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["karpenter.kubernetes.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["node.kubernetes.io/instance-type"]).To(Equal("Standard_D2_v2"))
})
It("should launch smaller instances than optimal if larger instance launch results in Insufficient Capacity Error", func() {
Expand Down Expand Up @@ -1011,20 +1009,15 @@ var _ = Describe("InstanceType Provider", func() {
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)
node := ExpectScheduled(ctx, env.Client, pod)

Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1))
vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM
Expect(vm.Properties).ToNot(BeNil())
Expect(vm.Properties.HardwareProfile).ToNot(BeNil())
Expect(utils.IsNvidiaEnabledSKU(string(*vm.Properties.HardwareProfile.VMSize))).To(BeFalse())

clusterNodes := cluster.Nodes()
node := clusterNodes[0]
if node.Name() == pod.Spec.NodeName {
nodeLabels := node.Labels()
Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-count", "0"))
}
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "0"))
})

It("should schedule GPU pod on GPU capable node", func() {
Expand Down Expand Up @@ -1054,23 +1047,31 @@ var _ = Describe("InstanceType Provider", func() {
})

ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)

// Verify that the node has the GPU label set that the pod was scheduled on
clusterNodes := cluster.Nodes()
Expect(clusterNodes).ToNot(BeEmpty())
Expect(len(clusterNodes)).To(Equal(1))
node := clusterNodes[0]
Expect(node.Node.Status.Allocatable).To(HaveKeyWithValue(v1.ResourceName("nvidia.com/gpu"), resource.MustParse("1")))
node := ExpectScheduled(ctx, env.Client, pod)

if node.Name() == pod.Spec.NodeName {
nodeLabels := node.Labels()
// the following checks assume Standard_NC16as_T4_v3 (surprisingly the cheapest GPU in the test set), so test the assumption
Expect(node.Labels).To(HaveKeyWithValue("node.kubernetes.io/instance-type", "Standard_NC16as_T4_v3"))

Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-name", "A100"))
Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-manufacturer", v1alpha2.ManufacturerNvidia))
Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-count", "1"))
// Verify GPU related settings in bootstrap (assuming one Standard_NC16as_T4_v3)
customData := ExpectDecodedCustomData(azureEnv)
Expect(customData).To(SatisfyAll(
ContainSubstring("GPU_NODE=true"),
ContainSubstring("SGX_NODE=false"),
ContainSubstring("MIG_NODE=false"),
ContainSubstring("CONFIG_GPU_DRIVER_IF_NEEDED=true"),
ContainSubstring("ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED=false"),
ContainSubstring("GPU_DRIVER_TYPE=\"cuda\""),
ContainSubstring(fmt.Sprintf("GPU_DRIVER_VERSION=\"%s\"", utils.NvidiaCudaDriverVersion)),
ContainSubstring(fmt.Sprintf("GPU_IMAGE_SHA=\"%s\"", utils.AKSGPUCudaVersionSuffix)),
ContainSubstring("GPU_NEEDS_FABRIC_MANAGER=\"false\""),
ContainSubstring("GPU_INSTANCE_PROFILE=\"\""),
))

}
// Verify that the node the pod was scheduled on has GPU resource and labels set
Expect(node.Status.Allocatable).To(HaveKeyWithValue(v1.ResourceName("nvidia.com/gpu"), resource.MustParse("1")))
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-name", "T4"))
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-manufacturer", v1alpha2.ManufacturerNvidia))
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "1"))
})
})

Expand Down
1 change: 1 addition & 0 deletions pkg/providers/launchtemplate/launchtemplate.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ func (p *Provider) getStaticParameters(ctx context.Context, instanceType *cloudp
Arch: arch,
GPUNode: utils.IsNvidiaEnabledSKU(instanceType.Name),
GPUDriverVersion: utils.GetGPUDriverVersion(instanceType.Name),
GPUDriverType: utils.GetGPUDriverType(instanceType.Name),
GPUImageSHA: utils.GetAKSGPUImageSHA(instanceType.Name),
TenantID: p.tenantID,
SubscriptionID: p.subscriptionID,
Expand Down
1 change: 1 addition & 0 deletions pkg/providers/launchtemplate/parameters/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ type StaticParameters struct {
Arch string
GPUNode bool
GPUDriverVersion string
GPUDriverType string
GPUImageSHA string
TenantID string
SubscriptionID string
Expand Down
Loading

0 comments on commit 25a10e3

Please sign in to comment.