diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b35a6dc26..90b22c55d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,7 +12,7 @@ repos: hooks: - id: shellcheck - repo: https://github.com/crate-ci/typos - rev: v1.26.0 + rev: v1.28.1 hooks: - id: typos args: [--write-changes, --force-exclude, --exclude, go.mod] diff --git a/go.mod b/go.mod index d7bfc10fa..ff02c6da7 100644 --- a/go.mod +++ b/go.mod @@ -40,6 +40,7 @@ require ( go.uber.org/multierr v1.11.0 go.uber.org/zap v1.27.0 golang.org/x/sync v0.8.0 + gopkg.in/yaml.v2 v2.4.0 k8s.io/api v0.30.3 k8s.io/apiextensions-apiserver v0.30.3 k8s.io/apimachinery v0.30.3 @@ -158,7 +159,6 @@ require ( google.golang.org/protobuf v1.34.2 // indirect gopkg.in/dnaeon/go-vcr.v3 v3.2.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/cloud-provider v0.30.3 // indirect k8s.io/component-base v0.30.3 // indirect diff --git a/pkg/controllers/nodeclass/hash/controller.go b/pkg/controllers/nodeclass/hash/controller.go index 8948e0a29..c9bb64d0b 100644 --- a/pkg/controllers/nodeclass/hash/controller.go +++ b/pkg/controllers/nodeclass/hash/controller.go @@ -100,7 +100,7 @@ func (c *Controller) updateNodeClaimHash(ctx context.Context, nodeClass *v1alpha v1alpha2.AnnotationAKSNodeClassHashVersion: v1alpha2.AKSNodeClassHashVersion, }) - // Any NodeClaim that is already drifted will remain drifted if the karpenter.k8s.aws/nodepool-hash-version doesn't match + // Any NodeClaim that is already drifted will remain drifted if the karpenter.azure.com/nodepool-hash-version doesn't match // Since the hashing mechanism has changed we will not be able to determine if the drifted status of the NodeClaim has changed if nc.StatusConditions().Get(karpv1.ConditionTypeDrifted) == nil { nc.Annotations = lo.Assign(nc.Annotations, map[string]string{ diff --git a/pkg/providers/imagefamily/azlinux.go b/pkg/providers/imagefamily/azlinux.go index 13be8fe07..6f4979f8d 100644 --- a/pkg/providers/imagefamily/azlinux.go +++ b/pkg/providers/imagefamily/azlinux.go @@ -88,9 +88,9 @@ func (u AzureLinux) ScriptlessCustomData(kubeletConfig *bootstrap.KubeletConfigu CABundle: caBundle, GPUNode: u.Options.GPUNode, GPUDriverVersion: u.Options.GPUDriverVersion, - // GPUImageSHA: u.Options.GPUImageSHA - GPU image SHA only applies to Ubuntu - // See: https://github.com/Azure/AgentBaker/blob/f393d6e4d689d9204d6000c85623ad9b764e2a29/vhdbuilder/packer/install-dependencies.sh#L201 - SubnetID: u.Options.SubnetID, + GPUDriverType: u.Options.GPUDriverType, + GPUImageSHA: u.Options.GPUImageSHA, + SubnetID: u.Options.SubnetID, }, Arch: u.Options.Arch, TenantID: u.Options.TenantID, diff --git a/pkg/providers/imagefamily/bootstrap/aksbootstrap.go b/pkg/providers/imagefamily/bootstrap/aksbootstrap.go index e72f69ac7..93697e7b9 100644 --- a/pkg/providers/imagefamily/bootstrap/aksbootstrap.go +++ b/pkg/providers/imagefamily/bootstrap/aksbootstrap.go @@ -204,6 +204,7 @@ type NodeBootstrapVariables struct { SwapFileSizeMB int // t user input GPUImageSHA string // s static sha rarely updated GPUDriverVersion string // k determine by OS + GPU hardware requirements; can be determined automatically, but hard. suggest using GPU operator. + GPUDriverType string // k GPUInstanceProfile string // t user-specified CustomSearchDomainName string // c user-specified [presumably cluster-level] CustomSearchRealmUser string // c user-specified [presumably cluster-level] @@ -467,6 +468,7 @@ func (a AKS) applyOptions(nbv *NodeBootstrapVariables) { nbv.GPUNode = true nbv.ConfigGPUDriverIfNeeded = true nbv.GPUDriverVersion = a.GPUDriverVersion + nbv.GPUDriverType = a.GPUDriverType nbv.GPUImageSHA = a.GPUImageSHA } diff --git a/pkg/providers/imagefamily/bootstrap/bootstrap.go b/pkg/providers/imagefamily/bootstrap/bootstrap.go index 678475fcd..850549851 100644 --- a/pkg/providers/imagefamily/bootstrap/bootstrap.go +++ b/pkg/providers/imagefamily/bootstrap/bootstrap.go @@ -52,6 +52,7 @@ type Options struct { CABundle *string GPUNode bool GPUDriverVersion string + GPUDriverType string GPUImageSHA string SubnetID string } diff --git a/pkg/providers/imagefamily/bootstrap/cse_cmd.sh.gtpl b/pkg/providers/imagefamily/bootstrap/cse_cmd.sh.gtpl index 00e032f24..a480c88fc 100644 --- a/pkg/providers/imagefamily/bootstrap/cse_cmd.sh.gtpl +++ b/pkg/providers/imagefamily/bootstrap/cse_cmd.sh.gtpl @@ -128,6 +128,7 @@ KUBELET_CONFIG_FILE_CONTENT="{{.KubeletConfigFileContent}}" SWAP_FILE_SIZE_MB="{{.SwapFileSizeMB}}" GPU_IMAGE_SHA="{{.GPUImageSHA}}" GPU_DRIVER_VERSION="{{.GPUDriverVersion}}" +GPU_DRIVER_TYPE="{{.GPUDriverType}}" GPU_INSTANCE_PROFILE="{{.GPUInstanceProfile}}" CUSTOM_SEARCH_DOMAIN_NAME="{{.CustomSearchDomainName}}" CUSTOM_SEARCH_REALM_USER="{{.CustomSearchRealmUser}}" diff --git a/pkg/providers/imagefamily/ubuntu_2204.go b/pkg/providers/imagefamily/ubuntu_2204.go index 21d3b8c03..54806cba2 100644 --- a/pkg/providers/imagefamily/ubuntu_2204.go +++ b/pkg/providers/imagefamily/ubuntu_2204.go @@ -89,6 +89,7 @@ func (u Ubuntu2204) ScriptlessCustomData(kubeletConfig *bootstrap.KubeletConfigu GPUNode: u.Options.GPUNode, GPUDriverVersion: u.Options.GPUDriverVersion, GPUImageSHA: u.Options.GPUImageSHA, + GPUDriverType: u.Options.GPUDriverType, SubnetID: u.Options.SubnetID, }, Arch: u.Options.Arch, diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index c0e30f560..ccb5a66d3 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -604,12 +604,10 @@ var _ = Describe("InstanceType Provider", func() { nodes := &v1.NodeList{} Expect(env.Client.List(ctx, nodes)).To(Succeed()) for _, node := range nodes.Items { - Expect(node.Labels["karpenter.k8s.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region))) + Expect(node.Labels["karpenter.kubernetes.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region))) Expect(node.Labels["node.kubernetes.io/instance-type"]).To(Equal("Standard_D2_v2")) - } } - }) DescribeTable("Should not return unavailable offerings", func(azEnv *test.Environment) { @@ -655,7 +653,7 @@ var _ = Describe("InstanceType Provider", func() { }}} ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels["karpenter.k8s.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region))) + Expect(node.Labels["karpenter.kubernetes.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region))) Expect(node.Labels["node.kubernetes.io/instance-type"]).To(Equal("Standard_D2_v2")) }) It("should launch smaller instances than optimal if larger instance launch results in Insufficient Capacity Error", func() { @@ -1011,7 +1009,7 @@ var _ = Describe("InstanceType Provider", func() { ExpectApplied(ctx, env.Client, nodePool, nodeClass) pod := coretest.UnschedulablePod(coretest.PodOptions{}) ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) - ExpectScheduled(ctx, env.Client, pod) + node := ExpectScheduled(ctx, env.Client, pod) Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1)) vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM @@ -1019,12 +1017,7 @@ var _ = Describe("InstanceType Provider", func() { Expect(vm.Properties.HardwareProfile).ToNot(BeNil()) Expect(utils.IsNvidiaEnabledSKU(string(*vm.Properties.HardwareProfile.VMSize))).To(BeFalse()) - clusterNodes := cluster.Nodes() - node := clusterNodes[0] - if node.Name() == pod.Spec.NodeName { - nodeLabels := node.Labels() - Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-count", "0")) - } + Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "0")) }) It("should schedule GPU pod on GPU capable node", func() { @@ -1054,23 +1047,31 @@ var _ = Describe("InstanceType Provider", func() { }) ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) - ExpectScheduled(ctx, env.Client, pod) - - // Verify that the node has the GPU label set that the pod was scheduled on - clusterNodes := cluster.Nodes() - Expect(clusterNodes).ToNot(BeEmpty()) - Expect(len(clusterNodes)).To(Equal(1)) - node := clusterNodes[0] - Expect(node.Node.Status.Allocatable).To(HaveKeyWithValue(v1.ResourceName("nvidia.com/gpu"), resource.MustParse("1"))) + node := ExpectScheduled(ctx, env.Client, pod) - if node.Name() == pod.Spec.NodeName { - nodeLabels := node.Labels() + // the following checks assume Standard_NC16as_T4_v3 (surprisingly the cheapest GPU in the test set), so test the assumption + Expect(node.Labels).To(HaveKeyWithValue("node.kubernetes.io/instance-type", "Standard_NC16as_T4_v3")) - Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-name", "A100")) - Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-manufacturer", v1alpha2.ManufacturerNvidia)) - Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-count", "1")) + // Verify GPU related settings in bootstrap (assuming one Standard_NC16as_T4_v3) + customData := ExpectDecodedCustomData(azureEnv) + Expect(customData).To(SatisfyAll( + ContainSubstring("GPU_NODE=true"), + ContainSubstring("SGX_NODE=false"), + ContainSubstring("MIG_NODE=false"), + ContainSubstring("CONFIG_GPU_DRIVER_IF_NEEDED=true"), + ContainSubstring("ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED=false"), + ContainSubstring("GPU_DRIVER_TYPE=\"cuda\""), + ContainSubstring(fmt.Sprintf("GPU_DRIVER_VERSION=\"%s\"", utils.NvidiaCudaDriverVersion)), + ContainSubstring(fmt.Sprintf("GPU_IMAGE_SHA=\"%s\"", utils.AKSGPUCudaVersionSuffix)), + ContainSubstring("GPU_NEEDS_FABRIC_MANAGER=\"false\""), + ContainSubstring("GPU_INSTANCE_PROFILE=\"\""), + )) - } + // Verify that the node the pod was scheduled on has GPU resource and labels set + Expect(node.Status.Allocatable).To(HaveKeyWithValue(v1.ResourceName("nvidia.com/gpu"), resource.MustParse("1"))) + Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-name", "T4")) + Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-manufacturer", v1alpha2.ManufacturerNvidia)) + Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "1")) }) }) diff --git a/pkg/providers/launchtemplate/launchtemplate.go b/pkg/providers/launchtemplate/launchtemplate.go index 9fed3b89d..3f1a1a853 100644 --- a/pkg/providers/launchtemplate/launchtemplate.go +++ b/pkg/providers/launchtemplate/launchtemplate.go @@ -152,6 +152,7 @@ func (p *Provider) getStaticParameters(ctx context.Context, instanceType *cloudp Arch: arch, GPUNode: utils.IsNvidiaEnabledSKU(instanceType.Name), GPUDriverVersion: utils.GetGPUDriverVersion(instanceType.Name), + GPUDriverType: utils.GetGPUDriverType(instanceType.Name), GPUImageSHA: utils.GetAKSGPUImageSHA(instanceType.Name), TenantID: p.tenantID, SubscriptionID: p.subscriptionID, diff --git a/pkg/providers/launchtemplate/parameters/types.go b/pkg/providers/launchtemplate/parameters/types.go index 8f2bda9f4..743e54be7 100644 --- a/pkg/providers/launchtemplate/parameters/types.go +++ b/pkg/providers/launchtemplate/parameters/types.go @@ -29,6 +29,7 @@ type StaticParameters struct { Arch string GPUNode bool GPUDriverVersion string + GPUDriverType string GPUImageSHA string TenantID string SubscriptionID string diff --git a/pkg/utils/gpu.go b/pkg/utils/gpu.go index 75dc5bcc7..cb026d5d6 100644 --- a/pkg/utils/gpu.go +++ b/pkg/utils/gpu.go @@ -17,116 +17,73 @@ limitations under the License. package utils import ( + _ "embed" "strings" + + "gopkg.in/yaml.v2" ) // TODO: Get these from agentbaker const ( - Nvidia470CudaDriverVersion = "cuda-470.82.01" - Nvidia550CudaDriverVersion = "cuda-550.54.15" - Nvidia535GridDriverVersion = "grid-535.161.08" - - // These SHAs will change once we update aks-gpu images in aks-gpu repository. We do that fairly rarely at this time. - // So for now these will be kept here like this and periodically bump them - AKSGPUGridSHA = "sha-d1f0ca" - AKSGPUCudaSHA = "sha-2d4c96" + Nvidia470CudaDriverVersion = "470.82.01" + + // https://github.com/Azure/AgentBaker/blob/ddf36a24eafd02ce0589657ff2dc799125f4ad37/parts/linux/cloud-init/artifacts/components.json#L562 + NvidiaCudaDriverVersion = "550.90.12" + AKSGPUCudaVersionSuffix = "20241021235610" + + NvidiaGridDriverVersion = "535.161.08" + AKSGPUGridVersionSuffix = "20241021235607" ) -func GetAKSGPUImageSHA(size string) string { - if UseGridDrivers(size) { - return AKSGPUGridSHA - } - return AKSGPUCudaSHA +type NvidiaSKUConfig struct { + NvidiaEnabledSKUFamilies map[string][]string `yaml:"nvidiaEnabledSKUs"` + MarinerNvidiaEnabledSKUFamilies map[string][]string `yaml:"marinerNvidiaEnabledSKUs"` } var ( - /* If a new GPU sku becomes available, add a key to this map, but only if you have a confirmation - that we have an agreement with NVIDIA for this specific gpu. - */ - NvidiaEnabledSKUs = map[string]bool{ - // M60 - "standard_nv6": true, - "standard_nv12": true, - "standard_nv12s_v3": true, - "standard_nv24": true, - "standard_nv24s_v3": true, - "standard_nv24r": true, - "standard_nv48s_v3": true, - // P40 - "standard_nd6s": true, - "standard_nd12s": true, - "standard_nd24s": true, - "standard_nd24rs": true, - // P100 - "standard_nc6s_v2": true, - "standard_nc12s_v2": true, - "standard_nc24s_v2": true, - "standard_nc24rs_v2": true, - // V100 - "standard_nc6s_v3": true, - "standard_nc12s_v3": true, - "standard_nc24s_v3": true, - "standard_nc24rs_v3": true, - "standard_nd40s_v3": true, - "standard_nd40rs_v2": true, - // T4 - "standard_nc4as_t4_v3": true, - "standard_nc8as_t4_v3": true, - "standard_nc16as_t4_v3": true, - "standard_nc64as_t4_v3": true, - // A100 40GB - "standard_nd96asr_v4": true, - "standard_nd112asr_a100_v4": true, - "standard_nd120asr_a100_v4": true, - // A100 80GB - "standard_nd96amsr_a100_v4": true, - "standard_nd112amsr_a100_v4": true, - "standard_nd120amsr_a100_v4": true, - // A100 PCIE 80GB - "standard_nc24ads_a100_v4": true, - "standard_nc48ads_a100_v4": true, - "standard_nc96ads_a100_v4": true, - "standard_ncads_a100_v4": true, - // A10 - "standard_nc8ads_a10_v4": true, - "standard_nc16ads_a10_v4": true, - "standard_nc32ads_a10_v4": true, - // A10, GRID only - "standard_nv6ads_a10_v5": true, - "standard_nv12ads_a10_v5": true, - "standard_nv18ads_a10_v5": true, - "standard_nv36ads_a10_v5": true, - "standard_nv36adms_a10_v5": true, - "standard_nv72ads_a10_v5": true, - // A100 - "standard_nd96ams_v4": true, - "standard_nd96ams_a100_v4": true, + nvidiaEnabledSKUs = make(map[string]bool) + marinerNvidiaEnabledSKUs = make(map[string]bool) +) + +//go:embed supported-gpus.yaml +var configFile []byte + +func init() { + readNvidiaSKUConfig() +} + +func readNvidiaSKUConfig() { + var nvidiaSKUConfig NvidiaSKUConfig + + err := yaml.Unmarshal(configFile, &nvidiaSKUConfig) + if err != nil { + panic(err) } + for _, skus := range nvidiaSKUConfig.NvidiaEnabledSKUFamilies { + for _, sku := range skus { + nvidiaEnabledSKUs[sku] = true + } + } + for _, skus := range nvidiaSKUConfig.MarinerNvidiaEnabledSKUFamilies { + for _, sku := range skus { + marinerNvidiaEnabledSKUs[sku] = true + } + } +} - // List of GPU SKUs currently enabled and validated for Mariner. Will expand the support - // to cover other SKUs available in Azure - MarinerNvidiaEnabledSKUs = map[string]bool{ - // V100 - "standard_nc6s_v3": true, - "standard_nc12s_v3": true, - "standard_nc24s_v3": true, - "standard_nc24rs_v3": true, - "standard_nd40s_v3": true, - "standard_nd40rs_v2": true, - // T4 - "standard_nc4as_t4_v3": true, - "standard_nc8as_t4_v3": true, - "standard_nc16as_t4_v3": true, - "standard_nc64as_t4_v3": true, +func GetAKSGPUImageSHA(size string) string { + if UseGridDrivers(size) { + return AKSGPUGridVersionSuffix } -) + return AKSGPUCudaVersionSuffix +} // IsNvidiaEnabledSKU determines if an VM SKU has nvidia driver support func IsNvidiaEnabledSKU(vmSize string) bool { // Trim the optional _Promo suffix. vmSize = strings.ToLower(vmSize) vmSize = strings.TrimSuffix(vmSize, "_promo") - return NvidiaEnabledSKUs[vmSize] + return nvidiaEnabledSKUs[vmSize] } // IsNvidiaEnabledSKU determines if an VM SKU has nvidia driver support @@ -134,7 +91,7 @@ func IsMarinerEnabledGPUSKU(vmSize string) bool { // Trim the optional _Promo suffix. vmSize = strings.ToLower(vmSize) vmSize = strings.TrimSuffix(vmSize, "_promo") - return MarinerNvidiaEnabledSKUs[vmSize] + return marinerNvidiaEnabledSKUs[vmSize] } // NV series GPUs target graphics workloads vs NC which targets compute. @@ -143,12 +100,20 @@ func IsMarinerEnabledGPUSKU(vmSize string) bool { // NVv3 is untested on AKS, NVv4 is AMD so n/a, and NVv2 no longer seems to exist (?). func GetGPUDriverVersion(size string) string { if UseGridDrivers(size) { - return Nvidia535GridDriverVersion + return NvidiaGridDriverVersion } if isStandardNCv1(size) { return Nvidia470CudaDriverVersion } - return Nvidia550CudaDriverVersion + return NvidiaCudaDriverVersion +} + +// GetGPUDriverType returns the type of GPU driver for given VM SKU ("grid" or "cuda") +func GetGPUDriverType(size string) string { + if UseGridDrivers(size) { + return "grid" + } + return "cuda" } func isStandardNCv1(size string) bool { diff --git a/pkg/utils/gpu_test.go b/pkg/utils/gpu_test.go index ed6529bcc..7e95b5403 100644 --- a/pkg/utils/gpu_test.go +++ b/pkg/utils/gpu_test.go @@ -25,23 +25,24 @@ import ( func TestGetAKSGPUImageSHA(t *testing.T) { assert := assert.New(t) tests := []struct { - name string - size string - output string + name string + size string + gpuDriverSha string + gpuDriverType string }{ - {"GRID Driver - NC Series v4", "standard_nc8ads_a10_v4", AKSGPUGridSHA}, - {"Cuda Driver - NV Series", "standard_nv6", AKSGPUCudaSHA}, - {"CUDA Driver - NC Series", "standard_nc6s_v3", AKSGPUCudaSHA}, - {"GRID Driver - NV Series v5", "standard_nv6ads_a10_v5", AKSGPUGridSHA}, - {"Unknown SKU", "unknown_sku", AKSGPUCudaSHA}, - {"CUDA Driver - NC Series v2", "standard_nc6s_v2", AKSGPUCudaSHA}, - {"CUDA Driver - NV Series v3", "standard_nv12s_v3", AKSGPUCudaSHA}, + {"GRID Driver - NC Series v4", "standard_nc8ads_a10_v4", AKSGPUGridVersionSuffix, "grid"}, + {"Cuda Driver - NV Series", "standard_nv6", AKSGPUCudaVersionSuffix, "cuda"}, + {"CUDA Driver - NC Series", "standard_nc6s_v3", AKSGPUCudaVersionSuffix, "cuda"}, + {"GRID Driver - NV Series v5", "standard_nv6ads_a10_v5", AKSGPUGridVersionSuffix, "grid"}, + {"Unknown SKU", "unknown_sku", AKSGPUCudaVersionSuffix, "cuda"}, + {"CUDA Driver - NC Series v2", "standard_nc6s_v2", AKSGPUCudaVersionSuffix, "cuda"}, + {"CUDA Driver - NV Series v3", "standard_nv12s_v3", AKSGPUCudaVersionSuffix, "cuda"}, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - result := GetAKSGPUImageSHA(test.size) - assert.Equal(test.output, result, "Failed for size: %s", test.size) + assert.Equal(test.gpuDriverSha, GetAKSGPUImageSHA(test.size), "Failed for size: %s", test.size) + assert.Equal(test.gpuDriverType, GetGPUDriverType(test.size), "Failed for size: %s", test.size) }) } } @@ -53,12 +54,12 @@ func TestGetGPUDriverVersion(t *testing.T) { size string output string }{ - {"GRID Driver - NV Series v5", "standard_nv6ads_a10_v5", Nvidia535GridDriverVersion}, + {"GRID Driver - NV Series v5", "standard_nv6ads_a10_v5", NvidiaGridDriverVersion}, {"CUDA Driver - NC Series v1", "standard_nc6s", Nvidia470CudaDriverVersion}, - {"CUDA Driver - NC Series v2", "standard_nc6s_v2", Nvidia550CudaDriverVersion}, - {"Unknown SKU", "unknown_sku", Nvidia550CudaDriverVersion}, - {"CUDA Driver - NC Series v3", "standard_nc6s_v3", Nvidia550CudaDriverVersion}, - {"GRID Driver - A10", "standard_nc8ads_a10_v4", Nvidia535GridDriverVersion}, + {"CUDA Driver - NC Series v2", "standard_nc6s_v2", NvidiaCudaDriverVersion}, + {"Unknown SKU", "unknown_sku", NvidiaCudaDriverVersion}, + {"CUDA Driver - NC Series v3", "standard_nc6s_v3", NvidiaCudaDriverVersion}, + {"GRID Driver - A10", "standard_nc8ads_a10_v4", NvidiaGridDriverVersion}, } for _, test := range tests { diff --git a/pkg/utils/supported-gpus.yaml b/pkg/utils/supported-gpus.yaml new file mode 100644 index 000000000..1b5c73b57 --- /dev/null +++ b/pkg/utils/supported-gpus.yaml @@ -0,0 +1,110 @@ +nvidiaEnabledSKUs: + StandardNCADSA10v4Family: + - standard_nc16ads_a10_v4 + - standard_nc32ads_a10_v4 + - standard_nc8ads_a10_v4 + StandardNCADSA100v4Family: + - standard_nc24ads_a100_v4 + - standard_nc48ads_a100_v4 + - standard_nc96ads_a100_v4 + StandardNCASv3_T4Family: + - standard_nc16as_t4_v3 + - standard_nc4as_t4_v3 + - standard_nc64as_t4_v3 + - standard_nc8as_t4_v3 + StandardNCadsH100v5Family: + - standard_nc40ads_h100_v5 + - standard_nc80adis_h100_v5 + StandardNDASv4_A100Family: + - standard_nd96asr_v4 + - standard_nd112asr_a100_v4 + - standard_nd120asr_a100_v4 + StandardNVADSA10v5Family: + - standard_nv12ads_a10_v5 + - standard_nv18ads_a10_v5 + - standard_nv36adms_a10_v5 + - standard_nv36ads_a10_v5 + - standard_nv6ads_a10_v5 + - standard_nv72ads_a10_v5 + standardNCFamily: + - standard_nc6 + - standard_nc12 + - standard_nc24 + - standard_nc24r + standardNCSv3Family: + - standard_nc12s_v3 + - standard_nc24rs_v3 + - standard_nc24s_v3 + - standard_nc6s_v3 + standardNCSv2Family: + - standard_nc12s_v2 + - standard_nc24rs_v2 + - standard_nc24s_v2 + - standard_nc6s_v2 + standardNDAMSv4_A100Family: + - standard_nd96amsr_a100_v4 + - standard_nd112amsr_a100_v4 + - standard_nd120amsr_a100_v4 + standardNDSH100v5Family: + - standard_nd96isr_h100_v5 + - standard_nd96is_h100_v5 + - standard_nd96is_noib_h100_v5 + - standard_nd96is_flex_h100_v5 + standardNDSFamily: + - standard_nd6s + - standard_nd12s + - standard_nd24s + - standard_nd24rs + standardNDSv2Family: + - standard_nd40rs_v2 + standardNDSv3Family: + - standard_nd40s_v3 + standardNVFamily: + - standard_nv12 + - standard_nv24 + - standard_nv6 + - standard_nv24r + standardNVSv3Family: + - standard_nv12s_v3 + - standard_nv24s_v3 + - standard_nv48s_v3 + standardNVPromoFamily: + - standard_nv12_promo + - standard_nv24_promo + - standard_nv6_promo + standardNDSFH100v5Family: + - standard_nd96isf_h100_v5 + - standard_nd96isrf_h100_v5 + standardNDAMSv4A100noRDMAFamily: + - standard_nd96ams_v4 + - standard_nd96ams_a100_flex_v4 + - standard_nd96ams_a100_v4 + - standard_nd96amsf_a100_v4 + standardNDAMSFv4_A100Family: + - standard_nd96amsrf_a100_v4 + standardNCPromoFamily: + - standard_nc12_promo + - standard_nc24r_promo + - standard_nc24_promo + - standard_nc6_promo + standardNDSH200v5Family: + - standard_nd96isr_h200_v5 + - standard_nd96is_h200_v5 + standardNDSFH200v5Family: + - standard_nd96isf_h200_v5 + - standard_nd96isrf_h200_v5 +marinerNvidiaEnabledSKUs: + StandardNCASv3_T4Family: + - standard_nc16as_t4_v3 + - standard_nc4as_t4_v3 + - standard_nc64as_t4_v3 + - standard_nc8as_t4_v3 + standardNCSv3Family: + - standard_nc12s_v3 + - standard_nc24rs_v3 + - standard_nc24s_v3 + - standard_nc6s_v3 + standardNDSv2Family: + - standard_nd40rs_v2 + standardNDSv3Family: + - standard_nd40s_v3 diff --git a/typos.toml b/typos.toml index 6f1a81dc6..8ad784397 100644 --- a/typos.toml +++ b/typos.toml @@ -1,3 +1,5 @@ [default.extend-words] aks = "aks" ERRO = "ERRO" +nd = "nd" +ND = "ND"