Skip to content

Commit

Permalink
Upgrade to latest NVIDIA NVML package
Browse files Browse the repository at this point in the history
  • Loading branch information
Harish Senthilkumar committed Jan 8, 2025
1 parent c46c0f4 commit 3bf8ff8
Show file tree
Hide file tree
Showing 45 changed files with 32,862 additions and 7,384 deletions.
4 changes: 2 additions & 2 deletions ecs-init/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module github.com/aws/amazon-ecs-agent/ecs-init
go 1.22

require (
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5
github.com/NVIDIA/go-nvml v0.12.4-0
github.com/aws/aws-sdk-go-v2 v1.31.0
github.com/aws/aws-sdk-go-v2/config v1.27.37
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.14
Expand All @@ -16,7 +16,7 @@ require (
github.com/fsouza/go-dockerclient v1.10.1
github.com/golang/mock v1.6.0
github.com/pkg/errors v0.9.1
github.com/stretchr/testify v1.8.4
github.com/stretchr/testify v1.9.0
)

require (
Expand Down
8 changes: 4 additions & 4 deletions ecs-init/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migc
github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM=
github.com/Microsoft/hcsshim v0.9.10 h1:TxXGNmcbQxBKVWvjvTocNb6jrPyeHlk5EiDhhgHgggs=
github.com/Microsoft/hcsshim v0.9.10/go.mod h1:7pLA8lDk46WKDWlVsENo92gC0XFa8rbKfyFRBqxEbCc=
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5 h1:WLyvLAM0QfjAarRzRTG9EgT5McqGWNZMvqqSUSoyUUY=
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5/go.mod h1:nMOvShGpWaf0bXwXmeu4k+O4uziuaEI8pWzIj3BUrOA=
github.com/NVIDIA/go-nvml v0.12.4-0 h1:4tkbB3pT1O77JGr0gQ6uD8FrsUPqP1A/EOEm2wI1TUg=
github.com/NVIDIA/go-nvml v0.12.4-0/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ=
github.com/aws/aws-sdk-go-v2 v1.31.0 h1:3V05LbxTSItI5kUqNwhJrrrY1BAXxXt0sN0l72QmG5U=
github.com/aws/aws-sdk-go-v2 v1.31.0/go.mod h1:ztolYtaEUtdpf9Wftr31CJfLVjOnD/CVRkKOOYgF8hA=
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.5 h1:xDAuZTn4IMm8o1LnBZvmrL8JA1io4o3YWNXgohbf20g=
Expand Down Expand Up @@ -113,8 +113,8 @@ github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVs
github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
Expand Down
9,299 changes: 9,299 additions & 0 deletions ecs-init/gpu/mocks/mock_nvml_device.go

Large diffs are not rendered by default.

50 changes: 37 additions & 13 deletions ecs-init/gpu/nvidia_gpu_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import (
"os"
"path/filepath"

"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
"github.com/NVIDIA/go-nvml/pkg/nvml"
"github.com/cihub/seelog"
"github.com/pkg/errors"
)
Expand Down Expand Up @@ -129,7 +129,11 @@ func (n *NvidiaGPUManager) Initialize() error {
var InitializeNVML = InitNVML

func InitNVML() error {
return nvml.Init()
ret := nvml.Init()
if ret != nvml.SUCCESS {
return errors.New(nvml.ErrorString(ret))
}
return nil
}

// Shutdown is for shutting down nvidia's nvml library
Expand All @@ -144,7 +148,11 @@ func (n *NvidiaGPUManager) Shutdown() error {
var ShutdownNVML = ShutdownNVMLib

func ShutdownNVMLib() error {
return nvml.Shutdown()
ret := nvml.Shutdown()
if ret != nvml.SUCCESS {
return errors.New(nvml.ErrorString(ret))
}
return nil
}

// GetDriverVersion is for getting Nvidia driver version on the instance
Expand All @@ -159,7 +167,11 @@ func (n *NvidiaGPUManager) GetDriverVersion() (string, error) {
var NvmlGetDriverVersion = GetNvidiaDriverVersion

func GetNvidiaDriverVersion() (string, error) {
return nvml.GetDriverVersion()
version, ret := nvml.SystemGetDriverVersion()
if ret != nvml.SUCCESS {
return "", errors.New(nvml.ErrorString(ret))
}
return version, nil
}

// GetGPUDeviceIDs is for getting the GPU device UUIDs
Expand All @@ -169,14 +181,18 @@ func (n *NvidiaGPUManager) GetGPUDeviceIDs() ([]string, error) {
return nil, errors.Wrapf(err, "error getting GPU device count for UUID detection")
}
var gpuIDs []string
var i uint
for i = 0; i < count; i++ {
device, err := NvmlNewDeviceLite(i)
if err != nil {
seelog.Errorf("error initializing device of index %d: %v", i, err)
for i := 0; i < int(count); i++ {
device, ret := nvml.DeviceGetHandleByIndex(i)
if ret != nvml.SUCCESS {
seelog.Errorf("error initializing device of index %d: %v", i, nvml.ErrorString(ret))
continue
}
gpuIDs = append(gpuIDs, device.UUID)
uuid, ret := nvml.DeviceGetUUID(device)
if ret != nvml.SUCCESS {
seelog.Errorf("failed to get UUID for device at index %d: %v", i, nvml.ErrorString(ret))
continue
}
gpuIDs = append(gpuIDs, uuid)
}
if len(gpuIDs) == 0 {
return gpuIDs, errors.New("error initializing GPU devices")
Expand All @@ -188,14 +204,22 @@ var NvmlGetDeviceCount = GetDeviceCount

// GetDeviceCount is for getting the number of GPU devices in the instance
func GetDeviceCount() (uint, error) {
return nvml.GetDeviceCount()
count, ret := nvml.DeviceGetCount()
if ret != nvml.SUCCESS {
return 0, errors.New(nvml.ErrorString(ret))
}
return uint(count), nil
}

var NvmlNewDeviceLite = NewDeviceLite

// NewDeviceLite is for initializing a new GPU device
func NewDeviceLite(idx uint) (*nvml.Device, error) {
return nvml.NewDeviceLite(idx)
func NewDeviceLite(idx int) (nvml.Device, error) {
device, ret := nvml.DeviceGetHandleByIndex(idx)
if ret != nvml.SUCCESS {
return device, errors.New(nvml.ErrorString(ret))
}
return device, nil
}

// SaveGPUState saves gpu state info on the disk
Expand Down
Loading

0 comments on commit 3bf8ff8

Please sign in to comment.