Skip to content

Commit

Permalink
Expose device UUIDs to node label
Browse files Browse the repository at this point in the history
Signed-off-by: Zubiao Xiong <[email protected]>
  • Loading branch information
xiongzubiao committed Jan 10, 2025
1 parent f7dc5f1 commit 2ad1041
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 0 deletions.
19 changes: 19 additions & 0 deletions internal/lm/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
return nil, fmt.Errorf("error creating IMEX labeler: %v", err)
}

uuidLabler, err := newGPUUUIDLabeler(devices)
if err != nil {
return nil, fmt.Errorf("error creating UUID labeler: %v", err)
}

l := Merge(
machineTypeLabeler,
versionLabeler,
Expand All @@ -93,6 +98,7 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
resourceLabeler,
gpuModeLabeler,
imexLabeler,
uuidLabler,
)

return l, nil
Expand Down Expand Up @@ -261,3 +267,16 @@ func getDeviceClasses(devices []resource.Device) ([]uint32, error) {
}
return classes, nil
}

// newGPUUUIDLabeler creates a new labeler that reports the UUIDs of GPUs on the node.
func newGPUUUIDLabeler(devices []resource.Device) (Labeler, error) {
labels := make(Labels, len(devices))
for idx, d := range devices {
uuid, err := d.GetUUID()
if err != nil {
return nil, err
}
labels[fmt.Sprintf("nvidia.com/gpu-%d.uuid", idx)] = uuid
}
return labels, nil
}
37 changes: 37 additions & 0 deletions internal/resource/device_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions internal/resource/nvml-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,15 @@ func (d nvmlDevice) GetName() (string, error) {
return name, nil
}

// GetUUID returns the device UUID.
func (d nvmlDevice) GetUUID() (string, error) {
uuid, ret := d.Device.GetUUID()
if ret != nvml.SUCCESS {
return "", ret
}
return uuid, nil
}

// GetTotalMemoryMB returns the total memory on a device in MB
func (d nvmlDevice) GetTotalMemoryMB() (uint64, error) {
info, ret := d.Device.GetMemoryInfo()
Expand Down
9 changes: 9 additions & 0 deletions internal/resource/nvml-mig-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,15 @@ func (d nvmlMigDevice) GetName() (string, error) {
return resourceName, nil
}

// GetUUID returns the UUID of the nvmlMigDevice.
func (d nvmlMigDevice) GetUUID() (string, error) {
uuid, ret := d.MigDevice.GetUUID()
if ret != nvml.SUCCESS {
return "", ret
}
return uuid, nil
}

// GetTotalMemoryMB returns the total memory on a device in MB
func (d nvmlMigDevice) GetTotalMemoryMB() (uint64, error) {
attr, err := d.GetAttributes()
Expand Down
6 changes: 6 additions & 0 deletions internal/resource/sysfs-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ func (d vfioDevice) GetName() (string, error) {
return d.nvidiaPCIDevice.DeviceName, nil
}

// GetUUID is unsupported for vfio devices
func (d vfioDevice) GetUUID() (string, error) {
return "", fmt.Errorf("GetUUID is not supported for vfio devices")
}

// GetTotalMemoryMB returns the total memory on a device in MB
func (d vfioDevice) GetTotalMemoryMB() (uint64, error) {
_, val := d.nvidiaPCIDevice.Resources.GetTotalAddressableMemory(true)
Expand All @@ -72,6 +77,7 @@ func (d vfioDevice) GetPCIClass() (uint32, error) {
func (d vfioDevice) IsFabricAttached() (bool, error) {
return false, nil
}

func (d vfioDevice) GetFabricIDs() (string, string, error) {
return "", "", fmt.Errorf("GetFabricIDs is not supported for vfio devices")
}
1 change: 1 addition & 0 deletions internal/resource/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ type Device interface {
GetMigDevices() ([]Device, error)
GetAttributes() (map[string]interface{}, error)
GetName() (string, error)
GetUUID() (string, error)
GetTotalMemoryMB() (uint64, error)
GetDeviceHandleFromMigDeviceHandle() (Device, error)
GetCudaComputeCapability() (int, int, error)
Expand Down

0 comments on commit 2ad1041

Please sign in to comment.