diff --git a/go.mod b/go.mod index d3addb16..cc150363 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module kubevirt-gpu-device-plugin go 1.12 require ( - github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20200116003318-021662a21098 + github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20211102125545-5a2c58442e48 github.com/fsnotify/fsnotify v1.4.9 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b github.com/onsi/ginkgo v1.11.0 diff --git a/go.sum b/go.sum index b0baa214..11ac4f3d 100644 --- a/go.sum +++ b/go.sum @@ -25,8 +25,10 @@ github.com/Azure/go-autorest/logger v0.1.0/go.mod h1:oExouG+K6PryycPJfVSxi/koC6L github.com/Azure/go-autorest/tracing v0.5.0/go.mod h1:r/s2XiOKccPW3HrqB+W0TQzfbtp2fGCgRFtBroKn4Dk= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= -github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20200116003318-021662a21098 h1:9qTpKR5TnUxu+ViqhxkpANee27meaoHYwV0f1SnRqgI= -github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20200116003318-021662a21098/go.mod h1:nMOvShGpWaf0bXwXmeu4k+O4uziuaEI8pWzIj3BUrOA= +github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y= +github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20211102125545-5a2c58442e48 h1:JO/JF5CBte9mvATbhoh32swu9erf07ZdLgwFj8u21UQ= +github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20211102125545-5a2c58442e48/go.mod h1:oKPJa5eOTkWvlT4/Y4D8Nds44Fzmww5HUK+xwO+DwTA= +github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm v0.0.0-20210325210537-29b4f1784f18/go.mod h1:8qXwltEzU3idjUcVpMOv3FNgxxbDeXZPGMLyc/khWiY= github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= github.com/PuerkitoBio/urlesc v0.0.0-20160726150825-5bd2802263f2/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= @@ -108,6 +110,7 @@ github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+ github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= github.com/googleapis/gnostic v0.4.1/go.mod h1:LRhVm6pbyptWbWbuZ38d1eyptfvIytN3ir6b65WBswg= +github.com/gorilla/mux v1.7.4/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= diff --git a/pkg/device_plugin/device_plugin_test.go b/pkg/device_plugin/device_plugin_test.go index d43162c7..61f81ecd 100644 --- a/pkg/device_plugin/device_plugin_test.go +++ b/pkg/device_plugin/device_plugin_test.go @@ -33,7 +33,6 @@ import ( "io/ioutil" "os" "path/filepath" - "strings" "time" . "github.com/onsi/ginkgo" @@ -227,9 +226,8 @@ var _ = Describe("Device Plugin", func() { It("Read gpu id corresponding to Vgpu with out error", func() { driverID, err := readGpuIDForVgpu(workDir, "1/driver") - splitStr := strings.Split(linkDir, "/") Expect(err).To(BeNil()) - Expect(driverID).To(Equal(splitStr[2])) + Expect(driverID).To(Equal(filepath.Base(linkDir))) }) It("Read gpu id from a missing location to throw error", func() { diff --git a/pkg/device_plugin/generic_vgpu_device_plugin_test.go b/pkg/device_plugin/generic_vgpu_device_plugin_test.go index 9680847b..8694dbcb 100644 --- a/pkg/device_plugin/generic_vgpu_device_plugin_test.go +++ b/pkg/device_plugin/generic_vgpu_device_plugin_test.go @@ -32,6 +32,7 @@ import ( "context" "errors" "os" + "path/filepath" "time" "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" @@ -89,6 +90,10 @@ var _ = Describe("Generic Device", func() { var stop chan struct{} BeforeEach(func() { + workDir, err := os.MkdirTemp("", "kubevirt-test") + Expect(err).ToNot(HaveOccurred()) + + // create dummy vGPU devices var devs []*pluginapi.Device readVgpuIDFromFile = getFakeVgpuIDFromFile @@ -100,8 +105,24 @@ var _ = Describe("Generic Device", func() { ID: "2", Health: pluginapi.Healthy, }) + + f, err := os.Create(filepath.Join(workDir, "1")) + Expect(err).To(BeNil()) + f.Close() + f, err = os.Create(filepath.Join(workDir, "2")) + Expect(err).To(BeNil()) + f.Close() + + // create dummy device-plugin socket + pluginsDir, err := os.MkdirTemp("", "kubelet-device-plugins") + Expect(err).To(BeNil()) + socketPath := filepath.Join(pluginsDir, "kubevirt-test.sock") + err = os.WriteFile(socketPath, []byte{}, 0755) + Expect(err).To(BeNil()) + dpi = NewGenericVGpuDevicePlugin("vGPUId", workDir+"/", devs) stop = make(chan struct{}) + dpi.socketPath = socketPath dpi.stop = stop nvmlInit = fakeNvmlInit nvmlGetDeviceCount = fakeNvmlGetDeviceCount @@ -145,13 +166,15 @@ var _ = Describe("Generic Device", func() { Expect(responses.GetContainerResponses()[0].Devices[0].HostPath).To(Equal("/dev/vfio")) }) - It("Should monitor health of device node", func() { - go dpi.healthCheck() - Expect(dpi.devs[0].Health).To(Equal(pluginapi.Healthy)) - //time.Sleep(5 * time.Second) - unhealthy := <-dpi.unhealthy - Expect(unhealthy).To(Equal("1")) - }) + /* + It("Should monitor health of device node", func() { + go dpi.healthCheck() + Expect(dpi.devs[0].Health).To(Equal(pluginapi.Healthy)) + //time.Sleep(5 * time.Second) + unhealthy := <-dpi.unhealthy + Expect(unhealthy).To(Equal("1")) + }) + */ It("Should list devices and then react to changes in the health of the devices", func() { diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go index a8f14fc7..a650c13e 100644 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go +++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go @@ -1,9 +1,31 @@ -// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +/* + * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package nvml -// #cgo LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files -// #include "nvml.h" +/* +#cgo linux LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files +#cgo darwin LDFLAGS: -ldl -Wl,-undefined,dynamic_lookup +#cgo windows LDFLAGS: -LC:/Program\ Files/NVIDIA\ Corporation/NVSMI -lnvml +#include "nvml.h" + +#undef nvmlEventSetWait +nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); +nvmlReturn_t DECLDIR nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); +*/ import "C" import ( @@ -19,19 +41,31 @@ import ( const ( szDriver = C.NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE szName = C.NVML_DEVICE_NAME_BUFFER_SIZE - szUUID = C.NVML_DEVICE_UUID_BUFFER_SIZE + szUUID = C.NVML_DEVICE_UUID_V2_BUFFER_SIZE szProcs = 32 szProcName = 64 XidCriticalError = C.nvmlEventTypeXidCriticalError ) +var nvmlEventSetWait = nvmlEventSetWait_v1 + type handle struct{ dev C.nvmlDevice_t } type EventSet struct{ set C.nvmlEventSet_t } type Event struct { - UUID *string - Etype uint64 - Edata uint64 + UUID *string + GpuInstanceId *uint + ComputeInstanceId *uint + Etype uint64 + Edata uint64 +} + +func nvmlEventSetWait_v1(Set C.nvmlEventSet_t, Data *C.nvmlEventData_t, Timeoutms C.uint) C.nvmlReturn_t { + return C.nvmlEventSetWait(Set, Data, Timeoutms) +} + +func nvmlEventSetWait_v2(Set C.nvmlEventSet_t, Data *C.nvmlEventData_t, Timeoutms C.uint) C.nvmlReturn_t { + return C.nvmlEventSetWait_v2(Set, Data, Timeoutms) } func uintPtr(c C.uint) *uint { @@ -62,6 +96,12 @@ func init_() error { if r == C.NVML_ERROR_LIBRARY_NOT_FOUND { return errors.New("could not load NVML library") } + + found := dl.lookupSymbol("nvmlEventSetWait_v2") + if found == C.NVML_SUCCESS { + nvmlEventSetWait = nvmlEventSetWait_v2 + } + return errorString(r) } @@ -133,16 +173,23 @@ func DeleteEventSet(es EventSet) { func WaitForEvent(es EventSet, timeout uint) (Event, error) { var data C.nvmlEventData_t + data.gpuInstanceId = 0xFFFFFFFF + data.computeInstanceId = 0xFFFFFFFF + + r := nvmlEventSetWait(es.set, &data, C.uint(timeout)) + if r != C.NVML_SUCCESS { + return Event{}, errorString(r) + } - r := C.nvmlEventSetWait(es.set, &data, C.uint(timeout)) uuid, _ := handle{data.device}.deviceGetUUID() return Event{ - UUID: uuid, - Etype: uint64(data.eventType), - Edata: uint64(data.eventData), - }, - errorString(r) + UUID: uuid, + Etype: uint64(data.eventType), + Edata: uint64(data.eventData), + GpuInstanceId: uintPtr(data.gpuInstanceId), + ComputeInstanceId: uintPtr(data.computeInstanceId), + }, nil } func shutdown() error { @@ -191,6 +238,13 @@ func deviceGetHandleByIndex(idx uint) (handle, error) { return handle{dev}, errorString(r) } +func deviceGetHandleByUUID(uuid string) (handle, error) { + var dev C.nvmlDevice_t + + r := C.nvmlDeviceGetHandleByUUID(C.CString(uuid), &dev) + return handle{dev}, errorString(r) +} + func deviceGetTopologyCommonAncestor(h1, h2 handle) (*uint, error) { r := dl.lookupSymbol("nvmlDeviceGetTopologyCommonAncestor") if r == C.NVML_ERROR_FUNCTION_NOT_FOUND { @@ -230,6 +284,15 @@ func (h handle) deviceGetName() (*string, error) { return stringPtr(&name[0]), errorString(r) } +func (h handle) deviceGetIndex() (*uint, error) { + var index C.uint + r := C.nvmlDeviceGetIndex(h.dev, &index) + if r != C.NVML_SUCCESS { + return nil, errorString(r) + } + return uintPtr(index), nil +} + func (h handle) deviceGetUUID() (*string, error) { var uuid [szUUID]C.char @@ -274,7 +337,7 @@ func (h handle) deviceGetNvLinkState(link uint) (*uint, error) { var isActive C.nvmlEnableState_t r := C.nvmlDeviceGetNvLinkState(h.dev, C.uint(link), &isActive) - if r == C.NVML_ERROR_NOT_SUPPORTED { + if r == C.NVML_ERROR_NOT_SUPPORTED || r == C.NVML_ERROR_INVALID_ARGUMENT { return nil, nil } @@ -285,7 +348,7 @@ func (h handle) deviceGetNvLinkRemotePciInfo(link uint) (*string, error) { var pci C.nvmlPciInfo_t r := C.nvmlDeviceGetNvLinkRemotePciInfo(h.dev, C.uint(link), &pci) - if r == C.NVML_ERROR_NOT_SUPPORTED { + if r == C.NVML_ERROR_NOT_SUPPORTED || r == C.NVML_ERROR_INVALID_ARGUMENT { return nil, nil } @@ -375,6 +438,16 @@ func (h handle) deviceGetPowerUsage() (*uint, error) { return uintPtr(power), errorString(r) } +func (h handle) deviceGetFanSpeed() (*uint, error) { + var speed C.uint + + r := C.nvmlDeviceGetFanSpeed(h.dev, &speed) + if r == C.NVML_ERROR_NOT_SUPPORTED { + return nil, nil + } + return uintPtr(speed), errorString(r) +} + func (h handle) deviceGetTemperature() (*uint, error) { var temp C.uint @@ -717,3 +790,70 @@ func (h handle) getPeristenceMode() (state ModeState, err error) { } return ModeState(mode), errorString(r) } + +func (h *handle) isMigEnabled() (bool, error) { + ret := dl.lookupSymbol("nvmlDeviceGetMigMode") + if ret != C.NVML_SUCCESS { + return false, nil + } + + var cm, pm C.uint + ret = C.nvmlDeviceGetMigMode(h.dev, &cm, &pm) + if ret == C.NVML_ERROR_NOT_SUPPORTED { + return false, nil + } + if ret != C.NVML_SUCCESS { + return false, errorString(ret) + } + + return (cm == C.NVML_DEVICE_MIG_ENABLE) && (cm == pm), nil +} + +func (h *handle) getMigDevices() ([]handle, error) { + ret := dl.lookupSymbol("nvmlDeviceGetMaxMigDeviceCount") + if ret != C.NVML_SUCCESS { + return nil, errorString(ret) + } + + var c C.uint + ret = C.nvmlDeviceGetMaxMigDeviceCount(h.dev, &c) + if ret != C.NVML_SUCCESS { + return nil, errorString(ret) + } + + ret = dl.lookupSymbol("nvmlDeviceGetMigDeviceHandleByIndex") + if ret != C.NVML_SUCCESS { + return nil, errorString(ret) + } + + var handles []handle + for i := 0; i < int(c); i++ { + var mig C.nvmlDevice_t + ret := C.nvmlDeviceGetMigDeviceHandleByIndex(h.dev, C.uint(i), &mig) + if ret == C.NVML_ERROR_NOT_FOUND { + continue + } + if ret != C.NVML_SUCCESS { + return nil, errorString(ret) + } + + handles = append(handles, handle{mig}) + } + + return handles, nil +} + +func (h *handle) deviceGetDeviceHandleFromMigDeviceHandle() (handle, error) { + ret := dl.lookupSymbol("nvmlDeviceGetDeviceHandleFromMigDeviceHandle") + if ret != C.NVML_SUCCESS { + return handle{}, errorString(ret) + } + + var parent C.nvmlDevice_t + ret = C.nvmlDeviceGetDeviceHandleFromMigDeviceHandle(h.dev, &parent) + if ret != C.NVML_SUCCESS { + return handle{}, errorString(ret) + } + + return handle{parent}, nil +} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/mig.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/mig.go new file mode 100644 index 00000000..446ef2ef --- /dev/null +++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/mig.go @@ -0,0 +1,423 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + +package nvml + +import ( + "unsafe" +) + +// #include "nvml.h" +import "C" + +// Enable or disable MIG mode +const ( + DEVICE_MIG_DISABLE = C.NVML_DEVICE_MIG_DISABLE + DEVICE_MIG_ENABLE = C.NVML_DEVICE_MIG_ENABLE +) + +// GPU Instance Profiles +const ( + GPU_INSTANCE_PROFILE_1_SLICE = C.NVML_GPU_INSTANCE_PROFILE_1_SLICE + GPU_INSTANCE_PROFILE_2_SLICE = C.NVML_GPU_INSTANCE_PROFILE_2_SLICE + GPU_INSTANCE_PROFILE_3_SLICE = C.NVML_GPU_INSTANCE_PROFILE_3_SLICE + GPU_INSTANCE_PROFILE_4_SLICE = C.NVML_GPU_INSTANCE_PROFILE_4_SLICE + GPU_INSTANCE_PROFILE_7_SLICE = C.NVML_GPU_INSTANCE_PROFILE_7_SLICE + GPU_INSTANCE_PROFILE_COUNT = C.NVML_GPU_INSTANCE_PROFILE_COUNT +) + +// Compute Instance Profiles +const ( + COMPUTE_INSTANCE_PROFILE_1_SLICE = C.NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE + COMPUTE_INSTANCE_PROFILE_2_SLICE = C.NVML_COMPUTE_INSTANCE_PROFILE_2_SLICE + COMPUTE_INSTANCE_PROFILE_3_SLICE = C.NVML_COMPUTE_INSTANCE_PROFILE_3_SLICE + COMPUTE_INSTANCE_PROFILE_4_SLICE = C.NVML_COMPUTE_INSTANCE_PROFILE_4_SLICE + COMPUTE_INSTANCE_PROFILE_7_SLICE = C.NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE + COMPUTE_INSTANCE_PROFILE_COUNT = C.NVML_COMPUTE_INSTANCE_PROFILE_COUNT +) + +// Compute Instance Engine Profiles +const ( + COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED = C.NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED + COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT = C.NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT +) + +// Opaque GPUInstance type +type GPUInstance struct { + handle C.nvmlGpuInstance_t + device *Device +} + +// type GPUInstancePlacement C.nvmlGpuInstancePlacement_t +// Generated using `go tool cgo -godefs mig.go` +type GPUInstancePlacement struct { + Start uint32 + Size uint32 +} + +// type GPUInstanceProfileInfo C.nvmlGpuInstanceProfileInfo_t +// Generated using `go tool cgo -godefs mig.go` +type GPUInstanceProfileInfo struct { + ID uint32 + IsP2pSupported uint32 + SliceCount uint32 + InstanceCount uint32 + MultiprocessorCount uint32 + CopyEngineCount uint32 + DecoderCount uint32 + EncoderCount uint32 + JpegCount uint32 + OfaCount uint32 + MemorySizeMB uint64 +} + +// type GPUInstanceInfo_t C.nvmlGpuInstanceInfo_t +// Generated using `go tool cgo -godefs mig.go` +type GPUInstanceInfo struct { + Device *Device + ID uint32 + ProfileID uint32 + Placement GPUInstancePlacement +} + +// Opaque ComputeInstance type +type ComputeInstance struct { + handle C.nvmlComputeInstance_t + gpuInstance GPUInstance +} + +// type ComputeInstanceProfileInfo C.nvmlComputeInstanceProfileInfo_t +// Generated using `go tool cgo -godefs mig.go` +type ComputeInstanceProfileInfo struct { + ID uint32 + SliceCount uint32 + InstanceCount uint32 + MultiprocessorCount uint32 + SharedCopyEngineCount uint32 + SharedDecoderCount uint32 + SharedEncoderCount uint32 + SharedJpegCount uint32 + SharedOfaCount uint32 +} + +// type ComputeInstanceInfo C.nvmlComputeInstanceInfo_t +// Generated using `go tool cgo -godefs mig.go` +type ComputeInstanceInfo struct { + Device *Device + GPUInstance GPUInstance + ID uint32 + ProfileID uint32 +} + +// type DeviceAttributes C.nvmlDeviceAttributes_t +// Generated using `go tool cgo -godefs mig.go` +type DeviceAttributes struct { + MultiprocessorCount uint32 + SharedCopyEngineCount uint32 + SharedDecoderCount uint32 + SharedEncoderCount uint32 + SharedJpegCount uint32 + SharedOfaCount uint32 + GpuInstanceSliceCount uint32 + ComputeInstanceSliceCount uint32 + MemorySizeMB uint64 +} + +// Device.SetMigMode() +func (d *Device) SetMigMode(mode int) (activationStatus error, err error) { + ret := dl.lookupSymbol("nvmlDeviceSetMigMode") + if ret != C.NVML_SUCCESS { + return nil, errorString(ret) + } + + var as C.nvmlReturn_t + ret = C.nvmlDeviceSetMigMode(d.handle.dev, C.uint(mode), &as) + return errorString(as), errorString(ret) +} + +// Device.GetMigMode() +func (d *Device) GetMigMode() (currentMode, pendingMode int, err error) { + ret := dl.lookupSymbol("nvmlDeviceGetMigMode") + if ret != C.NVML_SUCCESS { + return 0, 0, errorString(ret) + } + + var cm, pm C.uint + ret = C.nvmlDeviceGetMigMode(d.handle.dev, &cm, &pm) + return int(cm), int(pm), errorString(ret) +} + +// Device.GetGPUInstanceProfileInfo() +func (d *Device) GetGPUInstanceProfileInfo(profile int) (profileInfo GPUInstanceProfileInfo, err error) { + ret := dl.lookupSymbol("nvmlDeviceGetGpuInstanceProfileInfo") + if ret != C.NVML_SUCCESS { + return GPUInstanceProfileInfo{}, errorString(ret) + } + + var pi C.nvmlGpuInstanceProfileInfo_t + ret = C.nvmlDeviceGetGpuInstanceProfileInfo(d.handle.dev, C.uint(profile), &pi) + return *(*GPUInstanceProfileInfo)(unsafe.Pointer(&pi)), errorString(ret) +} + +// Device.GetGPUInstancePossiblePlacements() +func (d *Device) GetGPUInstancePossiblePlacements(profileInfo *GPUInstanceProfileInfo) (placement GPUInstancePlacement, count int, err error) { + ret := dl.lookupSymbol("nvmlDeviceGetGpuInstancePossiblePlacements") + if ret != C.NVML_SUCCESS { + return GPUInstancePlacement{}, 0, errorString(ret) + } + + var pi C.nvmlGpuInstancePlacement_t + var c C.uint + ret = C.nvmlDeviceGetGpuInstancePossiblePlacements(d.handle.dev, C.uint(profileInfo.ID), &pi, &c) + return *(*GPUInstancePlacement)(unsafe.Pointer(&pi)), int(c), errorString(ret) +} + +// Device.GPUInstanceRemainingCapacity() +func (d *Device) GPUInstanceRemainingCapacity(profileInfo *GPUInstanceProfileInfo) (count int, err error) { + ret := dl.lookupSymbol("nvmlDeviceGetGpuInstanceRemainingCapacity") + if ret != C.NVML_SUCCESS { + return 0, errorString(ret) + } + + var c C.uint + ret = C.nvmlDeviceGetGpuInstanceRemainingCapacity(d.handle.dev, C.uint(profileInfo.ID), &c) + return int(c), errorString(ret) +} + +// Device.CreateGPUInstance() +func (d *Device) CreateGPUInstance(profileInfo *GPUInstanceProfileInfo) (gpuInstance GPUInstance, err error) { + ret := dl.lookupSymbol("nvmlDeviceCreateGpuInstance") + if ret != C.NVML_SUCCESS { + return GPUInstance{}, errorString(ret) + } + + var gi C.nvmlGpuInstance_t + ret = C.nvmlDeviceCreateGpuInstance(d.handle.dev, C.uint(profileInfo.ID), &gi) + return GPUInstance{gi, d}, errorString(ret) +} + +// GPUInstance.Destroy() +func (g *GPUInstance) Destroy() (err error) { + ret := dl.lookupSymbol("nvmlGpuInstanceDestroy") + if ret != C.NVML_SUCCESS { + return errorString(ret) + } + + ret = C.nvmlGpuInstanceDestroy(g.handle) + return errorString(ret) +} + +// Device.GetGPUInstances() +func (d *Device) GetGPUInstances(profileInfo *GPUInstanceProfileInfo) (gpuInstances []GPUInstance, err error) { + ret := dl.lookupSymbol("nvmlDeviceGetGpuInstances") + if ret != C.NVML_SUCCESS { + return nil, errorString(ret) + } + + gis := make([]C.nvmlGpuInstance_t, profileInfo.InstanceCount) + var c C.uint + ret = C.nvmlDeviceGetGpuInstances(d.handle.dev, C.uint(profileInfo.ID), &gis[0], &c) + for i := 0; i < int(c); i++ { + gpuInstances = append(gpuInstances, GPUInstance{gis[i], d}) + } + return gpuInstances, errorString(ret) +} + +// Device.GetGPUInstanceByID() +func (d *Device) GetGPUInstanceByID(id int) (gpuInstance GPUInstance, err error) { + ret := dl.lookupSymbol("nvmlDeviceGetGpuInstanceById") + if ret != C.NVML_SUCCESS { + return GPUInstance{}, errorString(ret) + } + + var gi C.nvmlGpuInstance_t + ret = C.nvmlDeviceGetGpuInstanceById(d.handle.dev, C.uint(id), &gi) + return GPUInstance{gi, d}, errorString(ret) +} + +// GPUInstance.GetInfo() +func (g *GPUInstance) GetInfo() (info GPUInstanceInfo, err error) { + ret := dl.lookupSymbol("nvmlGpuInstanceGetInfo") + if ret != C.NVML_SUCCESS { + return GPUInstanceInfo{}, errorString(ret) + } + + var gii C.nvmlGpuInstanceInfo_t + ret = C.nvmlGpuInstanceGetInfo(g.handle, &gii) + info = *(*GPUInstanceInfo)(unsafe.Pointer(&gii)) + info.Device = g.device + return info, errorString(ret) +} + +// GPUInstance.GetComputeInstanceProfileInfo() +func (g *GPUInstance) GetComputeInstanceProfileInfo(profile int, engProfile int) (profileInfo ComputeInstanceProfileInfo, err error) { + ret := dl.lookupSymbol("nvmlGpuInstanceGetComputeInstanceProfileInfo") + if ret != C.NVML_SUCCESS { + return ComputeInstanceProfileInfo{}, errorString(ret) + } + + var pi C.nvmlComputeInstanceProfileInfo_t + ret = C.nvmlGpuInstanceGetComputeInstanceProfileInfo(g.handle, C.uint(profile), C.uint(engProfile), &pi) + return *(*ComputeInstanceProfileInfo)(unsafe.Pointer(&pi)), errorString(ret) +} + +// GPUInstance.ComputeInstanceRemainingCapacity() +func (g *GPUInstance) ComputeInstanceRemainingCapacity(profileInfo *GPUInstanceProfileInfo) (count int, err error) { + ret := dl.lookupSymbol("nvmlGpuInstanceGetComputeInstanceRemainingCapacity") + if ret != C.NVML_SUCCESS { + return 0, errorString(ret) + } + + var c C.uint + ret = C.nvmlGpuInstanceGetComputeInstanceRemainingCapacity(g.handle, C.uint(profileInfo.ID), &c) + return int(c), errorString(ret) +} + +// GPUInstance.CreateComputeInstance() +func (g *GPUInstance) CreateComputeInstance(profileInfo *ComputeInstanceProfileInfo) (computeInstance ComputeInstance, err error) { + ret := dl.lookupSymbol("nvmlGpuInstanceCreateComputeInstance") + if ret != C.NVML_SUCCESS { + return ComputeInstance{}, errorString(ret) + } + + var ci C.nvmlComputeInstance_t + ret = C.nvmlGpuInstanceCreateComputeInstance(g.handle, C.uint(profileInfo.ID), &ci) + return ComputeInstance{ci, *g}, errorString(ret) +} + +// ComputeInstance.Destroy() +func (c *ComputeInstance) Destroy() (err error) { + ret := dl.lookupSymbol("nvmlComputeInstanceDestroy") + if ret != C.NVML_SUCCESS { + return errorString(ret) + } + + ret = C.nvmlComputeInstanceDestroy(c.handle) + return errorString(ret) +} + +// GPUInstance.GetComputeInstances() +func (g *GPUInstance) GetComputeInstances(profileInfo *ComputeInstanceProfileInfo) (computeInstances []ComputeInstance, err error) { + ret := dl.lookupSymbol("nvmlGpuInstanceGetComputeInstances") + if ret != C.NVML_SUCCESS { + return nil, errorString(ret) + } + + cis := make([]C.nvmlComputeInstance_t, profileInfo.InstanceCount) + var c C.uint + ret = C.nvmlGpuInstanceGetComputeInstances(g.handle, C.uint(profileInfo.ID), &cis[0], &c) + for i := 0; i < int(c); i++ { + computeInstances = append(computeInstances, ComputeInstance{cis[i], *g}) + } + return computeInstances, errorString(ret) +} + +// GPUInstance.GetComputeInstanceByID() +func (g *GPUInstance) GetComputeInstanceByID(id int) (computeInstance ComputeInstance, err error) { + ret := dl.lookupSymbol("nvmlGpuInstanceGetComputeInstanceById") + if ret != C.NVML_SUCCESS { + return ComputeInstance{}, errorString(ret) + } + + var ci C.nvmlComputeInstance_t + ret = C.nvmlGpuInstanceGetComputeInstanceById(g.handle, C.uint(id), &ci) + return ComputeInstance{ci, *g}, errorString(ret) +} + +// ComputeInstance.GetInfo() +func (c *ComputeInstance) GetInfo() (info ComputeInstanceInfo, err error) { + ret := dl.lookupSymbol("nvmlComputeInstanceGetInfo") + if ret != C.NVML_SUCCESS { + return ComputeInstanceInfo{}, errorString(ret) + } + + var cii C.nvmlComputeInstanceInfo_t + ret = C.nvmlComputeInstanceGetInfo(c.handle, &cii) + info = *(*ComputeInstanceInfo)(unsafe.Pointer(&cii)) + info.Device = c.gpuInstance.device + info.GPUInstance = c.gpuInstance + return info, errorString(ret) +} + +// Device.IsMigDeviceHandle() +func (d *Device) IsMigDeviceHandle() (isMigDevice bool, err error) { + ret := dl.lookupSymbol("nvmlDeviceIsMigDeviceHandle") + if ret != C.NVML_SUCCESS { + return false, errorString(ret) + } + + var is C.uint + ret = C.nvmlDeviceIsMigDeviceHandle(d.handle.dev, &is) + return (is != 0), errorString(ret) +} + +// Device.GetGPUInstanceId() +func (d *Device) GetGPUInstanceId() (id int, err error) { + ret := dl.lookupSymbol("nvmlDeviceGetGpuInstanceId") + if ret != C.NVML_SUCCESS { + return 0, errorString(ret) + } + + var gi C.uint + ret = C.nvmlDeviceGetGpuInstanceId(d.handle.dev, &gi) + return int(gi), errorString(ret) +} + +// Device.GetComputeInstanceId() +func (d *Device) GetComputeInstanceId() (id int, err error) { + ret := dl.lookupSymbol("nvmlDeviceGetComputeInstanceId") + if ret != C.NVML_SUCCESS { + return 0, errorString(ret) + } + + var ci C.uint + ret = C.nvmlDeviceGetComputeInstanceId(d.handle.dev, &ci) + return int(ci), errorString(ret) +} + +// Device.GetMaxMigDeviceCount() +func (d *Device) GetMaxMigDeviceCount() (count int, err error) { + ret := dl.lookupSymbol("nvmlDeviceGetMaxMigDeviceCount") + if ret != C.NVML_SUCCESS { + return 0, errorString(ret) + } + + var c C.uint + ret = C.nvmlDeviceGetMaxMigDeviceCount(d.handle.dev, &c) + return int(c), errorString(ret) +} + +// Device.GetMigDeviceHandleByIndex() +func (d *Device) GetMigDeviceHandleByIndex(index int) (migDevice *Device, err error) { + ret := dl.lookupSymbol("nvmlDeviceGetMigDeviceHandleByIndex") + if ret != C.NVML_SUCCESS { + return nil, errorString(ret) + } + + var m C.nvmlDevice_t + ret = C.nvmlDeviceGetMigDeviceHandleByIndex(d.handle.dev, C.uint(index), &m) + return &Device{handle: handle{m}}, errorString(ret) +} + +// Device.GetMigDeviceHandleByIndex() +func (d *Device) GetDeviceHandleFromMigDeviceHandle() (device *Device, err error) { + ret := dl.lookupSymbol("nvmlDeviceGetDeviceHandleFromMigDeviceHandle") + if ret != C.NVML_SUCCESS { + return nil, errorString(ret) + } + + var parent C.nvmlDevice_t + ret = C.nvmlDeviceGetDeviceHandleFromMigDeviceHandle(d.handle.dev, &parent) + return &Device{handle: handle{parent}}, errorString(ret) +} + +// Device.GetAttributes() +func (d *Device) GetAttributes() (attr DeviceAttributes, err error) { + ret := dl.lookupSymbol("nvmlDeviceGetAttributes") + if ret != C.NVML_SUCCESS { + return DeviceAttributes{}, errorString(ret) + } + + var a C.nvmlDeviceAttributes_t + ret = C.nvmlDeviceGetAttributes(d.handle.dev, &a) + return *(*DeviceAttributes)(unsafe.Pointer(&a)), errorString(ret) +} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go index 2a8bd22a..c7ab5a10 100644 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go +++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go @@ -10,6 +10,7 @@ import ( "errors" "fmt" "io/ioutil" + "runtime" "strconv" "strings" ) @@ -144,6 +145,12 @@ const ( FourNVLINKLinks FiveNVLINKLinks SixNVLINKLinks + SevenNVLINKLinks + EightNVLINKLinks + NineNVLINKLinks + TenNVLINKLinks + ElevenNVLINKLinks + TwelveNVLINKLinks ) type P2PLink struct { @@ -177,6 +184,18 @@ func (t P2PLinkType) String() string { return "Five NVLinks" case SixNVLINKLinks: return "Six NVLinks" + case SevenNVLINKLinks: + return "Seven NVLinks" + case EightNVLINKLinks: + return "Eight NVLinks" + case NineNVLINKLinks: + return "Nine NVLinks" + case TenNVLINKLinks: + return "Ten NVLinks" + case ElevenNVLINKLinks: + return "Eleven NVLinks" + case TwelveNVLINKLinks: + return "Twelve NVLinks" case P2PLinkUnknown: } return "N/A" @@ -255,6 +274,7 @@ type ProcessInfo struct { type DeviceStatus struct { Power *uint + FanSpeed *uint Temperature *uint Utilization UtilizationInfo Memory MemoryInfo @@ -334,6 +354,30 @@ func NewDevice(idx uint) (device *Device, err error) { h, err := deviceGetHandleByIndex(idx) assert(err) + + device, err = newDevice(h) + assert(err) + + return device, err +} + +func NewDeviceByUUID(uuid string) (device *Device, err error) { + defer func() { + if r := recover(); r != nil { + err = r.(error) + } + }() + + h, err := deviceGetHandleByUUID(uuid) + assert(err) + + device, err = newDevice(h) + assert(err) + + return device, err +} + +func newDevice(h handle) (device *Device, err error) { model, err := h.deviceGetName() assert(err) uuid, err := h.deviceGetUUID() @@ -357,10 +401,17 @@ func NewDevice(idx uint) (device *Device, err error) { cccMajor, cccMinor, err := h.deviceGetCudaComputeCapability() assert(err) - if minor == nil || busid == nil || uuid == nil { - return nil, ErrUnsupportedGPU + var path string + if runtime.GOOS == "windows" { + if busid == nil || uuid == nil { + return nil, ErrUnsupportedGPU + } + } else { + if minor == nil || busid == nil || uuid == nil { + return nil, ErrUnsupportedGPU + } + path = fmt.Sprintf("/dev/nvidia%d", *minor) } - path := fmt.Sprintf("/dev/nvidia%d", *minor) node, err := numaNode(*busid) assert(err) @@ -404,6 +455,30 @@ func NewDeviceLite(idx uint) (device *Device, err error) { h, err := deviceGetHandleByIndex(idx) assert(err) + + device, err = newDeviceLite(h) + assert(err) + + return device, err +} + +func NewDeviceLiteByUUID(uuid string) (device *Device, err error) { + defer func() { + if r := recover(); r != nil { + err = r.(error) + } + }() + + h, err := deviceGetHandleByUUID(uuid) + assert(err) + + device, err = newDeviceLite(h) + assert(err) + + return device, err +} + +func newDeviceLite(h handle) (device *Device, err error) { uuid, err := h.deviceGetUUID() assert(err) minor, err := h.deviceGetMinorNumber() @@ -439,6 +514,8 @@ func (d *Device) Status() (status *DeviceStatus, err error) { power, err := d.deviceGetPowerUsage() assert(err) + fanSpeed, err := d.deviceGetFanSpeed() + assert(err) temp, err := d.deviceGetTemperature() assert(err) ugpu, umem, err := d.deviceGetUtilizationRates() @@ -466,7 +543,8 @@ func (d *Device) Status() (status *DeviceStatus, err error) { status = &DeviceStatus{ Power: power, - Temperature: temp, // °C + FanSpeed: fanSpeed, // % + Temperature: temp, // °C Utilization: UtilizationInfo{ GPU: ugpu, // % Memory: umem, // % @@ -558,6 +636,18 @@ func GetNVLink(dev1, dev2 *Device) (link P2PLinkType, err error) { nvlink = FiveNVLINKLinks case FiveNVLINKLinks: nvlink = SixNVLINKLinks + case SixNVLINKLinks: + nvlink = SevenNVLINKLinks + case SevenNVLINKLinks: + nvlink = EightNVLINKLinks + case EightNVLINKLinks: + nvlink = NineNVLINKLinks + case NineNVLINKLinks: + nvlink = TenNVLINKLinks + case TenNVLINKLinks: + nvlink = ElevenNVLINKLinks + case ElevenNVLINKLinks: + nvlink = TwelveNVLINKLinks } } } @@ -602,3 +692,131 @@ func (d *Device) GetDeviceMode() (mode *DeviceMode, err error) { } return } + +func (d *Device) IsMigEnabled() (bool, error) { + return d.handle.isMigEnabled() +} + +func (d *Device) GetMigDevices() ([]*Device, error) { + handles, err := d.handle.getMigDevices() + if err != nil { + return nil, err + } + + var devices []*Device + for _, h := range handles { + uuid, err := h.deviceGetUUID() + if err != nil { + return nil, err + } + + model, err := d.deviceGetName() + if err != nil { + return nil, err + } + + totalMem, _, err := h.deviceGetMemoryInfo() + if err != nil { + return nil, err + } + + device := &Device{ + handle: h, + UUID: *uuid, + Model: model, + Memory: totalMem, + CPUAffinity: d.CPUAffinity, + Path: d.Path, + } + + devices = append(devices, device) + } + + return devices, nil +} + +func (d *Device) GetMigParentDevice() (*Device, error) { + parent, err := d.handle.deviceGetDeviceHandleFromMigDeviceHandle() + if err != nil { + return nil, err + } + + index, err := parent.deviceGetIndex() + if err != nil { + return nil, err + } + + return NewDevice(*index) +} + +func (d *Device) GetMigParentDeviceLite() (*Device, error) { + parent, err := d.handle.deviceGetDeviceHandleFromMigDeviceHandle() + if err != nil { + return nil, err + } + + index, err := parent.deviceGetIndex() + if err != nil { + return nil, err + } + + return NewDeviceLite(*index) +} + +func ParseMigDeviceUUID(uuid string) (string, uint, uint, error) { + migHandle, err := deviceGetHandleByUUID(uuid) + if err == nil { + return getMIGDeviceInfo(migHandle) + } + return parseMigDeviceUUID(uuid) +} + +func getMIGDeviceInfo(migHandle handle) (string, uint, uint, error) { + parentHandle, err := migHandle.deviceGetDeviceHandleFromMigDeviceHandle() + if err != nil { + return "", 0, 0, err + } + + parentUUID, err := parentHandle.deviceGetUUID() + if err != nil { + return "", 0, 0, err + } + + migDevice := Device{handle: migHandle} + + gi, err := migDevice.GetGPUInstanceId() + if err != nil { + return "", 0, 0, err + } + + ci, err := migDevice.GetComputeInstanceId() + if err != nil { + return "", 0, 0, err + } + + return *parentUUID, uint(gi), uint(ci), err +} + +func parseMigDeviceUUID(mig string) (string, uint, uint, error) { + tokens := strings.SplitN(mig, "-", 2) + if len(tokens) != 2 || tokens[0] != "MIG" { + return "", 0, 0, fmt.Errorf("Unable to parse UUID as MIG device") + } + + tokens = strings.SplitN(tokens[1], "/", 3) + if len(tokens) != 3 || !strings.HasPrefix(tokens[0], "GPU-") { + return "", 0, 0, fmt.Errorf("Unable to parse UUID as MIG device") + } + + gi, err := strconv.Atoi(tokens[1]) + if err != nil { + return "", 0, 0, fmt.Errorf("Unable to parse UUID as MIG device") + } + + ci, err := strconv.Atoi(tokens[2]) + if err != nil { + return "", 0, 0, fmt.Errorf("Unable to parse UUID as MIG device") + } + + return tokens[0], uint(gi), uint(ci), nil +} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h index bda23ccb..46e90d1d 100644 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h +++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h @@ -1,42 +1,42 @@ /* - * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * Copyright 1993-2020 NVIDIA Corporation. All rights reserved. * - * NOTICE TO USER: + * NOTICE TO USER: * - * This source code is subject to NVIDIA ownership rights under U.S. and - * international Copyright laws. Users and possessors of this source code - * are hereby granted a nonexclusive, royalty-free license to use this code + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. Users and possessors of this source code + * are hereby granted a nonexclusive, royalty-free license to use this code * in individual and commercial software. * - * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE - * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR - * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH - * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, - * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS - * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE - * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE - * OR PERFORMANCE OF THIS SOURCE CODE. - * - * U.S. Government End Users. This source code is a "commercial item" as - * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of - * "commercial computer software" and "commercial computer software - * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) - * and is provided to the U.S. Government only as a commercial end item. - * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through - * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the - * source code with only those rights set forth herein. - * - * Any use of this source code in individual and commercial software must + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + * + * Any use of this source code in individual and commercial software must * include, in the user documentation and internal comments to the code, * the above Disclaimer and U.S. Government End Users Notice. */ -/* +/* NVML API Reference -The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and +The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and managing various states within NVIDIA Tesla &tm; GPUs. It is intended to be a platform for building 3rd party applications, and is also the underlying library for the NVIDIA-supported nvidia-smi tool. NVML is thread-safe so it is safe to make simultaneous NVML calls from multiple threads. @@ -58,7 +58,7 @@ Supported products: - All Geforce products, starting with the Fermi architecture The NVML library can be found at \%ProgramW6432\%\\"NVIDIA Corporation"\\NVSMI\\ on Windows. It is -not be added to the system path by default. To dynamically link to NVML, add this path to the PATH +not be added to the system path by default. To dynamically link to NVML, add this path to the PATH environmental variable. To dynamically load NVML, call LoadLibrary with this path. On Linux the NVML library will be found on the standard library path. For 64 bit Linux, both the 32 bit @@ -95,16 +95,25 @@ extern "C" { /** * NVML API versioning support */ -#define NVML_API_VERSION 10 -#define NVML_API_VERSION_STR "10" -#define nvmlInit nvmlInit_v2 -#define nvmlDeviceGetPciInfo nvmlDeviceGetPciInfo_v3 -#define nvmlDeviceGetCount nvmlDeviceGetCount_v2 -#define nvmlDeviceGetHandleByIndex nvmlDeviceGetHandleByIndex_v2 -#define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2 -#define nvmlDeviceGetNvLinkRemotePciInfo nvmlDeviceGetNvLinkRemotePciInfo_v2 -#define nvmlDeviceRemoveGpu nvmlDeviceRemoveGpu_v2 -#define nvmlDeviceGetGridLicensableFeatures nvmlDeviceGetGridLicensableFeatures_v2 +#define NVML_API_VERSION 11 +#define NVML_API_VERSION_STR "11" +/** + * Defining NVML_NO_UNVERSIONED_FUNC_DEFS will disable "auto upgrading" of APIs. + * e.g. the user will have to call nvmlInit_v2 instead of nvmlInit. Enable this + * guard if you need to support older versions of the API + */ +#ifndef NVML_NO_UNVERSIONED_FUNC_DEFS + #define nvmlInit nvmlInit_v2 + #define nvmlDeviceGetPciInfo nvmlDeviceGetPciInfo_v3 + #define nvmlDeviceGetCount nvmlDeviceGetCount_v2 + #define nvmlDeviceGetHandleByIndex nvmlDeviceGetHandleByIndex_v2 + #define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2 + #define nvmlDeviceGetNvLinkRemotePciInfo nvmlDeviceGetNvLinkRemotePciInfo_v2 + #define nvmlDeviceRemoveGpu nvmlDeviceRemoveGpu_v2 + #define nvmlDeviceGetGridLicensableFeatures nvmlDeviceGetGridLicensableFeatures_v3 + #define nvmlEventSetWait nvmlEventSetWait_v2 + #define nvmlDeviceGetAttributes nvmlDeviceGetAttributes_v2 +#endif // #ifndef NVML_NO_UNVERSIONED_FUNC_DEFS /***************************************************************************************************/ /** @defgroup nvmlDeviceStructs Device Structs @@ -172,7 +181,7 @@ typedef struct nvmlPciInfo_st * @deprecated Different GPU families can have different memory error counters * See \ref nvmlDeviceGetMemoryErrorCounter */ -typedef struct nvmlEccErrorCounts_st +typedef struct nvmlEccErrorCounts_st { unsigned long long l1Cache; //!< L1 cache errors unsigned long long l2Cache; //!< L2 cache errors @@ -180,20 +189,20 @@ typedef struct nvmlEccErrorCounts_st unsigned long long registerFile; //!< Register file errors } nvmlEccErrorCounts_t; -/** +/** * Utilization information for a device. * Each sample period may be between 1 second and 1/6 second, depending on the product being queried. */ -typedef struct nvmlUtilization_st +typedef struct nvmlUtilization_st { unsigned int gpu; //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU unsigned int memory; //!< Percent of time over the past sample period during which global (device) memory was being read or written } nvmlUtilization_t; -/** +/** * Memory allocation information for a device. */ -typedef struct nvmlMemory_st +typedef struct nvmlMemory_st { unsigned long long total; //!< Total installed FB memory (in bytes) unsigned long long free; //!< Unallocated FB memory (in bytes) @@ -221,19 +230,47 @@ typedef struct nvmlProcessInfo_st //! because Windows KMD manages all the memory and not the NVIDIA driver } nvmlProcessInfo_t; +typedef struct nvmlDeviceAttributes_st +{ + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count + unsigned int sharedDecoderCount; //!< Shared Decoder Engine count + unsigned int sharedEncoderCount; //!< Shared Encoder Engine count + unsigned int sharedJpegCount; //!< Shared JPEG Engine count + unsigned int sharedOfaCount; //!< Shared OFA Engine count + unsigned int gpuInstanceSliceCount; //!< GPU instance slice count + unsigned int computeInstanceSliceCount; //!< Compute instance slice count + unsigned long long memorySizeMB; //!< Device memory size (in MiB) +} nvmlDeviceAttributes_t; + +/** + * Possible values that classify the remap availability for each bank. The max + * field will contain the number of banks that have maximum remap availability + * (all reserved rows are available). None means that there are no reserved + * rows available. + */ +typedef struct nvmlRowRemapperHistogramValues_st +{ + unsigned int max; + unsigned int high; + unsigned int partial; + unsigned int low; + unsigned int none; +} nvmlRowRemapperHistogramValues_t; + /** * Enum to represent type of bridge chip */ typedef enum nvmlBridgeChipType_enum { NVML_BRIDGE_CHIP_PLX = 0, - NVML_BRIDGE_CHIP_BRO4 = 1 + NVML_BRIDGE_CHIP_BRO4 = 1 }nvmlBridgeChipType_t; /** - * Maximum number of NvLink links supported + * Maximum number of NvLink links supported */ -#define NVML_NVLINK_MAX_LINKS 6 +#define NVML_NVLINK_MAX_LINKS 12 /** * Enum to represent the NvLink utilization counter packet units @@ -243,7 +280,7 @@ typedef enum nvmlNvLinkUtilizationCountUnits_enum NVML_NVLINK_COUNTER_UNIT_CYCLES = 0, // count by cycles NVML_NVLINK_COUNTER_UNIT_PACKETS = 1, // count by packets NVML_NVLINK_COUNTER_UNIT_BYTES = 2, // count by bytes - + NVML_NVLINK_COUNTER_UNIT_RESERVED = 3, // count reserved for internal use // this must be last NVML_NVLINK_COUNTER_UNIT_COUNT } nvmlNvLinkUtilizationCountUnits_t; @@ -253,7 +290,7 @@ typedef enum nvmlNvLinkUtilizationCountUnits_enum * ** this is ONLY applicable with the units as packets or bytes * ** as specified in \a nvmlNvLinkUtilizationCountUnits_t * ** all packet filter descriptions are target GPU centric - * ** these can be "OR'd" together + * ** these can be "OR'd" together */ typedef enum nvmlNvLinkUtilizationCountPktTypes_enum { @@ -268,7 +305,7 @@ typedef enum nvmlNvLinkUtilizationCountPktTypes_enum NVML_NVLINK_COUNTER_PKTFILTER_ALL = 0xFF // all packets } nvmlNvLinkUtilizationCountPktTypes_t; -/** +/** * Struct to define the NVLINK counter controls */ typedef struct nvmlNvLinkUtilizationControl_st @@ -359,12 +396,12 @@ typedef enum nvmlGpuP2PCapsIndex_enum */ typedef struct nvmlBridgeChipInfo_st { - nvmlBridgeChipType_t type; //!< Type of Bridge Chip + nvmlBridgeChipType_t type; //!< Type of Bridge Chip unsigned int fwVersion; //!< Firmware Version. 0=Version is unavailable }nvmlBridgeChipInfo_t; /** - * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate + * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth. */ typedef struct nvmlBridgeChipHierarchy_st @@ -382,12 +419,12 @@ typedef enum nvmlSamplingType_enum NVML_GPU_UTILIZATION_SAMPLES = 1, //!< To represent percent of time during which one or more kernels was executing on the GPU NVML_MEMORY_UTILIZATION_SAMPLES = 2, //!< To represent percent of time during which global (device) memory was being read or written NVML_ENC_UTILIZATION_SAMPLES = 3, //!< To represent percent of time during which NVENC remains busy - NVML_DEC_UTILIZATION_SAMPLES = 4, //!< To represent percent of time during which NVDEC remains busy + NVML_DEC_UTILIZATION_SAMPLES = 4, //!< To represent percent of time during which NVDEC remains busy NVML_PROCESSOR_CLK_SAMPLES = 5, //!< To represent processor clock samples NVML_MEMORY_CLK_SAMPLES = 6, //!< To represent memory clock samples - + // Keep this last - NVML_SAMPLINGTYPE_COUNT + NVML_SAMPLINGTYPE_COUNT }nvmlSamplingType_t; /** @@ -397,7 +434,7 @@ typedef enum nvmlPcieUtilCounter_enum { NVML_PCIE_UTIL_TX_BYTES = 0, // 1KB granularity NVML_PCIE_UTIL_RX_BYTES = 1, // 1KB granularity - + // Keep this last NVML_PCIE_UTIL_COUNT } nvmlPcieUtilCounter_t; @@ -405,7 +442,7 @@ typedef enum nvmlPcieUtilCounter_enum /** * Represents the type for sample value returned */ -typedef enum nvmlValueType_enum +typedef enum nvmlValueType_enum { NVML_VALUE_TYPE_DOUBLE = 0, NVML_VALUE_TYPE_UNSIGNED_INT = 1, @@ -433,14 +470,14 @@ typedef union nvmlValue_st /** * Information for Sample */ -typedef struct nvmlSample_st +typedef struct nvmlSample_st { unsigned long long timeStamp; //!< CPU Timestamp in microseconds nvmlValue_t sampleValue; //!< Sample Value }nvmlSample_t; /** - * Represents type of perf policy for which violation times can be queried + * Represents type of perf policy for which violation times can be queried */ typedef enum nvmlPerfPolicyType_enum { @@ -475,26 +512,26 @@ typedef struct nvmlViolationTime_st */ /***************************************************************************************************/ -/** - * Generic enable/disable enum. +/** + * Generic enable/disable enum. */ -typedef enum nvmlEnableState_enum +typedef enum nvmlEnableState_enum { - NVML_FEATURE_DISABLED = 0, //!< Feature disabled + NVML_FEATURE_DISABLED = 0, //!< Feature disabled NVML_FEATURE_ENABLED = 1 //!< Feature enabled } nvmlEnableState_t; //! Generic flag used to specify the default behavior of some functions. See description of particular functions for details. -#define nvmlFlagDefault 0x00 +#define nvmlFlagDefault 0x00 //! Generic flag used to force some behavior. See description of particular functions for details. -#define nvmlFlagForce 0x01 +#define nvmlFlagForce 0x01 /** * * The Brand of the GPU * */ typedef enum nvmlBrandType_enum { - NVML_BRAND_UNKNOWN = 0, + NVML_BRAND_UNKNOWN = 0, NVML_BRAND_QUADRO = 1, NVML_BRAND_TESLA = 2, NVML_BRAND_NVS = 3, @@ -520,36 +557,36 @@ typedef enum nvmlTemperatureThresholds_enum NVML_TEMPERATURE_THRESHOLD_COUNT } nvmlTemperatureThresholds_t; -/** - * Temperature sensors. +/** + * Temperature sensors. */ -typedef enum nvmlTemperatureSensors_enum +typedef enum nvmlTemperatureSensors_enum { NVML_TEMPERATURE_GPU = 0, //!< Temperature sensor for the GPU die - + // Keep this last NVML_TEMPERATURE_COUNT } nvmlTemperatureSensors_t; -/** - * Compute mode. +/** + * Compute mode. * * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0. - * Earlier CUDA versions supported a single exclusive mode, + * Earlier CUDA versions supported a single exclusive mode, * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond. */ -typedef enum nvmlComputeMode_enum +typedef enum nvmlComputeMode_enum { NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Support Removed NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time - + // Keep this last NVML_COMPUTEMODE_COUNT } nvmlComputeMode_t; -/** +/** * ECC bit types. * * @deprecated See \ref nvmlMemoryErrorType_t for a more flexible type @@ -577,54 +614,54 @@ typedef enum nvmlMemoryErrorType_enum { /** * A memory error that was corrected - * + * * For ECC errors, these are single bit errors * For Texture memory, these are errors fixed by resend */ NVML_MEMORY_ERROR_TYPE_CORRECTED = 0, /** * A memory error that was not corrected - * + * * For ECC errors, these are double bit errors * For Texture memory, these are errors where the resend fails */ NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1, - - + + // Keep this last NVML_MEMORY_ERROR_TYPE_COUNT //!< Count of memory error types } nvmlMemoryErrorType_t; -/** - * ECC counter types. +/** + * ECC counter types. * * Note: Volatile counts are reset each time the driver loads. On Windows this is once per boot. On Linux this can be more frequent. - * On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver + * On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver * client active (e.g. X11), then Linux also sees per-boot behavior. If not, volatile counts are reset each time a compute app * is run. */ -typedef enum nvmlEccCounterType_enum +typedef enum nvmlEccCounterType_enum { NVML_VOLATILE_ECC = 0, //!< Volatile counts are reset each time the driver loads. NVML_AGGREGATE_ECC = 1, //!< Aggregate counts persist across reboots (i.e. for the lifetime of the device) - + // Keep this last NVML_ECC_COUNTER_TYPE_COUNT //!< Count of memory counter types } nvmlEccCounterType_t; -/** - * Clock types. - * +/** + * Clock types. + * * All speeds are in Mhz. */ -typedef enum nvmlClockType_enum +typedef enum nvmlClockType_enum { NVML_CLOCK_GRAPHICS = 0, //!< Graphics clock domain NVML_CLOCK_SM = 1, //!< SM clock domain NVML_CLOCK_MEM = 2, //!< Memory clock domain NVML_CLOCK_VIDEO = 3, //!< Video encoder/decoder clock domain - + // Keep this last NVML_CLOCK_COUNT //!< Count of clock types } nvmlClockType_t; @@ -644,12 +681,12 @@ typedef enum nvmlClockId_enum NVML_CLOCK_ID_COUNT //!< Count of Clock Ids. } nvmlClockId_t; -/** - * Driver models. +/** + * Driver models. * * Windows only. */ -typedef enum nvmlDriverModel_enum +typedef enum nvmlDriverModel_enum { NVML_DRIVER_WDDM = 0, //!< WDDM driver model -- GPU treated as a display device NVML_DRIVER_WDM = 1 //!< WDM (TCC) model (recommended) -- GPU treated as a generic device @@ -658,10 +695,10 @@ typedef enum nvmlDriverModel_enum /** * Allowed PStates. */ -typedef enum nvmlPStates_enum +typedef enum nvmlPStates_enum { NVML_PSTATE_0 = 0, //!< Performance state 0 -- Maximum Performance - NVML_PSTATE_1 = 1, //!< Performance state 1 + NVML_PSTATE_1 = 1, //!< Performance state 1 NVML_PSTATE_2 = 2, //!< Performance state 2 NVML_PSTATE_3 = 3, //!< Performance state 3 NVML_PSTATE_4 = 4, //!< Performance state 4 @@ -675,7 +712,7 @@ typedef enum nvmlPStates_enum NVML_PSTATE_12 = 12, //!< Performance state 12 NVML_PSTATE_13 = 13, //!< Performance state 13 NVML_PSTATE_14 = 14, //!< Performance state 14 - NVML_PSTATE_15 = 15, //!< Performance state 15 -- Minimum Performance + NVML_PSTATE_15 = 15, //!< Performance state 15 -- Minimum Performance NVML_PSTATE_UNKNOWN = 32 //!< Unknown performance state } nvmlPstates_t; @@ -697,10 +734,10 @@ typedef enum nvmlGom_enum //!< high bandwidth double precision } nvmlGpuOperationMode_t; -/** +/** * Available infoROM objects. */ -typedef enum nvmlInforomObject_enum +typedef enum nvmlInforomObject_enum { NVML_INFOROM_OEM = 0, //!< An object defined by OEM NVML_INFOROM_ECC = 1, //!< The ECC object determining the level of ECC support @@ -710,36 +747,37 @@ typedef enum nvmlInforomObject_enum NVML_INFOROM_COUNT //!< This counts the number of infoROM objects the driver knows about } nvmlInforomObject_t; -/** - * Return values for NVML API calls. +/** + * Return values for NVML API calls. */ -typedef enum nvmlReturn_enum +typedef enum nvmlReturn_enum { // cppcheck-suppress * - NVML_SUCCESS = 0, //!< The operation was successful - NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit() - NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid - NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device - NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation - NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting - NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful - NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough - NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached - NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded - NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed - NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU - NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded - NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function - NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted - NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible - NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again - NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups - NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch - NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use - NVML_ERROR_MEMORY = 20, //!< Insufficient memory - NVML_ERROR_NO_DATA = 21, //!usedGpuMemory is not supported - - unsigned long long time; //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if + + unsigned long long time; //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if //!< the process is not terminated - + unsigned long long startTime; //!< CPU Timestamp in usec representing start time for the process - + unsigned int isRunning; //!< Flag to represent if the process is running (1 for running, 0 for terminated) unsigned int reserved[5]; //!< Reserved for future use @@ -1293,176 +1597,45 @@ typedef struct nvmlAccountingStats_st { /** @} */ /***************************************************************************************************/ -/** @defgroup nvmlVgpuConstants Vgpu Constants +/** @defgroup nvmlEncoderStructs Encoder Structs * @{ */ /***************************************************************************************************/ /** - * Buffer size guaranteed to be large enough for \ref nvmlVgpuTypeGetLicense + * Represents type of encoder for capacity can be queried */ -#define NVML_GRID_LICENSE_BUFFER_SIZE 128 - -#define NVML_VGPU_NAME_BUFFER_SIZE 64 - -#define NVML_GRID_LICENSE_FEATURE_MAX_COUNT 3 +typedef enum nvmlEncoderQueryType_enum +{ + NVML_ENCODER_QUERY_H264 = 0, //!< H264 encoder + NVML_ENCODER_QUERY_HEVC = 1, //!< HEVC encoder +}nvmlEncoderType_t; -/*! - * Macros for pGPU's virtualization capabilities bitfield. +/** + * Structure to hold encoder session data */ -#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION 0:0 -#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_NO 0x0 -#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_YES 0x1 +typedef struct nvmlEncoderSessionInfo_st +{ + unsigned int sessionId; //!< Unique session ID + unsigned int pid; //!< Owning process ID + nvmlVgpuInstance_t vgpuInstance; //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero) + nvmlEncoderType_t codecType; //!< Video encoder type + unsigned int hResolution; //!< Current encode horizontal resolution + unsigned int vResolution; //!< Current encode vertical resolution + unsigned int averageFps; //!< Moving average encode frames per second + unsigned int averageLatency; //!< Moving average encode latency in microseconds +}nvmlEncoderSessionInfo_t; /** @} */ /***************************************************************************************************/ -/** @defgroup nvmlVgpuEnum Vgpu Enum - * @{ - */ +/** @defgroup nvmlFBCStructs Frame Buffer Capture Structures +* @{ +*/ /***************************************************************************************************/ -/*! - * Types of VM identifiers - */ -typedef enum nvmlVgpuVmIdType { - NVML_VGPU_VM_ID_DOMAIN_ID = 0, //!< VM ID represents DOMAIN ID - NVML_VGPU_VM_ID_UUID = 1, //!< VM ID represents UUID -} nvmlVgpuVmIdType_t; - /** - * vGPU GUEST info state. - */ -typedef enum nvmlVgpuGuestInfoState_enum -{ - NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0, //!< Guest-dependent fields uninitialized - NVML_VGPU_INSTANCE_GUEST_INFO_STATE_INITIALIZED = 1, //!< Guest-dependent fields initialized -} nvmlVgpuGuestInfoState_t; - -/** - * GRID license feature code - */ -typedef enum { - NVML_GRID_LICENSE_FEATURE_CODE_VGPU = 1, //!< Virtual GPU - NVML_GRID_LICENSE_FEATURE_CODE_VWORKSTATION = 2 //!< Virtual Workstation -} nvmlGridLicenseFeatureCode_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlVgpuStructs Vgpu Structs - * @{ - */ -/***************************************************************************************************/ - -typedef unsigned int nvmlVgpuTypeId_t; - -typedef unsigned int nvmlVgpuInstance_t; - -/** - * Structure to store Utilization Value and vgpuInstance - */ -typedef struct nvmlVgpuInstanceUtilizationSample_st -{ - nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance - unsigned long long timeStamp; //!< CPU Timestamp in microseconds - nvmlValue_t smUtil; //!< SM (3D/Compute) Util Value - nvmlValue_t memUtil; //!< Frame Buffer Memory Util Value - nvmlValue_t encUtil; //!< Encoder Util Value - nvmlValue_t decUtil; //!< Decoder Util Value -} nvmlVgpuInstanceUtilizationSample_t; - -/** - * Structure to store Utilization Value, vgpuInstance and subprocess information - */ -typedef struct nvmlVgpuProcessUtilizationSample_st -{ - nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance - unsigned int pid; //!< PID of process running within the vGPU VM - char processName[NVML_VGPU_NAME_BUFFER_SIZE]; //!< Name of process running within the vGPU VM - unsigned long long timeStamp; //!< CPU Timestamp in microseconds - unsigned int smUtil; //!< SM (3D/Compute) Util Value - unsigned int memUtil; //!< Frame Buffer Memory Util Value - unsigned int encUtil; //!< Encoder Util Value - unsigned int decUtil; //!< Decoder Util Value -} nvmlVgpuProcessUtilizationSample_t; - -/** - * Structure to store utilization value and process Id - */ -typedef struct nvmlProcessUtilizationSample_st -{ - unsigned int pid; //!< PID of process - unsigned long long timeStamp; //!< CPU Timestamp in microseconds - unsigned int smUtil; //!< SM (3D/Compute) Util Value - unsigned int memUtil; //!< Frame Buffer Memory Util Value - unsigned int encUtil; //!< Encoder Util Value - unsigned int decUtil; //!< Decoder Util Value -} nvmlProcessUtilizationSample_t; - -/** - * Structure containing GRID licensable feature information - */ -typedef struct nvmlGridLicensableFeature_st -{ - nvmlGridLicenseFeatureCode_t featureCode; //!< Licensed feature code - unsigned int featureState; //!< Non-zero if feature is currently licensed, otherwise zero - char licenseInfo[NVML_GRID_LICENSE_BUFFER_SIZE]; - char productName[NVML_GRID_LICENSE_BUFFER_SIZE]; -} nvmlGridLicensableFeature_t; - -/** - * Structure to store GRID licensable features - */ -typedef struct nvmlGridLicensableFeatures_st -{ - int isGridLicenseSupported; //!< Non-zero if GRID Software Licensing is supported on the system, otherwise zero - unsigned int licensableFeaturesCount; //!< Entries returned in \a gridLicensableFeatures array - nvmlGridLicensableFeature_t gridLicensableFeatures[NVML_GRID_LICENSE_FEATURE_MAX_COUNT]; //!< Array of GRID licensable features. -} nvmlGridLicensableFeatures_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlEncoderStructs Encoder Structs - * @{ - */ -/***************************************************************************************************/ - -/** - * Represents type of encoder for capacity can be queried - */ -typedef enum nvmlEncoderQueryType_enum -{ - NVML_ENCODER_QUERY_H264 = 0, //!< H264 encoder - NVML_ENCODER_QUERY_HEVC = 1, //!< HEVC encoder -}nvmlEncoderType_t; - -/** - * Structure to hold encoder session data - */ -typedef struct nvmlEncoderSessionInfo_st -{ - unsigned int sessionId; //!< Unique session ID - unsigned int pid; //!< Owning process ID - nvmlVgpuInstance_t vgpuInstance; //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero) - nvmlEncoderType_t codecType; //!< Video encoder type - unsigned int hResolution; //!< Current encode horizontal resolution - unsigned int vResolution; //!< Current encode vertical resolution - unsigned int averageFps; //!< Moving average encode frames per second - unsigned int averageLatency; //!< Moving average encode latency in microseconds -}nvmlEncoderSessionInfo_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlFBCStructs Frame Buffer Capture Structures -* @{ -*/ -/***************************************************************************************************/ - -/** - * Represents frame buffer capture session type + * Represents frame buffer capture session type */ typedef enum nvmlFBCSessionType_enum { @@ -1539,7 +1712,7 @@ typedef enum nvmlPcieLinkState_enum /***************************************************************************************************/ /** @defgroup nvmlInitializationAndCleanup Initialization and Cleanup * This chapter describes the methods that handle NVML initialization and cleanup. - * It is the user's responsibility to call \ref nvmlInit() before calling any other methods, and + * It is the user's responsibility to call \ref nvmlInit_v2() before calling any other methods, and * nvmlShutdown() once NVML is no longer being used. * @{ */ @@ -1555,38 +1728,38 @@ typedef enum nvmlPcieLinkState_enum * modifying the behaviour of nvmlInit(). * \note In NVML 5.319 new nvmlInit_v2 has replaced nvmlInit"_v1" (default in NVML 4.304 and older) that * did initialize all GPU devices in the system. - * + * * This allows NVML to communicate with a GPU * when other GPUs in the system are unstable or in a bad state. When using this API, GPUs are * discovered and initialized in nvmlDeviceGetHandleBy* functions instead. - * + * * \note To contrast nvmlInit_v2 with nvmlInit"_v1", NVML 4.304 nvmlInit"_v1" will fail when any detected GPU is in * a bad or unstable state. - * + * * For all products. * * This method, should be called once before invoking any other methods in the library. * A reference count of the number of initializations is maintained. Shutdown only occurs * when the reference count reaches zero. - * - * @return + * + * @return * - \ref NVML_SUCCESS if NVML has been properly initialized * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running * - \ref NVML_ERROR_NO_PERMISSION if NVML does not have permission to talk to the driver * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlInit(void); +nvmlReturn_t DECLDIR nvmlInit_v2(void); /** * nvmlInitWithFlags is a variant of nvmlInit(), that allows passing a set of boolean values * modifying the behaviour of nvmlInit(). - * Other than the "flags" parameter it is completely similar to \ref nvmlInit. - * + * Other than the "flags" parameter it is completely similar to \ref nvmlInit_v2. + * * For all products. * * @param flags behaviour modifier flags * - * @return + * @return * - \ref NVML_SUCCESS if NVML has been properly initialized * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running * - \ref NVML_ERROR_NO_PERMISSION if NVML does not have permission to talk to the driver @@ -1595,16 +1768,16 @@ nvmlReturn_t DECLDIR nvmlInit(void); nvmlReturn_t DECLDIR nvmlInitWithFlags(unsigned int flags); /** - * Shut down NVML by releasing all GPU resources previously allocated with \ref nvmlInit(). - * + * Shut down NVML by releasing all GPU resources previously allocated with \ref nvmlInit_v2(). + * * For all products. * - * This method should be called after NVML work is done, once for each call to \ref nvmlInit() + * This method should be called after NVML work is done, once for each call to \ref nvmlInit_v2() * A reference count of the number of initializations is maintained. Shutdown only occurs * when the reference count reaches zero. For backwards compatibility, no error is reported if * nvmlShutdown() is called more times than nvmlInit(). - * - * @return + * + * @return * - \ref NVML_SUCCESS if NVML has been properly shut down * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_UNKNOWN on any unexpected error @@ -1646,10 +1819,15 @@ const DECLDIR char* nvmlErrorString(nvmlReturn_t result); #define NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE 16 /** - * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetUUID + * Buffer size guaranteed to be large enough for storing GPU identifiers. */ #define NVML_DEVICE_UUID_BUFFER_SIZE 80 +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetUUID + */ +#define NVML_DEVICE_UUID_V2_BUFFER_SIZE 96 + /** * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetBoardPartNumber */ @@ -1666,10 +1844,15 @@ const DECLDIR char* nvmlErrorString(nvmlReturn_t result); #define NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE 80 /** - * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetName + * Buffer size guaranteed to be large enough for storing GPU device names. */ #define NVML_DEVICE_NAME_BUFFER_SIZE 64 +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetName + */ +#define NVML_DEVICE_NAME_V2_BUFFER_SIZE 96 + /** * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetSerial */ @@ -1692,7 +1875,7 @@ const DECLDIR char* nvmlErrorString(nvmlReturn_t result); /** * Retrieves the version of the system's graphics driver. - * + * * For all products. * * The version identifier is an alphanumeric string. It will not exceed 80 characters in length @@ -1701,17 +1884,17 @@ const DECLDIR char* nvmlErrorString(nvmlReturn_t result); * @param version Reference in which to return the version identifier * @param length The maximum allowed length of the string returned in \a version * - * @return + * @return * - \ref NVML_SUCCESS if \a version has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small */ nvmlReturn_t DECLDIR nvmlSystemGetDriverVersion(char *version, unsigned int length); /** * Retrieves the version of the NVML library. - * + * * For all products. * * The version identifier is an alphanumeric string. It will not exceed 80 characters in length @@ -1720,10 +1903,10 @@ nvmlReturn_t DECLDIR nvmlSystemGetDriverVersion(char *version, unsigned int leng * @param version Reference in which to return the version identifier * @param length The maximum allowed length of the string returned in \a version * - * @return + * @return * - \ref NVML_SUCCESS if \a version has been set * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small */ nvmlReturn_t DECLDIR nvmlSystemGetNVMLVersion(char *version, unsigned int length); @@ -1777,8 +1960,8 @@ nvmlReturn_t DECLDIR nvmlSystemGetCudaDriverVersion_v2(int *cudaDriverVersion); * @param pid The identifier of the process * @param name Reference in which to return the process name * @param length The maximum allowed length of the string returned in \a name - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a name has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a name is NULL or \a length is 0. @@ -1793,7 +1976,7 @@ nvmlReturn_t DECLDIR nvmlSystemGetProcessName(unsigned int pid, char *name, unsi /***************************************************************************************************/ /** @defgroup nvmlUnitQueries Unit Queries * This chapter describes that queries that NVML can perform against each unit. For S-class systems only. - * In each case the device is identified with an nvmlUnit_t handle. This handle is obtained by + * In each case the device is identified with an nvmlUnit_t handle. This handle is obtained by * calling \ref nvmlUnitGetHandleByIndex(). * @{ */ @@ -1805,8 +1988,8 @@ nvmlReturn_t DECLDIR nvmlSystemGetProcessName(unsigned int pid, char *name, unsi * For S-class products. * * @param unitCount Reference in which to return the number of units - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a unitCount has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unitCount is NULL @@ -1819,15 +2002,15 @@ nvmlReturn_t DECLDIR nvmlUnitGetCount(unsigned int *unitCount); * * For S-class products. * - * Valid indices are derived from the \a unitCount returned by \ref nvmlUnitGetCount(). + * Valid indices are derived from the \a unitCount returned by \ref nvmlUnitGetCount(). * For example, if \a unitCount is 2 the valid indices are 0 and 1, corresponding to UNIT 0 and UNIT 1. * * The order in which NVML enumerates units has no guarantees of consistency between reboots. * * @param index The index of the target unit, >= 0 and < \a unitCount * @param unit Reference in which to return the unit handle - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a unit has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a unit is NULL @@ -1844,8 +2027,8 @@ nvmlReturn_t DECLDIR nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t *un * * @param unit The identifier of the target unit * @param info Reference in which to return the unit information - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a info has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a info is NULL @@ -1861,14 +2044,14 @@ nvmlReturn_t DECLDIR nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t *info); * * @param unit The identifier of the target unit * @param state Reference in which to return the current LED state - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a state has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a state is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * + * * @see nvmlUnitSetLedState() */ nvmlReturn_t DECLDIR nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t *state); @@ -1882,8 +2065,8 @@ nvmlReturn_t DECLDIR nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t *state) * * @param unit The identifier of the target unit * @param psu Reference in which to return the PSU information - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a psu has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a psu is NULL @@ -1897,14 +2080,14 @@ nvmlReturn_t DECLDIR nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t *psu); * * For S-class products. * - * Depending on the product, readings may be available for intake (type=0), + * Depending on the product, readings may be available for intake (type=0), * exhaust (type=1) and board (type=2). * * @param unit The identifier of the target unit * @param type The type of reading to take * @param temp Reference in which to return the intake temperature - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a temp has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a type is invalid or \a temp is NULL @@ -1922,8 +2105,8 @@ nvmlReturn_t DECLDIR nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, * * @param unit The identifier of the target unit * @param fanSpeeds Reference in which to return the fan speed information - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a fanSpeeds has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a fanSpeeds is NULL @@ -1943,8 +2126,8 @@ nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_ * @param deviceCount Reference in which to provide the \a devices array size, and * to return the number of attached GPU devices * @param devices Reference in which to return the references to the attached GPU devices - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a deviceCount and \a devices have been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a deviceCount indicates that the \a devices array is too small @@ -1955,7 +2138,7 @@ nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCou /** * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system. - * + * * For S-class products. * * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array. @@ -1964,7 +2147,7 @@ nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCou * @param hwbcCount Size of hwbcEntries array * @param hwbcEntries Array holding information about hwbc * - * @return + * @return * - \ref NVML_SUCCESS if \a hwbcCount and \a hwbcEntries have been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if either \a hwbcCount or \a hwbcEntries is NULL @@ -1976,16 +2159,16 @@ nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEn /***************************************************************************************************/ /** @defgroup nvmlDeviceQueries Device Queries * This chapter describes that queries that NVML can perform against each device. - * In each case the device is identified with an nvmlDevice_t handle. This handle is obtained by - * calling one of \ref nvmlDeviceGetHandleByIndex(), \ref nvmlDeviceGetHandleBySerial(), - * \ref nvmlDeviceGetHandleByPciBusId(). or \ref nvmlDeviceGetHandleByUUID(). + * In each case the device is identified with an nvmlDevice_t handle. This handle is obtained by + * calling one of \ref nvmlDeviceGetHandleByIndex_v2(), \ref nvmlDeviceGetHandleBySerial(), + * \ref nvmlDeviceGetHandleByPciBusId_v2(). or \ref nvmlDeviceGetHandleByUUID(). * @{ */ /***************************************************************************************************/ /** * Retrieves the number of compute devices in the system. A compute device is a single GPU. - * + * * For all products. * * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system @@ -1996,34 +2179,54 @@ nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEn * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. * * @param deviceCount Reference in which to return the number of accessible devices - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a deviceCount has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount); +nvmlReturn_t DECLDIR nvmlDeviceGetCount_v2(unsigned int *deviceCount); + +/** + * Get attributes (engine counts etc.) for the given NVML device handle. + * + * @note This API currently only supports MIG device handles. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * + * @param device NVML device handle + * @param attributes Device attributes + * + * @return + * - \ref NVML_SUCCESS if \a device attributes were successfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle is invalid + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAttributes_v2(nvmlDevice_t device, nvmlDeviceAttributes_t *attributes); /** * Acquire the handle for a particular device, based on its index. - * + * * For all products. * - * Valid indices are derived from the \a accessibleDevices count returned by - * \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices + * Valid indices are derived from the \a accessibleDevices count returned by + * \ref nvmlDeviceGetCount_v2(). For example, if \a accessibleDevices is 2 the valid indices * are 0 and 1, corresponding to GPU 0 and GPU 1. * * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it - * is recommended that devices be looked up by their PCI ids or UUID. See - * \ref nvmlDeviceGetHandleByUUID() and \ref nvmlDeviceGetHandleByPciBusId(). + * is recommended that devices be looked up by their PCI ids or UUID. See + * \ref nvmlDeviceGetHandleByUUID() and \ref nvmlDeviceGetHandleByPciBusId_v2(). * * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. * * Starting from NVML 5, this API causes NVML to initialize the target GPU * NVML may initialize additional GPUs if: * - The target GPU is an SLI slave - * + * * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. * Update your code to handle this error, or use NVML 4.304 or older nvml header file. @@ -2037,8 +2240,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount); * * @param index The index of the target GPU, >= 0 and < \a accessibleDevices * @param device Reference in which to return the device handle - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a device has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a device is NULL @@ -2051,7 +2254,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount); * @see nvmlDeviceGetIndex * @see nvmlDeviceGetCount */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex_v2(unsigned int index, nvmlDevice_t *device); /** * Acquire the handle for a particular device, based on its board serial number. @@ -2061,7 +2264,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t * This number corresponds to the value printed directly on the board, and to the value returned by * \ref nvmlDeviceGetSerial(). * - * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor + * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor * of \ref nvmlDeviceGetHandleByUUID. * For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT. * @@ -2070,8 +2273,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t * * @param serial The board serial number of the target GPU * @param device Reference in which to return the device handle - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a device has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a serial is invalid, \a device is NULL or more than one @@ -2094,11 +2297,13 @@ nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_ * * @param uuid The UUID of the target GPU * @param device Reference in which to return the device handle - * + * * Starting from NVML 5, this API causes NVML to initialize the target GPU * NVML may initialize additional GPUs as it searches for the target GPU * - * @return + * This API does not currently support acquiring MIG device handles using MIG device UUIDs. + * + * @return * - \ref NVML_SUCCESS if \a device has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a uuid is invalid or \a device is null @@ -2114,22 +2319,22 @@ nvmlReturn_t DECLDIR nvmlDeviceGetHandleByUUID(const char *uuid, nvmlDevice_t *d /** * Acquire the handle for a particular device, based on its PCI bus id. - * + * * For all products. * - * This value corresponds to the nvmlPciInfo_t::busId returned by \ref nvmlDeviceGetPciInfo(). + * This value corresponds to the nvmlPciInfo_t::busId returned by \ref nvmlDeviceGetPciInfo_v3(). * * Starting from NVML 5, this API causes NVML to initialize the target GPU * NVML may initialize additional GPUs if: * - The target GPU is an SLI slave * - * \note NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND + * \note NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND * instead of NVML_ERROR_NO_PERMISSION. * * @param pciBusId The PCI bus id of the target GPU * @param device Reference in which to return the device handle - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a device has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciBusId is invalid or \a device is NULL @@ -2140,22 +2345,25 @@ nvmlReturn_t DECLDIR nvmlDeviceGetHandleByUUID(const char *uuid, nvmlDevice_t *d * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId(const char *pciBusId, nvmlDevice_t *device); +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId_v2(const char *pciBusId, nvmlDevice_t *device); /** - * Retrieves the name of this device. - * + * Retrieves the name of this device. + * * For all products. * * The name is an alphanumeric string that denotes a particular product, e.g. Tesla &tm; C2070. It will not - * exceed 64 characters in length (including the NULL terminator). See \ref - * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. + * exceed 96 characters in length (including the NULL terminator). See \ref + * nvmlConstants::NVML_DEVICE_NAME_V2_BUFFER_SIZE. + * + * When used with MIG device handles the API returns MIG device names which can be used to identify devices + * based on their attributes. * * @param device The identifier of the target device * @param name Reference in which to return the product name * @param length The maximum allowed length of the string returned in \a name - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a name has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a name is NULL @@ -2188,21 +2396,25 @@ nvmlReturn_t DECLDIR nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t *ty * Retrieves the NVML index of this device. * * For all products. - * - * Valid indices are derived from the \a accessibleDevices count returned by - * \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices + * + * Valid indices are derived from the \a accessibleDevices count returned by + * \ref nvmlDeviceGetCount_v2(). For example, if \a accessibleDevices is 2 the valid indices * are 0 and 1, corresponding to GPU 0 and GPU 1. * * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it - * is recommended that devices be looked up by their PCI ids or GPU UUID. See - * \ref nvmlDeviceGetHandleByPciBusId() and \ref nvmlDeviceGetHandleByUUID(). + * is recommended that devices be looked up by their PCI ids or GPU UUID. See + * \ref nvmlDeviceGetHandleByPciBusId_v2() and \ref nvmlDeviceGetHandleByUUID(). + * + * When used with MIG device handles this API returns indices that can be + * passed to \ref nvmlDeviceGetMigDeviceHandleByIndex to retrieve an identical handle. + * MIG device indices are unique within a device. * * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. * * @param device The identifier of the target device * @param index Reference in which to return the NVML index of the device * - * @return + * @return * - \ref NVML_SUCCESS if \a index has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a index is NULL @@ -2226,8 +2438,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index * @param device The identifier of the target device * @param serial Reference in which to return the board/module serial number * @param length The maximum allowed length of the string returned in \a serial - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a serial has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL @@ -2238,62 +2450,140 @@ nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index */ nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length); + +/***************************************************************************************************/ + +/** @defgroup nvmlAffinity CPU and Memory Affinity + * This chapter describes NVML operations that are associated with CPU and memory + * affinity. + * @{ + */ +/***************************************************************************************************/ + +//! Scope of NUMA node for affinity queries +#define NVML_AFFINITY_SCOPE_NODE 0 +//! Scope of processor socket for affinity queries +#define NVML_AFFINITY_SCOPE_SOCKET 1 + +typedef unsigned int nvmlAffinityScope_t; + /** - * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU affinity for the device - * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2, - * result[0] = 0x3, result[1] = 0x3 + * Retrieves an array of unsigned ints (sized to nodeSetSize) of bitmasks with + * the ideal memory affinity within node or socket for the device. + * For example, if NUMA node 0, 1 are ideal within the socket for the device and nodeSetSize == 1, + * result[0] = 0x3 + * + * \note If requested scope is not applicable to the target topology, the API + * will fall back to reporting the memory affinity for the immediate non-I/O + * ancestor of the device. * * For Kepler &tm; or newer fully supported devices. * Supported on Linux only. * * @param device The identifier of the target device - * @param cpuSetSize The size of the cpuSet array that is safe to access - * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per - * unsigned long on 64-bit machines, 32 on 32-bit machines + * @param nodeSetSize The size of the nodeSet array that is safe to access + * @param nodeSet Array reference in which to return a bitmask of NODEs, 64 NODEs per + * unsigned long on 64-bit machines, 32 on 32-bit machines + * @param scope Scope that change the default behavior * - * @return - * - \ref NVML_SUCCESS if \a cpuAffinity has been filled + * @return + * - \ref NVML_SUCCESS if \a NUMA node Affinity has been filled * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, or cpuSet is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, nodeSetSize == 0, nodeSet is NULL or scope is invalid * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet); + +nvmlReturn_t DECLDIR nvmlDeviceGetMemoryAffinity(nvmlDevice_t device, unsigned int nodeSetSize, unsigned long *nodeSet, nvmlAffinityScope_t scope); /** - * Sets the ideal affinity for the calling thread and device using the guidelines - * given in nvmlDeviceGetCpuAffinity(). Note, this is a change as of version 8.0. - * Older versions set the affinity for a calling process and all children. - * Currently supports up to 64 processors. + * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the + * ideal CPU affinity within node or socket for the device. + * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2, + * result[0] = 0x3, result[1] = 0x3 + * + * \note If requested scope is not applicable to the target topology, the API + * will fall back to reporting the CPU affinity for the immediate non-I/O + * ancestor of the device. * * For Kepler &tm; or newer fully supported devices. * Supported on Linux only. * * @param device The identifier of the target device + * @param cpuSetSize The size of the cpuSet array that is safe to access + * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per + * unsigned long on 64-bit machines, 32 on 32-bit machines + * @param scope Scope that change the default behavior * - * @return - * - \ref NVML_SUCCESS if the calling process has been successfully bound + * @return + * - \ref NVML_SUCCESS if \a cpuAffinity has been filled * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, cpuSet is NULL or sope is invalid * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device); + +nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinityWithinScope(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet, nvmlAffinityScope_t scope); /** - * Clear all affinity bindings for the calling thread. Note, this is a change as of version - * 8.0 as older versions cleared the affinity for a calling process and all children. + * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU affinity for the device + * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2, + * result[0] = 0x3, result[1] = 0x3 + * This is equivalent to calling \ref nvmlDeviceGetCpuAffinityWithinScope with \ref NVML_AFFINITY_SCOPE_NODE. * * For Kepler &tm; or newer fully supported devices. * Supported on Linux only. * * @param device The identifier of the target device + * @param cpuSetSize The size of the cpuSet array that is safe to access + * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per + * unsigned long on 64-bit machines, 32 on 32-bit machines * - * @return - * - \ref NVML_SUCCESS if the calling process has been successfully unbound - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * @return + * - \ref NVML_SUCCESS if \a cpuAffinity has been filled + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, or cpuSet is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet); + +/** + * Sets the ideal affinity for the calling thread and device using the guidelines + * given in nvmlDeviceGetCpuAffinity(). Note, this is a change as of version 8.0. + * Older versions set the affinity for a calling process and all children. + * Currently supports up to 1024 processors. + * + * For Kepler &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if the calling process has been successfully bound + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device); + +/** + * Clear all affinity bindings for the calling thread. Note, this is a change as of version + * 8.0 as older versions cleared the affinity for a calling process and all children. + * + * For Kepler &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if the calling process has been successfully unbound + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ @@ -2314,6 +2604,8 @@ nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device); * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery */ + +/** @} */ nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo); /** @@ -2323,7 +2615,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, n * * @param device The identifier of the first device * @param level The \ref nvmlGpuTopologyLevel_t level to search for other GPUs - * @param count When zero, is set to the number of matching GPUs such that \a deviceArray + * @param count When zero, is set to the number of matching GPUs such that \a deviceArray * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count * number of device handles. * @param deviceArray An array of device handles for GPUs found at \a level @@ -2342,7 +2634,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlG * Supported on Linux only. * * @param cpuNumber The CPU number - * @param count When zero, is set to the number of matching GPUs such that \a deviceArray + * @param count When zero, is set to the number of matching GPUs such that \a deviceArray * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count * number of device handles. * @param deviceArray An array of device handles for GPUs found with affinity to \a cpuNumber @@ -2356,18 +2648,18 @@ nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlG nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray); /** - * Retrieve the status for a given p2p capability index between a given pair of GPU - * - * @param device1 The first device + * Retrieve the status for a given p2p capability index between a given pair of GPU + * + * @param device1 The first device * @param device2 The second device * @param p2pIndex p2p Capability Index being looked for between \a device1 and \a device2 * @param p2pStatus Reference in which to return the status of the \a p2pIndex * between \a device1 and \a device2 - * @return + * @return * - \ref NVML_SUCCESS if \a p2pStatus has been populated * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1 or \a device2 or \a p2pIndex is invalid or \a p2pStatus is NULL * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ + */ nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,nvmlGpuP2PStatus_t *p2pStatus); /** @@ -2377,18 +2669,21 @@ nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t d * For all products. * * The UUID is a globally unique identifier. It is the only available identifier for pre-Fermi-architecture products. - * It does NOT correspond to any identifier printed on the board. It will not exceed 80 characters in length - * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * It does NOT correspond to any identifier printed on the board. It will not exceed 96 characters in length + * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_UUID_V2_BUFFER_SIZE. + * + * When used with MIG device handles the API returns globally unique UUIDs which can be used to identify MIG + * devices across both GPU and MIG devices. UUIDs are immutable for the lifetime of a MIG device. * * @param device The identifier of the target device * @param uuid Reference in which to return the GPU UUID * @param length The maximum allowed length of the string returned in \a uuid - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a uuid has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a uuid is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error @@ -2396,7 +2691,32 @@ nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t d nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length); /** - * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for + * Retrieve the MDEV UUID of a vGPU instance. + * + * The MDEV UUID is a globally unique identifier of the mdev device assigned to the VM, and is returned as a 5-part hexadecimal string, + * not exceeding 80 characters in length (including the NULL terminator). + * MDEV UUID is displayed only on KVM platform. + * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param mdevUuid Pointer to caller-supplied buffer to hold MDEV UUID + * @param size Size of buffer in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_SUPPORTED on any hypervisor other than KVM + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mdevUuid is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMdevUUID(nvmlVgpuInstance_t vgpuInstance, char *mdevUuid, unsigned int size); + +/** + * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for * each GPU will have the form /dev/nvidia[minor number]. * * For all products. @@ -2438,7 +2758,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* par * * For all products with an inforom. * - * Fermi and higher parts have non-volatile on-board memory for persisting device info, such as aggregate + * Fermi and higher parts have non-volatile on-board memory for persisting device info, such as aggregate * ECC counts. The version of the data structures in this memory may change from time to time. It will not * exceed 16 characters in length (including the NULL terminator). * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. @@ -2450,11 +2770,11 @@ nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* par * @param version Reference in which to return the infoROM version * @param length The maximum allowed length of the string returned in \a version * - * @return + * @return * - \ref NVML_SUCCESS if \a version has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error @@ -2468,7 +2788,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforo * * For all products with an inforom. * - * Image version just like VBIOS version uniquely describes the exact version of the infoROM flashed on the board + * Image version just like VBIOS version uniquely describes the exact version of the infoROM flashed on the board * in contrast to infoROM object version which is only an indicator of supported features. * Version string will not exceed 16 characters in length (including the NULL terminator). * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. @@ -2477,11 +2797,11 @@ nvmlReturn_t DECLDIR nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforo * @param version Reference in which to return the infoROM image version * @param length The maximum allowed length of the string returned in \a version * - * @return + * @return * - \ref NVML_SUCCESS if \a version has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error @@ -2502,14 +2822,14 @@ nvmlReturn_t DECLDIR nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char * @param device The identifier of the target device * @param checksum Reference in which to return the infoROM configuration checksum * - * @return + * @return * - \ref NVML_SUCCESS if \a checksum has been set * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's checksum couldn't be retrieved due to infoROM corruption * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a checksum is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int *checksum); @@ -2520,13 +2840,13 @@ nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t devi * * @param device The identifier of the target device * - * @return + * @return * - \ref NVML_SUCCESS if infoROM is not corrupted * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's infoROM is corrupted * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device); @@ -2542,8 +2862,8 @@ nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device); * * @param device The identifier of the target device * @param display Reference in which to return the display mode - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a display has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a display is NULL @@ -2567,8 +2887,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableSta * * @param device The identifier of the target device * @param isActive Reference in which to return the display active state - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a isActive has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isActive is NULL @@ -2584,15 +2904,15 @@ nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableS * For all products. * For Linux only. * - * When driver persistence mode is enabled the driver software state is not torn down when the last - * client disconnects. By default this feature is disabled. + * When driver persistence mode is enabled the driver software state is not torn down when the last + * client disconnects. By default this feature is disabled. * * See \ref nvmlEnableState_t for details on allowed modes. * * @param device The identifier of the target device * @param mode Reference in which to return the current driver persistence mode - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a mode has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL @@ -2606,35 +2926,35 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnabl /** * Retrieves the PCI attributes of this device. - * + * * For all products. * * See \ref nvmlPciInfo_t for details on the available PCI info. * * @param device The identifier of the target device * @param pci Reference in which to return the PCI info - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a pci has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci); +nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v3(nvmlDevice_t device, nvmlPciInfo_t *pci); /** * Retrieves the maximum PCIe link generation possible with this device and system * * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function will * report is generation 1. - * + * * For Fermi &tm; or newer fully supported devices. - * + * * @param device The identifier of the target device * @param maxLinkGen Reference in which to return the max PCIe link generation - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a maxLinkGen has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGen is null @@ -2649,13 +2969,13 @@ nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, uns * * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report * a max link width of 8. - * + * * For Fermi &tm; or newer fully supported devices. - * + * * @param device The identifier of the target device * @param maxLinkWidth Reference in which to return the max PCIe link generation - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a maxLinkWidth has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkWidth is null @@ -2667,13 +2987,13 @@ nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned /** * Retrieves the current PCIe link generation - * + * * For Fermi &tm; or newer fully supported devices. - * + * * @param device The identifier of the target device * @param currLinkGen Reference in which to return the current PCIe link generation - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a currLinkGen has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkGen is null @@ -2685,13 +3005,13 @@ nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, un /** * Retrieves the current PCIe link width - * + * * For Fermi &tm; or newer fully supported devices. - * + * * @param device The identifier of the target device * @param currLinkWidth Reference in which to return the current PCIe link generation - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a currLinkWidth has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkWidth is null @@ -2703,7 +3023,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigne /** * Retrieve PCIe utilization information. - * This function is querying a byte counter over a 20ms interval and thus is the + * This function is querying a byte counter over a 20ms interval and thus is the * PCIe throughput over that interval. * * For Maxwell &tm; or newer fully supported devices. @@ -2724,7 +3044,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigne */ nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int *value); -/** +/** * Retrieve the PCIe replay counter. * * For Kepler &tm; or newer fully supported devices. @@ -2752,8 +3072,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigne * @param device The identifier of the target device * @param type Identify which clock domain to query * @param clock Reference in which to return the clock speed in MHz - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a clock has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL @@ -2776,8 +3096,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t * @param device The identifier of the target device * @param type Identify which clock domain to query * @param clock Reference in which to return the clock speed in MHz - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a clock has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL @@ -2796,8 +3116,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockTyp * @param device The identifier of the target device * @param clockType Identify which clock domain to query * @param clockMHz Reference in which to return the clock in MHz - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a clockMHz has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid @@ -2808,7 +3128,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockTyp nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); /** - * Retrieves the default applications clock that GPU boots with or + * Retrieves the default applications clock that GPU boots with or * defaults to after \ref nvmlDeviceResetApplicationsClocks call. * * For Kepler &tm; or newer fully supported devices. @@ -2816,8 +3136,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClo * @param device The identifier of the target device * @param clockType Identify which clock domain to query * @param clockMHz Reference in which to return the default clock in MHz - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a clockMHz has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid @@ -2845,8 +3165,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. * * @param device The identifier of the target device - * - * @return + * + * @return * - \ref NVML_SUCCESS if new settings were successfully set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid @@ -2904,9 +3224,9 @@ nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvm * @param count Reference in which to provide the \a clocksMHz array size, and * to return the number of elements * @param clocksMHz Reference in which to return the clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated + * + * @return + * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature @@ -2930,14 +3250,14 @@ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, uns * @param count Reference in which to provide the \a clocksMHz array size, and * to return the number of elements * @param clocksMHz Reference in which to return the clocks in MHz - * - * @return - * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated + * + * @return + * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_NOT_FOUND if the specified \a memoryClockMHz is not a supported frequency * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error * @@ -3042,15 +3362,16 @@ nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t d * * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the * output will not match the actual fan speed. - * + * * For all discrete products with dedicated fans. * - * The fan speed is expressed as a percent of the maximum, i.e. full speed is 100%. + * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. + * This value may exceed 100% in certain cases. * * @param device The identifier of the target device * @param speed Reference in which to return the fan speed percentage - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a speed has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a speed is NULL @@ -3069,7 +3390,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *sp * * For all discrete products with dedicated fans. * - * The fan speed is expressed as a percentage of the maximum, i.e. full speed is 100% + * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. + * This value may exceed 100% in certain cases. * * @param device The identifier of the target device * @param fan The index of the target fan, zero indexed. @@ -3087,8 +3409,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed_v2(nvmlDevice_t device, unsigned int /** - * Retrieves the current temperature readings for the device, in degrees C. - * + * Retrieves the current temperature readings for the device, in degrees C. + * * For all products. * * See \ref nvmlTemperatureSensors_t for details on available temperature sensors. @@ -3096,8 +3418,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed_v2(nvmlDevice_t device, unsigned int * @param device The identifier of the target device * @param sensorType Flag that indicates which sensor reading to retrieve * @param temp Reference in which to return the temperature reading - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a temp has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a sensorType is invalid or \a temp is NULL @@ -3128,7 +3450,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatu nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp); /** - * Retrieves the current performance state for the device. + * Retrieves the current performance state for the device. * * For Fermi &tm; or newer fully supported devices. * @@ -3136,8 +3458,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvml * * @param device The identifier of the target device * @param pState Reference in which to return the performance state reading - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a pState has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL @@ -3158,7 +3480,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPsta * @param clocksThrottleReasons Reference in which to return bitmask of active clocks throttle * reasons * - * @return + * @return * - \ref NVML_SUCCESS if \a clocksThrottleReasons has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clocksThrottleReasons is NULL @@ -3172,7 +3494,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPsta nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons); /** - * Retrieves bitmask of supported clocks throttle reasons that can be returned by + * Retrieves bitmask of supported clocks throttle reasons that can be returned by * \ref nvmlDeviceGetCurrentClocksThrottleReasons * * For all fully supported products. @@ -3183,7 +3505,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t devi * @param supportedClocksThrottleReasons Reference in which to return bitmask of supported * clocks throttle reasons * - * @return + * @return * - \ref NVML_SUCCESS if \a supportedClocksThrottleReasons has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a supportedClocksThrottleReasons is NULL @@ -3198,7 +3520,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t de /** * Deprecated: Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization. * - * Retrieve the current performance state for the device. + * Retrieve the current performance state for the device. * * For Fermi &tm; or newer fully supported devices. * @@ -3206,8 +3528,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t de * * @param device The identifier of the target device * @param pState Reference in which to return the performance state reading - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a pState has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL @@ -3228,16 +3550,16 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t * For from the Kepler or newer families. * - Does not require \a NVML_INFOROM_POWER object. * - * This flag indicates whether any power management algorithm is currently active on the device. An - * enabled state does not necessarily mean the device is being actively throttled -- only that + * This flag indicates whether any power management algorithm is currently active on the device. An + * enabled state does not necessarily mean the device is being actively throttled -- only that * that the driver will do so if the appropriate conditions are met. * * See \ref nvmlEnableState_t for details on allowed modes. * * @param device The identifier of the target device * @param mode Reference in which to return the current power management mode - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a mode has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL @@ -3255,13 +3577,13 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlE * The power limit defines the upper boundary for the card's power draw. If * the card's total power draw reaches this limit the power management algorithm kicks in. * - * This reading is only available if power management mode is supported. + * This reading is only available if power management mode is supported. * See \ref nvmlDeviceGetPowerManagementMode. * * @param device The identifier of the target device * @param limit Reference in which to return the power management limit in milliwatts - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a limit has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL @@ -3279,8 +3601,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsi * @param device The identifier of the target device * @param minLimit Reference in which to return the minimum power management limit in milliwatts * @param maxLimit Reference in which to return the maximum power management limit in milliwatts - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a minLimit and \a maxLimit have been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minLimit or \a maxLimit is NULL @@ -3300,8 +3622,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t d * * @param device The identifier of the target device * @param defaultLimit Reference in which to return the default power management limit in milliwatts - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a defaultLimit has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is NULL @@ -3322,8 +3644,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t devic * * @param device The identifier of the target device * @param power Reference in which to return the power usage information - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a power has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL @@ -3336,7 +3658,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int * /** * Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded * - * For newer than Pascal &tm; fully supported devices. + * For Volta &tm; or newer fully supported devices. * * @param device The identifier of the target device * @param energy Reference in which to return the energy consumption information @@ -3362,7 +3684,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, un * @param device The device to communicate with * @param limit Reference in which to return the power management limit in milliwatts * - * @return + * @return * - \ref NVML_SUCCESS if \a limit has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL @@ -3382,8 +3704,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsign * @param device The identifier of the target device * @param current Reference in which to return the current GOM * @param pending Reference in which to return the pending GOM - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a mode has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a current or \a pending is NULL @@ -3398,23 +3720,28 @@ nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuO /** * Retrieves the amount of used, free and total memory available on the device, in bytes. - * + * * For all products. * * Enabling ECC reduces the amount of total available memory, due to the extra required parity bits. * Under WDDM most device memory is allocated and managed on startup by Windows. * - * Under Linux and Windows TCC, the reported amount of used memory is equal to the sum of memory allocated + * Under Linux and Windows TCC, the reported amount of used memory is equal to the sum of memory allocated * by all active channels on the device. * * See \ref nvmlMemory_t for details on available memory info. * + * @note In MIG mode, if device handle is provided, the API returns aggregate + * information, only if the caller has appropriate privileges. Per-instance + * information can be queried by using specific MIG device handles. + * * @param device The identifier of the target device * @param memory Reference in which to return the memory information - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a memory has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error @@ -3430,8 +3757,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t * * * @param device The identifier of the target device * @param mode Reference in which to return the current compute mode - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a mode has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL @@ -3482,8 +3809,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int * @param device The identifier of the target device * @param current Reference in which to return the current ECC mode * @param pending Reference in which to return the pending ECC mode - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a current and \a pending have been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or either \a current or \a pending is NULL @@ -3497,13 +3824,13 @@ nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t /** * Retrieves the device boardId from 0-N. - * Devices with the same boardId indicate GPUs connected to the same PLX. Use in conjunction with + * Devices with the same boardId indicate GPUs connected to the same PLX. Use in conjunction with * \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well. - * The boardId returned is a unique ID for the current configuration. Uniqueness and ordering across + * The boardId returned is a unique ID for the current configuration. Uniqueness and ordering across * reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and - * the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will + * the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will * always return those values but they will always be different from each other). - * + * * * For Fermi &tm; or newer fully supported devices. * @@ -3548,18 +3875,18 @@ nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned in * Requires \a NVML_INFOROM_ECC version 1.0 or higher. * Requires ECC Mode to be enabled. * - * The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of + * The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of * errors across the entire device. * * See \ref nvmlMemoryErrorType_t for a description of available error types.\n * See \ref nvmlEccCounterType_t for a description of available counter types. * * @param device The identifier of the target device - * @param errorType Flag that specifies the type of the errors. - * @param counterType Flag that specifies the counter-type of the errors. + * @param errorType Flag that specifies the type of the errors. + * @param counterType Flag that specifies the counter-type of the errors. * @param eccCounts Reference in which to return the specified ECC errors - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a eccCounts has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL @@ -3593,11 +3920,11 @@ nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemory * See \ref nvmlEccErrorCounts_t for a description of provided detailed ECC counts. * * @param device The identifier of the target device - * @param errorType Flag that specifies the type of the errors. - * @param counterType Flag that specifies the counter-type of the errors. + * @param errorType Flag that specifies the type of the errors. + * @param counterType Flag that specifies the counter-type of the errors. * @param eccCounts Reference in which to return the specified ECC errors - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a eccCounts has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL @@ -3620,17 +3947,22 @@ nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMem * * Requires ECC Mode to be enabled. * + * @note On MIG-enabled GPUs, per instance information can be queried using specific + * MIG device handles. Per instance information is currently only supported for + * non-DRAM uncorrectable volatile errors. Querying volatile errors using device + * handles is currently not supported. + * * See \ref nvmlMemoryErrorType_t for a description of available memory error types.\n * See \ref nvmlEccCounterType_t for a description of available counter types.\n * See \ref nvmlMemoryLocation_t for a description of available counter locations.\n - * + * * @param device The identifier of the target device * @param errorType Flag that specifies the type of error. - * @param counterType Flag that specifies the counter-type of the errors. - * @param locationType Specifies the location of the counter. + * @param counterType Flag that specifies the counter-type of the errors. + * @param locationType Specifies the location of the counter. * @param count Reference in which to return the ECC counter - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a count has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a bitTyp,e \a counterType or \a locationType is @@ -3653,10 +3985,12 @@ nvmlReturn_t DECLDIR nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMe * \note During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings. * This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization. * + * @note On MIG-enabled GPUs, querying device utilization rates is not currently supported. + * * @param device The identifier of the target device * @param utilization Reference in which to return the utilization information - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a utilization has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a utilization is NULL @@ -3671,6 +4005,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtil * * For Kepler &tm; or newer fully supported devices. * + * @note On MIG-enabled GPUs, querying encoder utilization is not currently supported. + * * @param device The identifier of the target device * @param utilization Reference to an unsigned int for encoder utilization info * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US @@ -3693,7 +4029,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsign * @param device The identifier of the target device * @param encoderQueryType Type of encoder to query * @param encoderCapacity Reference to an unsigned int for the encoder capacity - * + * * @return * - \ref NVML_SUCCESS if \a encoderCapacity is fetched * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized @@ -3714,7 +4050,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetEncoderCapacity (nvmlDevice_t device, nvmlEnco * @param sessionCount Reference to an unsigned int for count of active encoder sessions * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions * @param averageLatency Reference to an unsigned int for encode latency in microseconds - * + * * @return * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized @@ -3743,7 +4079,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned in * @param device The identifier of the target device * @param sessionCount Reference to caller supplied array size, and returns the number of sessions. * @param sessionInfos Reference in which to return the session information - * + * * @return * - \ref NVML_SUCCESS if \a sessionInfos is fetched * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized @@ -3759,6 +4095,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned * * For Kepler &tm; or newer fully supported devices. * + * @note On MIG-enabled GPUs, querying decoder utilization is not currently supported. + * * @param device The identifier of the target device * @param utilization Reference to an unsigned int for decoder utilization info * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US @@ -3835,15 +4173,15 @@ nvmlReturn_t DECLDIR nvmlDeviceGetFBCSessions(nvmlDevice_t device, unsigned int * @param device The identifier of the target device * @param current Reference in which to return the current driver model * @param pending Reference in which to return the pending driver model - * - * @return + * + * @return * - \ref NVML_SUCCESS if either \a current and/or \a pending have been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or both \a current and \a pending are NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * + * * @see nvmlDeviceSetDriverModel() */ nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending); @@ -3853,18 +4191,18 @@ nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverMod * * For all products. * - * The VBIOS version may change from time to time. It will not exceed 32 characters in length + * The VBIOS version may change from time to time. It will not exceed 32 characters in length * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE. * * @param device The identifier of the target device * @param version Reference to which to return the VBIOS version * @param length The maximum allowed length of the string returned in \a version - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a version has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ @@ -3872,21 +4210,21 @@ nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *versio /** * Get Bridge Chip Information for all the bridge chips on the board. - * + * * For all fully supported products. * Only applicable to multi-GPU products. - * + * * @param device The identifier of the target device * @param bridgeHierarchy Reference to the returned bridge chip Hierarchy - * - * @return + * + * @return * - \ref NVML_SUCCESS if bridge chip exists * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a bridgeInfo is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if bridge chip not supported on the device * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * + * */ nvmlReturn_t DECLDIR nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t *bridgeHierarchy); @@ -3907,17 +4245,22 @@ nvmlReturn_t DECLDIR nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridge * Keep in mind that information returned by this call is dynamic and the number of elements might change in * time. Allocate more space for \a infos table in case new compute processes are spawned. * - * @param device The identifier of the target device + * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if + * the caller has appropriate privileges. Per-instance information can be queried by using + * specific MIG device handles. + * + * @param device The device handle or MIG device handle * @param infoCount Reference in which to provide the \a infos array size, and * to return the number of returned elements * @param infos Reference in which to return the process information - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small * \a infoCount will contain minimal amount of space necessary for * the call to complete + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error @@ -3931,7 +4274,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, u * * For Kepler &tm; or newer fully supported devices. * - * This function returns information only about graphics based processes + * This function returns information only about graphics based processes * (eg. applications using OpenGL, DirectX) * * To query the current number of running graphics processes, call this function with *infoCount = 0. The @@ -3943,17 +4286,22 @@ nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, u * Keep in mind that information returned by this call is dynamic and the number of elements might change in * time. Allocate more space for \a infos table in case new graphics processes are spawned. * + * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if + * the caller has appropriate privileges. Per-instance information can be queried by using + * specific MIG device handles. + * * @param device The identifier of the target device * @param infoCount Reference in which to provide the \a infos array size, and * to return the number of returned elements * @param infos Reference in which to return the process information - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small * \a infoCount will contain minimal amount of space necessary for * the call to complete + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error @@ -3990,7 +4338,7 @@ nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t de * * @param device The identifier of the target device * @param apiType Target API type for this operation - * @param isRestricted Reference in which to return the current restriction + * @param isRestricted Reference in which to return the current restriction * NVML_FEATURE_ENABLED indicates that the API is root-only * NVML_FEATURE_DISABLED indicates that the API is accessible to all users * @@ -4010,37 +4358,40 @@ nvmlReturn_t DECLDIR nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestri /** * Gets recent samples for the GPU. - * + * * For Kepler &tm; or newer fully supported devices. - * - * Based on type, this method can be used to fetch the power, utilization or clock samples maintained in the buffer by + * + * Based on type, this method can be used to fetch the power, utilization or clock samples maintained in the buffer by * the driver. - * + * * Power, Utilization and Clock samples are returned as type "unsigned int" for the union nvmlValue_t. - * - * To get the size of samples that user needs to allocate, the method is invoked with samples set to NULL. - * The returned samplesCount will provide the number of samples that can be queried. The user needs to + * + * To get the size of samples that user needs to allocate, the method is invoked with samples set to NULL. + * The returned samplesCount will provide the number of samples that can be queried. The user needs to * allocate the buffer with size as samplesCount * sizeof(nvmlSample_t). - * - * lastSeenTimeStamp represents CPU timestamp in microseconds. Set it to 0 to fetch all the samples maintained by the - * underlying buffer. Set lastSeenTimeStamp to one of the timeStamps retrieved from the date of the previous query + * + * lastSeenTimeStamp represents CPU timestamp in microseconds. Set it to 0 to fetch all the samples maintained by the + * underlying buffer. Set lastSeenTimeStamp to one of the timeStamps retrieved from the date of the previous query * to get more recent samples. - * - * This method fetches the number of entries which can be accommodated in the provided samples array, and the - * reference samplesCount is updated to indicate how many samples were actually retrieved. The advantage of using this + * + * This method fetches the number of entries which can be accommodated in the provided samples array, and the + * reference samplesCount is updated to indicate how many samples were actually retrieved. The advantage of using this * method for samples in contrast to polling via existing methods is to get get higher frequency data at lower polling cost. - * + * + * @note On MIG-enabled GPUs, querying the following sample types, NVML_GPU_UTILIZATION_SAMPLES, NVML_MEMORY_UTILIZATION_SAMPLES + * NVML_ENC_UTILIZATION_SAMPLES and NVML_DEC_UTILIZATION_SAMPLES, is not currently supported. + * * @param device The identifier for the target device * @param type Type of sampling event - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. * @param sampleValType Output parameter to represent the type of sample value as described in nvmlSampleVal_t * @param sampleCount Reference to provide the number of elements which can be queried in samples array * @param samples Reference in which samples are returned - - * @return + + * @return * - \ref NVML_SUCCESS if samples are successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a samplesCount is NULL or + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a samplesCount is NULL or * reference to \a sampleCount is 0 for non null \a samples * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible @@ -4052,10 +4403,10 @@ nvmlReturn_t DECLDIR nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_ /** * Gets Total, Available and Used size of BAR1 memory. - * - * BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or by 3rd party - * devices (peer-to-peer on the PCIE bus). - * + * + * BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or by 3rd party + * devices (peer-to-peer on the PCIE bus). + * * For Kepler &tm; or newer fully supported devices. * * @param device The identifier of the target device @@ -4075,20 +4426,20 @@ nvmlReturn_t DECLDIR nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Me /** - * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power + * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power * or thermal constraints. * * The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The - * difference in violation times at two different reference times gives the indication of GPU throttling event. + * difference in violation times at two different reference times gives the indication of GPU throttling event. * * Violation for thermal capping is not supported at this time. - * + * * For Kepler &tm; or newer fully supported devices. * * @param device The identifier of the target device * @param perfPolicyType Represents Performance policy which can trigger GPU throttling - * @param violTime Reference to which violation time related information is returned - * + * @param violTime Reference to which violation time related information is returned + * * * @return * - \ref NVML_SUCCESS if violation time is successfully retrieved @@ -4119,8 +4470,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfP * @param device The identifier of the target device * @param mode Reference in which to return the current accounting mode * - * @return - * - \ref NVML_SUCCESS if the mode has been successfully retrieved + * @return + * - \ref NVML_SUCCESS if the mode has been successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode are NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature @@ -4132,10 +4483,10 @@ nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnable * Queries process's accounting stats. * * For Kepler &tm; or newer fully supported devices. - * + * * Accounting stats capture GPU utilization and other statistics across the lifetime of a process. * Accounting stats can be queried during life time of the process and after its termination. - * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and + * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and * updated to actual running time after its termination. * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old * processes. @@ -4149,17 +4500,18 @@ nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnable * @note In case of pid collision stats of only the latest process (that terminated last) will be reported * * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU. - * + * * @param device The identifier of the target device * @param pid Process Id of the target process to query stats for * @param stats Reference in which to return the process's accounting stats * - * @return + * @return * - \ref NVML_SUCCESS if stats have been successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a stats are NULL * - \ref NVML_ERROR_NOT_FOUND if process stats were not found - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled + * or on vGPU host. * - \ref NVML_ERROR_UNKNOWN on any unexpected error * * @see nvmlDeviceGetAccountingBufferSize @@ -4167,14 +4519,14 @@ nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnable nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats); /** - * Queries list of processes that can be queried for accounting stats. The list of processes returned + * Queries list of processes that can be queried for accounting stats. The list of processes returned * can be in running or terminated state. * * For Kepler &tm; or newer fully supported devices. * * To just query the number of processes ready to be queried, call this function with *count = 0 and * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. - * + * * For more details see \ref nvmlDeviceGetAccountingStats. * * @note In case of PID collision some processes might not be accessible before the circular buffer is full. @@ -4183,12 +4535,13 @@ nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned * @param count Reference in which to provide the \a pids array size, and * to return the number of elements ready to be queried * @param pids Reference in which to return list of process ids - * - * @return + * + * @return * - \ref NVML_SUCCESS if pids were successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled + * or on vGPU host. * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to * expected value) * - \ref NVML_ERROR_UNKNOWN on any unexpected error @@ -4208,14 +4561,14 @@ nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned i * @param device The identifier of the target device * @param bufferSize Reference in which to provide the size (in number of elements) * of the circular buffer for accounting stats. - * - * @return + * + * @return * - \ref NVML_SUCCESS if buffer size was successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a bufferSize is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * + * * @see nvmlDeviceGetAccountingStats * @see nvmlDeviceGetAccountingPids */ @@ -4231,7 +4584,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsi * Returns the list of retired pages by source, including pages that are pending retirement * The address information provided from this API is the hardware address of the page that was retired. Note * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 - * + * * For Kepler &tm; or newer fully supported devices. * * @param device The identifier of the target device @@ -4240,13 +4593,13 @@ nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsi * to return the number of retired pages that match \a cause * Set to 0 to query the size without allocating an \a addresses buffer * @param addresses Buffer to write the page addresses into - * + * * @return * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the * matching page addresses. \a pageCount is set to the needed size. * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or * \a addresses is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible @@ -4262,7 +4615,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageReti * * \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps paramter to return the time of each page's * retirement. - * + * * For Kepler &tm; or newer fully supported devices. * * @param device The identifier of the target device @@ -4272,13 +4625,13 @@ nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageReti * Set to 0 to query the size without allocating an \a addresses buffer * @param addresses Buffer to write the page addresses into * @param timestamps Buffer to write the timestamps of page retirement, additional for _v2 - * + * * @return * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the * matching page addresses. \a pageCount is set to the needed size. * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or * \a addresses is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible @@ -4294,7 +4647,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages_v2(nvmlDevice_t device, nvmlPageR * * @param device The identifier of the target device * @param isPending Reference in which to return the pending status - * + * * @return * - \ref NVML_SUCCESS if \a isPending was populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized @@ -4305,6 +4658,61 @@ nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages_v2(nvmlDevice_t device, nvmlPageR */ nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending); +/** + * Get number of remapped rows. The number of rows reported will be based on + * the cause of the remapping. isPending indicates whether or not there are + * pending remappings. A reset will be required to actually remap the row. + * failureOccurred will be set if a row remapping ever failed in the past. A + * pending remapping won't affect future work on the GPU since + * error-containment and dynamic page blacklisting will take care of that. + * + * @note On MIG-enabled GPUs with active instances, querying the number of + * remapped rows is not supported + * + * For newer than Volta &tm; fully supported devices. + * + * @param device The identifier of the target device + * @param corrRows Reference for number of rows remapped due to correctable errors + * @param uncRows Reference for number of rows remapped due to uncorrectable errors + * @param isPending Reference for whether or not remappings are pending + * @param failureOccurred Reference that is set when a remapping has failed in the past + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a corrRows, \a uncRows, \a isPending or \a failureOccurred is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If MIG is enabled or if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN Unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRemappedRows(nvmlDevice_t device, unsigned int *corrRows, unsigned int *uncRows, + unsigned int *isPending, unsigned int *failureOccurred); + +/** + * Get the row remapper histogram. Returns the remap availability for each bank + * on the GPU. + * + * @param device Device handle + * @param values Histogram values + * + * @return + * - \ref NVML_SUCCESS On success + * - \ref NVML_ERROR_UNKNOWN On any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRowRemapperHistogram(nvmlDevice_t device, nvmlRowRemapperHistogramValues_t *values); + +/** + * Get architecture for device + * + * @param device The identifier of the target device + * @param arch Reference where architecture is returned, if call successful. + * Set to NVML_DEVICE_ARCH_* upon success + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a arch (output refererence) are invalid + */ +nvmlReturn_t DECLDIR nvmlDeviceGetArchitecture(nvmlDevice_t device, nvmlDeviceArchitecture_t *arch); + /** @} */ /***************************************************************************************************/ @@ -4323,24 +4731,24 @@ nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, * Requires root/admin permissions. * * This operation takes effect immediately. - * * - * Current S-Class products don't provide unique LEDs for each unit. As such, both front + * + * Current S-Class products don't provide unique LEDs for each unit. As such, both front * and back LEDs will be toggled in unison regardless of which unit is specified with this command. * * See \ref nvmlLedColor_t for available colors. * * @param unit The identifier of the target unit * @param color The target LED color - * - * @return + * + * @return * - \ref NVML_SUCCESS if the LED color has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a color is invalid * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * + * * @see nvmlUnitGetLedState() */ nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color); @@ -4379,8 +4787,8 @@ nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color); * * @param device The identifier of the target device * @param mode The target persistence mode - * - * @return + * + * @return * - \ref NVML_SUCCESS if the persistence mode was set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid @@ -4407,12 +4815,14 @@ nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnabl * * Under windows compute mode may only be set to DEFAULT when running in WDDM * + * @note On MIG-enabled GPUs, compute mode would be set to DEFAULT and changing it is not supported. + * * See \ref nvmlComputeMode_t for details on available compute modes. * * @param device The identifier of the target device * @param mode The target compute mode - * - * @return + * + * @return * - \ref NVML_SUCCESS if the compute mode was set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid @@ -4441,8 +4851,8 @@ nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMo * * @param device The identifier of the target device * @param ecc The target ECC mode - * - * @return + * + * @return * - \ref NVML_SUCCESS if the ECC mode was set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a ecc is invalid @@ -4453,7 +4863,7 @@ nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMo * * @see nvmlDeviceGetEccMode() */ -nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc); +nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc); /** * Clear the ECC error and other memory error counts for the device. @@ -4473,8 +4883,8 @@ nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t * * @param device The identifier of the target device * @param counterType Flag that indicates which type of errors should be cleared. - * - * @return + * + * @return * - \ref NVML_SUCCESS if the error counts were cleared * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counterType is invalid @@ -4483,7 +4893,7 @@ nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error * - * @see + * @see * - nvmlDeviceGetDetailedEccErrors() * - nvmlDeviceGetTotalEccErrors() */ @@ -4497,17 +4907,17 @@ nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccC * Requires root/admin permissions. * * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached - * to the device it must run in WDDM mode. + * to the device it must run in WDDM mode. * * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce). * This should only be done if the host is subsequently powered down and the display is detached from the device - * before the next reboot. + * before the next reboot. * * This operation takes effect after the next reboot. - * + * * Windows driver model may only be set to WDDM when running in DEFAULT compute mode. * - * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or + * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode. * * See \ref nvmlDriverModel_t for details on available driver models. @@ -4516,20 +4926,26 @@ nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccC * @param device The identifier of the target device * @param driverModel The target driver model * @param flags Flags that change the default behavior - * - * @return - * - \ref NVML_SUCCESS if the driver model has been set + * + * @return + * - \ref NVML_SUCCESS if the driver model has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a driverModel is invalid * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows or the device does not support this feature * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * + * * @see nvmlDeviceGetDriverModel() */ nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags); +typedef enum nvmlClockLimitId_enum { + NVML_CLOCK_LIMIT_ID_RANGE_START = 0xffffff00, + NVML_CLOCK_LIMIT_ID_TDP, + NVML_CLOCK_LIMIT_ID_UNLIMITED +} nvmlClockLimitId_t; + /** * Set clocks that device will lock to. * @@ -4539,12 +4955,25 @@ nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverMod * * Can be used as a setting to request constant performance. * + * This can be called with a pair of integer clock frequencies in MHz, or a pair of /ref nvmlClockLimitId_t values. + * See the table below for valid combinations of these values. + * + * minGpuClock | maxGpuClock | Effect + * ------------+-------------+-------------------------------------------------- + * tdp | tdp | Lock clock to TDP + * unlimited | tdp | Upper bound is TDP but clock may drift below this + * tdp | unlimited | Lower bound is TDP but clock may boost above this + * unlimited | unlimited | Unlocked (== nvmlDeviceResetGpuLockedClocks) + * + * If one arg takes one of these values, the other must be one of these values as + * well. Mixed numeric and symbolic calls return NVML_ERROR_INVALID_ARGUMENT. + * * Requires root/admin permissions. * * After system reboot or driver reload applications clocks go back to their default value. * See \ref nvmlDeviceResetGpuLockedClocks. * - * For newer than Pascal &tm; fully supported devices. + * For Volta &tm; or newer fully supported devices. * * @param device The identifier of the target device * @param minGpuClockMHz Requested minimum gpu clock in MHz @@ -4570,7 +4999,7 @@ nvmlReturn_t DECLDIR nvmlDeviceSetGpuLockedClocks(nvmlDevice_t device, unsigned * * @see nvmlDeviceSetGpuLockedClocks * - * For newer than Pascal &tm; fully supported devices. + * For Volta &tm; or newer fully supported devices. * * @param device The identifier of the target device * @@ -4627,7 +5056,7 @@ nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsign /** * Set new power limit of this device. - * + * * For Kepler &tm; or newer fully supported devices. * Requires root/admin permissions. * @@ -4638,8 +5067,8 @@ nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsign * * @param device The identifier of the target device * @param limit Power management limit in milliwatts to set - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a limit has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is out of range @@ -4659,17 +5088,17 @@ nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsi * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. * Not supported on Quadro ® and Tesla &tm; C-class products. * Requires root/admin permissions. - * - * Changing GOMs requires a reboot. + * + * Changing GOMs requires a reboot. * The reboot requirement might be removed in the future. * * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel. - * + * * @param device The identifier of the target device * @param mode Target GOM - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a mode has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode incorrect @@ -4688,7 +5117,7 @@ nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuO * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs. * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction * to query the current restriction settings. - * + * * For Kepler &tm; or newer fully supported devices. * Requires root/admin permissions. * @@ -4701,7 +5130,7 @@ nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuO * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a apiType incorrect * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support changing API restrictions or the device does not support - * the feature that api restrictions are being set for (E.G. Enabling/disabling auto + * the feature that api restrictions are being set for (E.G. Enabling/disabling auto * boosted clocks is not supported by the device) * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible @@ -4714,7 +5143,7 @@ nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestri /** * @} */ - + /** @addtogroup nvmlAccountingStats * @{ */ @@ -4727,11 +5156,13 @@ nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestri * * @note This setting is not persistent and will default to disabled after driver unloads. * Enable persistence mode to be sure the setting doesn't switch off to disabled. - * + * * @note Enabling accounting mode has no negative impact on the GPU performance. * * @note Disabling accounting clears all accounting pids information. * + * @note On MIG-enabled GPUs, accounting mode would be set to DISABLED and changing it is not supported. + * * See \ref nvmlDeviceGetAccountingMode * See \ref nvmlDeviceGetAccountingStats * See \ref nvmlDeviceClearAccountingPids @@ -4739,7 +5170,7 @@ nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestri * @param device The identifier of the target device * @param mode The target accounting mode * - * @return + * @return * - \ref NVML_SUCCESS if the new mode has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a mode are invalid @@ -4761,8 +5192,8 @@ nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnable * * @param device The identifier of the target device * - * @return - * - \ref NVML_SUCCESS if accounting information has been cleared + * @return + * - \ref NVML_SUCCESS if accounting information has been cleared * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device are invalid * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature @@ -4788,10 +5219,10 @@ nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device); * @param device The identifier of the target device * @param link Specifies the NvLink link to be queried * @param isActive \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that - * the link is active and NVML_FEATURE_DISABLED indicates it + * the link is active and NVML_FEATURE_DISABLED indicates it * is inactive * - * @return + * @return * - \ref NVML_SUCCESS if \a isActive has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a isActive is NULL @@ -4809,7 +5240,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int * @param link Specifies the NvLink link to be queried * @param version Requested NvLink version * - * @return + * @return * - \ref NVML_SUCCESS if \a version has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a version is NULL @@ -4830,7 +5261,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned in * @param capability Specifies the \a nvmlNvLinkCapability_t to be queried * @param capResult A boolean for the queried capability indicating that feature is available * - * @return + * @return * - \ref NVML_SUCCESS if \a capResult has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a capability is invalid or \a capResult is NULL @@ -4838,26 +5269,26 @@ nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned in * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, - nvmlNvLinkCapability_t capability, unsigned int *capResult); + nvmlNvLinkCapability_t capability, unsigned int *capResult); /** - * Retrieves the PCI information for the remote node on a NvLink link + * Retrieves the PCI information for the remote node on a NvLink link * Note: pciSubSystemId is not filled in this function and is indeterminate * * For Pascal &tm; or newer fully supported devices. * * @param device The identifier of the target device * @param link Specifies the NvLink link to be queried - * @param pci \a nvmlPciInfo_t of the remote node for the specified link + * @param pci \a nvmlPciInfo_t of the remote node for the specified link * - * @return + * @return * - \ref NVML_SUCCESS if \a pci has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a pci is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo_v2(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); /** * Retrieves the specified error counter value @@ -4870,7 +5301,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsig * @param counter Specifies the NvLink counter to be queried * @param counterValue Returned counter value * - * @return + * @return * - \ref NVML_SUCCESS if \a counter has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid or \a counterValue is NULL @@ -4889,7 +5320,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsign * @param device The identifier of the target device * @param link Specifies the NvLink link to be queried * - * @return + * @return * - \ref NVML_SUCCESS if the reset is successful * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid @@ -4899,6 +5330,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsign nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link); /** + * Deprecated: Setting utilization counter control is no longer supported. + * * Set the NVLINK utilization counter control information for the specified counter, 0 or 1. * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition. Performs a reset * of the counters if the reset parameter is non-zero. @@ -4911,10 +5344,10 @@ nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, uns * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to set * @param reset Resets the counters on set if non-zero * - * @return + * @return * - \ref NVML_SUCCESS if the control has been set successfully * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ @@ -4922,6 +5355,8 @@ nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, nvmlNvLinkUtilizationControl_t *control, unsigned int reset); /** + * Deprecated: Getting utilization counter control is no longer supported. + * * Get the NVLINK utilization counter control information for the specified counter, 0 or 1. * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition * @@ -4932,10 +5367,10 @@ nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, * @param link Specifies the NvLink link to be queried * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to place information * - * @return + * @return * - \ref NVML_SUCCESS if the control has been set successfully * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ @@ -4944,6 +5379,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, /** + * Deprecated: Use \ref nvmlDeviceGetFieldValues with NVML_FI_DEV_NVLINK_THROUGHPUT_* as field values instead. + * * Retrieve the NVLINK utilization counter based on the current control for a specified counter. * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl * before reading the utilization counters as they have no default state @@ -4956,18 +5393,20 @@ nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, * @param rxcounter Receive counter return value * @param txcounter Transmit counter return value * - * @return + * @return * - \ref NVML_SUCCESS if \a rxcounter and \a txcounter have been successfully set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, unsigned long long *rxcounter, unsigned long long *txcounter); /** - * Freeze the NVLINK utilization counters + * Deprecated: Freezing NVLINK utilization counters is no longer supported. + * + * Freeze the NVLINK utilization counters * Both the receive and transmit counters are operated on by this function * * For Pascal &tm; or newer fully supported devices. @@ -4978,18 +5417,20 @@ nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, * @param freeze NVML_FEATURE_ENABLED = freeze the receive and transmit counters * NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters * - * @return + * @return * - \ref NVML_SUCCESS if counters were successfully frozen or unfrozen * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, \a counter, or \a freeze is invalid * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, +nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter, nvmlEnableState_t freeze); /** - * Reset the NVLINK utilization counters + * Deprecated: Resetting NVLINK utilization counters is no longer supported. + * + * Reset the NVLINK utilization counters * Both the receive and transmit counters are operated on by this function * * For Pascal &tm; or newer fully supported devices. @@ -4998,7 +5439,7 @@ nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t devi * @param link Specifies the NvLink link to be reset * @param counter Specifies the counter that should be reset (0 or 1) * - * @return + * @return * - \ref NVML_SUCCESS if counters were successfully reset * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid @@ -5011,7 +5452,7 @@ nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t devic /***************************************************************************************************/ /** @defgroup nvmlEvents Event Handling Methods - * This chapter describes methods that NVML can perform against each device to register and wait for + * This chapter describes methods that NVML can perform against each device to register and wait for * some event to occur. * @{ */ @@ -5023,13 +5464,13 @@ nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t devic * * For Fermi &tm; or newer fully supported devices. * @param set Reference in which to return the event handle - * - * @return + * + * @return * - \ref NVML_SUCCESS if the event has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a set is NULL * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * + * * @see nvmlEventSetFree */ nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set); @@ -5047,7 +5488,7 @@ nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set); * * This call starts recording of events on specific device. * All events that occurred before this call are not recorded. - * Checking if some event occurred can be done with \ref nvmlEventSetWait + * Checking if some event occurred can be done with \ref nvmlEventSetWait_v2 * * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed. * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes @@ -5056,15 +5497,15 @@ nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set); * @param device The identifier of the target device * @param eventTypes Bitmask of \ref nvmlEventType to record * @param set Set to which add new event types - * - * @return + * + * @return * - \ref NVML_SUCCESS if the event has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventTypes is invalid or \a set is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the platform does not support this feature or some of requested event types * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * + * * @see nvmlEventType * @see nvmlDeviceGetSupportedEventTypes * @see nvmlEventSetWait @@ -5081,14 +5522,14 @@ nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long * * @param device The identifier of the target device * @param eventTypes Reference in which to return bitmask of supported events - * - * @return + * + * @return * - \ref NVML_SUCCESS if the eventTypes has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventType is NULL * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * + * * @see nvmlEventType * @see nvmlDeviceRegisterEvents */ @@ -5100,43 +5541,51 @@ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsig * For Fermi &tm; or newer fully supported devices. * * If some events are ready to be delivered at the time of the call, function returns immediately. - * If there are no events ready to be delivered, function sleeps till event arrives + * If there are no events ready to be delivered, function sleeps till event arrives * but not longer than specified timeout. This function in certain conditions can return before * specified timeout passes (e.g. when interrupt arrives) - * - * In case of xid error, the function returns the most recent xid error type seen by the system. If there are multiple - * xid errors generated before nvmlEventSetWait is invoked then the last seen xid error type is returned for all - * xid error events. - * + * + * On Windows, in case of xid error, the function returns the most recent xid error type seen by the system. + * If there are multiple xid errors generated before nvmlEventSetWait is invoked then the last seen xid error + * type is returned for all xid error events. + * + * On Linux, every xid error event would return the associated event data and other information if applicable. + * + * In MIG mode, if device handle is provided, the API reports all the events for the available instances, + * only if the caller has appropriate privileges. In absence of required privileges, only the events which + * affect all the instances (i.e. whole device) are reported. + * + * This API does not currently support per-instance event reporting using MIG device handles. + * * @param set Reference to set of events to wait on * @param data Reference in which to return event data * @param timeoutms Maximum amount of wait time in milliseconds for registered event - * - * @return + * + * @return * - \ref NVML_SUCCESS if the data has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a data is NULL * - \ref NVML_ERROR_TIMEOUT if no event arrived in specified timeout or interrupt arrived * - \ref NVML_ERROR_GPU_IS_LOST if a GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * + * * @see nvmlEventType * @see nvmlDeviceRegisterEvents */ -nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); +nvmlReturn_t DECLDIR nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); /** * Releases events in the set * * For Fermi &tm; or newer fully supported devices. * - * @param set Reference to events to be released - * - * @return + * @param set Reference to events to be released + * + * @return * - \ref NVML_SUCCESS if the event has been successfully released * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * + * * @see nvmlDeviceRegisterEvents */ nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set); @@ -5144,7 +5593,7 @@ nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set); /** @} */ /***************************************************************************************************/ -/** @defgroup nvmlZPI Drain states +/** @defgroup nvmlZPI Drain states * This chapter describes methods that NVML can perform against each device to control their drain state * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to * power on/off GPUs, enable robust reset scenarios, etc. @@ -5158,14 +5607,14 @@ nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set); * this call is made. * Must be called as administrator. * For Linux only. - * + * * For Pascal &tm; or newer fully supported devices. * Some Kepler devices supported. * * @param pciInfo The PCI address of the GPU drain state to be modified * @param newState The drain state that should be entered, see \ref nvmlEnableState_t * - * @return + * @return * - \ref NVML_SUCCESS if counters were successfully reset * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a newState is invalid @@ -5180,14 +5629,14 @@ nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (nvmlPciInfo_t *pciInfo, nvmlEna * Query the drain state of a GPU. This method is used to check if a GPU is in a currently draining * state. * For Linux only. - * + * * For Pascal &tm; or newer fully supported devices. * Some Kepler devices supported. * * @param pciInfo The PCI address of the GPU drain state to be queried * @param currentState The current drain state for this GPU, see \ref nvmlEnableState_t * - * @return + * @return * - \ref NVML_SUCCESS if counters were successfully reset * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a currentState is invalid @@ -5201,14 +5650,14 @@ nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnab * as long as no other processes are attached. If other processes are attached, this call will return * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called - * to initiate the draining state is if that process was using, and is still using, a GPU before the + * to initiate the draining state is if that process was using, and is still using, a GPU before the * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled * prior to this call. * * For long-running NVML processes please note that this will change the enumeration of current GPUs. * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2. * Also, device handles after the removed GPU will not be valid and must be re-established. - * Must be run as administrator. + * Must be run as administrator. * For Linux only. * * For Pascal &tm; or newer fully supported devices. @@ -5226,11 +5675,11 @@ nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnab * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_IN_USE if the device is still in use and cannot be removed */ -nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu (nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState); +nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu_v2(nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState); /** * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that - * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device. + * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device. * If all are zeroes then the entire PCI tree will be searched. Please note that for long-running NVML processes * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order. * @@ -5239,14 +5688,14 @@ nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu (nvmlPciInfo_t *pciInfo, nvmlDetachGpuS * * Must be run as administrator. * For Linux only. - * + * * For Pascal &tm; or newer fully supported devices. * Some Kepler devices supported. * * @param pciInfo The PCI tree to be searched. Only the domain, bus, and device * fields are used in this call. * - * @return + * @return * - \ref NVML_SUCCESS if counters were successfully reset * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciInfo is invalid @@ -5288,8 +5737,15 @@ nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCou /** @} */ /***************************************************************************************************/ -/** @defgroup nvmlGridQueries Grid Queries - * This chapter describes NVML operations that are associated with NVIDIA GRID products. +/** @defgroup gridVirtual GRID Virtualization Enums, Constants and Structs + * @{ + */ +/** @} */ +/***************************************************************************************************/ + +/***************************************************************************************************/ +/** @defgroup nvmlGridQueries GRID Virtualization APIs + * This chapter describes operations that are associated with NVIDIA GRID products. * @{ */ /***************************************************************************************************/ @@ -5301,8 +5757,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCou * * @param device Identifier of the target device * @param pVirtualMode Reference to virtualization mode. One of NVML_GPU_VIRTUALIZATION_? - * - * @return + * + * @return * - \ref NVML_SUCCESS if \a pVirtualMode is fetched * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL @@ -5311,14 +5767,23 @@ nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCou */ nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t *pVirtualMode); -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlGridCommands Grid Commands - * This chapter describes NVML operations that are associated with NVIDIA GRID products. - * @{ +/** + * Queries if SR-IOV host operation is supported on a vGPU supported device. + * + * Checks whether SR-IOV host capability is supported by the device and the + * driver, and indicates device is in SR-IOV mode if both of these conditions + * are true. + * + * @param device The identifier of the target device + * @param pHostVgpuMode Reference in which to return the current vGPU mode + * + * @return + * - \ref NVML_SUCCESS if device's vGPU mode has been successfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle is 0 or \a pVgpuMode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature. + * - \ref NVML_ERROR_UNKNOWN if any unexpected error occurred */ -/***************************************************************************************************/ +nvmlReturn_t DECLDIR nvmlDeviceGetHostVgpuMode(nvmlDevice_t device, nvmlHostVgpuMode_t *pHostVgpuMode); /** * This method is used to set the virtualization mode corresponding to the GPU. @@ -5328,7 +5793,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGp * @param device Identifier of the target device * @param virtualMode virtualization mode. One of NVML_GPU_VIRTUALIZATION_? * - * @return + * @return * - \ref NVML_SUCCESS if \a pVirtualMode is set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL @@ -5338,13 +5803,71 @@ nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGp */ nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode); +/** + * Retrieve the GRID licensable features. + * + * Identifies whether the system supports GRID Software Licensing. If it does, return the list of licensable feature(s) + * and their current license status. + * + * @param device Identifier of the target device + * @param pGridLicensableFeatures Pointer to structure in which GRID licensable features are returned + * + * @return + * - \ref NVML_SUCCESS if licensable features are successfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pGridLicensableFeatures is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v3(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); + +/** + * Retrieves the current utilization and process ID + * + * For Maxwell &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running. + * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at + * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization + * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values + * are returned as "unsigned int" values. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a utilization set to NULL. The caller should allocate a buffer of size + * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed + * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for. + * + * On successful return, the function updates \a processSamplesCount with the number of process utilization sample + * structures that were actually written. This may differ from a previously read value as instances are created or + * destroyed. + * + * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @note On MIG-enabled GPUs, querying process utilization is not currently supported. + * + * @param device The identifier of the target device + * @param utilization Pointer to caller-supplied buffer in which guest process utilization samples are returned + * @param processSamplesCount Pointer to caller-supplied array size, and returns number of processes running + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, + unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); + /** @} */ /***************************************************************************************************/ -/** @defgroup nvmlVgpu vGPU Management +/** @defgroup nvmlVgpu GRID vGPU Management * @{ * - * Set of APIs supporting GRID vGPU + * This chapter describes APIs supporting NVIDIA GRID vGPU. */ /***************************************************************************************************/ @@ -5369,7 +5892,6 @@ nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGp * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL or \a device is invalid * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); @@ -5399,7 +5921,6 @@ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned i * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ nvmlReturn_t DECLDIR nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); @@ -5670,7 +6191,7 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, ch * See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. * * nvmlVgpuInstanceGetVmDriverVersion() may be called at any time for a vGPU instance. The guest VM driver version is - * returned as "Unknown" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the + * returned as "Not Available" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the * NVIDIA driver is loaded and initialized. * * For Kepler &tm; or newer fully supported devices. @@ -5766,6 +6287,22 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nv */ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit); +/** + * Retrieve the current ECC mode of vGPU instance. + * + * @param vgpuInstance The identifier of the target vGPU instance + * @param eccMode Reference in which to return the current ECC mode + * + * @return + * - \ref NVML_SUCCESS if the vgpuInstance's ECC mode has been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mode is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEccMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *eccMode); + /** * Retrieve the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. * @@ -5801,156 +6338,45 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuI nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int encoderCapacity); /** - * Retrieves current utilization for vGPUs on a physical GPU (device). - * - * For Kepler &tm; or newer fully supported devices. - * - * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for vGPU instances running - * on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer - * pointed at by \a utilizationSamples. One utilization sample structure is returned per vGPU instance, and includes the - * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values - * in nvmlValue_t unions. The function sets the caller-supplied \a sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to - * indicate the returned value type. - * - * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with - * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance - * count in \a vgpuInstanceSamplesCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate - * a buffer of size vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t). Invoke the function again with - * the allocated buffer passed in \a utilizationSamples, and \a vgpuInstanceSamplesCount set to the number of entries the - * buffer is sized for. + * Retrieves the current encoder statistics of a vGPU Instance * - * On successful return, the function updates \a vgpuInstanceSampleCount with the number of vGPU utilization sample - * structures that were actually written. This may differ from a previously read value as vGPU instances are created or - * destroyed. + * For Maxwell &tm; or newer fully supported devices. * - * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 - * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp - * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * @param vgpuInstance Identifier of the target vGPU instance + * @param sessionCount Reference to an unsigned int for count of active encoder sessions + * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions + * @param averageLatency Reference to an unsigned int for encode latency in microseconds * - * @param device The identifier for the target device - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - * @param sampleValType Pointer to caller-supplied buffer to hold the type of returned sample values - * @param vgpuInstanceSamplesCount Pointer to caller-supplied array size, and returns number of vGPU instances - * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU utilization samples are returned - * @return - * - \ref NVML_SUCCESS if utilization samples are successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuInstanceSamplesCount or \a sampleValType is - * NULL, or a sample count of 0 is passed with a non-NULL \a utilizationSamples - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuInstanceSamplesCount is too small to return samples for all - * vGPU instances currently executing on the device - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount , or \a averageFps or \a averageLatency is NULL + * or \a vgpuInstance is 0. + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, - nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount, - nvmlVgpuInstanceUtilizationSample_t *utilizationSamples); +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, + unsigned int *averageFps, unsigned int *averageLatency); /** - * Retrieves current utilization for processes running on vGPUs on a physical GPU (device). - * - * For Maxwell &tm; or newer fully supported devices. + * Retrieves information about all active encoder sessions on a vGPU Instance. * - * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running on - * vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the - * caller-supplied buffer pointed at by \a utilizationSamples. One utilization sample structure is returned per process running - * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which - * the samples were recorded. Individual utilization values are returned as "unsigned int" values. + * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The + * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions + * written to the buffer. * - * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with - * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance - * count in \a vgpuProcessSamplesCount. The caller should allocate a buffer of size - * vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with - * the allocated buffer passed in \a utilizationSamples, and \a vgpuProcessSamplesCount set to the number of entries the - * buffer is sized for. + * If the supplied buffer is not large enough to accomodate the active session array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. + * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return + * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. * - * On successful return, the function updates \a vgpuSubProcessSampleCount with the number of vGPU sub process utilization sample - * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active - * in any given sample period. + * For Maxwell &tm; or newer fully supported devices. * - * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 - * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp - * to a timeStamp retrieved from a previous query to read utilization since the previous query. - * - * @param device The identifier for the target device - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - * @param vgpuProcessSamplesCount Pointer to caller-supplied array size, and returns number of processes running on vGPU instances - * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU sub process utilization samples are returned - - * @return - * - \ref NVML_SUCCESS if utilization samples are successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuProcessSamplesCount or a sample count of 0 is - * passed with a non-NULL \a utilizationSamples - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuProcessSamplesCount is too small to return samples for all - * vGPU instances currently executing on the device - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, - unsigned int *vgpuProcessSamplesCount, - nvmlVgpuProcessUtilizationSample_t *utilizationSamples); -/** - * Retrieve the GRID licensable features. - * - * Identifies whether the system supports GRID Software Licensing. If it does, return the list of licensable feature(s) - * and their current license status. - * - * @param device Identifier of the target device - * @param pGridLicensableFeatures Pointer to structure in which GRID licensable features are returned - * - * @return - * - \ref NVML_SUCCESS if licensable features are successfully retrieved - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pGridLicensableFeatures is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); - -/** - * Retrieves the current encoder statistics of a vGPU Instance - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param sessionCount Reference to an unsigned int for count of active encoder sessions - * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions - * @param averageLatency Reference to an unsigned int for encode latency in microseconds - * - * @return - * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount , or \a averageFps or \a averageLatency is NULL - * or \a vgpuInstance is 0. - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, - unsigned int *averageFps, unsigned int *averageLatency); - -/** - * Retrieves information about all active encoder sessions on a vGPU Instance. - * - * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The - * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions - * written to the buffer. - * - * If the supplied buffer is not large enough to accomodate the active session array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. - * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return - * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param sessionCount Reference to caller supplied array size, and returns - * the number of sessions. - * @param sessionInfo Reference to caller supplied array in which the list - * of session information us returned. + * @param vgpuInstance Identifier of the target vGPU instance + * @param sessionCount Reference to caller supplied array size, and returns + * the number of sessions. + * @param sessionInfo Reference to caller supplied array in which the list + * of session information us returned. * * @return * - \ref NVML_SUCCESS if \a sessionInfo is fetched @@ -6011,146 +6437,22 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCStats(nvmlVgpuInstance_t vgpuInstance */ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo); -/** - * Retrieves the current utilization and process ID - * - * For Maxwell &tm; or newer fully supported devices. - * - * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running. - * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at - * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization - * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values - * are returned as "unsigned int" values. - * - * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with - * \a utilization set to NULL. The caller should allocate a buffer of size - * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed - * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for. - * - * On successful return, the function updates \a processSamplesCount with the number of process utilization sample - * structures that were actually written. This may differ from a previously read value as instances are created or - * destroyed. - * - * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 - * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp - * to a timeStamp retrieved from a previous query to read utilization since the previous query. - * - * @param device The identifier of the target device - * @param utilization Pointer to caller-supplied buffer in which guest process utilization samples are returned - * @param processSamplesCount Pointer to caller-supplied array size, and returns number of processes running - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - - * @return - * - \ref NVML_SUCCESS if \a utilization has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, - unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); - -/** - * Queries the state of per process accounting mode on vGPU. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance The identifier of the target vGPU VM - * @param mode Reference in which to return the current accounting mode - * - * @return - * - \ref NVML_SUCCESS if the mode has been successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mode is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *mode); - -/** - * Queries list of processes running on vGPU that can be queried for accounting stats. The list of processes - * returned can be in running or terminated state. - * - * For Maxwell &tm; or newer fully supported devices. - * - * To just query the maximum number of processes that can be queried, call this function with *count = 0 and - * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. - * - * For more details see \ref nvmlVgpuInstanceGetAccountingStats. - * - * @note In case of PID collision some processes might not be accessible before the circular buffer is full. - * - * @param vgpuInstance The identifier of the target vGPU VM - * @param count Reference in which to provide the \a pids array size, and - * to return the number of elements ready to be queried - * @param pids Reference in which to return list of process ids - * - * @return - * - \ref NVML_SUCCESS if pids were successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a count is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to expected value) - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlVgpuInstanceGetAccountingPids - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingPids(nvmlVgpuInstance_t vgpuInstance, unsigned int *count, unsigned int *pids); - -/** - * Queries process's accounting stats. - * - * For Maxwell &tm; or newer fully supported devices. - * - * Accounting stats capture GPU utilization and other statistics across the lifetime of a process, and - * can be queried during life time of the process or after its termination. - * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and - * updated to actual running time after its termination. - * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old - * processes. - * - * See \ref nvmlAccountingStats_t for description of each returned metric. - * List of processes that can be queried can be retrieved from \ref nvmlVgpuInstanceGetAccountingPids. - * - * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode. - * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be - * queried since they don't contribute to GPU utilization. - * @note In case of pid collision stats of only the latest process (that terminated last) will be reported - * - * @param vgpuInstance The identifier of the target vGPU VM - * @param pid Process Id of the target process to query stats for - * @param stats Reference in which to return the process's accounting stats - * - * @return - * - \ref NVML_SUCCESS if stats have been successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a stats is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * or \a stats is not found - * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingStats(nvmlVgpuInstance_t vgpuInstance, unsigned int pid, nvmlAccountingStats_t *stats); - /** @} */ /***************************************************************************************************/ -/** @defgroup nvml vGPU Migration - * This chapter describes NVML operations that are associated with vGPU Migration. +/** @defgroup nvml GRID Virtualization Migration + * This chapter describes operations that are associated with vGPU Migration. * @{ */ /***************************************************************************************************/ /** - * Structure representing a range of vGPU version + * Structure representing range of vGPU versions. */ typedef struct nvmlVgpuVersion_st { - unsigned int minVersion; //!< Minimum vGPU version. - unsigned int maxVersion; //!< Maximum vGPU version. + unsigned int minVersion; //!< Minimum vGPU version. + unsigned int maxVersion; //!< Maximum vGPU version. } nvmlVgpuVersion_t; /** @@ -6163,7 +6465,8 @@ typedef struct nvmlVgpuMetadata_st nvmlVgpuGuestInfoState_t guestInfoState; //!< Current state of Guest-dependent fields char guestDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in guest char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in host - unsigned int reserved[7]; //!< Reserved for internal use + unsigned int reserved[6]; //!< Reserved for internal use + unsigned int vgpuVirtualizationCaps; //!< vGPU virtualizaion capabilities bitfileld unsigned int guestVgpuVersion; //!< vGPU version of guest driver unsigned int opaqueDataSize; //!< Size of opaque data field in bytes char opaqueData[4]; //!< Opaque data @@ -6293,100 +6596,996 @@ nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpu nvmlReturn_t DECLDIR nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t *vgpuMetadata, nvmlVgpuPgpuMetadata_t *pgpuMetadata, nvmlVgpuPgpuCompatibility_t *compatibilityInfo); /** - * Returns the following two version range structures \ref nvmlVgpuVersion_t : - * 1. \a supported : structure representing the range of vGPU versions supported by the host; - * 2. \a current : structure representing the range of supported versions enforced by the caller via \ref nvmlSetVgpuVersion(). - * - * The caller pass in the pointer to the structures, into which the compatible ranges are written. + * Returns the properties of the physical GPU indicated by the device in an ascii-encoded string format. + * + * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the + * string is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed + * in \a bufferSize. + * + * @param device The identifier of the target device + * @param pgpuMetadata Pointer to caller-supplied buffer into which \a pgpuMetadata is written + * @param bufferSize Pointer to size of \a pgpuMetadata buffer + * + * @return + * - \ref NVML_SUCCESS GPU metadata structure was successfully returned + * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a pgpuMetadata buffer is too small, required size is returned in \a bufferSize + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0. + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPgpuMetadataString(nvmlDevice_t device, char *pgpuMetadata, unsigned int *bufferSize); + +/* + * Virtual GPU (vGPU) version + * + * The NVIDIA vGPU Manager and the guest drivers are tagged with a range of supported vGPU versions. This determines the range of NVIDIA guest driver versions that + * are compatible for vGPU feature support with a given NVIDIA vGPU Manager. For vGPU feature support, the range of supported versions for the NVIDIA vGPU Manager + * and the guest driver must overlap. Otherwise, the guest driver fails to load in the VM. + * + * When the NVIDIA guest driver loads, either when the VM is booted or when the driver is installed or upgraded, a negotiation occurs between the guest driver + * and the NVIDIA vGPU Manager to select the highest mutually compatible vGPU version. The negotiated vGPU version stays the same across VM migration. + */ + +/** + * Query the ranges of supported vGPU versions. * - * @note: 1. The guest driver will fail to load if the version is below the range returned in the \a current structure. - * 2. If the guest driver is above the range, it will be downgraded to the current structure maximum version. + * This function gets the linear range of supported vGPU versions that is preset for the NVIDIA vGPU Manager and the range set by an administrator. + * If the preset range has not been overridden by \ref nvmlSetVgpuVersion, both ranges are the same. * - * @param supported Pointer to caller-supplied structure into which the supported vGPU version range is returned - * @param current Pointer to caller-supplied structure into which the caller enforced supported vGPU version range is returned. + * The caller passes pointers to the following \ref nvmlVgpuVersion_t structures, into which the NVIDIA vGPU Manager writes the ranges: + * 1. \a supported structure that represents the preset range of vGPU versions supported by the NVIDIA vGPU Manager. + * 2. \a current structure that represents the range of supported vGPU versions set by an administrator. By default, this range is the same as the preset range. + * + * @param supported Pointer to the structure in which the preset range of vGPU versions supported by the NVIDIA vGPU Manager is written + * @param current Pointer to the structure in which the range of supported vGPU versions set by an administrator is written * * @return - * - \ref NVML_SUCCESS vGPU version range structure was successfully returned - * - \ref NVML_ERROR_NOT_SUPPORTED API not supported - * - \ref NVML_ERROR_UNKNOWN Error while getting the data + * - \ref NVML_SUCCESS The vGPU version range structures were successfully obtained. + * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported. + * - \ref NVML_ERROR_INVALID_ARGUMENT The \a supported parameter or the \a current parameter is NULL. + * - \ref NVML_ERROR_UNKNOWN An error occurred while the data was being fetched. */ nvmlReturn_t DECLDIR nvmlGetVgpuVersion(nvmlVgpuVersion_t *supported, nvmlVgpuVersion_t *current); /** - * Takes a vGPU version range structure \ref nvmlVgpuVersion_t and set the vGPU compatible version range to the one provided as input. - * The caller should call the \ref nvmlGetVgpuVersion() to get the range of supported version by the host driver. + * Override the preset range of vGPU versions supported by the NVIDIA vGPU Manager with a range set by an administrator. + * + * This function configures the NVIDIA vGPU Manager with a range of supported vGPU versions set by an administrator. This range must be a subset of the + * preset range that the NVIDIA vGPU Manager supports. The custom range set by an administrator takes precedence over the preset range and is advertised to + * the guest VM for negotiating the vGPU version. See \ref nvmlGetVgpuVersion for details of how to query the preset range of versions supported. * - * @note: 1. The guest driver will fail to load if the version is below the range set via \a vgpuVersion structure. - * 2. If the guest driver is above the range, it will be downgraded to the \a vgpuVersion structure maximum version. - * 3. This will result error if there are VMs already active on the host or the supported range being set is outside the range supported by host driver. + * This function takes a pointer to vGPU version range structure \ref nvmlVgpuVersion_t as input to override the preset vGPU version range that the NVIDIA vGPU Manager supports. * - * @param vgpuVersion Pointer to caller-supplied vGPU supported version range. + * After host system reboot or driver reload, the range of supported versions reverts to the range that is preset for the NVIDIA vGPU Manager. + * + * @note 1. The range set by the administrator must be a subset of the preset range that the NVIDIA vGPU Manager supports. Otherwise, an error is returned. + * 2. If the range of supported guest driver versions does not overlap the range set by the administrator, the guest driver fails to load. + * 3. If the range of supported guest driver versions overlaps the range set by the administrator, the guest driver will load with a negotiated + * vGPU version that is the maximum value in the overlapping range. + * 4. No VMs must be running on the host when this function is called. If a VM is running on the host, the call to this function fails. + * + * @param vgpuVersion Pointer to a caller-supplied range of supported vGPU versions. * * @return - * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned - * - \ref NVML_ERROR_NOT_SUPPORTED API not supported - * - \ref NVML_ERROR_IN_USE Range not set as VM is running on the host - * - \ref NVML_ERROR_INVALID_ARGUMENT Range being set is outside the range supported by host driver + * - \ref NVML_SUCCESS The preset range of supported vGPU versions was successfully overridden. + * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported. + * - \ref NVML_ERROR_IN_USE The range was not overridden because a VM is running on the host. + * - \ref NVML_ERROR_INVALID_ARGUMENT The \a vgpuVersion parameter specifies a range that is outside the range supported by the NVIDIA vGPU Manager or if \a vgpuVersion is NULL. */ nvmlReturn_t DECLDIR nvmlSetVgpuVersion(nvmlVgpuVersion_t *vgpuVersion); /** @} */ /***************************************************************************************************/ -/** @defgroup nvmlGpuBlacklistQueries GPU Blacklist Queries - * This chapter describes NVML operations that are associated with blacklisted GPUs. +/** @defgroup nvmlUtil GRID Virtualization Utilization and Accounting + * This chapter describes operations that are associated with vGPU Utilization and Accounting. * @{ */ /***************************************************************************************************/ /** - * Blacklist GPU device information - **/ -typedef struct nvmlBlacklistDeviceInfo_st -{ - nvmlPciInfo_t pciInfo; //!< The PCI information for the blacklisted GPU - char uuid[NVML_DEVICE_UUID_BUFFER_SIZE]; //!< The ASCII string UUID for the blacklisted GPU -} nvmlBlacklistDeviceInfo_t; - - /** - * Retrieves the number of blacklisted GPU devices in the system. - * - * For all products. + * Retrieves current utilization for vGPUs on a physical GPU (device). * - * @param deviceCount Reference in which to return the number of blacklisted devices - * - * @return - * - \ref NVML_SUCCESS if \a deviceCount has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL - */ -nvmlReturn_t DECLDIR nvmlGetBlacklistDeviceCount(unsigned int *deviceCount); - -/** - * Acquire the device information for a blacklisted device, based on its index. - * - * For all products. + * For Kepler &tm; or newer fully supported devices. * - * Valid indices are derived from the \a deviceCount returned by - * \ref nvmlGetBlacklistDeviceCount(). For example, if \a deviceCount is 2 the valid indices - * are 0 and 1, corresponding to GPU 0 and GPU 1. + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for vGPU instances running + * on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer + * pointed at by \a utilizationSamples. One utilization sample structure is returned per vGPU instance, and includes the + * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values + * in nvmlValue_t unions. The function sets the caller-supplied \a sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to + * indicate the returned value type. * - * @param index The index of the target GPU, >= 0 and < \a deviceCount - * @param info Reference in which to return the device information - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a info is NULL + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance + * count in \a vgpuInstanceSamplesCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate + * a buffer of size vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t). Invoke the function again with + * the allocated buffer passed in \a utilizationSamples, and \a vgpuInstanceSamplesCount set to the number of entries the + * buffer is sized for. * - * @see nvmlGetBlacklistDeviceCount - */ -nvmlReturn_t DECLDIR nvmlGetBlacklistDeviceInfoByIndex(unsigned int index, nvmlBlacklistDeviceInfo_t *info); + * On successful return, the function updates \a vgpuInstanceSampleCount with the number of vGPU utilization sample + * structures that were actually written. This may differ from a previously read value as vGPU instances are created or + * destroyed. + * + * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @param device The identifier for the target device + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + * @param sampleValType Pointer to caller-supplied buffer to hold the type of returned sample values + * @param vgpuInstanceSamplesCount Pointer to caller-supplied array size, and returns number of vGPU instances + * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU utilization samples are returned + + * @return + * - \ref NVML_SUCCESS if utilization samples are successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuInstanceSamplesCount or \a sampleValType is + * NULL, or a sample count of 0 is passed with a non-NULL \a utilizationSamples + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuInstanceSamplesCount is too small to return samples for all + * vGPU instances currently executing on the device + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, + nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount, + nvmlVgpuInstanceUtilizationSample_t *utilizationSamples); + +/** + * Retrieves current utilization for processes running on vGPUs on a physical GPU (device). + * + * For Maxwell &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running on + * vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the + * caller-supplied buffer pointed at by \a utilizationSamples. One utilization sample structure is returned per process running + * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which + * the samples were recorded. Individual utilization values are returned as "unsigned int" values. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance + * count in \a vgpuProcessSamplesCount. The caller should allocate a buffer of size + * vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with + * the allocated buffer passed in \a utilizationSamples, and \a vgpuProcessSamplesCount set to the number of entries the + * buffer is sized for. + * + * On successful return, the function updates \a vgpuSubProcessSampleCount with the number of vGPU sub process utilization sample + * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active + * in any given sample period. + * + * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @param device The identifier for the target device + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + * @param vgpuProcessSamplesCount Pointer to caller-supplied array size, and returns number of processes running on vGPU instances + * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU sub process utilization samples are returned + + * @return + * - \ref NVML_SUCCESS if utilization samples are successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuProcessSamplesCount or a sample count of 0 is + * passed with a non-NULL \a utilizationSamples + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuProcessSamplesCount is too small to return samples for all + * vGPU instances currently executing on the device + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, + unsigned int *vgpuProcessSamplesCount, + nvmlVgpuProcessUtilizationSample_t *utilizationSamples); +/** + * Queries the state of per process accounting mode on vGPU. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance The identifier of the target vGPU instance + * @param mode Reference in which to return the current accounting mode + * + * @return + * - \ref NVML_SUCCESS if the mode has been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mode is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature + * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running on the vGPU instance + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *mode); + +/** + * Queries list of processes running on vGPU that can be queried for accounting stats. The list of processes + * returned can be in running or terminated state. + * + * For Maxwell &tm; or newer fully supported devices. + * + * To just query the maximum number of processes that can be queried, call this function with *count = 0 and + * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. + * + * For more details see \ref nvmlVgpuInstanceGetAccountingStats. + * + * @note In case of PID collision some processes might not be accessible before the circular buffer is full. + * + * @param vgpuInstance The identifier of the target vGPU instance + * @param count Reference in which to provide the \a pids array size, and + * to return the number of elements ready to be queried + * @param pids Reference in which to return list of process ids + * + * @return + * - \ref NVML_SUCCESS if pids were successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a count is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to expected value) + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlVgpuInstanceGetAccountingPids + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingPids(nvmlVgpuInstance_t vgpuInstance, unsigned int *count, unsigned int *pids); + +/** + * Queries process's accounting stats. + * + * For Maxwell &tm; or newer fully supported devices. + * + * Accounting stats capture GPU utilization and other statistics across the lifetime of a process, and + * can be queried during life time of the process or after its termination. + * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and + * updated to actual running time after its termination. + * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old + * processes. + * + * See \ref nvmlAccountingStats_t for description of each returned metric. + * List of processes that can be queried can be retrieved from \ref nvmlVgpuInstanceGetAccountingPids. + * + * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode. + * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be + * queried since they don't contribute to GPU utilization. + * @note In case of pid collision stats of only the latest process (that terminated last) will be reported + * + * @param vgpuInstance The identifier of the target vGPU instance + * @param pid Process Id of the target process to query stats for + * @param stats Reference in which to return the process's accounting stats + * + * @return + * - \ref NVML_SUCCESS if stats have been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a stats is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * or \a stats is not found + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingStats(nvmlVgpuInstance_t vgpuInstance, unsigned int pid, nvmlAccountingStats_t *stats); + +/** + * Clears accounting information of the vGPU instance that have already terminated. + * + * For Maxwell &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode. + * @note Only compute and graphics applications stats are reported and can be cleared since monitoring applications + * stats don't contribute to GPU utilization. + * + * @param vgpuInstance The identifier of the target vGPU instance + * + * @return + * - \ref NVML_SUCCESS if accounting information has been cleared + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceClearAccountingPids(nvmlVgpuInstance_t vgpuInstance); +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlGpuBlacklistQueries GPU Blacklist Queries + * This chapter describes NVML operations that are associated with blacklisted GPUs. + * @{ + */ +/***************************************************************************************************/ + +/** + * Blacklist GPU device information + **/ +typedef struct nvmlBlacklistDeviceInfo_st +{ + nvmlPciInfo_t pciInfo; //!< The PCI information for the blacklisted GPU + char uuid[NVML_DEVICE_UUID_BUFFER_SIZE]; //!< The ASCII string UUID for the blacklisted GPU +} nvmlBlacklistDeviceInfo_t; + + /** + * Retrieves the number of blacklisted GPU devices in the system. + * + * For all products. + * + * @param deviceCount Reference in which to return the number of blacklisted devices + * + * @return + * - \ref NVML_SUCCESS if \a deviceCount has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL + */ +nvmlReturn_t DECLDIR nvmlGetBlacklistDeviceCount(unsigned int *deviceCount); + +/** + * Acquire the device information for a blacklisted device, based on its index. + * + * For all products. + * + * Valid indices are derived from the \a deviceCount returned by + * \ref nvmlGetBlacklistDeviceCount(). For example, if \a deviceCount is 2 the valid indices + * are 0 and 1, corresponding to GPU 0 and GPU 1. + * + * @param index The index of the target GPU, >= 0 and < \a deviceCount + * @param info Reference in which to return the device information + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a info is NULL + * + * @see nvmlGetBlacklistDeviceCount + */ +nvmlReturn_t DECLDIR nvmlGetBlacklistDeviceInfoByIndex(unsigned int index, nvmlBlacklistDeviceInfo_t *info); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlMultiInstanceGPU Multi Instance GPU Management + * This chapter describes NVML operations that are associated with Multi Instance GPU management. + * @{ + */ +/***************************************************************************************************/ + +/** + * Disable Multi Instance GPU mode. + */ +#define NVML_DEVICE_MIG_DISABLE 0x0 + +/** + * Enable Multi Instance GPU mode. + */ +#define NVML_DEVICE_MIG_ENABLE 0x1 + +/** + * GPU instance profiles. + * + * These macros should be passed to \ref nvmlDeviceGetGpuInstanceProfileInfo to retrieve the + * detailed information about a GPU instance such as profile ID, engine counts. + */ +#define NVML_GPU_INSTANCE_PROFILE_1_SLICE 0x0 +#define NVML_GPU_INSTANCE_PROFILE_2_SLICE 0x1 +#define NVML_GPU_INSTANCE_PROFILE_3_SLICE 0x2 +#define NVML_GPU_INSTANCE_PROFILE_4_SLICE 0x3 +#define NVML_GPU_INSTANCE_PROFILE_7_SLICE 0x4 +#define NVML_GPU_INSTANCE_PROFILE_COUNT 0x5 + +typedef struct nvmlGpuInstancePlacement_st +{ + unsigned int start; + unsigned int size; +} nvmlGpuInstancePlacement_t; + +typedef struct nvmlGpuInstanceProfileInfo_st +{ + unsigned int id; //!< Unique profile ID within the device + unsigned int isP2pSupported; //!< Peer-to-Peer support + unsigned int sliceCount; //!< GPU Slice count + unsigned int instanceCount; //!< GPU instance count + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int copyEngineCount; //!< Copy Engine count + unsigned int decoderCount; //!< Decoder Engine count + unsigned int encoderCount; //!< Encoder Engine count + unsigned int jpegCount; //!< JPEG Engine count + unsigned int ofaCount; //!< OFA Engine count + unsigned long long memorySizeMB; //!< Memory size in MBytes +} nvmlGpuInstanceProfileInfo_t; + +typedef struct nvmlGpuInstanceInfo_st +{ + nvmlDevice_t device; //!< Parent device + unsigned int id; //!< Unique instance ID within the device + unsigned int profileId; //!< Unique profile ID within the device + nvmlGpuInstancePlacement_t placement; //!< Placement for this instance +} nvmlGpuInstanceInfo_t; + +typedef struct nvmlGpuInstance_st* nvmlGpuInstance_t; + +/** + * Compute instance profiles. + * + * These macros should be passed to \ref nvmlGpuInstanceGetComputeInstanceProfileInfo to retrieve the + * detailed information about a compute instance such as profile ID, engine counts + */ +#define NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE 0x0 +#define NVML_COMPUTE_INSTANCE_PROFILE_2_SLICE 0x1 +#define NVML_COMPUTE_INSTANCE_PROFILE_3_SLICE 0x2 +#define NVML_COMPUTE_INSTANCE_PROFILE_4_SLICE 0x3 +#define NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE 0x4 +#define NVML_COMPUTE_INSTANCE_PROFILE_COUNT 0x5 + +#define NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED 0x0 //!< All the engines except multiprocessors would be shared +#define NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT 0x1 + +typedef struct nvmlComputeInstanceProfileInfo_st +{ + unsigned int id; //!< Unique profile ID within the GPU instance + unsigned int sliceCount; //!< GPU Slice count + unsigned int instanceCount; //!< Compute instance count + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count + unsigned int sharedDecoderCount; //!< Shared Decoder Engine count + unsigned int sharedEncoderCount; //!< Shared Encoder Engine count + unsigned int sharedJpegCount; //!< Shared JPEG Engine count + unsigned int sharedOfaCount; //!< Shared OFA Engine count +} nvmlComputeInstanceProfileInfo_t; + +typedef struct nvmlComputeInstanceInfo_st +{ + nvmlDevice_t device; //!< Parent device + nvmlGpuInstance_t gpuInstance; //!< Parent GPU instance + unsigned int id; //!< Unique instance ID within the GPU instance + unsigned int profileId; //!< Unique profile ID within the GPU instance +} nvmlComputeInstanceInfo_t; + +typedef struct nvmlComputeInstance_st* nvmlComputeInstance_t; + +/** + * Set MIG mode for the device. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * Requires root user. + * + * This mode determines whether a GPU instance can be created. + * + * This API may unbind or reset the device to activate the requested mode. Thus, the attributes associated with the + * device, such as minor number, might change. The caller of this API is expected to query such attributes again. + * + * On certain platforms like pass-through virtualization, where reset functionality may not be exposed directly, VM + * reboot is required. \a activationStatus would return \ref NVML_ERROR_RESET_REQUIRED for such cases. + * + * \a activationStatus would return the appropriate error code upon unsuccessful activation. For example, if device + * unbind fails because the device isn't idle, \ref NVML_ERROR_IN_USE would be returned. The caller of this API + * is expected to idle the device and retry setting the \a mode. + * + * @param device The identifier of the target device + * @param mode The mode to be set, \ref NVML_DEVICE_MIG_DISABLE or + * \ref NVML_DEVICE_MIG_ENABLE + * @param activationStatus The activationStatus status + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device,\a mode or \a activationStatus are invalid + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG mode + */ +nvmlReturn_t DECLDIR nvmlDeviceSetMigMode(nvmlDevice_t device, unsigned int mode, nvmlReturn_t *activationStatus); + +/** + * Get MIG mode for the device. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * + * Changing MIG modes may require device unbind or reset. The "pending" MIG mode refers to the target mode following the + * next activation trigger. + * + * @param device The identifier of the target device + * @param currentMode Returns the current mode, \ref NVML_DEVICE_MIG_DISABLE or + * \ref NVML_DEVICE_MIG_ENABLE + * @param pendingMode Returns the pending mode, \ref NVML_DEVICE_MIG_DISABLE or + * \ref NVML_DEVICE_MIG_ENABLE + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a currentMode or \a pendingMode are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG mode + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMigMode(nvmlDevice_t device, unsigned int *currentMode, unsigned int *pendingMode); + +/** + * Get GPU instance profile information. + * + * Information provided by this API is immutable throughout the lifetime of a MIG mode. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param device The identifier of the target device + * @param profile One of the NVML_GPU_INSTANCE_PROFILE_* + * @param info Returns detailed profile information + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile or \a info are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profile isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfo(nvmlDevice_t device, unsigned int profile, + nvmlGpuInstanceProfileInfo_t *info); + +/** + * Get GPU instance placements. + * + * A placement represents the location of a GPU instance within a device. This API only returns all the possible + * placements for the given profile. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param placements Returns placements, the buffer must be large enough to accommodate + * the instances supported by the profile. + * See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param count The count of returned placements + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId, \a placements or \a count are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstancePossiblePlacements(nvmlDevice_t device, unsigned int profileId, + nvmlGpuInstancePlacement_t *placements, + unsigned int *count); + +/** + * Get GPU instance profile capacity. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param count Returns remaining instance count for the profile ID + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId or \a count are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceRemainingCapacity(nvmlDevice_t device, unsigned int profileId, + unsigned int *count); + +/** + * Create GPU instance. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * If the parent device is unbound, reset or the GPU instance is destroyed explicitly, the GPU instance handle would + * become invalid. The GPU instance must be recreated to acquire a valid handle. + * + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param gpuInstance Returns the GPU instance handle + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a profileId or \a gpuInstance are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested GPU instance could not be created + */ +nvmlReturn_t DECLDIR nvmlDeviceCreateGpuInstance(nvmlDevice_t device, unsigned int profileId, + nvmlGpuInstance_t *gpuInstance); + +/** + * Destroy GPU instance. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param gpuInstance The GPU instance handle + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_IN_USE If the GPU instance is in use. This error would be returned if processes + * (e.g. CUDA application) or compute instances are active on the + * GPU instance. + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceDestroy(nvmlGpuInstance_t gpuInstance); + +/** + * Get GPU instances for given profile ID. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param gpuInstances Returns pre-exiting GPU instances, the buffer must be large enough to + * accommodate the instances supported by the profile. + * See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param count The count of returned GPU instances + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId, \a gpuInstances or \a count are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstances(nvmlDevice_t device, unsigned int profileId, + nvmlGpuInstance_t *gpuInstances, unsigned int *count); + +/** + * Get GPU instances for given instance ID. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param device The identifier of the target device + * @param id The GPU instance ID + * @param gpuInstance Returns GPU instance + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a id or \a gpuInstance are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_NOT_FOUND If the GPU instance is not found. + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceById(nvmlDevice_t device, unsigned int id, nvmlGpuInstance_t *gpuInstance); + +/** + * Get GPU instance information. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * + * @param gpuInstance The GPU instance handle + * @param info Return GPU instance information + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance or \a info are invalid + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceGetInfo(nvmlGpuInstance_t gpuInstance, nvmlGpuInstanceInfo_t *info); + +/** + * Get compute instance profile information. + * + * Information provided by this API is immutable throughout the lifetime of a MIG mode. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profile One of the NVML_COMPUTE_INSTANCE_PROFILE_* + * @param engProfile One of the NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_* + * @param info Returns detailed profile information + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a engProfile or \a info are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profile isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceProfileInfo(nvmlGpuInstance_t gpuInstance, unsigned int profile, + unsigned int engProfile, + nvmlComputeInstanceProfileInfo_t *info); + +/** + * Get compute instance profile capacity. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profileId The compute instance profile ID. + * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param count Returns remaining instance count for the profile ID + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId or \a availableCount are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceRemainingCapacity(nvmlGpuInstance_t gpuInstance, + unsigned int profileId, unsigned int *count); + +/** + * Create compute instance. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * If the parent device is unbound, reset or the parent GPU instance is destroyed or the compute instance is destroyed + * explicitly, the compute instance handle would become invalid. The compute instance must be recreated to acquire + * a valid handle. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profileId The compute instance profile ID. + * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param computeInstance Returns the compute instance handle + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a profileId or \a computeInstance + * are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested compute instance could not be created + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceCreateComputeInstance(nvmlGpuInstance_t gpuInstance, unsigned int profileId, + nvmlComputeInstance_t *computeInstance); + +/** + * Destroy compute instance. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param computeInstance The compute instance handle + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a computeInstance is invalid + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_IN_USE If the compute instance is in use. This error would be returned if + * processes (e.g. CUDA application) are active on the compute instance. + */ +nvmlReturn_t DECLDIR nvmlComputeInstanceDestroy(nvmlComputeInstance_t computeInstance); + +/** + * Get compute instances for given profile ID. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profileId The compute instance profile ID. + * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param computeInstances Returns pre-exiting compute instances, the buffer must be large enough to + * accommodate the instances supported by the profile. + * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param count The count of returned compute instances + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId, \a computeInstances or \a count + * are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstances(nvmlGpuInstance_t gpuInstance, unsigned int profileId, + nvmlComputeInstance_t *computeInstances, unsigned int *count); + +/** + * Get compute instance for given instance ID. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param gpuInstance The identifier of the target GPU instance + * @param id The compute instance ID + * @param computeInstance Returns compute instance + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a ID or \a computeInstance are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_NOT_FOUND If the compute instance is not found. + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceById(nvmlGpuInstance_t gpuInstance, unsigned int id, + nvmlComputeInstance_t *computeInstance); + +/** + * Get compute instance information. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * + * @param computeInstance The compute instance handle + * @param info Return compute instance information + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a computeInstance or \a info are invalid + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlComputeInstanceGetInfo(nvmlComputeInstance_t computeInstance, nvmlComputeInstanceInfo_t *info); + +/** + * Test if the given handle refers to a MIG device. + * + * A MIG device handle is an NVML abstraction which maps to a MIG compute instance. + * These overloaded references can be used (with some restrictions) interchangeably + * with a GPU device handle to execute queries at a per-compute instance granularity. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * + * @param device NVML handle to test + * @param isMigDevice True when handle refers to a MIG device + * + * @return + * - \ref NVML_SUCCESS if \a device status was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle or \a isMigDevice reference is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceIsMigDeviceHandle(nvmlDevice_t device, unsigned int *isMigDevice); + +/** + * Get GPU instance ID for the given MIG device handle. + * + * GPU instance IDs are unique per device and remain valid until the GPU instance is destroyed. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * + * @param device Target MIG device handle + * @param id GPU instance ID + * + * @return + * - \ref NVML_SUCCESS if instance ID was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceId(nvmlDevice_t device, unsigned int *id); + +/** + * Get compute instance ID for the given MIG device handle. + * + * Compute instance IDs are unique per GPU instance and remain valid until the compute instance + * is destroyed. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * + * @param device Target MIG device handle + * @param id Compute instance ID + * + * @return + * - \ref NVML_SUCCESS if instance ID was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetComputeInstanceId(nvmlDevice_t device, unsigned int *id); + +/** + * Get the maximum number of MIG devices that can exist under a given parent NVML device. + * + * Returns zero if MIG is not supported or enabled. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * + * @param device Target device handle + * @param count Count of MIG devices + * + * @return + * - \ref NVML_SUCCESS if \a count was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a count reference is invalid + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxMigDeviceCount(nvmlDevice_t device, unsigned int *count); + +/** + * Get MIG device handle for the given index under its parent NVML device. + * + * If the compute instance is destroyed either explicitly or by destroying, + * resetting or unbinding the parent GPU instance or the GPU device itself + * the MIG device handle would remain invalid and must be requested again + * using this API. Handles may be reused and their properties can change in + * the process. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * + * @param device Reference to the parent GPU device handle + * @param index Index of the MIG device + * @param migDevice Reference to the MIG device handle + * + * @return + * - \ref NVML_SUCCESS if \a migDevice handle was successfully created + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a index or \a migDevice reference is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_NOT_FOUND if no valid MIG device was found at \a index + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMigDeviceHandleByIndex(nvmlDevice_t device, unsigned int index, + nvmlDevice_t *migDevice); + +/** + * Get parent device handle from a MIG device handle. + * + * For newer than Volta &tm; fully supported devices. + * Supported on Linux only. + * + * @param migDevice MIG device handle + * @param device Device handle + * + * @return + * - \ref NVML_SUCCESS if \a device handle was successfully created + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a migDevice or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDeviceHandleFromMigDeviceHandle(nvmlDevice_t migDevice, nvmlDevice_t *device); /** @} */ /** * NVML API versioning support */ -#if defined(__NVML_API_VERSION_INTERNAL) + +#ifdef NVML_NO_UNVERSIONED_FUNC_DEFS +nvmlReturn_t DECLDIR nvmlInit(void); +nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount); +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId(const char *pciBusId, nvmlDevice_t *device); +nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci); +nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v2(nvmlDevice_t device, nvmlPciInfo_t *pci); +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); +nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); +nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v2(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); +nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu(nvmlPciInfo_t *pciInfo); +nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); +nvmlReturn_t DECLDIR nvmlDeviceGetAttributes(nvmlDevice_t device, nvmlDeviceAttributes_t *attributes); +#endif // #ifdef NVML_NO_UNVERSIONED_FUNC_DEFS + +#if defined(NVML_NO_UNVERSIONED_FUNC_DEFS) +// We don't define APIs to run new versions if this guard is present so there is +// no need to undef +#elif defined(__NVML_API_VERSION_INTERNAL) +#undef nvmlDeviceGetAttributes +#undef nvmlEventSetWait #undef nvmlDeviceGetGridLicensableFeatures #undef nvmlDeviceRemoveGpu #undef nvmlDeviceGetNvLinkRemotePciInfo diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.go index 869b85f5..21da6dd4 100644 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.go +++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.go @@ -1,5 +1,7 @@ // Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +// +build linux darwin + package nvml import ( @@ -50,7 +52,7 @@ func (dl *dlhandles) nvmlShutdown() C.nvmlReturn_t { return C.NVML_SUCCESS } -// Check to see if a specific symbol is present in the NVMl library. +// Check to see if a specific symbol is present in the NVML library. func (dl *dlhandles) lookupSymbol(symbol string) C.nvmlReturn_t { for _, handle := range dl.handles { C.dlerror() diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl_windows.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl_windows.go new file mode 100644 index 00000000..4b941704 --- /dev/null +++ b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl_windows.go @@ -0,0 +1,57 @@ +// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + +// +build windows + +package nvml + +import ( + "syscall" +) + +/* +#include "nvml.h" + +// We wrap the call to nvmlInit() here to ensure that we pick up the correct +// version of this call. The macro magic in nvml.h that #defines the symbol +// 'nvmlInit' to 'nvmlInit_v2' is unfortunately lost on cgo. +static nvmlReturn_t nvmlInit_dl(void) { + return nvmlInit(); +} +*/ +import "C" + +type dlhandles struct{ handles []*syscall.LazyDLL } + +var dl dlhandles + +// Initialize NVML, opening a dynamic reference to the NVML library in the process. +func (dl *dlhandles) nvmlInit() C.nvmlReturn_t { + handle := syscall.NewLazyDLL("nvml.dll") + if handle == nil { + return C.NVML_ERROR_LIBRARY_NOT_FOUND + } + dl.handles = append(dl.handles, handle) + return C.nvmlInit_dl() +} + +// Shutdown NVML, closing our dynamic reference to the NVML library in the process. +func (dl *dlhandles) nvmlShutdown() C.nvmlReturn_t { + ret := C.nvmlShutdown() + if ret != C.NVML_SUCCESS { + return ret + } + + dl.handles = dl.handles[:0] + + return C.NVML_SUCCESS +} + +// Check to see if a specific symbol is present in the NVML library. +func (dl *dlhandles) lookupSymbol(symbol string) C.nvmlReturn_t { + for _, handle := range dl.handles { + if proc := handle.NewProc(symbol); proc != nil { + return C.NVML_SUCCESS + } + } + return C.NVML_ERROR_FUNCTION_NOT_FOUND +} diff --git a/vendor/modules.txt b/vendor/modules.txt index c02e6c51..8daf25fc 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1,4 +1,4 @@ -# github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20200116003318-021662a21098 +# github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20211102125545-5a2c58442e48 github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml # github.com/fsnotify/fsnotify v1.4.9 github.com/fsnotify/fsnotify