From 231495eeba32b3d963c8cbfde41e96720066d50b Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Fri, 18 Oct 2024 15:19:35 +0800 Subject: [PATCH 1/6] fix(nvidia): use NVML + lspci to detect NVIDIA GPUs Signed-off-by: Gyuho Lee --- cmd/gpud/command/command.go | 6 + cmd/gpud/command/is-nvidia.go | 24 +++ components/accelerator/detect.go | 43 +---- components/accelerator/nvidia/query/detect.go | 147 ++++++++++++++++++ components/diagnose/scan.go | 8 +- components/dmesg/filters_nvidia.go | 13 +- config/default.go | 7 +- internal/server/handlers_root.go | 13 +- 8 files changed, 215 insertions(+), 46 deletions(-) create mode 100644 cmd/gpud/command/is-nvidia.go create mode 100644 components/accelerator/nvidia/query/detect.go diff --git a/cmd/gpud/command/command.go b/cmd/gpud/command/command.go index 59aa6988..1fc1d834 100644 --- a/cmd/gpud/command/command.go +++ b/cmd/gpud/command/command.go @@ -374,6 +374,12 @@ sudo rm /etc/systemd/system/gpud.service }, }, + { + Name: "is-nvidia", + + Usage: "quick check if the host has NVIDIA GPUs installed", + Action: cmdIsNvidia, + }, { Name: "accelerator", Aliases: []string{"a"}, diff --git a/cmd/gpud/command/is-nvidia.go b/cmd/gpud/command/is-nvidia.go new file mode 100644 index 00000000..cb6d16de --- /dev/null +++ b/cmd/gpud/command/is-nvidia.go @@ -0,0 +1,24 @@ +package command + +import ( + "context" + "fmt" + "time" + + nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" + + "github.com/urfave/cli" +) + +func cmdIsNvidia(cliContext *cli.Context) error { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + + nvidiaInstalled, err := nvidia_query.GPUsInstalled(ctx) + if err != nil { + return err + } + + fmt.Printf("NVIDIA installed: %v", nvidiaInstalled) + return nil +} diff --git a/components/accelerator/detect.go b/components/accelerator/detect.go index dd85390e..735a0370 100644 --- a/components/accelerator/detect.go +++ b/components/accelerator/detect.go @@ -2,13 +2,9 @@ package accelerator import ( "context" - "fmt" + nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" "github.com/leptonai/gpud/pkg/file" - - "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" - nvinfo "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" - "github.com/NVIDIA/go-nvml/pkg/nvml" ) type Type string @@ -21,7 +17,7 @@ const ( // Returns the GPU type (e.g., "NVIDIA") and product name (e.g., "A100") func DetectTypeAndProductName(ctx context.Context) (Type, string, error) { if p, err := file.LocateExecutable("nvidia-smi"); p != "" && err == nil { - productName, err := LoadNVIDIAProductName(ctx) + productName, err := nvidia_query.LoadProductName(ctx) if err != nil { return TypeNVIDIA, "unknown", err } @@ -30,38 +26,3 @@ func DetectTypeAndProductName(ctx context.Context) (Type, string, error) { return TypeUnknown, "unknown", nil } - -func LoadNVIDIAProductName(ctx context.Context) (string, error) { - nvmlLib := nvml.New() - if ret := nvmlLib.Init(); ret != nvml.SUCCESS { - return "", fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret)) - } - - deviceLib := device.New(nvmlLib) - infoLib := nvinfo.New( - nvinfo.WithNvmlLib(nvmlLib), - nvinfo.WithDeviceLib(deviceLib), - ) - - nvmlExists, nvmlExistsMsg := infoLib.HasNvml() - if !nvmlExists { - return "", fmt.Errorf("NVML not found: %s", nvmlExistsMsg) - } - - devices, err := deviceLib.GetDevices() - if err != nil { - return "", err - } - - for _, d := range devices { - name, ret := d.GetName() - if ret != nvml.SUCCESS { - return "", fmt.Errorf("failed to get device name: %v", nvml.ErrorString(ret)) - } - if name != "" { - return name, nil - } - } - - return "", nil -} diff --git a/components/accelerator/nvidia/query/detect.go b/components/accelerator/nvidia/query/detect.go new file mode 100644 index 00000000..89e8e517 --- /dev/null +++ b/components/accelerator/nvidia/query/detect.go @@ -0,0 +1,147 @@ +package query + +import ( + "bufio" + "context" + "fmt" + "strings" + "time" + + "github.com/leptonai/gpud/log" + "github.com/leptonai/gpud/pkg/file" + "github.com/leptonai/gpud/pkg/process" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + nvinfo "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +// Returns true if the local machine has NVIDIA GPUs installed. +func GPUsInstalled(ctx context.Context) (bool, error) { + smiInstalled := SMIExists() + if !smiInstalled { + return false, nil + } + + // now that nvidia-smi installed, + // check the NVIDIA GPU presence via PCI bus + pciDevices, err := ListPCIs(ctx) + if err != nil { + return false, err + } + if len(pciDevices) == 0 { + return false, nil + } + + // now that we have the NVIDIA PCI devices, + // call NVML C-based API for NVML API + productName, err := LoadProductName(ctx) + if err != nil { + return false, err + } + + log.Logger.Infow("detected nvidia gpu", "product", productName) + return true, nil +} + +// Loads the product name of the NVIDIA GPU. +func LoadProductName(ctx context.Context) (string, error) { + nvmlLib := nvml.New() + if ret := nvmlLib.Init(); ret != nvml.SUCCESS { + return "", fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret)) + } + + deviceLib := device.New(nvmlLib) + infoLib := nvinfo.New( + nvinfo.WithNvmlLib(nvmlLib), + nvinfo.WithDeviceLib(deviceLib), + ) + + nvmlExists, nvmlExistsMsg := infoLib.HasNvml() + if !nvmlExists { + return "", fmt.Errorf("NVML not found: %s", nvmlExistsMsg) + } + + devices, err := deviceLib.GetDevices() + if err != nil { + return "", err + } + + for _, d := range devices { + name, ret := d.GetName() + if ret != nvml.SUCCESS { + return "", fmt.Errorf("failed to get device name: %v", nvml.ErrorString(ret)) + } + if name != "" { + return name, nil + } + } + + return "", nil +} + +// Lists all PCI devices that are compatible with NVIDIA. +func ListPCIs(ctx context.Context) ([]string, error) { + lspciPath, err := file.LocateExecutable("lspci") + if err != nil { + return nil, nil + } + if lspciPath == "" { + return nil, nil + } + + p, err := process.New( + process.WithCommand(lspciPath), + process.WithRunAsBashScript(), + ) + if err != nil { + return nil, err + } + + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + + if err := p.Start(ctx); err != nil { + return nil, err + } + + lines := make([]string, 0) + + scanner := bufio.NewScanner(p.StdoutReader()) + for scanner.Scan() { // returns false at the end of the output + line := scanner.Text() + + // e.g., + // 01:00.0 VGA compatible controller: NVIDIA Corporation Device 2684 (rev a1) + // 01:00.1 Audio device: NVIDIA Corporation Device 22ba (rev a1) + if strings.Contains(line, "NVIDIA") { + lines = append(lines, line) + } + + select { + case err := <-p.Wait(): + if err != nil { + return nil, err + } + default: + } + } + if serr := scanner.Err(); serr != nil { + // process already dead, thus ignore + // e.g., "read |0: file already closed" + if !strings.Contains(serr.Error(), "file already closed") { + return nil, serr + } + } + + select { + case err := <-p.Wait(): + if err != nil { + return nil, err + } + case <-ctx.Done(): + return nil, ctx.Err() + } + + return lines, nil +} diff --git a/components/diagnose/scan.go b/components/diagnose/scan.go index 428adfbc..5abce2ad 100644 --- a/components/diagnose/scan.go +++ b/components/diagnose/scan.go @@ -43,7 +43,13 @@ func Scan(ctx context.Context, opts ...OpOption) error { fmt.Printf("\n\n%s scanning the host\n\n", inProgress) - if nvidia_query.SMIExists() { + nvidiaInstalled, err := nvidia_query.GPUsInstalled(ctx) + if err != nil { + log.Logger.Warnw("error checking nvidia gpu installation", "error", err) + return err + } + + if nvidiaInstalled { fmt.Printf("%s scanning nvidia accelerators\n", inProgress) for _, lib := range defaultNVIDIALibraries { diff --git a/components/dmesg/filters_nvidia.go b/components/dmesg/filters_nvidia.go index 30def526..9bfac16a 100644 --- a/components/dmesg/filters_nvidia.go +++ b/components/dmesg/filters_nvidia.go @@ -1,6 +1,9 @@ package dmesg import ( + "context" + "time" + nvidia_error "github.com/leptonai/gpud/components/accelerator/nvidia/error" nvidia_nccl_id "github.com/leptonai/gpud/components/accelerator/nvidia/nccl/id" nvidia_peermem_id "github.com/leptonai/gpud/components/accelerator/nvidia/peermem/id" @@ -15,7 +18,15 @@ import ( ) func init() { - if nvidia_query.SMIExists() { + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + + nvidiaInstalled, err := nvidia_query.GPUsInstalled(ctx) + if err != nil { + return + } + + if nvidiaInstalled { defaultFilters = append(defaultFilters, DefaultDmesgFiltersForNvidia()...) } for i := range defaultFilters { diff --git a/config/default.go b/config/default.go index 6da5d6e8..93c2dfaf 100644 --- a/config/default.go +++ b/config/default.go @@ -195,7 +195,12 @@ func DefaultConfig(ctx context.Context, opts ...OpOption) (*Config, error) { log.Logger.Debugw("auto-detect tailscale not supported -- skipping", "os", runtime.GOOS) } - if runtime.GOOS == "linux" && nvidia_query.SMIExists() { + nvidiaInstalled, err := nvidia_query.GPUsInstalled(ctx) + if err != nil { + return nil, err + } + + if runtime.GOOS == "linux" && nvidiaInstalled { driverVersion, err := nvidia_query_nvml.GetDriverVersion() if err != nil { return nil, err diff --git a/internal/server/handlers_root.go b/internal/server/handlers_root.go index 4af7ea42..f49e96f7 100644 --- a/internal/server/handlers_root.go +++ b/internal/server/handlers_root.go @@ -1,14 +1,15 @@ package server import ( + "context" "embed" "fmt" "html/template" stdos "os" "runtime" "strings" + "time" - "github.com/dustin/go-humanize" "github.com/leptonai/gpud/components" nvidia_clock "github.com/leptonai/gpud/components/accelerator/nvidia/clock" nvidia_clockspeed "github.com/leptonai/gpud/components/accelerator/nvidia/clock-speed" @@ -28,6 +29,7 @@ import ( "github.com/leptonai/gpud/log" "github.com/leptonai/gpud/version" + "github.com/dustin/go-humanize" "github.com/gin-gonic/gin" "github.com/shirou/gopsutil/v4/process" ) @@ -80,8 +82,15 @@ func createRootHandler(handlerDescs []componentHandlerDescription, webConfig con nvidiaClockSpeedChart := false nvidiaErrsChart := false + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + nvidiaInstalled, err := nvidia_query.GPUsInstalled(ctx) + cancel() + if err != nil { + log.Logger.Fatalw("failed to check if nvidia is installed", "error", err) + } + var nvidiaInfoOutputProvider components.OutputProvider - if nvidia_query.SMIExists() { + if nvidiaInstalled { nvidiaInfoComponent, err := components.GetComponent(nvidia_info.Name) if err != nil { panic(fmt.Sprintf("component %q required but not set", nvidia_info.Name)) From f7e297d96d8858cf21fe6b60294b945ad3e648fd Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Fri, 18 Oct 2024 15:22:54 +0800 Subject: [PATCH 2/6] remove ctx Signed-off-by: Gyuho Lee --- components/accelerator/nvidia/query/detect.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/components/accelerator/nvidia/query/detect.go b/components/accelerator/nvidia/query/detect.go index 89e8e517..89fad648 100644 --- a/components/accelerator/nvidia/query/detect.go +++ b/components/accelerator/nvidia/query/detect.go @@ -5,7 +5,6 @@ import ( "context" "fmt" "strings" - "time" "github.com/leptonai/gpud/log" "github.com/leptonai/gpud/pkg/file" @@ -98,9 +97,6 @@ func ListPCIs(ctx context.Context) ([]string, error) { return nil, err } - ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) - defer cancel() - if err := p.Start(ctx); err != nil { return nil, err } From 549a291e77d3d7f217717714ae83b3c72bd8cc16 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Fri, 18 Oct 2024 15:24:05 +0800 Subject: [PATCH 3/6] log Signed-off-by: Gyuho Lee --- components/accelerator/nvidia/query/detect.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/components/accelerator/nvidia/query/detect.go b/components/accelerator/nvidia/query/detect.go index 89fad648..6c9a7003 100644 --- a/components/accelerator/nvidia/query/detect.go +++ b/components/accelerator/nvidia/query/detect.go @@ -21,6 +21,7 @@ func GPUsInstalled(ctx context.Context) (bool, error) { if !smiInstalled { return false, nil } + log.Logger.Info("nvidia-smi installed") // now that nvidia-smi installed, // check the NVIDIA GPU presence via PCI bus @@ -31,6 +32,7 @@ func GPUsInstalled(ctx context.Context) (bool, error) { if len(pciDevices) == 0 { return false, nil } + log.Logger.Info("nvidia PCI devices found", "devices", len(pciDevices)) // now that we have the NVIDIA PCI devices, // call NVML C-based API for NVML API @@ -38,8 +40,8 @@ func GPUsInstalled(ctx context.Context) (bool, error) { if err != nil { return false, err } - log.Logger.Infow("detected nvidia gpu", "product", productName) + return true, nil } From c19b3f84be6b94fc4ee02e2997f50aadb25fe295 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Fri, 18 Oct 2024 15:24:37 +0800 Subject: [PATCH 4/6] fix Signed-off-by: Gyuho Lee --- components/accelerator/nvidia/query/detect.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/accelerator/nvidia/query/detect.go b/components/accelerator/nvidia/query/detect.go index 6c9a7003..d210f905 100644 --- a/components/accelerator/nvidia/query/detect.go +++ b/components/accelerator/nvidia/query/detect.go @@ -32,7 +32,7 @@ func GPUsInstalled(ctx context.Context) (bool, error) { if len(pciDevices) == 0 { return false, nil } - log.Logger.Info("nvidia PCI devices found", "devices", len(pciDevices)) + log.Logger.Infow("nvidia PCI devices found", "devices", len(pciDevices)) // now that we have the NVIDIA PCI devices, // call NVML C-based API for NVML API From 67d24c4f5caa3c59ca5e97d776a38f4439f7be26 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Tue, 22 Oct 2024 00:46:41 +0800 Subject: [PATCH 5/6] rename Signed-off-by: Gyuho Lee --- components/accelerator/detect.go | 2 +- components/accelerator/nvidia/query/detect.go | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/components/accelerator/detect.go b/components/accelerator/detect.go index 735a0370..34f5b449 100644 --- a/components/accelerator/detect.go +++ b/components/accelerator/detect.go @@ -17,7 +17,7 @@ const ( // Returns the GPU type (e.g., "NVIDIA") and product name (e.g., "A100") func DetectTypeAndProductName(ctx context.Context) (Type, string, error) { if p, err := file.LocateExecutable("nvidia-smi"); p != "" && err == nil { - productName, err := nvidia_query.LoadProductName(ctx) + productName, err := nvidia_query.LoadGPUDeviceName(ctx) if err != nil { return TypeNVIDIA, "unknown", err } diff --git a/components/accelerator/nvidia/query/detect.go b/components/accelerator/nvidia/query/detect.go index d210f905..3756ab04 100644 --- a/components/accelerator/nvidia/query/detect.go +++ b/components/accelerator/nvidia/query/detect.go @@ -21,7 +21,7 @@ func GPUsInstalled(ctx context.Context) (bool, error) { if !smiInstalled { return false, nil } - log.Logger.Info("nvidia-smi installed") + log.Logger.Debugw("nvidia-smi installed") // now that nvidia-smi installed, // check the NVIDIA GPU presence via PCI bus @@ -36,17 +36,17 @@ func GPUsInstalled(ctx context.Context) (bool, error) { // now that we have the NVIDIA PCI devices, // call NVML C-based API for NVML API - productName, err := LoadProductName(ctx) + gpuDeviceName, err := LoadGPUDeviceName(ctx) if err != nil { return false, err } - log.Logger.Infow("detected nvidia gpu", "product", productName) + log.Logger.Infow("detected nvidia gpu", "gpuDeviceName", gpuDeviceName) return true, nil } -// Loads the product name of the NVIDIA GPU. -func LoadProductName(ctx context.Context) (string, error) { +// Loads the product name of the NVIDIA GPU device. +func LoadGPUDeviceName(ctx context.Context) (string, error) { nvmlLib := nvml.New() if ret := nvmlLib.Init(); ret != nvml.SUCCESS { return "", fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret)) From 26d5bcf03179e839f4ed984c1e998a00ec89c5dd Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Tue, 22 Oct 2024 12:19:41 +0800 Subject: [PATCH 6/6] rename to ListNVIDIAPCIs Signed-off-by: Gyuho Lee --- components/accelerator/nvidia/query/detect.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/accelerator/nvidia/query/detect.go b/components/accelerator/nvidia/query/detect.go index 3756ab04..2f400746 100644 --- a/components/accelerator/nvidia/query/detect.go +++ b/components/accelerator/nvidia/query/detect.go @@ -25,7 +25,7 @@ func GPUsInstalled(ctx context.Context) (bool, error) { // now that nvidia-smi installed, // check the NVIDIA GPU presence via PCI bus - pciDevices, err := ListPCIs(ctx) + pciDevices, err := ListNVIDIAPCIs(ctx) if err != nil { return false, err } @@ -82,7 +82,7 @@ func LoadGPUDeviceName(ctx context.Context) (string, error) { } // Lists all PCI devices that are compatible with NVIDIA. -func ListPCIs(ctx context.Context) ([]string, error) { +func ListNVIDIAPCIs(ctx context.Context) ([]string, error) { lspciPath, err := file.LocateExecutable("lspci") if err != nil { return nil, nil