diff --git a/pkg/hostman/guestman/pod.go b/pkg/hostman/guestman/pod.go index ebd8dce4722..7a857f7c4e5 100644 --- a/pkg/hostman/guestman/pod.go +++ b/pkg/hostman/guestman/pod.go @@ -139,6 +139,8 @@ type PodInstance interface { IsInternalStopped(ctrCriId string) (*ContainerExpectedStatus, bool) IsInternalRemoved(ctrCriId string) bool + + GetPodContainerCriIds() []string } type sContainer struct { @@ -593,6 +595,24 @@ func (s *sPodGuestInstance) GetContainers() []*hostapi.ContainerDesc { return s.GetDesc().Containers } +func (s *sPodGuestInstance) GetPodContainerCriIds() []string { + criids := make([]string, 0) + for i := range s.containers { + criids = append(criids, s.containers[i].CRIId) + } + return criids +} + +func (s *sPodGuestInstance) HasContainerNvidiaGpu() bool { + for i := range s.Desc.IsolatedDevices { + if s.Desc.IsolatedDevices[i].DevType == computeapi.CONTAINER_DEV_NVIDIA_MPS || + s.Desc.IsolatedDevices[i].DevType == computeapi.CONTAINER_DEV_NVIDIA_GPU { + return true + } + } + return false +} + func (s *sPodGuestInstance) GetContainerById(ctrId string) *hostapi.ContainerDesc { ctrs := s.GetContainers() for i := range ctrs { diff --git a/pkg/hostman/hostinfo/container.go b/pkg/hostman/hostinfo/container.go index bced7f6b854..9cf1d4f07b8 100644 --- a/pkg/hostman/hostinfo/container.go +++ b/pkg/hostman/hostinfo/container.go @@ -22,7 +22,9 @@ import ( "yunion.io/x/log" "yunion.io/x/pkg/errors" + apis "yunion.io/x/onecloud/pkg/apis/compute" hostapi "yunion.io/x/onecloud/pkg/apis/host" + "yunion.io/x/onecloud/pkg/hostman/isolated_device" "yunion.io/x/onecloud/pkg/hostman/options" "yunion.io/x/onecloud/pkg/util/pod" "yunion.io/x/onecloud/pkg/util/pod/cadvisor" @@ -76,3 +78,40 @@ func (h *SHostInfo) GetContainerCPUMap() *pod.HostContainerCPUMap { func (h *SHostInfo) GetContainerStatsProvider() stats.ContainerStatsProvider { return h.containerStatsProvider } + +type INvidiaGpuIndexMemoryInterface interface { + GetNvidiaDevMemSize() int + GetNvidiaDevIndex() string +} + +func (h *SHostInfo) GetNvidiaGpuIndexMemoryMap() map[string]int { + res := map[string]int{} + for i := range h.containerNvidiaGpus { + iDev, ok := h.containerNvidiaGpus[i].(INvidiaGpuIndexMemoryInterface) + if !ok { + continue + } + index := iDev.GetNvidiaDevIndex() + memSize := iDev.GetNvidiaDevMemSize() + res[index] = memSize + } + return res +} + +func (h *SHostInfo) HasContainerNvidiaGpu() bool { + if h.hasNvidiaGpus != nil { + return *h.hasNvidiaGpus + } + hasNvidiaGpus := false + nvDevs := make([]isolated_device.IDevice, 0) + devs := h.IsolatedDeviceMan.GetDevices() + for i := range devs { + if devs[i].GetDeviceType() == apis.CONTAINER_DEV_NVIDIA_GPU || devs[i].GetDeviceType() == apis.CONTAINER_DEV_NVIDIA_MPS { + hasNvidiaGpus = true + nvDevs = append(nvDevs, devs[i]) + } + } + h.hasNvidiaGpus = &hasNvidiaGpus + h.containerNvidiaGpus = nvDevs + return *h.hasNvidiaGpus +} diff --git a/pkg/hostman/hostinfo/hostinfo.go b/pkg/hostman/hostinfo/hostinfo.go index 7e9a4197ebd..d8807365672 100644 --- a/pkg/hostman/hostinfo/hostinfo.go +++ b/pkg/hostman/hostinfo/hostinfo.go @@ -128,6 +128,8 @@ type SHostInfo struct { containerCPUMap *pod.HostContainerCPUMap containerStatsProvider stats.ContainerStatsProvider containerCpufreqSimulateConfig *jsonutils.JSONDict + containerNvidiaGpus []isolated_device.IDevice + hasNvidiaGpus *bool } func (h *SHostInfo) GetContainerDeviceConfigurationFilePath() string { diff --git a/pkg/hostman/hostmetrics/container_metrics.go b/pkg/hostman/hostmetrics/container_metrics.go index a8c8f1328c7..ca1480fff1d 100644 --- a/pkg/hostman/hostmetrics/container_metrics.go +++ b/pkg/hostman/hostmetrics/container_metrics.go @@ -16,6 +16,8 @@ package hostmetrics import ( "fmt" + "sort" + "strconv" "strings" "time" @@ -49,6 +51,18 @@ const ( SOCKET_COUNT = "socket_count" THREADS_CURRENT = "threads_current" THREADS_MAX = "threads_max" + + NVIDIA_GPU_MEMORY_TOTAL = "nvidia_gpu_memory_total" + NVIDIA_GPU_INDEX = "nvidia_gpu_index" + NVIDIA_GPU_PHYSICAL_INDEX = "nvidia_gpu_physical_index" + NVIDIA_GPU_FRAME_BUFFER = "nvidia_gpu_frame_buffer" + NVIDIA_GPU_CCPM = "nvidia_gpu_ccpm" + NVIDIA_GPU_SM = "nvidia_gpu_sm" + NVIDIA_GPU_MEM_UTIL = "nvidia_gpu_mem_util" + NVIDIA_GPU_ENC = "nvidia_gpu_enc" + NVIDIA_GPU_DEC = "nvidia_gpu_dec" + NVIDIA_GPU_JPG = "nvidia_gpu_jpg" + NVIDIA_GPU_OFA = "nvidia_gpu_ofa" ) type CadvisorProcessMetric struct { @@ -75,12 +89,13 @@ func (m CadvisorProcessMetric) ToMap() map[string]interface{} { } type PodMetrics struct { - PodCpu *PodCpuMetric `json:"pod_cpu"` - PodMemory *PodMemoryMetric `json:"pod_memory"` - PodProcess *PodProcessMetric `json:"pod_process"` - PodVolumes []*PodVolumeMetric `json:"pod_volume"` - PodDiskIos PodDiskIoMetrics `json:"pod_disk_ios"` - Containers []*ContainerMetrics `json:"containers"` + PodCpu *PodCpuMetric `json:"pod_cpu"` + PodMemory *PodMemoryMetric `json:"pod_memory"` + PodProcess *PodProcessMetric `json:"pod_process"` + PodVolumes []*PodVolumeMetric `json:"pod_volume"` + PodDiskIos PodDiskIoMetrics `json:"pod_disk_ios"` + PodNvidiaGpu []*PodNvidiaGpuMetrics `json:"pod_nvidia_gpu"` + Containers []*ContainerMetrics `json:"containers"` } type PodMetricMeta struct { @@ -95,6 +110,45 @@ func (m PodMetricMeta) GetTag() map[string]string { return nil } +type PodNvidiaGpuMetrics struct { + PodMetricMeta + + Index int + PhysicalIndex int + MemTotal int + + Framebuffer int // Framebuffer Memory Usage + Ccpm int // Current CUDA Contexts Per Measurement + SmUtil float64 // Streaming Multiprocessor Utilization + MemUtil float64 // Memory Utilization + EncUtil float64 // Encoder Utilization + DecUtil float64 // Decoder Utilization + JpgUtil float64 // JPEG Decoder Utilization + OfaUtil float64 // Other Feature Utilization +} + +func (m PodNvidiaGpuMetrics) GetName() string { + return "pod_nvidia_gpu_metrics" +} + +func (m PodNvidiaGpuMetrics) ToMap() map[string]interface{} { + ret := map[string]interface{}{ + NVIDIA_GPU_MEMORY_TOTAL: m.MemTotal, + NVIDIA_GPU_INDEX: m.Index, + NVIDIA_GPU_PHYSICAL_INDEX: m.PhysicalIndex, + NVIDIA_GPU_FRAME_BUFFER: m.Framebuffer, + NVIDIA_GPU_CCPM: m.Ccpm, + NVIDIA_GPU_SM: m.SmUtil, + NVIDIA_GPU_MEM_UTIL: m.MemUtil, + NVIDIA_GPU_ENC: m.EncUtil, + NVIDIA_GPU_DEC: m.DecUtil, + NVIDIA_GPU_JPG: m.JpgUtil, + NVIDIA_GPU_OFA: m.OfaUtil, + } + + return ret +} + type PodCpuMetric struct { PodMetricMeta CpuUsageSecondsTotal float64 `json:"cpu_usage_seconds_total"` @@ -382,18 +436,32 @@ func (m *ContainerDiskIoMetric) GetTag() map[string]string { return baseTags } -func GetPodStatsById(stats []stats.PodStats, podId string) *stats.PodStats { - for _, stat := range stats { - if stat.PodRef.UID == podId { - tmp := stat - return &tmp +func GetPodStatsById(ss []stats.PodStats, nvPodProcs map[string]map[string]struct{}, podId string) (*stats.PodStats, map[string]struct{}) { + var podStat *stats.PodStats + for i := range ss { + if ss[i].PodRef.UID == podId { + podStat = &ss[i] + break } } - return nil + podProcs, _ := nvPodProcs[podId] + return podStat, podProcs +} + +func GetPodNvidiaGpuMetrics(metrics []NvidiaGpuProcessMetrics, podProcs map[string]struct{}) []NvidiaGpuProcessMetrics { + podMetrics := make([]NvidiaGpuProcessMetrics, 0) + for i := range metrics { + pid := metrics[i].Pid + if _, ok := podProcs[pid]; ok { + podMetrics = append(podMetrics, metrics[i]) + } + } + return podMetrics } func (s *SGuestMonitorCollector) collectPodMetrics(gm *SGuestMonitor, prevUsage *GuestMetrics) *GuestMetrics { gmData := new(GuestMetrics) + s.hostInfo.GetContainerStatsProvider() gmData.PodMetrics = gm.PodMetrics(prevUsage) // netio @@ -564,11 +632,12 @@ func (m *SGuestMonitor) PodMetrics(prevUsage *GuestMetrics) *PodMetrics { } pm := &PodMetrics{ - PodCpu: podCpu, - PodMemory: podMemory, - PodProcess: podProcess, - PodVolumes: m.getVolumeMetrics(), - Containers: containers, + PodCpu: podCpu, + PodMemory: podMemory, + PodProcess: podProcess, + PodVolumes: m.getVolumeMetrics(), + PodNvidiaGpu: m.getPodNvidiaGpuMetrics(), + Containers: containers, } if stat.DiskIo != nil { @@ -587,6 +656,49 @@ func (m *SGuestMonitor) PodMetrics(prevUsage *GuestMetrics) *PodMetrics { return pm } +func (m *SGuestMonitor) getPodNvidiaGpuMetrics() []*PodNvidiaGpuMetrics { + if len(m.nvidiaGpuMetrics) == 0 { + return nil + } + indexGpuMap := map[int]*PodNvidiaGpuMetrics{} + for i := range m.nvidiaGpuMetrics { + index := m.nvidiaGpuMetrics[i].Index + gms, ok := indexGpuMap[index] + if !ok { + gms = new(PodNvidiaGpuMetrics) + } + gms.Framebuffer += m.nvidiaGpuMetrics[i].FB + gms.Ccpm += m.nvidiaGpuMetrics[i].Ccpm + gms.SmUtil += m.nvidiaGpuMetrics[i].Sm + gms.EncUtil += m.nvidiaGpuMetrics[i].Enc + gms.DecUtil += m.nvidiaGpuMetrics[i].Dec + gms.JpgUtil += m.nvidiaGpuMetrics[i].Jpg + gms.OfaUtil += m.nvidiaGpuMetrics[i].Ofa + indexGpuMap[index] = gms + } + + indexs := make([]int, 0) + for index, gms := range indexGpuMap { + indexs = append(indexs, index) + indexStr := strconv.Itoa(index) + memSizeTotal, ok := m.nvidiaGpuIndexMemoryMap[indexStr] + if !ok { + continue + } + gms.MemTotal = memSizeTotal + gms.MemUtil = float64(gms.Framebuffer) / float64(gms.MemTotal) + } + sort.Ints(indexs) + res := make([]*PodNvidiaGpuMetrics, len(indexs)) + for i := range indexs { + gms := indexGpuMap[indexs[i]] + gms.PhysicalIndex = gms.Index + gms.Index = i + res = append(res, gms) + } + return res +} + type iPodMetric interface { GetName() string GetTag() map[string]string diff --git a/pkg/hostman/hostmetrics/container_nvidia_gpu_metrics.go b/pkg/hostman/hostmetrics/container_nvidia_gpu_metrics.go new file mode 100644 index 00000000000..80115cc035c --- /dev/null +++ b/pkg/hostman/hostmetrics/container_nvidia_gpu_metrics.go @@ -0,0 +1,187 @@ +// Copyright 2019 Yunion +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostmetrics + +import ( + "fmt" + "os" + "path" + "strconv" + "strings" + "yunion.io/x/onecloud/pkg/hostman/guestman" + "yunion.io/x/onecloud/pkg/util/cgrouputils" + + "yunion.io/x/log" + "yunion.io/x/pkg/errors" + + "yunion.io/x/onecloud/pkg/util/procutils" +) + +type NvidiaGpuProcessMetrics struct { + Index int // Gpu Index + Pid string // Process ID + Type string // Process Type C/G, Compute or Graphics + FB int // Framebuffer Memory Usage + Ccpm int // Current CUDA Contexts Per Measurement + Sm float64 // Streaming Multiprocessor Utilization + Mem float64 // Memory Utilization + Enc float64 // Encoder Utilization + Dec float64 // Decoder Utilization + Jpg float64 // JPEG Decoder Utilization + Ofa float64 // Other Feature Utilization + Command string // Process Command Name +} + +func GetNvidiaGpuProcessMetrics() ([]NvidiaGpuProcessMetrics, error) { + cmd := "nvidia-smi pmon -s mu -c 1" + output, err := procutils.NewRemoteCommandAsFarAsPossible("bash", "-c", cmd).Output() + if err != nil { + return nil, errors.Wrapf(err, "Execute %s failed", cmd) + } + return parseGpuProcessMetrics(string(output)), nil +} + +/* +# gpu pid type fb ccpm sm mem enc dec jpg ofa command +# Idx # C/G MB MB % % % % % % name +*/ +func parseGpuProcessMetrics(gpuMetricsStr string) []NvidiaGpuProcessMetrics { + gpuProcessMetrics := make([]NvidiaGpuProcessMetrics, 0) + + lines := strings.Split(gpuMetricsStr, "\n") + for _, line := range lines { + + // Skip comments and blank lines + if strings.HasPrefix(line, "#") || len(strings.TrimSpace(line)) == 0 { + continue + } + + var processMetrics NvidiaGpuProcessMetrics + var fb, ccpm, sm, mem, enc, dec, jpg, ofa string + _, err := fmt.Sscanf(line, "%d %s %s %s %s %s %s %s %s %s %s %s", + &processMetrics.Index, &processMetrics.Pid, &processMetrics.Type, &fb, &ccpm, + &sm, &mem, &enc, &dec, &jpg, &ofa, &processMetrics.Command) + if err != nil { + log.Errorf("failed parse nvidia gpu metrics %s: %s", line, err) + continue + } + if processMetrics.Command == "nvidia-cuda-mps" || processMetrics.Command == "-" { + continue + } + if fb != "-" { + val, err := strconv.Atoi(fb) + if err != nil { + log.Errorf("failed parse sm %s: %s", sm, err) + } + processMetrics.FB = val + } + if ccpm != "-" { + val, err := strconv.Atoi(ccpm) + if err != nil { + log.Errorf("failed parse sm %s: %s", sm, err) + } + processMetrics.Ccpm = val + } + if sm != "-" { + val, err := strconv.ParseFloat(sm, 64) + if err != nil { + log.Errorf("failed parse sm %s: %s", sm, err) + } + processMetrics.Sm = val + } + if mem != "-" { + val, err := strconv.ParseFloat(mem, 64) + if err != nil { + log.Errorf("failed parse mem %s: %s", mem, err) + } + processMetrics.Mem = val + } + if enc != "-" { + val, err := strconv.ParseFloat(enc, 64) + if err != nil { + log.Errorf("failed parse enc %s: %s", enc, err) + } + processMetrics.Enc = val + } + if dec != "-" { + val, err := strconv.ParseFloat(dec, 64) + if err != nil { + log.Errorf("failed parse dec %s: %s", dec, err) + } + processMetrics.Dec = val + } + if jpg != "-" { + val, err := strconv.ParseFloat(jpg, 64) + if err != nil { + log.Errorf("failed parse jpg %s: %s", jpg, err) + } + processMetrics.Jpg = val + } + if ofa != "-" { + val, err := strconv.ParseFloat(ofa, 64) + if err != nil { + log.Errorf("failed parse ofa %s: %s", ofa, err) + } + processMetrics.Ofa = val + } + + gpuProcessMetrics = append(gpuProcessMetrics, processMetrics) + } + return gpuProcessMetrics +} + +func (s *SGuestMonitorCollector) collectNvidiaGpuPodsProcesses() map[string]map[string]struct{} { + podProcIds := map[string]map[string]struct{}{} + guestmanager := guestman.GetGuestManager() + cgroupRoot := path.Join(cgrouputils.RootTaskPath("cpuset"), "cloudpods") + guestmanager.Servers.Range(func(k, v interface{}) bool { + pod, ok := v.(guestman.PodInstance) + if !ok { + return true + } + if !pod.IsRunning() { + return true + } + + criIds := pod.GetPodContainerCriIds() + procs := map[string]struct{}{} + for i := range criIds { + cgroupPath := path.Join(cgroupRoot, criIds[i], "cgroup.procs") + pids, err := ReadProccessFromCgroupProcs(cgroupPath) + if err != nil { + log.Errorf("collectNvidiaGpuPodsProcesses: %s", err) + continue + } + for _, pid := range pids { + procs[pid] = struct{}{} + } + } + if len(procs) > 0 { + podProcIds[pod.GetId()] = procs + } + return true + }) + return podProcIds +} + +func ReadProccessFromCgroupProcs(procFilePath string) ([]string, error) { + out, err := os.ReadFile(procFilePath) + if err != nil { + return nil, errors.Wrap(err, "os.ReadFile") + } + + pids := strings.Split(string(out), "\n") + return pids, nil +} diff --git a/pkg/hostman/hostmetrics/hostmetrics.go b/pkg/hostman/hostmetrics/hostmetrics.go index 97b0c04064a..193a138b763 100644 --- a/pkg/hostman/hostmetrics/hostmetrics.go +++ b/pkg/hostman/hostmetrics/hostmetrics.go @@ -58,6 +58,8 @@ var hostMetricsCollector *SHostMetricsCollector type IHostInfo interface { GetContainerStatsProvider() stats.ContainerStatsProvider + HasContainerNvidiaGpu() bool + GetNvidiaGpuIndexMemoryMap() map[string]int } func Init(hostInfo IHostInfo) { @@ -178,6 +180,8 @@ func (s *SGuestMonitorCollector) GetGuests() map[string]*SGuestMonitor { guestmanager := guestman.GetGuestManager() var podStats []stats.PodStats = nil + var nvidiaGpuMetrics []NvidiaGpuProcessMetrics = nil + nvPodProcs := s.collectNvidiaGpuPodsProcesses() guestmanager.Servers.Range(func(k, v interface{}) bool { instance, ok := v.(guestman.GuestRuntimeInstance) @@ -234,13 +238,28 @@ func (s *SGuestMonitorCollector) GetGuests() map[string]*SGuestMonitor { log.Errorf("ListPodCPUAndMemoryStats: %s", err) return true } + if s.hostInfo.HasContainerNvidiaGpu() { + nvidiaGpuMetrics, err = GetNvidiaGpuProcessMetrics() + if err != nil { + log.Errorf("GetNvidiaGpuProcessMetrics %s", err) + } + log.Errorf("AAA nvmetrics %v", nvidiaGpuMetrics) + } } - podStat := GetPodStatsById(podStats, guestId) + podStat, nvProcs := GetPodStatsById(podStats, nvPodProcs, guestId) + log.Errorf("AAA pod %s nvprocs %v", guestId, nvProcs) if podStat != nil { - gm, err := NewGuestPodMonitor(instance, guestName, guestId, podStat, nicsDesc, int(vcpuCount)) + var nvGpuMetrics []NvidiaGpuProcessMetrics + if len(nvProcs) > 0 { + nvGpuMetrics = GetPodNvidiaGpuMetrics(nvidiaGpuMetrics, nvProcs) + } + gm, err := NewGuestPodMonitor(instance, guestName, guestId, podStat, nvGpuMetrics, nicsDesc, int(vcpuCount)) if err != nil { return true } + if len(nvProcs) > 0 { + gm.nvidiaGpuIndexMemoryMap = s.hostInfo.GetNvidiaGpuIndexMemoryMap() + } gm.UpdateByInstance(instance) gms[guestId] = gm return true @@ -548,22 +567,24 @@ func (s *SGuestMonitorCollector) reportNetIo(cur, prev *NetIOMetric) { } type SGuestMonitor struct { - Name string - Id string - Pid int - Nics []*desc.SGuestNetwork - CpuCnt int - MemMB int64 - Ip string - Process *process.Process - ScalingGroupId string - Tenant string - TenantId string - DomainId string - ProjectDomain string - podStat *stats.PodStats - instance guestman.GuestRuntimeInstance - sysFs sysfs.SysFs + Name string + Id string + Pid int + Nics []*desc.SGuestNetwork + CpuCnt int + MemMB int64 + Ip string + Process *process.Process + ScalingGroupId string + Tenant string + TenantId string + DomainId string + ProjectDomain string + podStat *stats.PodStats + nvidiaGpuMetrics []NvidiaGpuProcessMetrics + nvidiaGpuIndexMemoryMap map[string]int + instance guestman.GuestRuntimeInstance + sysFs sysfs.SysFs } func NewGuestMonitor(instance guestman.GuestRuntimeInstance, name, id string, pid int, nics []*desc.SGuestNetwork, cpuCount int) (*SGuestMonitor, error) { @@ -574,12 +595,17 @@ func NewGuestMonitor(instance guestman.GuestRuntimeInstance, name, id string, pi return newGuestMonitor(instance, name, id, proc, nics, cpuCount) } -func NewGuestPodMonitor(instance guestman.GuestRuntimeInstance, name, id string, stat *stats.PodStats, nics []*desc.SGuestNetwork, cpuCount int) (*SGuestMonitor, error) { +func NewGuestPodMonitor( + instance guestman.GuestRuntimeInstance, name, id string, + stat *stats.PodStats, nvGpuMetrics []NvidiaGpuProcessMetrics, + nics []*desc.SGuestNetwork, cpuCount int, +) (*SGuestMonitor, error) { m, err := newGuestMonitor(instance, name, id, nil, nics, cpuCount) if err != nil { return nil, errors.Wrap(err, "new pod GuestMonitor") } m.podStat = stat + m.nvidiaGpuMetrics = nvGpuMetrics return m, nil } diff --git a/pkg/hostman/isolated_device/container_device/nvidia_gpu.go b/pkg/hostman/isolated_device/container_device/nvidia_gpu.go index e5395a481eb..d886698a0e7 100644 --- a/pkg/hostman/isolated_device/container_device/nvidia_gpu.go +++ b/pkg/hostman/isolated_device/container_device/nvidia_gpu.go @@ -82,6 +82,17 @@ func (m *nvidiaGPUManager) GetContainerExtraConfigures(devs []*hostapi.Container type nvidiaGPU struct { *BaseDevice + + memSize int + gpuIndex string +} + +func (dev *nvidiaGPU) GetNvidiaDevMemSize() int { + return dev.memSize +} + +func (dev *nvidiaGPU) GetNvidiaDevIndex() string { + return dev.gpuIndex } func getNvidiaGPUs() ([]isolated_device.IDevice, error) { @@ -91,7 +102,7 @@ func getNvidiaGPUs() ([]isolated_device.IDevice, error) { // GPU-bc1a3bb9-55cb-8c52-c374-4f8b4f388a20, NVIDIA A800-SXM4-80GB, 00000000:10:00.0 // nvidia-smi --query-gpu=gpu_uuid,gpu_name,gpu_bus_id,memory.total,compute_mode --format=csv - out, err := procutils.NewRemoteCommandAsFarAsPossible("nvidia-smi", "--query-gpu=gpu_uuid,gpu_name,gpu_bus_id,compute_mode", "--format=csv").Output() + out, err := procutils.NewRemoteCommandAsFarAsPossible("nvidia-smi", "--query-gpu=gpu_uuid,gpu_name,gpu_bus_id,compute_mode,memory.total,index", "--format=csv").Output() if err != nil { return nil, errors.Wrap(err, "nvidia-smi") } @@ -101,15 +112,19 @@ func getNvidiaGPUs() ([]isolated_device.IDevice, error) { continue } segs := strings.Split(line, ",") - if len(segs) != 4 { + if len(segs) != 6 { log.Errorf("unknown nvidia-smi out line %s", line) continue } - gpuId, gpuName, gpuPciAddr, computeMode := strings.TrimSpace(segs[0]), strings.TrimSpace(segs[1]), strings.TrimSpace(segs[2]), strings.TrimSpace(segs[3]) + gpuId, gpuName, gpuPciAddr, computeMode, memTotal, index := strings.TrimSpace(segs[0]), strings.TrimSpace(segs[1]), strings.TrimSpace(segs[2]), strings.TrimSpace(segs[3]), strings.TrimSpace(segs[4]), strings.TrimSpace(segs[5]) if computeMode != "Default" { log.Warningf("gpu device %s compute mode %s, skip.", gpuId, computeMode) continue } + memSize, err := parseMemSize(memTotal) + if err != nil { + return nil, errors.Wrapf(err, "failed parse memSize %s", memTotal) + } pciOutput, err := isolated_device.GetPCIStrByAddr(gpuPciAddr) if err != nil { @@ -118,6 +133,8 @@ func getNvidiaGPUs() ([]isolated_device.IDevice, error) { dev := isolated_device.NewPCIDevice2(pciOutput[0]) gpuDev := &nvidiaGPU{ BaseDevice: NewBaseDevice(dev, isolated_device.ContainerDeviceTypeNvidiaGpu, gpuId), + memSize: memSize, + gpuIndex: index, } gpuDev.SetModelName(gpuName) diff --git a/pkg/hostman/isolated_device/container_device/nvidia_mps.go b/pkg/hostman/isolated_device/container_device/nvidia_mps.go index 902675aa268..adfb93b6d9d 100644 --- a/pkg/hostman/isolated_device/container_device/nvidia_mps.go +++ b/pkg/hostman/isolated_device/container_device/nvidia_mps.go @@ -113,6 +113,16 @@ type nvidiaMPS struct { MemSizeMB int MemTotalMB int ThreadPercentage int + + gpuIndex string +} + +func (dev *nvidiaMPS) GetNvidiaDevMemSize() int { + return dev.MemSizeMB +} + +func (dev *nvidiaMPS) GetNvidiaDevIndex() string { + return dev.gpuIndex } func (c *nvidiaMPS) GetNvidiaMpsMemoryLimit() int { @@ -139,7 +149,7 @@ func getNvidiaMPSGpus() ([]isolated_device.IDevice, error) { devs := make([]isolated_device.IDevice, 0) // nvidia-smi --query-gpu=gpu_uuid,gpu_name,gpu_bus_id,memory.total,compute_mode --format=csv // GPU-76aef7ff-372d-2432-b4b4-beca4d8d3400, Tesla P40, 00000000:00:08.0, 23040 MiB, Exclusive_Process - out, err := procutils.NewRemoteCommandAsFarAsPossible("nvidia-smi", "--query-gpu=gpu_uuid,gpu_name,gpu_bus_id,memory.total,compute_mode", "--format=csv").Output() + out, err := procutils.NewRemoteCommandAsFarAsPossible("nvidia-smi", "--query-gpu=gpu_uuid,gpu_name,gpu_bus_id,memory.total,compute_mode,index", "--format=csv").Output() if err != nil { return nil, errors.Wrap(err, "nvidia-smi") } @@ -149,11 +159,11 @@ func getNvidiaMPSGpus() ([]isolated_device.IDevice, error) { continue } segs := strings.Split(line, ",") - if len(segs) != 5 { + if len(segs) != 6 { log.Errorf("unknown nvidia-smi out line %s", line) continue } - gpuId, gpuName, gpuPciAddr, memTotal, computeMode := strings.TrimSpace(segs[0]), strings.TrimSpace(segs[1]), strings.TrimSpace(segs[2]), strings.TrimSpace(segs[3]), strings.TrimSpace(segs[4]) + gpuId, gpuName, gpuPciAddr, memTotal, computeMode, index := strings.TrimSpace(segs[0]), strings.TrimSpace(segs[1]), strings.TrimSpace(segs[2]), strings.TrimSpace(segs[3]), strings.TrimSpace(segs[4]), strings.TrimSpace(segs[5]) if computeMode != "Exclusive_Process" { log.Warningf("gpu device %s compute mode %s, skip.", gpuId, computeMode) continue @@ -174,6 +184,7 @@ func getNvidiaMPSGpus() ([]isolated_device.IDevice, error) { MemSizeMB: memSize / options.HostOptions.CudaMPSReplicas, MemTotalMB: memSize, ThreadPercentage: 100 / options.HostOptions.CudaMPSReplicas, + gpuIndex: index, } gpuDev.SetModelName(gpuName) devAddr := gpuDev.GetAddr()