Skip to content

Commit

Permalink
feat(host): container nvidia gpu metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
wanyaoqi committed Jan 10, 2025
1 parent 8c51743 commit 56a4f2d
Show file tree
Hide file tree
Showing 8 changed files with 456 additions and 42 deletions.
20 changes: 20 additions & 0 deletions pkg/hostman/guestman/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ type PodInstance interface {

IsInternalStopped(ctrCriId string) (*ContainerExpectedStatus, bool)
IsInternalRemoved(ctrCriId string) bool

GetPodContainerCriIds() []string
}

type sContainer struct {
Expand Down Expand Up @@ -593,6 +595,24 @@ func (s *sPodGuestInstance) GetContainers() []*hostapi.ContainerDesc {
return s.GetDesc().Containers
}

func (s *sPodGuestInstance) GetPodContainerCriIds() []string {
criids := make([]string, 0)
for i := range s.containers {
criids = append(criids, s.containers[i].CRIId)
}
return criids
}

func (s *sPodGuestInstance) HasContainerNvidiaGpu() bool {
for i := range s.Desc.IsolatedDevices {
if s.Desc.IsolatedDevices[i].DevType == computeapi.CONTAINER_DEV_NVIDIA_MPS ||
s.Desc.IsolatedDevices[i].DevType == computeapi.CONTAINER_DEV_NVIDIA_GPU {
return true
}
}
return false
}

func (s *sPodGuestInstance) GetContainerById(ctrId string) *hostapi.ContainerDesc {
ctrs := s.GetContainers()
for i := range ctrs {
Expand Down
39 changes: 39 additions & 0 deletions pkg/hostman/hostinfo/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ import (
"yunion.io/x/log"
"yunion.io/x/pkg/errors"

apis "yunion.io/x/onecloud/pkg/apis/compute"
hostapi "yunion.io/x/onecloud/pkg/apis/host"
"yunion.io/x/onecloud/pkg/hostman/isolated_device"
"yunion.io/x/onecloud/pkg/hostman/options"
"yunion.io/x/onecloud/pkg/util/pod"
"yunion.io/x/onecloud/pkg/util/pod/cadvisor"
Expand Down Expand Up @@ -76,3 +78,40 @@ func (h *SHostInfo) GetContainerCPUMap() *pod.HostContainerCPUMap {
func (h *SHostInfo) GetContainerStatsProvider() stats.ContainerStatsProvider {
return h.containerStatsProvider
}

type INvidiaGpuIndexMemoryInterface interface {
GetNvidiaDevMemSize() int
GetNvidiaDevIndex() string
}

func (h *SHostInfo) GetNvidiaGpuIndexMemoryMap() map[string]int {
res := map[string]int{}
for i := range h.containerNvidiaGpus {
iDev, ok := h.containerNvidiaGpus[i].(INvidiaGpuIndexMemoryInterface)
if !ok {
continue
}
index := iDev.GetNvidiaDevIndex()
memSize := iDev.GetNvidiaDevMemSize()
res[index] = memSize
}
return res
}

func (h *SHostInfo) HasContainerNvidiaGpu() bool {
if h.hasNvidiaGpus != nil {
return *h.hasNvidiaGpus
}
hasNvidiaGpus := false
nvDevs := make([]isolated_device.IDevice, 0)
devs := h.IsolatedDeviceMan.GetDevices()
for i := range devs {
if devs[i].GetDeviceType() == apis.CONTAINER_DEV_NVIDIA_GPU || devs[i].GetDeviceType() == apis.CONTAINER_DEV_NVIDIA_MPS {
hasNvidiaGpus = true
nvDevs = append(nvDevs, devs[i])
}
}
h.hasNvidiaGpus = &hasNvidiaGpus
h.containerNvidiaGpus = nvDevs
return *h.hasNvidiaGpus
}
2 changes: 2 additions & 0 deletions pkg/hostman/hostinfo/hostinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ type SHostInfo struct {
containerCPUMap *pod.HostContainerCPUMap
containerStatsProvider stats.ContainerStatsProvider
containerCpufreqSimulateConfig *jsonutils.JSONDict
containerNvidiaGpus []isolated_device.IDevice
hasNvidiaGpus *bool
}

func (h *SHostInfo) GetContainerDeviceConfigurationFilePath() string {
Expand Down
146 changes: 129 additions & 17 deletions pkg/hostman/hostmetrics/container_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ package hostmetrics

import (
"fmt"
"sort"
"strconv"
"strings"
"time"

Expand Down Expand Up @@ -49,6 +51,18 @@ const (
SOCKET_COUNT = "socket_count"
THREADS_CURRENT = "threads_current"
THREADS_MAX = "threads_max"

NVIDIA_GPU_MEMORY_TOTAL = "nvidia_gpu_memory_total"
NVIDIA_GPU_INDEX = "nvidia_gpu_index"
NVIDIA_GPU_PHYSICAL_INDEX = "nvidia_gpu_physical_index"
NVIDIA_GPU_FRAME_BUFFER = "nvidia_gpu_frame_buffer"
NVIDIA_GPU_CCPM = "nvidia_gpu_ccpm"
NVIDIA_GPU_SM = "nvidia_gpu_sm"
NVIDIA_GPU_MEM_UTIL = "nvidia_gpu_mem_util"
NVIDIA_GPU_ENC = "nvidia_gpu_enc"
NVIDIA_GPU_DEC = "nvidia_gpu_dec"
NVIDIA_GPU_JPG = "nvidia_gpu_jpg"
NVIDIA_GPU_OFA = "nvidia_gpu_ofa"
)

type CadvisorProcessMetric struct {
Expand All @@ -75,12 +89,13 @@ func (m CadvisorProcessMetric) ToMap() map[string]interface{} {
}

type PodMetrics struct {
PodCpu *PodCpuMetric `json:"pod_cpu"`
PodMemory *PodMemoryMetric `json:"pod_memory"`
PodProcess *PodProcessMetric `json:"pod_process"`
PodVolumes []*PodVolumeMetric `json:"pod_volume"`
PodDiskIos PodDiskIoMetrics `json:"pod_disk_ios"`
Containers []*ContainerMetrics `json:"containers"`
PodCpu *PodCpuMetric `json:"pod_cpu"`
PodMemory *PodMemoryMetric `json:"pod_memory"`
PodProcess *PodProcessMetric `json:"pod_process"`
PodVolumes []*PodVolumeMetric `json:"pod_volume"`
PodDiskIos PodDiskIoMetrics `json:"pod_disk_ios"`
PodNvidiaGpu []*PodNvidiaGpuMetrics `json:"pod_nvidia_gpu"`
Containers []*ContainerMetrics `json:"containers"`
}

type PodMetricMeta struct {
Expand All @@ -95,6 +110,45 @@ func (m PodMetricMeta) GetTag() map[string]string {
return nil
}

type PodNvidiaGpuMetrics struct {
PodMetricMeta

Index int
PhysicalIndex int
MemTotal int

Framebuffer int // Framebuffer Memory Usage
Ccpm int // Current CUDA Contexts Per Measurement
SmUtil float64 // Streaming Multiprocessor Utilization
MemUtil float64 // Memory Utilization
EncUtil float64 // Encoder Utilization
DecUtil float64 // Decoder Utilization
JpgUtil float64 // JPEG Decoder Utilization
OfaUtil float64 // Other Feature Utilization
}

func (m PodNvidiaGpuMetrics) GetName() string {
return "pod_nvidia_gpu_metrics"
}

func (m PodNvidiaGpuMetrics) ToMap() map[string]interface{} {
ret := map[string]interface{}{
NVIDIA_GPU_MEMORY_TOTAL: m.MemTotal,
NVIDIA_GPU_INDEX: m.Index,
NVIDIA_GPU_PHYSICAL_INDEX: m.PhysicalIndex,
NVIDIA_GPU_FRAME_BUFFER: m.Framebuffer,
NVIDIA_GPU_CCPM: m.Ccpm,
NVIDIA_GPU_SM: m.SmUtil,
NVIDIA_GPU_MEM_UTIL: m.MemUtil,
NVIDIA_GPU_ENC: m.EncUtil,
NVIDIA_GPU_DEC: m.DecUtil,
NVIDIA_GPU_JPG: m.JpgUtil,
NVIDIA_GPU_OFA: m.OfaUtil,
}

return ret
}

type PodCpuMetric struct {
PodMetricMeta
CpuUsageSecondsTotal float64 `json:"cpu_usage_seconds_total"`
Expand Down Expand Up @@ -382,18 +436,32 @@ func (m *ContainerDiskIoMetric) GetTag() map[string]string {
return baseTags
}

func GetPodStatsById(stats []stats.PodStats, podId string) *stats.PodStats {
for _, stat := range stats {
if stat.PodRef.UID == podId {
tmp := stat
return &tmp
func GetPodStatsById(ss []stats.PodStats, nvPodProcs map[string]map[string]struct{}, podId string) (*stats.PodStats, map[string]struct{}) {
var podStat *stats.PodStats
for i := range ss {
if ss[i].PodRef.UID == podId {
podStat = &ss[i]
break
}
}
return nil
podProcs, _ := nvPodProcs[podId]
return podStat, podProcs
}

func GetPodNvidiaGpuMetrics(metrics []NvidiaGpuProcessMetrics, podProcs map[string]struct{}) []NvidiaGpuProcessMetrics {
podMetrics := make([]NvidiaGpuProcessMetrics, 0)
for i := range metrics {
pid := metrics[i].Pid
if _, ok := podProcs[pid]; ok {
podMetrics = append(podMetrics, metrics[i])
}
}
return podMetrics
}

func (s *SGuestMonitorCollector) collectPodMetrics(gm *SGuestMonitor, prevUsage *GuestMetrics) *GuestMetrics {
gmData := new(GuestMetrics)
s.hostInfo.GetContainerStatsProvider()
gmData.PodMetrics = gm.PodMetrics(prevUsage)

// netio
Expand Down Expand Up @@ -564,11 +632,12 @@ func (m *SGuestMonitor) PodMetrics(prevUsage *GuestMetrics) *PodMetrics {
}

pm := &PodMetrics{
PodCpu: podCpu,
PodMemory: podMemory,
PodProcess: podProcess,
PodVolumes: m.getVolumeMetrics(),
Containers: containers,
PodCpu: podCpu,
PodMemory: podMemory,
PodProcess: podProcess,
PodVolumes: m.getVolumeMetrics(),
PodNvidiaGpu: m.getPodNvidiaGpuMetrics(),
Containers: containers,
}

if stat.DiskIo != nil {
Expand All @@ -587,6 +656,49 @@ func (m *SGuestMonitor) PodMetrics(prevUsage *GuestMetrics) *PodMetrics {
return pm
}

func (m *SGuestMonitor) getPodNvidiaGpuMetrics() []*PodNvidiaGpuMetrics {
if len(m.nvidiaGpuMetrics) == 0 {
return nil
}
indexGpuMap := map[int]*PodNvidiaGpuMetrics{}
for i := range m.nvidiaGpuMetrics {
index := m.nvidiaGpuMetrics[i].Index
gms, ok := indexGpuMap[index]
if !ok {
gms = new(PodNvidiaGpuMetrics)
}
gms.Framebuffer += m.nvidiaGpuMetrics[i].FB
gms.Ccpm += m.nvidiaGpuMetrics[i].Ccpm
gms.SmUtil += m.nvidiaGpuMetrics[i].Sm
gms.EncUtil += m.nvidiaGpuMetrics[i].Enc
gms.DecUtil += m.nvidiaGpuMetrics[i].Dec
gms.JpgUtil += m.nvidiaGpuMetrics[i].Jpg
gms.OfaUtil += m.nvidiaGpuMetrics[i].Ofa
indexGpuMap[index] = gms
}

indexs := make([]int, 0)
for index, gms := range indexGpuMap {
indexs = append(indexs, index)
indexStr := strconv.Itoa(index)
memSizeTotal, ok := m.nvidiaGpuIndexMemoryMap[indexStr]
if !ok {
continue
}
gms.MemTotal = memSizeTotal
gms.MemUtil = float64(gms.Framebuffer) / float64(gms.MemTotal)
}
sort.Ints(indexs)
res := make([]*PodNvidiaGpuMetrics, len(indexs))
for i := range indexs {
gms := indexGpuMap[indexs[i]]
gms.PhysicalIndex = gms.Index
gms.Index = i
res = append(res, gms)
}
return res
}

type iPodMetric interface {
GetName() string
GetTag() map[string]string
Expand Down
Loading

0 comments on commit 56a4f2d

Please sign in to comment.