Skip to content

Commit

Permalink
feat(host): add vastaitech gpu metrics support
Browse files Browse the repository at this point in the history
  • Loading branch information
wanyaoqi committed Jan 13, 2025
1 parent 637e314 commit 608e869
Show file tree
Hide file tree
Showing 6 changed files with 322 additions and 31 deletions.
15 changes: 15 additions & 0 deletions pkg/hostman/hostinfo/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,21 @@ func (h *SHostInfo) GetNvidiaGpuIndexMemoryMap() map[string]int {
return res
}

func (h *SHostInfo) HasContainerVastaitechGpu() bool {
if h.hasVastaitechGpus != nil {
return *h.hasVastaitechGpus
}
hasVastaitechGpus := false
devs := h.IsolatedDeviceMan.GetDevices()
for i := range devs {
if devs[i].GetDeviceType() == apis.CONTAINER_DEV_VASTAITECH_GPU {
hasVastaitechGpus = true
}
}
h.hasVastaitechGpus = &hasVastaitechGpus
return *h.hasVastaitechGpus
}

func (h *SHostInfo) HasContainerNvidiaGpu() bool {
if h.hasNvidiaGpus != nil {
return *h.hasNvidiaGpus
Expand Down
1 change: 1 addition & 0 deletions pkg/hostman/hostinfo/hostinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ type SHostInfo struct {
containerCpufreqSimulateConfig *jsonutils.JSONDict
containerNvidiaGpus []isolated_device.IDevice
hasNvidiaGpus *bool
hasVastaitechGpus *bool
}

func (h *SHostInfo) GetContainerDeviceConfigurationFilePath() string {
Expand Down
116 changes: 101 additions & 15 deletions pkg/hostman/hostmetrics/container_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,14 @@ const (
NVIDIA_GPU_DEC = "dec"
NVIDIA_GPU_JPG = "jpg"
NVIDIA_GPU_OFA = "ofa"

VASTAITECH_GPU_DEV_ID = "dev_id"
VASTAITECH_GPU_PCI_ADDR = "pci_addr"
VASTAITECH_GPU_ENC = "enc"
VASTAITECH_GPU_DEC = "dec"
VASTAITECH_GPU_GFX = "gfx"
VASTAITECH_GPU_MEM = "mem"
VASTAITECH_GPU_MEM_UTIL = "mem_util"
)

type CadvisorProcessMetric struct {
Expand All @@ -89,13 +97,14 @@ func (m CadvisorProcessMetric) ToMap() map[string]interface{} {
}

type PodMetrics struct {
PodCpu *PodCpuMetric `json:"pod_cpu"`
PodMemory *PodMemoryMetric `json:"pod_memory"`
PodProcess *PodProcessMetric `json:"pod_process"`
PodVolumes []*PodVolumeMetric `json:"pod_volume"`
PodDiskIos PodDiskIoMetrics `json:"pod_disk_ios"`
PodNvidiaGpu []*PodNvidiaGpuMetrics `json:"pod_nvidia_gpu"`
Containers []*ContainerMetrics `json:"containers"`
PodCpu *PodCpuMetric `json:"pod_cpu"`
PodMemory *PodMemoryMetric `json:"pod_memory"`
PodProcess *PodProcessMetric `json:"pod_process"`
PodVolumes []*PodVolumeMetric `json:"pod_volume"`
PodDiskIos PodDiskIoMetrics `json:"pod_disk_ios"`
PodNvidiaGpu []*PodNvidiaGpuMetrics `json:"pod_nvidia_gpu"`
PodVastaitechGpu []*PodVastaitechGpuMetrics `json:"pod_vastaitech_gpu"`
Containers []*ContainerMetrics `json:"containers"`
}

type PodMetricMeta struct {
Expand All @@ -110,6 +119,43 @@ func (m PodMetricMeta) GetTag() map[string]string {
return nil
}

type PodVastaitechGpuMetrics struct {
PodMetricMeta

PciAddr string
DevId string

Mem float64 // MB
MemUtil float64
Gfx float64
DecUtil float64
EncUtil float64
}

func (m PodVastaitechGpuMetrics) GetName() string {
return "pod_vastaitech_gpu"
}

func (m PodVastaitechGpuMetrics) GetTag() map[string]string {
return map[string]string{
"pci_addr": m.PciAddr,
"dev_id": m.DevId,
}
}

func (m PodVastaitechGpuMetrics) ToMap() map[string]interface{} {
ret := map[string]interface{}{
VASTAITECH_GPU_DEC: m.DecUtil,
VASTAITECH_GPU_DEV_ID: m.DevId,
VASTAITECH_GPU_ENC: m.EncUtil,
VASTAITECH_GPU_GFX: m.Gfx,
VASTAITECH_GPU_MEM: m.Mem,
VASTAITECH_GPU_MEM_UTIL: m.MemUtil,
VASTAITECH_GPU_PCI_ADDR: m.PciAddr,
}
return ret
}

type PodNvidiaGpuMetrics struct {
PodMetricMeta

Expand Down Expand Up @@ -443,15 +489,15 @@ func (m *ContainerDiskIoMetric) GetTag() map[string]string {
return baseTags
}

func GetPodStatsById(ss []stats.PodStats, nvPodProcs map[string]map[string]struct{}, podId string) (*stats.PodStats, map[string]struct{}) {
func GetPodStatsById(ss []stats.PodStats, gpuPodProcs map[string]map[string]struct{}, podId string) (*stats.PodStats, map[string]struct{}) {
var podStat *stats.PodStats
for i := range ss {
if ss[i].PodRef.UID == podId {
podStat = &ss[i]
break
}
}
podProcs, _ := nvPodProcs[podId]
podProcs, _ := gpuPodProcs[podId]
return podStat, podProcs
}

Expand All @@ -466,6 +512,17 @@ func GetPodNvidiaGpuMetrics(metrics []NvidiaGpuProcessMetrics, podProcs map[stri
return podMetrics
}

func GetPodVastaitechGpuMetrics(metrics []VastaitechGpuProcessMetrics, podProcs map[string]struct{}) []VastaitechGpuProcessMetrics {
podMetrics := make([]VastaitechGpuProcessMetrics, 0)
for i := range metrics {
pid := metrics[i].Pid
if _, ok := podProcs[pid]; ok {
podMetrics = append(podMetrics, metrics[i])
}
}
return podMetrics
}

func (s *SGuestMonitorCollector) collectPodMetrics(gm *SGuestMonitor, prevUsage *GuestMetrics) *GuestMetrics {
gmData := new(GuestMetrics)
s.hostInfo.GetContainerStatsProvider()
Expand Down Expand Up @@ -639,12 +696,13 @@ func (m *SGuestMonitor) PodMetrics(prevUsage *GuestMetrics) *PodMetrics {
}

pm := &PodMetrics{
PodCpu: podCpu,
PodMemory: podMemory,
PodProcess: podProcess,
PodVolumes: m.getVolumeMetrics(),
PodNvidiaGpu: m.getPodNvidiaGpuMetrics(),
Containers: containers,
PodCpu: podCpu,
PodMemory: podMemory,
PodProcess: podProcess,
PodVolumes: m.getVolumeMetrics(),
PodNvidiaGpu: m.getPodNvidiaGpuMetrics(),
PodVastaitechGpu: m.getPodVastaitechGpuMetrics(),
Containers: containers,
}

if stat.DiskIo != nil {
Expand All @@ -663,6 +721,30 @@ func (m *SGuestMonitor) PodMetrics(prevUsage *GuestMetrics) *PodMetrics {
return pm
}

func (m *SGuestMonitor) getPodVastaitechGpuMetrics() []*PodVastaitechGpuMetrics {
if len(m.vastaitechGpuMetrics) == 0 {
return nil
}
addrGpuMap := map[string]*PodVastaitechGpuMetrics{}
for i := range m.vastaitechGpuMetrics {
pciAddr := m.vastaitechGpuMetrics[i].PciAddr
gms, ok := addrGpuMap[pciAddr]
if !ok {
gms = new(PodVastaitechGpuMetrics)
}
gms.Mem += m.vastaitechGpuMetrics[i].GfxMem
gms.MemUtil += m.vastaitechGpuMetrics[i].GfxMemUsage
gms.Gfx += m.vastaitechGpuMetrics[i].Gfx
gms.DecUtil += m.vastaitechGpuMetrics[i].Dec
gms.EncUtil += m.vastaitechGpuMetrics[i].Enc
}
res := make([]*PodVastaitechGpuMetrics, 0)
for _, gms := range addrGpuMap {
res = append(res, gms)
}
return res
}

func (m *SGuestMonitor) getPodNvidiaGpuMetrics() []*PodNvidiaGpuMetrics {
if len(m.nvidiaGpuMetrics) == 0 {
return nil
Expand Down Expand Up @@ -729,6 +811,10 @@ func (d *GuestMetrics) toPodTelegrafData(tagStr string) []string {
for i := range m.PodNvidiaGpu {
ims = append(ims, m.PodNvidiaGpu[i])
}
for i := range m.PodVastaitechGpu {
ims = append(ims, m.PodVastaitechGpu[i])
}

for _, c := range m.Containers {
ims = append(ims, c.ContainerCpu)
ims = append(ims, c.ContainerMemory)
Expand Down
21 changes: 17 additions & 4 deletions pkg/hostman/hostmetrics/container_nvidia_gpu_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ import (

"yunion.io/x/log"
"yunion.io/x/pkg/errors"
"yunion.io/x/pkg/utils"

"yunion.io/x/onecloud/pkg/apis/compute"
"yunion.io/x/onecloud/pkg/hostman/guestman"
"yunion.io/x/onecloud/pkg/util/cgrouputils"
"yunion.io/x/onecloud/pkg/util/procutils"
Expand All @@ -48,16 +50,16 @@ func GetNvidiaGpuProcessMetrics() ([]NvidiaGpuProcessMetrics, error) {
cmd := "nvidia-smi pmon -s mu -c 1"
output, err := procutils.NewRemoteCommandAsFarAsPossible("bash", "-c", cmd).Output()
if err != nil {
return nil, errors.Wrapf(err, "Execute %s failed", cmd)
return nil, errors.Wrapf(err, "Execute %s failed: %s", cmd, output)
}
return parseGpuProcessMetrics(string(output)), nil
return parseNvidiaGpuProcessMetrics(string(output)), nil
}

/*
# gpu pid type fb ccpm sm mem enc dec jpg ofa command
# Idx # C/G MB MB % % % % % % name
*/
func parseGpuProcessMetrics(gpuMetricsStr string) []NvidiaGpuProcessMetrics {
func parseNvidiaGpuProcessMetrics(gpuMetricsStr string) []NvidiaGpuProcessMetrics {
gpuProcessMetrics := make([]NvidiaGpuProcessMetrics, 0)

lines := strings.Split(gpuMetricsStr, "\n")
Expand Down Expand Up @@ -142,7 +144,7 @@ func parseGpuProcessMetrics(gpuMetricsStr string) []NvidiaGpuProcessMetrics {
return gpuProcessMetrics
}

func (s *SGuestMonitorCollector) collectNvidiaGpuPodsProcesses() map[string]map[string]struct{} {
func (s *SGuestMonitorCollector) collectGpuPodsProcesses() map[string]map[string]struct{} {
podProcIds := map[string]map[string]struct{}{}
guestmanager := guestman.GetGuestManager()
cgroupRoot := path.Join(cgrouputils.RootTaskPath("cpuset"), "cloudpods")
Expand All @@ -154,6 +156,17 @@ func (s *SGuestMonitorCollector) collectNvidiaGpuPodsProcesses() map[string]map[
if !pod.IsRunning() {
return true
}
podDesc := pod.GetDesc()
hasGpu := false
for i := range podDesc.IsolatedDevices {
if utils.IsInStringArray(podDesc.IsolatedDevices[i].DevType, []string{compute.CONTAINER_DEV_NVIDIA_GPU, compute.CONTAINER_DEV_NVIDIA_MPS, compute.CONTAINER_DEV_VASTAITECH_GPU}) {
hasGpu = true
break
}
}
if !hasGpu {
return true
}

criIds := pod.GetPodContainerCriIds()
procs := map[string]struct{}{}
Expand Down
Loading

0 comments on commit 608e869

Please sign in to comment.