Skip to content

Commit

Permalink
topology-aware: implement metrics collection.
Browse files Browse the repository at this point in the history
Implement collection of per zone prometheus metrics.
Currently we collect for each pool/zone the following
  - name, cpuset and memset
  - shared pool capacity, allocation, available amount
  - memory capacity, allocation, available amount
  - number of containers
  - number of containers in the shared pool

Signed-off-by: Krisztian Litkey <[email protected]>
  • Loading branch information
klihub authored and askervin committed Nov 20, 2024
1 parent ca3ba3d commit 389d012
Show file tree
Hide file tree
Showing 3 changed files with 354 additions and 19 deletions.
324 changes: 324 additions & 0 deletions cmd/plugins/topology-aware/policy/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,324 @@
// Copyright The NRI Plugins Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
"slices"
"strings"

libmem "github.com/containers/nri-plugins/pkg/resmgr/lib/memory"
policyapi "github.com/containers/nri-plugins/pkg/resmgr/policy"
"github.com/containers/nri-plugins/pkg/utils/cpuset"
"github.com/prometheus/client_golang/prometheus"
)

type TopologyAwareMetrics struct {
p *policy
ZoneNames []string
Zones map[string]*Zone
Metrics Metrics
}

type Zone struct {
Name string
Cpus cpuset.CPUSet
Mems libmem.NodeMask
SharedPool cpuset.CPUSet
SharedAssigned int
SharedAvailable int
MemCapacity int64
MemAssigned int64
MemAvailable int64
ContainerCount int
SharedContainerCount int
}

type Metrics struct {
zone *prometheus.GaugeVec
cpuSharedCapacity *prometheus.GaugeVec
cpuSharedAssigned *prometheus.GaugeVec
cpuSharedAvailable *prometheus.GaugeVec
memCapacity *prometheus.GaugeVec
memAssigned *prometheus.GaugeVec
memAvailable *prometheus.GaugeVec
containerCount *prometheus.GaugeVec
sharedContainerCount *prometheus.GaugeVec
}

const (
metricsSubsystem = "topologyaware"
)

func (p *policy) GetMetrics() policyapi.Metrics {
return p.metrics
}

func (p *policy) NewTopologyAwareMetrics() *TopologyAwareMetrics {
m := &TopologyAwareMetrics{
p: p,
Zones: make(map[string]*Zone),
Metrics: Metrics{
zone: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: metricsSubsystem,
Name: "zone_cpu_capacity",
Help: "A topology zone of CPUs.",
},
[]string{
"zone",
"cpus",
"mems",
},
),
cpuSharedCapacity: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: metricsSubsystem,
Name: "zone_cpu_shared_capacity",
Help: "Capacity of shared CPU pool of a topology zone.",
},
[]string{
"zone",
"cpus",
},
),
cpuSharedAssigned: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: metricsSubsystem,
Name: "zone_cpu_shared_assigned",
Help: "Assigned amount of shared CPU pool of a topology zone.",
},
[]string{
"zone",
"cpus",
},
),
cpuSharedAvailable: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: metricsSubsystem,
Name: "zone_cpu_shared_available",
Help: "Available amount of shared CPU pool of a topology zone.",
},
[]string{
"zone",
"cpus",
},
),
memCapacity: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: metricsSubsystem,
Name: "zone_mem_capacity",
Help: "Memory capacity of a topology zone.",
},
[]string{
"zone",
"mems",
},
),
memAssigned: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: metricsSubsystem,
Name: "zone_mem_assigned",
Help: "Amount of assigned memory of a topology zone.",
},
[]string{
"zone",
"mems",
},
),
memAvailable: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: metricsSubsystem,
Name: "zone_mem_available",
Help: "Amount of available memory of a topology zone.",
},
[]string{
"zone",
"mems",
},
),
containerCount: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: metricsSubsystem,
Name: "zone_container_count",
Help: "Number of containers assigned to a topology zone.",
},
[]string{
"zone",
},
),
sharedContainerCount: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: metricsSubsystem,
Name: "zone_shared_container_count",
Help: "Number of containers in the shared CPU pool of a topology zone.",
},
[]string{
"zone",
},
),
},
}

for _, pool := range p.pools {
var (
name = pool.Name()
mems = libmem.NewNodeMask(pool.GetMemset(memoryAll).Members()...)
capa = pool.GetSupply().(*supply)
cpus = capa.ReservedCPUs().Union(capa.IsolatedCPUs()).Union(capa.SharableCPUs())
zone = &Zone{
Name: name,
Cpus: cpus,
Mems: mems,
MemCapacity: p.memAllocator.ZoneCapacity(mems),
}
)

m.Zones[name] = zone
m.ZoneNames = append(m.ZoneNames, name)

m.Metrics.zone.WithLabelValues(
zone.Name,
zone.Cpus.String(),
zone.Mems.String(),
).Set(float64(zone.Cpus.Size()))

m.Metrics.memCapacity.WithLabelValues(
zone.Name,
zone.Mems.String(),
).Set(float64(zone.MemCapacity))
}

slices.SortFunc(m.ZoneNames, func(a, b string) int {
poolA, poolB := p.nodes[a], p.nodes[b]
if diff := poolA.RootDistance() - poolB.RootDistance(); diff != 0 {
return diff
}
return strings.Compare(a, b)
})

m.Update()

return m
}

func (m *TopologyAwareMetrics) Describe(ch chan<- *prometheus.Desc) {
if m == nil {
return
}

m.Metrics.zone.Describe(ch)
m.Metrics.cpuSharedCapacity.Describe(ch)
m.Metrics.cpuSharedAssigned.Describe(ch)
m.Metrics.cpuSharedAvailable.Describe(ch)
m.Metrics.memCapacity.Describe(ch)
m.Metrics.memAssigned.Describe(ch)
m.Metrics.memAvailable.Describe(ch)
m.Metrics.containerCount.Describe(ch)
m.Metrics.sharedContainerCount.Describe(ch)
}

func (m *TopologyAwareMetrics) Collect(ch chan<- prometheus.Metric) {
if m == nil {
return
}

m.Update()

m.Metrics.zone.Collect(ch)
m.Metrics.cpuSharedCapacity.Collect(ch)
m.Metrics.cpuSharedAssigned.Collect(ch)
m.Metrics.cpuSharedAvailable.Collect(ch)
m.Metrics.memCapacity.Collect(ch)
m.Metrics.memAssigned.Collect(ch)
m.Metrics.memAvailable.Collect(ch)
m.Metrics.containerCount.Collect(ch)
m.Metrics.sharedContainerCount.Collect(ch)
}

// Update updates our metrics.
func (m *TopologyAwareMetrics) Update() {
if m == nil {
return
}

p := m.p
for _, pool := range p.pools {
log.Debug("updating metrics for pool %s...", pool.Name())

var (
zone = m.Zones[pool.Name()]
free = pool.FreeSupply().(*supply)
mems = libmem.NewNodeMask(pool.GetMemset(memoryAll).Members()...)
sharedPool = free.SharableCPUs().Union(free.ReservedCPUs())
containers = 0
sharedctrs = 0
)

if zone == nil {
log.Error("metrics zone not found for pool %s", pool.Name())
continue
}

for _, g := range p.allocations.grants {
if g.GetCPUNode().Name() == pool.Name() {
containers++
if g.ReservedPortion() != 0 || g.CPUPortion() != 0 {
sharedctrs++
}
}
}

zone.SharedPool = sharedPool
zone.SharedAssigned = free.GrantedReserved() + free.GrantedShared()
zone.SharedAvailable = free.AllocatableSharedCPU()
zone.MemAssigned = p.memAllocator.ZoneUsage(mems)
zone.MemAvailable = p.memAllocator.ZoneAvailable(mems)
zone.ContainerCount = containers
zone.SharedContainerCount = sharedctrs

m.Metrics.cpuSharedCapacity.WithLabelValues(
zone.Name,
zone.SharedPool.String(),
).Set(float64(zone.SharedPool.Size()))

m.Metrics.cpuSharedAssigned.WithLabelValues(
zone.Name,
zone.SharedPool.String(),
).Set(float64(zone.SharedAssigned) / 1000.0)

m.Metrics.cpuSharedAvailable.WithLabelValues(
zone.Name,
zone.SharedPool.String(),
).Set(float64(zone.SharedAvailable) / 1000.0)

m.Metrics.memAssigned.WithLabelValues(
zone.Name,
zone.Mems.MemsetString(),
).Set(float64(zone.MemAssigned))

m.Metrics.memAvailable.WithLabelValues(
zone.Name,
zone.Mems.MemsetString(),
).Set(float64(zone.MemAvailable))

m.Metrics.containerCount.WithLabelValues(
zone.Name,
).Set(float64(zone.ContainerCount))

m.Metrics.sharedContainerCount.WithLabelValues(
zone.Name,
).Set(float64(zone.SharedContainerCount))
}
}
19 changes: 0 additions & 19 deletions cmd/plugins/topology-aware/policy/topology-aware-policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
"fmt"

"github.com/containers/nri-plugins/pkg/utils/cpuset"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/apimachinery/pkg/api/resource"

cfgapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy/topologyaware"
Expand Down Expand Up @@ -422,24 +421,6 @@ func (p *policy) ExportResourceData(c cache.Container) map[string]string {
return data
}

func (p *policy) GetMetrics() policyapi.Metrics {
return p.metrics
}

func (p *policy) NewTopologyAwareMetrics() *TopologyAwareMetrics {
return &TopologyAwareMetrics{}
}

type TopologyAwareMetrics struct{}

func (*TopologyAwareMetrics) Describe(ch chan<- *prometheus.Desc) {
return
}

func (*TopologyAwareMetrics) Collect(ch chan<- prometheus.Metric) {
return
}

// reallocateResources reallocates the given containers using the given pool hints
func (p *policy) reallocateResources(containers []cache.Container, pools map[string]string) error {
errs := []error{}
Expand Down
30 changes: 30 additions & 0 deletions docs/resource-policy/policy/topology-aware.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,16 @@ behavior. These options can be supplied as part of the effective
CPU allocations. For a more detailed discussion of CPU prioritization see
the [cpu allocator](../developers-guide/cpu-allocator.md) documentation.

Additionally, the following sub-configuration is available for instrumentation:

- `instrumentation`: configures runtime instrumentation.
- `httpEndpoint`: the address the HTTP server listens on. Example:
`:8891`.
- `prometheusExport`: if set to True, metrics about system and topology zone
resource assignment are readable through `/metrics` from the configured
`httpEndpoint`.
- `reportPeriod`: `/metrics` aggregation interval for polled metrics.

## Policy CPU Allocation Preferences

There are a number of workload properties this policy actively checks to decide
Expand Down Expand Up @@ -760,3 +770,23 @@ metadata:

<!-- Links -->
[configuration]: ../configuration.md

## Metrics and Debugging

In order to enable more verbose logging and metrics exporting from the
topology-aware policy, enable instrumentation and policy debugging from
the nri-resource-policy global config:

```yaml
instrumentation:
# The topology-aware policy can exports various system and topology
# zone utilisation metrics. Accessible in command line with
# curl --silent http://$localhost_or_pod_IP:8891/metrics
HTTPEndpoint: :8891
PrometheusExport: true
metrics:
enabled: # use '*' instead for all available metrics
- policy
logger:
Debug: policy
```

0 comments on commit 389d012

Please sign in to comment.