From f74738048a2db2f00a79a7f2e8518e4605f0d1f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geyslan=20Greg=C3=B3rio?= Date: Fri, 27 Sep 2024 16:17:00 -0300 Subject: [PATCH] chore: add BPFPerfEventWrites metric (DEBUG) Enabled only when the build with DEBUG=1. BPFPerfEventWrites counts the number of events processed by the eBPF programs and written to the perf event buffer. It is incremented right before the event is written to the perf buffer, making it possible to measure even if the event is lost. This metric can be used to monitor the performance of individual eBPF events and to detect potential bottlenecks. --- Makefile | 4 ++ pkg/ebpf/c/common/buffer.h | 16 ++++++++ pkg/ebpf/perf_count.go | 77 +++++++++++++++++++++++++++++++++++ pkg/ebpf/tracee.go | 7 ++++ pkg/metrics/stats.go | 59 ++++++++++++++++----------- pkg/server/grpc/diagnostic.go | 19 +++++---- pkg/version/version.go | 9 +++- 7 files changed, 157 insertions(+), 34 deletions(-) create mode 100644 pkg/ebpf/perf_count.go diff --git a/Makefile b/Makefile index 66c47ba22ca4..87cad2f1f8a0 100644 --- a/Makefile +++ b/Makefile @@ -162,8 +162,10 @@ UNAME_M := $(shell uname -m) UNAME_R := $(shell uname -r) ifeq ($(DEBUG),1) + BPF_DEBUG_FLAG = -DDEBUG GO_DEBUG_FLAG = else + BPF_DEBUG_FLAG = GO_DEBUG_FLAG = -w endif @@ -423,6 +425,7 @@ $(OUTPUT_DIR)/tracee.bpf.o: \ $(TRACEE_EBPF_OBJ_HEADERS) # $(CMD_CLANG) \ + $(BPF_DEBUG_FLAG) \ -D__TARGET_ARCH_$(LINUX_ARCH) \ -D__BPF_TRACING__ \ -DCORE \ @@ -501,6 +504,7 @@ $(OUTPUT_DIR)/tracee: \ -ldflags="$(GO_DEBUG_FLAG) \ -extldflags \"$(CGO_EXT_LDFLAGS_EBPF)\" \ -X github.com/aquasecurity/tracee/pkg/version.version=$(VERSION) \ + -X github.com/aquasecurity/tracee/pkg/version.debug=$(DEBUG) \ " \ -v -o $@ \ ./cmd/tracee diff --git a/pkg/ebpf/c/common/buffer.h b/pkg/ebpf/c/common/buffer.h index 742a277082e5..9bcef571e9a3 100644 --- a/pkg/ebpf/c/common/buffer.h +++ b/pkg/ebpf/c/common/buffer.h @@ -458,6 +458,15 @@ statfunc int save_args_to_submit_buf(event_data_t *event, args_t *args) return arg_num; } +#ifdef DEBUG +struct event_counts { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_EVENT_ID); + __type(key, u32); // eventid + __type(value, u64); // count +} event_counts SEC(".maps"); +#endif + statfunc int events_perf_submit(program_data_t *p, long ret) { p->event->context.retval = ret; @@ -484,6 +493,13 @@ statfunc int events_perf_submit(program_data_t *p, long ret) : : [size] "r"(size), [max_size] "i"(MAX_EVENT_SIZE)); +#ifdef DEBUG + // increment event count before event submission attempt + u64 *event_count = bpf_map_lookup_elem(&event_counts, &p->event->context.eventid); + if (event_count) + __sync_fetch_and_add(event_count, 1); +#endif + return bpf_perf_event_output(p->ctx, &events, BPF_F_CURRENT_CPU, p->event, size); } diff --git a/pkg/ebpf/perf_count.go b/pkg/ebpf/perf_count.go new file mode 100644 index 000000000000..8cc9f1326e79 --- /dev/null +++ b/pkg/ebpf/perf_count.go @@ -0,0 +1,77 @@ +package ebpf + +import ( + "context" + "encoding/binary" + "time" + "unsafe" + + "github.com/aquasecurity/tracee/pkg/counter" + "github.com/aquasecurity/tracee/pkg/events" + "github.com/aquasecurity/tracee/pkg/logger" +) + +// countPerfEventWrites counts the number of times each event is attempted +// to be written to the perf buffer. +func (t *Tracee) countPerfEventWrites(ctx context.Context) { + logger.Debugw("Starting countPerfEventWrites goroutine") + defer logger.Debugw("Stopped countPerfEventWrites goroutine") + + evtsCountsBPFMap, err := t.bpfModule.GetMap("event_counts") + if err != nil { + logger.Errorw("Failed to get event_counts map", "error", err) + return + } + + for _, id := range t.policyManager.EventsSelected() { + key := uint32(id) + value := uint64(0) + err := evtsCountsBPFMap.Update(unsafe.Pointer(&key), unsafe.Pointer(&value)) + if err != nil { + logger.Errorw("Failed to update event_counts map", "error", err) + } + } + + total := counter.NewCounter(0) + evtsCounts := make(map[uint32]uint64) + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + iter := evtsCountsBPFMap.Iterator() + for iter.Next() { + key := binary.LittleEndian.Uint32(iter.Key()) + value, err := evtsCountsBPFMap.GetValue(unsafe.Pointer(&key)) + if err != nil { + logger.Errorw("Failed to get value from event_counts map", "error", err) + continue + } + + evtsCounts[key] = binary.LittleEndian.Uint64(value) + } + + total.Set(0) + for k, v := range evtsCounts { + if v == 0 { + continue + } + err := total.Increment(v) + if err != nil { + logger.Errorw("Failed to increment total counter", "error", err) + } + + logger.Debugw("Event sending attempts", + "event", events.Core.GetDefinitionByID(events.ID(k)).GetName(), + "count", v, + ) + } + + logger.Debugw("Event sending attempts", "total", total.Get()) + t.stats.BPFPerfEventWrites.Set(total.Get()) + } + } +} diff --git a/pkg/ebpf/tracee.go b/pkg/ebpf/tracee.go index 0945566e034e..f5f1a215eec3 100644 --- a/pkg/ebpf/tracee.go +++ b/pkg/ebpf/tracee.go @@ -47,6 +47,7 @@ import ( "github.com/aquasecurity/tracee/pkg/utils/environment" "github.com/aquasecurity/tracee/pkg/utils/proc" "github.com/aquasecurity/tracee/pkg/utils/sharedobjs" + "github.com/aquasecurity/tracee/pkg/version" "github.com/aquasecurity/tracee/types/trace" ) @@ -1370,6 +1371,12 @@ func (t *Tracee) Run(ctx gocontext.Context) error { t.controlPlane.Start() go t.controlPlane.Run(ctx) + // Measure event perf buffer write attempts (debug build only) + + if version.DebugBuild() { + go t.countPerfEventWrites(ctx) + } + // Main event loop (polling events perf buffer) t.eventsPerfMap.Poll(pollTimeout) diff --git a/pkg/metrics/stats.go b/pkg/metrics/stats.go index 08eccfc5e8eb..45f86cd09ef6 100644 --- a/pkg/metrics/stats.go +++ b/pkg/metrics/stats.go @@ -9,15 +9,16 @@ import ( // When updating this struct, please make sure to update the relevant exporting functions type Stats struct { - EventCount counter.Counter - EventsFiltered counter.Counter - NetCapCount counter.Counter // network capture events - BPFLogsCount counter.Counter - ErrorCount counter.Counter - LostEvCount counter.Counter - LostWrCount counter.Counter - LostNtCapCount counter.Counter // lost network capture events - LostBPFLogsCount counter.Counter + EventCount counter.Counter + EventsFiltered counter.Counter + NetCapCount counter.Counter // network capture events + BPFLogsCount counter.Counter + BPFPerfEventWrites counter.Counter // calls to write to the event perf buffer + ErrorCount counter.Counter + LostEvCount counter.Counter + LostWrCount counter.Counter + LostNtCapCount counter.Counter // lost network capture events + LostBPFLogsCount counter.Counter } // Register Stats to prometheus metrics exporter @@ -54,9 +55,9 @@ func (stats *Stats) RegisterPrometheus() error { err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "lostevents_total", - Help: "events lost in the submission buffer", - }, func() float64 { return float64(stats.LostEvCount.Get()) })) + Name: "bpf_logs_total", + Help: "logs collected by tracee-ebpf during ebpf execution", + }, func() float64 { return float64(stats.BPFLogsCount.Get()) })) if err != nil { return errfmt.WrapError(err) @@ -64,9 +65,9 @@ func (stats *Stats) RegisterPrometheus() error { err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "write_lostevents_total", - Help: "events lost in the write buffer", - }, func() float64 { return float64(stats.LostWrCount.Get()) })) + Name: "bpf_perf_event_writes_total", + Help: "total number of calls to write to the event perf buffer", + }, func() float64 { return float64(stats.BPFPerfEventWrites.Get()) })) if err != nil { return errfmt.WrapError(err) @@ -74,9 +75,9 @@ func (stats *Stats) RegisterPrometheus() error { err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "network_capture_lostevents_total", - Help: "network capture lost events in network capture buffer", - }, func() float64 { return float64(stats.LostNtCapCount.Get()) })) + Name: "errors_total", + Help: "errors accumulated by tracee-ebpf", + }, func() float64 { return float64(stats.ErrorCount.Get()) })) if err != nil { return errfmt.WrapError(err) @@ -84,9 +85,9 @@ func (stats *Stats) RegisterPrometheus() error { err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "bpf_logs_total", - Help: "logs collected by tracee-ebpf during ebpf execution", - }, func() float64 { return float64(stats.BPFLogsCount.Get()) })) + Name: "lostevents_total", + Help: "events lost in the submission buffer", + }, func() float64 { return float64(stats.LostEvCount.Get()) })) if err != nil { return errfmt.WrapError(err) @@ -94,9 +95,19 @@ func (stats *Stats) RegisterPrometheus() error { err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "errors_total", - Help: "errors accumulated by tracee-ebpf", - }, func() float64 { return float64(stats.ErrorCount.Get()) })) + Name: "write_lostevents_total", + Help: "events lost in the write buffer", + }, func() float64 { return float64(stats.LostWrCount.Get()) })) + + if err != nil { + return errfmt.WrapError(err) + } + + err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ + Namespace: "tracee_ebpf", + Name: "network_capture_lostevents_total", + Help: "network capture lost events in network capture buffer", + }, func() float64 { return float64(stats.LostNtCapCount.Get()) })) return errfmt.WrapError(err) } diff --git a/pkg/server/grpc/diagnostic.go b/pkg/server/grpc/diagnostic.go index dbd01bd77b5e..4f703cb81bca 100644 --- a/pkg/server/grpc/diagnostic.go +++ b/pkg/server/grpc/diagnostic.go @@ -17,15 +17,16 @@ type DiagnosticService struct { func (s *DiagnosticService) GetMetrics(ctx context.Context, in *pb.GetMetricsRequest) (*pb.GetMetricsResponse, error) { stats := s.tracee.Stats() metrics := &pb.GetMetricsResponse{ - EventCount: stats.EventCount.Get(), - EventsFiltered: stats.EventsFiltered.Get(), - NetCapCount: stats.NetCapCount.Get(), - BPFLogsCount: stats.BPFLogsCount.Get(), - ErrorCount: stats.ErrorCount.Get(), - LostEvCount: stats.LostEvCount.Get(), - LostWrCount: stats.LostWrCount.Get(), - LostNtCapCount: stats.LostNtCapCount.Get(), - LostBPFLogsCount: stats.LostBPFLogsCount.Get(), + EventCount: stats.EventCount.Get(), + EventsFiltered: stats.EventsFiltered.Get(), + NetCapCount: stats.NetCapCount.Get(), + BPFLogsCount: stats.BPFLogsCount.Get(), + BPFPerfEventWrites: stats.BPFPerfEventWrites.Get(), // only available in debug build + ErrorCount: stats.ErrorCount.Get(), + LostEvCount: stats.LostEvCount.Get(), + LostWrCount: stats.LostWrCount.Get(), + LostNtCapCount: stats.LostNtCapCount.Get(), + LostBPFLogsCount: stats.LostBPFLogsCount.Get(), } return metrics, nil diff --git a/pkg/version/version.go b/pkg/version/version.go index dd72e954731b..323f1562ace0 100644 --- a/pkg/version/version.go +++ b/pkg/version/version.go @@ -1,7 +1,14 @@ package version -var version string +var ( + version string + debug string +) func GetVersion() string { return version } + +func DebugBuild() bool { + return debug == "1" +}