diff --git a/Makefile b/Makefile index 66c47ba22ca4..87cad2f1f8a0 100644 --- a/Makefile +++ b/Makefile @@ -162,8 +162,10 @@ UNAME_M := $(shell uname -m) UNAME_R := $(shell uname -r) ifeq ($(DEBUG),1) + BPF_DEBUG_FLAG = -DDEBUG GO_DEBUG_FLAG = else + BPF_DEBUG_FLAG = GO_DEBUG_FLAG = -w endif @@ -423,6 +425,7 @@ $(OUTPUT_DIR)/tracee.bpf.o: \ $(TRACEE_EBPF_OBJ_HEADERS) # $(CMD_CLANG) \ + $(BPF_DEBUG_FLAG) \ -D__TARGET_ARCH_$(LINUX_ARCH) \ -D__BPF_TRACING__ \ -DCORE \ @@ -501,6 +504,7 @@ $(OUTPUT_DIR)/tracee: \ -ldflags="$(GO_DEBUG_FLAG) \ -extldflags \"$(CGO_EXT_LDFLAGS_EBPF)\" \ -X github.com/aquasecurity/tracee/pkg/version.version=$(VERSION) \ + -X github.com/aquasecurity/tracee/pkg/version.debug=$(DEBUG) \ " \ -v -o $@ \ ./cmd/tracee diff --git a/pkg/ebpf/c/common/buffer.h b/pkg/ebpf/c/common/buffer.h index 742a277082e5..9bcef571e9a3 100644 --- a/pkg/ebpf/c/common/buffer.h +++ b/pkg/ebpf/c/common/buffer.h @@ -458,6 +458,15 @@ statfunc int save_args_to_submit_buf(event_data_t *event, args_t *args) return arg_num; } +#ifdef DEBUG +struct event_counts { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_EVENT_ID); + __type(key, u32); // eventid + __type(value, u64); // count +} event_counts SEC(".maps"); +#endif + statfunc int events_perf_submit(program_data_t *p, long ret) { p->event->context.retval = ret; @@ -484,6 +493,13 @@ statfunc int events_perf_submit(program_data_t *p, long ret) : : [size] "r"(size), [max_size] "i"(MAX_EVENT_SIZE)); +#ifdef DEBUG + // increment event count before event submission attempt + u64 *event_count = bpf_map_lookup_elem(&event_counts, &p->event->context.eventid); + if (event_count) + __sync_fetch_and_add(event_count, 1); +#endif + return bpf_perf_event_output(p->ctx, &events, BPF_F_CURRENT_CPU, p->event, size); } diff --git a/pkg/ebpf/perf_count.go b/pkg/ebpf/perf_count.go new file mode 100644 index 000000000000..8cc9f1326e79 --- /dev/null +++ b/pkg/ebpf/perf_count.go @@ -0,0 +1,77 @@ +package ebpf + +import ( + "context" + "encoding/binary" + "time" + "unsafe" + + "github.com/aquasecurity/tracee/pkg/counter" + "github.com/aquasecurity/tracee/pkg/events" + "github.com/aquasecurity/tracee/pkg/logger" +) + +// countPerfEventWrites counts the number of times each event is attempted +// to be written to the perf buffer. +func (t *Tracee) countPerfEventWrites(ctx context.Context) { + logger.Debugw("Starting countPerfEventWrites goroutine") + defer logger.Debugw("Stopped countPerfEventWrites goroutine") + + evtsCountsBPFMap, err := t.bpfModule.GetMap("event_counts") + if err != nil { + logger.Errorw("Failed to get event_counts map", "error", err) + return + } + + for _, id := range t.policyManager.EventsSelected() { + key := uint32(id) + value := uint64(0) + err := evtsCountsBPFMap.Update(unsafe.Pointer(&key), unsafe.Pointer(&value)) + if err != nil { + logger.Errorw("Failed to update event_counts map", "error", err) + } + } + + total := counter.NewCounter(0) + evtsCounts := make(map[uint32]uint64) + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + iter := evtsCountsBPFMap.Iterator() + for iter.Next() { + key := binary.LittleEndian.Uint32(iter.Key()) + value, err := evtsCountsBPFMap.GetValue(unsafe.Pointer(&key)) + if err != nil { + logger.Errorw("Failed to get value from event_counts map", "error", err) + continue + } + + evtsCounts[key] = binary.LittleEndian.Uint64(value) + } + + total.Set(0) + for k, v := range evtsCounts { + if v == 0 { + continue + } + err := total.Increment(v) + if err != nil { + logger.Errorw("Failed to increment total counter", "error", err) + } + + logger.Debugw("Event sending attempts", + "event", events.Core.GetDefinitionByID(events.ID(k)).GetName(), + "count", v, + ) + } + + logger.Debugw("Event sending attempts", "total", total.Get()) + t.stats.BPFPerfEventWrites.Set(total.Get()) + } + } +} diff --git a/pkg/ebpf/tracee.go b/pkg/ebpf/tracee.go index 0945566e034e..f5f1a215eec3 100644 --- a/pkg/ebpf/tracee.go +++ b/pkg/ebpf/tracee.go @@ -47,6 +47,7 @@ import ( "github.com/aquasecurity/tracee/pkg/utils/environment" "github.com/aquasecurity/tracee/pkg/utils/proc" "github.com/aquasecurity/tracee/pkg/utils/sharedobjs" + "github.com/aquasecurity/tracee/pkg/version" "github.com/aquasecurity/tracee/types/trace" ) @@ -1370,6 +1371,12 @@ func (t *Tracee) Run(ctx gocontext.Context) error { t.controlPlane.Start() go t.controlPlane.Run(ctx) + // Measure event perf buffer write attempts (debug build only) + + if version.DebugBuild() { + go t.countPerfEventWrites(ctx) + } + // Main event loop (polling events perf buffer) t.eventsPerfMap.Poll(pollTimeout) diff --git a/pkg/metrics/stats.go b/pkg/metrics/stats.go index 08eccfc5e8eb..45f86cd09ef6 100644 --- a/pkg/metrics/stats.go +++ b/pkg/metrics/stats.go @@ -9,15 +9,16 @@ import ( // When updating this struct, please make sure to update the relevant exporting functions type Stats struct { - EventCount counter.Counter - EventsFiltered counter.Counter - NetCapCount counter.Counter // network capture events - BPFLogsCount counter.Counter - ErrorCount counter.Counter - LostEvCount counter.Counter - LostWrCount counter.Counter - LostNtCapCount counter.Counter // lost network capture events - LostBPFLogsCount counter.Counter + EventCount counter.Counter + EventsFiltered counter.Counter + NetCapCount counter.Counter // network capture events + BPFLogsCount counter.Counter + BPFPerfEventWrites counter.Counter // calls to write to the event perf buffer + ErrorCount counter.Counter + LostEvCount counter.Counter + LostWrCount counter.Counter + LostNtCapCount counter.Counter // lost network capture events + LostBPFLogsCount counter.Counter } // Register Stats to prometheus metrics exporter @@ -54,9 +55,9 @@ func (stats *Stats) RegisterPrometheus() error { err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "lostevents_total", - Help: "events lost in the submission buffer", - }, func() float64 { return float64(stats.LostEvCount.Get()) })) + Name: "bpf_logs_total", + Help: "logs collected by tracee-ebpf during ebpf execution", + }, func() float64 { return float64(stats.BPFLogsCount.Get()) })) if err != nil { return errfmt.WrapError(err) @@ -64,9 +65,9 @@ func (stats *Stats) RegisterPrometheus() error { err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "write_lostevents_total", - Help: "events lost in the write buffer", - }, func() float64 { return float64(stats.LostWrCount.Get()) })) + Name: "bpf_perf_event_writes_total", + Help: "total number of calls to write to the event perf buffer", + }, func() float64 { return float64(stats.BPFPerfEventWrites.Get()) })) if err != nil { return errfmt.WrapError(err) @@ -74,9 +75,9 @@ func (stats *Stats) RegisterPrometheus() error { err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "network_capture_lostevents_total", - Help: "network capture lost events in network capture buffer", - }, func() float64 { return float64(stats.LostNtCapCount.Get()) })) + Name: "errors_total", + Help: "errors accumulated by tracee-ebpf", + }, func() float64 { return float64(stats.ErrorCount.Get()) })) if err != nil { return errfmt.WrapError(err) @@ -84,9 +85,9 @@ func (stats *Stats) RegisterPrometheus() error { err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "bpf_logs_total", - Help: "logs collected by tracee-ebpf during ebpf execution", - }, func() float64 { return float64(stats.BPFLogsCount.Get()) })) + Name: "lostevents_total", + Help: "events lost in the submission buffer", + }, func() float64 { return float64(stats.LostEvCount.Get()) })) if err != nil { return errfmt.WrapError(err) @@ -94,9 +95,19 @@ func (stats *Stats) RegisterPrometheus() error { err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "errors_total", - Help: "errors accumulated by tracee-ebpf", - }, func() float64 { return float64(stats.ErrorCount.Get()) })) + Name: "write_lostevents_total", + Help: "events lost in the write buffer", + }, func() float64 { return float64(stats.LostWrCount.Get()) })) + + if err != nil { + return errfmt.WrapError(err) + } + + err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ + Namespace: "tracee_ebpf", + Name: "network_capture_lostevents_total", + Help: "network capture lost events in network capture buffer", + }, func() float64 { return float64(stats.LostNtCapCount.Get()) })) return errfmt.WrapError(err) } diff --git a/pkg/server/grpc/diagnostic.go b/pkg/server/grpc/diagnostic.go index dbd01bd77b5e..4f703cb81bca 100644 --- a/pkg/server/grpc/diagnostic.go +++ b/pkg/server/grpc/diagnostic.go @@ -17,15 +17,16 @@ type DiagnosticService struct { func (s *DiagnosticService) GetMetrics(ctx context.Context, in *pb.GetMetricsRequest) (*pb.GetMetricsResponse, error) { stats := s.tracee.Stats() metrics := &pb.GetMetricsResponse{ - EventCount: stats.EventCount.Get(), - EventsFiltered: stats.EventsFiltered.Get(), - NetCapCount: stats.NetCapCount.Get(), - BPFLogsCount: stats.BPFLogsCount.Get(), - ErrorCount: stats.ErrorCount.Get(), - LostEvCount: stats.LostEvCount.Get(), - LostWrCount: stats.LostWrCount.Get(), - LostNtCapCount: stats.LostNtCapCount.Get(), - LostBPFLogsCount: stats.LostBPFLogsCount.Get(), + EventCount: stats.EventCount.Get(), + EventsFiltered: stats.EventsFiltered.Get(), + NetCapCount: stats.NetCapCount.Get(), + BPFLogsCount: stats.BPFLogsCount.Get(), + BPFPerfEventWrites: stats.BPFPerfEventWrites.Get(), // only available in debug build + ErrorCount: stats.ErrorCount.Get(), + LostEvCount: stats.LostEvCount.Get(), + LostWrCount: stats.LostWrCount.Get(), + LostNtCapCount: stats.LostNtCapCount.Get(), + LostBPFLogsCount: stats.LostBPFLogsCount.Get(), } return metrics, nil diff --git a/pkg/version/version.go b/pkg/version/version.go index dd72e954731b..323f1562ace0 100644 --- a/pkg/version/version.go +++ b/pkg/version/version.go @@ -1,7 +1,14 @@ package version -var version string +var ( + version string + debug string +) func GetVersion() string { return version } + +func DebugBuild() bool { + return debug == "1" +}