From 439159c7e00556c0df6ae1b3c6daf7c92491884c Mon Sep 17 00:00:00 2001 From: shuaibo_nan <117452805+nanshuaibo@users.noreply.github.com> Date: Fri, 10 May 2024 19:51:49 +0800 Subject: [PATCH 1/5] =?UTF-8?q?kvm=5Fwatcher:=E4=BC=98=E5=8C=96=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=EF=BC=8C=E6=9B=B4=E6=96=B0=E7=9B=AE=E5=BD=95=E7=BB=93?= =?UTF-8?q?=E6=9E=84=20(#790)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 修改makefile * 更新目录结构 * 添加用户态帮助函数 * 更新目录结构,添加用户态帮助函数 * update * 更新目录结构,添加用户态帮助函数 * update --- .gitignore | 2 +- eBPF_Supermarket/kvm_watcher/Makefile | 71 +- eBPF_Supermarket/kvm_watcher/README.md | 33 +- .../kvm_watcher/include/{ => bpf}/kvm_exits.h | 2 +- .../include/{ => bpf}/kvm_hypercall.h | 2 +- .../kvm_watcher/include/{ => bpf}/kvm_ioctl.h | 2 +- .../kvm_watcher/include/{ => bpf}/kvm_irq.h | 2 +- .../kvm_watcher/include/{ => bpf}/kvm_mmu.h | 14 +- .../kvm_watcher/include/{ => bpf}/kvm_vcpu.h | 2 +- .../include/{kvm_watcher.h => common.h} | 2 +- .../include/helpers/trace_helpers.h | 105 ++ .../include/helpers/uprobe_helpers.h | 19 + .../kvm_watcher/src/helpers/trace_helpers.c | 1182 +++++++++++++++++ .../kvm_watcher/src/helpers/uprobe_helpers.c | 287 ++++ .../kvm_watcher/src/kvm_watcher.bpf.c | 26 +- .../kvm_watcher/src/kvm_watcher.c | 12 +- 16 files changed, 1693 insertions(+), 70 deletions(-) rename eBPF_Supermarket/kvm_watcher/include/{ => bpf}/kvm_exits.h (99%) rename eBPF_Supermarket/kvm_watcher/include/{ => bpf}/kvm_hypercall.h (99%) rename eBPF_Supermarket/kvm_watcher/include/{ => bpf}/kvm_ioctl.h (99%) rename eBPF_Supermarket/kvm_watcher/include/{ => bpf}/kvm_irq.h (99%) rename eBPF_Supermarket/kvm_watcher/include/{ => bpf}/kvm_mmu.h (97%) rename eBPF_Supermarket/kvm_watcher/include/{ => bpf}/kvm_vcpu.h (99%) rename eBPF_Supermarket/kvm_watcher/include/{kvm_watcher.h => common.h} (99%) create mode 100644 eBPF_Supermarket/kvm_watcher/include/helpers/trace_helpers.h create mode 100644 eBPF_Supermarket/kvm_watcher/include/helpers/uprobe_helpers.h create mode 100644 eBPF_Supermarket/kvm_watcher/src/helpers/trace_helpers.c create mode 100644 eBPF_Supermarket/kvm_watcher/src/helpers/uprobe_helpers.c diff --git a/.gitignore b/.gitignore index 681a24621..40e63914d 100644 --- a/.gitignore +++ b/.gitignore @@ -80,4 +80,4 @@ eBPF_Supermarket/CPU_Subsystem/eBPF_proc_image/controller # Stack_Analyser eBPF_Supermarket/Stack_Analyser/stack_analyzer eBPF_Supermarket/Stack_Analyser/exporter/exporter -eBPF_Supermarket/Stack_Analyser/bpf_skel \ No newline at end of file +eBPF_Supermarket/Stack_Analyser/bpf_skel diff --git a/eBPF_Supermarket/kvm_watcher/Makefile b/eBPF_Supermarket/kvm_watcher/Makefile index e292f83ba..cc5a05466 100644 --- a/eBPF_Supermarket/kvm_watcher/Makefile +++ b/eBPF_Supermarket/kvm_watcher/Makefile @@ -1,4 +1,3 @@ -.SILENT: ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ | sed 's/arm.*/arm/' \ | sed 's/aarch64/arm64/' \ @@ -18,19 +17,12 @@ BPF_CFLAGS=-g -O2 -target bpf LIBS=-lbpf -lelf -lz -lzstd # 头文件目录 -INCLUDE_DIRS=-I/usr/include/x86_64-linux-gnu -I. +INCLUDE_DIRS=-I/usr/include/x86_64-linux-gnu -I. -I./include -I./include/bpf -I./include/helpers -# qemu 命令行参数变量化 -QEMU_CMD=sudo qemu-system-x86_64 -enable-kvm -cpu host -m 2048 -smp 4 -drive file=cirros-0.5.2-x86_64-disk.img,format=qcow2 -boot c -nographic - -CIRROS_IMG_URL=https://gitee.com/nan-shuaibo/cirros/releases/download/0.5.2/cirros-0.5.2-x86_64-disk.img -CIRROS_IMG_FILE=cirros-0.5.2-x86_64-disk.img - -# 定义检查虚拟化支持的命令 -CHECK_VIRT_SUPPORT = [ $$(grep -Eoc '(vmx|svm)' /proc/cpuinfo) -eq 0 ] - -# 定义检查 qemu-system-x86_64 进程是否存在的命令 -CHECK_QEMU_RUNNING = [ "$$(pgrep -f qemu-system-x86_64)" ] +# 帮助函数 +HELPERS_OBJ_DIR := src/helpers +HELPERS_FILES := $(wildcard $(HELPERS_OBJ_DIR)/*.c) +HELPERS_OBJ_FILES := $(patsubst $(HELPERS_OBJ_DIR)/%.c,$(HELPERS_OBJ_DIR)/%.o,$(HELPERS_FILES)) # 默认目标 .PHONY: default @@ -39,28 +31,53 @@ default: bpf # 安装必要的依赖 .PHONY: deps deps: - sudo apt-get update + sudo apt-get update && \ sudo apt-get install -y clang libelf1 libelf-dev zlib1g-dev libbpf-dev \ linux-tools-$$(uname -r) linux-cloud-tools-$$(uname -r) \ - libpcap-dev gcc-multilib build-essential - sudo apt-get install -y lolcat qemu-kvm wget + libpcap-dev gcc-multilib build-essential lolcat qemu-kvm wget + # 生成 vmlinux.h .PHONY: vmlinux vmlinux: bpftool btf dump file /sys/kernel/btf/kvm format c > ./include/vmlinux.h +# 编译helpers目录下的所有.c文件 +$(HELPERS_OBJ_DIR)/%.o: $(HELPERS_OBJ_DIR)/%.c + clang $(CFLAGS) $(INCLUDE_DIRS) -c $< -o $@ + +# 编译BPF程序 +$(APP).bpf.o: $(APP).bpf.c vmlinux + clang $(BPF_CFLAGS) -D__TARGET_ARCH_$(ARCH) $(INCLUDE_DIRS) -c $< -o $@ + +# 生成BPF骨架文件 +$(APP).skel.h: $(APP).bpf.o + bpftool gen skeleton $< > $@ + +# 编译用户空间应用程序 +${APP}.o: ${APP}.c + clang $(CFLAGS) $(INCLUDE_DIRS) -c $< -o $@ + +# 链接用户空间应用程序与库 +$(notdir $(APP)): ${APP}.o $(HELPERS_OBJ_FILES) + clang -Wall $(CFLAGS) ${APP}.o $(HELPERS_OBJ_FILES) $(LIBS) -o $@ + @echo "BPF program compiled successfully." + # bpf 目标 .PHONY: bpf -bpf: vmlinux - # 编译BPF程序 - clang $(BPF_CFLAGS) -D__TARGET_ARCH_$(ARCH) $(INCLUDE_DIRS) -c $(APP).bpf.c -o $(APP).bpf.o - # 生成BPF骨架文件 - bpftool gen skeleton ${APP}.bpf.o > $(APP).skel.h - # 编译用户空间应用程序 - clang $(CFLAGS) $(INCLUDE_DIRS) -c $(APP).c -o ${APP}.o - # 将用户空间应用程序与库链接 - clang -Wall $(CFLAGS) ${APP}.o $(LIBS) -o $(notdir $(APP)) - echo "BPF program compiled successfully." +bpf: $(APP).skel.h $(APP).bpf.o ${APP}.o $(HELPERS_OBJ_FILES) $(notdir $(APP)) + + +# qemu 命令行参数变量化 +QEMU_CMD=sudo qemu-system-x86_64 -enable-kvm -cpu host -m 2048 -smp 4 -drive file=cirros-0.5.2-x86_64-disk.img,format=qcow2 -boot c -nographic + +CIRROS_IMG_URL=https://gitee.com/nan-shuaibo/cirros/releases/download/0.5.2/cirros-0.5.2-x86_64-disk.img +CIRROS_IMG_FILE=cirros-0.5.2-x86_64-disk.img + +# 定义检查虚拟化支持的命令 +CHECK_VIRT_SUPPORT = [ $$(grep -Eoc '(vmx|svm)' /proc/cpuinfo) -eq 0 ] + +# 定义检查 qemu-system-x86_64 进程是否存在的命令 +CHECK_QEMU_RUNNING = [ "$$(pgrep -f qemu-system-x86_64)" ] .PHONY: test test: bpf @@ -96,7 +113,7 @@ test: bpf clean: - cd src && rm -f *.o *.skel.h *.bpf.o + rm -f src/*.o src/*.skel.h src/helpers/*.o sudo rm -rf $(notdir $(APP)) include/vmlinux.h temp diff --git a/eBPF_Supermarket/kvm_watcher/README.md b/eBPF_Supermarket/kvm_watcher/README.md index 73218670b..c35a4647d 100755 --- a/eBPF_Supermarket/kvm_watcher/README.md +++ b/eBPF_Supermarket/kvm_watcher/README.md @@ -118,26 +118,33 @@ Report bugs to . ## 四、代码结构 ``` -├── docs //功能模块说明文档 +├── docs //功能模块说明文档 │ ├── kvm_exit.md │ ├── kvm_hypercall.md │ ├── kvm_irq.md │ ├── kvm_mmu.md │ └── kvm_vcpu.md -├── include //内核态bpf程序 -│ ├── kvm_exits.h -│ ├── kvm_hypercall.h -│ ├── kvm_ioctl.h -│ ├── kvm_irq.h -│ ├── kvm_mmu.h -│ ├── kvm_vcpu.h -│ └── kvm_watcher.h //公共头文件 +├── include +│ ├── bpf //内核态bpf程序 +│ │ ├── kvm_exits.h +│ │ ├── kvm_hypercall.h +│ │ ├── kvm_ioctl.h +│ │ ├── kvm_irq.h +│ │ ├── kvm_mmu.h +│ │ └── kvm_vcpu.h +│ ├── common.h //内核态和用户态公共头文件 +│ └── helpers //用户态帮助函数 +│ ├── trace_helpers.h +│ └── uprobe_helpers.h ├── Makefile //编译脚本 ├── README.md -├── src -│ ├── kvm_watcher.bpf.c //内核态bpf程序入口 -│ └── kvm_watcher.c //用户态bpf程序 -└── temp //临时文件目录 +├── src +│ ├── helpers //用户态帮助函数 +│ │ ├── trace_helpers.c +│ │ └── uprobe_helpers.c +│ ├── kvm_watcher.bpf.c //内核态bpf程序入口 +│ └── kvm_watcher.c //用户态bpf程序 +└── temp //临时文件目录 ``` ## 五、测试 diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_exits.h b/eBPF_Supermarket/kvm_watcher/include/bpf/kvm_exits.h similarity index 99% rename from eBPF_Supermarket/kvm_watcher/include/kvm_exits.h rename to eBPF_Supermarket/kvm_watcher/include/bpf/kvm_exits.h index 1e381d007..12faa0405 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_exits.h +++ b/eBPF_Supermarket/kvm_watcher/include/bpf/kvm_exits.h @@ -19,7 +19,7 @@ #ifndef __KVM_EXITS_H #define __KVM_EXITS_H -#include "kvm_watcher.h" +#include "common.h" #include "vmlinux.h" #include #include diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_hypercall.h b/eBPF_Supermarket/kvm_watcher/include/bpf/kvm_hypercall.h similarity index 99% rename from eBPF_Supermarket/kvm_watcher/include/kvm_hypercall.h rename to eBPF_Supermarket/kvm_watcher/include/bpf/kvm_hypercall.h index ab9b26a52..b4768d720 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_hypercall.h +++ b/eBPF_Supermarket/kvm_watcher/include/bpf/kvm_hypercall.h @@ -19,7 +19,7 @@ #ifndef __KVM_HYPERCALL_H #define __KVM_HYPERCALL_H -#include "kvm_watcher.h" +#include "common.h" #include "vmlinux.h" #include #include diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h b/eBPF_Supermarket/kvm_watcher/include/bpf/kvm_ioctl.h similarity index 99% rename from eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h rename to eBPF_Supermarket/kvm_watcher/include/bpf/kvm_ioctl.h index dd79a7702..6b619e437 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h +++ b/eBPF_Supermarket/kvm_watcher/include/bpf/kvm_ioctl.h @@ -19,7 +19,7 @@ #ifndef __KVM_IOCTL_H #define __KVM_IOCTL_H -#include "kvm_watcher.h" +#include "common.h" #include "vmlinux.h" #include #include diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_irq.h b/eBPF_Supermarket/kvm_watcher/include/bpf/kvm_irq.h similarity index 99% rename from eBPF_Supermarket/kvm_watcher/include/kvm_irq.h rename to eBPF_Supermarket/kvm_watcher/include/bpf/kvm_irq.h index fa1fda75c..2325f2467 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_irq.h +++ b/eBPF_Supermarket/kvm_watcher/include/bpf/kvm_irq.h @@ -18,7 +18,7 @@ #ifndef __KVM_IRQ_H #define __KVM_IRQ_H -#include "kvm_watcher.h" +#include "common.h" #include "vmlinux.h" #include #include diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_mmu.h b/eBPF_Supermarket/kvm_watcher/include/bpf/kvm_mmu.h similarity index 97% rename from eBPF_Supermarket/kvm_watcher/include/kvm_mmu.h rename to eBPF_Supermarket/kvm_watcher/include/bpf/kvm_mmu.h index ceea6d10d..45d5c1c52 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_mmu.h +++ b/eBPF_Supermarket/kvm_watcher/include/bpf/kvm_mmu.h @@ -19,7 +19,7 @@ #ifndef __KVM_MMU_H #define __KVM_MMU_H -#include "kvm_watcher.h" +#include "common.h" #include "vmlinux.h" #include #include @@ -60,16 +60,18 @@ static int trace_tdp_page_fault(struct kvm_vcpu *vcpu, struct common_event *e) { u64 addr; bpf_probe_read_kernel(&addr, sizeof(u64), &fault->addr); - u64 *ts; - ts = bpf_map_lookup_elem(&pf_delay, &addr); - if (!ts) { - return 0; - } u32 *count; u32 new_count = 1; u32 error_code; u64 hva, pfn; bpf_probe_read_kernel(&error_code, sizeof(u32), &fault->error_code); + u64 *ts; + ts = bpf_map_lookup_elem(&pf_delay, &addr); + if (!ts) { + int a = *ts; + bpf_printk("trace_tdp_page_fault:ts = %d", a); + return 0; + } bpf_probe_read_kernel(&hva, sizeof(u64), &fault->hva); bpf_probe_read_kernel(&pfn, sizeof(u64), &fault->pfn); short memslot_id = BPF_CORE_READ(fault, slot, id); diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_vcpu.h b/eBPF_Supermarket/kvm_watcher/include/bpf/kvm_vcpu.h similarity index 99% rename from eBPF_Supermarket/kvm_watcher/include/kvm_vcpu.h rename to eBPF_Supermarket/kvm_watcher/include/bpf/kvm_vcpu.h index abf6f3f5a..65ef18a54 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_vcpu.h +++ b/eBPF_Supermarket/kvm_watcher/include/bpf/kvm_vcpu.h @@ -19,7 +19,7 @@ #ifndef __KVM_VCPU_H #define __KVM_VCPU_H -#include "kvm_watcher.h" +#include "common.h" #include "vmlinux.h" #include #include diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h b/eBPF_Supermarket/kvm_watcher/include/common.h similarity index 99% rename from eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h rename to eBPF_Supermarket/kvm_watcher/include/common.h index 5fe4fc30f..d9435e584 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h +++ b/eBPF_Supermarket/kvm_watcher/include/common.h @@ -92,7 +92,7 @@ static const char binary_path[] = "/bin/qemu-system-x86_64"; // IOCTL #include #define KVMIO 0xAE -#define KVM_CREATE_VM _IO(KVMIO, 0x01) +#define KVM_CREATE_VM _IO(KVMIO, 0x01) #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) #define KVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events) #define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events) diff --git a/eBPF_Supermarket/kvm_watcher/include/helpers/trace_helpers.h b/eBPF_Supermarket/kvm_watcher/include/helpers/trace_helpers.h new file mode 100644 index 000000000..052e278ac --- /dev/null +++ b/eBPF_Supermarket/kvm_watcher/include/helpers/trace_helpers.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __TRACE_HELPERS_H +#define __TRACE_HELPERS_H + +#include + +#define NSEC_PER_SEC 1000000000ULL + +struct ksym { + const char *name; + unsigned long addr; +}; + +struct ksyms; + +struct ksyms *ksyms__load(void); +void ksyms__free(struct ksyms *ksyms); +const struct ksym *ksyms__map_addr(const struct ksyms *ksyms, + unsigned long addr); +const struct ksym *ksyms__get_symbol(const struct ksyms *ksyms, + const char *name); + +struct sym { + const char *name; + unsigned long start; + unsigned long size; + unsigned long offset; +}; + +struct syms; + +struct syms *syms__load_pid(int tgid); +struct syms *syms__load_file(const char *fname); +void syms__free(struct syms *syms); +const struct sym *syms__map_addr(const struct syms *syms, unsigned long addr); +const struct sym *syms__map_addr_dso(const struct syms *syms, + unsigned long addr, char **dso_name, + unsigned long *dso_offset); + +struct syms_cache; + +struct syms_cache *syms_cache__new(int nr); +struct syms *syms_cache__get_syms(struct syms_cache *syms_cache, int tgid); +void syms_cache__free(struct syms_cache *syms_cache); + +struct partition { + char *name; + unsigned int dev; +}; + +struct partitions; + +struct partitions *partitions__load(void); +void partitions__free(struct partitions *partitions); +const struct partition *partitions__get_by_dev( + const struct partitions *partitions, unsigned int dev); +const struct partition *partitions__get_by_name( + const struct partitions *partitions, const char *name); + +void print_log2_hist(unsigned int *vals, int vals_size, const char *val_type); +void print_linear_hist(unsigned int *vals, int vals_size, unsigned int base, + unsigned int step, const char *val_type); + +unsigned long long get_ktime_ns(void); + +bool is_kernel_module(const char *name); + +/* + * When attempting to use kprobe/kretprobe, please check out new fentry/fexit + * probes, as they provide better performance and usability. But in some + * situations we have to fallback to kprobe/kretprobe probes. This helper + * is used to detect fentry/fexit support for the specified kernel function. + * + * 1. A gap between kernel versions, kernel BTF is exposed + * starting from 5.4 kernel. but fentry/fexit is actually + * supported starting from 5.5. + * 2. Whether kernel supports module BTF or not + * + * *name* is the name of a kernel function to be attached to, which can be + * from vmlinux or a kernel module. + * *mod* is a hint that indicates the *name* may reside in module BTF, + * if NULL, it means *name* belongs to vmlinux. + */ +bool fentry_can_attach(const char *name, const char *mod); + +/* + * The name of a kernel function to be attached to may be changed between + * kernel releases. This helper is used to confirm whether the target kernel + * uses a certain function name before attaching. + * + * It is achieved by scaning + * /sys/kernel/debug/tracing/available_filter_functions + * If this file does not exist, it fallbacks to parse /proc/kallsyms, + * which is slower. + */ +bool kprobe_exists(const char *name); +bool tracepoint_exists(const char *category, const char *event); + +bool vmlinux_btf_exists(void); +bool module_btf_exists(const char *mod); + +bool probe_tp_btf(const char *name); +bool probe_ringbuf(); + +#endif /* __TRACE_HELPERS_H */ diff --git a/eBPF_Supermarket/kvm_watcher/include/helpers/uprobe_helpers.h b/eBPF_Supermarket/kvm_watcher/include/helpers/uprobe_helpers.h new file mode 100644 index 000000000..d33f7c1cf --- /dev/null +++ b/eBPF_Supermarket/kvm_watcher/include/helpers/uprobe_helpers.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2021 Google LLC. */ +#ifndef __UPROBE_HELPERS_H +#define __UPROBE_HELPERS_H + +#include +#include +#include + +int get_pid_binary_path(pid_t pid, char *path, size_t path_sz); +int get_pid_lib_path(pid_t pid, const char *lib, char *path, size_t path_sz); +int resolve_binary_path(const char *binary, pid_t pid, char *path, + size_t path_sz); +off_t get_elf_func_offset(const char *path, const char *func); +Elf *open_elf(const char *path, int *fd_close); +Elf *open_elf_by_fd(int fd); +void close_elf(Elf *e, int fd_close); + +#endif /* __UPROBE_HELPERS_H */ diff --git a/eBPF_Supermarket/kvm_watcher/src/helpers/trace_helpers.c b/eBPF_Supermarket/kvm_watcher/src/helpers/trace_helpers.c new file mode 100644 index 000000000..c84ed19a0 --- /dev/null +++ b/eBPF_Supermarket/kvm_watcher/src/helpers/trace_helpers.c @@ -0,0 +1,1182 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +// Copyright (c) 2020 Wenbo Zhang +// +// Based on ksyms improvements from Andrii Nakryiko, add more helpers. +// 28-Feb-2020 Wenbo Zhang Created this. +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "trace_helpers.h" +#include "uprobe_helpers.h" + +#define min(x, y) \ + ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void)(&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; \ + }) + +#define DISK_NAME_LEN 32 + +#define MINORBITS 20 +#define MINORMASK ((1U << MINORBITS) - 1) + +#define MKDEV(ma, mi) (((ma) << MINORBITS) | (mi)) + +struct ksyms { + struct ksym *syms; + int syms_sz; + int syms_cap; + char *strs; + int strs_sz; + int strs_cap; +}; + +static int ksyms__add_symbol(struct ksyms *ksyms, const char *name, + unsigned long addr) { + size_t new_cap, name_len = strlen(name) + 1; + struct ksym *ksym; + void *tmp; + + if (ksyms->strs_sz + name_len > ksyms->strs_cap) { + new_cap = ksyms->strs_cap * 4 / 3; + if (new_cap < ksyms->strs_sz + name_len) + new_cap = ksyms->strs_sz + name_len; + if (new_cap < 1024) + new_cap = 1024; + tmp = realloc(ksyms->strs, new_cap); + if (!tmp) + return -1; + ksyms->strs = tmp; + ksyms->strs_cap = new_cap; + } + if (ksyms->syms_sz + 1 > ksyms->syms_cap) { + new_cap = ksyms->syms_cap * 4 / 3; + if (new_cap < 1024) + new_cap = 1024; + tmp = realloc(ksyms->syms, sizeof(*ksyms->syms) * new_cap); + if (!tmp) + return -1; + ksyms->syms = tmp; + ksyms->syms_cap = new_cap; + } + + ksym = &ksyms->syms[ksyms->syms_sz]; + /* while constructing, re-use pointer as just a plain offset */ + ksym->name = (void *)(unsigned long)ksyms->strs_sz; + ksym->addr = addr; + + memcpy(ksyms->strs + ksyms->strs_sz, name, name_len); + ksyms->strs_sz += name_len; + ksyms->syms_sz++; + + return 0; +} + +static int ksym_cmp(const void *p1, const void *p2) { + const struct ksym *s1 = p1, *s2 = p2; + + if (s1->addr == s2->addr) + return strcmp(s1->name, s2->name); + return s1->addr < s2->addr ? -1 : 1; +} + +struct ksyms *ksyms__load(void) { + char sym_type, sym_name[256]; + struct ksyms *ksyms; + unsigned long sym_addr; + int i, ret; + FILE *f; + + f = fopen("/proc/kallsyms", "r"); + if (!f) + return NULL; + + ksyms = calloc(1, sizeof(*ksyms)); + if (!ksyms) + goto err_out; + + while (true) { + ret = fscanf(f, "%lx %c %s%*[^\n]\n", &sym_addr, &sym_type, sym_name); + if (ret == EOF && feof(f)) + break; + if (ret != 3) + goto err_out; + if (ksyms__add_symbol(ksyms, sym_name, sym_addr)) + goto err_out; + } + + /* now when strings are finalized, adjust pointers properly */ + for (i = 0; i < ksyms->syms_sz; i++) + ksyms->syms[i].name += (unsigned long)ksyms->strs; + + qsort(ksyms->syms, ksyms->syms_sz, sizeof(*ksyms->syms), ksym_cmp); + + fclose(f); + return ksyms; + +err_out: + ksyms__free(ksyms); + fclose(f); + return NULL; +} + +void ksyms__free(struct ksyms *ksyms) { + if (!ksyms) + return; + + free(ksyms->syms); + free(ksyms->strs); + free(ksyms); +} + +const struct ksym *ksyms__map_addr(const struct ksyms *ksyms, + unsigned long addr) { + int start = 0, end = ksyms->syms_sz - 1, mid; + unsigned long sym_addr; + + /* find largest sym_addr <= addr using binary search */ + while (start < end) { + mid = start + (end - start + 1) / 2; + sym_addr = ksyms->syms[mid].addr; + + if (sym_addr <= addr) + start = mid; + else + end = mid - 1; + } + + if (start == end && ksyms->syms[start].addr <= addr) + return &ksyms->syms[start]; + return NULL; +} + +const struct ksym *ksyms__get_symbol(const struct ksyms *ksyms, + const char *name) { + int i; + + for (i = 0; i < ksyms->syms_sz; i++) { + if (strcmp(ksyms->syms[i].name, name) == 0) + return &ksyms->syms[i]; + } + + return NULL; +} + +struct load_range { + uint64_t start; + uint64_t end; + uint64_t file_off; +}; + +enum elf_type { + EXEC, + DYN, + PERF_MAP, + VDSO, + UNKNOWN, +}; + +struct dso { + char *name; + struct load_range *ranges; + int range_sz; + /* Dyn's first text section virtual addr at execution */ + uint64_t sh_addr; + /* Dyn's first text section file offset */ + uint64_t sh_offset; + enum elf_type type; + + struct sym *syms; + int syms_sz; + int syms_cap; + + /* + * libbpf's struct btf is actually a pretty efficient + * "set of strings" data structure, so we create an + * empty one and use it to store symbol names. + */ + struct btf *btf; +}; + +struct map { + uint64_t start_addr; + uint64_t end_addr; + uint64_t file_off; + uint64_t dev_major; + uint64_t dev_minor; + uint64_t inode; +}; + +struct syms { + struct dso *dsos; + int dso_sz; +}; + +static bool is_file_backed(const char *mapname) { +#define STARTS_WITH(mapname, prefix) \ + (!strncmp(mapname, prefix, sizeof(prefix) - 1)) + + return mapname[0] && + !(STARTS_WITH(mapname, "//anon") || + STARTS_WITH(mapname, "/dev/zero") || + STARTS_WITH(mapname, "/anon_hugepage") || + STARTS_WITH(mapname, "[stack") || STARTS_WITH(mapname, "/SYSV") || + STARTS_WITH(mapname, "[heap]") || + STARTS_WITH(mapname, "[uprobes]") || + STARTS_WITH(mapname, "[vsyscall]")); +} + +static bool is_perf_map(const char *path) { + return false; +} + +static bool is_vdso(const char *path) { + return !strcmp(path, "[vdso]"); +} + +static bool is_uprobes(const char *path) { + return !strcmp(path, "[uprobes]"); +} + +static int get_elf_type(const char *path) { + GElf_Ehdr hdr; + void *res; + Elf *e; + int fd; + + if (is_vdso(path)) + return -1; + if (is_uprobes(path)) + return -1; + e = open_elf(path, &fd); + if (!e) + return -1; + res = gelf_getehdr(e, &hdr); + close_elf(e, fd); + if (!res) + return -1; + return hdr.e_type; +} + +static int get_elf_text_scn_info(const char *path, uint64_t *addr, + uint64_t *offset) { + Elf_Scn *section = NULL; + int fd = -1, err = -1; + GElf_Shdr header; + size_t stridx; + Elf *e = NULL; + char *name; + + e = open_elf(path, &fd); + if (!e) + goto err_out; + err = elf_getshdrstrndx(e, &stridx); + if (err < 0) + goto err_out; + + err = -1; + while ((section = elf_nextscn(e, section)) != 0) { + if (!gelf_getshdr(section, &header)) + continue; + + name = elf_strptr(e, stridx, header.sh_name); + if (name && !strcmp(name, ".text")) { + *addr = (uint64_t)header.sh_addr; + *offset = (uint64_t)header.sh_offset; + err = 0; + break; + } + } + +err_out: + close_elf(e, fd); + return err; +} + +static int syms__add_dso(struct syms *syms, struct map *map, const char *name) { + struct dso *dso = NULL; + int i, type; + void *tmp; + + for (i = 0; i < syms->dso_sz; i++) { + if (!strcmp(syms->dsos[i].name, name)) { + dso = &syms->dsos[i]; + break; + } + } + + if (!dso) { + tmp = realloc(syms->dsos, (syms->dso_sz + 1) * sizeof(*syms->dsos)); + if (!tmp) + return -1; + syms->dsos = tmp; + dso = &syms->dsos[syms->dso_sz++]; + memset(dso, 0, sizeof(*dso)); + dso->name = strdup(name); + dso->btf = btf__new_empty(); + } + + tmp = realloc(dso->ranges, (dso->range_sz + 1) * sizeof(*dso->ranges)); + if (!tmp) + return -1; + dso->ranges = tmp; + dso->ranges[dso->range_sz].start = map->start_addr; + dso->ranges[dso->range_sz].end = map->end_addr; + dso->ranges[dso->range_sz].file_off = map->file_off; + dso->range_sz++; + type = get_elf_type(name); + if (type == ET_EXEC) { + dso->type = EXEC; + } else if (type == ET_DYN) { + dso->type = DYN; + if (get_elf_text_scn_info(name, &dso->sh_addr, &dso->sh_offset) < 0) + return -1; + } else if (is_perf_map(name)) { + dso->type = PERF_MAP; + } else if (is_vdso(name)) { + dso->type = VDSO; + } else { + dso->type = UNKNOWN; + } + return 0; +} + +static struct dso *syms__find_dso(const struct syms *syms, unsigned long addr, + uint64_t *offset) { + struct load_range *range; + struct dso *dso; + int i, j; + + for (i = 0; i < syms->dso_sz; i++) { + dso = &syms->dsos[i]; + for (j = 0; j < dso->range_sz; j++) { + range = &dso->ranges[j]; + if (addr <= range->start || addr >= range->end) + continue; + if (dso->type == DYN || dso->type == VDSO) { + /* Offset within the mmap */ + *offset = addr - range->start + range->file_off; + /* Offset within the ELF for dyn symbol lookup */ + *offset += dso->sh_addr - dso->sh_offset; + } else { + *offset = addr; + } + + return dso; + } + } + + return NULL; +} + +static int dso__load_sym_table_from_perf_map(struct dso *dso) { + return -1; +} + +static int dso__add_sym(struct dso *dso, const char *name, uint64_t start, + uint64_t size) { + struct sym *sym; + size_t new_cap; + void *tmp; + int off; + + off = btf__add_str(dso->btf, name); + if (off < 0) + return off; + + if (dso->syms_sz + 1 > dso->syms_cap) { + new_cap = dso->syms_cap * 4 / 3; + if (new_cap < 1024) + new_cap = 1024; + tmp = realloc(dso->syms, sizeof(*dso->syms) * new_cap); + if (!tmp) + return -1; + dso->syms = tmp; + dso->syms_cap = new_cap; + } + + sym = &dso->syms[dso->syms_sz++]; + /* while constructing, re-use pointer as just a plain offset */ + sym->name = (void *)(unsigned long)off; + sym->start = start; + sym->size = size; + sym->offset = 0; + + return 0; +} + +static int sym_cmp(const void *p1, const void *p2) { + const struct sym *s1 = p1, *s2 = p2; + + if (s1->start == s2->start) + return strcmp(s1->name, s2->name); + return s1->start < s2->start ? -1 : 1; +} + +static int dso__add_syms(struct dso *dso, Elf *e, Elf_Scn *section, + size_t stridx, size_t symsize) { + Elf_Data *data = NULL; + + while ((data = elf_getdata(section, data)) != 0) { + size_t i, symcount = data->d_size / symsize; + + if (data->d_size % symsize) + return -1; + + for (i = 0; i < symcount; ++i) { + const char *name; + GElf_Sym sym; + + if (!gelf_getsym(data, (int)i, &sym)) + continue; + if (!(name = elf_strptr(e, stridx, sym.st_name))) + continue; + if (name[0] == '\0') + continue; + + if (sym.st_value == 0) + continue; + + if (dso__add_sym(dso, name, sym.st_value, sym.st_size)) + goto err_out; + } + } + + return 0; + +err_out: + return -1; +} + +static void dso__free_fields(struct dso *dso) { + if (!dso) + return; + + free(dso->name); + free(dso->ranges); + free(dso->syms); + btf__free(dso->btf); +} + +static int dso__load_sym_table_from_elf(struct dso *dso, int fd) { + Elf_Scn *section = NULL; + Elf *e; + int i; + + e = fd > 0 ? open_elf_by_fd(fd) : open_elf(dso->name, &fd); + if (!e) + return -1; + + while ((section = elf_nextscn(e, section)) != 0) { + GElf_Shdr header; + + if (!gelf_getshdr(section, &header)) + continue; + + if (header.sh_type != SHT_SYMTAB && header.sh_type != SHT_DYNSYM) + continue; + + if (dso__add_syms(dso, e, section, header.sh_link, header.sh_entsize)) + goto err_out; + } + + /* now when strings are finalized, adjust pointers properly */ + for (i = 0; i < dso->syms_sz; i++) + dso->syms[i].name = + btf__name_by_offset(dso->btf, (unsigned long)dso->syms[i].name); + + qsort(dso->syms, dso->syms_sz, sizeof(*dso->syms), sym_cmp); + + close_elf(e, fd); + return 0; + +err_out: + dso__free_fields(dso); + close_elf(e, fd); + return -1; +} + +static int create_tmp_vdso_image(struct dso *dso) { + uint64_t start_addr, end_addr; + long pid = getpid(); + char buf[PATH_MAX]; + void *image = NULL; + char tmpfile[128]; + int ret, fd = -1; + uint64_t sz; + char *name; + FILE *f; + + snprintf(tmpfile, sizeof(tmpfile), "/proc/%ld/maps", pid); + f = fopen(tmpfile, "r"); + if (!f) + return -1; + + while (true) { + ret = fscanf(f, "%llx-%llx %*s %*x %*x:%*x %*u%[^\n]", + (long long *)&start_addr, (long long *)&end_addr, buf); + if (ret == EOF && feof(f)) + break; + if (ret != 3) + goto err_out; + + name = buf; + while (isspace(*name)) + name++; + if (!is_file_backed(name)) + continue; + if (is_vdso(name)) + break; + } + + sz = end_addr - start_addr; + image = malloc(sz); + if (!image) + goto err_out; + memcpy(image, (void *)start_addr, sz); + + snprintf(tmpfile, sizeof(tmpfile), "/tmp/libbpf_%ld_vdso_image_XXXXXX", + pid); + fd = mkostemp(tmpfile, O_CLOEXEC); + if (fd < 0) { + fprintf(stderr, "failed to create temp file: %s\n", strerror(errno)); + goto err_out; + } + /* Unlink the file to avoid leaking */ + if (unlink(tmpfile) == -1) + fprintf(stderr, "failed to unlink %s: %s\n", tmpfile, strerror(errno)); + if (write(fd, image, sz) == -1) { + fprintf(stderr, "failed to write to vDSO image: %s\n", strerror(errno)); + close(fd); + fd = -1; + goto err_out; + } + +err_out: + fclose(f); + free(image); + return fd; +} + +static int dso__load_sym_table_from_vdso_image(struct dso *dso) { + int fd = create_tmp_vdso_image(dso); + + if (fd < 0) + return -1; + return dso__load_sym_table_from_elf(dso, fd); +} + +static int dso__load_sym_table(struct dso *dso) { + if (dso->type == UNKNOWN) + return -1; + if (dso->type == PERF_MAP) + return dso__load_sym_table_from_perf_map(dso); + if (dso->type == EXEC || dso->type == DYN) + return dso__load_sym_table_from_elf(dso, 0); + if (dso->type == VDSO) + return dso__load_sym_table_from_vdso_image(dso); + return -1; +} + +static struct sym *dso__find_sym(struct dso *dso, uint64_t offset) { + unsigned long sym_addr; + int start, end, mid; + + if (!dso->syms && dso__load_sym_table(dso)) + return NULL; + + start = 0; + end = dso->syms_sz - 1; + + /* find largest sym_addr <= addr using binary search */ + while (start < end) { + mid = start + (end - start + 1) / 2; + sym_addr = dso->syms[mid].start; + + if (sym_addr <= offset) + start = mid; + else + end = mid - 1; + } + + if (start == end && dso->syms[start].start <= offset && + offset < dso->syms[start].start + dso->syms[start].size) { + (dso->syms[start]).offset = offset - dso->syms[start].start; + return &dso->syms[start]; + } + return NULL; +} + +struct syms *syms__load_file(const char *fname) { + char buf[PATH_MAX], perm[5]; + struct syms *syms; + struct map map; + char *name; + FILE *f; + int ret; + + f = fopen(fname, "r"); + if (!f) + return NULL; + + syms = calloc(1, sizeof(*syms)); + if (!syms) + goto err_out; + + while (true) { + ret = fscanf(f, "%llx-%llx %4s %llx %llx:%llx %llu%[^\n]", + (long long *)&map.start_addr, (long long *)&map.end_addr, + perm, (long long *)&map.file_off, + (long long *)&map.dev_major, (long long *)&map.dev_minor, + (long long *)&map.inode, buf); + if (ret == EOF && feof(f)) + break; + if (ret != 8) /* perf-.map */ + goto err_out; + + if (perm[2] != 'x') + continue; + + name = buf; + while (isspace(*name)) + name++; + if (!is_file_backed(name)) + continue; + + if (syms__add_dso(syms, &map, name)) + goto err_out; + } + + fclose(f); + return syms; + +err_out: + syms__free(syms); + fclose(f); + return NULL; +} + +struct syms *syms__load_pid(pid_t tgid) { + char fname[128]; + + snprintf(fname, sizeof(fname), "/proc/%ld/maps", (long)tgid); + return syms__load_file(fname); +} + +void syms__free(struct syms *syms) { + int i; + + if (!syms) + return; + + for (i = 0; i < syms->dso_sz; i++) + dso__free_fields(&syms->dsos[i]); + free(syms->dsos); + free(syms); +} + +const struct sym *syms__map_addr(const struct syms *syms, unsigned long addr) { + struct dso *dso; + uint64_t offset; + + dso = syms__find_dso(syms, addr, &offset); + if (!dso) + return NULL; + return dso__find_sym(dso, offset); +} + +const struct sym *syms__map_addr_dso(const struct syms *syms, + unsigned long addr, char **dso_name, + unsigned long *dso_offset) { + struct dso *dso; + uint64_t offset; + + dso = syms__find_dso(syms, addr, &offset); + if (!dso) + return NULL; + + *dso_name = dso->name; + *dso_offset = offset; + + return dso__find_sym(dso, offset); +} + +struct syms_cache { + struct { + struct syms *syms; + int tgid; + } *data; + int nr; +}; + +struct syms_cache *syms_cache__new(int nr) { + struct syms_cache *syms_cache; + + syms_cache = calloc(1, sizeof(*syms_cache)); + if (!syms_cache) + return NULL; + if (nr > 0) + syms_cache->data = calloc(nr, sizeof(*syms_cache->data)); + return syms_cache; +} + +void syms_cache__free(struct syms_cache *syms_cache) { + int i; + + if (!syms_cache) + return; + + for (i = 0; i < syms_cache->nr; i++) + syms__free(syms_cache->data[i].syms); + free(syms_cache->data); + free(syms_cache); +} + +struct syms *syms_cache__get_syms(struct syms_cache *syms_cache, int tgid) { + void *tmp; + int i; + + for (i = 0; i < syms_cache->nr; i++) { + if (syms_cache->data[i].tgid == tgid) + return syms_cache->data[i].syms; + } + + tmp = realloc(syms_cache->data, + (syms_cache->nr + 1) * sizeof(*syms_cache->data)); + if (!tmp) + return NULL; + syms_cache->data = tmp; + syms_cache->data[syms_cache->nr].syms = syms__load_pid(tgid); + syms_cache->data[syms_cache->nr].tgid = tgid; + return syms_cache->data[syms_cache->nr++].syms; +} + +struct partitions { + struct partition *items; + int sz; +}; + +static int partitions__add_partition(struct partitions *partitions, + const char *name, unsigned int dev) { + struct partition *partition; + void *tmp; + + tmp = realloc(partitions->items, + (partitions->sz + 1) * sizeof(*partitions->items)); + if (!tmp) + return -1; + partitions->items = tmp; + partition = &partitions->items[partitions->sz]; + partition->name = strdup(name); + partition->dev = dev; + partitions->sz++; + + return 0; +} + +struct partitions *partitions__load(void) { + char part_name[DISK_NAME_LEN]; + unsigned int devmaj, devmin; + unsigned long long nop; + struct partitions *partitions; + char buf[64]; + FILE *f; + + f = fopen("/proc/partitions", "r"); + if (!f) + return NULL; + + partitions = calloc(1, sizeof(*partitions)); + if (!partitions) + goto err_out; + + while (fgets(buf, sizeof(buf), f) != NULL) { + /* skip heading */ + if (buf[0] != ' ' || buf[0] == '\n') + continue; + if (sscanf(buf, "%u %u %llu %s", &devmaj, &devmin, &nop, part_name) != + 4) + goto err_out; + if (partitions__add_partition(partitions, part_name, + MKDEV(devmaj, devmin))) + goto err_out; + } + + fclose(f); + return partitions; + +err_out: + partitions__free(partitions); + fclose(f); + return NULL; +} + +void partitions__free(struct partitions *partitions) { + int i; + + if (!partitions) + return; + + for (i = 0; i < partitions->sz; i++) + free(partitions->items[i].name); + free(partitions->items); + free(partitions); +} + +const struct partition *partitions__get_by_dev( + const struct partitions *partitions, unsigned int dev) { + int i; + + for (i = 0; i < partitions->sz; i++) { + if (partitions->items[i].dev == dev) + return &partitions->items[i]; + } + + return NULL; +} + +const struct partition *partitions__get_by_name( + const struct partitions *partitions, const char *name) { + int i; + + for (i = 0; i < partitions->sz; i++) { + if (strcmp(partitions->items[i].name, name) == 0) + return &partitions->items[i]; + } + + return NULL; +} + +static void print_stars(unsigned int val, unsigned int val_max, int width) { + int num_stars, num_spaces, i; + bool need_plus; + + num_stars = min(val, val_max) * width / val_max; + num_spaces = width - num_stars; + need_plus = val > val_max; + + for (i = 0; i < num_stars; i++) + printf("*"); + for (i = 0; i < num_spaces; i++) + printf(" "); + if (need_plus) + printf("+"); +} + +void print_log2_hist(unsigned int *vals, int vals_size, const char *val_type) { + int stars_max = 40, idx_max = -1; + unsigned int val, val_max = 0; + unsigned long long low, high; + int stars, width, i; + + for (i = 0; i < vals_size; i++) { + val = vals[i]; + if (val > 0) + idx_max = i; + if (val > val_max) + val_max = val; + } + + if (idx_max < 0) + return; + + printf("%*s%-*s : count distribution\n", idx_max <= 32 ? 5 : 15, "", + idx_max <= 32 ? 19 : 29, val_type); + + if (idx_max <= 32) + stars = stars_max; + else + stars = stars_max / 2; + + for (i = 0; i <= idx_max; i++) { + low = (1ULL << (i + 1)) >> 1; + high = (1ULL << (i + 1)) - 1; + if (low == high) + low -= 1; + val = vals[i]; + width = idx_max <= 32 ? 10 : 20; + printf("%*lld -> %-*lld : %-8d |", width, low, width, high, val); + print_stars(val, val_max, stars); + printf("|\n"); + } +} + +void print_linear_hist(unsigned int *vals, int vals_size, unsigned int base, + unsigned int step, const char *val_type) { + int i, stars_max = 40, idx_min = -1, idx_max = -1; + unsigned int val, val_max = 0; + + for (i = 0; i < vals_size; i++) { + val = vals[i]; + if (val > 0) { + idx_max = i; + if (idx_min < 0) + idx_min = i; + } + if (val > val_max) + val_max = val; + } + + if (idx_max < 0) + return; + + printf(" %-13s : count distribution\n", val_type); + for (i = idx_min; i <= idx_max; i++) { + val = vals[i]; + if (!val) + continue; + printf(" %-10d : %-8d |", base + i * step, val); + print_stars(val, val_max, stars_max); + printf("|\n"); + } +} + +unsigned long long get_ktime_ns(void) { + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec; +} + +bool is_kernel_module(const char *name) { + bool found = false; + char buf[64]; + FILE *f; + + f = fopen("/proc/modules", "r"); + if (!f) + return false; + + while (fgets(buf, sizeof(buf), f) != NULL) { + if (sscanf(buf, "%s %*s\n", buf) != 1) + break; + if (!strcmp(buf, name)) { + found = true; + break; + } + } + + fclose(f); + return found; +} + +static bool fentry_try_attach(int id) { + int prog_fd, attach_fd; + char error[4096]; + struct bpf_insn insns[] = { + {.code = BPF_ALU64 | BPF_MOV | BPF_K, .dst_reg = BPF_REG_0, .imm = 0}, + {.code = BPF_JMP | BPF_EXIT}, + }; + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .expected_attach_type = BPF_TRACE_FENTRY, .attach_btf_id = id, + .log_buf = error, .log_size = sizeof(error), ); + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACING, "test", "GPL", insns, + sizeof(insns) / sizeof(struct bpf_insn), &opts); + if (prog_fd < 0) + return false; + + attach_fd = bpf_raw_tracepoint_open(NULL, prog_fd); + if (attach_fd >= 0) + close(attach_fd); + + close(prog_fd); + return attach_fd >= 0; +} + +bool fentry_can_attach(const char *name, const char *mod) { + struct btf *btf, *vmlinux_btf, *module_btf = NULL; + int err, id; + + vmlinux_btf = btf__load_vmlinux_btf(); + err = libbpf_get_error(vmlinux_btf); + if (err) + return false; + + btf = vmlinux_btf; + + if (mod) { + module_btf = btf__load_module_btf(mod, vmlinux_btf); + err = libbpf_get_error(module_btf); + if (!err) { + btf = module_btf; + } + } + + id = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC); + + btf__free(module_btf); + btf__free(vmlinux_btf); + return id > 0 && fentry_try_attach(id); +} + +#define DEBUGFS "/sys/kernel/debug/tracing" +#define TRACEFS "/sys/kernel/tracing" + +static bool use_debugfs(void) { + static int has_debugfs = -1; + + if (has_debugfs < 0) + has_debugfs = faccessat(AT_FDCWD, DEBUGFS, F_OK, AT_EACCESS) == 0; + + return has_debugfs == 1; +} + +static const char *tracefs_path(void) { + return use_debugfs() ? DEBUGFS : TRACEFS; +} + +static const char *tracefs_available_filter_functions(void) { + return use_debugfs() ? DEBUGFS "/available_filter_functions" + : TRACEFS "/available_filter_functions"; +} + +bool kprobe_exists(const char *name) { + char addr_range[256]; + char sym_name[256]; + FILE *f; + int ret; + + f = fopen("/sys/kernel/debug/kprobes/blacklist", "r"); + if (!f) + goto avail_filter; + + while (true) { + ret = fscanf(f, "%s %s%*[^\n]\n", addr_range, sym_name); + if (ret == EOF && feof(f)) + break; + if (ret != 2) { + fprintf(stderr, "failed to read symbol from kprobe blacklist\n"); + break; + } + if (!strcmp(name, sym_name)) { + fclose(f); + return false; + } + } + fclose(f); + +avail_filter: + f = fopen(tracefs_available_filter_functions(), "r"); + if (!f) + goto slow_path; + + while (true) { + ret = fscanf(f, "%s%*[^\n]\n", sym_name); + if (ret == EOF && feof(f)) + break; + if (ret != 1) { + fprintf(stderr, + "failed to read symbol from available_filter_functions\n"); + break; + } + if (!strcmp(name, sym_name)) { + fclose(f); + return true; + } + } + + fclose(f); + return false; + +slow_path: + f = fopen("/proc/kallsyms", "r"); + if (!f) + return false; + + while (true) { + ret = fscanf(f, "%*x %*c %s%*[^\n]\n", sym_name); + if (ret == EOF && feof(f)) + break; + if (ret != 1) { + fprintf(stderr, "failed to read symbol from kallsyms\n"); + break; + } + if (!strcmp(name, sym_name)) { + fclose(f); + return true; + } + } + + fclose(f); + return false; +} + +bool tracepoint_exists(const char *category, const char *event) { + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "%s/events/%s/%s/format", tracefs_path(), + category, event); + if (!access(path, F_OK)) + return true; + return false; +} + +bool vmlinux_btf_exists(void) { + struct btf *btf; + int err; + + btf = btf__load_vmlinux_btf(); + err = libbpf_get_error(btf); + if (err) + return false; + + btf__free(btf); + return true; +} + +bool module_btf_exists(const char *mod) { + char sysfs_mod[80]; + + if (mod) { + snprintf(sysfs_mod, sizeof(sysfs_mod), "/sys/kernel/btf/%s", mod); + if (!access(sysfs_mod, R_OK)) + return true; + } + return false; +} + +bool probe_tp_btf(const char *name) { + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .expected_attach_type = BPF_TRACE_RAW_TP); + struct bpf_insn insns[] = { + {.code = BPF_ALU64 | BPF_MOV | BPF_K, .dst_reg = BPF_REG_0, .imm = 0}, + {.code = BPF_JMP | BPF_EXIT}, + }; + int fd, insn_cnt = sizeof(insns) / sizeof(struct bpf_insn); + + opts.attach_btf_id = libbpf_find_vmlinux_btf_id(name, BPF_TRACE_RAW_TP); + fd = bpf_prog_load(BPF_PROG_TYPE_TRACING, NULL, "GPL", insns, insn_cnt, + &opts); + if (fd >= 0) + close(fd); + return fd >= 0; +} + +bool probe_ringbuf() { + int map_fd; + + map_fd = + bpf_map_create(BPF_MAP_TYPE_RINGBUF, NULL, 0, 0, getpagesize(), NULL); + if (map_fd < 0) + return false; + + close(map_fd); + return true; +} diff --git a/eBPF_Supermarket/kvm_watcher/src/helpers/uprobe_helpers.c b/eBPF_Supermarket/kvm_watcher/src/helpers/uprobe_helpers.c new file mode 100644 index 000000000..5cf6d1743 --- /dev/null +++ b/eBPF_Supermarket/kvm_watcher/src/helpers/uprobe_helpers.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2021 Google LLC. */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define warn(...) fprintf(stderr, __VA_ARGS__) + +/* + * Returns 0 on success; -1 on failure. On sucess, returns via `path` the full + * path to the program for pid. + */ +int get_pid_binary_path(pid_t pid, char *path, size_t path_sz) { + ssize_t ret; + char proc_pid_exe[32]; + + if (snprintf(proc_pid_exe, sizeof(proc_pid_exe), "/proc/%d/exe", pid) >= + sizeof(proc_pid_exe)) { + warn("snprintf /proc/PID/exe failed"); + return -1; + } + ret = readlink(proc_pid_exe, path, path_sz); + if (ret < 0) { + warn("No such pid %d\n", pid); + return -1; + } + if (ret >= path_sz) { + warn("readlink truncation"); + return -1; + } + path[ret] = '\0'; + + return 0; +} + +/* + * Returns 0 on success; -1 on failure. On success, returns via `path` the full + * path to a library matching the name `lib` that is loaded into pid's address + * space. + */ +int get_pid_lib_path(pid_t pid, const char *lib, char *path, size_t path_sz) { + FILE *maps; + char *p; + char proc_pid_maps[32]; + char line_buf[1024]; + char path_buf[1024]; + + if (snprintf(proc_pid_maps, sizeof(proc_pid_maps), "/proc/%d/maps", pid) >= + sizeof(proc_pid_maps)) { + warn("snprintf /proc/PID/maps failed"); + return -1; + } + maps = fopen(proc_pid_maps, "r"); + if (!maps) { + warn("No such pid %d\n", pid); + return -1; + } + while (fgets(line_buf, sizeof(line_buf), maps)) { + if (sscanf(line_buf, "%*x-%*x %*s %*x %*s %*u %s", path_buf) != 1) + continue; + /* e.g. /usr/lib/x86_64-linux-gnu/libc-2.31.so */ + p = strrchr(path_buf, '/'); + if (!p) + continue; + if (strncmp(p, "/lib", 4)) + continue; + p += 4; + if (strncmp(lib, p, strlen(lib))) + continue; + p += strlen(lib); + /* libraries can have - or . after the name */ + if (*p != '.' && *p != '-') + continue; + if (strnlen(path_buf, 1024) >= path_sz) { + warn("path size too small\n"); + return -1; + } + strcpy(path, path_buf); + fclose(maps); + return 0; + } + + warn("Cannot find library %s\n", lib); + fclose(maps); + return -1; +} + +/* + * Returns 0 on success; -1 on failure. On success, returns via `path` the full + * path to the program. + */ +static int which_program(const char *prog, char *path, size_t path_sz) { + FILE *which; + char cmd[100]; + + if (snprintf(cmd, sizeof(cmd), "which %s", prog) >= sizeof(cmd)) { + warn("snprintf which prog failed"); + return -1; + } + which = popen(cmd, "r"); + if (!which) { + warn("which failed"); + return -1; + } + if (!fgets(path, path_sz, which)) { + warn("fgets which failed"); + pclose(which); + return -1; + } + /* which has a \n at the end of the string */ + path[strlen(path) - 1] = '\0'; + pclose(which); + return 0; +} + +/* + * Returns 0 on success; -1 on failure. On success, returns via `path` the full + * path to the binary for the given pid. + * 1) pid == x, binary == "" : returns the path to x's program + * 2) pid == x, binary == "foo" : returns the path to libfoo linked in x + * 3) pid == 0, binary == "" : failure: need a pid or a binary + * 4) pid == 0, binary == "bar" : returns the path to `which bar` + * + * For case 4), ideally we'd like to search for libbar too, but we don't support + * that yet. + */ +int resolve_binary_path(const char *binary, pid_t pid, char *path, + size_t path_sz) { + if (!strcmp(binary, "")) { + if (!pid) { + warn("Uprobes need a pid or a binary\n"); + return -1; + } + return get_pid_binary_path(pid, path, path_sz); + } + if (pid) + return get_pid_lib_path(pid, binary, path, path_sz); + + if (which_program(binary, path, path_sz)) { + /* + * If the user is tracing a program by name, we can find it. + * But we can't find a library by name yet. We'd need to parse + * ld.so.cache or something similar. + */ + warn("Can't find %s (Need a PID if this is a library)\n", binary); + return -1; + } + return 0; +} + +/* + * Opens an elf at `path` of kind ELF_K_ELF. Returns NULL on failure. On + * success, close with close_elf(e, fd_close). + */ +Elf *open_elf(const char *path, int *fd_close) { + int fd; + Elf *e; + + if (elf_version(EV_CURRENT) == EV_NONE) { + warn("elf init failed\n"); + return NULL; + } + fd = open(path, O_RDONLY); + if (fd < 0) { + warn("Could not open %s\n", path); + return NULL; + } + e = elf_begin(fd, ELF_C_READ, NULL); + if (!e) { + warn("elf_begin failed: %s\n", elf_errmsg(-1)); + close(fd); + return NULL; + } + if (elf_kind(e) != ELF_K_ELF) { + warn("elf kind %d is not ELF_K_ELF\n", elf_kind(e)); + elf_end(e); + close(fd); + return NULL; + } + *fd_close = fd; + return e; +} + +Elf *open_elf_by_fd(int fd) { + Elf *e; + + if (elf_version(EV_CURRENT) == EV_NONE) { + warn("elf init failed\n"); + return NULL; + } + e = elf_begin(fd, ELF_C_READ, NULL); + if (!e) { + warn("elf_begin failed: %s\n", elf_errmsg(-1)); + close(fd); + return NULL; + } + if (elf_kind(e) != ELF_K_ELF) { + warn("elf kind %d is not ELF_K_ELF\n", elf_kind(e)); + elf_end(e); + close(fd); + return NULL; + } + return e; +} + +void close_elf(Elf *e, int fd_close) { + elf_end(e); + close(fd_close); +} + +/* Returns the offset of a function in the elf file `path`, or -1 on failure. */ +off_t get_elf_func_offset(const char *path, const char *func) { + off_t ret = -1; + int i, fd = -1; + Elf *e; + Elf_Scn *scn; + Elf_Data *data; + GElf_Ehdr ehdr; + GElf_Shdr shdr[1]; + GElf_Phdr phdr; + GElf_Sym sym[1]; + size_t shstrndx, nhdrs; + char *n; + + e = open_elf(path, &fd); + + if (!gelf_getehdr(e, &ehdr)) + goto out; + + if (elf_getshdrstrndx(e, &shstrndx) != 0) + goto out; + + scn = NULL; + while ((scn = elf_nextscn(e, scn))) { + if (!gelf_getshdr(scn, shdr)) + continue; + if (!(shdr->sh_type == SHT_SYMTAB || shdr->sh_type == SHT_DYNSYM)) + continue; + data = NULL; + while ((data = elf_getdata(scn, data))) { + for (i = 0; gelf_getsym(data, i, sym); i++) { + n = elf_strptr(e, shdr->sh_link, sym->st_name); + if (!n) + continue; + if (GELF_ST_TYPE(sym->st_info) != STT_FUNC) + continue; + if (!strcmp(n, func)) { + ret = sym->st_value; + goto check; + } + } + } + } + +check: + if (ehdr.e_type == ET_EXEC || ehdr.e_type == ET_DYN) { + if (elf_getphdrnum(e, &nhdrs) != 0) { + ret = -1; + goto out; + } + for (i = 0; i < (int)nhdrs; i++) { + if (!gelf_getphdr(e, i, &phdr)) + continue; + if (phdr.p_type != PT_LOAD || !(phdr.p_flags & PF_X)) + continue; + if (phdr.p_vaddr <= ret && ret < (phdr.p_vaddr + phdr.p_memsz)) { + ret = ret - phdr.p_vaddr + phdr.p_offset; + goto out; + } + } + ret = -1; + } +out: + close_elf(e, fd); + return ret; +} diff --git a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c index 23b6d87f3..54352b8de 100644 --- a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c +++ b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c @@ -16,17 +16,17 @@ // // Kernel space BPF program used for monitoring data for KVM event. -#include "../include/vmlinux.h" +#include "vmlinux.h" #include #include #include -#include "../include/kvm_watcher.h" -#include "../include/kvm_exits.h" -#include "../include/kvm_ioctl.h" -#include "../include/kvm_vcpu.h" -#include "../include/kvm_mmu.h" -#include "../include/kvm_irq.h" -#include "../include/kvm_hypercall.h" +#include "common.h" +#include "kvm_exits.h" +#include "kvm_ioctl.h" +#include "kvm_vcpu.h" +#include "kvm_mmu.h" +#include "kvm_irq.h" +#include "kvm_hypercall.h" char LICENSE[] SEC("license") = "Dual BSD/GPL"; @@ -69,18 +69,20 @@ int tp_entry(struct exit *ctx) { } // 记录VCPU调度的信息--进入 SEC("fentry/vmx_vcpu_load") -int BPF_PROG(kp_vmx_vcpu_load, struct kvm_vcpu *vcpu, int cpu) { +int BPF_PROG(fentry_vmx_vcpu_load, struct kvm_vcpu *vcpu, int cpu) { CHECK_PID(vm_pid); return trace_vmx_vcpu_load(vcpu, cpu); } + // 记录VCPU调度的信息--退出 SEC("fentry/vmx_vcpu_put") -int BPF_PROG(kp_vmx_vcpu_put, struct kvm_vcpu *vcpu) { +int BPF_PROG(fentry_vmx_vcpu_put) { return trace_vmx_vcpu_put(); } + SEC("fentry/mark_page_dirty_in_slot") -int BPF_PROG(kp_mark_page_dirty_in_slot, struct kvm *kvm, - const struct kvm_memory_slot *memslot, gfn_t gfn) { +int BPF_PROG(fentry_mark_page_dirty_in_slot, struct kvm *kvm, + const struct kvm_memory_slot *memslot, gfn_t gfn) { CHECK_PID(vm_pid); return trace_mark_page_dirty_in_slot(kvm, memslot, gfn, &rb, e); } diff --git a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c index 142a2d7fc..ab9260223 100644 --- a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c +++ b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c @@ -27,7 +27,9 @@ #include #include #include -#include "../include/kvm_watcher.h" +#include "common.h" +#include "trace_helpers.h" +#include "uprobe_helpers.h" #include "kvm_watcher.skel.h" // 创建并打开临时文件 @@ -795,9 +797,9 @@ static int print_event_head(struct env *env) { static void set_disable_load(struct kvm_watcher_bpf *skel) { bpf_program__set_autoload(skel->progs.tp_vcpu_wakeup, env.execute_vcpu_wakeup ? true : false); - bpf_program__set_autoload(skel->progs.kp_vmx_vcpu_load, + bpf_program__set_autoload(skel->progs.fentry_vmx_vcpu_load, env.execute_vcpu_load ? true : false); - bpf_program__set_autoload(skel->progs.kp_vmx_vcpu_put, + bpf_program__set_autoload(skel->progs.fentry_vmx_vcpu_put, env.execute_vcpu_load ? true : false); bpf_program__set_autoload(skel->progs.fentry_kvm_vcpu_halt, env.execute_vcpu_wakeup ? true : false); @@ -811,7 +813,7 @@ static void set_disable_load(struct kvm_watcher_bpf *skel) { env.execute_exit ? true : false); bpf_program__set_autoload(skel->progs.tp_kvm_halt_poll_ns, env.execute_halt_poll_ns ? true : false); - bpf_program__set_autoload(skel->progs.kp_mark_page_dirty_in_slot, + bpf_program__set_autoload(skel->progs.fentry_mark_page_dirty_in_slot, env.execute_mark_page_dirty ? true : false); bpf_program__set_autoload(skel->progs.tp_page_fault, env.execute_page_fault ? true : false); @@ -1131,7 +1133,7 @@ void print_map_and_check_error(int (*print_func)(struct kvm_watcher_bpf *), const char *map_name, int err) { OUTPUT_INTERVAL(2); print_func(skel); - if (err < 0) { + if (err < 0 && err != -4) { printf("Error printing %s map: %d\n", map_name, err); } } From ca9488a3fc3697e4cf2363ea254b045cec75ee3b Mon Sep 17 00:00:00 2001 From: Y_y_s <78297703+Monkey857@users.noreply.github.com> Date: Fri, 10 May 2024 19:54:50 +0800 Subject: [PATCH 2/5] =?UTF-8?q?kvm=5Fwatcher:=E4=BC=98=E5=8C=96=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=E5=B9=B6=E5=AE=8C=E5=96=84=E8=AF=B4=E6=98=8E=E6=96=87?= =?UTF-8?q?=E6=A1=A3=20=20(#787)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add kvmexit watcher * Update kvmexit.py * VMexit * 修改action * 修改action * 修改makefile * update makefile * update yml * 调整代码格式 * vm exit * 优化代码 * modify sort_bug * modify sort_bug * add fun * vcpu_load * optimize code * modify sortFun * Revert "modify sortFun" This reverts commit 3bb005825a3f35f88e0d8d2b5d798ca094be6076. * fix bug --- eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md | 39 +++- .../kvm_watcher/src/kvm_watcher.c | 172 ++++++++---------- 2 files changed, 116 insertions(+), 95 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md b/eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md index e8980e773..59eec0e20 100644 --- a/eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md +++ b/eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md @@ -2,7 +2,7 @@ ## 概述 -kvm watcher中的kvm vcpu子功能模块是设计用于监控和分析虚拟化环境中虚拟 CPU (VCPU) 活动的工具,它通过精确记录 VCPU 的唤醒/挂起事件、halt poll 时间的变化,以及 KVM 虚拟机在热迁移过程中产生的脏页信息,为优化虚拟机性能、提高系统响应速度、提供数据支持。 +kvm watcher中的kvm vcpu子功能模块是设计用于监控和分析虚拟化环境中虚拟 CPU (VCPU) 活动的工具,它通过精确记录 VCPU 的唤醒/挂起事件、halt poll 时间的变化,虚拟cpu进调度和出调度的时间记录,以及 KVM 虚拟机在热迁移过程中产生的脏页信息,为优化虚拟机性能、提高系统响应速度、提供数据支持。 ## 原理介绍 @@ -22,6 +22,9 @@ KVM 暂停轮询系统是 KVM 内的一项功能,其在某些情况下可以 在虚拟化环境中,脏页指的是自上次同步以来已经被修改的内存页。特别是在虚拟机热迁移过程中,源虚拟机上的内存页在复制到目标虚拟机的同时仍然处于活动状态,任何在此过程中对这些页的修改都会导致脏页的产生。监控这些脏页对于优化热迁移过程至关重要,因为过多的脏页生成可能会导致迁移延迟,甚至影响到虚拟机的运行性能。此监控功能特别适用于虚拟机热迁移的场景,其中脏页的精确监控和管理可以显著优化迁移过程。 +### load_vcpu + +加载虚拟 CPU(vCPU)是虚拟化环境中的一个重要步骤,它涉及为虚拟机创建和配置虚拟 CPU 实例,并将其加载到虚拟化平台中的运行环境中。加载 vCPU 将为客户机提供计算资源,允许其执行操作系统和应用程序代码。每个 vCPU 实例代表了一个独立的处理器核心,可以并行执行客户机指令。虚拟化环境中的多个 vCPU 可以并发执行客户机代码,从而支持虚拟机中的多任务和并发执行。加载多个 vCPU 允许虚拟机同时运行多个线程或进程。通过合理地分配物理资源并加载 vCPU,KVM可以优化虚拟机的性能和资源利用率。 ## 挂载点 ### wakeup @@ -43,6 +46,12 @@ KVM 暂停轮询系统是 KVM 内的一项功能,其在某些情况下可以 | :----- | :---------------------- | | kprobe | mark_page_dirty_in_slot | +### load_vcpu + +| 类型 | 名称 | +| :----- | :----------------------| +| kprobe | vmx_vcpu_load | +| kprobe | vmx_vcpu_put | ## 示例输出 ### wakeup @@ -128,7 +137,22 @@ PID GFN REL_GFN SLOT_ID COUNTS 630632 f0122 122 3 61 .... ``` +### load_vcpu +``` +sudo ./kvm_watcher -o +TIME:15:23:05 +pid tid total_time max_time min_time counts vcpuid pcpuid +------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +49214 49224 1.2344 0.3670 0.0346 9 1 2 +49214 49225 0.5978 0.2171 0.1820 3 2 0 +49062 49072 2.4008 0.2656 0.0806 14 0 4 +49214 49223 4.5310 0.2794 0.0217 66 0 7 +54504 54514 6.9243 0.8872 0.0235 34 1 10 +49145 49166 2.8076 1.1653 0.0689 10 1 9 +54504 54513 3.8860 0.3099 0.0210 30 0 0 +49145 49165 1.4737 0.4106 0.0690 8 0 12 +``` ## 参数介绍 ### dirty page @@ -140,4 +164,15 @@ PID GFN REL_GFN SLOT_ID COUNTS - **REL_GFN**: 相对于 GFN 的偏移量。 - **NPAGES**: 内存槽的页面数量。 - **USERSPACE_ADDR**: 触发脏页的用户空间地址。 -- **SLOT_ID**: 内存插槽标识,指示哪个内存区域包含了脏页。 \ No newline at end of file +- **SLOT_ID**: 内存插槽标识,指示哪个内存区域包含了脏页。 + +### load_vcpu + +- **TIME(ms)**: 事件发生的时间,以毫秒为单位。 +- **PID/TID**: 进程或线程的标识符。 +- **total_time**: 2秒内调度vcpu的总时间。 +- **max_time**: 2秒内调度vcpu的最大时间。 +- **min_time**: 2秒内调度vcpu的最小时间。 +- **counts**: 2秒内调度vcpu的次数。 +- **vcpuid**: 虚拟CPU号。 +- **pcpuid**: vCPU对应绑定的物理CPU号。 \ No newline at end of file diff --git a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c index ab9260223..cc931f9c3 100644 --- a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c +++ b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c @@ -864,6 +864,80 @@ const char *getCurrentTimeFormatted() { return ts; // 返回指向静态字符串的指针 } +// In order to sort vm_exit maps +int sort_by_key(int fd, struct exit_key *keys, struct exit_value *values) { + int err = 0; + struct exit_key lookup_key = {}; + struct exit_key next_key = {}; + struct exit_value exit_value; + int first = 1; + int i = 0, j; + int count = 0; + while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) { + count++; + if (first) { + first = 0; + bpf_map_lookup_elem(fd, &next_key, &exit_value); + keys[0] = next_key; + values[0] = exit_value; + i++; + lookup_key = next_key; + continue; + } + err = bpf_map_lookup_elem(fd, &next_key, &exit_value); + if (err < 0) { + fprintf(stderr, "failed to lookup exit_value: %d\n", err); + return -1; + } + // insert sort + j = i - 1; + struct exit_key temp_key = next_key; + struct exit_value temp_value = exit_value; + while (j >= 0 && + (keys[j].pid > temp_key.pid || (keys[j].tid > temp_key.tid))) { + keys[j + 1] = keys[j]; + values[j + 1] = values[j]; + j--; + } + i++; + keys[j + 1] = next_key; + values[j + 1] = temp_value; + // Move to the next key + lookup_key = next_key; + } + return count; +} + +// clear the specific map +int clear_map(void *lookup_key, void *next_key, enum EventType type, int fd) { + int err; + switch (type) { + case HYPERCALL: + memset(lookup_key, 0, sizeof(struct hc_key)); + break; + case TIMER: + memset(lookup_key, 0, sizeof(struct timer_key)); + break; + case VCPU_LOAD: + memset(lookup_key, 0, sizeof(struct load_key)); + break; + case EXIT: + memset(lookup_key, 0, sizeof(struct exit_key)); + break; + default: + return -1; + } + while (!bpf_map_get_next_key(fd, lookup_key, next_key)) { + err = bpf_map_delete_elem(fd, next_key); + if (err < 0) { + fprintf(stderr, "failed to cleanup map: %d\n", err); + return -1; + } + lookup_key = next_key; + } + return 1; +} + int print_hc_map(struct kvm_watcher_bpf *skel) { int fd = bpf_map__fd(skel->maps.hc_map); int count_fd = bpf_map__fd(skel->maps.hc_count); @@ -896,24 +970,8 @@ int print_hc_map(struct kvm_watcher_bpf *skel) { // // Move to the next key lookup_key = next_key; } - memset(&lookup_key, 0, sizeof(struct hc_key)); - while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) { - err = bpf_map_delete_elem(fd, &next_key); - if (err < 0) { - fprintf(stderr, "failed to cleanup hc_map: %d\n", err); - return -1; - } - lookup_key = next_key; - } - memset(&lookup_key, 0, sizeof(struct hc_key)); - while (!bpf_map_get_next_key(count_fd, &lookup_key, &next_key)) { - err = bpf_map_delete_elem(count_fd, &next_key); - if (err < 0) { - fprintf(stderr, "failed to cleanup hc_count: %d\n", err); - return -1; - } - lookup_key = next_key; - } + clear_map(&lookup_key, &next_key, HYPERCALL, fd); + clear_map(&lookup_key, &next_key, HYPERCALL, count_fd); return 0; } @@ -948,64 +1006,10 @@ int print_timer_map(struct kvm_watcher_bpf *skel) { // Move to the next key lookup_key = next_key; } - - // Clear the timer_map - memset(&lookup_key, 0, sizeof(struct timer_key)); - while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) { - err = bpf_map_delete_elem(fd, &next_key); - if (err < 0) { - fprintf(stderr, "failed to cleanup timer_map: %d\n", err); - return -1; - } - lookup_key = next_key; - } + clear_map(&lookup_key, &next_key, TIMER, fd); return 0; } -// In order to sort vm_exit maps -int sort_by_key(int fd, struct exit_key *keys, struct exit_value *values) { - int err = 0; - struct exit_key lookup_key = {}; - struct exit_key next_key = {}; - struct exit_value exit_value; - int first = 1; - int i = 0, j; - int count = 0; - while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) { - count++; - if (first) { - first = 0; - bpf_map_lookup_elem(fd, &next_key, &exit_value); - keys[0] = next_key; - values[0] = exit_value; - i++; - lookup_key = next_key; - continue; - } - err = bpf_map_lookup_elem(fd, &next_key, &exit_value); - if (err < 0) { - fprintf(stderr, "failed to lookup exit_value: %d\n", err); - return -1; - } - // insert sort - j = i - 1; - struct exit_key temp_key = next_key; - struct exit_value temp_value = exit_value; - while (j >= 0 && - (keys[j].pid > temp_key.pid || (keys[j].tid > temp_key.tid))) { - keys[j + 1] = keys[j]; - values[j + 1] = values[j]; - j--; - } - i++; - keys[j + 1] = next_key; - values[j + 1] = temp_value; - // Move to the next key - lookup_key = next_key; - } - return count; -} - int print_vcpu_load_map(struct kvm_watcher_bpf *skel) { int fd = bpf_map__fd(skel->maps.load_map); int err; @@ -1040,16 +1044,7 @@ int print_vcpu_load_map(struct kvm_watcher_bpf *skel) { load_value.vcpu_id, load_value.pcpu_id); lookup_key = next_key; } - // clear the maps - memset(&lookup_key, 0, sizeof(struct load_key)); - while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) { - err = bpf_map_delete_elem(fd, &next_key); - if (err < 0) { - fprintf(stderr, "failed to cleanup counters: %d\n", err); - return -1; - } - lookup_key = next_key; - } + clear_map(&lookup_key, &next_key, VCPU_LOAD, fd); return 0; } @@ -1109,16 +1104,7 @@ void __print_exit_map(int fd, enum NameType name_type) { getName(keys[i].reason, name_type)); } } - // clear the maps - memset(&lookup_key, 0, sizeof(struct exit_key)); - while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) { - int err = bpf_map_delete_elem(fd, &next_key); - if (err < 0) { - fprintf(stderr, "failed to cleanup counters: %d\n", err); - return; - } - lookup_key = next_key; - } + clear_map(&lookup_key, &next_key, EXIT, fd); } int print_exit_map(struct kvm_watcher_bpf *skel) { int exit_fd = bpf_map__fd(skel->maps.exit_map); From 214cecfaa75fb607a213b5cccca4f7b836166287 Mon Sep 17 00:00:00 2001 From: vvzxy <145555693+vvzxy@users.noreply.github.com> Date: Fri, 10 May 2024 20:08:57 +0800 Subject: [PATCH 3/5] =?UTF-8?q?cpu=5Fwatcher=EF=BC=9A=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E9=83=A8=E5=88=86warning&schedule=5Fdelay=E5=8A=9F=E8=83=BD?= =?UTF-8?q?=E6=89=A9=E5=85=85=20(#784)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 添加第三方库 * 添加工具说明文档 * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * cpu_watcher:修改部分warning&schedule_delay功能扩充 * 输出净化 --- .../cpu_watcher/bpf/schedule_delay.bpf.c | 20 ++- .../CPU_Subsystem/cpu_watcher/cpu_watcher.c | 77 +++++------ .../cpu_watcher/include/cpu_watcher.h | 127 +++++++++--------- 3 files changed, 110 insertions(+), 114 deletions(-) diff --git a/eBPF_Supermarket/CPU_Subsystem/cpu_watcher/bpf/schedule_delay.bpf.c b/eBPF_Supermarket/CPU_Subsystem/cpu_watcher/bpf/schedule_delay.bpf.c index 036d1178c..a51e52459 100644 --- a/eBPF_Supermarket/CPU_Subsystem/cpu_watcher/bpf/schedule_delay.bpf.c +++ b/eBPF_Supermarket/CPU_Subsystem/cpu_watcher/bpf/schedule_delay.bpf.c @@ -127,18 +127,30 @@ int BPF_PROG(sched_switch, bool preempt, struct task_struct *prev, struct task_s struct sum_schedule sum_schedule= {}; sum_schedule.sum_count++; sum_schedule.sum_delay += delay; - if (delay > sum_schedule.max_delay) + if (delay > sum_schedule.max_delay){ sum_schedule.max_delay = delay; - if (sum_schedule.min_delay == 0 || delay < sum_schedule.min_delay) + if(next->pid!=0){ + sum_schedule.pid_max = next->pid; + } + }else if (sum_schedule.min_delay == 0 || delay < sum_schedule.min_delay) sum_schedule.min_delay = delay; + if(next->pid!=0){ + sum_schedule.pid_min = next->pid; + } bpf_map_update_elem(&sys_schedule, &key, &sum_schedule, BPF_ANY); } else { sum_schedule->sum_count++; sum_schedule->sum_delay += delay; - if (delay > sum_schedule->max_delay) + if (delay > sum_schedule->max_delay){ sum_schedule->max_delay = delay; - if (sum_schedule->min_delay == 0 || delay < sum_schedule->min_delay) + if(next->pid!=0){ + sum_schedule->pid_max = next->pid; + } + }else if (sum_schedule->min_delay == 0 || delay < sum_schedule->min_delay) sum_schedule->min_delay = delay; + if(next->pid!=0){ + sum_schedule->pid_min = next->pid; + } } return 0; } diff --git a/eBPF_Supermarket/CPU_Subsystem/cpu_watcher/cpu_watcher.c b/eBPF_Supermarket/CPU_Subsystem/cpu_watcher/cpu_watcher.c index 8d768fc9c..fa735c39e 100644 --- a/eBPF_Supermarket/CPU_Subsystem/cpu_watcher/cpu_watcher.c +++ b/eBPF_Supermarket/CPU_Subsystem/cpu_watcher/cpu_watcher.c @@ -78,19 +78,20 @@ struct preempt_bpf *preempt_skel; struct schedule_delay_bpf *sd_skel; struct mq_delay_bpf *mq_skel; -u64 softirq = 0;//初始化softirq; -u64 irqtime = 0;//初始化irq; -u64 idle = 0;//初始化idle;s +u64 softirq = 0; +u64 irqtime = 0; +u64 idle = 0; u64 sched = 0; u64 proc = 0; unsigned long ktTime = 0; unsigned long utTime = 0; -u64 tick_user = 0;//初始化sys; +u64 tick_user = 0; int sc_sum_time = 0 ; int sc_max_time = 0 ; int sc_min_time = SYSCALL_MIN_TIME ; int sys_call_count = 0; +bool ifprint = 0; int preempt_count = 0 ; @@ -176,7 +177,6 @@ static int open_and_attach_perf_event(int freq, struct bpf_program *prog, .config = PERF_COUNT_SW_CPU_CLOCK, }; int i, fd; - for (i = 0; i < nr_cpus; i++) { fd = syscall(__NR_perf_event_open, &attr, -1, i, -1, 0); if (fd < 0) { @@ -196,7 +196,6 @@ static int open_and_attach_perf_event(int freq, struct bpf_program *prog, return -1; } } - return 0; } @@ -207,18 +206,14 @@ u64 find_ksym(const char* target_symbol) { perror("Failed to open /proc/kallsyms"); return 1; } - char symbol_name[99]; u64 symbol_address = 0; - while (fscanf(file, "%llx %*c %s\n", &symbol_address, symbol_name) != EOF) { if (strcmp(symbol_name, target_symbol) == 0) { break; } } - fclose(file); - return symbol_address; } @@ -236,6 +231,7 @@ static int print_all() u64 __proc; __proc = total_forks - proc; proc = total_forks; + /*cswch:*/ int key_cswch = 0; int err_cswch, fd_cswch = bpf_map__fd(sar_skel->maps.countMap); @@ -320,7 +316,7 @@ static int print_all() unsigned long dtaUT = utTime -_utTime; /*sys*/ - int key_sys = 0,next_key; + int key_sys = 0; int err_sys, fd_sys = bpf_map__fd(sar_skel->maps.tick_user); u64 __tick_user =0 ;// 用于存储从映射中查找到的值 __tick_user = tick_user; @@ -338,7 +334,7 @@ static int print_all() if(env.enable_proc){ time_t now = time(NULL); struct tm *localTime = localtime(&now); - printf("%02d:%02d:%02d %8llu %8llu %6d %8llu %10llu %8llu %10llu %8llu %8lu %8lu\n", + printf("%02d:%02d:%02d %8llu %8llu %6d %8llu %10llu %8llu %10lu %8llu %8llu %8llu\n", localTime->tm_hour, localTime->tm_min, localTime->tm_sec, __proc,__sched,runqlen,dtairqtime/1000,dtasoftirq/1000,dtaidle/1000000, dtaKT/1000,dtaSysc / 1000000,dtaUTRaw/1000000,dtaSys / 1000000); @@ -346,7 +342,6 @@ static int print_all() else{ env.enable_proc = true; } - return 0; } @@ -354,49 +349,48 @@ int count[25]={0};//定义一个count数组,用于汇总schedul()调度时间 static int handle_event(void *ctx, void *data,unsigned long data_sz) { const struct event *e = data; - printf("t1:%lu t2:%lu delay:%lu\n",e->t1,e->t2,e->delay); - + printf("t1:%llu t2:%llu delay:%llu\n",e->t1,e->t2,e->delay); int dly=(int)(e->delay),i=0; while (dly > 1){ dly /= 2; i ++; } - count[i]++;//记录时间间隔次数; + count[i]++; return 0; } static int print_hstgram(int i,int max,int per_len) { int cnt=count[i]; if(per_len==1){ - while(cnt>0){//打印 + while(cnt>0){ printf("*"); cnt--; } } - while(cnt-per_len>=0){//打印 + while(cnt-per_len>=0){ printf("*"); cnt-=per_len; } printf("\n"); return per_len; } -double pow(int n,int k)//实现pow函数 +double my_pow(int n,int k)//实现pow函数 { if (k > 0) - return n * pow(n, k - 1); + return n * my_pow(n, k - 1); else if (k == 0) return 1; else - return 1.0 / pow(n, -k); + return 1.0 / my_pow(n, -k); } static void histogram() { int log10[15]={0},max=0,per_len=1; - for(int i=0;i<10;i++){//log10(count[i]); + for(int i=0;i<10;i++){ int tmp=count[i],cnt=0; while (tmp >= 10){ tmp /= 10; - cnt ++;//幂次 + cnt ++; } log10[cnt]++; } @@ -406,13 +400,13 @@ static void histogram() max=i; } - while(max>0){//pow(10,max); + while(max>0){ per_len *=10 ; max--; } - time_t now = time(NULL);// 获取当前时间 - struct tm *localTime = localtime(&now);// 将时间转换为本地时间结构 + time_t now = time(NULL); + struct tm *localTime = localtime(&now); printf("\nTime : %02d:%02d:%02d \n",localTime->tm_hour, localTime->tm_min, localTime->tm_sec); printf("%-24s \t%-12s \t%-12s \n","cs_delay","Count","Distribution"); printf("%d\t=>\t%-8d \t%-12d \t|",0,1,count[0]); @@ -420,28 +414,22 @@ static void histogram() printf("%d\t=>\t%-8d \t%-12d \t|",2,3,count[1]); print_hstgram(1,max,per_len); for(int i=2;i<20;i++){ - printf("%d\t=>\t%-8d \t%-12d \t|",(int)pow(2,i),(int)pow(2,(i+1))-1,count[i]); + printf("%d\t=>\t%-8d \t%-12d \t|",(int)my_pow(2,i),(int)my_pow(2,(i+1))-1,count[i]); print_hstgram(i,max,per_len); } printf("per_len = %d\n",per_len); } -// static void max_print(){ - -// int sc_average_time = sc_sum_time/sys_call_count; -// printf("Average_Syscall_Time: %8d ms\n",sc_average_time); -// printf("MAX_Syscall_Time: %8d ms\n",sc_max_time); -// printf("MIN_Syscall_Time: %8d ms\n",sc_min_time); -// } static int syscall_delay_print(void *ctx, void *data,unsigned long data_sz) { const struct syscall_events *e = data; - printf("pid: %-8llu comm: %-10s syscall_id: %-8ld delay: %-8llu\n", + printf("pid: %-8u comm: %-10s syscall_id: %-8lld delay: %-8lld\n", e->pid,e->comm,e->syscall_id,e->delay); return 0; } + //抢占时间输出 static int preempt_print(void *ctx, void *data, unsigned long data_sz) { @@ -462,16 +450,19 @@ static int schedule_print(struct bpf_map *sys_fd) int hour = localTime->tm_hour; int min = localTime->tm_min; int sec = localTime->tm_sec; - unsigned long long avg_delay; - + unsigned long long avg_delay; err = bpf_map_lookup_elem(fd, &key, &info); if (err < 0) { fprintf(stderr, "failed to lookup infos: %d\n", err); return -1; } avg_delay = info.sum_delay / info.sum_count; - printf("%02d:%02d:%02d | %-15lf %-15lf %-15lf |\n", - hour, min, sec, avg_delay / 1000.0, info.max_delay / 1000.0, info.min_delay / 1000.0); + if(!ifprint){ + ifprint=1; + }else{ + printf("%02d:%02d:%02d %-15lf %-15lf %5d %15lf %10d\n", + hour, min, sec, avg_delay / 1000.0, info.max_delay / 1000.0,info.pid_max, info.min_delay / 1000.0,info.pid_min); + } return 0; } @@ -483,10 +474,10 @@ static int mq_event(void *ctx, void *data,unsigned long data_sz) printf("-----------------------------------------------------------------------------------------------------------------------\n"); const struct mq_events *e = data; - printf("Mqdes: %-8llu msg_len: %-8llu msg_prio: %-8llu\n",e->mqdes,e->msg_len,e->msg_prio); - printf("SND_PID: %-8lu SND_enter_time: %-16llu SND_exit_time: %-16llu\n", + printf("Mqdes: %-8d msg_len: %-8lu msg_prio: %-8u\n",e->mqdes,e->msg_len,e->msg_prio); + printf("SND_PID: %-8d SND_enter_time: %-16llu SND_exit_time: %-16llu\n", e->send_pid,e->send_enter_time,e->send_exit_time); - printf("RCV_PID: %-8lu RCV_enter_time: %-16llu RCV_exit_time: %-16llu\n", + printf("RCV_PID: %-8d RCV_enter_time: %-16llu RCV_exit_time: %-16llu\n", e->rcv_pid,e->rcv_enter_time,e->rcv_exit_time); printf("-------------------------------------------------------------------------------\n"); @@ -626,7 +617,7 @@ int main(int argc, char **argv) fprintf(stderr, "Failed to attach BPF skeleton\n"); goto schedule_cleanup; } - printf("%-8s %s\n", " TIME ", "avg_delay/μs max_delay/μs min_delay/μs"); + printf("%-8s %s\n", " TIME ", "avg_delay/μs max_delay/μs max_pid min_delay/μs min_pid"); }else if (env.SAR){ /* Load and verify BPF application */ sar_skel = sar_bpf__open(); diff --git a/eBPF_Supermarket/CPU_Subsystem/cpu_watcher/include/cpu_watcher.h b/eBPF_Supermarket/CPU_Subsystem/cpu_watcher/include/cpu_watcher.h index aaf5dfb1d..bc67e3e5b 100644 --- a/eBPF_Supermarket/CPU_Subsystem/cpu_watcher/include/cpu_watcher.h +++ b/eBPF_Supermarket/CPU_Subsystem/cpu_watcher/include/cpu_watcher.h @@ -17,11 +17,11 @@ #include #include -typedef long long unsigned int u64; +typedef unsigned long long u64; typedef unsigned int u32; -typedef __kernel_mqd_t mqd_t; +typedef __kernel_mqd_t mqd_t; #define __user -#define MAX_CPU_NR 128 +#define MAX_CPU_NR 128 #define TASK_COMM_LEN 20 #define SYSCALL_MIN_TIME 1E7 #define MAX_SYSCALL_COUNT 100 @@ -35,56 +35,49 @@ typedef __kernel_mqd_t mqd_t; /// @param type1 键的类型 /// @param type2 值的类型 /// @param MAX_ENTRIES map容量 -#define BPF_ARRAY(name, type1,type2,MAX_ENTRIES ) \ - struct \ - { \ - __uint(type, BPF_MAP_TYPE_ARRAY); \ - __uint(key_size, sizeof(type1)); \ - __uint(value_size, sizeof(type2)); \ - __uint(max_entries, MAX_ENTRIES); \ +#define BPF_ARRAY(name, type1, type2, MAX_ENTRIES) \ + struct { \ + __uint(type, BPF_MAP_TYPE_ARRAY); \ + __uint(key_size, sizeof(type1)); \ + __uint(value_size, sizeof(type2)); \ + __uint(max_entries, MAX_ENTRIES); \ } name SEC(".maps") - /// @brief 创建一个指定名字和键值类型的ebpf散列表 /// @param name 新散列表的名字 /// @param type1 键的类型 /// @param type2 值的类型 /// @param MAX_ENTRIES 哈希map容量 -#define BPF_HASH(name, type1,type2,MAX_ENTRIES ) \ - struct \ - { \ - __uint(type, BPF_MAP_TYPE_HASH); \ - __uint(key_size, sizeof(type1)); \ - __uint(value_size, sizeof(type2)); \ - __uint(max_entries, MAX_ENTRIES); \ - } name SEC(".maps") - +#define BPF_HASH(name, type1, type2, MAX_ENTRIES) \ + struct { \ + __uint(type, BPF_MAP_TYPE_HASH); \ + __uint(key_size, sizeof(type1)); \ + __uint(value_size, sizeof(type2)); \ + __uint(max_entries, MAX_ENTRIES); \ + } name SEC(".maps") /// @brief 创建一个指定名字和键值类型的ebpf每CPU数组 /// @param name 新散列表的名字 /// @param type1 键的类型 /// @param type2 值的类型 /// @param MAX_ENTRIES map容量 -#define BPF_PERCPU_ARRAY(name, type1,type2,MAX_ENTRIES ) \ - struct \ - { \ - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); \ - __uint(key_size, sizeof(type1)); \ - __uint(value_size, sizeof(type2)); \ - __uint(max_entries, MAX_ENTRIES); \ - } name SEC(".maps") - +#define BPF_PERCPU_ARRAY(name, type1, type2, MAX_ENTRIES) \ + struct { \ + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); \ + __uint(key_size, sizeof(type1)); \ + __uint(value_size, sizeof(type2)); \ + __uint(max_entries, MAX_ENTRIES); \ + } name SEC(".maps") /// @brief 创建一个指定名字和键值类型的ebpf每CPU散列表 /// @param name 新散列表的名字 /// @param type1 键的类型 /// @param type2 值的类型 /// @param MAX_ENTRIES map容量 -#define BPF_PERCPU_HASH(name, type1,type2,MAX_ENTRIES ) \ - struct \ - { \ - __uint(type, BPF_MAP_TYPE_PERCPU_HASH); \ - __uint(key_size, sizeof(type1)); \ - __uint(value_size, sizeof(type2)); \ - __uint(max_entries, MAX_ENTRIES); \ - } name SEC(".maps") +#define BPF_PERCPU_HASH(name, type1, type2, MAX_ENTRIES) \ + struct { \ + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); \ + __uint(key_size, sizeof(type1)); \ + __uint(value_size, sizeof(type2)); \ + __uint(max_entries, MAX_ENTRIES); \ + } name SEC(".maps") /*----------------------------------------------*/ /* cs_delay结构体 */ @@ -92,22 +85,22 @@ typedef __kernel_mqd_t mqd_t; #ifndef __CS_DELAY_H #define __CS_DELAY_H struct event { - long unsigned int t1; - long unsigned int t2; - long unsigned int delay; + u64 t1; + u64 t2; + u64 delay; }; #endif /* __CS_DELAY_H */ /*----------------------------------------------*/ /* syscall_delay结构体 */ /*----------------------------------------------*/ -struct syscall_flags{ - long unsigned int start_time; +struct syscall_flags { + u64 start_time; int syscall_id; }; struct syscall_events {//每个进程一个 - int pid,count; + int pid, count; char comm[TASK_COMM_LEN]; u64 delay; u64 syscall_id; @@ -115,7 +108,7 @@ struct syscall_events {//每个进程一个 /*----------------------------------------------*/ /* preempt_event结构体 */ /*----------------------------------------------*/ -struct preempt_event{ +struct preempt_event { pid_t prev_pid; pid_t next_pid; unsigned long long duration; @@ -125,22 +118,24 @@ struct preempt_event{ /* schedule_delay相关结构体 */ /*----------------------------------------------*/ //标识不同进程 -struct proc_id{ +struct proc_id { int pid; int cpu_id; }; //标识该进程的调度信息 -struct schedule_event{ +struct schedule_event { int pid; int count;//调度次数 unsigned long long enter_time; }; //整个系统所有调度信息 -struct sum_schedule{ +struct sum_schedule { unsigned long long sum_count; unsigned long long sum_delay; unsigned long long max_delay; unsigned long long min_delay; + int pid_max; + int pid_min; }; /*----------------------------------------------*/ @@ -150,36 +145,34 @@ struct mq_events { int send_pid; int rcv_pid; mqd_t mqdes; - size_t msg_len; - unsigned int msg_prio; - - u64 send_enter_time; - u64 send_exit_time; - u64 rcv_enter_time; - u64 rcv_exit_time; + size_t msg_len; + unsigned int msg_prio; + u64 send_enter_time; + u64 send_exit_time; + u64 rcv_enter_time; + u64 rcv_exit_time; }; struct send_events { int send_pid; u64 Key_msg_ptr; - mqd_t mqdes; - size_t msg_len; - unsigned int msg_prio; - const char *u_msg_ptr; - const void *src; - u64 send_enter_time; - u64 send_exit_time; + size_t msg_len; + unsigned int msg_prio; + const char *u_msg_ptr; + const void *src; + u64 send_enter_time; + u64 send_exit_time; }; struct rcv_events { int rcv_pid; u64 Key_msg_ptr; mqd_t mqdes; - size_t msg_len; - unsigned int msg_prio; - const char *u_msg_ptr; - const void *dest; - u64 rcv_enter_time; - u64 rcv_exit_time; + size_t msg_len; + unsigned int msg_prio; + const char *u_msg_ptr; + const void *dest; + u64 rcv_enter_time; + u64 rcv_exit_time; }; /*----------------------------------------------*/ /* cswch_args结构体 */ From 2b0c89b55d6a55f821a0c20521c41053beb33458 Mon Sep 17 00:00:00 2001 From: wynyibo <147615158+wynyibo@users.noreply.github.com> Date: Fri, 10 May 2024 20:12:23 +0800 Subject: [PATCH 4/5] =?UTF-8?q?net=5Fwatcher=EF=BC=9Aupdate=20README=20(#7?= =?UTF-8?q?86)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add README * add README * add README * add README * add README * add README --- .../Network_Subsystem/net_watcher/README.md | 204 ++++++++++++++---- 1 file changed, 164 insertions(+), 40 deletions(-) diff --git a/eBPF_Supermarket/Network_Subsystem/net_watcher/README.md b/eBPF_Supermarket/Network_Subsystem/net_watcher/README.md index 7cf63cb0f..161b1d5dc 100644 --- a/eBPF_Supermarket/Network_Subsystem/net_watcher/README.md +++ b/eBPF_Supermarket/Network_Subsystem/net_watcher/README.md @@ -18,18 +18,18 @@ lmp现有许多用于监控linux网络协议栈相关信息的小工具,但这 ### 1.2 功能介绍 -`netwatcher`是一款基于eBPF的网络检测工具,其旨在使用户方便快捷的获取主机环境下linux网络协议栈的各种信息。 +`netwatcher`是一款基于eBPF的高效网络检测工具,其目的是为了让用户能够轻松快捷地获取到网络协议栈的详细信息,通过高效的数据采集和精准的监控能力,帮助用户深入了解网络行为,确保网络安全和性能的优化。其在车辆智能导航、自动驾驶等关键领扮演着重要角色,面对网络异常或延迟时,`netwatcher`能够提供强大的网络监测和优化支持,帮助企业及时诊断并解决网络障碍,提升系统稳定性和服务可靠性。 目前,其实现的功能包括: + - TCP相关的信息监测:主机环境下对tcp/ip协议的分析,可以统计流量,延时,错误,链接信息等主要信息 - HTTP1/1.1相关信息检测:通过截取相应TCP包的HTTP头实现主机环境下对用户态http1的分析 +- TCP、UDP、ICMP相关信息监测:追踪TCP、UDP、ICMP协议数据包,并实现对主机接收和发送的所有相关数据包的时延数据和流量信息 +- 监测TCP连接状态信息:包括三次握手以及四次挥手的状态转变和时延数据 +- 丢包事件的监控:分析导致丢包的地址以及丢包原因 #### TODO -- [ ] ICMP数据包信息的监控 - - 实现对ICMP协议报文的监控,输出到达的ICMP数据包的消息类型 -- [ ] UDP数据包信息的监控 - - 实现对主机接收和发送的所有UDP数据包的监控,输出各个UDP数据包的地址-端口4元组 -- [ ] 应用层协议的支持 +- [ ] 应用层协议的支持 - 在各个底层协议之上,提供对以下应用层信息的监控 - [ ] HTTP协议相关 - [ ] HTTP1 @@ -47,19 +47,31 @@ lmp现有许多用于监控linux网络协议栈相关信息的小工具,但这 ### 1.3 组织结构 -- netwatcher.bpf.c:在各个内核探针点对TCP包信息、TCP连接信息以及各个包的HTTP1/1.1信息进行记录 -- netwatcher.c: 对bpf.c文件中记录的信息进行输出 -- netwatcher.h: 定义内核态与用户态程序共用的结构体 - doc/: - implement.md:详细描述本项目的实现细节 - data/: - connects.log:符合Prometheus格式的连接信息 - err.log:符合Prometheus格式的错误包信息 - packets.log:符合Prometheus格式的包信息 + - udp.loh:符合Prometheus格式的udp包信息 - visual.py:暴露metrics接口给Prometheus,输出data文件夹下的所有信息 +- netwatcher.c :对bpf.c文件中记录的信息进行输出 +- netwatcher.bpf.c:封装内核探针点。 +- tcp.bpf.h:网络数据包处理以及tcp连接状态等信息具体实现细节。 +- udp.bpf.h :udp数据包时延、流量的具体处理逻辑。 +- packet.bpf.h :网络数据包的处理、时间戳的记录、数据包信息的提取等指标具体处理逻辑。 +- netfilter.bpf.h:处理netfilter时延的具体逻辑,`submit_nf_time`函数将时延信息提交到用户态,`store_nf_time`函数存储经过每个`HOOK`点的时延。 +- drop.bpf.h :数据包丢弃原因的具体处理逻辑。 +- dropreason.bpf.h :skb_drop_reason定义77种丢包原因。 +- icmp.bpf.h: icmp时延具体实现细节。 +- comm.bpf.h :辅助函数、宏、BPF映射、以及内核中使用到的结构体。 + ## 二、快速开始 ### 2.1 安装依赖 OS: Ubuntu 22.04LTS + +Kernel:Linux 6.2 + ```bash sudo apt update sudo apt install libbpf-dev clang llvm libelf-dev libpcap-dev gcc-multilib build-essential @@ -77,23 +89,33 @@ sudo make test # 测试 `netwatcher`通过一系列命令参数来控制其具体行为: ```bash -Usage: netwatcher [OPTION...] -Watch tcp/ip in network subsystem +Usage: [OPTION...] +Watch tcp/ip in network subsystem -a, --all set to trace CLOSED connection -d, --dport=DPORT trace this destination port only -e, --err set to trace TCP error packets -i, --http set to trace http info + -I, --icmptime set to trace layer time of icmp + -k, --drop_reason trace kfree + -L, --timeload analysis time load + -n, --net_filter trace ipv4 packget filter -r, --retrans set to trace extra retrans info -s, --sport=SPORT trace this source port only + -S, --tcpstate set to trace tcpstate -t, --time set to trace layer time of each packet + -T, --addr_to_func translation addr to func and offset + -u, --udp trace the udp message -x, --extra set to trace extra conn info -?, --help Give this help list - --usage Give a short usage messages. + --usage Give a short usage message + +Mandatory or optional arguments to long options are also mandatory or optional +for any corresponding short options. ``` - 参数`-d`,`-s`用于指定监控某个源端口/目的端口 - 指定参数`-a`会保留已CLOSED的TCP连接信息 -- 指定`-e`参数会记录SEQ错误或Checksum错误的TCP包 +- 指定`-e`参数会记录SEQ错误或Checksum错误的TCP包并输出到标准输出以及data/err.log中。 - 默认情况下,监控以下连接信息与包信息: - TCP连接pid - TCP连接\[源地址:端口,目的地址:端口\] @@ -115,12 +137,21 @@ Watch tcp/ip in network subsystem - 已使用的发送缓冲区 - 平滑往返时间 - 连接已建立时长 - - 连接总重传次数 + - 连接总重传次数。 +- 指定 `-i` 参数监控HTTP信息。从携带HTTP请求头的TCP包中将HTTP请求头提取并输出,记录HTTP的状态码等信息。 +- 指定 `-u` 参数监控UDP数据包信息。统计UDP协议的数据包流量、以us为单位的时延等信息。 +- 指定 `-n` 参数监控ipv4网络层的Netfilter延迟。监测ipv4网络层数据包经过Netfilter各个HOOK点的处理时延。 +- 指定 `-k` 参数监测系统中的各类丢包并分析丢包原因,详细定位协议丢包的指令地址。 +- 指定 `-S` 参数监控TCP的连接状态以及状态转换的时延。 +- 指定 `-I` 参数监控ICMP协议数据包收发过程中的时延。 +- 指定 `-T` 参数将捕获到的丢包事件虚拟地址转换成函数名+偏移量形式。 +- 指定 `-L` 参数监测网络协议栈数据包经过各层的时延,采用指数加权移动法对异常的时延数据进行监控并发出警告信息。 + ### 3.1 监控连接信息 `netwatcher`会将保存在内存中的连接相关信息实时地在`data/connects.log`中更新。默认情况下,为节省资源消耗,`netwatcher`会实时删除已CLOSED的TCP连接相关信息,并只会保存每个TCP连接的基本信息。 -``` +```c // data/connects.log -connection{pid="44793",sock="0xffff9d1ecb3ba300",src="10.0.2.15:46348",dst="103.235.46.40:80",is_server="0",backlog="-",maxbacklog="-",cwnd="-",ssthresh="-",sndbuf="-",wmem_queued="-",rx_bytes="-",tx_bytes="-",srtt="-",duration="-",total_retrans="-",fast_retrans="-",timeout_retrans="-"} 0 +connection{pid="44793",sock="0xffff9d1ecb3ba300",src="10.0.2.15:46348",dst="103.235.46.40:80",is_server="0",backlog="-",maxbacklog="-",cwnd="-",ssthresh="-",sndbuf="-",wmem_queued="-",rx_bytes="-",tx_bytes="-",srtt="-",duration="-",total_retrans="-",fast_retrans="-",timeout_retrans="-"} ``` #### 3.1.1 保留所有连接信息 对于想要保留所有连接信息的用户,需要指定`-a`参数。 @@ -128,58 +159,151 @@ connection{pid="44793",sock="0xffff9d1ecb3ba300",src="10.0.2.15:46348",dst="103. sudo ./netwatcher -a ``` #### 3.1.2 监控额外连接信息 -`netwatcher`默认只监控基本连接信息,额外连接信息输出为`-`;对于想要监控额外连接信息的用户,需要指定`-x`参数。 -``` -sudo ./netwatcher -x +`netwatcher`指定`-x`参数输出额外信息 ,额外信息实时更新于data/connects.log日志中。记录接收窗口大小rwnd、拥塞窗口大小cwnd、慢启动阈值ssthresh、发送缓冲区大小sndbuf、已使用的发送缓冲区wmem_queued、已接收字节数rx_bytes、已确认字节数tx_bytes、平滑往返时间srtt、连接建立时延duration等关键信息。 + +```c +sudo ./netwatcher -x +connection{pid="45395",sock="0xffff9780c13f7500",src="192.168.60.136:42938",dst="171.214.23.48:443",is_server="0",backlog="0",maxbacklog="0",rwnd="63986",cwnd="10",ssthresh="2147483647",sndbuf="87040",wmem_queued="0",rx_bytes="603",tx_bytes="1.563K",srtt="68166",duration="63879",total_retrans="0",fast_retrans="-",timeout_retrans="-"} ``` #### 3.1.3 监控重传信息 -`netwatcher`监控的额外连接信息并不提供对超时重传次数与快速重传次数的细化;对于想要获取此信息的用户,需要指定`-r`参数。 -``` +`netwatcher`监控超时重传次数、快速重传次数和连接总重传次数,指定`-r`参数。 + +```c sudo ./netwatcher -r +connection{pid="45395",sock="0xffff9780c13f6c00",src="192.168.60.136:60210",dst="124.237.208.55:443",is_server="0",backlog="0",maxbacklog="0",rwnd="63784",cwnd="10",ssthresh="2147483647",sndbuf="87040",wmem_queued="0",rx_bytes="568",tx_bytes="8.489K",srtt="65211",duration="319696",total_retrans="0",fast_retrans="-",timeout_retrans="2"} ``` ### 3.2 监控包信息 `netwatcher`会将各个TCP包的相关信息实时地输出在标准输出以及`data/packets.log`中。默认情况下,为节省资源消耗,`netwatcher`只会监控各个包的基本信息并输出。 -``` + +```c sudo ./netwatcher -SOCK SEQ ACK MAC_TIME IP_TIME TCP_TIME RX HTTP +SOCK SEQ ACK MAC_TIME IP_TIME TCP_TIME RX HTTP 0xffff9d1ecb3ba300 629372796 279168002 - - - 0 - 0xffff9d1ecb3ba300 279168002 629372873 - - - 1 - ``` -``` +```c // data/packets.log -packet{sock="0xffff9d1ecb3ba300",seq="629372796",ack="279168002",mac_time="-",ip_time="-",tcp_time="-",http_info="-",rx="0"} 0 -packet{sock="0xffff9d1ecb3ba300",seq="279168002",ack="629372873",mac_time="-",ip_time="-",tcp_time="-",http_info="-",rx="1"} 0 +packet{sock="0xffff9d1ecb3ba300",seq="629372796",ack="279168002",mac_time="-",ip_time="-",tcp_time="-",http_info="-",rx="0"} +packet{sock="0xffff9d1ecb3ba300",seq="279168002",ack="629372873",mac_time="-",ip_time="-",tcp_time="-",http_info="-",rx="1"} +``` +#### 3.2.1监控数据包在各层的处理时延 +`netwatcher`监控各个数据包在各层的处理时间,指定`-t`参数,`netwatcher`会输出以`us`为单位的各层处理时间,支持ipv4、ipv6协议。 +```c +sudo ./netwatcher -t +SOCK SEQ ACK MAC_TIME IP_TIME TRAN_TIME RX HTTP +0xffffa069b1271200 3401944293 1411917682 4 15 15 0 - +0xffffa06982943600 537486036 4194537197 16 14 123 1 - +0xffffa06982943600 537486036 4194537197 16 14 130 1 - +0xffffa06982946c00 3341452281 1920160519 3 13 12 0 - ``` -#### 3.2.1 各层处理时间 -`netwatcher`提供监控各个数据包在各层的处理时间的支持。为了获得这部分信息,用户需要指定`-t`参数,`netwatcher`会输出以`us`为单位的各层处理时间。 + +指定参数`-u`,查看UDP数据包处理时延并记录于`data/udp.log`中,单位为微妙。 + +```c +sudo ./netwatcher -u +saddr daddr sprot dprot udp_time rx len +192.168.60.2 192.168.60.136 38151 53 7 1 119 +192.168.60.2 192.168.60.136 36524 53 5 1 89 ``` -sudo ./netwatcher -t -SOCK SEQ ACK MAC_TIME IP_TIME TCP_TIME RX HTTP -0xffff9d1ec43fd780 2018346083 420544002 1 3 9 0 - -0xffff9d1ec43fd780 420544002 2018346160 67 12 494 1 - -0xffff9d1ec43fd780 420545414 2018346160 40 5 258 1 - + +指定参数`-I`,查看ICMP数据包处理时延,单位为微妙。 + +```c +sudo ./netwatcher -I +saddr daddr time flag +192.168.60.136 192.168.60.136 11 1 +192.168.60.136 192.168.60.136 5 0 +192.168.60.136 192.168.60.136 80 1 +``` + +指定参数`-n`,查看数据包在Netfilter框架(包括PRE_ROUTING、LOCAL_IN、FORWARD、LOCAL_OUT、POST_ROUTING链中处理的时间),单位为微妙,其网络数据包路径为: + +- 发往本地:**NF_INET_PRE_ROUTING**-->**NF_INET_LOCAL_IN** +- 转发:**NF_INET_PRE_ROUTING**-->**NF_INET_FORWARD**-->**NF_INET_POST_ROUTING** +- 本地发出:**NF_INET_LOCAL_OUT**-->**NF_INET_POST_ROUTING** + +```c +sudo ./netwatcher -n +saddr daddr dprot sprot PreRT L_IN FW PostRT L_OUT rx +192.168.60.136 192.168.60.1 22 51729 0 0 0 2 3 0 +192.168.60.136 192.168.60.1 22 51729 0 0 0 1 3 0 +127.0.0.1 127.0.0.1 771 10787 2 2 0 0 0 1 +127.0.0.1 127.0.0.1 771 15685 2 2 0 0 0 1 ``` #### 3.2.2 监控错误数据包 + `netwatcher`提供对TCP错误的监控支持,用户只需指定`-e`参数,`netwatcher`会记录SEQ错误或Checksum错误的TCP包并输出到标准输出以及`data/err.log`中。 -``` -sudo ./netwatcher -e + +```c +sudo ./netwatcher -e +packet{sock="0xffff13235ac8ac8e",seq="1318124482",ack="2468218244",reason="Invalid SEQ"} ``` #### 3.2.3 HTTP1/1.1 -`netwatcher`提供对HTTP1/1.1信息的监控,在指定`-i`参数后,`netwatcher`会从携带HTTP请求头的TCP包中将HTTP请求头提取并输出。 -``` +`netwatcher`提供对HTTP1/1.1信息的监控,在指定`-i`参数后,`netwatcher`会从携带HTTP请求头的TCP包中将HTTP请求头提取并输出,提取信息包括包含请求资源方式、状态码、状态文本等。 + +```c sudo ./netwatcher -i SOCK SEQ ACK MAC_TIME IP_TIME TCP_TIME RX HTTP 0xffff9d1ecb3b9180 3705894662 522176002 - - - 0 GET / HTTP/1.1 0xffff9d1ecb3b9180 522176002 3705894739 - - - 1 HTTP/1.1 200 OK ``` -### 3.3 与Prometheus连接进行可视化 -可以注意到,`data`中的所有文件都满足Prometheus要求的时序数据库格式。`netwatcher`使用`visual.py`在端口41420暴露`metrics`API为Prometheus提供可视化支持,当Prometheus请求此API时,会获得当前时刻下三个log文件的所有内容。由于三个log文件被eBPF程序实时更新,因此满足时序性。 +#### 3.2.3 过滤指定目的端口、源端口 + +```c +sudo ./netwatcher -d 80 或者 sudo ./netwatcher -s 80 ``` + +#### 3.2.4 TCP协议数据包连接状态 + +指定参数`-S`,对TCP数据包连接状态的监控。netwatcher会跟踪TCP连接状态的转换,其中可以对各个状态所持续的时间进行监测。 + +```C +sudo ./netwatcher -S +saddr daddr sport dport oldstate newstate time +192.168.60.136 1.1.1.1 41586 80 FIN_WAIT1 FIN_WAIT2 386 +192.168.60.136 1.1.1.1 41586 80 FIN_WAIT2 CLOSE 19 +``` + +#### 3.2.5 捕捉丢包及原因 + +指定参数`-k`,捕捉丢包信息并获取其导致丢包的函数地址、丢包原因。 + +```c +sudo ./netwatcher -k +time saddr daddr sprot dprot prot addr +reason +14:45:11 127.0.0.1 127.0.0.1 51162 36623 ipv4 ffffffff920fbf89 SKB_DROP_REASON_TCP_OLD_DATA +``` + +指定参数`-T`,可以将虚拟地址转换成函数名+偏移的形式,在此处可以捕捉发生丢包的内核函数名称。 + +```C +sudo ./netwatcher -k -T +time saddr daddr sprot dprot prot addr reason +14:48:54 192.168.60.136 192.168.60.136 45828 8090 ipv4 tcp_v4_rcv+0x84 SKB_DROP_REASON_NO_SOCKET +14:49:04 118.105.115.105 110.103.32.49 26210 24435 other unix_dgram_sendmsg+0x521 SKB_DROP_REASON_NOT_SPECIFIED +``` + +#### 3.2.7 异常时延监控 + +对获取到的各层时延加上-L参数进行监控,捕获监测到的异常数据并给予警告信息。 + +```C +sudo ./netwatcher -t -L +saddr daddr sprot dprot udp_time rx len +127.0.0.1 127.0.0.53 44530 53 1593 0 51 abnoermal data +``` + +### 3.3 与Prometheus连接进行可视化 + +`data`目录下的所有文件都满足Prometheus要求的时序数据库格式。`netwatcher`使用`visual.py`在端口41420暴露`metrics`API为Prometheus提供可视化支持,当Prometheus请求此API时,会获得当前时刻所有log文件的全部内容。由于log文件被eBPF程序实时更新,因此满足时序性。 +```c python visual.py ``` ## 四、代码实现细节 -- 见`doc/implement.md` + +- 详细实现细节见`doc/implement.md` From ad2177e7ab9a62a105d3adf5b3487bcad30b51e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=A9=AC=E5=AE=9C=E8=90=B1?= <85030740+syxl-time@users.noreply.github.com> Date: Fri, 10 May 2024 20:21:35 +0800 Subject: [PATCH 5/5] =?UTF-8?q?mem=5Fwatcher:=E6=B7=BB=E5=8A=A0=E6=A3=80?= =?UTF-8?q?=E6=B5=8B=E5=86=85=E6=A0=B8=E6=80=81=E5=86=85=E5=AD=98=E6=B3=84?= =?UTF-8?q?=E6=BC=8F=E7=9A=84=E5=8A=9F=E8=83=BD=20(#789)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * mem_watcher:添加检测内核态内存泄漏的功能 * mem_watcher:修改makefile --- .../Memory_Subsystem/mem_watcher/Makefile | 10 +- .../mem_watcher/mem_watcher.c | 66 ++++-- .../mem_watcher/memleak.bpf.c | 195 +++++++++++++++++- 3 files changed, 245 insertions(+), 26 deletions(-) diff --git a/eBPF_Supermarket/Memory_Subsystem/mem_watcher/Makefile b/eBPF_Supermarket/Memory_Subsystem/mem_watcher/Makefile index 5847750c7..60cdd95ae 100644 --- a/eBPF_Supermarket/Memory_Subsystem/mem_watcher/Makefile +++ b/eBPF_Supermarket/Memory_Subsystem/mem_watcher/Makefile @@ -1,12 +1,12 @@ # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) OUTPUT := .output CLANG ?= clang -LIBBPF_SRC := $(abspath ../../libbpf/src) -BPFTOOL_SRC := $(abspath ../../bpftool/src) +LIBBPF_SRC := $(abspath ../libbpf/src) +BPFTOOL_SRC := $(abspath ../bpftool/src) LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a) BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool) BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool -LIBBLAZESYM_SRC := $(abspath ../../blazesym/) +LIBBLAZESYM_SRC := $(abspath ../blazesym/) LIBBLAZESYM_INC := $(abspath $(LIBBLAZESYM_SRC)/capi/include) LIBBLAZESYM_OBJ := $(abspath $(OUTPUT)/libblazesym_c.a) ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ @@ -16,11 +16,11 @@ ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ | sed 's/mips.*/mips/' \ | sed 's/riscv64/riscv/' \ | sed 's/loongarch64/loongarch/') -VMLINUX := ../../vmlinux/$(ARCH)/vmlinux.h +VMLINUX := ../vmlinux/$(ARCH)/vmlinux.h # Use our own libbpf API headers and Linux UAPI headers distributed with # libbpf to avoid dependency on system-wide headers, which could be missing or # outdated -INCLUDES := -I$(OUTPUT) -I../../libbpf/include/uapi -I$(dir $(VMLINUX)) -I$(LIBBLAZESYM_INC) +INCLUDES := -I$(OUTPUT) -I../libbpf/include/uapi -I$(dir $(VMLINUX)) -I$(LIBBLAZESYM_INC) CFLAGS := -g -Wall ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS) diff --git a/eBPF_Supermarket/Memory_Subsystem/mem_watcher/mem_watcher.c b/eBPF_Supermarket/Memory_Subsystem/mem_watcher/mem_watcher.c index 8c5eb3dc6..c1301ee2c 100644 --- a/eBPF_Supermarket/Memory_Subsystem/mem_watcher/mem_watcher.c +++ b/eBPF_Supermarket/Memory_Subsystem/mem_watcher/mem_watcher.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include "paf.skel.h" @@ -40,6 +41,9 @@ static const int stack_map_max_entries = 10240; //最大允许存储多少个sta static __u64 *g_stacks = NULL; static size_t g_stacks_size = 0; +#define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) +#define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK) + static struct blaze_symbolizer *symbolizer; static int attach_pid; @@ -86,6 +90,7 @@ static struct env { bool procstat; bool sysstat; bool memleak; + bool kernel_trace; bool part2; @@ -98,8 +103,10 @@ static struct env { .procstat = false, .sysstat = false, .memleak = false, + .kernel_trace = true, .rss = false, .part2 = false, + .choose_pid = 0, }; const char argp_program_doc[] = "mem_watcher is in use ....\n"; @@ -107,22 +114,27 @@ const char argp_program_doc[] = "mem_watcher is in use ....\n"; static const struct argp_option opts[] = { {0, 0, 0, 0, "select function:", 1}, - {"paf", 'a', 0, 0, "print paf (内存页面状态报告)", 2}, + {0, 0, 0, 0, "par:", 2}, + {"paf", 'a', 0, 0, "print paf (内存页面状态报告)"}, - {"pr", 'p', 0, 0, "print pr (页面回收状态报告)", 3}, + {0, 0, 0, 0, "pr:", 3}, + {"pr", 'p', 0, 0, "print pr (页面回收状态报告)"}, - {"procstat", 'r', 0, 0, "print procstat (进程内存状态报告)", 4}, - {0, 0, 0, 0, "procstat additional function:"}, + {0, 0, 0, 0, "procstat:", 4}, + {"procstat", 'r', 0, 0, "print procstat (进程内存状态报告)"}, {"choose_pid", 'P', "PID", 0, "选择进程号打印"}, {"Rss", 'R', NULL, 0, "打印进程页面", 5}, - {"sysstat", 's', 0, 0, "print sysstat (系统内存状态报告)", 6}, - {0, 0, 0, 0, "sysstat additional function:"}, + {0, 0, 0, 0, "sysstat:", 6}, + {"sysstat", 's', 0, 0, "print sysstat (系统内存状态报告)"}, + {"part2", 'n', NULL, 0, "系统内存状态报告2", 7}, - {"memleak", 'l', "PID", 0, "print memleak (内存泄漏检测)", 8}, + {0, 0, 0, 0, "memleak:", 8}, + {"memleak", 'l', 0, 0, "print memleak (内核态内存泄漏检测)", 8}, + {"choose_pid", 'P', "PID", 0, "选择进程号打印, print memleak (用户态内存泄漏检测)", 9}, - {"time", 't', "TIME-SEC", 0, "Max Running Time(0 for infinite)", 9}, + {"time", 't', "TIME-SEC", 0, "Max Running Time(0 for infinite)", 10}, {NULL, 'h', NULL, OPTION_HIDDEN, "show the full help"}, {0}, }; @@ -296,6 +308,17 @@ int print_outstanding_combined_allocs(struct memleak_bpf *skel, pid_t pid) { return 0; } +void disable_kernel_tracepoints(struct memleak_bpf *skel) { + bpf_program__set_autoload(skel->progs.memleak__kmalloc, false); + bpf_program__set_autoload(skel->progs.memleak__kmalloc_node, false); + bpf_program__set_autoload(skel->progs.memleak__kfree, false); + bpf_program__set_autoload(skel->progs.memleak__kmem_cache_alloc, false); + bpf_program__set_autoload(skel->progs.memleak__kmem_cache_alloc_node, false); + bpf_program__set_autoload(skel->progs.memleak__kmem_cache_free, false); + bpf_program__set_autoload(skel->progs.memleak__mm_page_alloc, false); + bpf_program__set_autoload(skel->progs.memleak__mm_page_free, false); +} + static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) { return vfprintf(stderr, format, args); } @@ -635,12 +658,13 @@ int main(int argc, char **argv) { } else if (env.memleak) { - if (argc != 3) { - printf("usage:%s attach_pid\n", argv[0]); - return -1; + if (env.choose_pid != 0) { + printf("用户态内存泄漏\n"); + env.kernel_trace = false; + attach_pid = env.choose_pid; } - - attach_pid = atoi(argv[2]); + else + attach_pid = 0; strcpy(binary_path, "/lib/x86_64-linux-gnu/libc.so.6"); @@ -653,10 +677,15 @@ int main(int argc, char **argv) { fprintf(stderr, "Failed to open BPF skeleton\n"); return 1; } + //skel->rodata->stack_flags = KERN_STACKID_FLAGS; + skel->rodata->stack_flags = env.kernel_trace ? KERN_STACKID_FLAGS : USER_STACKID_FLAGS; bpf_map__set_value_size(skel->maps.stack_traces, perf_max_stack_depth * sizeof(__u64)); bpf_map__set_max_entries(skel->maps.stack_traces, stack_map_max_entries); + if (!env.kernel_trace) + disable_kernel_tracepoints(skel); + /* Load & verify BPF programs */ err = memleak_bpf__load(skel); if (err) { @@ -664,10 +693,13 @@ int main(int argc, char **argv) { goto memleak_cleanup; } - err = attach_uprobes(skel); - if (err) { - fprintf(stderr, "failed to attach uprobes\n"); - goto memleak_cleanup; + if (!env.kernel_trace) { + err = attach_uprobes(skel); + if (err) { + fprintf(stderr, "failed to attach uprobes\n"); + + goto memleak_cleanup; + } } /* Let libbpf perform auto-attach for uprobe_sub/uretprobe_sub diff --git a/eBPF_Supermarket/Memory_Subsystem/mem_watcher/memleak.bpf.c b/eBPF_Supermarket/Memory_Subsystem/mem_watcher/memleak.bpf.c index d8b5d0d31..4ae642682 100644 --- a/eBPF_Supermarket/Memory_Subsystem/mem_watcher/memleak.bpf.c +++ b/eBPF_Supermarket/Memory_Subsystem/mem_watcher/memleak.bpf.c @@ -19,10 +19,10 @@ #include "vmlinux.h" #include #include +#include #include "mem_watcher.h" - -#define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) -#define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK) + +const volatile __u64 stack_flags = 0; struct { __uint(type, BPF_MAP_TYPE_HASH); @@ -88,7 +88,7 @@ static int gen_alloc_exit2(void *ctx, u64 address) { bpf_map_delete_elem(&sizes, &pid); if (0 != address) { - info.stack_id = bpf_get_stackid(ctx, &stack_traces, USER_STACKID_FLAGS); + info.stack_id = bpf_get_stackid(ctx, &stack_traces, stack_flags); bpf_map_update_elem(&allocs, &addr, &info, BPF_ANY); @@ -258,4 +258,191 @@ int BPF_KPROBE(pvalloc_enter, size_t size) { SEC("uretprobe") int BPF_KRETPROBE(pvalloc_exit) { return gen_alloc_exit(ctx); +} + +struct trace_event_raw_kmem_alloc_node___x { + const void *ptr; + size_t bytes_alloc; +} __attribute__((preserve_access_index)); + +static __always_inline bool has_kmem_alloc_node(void) { + if (bpf_core_type_exists(struct trace_event_raw_kmem_alloc_node___x)) + return true; + return false; +} + +struct trace_event_raw_kmem_alloc___x { + const void *ptr; + size_t bytes_alloc; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_kmalloc___x { + const void *ptr; + size_t bytes_alloc; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_kmem_cache_alloc___x { + const void *ptr; + size_t bytes_alloc; +} __attribute__((preserve_access_index)); + +static __always_inline bool has_kmem_alloc(void) +{ + if (bpf_core_type_exists(struct trace_event_raw_kmem_alloc___x)) + return true; + return false; +} + +SEC("tracepoint/kmem/kmalloc") +int memleak__kmalloc(void *ctx) +{ + const void *ptr; + size_t bytes_alloc; + + if (has_kmem_alloc()) { + struct trace_event_raw_kmem_alloc___x *args = ctx; + ptr = BPF_CORE_READ(args, ptr); + bytes_alloc = BPF_CORE_READ(args, bytes_alloc); + } else { + struct trace_event_raw_kmalloc___x *args = ctx; + ptr = BPF_CORE_READ(args, ptr); + bytes_alloc = BPF_CORE_READ(args, bytes_alloc); + } + + gen_alloc_enter(bytes_alloc); + + return gen_alloc_exit2(ctx, (u64)ptr); +} + +SEC("tracepoint/kmem/kmalloc_node") +int memleak__kmalloc_node(void *ctx) +{ + const void *ptr; + size_t bytes_alloc; + + if (has_kmem_alloc_node()) { + struct trace_event_raw_kmem_alloc_node___x *args = ctx; + ptr = BPF_CORE_READ(args, ptr); + bytes_alloc = BPF_CORE_READ(args, bytes_alloc); + + gen_alloc_enter( bytes_alloc); + + return gen_alloc_exit2(ctx, (u64)ptr); + } else { + /* tracepoint is disabled if not exist, avoid compile warning */ + return 0; + } +} + +struct trace_event_raw_kmem_free___x { + const void *ptr; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_kfree___x { + const void *ptr; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_kmem_cache_free___x { + const void *ptr; +} __attribute__((preserve_access_index)); + +static __always_inline bool has_kfree() +{ + if (bpf_core_type_exists(struct trace_event_raw_kfree___x)) + return true; + return false; +} + +static __always_inline bool has_kmem_cache_free() +{ + if (bpf_core_type_exists(struct trace_event_raw_kmem_cache_free___x)) + return true; + return false; +} + +SEC("tracepoint/kmem/kfree") +int memleak__kfree(void *ctx) +{ + const void *ptr; + + if (has_kfree()) { + struct trace_event_raw_kfree___x *args = ctx; + ptr = BPF_CORE_READ(args, ptr); + } else { + struct trace_event_raw_kmem_free___x *args = ctx; + ptr = BPF_CORE_READ(args, ptr); + } + + return gen_free_enter(ptr); +} + +SEC("tracepoint/kmem/kmem_cache_alloc") +int memleak__kmem_cache_alloc(void *ctx) +{ + const void *ptr; + size_t bytes_alloc; + + if (has_kmem_alloc()) { + struct trace_event_raw_kmem_alloc___x *args = ctx; + ptr = BPF_CORE_READ(args, ptr); + bytes_alloc = BPF_CORE_READ(args, bytes_alloc); + } else { + struct trace_event_raw_kmem_cache_alloc___x *args = ctx; + ptr = BPF_CORE_READ(args, ptr); + bytes_alloc = BPF_CORE_READ(args, bytes_alloc); + } + + gen_alloc_enter(bytes_alloc); + + return gen_alloc_exit2(ctx, (u64)ptr); +} + +SEC("tracepoint/kmem/kmem_cache_alloc_node") +int memleak__kmem_cache_alloc_node(void *ctx) +{ + const void *ptr; + size_t bytes_alloc; + + if (has_kmem_alloc_node()) { + struct trace_event_raw_kmem_alloc_node___x *args = ctx; + ptr = BPF_CORE_READ(args, ptr); + bytes_alloc = BPF_CORE_READ(args, bytes_alloc); + + gen_alloc_enter(bytes_alloc); + + return gen_alloc_exit2(ctx, (u64)ptr); + } else { + /* tracepoint is disabled if not exist, avoid compile warning */ + return 0; + } +} + +SEC("tracepoint/kmem/kmem_cache_free") +int memleak__kmem_cache_free(void *ctx) +{ + const void *ptr; + + if (has_kmem_cache_free()) { + struct trace_event_raw_kmem_cache_free___x *args = ctx; + ptr = BPF_CORE_READ(args, ptr); + } else { + struct trace_event_raw_kmem_free___x *args = ctx; + ptr = BPF_CORE_READ(args, ptr); + } + + return gen_free_enter(ptr); +} + +SEC("tracepoint/kmem/mm_page_alloc") +int memleak__mm_page_alloc(struct trace_event_raw_mm_page_alloc *ctx) +{ + gen_alloc_enter(4096 << ctx->order); + + return gen_alloc_exit2(ctx, ctx->pfn); +} + +SEC("tracepoint/kmem/mm_page_free") +int memleak__mm_page_free(struct trace_event_raw_mm_page_free *ctx) +{ + return gen_free_enter((void *)ctx->pfn); } \ No newline at end of file