diff --git a/docs/core_plugins.md b/docs/core_plugins.md index 7bc6c4c0..6c5adfea 100644 --- a/docs/core_plugins.md +++ b/docs/core_plugins.md @@ -189,6 +189,7 @@ Always returns CONTINUE. post_action_delay=15 (optional) dry=false (optional) always_continue=false (optional) + reap_memory=false (optional) ### Description @@ -242,6 +243,8 @@ they will have no effect if set on ancestors of the targeted cgroups. STOP if killed something (even if dry=true), unless `always_continue`. CONTINUE otherwise. +If `reap_memory` is set to true, attempt to speed up process memory cleanup via process_mrelease syscall. See https://lwn.net/Articles/864184/ for details. + ## kill_by_swap_usage ### Arguments @@ -252,6 +255,7 @@ otherwise. post_action_delay=15 (optional) dry=false (optional) always_continue=false (optional) + reap_memory=false (optional) ### Description @@ -272,6 +276,8 @@ of SIGKILLs sent to resident processes. STOP if killed something (even if dry=true), unless `always_continue`. CONTINUE otherwise. +If `reap_memory` is set to true, attempt to speed up process memory cleanup via process_mrelease syscall. See https://lwn.net/Articles/864184/ for details. + ## kill_by_pressure ### Arguments @@ -282,6 +288,7 @@ otherwise. post_action_delay=15 (optional) dry=false (optional) always_continue=false (optional) + reap_memory=false (optional) ### Description @@ -302,6 +309,8 @@ of SIGKILLs sent to resident processes. STOP if killed something (even if dry=true), unless `always_continue`. CONTINUE otherwise. +If `reap_memory` is set to true, attempt to speed up process memory cleanup via process_mrelease syscall. See https://lwn.net/Articles/864184/ for details. + ## kill_by_io_cost ### Arguments @@ -311,6 +320,7 @@ otherwise. post_action_delay=15 (optional) dry=false (optional) always_continue=false (optional) + reap_memory=false (optional) ### Description @@ -329,6 +339,8 @@ of SIGKILLs sent to resident processes. STOP if killed something (even if dry=true), unless `always_continue`. CONTINUE otherwise. +If `reap_memory` is set to true, attempt to speed up process memory cleanup via process_mrelease syscall. See https://lwn.net/Articles/864184/ for details. + ## kill_by_pg_scan ### Arguments @@ -338,6 +350,7 @@ otherwise. post_action_delay=15 (optional) dry=false (optional) always_continue=false (optional) + reap_memory=false (optional) ### Description @@ -355,3 +368,5 @@ of SIGKILLs sent to resident processes. STOP if killed something (even if dry=true), unless `always_continue`. CONTINUE otherwise. + +If `reap_memory` is set to true, attempt to speed up process memory cleanup via process_mrelease syscall. See https://lwn.net/Articles/864184/ for details. diff --git a/src/oomd/plugins/BaseKillPlugin.cpp b/src/oomd/plugins/BaseKillPlugin.cpp index 8356c5fe..e02d268f 100644 --- a/src/oomd/plugins/BaseKillPlugin.cpp +++ b/src/oomd/plugins/BaseKillPlugin.cpp @@ -18,6 +18,8 @@ #include "oomd/plugins/BaseKillPlugin.h" #include +#include +#include #include #include #include @@ -42,8 +44,24 @@ #include "oomd/include/CoreStats.h" #include "oomd/include/Types.h" #include "oomd/util/Fs.h" +#include "oomd/util/ScopeGuard.h" #include "oomd/util/Util.h" +#ifndef __NR_process_mrelease +#define __NR_process_mrelease 448 +#endif + +namespace { +static int pidfd_open(pid_t pid, unsigned int flags) noexcept { + return ::syscall(SYS_pidfd_open, pid, flags); +} + +static int process_mrelease(int pidfd, unsigned int flags) noexcept { + return ::syscall(__NR_process_mrelease, pidfd, flags); +} + +} // namespace + static auto constexpr kOomdKillInitiationTrustedXattr = "trusted.oomd_ooms"; static auto constexpr kOomdKillInitiationUserXattr = "user.oomd_ooms"; static auto constexpr kOomdKillCompletionTrustedXattr = "trusted.oomd_kill"; @@ -71,6 +89,7 @@ int BaseKillPlugin::init( argParser_.addArgument("always_continue", alwaysContinue_); argParser_.addArgument("debug", debug_); argParser_.addArgument("kernelkill", kernelKill_); + argParser_.addArgument("reap_memory", reapMemory_); if (!argParser_.parse(args)) { return 1; @@ -466,6 +485,47 @@ int BaseKillPlugin::dumpMemoryStat(const CgroupContext& target) { return 0; } +bool BaseKillPlugin::reapProcess(pid_t pid) { + const int pidfd = ::pidfd_open(pid, 0); + if (pidfd < 0) { + if (errno != ESRCH) { + OLOG << "pidfd_open " << pid << " failed: " << Util::strerror_r(); + } + return false; + } + OOMD_SCOPE_EXIT { + ::close(pidfd); + }; + + if (::process_mrelease(pidfd, 0) < 0) { + if (errno != ESRCH) { + OLOG << "process_mrelease " << pid << " failed: " << Util::strerror_r(); + } + return false; + } + return true; +} + +int BaseKillPlugin::reapCgroupRecursively(const CgroupContext& target) { + int reaped = 0; + if (const auto& children = target.children()) { + for (const auto& childName : *children) { + if (auto childCtx = + target.oomd_ctx().addChildToCacheAndGet(target, childName)) { + reaped += reapCgroupRecursively(*childCtx); + } + } + } + + if (const auto& pids = Fs::getPidsAt(target.fd())) { + for (int pid : *pids) { + reaped += reapProcess(pid); + } + } + + return reaped; +} + int BaseKillPlugin::tryToKillCgroup( const CgroupContext& target, const KillUuid& killUuid, @@ -538,6 +598,16 @@ int BaseKillPlugin::tryToKillCgroup( } } + if (reapMemory_ && nrKilled > 0) { + OLOG << "Reaping processes in " << target.cgroup().absolutePath(); + const auto start = std::chrono::steady_clock::now(); + const int reaped = reapCgroupRecursively(target); + const auto end = std::chrono::steady_clock::now(); + const auto dur = + std::chrono::duration_cast(end - start); + OLOG << "Reaped " << reaped << " processes in " << dur.count() << "ms"; + } + reportKillCompletionToXattr(cgroupPath, nrKilled); return nrKilled; } diff --git a/src/oomd/plugins/BaseKillPlugin.h b/src/oomd/plugins/BaseKillPlugin.h index 5bd8003c..b40a3d1c 100644 --- a/src/oomd/plugins/BaseKillPlugin.h +++ b/src/oomd/plugins/BaseKillPlugin.h @@ -197,6 +197,14 @@ class BaseKillPlugin : public Engine::BasePlugin { virtual int freezeCgroup(const CgroupContext& target); virtual int kernelKillCgroup(const CgroupContext& target); + // Call process_mrelease for a given pid. + // Returns true if the process was successfully reaped. + virtual bool reapProcess(pid_t pid); + + // Call process_mrelease for all pids in cgroup and its descendants. + // Returns the number of processes reaped + virtual int reapCgroupRecursively(const CgroupContext& target); + private: enum class KillResult { SUCCESS, @@ -253,6 +261,7 @@ class BaseKillPlugin : public Engine::BasePlugin { bool alwaysContinue_{false}; bool debug_{false}; bool kernelKill_{false}; + bool reapMemory_{false}; struct ActivePrekillHook { public: