Skip to content

Commit

Permalink
Add support for process memory reaping via process_mrelease syscall
Browse files Browse the repository at this point in the history
Summary:
- Add support for faster process memory reaping by using `process_mrelease` syscall.
- To enable, use `reap_memory=true` in kill plugin arguments. Disabled by default

Reviewed By: lnyng

Differential Revision: D58204970

fbshipit-source-id: e54b6aef90c26243a300e0c98a3c0ff22369a348
  • Loading branch information
Andrew Onyshchuk authored and facebook-github-bot committed Jan 17, 2025
1 parent 1651104 commit aca7c94
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 0 deletions.
15 changes: 15 additions & 0 deletions docs/core_plugins.md
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ Always returns CONTINUE.
post_action_delay=15 (optional)
dry=false (optional)
always_continue=false (optional)
reap_memory=false (optional)

### Description

Expand Down Expand Up @@ -242,6 +243,8 @@ they will have no effect if set on ancestors of the targeted cgroups.
STOP if killed something (even if dry=true), unless `always_continue`. CONTINUE
otherwise.

If `reap_memory` is set to true, attempt to speed up process memory cleanup via process_mrelease syscall. See https://lwn.net/Articles/864184/ for details.

## kill_by_swap_usage

### Arguments
Expand All @@ -252,6 +255,7 @@ otherwise.
post_action_delay=15 (optional)
dry=false (optional)
always_continue=false (optional)
reap_memory=false (optional)

### Description

Expand All @@ -272,6 +276,8 @@ of SIGKILLs sent to resident processes.
STOP if killed something (even if dry=true), unless `always_continue`. CONTINUE
otherwise.

If `reap_memory` is set to true, attempt to speed up process memory cleanup via process_mrelease syscall. See https://lwn.net/Articles/864184/ for details.

## kill_by_pressure

### Arguments
Expand All @@ -282,6 +288,7 @@ otherwise.
post_action_delay=15 (optional)
dry=false (optional)
always_continue=false (optional)
reap_memory=false (optional)

### Description

Expand All @@ -302,6 +309,8 @@ of SIGKILLs sent to resident processes.
STOP if killed something (even if dry=true), unless `always_continue`. CONTINUE
otherwise.

If `reap_memory` is set to true, attempt to speed up process memory cleanup via process_mrelease syscall. See https://lwn.net/Articles/864184/ for details.

## kill_by_io_cost

### Arguments
Expand All @@ -311,6 +320,7 @@ otherwise.
post_action_delay=15 (optional)
dry=false (optional)
always_continue=false (optional)
reap_memory=false (optional)

### Description

Expand All @@ -329,6 +339,8 @@ of SIGKILLs sent to resident processes.
STOP if killed something (even if dry=true), unless `always_continue`. CONTINUE
otherwise.

If `reap_memory` is set to true, attempt to speed up process memory cleanup via process_mrelease syscall. See https://lwn.net/Articles/864184/ for details.

## kill_by_pg_scan

### Arguments
Expand All @@ -338,6 +350,7 @@ otherwise.
post_action_delay=15 (optional)
dry=false (optional)
always_continue=false (optional)
reap_memory=false (optional)

### Description

Expand All @@ -355,3 +368,5 @@ of SIGKILLs sent to resident processes.

STOP if killed something (even if dry=true), unless `always_continue`. CONTINUE
otherwise.

If `reap_memory` is set to true, attempt to speed up process memory cleanup via process_mrelease syscall. See https://lwn.net/Articles/864184/ for details.
70 changes: 70 additions & 0 deletions src/oomd/plugins/BaseKillPlugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#include "oomd/plugins/BaseKillPlugin.h"

#include <fcntl.h>
#include <signal.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <algorithm>
#include <chrono>
Expand All @@ -42,8 +44,24 @@
#include "oomd/include/CoreStats.h"
#include "oomd/include/Types.h"
#include "oomd/util/Fs.h"
#include "oomd/util/ScopeGuard.h"
#include "oomd/util/Util.h"

#ifndef __NR_process_mrelease
#define __NR_process_mrelease 448
#endif

namespace {
static int pidfd_open(pid_t pid, unsigned int flags) noexcept {
return ::syscall(SYS_pidfd_open, pid, flags);
}

static int process_mrelease(int pidfd, unsigned int flags) noexcept {
return ::syscall(__NR_process_mrelease, pidfd, flags);
}

} // namespace

static auto constexpr kOomdKillInitiationTrustedXattr = "trusted.oomd_ooms";
static auto constexpr kOomdKillInitiationUserXattr = "user.oomd_ooms";
static auto constexpr kOomdKillCompletionTrustedXattr = "trusted.oomd_kill";
Expand Down Expand Up @@ -71,6 +89,7 @@ int BaseKillPlugin::init(
argParser_.addArgument("always_continue", alwaysContinue_);
argParser_.addArgument("debug", debug_);
argParser_.addArgument("kernelkill", kernelKill_);
argParser_.addArgument("reap_memory", reapMemory_);

if (!argParser_.parse(args)) {
return 1;
Expand Down Expand Up @@ -466,6 +485,47 @@ int BaseKillPlugin::dumpMemoryStat(const CgroupContext& target) {
return 0;
}

bool BaseKillPlugin::reapProcess(pid_t pid) {
const int pidfd = ::pidfd_open(pid, 0);
if (pidfd < 0) {
if (errno != ESRCH) {
OLOG << "pidfd_open " << pid << " failed: " << Util::strerror_r();
}
return false;
}
OOMD_SCOPE_EXIT {
::close(pidfd);
};

if (::process_mrelease(pidfd, 0) < 0) {
if (errno != ESRCH) {
OLOG << "process_mrelease " << pid << " failed: " << Util::strerror_r();
}
return false;
}
return true;
}

int BaseKillPlugin::reapCgroupRecursively(const CgroupContext& target) {
int reaped = 0;
if (const auto& children = target.children()) {
for (const auto& childName : *children) {
if (auto childCtx =
target.oomd_ctx().addChildToCacheAndGet(target, childName)) {
reaped += reapCgroupRecursively(*childCtx);
}
}
}

if (const auto& pids = Fs::getPidsAt(target.fd())) {
for (int pid : *pids) {
reaped += reapProcess(pid);
}
}

return reaped;
}

int BaseKillPlugin::tryToKillCgroup(
const CgroupContext& target,
const KillUuid& killUuid,
Expand Down Expand Up @@ -538,6 +598,16 @@ int BaseKillPlugin::tryToKillCgroup(
}
}

if (reapMemory_ && nrKilled > 0) {
OLOG << "Reaping processes in " << target.cgroup().absolutePath();
const auto start = std::chrono::steady_clock::now();
const int reaped = reapCgroupRecursively(target);
const auto end = std::chrono::steady_clock::now();
const auto dur =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
OLOG << "Reaped " << reaped << " processes in " << dur.count() << "ms";
}

reportKillCompletionToXattr(cgroupPath, nrKilled);
return nrKilled;
}
Expand Down
9 changes: 9 additions & 0 deletions src/oomd/plugins/BaseKillPlugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,14 @@ class BaseKillPlugin : public Engine::BasePlugin {
virtual int freezeCgroup(const CgroupContext& target);
virtual int kernelKillCgroup(const CgroupContext& target);

// Call process_mrelease for a given pid.
// Returns true if the process was successfully reaped.
virtual bool reapProcess(pid_t pid);

// Call process_mrelease for all pids in cgroup and its descendants.
// Returns the number of processes reaped
virtual int reapCgroupRecursively(const CgroupContext& target);

private:
enum class KillResult {
SUCCESS,
Expand Down Expand Up @@ -253,6 +261,7 @@ class BaseKillPlugin : public Engine::BasePlugin {
bool alwaysContinue_{false};
bool debug_{false};
bool kernelKill_{false};
bool reapMemory_{false};

struct ActivePrekillHook {
public:
Expand Down

0 comments on commit aca7c94

Please sign in to comment.