Skip to content

Commit

Permalink
ROX-26763: Enhance the handling of failures in procfsscraper (#1987)
Browse files Browse the repository at this point in the history
We extract the process state from /proc/<pid>/stat
- Skip parsing of zombie processes.
- Add stats for zombie processes encountered
- Turn parse errors into Traces with process state details.
  • Loading branch information
ovalenti authored Jan 21, 2025
1 parent 0b3c7a1 commit fb2d253
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 3 deletions.
1 change: 1 addition & 0 deletions collector/lib/CollectorStats.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
X(procfs_could_not_get_socket_inodes) \
X(procfs_could_not_read_exe) \
X(procfs_could_not_read_cmdline) \
X(procfs_zombie_process) \
X(event_timestamp_distant_past) \
X(event_timestamp_future)

Expand Down
51 changes: 48 additions & 3 deletions collector/lib/ProcfsScraper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <cctype>
#include <cinttypes>
#include <cstdio>
#include <cstring>
#include <fcntl.h>
#include <string_view>
Expand Down Expand Up @@ -143,6 +144,23 @@ bool GetSocketINodes(int dirfd, uint64_t pid, UnorderedSet<SocketInfo>* sock_ino
return true;
}

// Fetches the current state of the process pointed to by dirfd
// returns nulopt in case of error
std::optional<char> ReadProcessState(int dirfd) {
FileHandle stat_file(FDHandle(openat(dirfd, "stat", O_RDONLY)), "r");
if (!stat_file.valid()) {
return false;
}

char linebuf[512];

if (fgets(linebuf, sizeof(linebuf), stat_file.get()) == nullptr) {
return false;
}

return ExtractProcessState(linebuf);
}

// GetContainerID retrieves the container ID of the process represented by dirfd. The container ID is extracted from
// the cgroup.
std::optional<std::string> GetContainerID(int dirfd) {
Expand Down Expand Up @@ -491,16 +509,24 @@ bool ReadContainerConnections(const char* proc_path, std::shared_ptr<ProcessStor
continue;
}

auto process_state = ReadProcessState(dirfd);
if (process_state && *process_state == 'Z') {
COUNTER_INC(CollectorStats::procfs_zombie_process);
continue;
}

auto container_id = GetContainerID(dirfd);
if (!container_id) {
continue;
}

uint64_t netns_inode;
if (!GetNetworkNamespace(dirfd, &netns_inode)) {
// TODO ROX-13962: Improve logging to indicate when a process is defunct.
COUNTER_INC(CollectorStats::procfs_could_not_get_network_namespace);
CLOG_THROTTLED(ERROR, std::chrono::seconds(10)) << "Could not determine network namespace: " << StrError();
CLOG(TRACE) << "Could not determine network namespace: " << StrError();
if (process_state) {
CLOG(TRACE) << "Process state: " << *process_state;
}
continue;
}

Expand All @@ -509,7 +535,10 @@ bool ReadContainerConnections(const char* proc_path, std::shared_ptr<ProcessStor

if (!GetSocketINodes(dirfd, pid, &container_ns_sockets)) {
COUNTER_INC(CollectorStats::procfs_could_not_get_socket_inodes);
CLOG_THROTTLED(ERROR, std::chrono::seconds(10)) << "Could not obtain socket inodes: " << StrError();
CLOG(TRACE) << "Could not obtain socket inodes: " << StrError();
if (process_state) {
CLOG(TRACE) << "Process state: " << *process_state;
}
continue;
}

Expand Down Expand Up @@ -609,6 +638,22 @@ std::optional<std::string_view> ExtractContainerID(std::string_view cgroup_line)
return ExtractContainerIDFromCgroup(cgroup_path);
}

std::optional<char> ExtractProcessState(std::string_view line) {
size_t last_parenthese;

if ((last_parenthese = line.rfind(") ")) == line.npos) {
return {};
}

line.remove_prefix(last_parenthese + 2);

if (line.empty()) {
return {};
}

return line[0];
}

bool ConnScraper::Scrape(std::vector<Connection>* connections, std::vector<ContainerEndpoint>* listen_endpoints) {
return ReadContainerConnections(proc_path_.c_str(), process_store_, connections, listen_endpoints);
}
Expand Down
5 changes: 5 additions & 0 deletions collector/lib/ProcfsScraper_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ namespace collector {
// ExtractContainerID tries to extract a container ID from a cgroup line.
std::optional<std::string_view> ExtractContainerID(std::string_view cgroup_line);

// ExtractProcessState retrieves the state of the process (3rd element).
// as found in /proc/<pid>/stat.
// Returns: the state character or nullopt in case of error.
std::optional<char> ExtractProcessState(std::string_view proc_pid_stat_line);

} // namespace collector

#endif
21 changes: 21 additions & 0 deletions collector/test/ConnScraperTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,27 @@ TEST(ConnScraperTest, TestExtractContainerID) {
}
}

TEST(ConnScraperTest, TestProcStateExtract) {
std::optional<char> state;

// valid line
state = ExtractProcessState("13934 (prog) Z 2312 13934 2312 34819 13934 4194304 94 0 0 0 0 0 0 0 20 0 1 0 608787 5758976 409 18446744073709551615 94201870180352 94201870200233 140728860702192 0 0 0 0 0 0 0 0 0 17 4 0 0 0 0 0 94201870216240 94201870217856 94202687545344 140728860710184 140728860710204 140728860710204 140728860712939 0\n");

EXPECT_TRUE(state);
EXPECT_EQ(*state, 'Z');

// invalid
state = ExtractProcessState("13934 (prog) ");

EXPECT_FALSE(state);

// program name containing ')'
state = ExtractProcessState("13934 (prog ) Z) R 2312 13934 2312 34819 13934 4194304 94 0 0 0 0 0 0 0 20 0 1 0 608787 5758976 409 18446744073709551615 94201870180352 94201870200233 140728860702192 0 0 0 0 0 0 0 0 0 17 4 0 0 0 0 0 94201870216240 94201870217856 94202687545344 140728860710184 140728860710204 140728860710204 140728860712939 0\n");

EXPECT_TRUE(state);
EXPECT_EQ(*state, 'R');
}

} // namespace

} // namespace collector

0 comments on commit fb2d253

Please sign in to comment.