Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROX-26025: Sanitize UTF-8 strings in process signals #1857

Merged
merged 9 commits into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 31 additions & 21 deletions collector/lib/ProcessSignalFormatter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ std::string extract_proc_args(sinsp_threadinfo* tinfo) {
}
std::ostringstream args;
for (auto it = tinfo->m_args.begin(); it != tinfo->m_args.end();) {
args << *it++;
Holder<const std::string> arg = SanitizeUTF8(*it++);
args << *arg;
if (it != tinfo->m_args.end()) {
args << " ";
}
Expand Down Expand Up @@ -111,26 +112,34 @@ ProcessSignal* ProcessSignalFormatter::CreateProcessSignal(sinsp_evt* event) {
// set id
signal->set_id(UUIDStr());

const std::string* name = event_extractor_->get_comm(event);
const std::string* exepath = event_extractor_->get_exepath(event);
Holder<const std::string> name;
Holder<const std::string> exepath;

// set name (if name is missing or empty, try to use exec_file_path)
if (name && !name->empty() && *name != "<NA>") {
if (const std::string* nameptr = event_extractor_->get_comm(event)) {
name = SanitizeUTF8(*nameptr);
}
if (const std::string* exepathptr = event_extractor_->get_exepath(event)) {
exepath = SanitizeUTF8(*exepathptr);
}

// set name (if name is empty, try to use exec_file_path)
if (!(*name).empty() && *name != "<NA>") {
signal->set_name(*name);
} else if (exepath && !exepath->empty() && *exepath != "<NA>") {
} else if (!(*exepath).empty() && *exepath != "<NA>") {
signal->set_name(*exepath);
}

// set exec_file_path (if exec_file_path is missing or empty, try to use name)
if (exepath && !exepath->empty() && *exepath != "<NA>") {
// set exec_file_path (if exec_file_path is empty, try to use name)
if (!(*exepath).empty() && *exepath != "<NA>") {
signal->set_exec_file_path(*exepath);
} else if (name && !name->empty() && *name != "<NA>") {
} else if (!(*name).empty() && *name != "<NA>") {
signal->set_exec_file_path(*name);
}

// set process arguments
if (const char* args = event_extractor_->get_proc_args(event)) {
signal->set_args(args);
std::string args_str(args);
signal->set_args(*SanitizeUTF8(args_str));
}

// set pid
Expand Down Expand Up @@ -161,7 +170,8 @@ ProcessSignal* ProcessSignalFormatter::CreateProcessSignal(sinsp_evt* event) {
this->GetProcessLineage(event->get_thread_info(), lineage);
for (const auto& p : lineage) {
auto signal_lineage = signal->add_lineage_info();
signal_lineage->set_parent_exec_file_path(p.parent_exec_file_path());
auto parent_exec_file_path = SanitizeUTF8(p.parent_exec_file_path());
signal_lineage->set_parent_exec_file_path(*parent_exec_file_path);
signal_lineage->set_parent_uid(p.parent_uid());
}

Expand All @@ -179,21 +189,21 @@ ProcessSignal* ProcessSignalFormatter::CreateProcessSignal(sinsp_threadinfo* tin
// set id
signal->set_id(UUIDStr());

const auto& name = tinfo->m_comm;
const auto& exepath = tinfo->m_exepath;
Holder<const std::string> name = SanitizeUTF8(tinfo->m_comm);
Holder<const std::string> exepath = SanitizeUTF8(tinfo->m_exepath);

// set name (if name is missing or empty, try to use exec_file_path)
if (!name.empty() && name != "<NA>") {
signal->set_name(name);
} else if (!exepath.empty() && exepath != "<NA>") {
signal->set_name(exepath);
if (!(*name).empty() && *name != "<NA>") {
signal->set_name(*name);
} else if (!(*exepath).empty() && *exepath != "<NA>") {
signal->set_name(*exepath);
}

// set exec_file_path (if exec_file_path is missing or empty, try to use name)
if (!exepath.empty() && exepath != "<NA>") {
signal->set_exec_file_path(exepath);
} else if (!name.empty() && name != "<NA>") {
signal->set_exec_file_path(name);
if (!(*exepath).empty() && *exepath != "<NA>") {
signal->set_exec_file_path(*exepath);
} else if (!(*name).empty() && *name != "<NA>") {
signal->set_exec_file_path(*name);
}

// set the process as coming from a scrape as opposed to an exec
Expand Down
16 changes: 16 additions & 0 deletions collector/lib/Utility.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ extern "C" {

#include <libsinsp/sinsp.h>

#include <google/protobuf/stubs/common.h>

#include "HostInfo.h"
#include "Logging.h"
#include "Utility.h"
Expand Down Expand Up @@ -230,4 +232,18 @@ std::optional<std::string_view> ExtractContainerIDFromCgroup(std::string_view cg
}
return std::make_optional(container_id_part.substr(0, SHORT_CONTAINER_ID_LENGTH));
}

Holder<const std::string> SanitizeUTF8(const std::string& str) {
std::unique_ptr<char[]> work_buffer(new char[str.size()]);
char* sanitized;

sanitized = google::protobuf::internal::UTF8CoerceToStructurallyValid(str, work_buffer.get(), '?');

// we do this to avoid copying if unnecessary
if (sanitized == work_buffer.get()) {
return Holder<const std::string>::copy(sanitized);
}
return Holder<const std::string>(&str);
}

} // namespace collector
30 changes: 30 additions & 0 deletions collector/lib/Utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,36 @@ ScopedLock<Mutex> Lock(Mutex& mutex) {
#define ssizeof(x) static_cast<ssize_t>(sizeof(x))

std::optional<std::string_view> ExtractContainerIDFromCgroup(std::string_view cgroup);

// A Holder refers to an object that it can own, or not.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This sounds like a lifetime nightmare. Can't we do something similar to what protobuf does and simply use pointers? Or maybe make SanitizeUTF8 return a std::optional<std::string>? That way if the return value is empty it would mean the original string is valid and, if it is not empty, it will hold the sanitized string.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see your point, I wasn't convinced myself.

The problem is with the immutability of std::string, which makes you copy and copy again. I will figure out a more maintainable way.

template <class T>
class Holder {
public:
// Make a copy of the provided object.
// The copy will be destroyed when this Holder is destroyed.
static Holder<T> copy(T& v) {
Holder h = Holder();
h.m_owned.emplace(v);
return h;
}

// Make an external reference to the provided object.
// This Holder does not own this object.
Holder(T* external) : m_external(external) {}

T& operator*() {
return m_owned ? *m_owned : *m_external;
}

Holder() : m_external(0), m_owned(T()) {}

private:
T* m_external;
std::optional<std::remove_const_t<T>> m_owned;
};

Holder<const std::string> SanitizeUTF8(const std::string& str);

} // namespace collector

#endif // _UTILITY_H_
Loading