From bf322695b20b678ddd22252be69564de7a0a5e85 Mon Sep 17 00:00:00 2001 From: Chris Lalancette Date: Tue, 29 Oct 2024 14:52:01 +0000 Subject: [PATCH 1/3] Additional fixes for yadu/raii-client. Signed-off-by: Chris Lalancette --- rmw_zenoh_cpp/src/detail/rmw_client_data.cpp | 104 ++++++++++++------- 1 file changed, 64 insertions(+), 40 deletions(-) diff --git a/rmw_zenoh_cpp/src/detail/rmw_client_data.cpp b/rmw_zenoh_cpp/src/detail/rmw_client_data.cpp index a271b310..b009e9c3 100644 --- a/rmw_zenoh_cpp/src/detail/rmw_client_data.cpp +++ b/rmw_zenoh_cpp/src/detail/rmw_client_data.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include "attachment_helpers.hpp" #include "cdr.hpp" @@ -45,14 +46,15 @@ namespace rmw_zenoh_cpp // way to cancel a query once it is in-flight via the z_get() zenoh-c API. Thus, if an // rmw_zenoh_cpp user does rmw_create_client(), rmw_send_request(), rmw_destroy_client(), but the // query comes in after the rmw_destroy_client(), rmw_zenoh_cpp could access already-freed memory. -// The next 2 variables are used to avoid that situation. Any time a query is initiated via -// rmw_send_request(), num_in_flight_ is incremented. When the Zenoh calls the callback for the -// query reply, num_in_flight_ is decremented. -// When shutdown() is called, is_shutdown_ is set to true. -// If num_in_flight_ is 0, the data associated with this structure is freed. -// If num_in_flight_ is *not* 0, then the data associated with this structure is maintained. -// In the situation where is_shutdown_ is true, and num_in_flight_ drops to 0 in the query -// callback, the query callback will free up the structure. +// The next 3 global variables are used to avoid that situation. Any time a query is initiated via +// rmw_send_request(), num_in_flight_ is incremented. When Zenoh calls the callback for the +// query drop, num_in_flight_map->second is decremented. +// When ClientData is destroyed, it checks to see if there are things in flight. If there are, +// it leaves this ClientData pointer both in the num_in_flight_map and the deleted_clients map. +// When the client_data_handler() is called on these destroyed objects, it knows that it cannot +// dereference the data anymore, and it gets out early. When client_data_drop() is called, it +// decrements num_in_flight_map->second, and if that drops to zero, drops the pointer address +// completely from deleted_clients. // // There is one case which is not handled by this, which has to do with timeouts. The query // timeout is currently set to essentially infinite. Thus, if a query is in-flight but never @@ -65,6 +67,7 @@ namespace rmw_zenoh_cpp static std::mutex num_in_flight_mutex; static std::unordered_map num_in_flight_map = {}; static std::unordered_set deleted_clients = {}; + ///============================================================================= void client_data_handler(z_owned_reply_t * reply, void * data) { @@ -77,8 +80,6 @@ void client_data_handler(z_owned_reply_t * reply, void * data) return; } - // client_data could be a dangling pointer if this callback was triggered after - // the last reference to ClientData::SharedPtr was dropped from NodeData. std::lock_guard lock(num_in_flight_mutex); if (deleted_clients.count(client_data) > 0) { RMW_ZENOH_LOG_INFO_NAMED( @@ -90,12 +91,6 @@ void client_data_handler(z_owned_reply_t * reply, void * data) // See the comment about the "num_in_flight" class variable in the ClientData class for // why we need to do this. - // Note: This called could lead to UB since the ClientData * could have been deallocated - // if the query reply is received after NodeData was destructed. ie even though we do not - // erase the ClientData from NodeData's clients_ map when rmw_destroy_client is called, - // the clients_ maps would get erased when the NodeData's destructor is invoked. - // This is an edge case that should be resolved once we switch to zenoh-cpp and capture - // weaK_ptr in this callback instead. if (client_data->is_shutdown()) { return; } @@ -138,15 +133,20 @@ void client_data_drop(void * data) // See the comment about the "num_in_flight" class variable in the ClientData class for // why we need to do this. - // Note: This called could lead to UB since the ClientData * could have been deallocated - // if the query reply is received after NodeData was destructed. ie even though we do not - // erase the ClientData from NodeData's clients_ map when rmw_destroy_client is called, - // the clients_ maps would get erased when the NodeData's destructor is invoked. - // This is an edge case that should be resolved once we switch to zenoh-cpp and capture - // weaK_ptr in this callback instead. + std::lock_guard lock(num_in_flight_mutex); auto num_in_flight_it = num_in_flight_map.find(client_data); - if (num_in_flight_it != num_in_flight_map.end()) { - --num_in_flight_it->second; + if (num_in_flight_it == num_in_flight_map.end()) { + // This should never happen + RMW_ZENOH_LOG_ERROR_NAMED( + "rmw_zenoh_cpp", + "Unable to find object in num_in_flight_map." + ); + return; + } + + --num_in_flight_it->second; + if (num_in_flight_it->second == 0) { + deleted_clients.erase(client_data); } } @@ -233,15 +233,21 @@ std::shared_ptr ClientData::make( return nullptr; } - auto client_data = std::shared_ptr( - new ClientData{ - node, - std::move(entity), - request_members, - response_members, - std::move(request_type_support), - std::move(response_type_support) - }); + std::lock_guard lock(num_in_flight_mutex); + std::vector> duplicate_pointers; + std::shared_ptr client_data; + do { + client_data = std::shared_ptr( + new ClientData{ + node, + std::move(entity), + request_members, + response_members, + std::move(request_type_support), + std::move(response_type_support) + }); + duplicate_pointers.push_back(client_data); + } while (deleted_clients.count(client_data.get()) > 0); client_data->keyexpr_ = z_keyexpr_new(client_data->entity_->topic_info().value().topic_keyexpr_.c_str()); @@ -273,9 +279,7 @@ std::shared_ptr ClientData::make( free_ros_keyexpr.cancel(); free_token.cancel(); - // Erase from deleted_clients set if the memory address for the client data is reused. - num_in_flight_map.erase(client_data.get()); - deleted_clients.erase(client_data.get()); + num_in_flight_map[client_data.get()] = 0; return client_data; } @@ -495,8 +499,9 @@ rmw_ret_t ClientData::send_request( std::lock_guard lock(num_in_flight_mutex); auto num_in_flight_it = num_in_flight_map.find(this); if (num_in_flight_it == num_in_flight_map.end()) { - num_in_flight_map[this] = 0; - num_in_flight_it = num_in_flight_map.find(this); + // This should never happen + RMW_SET_ERROR_MSG("failed to find object in num_in_flight_map"); + return RMW_RET_ERROR; } num_in_flight_it->second++; } @@ -536,9 +541,28 @@ ClientData::~ClientData() entity_->topic_info().value().name_.c_str() ); } + std::lock_guard lock(num_in_flight_mutex); - num_in_flight_map.erase(this); - deleted_clients.insert(this); + auto num_in_flight_it = num_in_flight_map.find(this); + if (num_in_flight_it != num_in_flight_map.end()) { + if (num_in_flight_it->second == 0) { + // If there is nothing in flight, we can remove this from the map + // with no further considerations. + num_in_flight_map.erase(this); + } else { + // Since there is still something in flight, we need to just add + // it to the deleted_clients; it will be deleted when the last + // outstanding query finishes. + deleted_clients.insert(this); + } + } else { + // This should never happen + RMW_ZENOH_LOG_ERROR_NAMED( + "rmw_zenoh_cpp", + "Error finding client /%s in num_in_flight_map.", + entity_->topic_info().value().name_.c_str() + ); + } } //============================================================================== From b276dd2eda1344c2b1f277e2f8df031912df89ec Mon Sep 17 00:00:00 2001 From: Chris Lalancette Date: Thu, 31 Oct 2024 09:00:10 -0400 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: yadunund Signed-off-by: Chris Lalancette --- rmw_zenoh_cpp/src/detail/rmw_client_data.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/rmw_zenoh_cpp/src/detail/rmw_client_data.cpp b/rmw_zenoh_cpp/src/detail/rmw_client_data.cpp index b009e9c3..ccd9252d 100644 --- a/rmw_zenoh_cpp/src/detail/rmw_client_data.cpp +++ b/rmw_zenoh_cpp/src/detail/rmw_client_data.cpp @@ -139,13 +139,13 @@ void client_data_drop(void * data) // This should never happen RMW_ZENOH_LOG_ERROR_NAMED( "rmw_zenoh_cpp", - "Unable to find object in num_in_flight_map." + "Unable to find object in num_in_flight_map. Report this bug." ); return; } --num_in_flight_it->second; - if (num_in_flight_it->second == 0) { + if (num_in_flight_it->second == 0 && deleted_clients.count(client) > 0) { deleted_clients.erase(client_data); } } @@ -240,11 +240,11 @@ std::shared_ptr ClientData::make( client_data = std::shared_ptr( new ClientData{ node, - std::move(entity), + entity, request_members, response_members, - std::move(request_type_support), - std::move(response_type_support) + request_type_support, + response_type_support }); duplicate_pointers.push_back(client_data); } while (deleted_clients.count(client_data.get()) > 0); @@ -559,7 +559,7 @@ ClientData::~ClientData() // This should never happen RMW_ZENOH_LOG_ERROR_NAMED( "rmw_zenoh_cpp", - "Error finding client /%s in num_in_flight_map.", + "Error finding client /%s in num_in_flight_map. Report this bug.", entity_->topic_info().value().name_.c_str() ); } From 0de5712bb285b51a3116975d0f047e8b11866bed Mon Sep 17 00:00:00 2001 From: Chris Lalancette Date: Thu, 31 Oct 2024 13:14:02 +0000 Subject: [PATCH 3/3] Fixups. Signed-off-by: Chris Lalancette --- rmw_zenoh_cpp/src/detail/rmw_client_data.cpp | 14 +++++++------- rmw_zenoh_cpp/src/detail/rmw_client_data.hpp | 8 ++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/rmw_zenoh_cpp/src/detail/rmw_client_data.cpp b/rmw_zenoh_cpp/src/detail/rmw_client_data.cpp index ccd9252d..f59320c5 100644 --- a/rmw_zenoh_cpp/src/detail/rmw_client_data.cpp +++ b/rmw_zenoh_cpp/src/detail/rmw_client_data.cpp @@ -145,7 +145,7 @@ void client_data_drop(void * data) } --num_in_flight_it->second; - if (num_in_flight_it->second == 0 && deleted_clients.count(client) > 0) { + if (num_in_flight_it->second == 0 && deleted_clients.count(client_data) > 0) { deleted_clients.erase(client_data); } } @@ -178,8 +178,8 @@ std::shared_ptr ClientData::make( service_members->request_members_->data); auto response_members = static_cast( service_members->response_members_->data); - auto request_type_support = std::make_unique(service_members); - auto response_type_support = std::make_unique(service_members); + auto request_type_support = std::make_shared(service_members); + auto response_type_support = std::make_shared(service_members); // Note: Service request/response types will contain a suffix Request_ or Response_. // We remove the suffix when appending the type to the liveliness tokens for @@ -290,14 +290,14 @@ ClientData::ClientData( std::shared_ptr entity, const void * request_type_support_impl, const void * response_type_support_impl, - std::unique_ptr request_type_support, - std::unique_ptr response_type_support) + std::shared_ptr request_type_support, + std::shared_ptr response_type_support) : rmw_node_(rmw_node), entity_(std::move(entity)), request_type_support_impl_(request_type_support_impl), response_type_support_impl_(response_type_support_impl), - request_type_support_(std::move(request_type_support)), - response_type_support_(std::move(response_type_support)), + request_type_support_(request_type_support), + response_type_support_(response_type_support), wait_set_data_(nullptr), sequence_number_(1), is_shutdown_(false) diff --git a/rmw_zenoh_cpp/src/detail/rmw_client_data.hpp b/rmw_zenoh_cpp/src/detail/rmw_client_data.hpp index f4fb0efa..0eb95a61 100644 --- a/rmw_zenoh_cpp/src/detail/rmw_client_data.hpp +++ b/rmw_zenoh_cpp/src/detail/rmw_client_data.hpp @@ -113,8 +113,8 @@ class ClientData final std::shared_ptr entity, const void * request_type_support_impl, const void * response_type_support_impl, - std::unique_ptr request_type_support, - std::unique_ptr response_type_support); + std::shared_ptr request_type_support, + std::shared_ptr response_type_support); // Internal mutex. mutable std::mutex mutex_; @@ -129,8 +129,8 @@ class ClientData final // Type support fields. const void * request_type_support_impl_; const void * response_type_support_impl_; - std::unique_ptr request_type_support_; - std::unique_ptr response_type_support_; + std::shared_ptr request_type_support_; + std::shared_ptr response_type_support_; // Deque to store the replies in the order they arrive. std::deque> reply_queue_; // Wait set data.