From b0b4cb13932ec4d8142f7894a8e9edf4973434ac Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin <5773434+shekhirin@users.noreply.github.com> Date: Tue, 7 Jan 2025 13:34:53 +0000 Subject: [PATCH] feat(mdbx): record backtrace of read transactions and log on timeout --- crates/storage/libmdbx-rs/src/txn_manager.rs | 52 ++++++++++++-------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/crates/storage/libmdbx-rs/src/txn_manager.rs b/crates/storage/libmdbx-rs/src/txn_manager.rs index 817f178cda4c..4fdaddc82081 100644 --- a/crates/storage/libmdbx-rs/src/txn_manager.rs +++ b/crates/storage/libmdbx-rs/src/txn_manager.rs @@ -110,6 +110,7 @@ mod read_transactions { }; use dashmap::{DashMap, DashSet}; use std::{ + backtrace::Backtrace, sync::{mpsc::sync_channel, Arc}, time::{Duration, Instant}, }; @@ -148,11 +149,10 @@ mod read_transactions { } /// Removes a transaction from the list of active read transactions. - pub(crate) fn remove_active_read_transaction( - &self, - ptr: *mut ffi::MDBX_txn, - ) -> Option<(usize, (TransactionPtr, Instant))> { - self.read_transactions.as_ref()?.remove_active(ptr) + /// + /// Returns `true` if the transaction was found and removed. + pub(crate) fn remove_active_read_transaction(&self, ptr: *mut ffi::MDBX_txn) -> bool { + self.read_transactions.as_ref().is_some_and(|txs| txs.remove_active(ptr)) } /// Returns the number of timed out transactions that were not aborted by the user yet. @@ -172,7 +172,10 @@ mod read_transactions { /// /// We store `usize` instead of a raw pointer as a key, because pointers are not /// comparable. The time of transaction opening is stored as a value. - active: DashMap, + /// + /// The backtrace of the transaction opening is recorded only when debug assertions are + /// enabled. + active: DashMap>)>, /// List of timed out transactions that were not aborted by the user yet, hence have a /// dangling read transaction pointer. timed_out_not_aborted: DashSet, @@ -185,16 +188,20 @@ mod read_transactions { /// Adds a new transaction to the list of active read transactions. pub(super) fn add_active(&self, ptr: *mut ffi::MDBX_txn, tx: TransactionPtr) { - let _ = self.active.insert(ptr as usize, (tx, Instant::now())); + let _ = self.active.insert( + ptr as usize, + ( + tx, + Instant::now(), + cfg!(debug_assertions).then(|| Arc::new(Backtrace::force_capture())), + ), + ); } /// Removes a transaction from the list of active read transactions. - pub(super) fn remove_active( - &self, - ptr: *mut ffi::MDBX_txn, - ) -> Option<(usize, (TransactionPtr, Instant))> { + pub(super) fn remove_active(&self, ptr: *mut ffi::MDBX_txn) -> bool { self.timed_out_not_aborted.remove(&(ptr as usize)); - self.active.remove(&(ptr as usize)) + self.active.remove(&(ptr as usize)).is_some() } /// Returns the number of timed out transactions that were not aborted by the user yet. @@ -215,7 +222,7 @@ mod read_transactions { // Iterate through active read transactions and time out those that's open for // longer than `self.max_duration`. for entry in &self.active { - let (tx, start) = entry.value(); + let (tx, start, backtrace) = entry.value(); let duration = now - *start; if duration > self.max_duration { @@ -241,10 +248,15 @@ mod read_transactions { // Add the transaction to `timed_out_active`. We can't remove it // instantly from the list of active transactions, because we // iterate through it. - timed_out_active.push((txn_ptr, duration, error)); + timed_out_active.push(( + txn_ptr, + duration, + backtrace.clone(), + error, + )); } Err(err) => { - error!(target: "libmdbx", %err, "Failed to abort the long-lived read transaction") + error!(target: "libmdbx", %err, ?backtrace, "Failed to abort the long-lived read transaction") } } } else { @@ -256,18 +268,18 @@ mod read_transactions { // Walk through timed out transactions, and delete them from the list of active // transactions. - for (ptr, open_duration, err) in timed_out_active.iter().copied() { + for (ptr, open_duration, backtrace, err) in timed_out_active.iter().cloned() { // Try deleting the transaction from the list of active transactions. - let was_in_active = self.remove_active(ptr).is_some(); + let was_in_active = self.remove_active(ptr); if let Err(err) = err { if was_in_active { // If the transaction was in the list of active transactions, // then user didn't abort it and we failed to do so. - error!(target: "libmdbx", %err, ?open_duration, "Failed to time out the long-lived read transaction"); + error!(target: "libmdbx", %err, ?open_duration, ?backtrace, "Failed to time out the long-lived read transaction"); } } else { // Happy path, the transaction has been timed out by us with no errors. - warn!(target: "libmdbx", ?open_duration, "Long-lived read transaction has been timed out"); + warn!(target: "libmdbx", ?open_duration, ?backtrace, "Long-lived read transaction has been timed out"); // Add transaction to the list of timed out transactions that were not // aborted by the user yet. self.timed_out_not_aborted.insert(ptr as usize); @@ -283,7 +295,7 @@ mod read_transactions { target: "libmdbx", elapsed = ?now.elapsed(), active = ?self.active.iter().map(|entry| { - let (tx, start) = entry.value(); + let (tx, start, _) = entry.value(); (tx.clone(), start.elapsed()) }).collect::>(), "Read transactions"