Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(mdbx): record backtrace of read transactions and log on timeout #13707

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 32 additions & 20 deletions crates/storage/libmdbx-rs/src/txn_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ mod read_transactions {
};
use dashmap::{DashMap, DashSet};
use std::{
backtrace::Backtrace,
sync::{mpsc::sync_channel, Arc},
time::{Duration, Instant},
};
Expand Down Expand Up @@ -148,11 +149,10 @@ mod read_transactions {
}

/// Removes a transaction from the list of active read transactions.
pub(crate) fn remove_active_read_transaction(
&self,
ptr: *mut ffi::MDBX_txn,
) -> Option<(usize, (TransactionPtr, Instant))> {
self.read_transactions.as_ref()?.remove_active(ptr)
///
/// Returns `true` if the transaction was found and removed.
pub(crate) fn remove_active_read_transaction(&self, ptr: *mut ffi::MDBX_txn) -> bool {
self.read_transactions.as_ref().is_some_and(|txs| txs.remove_active(ptr))
}

/// Returns the number of timed out transactions that were not aborted by the user yet.
Expand All @@ -172,7 +172,10 @@ mod read_transactions {
///
/// We store `usize` instead of a raw pointer as a key, because pointers are not
/// comparable. The time of transaction opening is stored as a value.
active: DashMap<usize, (TransactionPtr, Instant)>,
///
/// The backtrace of the transaction opening is recorded only when debug assertions are
/// enabled.
active: DashMap<usize, (TransactionPtr, Instant, Option<Arc<Backtrace>>)>,
/// List of timed out transactions that were not aborted by the user yet, hence have a
/// dangling read transaction pointer.
timed_out_not_aborted: DashSet<usize>,
Expand All @@ -185,16 +188,20 @@ mod read_transactions {

/// Adds a new transaction to the list of active read transactions.
pub(super) fn add_active(&self, ptr: *mut ffi::MDBX_txn, tx: TransactionPtr) {
let _ = self.active.insert(ptr as usize, (tx, Instant::now()));
let _ = self.active.insert(
ptr as usize,
(
tx,
Instant::now(),
cfg!(debug_assertions).then(|| Arc::new(Backtrace::force_capture())),
),
);
}

/// Removes a transaction from the list of active read transactions.
pub(super) fn remove_active(
&self,
ptr: *mut ffi::MDBX_txn,
) -> Option<(usize, (TransactionPtr, Instant))> {
pub(super) fn remove_active(&self, ptr: *mut ffi::MDBX_txn) -> bool {
self.timed_out_not_aborted.remove(&(ptr as usize));
self.active.remove(&(ptr as usize))
self.active.remove(&(ptr as usize)).is_some()
}

/// Returns the number of timed out transactions that were not aborted by the user yet.
Expand All @@ -215,7 +222,7 @@ mod read_transactions {
// Iterate through active read transactions and time out those that's open for
// longer than `self.max_duration`.
for entry in &self.active {
let (tx, start) = entry.value();
let (tx, start, backtrace) = entry.value();
let duration = now - *start;

if duration > self.max_duration {
Expand All @@ -241,10 +248,15 @@ mod read_transactions {
// Add the transaction to `timed_out_active`. We can't remove it
// instantly from the list of active transactions, because we
// iterate through it.
timed_out_active.push((txn_ptr, duration, error));
timed_out_active.push((
txn_ptr,
duration,
backtrace.clone(),
error,
));
}
Err(err) => {
error!(target: "libmdbx", %err, "Failed to abort the long-lived read transaction")
error!(target: "libmdbx", %err, ?backtrace, "Failed to abort the long-lived read transaction")
}
}
} else {
Expand All @@ -256,18 +268,18 @@ mod read_transactions {

// Walk through timed out transactions, and delete them from the list of active
// transactions.
for (ptr, open_duration, err) in timed_out_active.iter().copied() {
for (ptr, open_duration, backtrace, err) in timed_out_active.iter().cloned() {
// Try deleting the transaction from the list of active transactions.
let was_in_active = self.remove_active(ptr).is_some();
let was_in_active = self.remove_active(ptr);
if let Err(err) = err {
if was_in_active {
// If the transaction was in the list of active transactions,
// then user didn't abort it and we failed to do so.
error!(target: "libmdbx", %err, ?open_duration, "Failed to time out the long-lived read transaction");
error!(target: "libmdbx", %err, ?open_duration, ?backtrace, "Failed to time out the long-lived read transaction");
}
} else {
// Happy path, the transaction has been timed out by us with no errors.
warn!(target: "libmdbx", ?open_duration, "Long-lived read transaction has been timed out");
warn!(target: "libmdbx", ?open_duration, ?backtrace, "Long-lived read transaction has been timed out");
// Add transaction to the list of timed out transactions that were not
// aborted by the user yet.
self.timed_out_not_aborted.insert(ptr as usize);
Expand All @@ -283,7 +295,7 @@ mod read_transactions {
target: "libmdbx",
elapsed = ?now.elapsed(),
active = ?self.active.iter().map(|entry| {
let (tx, start) = entry.value();
let (tx, start, _) = entry.value();
(tx.clone(), start.elapsed())
}).collect::<Vec<_>>(),
"Read transactions"
Expand Down
Loading