From 5a0700ad88b6ca9134b4da83a597f7fd3af0fb5c Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Wed, 28 Feb 2024 14:09:00 -0800 Subject: [PATCH 1/3] Force commit disabling even if we can't get the lock --- sqlitecluster/SQLite.cpp | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/sqlitecluster/SQLite.cpp b/sqlitecluster/SQLite.cpp index 3cea1eaa3..7429356e6 100644 --- a/sqlitecluster/SQLite.cpp +++ b/sqlitecluster/SQLite.cpp @@ -701,7 +701,16 @@ int SQLite::commit(const string& description, function* preCheckpointCal _conflictPage = 0; uint64_t before = STimeNow(); uint64_t beforeCommit = STimeNow(); + + // This is a last-second check in case we hit the exceptional case where we were unable to acquire the lock when disabling commits. + // If we get here, things are not going well but this minimizes the chance that we'll commit a transaction that won't get broadcast to peers. + if (!_sharedData._commitEnabled) { + return COMMIT_DISABLED; + } result = SQuery(_db, "committing db transaction", "COMMIT"); + if (!_sharedData._commitEnabled) { + SALERT("This transaction was committed while commits were disabled, it may or may not get sent to peers."); + } _lastConflictPage = _conflictPage; if (_lastConflictPage) { SINFO("part of last conflcit page: " << _lastConflictPage); @@ -1114,8 +1123,28 @@ _commitLockTimer("commit lock timer", { { } void SQLite::SharedData::setCommitEnabled(bool enable) { - lock_guard lock(commitLock); - _commitEnabled = enable; + if (enable) { + lock_guard lock(commitLock); + _commitEnabled = true; + } else { + // If we are disabling commits, we are standing down, and we don't want to be stuck in the back of a giant queue of commands to be able to do this, + // because it blocks the sync thread and prevents the server from standing down at all. + unique_lock lock(commitLock, defer_lock); + bool locked = lock.try_lock_for(3s); + if (!locked) { + // This may result in a commit that happens after we've stood down, and thus a forked node. + // However, waiting indefinitely on this may result in a timeout on the whole cluster, and then a forked node anyway. + SALERT("Could not acquire commit lock to disable commits. Doing it anyway!"); + } + + _commitEnabled = false; + + // This is even hackier, but the idea is that if we set this value in the middle of a commit happening, that commit can finish even though + // commits are off. When we return from this function, we're going to want to send all outstanding commits to followers, so we block here for a + // couple seconds just to allow a final commit to finish. This doesn't guarantee anything, but it's really just trying to reduce the surface area for failure. + sleep(3); + SINFO("Done sleeping after failing to acquire commit lock."); + } } void SQLite::SharedData::incrementCommit(const string& commitHash) { From af8feb3f10d0dee689f55085ff262886bece025a Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Wed, 28 Feb 2024 15:19:51 -0800 Subject: [PATCH 2/3] Simpler solution to this. --- sqlitecluster/SQLite.cpp | 36 ++++++------------------------------ sqlitecluster/SQLiteNode.cpp | 4 ++++ 2 files changed, 10 insertions(+), 30 deletions(-) diff --git a/sqlitecluster/SQLite.cpp b/sqlitecluster/SQLite.cpp index 7429356e6..d8d9d7af8 100644 --- a/sqlitecluster/SQLite.cpp +++ b/sqlitecluster/SQLite.cpp @@ -701,16 +701,7 @@ int SQLite::commit(const string& description, function* preCheckpointCal _conflictPage = 0; uint64_t before = STimeNow(); uint64_t beforeCommit = STimeNow(); - - // This is a last-second check in case we hit the exceptional case where we were unable to acquire the lock when disabling commits. - // If we get here, things are not going well but this minimizes the chance that we'll commit a transaction that won't get broadcast to peers. - if (!_sharedData._commitEnabled) { - return COMMIT_DISABLED; - } result = SQuery(_db, "committing db transaction", "COMMIT"); - if (!_sharedData._commitEnabled) { - SALERT("This transaction was committed while commits were disabled, it may or may not get sent to peers."); - } _lastConflictPage = _conflictPage; if (_lastConflictPage) { SINFO("part of last conflcit page: " << _lastConflictPage); @@ -1123,28 +1114,13 @@ _commitLockTimer("commit lock timer", { { } void SQLite::SharedData::setCommitEnabled(bool enable) { - if (enable) { - lock_guard lock(commitLock); - _commitEnabled = true; - } else { - // If we are disabling commits, we are standing down, and we don't want to be stuck in the back of a giant queue of commands to be able to do this, - // because it blocks the sync thread and prevents the server from standing down at all. - unique_lock lock(commitLock, defer_lock); - bool locked = lock.try_lock_for(3s); - if (!locked) { - // This may result in a commit that happens after we've stood down, and thus a forked node. - // However, waiting indefinitely on this may result in a timeout on the whole cluster, and then a forked node anyway. - SALERT("Could not acquire commit lock to disable commits. Doing it anyway!"); - } - - _commitEnabled = false; - - // This is even hackier, but the idea is that if we set this value in the middle of a commit happening, that commit can finish even though - // commits are off. When we return from this function, we're going to want to send all outstanding commits to followers, so we block here for a - // couple seconds just to allow a final commit to finish. This doesn't guarantee anything, but it's really just trying to reduce the surface area for failure. - sleep(3); - SINFO("Done sleeping after failing to acquire commit lock."); + if (commitEnabled == enable) { + // Exit early without grabbing the lock. It's possible during highly congested times for getting the lock to take long enough to time out the cluster. + return; } + + lock_guard lock(commitLock); + _commitEnabled = enable; } void SQLite::SharedData::incrementCommit(const string& commitHash) { diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp index 2a0c7e31b..4c3ee027c 100644 --- a/sqlitecluster/SQLiteNode.cpp +++ b/sqlitecluster/SQLiteNode.cpp @@ -1153,6 +1153,7 @@ bool SQLiteNode::update() { // We're no longer waiting on responses from peers, we can re-update immediately and start becoming a // follower node instead. + // If we get here in our `while(update)` loop, what happens? return true; } break; @@ -1975,10 +1976,12 @@ void SQLiteNode::_changeState(SQLiteNodeState newState) { // Turn off commits. This prevents late commits coming in right after we call `_sendOutstandingTransactions` // below, which otherwise could get committed on leader and not replicated to followers. + // This blocks on the commit lock. That could hurt. _db.setCommitEnabled(false); // We send any unsent transactions here before we finish switching states, we need to make sure these are // all sent to the new leader before we complete the transition. + // It seems like we would have done something here, but we didn't log anything. Maybe there weren't any transactions? _sendOutstandingTransactions(); } @@ -1995,6 +1998,7 @@ void SQLiteNode::_changeState(SQLiteNodeState newState) { // Re-enable commits if they were disabled during a previous stand-down. if (newState != SQLiteNodeState::SEARCHING) { + // Could have hit this. _db.setCommitEnabled(true); } From 52998bfd71eda2fdc31fe95db400ade0b91eab4e Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Wed, 28 Feb 2024 15:25:54 -0800 Subject: [PATCH 3/3] oops, copy-paste typo --- sqlitecluster/SQLite.cpp | 2 +- sqlitecluster/SQLiteNode.cpp | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/sqlitecluster/SQLite.cpp b/sqlitecluster/SQLite.cpp index d8d9d7af8..461b14717 100644 --- a/sqlitecluster/SQLite.cpp +++ b/sqlitecluster/SQLite.cpp @@ -1114,7 +1114,7 @@ _commitLockTimer("commit lock timer", { { } void SQLite::SharedData::setCommitEnabled(bool enable) { - if (commitEnabled == enable) { + if (_commitEnabled == enable) { // Exit early without grabbing the lock. It's possible during highly congested times for getting the lock to take long enough to time out the cluster. return; } diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp index 4c3ee027c..2a0c7e31b 100644 --- a/sqlitecluster/SQLiteNode.cpp +++ b/sqlitecluster/SQLiteNode.cpp @@ -1153,7 +1153,6 @@ bool SQLiteNode::update() { // We're no longer waiting on responses from peers, we can re-update immediately and start becoming a // follower node instead. - // If we get here in our `while(update)` loop, what happens? return true; } break; @@ -1976,12 +1975,10 @@ void SQLiteNode::_changeState(SQLiteNodeState newState) { // Turn off commits. This prevents late commits coming in right after we call `_sendOutstandingTransactions` // below, which otherwise could get committed on leader and not replicated to followers. - // This blocks on the commit lock. That could hurt. _db.setCommitEnabled(false); // We send any unsent transactions here before we finish switching states, we need to make sure these are // all sent to the new leader before we complete the transition. - // It seems like we would have done something here, but we didn't log anything. Maybe there weren't any transactions? _sendOutstandingTransactions(); } @@ -1998,7 +1995,6 @@ void SQLiteNode::_changeState(SQLiteNodeState newState) { // Re-enable commits if they were disabled during a previous stand-down. if (newState != SQLiteNodeState::SEARCHING) { - // Could have hit this. _db.setCommitEnabled(true); }