Skip to content

Commit

Permalink
Merge pull request #1654 from Expensify/tyler-fix-timeouts
Browse files Browse the repository at this point in the history
Notice when leader dies.
  • Loading branch information
coleaeason authored Feb 23, 2024
2 parents 86f4b43 + 627c913 commit 5cb2988
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 4 deletions.
11 changes: 9 additions & 2 deletions libstuff/SSignal.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "libstuff.h"

#include <sqlitecluster/SQLiteNode.h>
#include <execinfo.h>
#include <fcntl.h>
#include <signal.h>
Expand Down Expand Up @@ -202,12 +202,19 @@ void _SSignal_StackTrace(int signum, siginfo_t *info, void *ucontext) {
SWARN("Calling DIE function.");
SSignalHandlerDieFunc();
SSignalHandlerDieFunc = [](){};
SWARN("DIE function returned, aborting (if not done).");
SWARN("DIE function returned.");
if (SQLiteNode::KILLABLE_SQLITE_NODE) {
SWARN("Killing peer connections.");
SQLiteNode::KILLABLE_SQLITE_NODE->kill();
}
}

// If we weren't already in ABORT, we'll call that. The second call will skip the above callstack generation.
if (signum != SIGABRT) {
SWARN("Aborting.");
abort();
} else {
SWARN("Already in ABORT.");
}
} else {
SALERT("Non-signal thread got signal " << strsignal(signum) << "(" << signum << "), which wasn't expected");
Expand Down
10 changes: 10 additions & 0 deletions sqlitecluster/SQLiteNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@
#undef SLOGPREFIX
#define SLOGPREFIX "{" << _name << "/" << SQLiteNode::stateName(_state) << "} "

SQLiteNode* SQLiteNode::KILLABLE_SQLITE_NODE{0};

// Initializations for static vars.
const uint64_t SQLiteNode::RECV_TIMEOUT{STIME_US_PER_S * 30};

Expand Down Expand Up @@ -145,6 +147,7 @@ SQLiteNode::SQLiteNode(SQLiteServer& server, shared_ptr<SQLitePool> dbPool, cons
_stateTimeout(STimeNow() + firstTimeout),
_syncPeer(nullptr)
{
KILLABLE_SQLITE_NODE = this;
SASSERT(_originalPriority >= 0);
onPrepareHandlerEnabled = false;

Expand Down Expand Up @@ -2716,3 +2719,10 @@ SQLiteNodeState SQLiteNode::stateFromName(const string& name) {
return it->second;
}
}

void SQLiteNode::kill() {
for (SQLitePeer* peer : _peerList) {
SWARN("Killing peer: " << peer->name);
peer->reset();
}
}
7 changes: 7 additions & 0 deletions sqlitecluster/SQLiteNode.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ class SQLiteNode : public STCPManager {
NUM_CONSISTENCY_LEVELS
};

// This is a globally accessible pointer to some node instance. The intention here is to let signal handling code attempt to kill outstanding
// peer connections on this node before shutting down.
static SQLiteNode* KILLABLE_SQLITE_NODE;

// Receive timeout for cluster messages.
static const uint64_t RECV_TIMEOUT;

Expand Down Expand Up @@ -152,6 +156,9 @@ class SQLiteNode : public STCPManager {
// Call this if you want to shut down the node.
void beginShutdown();

// kill all peer connections on this node.
void kill();

// Handle any read/write events that occurred.
void postPoll(fd_map& fdm, uint64_t& nextActivity);

Expand Down
3 changes: 1 addition & 2 deletions sqlitecluster/SQLitePeer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,7 @@ SQLitePeer::PeerPostPollStatus SQLitePeer::postPoll(fd_map& fdm, uint64_t& nextA
switch (socket->state.load()) {
case STCPManager::Socket::CONNECTED: {
// socket->lastRecvTime is always set, it's initialized to STimeNow() at creation.
auto lastActivityTime = max(socket->lastSendTime, socket->lastRecvTime);
if (lastActivityTime + SQLiteNode::RECV_TIMEOUT < STimeNow()) {
if (socket->lastRecvTime + SQLiteNode::RECV_TIMEOUT < STimeNow()) {
SHMMM("Connection with peer '" << name << "' timed out.");
return PeerPostPollStatus::SOCKET_ERROR;
}
Expand Down

0 comments on commit 5cb2988

Please sign in to comment.