Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
qingyang-hu committed Nov 14, 2023
1 parent 0eab23f commit 74db740
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 27 deletions.
4 changes: 0 additions & 4 deletions mongo/integration/unified/unified_spec_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,6 @@ var (
"Write commands with snapshot session do not affect snapshot reads": "Test fails frequently. See GODRIVER-2843",
// TODO(GODRIVER-2943): Fix and unskip this test case.
"Topology lifecycle": "Test times out. See GODRIVER-2943",

"Connection pool clear uses interruptInUseConnections=true after monitor timeout": "Godriver clears after multiple timeout",
"Error returned from connection pool clear with interruptInUseConnections=true is retryable": "Godriver clears after multiple timeout",
"Error returned from connection pool clear with interruptInUseConnections=true is retryable for write": "Godriver clears after multiple timeout",
}

logMessageValidatorTimeout = 10 * time.Millisecond
Expand Down
56 changes: 33 additions & 23 deletions x/mongo/driver/topology/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ func (s *Server) update() {
}
}

timeoutCnt := 0
// timeoutCnt := 0
for {
// Check if the server is disconnecting. Even if waitForNextCheck has already read from the done channel, we
// can safely read from it again because Disconnect closes the channel.
Expand All @@ -614,16 +614,17 @@ func (s *Server) update() {
continue
}

if isShortcut := func() bool {
// Must hold the processErrorLock while updating the server description and clearing the
// pool. Not holding the lock leads to possible out-of-order processing of pool.clear() and
// pool.ready() calls from concurrent server description updates.
s.processErrorLock.Lock()
defer s.processErrorLock.Unlock()

s.updateDescription(desc)
// Retry after the first timeout before clearing the pool in case of a FAAS pause as
// described in GODRIVER-2577.
// if isShortcut := func() bool {
// Must hold the processErrorLock while updating the server description and clearing the
// pool. Not holding the lock leads to possible out-of-order processing of pool.clear() and
// pool.ready() calls from concurrent server description updates.
s.processErrorLock.Lock()
// defer s.processErrorLock.Unlock()

s.updateDescription(desc)
// Retry after the first timeout before clearing the pool in case of a FAAS pause as
// described in GODRIVER-2577.
/*
if err := unwrapConnectionError(desc.LastError); err != nil && timeoutCnt < 1 {
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
timeoutCnt++
Expand All @@ -636,20 +637,29 @@ func (s *Server) update() {
return true
}
}
if err := desc.LastError; err != nil {
// Clear the pool once the description has been updated to Unknown. Pass in a nil service ID to clear
// because the monitoring routine only runs for non-load balanced deployments in which servers don't return
// IDs.
s.pool.clear(err, nil)
*/
if err := desc.LastError; err != nil {
if err := unwrapConnectionError(desc.LastError); err != nil {
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
s.pool.clearAll(err, nil)
} else if err, ok := err.(net.Error); ok && err.Timeout() {
s.pool.clearAll(err, nil)
}
}
// We're either not handling a timeout error, or we just handled the 2nd consecutive
// timeout error. In either case, reset the timeout count to 0 and return false to
// continue the normal check process.
timeoutCnt = 0
return false
}(); isShortcut {
continue
// Clear the pool once the description has been updated to Unknown. Pass in a nil service ID to clear
// because the monitoring routine only runs for non-load balanced deployments in which servers don't return
// IDs.
s.pool.clear(err, nil)
}
s.processErrorLock.Unlock()
// We're either not handling a timeout error, or we just handled the 2nd consecutive
// timeout error. In either case, reset the timeout count to 0 and return false to
// continue the normal check process.
// timeoutCnt = 0
// return false
// }(); isShortcut {
// continue
// }

// If the server supports streaming or we're already streaming, we want to move to streaming the next response
// without waiting. If the server has transitioned to Unknown from a network error, we want to do another
Expand Down

0 comments on commit 74db740

Please sign in to comment.