Skip to content

Commit

Permalink
Allow retrying remote actions on abrupt failures
Browse files Browse the repository at this point in the history
This adds logic to treat remote actions that terminate due to a signal as
retryable errors, assuming that such terminations are caused by a worker
crash.

Because this is a hack to paper over a current Remote Build defficiency,
and because this heuristic may be wrong, this feature is hidden behind a
new --experimental_remote_exit_signals_are_transient_errors flag.

Mitigates bazelbuild#18319.
  • Loading branch information
jmmv committed May 24, 2023
1 parent e7fd4cf commit c3e8c47
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,11 @@
import com.google.devtools.build.lib.vfs.Path;
import com.google.devtools.build.lib.vfs.PathFragment;
import com.google.longrunning.Operation;
import com.google.protobuf.Any;
import com.google.protobuf.Timestamp;
import com.google.protobuf.util.Durations;
import com.google.protobuf.util.Timestamps;
import com.google.rpc.RetryInfo;
import io.grpc.Status.Code;
import java.io.IOException;
import java.time.Duration;
Expand Down Expand Up @@ -282,6 +284,29 @@ public SpawnResult exec(Spawn spawn, SpawnExecutionContext context)
maybeDownloadServerLogs(action, result.getResponse());
}

if (remoteOptions.remoteExitSignalsAreTransientErrors) {
// As described in https://github.com/bazelbuild/bazel/issues/18319, the Remote Build
// protocol does not yet provide a way to distinguish between regular action crashes
// and unexpected worker crashes. This provides a heuristic to retry on likely worker
// crashes. The specific error code we may get depends on the remote executor
// implementation so we need to be quite liberal here: Go funnels all non-regular
// exit codes into -1 and the shell returns 128+SIGNO.
if (result.getExitCode() < 0 || result.getExitCode() > 127) {
// This contraption is required to convince the ExecuteRetrier that the failure can
// be retried.
com.google.rpc.Status synthesizedStatus = com.google.rpc.Status.newBuilder()
.setCode(com.google.rpc.Code.FAILED_PRECONDITION.getNumber())
.setMessage(
"Remote action seems to have terminated due to a signal (exit code "
+ result.getExitCode()
+ "); assuming worker crash to retry")
.addDetails(Any.pack(RetryInfo.newBuilder().build()))
.build();
Exception cause = new ExecutionStatusException(synthesizedStatus, null);
throw new IOException(cause);
}
}

try {
return downloadAndFinalizeSpawnResult(
action,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,22 @@ public RemoteBuildEventUploadModeConverter() {
+ "If set to 0, retries are disabled.")
public int remoteMaxRetryAttempts;

// TODO(https://github.com/bazelbuild/bazel/issues/18319): Remove once the Remote Build
// protocol can distinguish between abrupt worker termination and action termination.
@Option(
name = "experimental_remote_exit_signals_are_transient_errors",
defaultValue = "false",
documentationCategory = OptionDocumentationCategory.REMOTE,
effectTags = {OptionEffectTag.UNKNOWN},
help =
"Consider abrupt terminations of remote actions as transient failures. "
+ "This can be used to retry actions that fail due to their remote worker "
+ "crashing instead of failing the build. Note that this cannot differentiate "
+ "between those failures and legitimate action crashes so, if you enable this, "
+ "you should also set --remote_retries to non-zero or enable local fallback "
+ "to avoid getting into an infinite retry loop.")
public boolean remoteExitSignalsAreTransientErrors;

@Option(
name = "disk_cache",
defaultValue = "null",
Expand Down

0 comments on commit c3e8c47

Please sign in to comment.