From 70b0427a3337c7958d31936a801cbaff15519d05 Mon Sep 17 00:00:00 2001 From: Mauricio Galindo Date: Mon, 10 Jun 2024 16:20:38 +0000 Subject: [PATCH] Allow retrying remote actions on abrupt failures This adds logic to treat remote actions that terminate due to a signal as retryable errors, assuming that such terminations are caused by a worker crash. Because this is a hack to paper over a current Remote Build defficiency, and because this heuristic may be wrong, this feature is hidden behind a new --snowflake_remote_exit_signals_are_transient_errors flag. Mitigates #18319. Author: Julio Merino Date: Wed May 24 07:12:43 2023 -0700 --- .../build/lib/remote/RemoteSpawnRunner.java | 40 +++++++++++++++++++ .../lib/remote/options/RemoteOptions.java | 16 ++++++++ 2 files changed, 56 insertions(+) diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java index 934cdff7382f15..43aba3c437d72f 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java @@ -72,9 +72,11 @@ import com.google.devtools.build.lib.vfs.Path; import com.google.devtools.build.lib.vfs.PathFragment; import com.google.longrunning.Operation; +import com.google.protobuf.Any; import com.google.protobuf.Timestamp; import com.google.protobuf.util.Durations; import com.google.protobuf.util.Timestamps; +import com.google.rpc.RetryInfo; import io.grpc.Status.Code; import java.io.IOException; import java.time.Duration; @@ -179,6 +181,29 @@ public void reportExecutingIfNot() { } } + /** + * Guess if an action failed due to a transient remote error or not. + * + *

As described in #18319, the + * Remote Build protocol does not yet provide a way to distinguish between regular action crashes + * and unexpected worker crashes. This provides a heuristic to retry on likely worker crashes. + * The specific error code we may get depends on the remote executor implementation so we need to + * be quite liberal here: Go funnels all non-regular exit codes into {@code -1} and the shell + * returns {@code 128+SIGNO}. + */ + private boolean isTransientError(RemoteAction action, RemoteActionResult result) { + if (!remoteOptions.remoteExitSignalsAreTransientErrors) { + return false; + } + + boolean isTestAction = action.getSpawn().getMnemonic().equals("TestRunner"); + if (isTestAction) { + return false; + } + + return result.getExitCode() < 0 || result.getExitCode() > 127; + } + @Override public SpawnResult exec(Spawn spawn, SpawnExecutionContext context) throws ExecException, InterruptedException, IOException, ForbiddenActionInputException { @@ -306,6 +331,21 @@ public SpawnResult exec(Spawn spawn, SpawnExecutionContext context) maybeDownloadServerLogs(action, result.getResponse()); } + if (isTransientError(action, result)) { + // This contraption is required to convince the ExecuteRetrier that the failure can + // be retried. + com.google.rpc.Status synthesizedStatus = com.google.rpc.Status.newBuilder() + .setCode(com.google.rpc.Code.FAILED_PRECONDITION.getNumber()) + .setMessage( + "Remote action seems to have terminated due to a signal (exit code " + + result.getExitCode() + + "); assuming worker crash to retry") + .addDetails(Any.pack(RetryInfo.newBuilder().build())) + .build(); + Exception cause = new ExecutionStatusException(synthesizedStatus, null); + throw new IOException(cause); + } + try { return downloadAndFinalizeSpawnResult( action, diff --git a/src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java b/src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java index 4161d60a610a8f..98dde29862bc08 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java +++ b/src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java @@ -348,6 +348,22 @@ public RemoteBuildEventUploadModeConverter() { + "If set to 0, retries are disabled.") public int remoteMaxRetryAttempts; + // TODO(https://github.com/bazelbuild/bazel/issues/18319): Remove once the Remote Build + // protocol can distinguish between abrupt worker termination and action termination. + @Option( + name = "snowflake_remote_exit_signals_are_transient_errors", + defaultValue = "false", + documentationCategory = OptionDocumentationCategory.REMOTE, + effectTags = {OptionEffectTag.UNKNOWN}, + help = + "Consider abrupt terminations of remote non-test actions as transient failures. " + + "This can be used to retry actions that fail due to their remote worker " + + "crashing instead of failing the build. Note that this cannot differentiate " + + "between those failures and legitimate action crashes so, if you enable this, " + + "you should also set --remote_retries to non-zero or enable local fallback " + + "to avoid getting into an infinite retry loop.") + public boolean remoteExitSignalsAreTransientErrors; + @Option( name = "remote_retry_max_delay", defaultValue = "5s",