From 3b26ef50c8ac2c388315cfa2b5ccbd7578e4fb54 Mon Sep 17 00:00:00 2001 From: Julio Merino Date: Wed, 24 May 2023 07:12:43 -0700 Subject: [PATCH] Allow retrying remote actions on abrupt failures This adds logic to treat remote actions that terminate due to a signal as retryable errors, assuming that such terminations are caused by a worker crash. Because this is a hack to paper over a current Remote Build defficiency, and because this heuristic may be wrong, this feature is hidden behind a new --experimental_remote_exit_signals_are_transient_errors flag. Mitigates #18319. --- .../build/lib/remote/RemoteSpawnRunner.java | 15 +++++++++++++++ .../build/lib/remote/options/RemoteOptions.java | 16 ++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java index 0f7bb077577efa..9e2b9cc2f478f3 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java @@ -282,6 +282,21 @@ public SpawnResult exec(Spawn spawn, SpawnExecutionContext context) maybeDownloadServerLogs(action, result.getResponse()); } + if (remoteOptions.remoteExitSignalsAreTransientErrors) { + // As described in https://github.com/bazelbuild/bazel/issues/18319, the Remote Build + // protocol does not yet provide a way to distinguish between regular action crashes + // and unexpected worker crashes. This provides a heuristic to retry on likely worker + // crashes. The specific error code we may get depends on the remote executor + // implementation so we need to be quite liberal here: Go funnels all non-regular + // exit codes into -1 and the shell returns 128+SIGNO. + if (result.getExitCode() < 0 || result.getExitCode() > 127) { + throw new IOException( + "Remote action seems to have terminated due to a signal (exit code " + + result.getExitCode() + + "); assuming worker crash to retry"); + } + } + try { return downloadAndFinalizeSpawnResult( action, diff --git a/src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java b/src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java index ba8488382cb3b1..ecd10eb4942f49 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java +++ b/src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java @@ -362,6 +362,22 @@ public RemoteBuildEventUploadModeConverter() { + "If set to 0, retries are disabled.") public int remoteMaxRetryAttempts; + // TODO(https://github.com/bazelbuild/bazel/issues/18319): Remove once the Remote Build + // protocol can distinguish between abrupt worker termination and action termination. + @Option( + name = "experimental_remote_exit_signals_are_transient_errors", + defaultValue = "false", + documentationCategory = OptionDocumentationCategory.REMOTE, + effectTags = {OptionEffectTag.UNKNOWN}, + help = + "Consider abrupt terminations of remote actions as transient failures. " + + "This can be used to retry actions that fail due to their remote worker " + + "crashing instead of failing the build. Note that this cannot differentiate " + + "between those failures and legitimate action crashes so, if you enable this, " + + "you should also set --remote_retries to non-zero or enable local fallback " + + "to avoid getting into an infinite retry loop.") + public boolean remoteExitSignalsAreTransientErrors; + @Option( name = "disk_cache", defaultValue = "null",