Skip to content

Commit

Permalink
Allow retrying remote actions on abrupt failures
Browse files Browse the repository at this point in the history
    This adds logic to treat remote actions that terminate due to a signal as
    retryable errors, assuming that such terminations are caused by a worker
    crash.

    Because this is a hack to paper over a current Remote Build defficiency,
    and because this heuristic may be wrong, this feature is hidden behind a
    new --snowflake_remote_exit_signals_are_transient_errors flag.

    Mitigates bazelbuild#18319.

Author: Julio Merino <[email protected]>
Date:   Wed May 24 07:12:43 2023 -0700
  • Loading branch information
sfc-gh-mgalindo committed Jun 28, 2024
1 parent e20af2e commit b72c27e
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,11 @@
import com.google.devtools.build.lib.vfs.Path;
import com.google.devtools.build.lib.vfs.PathFragment;
import com.google.longrunning.Operation;
import com.google.protobuf.Any;
import com.google.protobuf.Timestamp;
import com.google.protobuf.util.Durations;
import com.google.protobuf.util.Timestamps;
import com.google.rpc.RetryInfo;
import io.grpc.Status.Code;
import java.io.IOException;
import java.time.Duration;
Expand Down Expand Up @@ -179,6 +181,29 @@ public void reportExecutingIfNot() {
}
}

/**
* Guess if an action failed due to a transient remote error or not.
*
* <p>As described in <a href="https://github.com/bazelbuild/bazel/issues/18319">#18319</a>, the
* Remote Build protocol does not yet provide a way to distinguish between regular action crashes
* and unexpected worker crashes. This provides a heuristic to retry on likely worker crashes.
* The specific error code we may get depends on the remote executor implementation so we need to
* be quite liberal here: Go funnels all non-regular exit codes into {@code -1} and the shell
* returns {@code 128+SIGNO}.
*/
private boolean isTransientError(RemoteAction action, RemoteActionResult result) {
if (!remoteOptions.remoteExitSignalsAreTransientErrors) {
return false;
}

boolean isTestAction = action.getSpawn().getMnemonic().equals("TestRunner");
if (isTestAction) {
return false;
}

return result.getExitCode() < 0 || result.getExitCode() > 127;
}

@Override
public SpawnResult exec(Spawn spawn, SpawnExecutionContext context)
throws ExecException, InterruptedException, IOException, ForbiddenActionInputException {
Expand Down Expand Up @@ -306,6 +331,21 @@ public SpawnResult exec(Spawn spawn, SpawnExecutionContext context)
maybeDownloadServerLogs(action, result.getResponse());
}

if (isTransientError(action, result)) {
// This contraption is required to convince the ExecuteRetrier that the failure can
// be retried.
com.google.rpc.Status synthesizedStatus = com.google.rpc.Status.newBuilder()
.setCode(com.google.rpc.Code.FAILED_PRECONDITION.getNumber())
.setMessage(
"Remote action seems to have terminated due to a signal (exit code "
+ result.getExitCode()
+ "); assuming worker crash to retry")
.addDetails(Any.pack(RetryInfo.newBuilder().build()))
.build();
Exception cause = new ExecutionStatusException(synthesizedStatus, null);
throw new IOException(cause);
}

try {
return downloadAndFinalizeSpawnResult(
action,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,22 @@ public RemoteBuildEventUploadModeConverter() {
+ "If set to 0, retries are disabled.")
public int remoteMaxRetryAttempts;

// TODO(https://github.com/bazelbuild/bazel/issues/18319): Remove once the Remote Build
// protocol can distinguish between abrupt worker termination and action termination.
@Option(
name = "snowflake_remote_exit_signals_are_transient_errors",
defaultValue = "false",
documentationCategory = OptionDocumentationCategory.REMOTE,
effectTags = {OptionEffectTag.UNKNOWN},
help =
"Consider abrupt terminations of remote non-test actions as transient failures. "
+ "This can be used to retry actions that fail due to their remote worker "
+ "crashing instead of failing the build. Note that this cannot differentiate "
+ "between those failures and legitimate action crashes so, if you enable this, "
+ "you should also set --remote_retries to non-zero or enable local fallback "
+ "to avoid getting into an infinite retry loop.")
public boolean remoteExitSignalsAreTransientErrors;

@Option(
name = "remote_retry_max_delay",
defaultValue = "5s",
Expand Down

0 comments on commit b72c27e

Please sign in to comment.