Skip to content

Commit

Permalink
feat(controller): support cap on retryStrategy backoff
Browse files Browse the repository at this point in the history
Signed-off-by: joey <[email protected]>
  • Loading branch information
chengjoey committed Oct 18, 2024
1 parent c9b1477 commit 905f8a1
Show file tree
Hide file tree
Showing 10 changed files with 71 additions and 1 deletion.
1 change: 1 addition & 0 deletions docs/executor_swagger.md
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,7 @@ of a single workflow step, which the executor will use as a default location to

| Name | Type | Go type | Required | Default | Description | Example |
|------|------|---------|:--------:| ------- |-------------|---------|
| cap | string| `string` | | | Cap is a limit on revised values of the duration parameter. If a</br>multiplication by the factor parameter would make the duration</br>exceed the cap then the duration is set to the cap | |
| duration | string| `string` | | | Duration is the amount to back off. Default unit is seconds, but could also be a duration (e.g. "2m", "1h") | |
| factor | [IntOrString](#int-or-string)| `IntOrString` | | | | |
| maxDuration | string| `string` | | | MaxDuration is the maximum amount of time allowed for a workflow in the backoff strategy.</br>It is important to note that if the workflow template includes activeDeadlineSeconds, the pod's deadline is initially set with activeDeadlineSeconds.</br>However, when the workflow fails, the pod's deadline is then overridden by maxDuration.</br>This ensures that the workflow does not exceed the specified maximum duration when retries are involved. | |
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions manifests/base/crds/full/argoproj.io_cronworkflows.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions manifests/base/crds/full/argoproj.io_workflows.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions manifests/base/crds/full/argoproj.io_workflowtasksets.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions manifests/base/crds/full/argoproj.io_workflowtemplates.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions pkg/apis/workflow/v1alpha1/workflow_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -2026,6 +2026,10 @@ type Backoff struct {
// However, when the workflow fails, the pod's deadline is then overridden by maxDuration.
// This ensures that the workflow does not exceed the specified maximum duration when retries are involved.
MaxDuration string `json:"maxDuration,omitempty" protobuf:"varint,3,opt,name=maxDuration"`
// Cap is a limit on revised values of the duration parameter. If a
// multiplication by the factor parameter would make the duration
// exceed the cap then the duration is set to the cap
Cap string `json:"cap,omitempty" protobuf:"varint,5,opt,name=cap"`
}

// RetryNodeAntiAffinity is a placeholder for future expansion, only empty nodeAntiAffinity is allowed.
Expand Down
6 changes: 6 additions & 0 deletions pkg/plugins/executor/swagger.yml
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,12 @@ definitions:
Backoff:
description: Backoff is a backoff strategy to use within retryStrategy
properties:
cap:
description: |-
Cap is a limit on revised values of the duration parameter. If a
multiplication by the factor parameter would make the duration
exceed the cap then the duration is set to the cap
type: string
duration:
description: Duration is the amount to back off. Default unit is seconds, but could also be a duration (e.g. "2m", "1h")
type: string
Expand Down
9 changes: 9 additions & 0 deletions workflow/controller/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -1029,6 +1029,15 @@ func (woc *wfOperationCtx) processNodeRetries(node *wfv1.NodeStatus, retryStrate
// Note that timeToWait should equal to duration for the first retry attempt.
timeToWait = baseDuration * time.Duration(math.Pow(float64(*retryStrategyBackoffFactor), float64(len(childNodeIds)-1)))
}
if retryStrategy.Backoff.Cap != "" {
capDuration, err := wfv1.ParseStringToDuration(retryStrategy.Backoff.Cap)
if err != nil {
return nil, false, err
}
if timeToWait > capDuration {
timeToWait = capDuration
}
}
waitingDeadline := lastChildNode.FinishedAt.Add(timeToWait)

// If the waiting deadline is after the max duration deadline, then it's futile to wait until then. Stop early
Expand Down
18 changes: 17 additions & 1 deletion workflow/controller/operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -817,11 +817,12 @@ func TestProcessNodeRetriesWithExponentialBackoff(t *testing.T) {
nodeID := woc.wf.NodeID(nodeName)
node := woc.initializeNode(nodeName, wfv1.NodeTypeRetry, "", &wfv1.WorkflowStep{}, "", wfv1.NodeRunning, &wfv1.NodeFlag{})
retries := wfv1.RetryStrategy{}
retries.Limit = intstrutil.ParsePtr("2")
retries.Limit = intstrutil.ParsePtr("3")
retries.RetryPolicy = wfv1.RetryPolicyAlways
retries.Backoff = &wfv1.Backoff{
Duration: "5m",
Factor: intstrutil.ParsePtr("2"),
Cap: "11m",
}
woc.wf.Status.Nodes[nodeID] = *node

Expand Down Expand Up @@ -863,6 +864,21 @@ func TestProcessNodeRetriesWithExponentialBackoff(t *testing.T) {
require.LessOrEqual(backoff, 600)
require.Less(595, backoff)

woc.initializeNode(nodeName+"(2)", wfv1.NodeTypePod, "", &wfv1.WorkflowStep{}, "", wfv1.NodeError, &wfv1.NodeFlag{Retried: true})
woc.addChildNode(nodeName, nodeName+"(2)")
n, err = woc.wf.GetNodeByName(nodeName)
require.NoError(err)

n, _, err = woc.processNodeRetries(n, retries, &executeTemplateOpts{})
require.NoError(err)
require.Equal(wfv1.NodeRunning, n.Phase)

// Third backoff should be limited to 660 seconds by the Cap.
backoff, err = parseRetryMessage(n.Message)
require.NoError(err)
require.LessOrEqual(backoff, 660)
require.Less(655, backoff)

// Mark lastChild as successful.
lastChild = getChildNodeIndex(n, woc.wf.Status.Nodes, -1)
require.NotNil(lastChild)
Expand Down

0 comments on commit 905f8a1

Please sign in to comment.