diff --git a/docs/executor_swagger.md b/docs/executor_swagger.md index 95c7b97c7b30..e952207c714b 100644 --- a/docs/executor_swagger.md +++ b/docs/executor_swagger.md @@ -468,6 +468,7 @@ of a single workflow step, which the executor will use as a default location to | Name | Type | Go type | Required | Default | Description | Example | |------|------|---------|:--------:| ------- |-------------|---------| +| cap | string| `string` | | | Cap is a limit on revised values of the duration parameter. If a
multiplication by the factor parameter would make the duration
exceed the cap then the duration is set to the cap | | | duration | string| `string` | | | Duration is the amount to back off. Default unit is seconds, but could also be a duration (e.g. "2m", "1h") | | | factor | [IntOrString](#int-or-string)| `IntOrString` | | | | | | maxDuration | string| `string` | | | MaxDuration is the maximum amount of time allowed for a workflow in the backoff strategy.
It is important to note that if the workflow template includes activeDeadlineSeconds, the pod's deadline is initially set with activeDeadlineSeconds.
However, when the workflow fails, the pod's deadline is then overridden by maxDuration.
This ensures that the workflow does not exceed the specified maximum duration when retries are involved. | | diff --git a/manifests/base/crds/full/argoproj.io_clusterworkflowtemplates.yaml b/manifests/base/crds/full/argoproj.io_clusterworkflowtemplates.yaml index 23bbf9f64ff5..0a1c0e5ab50c 100644 --- a/manifests/base/crds/full/argoproj.io_clusterworkflowtemplates.yaml +++ b/manifests/base/crds/full/argoproj.io_clusterworkflowtemplates.yaml @@ -2001,6 +2001,8 @@ spec: type: object backoff: properties: + cap: + type: string duration: type: string factor: @@ -9140,6 +9142,8 @@ spec: type: object backoff: properties: + cap: + type: string duration: type: string factor: @@ -18423,6 +18427,8 @@ spec: type: object backoff: properties: + cap: + type: string duration: type: string factor: diff --git a/manifests/base/crds/full/argoproj.io_cronworkflows.yaml b/manifests/base/crds/full/argoproj.io_cronworkflows.yaml index 4d46ecb5a95c..44b34f060b1b 100644 --- a/manifests/base/crds/full/argoproj.io_cronworkflows.yaml +++ b/manifests/base/crds/full/argoproj.io_cronworkflows.yaml @@ -2035,6 +2035,8 @@ spec: type: object backoff: properties: + cap: + type: string duration: type: string factor: @@ -9174,6 +9176,8 @@ spec: type: object backoff: properties: + cap: + type: string duration: type: string factor: @@ -18457,6 +18461,8 @@ spec: type: object backoff: properties: + cap: + type: string duration: type: string factor: diff --git a/manifests/base/crds/full/argoproj.io_workflows.yaml b/manifests/base/crds/full/argoproj.io_workflows.yaml index 9a78c3e5806d..04aad031fb0a 100644 --- a/manifests/base/crds/full/argoproj.io_workflows.yaml +++ b/manifests/base/crds/full/argoproj.io_workflows.yaml @@ -2015,6 +2015,8 @@ spec: type: object backoff: properties: + cap: + type: string duration: type: string factor: @@ -9154,6 +9156,8 @@ spec: type: object backoff: properties: + cap: + type: string duration: type: string factor: @@ -18437,6 +18441,8 @@ spec: type: object backoff: properties: + cap: + type: string duration: type: string factor: @@ -31797,6 +31803,8 @@ spec: type: object backoff: properties: + cap: + type: string duration: type: string factor: @@ -36077,6 +36085,8 @@ spec: type: object backoff: properties: + cap: + type: string duration: type: string factor: @@ -43216,6 +43226,8 @@ spec: type: object backoff: properties: + cap: + type: string duration: type: string factor: @@ -52499,6 +52511,8 @@ spec: type: object backoff: properties: + cap: + type: string duration: type: string factor: diff --git a/manifests/base/crds/full/argoproj.io_workflowtasksets.yaml b/manifests/base/crds/full/argoproj.io_workflowtasksets.yaml index bcac41078465..ffb6aff7b149 100644 --- a/manifests/base/crds/full/argoproj.io_workflowtasksets.yaml +++ b/manifests/base/crds/full/argoproj.io_workflowtasksets.yaml @@ -7006,6 +7006,8 @@ spec: type: object backoff: properties: + cap: + type: string duration: type: string factor: diff --git a/manifests/base/crds/full/argoproj.io_workflowtemplates.yaml b/manifests/base/crds/full/argoproj.io_workflowtemplates.yaml index be40a88a9365..292efbbaf78a 100644 --- a/manifests/base/crds/full/argoproj.io_workflowtemplates.yaml +++ b/manifests/base/crds/full/argoproj.io_workflowtemplates.yaml @@ -2000,6 +2000,8 @@ spec: type: object backoff: properties: + cap: + type: string duration: type: string factor: @@ -9139,6 +9141,8 @@ spec: type: object backoff: properties: + cap: + type: string duration: type: string factor: @@ -18422,6 +18426,8 @@ spec: type: object backoff: properties: + cap: + type: string duration: type: string factor: diff --git a/pkg/apis/workflow/v1alpha1/workflow_types.go b/pkg/apis/workflow/v1alpha1/workflow_types.go index f86e84d80934..ab7b83d4c4f2 100644 --- a/pkg/apis/workflow/v1alpha1/workflow_types.go +++ b/pkg/apis/workflow/v1alpha1/workflow_types.go @@ -2026,6 +2026,10 @@ type Backoff struct { // However, when the workflow fails, the pod's deadline is then overridden by maxDuration. // This ensures that the workflow does not exceed the specified maximum duration when retries are involved. MaxDuration string `json:"maxDuration,omitempty" protobuf:"varint,3,opt,name=maxDuration"` + // Cap is a limit on revised values of the duration parameter. If a + // multiplication by the factor parameter would make the duration + // exceed the cap then the duration is set to the cap + Cap string `json:"cap,omitempty" protobuf:"varint,5,opt,name=cap"` } // RetryNodeAntiAffinity is a placeholder for future expansion, only empty nodeAntiAffinity is allowed. diff --git a/pkg/plugins/executor/swagger.yml b/pkg/plugins/executor/swagger.yml index b0806a1055c2..0b761cda4416 100644 --- a/pkg/plugins/executor/swagger.yml +++ b/pkg/plugins/executor/swagger.yml @@ -358,6 +358,12 @@ definitions: Backoff: description: Backoff is a backoff strategy to use within retryStrategy properties: + cap: + description: |- + Cap is a limit on revised values of the duration parameter. If a + multiplication by the factor parameter would make the duration + exceed the cap then the duration is set to the cap + type: string duration: description: Duration is the amount to back off. Default unit is seconds, but could also be a duration (e.g. "2m", "1h") type: string diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go index 94b177236027..531718e85bed 100644 --- a/workflow/controller/operator.go +++ b/workflow/controller/operator.go @@ -1029,6 +1029,15 @@ func (woc *wfOperationCtx) processNodeRetries(node *wfv1.NodeStatus, retryStrate // Note that timeToWait should equal to duration for the first retry attempt. timeToWait = baseDuration * time.Duration(math.Pow(float64(*retryStrategyBackoffFactor), float64(len(childNodeIds)-1))) } + if retryStrategy.Backoff.Cap != "" { + capDuration, err := wfv1.ParseStringToDuration(retryStrategy.Backoff.Cap) + if err != nil { + return nil, false, err + } + if timeToWait > capDuration { + timeToWait = capDuration + } + } waitingDeadline := lastChildNode.FinishedAt.Add(timeToWait) // If the waiting deadline is after the max duration deadline, then it's futile to wait until then. Stop early diff --git a/workflow/controller/operator_test.go b/workflow/controller/operator_test.go index 4378ef2010ae..fa9cfa6e9db1 100644 --- a/workflow/controller/operator_test.go +++ b/workflow/controller/operator_test.go @@ -817,11 +817,12 @@ func TestProcessNodeRetriesWithExponentialBackoff(t *testing.T) { nodeID := woc.wf.NodeID(nodeName) node := woc.initializeNode(nodeName, wfv1.NodeTypeRetry, "", &wfv1.WorkflowStep{}, "", wfv1.NodeRunning, &wfv1.NodeFlag{}) retries := wfv1.RetryStrategy{} - retries.Limit = intstrutil.ParsePtr("2") + retries.Limit = intstrutil.ParsePtr("3") retries.RetryPolicy = wfv1.RetryPolicyAlways retries.Backoff = &wfv1.Backoff{ Duration: "5m", Factor: intstrutil.ParsePtr("2"), + Cap: "11m", } woc.wf.Status.Nodes[nodeID] = *node @@ -863,6 +864,21 @@ func TestProcessNodeRetriesWithExponentialBackoff(t *testing.T) { require.LessOrEqual(backoff, 600) require.Less(595, backoff) + woc.initializeNode(nodeName+"(2)", wfv1.NodeTypePod, "", &wfv1.WorkflowStep{}, "", wfv1.NodeError, &wfv1.NodeFlag{Retried: true}) + woc.addChildNode(nodeName, nodeName+"(2)") + n, err = woc.wf.GetNodeByName(nodeName) + require.NoError(err) + + n, _, err = woc.processNodeRetries(n, retries, &executeTemplateOpts{}) + require.NoError(err) + require.Equal(wfv1.NodeRunning, n.Phase) + + // Third backoff should be limited to 660 seconds by the Cap. + backoff, err = parseRetryMessage(n.Message) + require.NoError(err) + require.LessOrEqual(backoff, 660) + require.Less(655, backoff) + // Mark lastChild as successful. lastChild = getChildNodeIndex(n, woc.wf.Status.Nodes, -1) require.NotNil(lastChild)