Skip to content

Commit

Permalink
Config to automatically Re-trigger failed periodics
Browse files Browse the repository at this point in the history
Signed-off-by: Jakub Guzik <[email protected]>
  • Loading branch information
jmguzik committed Jan 14, 2025
1 parent 8e8a5cf commit de145fc
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 8 deletions.
63 changes: 62 additions & 1 deletion cmd/horologium/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"flag"
"fmt"
"os"
"strconv"
"time"

"github.com/sirupsen/logrus"
Expand All @@ -30,12 +31,14 @@ import (
"sigs.k8s.io/controller-runtime/pkg/cluster"

prowapi "sigs.k8s.io/prow/pkg/apis/prowjobs/v1"
v1 "sigs.k8s.io/prow/pkg/apis/prowjobs/v1"
"sigs.k8s.io/prow/pkg/config"
"sigs.k8s.io/prow/pkg/cron"
pkgflagutil "sigs.k8s.io/prow/pkg/flagutil"
prowflagutil "sigs.k8s.io/prow/pkg/flagutil"
configflagutil "sigs.k8s.io/prow/pkg/flagutil/config"
"sigs.k8s.io/prow/pkg/interrupts"
"sigs.k8s.io/prow/pkg/kube"
"sigs.k8s.io/prow/pkg/logrusutil"
"sigs.k8s.io/prow/pkg/metrics"
"sigs.k8s.io/prow/pkg/pjutil"
Expand Down Expand Up @@ -199,14 +202,27 @@ func sync(prowJobClient ctrlruntimeclient.Client, cfg *config.Config, cr cronCli
}
continue
}
if !shouldTrigger {
if !shouldTrigger && p.RetriggerFailedRun == nil {
logger.WithFields(logrus.Fields{
"previous-found": previousFound,
"name": p.Name,
"job": p.JobBase.Name,
}).Debug("Trigger time has not yet been reached.")
}
run := false

// Proceed only if the run deviates from the normal flow.
if !shouldTrigger {
run = shouldTriggerFailedRun(j, p, now, logger, previousFound)
}
if !previousFound || shouldTrigger {
run = true
}
if run {
if p.RetriggerFailedRun != nil && p.Labels == nil {
p.Labels = make(map[string]string)
p.Labels[kube.ReRunLabel] = strconv.Itoa(1)
}
prowJob := pjutil.NewProwJob(pjutil.PeriodicSpec(p), p.Labels, p.Annotations,
pjutil.RequireScheduling(cfg.Scheduler.Enabled))
prowJob.Namespace = cfg.ProwJobNamespace
Expand All @@ -227,3 +243,48 @@ func sync(prowJobClient ctrlruntimeclient.Client, cfg *config.Config, cr cronCli
}
return nil
}

func shouldTriggerFailedRun(j v1.ProwJob, p config.Periodic, now time.Time, logger *logrus.Entry, previousFound bool) bool {
if p.RetriggerFailedRun == nil {
return false
}

runCount := 1
if previousFound {
if !j.Complete() {
return false
}
countLabel, exists := j.Labels[kube.ReRunLabel]
if exists {
if count, err := strconv.Atoi(countLabel); err == nil {
runCount = count + 1
}
}
}
if runCount > p.RetriggerFailedRun.Attempts {
return false
}

duration := p.RetriggerFailedRun.Interval
lastRunTime := j.Status.StartTime.Time

if now.Sub(lastRunTime) <= duration {
return false
}

if p.RetriggerFailedRun.UntilSuccess && j.Status.State == v1.SuccessState {
return false
}

if p.Labels == nil {
p.Labels = make(map[string]string)
}
p.Labels[kube.ReRunLabel] = strconv.Itoa(runCount)

logger.WithFields(logrus.Fields{
"attempt": strconv.Itoa(runCount),
"until_success": p.RetriggerFailedRun.UntilSuccess,
}).Debug("Job marked to be re-triggered")

return true
}
93 changes: 86 additions & 7 deletions cmd/horologium/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"flag"
"reflect"
"strconv"
"testing"
"time"

Expand All @@ -30,9 +31,11 @@ import (
fakectrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client/fake"

prowapi "sigs.k8s.io/prow/pkg/apis/prowjobs/v1"
v1 "sigs.k8s.io/prow/pkg/apis/prowjobs/v1"
"sigs.k8s.io/prow/pkg/config"
"sigs.k8s.io/prow/pkg/flagutil"
configflagutil "sigs.k8s.io/prow/pkg/flagutil/config"
"sigs.k8s.io/prow/pkg/kube"
)

type fakeCron struct {
Expand Down Expand Up @@ -60,11 +63,13 @@ func TestSync(t *testing.T) {
testcases := []struct {
testName string

jobName string
jobComplete bool
jobStartTimeAgo time.Duration

shouldStart bool
jobName string
jobComplete bool
jobStartTimeAgo time.Duration
retriggerFailedRun *config.RetriggerFailedRun
state prowapi.ProwJobState
shouldStart bool
labelNumber int
}{
{
testName: "no job",
Expand Down Expand Up @@ -105,17 +110,84 @@ func TestSync(t *testing.T) {
jobStartTimeAgo: time.Second,
shouldStart: false,
},
{
testName: "old, complete job",
jobName: "j",
jobComplete: true,
jobStartTimeAgo: time.Hour,
shouldStart: true,
},
{
testName: "complete job meant to be re-run",
jobName: "j",
jobComplete: true,
jobStartTimeAgo: time.Minute,
shouldStart: true,
state: v1.FailureState,
retriggerFailedRun: &config.RetriggerFailedRun{Attempts: 3, Interval: time.Minute},
labelNumber: 1,
},
{
testName: "running job not meant to be re-run",
jobName: "j",
jobComplete: false,
jobStartTimeAgo: time.Minute,
shouldStart: false,
state: v1.PendingState,
retriggerFailedRun: &config.RetriggerFailedRun{Attempts: 3, Interval: time.Minute},
labelNumber: 1,
},
{
testName: "complete job meant to be re-run even if success state",
jobName: "j",
jobComplete: true,
jobStartTimeAgo: time.Minute,
shouldStart: true,
state: v1.SuccessState,
retriggerFailedRun: &config.RetriggerFailedRun{Attempts: 3, Interval: time.Minute},
labelNumber: 1,
},
{
testName: "complete job meant to be re-run after 2 attempts",
jobName: "j",
jobComplete: true,
jobStartTimeAgo: time.Minute,
shouldStart: true,
state: v1.SuccessState,
retriggerFailedRun: &config.RetriggerFailedRun{Attempts: 3, Interval: time.Minute},
labelNumber: 2,
},
{
testName: "complete job not meant to be re-run after 3 attempts",
jobName: "j",
jobComplete: true,
jobStartTimeAgo: time.Minute,
shouldStart: false,
state: v1.SuccessState,
retriggerFailedRun: &config.RetriggerFailedRun{Attempts: 3, Interval: time.Minute},
labelNumber: 3,
},
{
testName: "complete job not meant to be re-run with until_success after success state",
jobName: "j",
jobComplete: true,
jobStartTimeAgo: time.Minute,
shouldStart: false,
state: v1.SuccessState,
retriggerFailedRun: &config.RetriggerFailedRun{UntilSuccess: true, Attempts: 3, Interval: time.Minute},
labelNumber: 2,
},
}
for _, tc := range testcases {
cfg := config.Config{
ProwConfig: config.ProwConfig{
ProwJobNamespace: "prowjobs",
},
JobConfig: config.JobConfig{
Periodics: []config.Periodic{{JobBase: config.JobBase{Name: "j"}}},
Periodics: []config.Periodic{{JobBase: config.JobBase{Name: "j"}, RetriggerFailedRun: tc.retriggerFailedRun}},
},
}
cfg.Periodics[0].SetInterval(time.Minute)
cfg.Periodics[0].SetInterval(time.Minute * 30)

var jobs []client.Object
now := time.Now()
Expand All @@ -131,13 +203,20 @@ func TestSync(t *testing.T) {
},
Status: prowapi.ProwJobStatus{
StartTime: metav1.NewTime(now.Add(-tc.jobStartTimeAgo)),
State: tc.state,
},
}
complete := metav1.NewTime(now.Add(-time.Millisecond))
if tc.jobComplete {
job.Status.CompletionTime = &complete
}
jobs = append(jobs, job)
if tc.labelNumber != 0 {
if job.Labels == nil {
job.Labels = make(map[string]string)
}
job.Labels[kube.ReRunLabel] = strconv.Itoa(tc.labelNumber)
}
}
fakeProwJobClient := newCreateTrackingClient(jobs)
fc := &fakeCron{}
Expand Down
13 changes: 13 additions & 0 deletions pkg/config/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,17 @@ type Postsubmit struct {
JenkinsSpec *JenkinsSpec `json:"jenkins_spec,omitempty"`
}

// RetriggerFailedRun defines the configuration for retrying failed prowjobs.
type RetriggerFailedRun struct {
// UntilSuccess stops retries once a successful run is achieved.
// The Attempts field still limits the total number of retries.
UntilSuccess bool
// Attempts specifies the maximum number of retry attempts allowed.
Attempts int
// Interval defines the wait duration between consecutive retry attempts.
Interval time.Duration
}

// Periodic runs on a timer.
type Periodic struct {
JobBase
Expand All @@ -282,6 +293,8 @@ type Periodic struct {
// Tags for config entries
Tags []string `json:"tags,omitempty"`

RetriggerFailedRun *RetriggerFailedRun `json:"retrigger_failed_run,omitempty"`

interval time.Duration
minimum_interval time.Duration
}
Expand Down
3 changes: 3 additions & 0 deletions pkg/kube/prowjob.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ const (
// IsOptionalLabel is added in resources created by prow and
// carries the Optional from a Presubmit job.
IsOptionalLabel = "prow.k8s.io/is-optional"
// ReRunLabel is added in periodics that are configured to be
// re-runned several times
ReRunLabel = "prow.k8s.io/re-run"

// Gerrit related labels that are used by Prow

Expand Down

0 comments on commit de145fc

Please sign in to comment.