Skip to content

Commit

Permalink
Config to automatically Re-trigger failed periodics
Browse files Browse the repository at this point in the history
Signed-off-by: Jakub Guzik <[email protected]>
  • Loading branch information
jmguzik committed Jan 24, 2025
1 parent 8e8a5cf commit 62d9f69
Show file tree
Hide file tree
Showing 6 changed files with 262 additions and 43 deletions.
69 changes: 66 additions & 3 deletions cmd/horologium/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"flag"
"fmt"
"os"
"strconv"
"time"

"github.com/sirupsen/logrus"
Expand All @@ -30,12 +31,14 @@ import (
"sigs.k8s.io/controller-runtime/pkg/cluster"

prowapi "sigs.k8s.io/prow/pkg/apis/prowjobs/v1"
v1 "sigs.k8s.io/prow/pkg/apis/prowjobs/v1"
"sigs.k8s.io/prow/pkg/config"
"sigs.k8s.io/prow/pkg/cron"
pkgflagutil "sigs.k8s.io/prow/pkg/flagutil"
prowflagutil "sigs.k8s.io/prow/pkg/flagutil"
configflagutil "sigs.k8s.io/prow/pkg/flagutil/config"
"sigs.k8s.io/prow/pkg/interrupts"
"sigs.k8s.io/prow/pkg/kube"
"sigs.k8s.io/prow/pkg/logrusutil"
"sigs.k8s.io/prow/pkg/metrics"
"sigs.k8s.io/prow/pkg/pjutil"
Expand All @@ -44,6 +47,7 @@ import (

const (
defaultTickInterval = time.Minute
maxRetryCount = 10
)

type options struct {
Expand Down Expand Up @@ -199,15 +203,23 @@ func sync(prowJobClient ctrlruntimeclient.Client, cfg *config.Config, cr cronCli
}
continue
}
if !shouldTrigger {
if !shouldTrigger && p.RetriggerFailed == nil {
logger.WithFields(logrus.Fields{
"previous-found": previousFound,
"name": p.Name,
"job": p.JobBase.Name,
}).Debug("Trigger time has not yet been reached.")
}
if !previousFound || shouldTrigger {
prowJob := pjutil.NewProwJob(pjutil.PeriodicSpec(p), p.Labels, p.Annotations,

if !previousFound || shouldTrigger || shouldTriggerFailedRun(j, p, now, logger, previousFound) {
labels := p.Labels
if p.RetriggerFailed != nil {
if labels == nil {
labels = make(map[string]string)
}
labels[kube.ReRunLabel] = strconv.Itoa(0)
}
prowJob := pjutil.NewProwJob(pjutil.PeriodicSpec(p), labels, p.Annotations,
pjutil.RequireScheduling(cfg.Scheduler.Enabled))
prowJob.Namespace = cfg.ProwJobNamespace
logger.WithFields(logrus.Fields{
Expand All @@ -227,3 +239,54 @@ func sync(prowJobClient ctrlruntimeclient.Client, cfg *config.Config, cr cronCli
}
return nil
}

func shouldTriggerFailedRun(j v1.ProwJob, p config.Periodic, now time.Time, logger *logrus.Entry, previousFound bool) bool {
if p.RetriggerFailed == nil {
return false
}
runCount := 0
if previousFound {
if !j.Complete() {
return false
}
countLabel, exists := j.Labels[kube.ReRunLabel]
if exists {
if count, err := strconv.Atoi(countLabel); err == nil {
runCount = count + 1
}
}
}
attempts := p.RetriggerFailed.Attempts
if attempts > maxRetryCount {
attempts = maxRetryCount
}
if runCount >= attempts {
return false
}

lastRunTime := j.Status.StartTime.Time

if now.Sub(lastRunTime) <= p.RetriggerFailed.GetInterval() {
return false
}

if !p.RetriggerFailed.RunAll && j.Status.State == v1.SuccessState {
return false
}

if !j.Complete() {
return false
}

if p.Labels == nil {
p.Labels = make(map[string]string)
}
p.Labels[kube.ReRunLabel] = strconv.Itoa(runCount)

logger.WithFields(logrus.Fields{
"attempt": strconv.Itoa(runCount),
"run_all": p.RetriggerFailed.RunAll,
}).Debug("Job marked to be re-triggered")

return true
}
184 changes: 144 additions & 40 deletions cmd/horologium/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"flag"
"reflect"
"strconv"
"testing"
"time"

Expand All @@ -30,9 +31,11 @@ import (
fakectrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client/fake"

prowapi "sigs.k8s.io/prow/pkg/apis/prowjobs/v1"
v1 "sigs.k8s.io/prow/pkg/apis/prowjobs/v1"
"sigs.k8s.io/prow/pkg/config"
"sigs.k8s.io/prow/pkg/flagutil"
configflagutil "sigs.k8s.io/prow/pkg/flagutil/config"
"sigs.k8s.io/prow/pkg/kube"
)

type fakeCron struct {
Expand Down Expand Up @@ -60,9 +63,13 @@ func TestSync(t *testing.T) {
testcases := []struct {
testName string

jobName string
jobComplete bool
jobStartTimeAgo time.Duration
jobName string
jobComplete bool
jobStartTimeAgo time.Duration
retriggerFailed *config.RetriggerFailed
state prowapi.ProwJobState
interval time.Duration
retryLabelNumber int

shouldStart bool
}{
Expand Down Expand Up @@ -105,50 +112,147 @@ func TestSync(t *testing.T) {
jobStartTimeAgo: time.Second,
shouldStart: false,
},
{
testName: "old, complete job",
jobName: "j",
jobComplete: true,
jobStartTimeAgo: time.Hour,
shouldStart: true,
},
{
testName: "complete job meant to be re-run",
jobName: "j",
jobComplete: true,
jobStartTimeAgo: time.Minute,
shouldStart: true,
state: v1.FailureState,
retriggerFailed: &config.RetriggerFailed{Attempts: 3},
interval: time.Minute,
retryLabelNumber: 1,
},
{
testName: "failed job but horologium limit is reached",
jobName: "j",
jobComplete: true,
jobStartTimeAgo: time.Minute,
shouldStart: false,
state: v1.FailureState,
retriggerFailed: &config.RetriggerFailed{Attempts: 13},
interval: time.Minute,
retryLabelNumber: 9,
},
{
testName: "running job not meant to be re-run",
jobName: "j",
jobComplete: false,
jobStartTimeAgo: time.Minute,
shouldStart: false,
state: v1.PendingState,
retriggerFailed: &config.RetriggerFailed{RunAll: true, Attempts: 3},
interval: time.Minute,
retryLabelNumber: 1,
},
{
testName: "complete job meant to be re-run even if success state",
jobName: "j",
jobComplete: true,
jobStartTimeAgo: time.Minute,
shouldStart: true,
state: v1.SuccessState,
retriggerFailed: &config.RetriggerFailed{RunAll: true, Attempts: 3},
interval: time.Minute,
retryLabelNumber: 1,
},
{
testName: "complete job meant to be re-run after 2 attempts",
jobName: "j",
jobComplete: true,
jobStartTimeAgo: time.Minute,
shouldStart: true,
state: v1.SuccessState,
retriggerFailed: &config.RetriggerFailed{RunAll: true, Attempts: 3},
interval: time.Minute,
retryLabelNumber: 1,
},
{
testName: "complete job not meant to be re-run after 3 attempts",
jobName: "j",
jobComplete: true,
jobStartTimeAgo: time.Minute,
shouldStart: false,
state: v1.SuccessState,
retriggerFailed: &config.RetriggerFailed{RunAll: true, Attempts: 3},
interval: time.Minute,
retryLabelNumber: 2,
},
{
testName: "complete job not meant to be re-run with until_success after success state",
jobName: "j",
jobComplete: true,
jobStartTimeAgo: time.Minute,
shouldStart: false,
state: v1.SuccessState,
retriggerFailed: &config.RetriggerFailed{Attempts: 3},
interval: time.Minute,
retryLabelNumber: 2,
},
}
for _, tc := range testcases {
cfg := config.Config{
ProwConfig: config.ProwConfig{
ProwJobNamespace: "prowjobs",
},
JobConfig: config.JobConfig{
Periodics: []config.Periodic{{JobBase: config.JobBase{Name: "j"}}},
},
}
cfg.Periodics[0].SetInterval(time.Minute)

var jobs []client.Object
now := time.Now()
if tc.jobName != "" {
job := &prowapi.ProwJob{
ObjectMeta: metav1.ObjectMeta{
Name: "with-interval",
Namespace: "prowjobs",
t.Run(tc.testName, func(t *testing.T) {
cfg := config.Config{
ProwConfig: config.ProwConfig{
ProwJobNamespace: "prowjobs",
},
Spec: prowapi.ProwJobSpec{
Type: prowapi.PeriodicJob,
Job: tc.jobName,
},
Status: prowapi.ProwJobStatus{
StartTime: metav1.NewTime(now.Add(-tc.jobStartTimeAgo)),
JobConfig: config.JobConfig{
Periodics: []config.Periodic{{JobBase: config.JobBase{Name: "j"}, RetriggerFailed: tc.retriggerFailed}},
},
}
complete := metav1.NewTime(now.Add(-time.Millisecond))
if tc.jobComplete {
job.Status.CompletionTime = &complete
cfg.Periodics[0].SetInterval(time.Minute * 30)
if cfg.JobConfig.Periodics[0].RetriggerFailed != nil {
tc.retriggerFailed.SetInterval(tc.interval)
}
jobs = append(jobs, job)
}
fakeProwJobClient := newCreateTrackingClient(jobs)
fc := &fakeCron{}
if err := sync(fakeProwJobClient, &cfg, fc, now); err != nil {
t.Fatalf("For case %s, didn't expect error: %v", tc.testName, err)
}

sawCreation := fakeProwJobClient.sawCreate
if tc.shouldStart != sawCreation {
t.Errorf("For case %s, did the wrong thing.", tc.testName)
}
var jobs []client.Object
now := time.Now()
if tc.jobName != "" {
job := &prowapi.ProwJob{
ObjectMeta: metav1.ObjectMeta{
Name: "with-interval",
Namespace: "prowjobs",
},
Spec: prowapi.ProwJobSpec{
Type: prowapi.PeriodicJob,
Job: tc.jobName,
},
Status: prowapi.ProwJobStatus{
StartTime: metav1.NewTime(now.Add(-tc.jobStartTimeAgo)),
State: tc.state,
},
}
complete := metav1.NewTime(now.Add(-time.Millisecond))
if tc.jobComplete {
job.Status.CompletionTime = &complete
}
jobs = append(jobs, job)
if tc.retryLabelNumber != 0 {
if job.Labels == nil {
job.Labels = make(map[string]string)
}
job.Labels[kube.ReRunLabel] = strconv.Itoa(tc.retryLabelNumber)
}
}

fakeProwJobClient := newCreateTrackingClient(jobs)
fc := &fakeCron{}
if err := sync(fakeProwJobClient, &cfg, fc, now); err != nil {
t.Fatalf("Didn't expect error: %v", err)
}

sawCreation := fakeProwJobClient.sawCreate
if tc.shouldStart != sawCreation {
t.Errorf("Did the wrong thing. Expected sawCreation: %v, got: %v", tc.shouldStart, sawCreation)
}
})
}
}

Expand Down
8 changes: 8 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -2435,6 +2435,14 @@ func (c Config) validatePeriodics(periodics []Periodic) error {
periodics[j].interval = d
}

if p.RetriggerFailed != nil {
d, err := time.ParseDuration(periodics[j].RetriggerFailed.Interval)
if err != nil {
errs = append(errs, fmt.Errorf("cannot parse RetriggerFailed duration for %s: %w", periodics[j].Name, err))
}
periodics[j].interval = d
}

if p.MinimumInterval != "" {
d, err := time.ParseDuration(periodics[j].MinimumInterval)
if err != nil {
Expand Down
16 changes: 16 additions & 0 deletions pkg/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8205,6 +8205,13 @@ func TestValidatePeriodics(t *testing.T) {
},
expectedError: "cannot parse duration for a: time: invalid duration \"hello\"",
},
{
name: "Invalid RetriggerFailed interval",
periodics: []Periodic{
{JobBase: JobBase{Name: "a"}, RetriggerFailed: &RetriggerFailed{Interval: "hello"}, Interval: "6h"},
},
expectedError: "cannot parse RetriggerFailed duration for a: time: invalid duration \"hello\"",
},
{
name: "Invalid minimum_interval",
periodics: []Periodic{
Expand All @@ -8221,6 +8228,15 @@ func TestValidatePeriodics(t *testing.T) {
{JobBase: JobBase{Name: "a"}, Interval: "10ns", interval: time.Duration(10)},
},
},
{
name: "Sets Retrigger Failed interval",
periodics: []Periodic{
{JobBase: JobBase{Name: "a"}, RetriggerFailed: &RetriggerFailed{Interval: "10ns"}, Interval: "6h"},
},
expected: []Periodic{
{JobBase: JobBase{Name: "a"}, RetriggerFailed: &RetriggerFailed{Interval: "10ns"}, interval: time.Duration(10), Interval: "6h"},
},
},
{
name: "Sets minimum_interval",
periodics: []Periodic{
Expand Down
Loading

0 comments on commit 62d9f69

Please sign in to comment.