Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add argument to invert the behavior of alert-filter-regexp #786

Merged
merged 2 commits into from
Aug 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions cmd/kured/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ var (
prometheusURL string
preferNoScheduleTaintName string
alertFilter *regexp.Regexp
alertFilterMatchOnly bool
alertFiringOnly bool
rebootSentinelFile string
rebootSentinelCommand string
Expand Down Expand Up @@ -151,6 +152,8 @@ func NewRootCommand() *cobra.Command {
"Prometheus instance to probe for active alerts")
rootCmd.PersistentFlags().Var(&regexpValue{&alertFilter}, "alert-filter-regexp",
"alert names to ignore when checking for active alerts")
rootCmd.PersistentFlags().BoolVar(&alertFilterMatchOnly, "alert-filter-match-only", false,
"Only block if the alert-filter-regexp matches active alerts")
rootCmd.PersistentFlags().BoolVar(&alertFiringOnly, "alert-firing-only", false,
"only consider firing alerts when checking for active alerts")
rootCmd.PersistentFlags().StringVar(&rebootSentinelFile, "reboot-sentinel", "/var/run/reboot-required",
Expand Down Expand Up @@ -345,6 +348,8 @@ type PrometheusBlockingChecker struct {
filter *regexp.Regexp
// bool to indicate if only firing alerts should be considered
firingOnly bool
// bool to indicate that we're only blocking on alerts which match the filter
filterMatchOnly bool
}

// KubernetesBlockingChecker contains info for connecting
Expand All @@ -358,8 +363,7 @@ type KubernetesBlockingChecker struct {
}

func (pb PrometheusBlockingChecker) isBlocked() bool {

alertNames, err := pb.promClient.ActiveAlerts(pb.filter, pb.firingOnly)
alertNames, err := pb.promClient.ActiveAlerts(pb.filter, pb.firingOnly, pb.filterMatchOnly)
if err != nil {
log.Warnf("Reboot blocked: prometheus query error: %v", err)
return true
Expand Down Expand Up @@ -729,7 +733,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s

var blockCheckers []RebootBlocker
if prometheusURL != "" {
blockCheckers = append(blockCheckers, PrometheusBlockingChecker{promClient: promClient, filter: alertFilter, firingOnly: alertFiringOnly})
blockCheckers = append(blockCheckers, PrometheusBlockingChecker{promClient: promClient, filter: alertFilter, firingOnly: alertFiringOnly, filterMatchOnly: alertFilterMatchOnly})
}
if podSelectors != nil {
blockCheckers = append(blockCheckers, KubernetesBlockingChecker{client: client, nodename: nodeID, filter: podSelectors})
Expand Down
1 change: 1 addition & 0 deletions kured-ds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ spec:
# - --lock-ttl=0
# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local
# - --alert-filter-regexp=^RebootRequired$
# - --alert-filter-match-only=false
# - --alert-firing-only=false
# - --reboot-sentinel=/var/run/reboot-required
# - --prefer-no-schedule-taint=""
Expand Down
12 changes: 10 additions & 2 deletions pkg/alerts/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ func NewPromClient(conf papi.Config) (*PromClient, error) {
// filter by regexp means when the regex finds the alert-name; the alert is exluded from the
// block-list and will NOT block rebooting. query by includeLabel means,
// if the query finds an alert, it will include it to the block-list and it WILL block rebooting.
func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]string, error) {
func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly, filterMatchOnly bool) ([]string, error) {

// get all alerts from prometheus
value, _, err := p.api.Query(context.Background(), "ALERTS", time.Now())
Expand All @@ -49,7 +49,7 @@ func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]str
activeAlertSet := make(map[string]bool)
for _, sample := range vector {
if alertName, isAlert := sample.Metric[model.AlertNameLabel]; isAlert && sample.Value != 0 {
if (filter == nil || !filter.MatchString(string(alertName))) && (!firingOnly || sample.Metric["alertstate"] == "firing") {
if matchesRegex(filter, string(alertName), filterMatchOnly) && (!firingOnly || sample.Metric["alertstate"] == "firing") {
activeAlertSet[string(alertName)] = true
}
}
Expand All @@ -67,3 +67,11 @@ func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]str

return nil, fmt.Errorf("Unexpected value type: %v", value)
}

func matchesRegex(filter *regexp.Regexp, alertName string, filterMatchOnly bool) bool {
if filter == nil {
return true
}

return filter.MatchString(string(alertName)) == filterMatchOnly
}
111 changes: 68 additions & 43 deletions pkg/alerts/prometheus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,62 +45,87 @@ func TestActiveAlerts(t *testing.T) {
addr := "http://localhost:10001"

for _, tc := range []struct {
it string
rFilter string
respBody string
aName string
wantN int
firingOnly bool
it string
rFilter string
respBody string
aName string
wantN int
firingOnly bool
filterMatchOnly bool
}{
{
it: "should return no active alerts",
respBody: responsebody,
rFilter: "",
wantN: 0,
firingOnly: false,
it: "should return no active alerts",
respBody: responsebody,
rFilter: "",
wantN: 0,
firingOnly: false,
filterMatchOnly: false,
},
{
it: "should return a subset of all alerts",
respBody: responsebody,
rFilter: "Pod",
wantN: 3,
firingOnly: false,
it: "should return a subset of all alerts",
respBody: responsebody,
rFilter: "Pod",
wantN: 3,
firingOnly: false,
filterMatchOnly: false,
},
{
it: "should return all active alerts by regex",
respBody: responsebody,
rFilter: "*",
wantN: 5,
firingOnly: false,
it: "should return a subset of all alerts",
respBody: responsebody,
rFilter: "Gatekeeper",
wantN: 1,
firingOnly: false,
filterMatchOnly: true,
},
{
it: "should return all active alerts by regex filter",
respBody: responsebody,
rFilter: "*",
wantN: 5,
firingOnly: false,
it: "should return all active alerts by regex",
respBody: responsebody,
rFilter: "*",
wantN: 5,
firingOnly: false,
filterMatchOnly: false,
},
{
it: "should return only firing alerts if firingOnly is true",
respBody: responsebody,
rFilter: "*",
wantN: 4,
firingOnly: true,
it: "should return all active alerts by regex filter",
respBody: responsebody,
rFilter: "*",
wantN: 5,
firingOnly: false,
filterMatchOnly: false,
},
{
it: "should return ScheduledRebootFailing active alerts",
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
aName: "ScheduledRebootFailing",
rFilter: "*",
wantN: 1,
firingOnly: false,
it: "should return only firing alerts if firingOnly is true",
respBody: responsebody,
rFilter: "*",
wantN: 4,
firingOnly: true,
filterMatchOnly: false,
},

{
it: "should return ScheduledRebootFailing active alerts",
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
aName: "ScheduledRebootFailing",
rFilter: "*",
wantN: 1,
firingOnly: false,
filterMatchOnly: false,
},
{
it: "should not return an active alert if RebootRequired is firing (regex filter)",
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
rFilter: "RebootRequired",
wantN: 0,
firingOnly: false,
filterMatchOnly: false,
},
{
it: "should not return an active alert if RebootRequired is firing (regex filter)",
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
rFilter: "RebootRequired",
wantN: 0,
firingOnly: false,
it: "should not return an active alert if RebootRequired is firing (regex filter)",
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
rFilter: "RebootRequired",
wantN: 1,
firingOnly: false,
filterMatchOnly: true,
},
} {
// Start mockServer
Expand All @@ -125,7 +150,7 @@ func TestActiveAlerts(t *testing.T) {
log.Fatal(err)
}

result, err := p.ActiveAlerts(regex, tc.firingOnly)
result, err := p.ActiveAlerts(regex, tc.firingOnly, tc.filterMatchOnly)
if err != nil {
log.Fatal(err)
}
Expand Down
Loading