From a0afbd5e80619468485e5d606c1c6cff83498973 Mon Sep 17 00:00:00 2001 From: Jim Liming Date: Mon, 19 Jun 2023 17:31:01 -0700 Subject: [PATCH 1/2] add argument to invert the behavior of alert-filter-regexp Signed-off-by: Jim Liming --- cmd/kured/main.go | 25 ++++++++++++++++++++++--- kured-ds.yaml | 1 + 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 4960d3773..8c81ff7f9 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -58,6 +58,7 @@ var ( prometheusURL string preferNoScheduleTaintName string alertFilter *regexp.Regexp + alertFilterMatchOnly bool alertFiringOnly bool rebootSentinelFile string rebootSentinelCommand string @@ -151,6 +152,8 @@ func NewRootCommand() *cobra.Command { "Prometheus instance to probe for active alerts") rootCmd.PersistentFlags().Var(®expValue{&alertFilter}, "alert-filter-regexp", "alert names to ignore when checking for active alerts") + rootCmd.PersistentFlags().BoolVar(&alertFilterMatchOnly, "alert-filter-match-only", false, + "Only block if the alert-filter-regexp matches active alerts") rootCmd.PersistentFlags().BoolVar(&alertFiringOnly, "alert-firing-only", false, "only consider firing alerts when checking for active alerts") rootCmd.PersistentFlags().StringVar(&rebootSentinelFile, "reboot-sentinel", "/var/run/reboot-required", @@ -345,6 +348,8 @@ type PrometheusBlockingChecker struct { filter *regexp.Regexp // bool to indicate if only firing alerts should be considered firingOnly bool + // bool to indicate that we're only blocking on alerts which match the filter + filterMatchOnly bool } // KubernetesBlockingChecker contains info for connecting @@ -358,12 +363,26 @@ type KubernetesBlockingChecker struct { } func (pb PrometheusBlockingChecker) isBlocked() bool { - - alertNames, err := pb.promClient.ActiveAlerts(pb.filter, pb.firingOnly) + getFilter := func(useMatchOnlyFilter bool, filter *regexp.Regexp) *regexp.Regexp { + if useMatchOnlyFilter { + return nil + } + return filter + } + alertNames, err := pb.promClient.ActiveAlerts(getFilter(pb.filterMatchOnly, pb.filter), pb.firingOnly) if err != nil { log.Warnf("Reboot blocked: prometheus query error: %v", err) return true } + if pb.filterMatchOnly { + var matchedAlertNames []string + for _, alertName := range alertNames { + if pb.filter.MatchString(alertName) { + matchedAlertNames = append(matchedAlertNames, alertName) + } + } + alertNames = matchedAlertNames + } count := len(alertNames) if count > 10 { alertNames = append(alertNames[:10], "...") @@ -729,7 +748,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s var blockCheckers []RebootBlocker if prometheusURL != "" { - blockCheckers = append(blockCheckers, PrometheusBlockingChecker{promClient: promClient, filter: alertFilter, firingOnly: alertFiringOnly}) + blockCheckers = append(blockCheckers, PrometheusBlockingChecker{promClient: promClient, filter: alertFilter, firingOnly: alertFiringOnly, filterMatchOnly: alertFilterMatchOnly}) } if podSelectors != nil { blockCheckers = append(blockCheckers, KubernetesBlockingChecker{client: client, nodename: nodeID, filter: podSelectors}) diff --git a/kured-ds.yaml b/kured-ds.yaml index 7a6351420..87e7c22c4 100644 --- a/kured-ds.yaml +++ b/kured-ds.yaml @@ -60,6 +60,7 @@ spec: # - --lock-ttl=0 # - --prometheus-url=http://prometheus.monitoring.svc.cluster.local # - --alert-filter-regexp=^RebootRequired$ +# - --alert-filter-match-only=false # - --alert-firing-only=false # - --reboot-sentinel=/var/run/reboot-required # - --prefer-no-schedule-taint="" From f2d091e0f245a80e4105d3dc30663ac7e0e6f3cc Mon Sep 17 00:00:00 2001 From: Christian Kotzbauer Date: Mon, 14 Aug 2023 18:29:10 +0200 Subject: [PATCH 2/2] feat: small code-improvements Signed-off-by: Christian Kotzbauer --- cmd/kured/main.go | 17 +----- pkg/alerts/prometheus.go | 12 +++- pkg/alerts/prometheus_test.go | 111 +++++++++++++++++++++------------- 3 files changed, 79 insertions(+), 61 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 8c81ff7f9..35a89b28f 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -363,26 +363,11 @@ type KubernetesBlockingChecker struct { } func (pb PrometheusBlockingChecker) isBlocked() bool { - getFilter := func(useMatchOnlyFilter bool, filter *regexp.Regexp) *regexp.Regexp { - if useMatchOnlyFilter { - return nil - } - return filter - } - alertNames, err := pb.promClient.ActiveAlerts(getFilter(pb.filterMatchOnly, pb.filter), pb.firingOnly) + alertNames, err := pb.promClient.ActiveAlerts(pb.filter, pb.firingOnly, pb.filterMatchOnly) if err != nil { log.Warnf("Reboot blocked: prometheus query error: %v", err) return true } - if pb.filterMatchOnly { - var matchedAlertNames []string - for _, alertName := range alertNames { - if pb.filter.MatchString(alertName) { - matchedAlertNames = append(matchedAlertNames, alertName) - } - } - alertNames = matchedAlertNames - } count := len(alertNames) if count > 10 { alertNames = append(alertNames[:10], "...") diff --git a/pkg/alerts/prometheus.go b/pkg/alerts/prometheus.go index 08777f773..ea1457456 100644 --- a/pkg/alerts/prometheus.go +++ b/pkg/alerts/prometheus.go @@ -36,7 +36,7 @@ func NewPromClient(conf papi.Config) (*PromClient, error) { // filter by regexp means when the regex finds the alert-name; the alert is exluded from the // block-list and will NOT block rebooting. query by includeLabel means, // if the query finds an alert, it will include it to the block-list and it WILL block rebooting. -func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]string, error) { +func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly, filterMatchOnly bool) ([]string, error) { // get all alerts from prometheus value, _, err := p.api.Query(context.Background(), "ALERTS", time.Now()) @@ -49,7 +49,7 @@ func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]str activeAlertSet := make(map[string]bool) for _, sample := range vector { if alertName, isAlert := sample.Metric[model.AlertNameLabel]; isAlert && sample.Value != 0 { - if (filter == nil || !filter.MatchString(string(alertName))) && (!firingOnly || sample.Metric["alertstate"] == "firing") { + if matchesRegex(filter, string(alertName), filterMatchOnly) && (!firingOnly || sample.Metric["alertstate"] == "firing") { activeAlertSet[string(alertName)] = true } } @@ -67,3 +67,11 @@ func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]str return nil, fmt.Errorf("Unexpected value type: %v", value) } + +func matchesRegex(filter *regexp.Regexp, alertName string, filterMatchOnly bool) bool { + if filter == nil { + return true + } + + return filter.MatchString(string(alertName)) == filterMatchOnly +} diff --git a/pkg/alerts/prometheus_test.go b/pkg/alerts/prometheus_test.go index 7122ab524..d2a224f34 100644 --- a/pkg/alerts/prometheus_test.go +++ b/pkg/alerts/prometheus_test.go @@ -45,62 +45,87 @@ func TestActiveAlerts(t *testing.T) { addr := "http://localhost:10001" for _, tc := range []struct { - it string - rFilter string - respBody string - aName string - wantN int - firingOnly bool + it string + rFilter string + respBody string + aName string + wantN int + firingOnly bool + filterMatchOnly bool }{ { - it: "should return no active alerts", - respBody: responsebody, - rFilter: "", - wantN: 0, - firingOnly: false, + it: "should return no active alerts", + respBody: responsebody, + rFilter: "", + wantN: 0, + firingOnly: false, + filterMatchOnly: false, }, { - it: "should return a subset of all alerts", - respBody: responsebody, - rFilter: "Pod", - wantN: 3, - firingOnly: false, + it: "should return a subset of all alerts", + respBody: responsebody, + rFilter: "Pod", + wantN: 3, + firingOnly: false, + filterMatchOnly: false, }, { - it: "should return all active alerts by regex", - respBody: responsebody, - rFilter: "*", - wantN: 5, - firingOnly: false, + it: "should return a subset of all alerts", + respBody: responsebody, + rFilter: "Gatekeeper", + wantN: 1, + firingOnly: false, + filterMatchOnly: true, }, { - it: "should return all active alerts by regex filter", - respBody: responsebody, - rFilter: "*", - wantN: 5, - firingOnly: false, + it: "should return all active alerts by regex", + respBody: responsebody, + rFilter: "*", + wantN: 5, + firingOnly: false, + filterMatchOnly: false, }, { - it: "should return only firing alerts if firingOnly is true", - respBody: responsebody, - rFilter: "*", - wantN: 4, - firingOnly: true, + it: "should return all active alerts by regex filter", + respBody: responsebody, + rFilter: "*", + wantN: 5, + firingOnly: false, + filterMatchOnly: false, }, { - it: "should return ScheduledRebootFailing active alerts", - respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`, - aName: "ScheduledRebootFailing", - rFilter: "*", - wantN: 1, - firingOnly: false, + it: "should return only firing alerts if firingOnly is true", + respBody: responsebody, + rFilter: "*", + wantN: 4, + firingOnly: true, + filterMatchOnly: false, + }, + + { + it: "should return ScheduledRebootFailing active alerts", + respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`, + aName: "ScheduledRebootFailing", + rFilter: "*", + wantN: 1, + firingOnly: false, + filterMatchOnly: false, + }, + { + it: "should not return an active alert if RebootRequired is firing (regex filter)", + respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`, + rFilter: "RebootRequired", + wantN: 0, + firingOnly: false, + filterMatchOnly: false, }, { - it: "should not return an active alert if RebootRequired is firing (regex filter)", - respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`, - rFilter: "RebootRequired", - wantN: 0, - firingOnly: false, + it: "should not return an active alert if RebootRequired is firing (regex filter)", + respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`, + rFilter: "RebootRequired", + wantN: 1, + firingOnly: false, + filterMatchOnly: true, }, } { // Start mockServer @@ -125,7 +150,7 @@ func TestActiveAlerts(t *testing.T) { log.Fatal(err) } - result, err := p.ActiveAlerts(regex, tc.firingOnly) + result, err := p.ActiveAlerts(regex, tc.firingOnly, tc.filterMatchOnly) if err != nil { log.Fatal(err) }