thanos-io · pedro-stanaka · Oct 21, 2024 · Oct 22, 2024 · Oct 22, 2024 · Oct 22, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -32,6 +32,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re
 - [#7560](https://github.com/thanos-io/thanos/pull/7560) Query: Added the possibility of filtering rules by rule_name, rule_group or file to HTTP api.
 - [#7652](https://github.com/thanos-io/thanos/pull/7652) Store: Implement metadata API limit in stores.
 - [#7659](https://github.com/thanos-io/thanos/pull/7659) Receive: Add support for replication using [Cap'n Proto](https://capnproto.org/). This protocol has a lower CPU and memory footprint, which leads to a reduction in resource usage in Receivers. Before enabling it, make sure that all receivers are updated to a version which supports this replication method.
+- [#7854](https://github.com/thanos-io/thanos/pull/7854) Query Frontend: Add `--query-frontend.force-query-stats` flag to force collection of query statistics from upstream queriers.
 
 ### Changed
 

diff --git a/cmd/thanos/query_frontend.go b/cmd/thanos/query_frontend.go
@@ -146,6 +146,8 @@ func registerQueryFrontend(app *extkingpin.App) {
 	cmd.Flag("query-frontend.log-queries-longer-than", "Log queries that are slower than the specified duration. "+
 		"Set to 0 to disable. Set to < 0 to enable on all queries.").Default("0").DurationVar(&cfg.CortexHandlerConfig.LogQueriesLongerThan)
 
+	cmd.Flag("query-frontend.force-query-stats", "Will always pass \"stats\" param to upstream queriers and collect query statistics reporting them as logs.").Default("false").BoolVar(&cfg.ForceQueryStats)
+
 	cmd.Flag("query-frontend.org-id-header", "Deprecation Warning - This flag will be soon deprecated in favor of query-frontend.tenant-header"+
 		" and both flags cannot be used at the same time. "+
 		"Request header names used to identify the source of slow queries (repeated flag). "+

diff --git a/docs/components/query-frontend.md b/docs/components/query-frontend.md
@@ -278,6 +278,10 @@ Flags:
                                  functions in query-frontend.
                                  --no-query-frontend.enable-x-functions for
                                  disabling.
+      --query-frontend.force-query-stats
+                                 Will always pass "stats" param to upstream
+                                 queriers and collect query statistics reporting
+                                 them as logs.
       --query-frontend.forward-header=<http-header-name> ...
                                  List of headers forwarded by the query-frontend
                                  to downstream queriers, default is empty

diff --git a/internal/cortex/frontend/transport/handler.go b/internal/cortex/frontend/transport/handler.go
@@ -107,11 +107,9 @@ func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 
 	// Initialise the stats in the context and make sure it's propagated
 	// down the request chain.
-	if f.cfg.QueryStatsEnabled {
-		var ctx context.Context
-		stats, ctx = querier_stats.ContextWithEmptyStats(r.Context())
-		r = r.WithContext(ctx)
-	}
+	var ctx context.Context
+	stats, ctx = querier_stats.ContextWithEmptyStats(r.Context())
+	r = r.WithContext(ctx)
 
 	defer func() {
 		_ = r.Body.Close()
@@ -156,7 +154,7 @@ func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	}
 
 	if shouldReportSlowQuery {
-		f.reportSlowQuery(r, hs, queryString, queryResponseTime)
+		f.reportSlowQuery(r, hs, queryString, queryResponseTime, stats)
 	}
 	if f.cfg.QueryStatsEnabled {
 		f.reportQueryStats(r, queryString, queryResponseTime, stats)
@@ -171,7 +169,13 @@ func isQueryEndpoint(path string) bool {
 }
 
 // reportSlowQuery reports slow queries.
-func (f *Handler) reportSlowQuery(r *http.Request, responseHeaders http.Header, queryString url.Values, queryResponseTime time.Duration) {
+func (f *Handler) reportSlowQuery(
+	r *http.Request,
+	responseHeaders http.Header,
+	queryString url.Values,
+	queryResponseTime time.Duration,
+	stats *querier_stats.Stats,
+) {
 	// NOTE(GiedriusS): see https://github.com/grafana/grafana/pull/60301 for more info.
 	grafanaDashboardUID := "-"
 	if dashboardUID := r.Header.Get("X-Dashboard-Uid"); dashboardUID != "" {
@@ -207,6 +211,9 @@ func (f *Handler) reportSlowQuery(r *http.Request, responseHeaders http.Header,
 		"trace_id", thanosTraceID,
 	}, formatQueryString(queryString)...)
 
+	logMessage = addQueryRangeToLogMessage(queryString, logMessage)
+	logMessage = f.addStatsToLogMessage(logMessage, stats)
+
 	level.Info(util_log.WithContext(r.Context(), f.log)).Log(logMessage...)
 }
 
@@ -265,6 +272,40 @@ func formatQueryString(queryString url.Values) (fields []interface{}) {
 	return fields
 }
 
+func (f *Handler) addStatsToLogMessage(message []interface{}, stats *querier_stats.Stats) []interface{} {
+	if stats != nil {
+		message = append(message, "peak_samples", stats.LoadPeakSamples())
+		message = append(message, "total_samples_loaded", stats.LoadTotalSamples())
+	}
+
+	return message
+}
+
+func addQueryRangeToLogMessage(queryString url.Values, logMessage []interface{}) []interface{} {
+	queryRange := extractQueryRange(queryString)
+	if queryRange != time.Duration(0) {
+		logMessage = append(logMessage, "query_range_hours", int(queryRange.Hours()))
+		logMessage = append(logMessage, "query_range_human", queryRange.String())
+	}
+	return logMessage
+}
+
+// extractQueryRange extracts query range from query string.
+// If start and end are not provided or are invalid, it returns a duration with zero-value.
+func extractQueryRange(queryString url.Values) time.Duration {
+	startStr := queryString.Get("start")
+	endStr := queryString.Get("end")
+	var queryRange = time.Duration(0)
+	if startStr != "" && endStr != "" {
+		start, serr := util.ParseTime(startStr)
+		end, eerr := util.ParseTime(endStr)
+		if serr == nil && eerr == nil {
+			queryRange = time.Duration(end-start) * time.Millisecond
+		}
+	}
+	return queryRange
+}
+
 func writeError(w http.ResponseWriter, err error) {
 	switch err {
 	case context.Canceled:

diff --git a/internal/cortex/querier/queryrange/query_range.go b/internal/cortex/querier/queryrange/query_range.go
@@ -454,16 +454,27 @@ func (prometheusCodec) EncodeResponse(ctx context.Context, res Response) (*http.
 	sp.LogFields(otlog.Int("bytes", len(b)))
 
 	resp := http.Response{
-		Header: http.Header{
-			"Content-Type": []string{"application/json"},
-		},
+		Header:        mergeHeaders(a.Headers),
 		Body:          io.NopCloser(bytes.NewBuffer(b)),
 		StatusCode:    http.StatusOK,
 		ContentLength: int64(len(b)),
 	}
 	return &resp, nil
 }
 
+// PrometheusResponseHeader helps preserve the Header from the original Prometheus response, coming from the Tripperware.
+func mergeHeaders(headers []*PrometheusResponseHeader) http.Header {
+	h := make(http.Header, len(headers)+1)
+	for _, header := range headers {
+		if strings.EqualFold("Content-Type", header.Name) {
+			continue
+		}
+		h[header.Name] = header.Values
+	}
+	h["Content-Type"] = []string{"application/json"}
+	return h
+}
+
 // UnmarshalJSON implements json.Unmarshaler and is used for unmarshalling
 // a Prometheus range query response (matrix).
 func (s *SampleStream) UnmarshalJSON(data []byte) error {

diff --git a/internal/cortex/querier/queryrange/stats_middleware.go b/internal/cortex/querier/queryrange/stats_middleware.go
@@ -0,0 +1,46 @@
+// Copyright (c) The Cortex Authors.
+// Licensed under the Apache License 2.0.
+
+package queryrange
+
+import (
+	"context"
+
+	"github.com/thanos-io/thanos/internal/cortex/querier/stats"
+)
+
+type statsMiddleware struct {
+	next       Handler
+	forceStats bool
+}
+
+func NewStatsMiddleware(forceStats bool) Middleware {
+	return MiddlewareFunc(func(next Handler) Handler {
+		return statsMiddleware{
+			next:       next,
+			forceStats: forceStats,
+		}
+	})
+}
+
+func (s statsMiddleware) Do(ctx context.Context, r Request) (Response, error) {
+	if s.forceStats {
+		r = r.WithStats("all")
+	}
+	resp, err := s.next.Do(ctx, r)
+	if err != nil {
+		return resp, err
+	}
+
+	if resp.GetStats() != nil {
+		sts := stats.FromContext(ctx)
+		if sts != nil {
+			if sts.LoadPeakSamples() < resp.GetStats().Samples.PeakSamples {
+				sts.SetPeakSamples(resp.GetStats().Samples.PeakSamples)
+			}
+			sts.AddTotalSamples(resp.GetStats().Samples.TotalQueryableSamples)
+		}
+	}
+
+	return resp, err
+}
diff --git a/internal/cortex/querier/queryrange/stats_middleware_test.go b/internal/cortex/querier/queryrange/stats_middleware_test.go
@@ -0,0 +1,101 @@
+// Copyright (c) The Cortex Authors.
+// Licensed under the Apache License 2.0.
+
+package queryrange
+
+import (
+	"context"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/thanos-io/thanos/internal/cortex/querier/stats"
+)
+
+func Test_statsMiddleware_AddsHeaderWithStats(t *testing.T) {
+	t.Parallel()
+	tests := []struct {
+		name         string
+		forceStats   bool
+		peakSamples  int32
+		totalSamples int64
+	}{
+		{
+			name:         "With forceStats true",
+			forceStats:   true,
+			peakSamples:  100,
+			totalSamples: 1000,
+		},
+		{
+			name:         "With forceStats false",
+			forceStats:   false,
+			peakSamples:  200,
+			totalSamples: 2000,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			fakeHandler := &fakeHandler{
+				response: &PrometheusResponse{
+					Status: "success",
+					Data: PrometheusData{
+						ResultType: "vector",
+						Result:     []SampleStream{},
+						Stats: &PrometheusResponseStats{
+							Samples: &PrometheusResponseSamplesStats{
+								TotalQueryableSamples: tt.totalSamples,
+								PeakSamples:           tt.peakSamples,
+							},
+						},
+					},
+				},
+			}
+
+			middleware := NewStatsMiddleware(tt.forceStats)
+			wrappedHandler := middleware.Wrap(fakeHandler)
+
+			origCtx := context.Background()
+			qryStats, ctx := stats.ContextWithEmptyStats(origCtx)
+
+			resp, err := wrappedHandler.Do(ctx, &PrometheusRequest{
+				Path:  "/api/v1/query_range",
+				Start: 1536673680 * 1e3,
+				End:   1536716898 * 1e3,
+				Step:  120 * 1e3,
+				Query: "sum(container_memory_rss) by (namespace)",
+				Headers: []*PrometheusRequestHeader{
+					{
+						Name:   "Accept",
+						Values: []string{"application/json"},
+					},
+				},
+			})
+			require.NoError(t, err)
+
+			if tt.forceStats {
+				require.Equal(t, fakeHandler.request.GetStats(), "all")
+			}
+
+			promResp, ok := resp.(*PrometheusResponse)
+			require.True(t, ok)
+
+			assert.Equal(t, qryStats.LoadPeakSamples(), tt.peakSamples)
+			assert.Equal(t, qryStats.LoadTotalSamples(), tt.totalSamples)
+			assert.Equal(t, promResp.Data.Stats.Samples.PeakSamples, tt.peakSamples)
+			assert.Equal(t, promResp.Data.Stats.Samples.TotalQueryableSamples, tt.totalSamples)
+		})
+	}
+}
+
+type fakeHandler struct {
+	request  Request
+	response Response
+}
+
+func (f *fakeHandler) Do(ctx context.Context, r Request) (Response, error) {
+	f.request = r
+	return f.response, nil
+}
diff --git a/internal/cortex/querier/stats/stats.go b/internal/cortex/querier/stats/stats.go
@@ -89,6 +89,38 @@ func (s *Stats) LoadFetchedChunkBytes() uint64 {
 	return atomic.LoadUint64(&s.FetchedChunkBytes)
 }
 
+func (s *Stats) SetPeakSamples(peakSamples int32) {
+	if s == nil {
+		return
+	}
+
+	atomic.StoreInt32(&s.PeakLoadedSamples, peakSamples)
+}
+
+func (s *Stats) LoadPeakSamples() int32 {
+	if s == nil {
+		return 0
+	}
+
+	return atomic.LoadInt32(&s.PeakLoadedSamples)
+}
+
+func (s *Stats) AddTotalSamples(totalSamples int64) {
+	if s == nil {
+		return
+	}
+
+	atomic.AddInt64(&s.TotalLoadedSamples, totalSamples)
+}
+
+func (s *Stats) LoadTotalSamples() int64 {
+	if s == nil {
+		return 0
+	}
+
+	return atomic.LoadInt64(&s.TotalLoadedSamples)
+}
+
 // Merge the provide Stats into this one.
 func (s *Stats) Merge(other *Stats) {
 	if s == nil || other == nil {