From 6b5fc560676e2d04f04120fd2f9893c627d67152 Mon Sep 17 00:00:00 2001 From: Pavel Litvyak Date: Thu, 9 Apr 2026 22:06:25 +0300 Subject: [PATCH 1/2] fix(query-frontend): restore trace ID in slow query logs PR #8618 moved slow query logging into a defer and switched from using w.Header() (ResponseWriter headers) to resp.Header (round-trip response headers). The X-Thanos-Trace-Id header is set on w.Header() by the tracing middleware before the handler runs, so it was never present in resp.Header, causing trace IDs to be missing from slow query logs. Pass w.Header() to reportQueryStatsAndSlowQueries instead of the response object so the trace ID is available on both success and error paths. Fixes: #8760 Signed-off-by: Pavel Litvyak --- internal/cortex/frontend/transport/handler.go | 8 ++------ .../cortex/frontend/transport/handler_test.go | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/internal/cortex/frontend/transport/handler.go b/internal/cortex/frontend/transport/handler.go index f58bf01fc03..00fcefb3162 100644 --- a/internal/cortex/frontend/transport/handler.go +++ b/internal/cortex/frontend/transport/handler.go @@ -127,7 +127,7 @@ func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { // Ensure we log slow queries and query statistics regardless of how the request completes. defer func() { - f.reportQueryStatsAndSlowQueries(r, resp, buf, startTime, stats) + f.reportQueryStatsAndSlowQueries(r, w.Header(), buf, startTime, stats) }() var err error @@ -153,7 +153,7 @@ func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { } // reportQueryStatsAndSlowQueries handles logging of slow queries and query statistics. -func (f *Handler) reportQueryStatsAndSlowQueries(r *http.Request, resp *http.Response, buf bytes.Buffer, startTime time.Time, stats *querier_stats.Stats) { +func (f *Handler) reportQueryStatsAndSlowQueries(r *http.Request, responseHeaders http.Header, buf bytes.Buffer, startTime time.Time, stats *querier_stats.Stats) { queryResponseTime := time.Since(startTime) shouldReportSlowQuery := f.cfg.LogQueriesLongerThan != 0 && @@ -164,10 +164,6 @@ func (f *Handler) reportQueryStatsAndSlowQueries(r *http.Request, resp *http.Res queryString := f.parseRequestQueryString(r, buf) if shouldReportSlowQuery { - var responseHeaders http.Header - if resp != nil { - responseHeaders = resp.Header - } f.reportSlowQuery(r, responseHeaders, queryString, queryResponseTime, stats) } if f.cfg.QueryStatsEnabled { diff --git a/internal/cortex/frontend/transport/handler_test.go b/internal/cortex/frontend/transport/handler_test.go index 85ad403e194..f06fd1f0bd6 100644 --- a/internal/cortex/frontend/transport/handler_test.go +++ b/internal/cortex/frontend/transport/handler_test.go @@ -50,6 +50,7 @@ func TestHandler_SlowQueryLog(t *testing.T) { "param_query=absent(up)", "param_start=1714262400", "param_end=1714266000", + "trace_id=test-trace-id-123", }, }, { @@ -114,7 +115,9 @@ func TestHandler_SlowQueryLog(t *testing.T) { handler := NewHandler(cfg, fakeRoundTripper, logger, prometheus.NewRegistry()) - handler.ServeHTTP(httptest.NewRecorder(), httptest.NewRequest("GET", tt.url, nil)) + recorder := httptest.NewRecorder() + recorder.Header().Set("X-Thanos-Trace-Id", "test-trace-id-123") + handler.ServeHTTP(recorder, httptest.NewRequest("GET", tt.url, nil)) for _, part := range tt.logParts { require.Contains(t, logWriter.String(), part) @@ -141,11 +144,19 @@ func TestHandler_SlowQueryLogOnError(t *testing.T) { logger := log.NewLogfmtLogger(log.NewSyncWriter(logWriter)) handler := NewHandler(cfg, fakeRT, logger, prometheus.NewRegistry()) - handler.ServeHTTP(httptest.NewRecorder(), httptest.NewRequest("GET", "/api/v1/query?query=up", nil)) + + // Simulate tracing middleware setting the trace ID on the ResponseWriter + // before the handler runs. The slow query log must include this trace ID + // even when the round-tripper returns an error (resp is nil). + recorder := httptest.NewRecorder() + recorder.Header().Set("X-Thanos-Trace-Id", "test-trace-abc123") + + handler.ServeHTTP(recorder, httptest.NewRequest("GET", "/api/v1/query?query=up", nil)) // Verify slow query is logged even when round-tripper returns an error require.Contains(t, logWriter.String(), "slow query detected") require.Contains(t, logWriter.String(), "time_taken=") require.Contains(t, logWriter.String(), "path=/api/v1/query") require.Contains(t, logWriter.String(), "param_query=up") + require.Contains(t, logWriter.String(), "test-trace-abc123") } From 448fef07672c53c8bf8adc0437bcd4992184ffb2 Mon Sep 17 00:00:00 2001 From: Pavel Litvyak Date: Thu, 9 Apr 2026 22:21:52 +0300 Subject: [PATCH 2/2] docs: add changelog entry for #8762 Signed-off-by: Pavel Litvyak --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 90f7e56e4c6..d0274459e02 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ It is recommend to upgrade the storage components first (Receive, Store, etc.) a - [#8714](https://github.com/thanos-io/thanos/pull/8714): Tracing: Fix `tls_config` fields (`ca_file`, `cert_file`, `key_file`) being silently ignored when using the OTLP gRPC exporter. Previously, deployments using a private CA or mTLS client certificates had to work around this via `OTEL_EXPORTER_OTLP_CERTIFICATE` and related environment variables. - [#8128](https://github.com/thanos-io/thanos/issues/8128): Query-Frontend: Fix panic in `AnalyzesMerge` caused by indexing the wrong slice variable, leading to an out-of-range access when merging more than two query analyses. - [#8720](https://github.com/thanos-io/thanos/issues/8720): Receive: Fix 503 errors during restarts in some cases. +- [#8762](https://github.com/thanos-io/thanos/pull/8762): Query-Frontend: Fix trace ID missing from slow query logs, regression from #8618. ### Added