[release-8.5-keyspace] resource_group: push RU metrics to metering server (tikv#361)

zeminzhou · disksing · web-flow · commit 04affc4157a7 · 2025-05-16T08:22:07.000Z
* resource_group: push RU metrics to metering server (tikv#318) Signed-off-by: disksing <i@disksing.com> * resource_group: fix config load (tikv#319) Signed-off-by: disksing <i@disksing.com> * fix Signed-off-by: zeminzhou <zhouzemin@pingcap.com> * metrics: push sql layer request unit and write request unit (tikv#324) Signed-off-by: disksing <i@disksing.com> --------- Signed-off-by: disksing <i@disksing.com> Signed-off-by: zeminzhou <zhouzemin@pingcap.com> Co-authored-by: disksing <i@disksing.com>
diff --git a/Makefile b/Makefile
@@ -243,7 +243,7 @@ SUBMODULES := $(filter $(shell find . -iname "go.mod" -exec dirname {} \;),\
 test: install-tools
 	# testing all pkgs...
 	@$(FAILPOINT_ENABLE)
-	CGO_ENABLED=1 go test -tags tso_function_test,deadlock -timeout 20m -race -cover $(TEST_PKGS) || { $(FAILPOINT_DISABLE); exit 1; }
+	CGO_ENABLED=1 go test -tags tso_function_test -timeout 20m -race -cover $(TEST_PKGS) || { $(FAILPOINT_DISABLE); exit 1; }
 	@$(FAILPOINT_DISABLE)
 
 basic-test: install-tools
@@ -262,19 +262,19 @@ TSO_INTEGRATION_TEST_PKGS := $(PD_PKG)/tests/server/tso
 test-tso: install-tools
 	# testing TSO function & consistency...
 	@$(FAILPOINT_ENABLE)
-	CGO_ENABLED=1 go test -race -tags without_dashboard,tso_full_test,deadlock $(TSO_INTEGRATION_TEST_PKGS) || { $(FAILPOINT_DISABLE); exit 1; }
+	CGO_ENABLED=1 go test -race -tags without_dashboard,tso_full_test $(TSO_INTEGRATION_TEST_PKGS) || { $(FAILPOINT_DISABLE); exit 1; }
 	@$(FAILPOINT_DISABLE)
 
 test-tso-function: install-tools
 	# testing TSO function...
 	@$(FAILPOINT_ENABLE)
-	CGO_ENABLED=1 go test -race -tags without_dashboard,tso_function_test,deadlock $(TSO_INTEGRATION_TEST_PKGS) || { $(FAILPOINT_DISABLE); exit 1; }
+	CGO_ENABLED=1 go test -race -tags without_dashboard,tso_function_test $(TSO_INTEGRATION_TEST_PKGS) || { $(FAILPOINT_DISABLE); exit 1; }
 	@$(FAILPOINT_DISABLE)
 
 test-tso-consistency: install-tools
 	# testing TSO consistency...
 	@$(FAILPOINT_ENABLE)
-	CGO_ENABLED=1 go test -race -tags without_dashboard,tso_consistency_test,deadlock $(TSO_INTEGRATION_TEST_PKGS) || { $(FAILPOINT_DISABLE); exit 1; }
+	CGO_ENABLED=1 go test -race -tags without_dashboard,tso_consistency_test $(TSO_INTEGRATION_TEST_PKGS) || { $(FAILPOINT_DISABLE); exit 1; }
 	@$(FAILPOINT_DISABLE)
 
 REAL_CLUSTER_TEST_PATH := $(ROOT_PATH)/tests/integrations/realcluster
diff --git a/pkg/mcs/resourcemanager/server/config.go b/pkg/mcs/resourcemanager/server/config.go
@@ -110,6 +110,12 @@ type ControllerConfig struct {
 
 	// EnableControllerTraceLog is to control whether resource control client enable trace.
 	EnableControllerTraceLog bool `toml:"enable-controller-trace-log" json:"enable-controller-trace-log,string"`
+
+	// PushMetricsAddress is the address to push metrics.
+	PushMetricsAddress string `toml:"push-metrics-address" json:"push-metrics-address"`
+
+	// PushMetricsInterval is the interval to push metrics.
+	PushMetricsInterval typeutil.Duration `toml:"push-metrics-interval" json:"push-metrics-interval"`
 }
 
 // Adjust adjusts the configuration and initializes it with the default value if necessary.
diff --git a/pkg/mcs/resourcemanager/server/manager.go b/pkg/mcs/resourcemanager/server/manager.go
@@ -18,6 +18,7 @@ import (
 	"context"
 	"encoding/json"
 	"math"
+	"os"
 	"sort"
 	"strings"
 	"time"
@@ -28,6 +29,7 @@ import (
 	rmpb "github.com/pingcap/kvproto/pkg/resource_manager"
 	"github.com/pingcap/log"
 	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/push"
 	bs "github.com/tikv/pd/pkg/basicserver"
 	"github.com/tikv/pd/pkg/errs"
 	"github.com/tikv/pd/pkg/storage/endpoint"
@@ -48,6 +50,8 @@ const (
 
 	reservedDefaultGroupName = "default"
 	middlePriority           = 8
+
+	pushMetricsTimeout = 10 * time.Second
 )
 
 // Manager is the manager of resource group.
@@ -179,7 +183,7 @@ func (m *Manager) Init(ctx context.Context) error {
 	}
 
 	// Start the background metrics flusher.
-	go m.backgroundMetricsFlush(ctx)
+	go m.backgroundMetricsFlush(ctx, m.controllerConfig.PushMetricsAddress, m.controllerConfig.PushMetricsInterval.Duration)
 	go func() {
 		defer logutil.LogPanic()
 		m.persistLoop(ctx)
@@ -357,7 +361,7 @@ func (m *Manager) persistResourceGroupRunningState() {
 }
 
 // Receive the consumption and flush it to the metrics.
-func (m *Manager) backgroundMetricsFlush(ctx context.Context) {
+func (m *Manager) backgroundMetricsFlush(ctx context.Context, pushMetricsAddr string, pushMetricsInterval time.Duration) {
 	defer logutil.LogPanic()
 	cleanUpTicker := time.NewTicker(metricsCleanupInterval)
 	defer cleanUpTicker.Stop()
@@ -366,6 +370,13 @@ func (m *Manager) backgroundMetricsFlush(ctx context.Context) {
 	recordMaxTicker := time.NewTicker(tickPerSecond)
 	defer recordMaxTicker.Stop()
 	maxPerSecTrackers := make(map[string]*maxPerSecCostTracker)
+
+	pushMetricsTickerC := make(<-chan time.Time)
+	if pushMetricsAddr != "" && pushMetricsInterval.Seconds() > 0 {
+		pushMetricsTicker := time.NewTicker(pushMetricsInterval)
+		pushMetricsTickerC = pushMetricsTicker.C
+		defer pushMetricsTicker.Stop()
+	}
 	for {
 		select {
 		case <-ctx.Done():
@@ -495,6 +506,25 @@ func (m *Manager) backgroundMetricsFlush(ctx context.Context) {
 					t.FlushMetrics()
 				}
 			}
+
+		case <-pushMetricsTickerC:
+			podName := os.Getenv("HOSTNAME")
+			if podName == "" {
+				podName = "default"
+			}
+			pushCtx, cancel := context.WithTimeout(ctx, pushMetricsTimeout)
+			start := time.Now()
+			err := push.New(pushMetricsAddr, "resource_group_svc").
+				Grouping("pod", podName).
+				Collector(readRequestUnitCost).
+				Collector(writeRequestUnitCost).
+				Collector(sqlLayerRequestUnitCost).
+				PushContext(pushCtx)
+			cancel()
+			if err != nil {
+				log.Error("push metrics to Prometheus failed", zap.Error(err))
+			}
+			pushRUMetricsDuration.Observe(time.Since(start).Seconds())
 		}
 	}
 }
diff --git a/pkg/mcs/resourcemanager/server/metrics.go b/pkg/mcs/resourcemanager/server/metrics.go
@@ -129,6 +129,15 @@ var (
 			Name:      "group_config",
 			Help:      "Config of the resource group.",
 		}, []string{newResourceGroupNameLabel, typeLabel})
+
+	pushRUMetricsDuration = prometheus.NewHistogram(
+		prometheus.HistogramOpts{
+			Namespace: namespace,
+			Subsystem: ruSubsystem,
+			Name:      "push_ru_metrics_duration_seconds",
+			Help:      "The duration of pushing RU metrics to Prometheus.",
+			Buckets:   prometheus.DefBuckets,
+		})
 )
 
 func init() {
@@ -144,4 +153,5 @@ func init() {
 	prometheus.MustRegister(readRequestUnitMaxPerSecCost)
 	prometheus.MustRegister(writeRequestUnitMaxPerSecCost)
 	prometheus.MustRegister(resourceGroupConfigGauge)
+	prometheus.MustRegister(pushRUMetricsDuration)
 }
diff --git a/tests/integrations/Makefile b/tests/integrations/Makefile
@@ -29,7 +29,7 @@ tidy:
 	git diff --quiet go.mod go.sum
 
 test: failpoint-enable
-	CGO_ENABLED=1 go test ./$(value test_name)/... -v -tags deadlock -race -cover || { $(MAKE) failpoint-disable && exit 1; }
+	CGO_ENABLED=1 go test ./$(value test_name)/... -v -race -cover || { $(MAKE) failpoint-disable && exit 1; }
 	$(MAKE) failpoint-disable
 
 ci-test-job:
diff --git a/tests/integrations/mcs/scheduling/api_test.go b/tests/integrations/mcs/scheduling/api_test.go
@@ -614,6 +614,8 @@ func (suite *apiTestSuite) TestStores() {
 
 func (suite *apiTestSuite) checkStores(cluster *tests.TestCluster) {
 	re := suite.Require()
+	// prevent the offline store from changing to tombstone
+	tests.MustPutRegion(re, cluster, 3, 6, []byte("a"), []byte("b"))
 	stores := []*metapb.Store{
 		{
 			// metapb.StoreState_Up == 0

Original file line number	Diff line number	Diff line change
`@@ -614,6 +614,8 @@ func (suite *apiTestSuite) TestStores() {`
`614`	`614`
`615`	`615`	`func (suite apiTestSuite) checkStores(cluster tests.TestCluster) {`
`616`	`616`	`re := suite.Require()`
	`617`	`+ // prevent the offline store from changing to tombstone`
	`618`	`+ tests.MustPutRegion(re, cluster, 3, 6, []byte("a"), []byte("b"))`
`617`	`619`	`stores := []*metapb.Store{`
`618`	`620`	`{`
`619`	`621`	`// metapb.StoreState_Up == 0`