Skip to content

Commit 04affc4

Browse files
zeminzhoudisksing
andauthored
[release-8.5-keyspace] resource_group: push RU metrics to metering server (tikv#361)
* resource_group: push RU metrics to metering server (tikv#318) Signed-off-by: disksing <i@disksing.com> * resource_group: fix config load (tikv#319) Signed-off-by: disksing <i@disksing.com> * fix Signed-off-by: zeminzhou <zhouzemin@pingcap.com> * metrics: push sql layer request unit and write request unit (tikv#324) Signed-off-by: disksing <i@disksing.com> --------- Signed-off-by: disksing <i@disksing.com> Signed-off-by: zeminzhou <zhouzemin@pingcap.com> Co-authored-by: disksing <i@disksing.com>
1 parent c93399a commit 04affc4

File tree

6 files changed

+55
-7
lines changed

6 files changed

+55
-7
lines changed

Makefile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ SUBMODULES := $(filter $(shell find . -iname "go.mod" -exec dirname {} \;),\
243243
test: install-tools
244244
# testing all pkgs...
245245
@$(FAILPOINT_ENABLE)
246-
CGO_ENABLED=1 go test -tags tso_function_test,deadlock -timeout 20m -race -cover $(TEST_PKGS) || { $(FAILPOINT_DISABLE); exit 1; }
246+
CGO_ENABLED=1 go test -tags tso_function_test -timeout 20m -race -cover $(TEST_PKGS) || { $(FAILPOINT_DISABLE); exit 1; }
247247
@$(FAILPOINT_DISABLE)
248248

249249
basic-test: install-tools
@@ -262,19 +262,19 @@ TSO_INTEGRATION_TEST_PKGS := $(PD_PKG)/tests/server/tso
262262
test-tso: install-tools
263263
# testing TSO function & consistency...
264264
@$(FAILPOINT_ENABLE)
265-
CGO_ENABLED=1 go test -race -tags without_dashboard,tso_full_test,deadlock $(TSO_INTEGRATION_TEST_PKGS) || { $(FAILPOINT_DISABLE); exit 1; }
265+
CGO_ENABLED=1 go test -race -tags without_dashboard,tso_full_test $(TSO_INTEGRATION_TEST_PKGS) || { $(FAILPOINT_DISABLE); exit 1; }
266266
@$(FAILPOINT_DISABLE)
267267

268268
test-tso-function: install-tools
269269
# testing TSO function...
270270
@$(FAILPOINT_ENABLE)
271-
CGO_ENABLED=1 go test -race -tags without_dashboard,tso_function_test,deadlock $(TSO_INTEGRATION_TEST_PKGS) || { $(FAILPOINT_DISABLE); exit 1; }
271+
CGO_ENABLED=1 go test -race -tags without_dashboard,tso_function_test $(TSO_INTEGRATION_TEST_PKGS) || { $(FAILPOINT_DISABLE); exit 1; }
272272
@$(FAILPOINT_DISABLE)
273273

274274
test-tso-consistency: install-tools
275275
# testing TSO consistency...
276276
@$(FAILPOINT_ENABLE)
277-
CGO_ENABLED=1 go test -race -tags without_dashboard,tso_consistency_test,deadlock $(TSO_INTEGRATION_TEST_PKGS) || { $(FAILPOINT_DISABLE); exit 1; }
277+
CGO_ENABLED=1 go test -race -tags without_dashboard,tso_consistency_test $(TSO_INTEGRATION_TEST_PKGS) || { $(FAILPOINT_DISABLE); exit 1; }
278278
@$(FAILPOINT_DISABLE)
279279

280280
REAL_CLUSTER_TEST_PATH := $(ROOT_PATH)/tests/integrations/realcluster

pkg/mcs/resourcemanager/server/config.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,12 @@ type ControllerConfig struct {
110110

111111
// EnableControllerTraceLog is to control whether resource control client enable trace.
112112
EnableControllerTraceLog bool `toml:"enable-controller-trace-log" json:"enable-controller-trace-log,string"`
113+
114+
// PushMetricsAddress is the address to push metrics.
115+
PushMetricsAddress string `toml:"push-metrics-address" json:"push-metrics-address"`
116+
117+
// PushMetricsInterval is the interval to push metrics.
118+
PushMetricsInterval typeutil.Duration `toml:"push-metrics-interval" json:"push-metrics-interval"`
113119
}
114120

115121
// Adjust adjusts the configuration and initializes it with the default value if necessary.

pkg/mcs/resourcemanager/server/manager.go

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"context"
1919
"encoding/json"
2020
"math"
21+
"os"
2122
"sort"
2223
"strings"
2324
"time"
@@ -28,6 +29,7 @@ import (
2829
rmpb "github.com/pingcap/kvproto/pkg/resource_manager"
2930
"github.com/pingcap/log"
3031
"github.com/prometheus/client_golang/prometheus"
32+
"github.com/prometheus/client_golang/prometheus/push"
3133
bs "github.com/tikv/pd/pkg/basicserver"
3234
"github.com/tikv/pd/pkg/errs"
3335
"github.com/tikv/pd/pkg/storage/endpoint"
@@ -48,6 +50,8 @@ const (
4850

4951
reservedDefaultGroupName = "default"
5052
middlePriority = 8
53+
54+
pushMetricsTimeout = 10 * time.Second
5155
)
5256

5357
// Manager is the manager of resource group.
@@ -179,7 +183,7 @@ func (m *Manager) Init(ctx context.Context) error {
179183
}
180184

181185
// Start the background metrics flusher.
182-
go m.backgroundMetricsFlush(ctx)
186+
go m.backgroundMetricsFlush(ctx, m.controllerConfig.PushMetricsAddress, m.controllerConfig.PushMetricsInterval.Duration)
183187
go func() {
184188
defer logutil.LogPanic()
185189
m.persistLoop(ctx)
@@ -357,7 +361,7 @@ func (m *Manager) persistResourceGroupRunningState() {
357361
}
358362

359363
// Receive the consumption and flush it to the metrics.
360-
func (m *Manager) backgroundMetricsFlush(ctx context.Context) {
364+
func (m *Manager) backgroundMetricsFlush(ctx context.Context, pushMetricsAddr string, pushMetricsInterval time.Duration) {
361365
defer logutil.LogPanic()
362366
cleanUpTicker := time.NewTicker(metricsCleanupInterval)
363367
defer cleanUpTicker.Stop()
@@ -366,6 +370,13 @@ func (m *Manager) backgroundMetricsFlush(ctx context.Context) {
366370
recordMaxTicker := time.NewTicker(tickPerSecond)
367371
defer recordMaxTicker.Stop()
368372
maxPerSecTrackers := make(map[string]*maxPerSecCostTracker)
373+
374+
pushMetricsTickerC := make(<-chan time.Time)
375+
if pushMetricsAddr != "" && pushMetricsInterval.Seconds() > 0 {
376+
pushMetricsTicker := time.NewTicker(pushMetricsInterval)
377+
pushMetricsTickerC = pushMetricsTicker.C
378+
defer pushMetricsTicker.Stop()
379+
}
369380
for {
370381
select {
371382
case <-ctx.Done():
@@ -495,6 +506,25 @@ func (m *Manager) backgroundMetricsFlush(ctx context.Context) {
495506
t.FlushMetrics()
496507
}
497508
}
509+
510+
case <-pushMetricsTickerC:
511+
podName := os.Getenv("HOSTNAME")
512+
if podName == "" {
513+
podName = "default"
514+
}
515+
pushCtx, cancel := context.WithTimeout(ctx, pushMetricsTimeout)
516+
start := time.Now()
517+
err := push.New(pushMetricsAddr, "resource_group_svc").
518+
Grouping("pod", podName).
519+
Collector(readRequestUnitCost).
520+
Collector(writeRequestUnitCost).
521+
Collector(sqlLayerRequestUnitCost).
522+
PushContext(pushCtx)
523+
cancel()
524+
if err != nil {
525+
log.Error("push metrics to Prometheus failed", zap.Error(err))
526+
}
527+
pushRUMetricsDuration.Observe(time.Since(start).Seconds())
498528
}
499529
}
500530
}

pkg/mcs/resourcemanager/server/metrics.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,15 @@ var (
129129
Name: "group_config",
130130
Help: "Config of the resource group.",
131131
}, []string{newResourceGroupNameLabel, typeLabel})
132+
133+
pushRUMetricsDuration = prometheus.NewHistogram(
134+
prometheus.HistogramOpts{
135+
Namespace: namespace,
136+
Subsystem: ruSubsystem,
137+
Name: "push_ru_metrics_duration_seconds",
138+
Help: "The duration of pushing RU metrics to Prometheus.",
139+
Buckets: prometheus.DefBuckets,
140+
})
132141
)
133142

134143
func init() {
@@ -144,4 +153,5 @@ func init() {
144153
prometheus.MustRegister(readRequestUnitMaxPerSecCost)
145154
prometheus.MustRegister(writeRequestUnitMaxPerSecCost)
146155
prometheus.MustRegister(resourceGroupConfigGauge)
156+
prometheus.MustRegister(pushRUMetricsDuration)
147157
}

tests/integrations/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ tidy:
2929
git diff --quiet go.mod go.sum
3030

3131
test: failpoint-enable
32-
CGO_ENABLED=1 go test ./$(value test_name)/... -v -tags deadlock -race -cover || { $(MAKE) failpoint-disable && exit 1; }
32+
CGO_ENABLED=1 go test ./$(value test_name)/... -v -race -cover || { $(MAKE) failpoint-disable && exit 1; }
3333
$(MAKE) failpoint-disable
3434

3535
ci-test-job:

tests/integrations/mcs/scheduling/api_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -614,6 +614,8 @@ func (suite *apiTestSuite) TestStores() {
614614

615615
func (suite *apiTestSuite) checkStores(cluster *tests.TestCluster) {
616616
re := suite.Require()
617+
// prevent the offline store from changing to tombstone
618+
tests.MustPutRegion(re, cluster, 3, 6, []byte("a"), []byte("b"))
617619
stores := []*metapb.Store{
618620
{
619621
// metapb.StoreState_Up == 0

0 commit comments

Comments
 (0)