Skip to content

Commit 3dd3c7b

Browse files
authored
schedule: add metrcis for region scatter (#3582) (#3596)
1 parent a811014 commit 3dd3c7b

File tree

4 files changed

+267
-5
lines changed

4 files changed

+267
-5
lines changed

metrics/grafana/pd.json

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6853,6 +6853,220 @@
68536853
"title": "Scheduler",
68546854
"type": "row"
68556855
},
6856+
{
6857+
"collapsed": true,
6858+
"gridPos": {
6859+
"h": 1,
6860+
"w": 24,
6861+
"x": 0,
6862+
"y": 19
6863+
},
6864+
"id": 1437,
6865+
"panels": [
6866+
{
6867+
"aliasColors": {},
6868+
"bars": false,
6869+
"dashLength": 10,
6870+
"dashes": false,
6871+
"datasource": "tidb-cluster",
6872+
"description": "",
6873+
"fill": 0,
6874+
"gridPos": {
6875+
"h": 8,
6876+
"w": 12,
6877+
"x": 0,
6878+
"y": 20
6879+
},
6880+
"id": 1433,
6881+
"legend": {
6882+
"alignAsTable": true,
6883+
"avg": true,
6884+
"current": true,
6885+
"hideEmpty": true,
6886+
"hideZero": true,
6887+
"max": true,
6888+
"min": false,
6889+
"rightSide": true,
6890+
"show": true,
6891+
"total": false,
6892+
"values": true
6893+
},
6894+
"lines": true,
6895+
"linewidth": 2,
6896+
"links": [],
6897+
"nullPointMode": "null",
6898+
"percentage": false,
6899+
"pointradius": 2,
6900+
"points": false,
6901+
"renderer": "flot",
6902+
"seriesOverrides": [],
6903+
"spaceLength": 10,
6904+
"stack": false,
6905+
"steppedLine": false,
6906+
"targets": [
6907+
{
6908+
"expr": "sum(delta(pd_schedule_scatter_operators_count{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"skip\"}[1m])) by (event)",
6909+
"format": "time_series",
6910+
"intervalFactor": 2,
6911+
"legendFormat": "skip-{{event}}",
6912+
"refId": "A"
6913+
},
6914+
{
6915+
"expr": "delta(pd_schedule_scatter_operators_count{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"fail\"}[1m])",
6916+
"format": "time_series",
6917+
"intervalFactor": 2,
6918+
"legendFormat": "fail",
6919+
"refId": "B"
6920+
},
6921+
{
6922+
"expr": "delta(pd_schedule_scatter_operators_count{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"success\"}[1m])",
6923+
"format": "time_series",
6924+
"intervalFactor": 2,
6925+
"legendFormat": "success",
6926+
"refId": "C"
6927+
}
6928+
],
6929+
"thresholds": [],
6930+
"timeFrom": null,
6931+
"timeRegions": [],
6932+
"timeShift": null,
6933+
"title": "scatter operator event",
6934+
"tooltip": {
6935+
"shared": true,
6936+
"sort": 0,
6937+
"value_type": "individual"
6938+
},
6939+
"type": "graph",
6940+
"xaxis": {
6941+
"buckets": null,
6942+
"mode": "time",
6943+
"name": null,
6944+
"show": true,
6945+
"values": []
6946+
},
6947+
"yaxes": [
6948+
{
6949+
"format": "opm",
6950+
"label": null,
6951+
"logBase": 1,
6952+
"max": null,
6953+
"min": null,
6954+
"show": true
6955+
},
6956+
{
6957+
"format": "short",
6958+
"label": null,
6959+
"logBase": 1,
6960+
"max": null,
6961+
"min": null,
6962+
"show": true
6963+
}
6964+
],
6965+
"yaxis": {
6966+
"align": false,
6967+
"alignLevel": null
6968+
}
6969+
},
6970+
{
6971+
"aliasColors": {},
6972+
"bars": false,
6973+
"dashLength": 10,
6974+
"dashes": false,
6975+
"datasource": "tidb-cluster",
6976+
"fill": 0,
6977+
"gridPos": {
6978+
"h": 8,
6979+
"w": 12,
6980+
"x": 12,
6981+
"y": 20
6982+
},
6983+
"id": 1435,
6984+
"legend": {
6985+
"alignAsTable": true,
6986+
"avg": false,
6987+
"current": true,
6988+
"hideEmpty": true,
6989+
"hideZero": true,
6990+
"max": true,
6991+
"min": false,
6992+
"rightSide": true,
6993+
"show": true,
6994+
"total": false,
6995+
"values": true
6996+
},
6997+
"lines": true,
6998+
"linewidth": 2,
6999+
"links": [],
7000+
"nullPointMode": "null",
7001+
"percentage": false,
7002+
"pointradius": 2,
7003+
"points": false,
7004+
"renderer": "flot",
7005+
"seriesOverrides": [],
7006+
"spaceLength": 10,
7007+
"stack": false,
7008+
"steppedLine": false,
7009+
"targets": [
7010+
{
7011+
"expr": "sum(delta(pd_schedule_scatter_distribution{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\",engine=\"tikv\",is_leader=\"false\"}[1m])) by (store)",
7012+
"format": "time_series",
7013+
"intervalFactor": 1,
7014+
"legendFormat": "peer-{{store}}",
7015+
"refId": "A"
7016+
},
7017+
{
7018+
"expr": "sum(delta(pd_schedule_scatter_distribution{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\",engine=\"tikv\",is_leader=\"true\"}[1m])) by (store)",
7019+
"format": "time_series",
7020+
"intervalFactor": 1,
7021+
"legendFormat": "leader-{{store}}",
7022+
"refId": "B"
7023+
}
7024+
],
7025+
"thresholds": [],
7026+
"timeFrom": null,
7027+
"timeRegions": [],
7028+
"timeShift": null,
7029+
"title": "scatter store selection",
7030+
"tooltip": {
7031+
"shared": true,
7032+
"sort": 0,
7033+
"value_type": "individual"
7034+
},
7035+
"type": "graph",
7036+
"xaxis": {
7037+
"buckets": null,
7038+
"mode": "time",
7039+
"name": null,
7040+
"show": true,
7041+
"values": []
7042+
},
7043+
"yaxes": [
7044+
{
7045+
"format": "opm",
7046+
"label": null,
7047+
"logBase": 1,
7048+
"max": null,
7049+
"min": null,
7050+
"show": true
7051+
},
7052+
{
7053+
"format": "short",
7054+
"label": null,
7055+
"logBase": 1,
7056+
"max": null,
7057+
"min": null,
7058+
"show": true
7059+
}
7060+
],
7061+
"yaxis": {
7062+
"align": false,
7063+
"alignLevel": null
7064+
}
7065+
}
7066+
],
7067+
"title": "Scatter and Splitter",
7068+
"type": "row"
7069+
},
68567070
{
68577071
"collapsed": true,
68587072
"gridPos": {

server/schedule/filter/filters.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,7 @@ func (f *StoreStateFilter) anyConditionMatch(typ int, opt *config.PersistOptions
389389
funcs = []conditionFunc{f.isTombstone, f.isOffline, f.isDown, f.isDisconnected, f.isBusy,
390390
f.exceedAddLimit, f.tooManySnapshots, f.tooManyPendingPeers}
391391
case scatterRegionTarget:
392-
funcs = []conditionFunc{f.isTombstone, f.isOffline, f.isDown, f.isDisconnected}
392+
funcs = []conditionFunc{f.isTombstone, f.isOffline, f.isDown, f.isDisconnected, f.isBusy}
393393
}
394394
for _, cf := range funcs {
395395
if cf(opt, store) {
@@ -686,6 +686,8 @@ const (
686686
EngineKey = "engine"
687687
// EngineTiFlash is the tiflash value of the engine label.
688688
EngineTiFlash = "tiflash"
689+
// EngineTiKV indicates the tikv engine in metrics
690+
EngineTiKV = "tikv"
689691
)
690692

691693
var allSpecialUses = []string{SpecialUseHotRegion, SpecialUseReserved}

server/schedule/metrics.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,22 @@ var (
7373
Name: "store_limit_cost",
7474
Help: "limit rate cost of store.",
7575
}, []string{"store", "limit_type"})
76+
77+
scatterCounter = prometheus.NewCounterVec(
78+
prometheus.CounterOpts{
79+
Namespace: "pd",
80+
Subsystem: "schedule",
81+
Name: "scatter_operators_count",
82+
Help: "Counter of region scatter operators.",
83+
}, []string{"type", "event"})
84+
85+
scatterDistributionCounter = prometheus.NewCounterVec(
86+
prometheus.CounterOpts{
87+
Namespace: "pd",
88+
Subsystem: "schedule",
89+
Name: "scatter_distribution",
90+
Help: "Counter of the distribution in scatter.",
91+
}, []string{"store", "is_leader", "engine"})
7692
)
7793

7894
func init() {
@@ -83,4 +99,6 @@ func init() {
8399
prometheus.MustRegister(storeLimitRateGauge)
84100
prometheus.MustRegister(storeLimitCostCounter)
85101
prometheus.MustRegister(operatorWaitCounter)
102+
prometheus.MustRegister(scatterCounter)
103+
prometheus.MustRegister(scatterDistributionCounter)
86104
}

server/schedule/region_scatterer.go

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"context"
1818
"fmt"
1919
"math"
20+
"strconv"
2021
"sync"
2122
"time"
2223

@@ -137,6 +138,7 @@ const maxRetryLimit = 30
137138
func (r *RegionScatterer) ScatterRegionsByRange(startKey, endKey []byte, group string, retryLimit int) ([]*operator.Operator, map[uint64]error, error) {
138139
regions := r.cluster.ScanRegions(startKey, endKey, -1)
139140
if len(regions) < 1 {
141+
scatterCounter.WithLabelValues("skip", "empty-region").Inc()
140142
return nil, nil, errors.New("empty region")
141143
}
142144
failures := make(map[uint64]error, len(regions))
@@ -155,13 +157,16 @@ func (r *RegionScatterer) ScatterRegionsByRange(startKey, endKey []byte, group s
155157
// ScatterRegionsByID directly scatter regions by ScatterRegions
156158
func (r *RegionScatterer) ScatterRegionsByID(regionsID []uint64, group string, retryLimit int) ([]*operator.Operator, map[uint64]error, error) {
157159
if len(regionsID) < 1 {
160+
scatterCounter.WithLabelValues("skip", "empty-region").Inc()
158161
return nil, nil, errors.New("empty region")
159162
}
160163
failures := make(map[uint64]error, len(regionsID))
161164
var regions []*core.RegionInfo
162165
for _, id := range regionsID {
163166
region := r.cluster.GetRegion(id)
164167
if region == nil {
168+
scatterCounter.WithLabelValues("skip", "no-region").Inc()
169+
log.Warn("failed to find region during scatter", zap.Uint64("region-id", id))
165170
failures[id] = errors.New(fmt.Sprintf("failed to find region %v", id))
166171
continue
167172
}
@@ -187,6 +192,7 @@ func (r *RegionScatterer) ScatterRegionsByID(regionsID []uint64, group string, r
187192
// and the value of the failures indicates the failure error.
188193
func (r *RegionScatterer) ScatterRegions(regions map[uint64]*core.RegionInfo, failures map[uint64]error, group string, retryLimit int) ([]*operator.Operator, error) {
189194
if len(regions) < 1 {
195+
scatterCounter.WithLabelValues("skip", "empty-region").Inc()
190196
return nil, errors.New("empty region")
191197
}
192198
if retryLimit > maxRetryLimit {
@@ -226,14 +232,20 @@ func (r *RegionScatterer) ScatterRegions(regions map[uint64]*core.RegionInfo, fa
226232
func (r *RegionScatterer) Scatter(region *core.RegionInfo, group string) (*operator.Operator, error) {
227233
if !opt.IsRegionReplicated(r.cluster, region) {
228234
r.cluster.AddSuspectRegions(region.GetID())
235+
scatterCounter.WithLabelValues("skip", "not-replicated").Inc()
236+
log.Warn("region not replicated during scatter", zap.Uint64("region-id", region.GetID()))
229237
return nil, errors.Errorf("region %d is not fully replicated", region.GetID())
230238
}
231239

232240
if region.GetLeader() == nil {
241+
scatterCounter.WithLabelValues("skip", "no-leader").Inc()
242+
log.Warn("region no leader during scatter", zap.Uint64("region-id", region.GetID()))
233243
return nil, errors.Errorf("region %d has no leader", region.GetID())
234244
}
235245

236246
if r.cluster.IsRegionHot(region) {
247+
scatterCounter.WithLabelValues("skip", "hot").Inc()
248+
log.Warn("region too hot during scatter", zap.Uint64("region-id", region.GetID()))
237249
return nil, errors.Errorf("region %d is hot", region.GetID())
238250
}
239251

@@ -286,15 +298,19 @@ func (r *RegionScatterer) scatterRegion(region *core.RegionInfo, group string) *
286298

287299
op, err := operator.CreateScatterRegionOperator("scatter-region", r.cluster, region, targetPeers, targetLeader)
288300
if err != nil {
301+
scatterCounter.WithLabelValues("fail", "").Inc()
289302
for _, peer := range region.GetPeers() {
290303
targetPeers[peer.GetStoreId()] = peer
291304
}
292305
r.Put(targetPeers, region.GetLeader().GetStoreId(), group)
293306
log.Debug("fail to create scatter region operator", errs.ZapError(err))
294307
return nil
295308
}
296-
r.Put(targetPeers, targetLeader, group)
297-
op.SetPriorityLevel(core.HighPriority)
309+
if op != nil {
310+
scatterCounter.WithLabelValues("success", "").Inc()
311+
r.Put(targetPeers, targetLeader, group)
312+
op.SetPriorityLevel(core.HighPriority)
313+
}
298314
return op
299315
}
300316

@@ -305,15 +321,15 @@ func (r *RegionScatterer) selectCandidates(region *core.RegionInfo, sourceStoreI
305321
return nil
306322
}
307323
filters := []filter.Filter{
308-
filter.NewExcludedFilter("scatter-region", nil, selectedStores),
324+
filter.NewExcludedFilter(r.name, nil, selectedStores),
309325
}
310326
scoreGuard := filter.NewPlacementSafeguard(r.name, r.cluster, region, sourceStore)
311327
filters = append(filters, context.filters...)
312328
filters = append(filters, scoreGuard)
313329
stores := r.cluster.GetStores()
314330
candidates := make([]uint64, 0)
315331
for _, store := range stores {
316-
if filter.Target(r.cluster.GetOpts(), store, filters) && !store.IsBusy() {
332+
if filter.Target(r.cluster.GetOpts(), store, filters) {
317333
candidates = append(candidates, store.GetID())
318334
}
319335
}
@@ -375,10 +391,22 @@ func (r *RegionScatterer) Put(peers map[uint64]*metapb.Peer, leaderStoreID uint6
375391
store := r.cluster.GetStore(storeID)
376392
if ordinaryFilter.Target(r.cluster.GetOpts(), store) {
377393
r.ordinaryEngine.selectedPeer.Put(storeID, group)
394+
scatterDistributionCounter.WithLabelValues(
395+
fmt.Sprintf("%v", storeID),
396+
strconv.FormatBool(false),
397+
filter.EngineTiKV).Inc()
378398
} else {
379399
engine := store.GetLabelValue(filter.EngineKey)
380400
r.specialEngines[engine].selectedPeer.Put(storeID, group)
401+
scatterDistributionCounter.WithLabelValues(
402+
fmt.Sprintf("%v", storeID),
403+
strconv.FormatBool(false),
404+
engine).Inc()
381405
}
382406
}
383407
r.ordinaryEngine.selectedLeader.Put(leaderStoreID, group)
408+
scatterDistributionCounter.WithLabelValues(
409+
fmt.Sprintf("%v", leaderStoreID),
410+
strconv.FormatBool(true),
411+
filter.EngineTiKV).Inc()
384412
}

0 commit comments

Comments
 (0)