Skip to content

Commit b8c3d14

Browse files
authored
schedule: add metrcis for region scatter (#3582)
1 parent b6e80b2 commit b8c3d14

File tree

4 files changed

+266
-5
lines changed

4 files changed

+266
-5
lines changed

metrics/grafana/pd.json

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6971,6 +6971,220 @@
69716971
"title": "Scheduler",
69726972
"type": "row"
69736973
},
6974+
{
6975+
"collapsed": true,
6976+
"gridPos": {
6977+
"h": 1,
6978+
"w": 24,
6979+
"x": 0,
6980+
"y": 19
6981+
},
6982+
"id": 1437,
6983+
"panels": [
6984+
{
6985+
"aliasColors": {},
6986+
"bars": false,
6987+
"dashLength": 10,
6988+
"dashes": false,
6989+
"datasource": "tidb-cluster",
6990+
"description": "",
6991+
"fill": 0,
6992+
"gridPos": {
6993+
"h": 8,
6994+
"w": 12,
6995+
"x": 0,
6996+
"y": 20
6997+
},
6998+
"id": 1433,
6999+
"legend": {
7000+
"alignAsTable": true,
7001+
"avg": true,
7002+
"current": true,
7003+
"hideEmpty": true,
7004+
"hideZero": true,
7005+
"max": true,
7006+
"min": false,
7007+
"rightSide": true,
7008+
"show": true,
7009+
"total": false,
7010+
"values": true
7011+
},
7012+
"lines": true,
7013+
"linewidth": 2,
7014+
"links": [],
7015+
"nullPointMode": "null",
7016+
"percentage": false,
7017+
"pointradius": 2,
7018+
"points": false,
7019+
"renderer": "flot",
7020+
"seriesOverrides": [],
7021+
"spaceLength": 10,
7022+
"stack": false,
7023+
"steppedLine": false,
7024+
"targets": [
7025+
{
7026+
"expr": "sum(delta(pd_schedule_scatter_operators_count{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"skip\"}[1m])) by (event)",
7027+
"format": "time_series",
7028+
"intervalFactor": 2,
7029+
"legendFormat": "skip-{{event}}",
7030+
"refId": "A"
7031+
},
7032+
{
7033+
"expr": "delta(pd_schedule_scatter_operators_count{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"fail\"}[1m])",
7034+
"format": "time_series",
7035+
"intervalFactor": 2,
7036+
"legendFormat": "fail",
7037+
"refId": "B"
7038+
},
7039+
{
7040+
"expr": "delta(pd_schedule_scatter_operators_count{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"success\"}[1m])",
7041+
"format": "time_series",
7042+
"intervalFactor": 2,
7043+
"legendFormat": "success",
7044+
"refId": "C"
7045+
}
7046+
],
7047+
"thresholds": [],
7048+
"timeFrom": null,
7049+
"timeRegions": [],
7050+
"timeShift": null,
7051+
"title": "scatter operator event",
7052+
"tooltip": {
7053+
"shared": true,
7054+
"sort": 0,
7055+
"value_type": "individual"
7056+
},
7057+
"type": "graph",
7058+
"xaxis": {
7059+
"buckets": null,
7060+
"mode": "time",
7061+
"name": null,
7062+
"show": true,
7063+
"values": []
7064+
},
7065+
"yaxes": [
7066+
{
7067+
"format": "opm",
7068+
"label": null,
7069+
"logBase": 1,
7070+
"max": null,
7071+
"min": null,
7072+
"show": true
7073+
},
7074+
{
7075+
"format": "short",
7076+
"label": null,
7077+
"logBase": 1,
7078+
"max": null,
7079+
"min": null,
7080+
"show": true
7081+
}
7082+
],
7083+
"yaxis": {
7084+
"align": false,
7085+
"alignLevel": null
7086+
}
7087+
},
7088+
{
7089+
"aliasColors": {},
7090+
"bars": false,
7091+
"dashLength": 10,
7092+
"dashes": false,
7093+
"datasource": "tidb-cluster",
7094+
"fill": 0,
7095+
"gridPos": {
7096+
"h": 8,
7097+
"w": 12,
7098+
"x": 12,
7099+
"y": 20
7100+
},
7101+
"id": 1435,
7102+
"legend": {
7103+
"alignAsTable": true,
7104+
"avg": false,
7105+
"current": true,
7106+
"hideEmpty": true,
7107+
"hideZero": true,
7108+
"max": true,
7109+
"min": false,
7110+
"rightSide": true,
7111+
"show": true,
7112+
"total": false,
7113+
"values": true
7114+
},
7115+
"lines": true,
7116+
"linewidth": 2,
7117+
"links": [],
7118+
"nullPointMode": "null",
7119+
"percentage": false,
7120+
"pointradius": 2,
7121+
"points": false,
7122+
"renderer": "flot",
7123+
"seriesOverrides": [],
7124+
"spaceLength": 10,
7125+
"stack": false,
7126+
"steppedLine": false,
7127+
"targets": [
7128+
{
7129+
"expr": "sum(delta(pd_schedule_scatter_distribution{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\",engine=\"tikv\",is_leader=\"false\"}[1m])) by (store)",
7130+
"format": "time_series",
7131+
"intervalFactor": 1,
7132+
"legendFormat": "peer-{{store}}",
7133+
"refId": "A"
7134+
},
7135+
{
7136+
"expr": "sum(delta(pd_schedule_scatter_distribution{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\",engine=\"tikv\",is_leader=\"true\"}[1m])) by (store)",
7137+
"format": "time_series",
7138+
"intervalFactor": 1,
7139+
"legendFormat": "leader-{{store}}",
7140+
"refId": "B"
7141+
}
7142+
],
7143+
"thresholds": [],
7144+
"timeFrom": null,
7145+
"timeRegions": [],
7146+
"timeShift": null,
7147+
"title": "scatter store selection",
7148+
"tooltip": {
7149+
"shared": true,
7150+
"sort": 0,
7151+
"value_type": "individual"
7152+
},
7153+
"type": "graph",
7154+
"xaxis": {
7155+
"buckets": null,
7156+
"mode": "time",
7157+
"name": null,
7158+
"show": true,
7159+
"values": []
7160+
},
7161+
"yaxes": [
7162+
{
7163+
"format": "opm",
7164+
"label": null,
7165+
"logBase": 1,
7166+
"max": null,
7167+
"min": null,
7168+
"show": true
7169+
},
7170+
{
7171+
"format": "short",
7172+
"label": null,
7173+
"logBase": 1,
7174+
"max": null,
7175+
"min": null,
7176+
"show": true
7177+
}
7178+
],
7179+
"yaxis": {
7180+
"align": false,
7181+
"alignLevel": null
7182+
}
7183+
}
7184+
],
7185+
"title": "Scatter and Splitter",
7186+
"type": "row"
7187+
},
69747188
{
69757189
"collapsed": true,
69767190
"gridPos": {

server/schedule/filter/filters.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,7 @@ func (f *StoreStateFilter) anyConditionMatch(typ int, opt *config.PersistOptions
389389
funcs = []conditionFunc{f.isTombstone, f.isOffline, f.isDown, f.isDisconnected, f.isBusy,
390390
f.exceedAddLimit, f.tooManySnapshots, f.tooManyPendingPeers}
391391
case scatterRegionTarget:
392-
funcs = []conditionFunc{f.isTombstone, f.isOffline, f.isDown, f.isDisconnected}
392+
funcs = []conditionFunc{f.isTombstone, f.isOffline, f.isDown, f.isDisconnected, f.isBusy}
393393
}
394394
for _, cf := range funcs {
395395
if cf(opt, store) {
@@ -686,6 +686,8 @@ const (
686686
EngineKey = "engine"
687687
// EngineTiFlash is the tiflash value of the engine label.
688688
EngineTiFlash = "tiflash"
689+
// EngineTiKV indicates the tikv engine in metrics
690+
EngineTiKV = "tikv"
689691
)
690692

691693
var allSpecialUses = []string{SpecialUseHotRegion, SpecialUseReserved}

server/schedule/metrics.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,22 @@ var (
5757
Name: "store_limit_cost",
5858
Help: "limit rate cost of store.",
5959
}, []string{"store", "limit_type"})
60+
61+
scatterCounter = prometheus.NewCounterVec(
62+
prometheus.CounterOpts{
63+
Namespace: "pd",
64+
Subsystem: "schedule",
65+
Name: "scatter_operators_count",
66+
Help: "Counter of region scatter operators.",
67+
}, []string{"type", "event"})
68+
69+
scatterDistributionCounter = prometheus.NewCounterVec(
70+
prometheus.CounterOpts{
71+
Namespace: "pd",
72+
Subsystem: "schedule",
73+
Name: "scatter_distribution",
74+
Help: "Counter of the distribution in scatter.",
75+
}, []string{"store", "is_leader", "engine"})
6076
)
6177

6278
func init() {
@@ -65,4 +81,6 @@ func init() {
6581
prometheus.MustRegister(operatorWaitDuration)
6682
prometheus.MustRegister(storeLimitCostCounter)
6783
prometheus.MustRegister(operatorWaitCounter)
84+
prometheus.MustRegister(scatterCounter)
85+
prometheus.MustRegister(scatterDistributionCounter)
6886
}

server/schedule/region_scatterer.go

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ const maxRetryLimit = 30
137137
func (r *RegionScatterer) ScatterRegionsByRange(startKey, endKey []byte, group string, retryLimit int) ([]*operator.Operator, map[uint64]error, error) {
138138
regions := r.cluster.ScanRegions(startKey, endKey, -1)
139139
if len(regions) < 1 {
140+
scatterCounter.WithLabelValues("skip", "empty-region").Inc()
140141
return nil, nil, errors.New("empty region")
141142
}
142143
failures := make(map[uint64]error, len(regions))
@@ -155,13 +156,16 @@ func (r *RegionScatterer) ScatterRegionsByRange(startKey, endKey []byte, group s
155156
// ScatterRegionsByID directly scatter regions by ScatterRegions
156157
func (r *RegionScatterer) ScatterRegionsByID(regionsID []uint64, group string, retryLimit int) ([]*operator.Operator, map[uint64]error, error) {
157158
if len(regionsID) < 1 {
159+
scatterCounter.WithLabelValues("skip", "empty-region").Inc()
158160
return nil, nil, errors.New("empty region")
159161
}
160162
failures := make(map[uint64]error, len(regionsID))
161163
var regions []*core.RegionInfo
162164
for _, id := range regionsID {
163165
region := r.cluster.GetRegion(id)
164166
if region == nil {
167+
scatterCounter.WithLabelValues("skip", "no-region").Inc()
168+
log.Warn("failed to find region during scatter", zap.Uint64("region-id", id))
165169
failures[id] = errors.New(fmt.Sprintf("failed to find region %v", id))
166170
continue
167171
}
@@ -187,6 +191,7 @@ func (r *RegionScatterer) ScatterRegionsByID(regionsID []uint64, group string, r
187191
// and the value of the failures indicates the failure error.
188192
func (r *RegionScatterer) ScatterRegions(regions map[uint64]*core.RegionInfo, failures map[uint64]error, group string, retryLimit int) ([]*operator.Operator, error) {
189193
if len(regions) < 1 {
194+
scatterCounter.WithLabelValues("skip", "empty-region").Inc()
190195
return nil, errors.New("empty region")
191196
}
192197
if retryLimit > maxRetryLimit {
@@ -226,14 +231,20 @@ func (r *RegionScatterer) ScatterRegions(regions map[uint64]*core.RegionInfo, fa
226231
func (r *RegionScatterer) Scatter(region *core.RegionInfo, group string) (*operator.Operator, error) {
227232
if !opt.IsRegionReplicated(r.cluster, region) {
228233
r.cluster.AddSuspectRegions(region.GetID())
234+
scatterCounter.WithLabelValues("skip", "not-replicated").Inc()
235+
log.Warn("region not replicated during scatter", zap.Uint64("region-id", region.GetID()))
229236
return nil, errors.Errorf("region %d is not fully replicated", region.GetID())
230237
}
231238

232239
if region.GetLeader() == nil {
240+
scatterCounter.WithLabelValues("skip", "no-leader").Inc()
241+
log.Warn("region no leader during scatter", zap.Uint64("region-id", region.GetID()))
233242
return nil, errors.Errorf("region %d has no leader", region.GetID())
234243
}
235244

236245
if r.cluster.IsRegionHot(region) {
246+
scatterCounter.WithLabelValues("skip", "hot").Inc()
247+
log.Warn("region too hot during scatter", zap.Uint64("region-id", region.GetID()))
237248
return nil, errors.Errorf("region %d is hot", region.GetID())
238249
}
239250

@@ -286,15 +297,19 @@ func (r *RegionScatterer) scatterRegion(region *core.RegionInfo, group string) *
286297

287298
op, err := operator.CreateScatterRegionOperator("scatter-region", r.cluster, region, targetPeers, targetLeader)
288299
if err != nil {
300+
scatterCounter.WithLabelValues("fail", "").Inc()
289301
for _, peer := range region.GetPeers() {
290302
targetPeers[peer.GetStoreId()] = peer
291303
}
292304
r.Put(targetPeers, region.GetLeader().GetStoreId(), group)
293305
log.Debug("fail to create scatter region operator", errs.ZapError(err))
294306
return nil
295307
}
296-
r.Put(targetPeers, targetLeader, group)
297-
op.SetPriorityLevel(core.HighPriority)
308+
if op != nil {
309+
scatterCounter.WithLabelValues("success", "").Inc()
310+
r.Put(targetPeers, targetLeader, group)
311+
op.SetPriorityLevel(core.HighPriority)
312+
}
298313
return op
299314
}
300315

@@ -305,15 +320,15 @@ func (r *RegionScatterer) selectCandidates(region *core.RegionInfo, sourceStoreI
305320
return nil
306321
}
307322
filters := []filter.Filter{
308-
filter.NewExcludedFilter("scatter-region", nil, selectedStores),
323+
filter.NewExcludedFilter(r.name, nil, selectedStores),
309324
}
310325
scoreGuard := filter.NewPlacementSafeguard(r.name, r.cluster, region, sourceStore)
311326
filters = append(filters, context.filters...)
312327
filters = append(filters, scoreGuard)
313328
stores := r.cluster.GetStores()
314329
candidates := make([]uint64, 0)
315330
for _, store := range stores {
316-
if filter.Target(r.cluster.GetOpts(), store, filters) && !store.IsBusy() {
331+
if filter.Target(r.cluster.GetOpts(), store, filters) {
317332
candidates = append(candidates, store.GetID())
318333
}
319334
}
@@ -375,10 +390,22 @@ func (r *RegionScatterer) Put(peers map[uint64]*metapb.Peer, leaderStoreID uint6
375390
store := r.cluster.GetStore(storeID)
376391
if ordinaryFilter.Target(r.cluster.GetOpts(), store) {
377392
r.ordinaryEngine.selectedPeer.Put(storeID, group)
393+
scatterDistributionCounter.WithLabelValues(
394+
fmt.Sprintf("%v", storeID),
395+
fmt.Sprintf("%v", false),
396+
filter.EngineTiKV).Inc()
378397
} else {
379398
engine := store.GetLabelValue(filter.EngineKey)
380399
r.specialEngines[engine].selectedPeer.Put(storeID, group)
400+
scatterDistributionCounter.WithLabelValues(
401+
fmt.Sprintf("%v", storeID),
402+
fmt.Sprintf("%v", false),
403+
engine).Inc()
381404
}
382405
}
383406
r.ordinaryEngine.selectedLeader.Put(leaderStoreID, group)
407+
scatterDistributionCounter.WithLabelValues(
408+
fmt.Sprintf("%v", leaderStoreID),
409+
fmt.Sprintf("%v", true),
410+
filter.EngineTiKV).Inc()
384411
}

0 commit comments

Comments
 (0)