Skip to content

Commit 410b176

Browse files
ti-chi-botHuSharp
andauthored
config, cluster: add an option to halt the cluster scheduling (#6498) (#6558)
ref #6493, ref #6498 Add an option to halt the cluster scheduling. Signed-off-by: husharp <jinhao.hu@pingcap.com> Co-authored-by: husharp <jinhao.hu@pingcap.com>
1 parent 8e9d0c4 commit 410b176

File tree

11 files changed

+188
-12
lines changed

11 files changed

+188
-12
lines changed

errors.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,11 @@ error = '''
9191
TiKV cluster not bootstrapped, please start TiKV first
9292
'''
9393

94+
["PD:cluster:ErrSchedulingIsHalted"]
95+
error = '''
96+
scheduling is halted
97+
'''
98+
9499
["PD:cluster:ErrStoreIsUp"]
95100
error = '''
96101
store is still up, please remove store gracefully

metrics/grafana/pd.json

Lines changed: 108 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2332,6 +2332,113 @@
23322332
"alignLevel": null
23332333
}
23342334
},
2335+
{
2336+
"aliasColors": {},
2337+
"bars": true,
2338+
"dashLength": 10,
2339+
"dashes": false,
2340+
"datasource": "${DS_TEST-CLUSTER}",
2341+
"description": "The allowance status of the scheduling.",
2342+
"fieldConfig": {
2343+
"defaults": {},
2344+
"overrides": []
2345+
},
2346+
"fill": 0,
2347+
"fillGradient": 0,
2348+
"gridPos": {
2349+
"h": 8,
2350+
"w": 12,
2351+
"x": 12,
2352+
"y": 41
2353+
},
2354+
"hiddenSeries": false,
2355+
"id": 1464,
2356+
"legend": {
2357+
"alignAsTable": true,
2358+
"avg": false,
2359+
"current": true,
2360+
"hideEmpty": true,
2361+
"hideZero": true,
2362+
"max": false,
2363+
"min": false,
2364+
"rightSide": true,
2365+
"show": true,
2366+
"total": false,
2367+
"values": true
2368+
},
2369+
"lines": false,
2370+
"linewidth": 1,
2371+
"links": [],
2372+
"nullPointMode": "null",
2373+
"options": {
2374+
"alertThreshold": true
2375+
},
2376+
"paceLength": 10,
2377+
"percentage": false,
2378+
"pluginVersion": "7.5.10",
2379+
"pointradius": 1,
2380+
"points": false,
2381+
"renderer": "flot",
2382+
"seriesOverrides": [],
2383+
"spaceLength": 10,
2384+
"stack": true,
2385+
"steppedLine": false,
2386+
"targets": [
2387+
{
2388+
"exemplar": true,
2389+
"expr": "pd_scheduling_allowance_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\"}",
2390+
"format": "time_series",
2391+
"interval": "",
2392+
"intervalFactor": 2,
2393+
"legendFormat": "{{kind}}",
2394+
"metric": "pd_scheduling_allowance_status",
2395+
"refId": "A",
2396+
"step": 2
2397+
}
2398+
],
2399+
"thresholds": [],
2400+
"timeFrom": null,
2401+
"timeRegions": [],
2402+
"timeShift": null,
2403+
"title": "Scheduling Allowance Status",
2404+
"tooltip": {
2405+
"shared": true,
2406+
"sort": 1,
2407+
"value_type": "individual"
2408+
},
2409+
"type": "graph",
2410+
"xaxis": {
2411+
"buckets": null,
2412+
"mode": "time",
2413+
"name": null,
2414+
"show": true,
2415+
"values": []
2416+
},
2417+
"yaxes": [
2418+
{
2419+
"$$hashKey": "object:533",
2420+
"format": "short",
2421+
"label": null,
2422+
"logBase": 1,
2423+
"max": null,
2424+
"min": "0",
2425+
"show": true
2426+
},
2427+
{
2428+
"$$hashKey": "object:534",
2429+
"format": "short",
2430+
"label": null,
2431+
"logBase": 1,
2432+
"max": null,
2433+
"min": null,
2434+
"show": true
2435+
}
2436+
],
2437+
"yaxis": {
2438+
"align": false,
2439+
"alignLevel": null
2440+
}
2441+
},
23352442
{
23362443
"cacheTimeout": null,
23372444
"colorBackground": false,
@@ -2959,7 +3066,7 @@
29593066
"format": "time_series",
29603067
"intervalFactor": 2,
29613068
"legendFormat": "{{event}}",
2962-
"metric": "pd_scheduler_status",
3069+
"metric": "pd_schedule_operators_count",
29633070
"refId": "A",
29643071
"step": 4
29653072
}

pkg/errs/errno.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,9 +121,10 @@ var (
121121

122122
// cluster errors
123123
var (
124-
ErrNotBootstrapped = errors.Normalize("TiKV cluster not bootstrapped, please start TiKV first", errors.RFCCodeText("PD:cluster:ErrNotBootstrapped"))
125-
ErrStoreIsUp = errors.Normalize("store is still up, please remove store gracefully", errors.RFCCodeText("PD:cluster:ErrStoreIsUp"))
126-
ErrInvalidStoreID = errors.Normalize("invalid store id %d, not found", errors.RFCCodeText("PD:cluster:ErrInvalidStoreID"))
124+
ErrNotBootstrapped = errors.Normalize("TiKV cluster not bootstrapped, please start TiKV first", errors.RFCCodeText("PD:cluster:ErrNotBootstrapped"))
125+
ErrStoreIsUp = errors.Normalize("store is still up, please remove store gracefully", errors.RFCCodeText("PD:cluster:ErrStoreIsUp"))
126+
ErrInvalidStoreID = errors.Normalize("invalid store id %d, not found", errors.RFCCodeText("PD:cluster:ErrInvalidStoreID"))
127+
ErrSchedulingIsHalted = errors.Normalize("scheduling is halted", errors.RFCCodeText("PD:cluster:ErrSchedulingIsHalted"))
127128
)
128129

129130
// versioninfo errors

pkg/mock/mockcluster/mockcluster.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,9 @@ func (mc *Cluster) GetAllocator() id.Allocator {
9595
return mc.IDAllocator
9696
}
9797

98+
// CheckSchedulingAllowance checks if the cluster allows scheduling currently.
99+
func (mc *Cluster) CheckSchedulingAllowance() (bool, error) { return true, nil }
100+
98101
// ScanRegions scans region with start key, until number greater than limit.
99102
func (mc *Cluster) ScanRegions(startKey, endKey []byte, limit int) []*core.RegionInfo {
100103
return mc.ScanRange(startKey, endKey, limit)

server/cluster/cluster.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2531,3 +2531,25 @@ func (c *RaftCluster) GetPausedSchedulerDelayAt(name string) (int64, error) {
25312531
func (c *RaftCluster) GetPausedSchedulerDelayUntil(name string) (int64, error) {
25322532
return c.coordinator.getPausedSchedulerDelayUntil(name)
25332533
}
2534+
2535+
var (
2536+
onlineUnsafeRecoveryStatus = schedulingAllowanceStatusGauge.WithLabelValues("online-unsafe-recovery")
2537+
haltSchedulingStatus = schedulingAllowanceStatusGauge.WithLabelValues("halt-scheduling")
2538+
)
2539+
2540+
// CheckSchedulingAllowance checks if the cluster allows scheduling currently.
2541+
func (c *RaftCluster) CheckSchedulingAllowance() (bool, error) {
2542+
// If the cluster is in the process of online unsafe recovery, it should not allow scheduling.
2543+
if c.GetUnsafeRecoveryController().IsRunning() {
2544+
onlineUnsafeRecoveryStatus.Set(1)
2545+
return false, errs.ErrUnsafeRecoveryIsRunning.FastGenByArgs()
2546+
}
2547+
onlineUnsafeRecoveryStatus.Set(0)
2548+
// If the halt-scheduling is set, it should not allow scheduling.
2549+
if c.opt.IsSchedulingHalted() {
2550+
haltSchedulingStatus.Set(1)
2551+
return false, errs.ErrSchedulingIsHalted.FastGenByArgs()
2552+
}
2553+
haltSchedulingStatus.Set(0)
2554+
return true, nil
2555+
}

server/cluster/cluster_worker.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@ func (c *RaftCluster) HandleRegionHeartbeat(region *core.RegionInfo) error {
4343

4444
// HandleAskSplit handles the split request.
4545
func (c *RaftCluster) HandleAskSplit(request *pdpb.AskSplitRequest) (*pdpb.AskSplitResponse, error) {
46-
if c.GetUnsafeRecoveryController().IsRunning() {
47-
return nil, errs.ErrUnsafeRecoveryIsRunning.FastGenByArgs()
46+
if allowed, err := c.CheckSchedulingAllowance(); !allowed {
47+
return nil, err
4848
}
4949
if !c.opt.IsTikvRegionSplitEnabled() {
5050
return nil, errs.ErrSchedulerTiKVSplitDisabled.FastGenByArgs()
@@ -105,8 +105,8 @@ func (c *RaftCluster) ValidRequestRegion(reqRegion *metapb.Region) error {
105105

106106
// HandleAskBatchSplit handles the batch split request.
107107
func (c *RaftCluster) HandleAskBatchSplit(request *pdpb.AskBatchSplitRequest) (*pdpb.AskBatchSplitResponse, error) {
108-
if c.GetUnsafeRecoveryController().IsRunning() {
109-
return nil, errs.ErrUnsafeRecoveryIsRunning.FastGenByArgs()
108+
if allowed, err := c.CheckSchedulingAllowance(); !allowed {
109+
return nil, err
110110
}
111111
if !c.opt.IsTikvRegionSplitEnabled() {
112112
return nil, errs.ErrSchedulerTiKVSplitDisabled.FastGenByArgs()

server/cluster/coordinator.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,7 @@ func (c *coordinator) patrolRegions() {
129129
log.Info("patrol regions has been stopped")
130130
return
131131
}
132-
if c.cluster.GetUnsafeRecoveryController().IsRunning() {
133-
// Skip patrolling regions during unsafe recovery.
132+
if allowed, _ := c.cluster.CheckSchedulingAllowance(); !allowed {
134133
continue
135134
}
136135

@@ -533,7 +532,7 @@ func (c *coordinator) collectSchedulerMetrics() {
533532
var allowScheduler float64
534533
// If the scheduler is not allowed to schedule, it will disappear in Grafana panel.
535534
// See issue #1341.
536-
if !s.IsPaused() && !s.cluster.GetUnsafeRecoveryController().IsRunning() {
535+
if allowed, _ := s.cluster.CheckSchedulingAllowance(); !s.IsPaused() && allowed {
537536
allowScheduler = 1
538537
}
539538
schedulerStatusGauge.WithLabelValues(s.GetName(), "allow").Set(allowScheduler)
@@ -939,7 +938,14 @@ func (s *scheduleController) AllowSchedule(diagnosable bool) bool {
939938
}
940939
return false
941940
}
942-
if s.IsPaused() || s.cluster.GetUnsafeRecoveryController().IsRunning() {
941+
allowed, _ := s.cluster.CheckSchedulingAllowance()
942+
if !allowed {
943+
if diagnosable {
944+
s.diagnosticRecorder.setResultFromStatus(halted)
945+
}
946+
return false
947+
}
948+
if s.IsPaused() {
943949
if diagnosable {
944950
s.diagnosticRecorder.setResultFromStatus(paused)
945951
}

server/cluster/diagnostic_manager.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ const (
3333
disabled = "disabled"
3434
// paused means the current scheduler is paused
3535
paused = "paused"
36+
// halted means the current scheduler is halted
37+
halted = "halted"
3638
// scheduling means the current scheduler is generating.
3739
scheduling = "scheduling"
3840
// pending means the current scheduler cannot generate scheduling operator

server/cluster/metrics.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,14 @@ var (
135135
Name: "store_sync",
136136
Help: "The state of store sync config",
137137
}, []string{"address", "state"})
138+
139+
schedulingAllowanceStatusGauge = prometheus.NewGaugeVec(
140+
prometheus.GaugeOpts{
141+
Namespace: "pd",
142+
Subsystem: "scheduling",
143+
Name: "allowance_status",
144+
Help: "Status of the scheduling allowance.",
145+
}, []string{"kind"})
138146
)
139147

140148
func init() {
@@ -143,6 +151,7 @@ func init() {
143151
prometheus.MustRegister(schedulerStatusGauge)
144152
prometheus.MustRegister(hotSpotStatusGauge)
145153
prometheus.MustRegister(patrolCheckRegionsGauge)
154+
prometheus.MustRegister(schedulingAllowanceStatusGauge)
146155
prometheus.MustRegister(clusterStateCPUGauge)
147156
prometheus.MustRegister(clusterStateCurrent)
148157
prometheus.MustRegister(regionListGauge)

server/config/config.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@ const (
247247
defaultEnableGRPCGateway = true
248248
defaultDisableErrorVerbose = true
249249
defaultEnableWitness = false
250+
defaultHaltScheduling = false
250251

251252
defaultDashboardAddress = "auto"
252253

@@ -772,6 +773,10 @@ type ScheduleConfig struct {
772773

773774
// EnableWitness is the option to enable using witness
774775
EnableWitness bool `toml:"enable-witness" json:"enable-witness,string"`
776+
777+
// HaltScheduling is the option to halt the scheduling. Once it's on, PD will halt the scheduling,
778+
// and any other scheduling configs will be ignored.
779+
HaltScheduling bool `toml:"halt-scheduling" json:"halt-scheduling,string,omitempty"`
775780
}
776781

777782
// Clone returns a cloned scheduling configuration.
@@ -895,6 +900,10 @@ func (c *ScheduleConfig) adjust(meta *configMetaData, reloading bool) error {
895900
adjustString(&c.RegionScoreFormulaVersion, defaultRegionScoreFormulaVersion)
896901
}
897902

903+
if !meta.IsDefined("halt-scheduling") {
904+
c.HaltScheduling = defaultHaltScheduling
905+
}
906+
898907
adjustSchedulers(&c.Schedulers, DefaultSchedulers)
899908

900909
for k, b := range c.migrateConfigurationMap() {

0 commit comments

Comments
 (0)