Skip to content

Commit fbdddff

Browse files
JmPotatorleungx
authored andcommitted
config, cluster: add an option to halt the cluster scheduling (tikv#6498)
ref tikv#6493 Add an option to halt the cluster scheduling. Signed-off-by: JmPotato <ghzpotato@gmail.com>
1 parent 8582fbd commit fbdddff

File tree

14 files changed

+191
-32
lines changed

14 files changed

+191
-32
lines changed

errors.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,11 @@ error = '''
9696
TiKV cluster not bootstrapped, please start TiKV first
9797
'''
9898

99+
["PD:cluster:ErrSchedulingIsHalted"]
100+
error = '''
101+
scheduling is halted
102+
'''
103+
99104
["PD:cluster:ErrStoreIsUp"]
100105
error = '''
101106
store is still up, please remove store gracefully

metrics/grafana/pd.json

Lines changed: 108 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2340,6 +2340,113 @@
23402340
"alignLevel": null
23412341
}
23422342
},
2343+
{
2344+
"aliasColors": {},
2345+
"bars": true,
2346+
"dashLength": 10,
2347+
"dashes": false,
2348+
"datasource": "${DS_TEST-CLUSTER}",
2349+
"description": "The allowance status of the scheduling.",
2350+
"fieldConfig": {
2351+
"defaults": {},
2352+
"overrides": []
2353+
},
2354+
"fill": 0,
2355+
"fillGradient": 0,
2356+
"gridPos": {
2357+
"h": 8,
2358+
"w": 12,
2359+
"x": 12,
2360+
"y": 41
2361+
},
2362+
"hiddenSeries": false,
2363+
"id": 1464,
2364+
"legend": {
2365+
"alignAsTable": true,
2366+
"avg": false,
2367+
"current": true,
2368+
"hideEmpty": true,
2369+
"hideZero": true,
2370+
"max": false,
2371+
"min": false,
2372+
"rightSide": true,
2373+
"show": true,
2374+
"total": false,
2375+
"values": true
2376+
},
2377+
"lines": false,
2378+
"linewidth": 1,
2379+
"links": [],
2380+
"nullPointMode": "null",
2381+
"options": {
2382+
"alertThreshold": true
2383+
},
2384+
"paceLength": 10,
2385+
"percentage": false,
2386+
"pluginVersion": "7.5.10",
2387+
"pointradius": 1,
2388+
"points": false,
2389+
"renderer": "flot",
2390+
"seriesOverrides": [],
2391+
"spaceLength": 10,
2392+
"stack": true,
2393+
"steppedLine": false,
2394+
"targets": [
2395+
{
2396+
"exemplar": true,
2397+
"expr": "pd_scheduling_allowance_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\"}",
2398+
"format": "time_series",
2399+
"interval": "",
2400+
"intervalFactor": 2,
2401+
"legendFormat": "{{kind}}",
2402+
"metric": "pd_scheduling_allowance_status",
2403+
"refId": "A",
2404+
"step": 2
2405+
}
2406+
],
2407+
"thresholds": [],
2408+
"timeFrom": null,
2409+
"timeRegions": [],
2410+
"timeShift": null,
2411+
"title": "Scheduling Allowance Status",
2412+
"tooltip": {
2413+
"shared": true,
2414+
"sort": 1,
2415+
"value_type": "individual"
2416+
},
2417+
"type": "graph",
2418+
"xaxis": {
2419+
"buckets": null,
2420+
"mode": "time",
2421+
"name": null,
2422+
"show": true,
2423+
"values": []
2424+
},
2425+
"yaxes": [
2426+
{
2427+
"$$hashKey": "object:533",
2428+
"format": "short",
2429+
"label": null,
2430+
"logBase": 1,
2431+
"max": null,
2432+
"min": "0",
2433+
"show": true
2434+
},
2435+
{
2436+
"$$hashKey": "object:534",
2437+
"format": "short",
2438+
"label": null,
2439+
"logBase": 1,
2440+
"max": null,
2441+
"min": null,
2442+
"show": true
2443+
}
2444+
],
2445+
"yaxis": {
2446+
"align": false,
2447+
"alignLevel": null
2448+
}
2449+
},
23432450
{
23442451
"cacheTimeout": null,
23452452
"colorBackground": false,
@@ -2967,7 +3074,7 @@
29673074
"format": "time_series",
29683075
"intervalFactor": 2,
29693076
"legendFormat": "{{event}}",
2970-
"metric": "pd_scheduler_status",
3077+
"metric": "pd_schedule_operators_count",
29713078
"refId": "A",
29723079
"step": 4
29733080
}

pkg/errs/errno.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -142,9 +142,10 @@ var (
142142

143143
// cluster errors
144144
var (
145-
ErrNotBootstrapped = errors.Normalize("TiKV cluster not bootstrapped, please start TiKV first", errors.RFCCodeText("PD:cluster:ErrNotBootstrapped"))
146-
ErrStoreIsUp = errors.Normalize("store is still up, please remove store gracefully", errors.RFCCodeText("PD:cluster:ErrStoreIsUp"))
147-
ErrInvalidStoreID = errors.Normalize("invalid store id %d, not found", errors.RFCCodeText("PD:cluster:ErrInvalidStoreID"))
145+
ErrNotBootstrapped = errors.Normalize("TiKV cluster not bootstrapped, please start TiKV first", errors.RFCCodeText("PD:cluster:ErrNotBootstrapped"))
146+
ErrStoreIsUp = errors.Normalize("store is still up, please remove store gracefully", errors.RFCCodeText("PD:cluster:ErrStoreIsUp"))
147+
ErrInvalidStoreID = errors.Normalize("invalid store id %d, not found", errors.RFCCodeText("PD:cluster:ErrInvalidStoreID"))
148+
ErrSchedulingIsHalted = errors.Normalize("scheduling is halted", errors.RFCCodeText("PD:cluster:ErrSchedulingIsHalted"))
148149
)
149150

150151
// versioninfo errors

pkg/mock/mockcluster/mockcluster.go

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,6 @@ func (mc *Cluster) GetAllocator() id.Allocator {
104104
return mc.IDAllocator
105105
}
106106

107-
// IsUnsafeRecovering returns if the cluster is in unsafe recovering.
108-
func (mc *Cluster) IsUnsafeRecovering() bool {
109-
return false
110-
}
111-
112107
// GetPersistOptions returns the persist options.
113108
func (mc *Cluster) GetPersistOptions() *config.PersistOptions {
114109
return mc.PersistOptions
@@ -123,6 +118,9 @@ func (mc *Cluster) IsSchedulerExisted(name string) (bool, error) { return false,
123118
// IsSchedulerDisabled checks if the scheduler with name is disabled or not.
124119
func (mc *Cluster) IsSchedulerDisabled(name string) (bool, error) { return false, nil }
125120

121+
// CheckSchedulingAllowance checks if the cluster allows scheduling currently.
122+
func (mc *Cluster) CheckSchedulingAllowance() (bool, error) { return true, nil }
123+
126124
// ScanRegions scans region with start key, until number greater than limit.
127125
func (mc *Cluster) ScanRegions(startKey, endKey []byte, limit int) []*core.RegionInfo {
128126
return mc.ScanRange(startKey, endKey, limit)

pkg/schedule/config/config.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ type Config interface {
8787
SetSplitMergeInterval(time.Duration)
8888
SetMaxReplicas(int)
8989
SetPlacementRulesCacheEnabled(bool)
90-
SetWitnessEnabled(bool)
90+
SetEnableWitness(bool)
9191
// only for store configuration
9292
UseRaftV2()
9393
}

pkg/schedule/coordinator.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -143,8 +143,7 @@ func (c *Coordinator) PatrolRegions() {
143143
log.Info("patrol regions has been stopped")
144144
return
145145
}
146-
if c.cluster.IsUnsafeRecovering() {
147-
// Skip patrolling regions during unsafe recovery.
146+
if allowed, _ := c.cluster.CheckSchedulingAllowance(); !allowed {
148147
continue
149148
}
150149

@@ -577,7 +576,7 @@ func (c *Coordinator) CollectSchedulerMetrics() {
577576
var allowScheduler float64
578577
// If the scheduler is not allowed to schedule, it will disappear in Grafana panel.
579578
// See issue #1341.
580-
if !s.IsPaused() && !s.cluster.IsUnsafeRecovering() {
579+
if allowed, _ := s.cluster.CheckSchedulingAllowance(); !s.IsPaused() && allowed {
581580
allowScheduler = 1
582581
}
583582
schedulerStatusGauge.WithLabelValues(s.Scheduler.GetName(), "allow").Set(allowScheduler)
@@ -1047,7 +1046,14 @@ func (s *scheduleController) AllowSchedule(diagnosable bool) bool {
10471046
}
10481047
return false
10491048
}
1050-
if s.IsPaused() || s.cluster.IsUnsafeRecovering() {
1049+
allowed, _ := s.cluster.CheckSchedulingAllowance()
1050+
if !allowed {
1051+
if diagnosable {
1052+
s.diagnosticRecorder.setResultFromStatus(halted)
1053+
}
1054+
return false
1055+
}
1056+
if s.IsPaused() {
10511057
if diagnosable {
10521058
s.diagnosticRecorder.setResultFromStatus(paused)
10531059
}

pkg/schedule/core/cluster_informer.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ type ClusterInformer interface {
4444
UpdateRegionsLabelLevelStats(regions []*core.RegionInfo)
4545
IsSchedulerExisted(name string) (bool, error)
4646
IsSchedulerDisabled(name string) (bool, error)
47+
CheckSchedulingAllowance() (bool, error)
4748
GetPersistOptions() *config.PersistOptions
48-
IsUnsafeRecovering() bool
4949
}
5050

5151
// RegionHealthCluster is an aggregate interface that wraps multiple interfaces

pkg/schedule/diagnostic_manager.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ const (
3434
disabled = "disabled"
3535
// paused means the current scheduler is paused
3636
paused = "paused"
37+
// halted means the current scheduler is halted
38+
halted = "halted"
3739
// scheduling means the current scheduler is generating.
3840
scheduling = "scheduling"
3941
// pending means the current scheduler cannot generate scheduling operator

pkg/schedule/placement/rule_manager_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ func newTestManager(t *testing.T, enableWitness bool) (endpoint.RuleStorage, *Ru
3333
store := endpoint.NewStorageEndpoint(kv.NewMemoryKV(), nil)
3434
var err error
3535
manager := NewRuleManager(store, nil, mockconfig.NewTestOptions())
36-
manager.conf.SetWitnessEnabled(enableWitness)
36+
manager.conf.SetEnableWitness(enableWitness)
3737
err = manager.Initialize(3, []string{"zone", "rack", "host"}, "")
3838
re.NoError(err)
3939
return store, manager

server/cluster/cluster.go

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -828,11 +828,6 @@ func (c *RaftCluster) GetUnsafeRecoveryController() *unsaferecovery.Controller {
828828
return c.unsafeRecoveryController
829829
}
830830

831-
// IsUnsafeRecovering returns if the cluster is in unsafe recovering.
832-
func (c *RaftCluster) IsUnsafeRecovering() bool {
833-
return c.unsafeRecoveryController.IsRunning()
834-
}
835-
836831
// AddSuspectKeyRange adds the key range with the its ruleID as the key
837832
// The instance of each keyRange is like following format:
838833
// [2][]byte: start key/end key
@@ -2713,3 +2708,25 @@ func (c *RaftCluster) GetPausedSchedulerDelayAt(name string) (int64, error) {
27132708
func (c *RaftCluster) GetPausedSchedulerDelayUntil(name string) (int64, error) {
27142709
return c.coordinator.GetPausedSchedulerDelayUntil(name)
27152710
}
2711+
2712+
var (
2713+
onlineUnsafeRecoveryStatus = schedulingAllowanceStatusGauge.WithLabelValues("online-unsafe-recovery")
2714+
haltSchedulingStatus = schedulingAllowanceStatusGauge.WithLabelValues("halt-scheduling")
2715+
)
2716+
2717+
// CheckSchedulingAllowance checks if the cluster allows scheduling currently.
2718+
func (c *RaftCluster) CheckSchedulingAllowance() (bool, error) {
2719+
// If the cluster is in the process of online unsafe recovery, it should not allow scheduling.
2720+
if c.GetUnsafeRecoveryController().IsRunning() {
2721+
onlineUnsafeRecoveryStatus.Set(1)
2722+
return false, errs.ErrUnsafeRecoveryIsRunning.FastGenByArgs()
2723+
}
2724+
onlineUnsafeRecoveryStatus.Set(0)
2725+
// If the halt-scheduling is set, it should not allow scheduling.
2726+
if c.opt.IsSchedulingHalted() {
2727+
haltSchedulingStatus.Set(1)
2728+
return false, errs.ErrSchedulingIsHalted.FastGenByArgs()
2729+
}
2730+
haltSchedulingStatus.Set(0)
2731+
return true, nil
2732+
}

0 commit comments

Comments
 (0)