tikv · ti-chi-bot · May 25, 2023 · May 22, 2023 · May 22, 2023 · May 22, 2023
diff --git a/errors.toml b/errors.toml
@@ -91,6 +91,11 @@ error = '''
 TiKV cluster not bootstrapped, please start TiKV first
 '''
 
+["PD:cluster:ErrSchedulingIsHalted"]
+error = '''
+scheduling is halted
+'''
+
 ["PD:cluster:ErrStoreIsUp"]
 error = '''
 store is still up, please remove store gracefully

diff --git a/pkg/errs/errno.go b/pkg/errs/errno.go
@@ -139,9 +139,10 @@ var (
 
 // cluster errors
 var (
-	ErrNotBootstrapped = errors.Normalize("TiKV cluster not bootstrapped, please start TiKV first", errors.RFCCodeText("PD:cluster:ErrNotBootstrapped"))
-	ErrStoreIsUp       = errors.Normalize("store is still up, please remove store gracefully", errors.RFCCodeText("PD:cluster:ErrStoreIsUp"))
-	ErrInvalidStoreID  = errors.Normalize("invalid store id %d, not found", errors.RFCCodeText("PD:cluster:ErrInvalidStoreID"))
+	ErrNotBootstrapped    = errors.Normalize("TiKV cluster not bootstrapped, please start TiKV first", errors.RFCCodeText("PD:cluster:ErrNotBootstrapped"))
+	ErrStoreIsUp          = errors.Normalize("store is still up, please remove store gracefully", errors.RFCCodeText("PD:cluster:ErrStoreIsUp"))
+	ErrInvalidStoreID     = errors.Normalize("invalid store id %d, not found", errors.RFCCodeText("PD:cluster:ErrInvalidStoreID"))
+	ErrSchedulingIsHalted = errors.Normalize("scheduling is halted", errors.RFCCodeText("PD:cluster:ErrSchedulingIsHalted"))
 )
 
 // versioninfo errors

diff --git a/pkg/schedule/config/config.go b/pkg/schedule/config/config.go
@@ -86,7 +86,7 @@ type Config interface {
 	SetSplitMergeInterval(time.Duration)
 	SetMaxReplicas(int)
 	SetPlacementRulesCacheEnabled(bool)
-	SetWitnessEnabled(bool)
+	SetEnableWitness(bool)
 	// only for store configuration
 	UseRaftV2()
 }

diff --git a/pkg/schedule/placement/rule_manager_test.go b/pkg/schedule/placement/rule_manager_test.go
@@ -33,7 +33,7 @@ func newTestManager(t *testing.T, enableWitness bool) (endpoint.RuleStorage, *Ru
 	store := endpoint.NewStorageEndpoint(kv.NewMemoryKV(), nil)
 	var err error
 	manager := NewRuleManager(store, nil, mockconfig.NewTestOptions())
-	manager.conf.SetWitnessEnabled(enableWitness)
+	manager.conf.SetEnableWitness(enableWitness)
 	err = manager.Initialize(3, []string{"zone", "rack", "host"})
 	re.NoError(err)
 	return store, manager

diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go
@@ -2733,3 +2733,16 @@ func (c *RaftCluster) GetPausedSchedulerDelayAt(name string) (int64, error) {
 func (c *RaftCluster) GetPausedSchedulerDelayUntil(name string) (int64, error) {
 	return c.coordinator.getPausedSchedulerDelayUntil(name)
 }
+
+// checkSchedulingAllowance checks if the cluster allows scheduling.
+func (c *RaftCluster) checkSchedulingAllowance() (bool, error) {
+	// If the cluster is in the process of online unsafe recovery, it should not allow scheduling.
+	if c.GetUnsafeRecoveryController().IsRunning() {
+		return false, errs.ErrUnsafeRecoveryIsRunning.FastGenByArgs()
+	}
+	// If the halt-scheduling is set, it should not allow scheduling.
+	if c.opt.IsSchedulingHalted() {
+		return false, errs.ErrSchedulingIsHalted.FastGenByArgs()
+	}
+	return true, nil
+}
diff --git a/server/cluster/cluster_worker.go b/server/cluster/cluster_worker.go
@@ -43,8 +43,8 @@ func (c *RaftCluster) HandleRegionHeartbeat(region *core.RegionInfo) error {
 
 // HandleAskSplit handles the split request.
 func (c *RaftCluster) HandleAskSplit(request *pdpb.AskSplitRequest) (*pdpb.AskSplitResponse, error) {
-	if c.GetUnsafeRecoveryController().IsRunning() {
-		return nil, errs.ErrUnsafeRecoveryIsRunning.FastGenByArgs()
+	if allowed, err := c.checkSchedulingAllowance(); !allowed {
+		return nil, err
 	}
 	if !c.opt.IsTikvRegionSplitEnabled() {
 		return nil, errs.ErrSchedulerTiKVSplitDisabled.FastGenByArgs()
@@ -105,8 +105,8 @@ func (c *RaftCluster) ValidRequestRegion(reqRegion *metapb.Region) error {
 
 // HandleAskBatchSplit handles the batch split request.
 func (c *RaftCluster) HandleAskBatchSplit(request *pdpb.AskBatchSplitRequest) (*pdpb.AskBatchSplitResponse, error) {
-	if c.GetUnsafeRecoveryController().IsRunning() {
-		return nil, errs.ErrUnsafeRecoveryIsRunning.FastGenByArgs()
+	if allowed, err := c.checkSchedulingAllowance(); !allowed {
+		return nil, err
 	}
 	if !c.opt.IsTikvRegionSplitEnabled() {
 		return nil, errs.ErrSchedulerTiKVSplitDisabled.FastGenByArgs()

diff --git a/server/cluster/coordinator.go b/server/cluster/coordinator.go
@@ -135,8 +135,7 @@ func (c *coordinator) patrolRegions() {
 			log.Info("patrol regions has been stopped")
 			return
 		}
-		if c.cluster.GetUnsafeRecoveryController().IsRunning() {
-			// Skip patrolling regions during unsafe recovery.
+		if allowed, _ := c.cluster.checkSchedulingAllowance(); !allowed {
 			continue
 		}
 
@@ -540,7 +539,7 @@ func (c *coordinator) collectSchedulerMetrics() {
 		var allowScheduler float64
 		// If the scheduler is not allowed to schedule, it will disappear in Grafana panel.
 		// See issue #1341.
-		if !s.IsPaused() && !s.cluster.GetUnsafeRecoveryController().IsRunning() {
+		if allowed, _ := s.cluster.checkSchedulingAllowance(); !s.IsPaused() && !allowed {
 			allowScheduler = 1
 		}
 		schedulerStatusGauge.WithLabelValues(s.GetName(), "allow").Set(allowScheduler)
@@ -947,7 +946,7 @@ func (s *scheduleController) AllowSchedule(diagnosable bool) bool {
 		}
 		return false
 	}
-	if s.IsPaused() || s.cluster.GetUnsafeRecoveryController().IsRunning() {
+	if allowed, _ := s.cluster.checkSchedulingAllowance(); s.IsPaused() || allowed {
 		if diagnosable {
 			s.diagnosticRecorder.setResultFromStatus(paused)
 		}

diff --git a/server/config/config.go b/server/config/config.go
@@ -213,6 +213,7 @@ const (
 	defaultEnableGRPCGateway    = true
 	defaultDisableErrorVerbose  = true
 	defaultEnableWitness        = false
+	defaultHaltScheduling       = false
 
 	defaultDashboardAddress = "auto"
 
@@ -684,6 +685,10 @@ type ScheduleConfig struct {
 	// v1: which is based on the region count by rate limit.
 	// v2: which is based on region size by window size.
 	StoreLimitVersion string `toml:"store-limit-version" json:"store-limit-version,omitempty"`
+
+	// HaltScheduling is the option to halt the scheduling. Once it's on, PD will halt the scheduling,
+	// and any other scheduling configs will be ignored.
+	HaltScheduling bool `toml:"halt-scheduling" json:"halt-scheduling,string,omitempty"`
 }
 
 // Clone returns a cloned scheduling configuration.
@@ -820,6 +825,10 @@ func (c *ScheduleConfig) adjust(meta *configutil.ConfigMetaData, reloading bool)
 		configutil.AdjustString(&c.RegionScoreFormulaVersion, defaultRegionScoreFormulaVersion)
 	}
 
+	if !meta.IsDefined("halt-scheduling") {
+		c.HaltScheduling = defaultHaltScheduling
+	}
+
 	adjustSchedulers(&c.Schedulers, DefaultSchedulers)
 
 	for k, b := range c.migrateConfigurationMap() {

diff --git a/server/config/persist_options.go b/server/config/persist_options.go
@@ -184,13 +184,6 @@ func (o *PersistOptions) SetPlacementRulesCacheEnabled(enabled bool) {
 	o.SetReplicationConfig(v)
 }
 
-// SetWitnessEnabled set EanbleWitness
-func (o *PersistOptions) SetWitnessEnabled(enabled bool) {
-	v := o.GetScheduleConfig().Clone()
-	v.EnableWitness = enabled
-	o.SetScheduleConfig(v)
-}
-
 // GetStrictlyMatchLabel returns whether check label strict.
 func (o *PersistOptions) GetStrictlyMatchLabel() bool {
 	return o.GetReplicationConfig().StrictlyMatchLabel
@@ -926,3 +919,15 @@ func (o *PersistOptions) SetAllStoresLimitTTL(ctx context.Context, client *clien
 	}
 	return err
 }
+
+// SetHaltScheduling set HaltScheduling.
+func (o *PersistOptions) SetHaltScheduling(halt bool) {
+	v := o.GetScheduleConfig().Clone()
+	v.HaltScheduling = halt
+	o.SetScheduleConfig(v)
+}
+
+// IsSchedulingHalted returns if PD scheduling is halted.
+func (o *PersistOptions) IsSchedulingHalted() bool {
+	return o.GetScheduleConfig().HaltScheduling
+}