Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions errors.toml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ error = '''
TiKV cluster not bootstrapped, please start TiKV first
'''

["PD:cluster:ErrSchedulingIsHalted"]
error = '''
scheduling is halted
'''

["PD:cluster:ErrStoreIsUp"]
error = '''
store is still up, please remove store gracefully
Expand Down
7 changes: 4 additions & 3 deletions pkg/errs/errno.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,10 @@ var (

// cluster errors
var (
ErrNotBootstrapped = errors.Normalize("TiKV cluster not bootstrapped, please start TiKV first", errors.RFCCodeText("PD:cluster:ErrNotBootstrapped"))
ErrStoreIsUp = errors.Normalize("store is still up, please remove store gracefully", errors.RFCCodeText("PD:cluster:ErrStoreIsUp"))
ErrInvalidStoreID = errors.Normalize("invalid store id %d, not found", errors.RFCCodeText("PD:cluster:ErrInvalidStoreID"))
ErrNotBootstrapped = errors.Normalize("TiKV cluster not bootstrapped, please start TiKV first", errors.RFCCodeText("PD:cluster:ErrNotBootstrapped"))
ErrStoreIsUp = errors.Normalize("store is still up, please remove store gracefully", errors.RFCCodeText("PD:cluster:ErrStoreIsUp"))
ErrInvalidStoreID = errors.Normalize("invalid store id %d, not found", errors.RFCCodeText("PD:cluster:ErrInvalidStoreID"))
ErrSchedulingIsHalted = errors.Normalize("scheduling is halted", errors.RFCCodeText("PD:cluster:ErrSchedulingIsHalted"))
)

// versioninfo errors
Expand Down
2 changes: 1 addition & 1 deletion pkg/schedule/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ type Config interface {
SetSplitMergeInterval(time.Duration)
SetMaxReplicas(int)
SetPlacementRulesCacheEnabled(bool)
SetWitnessEnabled(bool)
SetEnableWitness(bool)
// only for store configuration
UseRaftV2()
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/schedule/placement/rule_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ func newTestManager(t *testing.T, enableWitness bool) (endpoint.RuleStorage, *Ru
store := endpoint.NewStorageEndpoint(kv.NewMemoryKV(), nil)
var err error
manager := NewRuleManager(store, nil, mockconfig.NewTestOptions())
manager.conf.SetWitnessEnabled(enableWitness)
manager.conf.SetEnableWitness(enableWitness)
err = manager.Initialize(3, []string{"zone", "rack", "host"})
re.NoError(err)
return store, manager
Expand Down
13 changes: 13 additions & 0 deletions server/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -2733,3 +2733,16 @@ func (c *RaftCluster) GetPausedSchedulerDelayAt(name string) (int64, error) {
func (c *RaftCluster) GetPausedSchedulerDelayUntil(name string) (int64, error) {
return c.coordinator.getPausedSchedulerDelayUntil(name)
}

// checkSchedulingAllowance checks if the cluster allows scheduling.
func (c *RaftCluster) checkSchedulingAllowance() (bool, error) {
// If the cluster is in the process of online unsafe recovery, it should not allow scheduling.
if c.GetUnsafeRecoveryController().IsRunning() {
return false, errs.ErrUnsafeRecoveryIsRunning.FastGenByArgs()
}
// If the halt-scheduling is set, it should not allow scheduling.
if c.opt.IsSchedulingHalted() {
return false, errs.ErrSchedulingIsHalted.FastGenByArgs()
}
return true, nil
}
8 changes: 4 additions & 4 deletions server/cluster/cluster_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ func (c *RaftCluster) HandleRegionHeartbeat(region *core.RegionInfo) error {

// HandleAskSplit handles the split request.
func (c *RaftCluster) HandleAskSplit(request *pdpb.AskSplitRequest) (*pdpb.AskSplitResponse, error) {
if c.GetUnsafeRecoveryController().IsRunning() {
return nil, errs.ErrUnsafeRecoveryIsRunning.FastGenByArgs()
if allowed, err := c.checkSchedulingAllowance(); !allowed {
return nil, err
}
if !c.opt.IsTikvRegionSplitEnabled() {
return nil, errs.ErrSchedulerTiKVSplitDisabled.FastGenByArgs()
Expand Down Expand Up @@ -105,8 +105,8 @@ func (c *RaftCluster) ValidRequestRegion(reqRegion *metapb.Region) error {

// HandleAskBatchSplit handles the batch split request.
func (c *RaftCluster) HandleAskBatchSplit(request *pdpb.AskBatchSplitRequest) (*pdpb.AskBatchSplitResponse, error) {
if c.GetUnsafeRecoveryController().IsRunning() {
return nil, errs.ErrUnsafeRecoveryIsRunning.FastGenByArgs()
if allowed, err := c.checkSchedulingAllowance(); !allowed {
return nil, err
}
if !c.opt.IsTikvRegionSplitEnabled() {
return nil, errs.ErrSchedulerTiKVSplitDisabled.FastGenByArgs()
Expand Down
7 changes: 3 additions & 4 deletions server/cluster/coordinator.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,7 @@ func (c *coordinator) patrolRegions() {
log.Info("patrol regions has been stopped")
return
}
if c.cluster.GetUnsafeRecoveryController().IsRunning() {
// Skip patrolling regions during unsafe recovery.
if allowed, _ := c.cluster.checkSchedulingAllowance(); !allowed {
continue
}

Expand Down Expand Up @@ -540,7 +539,7 @@ func (c *coordinator) collectSchedulerMetrics() {
var allowScheduler float64
// If the scheduler is not allowed to schedule, it will disappear in Grafana panel.
// See issue #1341.
if !s.IsPaused() && !s.cluster.GetUnsafeRecoveryController().IsRunning() {
if allowed, _ := s.cluster.checkSchedulingAllowance(); !s.IsPaused() && !allowed {
allowScheduler = 1
}
schedulerStatusGauge.WithLabelValues(s.GetName(), "allow").Set(allowScheduler)
Expand Down Expand Up @@ -947,7 +946,7 @@ func (s *scheduleController) AllowSchedule(diagnosable bool) bool {
}
return false
}
if s.IsPaused() || s.cluster.GetUnsafeRecoveryController().IsRunning() {
if allowed, _ := s.cluster.checkSchedulingAllowance(); s.IsPaused() || allowed {
if diagnosable {
s.diagnosticRecorder.setResultFromStatus(paused)
}
Expand Down
9 changes: 9 additions & 0 deletions server/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ const (
defaultEnableGRPCGateway = true
defaultDisableErrorVerbose = true
defaultEnableWitness = false
defaultHaltScheduling = false

defaultDashboardAddress = "auto"

Expand Down Expand Up @@ -684,6 +685,10 @@ type ScheduleConfig struct {
// v1: which is based on the region count by rate limit.
// v2: which is based on region size by window size.
StoreLimitVersion string `toml:"store-limit-version" json:"store-limit-version,omitempty"`

// HaltScheduling is the option to halt the scheduling. Once it's on, PD will halt the scheduling,
// and any other scheduling configs will be ignored.
HaltScheduling bool `toml:"halt-scheduling" json:"halt-scheduling,string,omitempty"`
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously, I am trying to introduce a scheduling mode to cover this case. For me, it's ok to use an individual config to control it. Maybe we can name it enable-scheduling or something else.

Copy link
Copy Markdown
Member Author

@JmPotato JmPotato May 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's best to use a configuration name with a default value of false to control the global scheduling switch, in order to avoid unexpected behaviors in scenarios that require compatibility considerations such as upgrades. Therefore, from this perspective, I think descriptions like "disable" or "halt" are more appropriate. At the same time, this global shutdown scheduling behavior should not be long-term. In addition, we already have the concept and operation of "pause" for Scheduler. So I ultimately chose the word "halt". WDYT?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I work on #6553, I found that maybe it's better to use one config for both unsafe recovery or halt, so that we can decouple the dependencies between cluster and coordinator.

}

// Clone returns a cloned scheduling configuration.
Expand Down Expand Up @@ -820,6 +825,10 @@ func (c *ScheduleConfig) adjust(meta *configutil.ConfigMetaData, reloading bool)
configutil.AdjustString(&c.RegionScoreFormulaVersion, defaultRegionScoreFormulaVersion)
}

if !meta.IsDefined("halt-scheduling") {
c.HaltScheduling = defaultHaltScheduling
}

adjustSchedulers(&c.Schedulers, DefaultSchedulers)

for k, b := range c.migrateConfigurationMap() {
Expand Down
19 changes: 12 additions & 7 deletions server/config/persist_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,13 +184,6 @@ func (o *PersistOptions) SetPlacementRulesCacheEnabled(enabled bool) {
o.SetReplicationConfig(v)
}

// SetWitnessEnabled set EanbleWitness
func (o *PersistOptions) SetWitnessEnabled(enabled bool) {
v := o.GetScheduleConfig().Clone()
v.EnableWitness = enabled
o.SetScheduleConfig(v)
}

// GetStrictlyMatchLabel returns whether check label strict.
func (o *PersistOptions) GetStrictlyMatchLabel() bool {
return o.GetReplicationConfig().StrictlyMatchLabel
Expand Down Expand Up @@ -926,3 +919,15 @@ func (o *PersistOptions) SetAllStoresLimitTTL(ctx context.Context, client *clien
}
return err
}

// SetHaltScheduling set HaltScheduling.
func (o *PersistOptions) SetHaltScheduling(halt bool) {
v := o.GetScheduleConfig().Clone()
v.HaltScheduling = halt
o.SetScheduleConfig(v)
}

// IsSchedulingHalted returns if PD scheduling is halted.
func (o *PersistOptions) IsSchedulingHalted() bool {
return o.GetScheduleConfig().HaltScheduling
}