Skip to content

Commit 9450168

Browse files
authored
scheduler: graceful shutdown implement (#9720)
close #9719 Add an is_stopping status to the StoreHeartbeat message. When TiKV receives a SIGTERM, it sets this flag. This change adds a new evict-stopping-store-scheduler to PD, which is analogous to the evict-slow-store-scheduler. It proactively transfers leaders away from nodes by inspecting the is_stopping status from store heartbeats. Signed-off-by: hujiatao0 <hhjjtt110@gmail.com>
1 parent f8ab6e5 commit 9450168

File tree

23 files changed

+739
-77
lines changed

23 files changed

+739
-77
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ require (
3535
github.com/pingcap/errcode v0.3.0
3636
github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c
3737
github.com/pingcap/failpoint v0.0.0-20240528011301-b51a646c7c86
38-
github.com/pingcap/kvproto v0.0.0-20250923064352-8eeada0a8a03
38+
github.com/pingcap/kvproto v0.0.0-20250923091925-d79d11002599
3939
github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3
4040
github.com/pingcap/metering_sdk v0.0.0-20250918015914-468cd6feb1dc
4141
github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -469,8 +469,8 @@ github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c/go.mod h1:X2r9ue
469469
github.com/pingcap/failpoint v0.0.0-20240528011301-b51a646c7c86 h1:tdMsjOqUR7YXHoBitzdebTvOjs/swniBTOLy5XiMtuE=
470470
github.com/pingcap/failpoint v0.0.0-20240528011301-b51a646c7c86/go.mod h1:exzhVYca3WRtd6gclGNErRWb1qEgff3LYta0LvRmON4=
471471
github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w=
472-
github.com/pingcap/kvproto v0.0.0-20250923064352-8eeada0a8a03 h1:G6lEpMBW42aNvstzIjNR3NnH6mx+3nIH42Ic1Sb8h/U=
473-
github.com/pingcap/kvproto v0.0.0-20250923064352-8eeada0a8a03/go.mod h1:rXxWk2UnwfUhLXha1jxRWPADw9eMZGWEWCg92Tgmb/8=
472+
github.com/pingcap/kvproto v0.0.0-20250923091925-d79d11002599 h1:57fBeBND/j/dp7nVlw+cWEwYlt4u8CAe4ApsmAEb1ow=
473+
github.com/pingcap/kvproto v0.0.0-20250923091925-d79d11002599/go.mod h1:rXxWk2UnwfUhLXha1jxRWPADw9eMZGWEWCg92Tgmb/8=
474474
github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7/go.mod h1:8AanEdAHATuRurdGxZXBz0At+9avep+ub7U1AGYLIMM=
475475
github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8IDP+SZrdhV1Kibl9KrHxJ9eciw=
476476
github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4=

pkg/core/basic_cluster.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,9 @@ type StoreSetController interface {
149149
ResumeLeaderTransfer(id uint64, d constant.Direction)
150150

151151
SlowStoreEvicted(id uint64) error
152+
StoppingStoreEvicted(id uint64) error
152153
SlowStoreRecovered(id uint64)
154+
StoppingStoreRecovered(id uint64)
153155
SlowTrendEvicted(id uint64) error
154156
SlowTrendRecovered(id uint64)
155157
}

pkg/core/store.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,11 @@ func (s *StoreInfo) EvictedAsSlowStore() bool {
206206
return s.slowStoreEvicted.Load() > 0
207207
}
208208

209+
// EvictedAsStoppingStore returns if the store should be evicted as a stopping store.
210+
func (s *StoreInfo) EvictedAsStoppingStore() bool {
211+
return s.rawStats.IsStopping
212+
}
213+
209214
// IsEvictedAsSlowTrend returns if the store should be evicted as a slow store by trend.
210215
func (s *StoreInfo) IsEvictedAsSlowTrend() bool {
211216
return s.slowTrendEvicted.Load() > 0
@@ -295,6 +300,13 @@ func (s *StoreInfo) IsSlow() bool {
295300
return s.IsEvictedAsSlowTrend() || s.rawStats.GetSlowScore() >= slowStoreThreshold
296301
}
297302

303+
// IsStopping checks if the store is in stopping state.
304+
func (s *StoreInfo) IsStopping() bool {
305+
s.mu.RLock()
306+
defer s.mu.RUnlock()
307+
return s.rawStats.GetIsStopping()
308+
}
309+
298310
// GetSlowTrend returns the slow trend information of the store.
299311
func (s *StoreInfo) GetSlowTrend() *pdpb.SlowTrend {
300312
s.mu.RLock()
@@ -946,6 +958,19 @@ func (s *StoresInfo) SlowStoreEvicted(storeID uint64) error {
946958
return nil
947959
}
948960

961+
// StoppingStoreEvicted marks a store as a stopping store and prevents transferring
962+
// leader to the store
963+
func (s *StoresInfo) StoppingStoreEvicted(storeID uint64) error {
964+
s.Lock()
965+
defer s.Unlock()
966+
store, ok := s.stores[storeID]
967+
if !ok {
968+
return errs.ErrStoreNotFound.FastGenByArgs(storeID)
969+
}
970+
s.stores[storeID] = store.Clone(StoppingStoreEvicted())
971+
return nil
972+
}
973+
949974
// SlowStoreRecovered cleans the evicted state of a store.
950975
func (s *StoresInfo) SlowStoreRecovered(storeID uint64) {
951976
s.Lock()
@@ -959,6 +984,19 @@ func (s *StoresInfo) SlowStoreRecovered(storeID uint64) {
959984
s.stores[storeID] = store.Clone(SlowStoreRecovered())
960985
}
961986

987+
// StoppingStoreRecovered cleans the evicted state of a store.
988+
func (s *StoresInfo) StoppingStoreRecovered(storeID uint64) {
989+
s.Lock()
990+
defer s.Unlock()
991+
store, ok := s.stores[storeID]
992+
if !ok {
993+
log.Warn("try to clean a store's evicted as a stopping store state, but it is not found. It may be cleanup",
994+
zap.Uint64("store-id", storeID))
995+
return
996+
}
997+
s.stores[storeID] = store.Clone(StoppingStoreRecovered())
998+
}
999+
9621000
// SlowTrendEvicted marks a store as a slow trend and prevents transferring
9631001
// leader to the store
9641002
func (s *StoresInfo) SlowTrendEvicted(storeID uint64) error {

pkg/core/store_option.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,14 @@ func SlowStoreEvicted() StoreCreateOption {
142142
}
143143
}
144144

145+
// StoppingStoreEvicted marks a store as a stopping store and prevents transferring
146+
// leader to the store
147+
func StoppingStoreEvicted() StoreCreateOption {
148+
return func(store *StoreInfo) {
149+
store.rawStats.IsStopping = true
150+
}
151+
}
152+
145153
// SlowTrendEvicted marks a store as a slow store by trend and prevents transferring
146154
// leader to the store
147155
func SlowTrendEvicted() StoreCreateOption {
@@ -164,6 +172,13 @@ func SlowStoreRecovered() StoreCreateOption {
164172
}
165173
}
166174

175+
// StoppingStoreRecovered cleans the evicted state of a store.
176+
func StoppingStoreRecovered() StoreCreateOption {
177+
return func(store *StoreInfo) {
178+
store.rawStats.IsStopping = false
179+
}
180+
}
181+
167182
// SetLeaderCount sets the leader count for the store.
168183
func SetLeaderCount(leaderCount int) StoreCreateOption {
169184
return func(store *StoreInfo) {

pkg/schedule/config/config.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -590,6 +590,7 @@ var defaultSchedulersInit = func() SchedulerConfigs {
590590
}
591591
if !kerneltype.IsNextGen() {
592592
defaultSchedulers = append(defaultSchedulers, SchedulerConfig{Type: types.SchedulerTypeCompatibleMap[types.EvictSlowStoreScheduler]})
593+
defaultSchedulers = append(defaultSchedulers, SchedulerConfig{Type: types.SchedulerTypeCompatibleMap[types.EvictStoppingStoreScheduler]})
593594
}
594595
return defaultSchedulers
595596
}

pkg/schedule/filter/counter.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ const (
5959
storeStateOffline
6060
storeStatePauseLeader
6161
storeStateSlow
62+
storeStateStopping
6263
storeStateDisconnected
6364
storeStateBusy
6465
storeStateExceedRemoveLimit
@@ -88,6 +89,7 @@ var filters = [filtersLen]string{
8889
"store-state-offline-filter",
8990
"store-state-pause-leader-filter",
9091
"store-state-slow-filter",
92+
"store-state-stopping-filter",
9193
"store-state-disconnect-filter",
9294
"store-state-busy-filter",
9395
"store-state-exceed-remove-limit-filter",

pkg/schedule/filter/filters.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,15 @@ func (f *StoreStateFilter) slowStoreEvicted(_ config.SharedConfigProvider, store
390390
return statusOK
391391
}
392392

393+
func (f *StoreStateFilter) stoppingStoreEvicted(_ config.SharedConfigProvider, store *core.StoreInfo) *plan.Status {
394+
if store.EvictedAsStoppingStore() {
395+
f.Reason = storeStateStopping
396+
return statusStoreRejectLeader
397+
}
398+
f.Reason = storeStateOK
399+
return statusOK
400+
}
401+
393402
func (f *StoreStateFilter) slowTrendEvicted(_ config.SharedConfigProvider, store *core.StoreInfo) *plan.Status {
394403
if store.IsEvictedAsSlowTrend() {
395404
f.Reason = storeStateSlowTrend
@@ -500,7 +509,7 @@ func (f *StoreStateFilter) anyConditionMatch(typ int, conf config.SharedConfigPr
500509
funcs = []conditionFunc{f.isBusy}
501510
case leaderTarget:
502511
funcs = []conditionFunc{f.isRemoved, f.isRemoving, f.isDown, f.pauseLeaderTransferIn,
503-
f.slowStoreEvicted, f.slowTrendEvicted, f.isDisconnected, f.isBusy, f.hasRejectLeaderProperty}
512+
f.slowStoreEvicted, f.stoppingStoreEvicted, f.slowTrendEvicted, f.isDisconnected, f.isBusy, f.hasRejectLeaderProperty}
504513
case regionTarget:
505514
funcs = []conditionFunc{f.isRemoved, f.isRemoving, f.isDown, f.isDisconnected, f.isBusy,
506515
f.exceedAddLimit, f.tooManySnapshots, f.tooManyPendingPeers}

0 commit comments

Comments
 (0)