Skip to content

Commit 0adb86f

Browse files
scheduler: make evict-slow-trend scheduler can manually modify recovery duration. (#7132)
ref #7156, ref tikv/tikv#15271 With this pr, users can manually modify the minimal recovery time when encountering an I/O jitter case. That is, only when the jitter is disappear and the recovery time reach this limit, can the given slow node be mark with normal for balancing leaders to it. Signed-off-by: lucasliang <nkcs_lykx@hotmail.com>
1 parent 779b5be commit 0adb86f

File tree

5 files changed

+118
-20
lines changed

5 files changed

+118
-20
lines changed

pkg/schedule/schedulers/evict_slow_trend.go

Lines changed: 81 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,12 @@
1515
package schedulers
1616

1717
import (
18+
"net/http"
1819
"strconv"
20+
"sync/atomic"
1921
"time"
2022

23+
"github.com/gorilla/mux"
2124
"github.com/pingcap/errors"
2225
"github.com/pingcap/failpoint"
2326
"github.com/pingcap/log"
@@ -26,6 +29,8 @@ import (
2629
"github.com/tikv/pd/pkg/schedule/operator"
2730
"github.com/tikv/pd/pkg/schedule/plan"
2831
"github.com/tikv/pd/pkg/storage/endpoint"
32+
"github.com/tikv/pd/pkg/utils/apiutil"
33+
"github.com/unrolled/render"
2934
"go.uber.org/zap"
3035
)
3136

@@ -54,11 +59,28 @@ type evictSlowTrendSchedulerConfig struct {
5459
evictCandidate slowCandidate
5560
// Last chosen candidate for eviction.
5661
lastEvictCandidate slowCandidate
57-
62+
// Duration gap for recovering the candidate, unit: s.
63+
RecoveryDurationGap uint64 `json:"recovery-duration"`
5864
// Only evict one store for now
5965
EvictedStores []uint64 `json:"evict-by-trend-stores"`
6066
}
6167

68+
func initEvictSlowTrendSchedulerConfig(storage endpoint.ConfigStorage) *evictSlowTrendSchedulerConfig {
69+
return &evictSlowTrendSchedulerConfig{
70+
storage: storage,
71+
evictCandidate: slowCandidate{},
72+
lastEvictCandidate: slowCandidate{},
73+
RecoveryDurationGap: defaultRecoveryDurationGap,
74+
EvictedStores: make([]uint64, 0),
75+
}
76+
}
77+
78+
func (conf *evictSlowTrendSchedulerConfig) Clone() *evictSlowTrendSchedulerConfig {
79+
return &evictSlowTrendSchedulerConfig{
80+
RecoveryDurationGap: atomic.LoadUint64(&conf.RecoveryDurationGap),
81+
}
82+
}
83+
6284
func (conf *evictSlowTrendSchedulerConfig) Persist() error {
6385
name := conf.getSchedulerName()
6486
data, err := EncodeConfig(conf)
@@ -116,6 +138,15 @@ func (conf *evictSlowTrendSchedulerConfig) lastCandidateCapturedSecs() uint64 {
116138
return DurationSinceAsSecs(conf.lastEvictCandidate.captureTS)
117139
}
118140

141+
// readyForRecovery checks whether the last cpatured candidate is ready for recovery.
142+
func (conf *evictSlowTrendSchedulerConfig) readyForRecovery() bool {
143+
recoveryDurationGap := atomic.LoadUint64(&conf.RecoveryDurationGap)
144+
failpoint.Inject("transientRecoveryGap", func() {
145+
recoveryDurationGap = 0
146+
})
147+
return conf.lastCandidateCapturedSecs() >= recoveryDurationGap
148+
}
149+
119150
func (conf *evictSlowTrendSchedulerConfig) captureCandidate(id uint64) {
120151
conf.evictCandidate = slowCandidate{
121152
storeID: id,
@@ -162,9 +193,52 @@ func (conf *evictSlowTrendSchedulerConfig) clearAndPersist(cluster sche.Schedule
162193
return oldID, conf.Persist()
163194
}
164195

196+
type evictSlowTrendHandler struct {
197+
rd *render.Render
198+
config *evictSlowTrendSchedulerConfig
199+
}
200+
201+
func newEvictSlowTrendHandler(config *evictSlowTrendSchedulerConfig) http.Handler {
202+
h := &evictSlowTrendHandler{
203+
config: config,
204+
rd: render.New(render.Options{IndentJSON: true}),
205+
}
206+
router := mux.NewRouter()
207+
router.HandleFunc("/config", h.UpdateConfig).Methods(http.MethodPost)
208+
router.HandleFunc("/list", h.ListConfig).Methods(http.MethodGet)
209+
return router
210+
}
211+
212+
func (handler *evictSlowTrendHandler) UpdateConfig(w http.ResponseWriter, r *http.Request) {
213+
var input map[string]interface{}
214+
if err := apiutil.ReadJSONRespondError(handler.rd, w, r.Body, &input); err != nil {
215+
return
216+
}
217+
recoveryDurationGapFloat, ok := input["recovery-duration"].(float64)
218+
if !ok {
219+
handler.rd.JSON(w, http.StatusInternalServerError, errors.New("invalid argument for 'recovery-duration'").Error())
220+
return
221+
}
222+
recoveryDurationGap := (uint64)(recoveryDurationGapFloat)
223+
prevRecoveryDurationGap := atomic.LoadUint64(&handler.config.RecoveryDurationGap)
224+
atomic.StoreUint64(&handler.config.RecoveryDurationGap, recoveryDurationGap)
225+
log.Info("evict-slow-trend-scheduler update 'recovery-duration' - unit: s", zap.Uint64("prev", prevRecoveryDurationGap), zap.Uint64("cur", recoveryDurationGap))
226+
handler.rd.JSON(w, http.StatusOK, nil)
227+
}
228+
229+
func (handler *evictSlowTrendHandler) ListConfig(w http.ResponseWriter, r *http.Request) {
230+
conf := handler.config.Clone()
231+
handler.rd.JSON(w, http.StatusOK, conf)
232+
}
233+
165234
type evictSlowTrendScheduler struct {
166235
*BaseScheduler
167-
conf *evictSlowTrendSchedulerConfig
236+
conf *evictSlowTrendSchedulerConfig
237+
handler http.Handler
238+
}
239+
240+
func (s *evictSlowTrendScheduler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
241+
s.handler.ServeHTTP(w, r)
168242
}
169243

170244
func (s *evictSlowTrendScheduler) GetName() string {
@@ -244,7 +318,7 @@ func (s *evictSlowTrendScheduler) Schedule(cluster sche.SchedulerCluster, dryRun
244318
// slow node next time.
245319
log.Info("store evicted by slow trend has been removed", zap.Uint64("store-id", store.GetID()))
246320
storeSlowTrendActionStatusGauge.WithLabelValues("evict", "stop_removed").Inc()
247-
} else if checkStoreCanRecover(cluster, store, s.conf.lastCandidateCapturedSecs()) {
321+
} else if checkStoreCanRecover(cluster, store) && s.conf.readyForRecovery() {
248322
log.Info("store evicted by slow trend has been recovered", zap.Uint64("store-id", store.GetID()))
249323
storeSlowTrendActionStatusGauge.WithLabelValues("evict", "stop_recovered").Inc()
250324
} else {
@@ -301,9 +375,11 @@ func (s *evictSlowTrendScheduler) Schedule(cluster sche.SchedulerCluster, dryRun
301375
}
302376

303377
func newEvictSlowTrendScheduler(opController *operator.Controller, conf *evictSlowTrendSchedulerConfig) Scheduler {
378+
handler := newEvictSlowTrendHandler(conf)
304379
return &evictSlowTrendScheduler{
305380
BaseScheduler: NewBaseScheduler(opController),
306381
conf: conf,
382+
handler: handler,
307383
}
308384
}
309385

@@ -453,7 +529,7 @@ func checkStoreSlowerThanOthers(cluster sche.SchedulerCluster, target *core.Stor
453529
return slowerThanStoresNum >= expected
454530
}
455531

456-
func checkStoreCanRecover(cluster sche.SchedulerCluster, target *core.StoreInfo, recoveryGap uint64) bool {
532+
func checkStoreCanRecover(cluster sche.SchedulerCluster, target *core.StoreInfo) bool {
457533
/*
458534
//
459535
// This might not be necessary,
@@ -473,7 +549,7 @@ func checkStoreCanRecover(cluster sche.SchedulerCluster, target *core.StoreInfo,
473549
storeSlowTrendActionStatusGauge.WithLabelValues("recover.judging:got-event").Inc()
474550
}
475551
*/
476-
return checkStoreFasterThanOthers(cluster, target) && checkStoreReadyForRecover(target, recoveryGap)
552+
return checkStoreFasterThanOthers(cluster, target)
477553
}
478554

479555
func checkStoreFasterThanOthers(cluster sche.SchedulerCluster, target *core.StoreInfo) bool {
@@ -507,19 +583,6 @@ func checkStoreFasterThanOthers(cluster sche.SchedulerCluster, target *core.Stor
507583
return fasterThanStores >= expected
508584
}
509585

510-
// checkStoreReadyForRecover checks whether the given target store is ready for recover.
511-
func checkStoreReadyForRecover(target *core.StoreInfo, recoveryGap uint64) bool {
512-
durationGap := uint64(defaultRecoveryDurationGap)
513-
failpoint.Inject("transientRecoveryGap", func() {
514-
durationGap = 0
515-
})
516-
if targetSlowTrend := target.GetSlowTrend(); targetSlowTrend != nil {
517-
// TODO: setting the recovery time in SlowTrend
518-
return recoveryGap >= durationGap
519-
}
520-
return true
521-
}
522-
523586
// DurationSinceAsSecs returns the duration gap since the given startTS, unit: s.
524587
func DurationSinceAsSecs(startTS time.Time) uint64 {
525588
return uint64(time.Since(startTS).Seconds())

pkg/schedule/schedulers/evict_slow_trend_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ func (suite *evictSlowTrendTestSuite) TestEvictSlowTrendBasicFuncs() {
9393
suite.Equal(*lastCapturedCandidate, es2.conf.evictCandidate)
9494
suite.Equal(es2.conf.candidateCapturedSecs(), uint64(0))
9595
suite.Equal(es2.conf.lastCandidateCapturedSecs(), uint64(0))
96-
suite.False(checkStoreReadyForRecover(store, es2.conf.lastCandidateCapturedSecs()))
96+
suite.False(es2.conf.readyForRecovery())
9797
recoverTS := lastCapturedCandidate.recoverTS
9898
suite.True(recoverTS.After(lastCapturedCandidate.captureTS))
9999
// Pop captured store 1 and mark it has recovered.

pkg/schedule/schedulers/init.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ func schedulersRegister() {
466466
})
467467

468468
RegisterScheduler(EvictSlowTrendType, func(opController *operator.Controller, storage endpoint.ConfigStorage, decoder ConfigDecoder, removeSchedulerCb ...func(string) error) (Scheduler, error) {
469-
conf := &evictSlowTrendSchedulerConfig{storage: storage, EvictedStores: make([]uint64, 0), evictCandidate: slowCandidate{}, lastEvictCandidate: slowCandidate{}}
469+
conf := initEvictSlowTrendSchedulerConfig(storage)
470470
if err := decoder(conf); err != nil {
471471
return nil, err
472472
}

tests/pdctl/scheduler/scheduler_test.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,21 @@ func (suite *schedulerTestSuite) checkScheduler(cluster *tests.TestCluster) {
407407
echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "balance-leader-scheduler"}, nil)
408408
re.Contains(echo, "Success!")
409409

410+
// test evict-slow-trend scheduler config
411+
echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "evict-slow-trend-scheduler"}, nil)
412+
re.Contains(echo, "Success!")
413+
echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "show"}, nil)
414+
re.Contains(echo, "evict-slow-trend-scheduler")
415+
echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "evict-slow-trend-scheduler", "set", "recovery-duration", "100"}, nil)
416+
re.Contains(echo, "Success!")
417+
conf = make(map[string]interface{})
418+
mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "evict-slow-trend-scheduler", "show"}, &conf)
419+
re.Equal(100., conf["recovery-duration"])
420+
echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "evict-slow-trend-scheduler"}, nil)
421+
re.Contains(echo, "Success!")
422+
echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "show"}, nil)
423+
re.NotContains(echo, "evict-slow-trend-scheduler")
424+
410425
// test show scheduler with paused and disabled status.
411426
checkSchedulerWithStatusCommand := func(status string, expected []string) {
412427
var schedulers []string

tools/pd-ctl/pdctl/command/scheduler.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,7 @@ func NewConfigSchedulerCommand() *cobra.Command {
499499
newConfigGrantHotRegionCommand(),
500500
newConfigBalanceLeaderCommand(),
501501
newSplitBucketCommand(),
502+
newConfigEvictSlowTrendCommand(),
502503
)
503504
return c
504505
}
@@ -775,6 +776,25 @@ func setShuffleRegionSchedulerRolesCommandFunc(cmd *cobra.Command, args []string
775776
cmd.Println("Success!")
776777
}
777778

779+
func newConfigEvictSlowTrendCommand() *cobra.Command {
780+
c := &cobra.Command{
781+
Use: "evict-slow-trend-scheduler",
782+
Short: "evict-slow-trend-scheduler config",
783+
Run: listSchedulerConfigCommandFunc,
784+
}
785+
786+
c.AddCommand(&cobra.Command{
787+
Use: "show",
788+
Short: "list the config item",
789+
Run: listSchedulerConfigCommandFunc,
790+
}, &cobra.Command{
791+
Use: "set <key> <value>",
792+
Short: "set the config item",
793+
Run: func(cmd *cobra.Command, args []string) { postSchedulerConfigCommandFunc(cmd, c.Name(), args) },
794+
})
795+
return c
796+
}
797+
778798
// NewDescribeSchedulerCommand returns command to describe the scheduler.
779799
func NewDescribeSchedulerCommand() *cobra.Command {
780800
c := &cobra.Command{

0 commit comments

Comments
 (0)