Skip to content

Commit ba782a5

Browse files
Connor1996ti-chi-bot
authored andcommitted
online recovery: fix online recovery timeout mechanism (tikv#6108)
close tikv#6107 fix online recovery timeout mechanism Signed-off-by: Connor1996 <zbk602423539@gmail.com> Co-authored-by: Ti Chi Robot <ti-community-prow-bot@tidb.io> Signed-off-by: Yang Zhang <yang.zhang@pingcap.com>
1 parent d75733f commit ba782a5

File tree

2 files changed

+167
-117
lines changed

2 files changed

+167
-117
lines changed

server/cluster/unsafe_recovery_controller.go

Lines changed: 128 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -160,17 +160,22 @@ func (u *unsafeRecoveryController) reset() {
160160
func (u *unsafeRecoveryController) IsRunning() bool {
161161
u.RLock()
162162
defer u.RUnlock()
163+
return u.isRunningLocked()
164+
}
165+
166+
func (u *unsafeRecoveryController) isRunningLocked() bool {
163167
return u.stage != idle && u.stage != finished && u.stage != failed
164168
}
165169

166170
// RemoveFailedStores removes failed stores from the cluster.
167171
func (u *unsafeRecoveryController) RemoveFailedStores(failedStores map[uint64]struct{}, timeout uint64) error {
168-
if u.IsRunning() {
169-
return errs.ErrUnsafeRecoveryIsRunning.FastGenByArgs()
170-
}
171172
u.Lock()
172173
defer u.Unlock()
173174

175+
if u.isRunningLocked() {
176+
return errs.ErrUnsafeRecoveryIsRunning.FastGenByArgs()
177+
}
178+
174179
if len(failedStores) == 0 {
175180
return errs.ErrUnsafeRecoveryInvalidInput.FastGenByArgs("no store specified")
176181
}
@@ -216,7 +221,9 @@ func (u *unsafeRecoveryController) Show() []StageOutput {
216221
if u.stage == idle {
217222
return []StageOutput{{Info: "No on-going recovery."}}
218223
}
219-
u.checkTimeout()
224+
if err := u.checkTimeout(); err != nil {
225+
u.HandleErr(err)
226+
}
220227
status := u.output
221228
if u.stage != finished && u.stage != failed {
222229
status = append(status, u.getReportStatus())
@@ -251,17 +258,15 @@ func (u *unsafeRecoveryController) getReportStatus() StageOutput {
251258
return status
252259
}
253260

254-
func (u *unsafeRecoveryController) checkTimeout() bool {
261+
func (u *unsafeRecoveryController) checkTimeout() error {
255262
if u.stage == finished || u.stage == failed {
256-
return false
263+
return nil
257264
}
258265

259266
if time.Now().After(u.timeout) {
260-
ret := u.HandleErr(errors.Errorf("Exceeds timeout %v", u.timeout))
261-
u.timeout = time.Now().Add(storeRequestInterval * 2)
262-
return ret
267+
return errors.Errorf("Exceeds timeout %v", u.timeout)
263268
}
264-
return false
269+
return nil
265270
}
266271

267272
func (u *unsafeRecoveryController) HandleErr(err error) bool {
@@ -270,128 +275,139 @@ func (u *unsafeRecoveryController) HandleErr(err error) bool {
270275
u.err = err
271276
}
272277
if u.stage == exitForceLeader {
278+
// We already tried to exit force leader, and it still failed.
279+
// We turn into failed stage directly. TiKV will step down force leader
280+
// automatically after being for a long time.
273281
u.changeStage(failed)
274282
return true
275283
}
284+
// When encountering an error for the first time, we will try to exit force
285+
// leader before turning into failed stage to avoid the leaking force leaders
286+
// blocks reads and writes.
287+
u.storePlanExpires = make(map[uint64]time.Time)
288+
u.storeRecoveryPlans = make(map[uint64]*pdpb.RecoveryPlan)
289+
u.timeout = time.Now().Add(storeRequestInterval * 2)
290+
// empty recovery plan would trigger exit force leader
291+
u.changeStage(exitForceLeader)
276292
return false
277293
}
278294

279295
// HandleStoreHeartbeat handles the store heartbeat requests and checks whether the stores need to
280296
// send detailed report back.
281297
func (u *unsafeRecoveryController) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest, resp *pdpb.StoreHeartbeatResponse) {
282-
if !u.IsRunning() {
283-
// no recovery in progress, do nothing
284-
return
285-
}
286298
u.Lock()
287299
defer u.Unlock()
288300

289-
if u.checkTimeout() {
301+
if !u.isRunningLocked() {
302+
// no recovery in progress, do nothing
290303
return
291304
}
292305

293-
allCollected := u.collectReport(heartbeat)
306+
done, err := func() (bool, error) {
307+
if err := u.checkTimeout(); err != nil {
308+
return false, err
309+
}
294310

295-
if allCollected {
296-
newestRegionTree, peersMap, buildErr := u.buildUpFromReports()
297-
if buildErr != nil && u.HandleErr(buildErr) {
298-
return
311+
allCollected, err := u.collectReport(heartbeat)
312+
if err != nil {
313+
return false, err
299314
}
300315

301-
// clean up previous plan
302-
u.storePlanExpires = make(map[uint64]time.Time)
303-
u.storeRecoveryPlans = make(map[uint64]*pdpb.RecoveryPlan)
316+
if allCollected {
317+
newestRegionTree, peersMap, err := u.buildUpFromReports()
318+
if err != nil {
319+
return false, err
320+
}
304321

305-
var stage unsafeRecoveryStage
306-
if u.err == nil {
307-
stage = u.stage
308-
} else {
309-
stage = exitForceLeader
322+
return u.generatePlan(newestRegionTree, peersMap)
310323
}
311-
reCheck := false
312-
hasPlan := false
313-
var err error
314-
for {
315-
switch stage {
316-
case collectReport:
317-
fallthrough
318-
case tombstoneTiFlashLearner:
319-
if hasPlan, err = u.generateTombstoneTiFlashLearnerPlan(newestRegionTree, peersMap); hasPlan && err == nil {
320-
u.changeStage(tombstoneTiFlashLearner)
321-
break
322-
}
323-
if err != nil {
324-
break
325-
}
326-
fallthrough
327-
case forceLeaderForCommitMerge:
328-
if hasPlan, err = u.generateForceLeaderPlan(newestRegionTree, peersMap, true); hasPlan && err == nil {
329-
u.changeStage(forceLeaderForCommitMerge)
330-
break
331-
}
332-
if err != nil {
333-
break
334-
}
335-
fallthrough
336-
case forceLeader:
337-
if hasPlan, err = u.generateForceLeaderPlan(newestRegionTree, peersMap, false); hasPlan && err == nil {
338-
u.changeStage(forceLeader)
339-
break
340-
}
341-
if err != nil {
342-
break
343-
}
344-
fallthrough
345-
case demoteFailedVoter:
346-
if hasPlan = u.generateDemoteFailedVoterPlan(newestRegionTree, peersMap); hasPlan {
347-
u.changeStage(demoteFailedVoter)
348-
break
349-
} else if !reCheck {
350-
reCheck = true
351-
stage = tombstoneTiFlashLearner
352-
continue
353-
}
354-
fallthrough
355-
case createEmptyRegion:
356-
if hasPlan, err = u.generateCreateEmptyRegionPlan(newestRegionTree, peersMap); hasPlan && err == nil {
357-
u.changeStage(createEmptyRegion)
358-
break
359-
}
360-
if err != nil {
361-
break
362-
}
363-
fallthrough
364-
case exitForceLeader:
365-
// no need to generate plan, empty recovery plan triggers exit force leader on TiKV side
366-
if hasPlan = u.generateExitForceLeaderPlan(); hasPlan {
367-
u.changeStage(exitForceLeader)
368-
}
369-
default:
370-
panic("unreachable")
371-
}
324+
return false, nil
325+
}()
372326

327+
if done || (err != nil && u.HandleErr(err)) {
328+
return
329+
}
330+
u.dispatchPlan(heartbeat, resp)
331+
}
332+
333+
func (u *unsafeRecoveryController) generatePlan(newestRegionTree *regionTree, peersMap map[uint64][]*regionItem) (bool, error) {
334+
// clean up previous plan
335+
u.storePlanExpires = make(map[uint64]time.Time)
336+
u.storeRecoveryPlans = make(map[uint64]*pdpb.RecoveryPlan)
337+
338+
stage := u.stage
339+
reCheck := false
340+
hasPlan := false
341+
var err error
342+
for {
343+
switch stage {
344+
case collectReport:
345+
fallthrough
346+
case tombstoneTiFlashLearner:
347+
if hasPlan, err = u.generateTombstoneTiFlashLearnerPlan(newestRegionTree, peersMap); hasPlan && err == nil {
348+
u.changeStage(tombstoneTiFlashLearner)
349+
break
350+
}
373351
if err != nil {
374-
if u.HandleErr(err) {
375-
return
376-
}
377-
u.storePlanExpires = make(map[uint64]time.Time)
378-
u.storeRecoveryPlans = make(map[uint64]*pdpb.RecoveryPlan)
379-
// Clear the reports etc.
352+
break
353+
}
354+
fallthrough
355+
case forceLeaderForCommitMerge:
356+
if hasPlan, err = u.generateForceLeaderPlan(newestRegionTree, peersMap, true); hasPlan && err == nil {
357+
u.changeStage(forceLeaderForCommitMerge)
358+
break
359+
}
360+
if err != nil {
361+
break
362+
}
363+
fallthrough
364+
case forceLeader:
365+
if hasPlan, err = u.generateForceLeaderPlan(newestRegionTree, peersMap, false); hasPlan && err == nil {
366+
u.changeStage(forceLeader)
367+
break
368+
}
369+
if err != nil {
370+
break
371+
}
372+
fallthrough
373+
case demoteFailedVoter:
374+
if hasPlan = u.generateDemoteFailedVoterPlan(newestRegionTree, peersMap); hasPlan {
375+
u.changeStage(demoteFailedVoter)
376+
break
377+
} else if !reCheck {
378+
reCheck = true
379+
stage = tombstoneTiFlashLearner
380+
continue
381+
}
382+
fallthrough
383+
case createEmptyRegion:
384+
if hasPlan, err = u.generateCreateEmptyRegionPlan(newestRegionTree, peersMap); hasPlan && err == nil {
385+
u.changeStage(createEmptyRegion)
386+
break
387+
}
388+
if err != nil {
389+
break
390+
}
391+
fallthrough
392+
case exitForceLeader:
393+
if hasPlan = u.generateExitForceLeaderPlan(); hasPlan {
380394
u.changeStage(exitForceLeader)
381-
return
382-
} else if !hasPlan {
383-
if u.err != nil {
384-
u.changeStage(failed)
385-
} else {
386-
u.changeStage(finished)
387-
}
388-
return
389395
}
390-
break
396+
default:
397+
panic("unreachable")
391398
}
399+
break
392400
}
393401

394-
u.dispatchPlan(heartbeat, resp)
402+
if err == nil && !hasPlan {
403+
if u.err != nil {
404+
u.changeStage(failed)
405+
} else {
406+
u.changeStage(finished)
407+
}
408+
return true, nil
409+
}
410+
return false, err
395411
}
396412

397413
// It dispatches recovery plan if any.
@@ -417,22 +433,21 @@ func (u *unsafeRecoveryController) dispatchPlan(heartbeat *pdpb.StoreHeartbeatRe
417433
}
418434

419435
// It collects and checks if store reports have been fully collected.
420-
func (u *unsafeRecoveryController) collectReport(heartbeat *pdpb.StoreHeartbeatRequest) bool {
436+
func (u *unsafeRecoveryController) collectReport(heartbeat *pdpb.StoreHeartbeatRequest) (bool, error) {
421437
storeID := heartbeat.Stats.StoreId
422438
if _, isFailedStore := u.failedStores[storeID]; isFailedStore {
423-
u.HandleErr(errors.Errorf("Receive heartbeat from failed store %d", storeID))
424-
return false
439+
return false, errors.Errorf("Receive heartbeat from failed store %d", storeID)
425440
}
426441

427442
if heartbeat.StoreReport == nil {
428-
return false
443+
return false, nil
429444
}
430445

431446
if heartbeat.StoreReport.GetStep() != u.step {
432447
log.Info("Unsafe recovery receives invalid store report",
433448
zap.Uint64("store-id", storeID), zap.Uint64("expected-step", u.step), zap.Uint64("obtained-step", heartbeat.StoreReport.GetStep()))
434449
// invalid store report, ignore
435-
return false
450+
return false, nil
436451
}
437452

438453
if report, exists := u.storeReports[storeID]; exists {
@@ -441,11 +456,11 @@ func (u *unsafeRecoveryController) collectReport(heartbeat *pdpb.StoreHeartbeatR
441456
if report == nil {
442457
u.numStoresReported++
443458
if u.numStoresReported == len(u.storeReports) {
444-
return true
459+
return true, nil
445460
}
446461
}
447462
}
448-
return false
463+
return false, nil
449464
}
450465

451466
// Gets the stage of the current unsafe recovery.
@@ -1172,6 +1187,7 @@ func (u *unsafeRecoveryController) generateExitForceLeaderPlan() bool {
11721187
for storeID, storeReport := range u.storeReports {
11731188
for _, peerReport := range storeReport.PeerReports {
11741189
if peerReport.IsForceLeader {
1190+
// empty recovery plan triggers exit force leader on TiKV side
11751191
_ = u.getRecoveryPlan(storeID)
11761192
hasPlan = true
11771193
break

0 commit comments

Comments
 (0)