@@ -160,17 +160,22 @@ func (u *unsafeRecoveryController) reset() {
160160func (u * unsafeRecoveryController ) IsRunning () bool {
161161 u .RLock ()
162162 defer u .RUnlock ()
163+ return u .isRunningLocked ()
164+ }
165+
166+ func (u * unsafeRecoveryController ) isRunningLocked () bool {
163167 return u .stage != idle && u .stage != finished && u .stage != failed
164168}
165169
166170// RemoveFailedStores removes failed stores from the cluster.
167171func (u * unsafeRecoveryController ) RemoveFailedStores (failedStores map [uint64 ]struct {}, timeout uint64 ) error {
168- if u .IsRunning () {
169- return errs .ErrUnsafeRecoveryIsRunning .FastGenByArgs ()
170- }
171172 u .Lock ()
172173 defer u .Unlock ()
173174
175+ if u .isRunningLocked () {
176+ return errs .ErrUnsafeRecoveryIsRunning .FastGenByArgs ()
177+ }
178+
174179 if len (failedStores ) == 0 {
175180 return errs .ErrUnsafeRecoveryInvalidInput .FastGenByArgs ("no store specified" )
176181 }
@@ -216,7 +221,9 @@ func (u *unsafeRecoveryController) Show() []StageOutput {
216221 if u .stage == idle {
217222 return []StageOutput {{Info : "No on-going recovery." }}
218223 }
219- u .checkTimeout ()
224+ if err := u .checkTimeout (); err != nil {
225+ u .HandleErr (err )
226+ }
220227 status := u .output
221228 if u .stage != finished && u .stage != failed {
222229 status = append (status , u .getReportStatus ())
@@ -251,17 +258,15 @@ func (u *unsafeRecoveryController) getReportStatus() StageOutput {
251258 return status
252259}
253260
254- func (u * unsafeRecoveryController ) checkTimeout () bool {
261+ func (u * unsafeRecoveryController ) checkTimeout () error {
255262 if u .stage == finished || u .stage == failed {
256- return false
263+ return nil
257264 }
258265
259266 if time .Now ().After (u .timeout ) {
260- ret := u .HandleErr (errors .Errorf ("Exceeds timeout %v" , u .timeout ))
261- u .timeout = time .Now ().Add (storeRequestInterval * 2 )
262- return ret
267+ return errors .Errorf ("Exceeds timeout %v" , u .timeout )
263268 }
264- return false
269+ return nil
265270}
266271
267272func (u * unsafeRecoveryController ) HandleErr (err error ) bool {
@@ -270,128 +275,139 @@ func (u *unsafeRecoveryController) HandleErr(err error) bool {
270275 u .err = err
271276 }
272277 if u .stage == exitForceLeader {
278+ // We already tried to exit force leader, and it still failed.
279+ // We turn into failed stage directly. TiKV will step down force leader
280+ // automatically after being for a long time.
273281 u .changeStage (failed )
274282 return true
275283 }
284+ // When encountering an error for the first time, we will try to exit force
285+ // leader before turning into failed stage to avoid the leaking force leaders
286+ // blocks reads and writes.
287+ u .storePlanExpires = make (map [uint64 ]time.Time )
288+ u .storeRecoveryPlans = make (map [uint64 ]* pdpb.RecoveryPlan )
289+ u .timeout = time .Now ().Add (storeRequestInterval * 2 )
290+ // empty recovery plan would trigger exit force leader
291+ u .changeStage (exitForceLeader )
276292 return false
277293}
278294
279295// HandleStoreHeartbeat handles the store heartbeat requests and checks whether the stores need to
280296// send detailed report back.
281297func (u * unsafeRecoveryController ) HandleStoreHeartbeat (heartbeat * pdpb.StoreHeartbeatRequest , resp * pdpb.StoreHeartbeatResponse ) {
282- if ! u .IsRunning () {
283- // no recovery in progress, do nothing
284- return
285- }
286298 u .Lock ()
287299 defer u .Unlock ()
288300
289- if u .checkTimeout () {
301+ if ! u .isRunningLocked () {
302+ // no recovery in progress, do nothing
290303 return
291304 }
292305
293- allCollected := u .collectReport (heartbeat )
306+ done , err := func () (bool , error ) {
307+ if err := u .checkTimeout (); err != nil {
308+ return false , err
309+ }
294310
295- if allCollected {
296- newestRegionTree , peersMap , buildErr := u .buildUpFromReports ()
297- if buildErr != nil && u .HandleErr (buildErr ) {
298- return
311+ allCollected , err := u .collectReport (heartbeat )
312+ if err != nil {
313+ return false , err
299314 }
300315
301- // clean up previous plan
302- u .storePlanExpires = make (map [uint64 ]time.Time )
303- u .storeRecoveryPlans = make (map [uint64 ]* pdpb.RecoveryPlan )
316+ if allCollected {
317+ newestRegionTree , peersMap , err := u .buildUpFromReports ()
318+ if err != nil {
319+ return false , err
320+ }
304321
305- var stage unsafeRecoveryStage
306- if u .err == nil {
307- stage = u .stage
308- } else {
309- stage = exitForceLeader
322+ return u .generatePlan (newestRegionTree , peersMap )
310323 }
311- reCheck := false
312- hasPlan := false
313- var err error
314- for {
315- switch stage {
316- case collectReport :
317- fallthrough
318- case tombstoneTiFlashLearner :
319- if hasPlan , err = u .generateTombstoneTiFlashLearnerPlan (newestRegionTree , peersMap ); hasPlan && err == nil {
320- u .changeStage (tombstoneTiFlashLearner )
321- break
322- }
323- if err != nil {
324- break
325- }
326- fallthrough
327- case forceLeaderForCommitMerge :
328- if hasPlan , err = u .generateForceLeaderPlan (newestRegionTree , peersMap , true ); hasPlan && err == nil {
329- u .changeStage (forceLeaderForCommitMerge )
330- break
331- }
332- if err != nil {
333- break
334- }
335- fallthrough
336- case forceLeader :
337- if hasPlan , err = u .generateForceLeaderPlan (newestRegionTree , peersMap , false ); hasPlan && err == nil {
338- u .changeStage (forceLeader )
339- break
340- }
341- if err != nil {
342- break
343- }
344- fallthrough
345- case demoteFailedVoter :
346- if hasPlan = u .generateDemoteFailedVoterPlan (newestRegionTree , peersMap ); hasPlan {
347- u .changeStage (demoteFailedVoter )
348- break
349- } else if ! reCheck {
350- reCheck = true
351- stage = tombstoneTiFlashLearner
352- continue
353- }
354- fallthrough
355- case createEmptyRegion :
356- if hasPlan , err = u .generateCreateEmptyRegionPlan (newestRegionTree , peersMap ); hasPlan && err == nil {
357- u .changeStage (createEmptyRegion )
358- break
359- }
360- if err != nil {
361- break
362- }
363- fallthrough
364- case exitForceLeader :
365- // no need to generate plan, empty recovery plan triggers exit force leader on TiKV side
366- if hasPlan = u .generateExitForceLeaderPlan (); hasPlan {
367- u .changeStage (exitForceLeader )
368- }
369- default :
370- panic ("unreachable" )
371- }
324+ return false , nil
325+ }()
372326
327+ if done || (err != nil && u .HandleErr (err )) {
328+ return
329+ }
330+ u .dispatchPlan (heartbeat , resp )
331+ }
332+
333+ func (u * unsafeRecoveryController ) generatePlan (newestRegionTree * regionTree , peersMap map [uint64 ][]* regionItem ) (bool , error ) {
334+ // clean up previous plan
335+ u .storePlanExpires = make (map [uint64 ]time.Time )
336+ u .storeRecoveryPlans = make (map [uint64 ]* pdpb.RecoveryPlan )
337+
338+ stage := u .stage
339+ reCheck := false
340+ hasPlan := false
341+ var err error
342+ for {
343+ switch stage {
344+ case collectReport :
345+ fallthrough
346+ case tombstoneTiFlashLearner :
347+ if hasPlan , err = u .generateTombstoneTiFlashLearnerPlan (newestRegionTree , peersMap ); hasPlan && err == nil {
348+ u .changeStage (tombstoneTiFlashLearner )
349+ break
350+ }
373351 if err != nil {
374- if u .HandleErr (err ) {
375- return
376- }
377- u .storePlanExpires = make (map [uint64 ]time.Time )
378- u .storeRecoveryPlans = make (map [uint64 ]* pdpb.RecoveryPlan )
379- // Clear the reports etc.
352+ break
353+ }
354+ fallthrough
355+ case forceLeaderForCommitMerge :
356+ if hasPlan , err = u .generateForceLeaderPlan (newestRegionTree , peersMap , true ); hasPlan && err == nil {
357+ u .changeStage (forceLeaderForCommitMerge )
358+ break
359+ }
360+ if err != nil {
361+ break
362+ }
363+ fallthrough
364+ case forceLeader :
365+ if hasPlan , err = u .generateForceLeaderPlan (newestRegionTree , peersMap , false ); hasPlan && err == nil {
366+ u .changeStage (forceLeader )
367+ break
368+ }
369+ if err != nil {
370+ break
371+ }
372+ fallthrough
373+ case demoteFailedVoter :
374+ if hasPlan = u .generateDemoteFailedVoterPlan (newestRegionTree , peersMap ); hasPlan {
375+ u .changeStage (demoteFailedVoter )
376+ break
377+ } else if ! reCheck {
378+ reCheck = true
379+ stage = tombstoneTiFlashLearner
380+ continue
381+ }
382+ fallthrough
383+ case createEmptyRegion :
384+ if hasPlan , err = u .generateCreateEmptyRegionPlan (newestRegionTree , peersMap ); hasPlan && err == nil {
385+ u .changeStage (createEmptyRegion )
386+ break
387+ }
388+ if err != nil {
389+ break
390+ }
391+ fallthrough
392+ case exitForceLeader :
393+ if hasPlan = u .generateExitForceLeaderPlan (); hasPlan {
380394 u .changeStage (exitForceLeader )
381- return
382- } else if ! hasPlan {
383- if u .err != nil {
384- u .changeStage (failed )
385- } else {
386- u .changeStage (finished )
387- }
388- return
389395 }
390- break
396+ default :
397+ panic ("unreachable" )
391398 }
399+ break
392400 }
393401
394- u .dispatchPlan (heartbeat , resp )
402+ if err == nil && ! hasPlan {
403+ if u .err != nil {
404+ u .changeStage (failed )
405+ } else {
406+ u .changeStage (finished )
407+ }
408+ return true , nil
409+ }
410+ return false , err
395411}
396412
397413// It dispatches recovery plan if any.
@@ -417,22 +433,21 @@ func (u *unsafeRecoveryController) dispatchPlan(heartbeat *pdpb.StoreHeartbeatRe
417433}
418434
419435// It collects and checks if store reports have been fully collected.
420- func (u * unsafeRecoveryController ) collectReport (heartbeat * pdpb.StoreHeartbeatRequest ) bool {
436+ func (u * unsafeRecoveryController ) collectReport (heartbeat * pdpb.StoreHeartbeatRequest ) ( bool , error ) {
421437 storeID := heartbeat .Stats .StoreId
422438 if _ , isFailedStore := u .failedStores [storeID ]; isFailedStore {
423- u .HandleErr (errors .Errorf ("Receive heartbeat from failed store %d" , storeID ))
424- return false
439+ return false , errors .Errorf ("Receive heartbeat from failed store %d" , storeID )
425440 }
426441
427442 if heartbeat .StoreReport == nil {
428- return false
443+ return false , nil
429444 }
430445
431446 if heartbeat .StoreReport .GetStep () != u .step {
432447 log .Info ("Unsafe recovery receives invalid store report" ,
433448 zap .Uint64 ("store-id" , storeID ), zap .Uint64 ("expected-step" , u .step ), zap .Uint64 ("obtained-step" , heartbeat .StoreReport .GetStep ()))
434449 // invalid store report, ignore
435- return false
450+ return false , nil
436451 }
437452
438453 if report , exists := u .storeReports [storeID ]; exists {
@@ -441,11 +456,11 @@ func (u *unsafeRecoveryController) collectReport(heartbeat *pdpb.StoreHeartbeatR
441456 if report == nil {
442457 u .numStoresReported ++
443458 if u .numStoresReported == len (u .storeReports ) {
444- return true
459+ return true , nil
445460 }
446461 }
447462 }
448- return false
463+ return false , nil
449464}
450465
451466// Gets the stage of the current unsafe recovery.
@@ -1172,6 +1187,7 @@ func (u *unsafeRecoveryController) generateExitForceLeaderPlan() bool {
11721187 for storeID , storeReport := range u .storeReports {
11731188 for _ , peerReport := range storeReport .PeerReports {
11741189 if peerReport .IsForceLeader {
1190+ // empty recovery plan triggers exit force leader on TiKV side
11751191 _ = u .getRecoveryPlan (storeID )
11761192 hasPlan = true
11771193 break
0 commit comments