@@ -660,16 +660,16 @@ func (suite *keyspaceGroupTestSuite) setupTSONodesAndClient(re *require.Assertio
660660
661661// TestUpdateMemberWhenRecovery verifies that in TSO microservice mode (API_SVC_MODE), when all TSO nodes
662662// become temporarily unavailable and then recover, the client should NOT fallback to
663- // the legacy path (group 0), but should wait and successfully get TSO after nodes restart .
663+ // the legacy path (group 0), but should eventually get a newer TSO once service recovers .
664664//
665665// Test scenario:
666666// 1. Setup: Start 2 TSO nodes and create keyspace group 1, client gets initial TSO
667667// 2. Close all TSO nodes to simulate total TSO microservice failure
668668// 3. Wait until all TSO nodes are deregistered (getTSOServerURLs returns empty)
669669// 4. Enable failpoints: assertNotReachLegacyPath (panic if fallback) and extend timeout
670- // 5. Start async GetTS call (will wait for TSO service to recover)
671- // 6. Restart one TSO node while GetTS is waiting
672- // 7. Verify GetTS succeeds after node restart (assertNotReachLegacyPath ensures no fallback)
670+ // 5. Start an async GetTS call while the TSO service is down
671+ // 6. Restart one TSO node to recover the TSO service
672+ // 7. Verify eventual recovery: either the in-flight GetTS or a fresh retry gets a newer TS
673673func (suite * keyspaceGroupTestSuite ) TestUpdateMemberWhenRecovery () {
674674 re := suite .Require ()
675675
@@ -712,7 +712,7 @@ func (suite *keyspaceGroupTestSuite) TestUpdateMemberWhenRecovery() {
712712 re .NoError (failpoint .Disable ("github.com/tikv/pd/client/servicediscovery/assertNotReachLegacyPath" ))
713713 }()
714714
715- // Step 5: Start async GetTS call - it will wait for TSO service to recover
715+ // Step 5: Start an async GetTS call while the TSO service is unavailable
716716 // Use an independent context with explicit timeout for this GetTS operation
717717 getTSCtx , getTSCancel := context .WithTimeout (context .Background (), 60 * time .Second )
718718 defer getTSCancel ()
@@ -731,25 +731,39 @@ func (suite *keyspaceGroupTestSuite) TestUpdateMemberWhenRecovery() {
731731
732732 time .Sleep (waitForGetTSStart ) // Give it time to begin execution
733733
734- // Step 6: Restart one TSO node while GetTS is waiting
734+ // Step 6: Restart one TSO node to recover the TSO service
735735 newNode , cleanup := tests .StartSingleTSOTestServer (suite .ctx , re , suite .backendEndpoints , firstNodeAddr )
736736 setup .cleanups = append (setup .cleanups , cleanup )
737737 nodes [newNode .GetAddr ()] = newNode
738738 tests .WaitForPrimaryServing (re , map [string ]bs.Server {newNode .GetAddr (): newNode })
739739
740- // Step 7: Verify GetTS succeeds after node restart.
741- // The restarted node may transiently serve stale keyspace-group metadata
742- // before watch sync catches up, so tolerate one transient GetTS error and
743- // assert eventual recovery instead of requiring immediate success.
744- result := <- resultCh
745- if result .err != nil {
746- testutil .Eventually (re , func () bool {
747- retryCtx , cancel := context .WithTimeout (context .Background (), 10 * time .Second )
748- defer cancel ()
749- _ , _ , err := client .GetTS (retryCtx )
750- return err == nil
751- }, testutil .WithWaitFor (60 * time .Second ), testutil .WithTickInterval (500 * time .Millisecond ))
752- }
740+ // Step 7: Verify eventual recovery after node restart.
741+ // The in-flight GetTS may stay attached to stale discovery/metadata during
742+ // recovery. Allow either the blocked request or a fresh retry to observe the
743+ // recovered TSO service and return a newer timestamp.
744+ var recoveredTS uint64
745+ testutil .Eventually (re , func () bool {
746+ select {
747+ case result := <- resultCh :
748+ if result .err == nil {
749+ recoveredTS = tsoutil .ComposeTS (result .physicalTS , result .logicalTS )
750+ return recoveredTS > setup .initialTS
751+ }
752+ default :
753+ }
754+
755+ retryCtx , cancel := context .WithTimeout (context .Background (), 10 * time .Second )
756+ defer cancel ()
757+
758+ physicalTS , logicalTS , err := client .GetTS (retryCtx )
759+ if err != nil {
760+ return false
761+ }
762+ recoveredTS = tsoutil .ComposeTS (physicalTS , logicalTS )
763+ return recoveredTS > setup .initialTS
764+ }, testutil .WithWaitFor (60 * time .Second ), testutil .WithTickInterval (500 * time .Millisecond ))
765+ getTSCancel ()
766+ re .Greater (recoveredTS , setup .initialTS )
753767
754768 // KEY VERIFICATION: If code incorrectly tried to fallback to legacy path,
755769 // assertNotReachLegacyPath failpoint would have panicked already
0 commit comments