Skip to content

Commit 3095bb6

Browse files
authored
tests: stabilize TestUpdateMemberWhenRecovery (#10442)
close #9994 tests: stabilize TestUpdateMemberWhenRecovery Signed-off-by: okjiang <819421878@qq.com>
1 parent ea48887 commit 3095bb6

File tree

1 file changed

+33
-19
lines changed

1 file changed

+33
-19
lines changed

tests/integrations/mcs/keyspace/tso_keyspace_group_test.go

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -660,16 +660,16 @@ func (suite *keyspaceGroupTestSuite) setupTSONodesAndClient(re *require.Assertio
660660

661661
// TestUpdateMemberWhenRecovery verifies that in TSO microservice mode (API_SVC_MODE), when all TSO nodes
662662
// become temporarily unavailable and then recover, the client should NOT fallback to
663-
// the legacy path (group 0), but should wait and successfully get TSO after nodes restart.
663+
// the legacy path (group 0), but should eventually get a newer TSO once service recovers.
664664
//
665665
// Test scenario:
666666
// 1. Setup: Start 2 TSO nodes and create keyspace group 1, client gets initial TSO
667667
// 2. Close all TSO nodes to simulate total TSO microservice failure
668668
// 3. Wait until all TSO nodes are deregistered (getTSOServerURLs returns empty)
669669
// 4. Enable failpoints: assertNotReachLegacyPath (panic if fallback) and extend timeout
670-
// 5. Start async GetTS call (will wait for TSO service to recover)
671-
// 6. Restart one TSO node while GetTS is waiting
672-
// 7. Verify GetTS succeeds after node restart (assertNotReachLegacyPath ensures no fallback)
670+
// 5. Start an async GetTS call while the TSO service is down
671+
// 6. Restart one TSO node to recover the TSO service
672+
// 7. Verify eventual recovery: either the in-flight GetTS or a fresh retry gets a newer TS
673673
func (suite *keyspaceGroupTestSuite) TestUpdateMemberWhenRecovery() {
674674
re := suite.Require()
675675

@@ -712,7 +712,7 @@ func (suite *keyspaceGroupTestSuite) TestUpdateMemberWhenRecovery() {
712712
re.NoError(failpoint.Disable("github.com/tikv/pd/client/servicediscovery/assertNotReachLegacyPath"))
713713
}()
714714

715-
// Step 5: Start async GetTS call - it will wait for TSO service to recover
715+
// Step 5: Start an async GetTS call while the TSO service is unavailable
716716
// Use an independent context with explicit timeout for this GetTS operation
717717
getTSCtx, getTSCancel := context.WithTimeout(context.Background(), 60*time.Second)
718718
defer getTSCancel()
@@ -731,25 +731,39 @@ func (suite *keyspaceGroupTestSuite) TestUpdateMemberWhenRecovery() {
731731

732732
time.Sleep(waitForGetTSStart) // Give it time to begin execution
733733

734-
// Step 6: Restart one TSO node while GetTS is waiting
734+
// Step 6: Restart one TSO node to recover the TSO service
735735
newNode, cleanup := tests.StartSingleTSOTestServer(suite.ctx, re, suite.backendEndpoints, firstNodeAddr)
736736
setup.cleanups = append(setup.cleanups, cleanup)
737737
nodes[newNode.GetAddr()] = newNode
738738
tests.WaitForPrimaryServing(re, map[string]bs.Server{newNode.GetAddr(): newNode})
739739

740-
// Step 7: Verify GetTS succeeds after node restart.
741-
// The restarted node may transiently serve stale keyspace-group metadata
742-
// before watch sync catches up, so tolerate one transient GetTS error and
743-
// assert eventual recovery instead of requiring immediate success.
744-
result := <-resultCh
745-
if result.err != nil {
746-
testutil.Eventually(re, func() bool {
747-
retryCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
748-
defer cancel()
749-
_, _, err := client.GetTS(retryCtx)
750-
return err == nil
751-
}, testutil.WithWaitFor(60*time.Second), testutil.WithTickInterval(500*time.Millisecond))
752-
}
740+
// Step 7: Verify eventual recovery after node restart.
741+
// The in-flight GetTS may stay attached to stale discovery/metadata during
742+
// recovery. Allow either the blocked request or a fresh retry to observe the
743+
// recovered TSO service and return a newer timestamp.
744+
var recoveredTS uint64
745+
testutil.Eventually(re, func() bool {
746+
select {
747+
case result := <-resultCh:
748+
if result.err == nil {
749+
recoveredTS = tsoutil.ComposeTS(result.physicalTS, result.logicalTS)
750+
return recoveredTS > setup.initialTS
751+
}
752+
default:
753+
}
754+
755+
retryCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
756+
defer cancel()
757+
758+
physicalTS, logicalTS, err := client.GetTS(retryCtx)
759+
if err != nil {
760+
return false
761+
}
762+
recoveredTS = tsoutil.ComposeTS(physicalTS, logicalTS)
763+
return recoveredTS > setup.initialTS
764+
}, testutil.WithWaitFor(60*time.Second), testutil.WithTickInterval(500*time.Millisecond))
765+
getTSCancel()
766+
re.Greater(recoveredTS, setup.initialTS)
753767

754768
// KEY VERIFICATION: If code incorrectly tried to fallback to legacy path,
755769
// assertNotReachLegacyPath failpoint would have panicked already

0 commit comments

Comments
 (0)