Skip to content

Commit d190c0e

Browse files
ti-chi-botlhy1024
andauthored
cluster: fix tso fallback due raft cluster did not stop tso service (part2) (#8885) (#8890)
ref #8477, close #8889 Signed-off-by: lhy1024 <admin@liudos.us> Co-authored-by: lhy1024 <admin@liudos.us>
1 parent c0daa90 commit d190c0e

File tree

4 files changed

+140
-12
lines changed

4 files changed

+140
-12
lines changed

server/cluster/cluster.go

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ func (c *RaftCluster) InitCluster(
318318
}
319319

320320
// Start starts a cluster.
321-
func (c *RaftCluster) Start(s Server) error {
321+
func (c *RaftCluster) Start(s Server, bootstrap bool) (err error) {
322322
c.Lock()
323323
defer c.Unlock()
324324

@@ -327,11 +327,32 @@ func (c *RaftCluster) Start(s Server) error {
327327
return nil
328328
}
329329
c.isAPIServiceMode = s.IsAPIServiceMode()
330-
err := c.InitCluster(s.GetAllocator(), s.GetPersistOptions(), s.GetHBStreams(), s.GetKeyspaceGroupManager())
330+
err = c.InitCluster(s.GetAllocator(), s.GetPersistOptions(), s.GetHBStreams(), s.GetKeyspaceGroupManager())
331331
if err != nil {
332332
return err
333333
}
334-
c.checkTSOService()
334+
// We should not manage tso service when bootstrap try to start raft cluster.
335+
// It only is controlled by leader election.
336+
// Ref: https://github.com/tikv/pd/issues/8836
337+
if !bootstrap {
338+
c.checkTSOService()
339+
}
340+
defer func() {
341+
if !bootstrap && err != nil {
342+
if err := c.stopTSOJobsIfNeeded(); err != nil {
343+
log.Error("failed to stop TSO jobs", errs.ZapError(err))
344+
return
345+
}
346+
}
347+
}()
348+
failpoint.Inject("raftClusterReturn", func(val failpoint.Value) {
349+
if val, ok := val.(bool); (ok && val) || !ok {
350+
err = errors.New("raftClusterReturn")
351+
} else {
352+
err = nil
353+
}
354+
failpoint.Return(err)
355+
})
335356
cluster, err := c.LoadClusterInfo()
336357
if err != nil {
337358
return err
@@ -422,12 +443,12 @@ func (c *RaftCluster) checkTSOService() {
422443
log.Info("TSO is provided by PD")
423444
c.UnsetServiceIndependent(constant.TSOServiceName)
424445
} else {
425-
if err := c.startTSOJobsIfNeeded(); err != nil {
446+
if err := c.stopTSOJobsIfNeeded(); err != nil {
426447
log.Error("failed to stop TSO jobs", errs.ZapError(err))
427448
return
428449
}
429-
log.Info("TSO is provided by TSO server")
430450
if !c.IsServiceIndependent(constant.TSOServiceName) {
451+
log.Info("TSO is provided by TSO server")
431452
c.SetServiceIndependent(constant.TSOServiceName)
432453
}
433454
}
@@ -2579,3 +2600,13 @@ func (c *RaftCluster) SetServiceIndependent(name string) {
25792600
func (c *RaftCluster) UnsetServiceIndependent(name string) {
25802601
c.independentServices.Delete(name)
25812602
}
2603+
2604+
// GetGlobalTSOAllocator return global tso allocator
2605+
// It only is used for test.
2606+
func (c *RaftCluster) GetGlobalTSOAllocator() tso.Allocator {
2607+
allocator, err := c.tsoAllocator.GetAllocator(tso.GlobalDCLocation)
2608+
if err != nil {
2609+
return nil
2610+
}
2611+
return allocator
2612+
}

server/server.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -777,7 +777,7 @@ func (s *Server) bootstrapCluster(req *pdpb.BootstrapRequest) (*pdpb.BootstrapRe
777777
log.Warn("flush the bootstrap region failed", errs.ZapError(err))
778778
}
779779

780-
if err := s.cluster.Start(s); err != nil {
780+
if err := s.cluster.Start(s, true); err != nil {
781781
return nil, err
782782
}
783783

@@ -795,7 +795,7 @@ func (s *Server) createRaftCluster() error {
795795
return nil
796796
}
797797

798-
return s.cluster.Start(s)
798+
return s.cluster.Start(s, false)
799799
}
800800

801801
func (s *Server) stopRaftCluster() {
@@ -2125,3 +2125,9 @@ func (s *Server) GetMaxResetTSGap() time.Duration {
21252125
func (s *Server) SetClient(client *clientv3.Client) {
21262126
s.client = client
21272127
}
2128+
2129+
// GetGlobalTSOAllocator return global tso allocator
2130+
// It only is used for test.
2131+
func (s *Server) GetGlobalTSOAllocator() tso.Allocator {
2132+
return s.cluster.GetGlobalTSOAllocator()
2133+
}

tests/server/api/api_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -925,7 +925,7 @@ func TestSendApiWhenRestartRaftCluster(t *testing.T) {
925925
output := sendRequest(re, leader.GetAddr()+"/pd/api/v1/min-resolved-ts", http.MethodGet, http.StatusInternalServerError)
926926
re.Contains(string(output), "TiKV cluster not bootstrapped, please start TiKV first")
927927

928-
err = rc.Start(leader.GetServer())
928+
err = rc.Start(leader.GetServer(), false)
929929
re.NoError(err)
930930
rc = leader.GetRaftCluster()
931931
re.NotNil(rc)

tests/server/cluster/cluster_test.go

Lines changed: 95 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -578,7 +578,7 @@ func TestRaftClusterRestart(t *testing.T) {
578578
re.NotNil(rc)
579579
rc.Stop()
580580

581-
err = rc.Start(leaderServer.GetServer())
581+
err = rc.Start(leaderServer.GetServer(), false)
582582
re.NoError(err)
583583

584584
rc = leaderServer.GetRaftCluster()
@@ -619,14 +619,105 @@ func TestRaftClusterMultipleRestart(t *testing.T) {
619619
for range 100 {
620620
// See https://github.com/tikv/pd/issues/8543
621621
rc.Wait()
622-
err = rc.Start(leaderServer.GetServer())
622+
err = rc.Start(leaderServer.GetServer(), false)
623623
re.NoError(err)
624624
time.Sleep(time.Millisecond)
625625
rc.Stop()
626626
}
627627
re.NoError(failpoint.Disable("github.com/tikv/pd/server/cluster/highFrequencyClusterJobs"))
628628
}
629629

630+
// TestRaftClusterStartTSOJob is used to test whether tso job service is normally closed
631+
// when raft cluster is stopped ahead of time.
632+
// Ref: https://github.com/tikv/pd/issues/8836
633+
func TestRaftClusterStartTSOJob(t *testing.T) {
634+
re := require.New(t)
635+
name := "pd1"
636+
// case 1: normal start
637+
ctx, cancel := context.WithCancel(context.Background())
638+
tc, err := tests.NewTestCluster(ctx, 1, func(conf *config.Config, _ string) {
639+
conf.LeaderLease = 300
640+
})
641+
re.NoError(err)
642+
re.NoError(tc.RunInitialServers())
643+
re.NotEmpty(tc.WaitLeader())
644+
leaderServer := tc.GetLeaderServer()
645+
re.NotNil(leaderServer)
646+
leaderServer.BootstrapCluster()
647+
testutil.Eventually(re, func() bool {
648+
allocator := tc.GetServer(name).GetServer().GetGlobalTSOAllocator()
649+
return allocator.IsInitialize()
650+
})
651+
tc.Destroy()
652+
cancel()
653+
// case 2: return ahead of time but no error when start raft cluster
654+
re.NoError(failpoint.Enable("github.com/tikv/pd/server/cluster/raftClusterReturn", `return(false)`))
655+
ctx, cancel = context.WithCancel(context.Background())
656+
tc, err = tests.NewTestCluster(ctx, 1, func(conf *config.Config, _ string) {
657+
conf.LeaderLease = 300
658+
})
659+
re.NoError(err)
660+
err = tc.RunInitialServers()
661+
re.NoError(err)
662+
tc.WaitLeader()
663+
testutil.Eventually(re, func() bool {
664+
allocator := tc.GetServer(name).GetServer().GetGlobalTSOAllocator()
665+
return allocator.IsInitialize()
666+
})
667+
re.NoError(failpoint.Disable("github.com/tikv/pd/server/cluster/raftClusterReturn"))
668+
tc.Destroy()
669+
cancel()
670+
// case 3: meet error when start raft cluster
671+
re.NoError(failpoint.Enable("github.com/tikv/pd/server/cluster/raftClusterReturn", `return(true)`))
672+
ctx, cancel = context.WithCancel(context.Background())
673+
tc, err = tests.NewTestCluster(ctx, 1, func(conf *config.Config, _ string) {
674+
conf.LeaderLease = 300
675+
})
676+
re.NoError(err)
677+
err = tc.RunInitialServers()
678+
re.NoError(err)
679+
tc.WaitLeader()
680+
testutil.Eventually(re, func() bool {
681+
allocator := tc.GetServer(name).GetServer().GetGlobalTSOAllocator()
682+
return !allocator.IsInitialize()
683+
})
684+
re.NoError(failpoint.Disable("github.com/tikv/pd/server/cluster/raftClusterReturn"))
685+
tc.Destroy()
686+
cancel()
687+
// case 4: multiple bootstrap in 3 pd cluster
688+
ctx, cancel = context.WithCancel(context.Background())
689+
tc, err = tests.NewTestCluster(ctx, 3, func(conf *config.Config, _ string) {
690+
conf.LeaderLease = 300
691+
})
692+
re.NoError(err)
693+
re.NoError(tc.RunInitialServers())
694+
re.NotEmpty(tc.WaitLeader())
695+
leaderServer = tc.GetLeaderServer()
696+
re.NotNil(leaderServer)
697+
name = leaderServer.GetLeader().GetName()
698+
wg := sync.WaitGroup{}
699+
for range 3 {
700+
wg.Add(1)
701+
go func() {
702+
leaderServer.BootstrapCluster()
703+
wg.Done()
704+
}()
705+
}
706+
wg.Wait()
707+
testutil.Eventually(re, func() bool {
708+
allocator := leaderServer.GetServer().GetGlobalTSOAllocator()
709+
return allocator.IsInitialize()
710+
})
711+
re.NoError(tc.ResignLeader())
712+
re.NotEmpty(tc.WaitLeader())
713+
testutil.Eventually(re, func() bool {
714+
allocator := tc.GetServer(name).GetServer().GetGlobalTSOAllocator()
715+
return !allocator.IsInitialize()
716+
})
717+
tc.Destroy()
718+
cancel()
719+
}
720+
630721
func newMetaStore(storeID uint64, addr, version string, state metapb.StoreState, deployPath string) *metapb.Store {
631722
return &metapb.Store{Id: storeID, Address: addr, Version: version, State: state, DeployPath: deployPath}
632723
}
@@ -1435,7 +1526,7 @@ func TestTransferLeaderForScheduler(t *testing.T) {
14351526
tc.WaitLeader()
14361527
leaderServer = tc.GetLeaderServer()
14371528
rc1 := leaderServer.GetServer().GetRaftCluster()
1438-
rc1.Start(leaderServer.GetServer())
1529+
rc1.Start(leaderServer.GetServer(), false)
14391530
re.NoError(err)
14401531
re.NotNil(rc1)
14411532
// region heartbeat
@@ -1455,7 +1546,7 @@ func TestTransferLeaderForScheduler(t *testing.T) {
14551546
tc.WaitLeader()
14561547
leaderServer = tc.GetLeaderServer()
14571548
rc = leaderServer.GetServer().GetRaftCluster()
1458-
rc.Start(leaderServer.GetServer())
1549+
rc.Start(leaderServer.GetServer(), false)
14591550
re.NotNil(rc)
14601551
// region heartbeat
14611552
id = leaderServer.GetAllocator()

0 commit comments

Comments
 (0)