Skip to content

Commit cb52d28

Browse files
ti-chi-botnolouchti-chi-bot[bot]
authored
replication_mode: fix the state cannot switch to async while existing learner node (#6452) (#6483)
ref #4399, ref #6452, close tikv/tikv#14704 replication_mode: fix the state cannot switch to async while existing learner node - skip the learner node when check the stores state Signed-off-by: ti-chi-bot <ti-community-prow-bot@tidb.io> Signed-off-by: nolouch <nolouch@gmail.com> Co-authored-by: ShuNing <nolouch@gmail.com> Co-authored-by: nolouch <nolouch@gmail.com> Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com>
1 parent d086bd5 commit cb52d28

File tree

9 files changed

+45
-10
lines changed

9 files changed

+45
-10
lines changed

pkg/mock/mockcluster/mockcluster.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,13 @@ func (mc *Cluster) AddLabelsStore(storeID uint64, regionCount int, labels map[st
337337
mc.PutStore(store)
338338
}
339339

340+
// AddLabersStoreWithLearnerCount adds store with specified count of region, learner and labels.
341+
func (mc *Cluster) AddLabersStoreWithLearnerCount(storeID uint64, regionCount int, learnerCount int, labels map[string]string) {
342+
mc.AddLabelsStore(storeID, regionCount, labels)
343+
store := mc.GetStore(storeID).Clone(core.SetLearnerCount(learnerCount))
344+
mc.PutStore(store)
345+
}
346+
340347
// AddLeaderRegion adds region with specified leader and followers.
341348
func (mc *Cluster) AddLeaderRegion(regionID uint64, leaderStoreID uint64, otherPeerStoreIDs ...uint64) *core.RegionInfo {
342349
origin := mc.newMockRegionInfo(regionID, leaderStoreID, otherPeerStoreIDs...)

server/api/store.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,9 @@ type StoreStatus struct {
5555
RegionWeight float64 `json:"region_weight"`
5656
RegionScore float64 `json:"region_score"`
5757
RegionSize int64 `json:"region_size"`
58-
WitnessCount int `json:"witness_count"`
59-
SlowScore uint64 `json:"slow_score"`
58+
LearnerCount int `json:"learner_count,omitempty"`
59+
WitnessCount int `json:"witness_count,omitempty"`
60+
SlowScore uint64 `json:"slow_score,omitempty"`
6061
SendingSnapCount uint32 `json:"sending_snap_count,omitempty"`
6162
ReceivingSnapCount uint32 `json:"receiving_snap_count,omitempty"`
6263
IsBusy bool `json:"is_busy,omitempty"`
@@ -94,6 +95,7 @@ func newStoreInfo(opt *config.ScheduleConfig, store *core.StoreInfo) *StoreInfo
9495
RegionWeight: store.GetRegionWeight(),
9596
RegionScore: store.RegionScore(opt.RegionScoreFormulaVersion, opt.HighSpaceRatio, opt.LowSpaceRatio, 0),
9697
RegionSize: store.GetRegionSize(),
98+
LearnerCount: store.GetLearnerCount(),
9799
WitnessCount: store.GetWitnessCount(),
98100
SlowScore: store.GetSlowScore(),
99101
SendingSnapCount: store.GetSendingSnapCount(),

server/core/basic_cluster.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,10 +150,10 @@ func (bc *BasicCluster) ResetStoreLimit(storeID uint64, limitType storelimit.Typ
150150

151151
// UpdateStoreStatus updates the information of the store.
152152
func (bc *BasicCluster) UpdateStoreStatus(storeID uint64) {
153-
leaderCount, regionCount, witnessCount, pendingPeerCount, leaderRegionSize, regionSize := bc.RegionsInfo.GetStoreStats(storeID)
153+
leaderCount, regionCount, witnessCount, learnerCount, pendingPeerCount, leaderRegionSize, regionSize := bc.RegionsInfo.GetStoreStats(storeID)
154154
bc.Stores.mu.Lock()
155155
defer bc.Stores.mu.Unlock()
156-
bc.Stores.UpdateStoreStatus(storeID, leaderCount, regionCount, pendingPeerCount, leaderRegionSize, regionSize, witnessCount)
156+
bc.Stores.UpdateStoreStatus(storeID, leaderCount, regionCount, witnessCount, learnerCount, pendingPeerCount, leaderRegionSize, regionSize)
157157
}
158158

159159
// PutStore put a store.

server/core/region.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1171,11 +1171,11 @@ func (r *RegionsInfo) GetMetaRegions() []*metapb.Region {
11711171
}
11721172

11731173
// GetStoreStats returns the store stats.
1174-
func (r *RegionsInfo) GetStoreStats(storeID uint64) (leader, region, witness, pending int, leaderSize, regionSize int64) {
1174+
func (r *RegionsInfo) GetStoreStats(storeID uint64) (leader, region, witness, learner, pending int, leaderSize, regionSize int64) {
11751175
r.st.RLock()
11761176
defer r.st.RUnlock()
11771177
return r.leaders[storeID].length(), r.getStoreRegionCountLocked(storeID), r.witnesses[storeID].length(),
1178-
r.pendingPeers[storeID].length(), r.leaders[storeID].TotalSize(), r.getStoreRegionSizeLocked(storeID)
1178+
r.learners[storeID].length(), r.pendingPeers[storeID].length(), r.leaders[storeID].TotalSize(), r.getStoreRegionSizeLocked(storeID)
11791179
}
11801180

11811181
// GetRegionCount gets the total count of RegionInfo of regionMap

server/core/store.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ type StoreInfo struct {
5252
slowStoreEvicted bool // this store has been evicted as a slow store, should not transfer leader to it
5353
leaderCount int
5454
regionCount int
55+
learnerCount int
5556
witnessCount int
5657
leaderSize int64
5758
regionSize int64
@@ -221,6 +222,11 @@ func (s *StoreInfo) GetRegionCount() int {
221222
return s.regionCount
222223
}
223224

225+
// GetLearnerCount returns the learner count of the store.
226+
func (s *StoreInfo) GetLearnerCount() int {
227+
return s.learnerCount
228+
}
229+
224230
// GetWitnessCount returns the witness count of the store.
225231
func (s *StoreInfo) GetWitnessCount() int {
226232
return s.witnessCount
@@ -709,11 +715,12 @@ func (s *StoresInfo) SetRegionSize(storeID uint64, regionSize int64) {
709715
}
710716

711717
// UpdateStoreStatus updates the information of the store.
712-
func (s *StoresInfo) UpdateStoreStatus(storeID uint64, leaderCount int, regionCount int, pendingPeerCount int, leaderSize int64, regionSize int64, witnessCount int) {
718+
func (s *StoresInfo) UpdateStoreStatus(storeID uint64, leaderCount, regionCount, witnessCount, learnerCount, pendingPeerCount int, leaderSize int64, regionSize int64) {
713719
if store, ok := s.stores[storeID]; ok {
714720
newStore := store.ShallowClone(SetLeaderCount(leaderCount),
715721
SetRegionCount(regionCount),
716722
SetWitnessCount(witnessCount),
723+
SetLearnerCount(learnerCount),
717724
SetPendingPeerCount(pendingPeerCount),
718725
SetLeaderSize(leaderSize),
719726
SetRegionSize(regionSize))

server/core/store_option.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,13 @@ func SetRegionCount(regionCount int) StoreCreateOption {
150150
}
151151
}
152152

153+
// SetLearnerCount sets the learner count for the store.
154+
func SetLearnerCount(learnerCount int) StoreCreateOption {
155+
return func(store *StoreInfo) {
156+
store.learnerCount = learnerCount
157+
}
158+
}
159+
153160
// SetWitnessCount sets the witness count for the store.
154161
func SetWitnessCount(witnessCount int) StoreCreateOption {
155162
return func(store *StoreInfo) {

server/replication/replication_mode.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,10 @@ func (m *ModeManager) checkStoreStatus() [][]uint64 {
503503
if s.IsRemoved() {
504504
continue
505505
}
506+
// learner peers do not participate in major commit or vote, so it should not count in primary/dr as a normal store.
507+
if s.GetRegionCount() == s.GetLearnerCount() {
508+
continue
509+
}
506510
down := s.DownTime() >= m.config.DRAutoSync.WaitStoreTimeout.Duration
507511
labelValue := s.GetLabelValue(m.config.DRAutoSync.LabelKey)
508512
if labelValue == m.config.DRAutoSync.Primary {

server/replication/replication_mode_test.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ func TestStateSwitch(t *testing.T) {
167167
Primary: "zone1",
168168
DR: "zone2",
169169
PrimaryReplicas: 4,
170-
DRReplicas: 1,
170+
DRReplicas: 2,
171171
WaitStoreTimeout: typeutil.Duration{Duration: time.Minute},
172172
}}
173173
cluster := mockcluster.NewCluster(ctx, config.NewTestOptions())
@@ -214,7 +214,7 @@ func TestStateSwitch(t *testing.T) {
214214

215215
// add new store in dr zone.
216216
cluster.AddLabelsStore(5, 1, map[string]string{"zone": "zone2"})
217-
cluster.AddLabelsStore(6, 1, map[string]string{"zone": "zone2"})
217+
cluster.AddLabersStoreWithLearnerCount(6, 1, 1, map[string]string{"zone": "zone2"})
218218
// async -> sync
219219
rep.tickDR()
220220
re.Equal(drStateSyncRecover, rep.drGetState())
@@ -233,10 +233,14 @@ func TestStateSwitch(t *testing.T) {
233233
rep.tickDR()
234234
re.Equal(drStateSync, rep.drGetState()) // cannot guarantee majority, keep sync.
235235

236+
setStoreState(cluster, "up", "up", "up", "up", "up", "down")
237+
rep.tickDR()
238+
re.Equal(drStateSync, rep.drGetState())
239+
240+
// once the voter node down, even learner node up, swith to async state.
236241
setStoreState(cluster, "up", "up", "up", "up", "down", "up")
237242
rep.tickDR()
238243
re.Equal(drStateAsyncWait, rep.drGetState())
239-
assertStateIDUpdate()
240244

241245
rep.drSwitchToSync()
242246
replicator.errors[2] = errors.New("fail to replicate")

server/statistics/store_collection.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ type storeStatistics struct {
4343
StorageCapacity uint64
4444
RegionCount int
4545
LeaderCount int
46+
LearnerCount int
4647
WitnessCount int
4748
LabelCounter map[string]int
4849
Preparing int
@@ -119,6 +120,7 @@ func (s *storeStatistics) Observe(store *core.StoreInfo, stats *StoresStats) {
119120
storeStatusGauge.WithLabelValues(storeAddress, id, "leader_size").Set(float64(store.GetLeaderSize()))
120121
storeStatusGauge.WithLabelValues(storeAddress, id, "leader_count").Set(float64(store.GetLeaderCount()))
121122
storeStatusGauge.WithLabelValues(storeAddress, id, "witness_count").Set(float64(store.GetWitnessCount()))
123+
storeStatusGauge.WithLabelValues(storeAddress, id, "learner_count").Set(float64(store.GetLearnerCount()))
122124
storeStatusGauge.WithLabelValues(storeAddress, id, "store_available").Set(float64(store.GetAvailable()))
123125
storeStatusGauge.WithLabelValues(storeAddress, id, "store_used").Set(float64(store.GetUsedSize()))
124126
storeStatusGauge.WithLabelValues(storeAddress, id, "store_capacity").Set(float64(store.GetCapacity()))
@@ -170,6 +172,7 @@ func (s *storeStatistics) Collect() {
170172
metrics["region_count"] = float64(s.RegionCount)
171173
metrics["leader_count"] = float64(s.LeaderCount)
172174
metrics["witness_count"] = float64(s.WitnessCount)
175+
metrics["learner_count"] = float64(s.LearnerCount)
173176
metrics["storage_size"] = float64(s.StorageSize)
174177
metrics["storage_capacity"] = float64(s.StorageCapacity)
175178

@@ -241,6 +244,7 @@ func (s *storeStatistics) resetStoreStatistics(storeAddress string, id string) {
241244
"leader_size",
242245
"leader_count",
243246
"witness_count",
247+
"learner_count",
244248
"store_available",
245249
"store_used",
246250
"store_capacity",

0 commit comments

Comments
 (0)