Skip to content

Commit a7c733b

Browse files
authored
PD api: add metrics and log (tikv#455)
* add metrics and log Signed-off-by: ystaticy <y_static_y@sina.com> * add metrics and log Signed-off-by: ystaticy <y_static_y@sina.com> * add metrics and log Signed-off-by: ystaticy <y_static_y@sina.com> * add leader ready log Signed-off-by: ystaticy <y_static_y@sina.com> --------- Signed-off-by: ystaticy <y_static_y@sina.com>
1 parent 18200a5 commit a7c733b

File tree

5 files changed

+138
-18
lines changed

5 files changed

+138
-18
lines changed

pkg/schedule/labeler/labeler.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ type RegionLabeler struct {
4242

4343
// NewRegionLabeler creates a Labeler instance.
4444
func NewRegionLabeler(ctx context.Context, storage endpoint.RuleStorage, gcInterval time.Duration) (*RegionLabeler, error) {
45+
start := time.Now()
46+
defer func() {
47+
newRegionLabelerDuration.Observe(time.Since(start).Seconds())
48+
}()
49+
4550
l := &RegionLabeler{
4651
storage: storage,
4752
labelRules: make(map[string]*LabelRule),
@@ -52,6 +57,7 @@ func NewRegionLabeler(ctx context.Context, storage endpoint.RuleStorage, gcInter
5257
if err := l.loadRules(); err != nil {
5358
return nil, err
5459
}
60+
log.Info("new region labeler created", zap.Int("label-rules-count", len(l.labelRules)))
5561
go l.doGC(gcInterval)
5662
return l, nil
5763
}

pkg/schedule/labeler/metrics.go

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,27 @@ package labeler
1616

1717
import "github.com/prometheus/client_golang/prometheus"
1818

19-
// LabelerEventCounter is a counter of the scheduler labeler system.
20-
var LabelerEventCounter = prometheus.NewCounterVec(
21-
prometheus.CounterOpts{
22-
Namespace: "pd",
23-
Subsystem: "schedule",
24-
Name: "labeler_event_counter",
25-
Help: "Counter of the scheduler label.",
26-
}, []string{"type", "event"})
19+
var (
20+
// LabelerEventCounter is a counter of the scheduler labeler system.
21+
LabelerEventCounter = prometheus.NewCounterVec(
22+
prometheus.CounterOpts{
23+
Namespace: "pd",
24+
Subsystem: "schedule",
25+
Name: "labeler_event_counter",
26+
Help: "Counter of the scheduler label.",
27+
}, []string{"type", "event"})
28+
29+
newRegionLabelerDuration = prometheus.NewHistogram(
30+
prometheus.HistogramOpts{
31+
Namespace: "pd",
32+
Subsystem: "schedule",
33+
Name: "new_region_labeler_duration_seconds",
34+
Help: "Bucketed histogram of processing time (s) of creating new region labeler.",
35+
Buckets: prometheus.ExponentialBuckets(0.001, 2, 13), // 1ms ~ 4s
36+
})
37+
)
2738

2839
func init() {
2940
prometheus.MustRegister(LabelerEventCounter)
41+
prometheus.MustRegister(newRegionLabelerDuration)
3042
}

server/cluster/cluster.go

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,11 @@ func (c *RaftCluster) InitCluster(
320320

321321
// Start starts a cluster.
322322
func (c *RaftCluster) Start(s Server, bootstrap bool) (err error) {
323+
start := time.Now()
324+
defer func() {
325+
raftClusterStartDuration.Observe(time.Since(start).Seconds())
326+
}()
327+
323328
c.Lock()
324329
defer c.Unlock()
325330

@@ -328,15 +333,24 @@ func (c *RaftCluster) Start(s Server, bootstrap bool) (err error) {
328333
return nil
329334
}
330335
c.isAPIServiceMode = s.IsAPIServiceMode()
336+
log.Info("[leader-ready] start to init cluster")
337+
initClusterStart := time.Now()
331338
err = c.InitCluster(s.GetAllocator(), s.GetPersistOptions(), s.GetHBStreams(), s.GetKeyspaceGroupManager())
332339
if err != nil {
340+
log.Error("[leader-ready] failed to init cluster", errs.ZapError(err), zap.Duration("cost", time.Since(initClusterStart)))
333341
return err
334342
}
343+
initClusterDuration := time.Since(initClusterStart)
344+
log.Info("[leader-ready] init cluster completed", zap.Duration("cost", initClusterDuration))
335345
// We should not manage tso service when bootstrap try to start raft cluster.
336346
// It only is controlled by leader election.
337347
// Ref: https://github.com/tikv/pd/issues/8836
338348
if !bootstrap {
349+
log.Info("[leader-ready] start to check TSO service")
350+
checkTSOStart := time.Now()
339351
c.checkTSOService()
352+
checkTSODuration := time.Since(checkTSOStart)
353+
log.Info("[leader-ready] check TSO service completed", zap.Duration("cost", checkTSODuration))
340354
}
341355
defer func() {
342356
if !bootstrap && err != nil {
@@ -354,45 +368,83 @@ func (c *RaftCluster) Start(s Server, bootstrap bool) (err error) {
354368
}
355369
failpoint.Return(err)
356370
})
371+
log.Info("[leader-ready] start to load cluster info")
372+
loadClusterInfoStart := time.Now()
357373
cluster, err := c.LoadClusterInfo()
358374
if err != nil {
375+
log.Error("[leader-ready] failed to load cluster info", errs.ZapError(err), zap.Duration("cost", time.Since(loadClusterInfoStart)))
359376
return err
360377
}
378+
loadClusterInfoDuration := time.Since(loadClusterInfoStart)
361379
if cluster == nil {
362-
log.Warn("cluster is not bootstrapped")
380+
log.Warn("[leader-ready] cluster is not bootstrapped", zap.Duration("cost", loadClusterInfoDuration))
363381
return nil
364382
}
383+
log.Info("[leader-ready] load cluster info completed", zap.Duration("cost", loadClusterInfoDuration))
365384

385+
log.Info("[leader-ready] creating region labeler")
386+
labelerStart := time.Now()
366387
c.regionLabeler, err = labeler.NewRegionLabeler(c.ctx, c.storage, regionLabelGCInterval)
388+
labelerDuration := time.Since(labelerStart)
367389
if err != nil {
390+
log.Error("[leader-ready] region labeler creation failed", zap.Error(err), zap.Duration("cost", labelerDuration))
368391
return err
369392
}
393+
log.Info("[leader-ready] region labeler created", zap.Duration("cost", labelerDuration))
370394

371395
if !c.IsServiceIndependent(constant.SchedulingServiceName) {
396+
log.Info("[leader-ready] start to observe slow store status")
397+
observeSlowStoreStart := time.Now()
372398
for _, store := range c.GetStores() {
373399
storeID := store.GetID()
374400
c.slowStat.ObserveSlowStoreStatus(storeID, store.IsSlow())
375401
}
402+
observeSlowStoreDuration := time.Since(observeSlowStoreStart)
403+
log.Info("[leader-ready] observe slow store status completed", zap.Duration("cost", observeSlowStoreDuration))
376404
}
405+
log.Info("[leader-ready] start to create replication mode manager")
406+
replicationModeStart := time.Now()
377407
c.replicationMode, err = replication.NewReplicationModeManager(s.GetConfig().ReplicationMode, c.storage, cluster, s)
378408
if err != nil {
409+
log.Error("[leader-ready] failed to create replication mode manager", errs.ZapError(err), zap.Duration("cost", time.Since(replicationModeStart)))
379410
return err
380411
}
412+
replicationModeDuration := time.Since(replicationModeStart)
413+
log.Info("[leader-ready] create replication mode manager completed", zap.Duration("cost", replicationModeDuration))
414+
log.Info("[leader-ready] start to create store limiter")
415+
limiterStart := time.Now()
381416
c.limiter = NewStoreLimiter(s.GetPersistOptions())
417+
limiterDuration := time.Since(limiterStart)
418+
log.Info("[leader-ready] create store limiter completed", zap.Duration("cost", limiterDuration))
419+
loadExternalTSStart := time.Now()
382420
c.externalTS, err = c.storage.LoadExternalTS()
383421
if err != nil {
384-
log.Error("load external timestamp meets error", zap.Error(err))
422+
log.Error("[leader-ready] load external timestamp meets error", zap.Error(err), zap.Duration("cost", time.Since(loadExternalTSStart)))
423+
} else {
424+
loadExternalTSDuration := time.Since(loadExternalTSStart)
425+
log.Info("[leader-ready] load external timestamp completed", zap.Duration("cost", loadExternalTSDuration))
385426
}
386427

387428
if c.isAPIServiceMode {
388429
// bootstrap keyspace group manager after starting other parts successfully.
389430
// This order avoids a stuck goroutine in keyspaceGroupManager when it fails to create raftcluster.
431+
log.Info("[leader-ready] start to bootstrap keyspace group manager")
432+
bootstrapKeyspaceStart := time.Now()
390433
err = c.keyspaceGroupManager.Bootstrap(c.ctx)
391434
if err != nil {
435+
log.Error("[leader-ready] failed to bootstrap keyspace group manager", errs.ZapError(err), zap.Duration("cost", time.Since(bootstrapKeyspaceStart)))
392436
return err
393437
}
438+
bootstrapKeyspaceDuration := time.Since(bootstrapKeyspaceStart)
439+
log.Info("[leader-ready] bootstrap keyspace group manager completed", zap.Duration("cost", bootstrapKeyspaceDuration))
394440
}
441+
log.Info("[leader-ready] start to check scheduling service")
442+
checkSchedulingStart := time.Now()
395443
c.checkSchedulingService()
444+
checkSchedulingDuration := time.Since(checkSchedulingStart)
445+
log.Info("[leader-ready] check scheduling service completed", zap.Duration("cost", checkSchedulingDuration))
446+
log.Info("[leader-ready] start to start background jobs")
447+
backgroundJobsStart := time.Now()
396448
c.wg.Add(9)
397449
go c.runServiceCheckJob()
398450
go c.runMetricsCollectionJob()
@@ -403,11 +455,17 @@ func (c *RaftCluster) Start(s Server, bootstrap bool) (err error) {
403455
go c.runStoreConfigSync()
404456
go c.runUpdateStoreStats()
405457
go c.startGCTuner()
458+
backgroundJobsDuration := time.Since(backgroundJobsStart)
459+
log.Info("[leader-ready] start background jobs completed", zap.Duration("cost", backgroundJobsDuration))
406460

461+
log.Info("[leader-ready] start to start runners")
462+
runnersStart := time.Now()
407463
c.running = true
408464
c.heartbeatRunner.Start(c.ctx)
409465
c.miscRunner.Start(c.ctx)
410466
c.logRunner.Start(c.ctx)
467+
runnersDuration := time.Since(runnersStart)
468+
log.Info("[leader-ready] start runners completed", zap.Duration("cost", runnersDuration))
411469
return nil
412470
}
413471

server/cluster/metrics.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,15 @@ var (
9595
Name: "store_sync",
9696
Help: "The state of store sync config",
9797
}, []string{"address", "state"})
98+
99+
raftClusterStartDuration = prometheus.NewHistogram(
100+
prometheus.HistogramOpts{
101+
Namespace: "pd",
102+
Subsystem: "cluster",
103+
Name: "raftcluster_start_duration_seconds",
104+
Help: "Bucketed histogram of processing time (s) of raft cluster start.",
105+
Buckets: prometheus.ExponentialBuckets(0.001, 2, 13), // 1ms ~ 4s
106+
})
98107
)
99108

100109
func init() {
@@ -108,4 +117,5 @@ func init() {
108117
prometheus.MustRegister(storesETAGauge)
109118
prometheus.MustRegister(storeSyncConfigEvent)
110119
prometheus.MustRegister(updateStoreStatsGauge)
120+
prometheus.MustRegister(raftClusterStartDuration)
111121
}

server/server.go

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1733,6 +1733,9 @@ func (s *Server) campaignLeader() {
17331733
}
17341734
return
17351735
}
1736+
// Start timing from when leader is successfully elected
1737+
leaderReadyStart := time.Now()
1738+
log.Info("[leader-ready] leader election succeeded, start leader ready process")
17361739

17371740
// Start keepalive the leadership and enable TSO service.
17381741
// TSO service is strictly enabled/disabled by PD leader lease for 2 reasons:
@@ -1746,48 +1749,74 @@ func (s *Server) campaignLeader() {
17461749
})
17471750

17481751
// maintain the PD leadership, after this, TSO can be service.
1752+
log.Info("[leader-ready] start to keep leader lease")
1753+
keepLeaderStart := time.Now()
17491754
s.member.KeepLeader(ctx)
1755+
keepLeaderDuration := time.Since(keepLeaderStart)
1756+
log.Info("[leader-ready] keep leader lease completed", zap.Duration("cost", keepLeaderDuration))
17501757
log.Info(fmt.Sprintf("campaign %s leader ok", s.mode), zap.String("campaign-leader-name", s.Name()))
17511758

1759+
reloadConfigStart := time.Now()
17521760
if err := s.reloadConfigFromKV(); err != nil {
1753-
log.Error("failed to reload configuration", errs.ZapError(err))
1761+
log.Error("[leader-ready] failed to reload configuration", errs.ZapError(err), zap.Duration("cost", time.Since(reloadConfigStart)))
17541762
return
17551763
}
1764+
reloadConfigDuration := time.Since(reloadConfigStart)
1765+
log.Info("[leader-ready] reload config from KV completed", zap.Duration("cost", reloadConfigDuration))
17561766

1767+
loadTTLStart := time.Now()
17571768
if err := s.persistOptions.LoadTTLFromEtcd(s.ctx, s.client); err != nil {
1758-
log.Error("failed to load persistOptions from etcd", errs.ZapError(err))
1769+
log.Error("[leader-ready] failed to load persistOptions from etcd", errs.ZapError(err), zap.Duration("cost", time.Since(loadTTLStart)))
17591770
return
17601771
}
1772+
loadTTLDuration := time.Since(loadTTLStart)
1773+
log.Info("[leader-ready] load persist options from etcd completed", zap.Duration("cost", loadTTLDuration))
17611774

1775+
encryptionStart := time.Now()
17621776
if err := s.encryptionKeyManager.SetLeadership(s.member.GetLeadership()); err != nil {
1763-
log.Error("failed to initialize encryption", errs.ZapError(err))
1777+
log.Error("[leader-ready] failed to initialize encryption", errs.ZapError(err), zap.Duration("cost", time.Since(encryptionStart)))
17641778
return
17651779
}
1780+
encryptionDuration := time.Since(encryptionStart)
1781+
log.Info("[leader-ready] initialize encryption completed", zap.Duration("cost", encryptionDuration))
17661782

1767-
log.Info("triggering the leader callback functions")
1783+
callbacksStart := time.Now()
1784+
log.Info("[leader-ready] triggering the leader callback functions")
17681785
for _, cb := range s.leaderCallbacks {
17691786
if err := cb(ctx); err != nil {
1770-
log.Error("failed to execute leader callback function", errs.ZapError(err))
1787+
log.Error("[leader-ready] failed to execute leader callback function", errs.ZapError(err), zap.Duration("cost", time.Since(callbacksStart)))
17711788
return
17721789
}
17731790
}
1791+
callbacksDuration := time.Since(callbacksStart)
1792+
log.Info("[leader-ready] trigger leader callback functions completed", zap.Duration("cost", callbacksDuration))
17741793

17751794
// Try to create raft cluster.
1795+
createRaftClusterStart := time.Now()
17761796
if err := s.createRaftCluster(); err != nil {
1777-
log.Error("failed to create raft cluster", errs.ZapError(err))
1797+
log.Error("[leader-ready] failed to create raft cluster", errs.ZapError(err), zap.Duration("cost", time.Since(createRaftClusterStart)))
17781798
return
17791799
}
1800+
createRaftClusterDuration := time.Since(createRaftClusterStart)
1801+
log.Info("[leader-ready] create raft cluster completed", zap.Duration("cost", createRaftClusterDuration))
17801802
defer s.stopRaftCluster()
17811803
failpoint.Inject("rebaseErr", func() {
17821804
failpoint.Return()
17831805
})
1806+
rebaseStart := time.Now()
17841807
if err := s.idAllocator.Rebase(); err != nil {
1785-
log.Error("failed to sync id from etcd", errs.ZapError(err))
1808+
log.Error("[leader-ready] failed to sync id from etcd", errs.ZapError(err), zap.Duration("cost", time.Since(rebaseStart)))
17861809
return
17871810
}
1811+
rebaseDuration := time.Since(rebaseStart)
1812+
log.Info("[leader-ready] sync id from etcd completed", zap.Duration("cost", rebaseDuration))
17881813
// EnableLeader to accept the remaining service, such as GetStore, GetRegion.
1814+
log.Info("[leader-ready] start to enable leader")
1815+
enableLeaderStart := time.Now()
17891816
s.member.EnableLeader()
1817+
enableLeaderDuration := time.Since(enableLeaderStart)
17901818
member.ServiceMemberGauge.WithLabelValues(s.mode).Set(1)
1819+
log.Info("[leader-ready] enable leader completed", zap.Duration("cost", enableLeaderDuration))
17911820
if !s.IsAPIServiceMode() {
17921821
// Check the cluster dc-location after the PD leader is elected.
17931822
go s.tsoAllocatorManager.ClusterDCLocationChecker()
@@ -1800,8 +1829,13 @@ func (s *Server) campaignLeader() {
18001829
member.ServiceMemberGauge.WithLabelValues(s.mode).Set(0)
18011830
})
18021831

1832+
log.Info("[leader-ready] start to check PD version with cluster version")
1833+
versionCheckStart := time.Now()
18031834
CheckPDVersionWithClusterVersion(s.persistOptions)
1804-
log.Info(fmt.Sprintf("%s leader is ready to serve", s.mode), zap.String("leader-name", s.Name()))
1835+
versionCheckDuration := time.Since(versionCheckStart)
1836+
log.Info("[leader-ready] check PD version with cluster version completed", zap.Duration("cost", versionCheckDuration))
1837+
totalDuration := time.Since(leaderReadyStart)
1838+
log.Info(fmt.Sprintf("[leader-ready] %s leader is ready to serve", s.mode), zap.String("leader-name", s.Name()), zap.Duration("total-cost", totalDuration))
18051839

18061840
leaderTicker := time.NewTicker(constant.LeaderTickInterval)
18071841
defer leaderTicker.Stop()

0 commit comments

Comments
 (0)