Skip to content

Commit 6977c00

Browse files
ystaticybufferflies
authored andcommitted
PD api: add metrics and log (#455)
* add metrics and log Signed-off-by: ystaticy <y_static_y@sina.com> * add metrics and log Signed-off-by: ystaticy <y_static_y@sina.com> * add metrics and log Signed-off-by: ystaticy <y_static_y@sina.com> * add leader ready log Signed-off-by: ystaticy <y_static_y@sina.com> --------- Signed-off-by: ystaticy <y_static_y@sina.com> (cherry picked from commit a7c733b)
1 parent 0a0dcfd commit 6977c00

File tree

5 files changed

+135
-17
lines changed

5 files changed

+135
-17
lines changed

pkg/schedule/labeler/labeler.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,11 @@ type RegionLabeler struct {
4444

4545
// NewRegionLabeler creates a Labeler instance.
4646
func NewRegionLabeler(ctx context.Context, storage endpoint.RuleStorage, gcInterval time.Duration) (*RegionLabeler, error) {
47+
start := time.Now()
48+
defer func() {
49+
newRegionLabelerDuration.Observe(time.Since(start).Seconds())
50+
}()
51+
4752
l := &RegionLabeler{
4853
storage: storage,
4954
labelRules: make(map[string]*LabelRule),
@@ -54,6 +59,7 @@ func NewRegionLabeler(ctx context.Context, storage endpoint.RuleStorage, gcInter
5459
if err := l.loadRules(); err != nil {
5560
return nil, err
5661
}
62+
log.Info("new region labeler created", zap.Int("label-rules-count", len(l.labelRules)))
5763
go l.doGC(gcInterval)
5864
return l, nil
5965
}

pkg/schedule/labeler/metrics.go

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,27 @@ package labeler
1616

1717
import "github.com/prometheus/client_golang/prometheus"
1818

19-
// LabelerEventCounter is a counter of the scheduler labeler system.
20-
var LabelerEventCounter = prometheus.NewCounterVec(
21-
prometheus.CounterOpts{
22-
Namespace: "pd",
23-
Subsystem: "schedule",
24-
Name: "labeler_event_counter",
25-
Help: "Counter of the scheduler label.",
26-
}, []string{"type", "event"})
19+
var (
20+
// LabelerEventCounter is a counter of the scheduler labeler system.
21+
LabelerEventCounter = prometheus.NewCounterVec(
22+
prometheus.CounterOpts{
23+
Namespace: "pd",
24+
Subsystem: "schedule",
25+
Name: "labeler_event_counter",
26+
Help: "Counter of the scheduler label.",
27+
}, []string{"type", "event"})
28+
29+
newRegionLabelerDuration = prometheus.NewHistogram(
30+
prometheus.HistogramOpts{
31+
Namespace: "pd",
32+
Subsystem: "schedule",
33+
Name: "new_region_labeler_duration_seconds",
34+
Help: "Bucketed histogram of processing time (s) of creating new region labeler.",
35+
Buckets: prometheus.ExponentialBuckets(0.001, 2, 13), // 1ms ~ 4s
36+
})
37+
)
2738

2839
func init() {
2940
prometheus.MustRegister(LabelerEventCounter)
41+
prometheus.MustRegister(newRegionLabelerDuration)
3042
}

server/cluster/cluster.go

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,11 @@ func (c *RaftCluster) InitCluster(
327327

328328
// Start starts a cluster.
329329
func (c *RaftCluster) Start(s Server, bootstrap bool) (err error) {
330+
start := time.Now()
331+
defer func() {
332+
raftClusterStartDuration.Observe(time.Since(start).Seconds())
333+
}()
334+
330335
c.Lock()
331336
defer c.Unlock()
332337

@@ -335,15 +340,24 @@ func (c *RaftCluster) Start(s Server, bootstrap bool) (err error) {
335340
return nil
336341
}
337342
c.isKeyspaceGroupEnabled = s.IsKeyspaceGroupEnabled()
343+
log.Info("[leader-ready] start to init cluster")
344+
initClusterStart := time.Now()
338345
err = c.InitCluster(s.GetAllocator(), s.GetPersistOptions(), s.GetHBStreams(), s.GetKeyspaceGroupManager())
339346
if err != nil {
347+
log.Error("[leader-ready] failed to init cluster", errs.ZapError(err), zap.Duration("cost", time.Since(initClusterStart)))
340348
return err
341349
}
350+
initClusterDuration := time.Since(initClusterStart)
351+
log.Info("[leader-ready] init cluster completed", zap.Duration("cost", initClusterDuration))
342352
// We should not manage tso service when bootstrap try to start raft cluster.
343353
// It only is controlled by leader election.
344354
// Ref: https://github.com/tikv/pd/issues/8836
345355
if !bootstrap {
356+
log.Info("[leader-ready] start to check TSO service")
357+
checkTSOStart := time.Now()
346358
c.checkTSOService()
359+
checkTSODuration := time.Since(checkTSOStart)
360+
log.Info("[leader-ready] check TSO service completed", zap.Duration("cost", checkTSODuration))
347361
}
348362
defer func() {
349363
if !bootstrap && err != nil {
@@ -358,12 +372,16 @@ func (c *RaftCluster) Start(s Server, bootstrap bool) (err error) {
358372
}
359373
failpoint.Return(err)
360374
})
375+
log.Info("[leader-ready] start to load cluster info")
376+
loadClusterInfoStart := time.Now()
361377
cluster, err := c.LoadClusterInfo()
362378
if err != nil {
379+
log.Error("[leader-ready] failed to load cluster info", errs.ZapError(err), zap.Duration("cost", time.Since(loadClusterInfoStart)))
363380
return err
364381
}
382+
loadClusterInfoDuration := time.Since(loadClusterInfoStart)
365383
if cluster == nil {
366-
log.Warn("cluster is not bootstrapped")
384+
log.Warn("[leader-ready] cluster is not bootstrapped", zap.Duration("cost", loadClusterInfoDuration))
367385
return nil
368386
}
369387
if c.opt.IsPlacementRulesEnabled() {
@@ -372,11 +390,17 @@ func (c *RaftCluster) Start(s Server, bootstrap bool) (err error) {
372390
return err
373391
}
374392
}
393+
log.Info("[leader-ready] load cluster info completed", zap.Duration("cost", loadClusterInfoDuration))
375394

395+
log.Info("[leader-ready] creating region labeler")
396+
labelerStart := time.Now()
376397
c.regionLabeler, err = labeler.NewRegionLabeler(c.ctx, c.storage, regionLabelGCInterval)
398+
labelerDuration := time.Since(labelerStart)
377399
if err != nil {
400+
log.Error("[leader-ready] region labeler creation failed", zap.Error(err), zap.Duration("cost", labelerDuration))
378401
return err
379402
}
403+
log.Info("[leader-ready] region labeler created", zap.Duration("cost", labelerDuration))
380404

381405
// create affinity manager with region labeler for key range validation and rebuild
382406
c.affinityManager, err = affinity.NewManager(c.ctx, c.storage, c, c.GetOpts(), c.regionLabeler)
@@ -385,27 +409,53 @@ func (c *RaftCluster) Start(s Server, bootstrap bool) (err error) {
385409
}
386410

387411
if !c.IsServiceIndependent(constant.SchedulingServiceName) {
412+
log.Info("[leader-ready] start to observe slow store status")
413+
observeSlowStoreStart := time.Now()
388414
for _, store := range c.GetStores() {
389415
storeID := store.GetID()
390416
c.slowStat.ObserveSlowStoreStatus(storeID, store.IsSlow())
391417
}
418+
observeSlowStoreDuration := time.Since(observeSlowStoreStart)
419+
log.Info("[leader-ready] observe slow store status completed", zap.Duration("cost", observeSlowStoreDuration))
392420
}
421+
log.Info("[leader-ready] start to create replication mode manager")
422+
replicationModeStart := time.Now()
393423
c.replicationMode, err = replication.NewReplicationModeManager(s.GetConfig().ReplicationMode, c.storage, cluster, s)
394424
if err != nil {
425+
log.Error("[leader-ready] failed to create replication mode manager", errs.ZapError(err), zap.Duration("cost", time.Since(replicationModeStart)))
395426
return err
396427
}
428+
replicationModeDuration := time.Since(replicationModeStart)
429+
log.Info("[leader-ready] create replication mode manager completed", zap.Duration("cost", replicationModeDuration))
430+
log.Info("[leader-ready] start to load external timestamp")
431+
loadExternalTSStart := time.Now()
397432
c.loadExternalTS()
433+
log.Info("[leader-ready] load external timestamp completed", zap.Duration("cost", time.Since(loadExternalTSStart)))
434+
log.Info("[leader-ready] start to load min resolved ts")
435+
loadMinResolvedTSStart := time.Now()
398436
c.loadMinResolvedTS()
437+
log.Info("[leader-ready] load min resolved ts completed", zap.Duration("cost", time.Since(loadMinResolvedTSStart)))
399438

400439
if c.isKeyspaceGroupEnabled {
401440
// bootstrap keyspace group manager after starting other parts successfully.
402441
// This order avoids a stuck goroutine in keyspaceGroupManager when it fails to create raftcluster.
442+
log.Info("[leader-ready] start to bootstrap keyspace group manager")
443+
bootstrapKeyspaceStart := time.Now()
403444
err = c.keyspaceGroupManager.Bootstrap(c.ctx)
404445
if err != nil {
446+
log.Error("[leader-ready] failed to bootstrap keyspace group manager", errs.ZapError(err), zap.Duration("cost", time.Since(bootstrapKeyspaceStart)))
405447
return err
406448
}
449+
bootstrapKeyspaceDuration := time.Since(bootstrapKeyspaceStart)
450+
log.Info("[leader-ready] bootstrap keyspace group manager completed", zap.Duration("cost", bootstrapKeyspaceDuration))
407451
}
452+
log.Info("[leader-ready] start to check scheduling service")
453+
checkSchedulingStart := time.Now()
408454
c.checkSchedulingService()
455+
checkSchedulingDuration := time.Since(checkSchedulingStart)
456+
log.Info("[leader-ready] check scheduling service completed", zap.Duration("cost", checkSchedulingDuration))
457+
log.Info("[leader-ready] start to start background jobs")
458+
backgroundJobsStart := time.Now()
409459
c.wg.Add(11)
410460
go c.runServiceCheckJob()
411461
go c.runMetricsCollectionJob()
@@ -418,12 +468,18 @@ func (c *RaftCluster) Start(s Server, bootstrap bool) (err error) {
418468
go c.startGCTuner()
419469
go c.startProgressGC()
420470
go c.runStorageSizeCollector(s.GetMeteringWriter(), c.regionLabeler, s.GetKeyspaceManager())
471+
backgroundJobsDuration := time.Since(backgroundJobsStart)
472+
log.Info("[leader-ready] start background jobs completed", zap.Duration("cost", backgroundJobsDuration))
421473

474+
log.Info("[leader-ready] start to start runners")
475+
runnersStart := time.Now()
422476
c.running = true
423477
c.heartbeatRunner.Start(c.ctx)
424478
c.miscRunner.Start(c.ctx)
425479
c.logRunner.Start(c.ctx)
426480
c.syncRegionRunner.Start(c.ctx)
481+
runnersDuration := time.Since(runnersStart)
482+
log.Info("[leader-ready] start runners completed", zap.Duration("cost", runnersDuration))
427483
return nil
428484
}
429485

server/cluster/metrics.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,15 @@ var (
7171
Name: "store_trigger_network_slow_evict",
7272
Help: "The count of store trigger network slow evict",
7373
}, []string{"store"})
74+
75+
raftClusterStartDuration = prometheus.NewHistogram(
76+
prometheus.HistogramOpts{
77+
Namespace: "pd",
78+
Subsystem: "cluster",
79+
Name: "raftcluster_start_duration_seconds",
80+
Help: "Bucketed histogram of processing time (s) of raft cluster start.",
81+
Buckets: prometheus.ExponentialBuckets(0.001, 2, 13), // 1ms ~ 4s
82+
})
7483
)
7584

7685
func init() {
@@ -81,4 +90,5 @@ func init() {
8190
prometheus.MustRegister(storeSyncConfigEvent)
8291
prometheus.MustRegister(updateStoreStatsGauge)
8392
prometheus.MustRegister(storeTriggerNetworkSlowEvict)
93+
prometheus.MustRegister(raftClusterStartDuration)
8494
}

server/server.go

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1748,6 +1748,9 @@ func (s *Server) campaignLeader() {
17481748
}
17491749
return
17501750
}
1751+
// Start timing from when leader is successfully elected
1752+
leaderReadyStart := time.Now()
1753+
log.Info("[leader-ready] leader election succeeded, start leader ready process")
17511754

17521755
// Start keepalive the leadership and enable TSO service.
17531756
// TSO service is strictly enabled/disabled by PD leader lease for 2 reasons:
@@ -1761,48 +1764,74 @@ func (s *Server) campaignLeader() {
17611764
})
17621765

17631766
// maintain the PD leadership, after this, TSO can be service.
1767+
log.Info("[leader-ready] start to keep leader lease")
1768+
keepLeaderStart := time.Now()
17641769
s.member.GetLeadership().Keep(ctx)
1770+
keepLeaderDuration := time.Since(keepLeaderStart)
1771+
log.Info("[leader-ready] keep leader lease completed", zap.Duration("cost", keepLeaderDuration))
17651772
log.Info("campaign PD leader ok", zap.String("campaign-leader-name", s.Name()))
17661773

1774+
reloadConfigStart := time.Now()
17671775
if err := s.reloadConfigFromKV(); err != nil {
1768-
log.Error("failed to reload configuration", errs.ZapError(err))
1776+
log.Error("[leader-ready] failed to reload configuration", errs.ZapError(err), zap.Duration("cost", time.Since(reloadConfigStart)))
17691777
return
17701778
}
1779+
reloadConfigDuration := time.Since(reloadConfigStart)
1780+
log.Info("[leader-ready] reload config from KV completed", zap.Duration("cost", reloadConfigDuration))
17711781

1782+
loadTTLStart := time.Now()
17721783
if err := s.persistOptions.LoadTTLFromEtcd(s.ctx, s.client); err != nil {
1773-
log.Error("failed to load persistOptions from etcd", errs.ZapError(err))
1784+
log.Error("[leader-ready] failed to load persistOptions from etcd", errs.ZapError(err), zap.Duration("cost", time.Since(loadTTLStart)))
17741785
return
17751786
}
1787+
loadTTLDuration := time.Since(loadTTLStart)
1788+
log.Info("[leader-ready] load persist options from etcd completed", zap.Duration("cost", loadTTLDuration))
17761789

1790+
encryptionStart := time.Now()
17771791
if err := s.encryptionKeyManager.SetLeadership(s.member.GetLeadership()); err != nil {
1778-
log.Error("failed to initialize encryption", errs.ZapError(err))
1792+
log.Error("[leader-ready] failed to initialize encryption", errs.ZapError(err), zap.Duration("cost", time.Since(encryptionStart)))
17791793
return
17801794
}
1795+
encryptionDuration := time.Since(encryptionStart)
1796+
log.Info("[leader-ready] initialize encryption completed", zap.Duration("cost", encryptionDuration))
17811797

1782-
log.Info("triggering the leader callback functions")
1798+
callbacksStart := time.Now()
1799+
log.Info("[leader-ready] triggering the leader callback functions")
17831800
for _, cb := range s.leaderCallbacks {
17841801
if err := cb(ctx); err != nil {
1785-
log.Error("failed to execute leader callback function", errs.ZapError(err))
1802+
log.Error("[leader-ready] failed to execute leader callback function", errs.ZapError(err), zap.Duration("cost", time.Since(callbacksStart)))
17861803
return
17871804
}
17881805
}
1806+
callbacksDuration := time.Since(callbacksStart)
1807+
log.Info("[leader-ready] trigger leader callback functions completed", zap.Duration("cost", callbacksDuration))
17891808

17901809
// Try to create raft cluster.
1810+
createRaftClusterStart := time.Now()
17911811
if err := s.createRaftCluster(); err != nil {
1792-
log.Error("failed to create raft cluster", errs.ZapError(err))
1812+
log.Error("[leader-ready] failed to create raft cluster", errs.ZapError(err), zap.Duration("cost", time.Since(createRaftClusterStart)))
17931813
return
17941814
}
1815+
createRaftClusterDuration := time.Since(createRaftClusterStart)
1816+
log.Info("[leader-ready] create raft cluster completed", zap.Duration("cost", createRaftClusterDuration))
17951817
defer s.stopRaftCluster()
17961818
failpoint.Inject("rebaseErr", func() {
17971819
failpoint.Return()
17981820
})
1821+
rebaseStart := time.Now()
17991822
if err := s.idAllocator.Rebase(); err != nil {
1800-
log.Error("failed to sync id from etcd", errs.ZapError(err))
1823+
log.Error("[leader-ready] failed to sync id from etcd", errs.ZapError(err), zap.Duration("cost", time.Since(rebaseStart)))
18011824
return
18021825
}
1826+
rebaseDuration := time.Since(rebaseStart)
1827+
log.Info("[leader-ready] sync id from etcd completed", zap.Duration("cost", rebaseDuration))
18031828
// PromoteSelf to accept the remaining service, such as GetStore, GetRegion.
1829+
log.Info("[leader-ready] start to promote leader")
1830+
enableLeaderStart := time.Now()
18041831
s.member.PromoteSelf()
1832+
enableLeaderDuration := time.Since(enableLeaderStart)
18051833
member.ServiceMemberGauge.WithLabelValues(PD).Set(1)
1834+
log.Info("[leader-ready] promote leader completed", zap.Duration("cost", enableLeaderDuration))
18061835
defer resetLeaderOnce.Do(func() {
18071836
// as soon as cancel the leadership keepalive, then other member have chance
18081837
// to be new leader.
@@ -1811,8 +1840,13 @@ func (s *Server) campaignLeader() {
18111840
member.ServiceMemberGauge.WithLabelValues(PD).Set(0)
18121841
})
18131842

1843+
log.Info("[leader-ready] start to check PD version with cluster version")
1844+
versionCheckStart := time.Now()
18141845
CheckPDVersionWithClusterVersion(s.persistOptions)
1815-
log.Info("PD leader is ready to serve", zap.String("leader-name", s.Name()))
1846+
versionCheckDuration := time.Since(versionCheckStart)
1847+
log.Info("[leader-ready] check PD version with cluster version completed", zap.Duration("cost", versionCheckDuration))
1848+
totalDuration := time.Since(leaderReadyStart)
1849+
log.Info("[leader-ready] PD leader is ready to serve", zap.String("leader-name", s.Name()), zap.Duration("total-cost", totalDuration))
18161850

18171851
leaderTicker := time.NewTicker(mcs.LeaderTickInterval)
18181852
defer leaderTicker.Stop()

0 commit comments

Comments
 (0)