Skip to content

Commit 471b3da

Browse files
authored
metrics: support showing the store limit in Grafana (#3583)
* add store limit metrics Signed-off-by: Ryan Leung <rleungx@gmail.com> * add panel Signed-off-by: Ryan Leung <rleungx@gmail.com> * remove the old metrics Signed-off-by: Ryan Leung <rleungx@gmail.com> * add a comment Signed-off-by: Ryan Leung <rleungx@gmail.com>
1 parent a44c9cc commit 471b3da

File tree

6 files changed

+158
-65
lines changed

6 files changed

+158
-65
lines changed

metrics/grafana/pd.json

Lines changed: 138 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@
6363
"editable": true,
6464
"gnetId": null,
6565
"graphTooltip": 1,
66-
"id": 11,
67-
"iteration": 1608795246154,
66+
"id": 25,
67+
"iteration": 1618283470402,
6868
"links": [],
6969
"panels": [
7070
{
@@ -957,7 +957,7 @@
957957
"fontSize": "90%",
958958
"gridPos": {
959959
"h": 7,
960-
"w": 6,
960+
"w": 4,
961961
"x": 0,
962962
"y": 14
963963
},
@@ -1008,15 +1008,133 @@
10081008
"transform": "timeseries_aggregations",
10091009
"type": "table"
10101010
},
1011+
{
1012+
"columns": [],
1013+
"datasource": "${DS_TEST-CLUSTER}",
1014+
"fontSize": "100%",
1015+
"gridPos": {
1016+
"h": 7,
1017+
"w": 5,
1018+
"x": 4,
1019+
"y": 14
1020+
},
1021+
"id": 1433,
1022+
"links": [],
1023+
"pageSize": null,
1024+
"scroll": true,
1025+
"showHeader": true,
1026+
"sort": {
1027+
"col": 4,
1028+
"desc": true
1029+
},
1030+
"styles": [
1031+
{
1032+
"alias": "",
1033+
"colorMode": null,
1034+
"colors": [
1035+
"rgba(245, 54, 54, 0.9)",
1036+
"rgba(237, 129, 40, 0.89)",
1037+
"rgba(50, 172, 45, 0.97)"
1038+
],
1039+
"dateFormat": "YYYY-MM-DD HH:mm:ss",
1040+
"decimals": 2,
1041+
"link": false,
1042+
"mappingType": 1,
1043+
"pattern": "Time",
1044+
"thresholds": [],
1045+
"type": "hidden",
1046+
"unit": "short"
1047+
},
1048+
{
1049+
"alias": "",
1050+
"colorMode": null,
1051+
"colors": [
1052+
"rgba(245, 54, 54, 0.9)",
1053+
"rgba(237, 129, 40, 0.89)",
1054+
"rgba(50, 172, 45, 0.97)"
1055+
],
1056+
"dateFormat": "YYYY-MM-DD HH:mm:ss",
1057+
"decimals": 2,
1058+
"mappingType": 1,
1059+
"pattern": "__name__",
1060+
"thresholds": [],
1061+
"type": "hidden",
1062+
"unit": "short"
1063+
},
1064+
{
1065+
"alias": "",
1066+
"colorMode": null,
1067+
"colors": [
1068+
"rgba(245, 54, 54, 0.9)",
1069+
"rgba(237, 129, 40, 0.89)",
1070+
"rgba(50, 172, 45, 0.97)"
1071+
],
1072+
"dateFormat": "YYYY-MM-DD HH:mm:ss",
1073+
"decimals": 2,
1074+
"mappingType": 1,
1075+
"pattern": "instance",
1076+
"thresholds": [],
1077+
"type": "hidden",
1078+
"unit": "short"
1079+
},
1080+
{
1081+
"alias": "",
1082+
"colorMode": null,
1083+
"colors": [
1084+
"rgba(245, 54, 54, 0.9)",
1085+
"rgba(237, 129, 40, 0.89)",
1086+
"rgba(50, 172, 45, 0.97)"
1087+
],
1088+
"dateFormat": "YYYY-MM-DD HH:mm:ss",
1089+
"decimals": 2,
1090+
"mappingType": 1,
1091+
"pattern": "job",
1092+
"thresholds": [],
1093+
"type": "hidden",
1094+
"unit": "short"
1095+
},
1096+
{
1097+
"alias": "limit (opm)",
1098+
"colorMode": null,
1099+
"colors": [
1100+
"rgba(245, 54, 54, 0.9)",
1101+
"rgba(237, 129, 40, 0.89)",
1102+
"rgba(50, 172, 45, 0.97)"
1103+
],
1104+
"dateFormat": "YYYY-MM-DD HH:mm:ss",
1105+
"decimals": 0,
1106+
"mappingType": 1,
1107+
"pattern": "Value",
1108+
"thresholds": [],
1109+
"type": "number",
1110+
"unit": "short"
1111+
}
1112+
],
1113+
"targets": [
1114+
{
1115+
"expr": "pd_cluster_store_limit",
1116+
"format": "table",
1117+
"instant": true,
1118+
"intervalFactor": 1,
1119+
"legendFormat": "",
1120+
"refId": "A"
1121+
}
1122+
],
1123+
"timeFrom": null,
1124+
"timeShift": null,
1125+
"title": "Store limit",
1126+
"transform": "table",
1127+
"type": "table"
1128+
},
10111129
{
10121130
"cacheTimeout": null,
10131131
"columns": [],
10141132
"datasource": "${DS_TEST-CLUSTER}",
10151133
"fontSize": "100%",
10161134
"gridPos": {
10171135
"h": 3,
1018-
"w": 6,
1019-
"x": 6,
1136+
"w": 5,
1137+
"x": 9,
10201138
"y": 14
10211139
},
10221140
"hideTimeOverride": true,
@@ -1083,8 +1201,8 @@
10831201
"fontSize": "100%",
10841202
"gridPos": {
10851203
"h": 7,
1086-
"w": 6,
1087-
"x": 12,
1204+
"w": 5,
1205+
"x": 14,
10881206
"y": 14
10891207
},
10901208
"hideTimeOverride": true,
@@ -1160,8 +1278,8 @@
11601278
"fontSize": "100%",
11611279
"gridPos": {
11621280
"h": 7,
1163-
"w": 6,
1164-
"x": 18,
1281+
"w": 5,
1282+
"x": 19,
11651283
"y": 14
11661284
},
11671285
"hideTimeOverride": true,
@@ -1253,8 +1371,8 @@
12531371
},
12541372
"gridPos": {
12551373
"h": 2,
1256-
"w": 6,
1257-
"x": 6,
1374+
"w": 5,
1375+
"x": 9,
12581376
"y": 17
12591377
},
12601378
"hideTimeOverride": true,
@@ -1338,8 +1456,8 @@
13381456
},
13391457
"gridPos": {
13401458
"h": 2,
1341-
"w": 6,
1342-
"x": 6,
1459+
"w": 5,
1460+
"x": 9,
13431461
"y": 19
13441462
},
13451463
"hideTimeOverride": true,
@@ -9916,24 +10034,25 @@
991610034
{
991710035
"allValue": null,
991810036
"current": {
10037+
"isNone": true,
10038+
"text": "None",
10039+
"value": ""
991910040
},
992010041
"datasource": "${DS_TEST-CLUSTER}",
10042+
"definition": "",
992110043
"hide": 2,
992210044
"includeAll": false,
992310045
"label": "tidb_cluster",
992410046
"multi": false,
992510047
"name": "tidb_cluster",
9926-
"options": [
9927-
9928-
],
10048+
"options": [],
992910049
"query": "label_values(pd_cluster_status, tidb_cluster)",
993010050
"refresh": 2,
993110051
"regex": "",
10052+
"skipUrlSync": false,
993210053
"sort": 1,
993310054
"tagValuesQuery": "",
9934-
"tags": [
9935-
9936-
],
10055+
"tags": [],
993710056
"tagsQuery": "",
993810057
"type": "query",
993910058
"useTags": false

server/cluster/cluster.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"context"
1818
"fmt"
1919
"net/http"
20+
"strconv"
2021
"sync"
2122
"time"
2223

@@ -1257,7 +1258,6 @@ func (c *RaftCluster) collectMetrics() {
12571258

12581259
c.coordinator.collectSchedulerMetrics()
12591260
c.coordinator.collectHotSpotMetrics()
1260-
c.coordinator.opController.CollectStoreLimitMetrics()
12611261
c.collectClusterMetrics()
12621262
c.collectHealthStatus()
12631263
}
@@ -1680,6 +1680,9 @@ func (c *RaftCluster) RemoveStoreLimit(storeID uint64) {
16801680
for i := 0; i < persistLimitRetryTimes; i++ {
16811681
if err = c.opt.Persist(c.storage); err == nil {
16821682
log.Info("store limit removed", zap.Uint64("store-id", storeID))
1683+
id := strconv.FormatUint(storeID, 10)
1684+
statistics.StoreLimitGauge.DeleteLabelValues(id, "add-peer")
1685+
statistics.StoreLimitGauge.DeleteLabelValues(id, "remove-peer")
16831686
return
16841687
}
16851688
time.Sleep(persistLimitWaitTime)

server/schedule/metrics.go

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -50,22 +50,6 @@ var (
5050
Buckets: prometheus.ExponentialBuckets(0.01, 2, 16),
5151
}, []string{"type"})
5252

53-
storeLimitAvailableGauge = prometheus.NewGaugeVec(
54-
prometheus.GaugeOpts{
55-
Namespace: "pd",
56-
Subsystem: "schedule",
57-
Name: "store_limit_available",
58-
Help: "available limit rate of store.",
59-
}, []string{"store", "limit_type"})
60-
61-
storeLimitRateGauge = prometheus.NewGaugeVec(
62-
prometheus.GaugeOpts{
63-
Namespace: "pd",
64-
Subsystem: "schedule",
65-
Name: "store_limit_rate",
66-
Help: "the limit rate of store.",
67-
}, []string{"store", "limit_type"})
68-
6953
storeLimitCostCounter = prometheus.NewCounterVec(
7054
prometheus.CounterOpts{
7155
Namespace: "pd",
@@ -79,8 +63,6 @@ func init() {
7963
prometheus.MustRegister(operatorCounter)
8064
prometheus.MustRegister(operatorDuration)
8165
prometheus.MustRegister(operatorWaitDuration)
82-
prometheus.MustRegister(storeLimitAvailableGauge)
83-
prometheus.MustRegister(storeLimitRateGauge)
8466
prometheus.MustRegister(storeLimitCostCounter)
8567
prometheus.MustRegister(operatorWaitCounter)
8668
}

server/schedule/operator_controller.go

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -968,30 +968,3 @@ func (oc *OperatorController) GetLeaderSchedulePolicy() core.SchedulePolicy {
968968
}
969969
return oc.cluster.GetOpts().GetLeaderSchedulePolicy()
970970
}
971-
972-
// CollectStoreLimitMetrics collects the metrics about store limit
973-
func (oc *OperatorController) CollectStoreLimitMetrics() {
974-
oc.RLock()
975-
defer oc.RUnlock()
976-
if oc.storesLimit == nil {
977-
return
978-
}
979-
stores := oc.cluster.GetStores()
980-
for _, store := range stores {
981-
if store != nil {
982-
storeID := store.GetID()
983-
storeIDStr := strconv.FormatUint(storeID, 10)
984-
for n, v := range storelimit.TypeNameValue {
985-
var storeLimit *storelimit.StoreLimit
986-
if oc.storesLimit[storeID] == nil || oc.storesLimit[storeID][v] == nil {
987-
// Set to 0 to represent the store limit of the specific type is not initialized.
988-
storeLimitRateGauge.WithLabelValues(storeIDStr, n).Set(0)
989-
continue
990-
}
991-
storeLimit = oc.storesLimit[storeID][v]
992-
storeLimitAvailableGauge.WithLabelValues(storeIDStr, n).Set(float64(storeLimit.Available()) / float64(storelimit.RegionInfluence[v]))
993-
storeLimitRateGauge.WithLabelValues(storeIDStr, n).Set(storeLimit.Rate() * StoreBalanceBaseTime)
994-
}
995-
}
996-
}
997-
}

server/statistics/metrics.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,15 @@ var (
7272
Help: "Status of the scheduling configurations.",
7373
}, []string{"type"})
7474

75+
// StoreLimitGauge is used to record the current store limit.
76+
StoreLimitGauge = prometheus.NewGaugeVec(
77+
prometheus.GaugeOpts{
78+
Namespace: "pd",
79+
Subsystem: "cluster",
80+
Name: "store_limit",
81+
Help: "Status of the store limit.",
82+
}, []string{"store", "type"})
83+
7584
regionLabelLevelGauge = prometheus.NewGaugeVec(
7685
prometheus.GaugeOpts{
7786
Namespace: "pd",
@@ -152,6 +161,7 @@ func init() {
152161
prometheus.MustRegister(clusterStatusGauge)
153162
prometheus.MustRegister(placementStatusGauge)
154163
prometheus.MustRegister(configStatusGauge)
164+
prometheus.MustRegister(StoreLimitGauge)
155165
prometheus.MustRegister(regionLabelLevelGauge)
156166
prometheus.MustRegister(readByteHist)
157167
prometheus.MustRegister(readKeyHist)

server/statistics/store_collection.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,12 @@ func (s *storeStatistics) Collect() {
186186
for name, value := range s.LabelCounter {
187187
placementStatusGauge.WithLabelValues(labelType, name).Set(float64(value))
188188
}
189+
190+
for storeID, limit := range s.opt.GetScheduleConfig().StoreLimit {
191+
id := strconv.FormatUint(storeID, 10)
192+
StoreLimitGauge.WithLabelValues(id, "add-peer").Set(limit.AddPeer)
193+
StoreLimitGauge.WithLabelValues(id, "remove-peer").Set(limit.RemovePeer)
194+
}
189195
}
190196

191197
func (s *storeStatistics) resetStoreStatistics(storeAddress string, id string) {

0 commit comments

Comments
 (0)