Skip to content

Commit 35a7c9e

Browse files
authored
statistics: add refresher (#50845)
ref #50132
1 parent 8491da1 commit 35a7c9e

File tree

7 files changed

+453
-16
lines changed

7 files changed

+453
-16
lines changed

pkg/statistics/handle/autoanalyze/priorityqueue/BUILD.bazel

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@ go_library(
1313
"//pkg/sessionctx",
1414
"//pkg/sessionctx/variable",
1515
"//pkg/statistics/handle/autoanalyze/exec",
16+
"//pkg/statistics/handle/logutil",
1617
"//pkg/statistics/handle/types",
1718
"//pkg/statistics/handle/util",
19+
"@org_uber_go_zap//:zap",
1820
],
1921
)
2022

@@ -29,7 +31,7 @@ go_test(
2931
],
3032
embed = [":priorityqueue"],
3133
flaky = True,
32-
shard_count = 10,
34+
shard_count = 13,
3335
deps = [
3436
"//pkg/parser/model",
3537
"//pkg/session",

pkg/statistics/handle/autoanalyze/priorityqueue/interval.go

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ import (
2121
"github.com/pingcap/tidb/pkg/statistics/handle/util"
2222
)
2323

24+
// noRecord is used to indicate that there is no related record in mysql.analyze_jobs.
25+
const noRecord = -1
26+
27+
// justFailed is used to indicate that the last analysis has just failed.
28+
const justFailed = 0
29+
2430
const avgDurationQueryForTable = `
2531
SELECT AVG(TIMESTAMPDIFF(SECOND, start_time, end_time)) AS avg_duration
2632
FROM (
@@ -57,14 +63,21 @@ const lastFailedDurationQueryForTable = `
5763
// We pick the minimum duration of all failed analyses because we want to be conservative.
5864
const lastFailedDurationQueryForPartition = `
5965
SELECT
60-
MIN(TIMESTAMPDIFF(SECOND, start_time, CURRENT_TIMESTAMP)) AS min_duration
61-
FROM
62-
mysql.analyze_jobs
63-
WHERE
64-
table_schema = %? AND
65-
table_name = %? AND
66-
state = 'failed' AND
67-
partition_name IN (%?);
66+
MIN(TIMESTAMPDIFF(SECOND, aj.start_time, CURRENT_TIMESTAMP)) AS min_duration
67+
FROM (
68+
SELECT
69+
MAX(id) AS max_id
70+
FROM
71+
mysql.analyze_jobs
72+
WHERE
73+
table_schema = %?
74+
AND table_name = %?
75+
AND state = 'failed'
76+
AND partition_name IN (%?)
77+
GROUP BY
78+
partition_name
79+
) AS latest_failures
80+
JOIN mysql.analyze_jobs aj ON aj.id = latest_failures.max_id;
6881
`
6982

7083
// getAverageAnalysisDuration returns the average duration of the last 5 successful analyses for each specified partition.
@@ -86,17 +99,17 @@ func getAverageAnalysisDuration(
8699

87100
rows, _, err := util.ExecRows(sctx, query, params...)
88101
if err != nil {
89-
return 0, err
102+
return noRecord, err
90103
}
91104

92105
// NOTE: if there are no successful analyses, we return 0.
93106
if len(rows) == 0 || rows[0].IsNull(0) {
94-
return 0, nil
107+
return noRecord, nil
95108
}
96109
avgDuration := rows[0].GetMyDecimal(0)
97110
duration, err := avgDuration.ToFloat64()
98111
if err != nil {
99-
return 0, err
112+
return noRecord, err
100113
}
101114

102115
return time.Duration(duration) * time.Second, nil
@@ -121,14 +134,17 @@ func getLastFailedAnalysisDuration(
121134

122135
rows, _, err := util.ExecRows(sctx, query, params...)
123136
if err != nil {
124-
return 0, err
137+
return noRecord, err
125138
}
126139

127140
// NOTE: if there are no failed analyses, we return 0.
128141
if len(rows) == 0 || rows[0].IsNull(0) {
129-
return 0, nil
142+
return noRecord, nil
130143
}
131144
lastFailedDuration := rows[0].GetUint64(0)
145+
if lastFailedDuration == 0 {
146+
return justFailed, nil
147+
}
132148

133149
return time.Duration(lastFailedDuration) * time.Second, nil
134150
}

pkg/statistics/handle/autoanalyze/priorityqueue/interval_test.go

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ func TestGetAverageAnalysisDuration(t *testing.T) {
3737
"example_schema", "example_table", "example_partition",
3838
)
3939
require.NoError(t, err)
40-
require.Equal(t, time.Duration(0), avgDuration)
40+
require.Equal(t, time.Duration(noRecord), avgDuration)
4141

4242
initJobs(tk)
4343

@@ -102,7 +102,7 @@ func TestGetLastFailedAnalysisDuration(t *testing.T) {
102102
"example_schema", "example_table", "example_partition",
103103
)
104104
require.NoError(t, err)
105-
require.Equal(t, time.Duration(0), lastFailedDuration)
105+
require.Equal(t, time.Duration(noRecord), lastFailedDuration)
106106
initJobs(tk)
107107

108108
// Partitioned table.
@@ -320,3 +320,68 @@ func insertFailedJob(
320320
)
321321
}
322322
}
323+
324+
func insertFailedJobWithStartTime(
325+
tk *testkit.TestKit,
326+
dbName string,
327+
tableName string,
328+
partitionName string,
329+
startTime string,
330+
) {
331+
if partitionName == "" {
332+
tk.MustExec(`
333+
INSERT INTO mysql.analyze_jobs (
334+
table_schema,
335+
table_name,
336+
job_info,
337+
start_time,
338+
end_time,
339+
state,
340+
fail_reason,
341+
instance
342+
) VALUES (
343+
?,
344+
?,
345+
'Job information for failed job',
346+
?,
347+
'2024-01-01 10:00:00',
348+
'failed',
349+
'Some reason for failure',
350+
'example_instance'
351+
);
352+
`,
353+
dbName,
354+
tableName,
355+
startTime,
356+
)
357+
} else {
358+
tk.MustExec(`
359+
INSERT INTO mysql.analyze_jobs (
360+
table_schema,
361+
table_name,
362+
partition_name,
363+
job_info,
364+
start_time,
365+
end_time,
366+
state,
367+
fail_reason,
368+
instance
369+
) VALUES (
370+
?,
371+
?,
372+
?,
373+
'Job information for failed job',
374+
?,
375+
'2024-01-01 10:00:00',
376+
'failed',
377+
'Some reason for failure',
378+
'example_instance'
379+
);
380+
`,
381+
dbName,
382+
tableName,
383+
partitionName,
384+
startTime,
385+
)
386+
}
387+
}

pkg/statistics/handle/autoanalyze/priorityqueue/job.go

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,25 @@
1515
package priorityqueue
1616

1717
import (
18+
"fmt"
1819
"strings"
20+
"time"
1921

2022
"github.com/pingcap/tidb/pkg/sessionctx"
2123
"github.com/pingcap/tidb/pkg/sessionctx/variable"
2224
"github.com/pingcap/tidb/pkg/statistics/handle/autoanalyze/exec"
25+
statslogutil "github.com/pingcap/tidb/pkg/statistics/handle/logutil"
2326
statstypes "github.com/pingcap/tidb/pkg/statistics/handle/types"
2427
statsutil "github.com/pingcap/tidb/pkg/statistics/handle/util"
28+
"go.uber.org/zap"
2529
)
2630

31+
// defaultFailedAnalysisWaitTime is the default wait time for the next analysis after a failed analysis.
32+
// NOTE: this is only used when the average analysis duration is not available.(No successful analysis before)
33+
const defaultFailedAnalysisWaitTime = 30 * time.Minute
34+
2735
// TableAnalysisJob defines the structure for table analysis job information.
36+
// TODO: add stringer for TableAnalysisJob.
2837
type TableAnalysisJob struct {
2938
// Only set when partitions's indexes need to be analyzed.
3039
PartitionIndexes map[string][]string
@@ -40,6 +49,128 @@ type TableAnalysisJob struct {
4049
Weight float64
4150
}
4251

52+
// IsValidToAnalyze checks whether the table is valid to analyze.
53+
// It checks the last failed analysis duration and the average analysis duration.
54+
// If the last failed analysis duration is less than 2 times the average analysis duration,
55+
// we skip this table to avoid too much failed analysis.
56+
func (j *TableAnalysisJob) IsValidToAnalyze(
57+
sctx sessionctx.Context,
58+
) (bool, string) {
59+
// No need to analyze this table.
60+
// TODO: Usually, we should not put this kind of table into the queue.
61+
if j.Weight == 0 {
62+
return false, "weight is 0"
63+
}
64+
65+
// Check whether the table or partition is valid to analyze.
66+
if len(j.Partitions) > 0 || len(j.PartitionIndexes) > 0 {
67+
// Any partition is invalid to analyze, the whole table is invalid to analyze.
68+
// Because we need to analyze partitions in batch mode.
69+
partitions := append(j.Partitions, getPartitionNames(j.PartitionIndexes)...)
70+
if valid, failReason := isValidToAnalyze(
71+
sctx,
72+
j.TableSchema,
73+
j.TableName,
74+
partitions...,
75+
); !valid {
76+
return false, failReason
77+
}
78+
} else {
79+
if valid, failReason := isValidToAnalyze(
80+
sctx,
81+
j.TableSchema,
82+
j.TableName,
83+
); !valid {
84+
return false, failReason
85+
}
86+
}
87+
88+
return true, ""
89+
}
90+
91+
func getPartitionNames(partitionIndexes map[string][]string) []string {
92+
names := make([]string, 0, len(partitionIndexes))
93+
for _, partitionNames := range partitionIndexes {
94+
names = append(names, partitionNames...)
95+
}
96+
return names
97+
}
98+
99+
func isValidToAnalyze(
100+
sctx sessionctx.Context,
101+
schema, table string,
102+
partitionNames ...string,
103+
) (bool, string) {
104+
lastFailedAnalysisDuration, err :=
105+
getLastFailedAnalysisDuration(sctx, schema, table, partitionNames...)
106+
if err != nil {
107+
statslogutil.StatsLogger().Warn(
108+
"Fail to get last failed analysis duration",
109+
zap.String("schema", schema),
110+
zap.String("table", table),
111+
zap.Strings("partitions", partitionNames),
112+
zap.Error(err),
113+
)
114+
return false, fmt.Sprintf("fail to get last failed analysis duration: %v", err)
115+
}
116+
117+
averageAnalysisDuration, err :=
118+
getAverageAnalysisDuration(sctx, schema, table, partitionNames...)
119+
if err != nil {
120+
statslogutil.StatsLogger().Warn(
121+
"Fail to get average analysis duration",
122+
zap.String("schema", schema),
123+
zap.String("table", table),
124+
zap.Strings("partitions", partitionNames),
125+
zap.Error(err),
126+
)
127+
return false, fmt.Sprintf("fail to get average analysis duration: %v", err)
128+
}
129+
130+
// Last analysis just failed, we should not analyze it again.
131+
if lastFailedAnalysisDuration == justFailed {
132+
// The last analysis failed, we should not analyze it again.
133+
statslogutil.StatsLogger().Info(
134+
"Skip analysis because the last analysis just failed",
135+
zap.String("schema", schema),
136+
zap.String("table", table),
137+
zap.Strings("partitions", partitionNames),
138+
)
139+
return false, "last analysis just failed"
140+
}
141+
142+
// Failed analysis duration is less than 2 times the average analysis duration.
143+
// Skip this table to avoid too much failed analysis.
144+
onlyFailedAnalysis := lastFailedAnalysisDuration != noRecord && averageAnalysisDuration == noRecord
145+
if onlyFailedAnalysis && lastFailedAnalysisDuration < defaultFailedAnalysisWaitTime {
146+
statslogutil.StatsLogger().Info(
147+
fmt.Sprintf("Skip analysis because the last failed analysis duration is less than %v", defaultFailedAnalysisWaitTime),
148+
zap.String("schema", schema),
149+
zap.String("table", table),
150+
zap.Strings("partitions", partitionNames),
151+
zap.Duration("lastFailedAnalysisDuration", lastFailedAnalysisDuration),
152+
zap.Duration("averageAnalysisDuration", averageAnalysisDuration),
153+
)
154+
return false, fmt.Sprintf("last failed analysis duration is less than %v", defaultFailedAnalysisWaitTime)
155+
}
156+
// Failed analysis duration is less than 2 times the average analysis duration.
157+
meetSkipCondition := lastFailedAnalysisDuration != noRecord &&
158+
lastFailedAnalysisDuration < 2*averageAnalysisDuration
159+
if meetSkipCondition {
160+
statslogutil.StatsLogger().Info(
161+
"Skip analysis because the last failed analysis duration is less than 2 times the average analysis duration",
162+
zap.String("schema", schema),
163+
zap.String("table", table),
164+
zap.Strings("partitions", partitionNames),
165+
zap.Duration("lastFailedAnalysisDuration", lastFailedAnalysisDuration),
166+
zap.Duration("averageAnalysisDuration", averageAnalysisDuration),
167+
)
168+
return false, "last failed analysis duration is less than 2 times the average analysis duration"
169+
}
170+
171+
return true, ""
172+
}
173+
43174
// Execute executes the analyze statement.
44175
func (j *TableAnalysisJob) Execute(
45176
statsHandle statstypes.StatsHandle,

0 commit comments

Comments
 (0)