Skip to content

Commit b5ffc4f

Browse files
mjonssclaude
andcommitted
statistics: simplify FMSketch by removing swiss map and pool
Go 1.24+ uses swiss tables natively, so the dolthub/swiss dependency is no longer needed. Replace it with native map[uint64]struct{} and use standard library functions (maps.Clone, maps.DeleteFunc). Remove the sync.Pool for FMSketch as its benefit is marginal — nil the references instead to allow prompt GC. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent b104014 commit b5ffc4f

File tree

10 files changed

+42
-89
lines changed

10 files changed

+42
-89
lines changed

go.mod

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ require (
5151
github.com/dgraph-io/ristretto v0.1.1
5252
github.com/dgryski/go-farm v0.0.0-20240924180020-3414d57e47da
5353
github.com/docker/go-units v0.5.0
54-
github.com/dolthub/swiss v0.2.1
5554
github.com/emirpasic/gods v1.18.1
5655
github.com/fatih/color v1.18.0
5756
github.com/felixge/fgprof v0.9.3
@@ -247,7 +246,6 @@ require (
247246
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
248247
github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0 // indirect
249248
github.com/dennwc/varint v1.0.0 // indirect
250-
github.com/dolthub/maphash v0.1.0 // indirect
251249
github.com/dustin/go-humanize v1.0.1 // indirect
252250
github.com/fatih/structtag v1.2.0
253251
github.com/felixge/httpsnoop v1.0.4 // indirect

go.sum

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -305,10 +305,6 @@ github.com/dgryski/go-farm v0.0.0-20240924180020-3414d57e47da h1:aIftn67I1fkbMa5
305305
github.com/dgryski/go-farm v0.0.0-20240924180020-3414d57e47da/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw=
306306
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
307307
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
308-
github.com/dolthub/maphash v0.1.0 h1:bsQ7JsF4FkkWyrP3oCnFJgrCUAFbFf3kOl4L/QxPDyQ=
309-
github.com/dolthub/maphash v0.1.0/go.mod h1:gkg4Ch4CdCDu5h6PMriVLawB7koZ+5ijb9puGMV50a4=
310-
github.com/dolthub/swiss v0.2.1 h1:gs2osYs5SJkAaH5/ggVJqXQxRXtWshF6uE0lgR/Y3Gw=
311-
github.com/dolthub/swiss v0.2.1/go.mod h1:8AhKZZ1HK7g18j7v7k6c5cYIGEZJcPn0ARsai8cUrh0=
312308
github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
313309
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
314310
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=

pkg/statistics/analyze.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,7 @@ type AnalyzeResult struct {
7474

7575
// DestroyAndPutToPool destroys the result and put it to the pool.
7676
func (a *AnalyzeResult) DestroyAndPutToPool() {
77-
for _, f := range a.Fms {
78-
f.DestroyAndPutToPool()
79-
}
77+
a.Fms = nil
8078
for _, h := range a.Hist {
8179
h.DestroyAndPutToPool()
8280
}

pkg/statistics/fmsketch.go

Lines changed: 29 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@ package statistics
1616

1717
import (
1818
"hash"
19+
"maps"
1920
"sync"
2021

21-
"github.com/dolthub/swiss"
2222
"github.com/pingcap/errors"
2323
"github.com/pingcap/tidb/pkg/sessionctx/stmtctx"
2424
"github.com/pingcap/tidb/pkg/types"
@@ -33,15 +33,6 @@ var murmur3Pool = sync.Pool{
3333
},
3434
}
3535

36-
var fmSketchPool = sync.Pool{
37-
New: func() any {
38-
return &FMSketch{
39-
hashset: swiss.NewMap[uint64, bool](uint32(128)),
40-
maxSize: 0,
41-
}
42-
},
43-
}
44-
4536
// MaxSketchSize is the maximum size of the hashset in the FM sketch.
4637
// TODO: add this attribute to PB and persist it instead of using a fixed number(executor.maxSketchSize)
4738
const MaxSketchSize = 10000
@@ -64,7 +55,7 @@ const MaxSketchSize = 10000
6455
// 2. https://algo.inria.fr/flajolet/Publications/FlMa85.pdf
6556
type FMSketch struct {
6657
// A set to store unique hashed values.
67-
hashset *swiss.Map[uint64, bool]
58+
hashset map[uint64]struct{}
6859
// A binary mask used to track the maximum number of trailing zeroes in the hashed values.
6960
// Also used to track the level of the sketch.
7061
// Every time the size of the hashset exceeds the maximum size, the mask will be moved to the next level.
@@ -76,24 +67,22 @@ type FMSketch struct {
7667

7768
// NewFMSketch returns a new FM sketch.
7869
func NewFMSketch(maxSize int) *FMSketch {
79-
result := fmSketchPool.Get().(*FMSketch)
80-
result.maxSize = maxSize
81-
return result
70+
return &FMSketch{
71+
hashset: make(map[uint64]struct{}),
72+
maxSize: maxSize,
73+
}
8274
}
8375

8476
// Copy makes a copy for current FMSketch.
8577
func (s *FMSketch) Copy() *FMSketch {
8678
if s == nil {
8779
return nil
8880
}
89-
result := NewFMSketch(s.maxSize)
90-
s.hashset.Iter(func(key uint64, value bool) bool {
91-
result.hashset.Put(key, value)
92-
return false
93-
})
94-
result.mask = s.mask
95-
result.maxSize = s.maxSize
96-
return result
81+
return &FMSketch{
82+
hashset: maps.Clone(s.hashset),
83+
mask: s.mask,
84+
maxSize: s.maxSize,
85+
}
9786
}
9887

9988
// NDV returns the estimated number of distinct values (NDV) in the sketch.
@@ -106,7 +95,7 @@ func (s *FMSketch) NDV() int64 {
10695
// This is achieved by hashing the input value and counting the number of trailing zeroes in the binary representation of the hash value.
10796
// So the count of distinct values with 'r' trailing zeroes is n / 2^r, where 'n' is the number of distinct values.
10897
// Therefore, the estimated count of distinct values is 2^r * count = n.
109-
return int64(s.mask+1) * int64(s.hashset.Count())
98+
return int64(s.mask+1) * int64(len(s.hashset))
11099
}
111100

112101
// insertHashValue inserts a hashed value into the sketch.
@@ -117,18 +106,15 @@ func (s *FMSketch) insertHashValue(hashVal uint64) {
117106
return
118107
}
119108
// Put the hashed value into the hashset.
120-
s.hashset.Put(hashVal, true)
109+
s.hashset[hashVal] = struct{}{}
121110
// We track the unique hashed values level by level to ensure a minimum count of distinct values at each level.
122111
// This way, the final estimation is less likely to be skewed by outliers.
123-
if s.hashset.Count() > s.maxSize {
112+
if len(s.hashset) > s.maxSize {
124113
// If the size of the hashset exceeds the maximum size, move the mask to the next level.
125114
s.mask = s.mask*2 + 1
126115
// Clean up the hashset by removing the hashed values with trailing zeroes less than the new mask.
127-
s.hashset.Iter(func(k uint64, _ bool) (stop bool) {
128-
if (k & s.mask) != 0 {
129-
s.hashset.Delete(k)
130-
}
131-
return false
116+
maps.DeleteFunc(s.hashset, func(k uint64, _ struct{}) bool {
117+
return (k & s.mask) != 0
132118
})
133119
}
134120
}
@@ -182,28 +168,23 @@ func (s *FMSketch) MergeFMSketch(rs *FMSketch) {
182168
}
183169
if s.mask < rs.mask {
184170
s.mask = rs.mask
185-
s.hashset.Iter(func(key uint64, _ bool) bool {
186-
if (key & s.mask) != 0 {
187-
s.hashset.Delete(key)
188-
}
189-
return false
171+
maps.DeleteFunc(s.hashset, func(k uint64, _ struct{}) bool {
172+
return (k & s.mask) != 0
190173
})
191174
}
192-
rs.hashset.Iter(func(key uint64, _ bool) bool {
175+
for key := range rs.hashset {
193176
s.insertHashValue(key)
194-
return false
195-
})
177+
}
196178
}
197179

198180
// FMSketchToProto converts FMSketch to its protobuf representation.
199181
func FMSketchToProto(s *FMSketch) *tipb.FMSketch {
200182
protoSketch := new(tipb.FMSketch)
201183
if s != nil {
202184
protoSketch.Mask = s.mask
203-
s.hashset.Iter(func(val uint64, _ bool) bool {
185+
for val := range s.hashset {
204186
protoSketch.Hashset = append(protoSketch.Hashset, val)
205-
return false
206-
})
187+
}
207188
}
208189
return protoSketch
209190
}
@@ -213,10 +194,12 @@ func FMSketchFromProto(protoSketch *tipb.FMSketch) *FMSketch {
213194
if protoSketch == nil {
214195
return nil
215196
}
216-
sketch := fmSketchPool.Get().(*FMSketch)
217-
sketch.mask = protoSketch.Mask
197+
sketch := &FMSketch{
198+
hashset: make(map[uint64]struct{}, len(protoSketch.Hashset)),
199+
mask: protoSketch.Mask,
200+
}
218201
for _, val := range protoSketch.Hashset {
219-
sketch.hashset.Put(val, true)
202+
sketch.hashset[val] = struct{}{}
220203
}
221204
return sketch
222205
}
@@ -249,22 +232,8 @@ func DecodeFMSketch(data []byte) (*FMSketch, error) {
249232
// MemoryUsage returns the total memory usage of a FMSketch.
250233
func (s *FMSketch) MemoryUsage() (sum int64) {
251234
// As for the variables mask(uint64) and maxSize(int) each will consume 8 bytes. This is the origin of the constant 16.
252-
// And for the variables hashset(map[uint64]bool), each element in map will consume 9 bytes(8[uint64] + 1[bool]).
253-
sum = int64(16 + 9*s.hashset.Count())
235+
// And for the variables hashset(map[uint64]struct{}), each element in map will consume 8 bytes(uint64 key).
236+
sum = int64(16 + 8*len(s.hashset))
254237
return
255238
}
256239

257-
func (s *FMSketch) reset() {
258-
s.hashset.Clear()
259-
s.mask = 0
260-
s.maxSize = 0
261-
}
262-
263-
// DestroyAndPutToPool resets the FMSketch and puts it to the pool.
264-
func (s *FMSketch) DestroyAndPutToPool() {
265-
if s == nil {
266-
return
267-
}
268-
s.reset()
269-
fmSketchPool.Put(s)
270-
}

pkg/statistics/fmsketch_test.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,9 @@ func SubTestSketch() func(*testing.T) {
7070
sketch := NewFMSketch(maxSize)
7171
sketch.insertHashValue(1)
7272
sketch.insertHashValue(2)
73-
require.Equal(t, maxSize, sketch.hashset.Count())
73+
require.Equal(t, maxSize, len(sketch.hashset))
7474
sketch.insertHashValue(4)
75-
require.LessOrEqual(t, maxSize, sketch.hashset.Count())
75+
require.LessOrEqual(t, maxSize, len(sketch.hashset))
7676
}
7777
}
7878

@@ -87,11 +87,11 @@ func SubTestSketchProtoConversion() func(*testing.T) {
8787
p := FMSketchToProto(sampleSketch)
8888
f := FMSketchFromProto(p)
8989
require.Equal(t, f.mask, sampleSketch.mask)
90-
require.Equal(t, f.hashset.Count(), sampleSketch.hashset.Count())
91-
sampleSketch.hashset.Iter(func(key uint64, _ bool) bool {
92-
require.True(t, f.hashset.Has(key))
93-
return false
94-
})
90+
require.Equal(t, len(f.hashset), len(sampleSketch.hashset))
91+
for key := range sampleSketch.hashset {
92+
_, ok := f.hashset[key]
93+
require.True(t, ok)
94+
}
9595
}
9696
}
9797

pkg/statistics/handle/globalstats/global_stats.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -312,13 +312,13 @@ func blockingMergePartitionStats2GlobalStats(
312312
globalStats.Fms[i] = allFms[i][j]
313313
} else {
314314
globalStats.Fms[i].MergeFMSketch(allFms[i][j])
315-
allFms[i][j].DestroyAndPutToPool()
315+
allFms[i][j] = nil
316316
}
317317
}
318318

319319
// Update the global NDV.
320320
globalStatsNDV := min(globalStats.Fms[i].NDV(), globalStats.Count)
321-
globalStats.Fms[i].DestroyAndPutToPool()
321+
globalStats.Fms[i] = nil
322322

323323
// Merge CMSketch.
324324
globalStats.Cms[i] = allCms[i][0]

pkg/statistics/handle/globalstats/global_stats_async.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ func (a *AsyncMergePartitionStats2GlobalStats) cpuWorker(stmtCtx *stmtctx.Statem
278278
// Update the global NDV.
279279
globalStatsNDV := min(a.globalStats.Fms[i].NDV(), a.globalStats.Count)
280280
a.globalStatsNDV = append(a.globalStatsNDV, globalStatsNDV)
281-
a.globalStats.Fms[i].DestroyAndPutToPool()
281+
a.globalStats.Fms[i] = nil
282282
}
283283
}
284284
err = a.dealCMSketch()

pkg/statistics/handle/storage/json.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ func GenJSONTableFromStats(
122122
return true
123123
}
124124
jsonTbl.Columns[col.Info.Name.L] = proto
125-
col.FMSketch.DestroyAndPutToPool()
125+
col.FMSketch = nil
126126
hist.DestroyAndPutToPool()
127127
return false
128128
})

pkg/statistics/row_sampler.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -231,9 +231,7 @@ func (s *RowSampleBuilder) Collect() (RowSampleCollector, error) {
231231
}
232232

233233
func (s *baseCollector) destroyAndPutToPool() {
234-
for _, sketch := range s.FMSketches {
235-
sketch.DestroyAndPutToPool()
236-
}
234+
s.FMSketches = nil
237235
}
238236

239237
func (s *baseCollector) collectColumns(sc *stmtctx.StatementContext, cols []types.Datum, sizes []int64) error {

pkg/statistics/table.go

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -960,13 +960,7 @@ func (t *Table) IsOutdated() bool {
960960

961961
// ReleaseAndPutToPool releases data structures of Table and put itself back to pool.
962962
func (t *Table) ReleaseAndPutToPool() {
963-
for _, col := range t.columns {
964-
col.FMSketch.DestroyAndPutToPool()
965-
}
966963
clear(t.columns)
967-
for _, idx := range t.indices {
968-
idx.FMSketch.DestroyAndPutToPool()
969-
}
970964
clear(t.indices)
971965
}
972966

0 commit comments

Comments
 (0)