@@ -16,9 +16,9 @@ package statistics
1616
1717import (
1818 "hash"
19+ "maps"
1920 "sync"
2021
21- "github.com/dolthub/swiss"
2222 "github.com/pingcap/errors"
2323 "github.com/pingcap/tidb/pkg/sessionctx/stmtctx"
2424 "github.com/pingcap/tidb/pkg/types"
@@ -33,15 +33,6 @@ var murmur3Pool = sync.Pool{
3333 },
3434}
3535
36- var fmSketchPool = sync.Pool {
37- New : func () any {
38- return & FMSketch {
39- hashset : swiss.NewMap [uint64 , bool ](uint32 (128 )),
40- maxSize : 0 ,
41- }
42- },
43- }
44-
4536// MaxSketchSize is the maximum size of the hashset in the FM sketch.
4637// TODO: add this attribute to PB and persist it instead of using a fixed number(executor.maxSketchSize)
4738const MaxSketchSize = 10000
@@ -64,7 +55,7 @@ const MaxSketchSize = 10000
6455// 2. https://algo.inria.fr/flajolet/Publications/FlMa85.pdf
6556type FMSketch struct {
6657 // A set to store unique hashed values.
67- hashset * swiss. Map [uint64 , bool ]
58+ hashset map [uint64 ] struct {}
6859 // A binary mask used to track the maximum number of trailing zeroes in the hashed values.
6960 // Also used to track the level of the sketch.
7061 // Every time the size of the hashset exceeds the maximum size, the mask will be moved to the next level.
@@ -76,24 +67,22 @@ type FMSketch struct {
7667
7768// NewFMSketch returns a new FM sketch.
7869func NewFMSketch (maxSize int ) * FMSketch {
79- result := fmSketchPool .Get ().(* FMSketch )
80- result .maxSize = maxSize
81- return result
70+ return & FMSketch {
71+ hashset : make (map [uint64 ]struct {}),
72+ maxSize : maxSize ,
73+ }
8274}
8375
8476// Copy makes a copy for current FMSketch.
8577func (s * FMSketch ) Copy () * FMSketch {
8678 if s == nil {
8779 return nil
8880 }
89- result := NewFMSketch (s .maxSize )
90- s .hashset .Iter (func (key uint64 , value bool ) bool {
91- result .hashset .Put (key , value )
92- return false
93- })
94- result .mask = s .mask
95- result .maxSize = s .maxSize
96- return result
81+ return & FMSketch {
82+ hashset : maps .Clone (s .hashset ),
83+ mask : s .mask ,
84+ maxSize : s .maxSize ,
85+ }
9786}
9887
9988// NDV returns the estimated number of distinct values (NDV) in the sketch.
@@ -106,7 +95,7 @@ func (s *FMSketch) NDV() int64 {
10695 // This is achieved by hashing the input value and counting the number of trailing zeroes in the binary representation of the hash value.
10796 // So the count of distinct values with 'r' trailing zeroes is n / 2^r, where 'n' is the number of distinct values.
10897 // Therefore, the estimated count of distinct values is 2^r * count = n.
109- return int64 (s .mask + 1 ) * int64 (s .hashset . Count ( ))
98+ return int64 (s .mask + 1 ) * int64 (len ( s .hashset ))
11099}
111100
112101// insertHashValue inserts a hashed value into the sketch.
@@ -117,18 +106,15 @@ func (s *FMSketch) insertHashValue(hashVal uint64) {
117106 return
118107 }
119108 // Put the hashed value into the hashset.
120- s .hashset . Put ( hashVal , true )
109+ s .hashset [ hashVal ] = struct {}{}
121110 // We track the unique hashed values level by level to ensure a minimum count of distinct values at each level.
122111 // This way, the final estimation is less likely to be skewed by outliers.
123- if s .hashset . Count ( ) > s .maxSize {
112+ if len ( s .hashset ) > s .maxSize {
124113 // If the size of the hashset exceeds the maximum size, move the mask to the next level.
125114 s .mask = s .mask * 2 + 1
126115 // Clean up the hashset by removing the hashed values with trailing zeroes less than the new mask.
127- s .hashset .Iter (func (k uint64 , _ bool ) (stop bool ) {
128- if (k & s .mask ) != 0 {
129- s .hashset .Delete (k )
130- }
131- return false
116+ maps .DeleteFunc (s .hashset , func (k uint64 , _ struct {}) bool {
117+ return (k & s .mask ) != 0
132118 })
133119 }
134120}
@@ -182,28 +168,23 @@ func (s *FMSketch) MergeFMSketch(rs *FMSketch) {
182168 }
183169 if s .mask < rs .mask {
184170 s .mask = rs .mask
185- s .hashset .Iter (func (key uint64 , _ bool ) bool {
186- if (key & s .mask ) != 0 {
187- s .hashset .Delete (key )
188- }
189- return false
171+ maps .DeleteFunc (s .hashset , func (k uint64 , _ struct {}) bool {
172+ return (k & s .mask ) != 0
190173 })
191174 }
192- rs . hashset . Iter ( func ( key uint64 , _ bool ) bool {
175+ for key := range rs . hashset {
193176 s .insertHashValue (key )
194- return false
195- })
177+ }
196178}
197179
198180// FMSketchToProto converts FMSketch to its protobuf representation.
199181func FMSketchToProto (s * FMSketch ) * tipb.FMSketch {
200182 protoSketch := new (tipb.FMSketch )
201183 if s != nil {
202184 protoSketch .Mask = s .mask
203- s . hashset . Iter ( func ( val uint64 , _ bool ) bool {
185+ for val := range s . hashset {
204186 protoSketch .Hashset = append (protoSketch .Hashset , val )
205- return false
206- })
187+ }
207188 }
208189 return protoSketch
209190}
@@ -213,10 +194,12 @@ func FMSketchFromProto(protoSketch *tipb.FMSketch) *FMSketch {
213194 if protoSketch == nil {
214195 return nil
215196 }
216- sketch := fmSketchPool .Get ().(* FMSketch )
217- sketch .mask = protoSketch .Mask
197+ sketch := & FMSketch {
198+ hashset : make (map [uint64 ]struct {}, len (protoSketch .Hashset )),
199+ mask : protoSketch .Mask ,
200+ }
218201 for _ , val := range protoSketch .Hashset {
219- sketch .hashset . Put ( val , true )
202+ sketch .hashset [ val ] = struct {}{}
220203 }
221204 return sketch
222205}
@@ -249,22 +232,8 @@ func DecodeFMSketch(data []byte) (*FMSketch, error) {
249232// MemoryUsage returns the total memory usage of a FMSketch.
250233func (s * FMSketch ) MemoryUsage () (sum int64 ) {
251234 // As for the variables mask(uint64) and maxSize(int) each will consume 8 bytes. This is the origin of the constant 16.
252- // And for the variables hashset(map[uint64]bool ), each element in map will consume 9 bytes(8[ uint64] + 1[bool] ).
253- sum = int64 (16 + 9 * s .hashset . Count ( ))
235+ // And for the variables hashset(map[uint64]struct{} ), each element in map will consume 8 bytes(uint64 key ).
236+ sum = int64 (16 + 8 * len ( s .hashset ))
254237 return
255238}
256239
257- func (s * FMSketch ) reset () {
258- s .hashset .Clear ()
259- s .mask = 0
260- s .maxSize = 0
261- }
262-
263- // DestroyAndPutToPool resets the FMSketch and puts it to the pool.
264- func (s * FMSketch ) DestroyAndPutToPool () {
265- if s == nil {
266- return
267- }
268- s .reset ()
269- fmSketchPool .Put (s )
270- }
0 commit comments