@@ -39,7 +39,6 @@ import (
3939 plannercore "github.com/pingcap/tidb/pkg/planner/core"
4040 "github.com/pingcap/tidb/pkg/table"
4141 "github.com/pingcap/tidb/pkg/table/tables"
42- "github.com/pingcap/tidb/pkg/types"
4342 contextutil "github.com/pingcap/tidb/pkg/util/context"
4443 "github.com/pingcap/tidb/pkg/util/dbterror/exeerrors"
4544 "go.uber.org/zap"
@@ -352,27 +351,30 @@ func (s *kvSizeSampler) sampleOneFile(
352351 }()
353352
354353 var (
355- count int
356- readRowCache []types.Datum
357- readFn = parserEncodeReader (parser , chunk .Chunk .EndOffset , chunk .GetKey ())
358- kvBatch = newEncodedKVGroupBatch (ksCodec , maxRowCount )
354+ count int
355+ kvBatch = newEncodedKVGroupBatch (ksCodec , maxRowCount )
359356 )
360357 for count < maxRowCount {
361- row , closed , readErr := readFn (ctx , readRowCache )
362- if readErr != nil {
363- return 0 , 0 , 0 , readErr
364- }
365- if closed {
358+ startPos , _ := parser .Pos ()
359+ if s .cfg .Format != DataFormatParquet && startPos >= chunk .Chunk .EndOffset {
366360 break
367361 }
368- readRowCache = row .row
369- if rowDelta := row .endOffset - row .startPos ; rowDelta > 0 {
370- sourceSize += rowDelta
362+
363+ readErr := parser .ReadRow ()
364+ if readErr != nil {
365+ if errors .Cause (readErr ) == io .EOF {
366+ break
367+ }
368+ return 0 , 0 , 0 , common .ErrEncodeKV .Wrap (readErr ).GenWithStackByArgs (chunk .GetKey (), startPos )
371369 }
372- kvs , encodeErr := encoder .Encode (row .row , row .rowID )
373- row .resetFn ()
370+
371+ lastRow := parser .LastRow ()
372+ sourceSize += s .sampledRowSourceSize (parser , startPos , lastRow )
373+
374+ kvs , encodeErr := encoder .Encode (lastRow .Row , lastRow .RowID )
375+ parser .RecycleRow (lastRow )
374376 if encodeErr != nil {
375- return 0 , 0 , 0 , common .ErrEncodeKV .Wrap (encodeErr ).GenWithStackByArgs (chunk .GetKey (), row . startPos )
377+ return 0 , 0 , 0 , common .ErrEncodeKV .Wrap (encodeErr ).GenWithStackByArgs (chunk .GetKey (), startPos )
376378 }
377379 if _ , err = kvBatch .add (kvs ); err != nil {
378380 return 0 , 0 , 0 , err
@@ -382,3 +384,17 @@ func (s *kvSizeSampler) sampleOneFile(
382384 dataKVSize , indexKVSize = kvBatch .groupChecksum .DataAndIndexSumSize ()
383385 return sourceSize , dataKVSize , indexKVSize , nil
384386}
387+
388+ func (s * kvSizeSampler ) sampledRowSourceSize (parser mydump.Parser , startPos int64 , row mydump.Row ) int64 {
389+ // Sampling needs per-row source bytes, not buffered reader progress.
390+ // SQL/CSV parsers expose byte offsets through Pos(), while parquet Pos()
391+ // is row-count based and must fall back to the row-size estimate.
392+ if s .cfg .Format == DataFormatParquet {
393+ return int64 (row .Length )
394+ }
395+ endPos , _ := parser .Pos ()
396+ if rowDelta := endPos - startPos ; rowDelta > 0 {
397+ return rowDelta
398+ }
399+ return int64 (row .Length )
400+ }
0 commit comments