Skip to content

Commit 5615d94

Browse files
authored
test(parquet/file): add regression test for mixed-size ByteArray WriteBatch (#757)
### Rationale for this change Issue #756 reports that small ByteArray values are silently dropped when a large (≥1MB) value appears in the same WriteBatch. This was already fixed on main by #690, but there was no test covering the specific mixed-size scenario. ### What changes are included in this PR? Adds a regression test that writes a batch of mixed-size ByteArray values (small values flanking a 2MB value) and verifies all values round-trip correctly. ### Are these changes tested? The test itself is the change. Confirmed it fails at bbf7ab7 (#655, the buggy commit) and passes on main. ### Are there any user-facing changes? No
1 parent 6648c1d commit 5615d94

1 file changed

Lines changed: 72 additions & 0 deletions

File tree

parquet/file/large_value_test.go

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,78 @@ func TestLargeByteArrayRoundTripCorrectness(t *testing.T) {
236236
require.Equal(t, numValues, rowIdx, "did not read back all values")
237237
}
238238

239+
// TestMixedSizeByteArrayRoundTrip verifies that small ByteArray values
240+
// are not dropped when a large (≥1MB) value appears in the same WriteBatch.
241+
func TestMixedSizeByteArrayRoundTrip(t *testing.T) {
242+
sc := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{
243+
schema.Must(schema.NewPrimitiveNode("data", parquet.Repetitions.Required, parquet.Types.ByteArray, -1, -1)),
244+
}, -1)))
245+
246+
props := parquet.NewWriterProperties(
247+
parquet.WithStats(true),
248+
parquet.WithDictionaryDefault(false),
249+
parquet.WithDataPageSize(1024*1024),
250+
)
251+
252+
// Build values: small, small, small, 2MB, small, small
253+
// Each has a unique pattern so corruption is detectable.
254+
sizes := []int{65, 100, 200, 2 * 1024 * 1024, 50, 80}
255+
values := make([]parquet.ByteArray, len(sizes))
256+
for i, sz := range sizes {
257+
buf := make([]byte, sz)
258+
// Header: index for identification
259+
buf[0] = byte(i)
260+
// Fill with deterministic pattern
261+
for j := 1; j < sz; j++ {
262+
buf[j] = byte(i*31 + j)
263+
}
264+
values[i] = buf
265+
}
266+
267+
// Write
268+
out := &bytes.Buffer{}
269+
writer := file.NewParquetWriter(out, sc.Root(), file.WithWriterProps(props))
270+
271+
rgw := writer.AppendRowGroup()
272+
colWriter, err := rgw.NextColumn()
273+
require.NoError(t, err)
274+
275+
byteArrayWriter := colWriter.(*file.ByteArrayColumnChunkWriter)
276+
_, err = byteArrayWriter.WriteBatch(values, nil, nil)
277+
require.NoError(t, err)
278+
279+
require.NoError(t, colWriter.Close())
280+
require.NoError(t, rgw.Close())
281+
require.NoError(t, writer.Close())
282+
283+
// Read back
284+
rdr, err := file.NewParquetReader(bytes.NewReader(out.Bytes()))
285+
require.NoError(t, err)
286+
defer rdr.Close()
287+
288+
require.EqualValues(t, len(values), rdr.NumRows())
289+
290+
rgr := rdr.RowGroup(0)
291+
colReader, err := rgr.Column(0)
292+
require.NoError(t, err)
293+
294+
result := make([]parquet.ByteArray, len(values))
295+
_, nVals, err := colReader.(*file.ByteArrayColumnChunkReader).ReadBatch(
296+
int64(len(values)), result, nil, nil)
297+
require.NoError(t, err)
298+
require.Equal(t, len(values), nVals)
299+
300+
for i, expected := range values {
301+
got := result[i]
302+
require.Equal(t, len(expected), len(got),
303+
"value %d: length mismatch (expected %d, got %d)", i, len(expected), len(got))
304+
require.Equal(t, expected[0], got[0],
305+
"value %d: header mismatch (data corruption)", i)
306+
require.True(t, bytes.Equal(expected, got),
307+
"value %d: content mismatch", i)
308+
}
309+
}
310+
239311
// TestLargeByteArrayRoundTripWithNulls verifies correctness of the
240312
// WriteBatchSpaced path (nullable column) with moderately-sized values.
241313
// Every 3rd value is null. Uses ~3MB total.

0 commit comments

Comments
 (0)