test(parquet/file): add regression test for mixed-size ByteArray WriteBatch (#757)

kli19 · web-flow · commit 5615d94fb283 · 2026-04-13T11:57:12.000-04:00
### Rationale for this change Issue #756 reports that small ByteArray values are silently dropped when a large (≥1MB) value appears in the same WriteBatch. This was already fixed on main by #690, but there was no test covering the specific mixed-size scenario. ### What changes are included in this PR? Adds a regression test that writes a batch of mixed-size ByteArray values (small values flanking a 2MB value) and verifies all values round-trip correctly. ### Are these changes tested? The test itself is the change. Confirmed it fails at bbf7ab7 (#655, the buggy commit) and passes on main. ### Are there any user-facing changes? No
diff --git a/parquet/file/large_value_test.go b/parquet/file/large_value_test.go
@@ -236,6 +236,78 @@ func TestLargeByteArrayRoundTripCorrectness(t *testing.T) {
 	require.Equal(t, numValues, rowIdx, "did not read back all values")
 }
 
+// TestMixedSizeByteArrayRoundTrip verifies that small ByteArray values
+// are not dropped when a large (≥1MB) value appears in the same WriteBatch.
+func TestMixedSizeByteArrayRoundTrip(t *testing.T) {
+	sc := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required, schema.FieldList{
+		schema.Must(schema.NewPrimitiveNode("data", parquet.Repetitions.Required, parquet.Types.ByteArray, -1, -1)),
+	}, -1)))
+
+	props := parquet.NewWriterProperties(
+		parquet.WithStats(true),
+		parquet.WithDictionaryDefault(false),
+		parquet.WithDataPageSize(1024*1024),
+	)
+
+	// Build values: small, small, small, 2MB, small, small
+	// Each has a unique pattern so corruption is detectable.
+	sizes := []int{65, 100, 200, 2 * 1024 * 1024, 50, 80}
+	values := make([]parquet.ByteArray, len(sizes))
+	for i, sz := range sizes {
+		buf := make([]byte, sz)
+		// Header: index for identification
+		buf[0] = byte(i)
+		// Fill with deterministic pattern
+		for j := 1; j < sz; j++ {
+			buf[j] = byte(i*31 + j)
+		}
+		values[i] = buf
+	}
+
+	// Write
+	out := &bytes.Buffer{}
+	writer := file.NewParquetWriter(out, sc.Root(), file.WithWriterProps(props))
+
+	rgw := writer.AppendRowGroup()
+	colWriter, err := rgw.NextColumn()
+	require.NoError(t, err)
+
+	byteArrayWriter := colWriter.(*file.ByteArrayColumnChunkWriter)
+	_, err = byteArrayWriter.WriteBatch(values, nil, nil)
+	require.NoError(t, err)
+
+	require.NoError(t, colWriter.Close())
+	require.NoError(t, rgw.Close())
+	require.NoError(t, writer.Close())
+
+	// Read back
+	rdr, err := file.NewParquetReader(bytes.NewReader(out.Bytes()))
+	require.NoError(t, err)
+	defer rdr.Close()
+
+	require.EqualValues(t, len(values), rdr.NumRows())
+
+	rgr := rdr.RowGroup(0)
+	colReader, err := rgr.Column(0)
+	require.NoError(t, err)
+
+	result := make([]parquet.ByteArray, len(values))
+	_, nVals, err := colReader.(*file.ByteArrayColumnChunkReader).ReadBatch(
+		int64(len(values)), result, nil, nil)
+	require.NoError(t, err)
+	require.Equal(t, len(values), nVals)
+
+	for i, expected := range values {
+		got := result[i]
+		require.Equal(t, len(expected), len(got),
+			"value %d: length mismatch (expected %d, got %d)", i, len(expected), len(got))
+		require.Equal(t, expected[0], got[0],
+			"value %d: header mismatch (data corruption)", i)
+		require.True(t, bytes.Equal(expected, got),
+			"value %d: content mismatch", i)
+	}
+}
+
 // TestLargeByteArrayRoundTripWithNulls verifies correctness of the
 // WriteBatchSpaced path (nullable column) with moderately-sized values.
 // Every 3rd value is null. Uses ~3MB total.