fix(parquet/pqarrow): selective column reading of complex map column (#668)

zeroshade · web-flow · commit a886a5722b87 · 2026-02-19T20:50:23.000-05:00
### Rationale for this change Upstream fix for the issue identified in apache/iceberg-go#737. When reading maps with nested values using column indices for selective column reading, if the child fields of the map weren't in the list of indices there was a problem: - Maps are represented in Parquet as a list of key-value structs (`list<struct<key, value>>` - The struct *MUST* have exactly 2 fields (key and value) to be converted into a proper Arrow typed Map column - When applying the column filtering, if only the key *OR* value field (but not both) were in the list of columns, the resulting child struct would only have 1 field - As a result, the `Map.validateData()` method would fail with a panic of `arrow/array: map array child array should have two fields`. ### What changes are included in this PR? In pqarrow/file_reader.go leaf filtering is disabled when reading a map's key-value struct. This will ensure both the key and value columns are always read together, maintaining the required 2-field structure for map array. ### Are these changes tested? Yes a test case is added for the change. ### Are there any user-facing changes? This only affects map type reading with column filter selection ensuring correctness. The only change is that a failure mode has been eliminated.
diff --git a/parquet/pqarrow/file_reader.go b/parquet/pqarrow/file_reader.go
@@ -606,7 +606,22 @@ func (fr *FileReader) getReader(ctx context.Context, field *SchemaField, arrowFi
 		out = newStructReader(&rctx, &filtered, field.LevelInfo, childReaders, fr.Props)
 	case arrow.LIST, arrow.FIXED_SIZE_LIST, arrow.MAP:
 		child := field.Children[0]
-		childReader, err := fr.getReader(ctx, &child, *child.Field)
+
+		// For maps, we must read BOTH key and value columns, regardless of column selection.
+		// Map arrays require a complete key-value struct with exactly 2 fields.
+		// Disable leaf filtering when reading a map's key-value struct.
+		childCtx := ctx
+		if _, isMap := arrowField.Type.(*arrow.MapType); isMap {
+			childCtx = context.WithValue(ctx, rdrCtxKey{}, readerCtx{
+				rdr:            rctx.rdr,
+				mem:            rctx.mem,
+				colFactory:     rctx.colFactory,
+				filterLeaves:   false, // Don't filter leaves for map key-value struct
+				includedLeaves: rctx.includedLeaves,
+			})
+		}
+
+		childReader, err := fr.getReader(childCtx, &child, *child.Field)
 		if err != nil {
 			return nil, err
 		}
diff --git a/parquet/pqarrow/file_reader_test.go b/parquet/pqarrow/file_reader_test.go
@@ -652,3 +652,127 @@ func TestPartialStructColumnRead(t *testing.T) {
 	cArr := arr.Field(1).(*array.Float64)
 	require.Equal(t, 3.0, cArr.Value(0))
 }
+
+// TestMapColumnWithFilters tests that map columns can be read correctly when
+// using column filtering. This is a regression test for a bug where reading
+// a map column with filters would fail because the code tried to filter the
+// individual key and value columns of the map's internal key-value struct.
+// Maps require both key and value columns to be read together, so leaf filtering
+// must be disabled for map types.
+func TestMapColumnWithFilters(t *testing.T) {
+	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+	defer mem.AssertSize(t, 0)
+
+	// Create schema with a map column and other columns
+	schema := arrow.NewSchema([]arrow.Field{
+		{Name: "id", Type: arrow.PrimitiveTypes.Int64},
+		{Name: "properties", Type: arrow.MapOf(arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int32)},
+		{Name: "value", Type: arrow.PrimitiveTypes.Float64},
+	}, nil)
+
+	// Build test data
+	b := array.NewRecordBuilder(mem, schema)
+	defer b.Release()
+
+	// Build ID column
+	idBuilder := b.Field(0).(*array.Int64Builder)
+	idBuilder.AppendValues([]int64{1, 2}, nil)
+
+	// Build map column
+	mapBuilder := b.Field(1).(*array.MapBuilder)
+	kb := mapBuilder.KeyBuilder().(*array.StringBuilder)
+	vb := mapBuilder.ItemBuilder().(*array.Int32Builder)
+
+	// First map: {"key1": 100, "key2": 200}
+	mapBuilder.Append(true)
+	kb.AppendValues([]string{"key1", "key2"}, nil)
+	vb.AppendValues([]int32{100, 200}, nil)
+
+	// Second map: {"key3": 300}
+	mapBuilder.Append(true)
+	kb.AppendValues([]string{"key3"}, nil)
+	vb.AppendValues([]int32{300}, nil)
+
+	// Build value column
+	valueBuilder := b.Field(2).(*array.Float64Builder)
+	valueBuilder.AppendValues([]float64{1.5, 2.5}, nil)
+
+	rec := b.NewRecordBatch()
+	defer rec.Release()
+
+	// Write to parquet
+	buf := new(bytes.Buffer)
+	writer, err := pqarrow.NewFileWriter(schema, buf, nil, pqarrow.DefaultWriterProps())
+	require.NoError(t, err)
+	require.NoError(t, writer.Write(rec))
+	require.NoError(t, writer.Close())
+
+	// Read back with column filtering (only read id and properties, skip value)
+	pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
+	require.NoError(t, err)
+	defer pf.Close()
+
+	fr, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{}, mem)
+	require.NoError(t, err)
+
+	// Read only columns for the first two fields (id and map)
+	// Column 0 = id
+	// Column 1 = map.key
+	// Column 2 = map.value
+	// Column 3 = value (skipped)
+	// This exercises the code path where maps need both key and value columns
+	// even when column filtering is active
+	ctx := context.Background()
+	colIndices := []int{0, 1, 2} // id, map.key, map.value
+	rr, err := fr.GetRecordReader(ctx, colIndices, nil)
+	require.NoError(t, err)
+	require.NotNil(t, rr)
+	defer rr.Release()
+
+	// Read the record batch
+	require.True(t, rr.Next())
+	result := rr.RecordBatch()
+	// Note: Don't release result manually - the record reader owns it
+
+	// Verify schema - should have only 2 fields (id and properties)
+	require.Equal(t, 2, int(result.NumCols()))
+	require.Equal(t, "id", result.Schema().Field(0).Name)
+	require.Equal(t, "properties", result.Schema().Field(1).Name)
+
+	// Verify ID column
+	idCol := result.Column(0).(*array.Int64)
+	require.Equal(t, int64(1), idCol.Value(0))
+	require.Equal(t, int64(2), idCol.Value(1))
+
+	// Verify map column - this is the critical test for the fix
+	// The key test is that reading succeeds without panic
+	mapCol := result.Column(1).(*array.Map)
+	require.Equal(t, 2, mapCol.Len())
+
+	// Verify the map has the correct structure (keys and items arrays exist)
+	keys := mapCol.Keys().(*array.String)
+	vals := mapCol.Items().(*array.Int32)
+	require.NotNil(t, keys)
+	require.NotNil(t, vals)
+
+	// Verify total number of key-value pairs across all maps
+	require.Equal(t, 3, keys.Len()) // Total: 2 from first map + 1 from second map
+	require.Equal(t, 3, vals.Len())
+
+	// Verify the map offsets are correct
+	start0, end0 := mapCol.ValueOffsets(0)
+	require.Equal(t, int64(0), start0)
+	require.Equal(t, int64(2), end0) // First map has entries from 0 to 2 (2 entries)
+
+	start1, end1 := mapCol.ValueOffsets(1)
+	require.Equal(t, int64(2), start1)
+	require.Equal(t, int64(3), end1) // Second map has entries from 2 to 3 (1 entry)
+
+	// Verify key-value pairs
+	require.Equal(t, "key1", keys.Value(0))
+	require.Equal(t, int32(100), vals.Value(0))
+	require.Equal(t, "key2", keys.Value(1))
+	require.Equal(t, int32(200), vals.Value(1))
+	require.Equal(t, "key3", keys.Value(2))
+	require.Equal(t, int32(300), vals.Value(2))
+}