Skip to content

Commit 14eb8f3

Browse files
authored
fix(parquet/pqarrow): Fix propagation of field-ids for Lists (#397)
### Rationale for this change An issue was found in apache/iceberg-go when attempting to retrieve data from a table containing a List Column that had a struct as the element. It was failing to propagate the element-id for the fields when fetching. I tracked it down to the schema handling here. ### What changes are included in this PR? Changes the `getNestedFactory` method in pqarrow/schema.go to use `ListOfField` instead of `ListOf` so that it preserves the metadata, i.e. the field id. ### Are these changes tested? Yes, a test has been added to cover this scenario. ### Are there any user-facing changes? Previously this situation would result in a field-id of -1, now users will see the field-id get propagated correctly.
1 parent 5240503 commit 14eb8f3

3 files changed

Lines changed: 36 additions & 4 deletions

File tree

parquet/pqarrow/encode_dictionary_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -688,7 +688,9 @@ func TestArrowWriteNestedSubfieldDictionary(t *testing.T) {
688688
dictValues := array.NewDictionaryArray(dictType, indices, dict)
689689
defer dictValues.Release()
690690

691-
data := array.NewData(arrow.ListOf(dictType), 3, []*memory.Buffer{nil, offsets.Data().Buffers()[1]},
691+
data := array.NewData(arrow.ListOfField(arrow.Field{
692+
Name: "element", Type: dictType, Nullable: true,
693+
Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})}), 3, []*memory.Buffer{nil, offsets.Data().Buffers()[1]},
692694
[]arrow.ArrayData{dictValues.Data()}, 0, 0)
693695
defer data.Release()
694696
values := array.NewListData(data)

parquet/pqarrow/schema.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,19 +1015,19 @@ func getNestedFactory(origin, inferred arrow.DataType) func(fieldList []arrow.Fi
10151015
switch origin.ID() {
10161016
case arrow.LIST:
10171017
return func(list []arrow.Field) arrow.DataType {
1018-
return arrow.ListOf(list[0].Type)
1018+
return arrow.ListOfField(list[0])
10191019
}
10201020
case arrow.FIXED_SIZE_LIST:
10211021
sz := origin.(*arrow.FixedSizeListType).Len()
10221022
return func(list []arrow.Field) arrow.DataType {
1023-
return arrow.FixedSizeListOf(sz, list[0].Type)
1023+
return arrow.FixedSizeListOfField(sz, list[0])
10241024
}
10251025
}
10261026
case arrow.MAP:
10271027
if origin.ID() == arrow.MAP {
10281028
return func(list []arrow.Field) arrow.DataType {
10291029
valType := list[0].Type.(*arrow.StructType)
1030-
return arrow.MapOf(valType.Field(0).Type, valType.Field(1).Type)
1030+
return arrow.MapOfFields(valType.Field(0), valType.Field(1))
10311031
}
10321032
}
10331033
}

parquet/pqarrow/schema_test.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,36 @@ func TestProperListElementNullability(t *testing.T) {
473473
assert.True(t, arrSchema.Equal(outSchema), "expected: %s, got: %s", arrSchema, outSchema)
474474
}
475475

476+
func TestFieldNestedPropagate(t *testing.T) {
477+
arrSchema := arrow.NewSchema([]arrow.Field{
478+
{Name: "transformations", Type: arrow.ListOfField(
479+
arrow.Field{
480+
Name: "element",
481+
Type: arrow.StructOf(
482+
arrow.Field{Name: "destination", Type: arrow.BinaryTypes.String,
483+
Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"6"})},
484+
arrow.Field{Name: "transform_type", Type: arrow.BinaryTypes.String,
485+
Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"7"})},
486+
arrow.Field{Name: "transform_value", Type: arrow.BinaryTypes.String,
487+
Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"8"})},
488+
arrow.Field{Name: "source_cols", Type: arrow.ListOfField(
489+
arrow.Field{Name: "element", Type: arrow.BinaryTypes.String, Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"10"})}),
490+
Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"9"})},
491+
),
492+
Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"5"}),
493+
},
494+
), Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"4"})},
495+
}, nil)
496+
497+
pqSchema, err := pqarrow.ToParquet(arrSchema, nil, pqarrow.DefaultWriterProps())
498+
require.NoError(t, err)
499+
500+
result, err := pqarrow.FromParquet(pqSchema, nil, metadata.KeyValueMetadata{})
501+
require.NoError(t, err)
502+
503+
assert.True(t, arrSchema.Equal(result), "expected: %s, got: %s", arrSchema, result)
504+
}
505+
476506
func TestConvertSchemaParquetVariant(t *testing.T) {
477507
// unshredded variant:
478508
// optional group variant_col {

0 commit comments

Comments
 (0)