@@ -19,6 +19,7 @@ package pqarrow_test
1919import (
2020 "bytes"
2121 "context"
22+ "encoding/base64"
2223 "encoding/binary"
2324 "fmt"
2425 "math"
@@ -1532,9 +1533,9 @@ func makeListArray(values arrow.Array, size, nullcount int) arrow.Array {
15321533 nullBitmap := make ([]byte , int (bitutil .BytesForBits (int64 (size ))))
15331534
15341535 curOffset := 0
1535- for i := 0 ; i < size ; i ++ {
1536+ for i := range size {
15361537 offsetsArr [i ] = int32 (curOffset )
1537- if ! ((( i % 2 ) == 0 ) && (( i / 2 ) < nullcount )) {
1538+ if i % 2 != 0 || i / 2 >= nullcount {
15381539 // non-null list (list with index 1 is always empty)
15391540 bitutil .SetBit (nullBitmap , i )
15401541 if i != 1 {
@@ -2108,6 +2109,105 @@ func (ps *ParquetIOTestSuite) TestStructWithListOfNestedStructs() {
21082109 ps .roundTripTable (mem , expected , false )
21092110}
21102111
2112+ // TestListOfStructWithEmptyListStoreSchema tests that ARROW:schema metadata stored
2113+ // in a Parquet file uses "element" (not "item") as the list element field name, to
2114+ // match the actual Parquet column paths. This is required for compatibility with
2115+ // readers like Snowflake that resolve columns by matching ARROW:schema field names
2116+ // to Parquet column path segments. See https://github.com/apache/arrow-go/issues/744.
2117+ func TestListOfStructWithEmptyListStoreSchema (t * testing.T ) {
2118+ mem := memory .NewCheckedAllocator (memory .DefaultAllocator )
2119+ defer mem .AssertSize (t , 0 )
2120+
2121+ opsStruct := arrow .StructOf (
2122+ arrow.Field {Name : "id" , Type : arrow .BinaryTypes .String , Nullable : false },
2123+ arrow.Field {Name : "token" , Type : arrow .BinaryTypes .String , Nullable : true },
2124+ arrow.Field {Name : "amount" , Type : arrow .BinaryTypes .String , Nullable : true },
2125+ )
2126+ // arrow.ListOf uses "item" as the element field name, which would mismatch
2127+ // the Parquet column path that uses "element". The fix ensures the stored
2128+ // ARROW:schema uses "element" to stay consistent with the Parquet columns.
2129+ schema := arrow .NewSchema ([]arrow.Field {
2130+ {Name : "block_num" , Type : arrow .PrimitiveTypes .Uint64 , Nullable : false },
2131+ {Name : "tx_id" , Type : arrow .BinaryTypes .String , Nullable : false },
2132+ {Name : "ops" , Type : arrow .ListOf (opsStruct ), Nullable : true },
2133+ }, nil )
2134+
2135+ b := array .NewRecordBuilder (mem , schema )
2136+ defer b .Release ()
2137+
2138+ b .Field (0 ).(* array.Uint64Builder ).AppendValues ([]uint64 {100 , 101 , 102 }, nil )
2139+ b .Field (1 ).(* array.StringBuilder ).AppendValues ([]string {"tx-a" , "tx-b" , "tx-c" }, nil )
2140+
2141+ lb := b .Field (2 ).(* array.ListBuilder )
2142+ sb := lb .ValueBuilder ().(* array.StructBuilder )
2143+ idb := sb .FieldBuilder (0 ).(* array.StringBuilder )
2144+ tokb := sb .FieldBuilder (1 ).(* array.StringBuilder )
2145+ amtb := sb .FieldBuilder (2 ).(* array.StringBuilder )
2146+
2147+ lb .Append (true )
2148+ sb .Append (true )
2149+ idb .Append ("op-1" )
2150+ tokb .Append ("USDC" )
2151+ amtb .Append ("10" )
2152+ sb .Append (true )
2153+ idb .Append ("op-2" )
2154+ tokb .Append ("ETH" )
2155+ amtb .Append ("1.5" )
2156+ lb .Append (true ) // empty list
2157+ lb .Append (true )
2158+ sb .Append (true )
2159+ idb .Append ("op-3" )
2160+ tokb .AppendNull ()
2161+ amtb .Append ("42" )
2162+
2163+ rec := b .NewRecordBatch ()
2164+ defer rec .Release ()
2165+
2166+ var buf bytes.Buffer
2167+ props := parquet .NewWriterProperties (parquet .WithDictionaryDefault (true ), parquet .WithStats (true ))
2168+ arrowProps := pqarrow .NewArrowWriterProperties (pqarrow .WithStoreSchema ())
2169+
2170+ pw , err := pqarrow .NewFileWriter (schema , & buf , props , arrowProps )
2171+ require .NoError (t , err )
2172+ require .NoError (t , pw .Write (rec ))
2173+ require .NoError (t , pw .Close ())
2174+
2175+ // Verify round-trip data is correct.
2176+ pf , err := file .NewParquetReader (bytes .NewReader (buf .Bytes ()))
2177+ require .NoError (t , err )
2178+ defer pf .Close ()
2179+
2180+ fr , err := pqarrow .NewFileReader (pf , pqarrow.ArrowReadProperties {}, mem )
2181+ require .NoError (t , err )
2182+
2183+ tbl , err := fr .ReadTable (context .Background ())
2184+ require .NoError (t , err )
2185+ defer tbl .Release ()
2186+
2187+ require .EqualValues (t , 3 , tbl .NumRows ())
2188+
2189+ // Verify the stored ARROW:schema uses "element" as the list element field name
2190+ // (consistent with the Parquet column path "ops.list.element.*"), not "item"
2191+ // (the default Arrow field name from arrow.ListOf()).
2192+ arrowSchemaEncoded := pf .MetaData ().KeyValueMetadata ().FindValue ("ARROW:schema" )
2193+ require .NotNil (t , arrowSchemaEncoded , "ARROW:schema metadata key must be present" )
2194+ decoded , err := base64 .StdEncoding .DecodeString (* arrowSchemaEncoded )
2195+ require .NoError (t , err )
2196+ // DeserializeSchema wraps bytes in an IPC stream; use ipc.NewReader to decode.
2197+ ipcRdr , err := ipc .NewReader (bytes .NewReader (decoded ), ipc .WithAllocator (mem ))
2198+ require .NoError (t , err )
2199+ defer ipcRdr .Release ()
2200+ storedSchema := ipcRdr .Schema ()
2201+
2202+ opsField , ok := storedSchema .FieldsByName ("ops" )
2203+ require .True (t , ok )
2204+ opsListType , ok := opsField [0 ].Type .(* arrow.ListType )
2205+ require .True (t , ok )
2206+ // Must be "element" (matching Parquet column path) not "item" (Arrow default).
2207+ assert .Equal (t , "element" , opsListType .ElemField ().Name ,
2208+ "ARROW:schema element name must match the Parquet column path segment" )
2209+ }
2210+
21112211func TestParquetArrowIO (t * testing.T ) {
21122212 suite .Run (t , new (ParquetIOTestSuite ))
21132213}
0 commit comments