Skip to content

Commit f6dfe1d

Browse files
Yicong-Huangterana
authored andcommitted
[SPARK-55056][SQL][PYTHON][TEST] Add tests using Arrow to deserialize nested array with empty outer array
### What changes were proposed in this pull request? Add tests to verify that writing triple-nested arrays (and nested arrays with maps) with an empty outer array no longer triggers a SIGSEGV. ### Why are the changes needed? SPARK-55056 reported a segmentation fault when deserializing triple-nested arrays with an empty outer array via Arrow IPC. The root cause was in arrow-java: `ListVector.getBufferSizeFor(0)` returned 0, causing the offset buffer to be omitted for empty vectors, which violates the Arrow spec (offset buffer must have N+1 entries even when N=0). This has been fixed upstream in arrow-java 19.0.0 ([apache/arrow-java#343](apache/arrow-java#343)), which Spark adopted in SPARK-56000 (PR apache#54820). These tests confirm the fix works correctly without any Spark-side workaround. ### Does this PR introduce _any_ user-facing change? No (test only). ### How was this patch tested? New unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#54880 from Yicong-Huang/SPARK-55056-test. Authored-by: Yicong Huang <17627829+Yicong-Huang@users.noreply.github.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
1 parent 2de1d03 commit f6dfe1d

File tree

2 files changed

+76
-0
lines changed

2 files changed

+76
-0
lines changed

python/pyspark/sql/tests/arrow/test_arrow.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1871,6 +1871,41 @@ def test_toArrow_with_compression_codec_large_dataset(self):
18711871
self.assertEqual(t.num_rows, 10000)
18721872
self.assertEqual(t.column_names, ["id", "str_col", "mod_col"])
18731873

1874+
def test_toPandas_double_nested_array_empty_outer(self):
1875+
schema = StructType([StructField("data", ArrayType(ArrayType(StringType())))])
1876+
df = self.spark.createDataFrame([Row(data=[])], schema=schema)
1877+
pdf = df.toPandas()
1878+
self.assertEqual(len(pdf), 1)
1879+
self.assertEqual(len(pdf["data"][0]), 0)
1880+
1881+
def test_toPandas_array_of_map_empty_outer(self):
1882+
schema = StructType([StructField("data", ArrayType(MapType(StringType(), StringType())))])
1883+
df = self.spark.createDataFrame([Row(data=[])], schema=schema)
1884+
pdf = df.toPandas()
1885+
self.assertEqual(len(pdf), 1)
1886+
self.assertEqual(len(pdf["data"][0]), 0)
1887+
1888+
def test_toPandas_triple_nested_array_empty_outer(self):
1889+
# SPARK-55056: This used to trigger SIGSEGV before the upstream arrow-java fix.
1890+
# When the outer array is empty, the second-level ArrayWriter is never
1891+
# invoked, so its count stays 0. Arrow format requires ListArray offset
1892+
# buffer to have N+1 entries even when N=0, but getBufferSizeFor(0)
1893+
# returns 0 and the buffer is omitted in IPC serialization.
1894+
schema = StructType([StructField("data", ArrayType(ArrayType(ArrayType(StringType()))))])
1895+
df = self.spark.createDataFrame([Row(data=[])], schema=schema)
1896+
pdf = df.toPandas()
1897+
self.assertEqual(len(pdf), 1)
1898+
self.assertEqual(len(pdf["data"][0]), 0)
1899+
1900+
def test_toPandas_nested_array_with_map_empty_outer(self):
1901+
schema = StructType(
1902+
[StructField("data", ArrayType(ArrayType(MapType(StringType(), StringType()))))]
1903+
)
1904+
df = self.spark.createDataFrame([Row(data=[])], schema=schema)
1905+
pdf = df.toPandas()
1906+
self.assertEqual(len(pdf), 1)
1907+
self.assertEqual(len(pdf["data"][0]), 0)
1908+
18741909

18751910
@unittest.skipIf(
18761911
not have_pandas or not have_pyarrow,

sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -875,4 +875,45 @@ class ArrowWriterSuite extends SparkFunSuite {
875875
assert(map2.keyArray().array().mkString(",") == Array(1).mkString(","))
876876
assert(stringRepr(map2) == Array("bob", "40").mkString(","))
877877
}
878+
879+
test("SPARK-55056: triple nested array with empty outer array") {
880+
// Schema: array<array<array<string>>>
881+
// This used to trigger SIGSEGV before the upstream arrow-java fix.
882+
// When the outer array is empty, the second-level ArrayWriter is never
883+
// invoked, so its count stays 0. Arrow format requires ListArray offset
884+
// buffer to have N+1 entries even when N=0, but getBufferSizeFor(0)
885+
// returns 0 and the buffer is omitted.
886+
val schema = new StructType()
887+
.add("data", ArrayType(ArrayType(ArrayType(StringType))))
888+
val writer = ArrowWriter.create(schema, null)
889+
assert(writer.schema === schema)
890+
891+
// Write a row with an empty outer array
892+
writer.write(InternalRow(ArrayData.toArrayData(Array.empty)))
893+
writer.finish()
894+
895+
val reader = new ArrowColumnVector(writer.root.getFieldVectors().get(0))
896+
val array0 = reader.getArray(0)
897+
assert(array0.numElements() === 0)
898+
899+
writer.root.close()
900+
}
901+
902+
test("SPARK-55056: nested array with map inside empty outer array") {
903+
// Schema: array<array<map<string, string>>>
904+
val schema = new StructType()
905+
.add("data", ArrayType(ArrayType(MapType(StringType, StringType))))
906+
val writer = ArrowWriter.create(schema, null)
907+
assert(writer.schema === schema)
908+
909+
// Write a row with an empty outer array
910+
writer.write(InternalRow(ArrayData.toArrayData(Array.empty)))
911+
writer.finish()
912+
913+
val reader = new ArrowColumnVector(writer.root.getFieldVectors().get(0))
914+
val array0 = reader.getArray(0)
915+
assert(array0.numElements() === 0)
916+
917+
writer.root.close()
918+
}
878919
}

0 commit comments

Comments
 (0)