[SPARK-55056][SQL][PYTHON][TEST] Add tests using Arrow to deserialize nested array with empty outer array

Yicong-Huang · terana · commit f6dfe1d05d92 · 2026-03-23T13:00:54.000Z
### What changes were proposed in this pull request? Add tests to verify that writing triple-nested arrays (and nested arrays with maps) with an empty outer array no longer triggers a SIGSEGV. ### Why are the changes needed? SPARK-55056 reported a segmentation fault when deserializing triple-nested arrays with an empty outer array via Arrow IPC. The root cause was in arrow-java: `ListVector.getBufferSizeFor(0)` returned 0, causing the offset buffer to be omitted for empty vectors, which violates the Arrow spec (offset buffer must have N+1 entries even when N=0). This has been fixed upstream in arrow-java 19.0.0 ([apache/arrow-java#343](apache/arrow-java#343)), which Spark adopted in SPARK-56000 (PR apache#54820). These tests confirm the fix works correctly without any Spark-side workaround. ### Does this PR introduce _any_ user-facing change? No (test only). ### How was this patch tested? New unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#54880 from Yicong-Huang/SPARK-55056-test. Authored-by: Yicong Huang <17627829+Yicong-Huang@users.noreply.github.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/python/pyspark/sql/tests/arrow/test_arrow.py b/python/pyspark/sql/tests/arrow/test_arrow.py
@@ -1871,6 +1871,41 @@ def test_toArrow_with_compression_codec_large_dataset(self):
                     self.assertEqual(t.num_rows, 10000)
                     self.assertEqual(t.column_names, ["id", "str_col", "mod_col"])
 
+    def test_toPandas_double_nested_array_empty_outer(self):
+        schema = StructType([StructField("data", ArrayType(ArrayType(StringType())))])
+        df = self.spark.createDataFrame([Row(data=[])], schema=schema)
+        pdf = df.toPandas()
+        self.assertEqual(len(pdf), 1)
+        self.assertEqual(len(pdf["data"][0]), 0)
+
+    def test_toPandas_array_of_map_empty_outer(self):
+        schema = StructType([StructField("data", ArrayType(MapType(StringType(), StringType())))])
+        df = self.spark.createDataFrame([Row(data=[])], schema=schema)
+        pdf = df.toPandas()
+        self.assertEqual(len(pdf), 1)
+        self.assertEqual(len(pdf["data"][0]), 0)
+
+    def test_toPandas_triple_nested_array_empty_outer(self):
+        # SPARK-55056: This used to trigger SIGSEGV before the upstream arrow-java fix.
+        # When the outer array is empty, the second-level ArrayWriter is never
+        # invoked, so its count stays 0. Arrow format requires ListArray offset
+        # buffer to have N+1 entries even when N=0, but getBufferSizeFor(0)
+        # returns 0 and the buffer is omitted in IPC serialization.
+        schema = StructType([StructField("data", ArrayType(ArrayType(ArrayType(StringType()))))])
+        df = self.spark.createDataFrame([Row(data=[])], schema=schema)
+        pdf = df.toPandas()
+        self.assertEqual(len(pdf), 1)
+        self.assertEqual(len(pdf["data"][0]), 0)
+
+    def test_toPandas_nested_array_with_map_empty_outer(self):
+        schema = StructType(
+            [StructField("data", ArrayType(ArrayType(MapType(StringType(), StringType()))))]
+        )
+        df = self.spark.createDataFrame([Row(data=[])], schema=schema)
+        pdf = df.toPandas()
+        self.assertEqual(len(pdf), 1)
+        self.assertEqual(len(pdf["data"][0]), 0)
+
 
 @unittest.skipIf(
     not have_pandas or not have_pyarrow,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
@@ -875,4 +875,45 @@ class ArrowWriterSuite extends SparkFunSuite {
     assert(map2.keyArray().array().mkString(",") == Array(1).mkString(","))
     assert(stringRepr(map2) == Array("bob", "40").mkString(","))
   }
+
+  test("SPARK-55056: triple nested array with empty outer array") {
+    // Schema: array<array<array<string>>>
+    // This used to trigger SIGSEGV before the upstream arrow-java fix.
+    // When the outer array is empty, the second-level ArrayWriter is never
+    // invoked, so its count stays 0. Arrow format requires ListArray offset
+    // buffer to have N+1 entries even when N=0, but getBufferSizeFor(0)
+    // returns 0 and the buffer is omitted.
+    val schema = new StructType()
+      .add("data", ArrayType(ArrayType(ArrayType(StringType))))
+    val writer = ArrowWriter.create(schema, null)
+    assert(writer.schema === schema)
+
+    // Write a row with an empty outer array
+    writer.write(InternalRow(ArrayData.toArrayData(Array.empty)))
+    writer.finish()
+
+    val reader = new ArrowColumnVector(writer.root.getFieldVectors().get(0))
+    val array0 = reader.getArray(0)
+    assert(array0.numElements() === 0)
+
+    writer.root.close()
+  }
+
+  test("SPARK-55056: nested array with map inside empty outer array") {
+    // Schema: array<array<map<string, string>>>
+    val schema = new StructType()
+      .add("data", ArrayType(ArrayType(MapType(StringType, StringType))))
+    val writer = ArrowWriter.create(schema, null)
+    assert(writer.schema === schema)
+
+    // Write a row with an empty outer array
+    writer.write(InternalRow(ArrayData.toArrayData(Array.empty)))
+    writer.finish()
+
+    val reader = new ArrowColumnVector(writer.root.getFieldVectors().get(0))
+    val array0 = reader.getArray(0)
+    assert(array0.numElements() === 0)
+
+    writer.root.close()
+  }
 }