[SPARK-53967][PYTHON] Avoid intermediate pandas dataframe creation in df.toPandas

zhengruifeng · zhengruifeng · commit 4966fe99d3e3 · 2025-11-05T09:10:01.000+08:00
### What changes were proposed in this pull request? Avoid intermediate pandas dataframe creation in `df.toPandas` before: batches -> table -> intermediate pdf -> result pdf (based on `pa.Table.to_pandas`) after: batches -> table -> result pdf (based on `pa.ChunkedArray.to_pandas`) ### Why are the changes needed? the intermediate pandas dataframe can be skipped simple benchmark in my local ``` spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") spark.conf.set("spark.sql.execution.arrow.pyspark.fallback.enabled", "false") spark.conf.set("spark.sql.execution.arrow.pyspark.selfDestruct.enabled", "true") import time from pyspark.sql import functions as sf df = spark.range(1000000).select( (sf.col("id") % 2).alias("key"), sf.col("id").alias("v") ) cols = {f"col_{i}": sf.lit(f"c{i}") for i in range(100)} df = df.withColumns(cols) df.cache() df.count() pdf = df.toPandas() # warm up start_arrow = time.perf_counter() for i in range(100): pdf = df.toPandas() time.perf_counter() - start_arrow ``` master: 304.49954012501985 secs this PR: 285.2997682078276 secs ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? existing tests ### Was this patch authored or co-authored using generative AI tooling? no Closes #52680 from zhengruifeng/avoid_unnecessary_pdf_creation. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py
@@ -80,7 +80,9 @@ def toPandas(self) -> "PandasDataFrameLike":
                 from pyspark.sql.pandas.utils import require_minimum_pyarrow_version
 
                 require_minimum_pyarrow_version()
-                to_arrow_schema(self.schema, prefers_large_types=jconf.arrowUseLargeVarTypes())
+                arrow_schema = to_arrow_schema(
+                    self.schema, prefers_large_types=jconf.arrowUseLargeVarTypes()
+                )
             except Exception as e:
                 if jconf.arrowPySparkFallbackEnabled():
                     msg = (
@@ -112,41 +114,40 @@ def toPandas(self) -> "PandasDataFrameLike":
 
                     self_destruct = jconf.arrowPySparkSelfDestructEnabled()
                     batches = self._collect_as_arrow(split_batches=self_destruct)
-                    if len(batches) > 0:
-                        table = pa.Table.from_batches(batches)
-                        # Ensure only the table has a reference to the batches, so that
-                        # self_destruct (if enabled) is effective
-                        del batches
-                        # Pandas DataFrame created from PyArrow uses datetime64[ns] for date type
-                        # values, but we should use datetime.date to match the behavior with when
-                        # Arrow optimization is disabled.
-                        pandas_options = {
-                            "date_as_object": True,
-                            "coerce_temporal_nanoseconds": True,
-                        }
-                        if self_destruct:
-                            # Configure PyArrow to use as little memory as possible:
-                            # self_destruct - free columns as they are converted
-                            # split_blocks - create a separate Pandas block for each column
-                            # use_threads - convert one column at a time
-                            pandas_options.update(
-                                {
-                                    "self_destruct": True,
-                                    "split_blocks": True,
-                                    "use_threads": False,
-                                }
-                            )
-                        # Rename columns to avoid duplicated column names.
-                        pdf = table.rename_columns(
-                            [f"col_{i}" for i in range(table.num_columns)]
-                        ).to_pandas(**pandas_options)
 
-                        # Rename back to the original column names.
-                        pdf.columns = self.columns
+                    # Rename columns to avoid duplicated column names.
+                    temp_col_names = [f"col_{i}" for i in range(len(self.columns))]
+                    if len(batches) > 0:
+                        table = pa.Table.from_batches(batches).rename_columns(temp_col_names)
                     else:
-                        pdf = pd.DataFrame(columns=self.columns)
+                        # empty dataset
+                        table = arrow_schema.empty_table().rename_columns(temp_col_names)
+
+                    # Ensure only the table has a reference to the batches, so that
+                    # self_destruct (if enabled) is effective
+                    del batches
+
+                    # Pandas DataFrame created from PyArrow uses datetime64[ns] for date type
+                    # values, but we should use datetime.date to match the behavior with when
+                    # Arrow optimization is disabled.
+                    pandas_options = {
+                        "date_as_object": True,
+                        "coerce_temporal_nanoseconds": True,
+                    }
+                    if self_destruct:
+                        # Configure PyArrow to use as little memory as possible:
+                        # self_destruct - free columns as they are converted
+                        # split_blocks - create a separate Pandas block for each column
+                        # use_threads - convert one column at a time
+                        pandas_options.update(
+                            {
+                                "self_destruct": True,
+                                "split_blocks": True,
+                                "use_threads": False,
+                            }
+                        )
 
-                    if len(pdf.columns) > 0:
+                    if len(self.columns) > 0:
                         timezone = jconf.sessionLocalTimeZone()
                         struct_in_pandas = jconf.pandasStructHandlingMode()
 
@@ -155,21 +156,26 @@ def toPandas(self) -> "PandasDataFrameLike":
                             error_on_duplicated_field_names = True
                             struct_in_pandas = "dict"
 
-                        return pd.concat(
+                        pdf = pd.concat(
                             [
                                 _create_converter_to_pandas(
                                     field.dataType,
                                     field.nullable,
                                     timezone=timezone,
                                     struct_in_pandas=struct_in_pandas,
                                     error_on_duplicated_field_names=error_on_duplicated_field_names,
-                                )(pser)
-                                for (_, pser), field in zip(pdf.items(), self.schema.fields)
+                                )(arrow_col.to_pandas(**pandas_options))
+                                for arrow_col, field in zip(table.columns, self.schema.fields)
                             ],
                             axis="columns",
                         )
                     else:
-                        return pdf
+                        # empty columns
+                        pdf = table.to_pandas(**pandas_options)
+
+                    pdf.columns = self.columns
+                    return pdf
+
                 except Exception as e:
                     # We might have to allow fallback here as well but multiple Spark jobs can
                     # be executed. So, simply fail in this case for now.