fix: fall back to native_iceberg_compat for row index columns in native_datafusion scan

andygrove · claude · andygrove · commit ba0ebf29a339 · 2026-02-05T15:12:19.000-07:00
When the native_datafusion scan encounters row index metadata columns (_tmp_metadata_row_index), fall back to native_iceberg_compat instead of falling back to Spark. This fixes the issue where row index columns were not being populated when using the native_datafusion scan mode. Closes #3317 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/dev/diffs/3.5.8.diff b/dev/diffs/3.5.8.diff
@@ -2138,49 +2138,6 @@ index 5e01d3f447c..284d6657d4f 100644
      withTempDir { dir =>
        val readSchema =
          new StructType()
-diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileMetadataStructRowIndexSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileMetadataStructRowIndexSuite.scala
-index c10e1799702..ba6629abfd9 100644
---- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileMetadataStructRowIndexSuite.scala
-+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileMetadataStructRowIndexSuite.scala
-@@ -16,7 +16,7 @@
-  */
- package org.apache.spark.sql.execution.datasources.parquet
- 
--import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest}
-+import org.apache.spark.sql.{AnalysisException, DataFrame, IgnoreCometNativeDataFusion, QueryTest}
- import org.apache.spark.sql.execution.datasources.FileFormat
- import org.apache.spark.sql.functions.{col, lit}
- import org.apache.spark.sql.internal.SQLConf
-@@ -154,7 +154,8 @@ class ParquetFileMetadataStructRowIndexSuite extends QueryTest with SharedSparkS
-     }
-   }
- 
--  test(s"reading ${ROW_INDEX_TEMPORARY_COLUMN_NAME} - not present in a table") {
-+  test(s"reading ${ROW_INDEX_TEMPORARY_COLUMN_NAME} - not present in a table",
-+    IgnoreCometNativeDataFusion("https://github.com/apache/datafusion-comet/issues/3317")) {
-     // File format supporting row index generation populates the column with row indexes.
-     withReadDataFrame("parquet", extraSchemaFields =
-         Seq(StructField(ROW_INDEX_TEMPORARY_COLUMN_NAME, LongType))) { df =>
-@@ -172,7 +173,8 @@ class ParquetFileMetadataStructRowIndexSuite extends QueryTest with SharedSparkS
-     }
-   }
- 
--  test(s"reading ${ROW_INDEX_TEMPORARY_COLUMN_NAME} - present in a table") {
-+  test(s"reading ${ROW_INDEX_TEMPORARY_COLUMN_NAME} - present in a table",
-+    IgnoreCometNativeDataFusion("https://github.com/apache/datafusion-comet/issues/3317")) {
-     withReadDataFrame("parquet", extraCol = ROW_INDEX_TEMPORARY_COLUMN_NAME) { df =>
-       // Values of ROW_INDEX_TEMPORARY_COLUMN_NAME column are always populated with
-       // generated row indexes, rather than read from the file.
-@@ -189,7 +191,8 @@ class ParquetFileMetadataStructRowIndexSuite extends QueryTest with SharedSparkS
-     }
-   }
- 
--  test(s"reading ${ROW_INDEX_TEMPORARY_COLUMN_NAME} - as partition col") {
-+  test(s"reading ${ROW_INDEX_TEMPORARY_COLUMN_NAME} - as partition col",
-+    IgnoreCometNativeDataFusion("https://github.com/apache/datafusion-comet/issues/3317")) {
-     withReadDataFrame("parquet", partitionCol = ROW_INDEX_TEMPORARY_COLUMN_NAME) { df =>
-       // Column values are set for each partition, rather than populated with generated row indexes.
-       assert(df
 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
 index 8e88049f51e..49f2001dc6b 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
@@ -198,8 +198,11 @@ case class CometScanRule(session: SparkSession) extends Rule[SparkPlan] with Com
       return None
     }
     if (ShimFileFormat.findRowIndexColumnIndexInSchema(scanExec.requiredSchema) >= 0) {
-      withInfo(scanExec, "Native DataFusion scan does not support row index generation")
-      return None
+      withInfo(
+        scanExec,
+        "Native DataFusion scan does not support row index generation," +
+          " falling back to native_iceberg_compat")
+      return nativeIcebergCompatScan(session, scanExec, r, hadoopConf)
     }
     if (!isSchemaSupported(scanExec, SCAN_NATIVE_DATAFUSION, r)) {
       return None

Original file line number	Diff line number	Diff line change
`@@ -198,8 +198,11 @@ case class CometScanRule(session: SparkSession) extends Rule[SparkPlan] with Com`
`198`	`198`	`return None`
`199`	`199`	`}`
`200`	`200`	`if (ShimFileFormat.findRowIndexColumnIndexInSchema(scanExec.requiredSchema) >= 0) {`
`201`		`- withInfo(scanExec, "Native DataFusion scan does not support row index generation")`
`202`		`- return None`
	`201`	`+ withInfo(`
	`202`	`+ scanExec,`
	`203`	`+ "Native DataFusion scan does not support row index generation," +`
	`204`	`+ " falling back to native_iceberg_compat")`
	`205`	`+ return nativeIcebergCompatScan(session, scanExec, r, hadoopConf)`
`203`	`206`	`}`
`204`	`207`	`if (!isSchemaSupported(scanExec, SCAN_NATIVE_DATAFUSION, r)) {`
`205`	`208`	`return None`