fix: preserve partitioning in CometNativeScanExec for bucketed scans (#3392)

andygrove · claude · web-flow · commit 9ecf53fecbfc · 2026-02-04T09:20:02.000-08:00
This fixes test failures when `native_datafusion` is enabled (issue #3315): 1. CometNativeScanExec now preserves the original outputPartitioning for bucketed scans, matching the pattern used by CometScanExec. Previously it always returned UnknownPartitioning, causing BroadcastJoinSuite tests to fail when they expected PartitioningCollection. 2. Updated diff files to accept CometNativeScanExec in the FileDataSourceV2FallBackSuite "Fallback Parquet V2 to V1" test, which checks for FileSourceScanExec or CometScanExec in the plan. Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/dev/diffs/3.4.3.diff b/dev/diffs/3.4.3.diff
@@ -1157,7 +1157,7 @@ index cfc8b2cc845..c6fcfd7bd08 100644
  import org.apache.spark.SparkConf
  import org.apache.spark.sql.{AnalysisException, QueryTest}
  import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-+import org.apache.spark.sql.comet.CometScanExec
++import org.apache.spark.sql.comet.{CometNativeScanExec, CometScanExec}
  import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, Table, TableCapability}
  import org.apache.spark.sql.connector.read.ScanBuilder
  import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder}
@@ -1167,7 +1167,7 @@ index cfc8b2cc845..c6fcfd7bd08 100644
              assert(
 -              df.queryExecution.executedPlan.exists(_.isInstanceOf[FileSourceScanExec]))
 +              df.queryExecution.executedPlan.exists {
-+                case _: FileSourceScanExec | _: CometScanExec => true
++                case _: FileSourceScanExec | _: CometScanExec | _: CometNativeScanExec => true
 +                case _ => false
 +              }
 +            )
diff --git a/dev/diffs/3.5.8.diff b/dev/diffs/3.5.8.diff
@@ -1111,7 +1111,7 @@ index cfc8b2cc845..c6fcfd7bd08 100644
  import org.apache.spark.SparkConf
  import org.apache.spark.sql.{AnalysisException, QueryTest}
  import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-+import org.apache.spark.sql.comet.CometScanExec
++import org.apache.spark.sql.comet.{CometNativeScanExec, CometScanExec}
  import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, Table, TableCapability}
  import org.apache.spark.sql.connector.read.ScanBuilder
  import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder}
@@ -1121,7 +1121,7 @@ index cfc8b2cc845..c6fcfd7bd08 100644
              assert(
 -              df.queryExecution.executedPlan.exists(_.isInstanceOf[FileSourceScanExec]))
 +              df.queryExecution.executedPlan.exists {
-+                case _: FileSourceScanExec | _: CometScanExec => true
++                case _: FileSourceScanExec | _: CometScanExec | _: CometNativeScanExec => true
 +                case _ => false
 +              }
 +            )
diff --git a/dev/diffs/4.0.1.diff b/dev/diffs/4.0.1.diff
@@ -1443,7 +1443,7 @@ index 2a0ab21ddb0..e8a5a891105 100644
  import org.apache.spark.{SparkConf, SparkException}
  import org.apache.spark.sql.QueryTest
  import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-+import org.apache.spark.sql.comet.CometScanExec
++import org.apache.spark.sql.comet.{CometNativeScanExec, CometScanExec}
  import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, Table, TableCapability}
  import org.apache.spark.sql.connector.read.ScanBuilder
  import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder}
@@ -1453,7 +1453,7 @@ index 2a0ab21ddb0..e8a5a891105 100644
              assert(
 -              df.queryExecution.executedPlan.exists(_.isInstanceOf[FileSourceScanExec]))
 +              df.queryExecution.executedPlan.exists {
-+                case _: FileSourceScanExec | _: CometScanExec => true
++                case _: FileSourceScanExec | _: CometScanExec | _: CometNativeScanExec => true
 +                case _ => false
 +              }
 +            )
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeScanExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeScanExec.scala
@@ -65,8 +65,16 @@ case class CometNativeScanExec(
   override val nodeName: String =
     s"CometNativeScan $relation ${tableIdentifier.map(_.unquotedString).getOrElse("")}"
 
-  override lazy val outputPartitioning: Partitioning =
-    UnknownPartitioning(originalPlan.inputRDD.getNumPartitions)
+  // exposed for testing
+  lazy val bucketedScan: Boolean = originalPlan.bucketedScan && !disableBucketedScan
+
+  override lazy val outputPartitioning: Partitioning = {
+    if (bucketedScan) {
+      originalPlan.outputPartitioning
+    } else {
+      UnknownPartitioning(originalPlan.inputRDD.getNumPartitions)
+    }
+  }
 
   override lazy val outputOrdering: Seq[SortOrder] = originalPlan.outputOrdering