@@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.plans.QueryPlan
3030import org .apache .spark .sql .catalyst .plans .physical .{Partitioning , UnknownPartitioning }
3131import org .apache .spark .sql .comet .shims .ShimStreamSourceAwareSparkPlan
3232import org .apache .spark .sql .execution ._
33+ import org .apache .spark .sql .execution .{ScalarSubquery => ExecScalarSubquery }
3334import org .apache .spark .sql .execution .datasources ._
3435import org .apache .spark .sql .execution .metric .SQLMetric
3536import org .apache .spark .sql .types ._
@@ -41,6 +42,7 @@ import com.google.common.base.Objects
4142
4243import org .apache .comet .parquet .{CometParquetFileFormat , CometParquetUtils }
4344import org .apache .comet .serde .OperatorOuterClass .Operator
45+ import org .apache .comet .serde .QueryPlanSerde .exprToProto
4446
4547/**
4648 * Native scan operator for DataSource V1 Parquet files using DataFusion's ParquetExec.
@@ -77,23 +79,30 @@ case class CometNativeScanExec(
7779 override lazy val metadata : Map [String , String ] = originalPlan.metadata
7880
7981 /**
80- * Prepare DPP subquery plans before execution.
82+ * Prepare subquery plans before execution.
8183 *
82- * For non-AQE DPP, partitionFilters contains DynamicPruningExpression(InSubqueryExec(...))
83- * inserted by PlanDynamicPruningFilters (which runs before Comet rules). We call
84- * e.plan.prepare() here so that the subquery plans are set up before execution begins.
84+ * DPP: partitionFilters may contain DynamicPruningExpression(InSubqueryExec(...)) from
85+ * PlanDynamicPruningFilters.
8586 *
86- * Note: doPrepare() alone is NOT sufficient for DPP resolution. serializedPartitionData can be
87- * triggered from findAllPlanData (via commonData) on a BroadcastExchangeExec thread, outside
88- * the normal prepare() -> executeSubqueries() flow. The actual DPP resolution (updateResult)
89- * happens in serializedPartitionData below.
87+ * Scalar subquery pushdown (SPARK-43402, Spark 4.0+): dataFilters may contain ScalarSubquery.
88+ *
89+ * serializedPartitionData can be triggered outside the normal prepare() -> executeSubqueries()
90+ * flow (e.g., from a BroadcastExchangeExec thread), so we prepare subquery plans here and
91+ * resolve them explicitly in serializedPartitionData via updateResult().
9092 */
9193 override protected def doPrepare (): Unit = {
9294 partitionFilters.foreach {
9395 case DynamicPruningExpression (e : InSubqueryExec ) =>
9496 e.plan.prepare()
9597 case _ =>
9698 }
99+ dataFilters.foreach { f =>
100+ f.foreach {
101+ case s : ExecScalarSubquery =>
102+ s.plan.prepare()
103+ case _ =>
104+ }
105+ }
97106 super .doPrepare()
98107 }
99108
@@ -138,7 +147,7 @@ case class CometNativeScanExec(
138147 //
139148 // originalPlan.inputRDD triggers FileSourceScanExec's full scan pipeline including
140149 // codegen on partition filter expressions. With DPP, this calls
141- // InSubqueryExec.doGenCode which requires the subquery to have finished — but
150+ // InSubqueryExec.doGenCode which requires the subquery to have finished - but
142151 // outputPartitioning can be accessed before prepare() runs (e.g., by
143152 // ValidateRequirements during plan validation).
144153 //
@@ -208,8 +217,40 @@ case class CometNativeScanExec(
208217 case _ =>
209218 }
210219 }
211- // Extract common data from nativeOp
212- val commonBytes = nativeOp.getNativeScan.getCommon.toByteArray
220+ // Resolve scalar subqueries in dataFilters and push to the native Parquet reader.
221+ // supportedDataFilters excludes PlanExpression at planning time (unresolved), so these
222+ // aren't in the serialized native plan yet. We resolve them here and append to the
223+ // NativeScanCommon protobuf. Same approach as FileSourceScanLike.pushedDownFilters
224+ // (DataSourceScanExec.scala), which resolves ScalarSubquery -> Literal at execution time.
225+ val commonBytes = {
226+ val base = nativeOp.getNativeScan.getCommon
227+ val scalarSubqueryFilters = dataFilters
228+ .filter(_.exists(_.isInstanceOf [ExecScalarSubquery ]))
229+ scalarSubqueryFilters.foreach { f =>
230+ f.foreach {
231+ case s : ExecScalarSubquery =>
232+ s.updateResult()
233+ case _ =>
234+ }
235+ }
236+ val resolvedFilters = scalarSubqueryFilters
237+ .map(_.transform { case s : ExecScalarSubquery =>
238+ Literal .create(s.eval(null ), s.dataType)
239+ })
240+ if (resolvedFilters.nonEmpty) {
241+ val commonBuilder = base.toBuilder
242+ for (filter <- resolvedFilters) {
243+ exprToProto(filter, output) match {
244+ case Some (proto) => commonBuilder.addDataFilters(proto)
245+ case _ =>
246+ logWarning(s " Could not serialize resolved scalar subquery filter: $filter" )
247+ }
248+ }
249+ commonBuilder.build().toByteArray
250+ } else {
251+ base.toByteArray
252+ }
253+ }
213254
214255 // Get file partitions from CometScanExec (handles bucketing, etc.)
215256 val filePartitions = scan.getFilePartitions()
@@ -299,13 +340,15 @@ case class CometNativeScanExec(
299340 case other : CometNativeScanExec =>
300341 this .originalPlan == other.originalPlan &&
301342 this .serializedPlanOpt == other.serializedPlanOpt &&
302- this .partitionFilters == other.partitionFilters
343+ this .partitionFilters == other.partitionFilters &&
344+ this .dataFilters == other.dataFilters
303345 case _ =>
304346 false
305347 }
306348 }
307349
308- override def hashCode (): Int = Objects .hashCode(originalPlan, serializedPlanOpt)
350+ override def hashCode (): Int =
351+ Objects .hashCode(originalPlan, serializedPlanOpt, partitionFilters, dataFilters)
309352
310353 private val driverMetricKeys =
311354 Set (
0 commit comments