fix: tag reverted shuffle so AQE stage re-planning does not reintroduce Comet columnar shuffle

andygrove · andygrove · commit 5e1e49d4a488 · 2026-04-23T07:33:00.000-06:00
Without the tag, AQE re-plans each stage in isolation, and the isolated
subplan (which no longer shows the parent aggregate) converts the reverted
ShuffleExchangeExec back into a CometShuffleExchangeExec. Subsequent plan
canonicalization then fails because a ColumnarToRowExec ends up with a
non-columnar child.

Persist the revert decision via a TreeNodeTag on the ShuffleExchangeExec.
Both applyCometShuffle and the main transform now short-circuit when the
tag is set, so the decision survives re-entrancy.
diff --git a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala
@@ -89,6 +89,13 @@ object CometExecRule {
 
   val allExecs: Map[Class[_ <: SparkPlan], CometOperatorSerde[_]] = nativeExecs ++ sinks
 
+  /**
+   * Tag set on a `ShuffleExchangeExec` that should be left as a plain Spark shuffle rather than
+   * wrapped in `CometShuffleExchangeExec`. See `tagRedundantColumnarShuffle`.
+   */
+  val SKIP_COMET_SHUFFLE_TAG: org.apache.spark.sql.catalyst.trees.TreeNodeTag[Unit] =
+    org.apache.spark.sql.catalyst.trees.TreeNodeTag[Unit]("comet.skipCometShuffle")
+
 }
 
 /**
@@ -108,9 +115,13 @@ case class CometExecRule(session: SparkSession)
    * row->arrow->shuffle->arrow->row conversion overhead with no Comet consumer on either side.
    * See https://github.com/apache/datafusion-comet/issues/4004.
    *
-   * The match is intentionally narrow (both sides must be row-based aggregates) so this does not
-   * interfere with non-relational plan shapes such as object-mode Dataset plans where the shuffle
-   * sits between encoder/serializer nodes.
+   * The match is intentionally narrow (both sides must be row-based aggregates that remained JVM
+   * after the main transform pass). Running the revert post-transform means we only fire when the
+   * main conversion already decided to keep both aggregates JVM - we never create the dangerous
+   * mixed mode where a Comet partial feeds a JVM final (see issue #1389).
+   *
+   * Also tag the reverted shuffle so AQE stage-isolated re-planning does not convert it back to a
+   * Comet shuffle when the outer aggregate context is no longer visible.
    */
   private def revertRedundantColumnarShuffle(plan: SparkPlan): SparkPlan = {
     def isAggregate(p: SparkPlan): Boolean =
@@ -127,26 +138,35 @@ case class CometExecRule(session: SparkSession)
         val newChildren = op.children.map {
           case s: CometShuffleExchangeExec
               if s.shuffleType == CometColumnarShuffle && isAggregate(s.child) =>
-            s.originalPlan.withNewChildren(Seq(s.child)).asInstanceOf[SparkPlan]
+            val reverted =
+              s.originalPlan.withNewChildren(Seq(s.child)).asInstanceOf[ShuffleExchangeExec]
+            reverted.setTagValue(CometExecRule.SKIP_COMET_SHUFFLE_TAG, ())
+            reverted
           case other => other
         }
         op.withNewChildren(newChildren)
     }
   }
 
+  private def shouldSkipCometShuffle(s: ShuffleExchangeExec): Boolean =
+    s.getTagValue(CometExecRule.SKIP_COMET_SHUFFLE_TAG).isDefined
+
   private def applyCometShuffle(plan: SparkPlan): SparkPlan = {
-    plan.transformUp { case s: ShuffleExchangeExec =>
-      CometShuffleExchangeExec.shuffleSupported(s) match {
-        case Some(CometNativeShuffle) =>
-          // Switch to use Decimal128 regardless of precision, since Arrow native execution
-          // doesn't support Decimal32 and Decimal64 yet.
-          conf.setConfString(CometConf.COMET_USE_DECIMAL_128.key, "true")
-          CometShuffleExchangeExec(s, shuffleType = CometNativeShuffle)
-        case Some(CometColumnarShuffle) =>
-          CometShuffleExchangeExec(s, shuffleType = CometColumnarShuffle)
-        case None =>
-          s
-      }
+    plan.transformUp {
+      case s: ShuffleExchangeExec if shouldSkipCometShuffle(s) =>
+        s
+      case s: ShuffleExchangeExec =>
+        CometShuffleExchangeExec.shuffleSupported(s) match {
+          case Some(CometNativeShuffle) =>
+            // Switch to use Decimal128 regardless of precision, since Arrow native execution
+            // doesn't support Decimal32 and Decimal64 yet.
+            conf.setConfString(CometConf.COMET_USE_DECIMAL_128.key, "true")
+            CometShuffleExchangeExec(s, shuffleType = CometNativeShuffle)
+          case Some(CometColumnarShuffle) =>
+            CometShuffleExchangeExec(s, shuffleType = CometColumnarShuffle)
+          case None =>
+            s
+        }
     }
   }
 
@@ -295,6 +315,9 @@ case class CometExecRule(session: SparkSession)
       case s @ ShuffleQueryStageExec(_, ReusedExchangeExec(_, _: CometShuffleExchangeExec), _) =>
         convertToComet(s, CometExchangeSink).getOrElse(s)
 
+      case s: ShuffleExchangeExec if shouldSkipCometShuffle(s) =>
+        s
+
       case s: ShuffleExchangeExec =>
         convertToComet(s, CometShuffleExchangeExec).getOrElse(s)
 
@@ -498,10 +521,9 @@ case class CometExecRule(session: SparkSession)
         case CometScanWrapper(_, s) => s
       }
 
-      // Revert CometColumnarShuffle to Spark's ShuffleExchangeExec when both the parent and
-      // the child are non-Comet (JVM) operators. In that case the Comet shuffle only adds
-      // row->arrow->arrow->row conversion overhead with no Comet operator on either side to
-      // benefit from columnar output. See https://github.com/apache/datafusion-comet/issues/4004.
+      // Revert CometColumnarShuffle to Spark's ShuffleExchangeExec when sandwiched between two
+      // non-Comet HashAggregate/ObjectHashAggregate operators that remained JVM after the main
+      // transform pass. See https://github.com/apache/datafusion-comet/issues/4004.
       newPlan = revertRedundantColumnarShuffle(newPlan)
 
       // Set up logical links