test: re-enable sql_hive-1 for Spark 4.0 and fix two small failures (#4047)

andygrove · web-flow · commit 2cb6142cdc66 · 2026-04-24T15:41:31.000-06:00
diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml
@@ -141,10 +141,6 @@ jobs:
           - {spark-short: '3.4', spark-full: '3.4.3', java: 11, scan-impl: 'auto'}
           - {spark-short: '3.5', spark-full: '3.5.8', java: 11, scan-impl: 'auto'}
           - {spark-short: '4.0', spark-full: '4.0.1', java: 17, scan-impl: 'auto'}
-        # Skip sql_hive-1 for Spark 4.0 due to https://github.com/apache/datafusion-comet/issues/2946
-        exclude:
-          - config: {spark-short: '4.0', spark-full: '4.0.1', java: 17, scan-impl: 'auto'}
-            module: {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
       fail-fast: false
     name: spark-sql-${{ matrix.config.scan-impl }}-${{ matrix.module.name }}/spark-${{ matrix.config.spark-full }}
     runs-on: ${{ matrix.os }}
diff --git a/dev/diffs/4.0.1.diff b/dev/diffs/4.0.1.diff
@@ -3830,16 +3830,44 @@ index 52abd248f3a..b4e096cae24 100644
          case d: DynamicPruningExpression => d.child
        }
 diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUDFDynamicLoadSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUDFDynamicLoadSuite.scala
-index 4b27082e188..6710c90c789 100644
+index 4b27082e188..057b2430872 100644
 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUDFDynamicLoadSuite.scala
 +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUDFDynamicLoadSuite.scala
-@@ -147,7 +147,9 @@ class HiveUDFDynamicLoadSuite extends QueryTest with SQLTestUtils with TestHiveS
+@@ -17,7 +17,7 @@
+ 
+ package org.apache.spark.sql.hive
+ 
+-import org.apache.spark.sql.{QueryTest, Row}
++import org.apache.spark.sql.{IgnoreCometSuite, QueryTest, Row}
+ import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression}
+ import org.apache.spark.sql.hive.HiveShim.HiveFunctionWrapper
+ import org.apache.spark.sql.hive.test.TestHiveSingleton
+@@ -26,7 +26,13 @@ import org.apache.spark.sql.types.{IntegerType, StringType}
+ import org.apache.spark.util.ArrayImplicits._
+ import org.apache.spark.util.Utils
+ 
+-class HiveUDFDynamicLoadSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
++// Comet: mix in IgnoreCometSuite so these tests are reported as ignored when Comet is enabled
++// (ENABLE_COMET=true). The jar these tests depend on (`hive-test-udfs.jar`) is stripped from the
++// Spark 4.0.1 release source tag per the ASF binary-artifact policy, so the tests cannot run in
++// Comet's CI. Ignoring keeps the suite passing without masking real regressions; the upstream
++// tests still run in non-Comet Spark builds that ship the jar on branch-4.0.
++class HiveUDFDynamicLoadSuite extends QueryTest with SQLTestUtils with TestHiveSingleton
++  with IgnoreCometSuite {
+ 
+   case class UDFTestInformation(
+       identifier: String,
+@@ -147,7 +153,13 @@ class HiveUDFDynamicLoadSuite extends QueryTest with SQLTestUtils with TestHiveS
  
      // This jar file should not be placed to the classpath.
      val jarPath = "src/test/noclasspath/hive-test-udfs.jar"
 -    assume(new java.io.File(jarPath).exists)
-+    // Comet: hive-test-udfs.jar files has been removed from Apache Spark repository
-+    //        comment out the following line for now
++    // Comet: the upstream `assume(...)` runs here in the suite constructor (inside this foreach,
++    // before `test(...)` registers a case). When the jar is missing - as it is on the v4.0.1
++    // release tag - `assume` throws TestCanceledException out of `<init>`, which ScalaTest
++    // reports as a suite abort (not a per-test cancel) and fails the whole job. The
++    // IgnoreCometSuite mixin above already reroutes these tests to `ignore` under Comet, so
++    // the jar presence check is unnecessary; comment it out to avoid the constructor-time abort.
 +    // assume(new java.io.File(jarPath).exists)
      val jarUrl = s"file://${System.getProperty("user.dir")}/$jarPath"
  
diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala
@@ -19,11 +19,12 @@
 
 package org.apache.spark.sql.comet.shims
 
+import java.io.FileNotFoundException
+
 import scala.util.matching.Regex
 
 import org.apache.spark.QueryContext
 import org.apache.spark.SparkException
-import org.apache.spark.SparkFileNotFoundException
 import org.apache.spark.sql.errors.QueryExecutionErrors
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -292,17 +293,13 @@ trait ShimSparkErrorConverter {
 
       case "FileNotFound" =>
         val msg = params("message").toString
-        // Extract file path from native error message and format like Hadoop's
-        // FileNotFoundException: "File <path> does not exist"
         val path = ShimSparkErrorConverter.ObjectLocationPattern
           .findFirstMatchIn(msg)
           .map(_.group(1))
           .getOrElse(msg)
-        // readCurrentFileNotFoundError was removed in Spark 4.0; construct directly
         Some(
-          new SparkFileNotFoundException(
-            errorClass = "_LEGACY_ERROR_TEMP_2055",
-            messageParameters = Map("message" -> s"File $path does not exist")))
+          QueryExecutionErrors
+            .fileNotExistError(path, new FileNotFoundException(s"File $path does not exist")))
 
       case _ =>
         // Unknown error type - return None to trigger fallback