Add Spark 3.5.5 support with new compatibility shims

unkloud · unkloud · commit 7b587b499116 · 2025-03-23T21:20:54.000+11:00
diff --git a/common/src/main/spark-3.5.5/org/apache/comet/shims/ShimBatchReader.scala b/common/src/main/spark-3.5.5/org/apache/comet/shims/ShimBatchReader.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.shims
+
+import org.apache.spark.paths.SparkPath
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.datasources.PartitionedFile
+
+object ShimBatchReader {
+
+  def newPartitionedFile(partitionValues: InternalRow, file: String): PartitionedFile =
+    PartitionedFile(
+      partitionValues,
+      SparkPath.fromPathString(file),
+      -1, // -1 means we read the entire file
+      -1,
+      Array.empty[String],
+      0,
+      0,
+      Map.empty)
+}
diff --git a/common/src/main/spark-3.5.5/org/apache/spark/sql/comet/shims/ShimCastOverflowException.scala b/common/src/main/spark-3.5.5/org/apache/spark/sql/comet/shims/ShimCastOverflowException.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet.shims
+
+import org.apache.spark.SparkArithmeticException
+import org.apache.spark.sql.errors.QueryExecutionErrors.toSQLConf
+import org.apache.spark.sql.internal.SQLConf
+
+// TODO: Only the Spark 3.3 version of this class is different from the others.
+//       Remove this class after dropping Spark 3.3 support.
+class ShimCastOverflowException(t: String, from: String, to: String)
+  extends SparkArithmeticException(
+    "CAST_OVERFLOW",
+    Map(
+      "value" -> t,
+      "sourceType" -> s""""$from"""",
+      "targetType" -> s""""$to"""",
+      "ansiConfig" -> toSQLConf(SQLConf.ANSI_ENABLED.key)),
+    Array.empty,
+    "") {}
diff --git a/common/src/main/spark-3.5.5/org/apache/spark/sql/comet/shims/ShimTaskMetrics.scala b/common/src/main/spark-3.5.5/org/apache/spark/sql/comet/shims/ShimTaskMetrics.scala
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet.shims
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.util.AccumulatorV2
+
+object ShimTaskMetrics {
+
+  def getTaskAccumulator(taskMetrics: TaskMetrics): Option[AccumulatorV2[_, _]] =
+    taskMetrics.withExternalAccums(identity[ArrayBuffer[AccumulatorV2[_, _]]](_)).lastOption
+}
diff --git a/pom.xml b/pom.xml
@@ -585,6 +585,21 @@ under the License.
       </properties>
     </profile>
 
+    <profile>
+      <id>spark-3.5.5</id>
+      <properties>
+        <scala.version>2.12.18</scala.version>
+        <spark.version>3.5.5</spark.version>
+        <spark.version.short>3.5</spark.version.short>
+        <parquet.version>1.13.1</parquet.version>
+        <slf4j.version>2.0.7</slf4j.version>
+        <shims.minorVerSrc>spark-3.5.5</shims.minorVerSrc>
+        <shims.pre35Src>not-needed</shims.pre35Src>
+        <additional.pre35.test.source>not-needed</additional.pre35.test.source>
+        <additional.3_5.test.source>spark-3.5</additional.3_5.test.source>
+      </properties>
+    </profile>
+
     <profile>
       <!-- FIXME: this is WIP. Tests may fail https://github.com/apache/datafusion-comet/issues/551 -->
       <id>spark-4.0</id>
diff --git a/spark/src/main/spark-3.5.5/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-3.5.5/org/apache/comet/shims/CometExprShim.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.comet.shims
+
+import org.apache.comet.expressions.CometEvalMode
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.{DataType, TimestampNTZType}
+
+/**
+ * `CometExprShim` acts as a shim for for parsing expressions from different Spark versions.
+ */
+trait CometExprShim {
+    /**
+     * Returns a tuple of expressions for the `unhex` function.
+     */
+    protected def unhexSerde(unhex: Unhex): (Expression, Expression) = {
+        (unhex.child, Literal(unhex.failOnError))
+    }
+
+    protected def isTimestampNTZType(dt: DataType): Boolean = dt match {
+        case _: TimestampNTZType => true
+        case _ => false
+    }
+
+    protected def evalMode(c: Cast): CometEvalMode.Value =
+        CometEvalModeUtil.fromSparkEvalMode(c.evalMode)
+}
+
+object CometEvalModeUtil {
+    def fromSparkEvalMode(evalMode: EvalMode.Value): CometEvalMode.Value = evalMode match {
+        case EvalMode.LEGACY => CometEvalMode.LEGACY
+        case EvalMode.TRY => CometEvalMode.TRY
+        case EvalMode.ANSI => CometEvalMode.ANSI
+    }
+}
+
diff --git a/spark/src/main/spark-3.5.5/org/apache/comet/shims/ShimSQLConf.scala b/spark/src/main/spark-3.5.5/org/apache/comet/shims/ShimSQLConf.scala
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.shims
+
+import org.apache.spark.sql.internal.LegacyBehaviorPolicy
+import org.apache.spark.sql.internal.SQLConf
+
+trait ShimSQLConf {
+
+  /**
+   * Spark 3.4 renamed parquetFilterPushDownStringStartWith to
+   * parquetFilterPushDownStringPredicate
+   */
+  protected def getPushDownStringPredicate(sqlConf: SQLConf): Boolean =
+    sqlConf.parquetFilterPushDownStringPredicate
+
+  protected val LEGACY = LegacyBehaviorPolicy.LEGACY
+  protected val CORRECTED = LegacyBehaviorPolicy.CORRECTED
+}
diff --git a/spark/src/main/spark-3.5.5/org/apache/spark/sql/comet/shims/ShimCometScanExec.scala b/spark/src/main/spark-3.5.5/org/apache/spark/sql/comet/shims/ShimCometScanExec.scala
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet.shims
+
+
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression}
+import org.apache.spark.sql.errors.QueryExecutionErrors
+import org.apache.spark.sql.execution.{FileSourceScanExec, PartitionedFileUtil}
+import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.execution.datasources.parquet.ParquetOptions
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.sql.types.StructType
+
+trait ShimCometScanExec {
+  def wrapped: FileSourceScanExec
+
+  lazy val fileConstantMetadataColumns: Seq[AttributeReference] =
+    wrapped.fileConstantMetadataColumns
+
+  protected def newFileScanRDD(
+      fsRelation: HadoopFsRelation,
+      readFunction: PartitionedFile => Iterator[InternalRow],
+      filePartitions: Seq[FilePartition],
+      readSchema: StructType,
+      options: ParquetOptions): FileScanRDD = new FileScanRDD(
+    fsRelation.sparkSession,
+    readFunction,
+    filePartitions,
+    readSchema,
+    fileConstantMetadataColumns,
+    fsRelation.fileFormat.fileConstantMetadataExtractors,
+    options)
+
+  protected def invalidBucketFile(path: String, sparkVersion: String): Throwable =
+    QueryExecutionErrors.invalidBucketFile(path)
+
+  // see SPARK-39634
+  protected def isNeededForSchema(sparkSchema: StructType): Boolean = false
+
+  protected def getPartitionedFile(f: FileStatusWithMetadata, p: PartitionDirectory): PartitionedFile =
+    PartitionedFileUtil.getPartitionedFile(f, f.getPath, p.values)
+
+  protected def splitFiles(sparkSession: SparkSession,
+                           file: FileStatusWithMetadata,
+                           filePath: Path,
+                           isSplitable: Boolean,
+                           maxSplitBytes: Long,
+                           partitionValues: InternalRow): Seq[PartitionedFile] =
+    PartitionedFileUtil.splitFiles(sparkSession, file, filePath, isSplitable, maxSplitBytes, partitionValues)
+
+  protected def getPushedDownFilters(relation: HadoopFsRelation , dataFilters: Seq[Expression]):  Seq[Filter] = {
+    val supportNestedPredicatePushdown = DataSourceUtils.supportNestedPredicatePushdown(relation)
+    dataFilters.flatMap(DataSourceStrategy.translateFilter(_, supportNestedPredicatePushdown))
+  }
+}