apache
diff --git a/‎common/src/main/scala/org/apache/comet/CometConf.scala‎
Lines changed: 11 additions & 0 deletions b/‎common/src/main/scala/org/apache/comet/CometConf.scala‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎dev/diffs/3.5.8.diff‎
Lines changed: 4 additions & 54 deletions b/‎dev/diffs/3.5.8.diff‎
Lines changed: 4 additions & 54 deletions
diff --git a/‎native/core/src/execution/planner.rs‎
Lines changed: 2 additions & 0 deletions b/‎native/core/src/execution/planner.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎native/core/src/execution/spark_config.rs‎
Lines changed: 3 additions & 0 deletions b/‎native/core/src/execution/spark_config.rs‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎native/core/src/parquet/mod.rs‎
Lines changed: 2 additions & 0 deletions b/‎native/core/src/parquet/mod.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎native/core/src/parquet/parquet_exec.rs‎
Lines changed: 8 additions & 0 deletions b/‎native/core/src/parquet/parquet_exec.rs‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎native/core/src/parquet/parquet_support.rs‎
Lines changed: 8 additions & 0 deletions b/‎native/core/src/parquet/parquet_support.rs‎
Lines changed: 8 additions & 0 deletions
@@ -157,6 +157,17 @@ object CometConf extends ShimCometConf {
       .booleanConf
       .createWithDefault(false)
 
+  val COMET_PARQUET_SCHEMA_VALIDATION_ENABLED: ConfigEntry[Boolean] =
+    conf("spark.comet.parquet.schemaValidation.enabled")
+      .category(CATEGORY_PARQUET)
+      .doc(
+        "Whether to enable Spark-compatible schema validation when reading Parquet files " +
+          "with native_datafusion scan. When enabled, type coercions and column resolutions " +
+          "that Spark's vectorized reader would reject will also be rejected by Comet, " +
+          "throwing SparkException with compatible error messages.")
+      .booleanConf
+      .createWithDefault(true)
+
   val COMET_RESPECT_PARQUET_FILTER_PUSHDOWN: ConfigEntry[Boolean] =
     conf("spark.comet.parquet.respectFilterPushdown")
       .category(CATEGORY_PARQUET)
 
@@ -2271,17 +2271,7 @@ index 8e88049f51e..49f2001dc6b 100644
      val schema = StructType(Seq(
        StructField("a", IntegerType, nullable = false)
      ))
-@@ -1933,7 +1949,8 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared
-     }
-   }
- 
--  test("SPARK-25207: exception when duplicate fields in case-insensitive mode") {
-+  test("SPARK-25207: exception when duplicate fields in case-insensitive mode",
-+    IgnoreCometNativeDataFusion("https://github.com/apache/datafusion-comet/issues/3311")) {
-     withTempPath { dir =>
-       val count = 10
-       val tableName = "spark_25207"
-@@ -1984,7 +2001,8 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared
+@@ -1984,7 +2000,8 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared
      }
    }
 
@@ -2331,27 +2321,7 @@ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/
 index 8ed9ef1630e..f312174b182 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
 +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
-@@ -1064,7 +1064,8 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession
-     }
-   }
- 
--  test("SPARK-35640: read binary as timestamp should throw schema incompatible error") {
-+  test("SPARK-35640: read binary as timestamp should throw schema incompatible error",
-+    IgnoreCometNativeDataFusion("https://github.com/apache/datafusion-comet/issues/3311")) {
-     val data = (1 to 4).map(i => Tuple1(i.toString))
-     val readSchema = StructType(Seq(StructField("_1", DataTypes.TimestampType)))
- 
-@@ -1075,7 +1076,8 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession
-     }
-   }
- 
--  test("SPARK-35640: int as long should throw schema incompatible error") {
-+  test("SPARK-35640: int as long should throw schema incompatible error",
-+    IgnoreCometNativeDataFusion("https://github.com/apache/datafusion-comet/issues/3311")) {
-     val data = (1 to 4).map(i => Tuple1(i))
-     val readSchema = StructType(Seq(StructField("_1", DataTypes.LongType)))
- 
-@@ -1345,7 +1347,8 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession
+@@ -1345,7 +1345,8 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession
      }
    }
 
@@ -2365,17 +2335,7 @@ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/
 index f6472ba3d9d..ce39ebb52e6 100644
 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
 +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
-@@ -185,7 +185,8 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS
-     }
-   }
- 
--  test("SPARK-36182: can't read TimestampLTZ as TimestampNTZ") {
-+  test("SPARK-36182: can't read TimestampLTZ as TimestampNTZ",
-+    IgnoreCometNativeDataFusion("https://github.com/apache/datafusion-comet/issues/3311")) {
-     val data = (1 to 1000).map { i =>
-       val ts = new java.sql.Timestamp(i)
-       Row(ts)
-@@ -998,7 +999,8 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS
+@@ -998,7 +998,8 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS
      }
    }
 
@@ -2415,17 +2375,7 @@ index f6472ba3d9d..ce39ebb52e6 100644
          checkAnswer(readParquet("a DECIMAL(3, 2)", path), sql("SELECT 1.00"))
          checkAnswer(readParquet("b DECIMAL(3, 2)", path), Row(null))
          checkAnswer(readParquet("b DECIMAL(11, 1)", path), sql("SELECT 123456.0"))
-@@ -1133,7 +1138,8 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS
-     }
-   }
- 
--  test("row group skipping doesn't overflow when reading into larger type") {
-+  test("row group skipping doesn't overflow when reading into larger type",
-+    IgnoreCometNativeDataFusion("https://github.com/apache/datafusion-comet/issues/3311")) {
-     withTempPath { path =>
-       Seq(0).toDF("a").write.parquet(path.toString)
-       // The vectorized and non-vectorized readers will produce different exceptions, we don't need
-@@ -1148,7 +1154,7 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS
+@@ -1148,7 +1152,7 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS
              .where(s"a < ${Long.MaxValue}")
              .collect()
          }
 
@@ -1053,6 +1053,8 @@ impl PhysicalPlanner {
                     default_values,
                     scan.session_timezone.as_str(),
                     scan.case_sensitive,
+                    scan.schema_validation_enabled,
+                    scan.schema_evolution_enabled,
                     self.session_ctx(),
                     scan.encryption_enabled,
                 )?;
 
@@ -21,6 +21,9 @@ pub(crate) const COMET_TRACING_ENABLED: &str = "spark.comet.tracing.enabled";
 pub(crate) const COMET_DEBUG_ENABLED: &str = "spark.comet.debug.enabled";
 pub(crate) const COMET_EXPLAIN_NATIVE_ENABLED: &str = "spark.comet.explain.native.enabled";
 pub(crate) const COMET_MAX_TEMP_DIRECTORY_SIZE: &str = "spark.comet.maxTempDirectorySize";
+#[allow(dead_code)]
+pub(crate) const COMET_PARQUET_SCHEMA_VALIDATION_ENABLED: &str =
+    "spark.comet.parquet.schemaValidation.enabled";
 
 pub(crate) trait SparkConfig {
     fn get_bool(&self, name: &str) -> bool;
 
@@ -773,6 +773,8 @@ pub unsafe extern "system" fn Java_org_apache_comet_parquet_Native_initRecordBat
             None,
             session_timezone.as_str(),
             case_sensitive != JNI_FALSE,
+            false, // schema_validation_enabled - validation is done on the Java side
+            false, // schema_evolution_enabled
             session_ctx,
             encryption_enabled,
         )?;
 
@@ -68,12 +68,16 @@ pub(crate) fn init_datasource_exec(
     default_values: Option<HashMap<usize, ScalarValue>>,
     session_timezone: &str,
     case_sensitive: bool,
+    schema_validation_enabled: bool,
+    schema_evolution_enabled: bool,
     session_ctx: &Arc<SessionContext>,
     encryption_enabled: bool,
 ) -> Result<Arc<DataSourceExec>, ExecutionError> {
     let (table_parquet_options, spark_parquet_options) = get_options(
         session_timezone,
         case_sensitive,
+        schema_validation_enabled,
+        schema_evolution_enabled,
         &object_store_url,
         encryption_enabled,
     );
@@ -142,6 +146,8 @@ pub(crate) fn init_datasource_exec(
 fn get_options(
     session_timezone: &str,
     case_sensitive: bool,
+    schema_validation_enabled: bool,
+    schema_evolution_enabled: bool,
     object_store_url: &ObjectStoreUrl,
     encryption_enabled: bool,
 ) -> (TableParquetOptions, SparkParquetOptions) {
@@ -153,6 +159,8 @@ fn get_options(
         SparkParquetOptions::new(EvalMode::Legacy, session_timezone, false);
     spark_parquet_options.allow_cast_unsigned_ints = true;
     spark_parquet_options.case_sensitive = case_sensitive;
+    spark_parquet_options.schema_validation_enabled = schema_validation_enabled;
+    spark_parquet_options.schema_evolution_enabled = schema_evolution_enabled;
 
     if encryption_enabled {
         table_parquet_options.crypto.configure_factory(
 
@@ -76,6 +76,10 @@ pub struct SparkParquetOptions {
     pub use_legacy_date_timestamp_or_ntz: bool,
     // Whether schema field names are case sensitive
     pub case_sensitive: bool,
+    /// Whether to validate schema compatibility (type coercions) in a Spark-compatible way
+    pub schema_validation_enabled: bool,
+    /// Whether schema evolution (type widening) is enabled
+    pub schema_evolution_enabled: bool,
 }
 
 impl SparkParquetOptions {
@@ -88,6 +92,8 @@ impl SparkParquetOptions {
             use_decimal_128: false,
             use_legacy_date_timestamp_or_ntz: false,
             case_sensitive: false,
+            schema_validation_enabled: true,
+            schema_evolution_enabled: false,
         }
     }
 
@@ -100,6 +106,8 @@ impl SparkParquetOptions {
             use_decimal_128: false,
             use_legacy_date_timestamp_or_ntz: false,
             case_sensitive: false,
+            schema_validation_enabled: true,
+            schema_evolution_enabled: false,
         }
     }
 }
Original file line number	Diff line number	Diff line change
`@@ -76,6 +76,10 @@ pub struct SparkParquetOptions {`
`76`	`76`	`pub use_legacy_date_timestamp_or_ntz: bool,`
`77`	`77`	`// Whether schema field names are case sensitive`
`78`	`78`	`pub case_sensitive: bool,`
	`79`	`+ /// Whether to validate schema compatibility (type coercions) in a Spark-compatible way`
	`80`	`+ pub schema_validation_enabled: bool,`
	`81`	`+ /// Whether schema evolution (type widening) is enabled`
	`82`	`+ pub schema_evolution_enabled: bool,`
`79`	`83`	`}`
`80`	`84`
`81`	`85`	`impl SparkParquetOptions {`
`@@ -88,6 +92,8 @@ impl SparkParquetOptions {`
`88`	`92`	`use_decimal_128: false,`
`89`	`93`	`use_legacy_date_timestamp_or_ntz: false,`
`90`	`94`	`case_sensitive: false,`
	`95`	`+ schema_validation_enabled: true,`
	`96`	`+ schema_evolution_enabled: false,`
`91`	`97`	`}`
`92`	`98`	`}`
`93`	`99`
`@@ -100,6 +106,8 @@ impl SparkParquetOptions {`
`100`	`106`	`use_decimal_128: false,`
`101`	`107`	`use_legacy_date_timestamp_or_ntz: false,`
`102`	`108`	`case_sensitive: false,`
	`109`	`+ schema_validation_enabled: true,`
	`110`	`+ schema_evolution_enabled: false,`
`103`	`111`	`}`
`104`	`112`	`}`
`105`	`113`	`}`