Skip to content

Commit 4a7df41

Browse files
committed
Merge branch 'main' into arrays_zip
2 parents c3e2b2f + 87291f4 commit 4a7df41

173 files changed

Lines changed: 1745 additions & 609 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.claude/skills/review-comet-pr/SKILL.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,6 @@ Categories include: `aggregate/`, `array/`, `string/`, `math/`, `struct/`, `map/
149149
**SQL file structure:**
150150

151151
```sql
152-
-- ConfigMatrix: parquet.enable.dictionary=false,true
153-
154152
-- Create test data
155153
statement
156154
CREATE TABLE test_crc32(col string, a int, b float) USING parquet

.github/workflows/codeql.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,11 @@ jobs:
4545
persist-credentials: false
4646

4747
- name: Initialize CodeQL
48-
uses: github/codeql-action/init@c10b8064de6f491fea524254123dbe5e09572f13 # v4
48+
uses: github/codeql-action/init@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4
4949
with:
5050
languages: actions
5151

5252
- name: Perform CodeQL Analysis
53-
uses: github/codeql-action/analyze@c10b8064de6f491fea524254123dbe5e09572f13 # v4
53+
uses: github/codeql-action/analyze@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4
5454
with:
5555
category: "/language:actions"

common/src/main/scala/org/apache/comet/CometConf.scala

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -222,31 +222,28 @@ object CometConf extends ShimCometConf {
222222

223223
val COMET_CONVERT_FROM_PARQUET_ENABLED: ConfigEntry[Boolean] =
224224
conf("spark.comet.convert.parquet.enabled")
225-
.category(CATEGORY_TESTING)
225+
.category(CATEGORY_EXEC)
226226
.doc(
227227
"When enabled, data from Spark (non-native) Parquet v1 and v2 scans will be converted to " +
228-
"Arrow format. This is an experimental feature and has known issues with " +
229-
"non-UTC timezones.")
228+
"Arrow format.")
230229
.booleanConf
231230
.createWithDefault(false)
232231

233232
val COMET_CONVERT_FROM_JSON_ENABLED: ConfigEntry[Boolean] =
234233
conf("spark.comet.convert.json.enabled")
235-
.category(CATEGORY_TESTING)
234+
.category(CATEGORY_EXEC)
236235
.doc(
237236
"When enabled, data from Spark (non-native) JSON v1 and v2 scans will be converted to " +
238-
"Arrow format. This is an experimental feature and has known issues with " +
239-
"non-UTC timezones.")
237+
"Arrow format.")
240238
.booleanConf
241239
.createWithDefault(false)
242240

243241
val COMET_CONVERT_FROM_CSV_ENABLED: ConfigEntry[Boolean] =
244242
conf("spark.comet.convert.csv.enabled")
245-
.category(CATEGORY_TESTING)
243+
.category(CATEGORY_EXEC)
246244
.doc(
247245
"When enabled, data from Spark (non-native) CSV v1 and v2 scans will be converted to " +
248-
"Arrow format. This is an experimental feature and has known issues with " +
249-
"non-UTC timezones.")
246+
"Arrow format.")
250247
.booleanConf
251248
.createWithDefault(false)
252249

@@ -743,17 +740,17 @@ object CometConf extends ShimCometConf {
743740

744741
val COMET_SPARK_TO_ARROW_ENABLED: ConfigEntry[Boolean] =
745742
conf("spark.comet.sparkToColumnar.enabled")
746-
.category(CATEGORY_TESTING)
743+
.category(CATEGORY_EXEC)
747744
.doc("Whether to enable Spark to Arrow columnar conversion. When this is turned on, " +
748745
"Comet will convert operators in " +
749746
"`spark.comet.sparkToColumnar.supportedOperatorList` into Arrow columnar format before " +
750-
"processing. This is an experimental feature and has known issues with non-UTC timezones.")
747+
"processing.")
751748
.booleanConf
752749
.createWithDefault(false)
753750

754751
val COMET_SPARK_TO_ARROW_SUPPORTED_OPERATOR_LIST: ConfigEntry[Seq[String]] =
755752
conf("spark.comet.sparkToColumnar.supportedOperatorList")
756-
.category(CATEGORY_TESTING)
753+
.category(CATEGORY_EXEC)
757754
.doc("A comma-separated list of operators that will be converted to Arrow columnar " +
758755
s"format when `${COMET_SPARK_TO_ARROW_ENABLED.key}` is true.")
759756
.stringConf

dev/changelog/0.14.1.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
<!--
2+
Licensed to the Apache Software Foundation (ASF) under one
3+
or more contributor license agreements. See the NOTICE file
4+
distributed with this work for additional information
5+
regarding copyright ownership. The ASF licenses this file
6+
to you under the Apache License, Version 2.0 (the
7+
"License"); you may not use this file except in compliance
8+
with the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing,
13+
software distributed under the License is distributed on an
14+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
KIND, either express or implied. See the License for the
16+
specific language governing permissions and limitations
17+
under the License.
18+
-->
19+
20+
# DataFusion Comet 0.14.1 Changelog
21+
22+
This release consists of 5 commits from 1 contributors. See credits at the end of this changelog for more information.
23+
24+
**Fixed bugs:**
25+
26+
- fix: [branch-0.14] backport #3802 - cache object stores and bucket regions to reduce DNS query volume [#3935](https://github.com/apache/datafusion-comet/pull/3935) (andygrove)
27+
- fix: [branch-0.14] backport #3924 - share unified memory pools across native execution contexts [#3938](https://github.com/apache/datafusion-comet/pull/3938) (andygrove)
28+
- fix: [branch-0.14] backport #3879 - skip Comet columnar shuffle for stages with DPP scans [#3934](https://github.com/apache/datafusion-comet/pull/3934) (andygrove)
29+
- fix: [branch-0.14] backport #3914 - use min instead of max when capping write buffer size to Int range [#3936](https://github.com/apache/datafusion-comet/pull/3936) (andygrove)
30+
- fix: [branch-0.14] backport #3865 - handle ambiguous and non-existent local times [#3937](https://github.com/apache/datafusion-comet/pull/3937) (andygrove)
31+
32+
## Credits
33+
34+
Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
35+
36+
```
37+
5 Andy Grove
38+
```
39+
40+
Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.

dev/release/generate-changelog.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,18 @@ def generate_changelog(repo, repo_name, tag1, tag2, version):
142142
print("Thank you also to everyone who contributed in other ways such as filing issues, reviewing "
143143
"PRs, and providing feedback on this release.\n")
144144

145+
def resolve_ref(ref):
146+
"""Resolve a git ref (e.g. HEAD, branch name) to a full commit SHA."""
147+
try:
148+
return subprocess.check_output(
149+
["git", "rev-parse", ref], text=True
150+
).strip()
151+
except subprocess.CalledProcessError:
152+
# If it can't be resolved locally, return as-is (e.g. a tag name
153+
# that the GitHub API can resolve)
154+
return ref
155+
156+
145157
def cli(args=None):
146158
"""Process command line arguments."""
147159
if not args:
@@ -153,12 +165,18 @@ def cli(args=None):
153165
parser.add_argument("version", help="The version number to include in the changelog")
154166
args = parser.parse_args()
155167

168+
# Resolve refs to SHAs so the GitHub API compares the same commits
169+
# as the local git log. Without this, refs like HEAD get resolved by
170+
# the GitHub API to the default branch instead of the current branch.
171+
tag1 = resolve_ref(args.tag1)
172+
tag2 = resolve_ref(args.tag2)
173+
156174
token = os.getenv("GITHUB_TOKEN")
157175
project = "apache/datafusion-comet"
158176

159177
g = Github(token)
160178
repo = g.get_repo(project)
161-
generate_changelog(repo, project, args.tag1, args.tag2, args.version)
179+
generate_changelog(repo, project, tag1, tag2, args.version)
162180

163181
if __name__ == "__main__":
164182
cli()

dev/release/rat_exclude_files.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ spark/src/test/resources/tpcds-query-results/*.out
2424
spark/src/test/resources/tpcds-micro-benchmarks/*.sql
2525
spark/src/test/resources/tpcds-plan-stability/approved-plans*/**/explain.txt
2626
spark/src/test/resources/tpcds-plan-stability/approved-plans*/**/simplified.txt
27+
spark/src/test/resources/tpcds-plan-stability/approved-plans*/**/extended.txt
2728
spark/src/test/resources/tpch-query-results/*.out
2829
spark/src/test/resources/tpch-extended/q*.sql
2930
spark/src/test/resources/test-data/*.csv

docs/source/contributor-guide/adding_a_new_expression.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,6 @@ It is important to verify that the new expression is correctly recognized by the
217217
Create a `.sql` file under the appropriate subdirectory in `spark/src/test/resources/sql-tests/expressions/` (e.g., `string/`, `math/`, `array/`). The file should create a table with test data, then run queries that exercise the expression. Here is an example for the `unhex` expression:
218218

219219
```sql
220-
-- ConfigMatrix: parquet.enable.dictionary=false,true
221-
222220
statement
223221
CREATE TABLE test_unhex(col string) USING parquet
224222

docs/source/contributor-guide/release_process.md

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -302,8 +302,13 @@ Creating Nexus staging repository
302302
In the Nexus repository UI (https://repository.apache.org/) locate and verify the artifacts in
303303
staging (https://central.sonatype.org/publish/release/#locate-and-examine-your-staging-repository).
304304

305-
If the artifacts appear to be correct, then close and release the repository so it is made visible (this should
306-
actually happen automatically when running the script).
305+
The script closes the staging repository but does not release it. Releasing to Maven Central is a manual step
306+
performed only after the vote passes (see [Publishing Maven Artifacts](#publishing-maven-artifacts) below).
307+
308+
Note that the Maven artifacts are always published under the final release version (e.g. `0.13.0`), not the RC
309+
version — the `-rc1` / `-rc2` suffix only appears in the git tag and the source tarball in SVN. Because the script
310+
creates a new staging repository on each run, re-staging the same version for a subsequent RC is supported as long
311+
as no staging repository for that version has been released to Maven Central.
307312

308313
### Create the Release Candidate Tarball
309314

@@ -345,6 +350,13 @@ If the vote does not pass, address the issues raised, increment the release cand
345350
the [Tag the Release Candidate](#tag-the-release-candidate) step. For example, the next attempt would be tagged
346351
`0.13.0-rc2`.
347352

353+
Before staging the next RC, drop the previous RC's staging repository in the
354+
[Nexus UI](https://repository.apache.org/#stagingRepositories) by selecting it and clicking "Drop". This avoids
355+
leaving multiple closed staging repositories for the same version and prevents accidentally releasing the wrong
356+
one when the vote eventually passes. The Maven version (e.g. `0.13.0`) is shared across all RCs, so each run of
357+
`publish-to-maven.sh` creates a new staging repository for the same GAV — only one of them should ever be
358+
released to Maven Central.
359+
348360
## Publishing Binary Releases
349361

350362
Once the vote passes, we can publish the source and binary releases.

docs/source/contributor-guide/roadmap.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ helpful to have a roadmap for some of the major items that require coordination
2626

2727
### Iceberg Integration
2828

29-
Iceberg tables reads are now fully native, powered by a scan operator backed by Iceberg-rust ([#2528]). We anticipate
30-
major improvements expected in the next few releases, including bringing Iceberg table format V3 features (_e.g._,
29+
Reads of Iceberg tables with Parquet data files are fully native and enabled by default, powered by a scan operator
30+
backed by Iceberg-rust ([#2528]). We anticipate major improvements in the next few releases, including bringing Iceberg table format V3 features (_e.g._,
3131
encryption) to the reader.
3232

3333
[#2528]: https://github.com/apache/datafusion-comet/pull/2528

docs/source/contributor-guide/sql-file-tests.md

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,6 @@ A test file consists of SQL comments, directives, statements, and queries separa
7676
lines. Here is a minimal example:
7777

7878
```sql
79-
-- ConfigMatrix: parquet.enable.dictionary=false,true
80-
8179
statement
8280
CREATE TABLE test_abs(v double) USING parquet
8381

@@ -106,16 +104,19 @@ Runs the entire file once per combination of values. Multiple `ConfigMatrix` lin
106104
cross product of all combinations.
107105

108106
```sql
109-
-- ConfigMatrix: parquet.enable.dictionary=false,true
107+
-- ConfigMatrix: spark.sql.optimizer.inSetConversionThreshold=100,0
110108
```
111109

112110
This generates two test cases:
113111

114112
```
115-
sql-file: expressions/cast/cast.sql [parquet.enable.dictionary=false]
116-
sql-file: expressions/cast/cast.sql [parquet.enable.dictionary=true]
113+
sql-file: expressions/conditional/in_set.sql [spark.sql.optimizer.inSetConversionThreshold=100]
114+
sql-file: expressions/conditional/in_set.sql [spark.sql.optimizer.inSetConversionThreshold=0]
117115
```
118116

117+
Only add a `ConfigMatrix` directive when there is a real reason to run the test under
118+
multiple configurations. Do not add `ConfigMatrix` directives speculatively.
119+
119120
#### `MinSparkVersion`
120121

121122
Skips the file when running on a Spark version older than the specified version.
@@ -223,12 +224,9 @@ SELECT array(1, 2, 3)[10]
223224

224225
2. Add the Apache license header as a SQL comment.
225226

226-
3. Add a `ConfigMatrix` directive if the test should run with multiple Parquet configurations.
227-
Most expression tests use:
228-
229-
```sql
230-
-- ConfigMatrix: parquet.enable.dictionary=false,true
231-
```
227+
3. Add a `ConfigMatrix` directive only if the test needs to run under multiple configurations
228+
(e.g., testing behavior that varies with a specific Spark config). Do not add `ConfigMatrix`
229+
directives speculatively.
232230

233231
4. Create tables and insert test data using `statement` blocks. Include edge cases such as
234232
`NULL`, boundary values, and negative numbers.

0 commit comments

Comments
 (0)