Skip to content

Commit b7ae2c6

Browse files
zhuqi-lucasclaude
authored andcommitted
feat: add sort pushdown benchmark and SLT tests (apache#21213)
## Which issue does this PR close? - Related to apache#17348 - Precursor to apache#21182 ## Rationale for this change Add benchmark and integration tests for sort pushdown optimization, split out from apache#21182 per [reviewer request](apache#21182 (comment)). This allows comparing benchmark results before and after the optimization lands, and the SLT diff in apache#21182 will clearly show which test expectations changed due to the optimization. ## What changes are included in this PR? ### Benchmark New `sort-pushdown` benchmark subcommand with 4 queries testing sort elimination: | Query | Description | |-------|-------------| | Q1 | `ORDER BY l_orderkey ASC` (full scan) | | Q2 | `ORDER BY l_orderkey ASC LIMIT 100` | | Q3 | `SELECT * ORDER BY l_orderkey ASC` (wide) | | Q4 | `SELECT * ORDER BY l_orderkey ASC LIMIT 100` (wide) | Usage: \`\`\`bash ./bench.sh data sort_pushdown ./bench.sh run sort_pushdown # baseline ./bench.sh run sort_pushdown_sorted # with sort elimination \`\`\` ### SLT Integration Tests (5 new groups) - **Test A**: Non-overlapping files + WITH ORDER → Sort eliminated (single partition) - **Test B**: Overlapping files → SortExec retained (baseline, files in original order) - **Test C**: LIMIT queries (ASC sort elimination + DESC reverse scan) - **Test D**: \`target_partitions=2\` → SPM + per-partition sort elimination - **Test E**: Inferred ordering from Parquet metadata (no WITH ORDER) — single and multi partition ### Files Changed | File | Change | |------|--------| | \`benchmarks/src/sort_pushdown.rs\` | New benchmark module | | \`benchmarks/src/lib.rs\` | Register module | | \`benchmarks/src/bin/dfbench.rs\` | Register subcommand | | \`benchmarks/bench.sh\` | Add data/run entries | | \`datafusion/sqllogictest/test_files/sort_pushdown.slt\` | 5 new SLT test groups | ## Test plan - [x] \`cargo clippy -p datafusion-benchmarks\` — 0 warnings - [x] \`cargo test -p datafusion-sqllogictest -- sort_pushdown\` — all tests pass 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 738493e commit b7ae2c6

9 files changed

Lines changed: 797 additions & 1 deletion

File tree

benchmarks/bench.sh

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,10 @@ clickbench_partitioned: ClickBench queries against partitioned (100 files) parqu
106106
clickbench_pushdown: ClickBench queries against partitioned (100 files) parquet w/ filter_pushdown enabled
107107
clickbench_extended: ClickBench \"inspired\" queries against a single parquet (DataFusion specific)
108108
109+
# Sort Pushdown Benchmarks
110+
sort_pushdown: Sort pushdown baseline (no WITH ORDER) on TPC-H data (SF=1)
111+
sort_pushdown_sorted: Sort pushdown with WITH ORDER — tests sort elimination on non-overlapping files
112+
109113
# Sorted Data Benchmarks (ORDER BY Optimization)
110114
clickbench_sorted: ClickBench queries on pre-sorted data using prefer_existing_sort (tests sort elimination optimization)
111115
@@ -309,6 +313,10 @@ main() {
309313
# same data as for tpch
310314
data_tpch "1" "parquet"
311315
;;
316+
sort_pushdown|sort_pushdown_sorted)
317+
# same data as for tpch
318+
data_tpch "1" "parquet"
319+
;;
312320
sort_tpch)
313321
# same data as for tpch
314322
data_tpch "1" "parquet"
@@ -509,6 +517,12 @@ main() {
509517
external_aggr)
510518
run_external_aggr
511519
;;
520+
sort_pushdown)
521+
run_sort_pushdown
522+
;;
523+
sort_pushdown_sorted)
524+
run_sort_pushdown_sorted
525+
;;
512526
sort_tpch)
513527
run_sort_tpch "1"
514528
;;
@@ -1070,6 +1084,22 @@ run_external_aggr() {
10701084
debug_run $CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG}
10711085
}
10721086

1087+
# Runs the sort pushdown benchmark (without WITH ORDER)
1088+
run_sort_pushdown() {
1089+
TPCH_DIR="${DATA_DIR}/tpch_sf1"
1090+
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown.json"
1091+
echo "Running sort pushdown benchmark (no WITH ORDER)..."
1092+
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
1093+
}
1094+
1095+
# Runs the sort pushdown benchmark with WITH ORDER (enables sort elimination)
1096+
run_sort_pushdown_sorted() {
1097+
TPCH_DIR="${DATA_DIR}/tpch_sf1"
1098+
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_sorted.json"
1099+
echo "Running sort pushdown benchmark (with WITH ORDER)..."
1100+
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
1101+
}
1102+
10731103
# Runs the sort integration benchmark
10741104
run_sort_tpch() {
10751105
SCALE_FACTOR=$1
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
-- Sort elimination: ORDER BY sort key ASC (full scan)
2+
-- With --sorted: SortExec removed, sequential scan in file order
3+
-- Without --sorted: full SortExec required
4+
SELECT l_orderkey, l_partkey, l_suppkey
5+
FROM lineitem
6+
ORDER BY l_orderkey
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
-- Sort elimination + limit pushdown
2+
-- With --sorted: SortExec removed + limit pushed to DataSourceExec
3+
-- Without --sorted: TopK sort over all data
4+
SELECT l_orderkey, l_partkey, l_suppkey
5+
FROM lineitem
6+
ORDER BY l_orderkey
7+
LIMIT 100
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
-- Sort elimination: wide projection (all columns)
2+
-- Tests sort elimination benefit with larger row payload
3+
SELECT *
4+
FROM lineitem
5+
ORDER BY l_orderkey
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
-- Sort elimination + limit: wide projection
2+
SELECT *
3+
FROM lineitem
4+
ORDER BY l_orderkey
5+
LIMIT 100

benchmarks/src/bin/dfbench.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
3434
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
3535

3636
use datafusion_benchmarks::{
37-
cancellation, clickbench, h2o, hj, imdb, nlj, smj, sort_tpch, tpcds, tpch,
37+
cancellation, clickbench, h2o, hj, imdb, nlj, smj, sort_pushdown, sort_tpch, tpcds,
38+
tpch,
3839
};
3940

4041
#[derive(Debug, Parser)]
@@ -53,6 +54,7 @@ enum Options {
5354
Imdb(imdb::RunOpt),
5455
Nlj(nlj::RunOpt),
5556
Smj(smj::RunOpt),
57+
SortPushdown(sort_pushdown::RunOpt),
5658
SortTpch(sort_tpch::RunOpt),
5759
Tpch(tpch::RunOpt),
5860
Tpcds(tpcds::RunOpt),
@@ -72,6 +74,7 @@ pub async fn main() -> Result<()> {
7274
Options::Imdb(opt) => Box::pin(opt.run()).await,
7375
Options::Nlj(opt) => opt.run().await,
7476
Options::Smj(opt) => opt.run().await,
77+
Options::SortPushdown(opt) => opt.run().await,
7578
Options::SortTpch(opt) => opt.run().await,
7679
Options::Tpch(opt) => Box::pin(opt.run()).await,
7780
Options::Tpcds(opt) => Box::pin(opt.run()).await,

benchmarks/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ pub mod hj;
2323
pub mod imdb;
2424
pub mod nlj;
2525
pub mod smj;
26+
pub mod sort_pushdown;
2627
pub mod sort_tpch;
2728
pub mod tpcds;
2829
pub mod tpch;

0 commit comments

Comments
 (0)