diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index b8c9ff5c8ec46..badf9ce4352ad 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -314,8 +314,7 @@ main() { data_tpch "1" "parquet" ;; sort_pushdown|sort_pushdown_sorted) - # same data as for tpch - data_tpch "1" "parquet" + data_sort_pushdown ;; sort_tpch) # same data as for tpch @@ -1085,19 +1084,57 @@ run_external_aggr() { } # Runs the sort pushdown benchmark (without WITH ORDER) +# Generates sort pushdown benchmark data: TPC-H lineitem with 3 parts, +# renamed so alphabetical order does NOT match sort key order. +# This forces the sort pushdown optimizer to reorder files by statistics. +# +# tpchgen produces 3 sorted, non-overlapping parquet files: +# lineitem.1.parquet: l_orderkey 1 ~ 2M (lowest keys) +# lineitem.2.parquet: l_orderkey 2M ~ 4M +# lineitem.3.parquet: l_orderkey 4M ~ 6M (highest keys) +# +# We rename them so alphabetical order is reversed: +# a_part3.parquet (highest keys, sorts first alphabetically) +# b_part2.parquet +# c_part1.parquet (lowest keys, sorts last alphabetically) +data_sort_pushdown() { + SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown/lineitem" + if [ -d "${SORT_PUSHDOWN_DIR}" ] && [ "$(ls -A ${SORT_PUSHDOWN_DIR}/*.parquet 2>/dev/null)" ]; then + echo "Sort pushdown data already exists at ${SORT_PUSHDOWN_DIR}" + return + fi + + echo "Generating sort pushdown benchmark data (3 parts with reversed naming)..." + + TEMP_DIR="${DATA_DIR}/sort_pushdown_temp" + mkdir -p "${TEMP_DIR}" "${SORT_PUSHDOWN_DIR}" + + tpchgen-cli --scale-factor 1 --format parquet --parquet-compression='ZSTD(1)' --parts=3 --output-dir "${TEMP_DIR}" + + # Rename: reverse alphabetical order vs key order + mv "${TEMP_DIR}/lineitem/lineitem.3.parquet" "${SORT_PUSHDOWN_DIR}/a_part3.parquet" + mv "${TEMP_DIR}/lineitem/lineitem.2.parquet" "${SORT_PUSHDOWN_DIR}/b_part2.parquet" + mv "${TEMP_DIR}/lineitem/lineitem.1.parquet" "${SORT_PUSHDOWN_DIR}/c_part1.parquet" + + rm -rf "${TEMP_DIR}" + + echo "Sort pushdown data generated at ${SORT_PUSHDOWN_DIR}" + ls -la "${SORT_PUSHDOWN_DIR}" +} + run_sort_pushdown() { - TPCH_DIR="${DATA_DIR}/tpch_sf1" + SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown" RESULTS_FILE="${RESULTS_DIR}/sort_pushdown.json" echo "Running sort pushdown benchmark (no WITH ORDER)..." - debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG} + debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG} } # Runs the sort pushdown benchmark with WITH ORDER (enables sort elimination) run_sort_pushdown_sorted() { - TPCH_DIR="${DATA_DIR}/tpch_sf1" + SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown" RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_sorted.json" echo "Running sort pushdown benchmark (with WITH ORDER)..." - debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG} + debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG} } # Runs the sort integration benchmark