Skip to content

Commit a960247

Browse files
committed
Add regression test for duplicate group keys after hash aggregation spill (#20724)
Add a deterministic test that reproduces the bug where hash aggregation produces duplicate group keys after spilling to disk. The root cause: update_merged_stream() sets GroupOrdering::Full but does not recreate group_values, leaving GroupValuesColumn<false> (vectorized_intern) active. Under hash collisions, vectorized_intern produces non-monotonic group indices, causing GroupOrderingFull to prematurely emit groups. Introduces a new force_hash_partial_collisions feature that truncates hashes to 5 bits (32 distinct values) instead of all zeros. Full collisions paradoxically do not trigger the bug because all rows take the same code path producing monotonic indices. Partial collisions create the necessary mix of fast-path and slow-path rows.
1 parent 8e02b8e commit a960247

7 files changed

Lines changed: 174 additions & 9 deletions

File tree

.github/workflows/extended.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,11 +154,16 @@ jobs:
154154
uses: ./.github/actions/setup-builder
155155
with:
156156
rust-version: stable
157-
- name: Run tests
157+
- name: Run tests (force_hash_collisions)
158158
run: |
159159
cd datafusion
160160
cargo test --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --exclude datafusion-sqllogictest --exclude datafusion-cli --workspace --lib --tests --features=force_hash_collisions,avro
161161
cargo clean
162+
- name: Run tests (force_hash_partial_collisions, #20724)
163+
run: |
164+
cd datafusion
165+
cargo test --profile ci -p datafusion --test core_integration --features=force_hash_partial_collisions -- memory_limit::test_no_duplicate_groups_after_spill --exact
166+
cargo clean
162167
163168
sqllogictest-sqlite:
164169
name: "Run sqllogictests with the sqlite test suite"

.github/workflows/rust.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,8 @@ jobs:
210210
run: cargo check --profile ci --no-default-features -p datafusion --features=encoding_expressions
211211
- name: Check datafusion (force_hash_collisions)
212212
run: cargo check --profile ci --no-default-features -p datafusion --features=force_hash_collisions
213+
- name: Check datafusion (force_hash_partial_collisions)
214+
run: cargo check --profile ci --no-default-features -p datafusion --features=force_hash_partial_collisions
213215
- name: Check datafusion (math_expressions)
214216
run: cargo check --profile ci --no-default-features -p datafusion --features=math_expressions
215217
- name: Check datafusion (parquet)

datafusion/common/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ parquet_encryption = [
4949
"dep:hex",
5050
]
5151
force_hash_collisions = []
52+
force_hash_partial_collisions = []
5253
recursive_protection = ["dep:recursive"]
5354
parquet = ["dep:parquet"]
5455
sql = ["sqlparser"]

datafusion/common/src/hash_utils.rs

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -935,8 +935,7 @@ fn hash_run_array<R: RunEndIndexType>(
935935

936936
/// Internal helper function that hashes a single array and either initializes or combines
937937
/// the hash values in the buffer.
938-
#[cfg(not(feature = "force_hash_collisions"))]
939-
fn hash_single_array(
938+
fn hash_single_array_impl(
940939
array: &dyn Array,
941940
random_state: &RandomState,
942941
hashes_buffer: &mut [u64],
@@ -1007,17 +1006,47 @@ fn hash_single_array(
10071006
Ok(())
10081007
}
10091008

1010-
/// Test version of `hash_single_array` that forces all hashes to collide to zero.
1011-
#[cfg(feature = "force_hash_collisions")]
1009+
/// Dispatches to the appropriate `hash_single_array` implementation based on
1010+
/// the enabled feature flags.
1011+
#[cfg(not(any(
1012+
feature = "force_hash_collisions",
1013+
feature = "force_hash_partial_collisions"
1014+
)))]
1015+
fn hash_single_array(
1016+
array: &dyn Array,
1017+
random_state: &RandomState,
1018+
hashes_buffer: &mut [u64],
1019+
rehash: bool,
1020+
) -> Result<()> {
1021+
hash_single_array_impl(array, random_state, hashes_buffer, rehash)
1022+
}
1023+
1024+
/// Test version: forces full hash collisions by setting all hashes to 0.
1025+
#[cfg(all(
1026+
feature = "force_hash_collisions",
1027+
not(feature = "force_hash_partial_collisions")
1028+
))]
10121029
fn hash_single_array(
10131030
_array: &dyn Array,
10141031
_random_state: &RandomState,
10151032
hashes_buffer: &mut [u64],
10161033
_rehash: bool,
10171034
) -> Result<()> {
1018-
for hash in hashes_buffer.iter_mut() {
1019-
*hash = 0
1020-
}
1035+
hashes_buffer.iter_mut().for_each(|x| *x = 0);
1036+
Ok(())
1037+
}
1038+
1039+
/// Test version: truncates real hashes to 5 bits (32 distinct values) to create
1040+
/// partial collisions that expose non-monotonic group index bugs (#20724).
1041+
#[cfg(feature = "force_hash_partial_collisions")]
1042+
fn hash_single_array(
1043+
array: &dyn Array,
1044+
random_state: &RandomState,
1045+
hashes_buffer: &mut [u64],
1046+
rehash: bool,
1047+
) -> Result<()> {
1048+
hash_single_array_impl(array, random_state, hashes_buffer, rehash)?;
1049+
hashes_buffer.iter_mut().for_each(|h| *h &= 0x1F);
10211050
Ok(())
10221051
}
10231052

datafusion/core/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,10 @@ default = [
7171
encoding_expressions = ["datafusion-functions/encoding_expressions"]
7272
# Used for testing ONLY: causes all values to hash to the same value (test for collisions)
7373
force_hash_collisions = ["datafusion-physical-plan/force_hash_collisions", "datafusion-common/force_hash_collisions"]
74+
# Used for testing ONLY: truncates hashes to 5 bits (32 distinct values) to create partial collisions.
75+
# Unlike force_hash_collisions (all hashes = 0), this creates a mix of colliding and non-colliding keys,
76+
# which triggers non-monotonic group indices in vectorized_intern (#20724).
77+
force_hash_partial_collisions = ["datafusion-physical-plan/force_hash_partial_collisions", "datafusion-common/force_hash_partial_collisions"]
7478
math_expressions = ["datafusion-functions/math_expressions"]
7579
parquet = ["datafusion-common/parquet", "dep:parquet", "datafusion-datasource-parquet"]
7680
parquet_encryption = [

datafusion/core/tests/memory_limit/mod.rs

Lines changed: 124 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ use std::sync::{Arc, LazyLock};
2424
#[cfg(feature = "extended_tests")]
2525
mod memory_limit_validation;
2626
mod repartition_mem_limit;
27-
use arrow::array::{ArrayRef, DictionaryArray, Int32Array, RecordBatch, StringViewArray};
27+
use arrow::array::{
28+
ArrayRef, DictionaryArray, Int32Array, Int64Array, RecordBatch, StringViewArray,
29+
};
2830
use arrow::compute::SortOptions;
2931
use arrow::datatypes::{Int32Type, SchemaRef};
3032
use arrow_schema::{DataType, Field, Schema};
@@ -56,6 +58,7 @@ use datafusion_physical_plan::collect as collect_batches;
5658
use datafusion_physical_plan::common::collect;
5759
use datafusion_physical_plan::spill::get_record_batch_memory_size;
5860
use rand::Rng;
61+
use std::collections::HashSet;
5962
use test_utils::AccessLogGenerator;
6063

6164
use async_trait::async_trait;
@@ -1172,3 +1175,123 @@ impl TableProvider for SortedTableProvider {
11721175
Ok(DataSourceExec::from_data_source(mem_conf))
11731176
}
11741177
}
1178+
1179+
// ============================================================================
1180+
// Regression tests for https://github.com/apache/datafusion/issues/20724
1181+
//
1182+
// When hash aggregation spills and switches to streaming merge,
1183+
// `group_values` must be recreated with the streaming variant.
1184+
// Otherwise `vectorized_intern` can produce non-monotonic group indices
1185+
// under hash collisions, causing `GroupOrderingFull` to prematurely
1186+
// emit groups → duplicate keys in output.
1187+
// ============================================================================
1188+
1189+
/// Helper: set up a session that forces spilling during aggregation.
1190+
async fn setup_spill_agg_context(
1191+
memory_limit: usize,
1192+
batch_size: usize,
1193+
) -> Result<SessionContext> {
1194+
let runtime = RuntimeEnvBuilder::new()
1195+
.with_memory_pool(Arc::new(FairSpillPool::new(memory_limit)))
1196+
.with_disk_manager_builder(
1197+
DiskManagerBuilder::default().with_mode(DiskManagerMode::OsTmpDirectory),
1198+
)
1199+
.build_arc()
1200+
.unwrap();
1201+
1202+
let config = SessionConfig::new()
1203+
.with_sort_spill_reservation_bytes(64 * 1024)
1204+
.with_sort_in_place_threshold_bytes(0)
1205+
.with_spill_compression(SpillCompression::Uncompressed)
1206+
.with_batch_size(batch_size)
1207+
.with_target_partitions(1);
1208+
1209+
Ok(SessionContext::new_with_config_rt(config, runtime))
1210+
}
1211+
1212+
/// Regression test for https://github.com/apache/datafusion/issues/20724
1213+
///
1214+
/// When hash aggregation spills and switches to streaming merge,
1215+
/// `group_values` (GroupValuesColumn<false>) is not recreated with the
1216+
/// streaming variant (<true>). This means `vectorized_intern` is used
1217+
/// post-spill, which can produce non-monotonic group indices under hash
1218+
/// collisions, causing `GroupOrderingFull` to prematurely emit groups
1219+
/// and create duplicate keys in the output.
1220+
///
1221+
/// Requirements to trigger:
1222+
/// - Two-column GROUP BY → forces `GroupValuesColumn` (not `GroupValuesPrimitive`)
1223+
/// - `force_hash_partial_collisions` feature → truncated hashes create the mix
1224+
/// of colliding/non-colliding keys needed for non-monotonic indices
1225+
/// - `batch_size=50` → not a multiple of rows-per-group in the merged stream,
1226+
/// so groups span batch boundaries and premature emission causes duplicates
1227+
#[tokio::test]
1228+
async fn test_no_duplicate_groups_after_spill() -> Result<()> {
1229+
let num_keys: i64 = 5000;
1230+
let rows_per_key: i64 = 4;
1231+
let total_rows = (num_keys * rows_per_key) as usize;
1232+
1233+
let schema = Arc::new(Schema::new(vec![
1234+
Field::new("key_a", DataType::Int64, false),
1235+
Field::new("key_b", DataType::Int64, false),
1236+
Field::new("value", DataType::Int64, false),
1237+
]));
1238+
1239+
let mut keys_a = Vec::with_capacity(total_rows);
1240+
let mut keys_b = Vec::with_capacity(total_rows);
1241+
let mut vals = Vec::with_capacity(total_rows);
1242+
for r in 0..rows_per_key {
1243+
for k in 0..num_keys {
1244+
keys_a.push(k / 100);
1245+
keys_b.push(k % 100);
1246+
vals.push(r * num_keys + k);
1247+
}
1248+
}
1249+
1250+
let mut batches = Vec::new();
1251+
for start in (0..total_rows).step_by(500) {
1252+
let end = (start + 500).min(total_rows);
1253+
batches.push(RecordBatch::try_new(
1254+
Arc::clone(&schema),
1255+
vec![
1256+
Arc::new(Int64Array::from(keys_a[start..end].to_vec())),
1257+
Arc::new(Int64Array::from(keys_b[start..end].to_vec())),
1258+
Arc::new(Int64Array::from(vals[start..end].to_vec())),
1259+
],
1260+
)?);
1261+
}
1262+
1263+
let ctx = setup_spill_agg_context(128 * 1024, 50).await?;
1264+
let table = MemTable::try_new(schema, vec![batches])?;
1265+
ctx.register_table("t", Arc::new(table))?;
1266+
1267+
let df = ctx
1268+
.sql("SELECT key_a, key_b, COUNT(*) as cnt FROM t GROUP BY key_a, key_b")
1269+
.await?;
1270+
let results =
1271+
collect_batches(df.create_physical_plan().await?, ctx.task_ctx()).await?;
1272+
1273+
let mut seen = HashSet::new();
1274+
for batch in &results {
1275+
let ka = batch
1276+
.column(0)
1277+
.as_any()
1278+
.downcast_ref::<Int64Array>()
1279+
.unwrap();
1280+
let kb = batch
1281+
.column(1)
1282+
.as_any()
1283+
.downcast_ref::<Int64Array>()
1284+
.unwrap();
1285+
for i in 0..batch.num_rows() {
1286+
assert!(
1287+
seen.insert((ka.value(i), kb.value(i))),
1288+
"DUPLICATE group key ({}, {}). \
1289+
Bug #20724: group_values not recreated for streaming merge.",
1290+
ka.value(i),
1291+
kb.value(i),
1292+
);
1293+
}
1294+
}
1295+
assert_eq!(seen.len(), num_keys as usize);
1296+
Ok(())
1297+
}

datafusion/physical-plan/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ workspace = true
3939

4040
[features]
4141
force_hash_collisions = []
42+
force_hash_partial_collisions = ["datafusion-common/force_hash_partial_collisions"]
4243
test_utils = ["arrow/test_utils"]
4344
tokio_coop = []
4445
tokio_coop_fallback = []

0 commit comments

Comments
 (0)