Add regression test for duplicate group keys after hash aggregation spill (#20724)

gboucher90 · gboucher90 · commit a96024746eff · 2026-03-10T22:07:38.000+01:00
Add a deterministic test that reproduces the bug where hash aggregation
produces duplicate group keys after spilling to disk.

The root cause: update_merged_stream() sets GroupOrdering::Full but does
not recreate group_values, leaving GroupValuesColumn&lt;false&gt; (vectorized_intern)
active. Under hash collisions, vectorized_intern produces non-monotonic
group indices, causing GroupOrderingFull to prematurely emit groups.

Introduces a new force_hash_partial_collisions feature that truncates
hashes to 5 bits (32 distinct values) instead of all zeros. Full collisions
paradoxically do not trigger the bug because all rows take the same code
path producing monotonic indices. Partial collisions create the necessary
mix of fast-path and slow-path rows.
diff --git a/.github/workflows/extended.yml b/.github/workflows/extended.yml
@@ -154,11 +154,16 @@ jobs:
         uses: ./.github/actions/setup-builder
         with:
           rust-version: stable
-      - name: Run tests
+      - name: Run tests (force_hash_collisions)
         run: |
           cd datafusion
           cargo test  --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --exclude datafusion-sqllogictest --exclude datafusion-cli --workspace --lib --tests --features=force_hash_collisions,avro
           cargo clean
+      - name: Run tests (force_hash_partial_collisions, #20724)
+        run: |
+          cd datafusion
+          cargo test  --profile ci -p datafusion --test core_integration --features=force_hash_partial_collisions -- memory_limit::test_no_duplicate_groups_after_spill --exact
+          cargo clean
 
   sqllogictest-sqlite:
     name: "Run sqllogictests with the sqlite test suite"
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -210,6 +210,8 @@ jobs:
         run: cargo check --profile ci --no-default-features -p datafusion --features=encoding_expressions
       - name: Check datafusion (force_hash_collisions)
         run: cargo check --profile ci --no-default-features -p datafusion --features=force_hash_collisions
+      - name: Check datafusion (force_hash_partial_collisions)
+        run: cargo check --profile ci --no-default-features -p datafusion --features=force_hash_partial_collisions
       - name: Check datafusion (math_expressions)
         run: cargo check --profile ci --no-default-features -p datafusion --features=math_expressions
       - name: Check datafusion (parquet)
diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml
@@ -49,6 +49,7 @@ parquet_encryption = [
     "dep:hex",
 ]
 force_hash_collisions = []
+force_hash_partial_collisions = []
 recursive_protection = ["dep:recursive"]
 parquet = ["dep:parquet"]
 sql = ["sqlparser"]
diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs
@@ -935,8 +935,7 @@ fn hash_run_array<R: RunEndIndexType>(
 
 /// Internal helper function that hashes a single array and either initializes or combines
 /// the hash values in the buffer.
-#[cfg(not(feature = "force_hash_collisions"))]
-fn hash_single_array(
+fn hash_single_array_impl(
     array: &dyn Array,
     random_state: &RandomState,
     hashes_buffer: &mut [u64],
@@ -1007,17 +1006,47 @@ fn hash_single_array(
     Ok(())
 }
 
-/// Test version of `hash_single_array` that forces all hashes to collide to zero.
-#[cfg(feature = "force_hash_collisions")]
+/// Dispatches to the appropriate `hash_single_array` implementation based on
+/// the enabled feature flags.
+#[cfg(not(any(
+    feature = "force_hash_collisions",
+    feature = "force_hash_partial_collisions"
+)))]
+fn hash_single_array(
+    array: &dyn Array,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+    rehash: bool,
+) -> Result<()> {
+    hash_single_array_impl(array, random_state, hashes_buffer, rehash)
+}
+
+/// Test version: forces full hash collisions by setting all hashes to 0.
+#[cfg(all(
+    feature = "force_hash_collisions",
+    not(feature = "force_hash_partial_collisions")
+))]
 fn hash_single_array(
     _array: &dyn Array,
     _random_state: &RandomState,
     hashes_buffer: &mut [u64],
     _rehash: bool,
 ) -> Result<()> {
-    for hash in hashes_buffer.iter_mut() {
-        *hash = 0
-    }
+    hashes_buffer.iter_mut().for_each(|x| *x = 0);
+    Ok(())
+}
+
+/// Test version: truncates real hashes to 5 bits (32 distinct values) to create
+/// partial collisions that expose non-monotonic group index bugs (#20724).
+#[cfg(feature = "force_hash_partial_collisions")]
+fn hash_single_array(
+    array: &dyn Array,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+    rehash: bool,
+) -> Result<()> {
+    hash_single_array_impl(array, random_state, hashes_buffer, rehash)?;
+    hashes_buffer.iter_mut().for_each(|h| *h &= 0x1F);
     Ok(())
 }
 
diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
@@ -71,6 +71,10 @@ default = [
 encoding_expressions = ["datafusion-functions/encoding_expressions"]
 # Used for testing ONLY: causes all values to hash to the same value (test for collisions)
 force_hash_collisions = ["datafusion-physical-plan/force_hash_collisions", "datafusion-common/force_hash_collisions"]
+# Used for testing ONLY: truncates hashes to 5 bits (32 distinct values) to create partial collisions.
+# Unlike force_hash_collisions (all hashes = 0), this creates a mix of colliding and non-colliding keys,
+# which triggers non-monotonic group indices in vectorized_intern (#20724).
+force_hash_partial_collisions = ["datafusion-physical-plan/force_hash_partial_collisions", "datafusion-common/force_hash_partial_collisions"]
 math_expressions = ["datafusion-functions/math_expressions"]
 parquet = ["datafusion-common/parquet", "dep:parquet", "datafusion-datasource-parquet"]
 parquet_encryption = [
diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs
@@ -24,7 +24,9 @@ use std::sync::{Arc, LazyLock};
 #[cfg(feature = "extended_tests")]
 mod memory_limit_validation;
 mod repartition_mem_limit;
-use arrow::array::{ArrayRef, DictionaryArray, Int32Array, RecordBatch, StringViewArray};
+use arrow::array::{
+    ArrayRef, DictionaryArray, Int32Array, Int64Array, RecordBatch, StringViewArray,
+};
 use arrow::compute::SortOptions;
 use arrow::datatypes::{Int32Type, SchemaRef};
 use arrow_schema::{DataType, Field, Schema};
@@ -56,6 +58,7 @@ use datafusion_physical_plan::collect as collect_batches;
 use datafusion_physical_plan::common::collect;
 use datafusion_physical_plan::spill::get_record_batch_memory_size;
 use rand::Rng;
+use std::collections::HashSet;
 use test_utils::AccessLogGenerator;
 
 use async_trait::async_trait;
@@ -1172,3 +1175,123 @@ impl TableProvider for SortedTableProvider {
         Ok(DataSourceExec::from_data_source(mem_conf))
     }
 }
+
+// ============================================================================
+// Regression tests for https://github.com/apache/datafusion/issues/20724
+//
+// When hash aggregation spills and switches to streaming merge,
+// `group_values` must be recreated with the streaming variant.
+// Otherwise `vectorized_intern` can produce non-monotonic group indices
+// under hash collisions, causing `GroupOrderingFull` to prematurely
+// emit groups → duplicate keys in output.
+// ============================================================================
+
+/// Helper: set up a session that forces spilling during aggregation.
+async fn setup_spill_agg_context(
+    memory_limit: usize,
+    batch_size: usize,
+) -> Result<SessionContext> {
+    let runtime = RuntimeEnvBuilder::new()
+        .with_memory_pool(Arc::new(FairSpillPool::new(memory_limit)))
+        .with_disk_manager_builder(
+            DiskManagerBuilder::default().with_mode(DiskManagerMode::OsTmpDirectory),
+        )
+        .build_arc()
+        .unwrap();
+
+    let config = SessionConfig::new()
+        .with_sort_spill_reservation_bytes(64 * 1024)
+        .with_sort_in_place_threshold_bytes(0)
+        .with_spill_compression(SpillCompression::Uncompressed)
+        .with_batch_size(batch_size)
+        .with_target_partitions(1);
+
+    Ok(SessionContext::new_with_config_rt(config, runtime))
+}
+
+/// Regression test for https://github.com/apache/datafusion/issues/20724
+///
+/// When hash aggregation spills and switches to streaming merge,
+/// `group_values` (GroupValuesColumn<false>) is not recreated with the
+/// streaming variant (<true>). This means `vectorized_intern` is used
+/// post-spill, which can produce non-monotonic group indices under hash
+/// collisions, causing `GroupOrderingFull` to prematurely emit groups
+/// and create duplicate keys in the output.
+///
+/// Requirements to trigger:
+/// - Two-column GROUP BY → forces `GroupValuesColumn` (not `GroupValuesPrimitive`)
+/// - `force_hash_partial_collisions` feature → truncated hashes create the mix
+///   of colliding/non-colliding keys needed for non-monotonic indices
+/// - `batch_size=50` → not a multiple of rows-per-group in the merged stream,
+///   so groups span batch boundaries and premature emission causes duplicates
+#[tokio::test]
+async fn test_no_duplicate_groups_after_spill() -> Result<()> {
+    let num_keys: i64 = 5000;
+    let rows_per_key: i64 = 4;
+    let total_rows = (num_keys * rows_per_key) as usize;
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("key_a", DataType::Int64, false),
+        Field::new("key_b", DataType::Int64, false),
+        Field::new("value", DataType::Int64, false),
+    ]));
+
+    let mut keys_a = Vec::with_capacity(total_rows);
+    let mut keys_b = Vec::with_capacity(total_rows);
+    let mut vals = Vec::with_capacity(total_rows);
+    for r in 0..rows_per_key {
+        for k in 0..num_keys {
+            keys_a.push(k / 100);
+            keys_b.push(k % 100);
+            vals.push(r * num_keys + k);
+        }
+    }
+
+    let mut batches = Vec::new();
+    for start in (0..total_rows).step_by(500) {
+        let end = (start + 500).min(total_rows);
+        batches.push(RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int64Array::from(keys_a[start..end].to_vec())),
+                Arc::new(Int64Array::from(keys_b[start..end].to_vec())),
+                Arc::new(Int64Array::from(vals[start..end].to_vec())),
+            ],
+        )?);
+    }
+
+    let ctx = setup_spill_agg_context(128 * 1024, 50).await?;
+    let table = MemTable::try_new(schema, vec![batches])?;
+    ctx.register_table("t", Arc::new(table))?;
+
+    let df = ctx
+        .sql("SELECT key_a, key_b, COUNT(*) as cnt FROM t GROUP BY key_a, key_b")
+        .await?;
+    let results =
+        collect_batches(df.create_physical_plan().await?, ctx.task_ctx()).await?;
+
+    let mut seen = HashSet::new();
+    for batch in &results {
+        let ka = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int64Array>()
+            .unwrap();
+        let kb = batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<Int64Array>()
+            .unwrap();
+        for i in 0..batch.num_rows() {
+            assert!(
+                seen.insert((ka.value(i), kb.value(i))),
+                "DUPLICATE group key ({}, {}). \
+                 Bug #20724: group_values not recreated for streaming merge.",
+                ka.value(i),
+                kb.value(i),
+            );
+        }
+    }
+    assert_eq!(seen.len(), num_keys as usize);
+    Ok(())
+}
diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml
@@ -39,6 +39,7 @@ workspace = true
 
 [features]
 force_hash_collisions = []
+force_hash_partial_collisions = ["datafusion-common/force_hash_partial_collisions"]
 test_utils = ["arrow/test_utils"]
 tokio_coop = []
 tokio_coop_fallback = []

Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,7 @@ parquet_encryption = [`
`49`	`49`	`"dep:hex",`
`50`	`50`	`]`
`51`	`51`	`force_hash_collisions = []`
	`52`	`+force_hash_partial_collisions = []`
`52`	`53`	`recursive_protection = ["dep:recursive"]`
`53`	`54`	`parquet = ["dep:parquet"]`
`54`	`55`	`sql = ["sqlparser"]`