Skip to content

Commit 5af6a6e

Browse files
committed
perf: optimize slow path and fix benchmark in ByteViewGroupValueBuilder
Three follow-up changes addressing PR review and CI feedback on #21794: 1. Remove `required-features = ["test_utils"]` from the `aggregate_vectorized` bench target. The gate caused the bench to be silently skipped when CI ran `cargo bench --features=parquet --bench aggregate_vectorized`. The bench compiles fine in a workspace build where `arrow/test_utils` is already enabled via feature unification through `datafusion-physical-expr`. 2. Replace `arrow::util::test_util::seedable_rng()` with `StdRng::seed_from_u64(42)` from the already-present `rand` crate. This was the only import from `test_util` (as opposed to `bench_util`) and removes the last explicit dependency on the `test_utils` feature in the benchmark. 3. Optimize the slow path in `vectorized_append_inner` (non-inline strings, `!data_buffers().is_empty()`), addressing Dandandan's review comment. Instead of calling `do_append_val_inner` which goes through `array.value(row)` (buffer lookup + slice construction) and then `make_view` (re-reads the first 4 bytes to build the prefix), the new path reads `arr.views()[row]` directly and copies `src.prefix` from the source `ByteView`. Benchmarks show 41–53% improvement for 64-byte strings and 6–16% improvement for random-length strings (null_density=0.0).
1 parent 9621004 commit 5af6a6e

3 files changed

Lines changed: 37 additions & 10 deletions

File tree

datafusion/physical-plan/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,4 +107,3 @@ required-features = ["test_utils"]
107107
[[bench]]
108108
harness = false
109109
name = "aggregate_vectorized"
110-
required-features = ["test_utils"]

datafusion/physical-plan/benches/aggregate_vectorized.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ use arrow::util::bench_util::{
2121
create_primitive_array, create_string_view_array_with_len,
2222
create_string_view_array_with_max_len,
2323
};
24-
use arrow::util::test_util::seedable_rng;
2524
use arrow_schema::DataType;
2625
use criterion::measurement::WallTime;
2726
use criterion::{
@@ -30,7 +29,9 @@ use criterion::{
3029
use datafusion_physical_plan::aggregates::group_values::multi_group_by::GroupColumn;
3130
use datafusion_physical_plan::aggregates::group_values::multi_group_by::bytes_view::ByteViewGroupValueBuilder;
3231
use datafusion_physical_plan::aggregates::group_values::multi_group_by::primitive::PrimitiveGroupValueBuilder;
32+
use rand::SeedableRng;
3333
use rand::distr::{Bernoulli, Distribution};
34+
use rand::rngs::StdRng;
3435
use std::hint::black_box;
3536
use std::sync::Arc;
3637

@@ -128,7 +129,7 @@ fn bytes_bench(
128129
input,
129130
"0.75 true",
130131
{
131-
let mut rng = seedable_rng();
132+
let mut rng = StdRng::seed_from_u64(42);
132133
let d = Bernoulli::new(0.75).unwrap();
133134
(0..size).map(|_| d.sample(&mut rng)).collect::<Vec<_>>()
134135
},
@@ -141,7 +142,7 @@ fn bytes_bench(
141142
input,
142143
"0.5 true",
143144
{
144-
let mut rng = seedable_rng();
145+
let mut rng = StdRng::seed_from_u64(42);
145146
let d = Bernoulli::new(0.5).unwrap();
146147
(0..size).map(|_| d.sample(&mut rng)).collect::<Vec<_>>()
147148
},
@@ -154,7 +155,7 @@ fn bytes_bench(
154155
input,
155156
"0.25 true",
156157
{
157-
let mut rng = seedable_rng();
158+
let mut rng = StdRng::seed_from_u64(42);
158159
let d = Bernoulli::new(0.25).unwrap();
159160
(0..size).map(|_| d.sample(&mut rng)).collect::<Vec<_>>()
160161
},
@@ -236,7 +237,7 @@ fn bench_single_primitive<const NULLABLE: bool>(
236237
&input,
237238
"0.75 true",
238239
{
239-
let mut rng = seedable_rng();
240+
let mut rng = StdRng::seed_from_u64(42);
240241
let d = Bernoulli::new(0.75).unwrap();
241242
(0..size).map(|_| d.sample(&mut rng)).collect::<Vec<_>>()
242243
},
@@ -249,7 +250,7 @@ fn bench_single_primitive<const NULLABLE: bool>(
249250
&input,
250251
"0.5 true",
251252
{
252-
let mut rng = seedable_rng();
253+
let mut rng = StdRng::seed_from_u64(42);
253254
let d = Bernoulli::new(0.5).unwrap();
254255
(0..size).map(|_| d.sample(&mut rng)).collect::<Vec<_>>()
255256
},
@@ -262,7 +263,7 @@ fn bench_single_primitive<const NULLABLE: bool>(
262263
&input,
263264
"0.25 true",
264265
{
265-
let mut rng = seedable_rng();
266+
let mut rng = StdRng::seed_from_u64(42);
266267
let d = Bernoulli::new(0.25).unwrap();
267268
(0..size).map(|_| d.sample(&mut rng)).collect::<Vec<_>>()
268269
},

datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,10 +173,37 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
173173
self.views.extend(rows.iter().map(|&row| arr.views()[row]));
174174
} else {
175175
// Slow path: some strings are non-inline (>12 bytes).
176-
// Pre-reserve views capacity to avoid repeated reallocation.
176+
// Read views directly to avoid array.value(row) overhead and
177+
// reuse the source view's prefix instead of recomputing it via make_view.
177178
self.views.reserve(rows.len());
178179
for &row in rows {
179-
self.do_append_val_inner(arr, row);
180+
let view = arr.views()[row];
181+
let len = view as u32;
182+
if len <= 12 {
183+
// This row happens to be inline; copy view directly.
184+
self.views.push(view);
185+
} else {
186+
let src = ByteView::from(view);
187+
// ensure_in_progress_big_enough must be called before computing
188+
// new_buffer_index / new_offset — it may flush in_progress to completed.
189+
self.ensure_in_progress_big_enough(len as usize);
190+
let new_buffer_index = self.completed.len() as u32;
191+
let new_offset = self.in_progress.len() as u32;
192+
let src_buf = &arr.data_buffers()[src.buffer_index as usize];
193+
self.in_progress.extend_from_slice(
194+
&src_buf[src.offset as usize
195+
..(src.offset + src.length) as usize],
196+
);
197+
// Reuse prefix from the source view — avoids re-reading first 4 bytes.
198+
let new_view = ByteView {
199+
length: src.length,
200+
prefix: src.prefix,
201+
buffer_index: new_buffer_index,
202+
offset: new_offset,
203+
}
204+
.as_u128();
205+
self.views.push(new_view);
206+
}
180207
}
181208
}
182209
}

0 commit comments

Comments
 (0)