Skip to content

Commit 6c11afe

Browse files
committed
Revert "refactor: use upstream inline_key_fast (apache#17044)"
This reverts commit 71b92bc.
1 parent e00f508 commit 6c11afe

1 file changed

Lines changed: 212 additions & 2 deletions

File tree

datafusion/physical-plan/src/sorts/cursor.rs

Lines changed: 212 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,120 @@ impl CursorArray for StringViewArray {
289289
}
290290
}
291291

292+
/// Todo use arrow-rs side api after: <https://github.com/apache/arrow-rs/pull/7748> and <https://github.com/apache/arrow-rs/pull/7875> released
293+
/// Builds a 128-bit composite key for an inline value:
294+
///
295+
/// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting).
296+
/// - Low 32 bits: the length in big-endian byte order, acting as a tiebreaker so shorter strings
297+
/// (or those with fewer meaningful bytes) always numerically sort before longer ones.
298+
///
299+
/// This function extracts the length and the 12-byte inline string data from the raw
300+
/// little-endian `u128` representation, converts them to big-endian ordering, and packs them
301+
/// into a single `u128` value suitable for fast, branchless comparisons.
302+
///
303+
/// # Why include length?
304+
///
305+
/// A pure 96-bit content comparison can’t distinguish between two values whose inline bytes
306+
/// compare equal—either because one is a true prefix of the other or because zero-padding
307+
/// hides extra bytes. By tucking the 32-bit length into the lower bits, a single `u128` compare
308+
/// handles both content and length in one go.
309+
///
310+
/// Example: comparing "bar" (3 bytes) vs "bar\0" (4 bytes)
311+
///
312+
/// | String | Bytes 0–4 (length LE) | Bytes 4–16 (data + padding) |
313+
/// |------------|-----------------------|---------------------------------|
314+
/// | `"bar"` | `03 00 00 00` | `62 61 72` + 9 × `00` |
315+
/// | `"bar\0"`| `04 00 00 00` | `62 61 72 00` + 8 × `00` |
316+
///
317+
/// Both inline parts become `62 61 72 00…00`, so they tie on content. The length field
318+
/// then differentiates:
319+
///
320+
/// ```text
321+
/// key("bar") = 0x0000000000000000000062617200000003
322+
/// key("bar\0") = 0x0000000000000000000062617200000004
323+
/// ⇒ key("bar") < key("bar\0")
324+
/// ```
325+
/// # Inlining and Endianness
326+
///
327+
/// - We start by calling `.to_le_bytes()` on the `raw` `u128`, because Rust’s native in‑memory
328+
/// representation is little‑endian on x86/ARM.
329+
/// - We extract the low 32 bits numerically (`raw as u32`)—this step is endianness‑free.
330+
/// - We copy the 12 bytes of inline data (original order) into `buf[0..12]`.
331+
/// - We serialize `length` as big‑endian into `buf[12..16]`.
332+
/// - Finally, `u128::from_be_bytes(buf)` treats `buf[0]` as the most significant byte
333+
/// and `buf[15]` as the least significant, producing a `u128` whose integer value
334+
/// directly encodes “inline data then length” in big‑endian form.
335+
///
336+
/// This ensures that a simple `u128` comparison is equivalent to the desired
337+
/// lexicographical comparison of the inline bytes followed by length.
338+
#[inline(always)]
339+
pub fn inline_key_fast(raw: u128) -> u128 {
340+
// 1. Decompose `raw` into little‑endian bytes:
341+
// - raw_bytes[0..4] = length in LE
342+
// - raw_bytes[4..16] = inline string data
343+
let raw_bytes = raw.to_le_bytes();
344+
345+
// 2. Numerically truncate to get the low 32‑bit length (endianness‑free).
346+
let length = raw as u32;
347+
348+
// 3. Build a 16‑byte buffer in big‑endian order:
349+
// - buf[0..12] = inline string bytes (in original order)
350+
// - buf[12..16] = length.to_be_bytes() (BE)
351+
let mut buf = [0u8; 16];
352+
buf[0..12].copy_from_slice(&raw_bytes[4..16]); // inline data
353+
354+
// Why convert length to big-endian for comparison?
355+
//
356+
// Rust (on most platforms) stores integers in little-endian format,
357+
// meaning the least significant byte is at the lowest memory address.
358+
// For example, an u32 value like 0x22345677 is stored in memory as:
359+
//
360+
// [0x77, 0x56, 0x34, 0x22] // little-endian layout
361+
// ^ ^ ^ ^
362+
// LSB ↑↑↑ MSB
363+
//
364+
// This layout is efficient for arithmetic but *not* suitable for
365+
// lexicographic (dictionary-style) comparison of byte arrays.
366+
//
367+
// To compare values by byte order—e.g., for sorted keys or binary trees—
368+
// we must convert them to **big-endian**, where:
369+
//
370+
// - The most significant byte (MSB) comes first (index 0)
371+
// - The least significant byte (LSB) comes last (index N-1)
372+
//
373+
// In big-endian, the same u32 = 0x22345677 would be represented as:
374+
//
375+
// [0x22, 0x34, 0x56, 0x77]
376+
//
377+
// This ordering aligns with natural string/byte sorting, so calling
378+
// `.to_be_bytes()` allows us to construct
379+
// keys where standard numeric comparison (e.g., `<`, `>`) behaves
380+
// like lexicographic byte comparison.
381+
buf[12..16].copy_from_slice(&length.to_be_bytes()); // length in BE
382+
383+
// 4. Deserialize the buffer as a big‑endian u128:
384+
// buf[0] is MSB, buf[15] is LSB.
385+
// Details:
386+
// Note on endianness and layout:
387+
//
388+
// Although `buf[0]` is stored at the lowest memory address,
389+
// calling `u128::from_be_bytes(buf)` interprets it as the **most significant byte (MSB)**,
390+
// and `buf[15]` as the **least significant byte (LSB)**.
391+
//
392+
// This is the core principle of **big-endian decoding**:
393+
// - Byte at index 0 maps to bits 127..120 (highest)
394+
// - Byte at index 1 maps to bits 119..112
395+
// - ...
396+
// - Byte at index 15 maps to bits 7..0 (lowest)
397+
//
398+
// So even though memory layout goes from low to high (left to right),
399+
// big-endian treats the **first byte** as highest in value.
400+
//
401+
// This guarantees that comparing two `u128` keys is equivalent to lexicographically
402+
// comparing the original inline bytes, followed by length.
403+
u128::from_be_bytes(buf)
404+
}
405+
292406
impl CursorValues for StringViewArray {
293407
fn len(&self) -> usize {
294408
self.views().len()
@@ -346,8 +460,7 @@ impl CursorValues for StringViewArray {
346460
if l.data_buffers().is_empty() && r.data_buffers().is_empty() {
347461
let l_view = unsafe { l.views().get_unchecked(l_idx) };
348462
let r_view = unsafe { r.views().get_unchecked(r_idx) };
349-
return StringViewArray::inline_key_fast(*l_view)
350-
.cmp(&StringViewArray::inline_key_fast(*r_view));
463+
return inline_key_fast(*l_view).cmp(&inline_key_fast(*r_view));
351464
}
352465

353466
unsafe { GenericByteViewArray::compare_unchecked(l, l_idx, r, r_idx) }
@@ -442,6 +555,7 @@ impl<T: CursorValues> CursorValues for ArrayValues<T> {
442555

443556
#[cfg(test)]
444557
mod tests {
558+
use arrow::array::GenericBinaryArray;
445559
use datafusion_execution::memory_pool::{
446560
GreedyMemoryPool, MemoryConsumer, MemoryPool,
447561
};
@@ -606,4 +720,100 @@ mod tests {
606720
b.advance();
607721
assert_eq!(a.cmp(&b), Ordering::Less);
608722
}
723+
724+
/// Integration tests for `inline_key_fast` covering:
725+
///
726+
/// 1. Monotonic ordering across increasing lengths and lexical variations.
727+
/// 2. Cross-check against `GenericBinaryArray` comparison to ensure semantic equivalence.
728+
///
729+
/// This also includes a specific test for the “bar” vs. “bar\0” case, demonstrating why
730+
/// the length field is required even when all inline bytes fit in 12 bytes.
731+
///
732+
/// The test includes strings that verify correct byte order (prevent reversal bugs),
733+
/// and length-based tie-breaking in the composite key.
734+
///
735+
/// The test confirms that `inline_key_fast` produces keys which sort consistently
736+
/// with the expected lexicographical order of the raw byte arrays.
737+
#[test]
738+
fn test_inline_key_fast_various_lengths_and_lexical() {
739+
/// Helper to create a raw u128 value representing an inline ByteView:
740+
/// - `length`: number of meaningful bytes (must be ≤ 12)
741+
/// - `data`: the actual inline data bytes
742+
///
743+
/// The first 4 bytes encode length in little-endian,
744+
/// the following 12 bytes contain the inline string data (unpadded).
745+
fn make_raw_inline(length: u32, data: &[u8]) -> u128 {
746+
assert!(length as usize <= 12, "Inline length must be ≤ 12");
747+
assert!(
748+
data.len() == length as usize,
749+
"Data length must match `length`"
750+
);
751+
752+
let mut raw_bytes = [0u8; 16];
753+
raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // length stored little-endian
754+
raw_bytes[4..(4 + data.len())].copy_from_slice(data); // inline data
755+
u128::from_le_bytes(raw_bytes)
756+
}
757+
758+
// Test inputs: various lengths and lexical orders,
759+
// plus special cases for byte order and length tie-breaking
760+
let test_inputs: Vec<&[u8]> = vec![
761+
b"a",
762+
b"aa",
763+
b"aaa",
764+
b"aab",
765+
b"abcd",
766+
b"abcde",
767+
b"abcdef",
768+
b"abcdefg",
769+
b"abcdefgh",
770+
b"abcdefghi",
771+
b"abcdefghij",
772+
b"abcdefghijk",
773+
b"abcdefghijkl",
774+
// Tests for byte-order reversal bug:
775+
// Without the fix, "backend one" would compare as "eno dnekcab",
776+
// causing incorrect sort order relative to "backend two".
777+
b"backend one",
778+
b"backend two",
779+
// Tests length-tiebreaker logic:
780+
// "bar" (3 bytes) and "bar\0" (4 bytes) have identical inline data,
781+
// so only the length differentiates their ordering.
782+
b"bar",
783+
b"bar\0",
784+
// Additional lexical and length tie-breaking cases with same prefix, in correct lex order:
785+
b"than12Byt",
786+
b"than12Bytes",
787+
b"than12Bytes\0",
788+
b"than12Bytesx",
789+
b"than12Bytex",
790+
b"than12Bytez",
791+
// Additional lexical tests
792+
b"xyy",
793+
b"xyz",
794+
b"xza",
795+
];
796+
797+
// Create a GenericBinaryArray for cross-comparison of lex order
798+
let array: GenericBinaryArray<i32> = GenericBinaryArray::from(
799+
test_inputs.iter().map(|s| Some(*s)).collect::<Vec<_>>(),
800+
);
801+
802+
for i in 0..array.len() - 1 {
803+
let v1 = array.value(i);
804+
let v2 = array.value(i + 1);
805+
806+
// Assert the array's natural lexical ordering is correct
807+
assert!(v1 < v2, "Array compare failed: {v1:?} !< {v2:?}");
808+
809+
// Assert the keys produced by inline_key_fast reflect the same ordering
810+
let key1 = inline_key_fast(make_raw_inline(v1.len() as u32, v1));
811+
let key2 = inline_key_fast(make_raw_inline(v2.len() as u32, v2));
812+
813+
assert!(
814+
key1 < key2,
815+
"Key compare failed: key({v1:?})=0x{key1:032x} !< key({v2:?})=0x{key2:032x}",
816+
);
817+
}
818+
}
609819
}

0 commit comments

Comments
 (0)