Skip to content

Commit e80d87f

Browse files
authored
Reduce Dictionary Builder Codegen (#3616)
* Reduce dictionary builder codegen * Clippy * Format
1 parent f78a9be commit e80d87f

File tree

3 files changed

+51
-36
lines changed

3 files changed

+51
-36
lines changed

arrow-array/src/builder/generic_bytes_dictionary_builder.rs

Lines changed: 21 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,10 @@ where
4040
state: ahash::RandomState,
4141
/// Used to provide a lookup from string value to key type
4242
///
43-
/// Note: K's hash implementation is not used, instead the raw entry
43+
/// Note: usize's hash implementation is not used, instead the raw entry
4444
/// API is used to store keys w.r.t the hash of the strings themselves
4545
///
46-
dedup: HashMap<K::Native, (), ()>,
46+
dedup: HashMap<usize, (), ()>,
4747

4848
keys_builder: PrimitiveBuilder<K>,
4949
values_builder: GenericByteBuilder<T>,
@@ -133,23 +133,22 @@ where
133133
let mut values_builder =
134134
GenericByteBuilder::<T>::with_capacity(dict_len, values_len);
135135

136+
K::Native::from_usize(dictionary_values.len())
137+
.ok_or(ArrowError::DictionaryKeyOverflowError)?;
138+
136139
for (idx, maybe_value) in dictionary_values.iter().enumerate() {
137140
match maybe_value {
138141
Some(value) => {
139142
let value_bytes: &[u8] = value.as_ref();
140143
let hash = state.hash_one(value_bytes);
141144

142-
let key = K::Native::from_usize(idx)
143-
.ok_or(ArrowError::DictionaryKeyOverflowError)?;
144-
145-
let entry =
146-
dedup.raw_entry_mut().from_hash(hash, |key: &K::Native| {
147-
value_bytes == get_bytes(&values_builder, key)
148-
});
145+
let entry = dedup.raw_entry_mut().from_hash(hash, |idx: &usize| {
146+
value_bytes == get_bytes(&values_builder, *idx)
147+
});
149148

150149
if let RawEntryMut::Vacant(v) = entry {
151-
v.insert_with_hasher(hash, key, (), |key| {
152-
state.hash_one(get_bytes(&values_builder, key))
150+
v.insert_with_hasher(hash, idx, (), |idx| {
151+
state.hash_one(get_bytes(&values_builder, *idx))
153152
});
154153
}
155154

@@ -233,21 +232,20 @@ where
233232
let entry = self
234233
.dedup
235234
.raw_entry_mut()
236-
.from_hash(hash, |key| value_bytes == get_bytes(storage, key));
235+
.from_hash(hash, |idx| value_bytes == get_bytes(storage, *idx));
237236

238237
let key = match entry {
239-
RawEntryMut::Occupied(entry) => *entry.into_key(),
238+
RawEntryMut::Occupied(entry) => K::Native::usize_as(*entry.into_key()),
240239
RawEntryMut::Vacant(entry) => {
241-
let index = storage.len();
240+
let idx = storage.len();
242241
storage.append_value(value);
243-
let key = K::Native::from_usize(index)
244-
.ok_or(ArrowError::DictionaryKeyOverflowError)?;
245-
246-
*entry
247-
.insert_with_hasher(hash, key, (), |key| {
248-
state.hash_one(get_bytes(storage, key))
249-
})
250-
.0
242+
243+
entry.insert_with_hasher(hash, idx, (), |idx| {
244+
state.hash_one(get_bytes(storage, *idx))
245+
});
246+
247+
K::Native::from_usize(idx)
248+
.ok_or(ArrowError::DictionaryKeyOverflowError)?
251249
}
252250
};
253251
self.keys_builder.append_value(key);
@@ -330,14 +328,10 @@ impl<K: ArrowDictionaryKeyType, T: ByteArrayType, V: AsRef<T::Native>> Extend<Op
330328
}
331329
}
332330

333-
fn get_bytes<'a, K: ArrowNativeType, T: ByteArrayType>(
334-
values: &'a GenericByteBuilder<T>,
335-
key: &K,
336-
) -> &'a [u8] {
331+
fn get_bytes<T: ByteArrayType>(values: &GenericByteBuilder<T>, idx: usize) -> &[u8] {
337332
let offsets = values.offsets_slice();
338333
let values = values.values_slice();
339334

340-
let idx = key.as_usize();
341335
let end_offset = offsets[idx + 1].as_usize();
342336
let start_offset = offsets[idx].as_usize();
343337

arrow-array/src/builder/primitive_dictionary_builder.rs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ where
8686
{
8787
keys_builder: PrimitiveBuilder<K>,
8888
values_builder: PrimitiveBuilder<V>,
89-
map: HashMap<Value<V::Native>, K::Native>,
89+
map: HashMap<Value<V::Native>, usize>,
9090
}
9191

9292
impl<K, V> Default for PrimitiveDictionaryBuilder<K, V>
@@ -180,13 +180,13 @@ where
180180
let key = match self.map.entry(Value(value)) {
181181
Entry::Vacant(vacant) => {
182182
// Append new value.
183-
let key = K::Native::from_usize(self.values_builder.len())
184-
.ok_or(ArrowError::DictionaryKeyOverflowError)?;
183+
let key = self.values_builder.len();
185184
self.values_builder.append_value(value);
186185
vacant.insert(key);
187-
key
186+
K::Native::from_usize(key)
187+
.ok_or(ArrowError::DictionaryKeyOverflowError)?
188188
}
189-
Entry::Occupied(o) => *o.get(),
189+
Entry::Occupied(o) => K::Native::usize_as(*o.get()),
190190
};
191191

192192
self.keys_builder.append_value(key);
@@ -198,6 +198,7 @@ where
198198
/// # Panics
199199
///
200200
/// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
201+
#[inline]
201202
pub fn append_value(&mut self, value: V::Native) {
202203
self.append(value).expect("dictionary key overflow");
203204
}

arrow-buffer/src/native.rs

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ pub trait ArrowNativeType:
5858
/// [`as`]: https://doc.rust-lang.org/reference/expressions/operator-expr.html#numeric-cast
5959
fn as_usize(self) -> usize;
6060

61+
/// Convert from usize according to the [`as`] operator
62+
///
63+
/// [`as`]: https://doc.rust-lang.org/reference/expressions/operator-expr.html#numeric-cast
64+
fn usize_as(i: usize) -> Self;
65+
6166
/// Convert native type to usize.
6267
///
6368
/// Returns `None` if [`Self`] is not an integer or conversion would result
@@ -119,6 +124,12 @@ macro_rules! native_integer {
119124
self as _
120125
}
121126

127+
#[inline]
128+
fn usize_as(i: usize) -> Self {
129+
i as _
130+
}
131+
132+
122133
$(
123134
#[inline]
124135
fn $from(v: $t) -> Option<Self> {
@@ -140,7 +151,7 @@ native_integer!(u32);
140151
native_integer!(u64);
141152

142153
macro_rules! native_float {
143-
($t:ty, $s:ident, $as_usize: expr) => {
154+
($t:ty, $s:ident, $as_usize: expr, $i:ident, $usize_as: expr) => {
144155
impl private::Sealed for $t {}
145156
impl ArrowNativeType for $t {
146157
#[inline]
@@ -162,13 +173,18 @@ macro_rules! native_float {
162173
fn as_usize($s) -> usize {
163174
$as_usize
164175
}
176+
177+
#[inline]
178+
fn usize_as($i: usize) -> Self {
179+
$usize_as
180+
}
165181
}
166182
};
167183
}
168184

169-
native_float!(f16, self, self.to_f32() as _);
170-
native_float!(f32, self, self as _);
171-
native_float!(f64, self, self as _);
185+
native_float!(f16, self, self.to_f32() as _, i, f16::from_f32(i as _));
186+
native_float!(f32, self, self as _, i, i as _);
187+
native_float!(f64, self, self as _, i, i as _);
172188

173189
impl private::Sealed for i256 {}
174190
impl ArrowNativeType for i256 {
@@ -180,6 +196,10 @@ impl ArrowNativeType for i256 {
180196
self.to_parts().0 as usize
181197
}
182198

199+
fn usize_as(i: usize) -> Self {
200+
Self::from_parts(i as u128, 0)
201+
}
202+
183203
fn to_usize(self) -> Option<usize> {
184204
let (low, high) = self.to_parts();
185205
if high != 0 {

0 commit comments

Comments
 (0)