Skip to content

Commit 47e4b61

Browse files
authored
Use Typed Buffers in Arrays (#1811) (#1176) (#3743)
* Remove RawPtrBox (#1811) (#1176) * Clippy * Extract get_offsets function
1 parent ebe6f53 commit 47e4b61

File tree

17 files changed

+242
-261
lines changed

17 files changed

+242
-261
lines changed

arrow-array/src/array/boolean_array.rs

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
use crate::array::print_long_array;
1819
use crate::builder::BooleanBuilder;
1920
use crate::iterator::BooleanIter;
20-
use crate::raw_pointer::RawPtrBox;
21-
use crate::{print_long_array, Array, ArrayAccessor};
21+
use crate::{Array, ArrayAccessor};
2222
use arrow_buffer::{bit_util, Buffer, MutableBuffer};
2323
use arrow_data::bit_mask::combine_option_bitmap;
2424
use arrow_data::ArrayData;
@@ -67,9 +67,7 @@ use std::any::Any;
6767
#[derive(Clone)]
6868
pub struct BooleanArray {
6969
data: ArrayData,
70-
/// Pointer to the value array. The lifetime of this must be <= to the value buffer
71-
/// stored in `data`, so it's safe to store.
72-
raw_values: RawPtrBox<u8>,
70+
raw_values: Buffer,
7371
}
7472

7573
impl std::fmt::Debug for BooleanArray {
@@ -102,7 +100,7 @@ impl BooleanArray {
102100
///
103101
/// Note this doesn't take the offset of this array into account.
104102
pub fn values(&self) -> &Buffer {
105-
&self.data.buffers()[0]
103+
&self.raw_values
106104
}
107105

108106
/// Returns the number of non null, true values within this array
@@ -328,13 +326,8 @@ impl From<ArrayData> for BooleanArray {
328326
1,
329327
"BooleanArray data should contain a single buffer only (values buffer)"
330328
);
331-
let ptr = data.buffers()[0].as_ptr();
332-
Self {
333-
data,
334-
// SAFETY:
335-
// ArrayData must be valid, and validated data type above
336-
raw_values: unsafe { RawPtrBox::new(ptr) },
337-
}
329+
let raw_values = data.buffers()[0].clone();
330+
Self { data, raw_values }
338331
}
339332
}
340333

arrow-array/src/array/byte_array.rs

Lines changed: 17 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use crate::array::{empty_offsets, print_long_array};
18+
use crate::array::{get_offsets, print_long_array};
1919
use crate::builder::GenericByteBuilder;
2020
use crate::iterator::ArrayIter;
21-
use crate::raw_pointer::RawPtrBox;
2221
use crate::types::bytes::ByteArrayNativeType;
2322
use crate::types::ByteArrayType;
2423
use crate::{Array, ArrayAccessor, OffsetSizeTrait};
25-
use arrow_buffer::ArrowNativeType;
24+
use arrow_buffer::buffer::OffsetBuffer;
25+
use arrow_buffer::{ArrowNativeType, Buffer};
2626
use arrow_data::ArrayData;
2727
use arrow_schema::DataType;
2828
use std::any::Any;
@@ -39,16 +39,16 @@ use std::any::Any;
3939
/// [`LargeBinaryArray`]: crate::LargeBinaryArray
4040
pub struct GenericByteArray<T: ByteArrayType> {
4141
data: ArrayData,
42-
value_offsets: RawPtrBox<T::Offset>,
43-
value_data: RawPtrBox<u8>,
42+
value_offsets: OffsetBuffer<T::Offset>,
43+
value_data: Buffer,
4444
}
4545

4646
impl<T: ByteArrayType> Clone for GenericByteArray<T> {
4747
fn clone(&self) -> Self {
4848
Self {
4949
data: self.data.clone(),
50-
value_offsets: self.value_offsets,
51-
value_data: self.value_data,
50+
value_offsets: self.value_offsets.clone(),
51+
value_data: self.value_data.clone(),
5252
}
5353
}
5454
}
@@ -68,7 +68,7 @@ impl<T: ByteArrayType> GenericByteArray<T> {
6868

6969
/// Returns the raw value data
7070
pub fn value_data(&self) -> &[u8] {
71-
self.data.buffers()[1].as_slice()
71+
self.value_data.as_slice()
7272
}
7373

7474
/// Returns true if all data within this array is ASCII
@@ -82,15 +82,7 @@ impl<T: ByteArrayType> GenericByteArray<T> {
8282
/// Returns the offset values in the offsets buffer
8383
#[inline]
8484
pub fn value_offsets(&self) -> &[T::Offset] {
85-
// Soundness
86-
// pointer alignment & location is ensured by RawPtrBox
87-
// buffer bounds/offset is ensured by the ArrayData instance.
88-
unsafe {
89-
std::slice::from_raw_parts(
90-
self.value_offsets.as_ptr().add(self.data.offset()),
91-
self.len() + 1,
92-
)
93-
}
85+
&self.value_offsets
9486
}
9587

9688
/// Returns the element at index `i`
@@ -161,6 +153,8 @@ impl<T: ByteArrayType> GenericByteArray<T> {
161153
.slice_with_length(self.data.offset() * element_len, value_len * element_len);
162154

163155
drop(self.data);
156+
drop(self.value_data);
157+
drop(self.value_offsets);
164158

165159
let try_mutable_null_buffer = match null_bit_buffer {
166160
None => Ok(None),
@@ -280,18 +274,16 @@ impl<T: ByteArrayType> From<ArrayData> for GenericByteArray<T> {
280274
T::Offset::PREFIX,
281275
T::PREFIX,
282276
);
283-
// Handle case of empty offsets
284-
let offsets = match data.is_empty() && data.buffers()[0].is_empty() {
285-
true => empty_offsets::<T::Offset>().as_ptr() as *const _,
286-
false => data.buffers()[0].as_ptr(),
287-
};
288-
let values = data.buffers()[1].as_ptr();
277+
// SAFETY:
278+
// ArrayData is valid, and verified type above
279+
let value_offsets = unsafe { get_offsets(&data) };
280+
let value_data = data.buffers()[1].clone();
289281
Self {
290282
data,
291283
// SAFETY:
292284
// ArrayData must be valid, and validated data type above
293-
value_offsets: unsafe { RawPtrBox::new(offsets) },
294-
value_data: unsafe { RawPtrBox::new(values) },
285+
value_offsets,
286+
value_data,
295287
}
296288
}
297289
}

arrow-array/src/array/fixed_size_binary_array.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
use crate::array::print_long_array;
1819
use crate::iterator::FixedSizeBinaryIter;
19-
use crate::raw_pointer::RawPtrBox;
20-
use crate::{print_long_array, Array, ArrayAccessor, FixedSizeListArray};
20+
use crate::{Array, ArrayAccessor, FixedSizeListArray};
2121
use arrow_buffer::{bit_util, Buffer, MutableBuffer};
2222
use arrow_data::ArrayData;
2323
use arrow_schema::{ArrowError, DataType};
@@ -50,7 +50,7 @@ use std::any::Any;
5050
#[derive(Clone)]
5151
pub struct FixedSizeBinaryArray {
5252
data: ArrayData,
53-
value_data: RawPtrBox<u8>,
53+
value_data: Buffer,
5454
length: i32,
5555
}
5656

@@ -357,14 +357,14 @@ impl From<ArrayData> for FixedSizeBinaryArray {
357357
1,
358358
"FixedSizeBinaryArray data should contain 1 buffer only (values)"
359359
);
360-
let value_data = data.buffers()[0].as_ptr();
360+
let value_data = data.buffers()[0].clone();
361361
let length = match data.data_type() {
362362
DataType::FixedSizeBinary(len) => *len,
363363
_ => panic!("Expected data type to be FixedSizeBinary"),
364364
};
365365
Self {
366366
data,
367-
value_data: unsafe { RawPtrBox::new(value_data) },
367+
value_data,
368368
length,
369369
}
370370
}

arrow-array/src/array/fixed_size_list_array.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,9 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
use crate::array::print_long_array;
1819
use crate::builder::{FixedSizeListBuilder, PrimitiveBuilder};
19-
use crate::{
20-
make_array, print_long_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType,
21-
};
20+
use crate::{make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType};
2221
use arrow_data::ArrayData;
2322
use arrow_schema::DataType;
2423
use std::any::Any;

arrow-array/src/array/list_array.rs

Lines changed: 11 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use crate::array::make_array;
18+
use crate::array::{get_offsets, make_array, print_long_array};
1919
use crate::builder::{GenericListBuilder, PrimitiveBuilder};
2020
use crate::{
21-
iterator::GenericListArrayIter, print_long_array, raw_pointer::RawPtrBox, Array,
22-
ArrayAccessor, ArrayRef, ArrowPrimitiveType,
21+
iterator::GenericListArrayIter, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType,
2322
};
23+
use arrow_buffer::buffer::OffsetBuffer;
2424
use arrow_buffer::ArrowNativeType;
2525
use arrow_data::ArrayData;
2626
use arrow_schema::{ArrowError, DataType, Field};
@@ -45,35 +45,24 @@ impl OffsetSizeTrait for i64 {
4545
const PREFIX: &'static str = "Large";
4646
}
4747

48-
/// Returns a slice of `OffsetSize` consisting of a single zero value
49-
#[inline]
50-
pub(crate) fn empty_offsets<OffsetSize: OffsetSizeTrait>() -> &'static [OffsetSize] {
51-
static OFFSET: &[i64] = &[0];
52-
// SAFETY:
53-
// OffsetSize is ArrowNativeType and is therefore trivially transmutable
54-
let (prefix, val, suffix) = unsafe { OFFSET.align_to::<OffsetSize>() };
55-
assert!(prefix.is_empty() && suffix.is_empty());
56-
val
57-
}
58-
5948
/// Generic struct for a variable-size list array.
6049
///
6150
/// Columnar format in Apache Arrow:
6251
/// <https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout>
6352
///
6453
/// For non generic lists, you may wish to consider using [`ListArray`] or [`LargeListArray`]`
65-
pub struct GenericListArray<OffsetSize> {
54+
pub struct GenericListArray<OffsetSize: OffsetSizeTrait> {
6655
data: ArrayData,
6756
values: ArrayRef,
68-
value_offsets: RawPtrBox<OffsetSize>,
57+
value_offsets: OffsetBuffer<OffsetSize>,
6958
}
7059

71-
impl<OffsetSize> Clone for GenericListArray<OffsetSize> {
60+
impl<OffsetSize: OffsetSizeTrait> Clone for GenericListArray<OffsetSize> {
7261
fn clone(&self) -> Self {
7362
Self {
7463
data: self.data.clone(),
7564
values: self.values.clone(),
76-
value_offsets: self.value_offsets,
65+
value_offsets: self.value_offsets.clone(),
7766
}
7867
}
7968
}
@@ -118,15 +107,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
118107
/// Returns the offset values in the offsets buffer
119108
#[inline]
120109
pub fn value_offsets(&self) -> &[OffsetSize] {
121-
// Soundness
122-
// pointer alignment & location is ensured by RawPtrBox
123-
// buffer bounds/offset is ensured by the ArrayData instance.
124-
unsafe {
125-
std::slice::from_raw_parts(
126-
self.value_offsets.as_ptr().add(self.data.offset()),
127-
self.len() + 1,
128-
)
129-
}
110+
&self.value_offsets
130111
}
131112

132113
/// Returns the length for value at index `i`.
@@ -242,15 +223,10 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
242223
}
243224

244225
let values = make_array(values);
245-
// Handle case of empty offsets
246-
let offsets = match data.is_empty() && data.buffers()[0].is_empty() {
247-
true => empty_offsets::<OffsetSize>().as_ptr() as *const _,
248-
false => data.buffers()[0].as_ptr(),
249-
};
250-
251226
// SAFETY:
252-
// Verified list type in call to `Self::get_type`
253-
let value_offsets = unsafe { RawPtrBox::new(offsets) };
227+
// ArrayData is valid, and verified type above
228+
let value_offsets = unsafe { get_offsets(&data) };
229+
254230
Ok(Self {
255231
data,
256232
values,

arrow-array/src/array/map_array.rs

Lines changed: 6 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use crate::raw_pointer::RawPtrBox;
19-
use crate::{make_array, print_long_array, Array, ArrayRef, StringArray, StructArray};
18+
use crate::array::{get_offsets, print_long_array};
19+
use crate::{make_array, Array, ArrayRef, StringArray, StructArray};
20+
use arrow_buffer::buffer::OffsetBuffer;
2021
use arrow_buffer::{ArrowNativeType, Buffer, ToByteSlice};
2122
use arrow_data::ArrayData;
2223
use arrow_schema::{ArrowError, DataType, Field};
@@ -38,7 +39,7 @@ pub struct MapArray {
3839
/// The second child of `entries`, the "values" of this MapArray
3940
values: ArrayRef,
4041
/// The start and end offsets of each entry
41-
value_offsets: RawPtrBox<i32>,
42+
value_offsets: OffsetBuffer<i32>,
4243
}
4344

4445
impl MapArray {
@@ -86,15 +87,7 @@ impl MapArray {
8687
/// Returns the offset values in the offsets buffer
8788
#[inline]
8889
pub fn value_offsets(&self) -> &[i32] {
89-
// Soundness
90-
// pointer alignment & location is ensured by RawPtrBox
91-
// buffer bounds/offset is ensured by the ArrayData instance.
92-
unsafe {
93-
std::slice::from_raw_parts(
94-
self.value_offsets.as_ptr().add(self.data.offset()),
95-
self.len() + 1,
96-
)
97-
}
90+
&self.value_offsets
9891
}
9992

10093
/// Returns the length for value at index `i`.
@@ -159,18 +152,10 @@ impl MapArray {
159152
let keys = make_array(entries.child_data()[0].clone());
160153
let values = make_array(entries.child_data()[1].clone());
161154
let entries = make_array(entries);
162-
let value_offsets = data.buffers()[0].as_ptr();
163155

164156
// SAFETY:
165157
// ArrayData is valid, and verified type above
166-
let value_offsets = unsafe { RawPtrBox::<i32>::new(value_offsets) };
167-
unsafe {
168-
if (*value_offsets.as_ptr().offset(0)) != 0 {
169-
return Err(ArrowError::InvalidArgumentError(String::from(
170-
"offsets do not start at zero",
171-
)));
172-
}
173-
}
158+
let value_offsets = unsafe { get_offsets(&data) };
174159

175160
Ok(Self {
176161
data,

arrow-array/src/array/mod.rs

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
mod binary_array;
2121

2222
use crate::types::*;
23+
use arrow_buffer::buffer::{OffsetBuffer, ScalarBuffer};
24+
use arrow_buffer::ArrowNativeType;
2325
use arrow_data::ArrayData;
2426
use arrow_schema::{DataType, IntervalUnit, TimeUnit};
2527
use std::any::Any;
@@ -636,8 +638,29 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef {
636638
make_array(ArrayData::new_null(data_type, length))
637639
}
638640

639-
// Helper function for printing potentially long arrays.
640-
pub(crate) fn print_long_array<A, F>(
641+
/// Helper function that gets offset from an [`ArrayData`]
642+
///
643+
/// # Safety
644+
///
645+
/// - ArrayData must contain a valid [`OffsetBuffer`] as its first buffer
646+
unsafe fn get_offsets<O: ArrowNativeType>(data: &ArrayData) -> OffsetBuffer<O> {
647+
match data.is_empty() && data.buffers()[0].is_empty() {
648+
true => OffsetBuffer::new_empty(),
649+
false => {
650+
let buffer = ScalarBuffer::new(
651+
data.buffers()[0].clone(),
652+
data.offset(),
653+
data.len() + 1,
654+
);
655+
// Safety:
656+
// ArrayData is valid
657+
unsafe { OffsetBuffer::new_unchecked(buffer) }
658+
}
659+
}
660+
}
661+
662+
/// Helper function for printing potentially long arrays.
663+
fn print_long_array<A, F>(
641664
array: &A,
642665
f: &mut std::fmt::Formatter,
643666
print_item: F,

0 commit comments

Comments
 (0)