Skip to content

Commit b06996b

Browse files
petern48alamb
andauthored
[Variant] [Shredding] Support typed_access for Utf8 and BinaryView (#8364)
# Which issue does this PR close? We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. - Closes #8333 # Rationale for this change See Issue # What changes are included in this PR? Support typed_access for Utf8 and BinaryView # Are these changes tested? Yes # Are there any user-facing changes? N/A. Variant support still in development --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 2ec77b5 commit b06996b

File tree

4 files changed

+143
-3
lines changed

4 files changed

+143
-3
lines changed

.github/workflows/parquet.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ on:
4242
- arrow-json/**
4343
- arrow-avro/**
4444
- parquet/**
45+
- parquet-variant/**
46+
- parquet-variant-compute/**
47+
- parquet-variant-json/**
4548
- .github/**
4649

4750
jobs:

parquet-variant-compute/src/variant_array.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,16 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, '
568568
let value = array.value(index);
569569
Variant::from(value)
570570
}
571+
DataType::BinaryView => {
572+
let array = typed_value.as_binary_view();
573+
let value = array.value(index);
574+
Variant::from(value)
575+
}
576+
DataType::Utf8 => {
577+
let array = typed_value.as_string::<i32>();
578+
let value = array.value(index);
579+
Variant::from(value)
580+
}
571581
DataType::Int8 => {
572582
primitive_conversion_single_value!(Int8Type, typed_value, index)
573583
}

parquet-variant-compute/src/variant_get.rs

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,40 @@ mod test {
508508
assert_eq!(result.value(3), Variant::from(&[4u8, 5u8, 6u8][..]));
509509
}
510510

511+
#[test]
512+
fn get_variant_partially_shredded_utf8_as_variant() {
513+
let array = partially_shredded_utf8_variant_array();
514+
let options = GetOptions::new();
515+
let result = variant_get(&array, options).unwrap();
516+
517+
// expect the result is a VariantArray
518+
let result: &VariantArray = result.as_any().downcast_ref().unwrap();
519+
assert_eq!(result.len(), 4);
520+
521+
// Expect the values are the same as the original values
522+
assert_eq!(result.value(0), Variant::from("hello"));
523+
assert!(!result.is_valid(1));
524+
assert_eq!(result.value(2), Variant::from("n/a"));
525+
assert_eq!(result.value(3), Variant::from("world"));
526+
}
527+
528+
#[test]
529+
fn get_variant_partially_shredded_binary_view_as_variant() {
530+
let array = partially_shredded_binary_view_variant_array();
531+
let options = GetOptions::new();
532+
let result = variant_get(&array, options).unwrap();
533+
534+
// expect the result is a VariantArray
535+
let result: &VariantArray = result.as_any().downcast_ref().unwrap();
536+
assert_eq!(result.len(), 4);
537+
538+
// Expect the values are the same as the original values
539+
assert_eq!(result.value(0), Variant::from(&[1u8, 2u8, 3u8][..]));
540+
assert!(!result.is_valid(1));
541+
assert_eq!(result.value(2), Variant::from("n/a"));
542+
assert_eq!(result.value(3), Variant::from(&[4u8, 5u8, 6u8][..]));
543+
}
544+
511545
/// Shredding: extract a value as an Int32Array
512546
#[test]
513547
fn get_variant_shredded_int32_as_int32_safe_cast() {
@@ -1018,6 +1052,100 @@ mod test {
10181052
)
10191053
}
10201054

1055+
/// Return a VariantArray that represents a partially "shredded" variant for UTF8
1056+
fn partially_shredded_utf8_variant_array() -> ArrayRef {
1057+
let (metadata, string_value) = {
1058+
let mut builder = parquet_variant::VariantBuilder::new();
1059+
builder.append_value("n/a");
1060+
builder.finish()
1061+
};
1062+
1063+
// Create the null buffer for the overall array
1064+
let nulls = NullBuffer::from(vec![
1065+
true, // row 0 non null
1066+
false, // row 1 is null
1067+
true, // row 2 non null
1068+
true, // row 3 non null
1069+
]);
1070+
1071+
// metadata is the same for all rows
1072+
let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4));
1073+
1074+
// See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY
1075+
// about why row1 is an empty but non null, value.
1076+
let values = BinaryViewArray::from(vec![
1077+
None, // row 0 is shredded, so no value
1078+
Some(b"" as &[u8]), // row 1 is null, so empty value
1079+
Some(&string_value), // copy the string value "N/A"
1080+
None, // row 3 is shredded, so no value
1081+
]);
1082+
1083+
let typed_value = StringArray::from(vec![
1084+
Some("hello"), // row 0 is shredded
1085+
None, // row 1 is null
1086+
None, // row 2 is a string
1087+
Some("world"), // row 3 is shredded
1088+
]);
1089+
1090+
let struct_array = StructArrayBuilder::new()
1091+
.with_field("metadata", Arc::new(metadata), true)
1092+
.with_field("typed_value", Arc::new(typed_value), true)
1093+
.with_field("value", Arc::new(values), true)
1094+
.with_nulls(nulls)
1095+
.build();
1096+
1097+
Arc::new(
1098+
VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"),
1099+
)
1100+
}
1101+
1102+
/// Return a VariantArray that represents a partially "shredded" variant for BinaryView
1103+
fn partially_shredded_binary_view_variant_array() -> ArrayRef {
1104+
let (metadata, string_value) = {
1105+
let mut builder = parquet_variant::VariantBuilder::new();
1106+
builder.append_value("n/a");
1107+
builder.finish()
1108+
};
1109+
1110+
// Create the null buffer for the overall array
1111+
let nulls = NullBuffer::from(vec![
1112+
true, // row 0 non null
1113+
false, // row 1 is null
1114+
true, // row 2 non null
1115+
true, // row 3 non null
1116+
]);
1117+
1118+
// metadata is the same for all rows
1119+
let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4));
1120+
1121+
// See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY
1122+
// about why row1 is an empty but non null, value.
1123+
let values = BinaryViewArray::from(vec![
1124+
None, // row 0 is shredded, so no value
1125+
Some(b"" as &[u8]), // row 1 is null, so empty value
1126+
Some(&string_value), // copy the string value "N/A"
1127+
None, // row 3 is shredded, so no value
1128+
]);
1129+
1130+
let typed_value = BinaryViewArray::from(vec![
1131+
Some(&[1u8, 2u8, 3u8][..]), // row 0 is shredded
1132+
None, // row 1 is null
1133+
None, // row 2 is a string
1134+
Some(&[4u8, 5u8, 6u8][..]), // row 3 is shredded
1135+
]);
1136+
1137+
let struct_array = StructArrayBuilder::new()
1138+
.with_field("metadata", Arc::new(metadata), true)
1139+
.with_field("typed_value", Arc::new(typed_value), true)
1140+
.with_field("value", Arc::new(values), true)
1141+
.with_nulls(nulls)
1142+
.build();
1143+
1144+
Arc::new(
1145+
VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"),
1146+
)
1147+
}
1148+
10211149
/// Return a VariantArray that represents an "all null" variant
10221150
/// for the following example (3 null values):
10231151
///

parquet/tests/variant_integration.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,8 @@ variant_test_case!(26, "Unsupported typed_value type: Decimal128(18, 9)");
119119
variant_test_case!(27, "Unsupported typed_value type: Decimal128(18, 9)");
120120
variant_test_case!(28, "Unsupported typed_value type: Decimal128(38, 9)");
121121
variant_test_case!(29, "Unsupported typed_value type: Decimal128(38, 9)");
122-
// https://github.com/apache/arrow-rs/issues/8333
123-
variant_test_case!(30, "Unsupported typed_value type: BinaryView");
124-
variant_test_case!(31, "Unsupported typed_value type: Utf8");
122+
variant_test_case!(30);
123+
variant_test_case!(31);
125124
// https://github.com/apache/arrow-rs/issues/8334
126125
variant_test_case!(32, "Unsupported typed_value type: Time64(Microsecond)");
127126
// https://github.com/apache/arrow-rs/issues/8331

0 commit comments

Comments
 (0)