Skip to content

Commit 25ba429

Browse files
compheadgabotechs
authored andcommitted
feat: support array_compact builtin function (apache#21522)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes apache#123` indicates that this PR will close issue apache#123. --> - Closes #. ## Rationale for this change Create `array_compact` function which removes NULLs from input array. There is no direct counterparty in DuckDB however the function used in Spark, SnowFlake <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> ## What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> ## Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. --> (cherry picked from commit 26c6121)
1 parent 434957e commit 25ba429

4 files changed

Lines changed: 320 additions & 0 deletions

File tree

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! [`ScalarUDFImpl`] definitions for array_compact function.
19+
20+
use crate::utils::make_scalar_function;
21+
use arrow::array::{
22+
Array, ArrayRef, Capacities, GenericListArray, MutableArrayData, OffsetSizeTrait,
23+
make_array,
24+
};
25+
use arrow::buffer::OffsetBuffer;
26+
use arrow::datatypes::DataType;
27+
use arrow::datatypes::DataType::{LargeList, List, Null};
28+
use datafusion_common::cast::{as_large_list_array, as_list_array};
29+
use datafusion_common::{Result, exec_err, utils::take_function_args};
30+
use datafusion_expr::{
31+
ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
32+
Volatility,
33+
};
34+
use datafusion_macros::user_doc;
35+
use std::sync::Arc;
36+
37+
make_udf_expr_and_func!(
38+
ArrayCompact,
39+
array_compact,
40+
array,
41+
"removes null values from the array.",
42+
array_compact_udf
43+
);
44+
45+
#[user_doc(
46+
doc_section(label = "Array Functions"),
47+
description = "Removes null values from the array.",
48+
syntax_example = "array_compact(array)",
49+
sql_example = r#"```sql
50+
> select array_compact([1, NULL, 2, NULL, 3]) arr;
51+
+-----------+
52+
| arr |
53+
+-----------+
54+
| [1, 2, 3] |
55+
+-----------+
56+
```"#,
57+
argument(
58+
name = "array",
59+
description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
60+
)
61+
)]
62+
#[derive(Debug, PartialEq, Eq, Hash)]
63+
pub struct ArrayCompact {
64+
signature: Signature,
65+
aliases: Vec<String>,
66+
}
67+
68+
impl Default for ArrayCompact {
69+
fn default() -> Self {
70+
Self::new()
71+
}
72+
}
73+
74+
impl ArrayCompact {
75+
pub fn new() -> Self {
76+
Self {
77+
signature: Signature::array(Volatility::Immutable),
78+
aliases: vec!["list_compact".to_string()],
79+
}
80+
}
81+
}
82+
83+
impl ScalarUDFImpl for ArrayCompact {
84+
fn name(&self) -> &str {
85+
"array_compact"
86+
}
87+
88+
fn signature(&self) -> &Signature {
89+
&self.signature
90+
}
91+
92+
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
93+
Ok(arg_types[0].clone())
94+
}
95+
96+
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
97+
make_scalar_function(array_compact_inner)(&args.args)
98+
}
99+
100+
fn aliases(&self) -> &[String] {
101+
&self.aliases
102+
}
103+
104+
fn documentation(&self) -> Option<&Documentation> {
105+
self.doc()
106+
}
107+
}
108+
109+
/// array_compact SQL function
110+
fn array_compact_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
111+
let [input_array] = take_function_args("array_compact", arg)?;
112+
113+
match &input_array.data_type() {
114+
List(field) => {
115+
let array = as_list_array(input_array)?;
116+
compact_list::<i32>(array, field)
117+
}
118+
LargeList(field) => {
119+
let array = as_large_list_array(input_array)?;
120+
compact_list::<i64>(array, field)
121+
}
122+
Null => Ok(Arc::clone(input_array)),
123+
array_type => exec_err!("array_compact does not support type '{array_type}'."),
124+
}
125+
}
126+
127+
/// Remove null elements from each row of a list array.
128+
fn compact_list<O: OffsetSizeTrait>(
129+
list_array: &GenericListArray<O>,
130+
field: &Arc<arrow::datatypes::Field>,
131+
) -> Result<ArrayRef> {
132+
let values = list_array.values();
133+
134+
// Fast path: no nulls in values, return input unchanged
135+
if values.null_count() == 0 {
136+
return Ok(Arc::new(list_array.clone()));
137+
}
138+
139+
let original_data = values.to_data();
140+
let capacity = original_data.len() - values.null_count();
141+
let mut offsets = Vec::<O>::with_capacity(list_array.len() + 1);
142+
offsets.push(O::zero());
143+
let mut mutable = MutableArrayData::with_capacities(
144+
vec![&original_data],
145+
false,
146+
Capacities::Array(capacity),
147+
);
148+
149+
for row_index in 0..list_array.len() {
150+
if list_array.nulls().is_some_and(|n| n.is_null(row_index)) {
151+
offsets.push(offsets[row_index]);
152+
continue;
153+
}
154+
155+
let start = list_array.offsets()[row_index].as_usize();
156+
let end = list_array.offsets()[row_index + 1].as_usize();
157+
let mut copied = 0usize;
158+
159+
// Batch consecutive non-null elements into single extend() calls
160+
// to reduce per-element overhead. For [1, 2, NULL, 3, 4] this
161+
// produces 2 extend calls (0..2, 3..5) instead of 4 individual ones.
162+
let mut batch_start: Option<usize> = None;
163+
for i in start..end {
164+
if values.is_null(i) {
165+
// Null breaks the current batch — flush it
166+
if let Some(bs) = batch_start {
167+
mutable.extend(0, bs, i);
168+
copied += i - bs;
169+
batch_start = None;
170+
}
171+
} else if batch_start.is_none() {
172+
batch_start = Some(i);
173+
}
174+
}
175+
// Flush any remaining batch after the loop
176+
if let Some(bs) = batch_start {
177+
mutable.extend(0, bs, end);
178+
copied += end - bs;
179+
}
180+
181+
offsets.push(offsets[row_index] + O::usize_as(copied));
182+
}
183+
184+
let new_values = make_array(mutable.freeze());
185+
Ok(Arc::new(GenericListArray::<O>::try_new(
186+
Arc::clone(field),
187+
OffsetBuffer::new(offsets.into()),
188+
new_values,
189+
list_array.nulls().cloned(),
190+
)?))
191+
}

datafusion/functions-nested/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#[macro_use]
3838
pub mod macros;
3939

40+
pub mod array_compact;
4041
pub mod array_has;
4142
pub mod arrays_zip;
4243
pub mod cardinality;
@@ -77,6 +78,7 @@ use std::sync::Arc;
7778

7879
/// Fluent-style API for creating `Expr`s
7980
pub mod expr_fn {
81+
pub use super::array_compact::array_compact;
8082
pub use super::array_has::array_has;
8183
pub use super::array_has::array_has_all;
8284
pub use super::array_has::array_has_any;
@@ -128,6 +130,7 @@ pub mod expr_fn {
128130
/// Return all default nested type functions
129131
pub fn all_default_nested_functions() -> Vec<Arc<ScalarUDF>> {
130132
vec![
133+
array_compact::array_compact_udf(),
131134
string::array_to_string_udf(),
132135
string::string_to_array_udf(),
133136
range::range_udf(),

datafusion/sqllogictest/test_files/array.slt

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7182,6 +7182,99 @@ from array_distinct_table_2D_fixed;
71827182
[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
71837183
[[5, 6], NULL]
71847184

7185+
## array_compact (aliases: `list_compact`)
7186+
7187+
# basic: remove nulls from integer array
7188+
query ?
7189+
select array_compact([1, NULL, 2, NULL, 3]);
7190+
----
7191+
[1, 2, 3]
7192+
7193+
# no nulls present
7194+
query ?
7195+
select array_compact([1, 2, 3]);
7196+
----
7197+
[1, 2, 3]
7198+
7199+
# all nulls
7200+
query ?
7201+
select array_compact(arrow_cast([NULL, NULL, NULL], 'List(Int64)'));
7202+
----
7203+
[]
7204+
7205+
# empty array
7206+
query ?
7207+
select array_compact([]);
7208+
----
7209+
[]
7210+
7211+
# NULL input returns NULL
7212+
query ?
7213+
select array_compact(NULL::INT[]);
7214+
----
7215+
NULL
7216+
7217+
# string array
7218+
query ?
7219+
select array_compact(['a', NULL, 'b', NULL, 'c']);
7220+
----
7221+
[a, b, c]
7222+
7223+
# float array
7224+
query ?
7225+
select array_compact([1.0, NULL, 2.0, NULL]);
7226+
----
7227+
[1.0, 2.0]
7228+
7229+
# nested array (2D)
7230+
query ?
7231+
select array_compact([make_array(1, 2), NULL, make_array(3, 4)]);
7232+
----
7233+
[[1, 2], [3, 4]]
7234+
7235+
# LargeList
7236+
query ?
7237+
select array_compact(arrow_cast([1, NULL, 2, NULL, 3], 'LargeList(Int64)'));
7238+
----
7239+
[1, 2, 3]
7240+
7241+
# alias list_compact
7242+
query ?
7243+
select list_compact([1, NULL, 2]);
7244+
----
7245+
[1, 2]
7246+
7247+
# table-based test
7248+
statement ok
7249+
CREATE TABLE array_compact_table AS VALUES
7250+
(make_array(1, NULL, 2, NULL, 3)),
7251+
(make_array(NULL, NULL, NULL)),
7252+
(make_array(4, 5, 6)),
7253+
(NULL::INT[])
7254+
;
7255+
7256+
query ?
7257+
select array_compact(column1) from array_compact_table;
7258+
----
7259+
[1, 2, 3]
7260+
[]
7261+
[4, 5, 6]
7262+
NULL
7263+
7264+
statement ok
7265+
DROP TABLE array_compact_table;
7266+
7267+
# FixedSizeList (coerced to List)
7268+
query ?
7269+
select array_compact(arrow_cast(make_array(1, NULL, 2, NULL, 3), 'FixedSizeList(5, Int64)'));
7270+
----
7271+
[1, 2, 3]
7272+
7273+
query ?
7274+
select array_compact(arrow_cast(make_array(NULL, NULL, NULL), 'FixedSizeList(3, Int64)'));
7275+
----
7276+
[]
7277+
71857278
## arrays_zip (aliases: `list_zip`)
71867279

71877280
# Spark example: arrays_zip(array(1, 2, 3), array(2, 3, 4))

docs/source/user-guide/sql/scalar_functions.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3143,6 +3143,7 @@ _Alias of [current_date](#current_date)._
31433143
- [array_any_value](#array_any_value)
31443144
- [array_append](#array_append)
31453145
- [array_cat](#array_cat)
3146+
- [array_compact](#array_compact)
31463147
- [array_concat](#array_concat)
31473148
- [array_contains](#array_contains)
31483149
- [array_dims](#array_dims)
@@ -3191,6 +3192,7 @@ _Alias of [current_date](#current_date)._
31913192
- [list_any_value](#list_any_value)
31923193
- [list_append](#list_append)
31933194
- [list_cat](#list_cat)
3195+
- [list_compact](#list_compact)
31943196
- [list_concat](#list_concat)
31953197
- [list_contains](#list_contains)
31963198
- [list_dims](#list_dims)
@@ -3297,6 +3299,33 @@ array_append(array, element)
32973299

32983300
_Alias of [array_concat](#array_concat)._
32993301

3302+
### `array_compact`
3303+
3304+
Removes null values from the array.
3305+
3306+
```sql
3307+
array_compact(array)
3308+
```
3309+
3310+
#### Arguments
3311+
3312+
- **array**: Array expression. Can be a constant, column, or function, and any combination of array operators.
3313+
3314+
#### Example
3315+
3316+
```sql
3317+
> select array_compact([1, NULL, 2, NULL, 3]) arr;
3318+
+-----------+
3319+
| arr |
3320+
+-----------+
3321+
| [1, 2, 3] |
3322+
+-----------+
3323+
```
3324+
3325+
#### Aliases
3326+
3327+
- list_compact
3328+
33003329
### `array_concat`
33013330

33023331
Concatenates arrays.
@@ -4421,6 +4450,10 @@ _Alias of [array_append](#array_append)._
44214450

44224451
_Alias of [array_concat](#array_concat)._
44234452

4453+
### `list_compact`
4454+
4455+
_Alias of [array_compact](#array_compact)._
4456+
44244457
### `list_concat`
44254458

44264459
_Alias of [array_concat](#array_concat)._

0 commit comments

Comments
 (0)