Skip to content

Commit f7a4bc2

Browse files
authored
fix: preserve kernel column segments (delta-io#4164)
# Description Fix kernel to DataFusion column expression conversion to preserve exact `ColumnName` path segments. Fixes delta-io#4082 **Fix:** Use DataFusion `ident(...)` for the base column segment when converting `Expression::Column`, then `.field(...)` for remaining path segments. Preserves exact segment names, avoids SQL style normalization. # Related Issue(s) - closes delta-io#4082 <!--- For example: - closes #106 ---> # Documentation <!--- Share links to useful documentation ---> --------- Signed-off-by: Ethan Urbanski <ethan@urbanskitech.com>
1 parent 37e9e6d commit f7a4bc2

2 files changed

Lines changed: 65 additions & 5 deletions

File tree

crates/core/src/delta_datafusion/engine/expressions/to_datafusion.rs

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use datafusion::common::{
77
use datafusion::functions::core::expr_ext::FieldAccessor;
88
use datafusion::functions::expr_fn::named_struct;
99
use datafusion::logical_expr::expr::ScalarFunction;
10-
use datafusion::logical_expr::{BinaryExpr, Expr, Operator, col, lit};
10+
use datafusion::logical_expr::{BinaryExpr, Expr, Operator, expr_fn::ident, lit};
1111
use datafusion::prelude::coalesce;
1212
use delta_kernel::Predicate;
1313
use delta_kernel::arrow::datatypes::{DataType as ArrowDataType, Field as ArrowField};
@@ -34,7 +34,11 @@ pub(crate) fn to_datafusion_expr(expr: &Expression, output_type: &DataType) -> D
3434
let base_name = name_iter
3535
.next()
3636
.ok_or_else(|| internal_datafusion_err!("Expected at least one column name"))?;
37-
Ok(name_iter.fold(col(base_name), |acc, n| acc.field(n)))
37+
// Kernel column paths are already exact segments. Using `ident` avoids DataFusion SQL
38+
// identifier normalization (e.g. lowercasing), and treats `.` as a literal character
39+
// rather than a qualifier separator. This is required for correctness with camelCase
40+
// and other non-normalized column names (see #4082).
41+
Ok(name_iter.fold(ident(base_name), |acc, n| acc.field(n)))
3842
}
3943
Expression::Predicate(expr) => predicate_to_df(expr, output_type),
4044
Expression::Struct(fields) => struct_to_df(fields, output_type),
@@ -222,7 +226,7 @@ fn struct_to_df(fields: &[Arc<Expression>], output_type: &DataType) -> DFResult<
222226
mod tests {
223227
use std::ops::Not;
224228

225-
use datafusion::logical_expr::{col, lit};
229+
use datafusion::logical_expr::{col, expr_fn::ident, lit};
226230
use delta_kernel::expressions::ColumnName;
227231
use delta_kernel::expressions::{ArrayData, BinaryExpression, MapData, Scalar, StructData};
228232
use delta_kernel::schema::{ArrayType, DataType, MapType, StructField, StructType};
@@ -381,11 +385,25 @@ mod tests {
381385
fn test_column_expression() {
382386
let expr = Expression::Column(ColumnName::new(["test_col"]));
383387
let result = to_datafusion_expr(&expr, &DataType::BOOLEAN).unwrap();
384-
assert_eq!(result, col("test_col"));
388+
assert_eq!(result, ident("test_col"));
385389

386390
let expr = Expression::Column(ColumnName::new(["test_col", "field_1", "field_2"]));
387391
let result = to_datafusion_expr(&expr, &DataType::BOOLEAN).unwrap();
388-
assert_eq!(result, col("test_col").field("field_1").field("field_2"));
392+
assert_eq!(result, ident("test_col").field("field_1").field("field_2"));
393+
}
394+
395+
#[test]
396+
fn test_column_expression_preserves_case() {
397+
let expr = Expression::Column(ColumnName::new(["submittedAt"]));
398+
let result = to_datafusion_expr(&expr, &DataType::BOOLEAN).unwrap();
399+
assert_eq!(result, ident("submittedAt"));
400+
}
401+
402+
#[test]
403+
fn test_column_expression_preserves_dots_in_segments() {
404+
let expr = Expression::Column(ColumnName::new(["a.b"]));
405+
let result = to_datafusion_expr(&expr, &DataType::BOOLEAN).unwrap();
406+
assert_eq!(result, ident("a.b"));
389407
}
390408

391409
/// Test various literal values:

python/tests/test_merge.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2117,6 +2117,48 @@ def test_merge_preserves_casing_in_quoted_identifiers(
21172117
assert result == expected
21182118

21192119

2120+
def test_merge_camelcase_non_nullable_column_4082(tmp_path: pathlib.Path):
2121+
# Regression test for https://github.com/delta-io/delta-rs/issues/4082
2122+
schema = Schema(
2123+
[
2124+
Field("submittedAt", PrimitiveType("long"), nullable=False),
2125+
Field("id", PrimitiveType("string"), nullable=True),
2126+
]
2127+
)
2128+
dt = DeltaTable.create(tmp_path, schema=schema, mode="ignore")
2129+
2130+
source_table = Table(
2131+
{
2132+
"submittedAt": Array(
2133+
[123],
2134+
ArrowField("submittedAt", type=DataType.int64(), nullable=True),
2135+
),
2136+
"id": Array(
2137+
["test"],
2138+
ArrowField("id", type=DataType.string_view(), nullable=True),
2139+
),
2140+
}
2141+
)
2142+
2143+
(
2144+
dt.merge(
2145+
source=source_table,
2146+
predicate="source.id = target.id",
2147+
source_alias="source",
2148+
target_alias="target",
2149+
)
2150+
.when_matched_update_all()
2151+
.when_not_matched_insert_all()
2152+
.execute()
2153+
)
2154+
2155+
assert dt.history(1)[0]["operation"] == "MERGE"
2156+
2157+
result = QueryBuilder().register("tbl", dt).execute("select * from tbl").read_all()
2158+
assert result["submittedAt"].to_pylist() == [123]
2159+
assert result["id"].to_pylist() == ["test"]
2160+
2161+
21202162
@pytest.mark.pandas
21212163
@pytest.mark.pyarrow
21222164
def test_struct_casting(tmp_path: pathlib.Path):

0 commit comments

Comments
 (0)