Skip to content

Commit 03475c7

Browse files
committed
Always quote CSVs coming out of lambda-to-csv
Digging deeper into load failures I discovered that while arrow-csv and other Rust CSV parsers can handle some characters properly, some RDMS and C-based CSV parsers (older) cannot, without quoting. This relies on a backport of apache/datafusion#20813 to the 52.4.0 release in order to support this. It also requires arrow-csv 57.2.0 or later since that release is the first that `QuoteStyle` exists.
1 parent 76cb611 commit 03475c7

3 files changed

Lines changed: 13 additions & 3 deletions

File tree

Cargo.toml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ members = [
77
resolver = "3"
88

99
[workspace.package]
10-
version = "1.8.7"
10+
version = "1.8.8"
1111
edition = "2024"
1212
keywords = ["deltalake", "parquet", "lambda", "delta", "sqs"]
1313
homepage = "https://github.com/buoyant-data/oxbow"
@@ -22,6 +22,7 @@ aws_lambda_events = { version = "0.15", default-features = false, features = ["s
2222
# The datafusion feature is required to support invariants which may be in error, but is required as of currently released 0.18.2
2323
deltalake = { version = "0.31.1", features = ["s3", "json", "datafusion"] }
2424
#deltalake = { git = "https://github.com/ethan-tyler/delta-rs", branch = "fix/meta-only-count-stats", features = ["s3", "json", "datafusion"]}
25+
#deltalake = { git = "https://github.com/ethan-tyler/delta-rs", branch = "fix/meta-only-count-stats", features = ["s3", "json", "datafusion"]}
2526
#deltalake = { path = "../../delta-io/delta-rs/crates/deltalake", features = ["s3", "json", "datafusion"]}
2627
futures = { version = "0.3" }
2728
tokio = { version = "=1", features = ["macros"] }
@@ -32,6 +33,14 @@ tracing = { version = "0.1", features = ["log"] }
3233
tracing-subscriber = { version = "0.3", default-features = false, features = ["fmt", "env-filter", "tracing-log"] }
3334
url = { version = "2.3", features = ["serde"] }
3435

36+
[patch.crates-io]
37+
datafusion = { git = "https://github.com/buoyant-data/datafusion", branch = "buoyant/release-52.4.1" }
38+
datafusion-datasource = { git = "https://github.com/buoyant-data/datafusion", branch = "buoyant/release-52.4.1" }
39+
datafusion-physical-expr-adapter = { git = "https://github.com/buoyant-data/datafusion", branch = "buoyant/release-52.4.1" }
40+
datafusion-ffi = { git = "https://github.com/buoyant-data/datafusion", branch = "buoyant/release-52.4.1" }
41+
datafusion-proto = { git = "https://github.com/buoyant-data/datafusion", branch = "buoyant/release-52.4.1" }
42+
43+
3544
[profile.release]
3645
panic = "abort"
3746
lto = true

lambdas/cdf-to-csv/src/main.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
//! produce CSV files to ingest into Aurora for Change Data Feeds
44
55
use aws_lambda_events::event::sqs::SqsEvent;
6+
use deltalake::datafusion::common::parsers::CsvQuoteStyle;
67
use deltalake::datafusion::config::CsvOptions;
78
use deltalake::datafusion::dataframe::DataFrameWriteOptions;
89
use deltalake::datafusion::prelude::*;
@@ -215,6 +216,7 @@ fn escape_dataframe(input: DataFrame) -> DeltaResult<DataFrame> {
215216
fn csv_options() -> CsvOptions {
216217
CsvOptions {
217218
null_value: std::env::var("CSV_NULL_CHARACTER").ok(),
219+
quote_style: CsvQuoteStyle::Always,
218220
timestamp_format: std::env::var("CSV_TIMESTAMP_FORMAT").ok(),
219221
timestamp_tz_format: std::env::var("CSV_TIMESTAMP_TZ_FORMAT").ok(),
220222
..Default::default()
@@ -384,7 +386,6 @@ mod tests {
384386
)
385387
.await?;
386388
let written = df.clone();
387-
let written_schema = written.schema();
388389
df = escape_dataframe(df)?;
389390

390391
df.write_csv(
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
{"id": 209, "other": 149, "rating" : 1.00, "boo" : true, "r" : "TRUST ME BRO\the sources are \"my friend once...\" \"my friend's daughter...\"\nbook full of chicken! as a pilot, break dancing was simply awfull\n\nper quanto" }
1+
{"id": 209, "other": 149, "r" : "TRUST ME BRO\the sources are \"my friend once...\" \"my friend's daughter...\"\nbook full of chicken! as a pilot, break dancing was simply awfull\n\nper quanto :-\\", "rating" : 1.00, "boo" : true }
22
{"id": 209, "other": 149, "rating" : 1.00, "boo" : false, "r" : "TRUST ME BRO\the sources are \"my friend once...\" \"my friend's daughter...\"\nbook full of chicken! as a pilot, break dancing was simply awfull\n\nper quanto" }
33
{"id": 209, "other": 149, "rating" : 1.00, "boo" : null, "r" : "TRUST ME BRO\the sources are \"my friend once...\" \"my friend's daughter...\"\nbook full of chicken! as a pilot, break dancing was simply awfull\n\nper quanto" }

0 commit comments

Comments
 (0)