-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Add quote style to csv writer #9004
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -141,6 +141,66 @@ | |
| //! "name,comment\nAlice ,Great job! \nBob,Well done\nCharlie,Excellent \n" | ||
| //! ); | ||
| //! ``` | ||
| //! | ||
| //! # Quoting Styles | ||
| //! | ||
| //! The writer supports different quoting styles for fields, compatible with Apache Spark's | ||
| //! CSV options like `quoteAll`. You can control when fields are quoted using the | ||
| //! [`QuoteStyle`] enum. | ||
| //! | ||
| //! ## Available Quoting Styles | ||
| //! | ||
| //! - `QuoteStyle::Necessary` (default): Only quotes fields when necessary (e.g., when they | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is redundant with the existing documentation on |
||
| //! contain delimiters, quotes, or newlines) | ||
| //! - `QuoteStyle::Always`: Quotes all fields (equivalent to Spark's `quoteAll=true`) | ||
| //! - `QuoteStyle::NonNumeric`: Quotes only non-numeric fields | ||
| //! - `QuoteStyle::Never`: Never quotes fields (warning: can produce invalid CSV) | ||
| //! | ||
| //! ## Example with quoting styles | ||
| //! | ||
| //! ``` | ||
| //! # use arrow_array::*; | ||
| //! # use arrow_csv::{WriterBuilder, QuoteStyle}; | ||
| //! # use arrow_schema::*; | ||
| //! # use std::sync::Arc; | ||
| //! | ||
| //! let schema = Schema::new(vec![ | ||
| //! Field::new("product", DataType::Utf8, false), | ||
| //! Field::new("price", DataType::Float64, false), | ||
| //! ]); | ||
| //! | ||
| //! let product = StringArray::from(vec!["apple", "banana,organic", "cherry"]); | ||
| //! let price = Float64Array::from(vec![1.50, 2.25, 3.00]); | ||
| //! | ||
| //! let batch = RecordBatch::try_new( | ||
| //! Arc::new(schema), | ||
| //! vec![Arc::new(product), Arc::new(price)], | ||
| //! ) | ||
| //! .unwrap(); | ||
| //! | ||
| //! // Default behavior (QuoteStyle::Necessary) | ||
| //! let mut output = Vec::new(); | ||
| //! WriterBuilder::new() | ||
| //! .build(&mut output) | ||
| //! .write(&batch) | ||
| //! .unwrap(); | ||
| //! assert_eq!( | ||
| //! String::from_utf8(output).unwrap(), | ||
| //! "product,price\napple,1.5\n\"banana,organic\",2.25\ncherry,3.0\n" | ||
| //! ); | ||
| //! | ||
| //! // Quote all fields (Spark's quoteAll=true) | ||
| //! let mut output = Vec::new(); | ||
| //! WriterBuilder::new() | ||
| //! .with_quote_style(QuoteStyle::Always) | ||
| //! .build(&mut output) | ||
| //! .write(&batch) | ||
| //! .unwrap(); | ||
| //! assert_eq!( | ||
| //! String::from_utf8(output).unwrap(), | ||
| //! "\"product\",\"price\"\n\"apple\",\"1.5\"\n\"banana,organic\",\"2.25\"\n\"cherry\",\"3.0\"\n" | ||
| //! ); | ||
| //! ``` | ||
|
|
||
| use arrow_array::*; | ||
| use arrow_cast::display::*; | ||
|
|
@@ -151,6 +211,22 @@ use std::io::Write; | |
| use crate::map_csv_error; | ||
| const DEFAULT_NULL_VALUE: &str = ""; | ||
|
|
||
| /// The quoting style to use when writing CSV files. | ||
| /// | ||
| /// This type is re-exported from the `csv` crate and supports different | ||
| /// strategies for quoting fields. It is compatible with Apache Spark's | ||
| /// CSV options like `quoteAll`. | ||
| /// | ||
| /// # Example | ||
| /// | ||
| /// ``` | ||
| /// use arrow_csv::{WriterBuilder, QuoteStyle}; | ||
| /// | ||
| /// let builder = WriterBuilder::new() | ||
| /// .with_quote_style(QuoteStyle::Always); // Equivalent to Spark's quoteAll=true | ||
| /// ``` | ||
| pub use csv::QuoteStyle; | ||
|
|
||
| /// A CSV writer | ||
| #[derive(Debug)] | ||
| pub struct Writer<W: Write> { | ||
|
|
@@ -324,6 +400,8 @@ pub struct WriterBuilder { | |
| ignore_leading_whitespace: bool, | ||
| /// Whether to ignore trailing whitespace in string values. Defaults to `false` | ||
| ignore_trailing_whitespace: bool, | ||
| /// The quoting style to use. Defaults to `QuoteStyle::Necessary` | ||
| quote_style: QuoteStyle, | ||
| } | ||
|
|
||
| impl Default for WriterBuilder { | ||
|
|
@@ -342,6 +420,7 @@ impl Default for WriterBuilder { | |
| null_value: None, | ||
| ignore_leading_whitespace: false, | ||
| ignore_trailing_whitespace: false, | ||
| quote_style: QuoteStyle::default(), | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -528,12 +607,38 @@ impl WriterBuilder { | |
| self.ignore_trailing_whitespace | ||
| } | ||
|
|
||
| /// Set the quoting style for writing CSV files | ||
| /// | ||
| /// # Example | ||
| /// | ||
| /// ``` | ||
| /// use arrow_csv::{WriterBuilder, QuoteStyle}; | ||
| /// | ||
| /// // Quote all fields (equivalent to Spark's quoteAll=true) | ||
| /// let builder = WriterBuilder::new() | ||
| /// .with_quote_style(QuoteStyle::Always); | ||
| /// | ||
| /// // Only quote when necessary (default) | ||
| /// let builder = WriterBuilder::new() | ||
| /// .with_quote_style(QuoteStyle::Necessary); | ||
| /// ``` | ||
| pub fn with_quote_style(mut self, quote_style: QuoteStyle) -> Self { | ||
| self.quote_style = quote_style; | ||
| self | ||
| } | ||
|
|
||
| /// Get the configured quoting style | ||
| pub fn quote_style(&self) -> QuoteStyle { | ||
| self.quote_style | ||
| } | ||
|
|
||
| /// Create a new `Writer` | ||
| pub fn build<W: Write>(self, writer: W) -> Writer<W> { | ||
| let mut builder = csv::WriterBuilder::new(); | ||
| let writer = builder | ||
| .delimiter(self.delimiter) | ||
| .quote(self.quote) | ||
| .quote_style(self.quote_style) | ||
| .double_quote(self.double_quote) | ||
| .escape(self.escape) | ||
| .from_writer(writer); | ||
|
|
@@ -1181,4 +1286,106 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo | |
| String::from_utf8(buf).unwrap() | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_write_csv_quote_style() { | ||
| let schema = Schema::new(vec![ | ||
| Field::new("text", DataType::Utf8, false), | ||
| Field::new("number", DataType::Int32, false), | ||
| Field::new("float", DataType::Float64, false), | ||
| ]); | ||
|
|
||
| let text = StringArray::from(vec!["hello", "world", "comma,value", "quote\"test"]); | ||
| let number = Int32Array::from(vec![1, 2, 3, 4]); | ||
| let float = Float64Array::from(vec![1.1, 2.2, 3.3, 4.4]); | ||
|
|
||
| let batch = RecordBatch::try_new( | ||
| Arc::new(schema), | ||
| vec![Arc::new(text), Arc::new(number), Arc::new(float)], | ||
| ) | ||
| .unwrap(); | ||
|
|
||
| // Test with QuoteStyle::Necessary (default) | ||
| let mut buf = Vec::new(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you please reduce some of this boiler plate code? It is obscuring what is going on in the tests -- maybe a helper so these tests end up looking something like this: assert_eq!(
"text,number,float\nhello,1,1.1\nworld,2,2.2\n\"comma,value\",3,3.3\n\"quote\"\"test\",4,4.4\n",
write_quote_style(batch, QuoteStyle::Necessary
) |
||
| let builder = WriterBuilder::new().with_quote_style(QuoteStyle::Necessary); | ||
| let mut writer = builder.build(&mut buf); | ||
| writer.write(&batch).unwrap(); | ||
| drop(writer); | ||
| assert_eq!( | ||
| "text,number,float\nhello,1,1.1\nworld,2,2.2\n\"comma,value\",3,3.3\n\"quote\"\"test\",4,4.4\n", | ||
| String::from_utf8(buf).unwrap() | ||
| ); | ||
|
|
||
| // Test with QuoteStyle::Always (equivalent to Spark's quoteAll=true) | ||
| let mut buf = Vec::new(); | ||
| let builder = WriterBuilder::new().with_quote_style(QuoteStyle::Always); | ||
| let mut writer = builder.build(&mut buf); | ||
| writer.write(&batch).unwrap(); | ||
| drop(writer); | ||
| assert_eq!( | ||
| "\"text\",\"number\",\"float\"\n\"hello\",\"1\",\"1.1\"\n\"world\",\"2\",\"2.2\"\n\"comma,value\",\"3\",\"3.3\"\n\"quote\"\"test\",\"4\",\"4.4\"\n", | ||
| String::from_utf8(buf).unwrap() | ||
| ); | ||
|
|
||
| // Test with QuoteStyle::NonNumeric | ||
| let mut buf = Vec::new(); | ||
| let builder = WriterBuilder::new().with_quote_style(QuoteStyle::NonNumeric); | ||
| let mut writer = builder.build(&mut buf); | ||
| writer.write(&batch).unwrap(); | ||
| drop(writer); | ||
| assert_eq!( | ||
| "\"text\",\"number\",\"float\"\n\"hello\",1,1.1\n\"world\",2,2.2\n\"comma,value\",3,3.3\n\"quote\"\"test\",4,4.4\n", | ||
| String::from_utf8(buf).unwrap() | ||
| ); | ||
|
|
||
| // Test with QuoteStyle::Never (warning: can produce invalid CSV) | ||
| let mut buf = Vec::new(); | ||
| let builder = WriterBuilder::new().with_quote_style(QuoteStyle::Never); | ||
| let mut writer = builder.build(&mut buf); | ||
| writer.write(&batch).unwrap(); | ||
| drop(writer); | ||
| // Note: This produces invalid CSV for fields with commas or quotes | ||
| assert_eq!( | ||
| "text,number,float\nhello,1,1.1\nworld,2,2.2\ncomma,value,3,3.3\nquote\"test,4,4.4\n", | ||
| String::from_utf8(buf).unwrap() | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_write_csv_quote_style_with_nulls() { | ||
| let schema = Schema::new(vec![ | ||
| Field::new("text", DataType::Utf8, true), | ||
| Field::new("number", DataType::Int32, true), | ||
| ]); | ||
|
|
||
| let text = StringArray::from(vec![Some("hello"), None, Some("world")]); | ||
| let number = Int32Array::from(vec![Some(1), Some(2), None]); | ||
|
|
||
| let batch = | ||
| RecordBatch::try_new(Arc::new(schema), vec![Arc::new(text), Arc::new(number)]).unwrap(); | ||
|
|
||
| // Test with QuoteStyle::Always | ||
| let mut buf = Vec::new(); | ||
| let builder = WriterBuilder::new().with_quote_style(QuoteStyle::Always); | ||
| let mut writer = builder.build(&mut buf); | ||
| writer.write(&batch).unwrap(); | ||
| drop(writer); | ||
| assert_eq!( | ||
| "\"text\",\"number\"\n\"hello\",\"1\"\n\"\",\"2\"\n\"world\",\"\"\n", | ||
| String::from_utf8(buf).unwrap() | ||
| ); | ||
|
|
||
| // Test with QuoteStyle::Always and custom null value | ||
| let mut buf = Vec::new(); | ||
| let builder = WriterBuilder::new() | ||
| .with_quote_style(QuoteStyle::Always) | ||
| .with_null("NULL".to_string()); | ||
| let mut writer = builder.build(&mut buf); | ||
| writer.write(&batch).unwrap(); | ||
| drop(writer); | ||
| assert_eq!( | ||
| "\"text\",\"number\"\n\"hello\",\"1\"\n\"NULL\",\"2\"\n\"world\",\"NULL\"\n", | ||
| String::from_utf8(buf).unwrap() | ||
| ); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is publicly exporting something from the csv crate (which I think is good): https://docs.rs/csv/latest/csv/enum.QuoteStyle.html