Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions arrow-csv/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ pub mod writer;
pub use self::reader::Reader;
pub use self::reader::ReaderBuilder;
pub use self::reader::infer_schema_from_files;
pub use self::writer::QuoteStyle;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is publicly exporting something from the csv crate (which I think is good): https://docs.rs/csv/latest/csv/enum.QuoteStyle.html

pub use self::writer::Writer;
pub use self::writer::WriterBuilder;
use arrow_schema::ArrowError;
Expand Down
207 changes: 207 additions & 0 deletions arrow-csv/src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,66 @@
//! "name,comment\nAlice ,Great job! \nBob,Well done\nCharlie,Excellent \n"
//! );
//! ```
//!
//! # Quoting Styles
//!
//! The writer supports different quoting styles for fields, compatible with Apache Spark's
//! CSV options like `quoteAll`. You can control when fields are quoted using the
//! [`QuoteStyle`] enum.
//!
//! ## Available Quoting Styles
//!
//! - `QuoteStyle::Necessary` (default): Only quotes fields when necessary (e.g., when they
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is redundant with the existing documentation on QuoteStyle -- the link above to QuoteStyle is probably fine

//! contain delimiters, quotes, or newlines)
//! - `QuoteStyle::Always`: Quotes all fields (equivalent to Spark's `quoteAll=true`)
//! - `QuoteStyle::NonNumeric`: Quotes only non-numeric fields
//! - `QuoteStyle::Never`: Never quotes fields (warning: can produce invalid CSV)
//!
//! ## Example with quoting styles
//!
//! ```
//! # use arrow_array::*;
//! # use arrow_csv::{WriterBuilder, QuoteStyle};
//! # use arrow_schema::*;
//! # use std::sync::Arc;
//!
//! let schema = Schema::new(vec![
//! Field::new("product", DataType::Utf8, false),
//! Field::new("price", DataType::Float64, false),
//! ]);
//!
//! let product = StringArray::from(vec!["apple", "banana,organic", "cherry"]);
//! let price = Float64Array::from(vec![1.50, 2.25, 3.00]);
//!
//! let batch = RecordBatch::try_new(
//! Arc::new(schema),
//! vec![Arc::new(product), Arc::new(price)],
//! )
//! .unwrap();
//!
//! // Default behavior (QuoteStyle::Necessary)
//! let mut output = Vec::new();
//! WriterBuilder::new()
//! .build(&mut output)
//! .write(&batch)
//! .unwrap();
//! assert_eq!(
//! String::from_utf8(output).unwrap(),
//! "product,price\napple,1.5\n\"banana,organic\",2.25\ncherry,3.0\n"
//! );
//!
//! // Quote all fields (Spark's quoteAll=true)
//! let mut output = Vec::new();
//! WriterBuilder::new()
//! .with_quote_style(QuoteStyle::Always)
//! .build(&mut output)
//! .write(&batch)
//! .unwrap();
//! assert_eq!(
//! String::from_utf8(output).unwrap(),
//! "\"product\",\"price\"\n\"apple\",\"1.5\"\n\"banana,organic\",\"2.25\"\n\"cherry\",\"3.0\"\n"
//! );
//! ```

use arrow_array::*;
use arrow_cast::display::*;
Expand All @@ -151,6 +211,22 @@ use std::io::Write;
use crate::map_csv_error;
const DEFAULT_NULL_VALUE: &str = "";

/// The quoting style to use when writing CSV files.
///
/// This type is re-exported from the `csv` crate and supports different
/// strategies for quoting fields. It is compatible with Apache Spark's
/// CSV options like `quoteAll`.
///
/// # Example
///
/// ```
/// use arrow_csv::{WriterBuilder, QuoteStyle};
///
/// let builder = WriterBuilder::new()
/// .with_quote_style(QuoteStyle::Always); // Equivalent to Spark's quoteAll=true
/// ```
pub use csv::QuoteStyle;

/// A CSV writer
#[derive(Debug)]
pub struct Writer<W: Write> {
Expand Down Expand Up @@ -324,6 +400,8 @@ pub struct WriterBuilder {
ignore_leading_whitespace: bool,
/// Whether to ignore trailing whitespace in string values. Defaults to `false`
ignore_trailing_whitespace: bool,
/// The quoting style to use. Defaults to `QuoteStyle::Necessary`
quote_style: QuoteStyle,
}

impl Default for WriterBuilder {
Expand All @@ -342,6 +420,7 @@ impl Default for WriterBuilder {
null_value: None,
ignore_leading_whitespace: false,
ignore_trailing_whitespace: false,
quote_style: QuoteStyle::default(),
}
}
}
Expand Down Expand Up @@ -528,12 +607,38 @@ impl WriterBuilder {
self.ignore_trailing_whitespace
}

/// Set the quoting style for writing CSV files
///
/// # Example
///
/// ```
/// use arrow_csv::{WriterBuilder, QuoteStyle};
///
/// // Quote all fields (equivalent to Spark's quoteAll=true)
/// let builder = WriterBuilder::new()
/// .with_quote_style(QuoteStyle::Always);
///
/// // Only quote when necessary (default)
/// let builder = WriterBuilder::new()
/// .with_quote_style(QuoteStyle::Necessary);
/// ```
pub fn with_quote_style(mut self, quote_style: QuoteStyle) -> Self {
self.quote_style = quote_style;
self
}

/// Get the configured quoting style
pub fn quote_style(&self) -> QuoteStyle {
self.quote_style
}

/// Create a new `Writer`
pub fn build<W: Write>(self, writer: W) -> Writer<W> {
let mut builder = csv::WriterBuilder::new();
let writer = builder
.delimiter(self.delimiter)
.quote(self.quote)
.quote_style(self.quote_style)
.double_quote(self.double_quote)
.escape(self.escape)
.from_writer(writer);
Expand Down Expand Up @@ -1181,4 +1286,106 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo
String::from_utf8(buf).unwrap()
);
}

#[test]
fn test_write_csv_quote_style() {
let schema = Schema::new(vec![
Field::new("text", DataType::Utf8, false),
Field::new("number", DataType::Int32, false),
Field::new("float", DataType::Float64, false),
]);

let text = StringArray::from(vec!["hello", "world", "comma,value", "quote\"test"]);
let number = Int32Array::from(vec![1, 2, 3, 4]);
let float = Float64Array::from(vec![1.1, 2.2, 3.3, 4.4]);

let batch = RecordBatch::try_new(
Arc::new(schema),
vec![Arc::new(text), Arc::new(number), Arc::new(float)],
)
.unwrap();

// Test with QuoteStyle::Necessary (default)
let mut buf = Vec::new();
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please reduce some of this boiler plate code? It is obscuring what is going on in the tests -- maybe a helper so these tests end up looking something like this:

        assert_eq!(
            "text,number,float\nhello,1,1.1\nworld,2,2.2\n\"comma,value\",3,3.3\n\"quote\"\"test\",4,4.4\n",
            write_quote_style(batch, QuoteStyle::Necessary
       )

let builder = WriterBuilder::new().with_quote_style(QuoteStyle::Necessary);
let mut writer = builder.build(&mut buf);
writer.write(&batch).unwrap();
drop(writer);
assert_eq!(
"text,number,float\nhello,1,1.1\nworld,2,2.2\n\"comma,value\",3,3.3\n\"quote\"\"test\",4,4.4\n",
String::from_utf8(buf).unwrap()
);

// Test with QuoteStyle::Always (equivalent to Spark's quoteAll=true)
let mut buf = Vec::new();
let builder = WriterBuilder::new().with_quote_style(QuoteStyle::Always);
let mut writer = builder.build(&mut buf);
writer.write(&batch).unwrap();
drop(writer);
assert_eq!(
"\"text\",\"number\",\"float\"\n\"hello\",\"1\",\"1.1\"\n\"world\",\"2\",\"2.2\"\n\"comma,value\",\"3\",\"3.3\"\n\"quote\"\"test\",\"4\",\"4.4\"\n",
String::from_utf8(buf).unwrap()
);

// Test with QuoteStyle::NonNumeric
let mut buf = Vec::new();
let builder = WriterBuilder::new().with_quote_style(QuoteStyle::NonNumeric);
let mut writer = builder.build(&mut buf);
writer.write(&batch).unwrap();
drop(writer);
assert_eq!(
"\"text\",\"number\",\"float\"\n\"hello\",1,1.1\n\"world\",2,2.2\n\"comma,value\",3,3.3\n\"quote\"\"test\",4,4.4\n",
String::from_utf8(buf).unwrap()
);

// Test with QuoteStyle::Never (warning: can produce invalid CSV)
let mut buf = Vec::new();
let builder = WriterBuilder::new().with_quote_style(QuoteStyle::Never);
let mut writer = builder.build(&mut buf);
writer.write(&batch).unwrap();
drop(writer);
// Note: This produces invalid CSV for fields with commas or quotes
assert_eq!(
"text,number,float\nhello,1,1.1\nworld,2,2.2\ncomma,value,3,3.3\nquote\"test,4,4.4\n",
String::from_utf8(buf).unwrap()
);
}

#[test]
fn test_write_csv_quote_style_with_nulls() {
let schema = Schema::new(vec![
Field::new("text", DataType::Utf8, true),
Field::new("number", DataType::Int32, true),
]);

let text = StringArray::from(vec![Some("hello"), None, Some("world")]);
let number = Int32Array::from(vec![Some(1), Some(2), None]);

let batch =
RecordBatch::try_new(Arc::new(schema), vec![Arc::new(text), Arc::new(number)]).unwrap();

// Test with QuoteStyle::Always
let mut buf = Vec::new();
let builder = WriterBuilder::new().with_quote_style(QuoteStyle::Always);
let mut writer = builder.build(&mut buf);
writer.write(&batch).unwrap();
drop(writer);
assert_eq!(
"\"text\",\"number\"\n\"hello\",\"1\"\n\"\",\"2\"\n\"world\",\"\"\n",
String::from_utf8(buf).unwrap()
);

// Test with QuoteStyle::Always and custom null value
let mut buf = Vec::new();
let builder = WriterBuilder::new()
.with_quote_style(QuoteStyle::Always)
.with_null("NULL".to_string());
let mut writer = builder.build(&mut buf);
writer.write(&batch).unwrap();
drop(writer);
assert_eq!(
"\"text\",\"number\"\n\"hello\",\"1\"\n\"NULL\",\"2\"\n\"world\",\"NULL\"\n",
String::from_utf8(buf).unwrap()
);
}
}
Loading