Skip to content

Commit 934f8fc

Browse files
committed
ls: use Unicode quotes for locale/clocale styles in UTF-8 locales
The locale and clocale quoting styles always emitted ASCII quotes and escaped embedded apostrophes/double quotes, regardless of the locale. GNU uses Unicode quotation marks U+2018/U+2019 (keyed off LC_CTYPE) for both styles in a UTF-8 locale, leaving embedded ASCII quotes untouched. Should make test tests/ls/quoting-utf8.sh pass.
1 parent 6813d19 commit 934f8fc

4 files changed

Lines changed: 106 additions & 0 deletions

File tree

.vscode/cspell.dictionaries/jargon.wordlist.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,3 +254,6 @@ Hijri
254254
Nowruz
255255
charmap
256256
hijri
257+
258+
ctype
259+
clocale

src/uu/ls/src/display.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ use uucore::{
4242
format::human::human_readable,
4343
fs::display_permissions,
4444
fsext::metadata_get_time,
45+
i18n::{UEncoding, get_ctype_encoding},
4546
os_str_as_bytes_lossy,
4647
quoting_style::{QuotingStyle, locale_aware_escape_dir_name, locale_aware_escape_name},
4748
show,
@@ -168,6 +169,33 @@ fn escape_name_with_locale(name: &OsStr, config: &Config) -> OsString {
168169

169170
fn locale_quote(name: &OsStr, style: LocaleQuoting) -> OsString {
170171
let bytes = os_str_as_bytes_lossy(name);
172+
173+
// In a UTF-8 locale GNU's locale/clocale quoting uses Unicode quotation
174+
// marks U+2018 (LEFT) and U+2019 (RIGHT) as delimiters for both styles,
175+
// keyed off LC_CTYPE. Since the delimiters differ from any ASCII quote,
176+
// embedded apostrophes and double quotes are left untouched; only control
177+
// characters, backslashes and invalid bytes are escaped.
178+
if get_ctype_encoding() == UEncoding::Utf8 {
179+
let mut quoted = String::with_capacity(name.len() + 6);
180+
quoted.push('\u{2018}');
181+
for chunk in bytes.utf8_chunks() {
182+
for c in chunk.valid().chars() {
183+
if c == '\\' {
184+
quoted.push_str("\\\\");
185+
} else if c.is_ascii() && c.is_control() {
186+
push_basic_escape(&mut quoted, c as u8);
187+
} else {
188+
quoted.push(c);
189+
}
190+
}
191+
for &byte in chunk.invalid() {
192+
let _ = write!(quoted, "\\{byte:03o}");
193+
}
194+
}
195+
quoted.push('\u{2019}');
196+
return OsString::from(quoted);
197+
}
198+
171199
let mut quoted = String::with_capacity(name.len() + 2);
172200
match style {
173201
LocaleQuoting::Single => quoted.push('\''),

src/uucore/src/lib/features/i18n/mod.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,3 +92,10 @@ pub fn get_numeric_locale() -> &'static (Locale, UEncoding) {
9292
pub fn get_locale_encoding() -> UEncoding {
9393
get_collating_locale().1
9494
}
95+
96+
/// Return the character-type encoding (`LC_CTYPE`) deduced from the environment.
97+
pub fn get_ctype_encoding() -> UEncoding {
98+
static CTYPE_ENCODING: OnceLock<UEncoding> = OnceLock::new();
99+
100+
*CTYPE_ENCODING.get_or_init(|| get_locale_from_env("LC_CTYPE").1)
101+
}

tests/by-util/test_ls.rs

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2717,6 +2717,8 @@ fn test_ls_recursive_1() {
27172717
#[cfg(unix)]
27182718
mod quoting {
27192719
use super::TestScenario;
2720+
use uutests::at_and_ucmd;
2721+
use uutests::util::is_locale_available;
27202722
use uutests::util_name;
27212723

27222724
/// Create a directory with "dirname", then for each check, assert that the
@@ -2764,6 +2766,72 @@ mod quoting {
27642766
);
27652767
}
27662768

2769+
// Regression test for GNU tests/ls/quoting-utf8.sh: in a UTF-8 locale the
2770+
// locale/clocale quoting styles use Unicode quotation marks U+2018/U+2019
2771+
// and must not escape embedded apostrophes or double quotes; in the C
2772+
// locale they fall back to ASCII single/double quotes.
2773+
#[test]
2774+
fn test_ls_quoting_locale_utf8() {
2775+
if !is_locale_available("en_US.UTF-8") {
2776+
return;
2777+
}
2778+
2779+
let lq = "\u{2018}";
2780+
let rq = "\u{2019}";
2781+
2782+
for style in ["locale", "clocale"] {
2783+
let (at, mut ucmd) = at_and_ucmd!();
2784+
at.touch("hello world");
2785+
at.touch("it's");
2786+
at.touch("say \"hi\"");
2787+
at.touch("tab\there");
2788+
2789+
let out = ucmd
2790+
.env("LC_ALL", "en_US.UTF-8")
2791+
.arg(format!("--quoting-style={style}"))
2792+
.arg("-1")
2793+
.succeeds()
2794+
.stdout_move_str();
2795+
2796+
assert!(
2797+
out.contains(&format!("{lq}hello world{rq}")),
2798+
"{style}: 'hello world' not quoted with Unicode quotes: {out:?}"
2799+
);
2800+
// Embedded apostrophe and double quote must stay unescaped.
2801+
assert!(
2802+
out.contains(&format!("{lq}it's{rq}")),
2803+
"{style}: embedded apostrophe should not be escaped: {out:?}"
2804+
);
2805+
assert!(
2806+
out.contains(&format!("{lq}say \"hi\"{rq}")),
2807+
"{style}: embedded double quote should not be escaped: {out:?}"
2808+
);
2809+
// Control characters are still C-escaped.
2810+
assert!(
2811+
out.contains(&format!("{lq}tab\\there{rq}")),
2812+
"{style}: tab should be escaped as \\t: {out:?}"
2813+
);
2814+
}
2815+
2816+
// In the C locale, locale uses ASCII single quotes and clocale uses
2817+
// ASCII double quotes.
2818+
let (at, mut ucmd) = at_and_ucmd!();
2819+
at.touch("hello world");
2820+
ucmd.env("LC_ALL", "C")
2821+
.arg("--quoting-style=locale")
2822+
.arg("-1")
2823+
.succeeds()
2824+
.stdout_contains("'hello world'");
2825+
2826+
let (at, mut ucmd) = at_and_ucmd!();
2827+
at.touch("hello world");
2828+
ucmd.env("LC_ALL", "C")
2829+
.arg("--quoting-style=clocale")
2830+
.arg("-1")
2831+
.succeeds()
2832+
.stdout_contains("\"hello world\"");
2833+
}
2834+
27672835
#[test]
27682836
fn test_ls_quoting_space() {
27692837
check_quoting_dirname(

0 commit comments

Comments
 (0)