Skip to content

Commit d24c343

Browse files
authored
Merge pull request #9848 from mattsu2020/sort_sort-h-thousands-sep.sh
feat(sort): add locale-aware numeric sorting support(sort-h-thousands-sep.sh)
2 parents 8bb31ee + 6cde13e commit d24c343

15 files changed

Lines changed: 290 additions & 94 deletions

src/uu/sort/src/sort.rs

Lines changed: 167 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
// https://pubs.opengroup.org/onlinepubs/9699919799/utilities/sort.html
88
// https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
99

10-
// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit behaviour keydef GETFD
10+
// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit behaviour keydef GETFD localeconv
1111

1212
mod buffer_hint;
1313
mod check;
@@ -284,9 +284,35 @@ pub struct GlobalSettings {
284284
buffer_size_is_explicit: bool,
285285
compress_prog: Option<String>,
286286
merge_batch_size: usize,
287+
numeric_locale: NumericLocaleSettings,
287288
precomputed: Precomputed,
288289
}
289290

291+
#[derive(Clone, Copy, Debug)]
292+
struct NumericLocaleSettings {
293+
thousands_sep: Option<u8>,
294+
decimal_pt: Option<u8>,
295+
}
296+
297+
impl Default for NumericLocaleSettings {
298+
fn default() -> Self {
299+
Self {
300+
thousands_sep: None,
301+
decimal_pt: Some(DECIMAL_PT),
302+
}
303+
}
304+
}
305+
306+
impl NumericLocaleSettings {
307+
fn num_info_settings(&self, accept_si_units: bool) -> NumInfoParseSettings {
308+
NumInfoParseSettings {
309+
accept_si_units,
310+
thousands_separator: self.thousands_sep,
311+
decimal_pt: self.decimal_pt,
312+
}
313+
}
314+
}
315+
290316
/// Data needed for sorting. Should be computed once before starting to sort
291317
/// by calling `GlobalSettings::init_precomputed`.
292318
#[derive(Clone, Debug, Default)]
@@ -297,6 +323,8 @@ struct Precomputed {
297323
selections_per_line: usize,
298324
fast_lexicographic: bool,
299325
fast_ascii_insensitive: bool,
326+
tokenize_blank_thousands_sep: bool,
327+
tokenize_allow_unit_after_blank: bool,
300328
}
301329

302330
impl GlobalSettings {
@@ -341,6 +369,20 @@ impl GlobalSettings {
341369
.filter(|s| matches!(s.settings.mode, SortMode::GeneralNumeric))
342370
.count();
343371

372+
let uses_numeric = self
373+
.selectors
374+
.iter()
375+
.any(|s| matches!(s.settings.mode, SortMode::Numeric | SortMode::HumanNumeric));
376+
let uses_human_numeric = self
377+
.selectors
378+
.iter()
379+
.any(|s| matches!(s.settings.mode, SortMode::HumanNumeric));
380+
self.precomputed.tokenize_blank_thousands_sep = self.separator.is_none()
381+
&& uses_numeric
382+
&& self.numeric_locale.thousands_sep == Some(b' ');
383+
self.precomputed.tokenize_allow_unit_after_blank =
384+
self.precomputed.tokenize_blank_thousands_sep && uses_human_numeric;
385+
344386
self.precomputed.fast_lexicographic =
345387
!disable_fast_lexicographic && self.can_use_fast_lexicographic();
346388
self.precomputed.fast_ascii_insensitive = self.can_use_fast_ascii_insensitive();
@@ -413,6 +455,7 @@ impl Default for GlobalSettings {
413455
buffer_size_is_explicit: false,
414456
compress_prog: None,
415457
merge_batch_size: default_merge_batch_size(),
458+
numeric_locale: NumericLocaleSettings::default(),
416459
precomputed: Precomputed::default(),
417460
}
418461
}
@@ -597,7 +640,12 @@ impl<'a> Line<'a> {
597640
}
598641
token_buffer.clear();
599642
if settings.precomputed.needs_tokens {
600-
tokenize(line, settings.separator, token_buffer);
643+
tokenize(
644+
line,
645+
settings.separator,
646+
token_buffer,
647+
&settings.precomputed,
648+
);
601649
}
602650
if settings.mode == SortMode::Numeric {
603651
// exclude inf, nan, scientific notation
@@ -607,11 +655,12 @@ impl<'a> Line<'a> {
607655
.and_then(|s| s.parse::<f64>().ok());
608656
line_data.line_num_floats.push(line_num_float);
609657
}
610-
for (selector, selection) in settings
611-
.selectors
612-
.iter()
613-
.map(|selector| (selector, selector.get_selection(line, token_buffer)))
614-
{
658+
for (selector, selection) in settings.selectors.iter().map(|selector| {
659+
(
660+
selector,
661+
selector.get_selection(line, token_buffer, &settings.numeric_locale),
662+
)
663+
}) {
615664
match selection {
616665
Selection::AsBigDecimal(parsed_float) => line_data.parsed_floats.push(parsed_float),
617666
Selection::WithNumInfo(str, num_info) => {
@@ -660,19 +709,24 @@ impl<'a> Line<'a> {
660709
writeln!(writer)?;
661710

662711
let mut fields = vec![];
663-
tokenize(self.line, settings.separator, &mut fields);
712+
tokenize(
713+
self.line,
714+
settings.separator,
715+
&mut fields,
716+
&settings.precomputed,
717+
);
664718
for selector in &settings.selectors {
665719
let mut selection = selector.get_range(self.line, Some(&fields));
666720
match selector.settings.mode {
667721
SortMode::Numeric | SortMode::HumanNumeric => {
668722
// find out which range is used for numeric comparisons
669-
let (_, num_range) = NumInfo::parse(
670-
&self.line[selection.clone()],
671-
&NumInfoParseSettings {
672-
accept_si_units: selector.settings.mode == SortMode::HumanNumeric,
673-
..Default::default()
674-
},
675-
);
723+
let mut parse_settings = settings
724+
.numeric_locale
725+
.num_info_settings(selector.settings.mode == SortMode::HumanNumeric);
726+
// Debug annotations should ignore thousands separators to match GNU output.
727+
parse_settings.thousands_separator = None;
728+
let (_, num_range) =
729+
NumInfo::parse(&self.line[selection.clone()], &parse_settings);
676730
let initial_selection = selection.clone();
677731

678732
// Shorten selection to num_range.
@@ -789,24 +843,50 @@ impl<'a> Line<'a> {
789843
}
790844

791845
/// Tokenize a line into fields. The result is stored into `token_buffer`.
792-
fn tokenize(line: &[u8], separator: Option<u8>, token_buffer: &mut Vec<Field>) {
846+
fn tokenize(
847+
line: &[u8],
848+
separator: Option<u8>,
849+
token_buffer: &mut Vec<Field>,
850+
precomputed: &Precomputed,
851+
) {
793852
assert!(token_buffer.is_empty());
794853
if let Some(separator) = separator {
795854
tokenize_with_separator(line, separator, token_buffer);
796855
} else {
797-
tokenize_default(line, token_buffer);
856+
tokenize_default(
857+
line,
858+
token_buffer,
859+
precomputed.tokenize_blank_thousands_sep,
860+
precomputed.tokenize_allow_unit_after_blank,
861+
);
798862
}
799863
}
800864

801865
/// By default fields are separated by the first whitespace after non-whitespace.
802866
/// Whitespace is included in fields at the start.
803867
/// The result is stored into `token_buffer`.
804-
fn tokenize_default(line: &[u8], token_buffer: &mut Vec<Field>) {
868+
fn tokenize_default(
869+
line: &[u8],
870+
token_buffer: &mut Vec<Field>,
871+
blank_thousands_sep: bool,
872+
allow_unit_after_blank: bool,
873+
) {
805874
token_buffer.push(0..0);
806875
// pretend that there was whitespace in front of the line
807876
let mut previous_was_whitespace = true;
808877
for (idx, char) in line.iter().enumerate() {
809-
if char.is_ascii_whitespace() {
878+
let is_whitespace = char.is_ascii_whitespace();
879+
let treat_as_separator = if is_whitespace {
880+
if blank_thousands_sep && *char == b' ' {
881+
!is_blank_thousands_sep(line, idx, allow_unit_after_blank)
882+
} else {
883+
true
884+
}
885+
} else {
886+
false
887+
};
888+
889+
if treat_as_separator {
810890
if !previous_was_whitespace {
811891
token_buffer.last_mut().unwrap().end = idx;
812892
token_buffer.push(idx..0);
@@ -819,6 +899,31 @@ fn tokenize_default(line: &[u8], token_buffer: &mut Vec<Field>) {
819899
token_buffer.last_mut().unwrap().end = line.len();
820900
}
821901

902+
fn is_blank_thousands_sep(line: &[u8], idx: usize, allow_unit_after_blank: bool) -> bool {
903+
if line.get(idx) != Some(&b' ') {
904+
return false;
905+
}
906+
907+
let prev_is_digit = idx
908+
.checked_sub(1)
909+
.and_then(|prev_idx| line.get(prev_idx))
910+
.is_some_and(u8::is_ascii_digit);
911+
if !prev_is_digit {
912+
return false;
913+
}
914+
915+
let next = line.get(idx + 1).copied();
916+
match next {
917+
Some(c) if c.is_ascii_digit() => true,
918+
Some(b'K' | b'k' | b'M' | b'G' | b'T' | b'P' | b'E' | b'Z' | b'Y' | b'R' | b'Q')
919+
if allow_unit_after_blank =>
920+
{
921+
true
922+
}
923+
_ => false,
924+
}
925+
}
926+
822927
/// Split between separators. These separators are not included in fields.
823928
/// The result is stored into `token_buffer`.
824929
fn tokenize_with_separator(line: &[u8], separator: u8, token_buffer: &mut Vec<Field>) {
@@ -1077,7 +1182,12 @@ impl FieldSelector {
10771182

10781183
/// Get the selection that corresponds to this selector for the line.
10791184
/// If `needs_fields` returned false, tokens may be empty.
1080-
fn get_selection<'a>(&self, line: &'a [u8], tokens: &[Field]) -> Selection<'a> {
1185+
fn get_selection<'a>(
1186+
&self,
1187+
line: &'a [u8],
1188+
tokens: &[Field],
1189+
numeric_locale: &NumericLocaleSettings,
1190+
) -> Selection<'a> {
10811191
// `get_range` expects `None` when we don't need tokens and would get confused by an empty vector.
10821192
let tokens = if self.needs_tokens {
10831193
Some(tokens)
@@ -1086,24 +1196,10 @@ impl FieldSelector {
10861196
};
10871197
let mut range_str = &line[self.get_range(line, tokens)];
10881198
if self.settings.mode == SortMode::Numeric || self.settings.mode == SortMode::HumanNumeric {
1089-
// Get the thousands separator from the locale, handling cases where the separator is empty or multi-character
1090-
let locale_thousands_separator = i18n::decimal::locale_grouping_separator().as_bytes();
1091-
1092-
// Upstream GNU coreutils ignore multibyte thousands separators
1093-
// (FIXME in C source). We keep the same single-byte behavior.
1094-
let thousands_separator = match locale_thousands_separator {
1095-
[b] => Some(*b),
1096-
_ => None,
1097-
};
1098-
10991199
// Parse NumInfo for this number.
11001200
let (info, num_range) = NumInfo::parse(
11011201
range_str,
1102-
&NumInfoParseSettings {
1103-
accept_si_units: self.settings.mode == SortMode::HumanNumeric,
1104-
thousands_separator,
1105-
..Default::default()
1106-
},
1202+
&numeric_locale.num_info_settings(self.settings.mode == SortMode::HumanNumeric),
11071203
);
11081204
// Shorten the range to what we need to pass to numeric_str_cmp later.
11091205
range_str = &range_str[num_range];
@@ -1216,6 +1312,33 @@ impl FieldSelector {
12161312
}
12171313
}
12181314

1315+
fn detect_numeric_locale() -> NumericLocaleSettings {
1316+
let numeric_locale = i18n::get_numeric_locale();
1317+
let locale = &numeric_locale.0;
1318+
let encoding = numeric_locale.1;
1319+
let is_c_locale = encoding == i18n::UEncoding::Ascii && locale.to_string() == "und";
1320+
1321+
if is_c_locale {
1322+
return NumericLocaleSettings {
1323+
decimal_pt: Some(DECIMAL_PT),
1324+
thousands_sep: None,
1325+
};
1326+
}
1327+
1328+
let grouping = i18n::decimal::locale_grouping_separator();
1329+
NumericLocaleSettings {
1330+
decimal_pt: Some(locale_decimal_pt()),
1331+
// Upstream GNU coreutils ignore multibyte thousands separators
1332+
// (FIXME in C source). We keep the same single-byte behavior.
1333+
thousands_sep: match grouping.as_bytes() {
1334+
[b] => Some(*b),
1335+
// ICU returns NBSP as UTF-8 (0xC2 0xA0). In non-UTF8 locales like ISO-8859-1,
1336+
// the input byte is 0xA0, so map it to a single-byte separator.
1337+
[0xC2, 0xA0] if encoding != i18n::UEncoding::Utf8 => Some(0xA0),
1338+
_ => None,
1339+
},
1340+
}
1341+
}
12191342
/// Creates an `Arg` for a sort mode flag.
12201343
fn make_sort_mode_arg(mode: &'static str, short: char, help: String) -> Arg {
12211344
Arg::new(mode)
@@ -1847,7 +1970,10 @@ fn emit_debug_warnings(
18471970
#[uucore::main]
18481971
#[allow(clippy::cognitive_complexity)]
18491972
pub fn uumain(args: impl uucore::Args) -> UResult<()> {
1850-
let mut settings = GlobalSettings::default();
1973+
let mut settings = GlobalSettings {
1974+
numeric_locale: detect_numeric_locale(),
1975+
..Default::default()
1976+
};
18511977

18521978
let (processed_args, mut legacy_warnings) = preprocess_legacy_args(args);
18531979
if !legacy_warnings.is_empty() {
@@ -1955,7 +2081,9 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
19552081
let ignore_non_printing = matches.get_flag(options::IGNORE_NONPRINTING);
19562082
let ignore_case = matches.get_flag(options::IGNORE_CASE);
19572083

1958-
if ordering_incompatible(mode_flags, dictionary_order, ignore_non_printing) {
2084+
if !matches.contains_id(options::KEY)
2085+
&& ordering_incompatible(mode_flags, dictionary_order, ignore_non_printing)
2086+
{
19592087
let opts = ordering_opts_string(
19602088
mode_flags,
19612089
dictionary_order,
@@ -2965,7 +3093,8 @@ mod tests {
29653093

29663094
fn tokenize_helper(line: &[u8], separator: Option<u8>) -> Vec<Field> {
29673095
let mut buffer = vec![];
2968-
tokenize(line, separator, &mut buffer);
3096+
let precomputed = Precomputed::default();
3097+
tokenize(line, separator, &mut buffer, &precomputed);
29693098
buffer
29703099
}
29713100

src/uucore/src/lib/features/i18n/decimal.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
use std::sync::OnceLock;
77

88
use icu_decimal::provider::DecimalSymbolsV1;
9-
use icu_locale::Locale;
9+
use icu_locale::{Locale, locale};
1010
use icu_provider::prelude::*;
1111

1212
use crate::i18n::get_numeric_locale;
@@ -60,7 +60,15 @@ fn get_grouping_separator(loc: Locale) -> String {
6060
pub fn locale_grouping_separator() -> &'static str {
6161
static GROUPING_SEP: OnceLock<String> = OnceLock::new();
6262

63-
GROUPING_SEP.get_or_init(|| get_grouping_separator(get_numeric_locale().0.clone()))
63+
GROUPING_SEP.get_or_init(|| {
64+
let loc = get_numeric_locale().0.clone();
65+
// C/POSIX locale (represented as "und") has no grouping separator.
66+
if loc == locale!("und") {
67+
String::new()
68+
} else {
69+
get_grouping_separator(loc)
70+
}
71+
})
6472
}
6573

6674
#[cfg(test)]

0 commit comments

Comments
 (0)