77// https://pubs.opengroup.org/onlinepubs/9699919799/utilities/sort.html
88// https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
99
10- // spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit behaviour keydef GETFD
10+ // spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit behaviour keydef GETFD localeconv
1111
1212mod buffer_hint;
1313mod check;
@@ -284,9 +284,35 @@ pub struct GlobalSettings {
284284 buffer_size_is_explicit : bool ,
285285 compress_prog : Option < String > ,
286286 merge_batch_size : usize ,
287+ numeric_locale : NumericLocaleSettings ,
287288 precomputed : Precomputed ,
288289}
289290
291+ #[ derive( Clone , Copy , Debug ) ]
292+ struct NumericLocaleSettings {
293+ thousands_sep : Option < u8 > ,
294+ decimal_pt : Option < u8 > ,
295+ }
296+
297+ impl Default for NumericLocaleSettings {
298+ fn default ( ) -> Self {
299+ Self {
300+ thousands_sep : None ,
301+ decimal_pt : Some ( DECIMAL_PT ) ,
302+ }
303+ }
304+ }
305+
306+ impl NumericLocaleSettings {
307+ fn num_info_settings ( & self , accept_si_units : bool ) -> NumInfoParseSettings {
308+ NumInfoParseSettings {
309+ accept_si_units,
310+ thousands_separator : self . thousands_sep ,
311+ decimal_pt : self . decimal_pt ,
312+ }
313+ }
314+ }
315+
290316/// Data needed for sorting. Should be computed once before starting to sort
291317/// by calling `GlobalSettings::init_precomputed`.
292318#[ derive( Clone , Debug , Default ) ]
@@ -297,6 +323,8 @@ struct Precomputed {
297323 selections_per_line : usize ,
298324 fast_lexicographic : bool ,
299325 fast_ascii_insensitive : bool ,
326+ tokenize_blank_thousands_sep : bool ,
327+ tokenize_allow_unit_after_blank : bool ,
300328}
301329
302330impl GlobalSettings {
@@ -341,6 +369,20 @@ impl GlobalSettings {
341369 . filter ( |s| matches ! ( s. settings. mode, SortMode :: GeneralNumeric ) )
342370 . count ( ) ;
343371
372+ let uses_numeric = self
373+ . selectors
374+ . iter ( )
375+ . any ( |s| matches ! ( s. settings. mode, SortMode :: Numeric | SortMode :: HumanNumeric ) ) ;
376+ let uses_human_numeric = self
377+ . selectors
378+ . iter ( )
379+ . any ( |s| matches ! ( s. settings. mode, SortMode :: HumanNumeric ) ) ;
380+ self . precomputed . tokenize_blank_thousands_sep = self . separator . is_none ( )
381+ && uses_numeric
382+ && self . numeric_locale . thousands_sep == Some ( b' ' ) ;
383+ self . precomputed . tokenize_allow_unit_after_blank =
384+ self . precomputed . tokenize_blank_thousands_sep && uses_human_numeric;
385+
344386 self . precomputed . fast_lexicographic =
345387 !disable_fast_lexicographic && self . can_use_fast_lexicographic ( ) ;
346388 self . precomputed . fast_ascii_insensitive = self . can_use_fast_ascii_insensitive ( ) ;
@@ -413,6 +455,7 @@ impl Default for GlobalSettings {
413455 buffer_size_is_explicit : false ,
414456 compress_prog : None ,
415457 merge_batch_size : default_merge_batch_size ( ) ,
458+ numeric_locale : NumericLocaleSettings :: default ( ) ,
416459 precomputed : Precomputed :: default ( ) ,
417460 }
418461 }
@@ -597,7 +640,12 @@ impl<'a> Line<'a> {
597640 }
598641 token_buffer. clear ( ) ;
599642 if settings. precomputed . needs_tokens {
600- tokenize ( line, settings. separator , token_buffer) ;
643+ tokenize (
644+ line,
645+ settings. separator ,
646+ token_buffer,
647+ & settings. precomputed ,
648+ ) ;
601649 }
602650 if settings. mode == SortMode :: Numeric {
603651 // exclude inf, nan, scientific notation
@@ -607,11 +655,12 @@ impl<'a> Line<'a> {
607655 . and_then ( |s| s. parse :: < f64 > ( ) . ok ( ) ) ;
608656 line_data. line_num_floats . push ( line_num_float) ;
609657 }
610- for ( selector, selection) in settings
611- . selectors
612- . iter ( )
613- . map ( |selector| ( selector, selector. get_selection ( line, token_buffer) ) )
614- {
658+ for ( selector, selection) in settings. selectors . iter ( ) . map ( |selector| {
659+ (
660+ selector,
661+ selector. get_selection ( line, token_buffer, & settings. numeric_locale ) ,
662+ )
663+ } ) {
615664 match selection {
616665 Selection :: AsBigDecimal ( parsed_float) => line_data. parsed_floats . push ( parsed_float) ,
617666 Selection :: WithNumInfo ( str, num_info) => {
@@ -660,19 +709,24 @@ impl<'a> Line<'a> {
660709 writeln ! ( writer) ?;
661710
662711 let mut fields = vec ! [ ] ;
663- tokenize ( self . line , settings. separator , & mut fields) ;
712+ tokenize (
713+ self . line ,
714+ settings. separator ,
715+ & mut fields,
716+ & settings. precomputed ,
717+ ) ;
664718 for selector in & settings. selectors {
665719 let mut selection = selector. get_range ( self . line , Some ( & fields) ) ;
666720 match selector. settings . mode {
667721 SortMode :: Numeric | SortMode :: HumanNumeric => {
668722 // find out which range is used for numeric comparisons
669- let ( _ , num_range ) = NumInfo :: parse (
670- & self . line [ selection . clone ( ) ] ,
671- & NumInfoParseSettings {
672- accept_si_units : selector . settings . mode == SortMode :: HumanNumeric ,
673- .. Default :: default ( )
674- } ,
675- ) ;
723+ let mut parse_settings = settings
724+ . numeric_locale
725+ . num_info_settings ( selector . settings . mode == SortMode :: HumanNumeric ) ;
726+ // Debug annotations should ignore thousands separators to match GNU output.
727+ parse_settings . thousands_separator = None ;
728+ let ( _ , num_range ) =
729+ NumInfo :: parse ( & self . line [ selection . clone ( ) ] , & parse_settings ) ;
676730 let initial_selection = selection. clone ( ) ;
677731
678732 // Shorten selection to num_range.
@@ -789,24 +843,50 @@ impl<'a> Line<'a> {
789843}
790844
791845/// Tokenize a line into fields. The result is stored into `token_buffer`.
792- fn tokenize ( line : & [ u8 ] , separator : Option < u8 > , token_buffer : & mut Vec < Field > ) {
846+ fn tokenize (
847+ line : & [ u8 ] ,
848+ separator : Option < u8 > ,
849+ token_buffer : & mut Vec < Field > ,
850+ precomputed : & Precomputed ,
851+ ) {
793852 assert ! ( token_buffer. is_empty( ) ) ;
794853 if let Some ( separator) = separator {
795854 tokenize_with_separator ( line, separator, token_buffer) ;
796855 } else {
797- tokenize_default ( line, token_buffer) ;
856+ tokenize_default (
857+ line,
858+ token_buffer,
859+ precomputed. tokenize_blank_thousands_sep ,
860+ precomputed. tokenize_allow_unit_after_blank ,
861+ ) ;
798862 }
799863}
800864
801865/// By default fields are separated by the first whitespace after non-whitespace.
802866/// Whitespace is included in fields at the start.
803867/// The result is stored into `token_buffer`.
804- fn tokenize_default ( line : & [ u8 ] , token_buffer : & mut Vec < Field > ) {
868+ fn tokenize_default (
869+ line : & [ u8 ] ,
870+ token_buffer : & mut Vec < Field > ,
871+ blank_thousands_sep : bool ,
872+ allow_unit_after_blank : bool ,
873+ ) {
805874 token_buffer. push ( 0 ..0 ) ;
806875 // pretend that there was whitespace in front of the line
807876 let mut previous_was_whitespace = true ;
808877 for ( idx, char) in line. iter ( ) . enumerate ( ) {
809- if char. is_ascii_whitespace ( ) {
878+ let is_whitespace = char. is_ascii_whitespace ( ) ;
879+ let treat_as_separator = if is_whitespace {
880+ if blank_thousands_sep && * char == b' ' {
881+ !is_blank_thousands_sep ( line, idx, allow_unit_after_blank)
882+ } else {
883+ true
884+ }
885+ } else {
886+ false
887+ } ;
888+
889+ if treat_as_separator {
810890 if !previous_was_whitespace {
811891 token_buffer. last_mut ( ) . unwrap ( ) . end = idx;
812892 token_buffer. push ( idx..0 ) ;
@@ -819,6 +899,31 @@ fn tokenize_default(line: &[u8], token_buffer: &mut Vec<Field>) {
819899 token_buffer. last_mut ( ) . unwrap ( ) . end = line. len ( ) ;
820900}
821901
902+ fn is_blank_thousands_sep ( line : & [ u8 ] , idx : usize , allow_unit_after_blank : bool ) -> bool {
903+ if line. get ( idx) != Some ( & b' ' ) {
904+ return false ;
905+ }
906+
907+ let prev_is_digit = idx
908+ . checked_sub ( 1 )
909+ . and_then ( |prev_idx| line. get ( prev_idx) )
910+ . is_some_and ( u8:: is_ascii_digit) ;
911+ if !prev_is_digit {
912+ return false ;
913+ }
914+
915+ let next = line. get ( idx + 1 ) . copied ( ) ;
916+ match next {
917+ Some ( c) if c. is_ascii_digit ( ) => true ,
918+ Some ( b'K' | b'k' | b'M' | b'G' | b'T' | b'P' | b'E' | b'Z' | b'Y' | b'R' | b'Q' )
919+ if allow_unit_after_blank =>
920+ {
921+ true
922+ }
923+ _ => false ,
924+ }
925+ }
926+
822927/// Split between separators. These separators are not included in fields.
823928/// The result is stored into `token_buffer`.
824929fn tokenize_with_separator ( line : & [ u8 ] , separator : u8 , token_buffer : & mut Vec < Field > ) {
@@ -1077,7 +1182,12 @@ impl FieldSelector {
10771182
10781183 /// Get the selection that corresponds to this selector for the line.
10791184 /// If `needs_fields` returned false, tokens may be empty.
1080- fn get_selection < ' a > ( & self , line : & ' a [ u8 ] , tokens : & [ Field ] ) -> Selection < ' a > {
1185+ fn get_selection < ' a > (
1186+ & self ,
1187+ line : & ' a [ u8 ] ,
1188+ tokens : & [ Field ] ,
1189+ numeric_locale : & NumericLocaleSettings ,
1190+ ) -> Selection < ' a > {
10811191 // `get_range` expects `None` when we don't need tokens and would get confused by an empty vector.
10821192 let tokens = if self . needs_tokens {
10831193 Some ( tokens)
@@ -1086,24 +1196,10 @@ impl FieldSelector {
10861196 } ;
10871197 let mut range_str = & line[ self . get_range ( line, tokens) ] ;
10881198 if self . settings . mode == SortMode :: Numeric || self . settings . mode == SortMode :: HumanNumeric {
1089- // Get the thousands separator from the locale, handling cases where the separator is empty or multi-character
1090- let locale_thousands_separator = i18n:: decimal:: locale_grouping_separator ( ) . as_bytes ( ) ;
1091-
1092- // Upstream GNU coreutils ignore multibyte thousands separators
1093- // (FIXME in C source). We keep the same single-byte behavior.
1094- let thousands_separator = match locale_thousands_separator {
1095- [ b] => Some ( * b) ,
1096- _ => None ,
1097- } ;
1098-
10991199 // Parse NumInfo for this number.
11001200 let ( info, num_range) = NumInfo :: parse (
11011201 range_str,
1102- & NumInfoParseSettings {
1103- accept_si_units : self . settings . mode == SortMode :: HumanNumeric ,
1104- thousands_separator,
1105- ..Default :: default ( )
1106- } ,
1202+ & numeric_locale. num_info_settings ( self . settings . mode == SortMode :: HumanNumeric ) ,
11071203 ) ;
11081204 // Shorten the range to what we need to pass to numeric_str_cmp later.
11091205 range_str = & range_str[ num_range] ;
@@ -1216,6 +1312,33 @@ impl FieldSelector {
12161312 }
12171313}
12181314
1315+ fn detect_numeric_locale ( ) -> NumericLocaleSettings {
1316+ let numeric_locale = i18n:: get_numeric_locale ( ) ;
1317+ let locale = & numeric_locale. 0 ;
1318+ let encoding = numeric_locale. 1 ;
1319+ let is_c_locale = encoding == i18n:: UEncoding :: Ascii && locale. to_string ( ) == "und" ;
1320+
1321+ if is_c_locale {
1322+ return NumericLocaleSettings {
1323+ decimal_pt : Some ( DECIMAL_PT ) ,
1324+ thousands_sep : None ,
1325+ } ;
1326+ }
1327+
1328+ let grouping = i18n:: decimal:: locale_grouping_separator ( ) ;
1329+ NumericLocaleSettings {
1330+ decimal_pt : Some ( locale_decimal_pt ( ) ) ,
1331+ // Upstream GNU coreutils ignore multibyte thousands separators
1332+ // (FIXME in C source). We keep the same single-byte behavior.
1333+ thousands_sep : match grouping. as_bytes ( ) {
1334+ [ b] => Some ( * b) ,
1335+ // ICU returns NBSP as UTF-8 (0xC2 0xA0). In non-UTF8 locales like ISO-8859-1,
1336+ // the input byte is 0xA0, so map it to a single-byte separator.
1337+ [ 0xC2 , 0xA0 ] if encoding != i18n:: UEncoding :: Utf8 => Some ( 0xA0 ) ,
1338+ _ => None ,
1339+ } ,
1340+ }
1341+ }
12191342/// Creates an `Arg` for a sort mode flag.
12201343fn make_sort_mode_arg ( mode : & ' static str , short : char , help : String ) -> Arg {
12211344 Arg :: new ( mode)
@@ -1847,7 +1970,10 @@ fn emit_debug_warnings(
18471970#[ uucore:: main]
18481971#[ allow( clippy:: cognitive_complexity) ]
18491972pub fn uumain ( args : impl uucore:: Args ) -> UResult < ( ) > {
1850- let mut settings = GlobalSettings :: default ( ) ;
1973+ let mut settings = GlobalSettings {
1974+ numeric_locale : detect_numeric_locale ( ) ,
1975+ ..Default :: default ( )
1976+ } ;
18511977
18521978 let ( processed_args, mut legacy_warnings) = preprocess_legacy_args ( args) ;
18531979 if !legacy_warnings. is_empty ( ) {
@@ -1955,7 +2081,9 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
19552081 let ignore_non_printing = matches. get_flag ( options:: IGNORE_NONPRINTING ) ;
19562082 let ignore_case = matches. get_flag ( options:: IGNORE_CASE ) ;
19572083
1958- if ordering_incompatible ( mode_flags, dictionary_order, ignore_non_printing) {
2084+ if !matches. contains_id ( options:: KEY )
2085+ && ordering_incompatible ( mode_flags, dictionary_order, ignore_non_printing)
2086+ {
19592087 let opts = ordering_opts_string (
19602088 mode_flags,
19612089 dictionary_order,
@@ -2965,7 +3093,8 @@ mod tests {
29653093
29663094 fn tokenize_helper ( line : & [ u8 ] , separator : Option < u8 > ) -> Vec < Field > {
29673095 let mut buffer = vec ! [ ] ;
2968- tokenize ( line, separator, & mut buffer) ;
3096+ let precomputed = Precomputed :: default ( ) ;
3097+ tokenize ( line, separator, & mut buffer, & precomputed) ;
29693098 buffer
29703099 }
29713100
0 commit comments