From 0d502bf3ac077088ce3201c9b21a6fcb624635d9 Mon Sep 17 00:00:00 2001 From: mattsu Date: Mon, 4 May 2026 21:37:43 +0900 Subject: [PATCH 1/2] Optimize sort collation for long lines --- src/uu/sort/src/chunks.rs | 30 +++++++++++++----------------- src/uu/sort/src/sort.rs | 23 ++++++++++++++++------- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/src/uu/sort/src/chunks.rs b/src/uu/sort/src/chunks.rs index 62fb9fe4ea7..a6538e4dc41 100644 --- a/src/uu/sort/src/chunks.rs +++ b/src/uu/sort/src/chunks.rs @@ -55,20 +55,16 @@ pub struct LineData<'a> { pub line_num_floats: Vec>, /// Arena buffer holding all collation sort keys concatenated. pub collation_key_buffer: Vec, - /// End offsets into `collation_key_buffer` for each line's sort key. - pub collation_key_ends: Vec, + /// Byte ranges into `collation_key_buffer`; `None` means this line should use lazy collation. + pub collation_key_ranges: Vec>>, } impl LineData<'_> { /// Get the collation sort key for a line at the given index. - pub fn collation_key(&self, index: usize) -> &[u8] { - let start = if index == 0 { - 0 - } else { - self.collation_key_ends[index - 1] - }; - let end = self.collation_key_ends[index]; - &self.collation_key_buffer[start..end] + pub fn collation_key(&self, index: usize) -> Option<&[u8]> { + self.collation_key_ranges[index] + .as_ref() + .map(|range| &self.collation_key_buffer[range.clone()]) } } @@ -82,7 +78,7 @@ impl Chunk { contents.line_data.parsed_floats.clear(); contents.line_data.line_num_floats.clear(); contents.line_data.collation_key_buffer.clear(); - contents.line_data.collation_key_ends.clear(); + contents.line_data.collation_key_ranges.clear(); contents.token_buffer.clear(); let lines = unsafe { // SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime, @@ -107,7 +103,7 @@ impl Chunk { parsed_floats: std::mem::take(&mut contents.line_data.parsed_floats), line_num_floats: std::mem::take(&mut contents.line_data.line_num_floats), collation_key_buffer: std::mem::take(&mut contents.line_data.collation_key_buffer), - collation_key_ends: std::mem::take(&mut contents.line_data.collation_key_ends), + collation_key_ranges: std::mem::take(&mut contents.line_data.collation_key_ranges), token_buffer: std::mem::take(&mut contents.token_buffer), line_count_hint: contents.line_count_hint, // buffer is set below after we consume `self` @@ -134,7 +130,7 @@ pub struct RecycledChunk { parsed_floats: Vec, line_num_floats: Vec>, collation_key_buffer: Vec, - collation_key_ends: Vec, + collation_key_ranges: Vec>>, token_buffer: Vec>, line_count_hint: usize, buffer: Vec, @@ -149,7 +145,7 @@ impl RecycledChunk { parsed_floats: Vec::new(), line_num_floats: Vec::new(), collation_key_buffer: Vec::new(), - collation_key_ends: Vec::new(), + collation_key_ranges: Vec::new(), token_buffer: Vec::new(), line_count_hint: 0, buffer: vec![0; capacity], @@ -196,7 +192,7 @@ pub fn read( parsed_floats, line_num_floats, collation_key_buffer, - collation_key_ends, + collation_key_ranges, mut token_buffer, mut line_count_hint, mut buffer, @@ -236,7 +232,7 @@ pub fn read( parsed_floats, line_num_floats, collation_key_buffer, - collation_key_ends, + collation_key_ranges, }; parse_lines( read, @@ -279,7 +275,7 @@ fn parse_lines<'a>( assert!(line_data.parsed_floats.is_empty()); assert!(line_data.line_num_floats.is_empty()); assert!(line_data.collation_key_buffer.is_empty()); - assert!(line_data.collation_key_ends.is_empty()); + assert!(line_data.collation_key_ranges.is_empty()); token_buffer.clear(); let mut estimated = (*line_count_hint).max(1); let mut exact_line_count = None; diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index a360f5f95b6..142028524ea 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -134,6 +134,7 @@ const POSITIVE: &u8 = &b'+'; const MIN_AUTOMATIC_BUF_SIZE: usize = 512 * 1024; // 512 KiB const FALLBACK_AUTOMATIC_BUF_SIZE: usize = 32 * 1024 * 1024; // 32 MiB const MAX_AUTOMATIC_BUF_SIZE: usize = 1024 * 1024 * 1024; // 1 GiB +const MAX_PRECOMPUTED_COLLATION_KEY_LINE_LEN: usize = 1024 * 1024; // 1 MiB #[derive(Debug, Error)] pub enum SortError { @@ -643,10 +644,14 @@ impl<'a> Line<'a> { ) -> Self { #[cfg(feature = "i18n-collator")] if settings.precomputed.fast_locale_collation { - compute_sort_key_utf8(line, &mut line_data.collation_key_buffer); - line_data - .collation_key_ends - .push(line_data.collation_key_buffer.len()); + if line.len() <= MAX_PRECOMPUTED_COLLATION_KEY_LINE_LEN { + let start = line_data.collation_key_buffer.len(); + compute_sort_key_utf8(line, &mut line_data.collation_key_buffer); + let end = line_data.collation_key_buffer.len(); + line_data.collation_key_ranges.push(Some(start..end)); + } else { + line_data.collation_key_ranges.push(None); + } return Self { line, index }; } @@ -2653,9 +2658,13 @@ fn compare_by<'a>( #[cfg(feature = "i18n-collator")] if global_settings.precomputed.fast_locale_collation { - let a_key = a_line_data.collation_key(a.index); - let b_key = b_line_data.collation_key(b.index); - let mut cmp = a_key.cmp(b_key); + let mut cmp = match ( + a_line_data.collation_key(a.index), + b_line_data.collation_key(b.index), + ) { + (Some(a_key), Some(b_key)) => a_key.cmp(b_key), + _ => locale_cmp(a.line, b.line), + }; // If collation keys are equal, fall back to lexicographic comparison // This can be the case for inputs like `01` and `0_1`, which have equal keys if cmp == Ordering::Equal { From 8c85e7f3e08bb22d97c2e7cdce66d47c3ee90da1 Mon Sep 17 00:00:00 2001 From: mattsu Date: Tue, 5 May 2026 08:24:27 +0900 Subject: [PATCH 2/2] Lower sort collation key threshold --- src/uu/sort/src/sort.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 142028524ea..db63368c1ca 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -134,7 +134,7 @@ const POSITIVE: &u8 = &b'+'; const MIN_AUTOMATIC_BUF_SIZE: usize = 512 * 1024; // 512 KiB const FALLBACK_AUTOMATIC_BUF_SIZE: usize = 32 * 1024 * 1024; // 32 MiB const MAX_AUTOMATIC_BUF_SIZE: usize = 1024 * 1024 * 1024; // 1 GiB -const MAX_PRECOMPUTED_COLLATION_KEY_LINE_LEN: usize = 1024 * 1024; // 1 MiB +const MAX_PRECOMPUTED_COLLATION_KEY_LINE_LEN: usize = u16::MAX as usize; #[derive(Debug, Error)] pub enum SortError {