Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/uu/sort/src/chunks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ pub struct LineData<'a> {
pub collation_key_buffer: Vec<u8>,
/// End offsets into `collation_key_buffer` for each line's sort key.
pub collation_key_ends: Vec<usize>,
/// Tracks whether each line's sort key was computed from a truncated prefix.
/// When `true`, prefix sort keys that compare equal must fall back to full
/// locale comparison.
pub collation_key_truncated: Vec<bool>,
}

impl LineData<'_> {
Expand All @@ -83,6 +87,7 @@ impl Chunk {
contents.line_data.line_num_floats.clear();
contents.line_data.collation_key_buffer.clear();
contents.line_data.collation_key_ends.clear();
contents.line_data.collation_key_truncated.clear();
contents.token_buffer.clear();
let lines = unsafe {
// SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime,
Expand All @@ -108,6 +113,9 @@ impl Chunk {
line_num_floats: std::mem::take(&mut contents.line_data.line_num_floats),
collation_key_buffer: std::mem::take(&mut contents.line_data.collation_key_buffer),
collation_key_ends: std::mem::take(&mut contents.line_data.collation_key_ends),
collation_key_truncated: std::mem::take(
&mut contents.line_data.collation_key_truncated,
),
token_buffer: std::mem::take(&mut contents.token_buffer),
line_count_hint: contents.line_count_hint,
// buffer is set below after we consume `self`
Expand Down Expand Up @@ -135,6 +143,7 @@ pub struct RecycledChunk {
line_num_floats: Vec<Option<f64>>,
collation_key_buffer: Vec<u8>,
collation_key_ends: Vec<usize>,
collation_key_truncated: Vec<bool>,
token_buffer: Vec<Range<usize>>,
line_count_hint: usize,
buffer: Vec<u8>,
Expand All @@ -150,6 +159,7 @@ impl RecycledChunk {
line_num_floats: Vec::new(),
collation_key_buffer: Vec::new(),
collation_key_ends: Vec::new(),
collation_key_truncated: Vec::new(),
token_buffer: Vec::new(),
line_count_hint: 0,
buffer: vec![0; capacity],
Expand Down Expand Up @@ -197,6 +207,7 @@ pub fn read<T: Read>(
line_num_floats,
collation_key_buffer,
collation_key_ends,
collation_key_truncated,
mut token_buffer,
mut line_count_hint,
mut buffer,
Expand Down Expand Up @@ -237,6 +248,7 @@ pub fn read<T: Read>(
line_num_floats,
collation_key_buffer,
collation_key_ends,
collation_key_truncated,
};
parse_lines(
read,
Expand Down
20 changes: 15 additions & 5 deletions src/uu/sort/src/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -643,10 +643,11 @@ impl<'a> Line<'a> {
) -> Self {
#[cfg(feature = "i18n-collator")]
if settings.precomputed.fast_locale_collation {
compute_sort_key_utf8(line, &mut line_data.collation_key_buffer);
let truncated = compute_sort_key_utf8(line, &mut line_data.collation_key_buffer);
line_data
.collation_key_ends
.push(line_data.collation_key_buffer.len());
line_data.collation_key_truncated.push(truncated);
return Self { line, index };
}

Expand Down Expand Up @@ -2656,11 +2657,20 @@ fn compare_by<'a>(
let a_key = a_line_data.collation_key(a.index);
let b_key = b_line_data.collation_key(b.index);
let mut cmp = a_key.cmp(b_key);
// If collation keys are equal, fall back to lexicographic comparison
// This can be the case for inputs like `01` and `0_1`, which have equal keys
// If collation keys are equal, we need to distinguish two cases:
if cmp == Ordering::Equal {
// Reversing the order to match sort's sorting behaviour
cmp = b.line.cmp(a.line);
let a_truncated = a_line_data.collation_key_truncated[a.index];
let b_truncated = b_line_data.collation_key_truncated[b.index];
if a_truncated || b_truncated {
// Prefix sort keys matched but at least one line was truncated.
// Fall back to full locale comparison for correctness.
cmp = locale_cmp(a.line, b.line);
}
// If still equal (or neither was truncated), use reverse lexicographic
// tiebreak to match GNU sort's behaviour for inputs like `01` vs `0_1`.
if cmp == Ordering::Equal {
cmp = b.line.cmp(a.line);
}
}
return if global_settings.reverse {
cmp.reverse()
Expand Down
33 changes: 28 additions & 5 deletions src/uucore/src/lib/features/i18n/collator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,38 @@ pub fn init_locale_collation() -> bool {
try_init_collator(opts)
}

/// Compute the ICU collation sort key for the given input bytes and append it to `buf`.
/// This allows pre-computing sort keys once per line, then comparing them with simple
/// byte comparison during sorting (much faster than calling `compare_utf8` per comparison).
pub fn compute_sort_key_utf8(input: &[u8], buf: &mut Vec<u8>) {
/// Cap on input bytes used to compute a sort key. Callers must fall back to
/// `locale_cmp` when prefix keys tie. 8 KiB bounds key cost on multi-MB lines
/// without hitting the fallback for realistic inputs — see issue #12138
/// (unbounded path was ~40× slower than GNU `sort`).
const SORT_KEY_PREFIX_LIMIT: usize = 8 * 1024;

/// Append the ICU collation sort key for `input` to `buf`, using at most
/// `SORT_KEY_PREFIX_LIMIT` bytes. Returns `true` if the input was truncated;
/// the caller must then fall back to `locale_cmp` on tie.
pub fn compute_sort_key_utf8(input: &[u8], buf: &mut Vec<u8>) -> bool {
let c = COLLATOR
.get()
.expect("compute_sort_key_utf8 called before collator initialization");
c.write_sort_key_utf8_to(input, buf)
let truncated = input.len() > SORT_KEY_PREFIX_LIMIT;
let effective_input = if truncated {
let mut end = SORT_KEY_PREFIX_LIMIT;
while end > 0 && !is_utf8_char_boundary(input[end]) {
end -= 1;
}
&input[..end]
} else {
input
};
c.write_sort_key_utf8_to(effective_input, buf)
.expect("ICU write_sort_key_utf8_to failed");
truncated
}

#[inline]
fn is_utf8_char_boundary(b: u8) -> bool {
// ASCII (0xxxxxxx) or UTF-8 leading byte (11xxxxxx).
(b as i8) >= -0x40
}

/// Compare both strings with regard to the current locale.
Expand Down
48 changes: 48 additions & 0 deletions tests/by-util/test_sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2981,4 +2981,52 @@ fn test_consistent_sorting_with_i18n_collate() {
.stdout_is(expected_output);
}

#[test]
#[cfg(unix)]
fn test_locale_utf8_long_lines_differ_after_prefix_limit() {
// Regression test for #12138: lines sharing a prefix longer than the
// 8 KiB sort-key limit must fall back to full locale comparison.
let locale = "en_US.UTF-8";
if !is_locale_available(locale) {
return;
}
let prefix = "x".repeat(16 * 1024);
let line_a = format!("{prefix}a\n");
let line_b = format!("{prefix}b\n");
let input = format!("{line_b}{line_a}");
let expected = format!("{line_a}{line_b}");
new_ucmd!()
.env("LC_ALL", locale)
.pipe_in(input)
.succeeds()
.stdout_is(expected);
}

#[test]
#[cfg(unix)]
fn test_locale_utf8_truncation_at_multibyte_boundary() {
// Construct lines whose byte length is just over the 8 KiB sort-key
// prefix limit, with a multi-byte UTF-8 character (é = 0xC3 0xA9)
// straddling that boundary. The truncation logic must back off to a
// valid char boundary and not split the multi-byte sequence; the
// fallback path must then order the lines correctly.
let locale = "en_US.UTF-8";
if !is_locale_available(locale) {
return;
}
// Pad to one byte before the 8 KiB limit, then place "éa" / "éb" so
// 'é' begins at byte 8191 (straddling 8192) and the differing ASCII
// byte ('a' vs 'b') sits past the limit.
let pad = "x".repeat(8 * 1024 - 1);
let line_a = format!("{pad}éa\n");
let line_b = format!("{pad}éb\n");
let input = format!("{line_b}{line_a}");
let expected = format!("{line_a}{line_b}");
new_ucmd!()
.env("LC_ALL", locale)
.pipe_in(input)
.succeeds()
.stdout_is(expected);
}

/* spell-checker: enable */
Loading