Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions src/uu/sort/benches/sort_locale_utf8_bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,68 @@ fn sort_unique_utf8_locale(bencher: Bencher) {
});
}

/// Benchmark sorting very long lines (single repeated character per line) with UTF-8 locale.
/// This reproduces the pathological case from issue #12138 where computing full collation
/// sort keys for multi-megabyte lines caused a 40x slowdown vs GNU sort.
/// We use 1 MB lines (26 lines, one per letter) to keep the benchmark fast while still
/// exercising the prefix-based sort key optimization.
#[divan::bench]
fn sort_very_long_lines_utf8_locale(bencher: Bencher) {
let mut data = Vec::new();
// Create 26 lines of 1 MB each, each line is a single repeated letter
let letters: Vec<u8> = (b'a'..=b'z').collect();
for &ch in &letters {
data.extend(std::iter::repeat_n(ch, 1_000_000));
data.push(b'\n');
}
let file_path = setup_test_file(&data);
let output_file = NamedTempFile::new().unwrap();
let output_path = output_file.path().to_str().unwrap().to_string();

let args = [
"--parallel",
"1",
"-o",
&output_path,
file_path.to_str().unwrap(),
];
// Warm up
black_box(run_util_function(uumain, &args));
bencher.bench(|| {
black_box(run_util_function(uumain, &args));
});
}

/// Benchmark sorting lines that share a long common prefix but differ after 8 KB,
/// exercising the fallback from prefix sort keys to full locale comparison.
#[divan::bench]
fn sort_long_common_prefix_utf8_locale(bencher: Bencher) {
let mut data = Vec::new();
let prefix_len = 16 * 1024; // 16 KB common prefix (exceeds the 8 KB sort key limit)
let prefix: Vec<u8> = std::iter::repeat_n(b'x', prefix_len).collect();
// 26 lines that share the prefix but differ in the suffix
for ch in b'a'..=b'z' {
data.extend_from_slice(&prefix);
data.extend(std::iter::repeat_n(ch, 100));
data.push(b'\n');
}
let file_path = setup_test_file(&data);
let output_file = NamedTempFile::new().unwrap();
let output_path = output_file.path().to_str().unwrap().to_string();

let args = [
"--parallel",
"1",
"-o",
&output_path,
file_path.to_str().unwrap(),
];
black_box(run_util_function(uumain, &args));
bencher.bench(|| {
black_box(run_util_function(uumain, &args));
});
}

fn main() {
// Set UTF-8 locale BEFORE any benchmarks run.
// This must happen before divan::main() because the locale is cached
Expand Down
Loading