Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 13 additions & 17 deletions src/uu/sort/src/chunks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,20 +55,16 @@ pub struct LineData<'a> {
pub line_num_floats: Vec<Option<f64>>,
/// Arena buffer holding all collation sort keys concatenated.
pub collation_key_buffer: Vec<u8>,
/// End offsets into `collation_key_buffer` for each line's sort key.
pub collation_key_ends: Vec<usize>,
/// Byte ranges into `collation_key_buffer`; `None` means this line should use lazy collation.
pub collation_key_ranges: Vec<Option<Range<usize>>>,
}

impl LineData<'_> {
/// Get the collation sort key for a line at the given index.
pub fn collation_key(&self, index: usize) -> &[u8] {
let start = if index == 0 {
0
} else {
self.collation_key_ends[index - 1]
};
let end = self.collation_key_ends[index];
&self.collation_key_buffer[start..end]
pub fn collation_key(&self, index: usize) -> Option<&[u8]> {
self.collation_key_ranges[index]
.as_ref()
.map(|range| &self.collation_key_buffer[range.clone()])
}
}

Expand All @@ -82,7 +78,7 @@ impl Chunk {
contents.line_data.parsed_floats.clear();
contents.line_data.line_num_floats.clear();
contents.line_data.collation_key_buffer.clear();
contents.line_data.collation_key_ends.clear();
contents.line_data.collation_key_ranges.clear();
contents.token_buffer.clear();
let lines = unsafe {
// SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime,
Expand All @@ -107,7 +103,7 @@ impl Chunk {
parsed_floats: std::mem::take(&mut contents.line_data.parsed_floats),
line_num_floats: std::mem::take(&mut contents.line_data.line_num_floats),
collation_key_buffer: std::mem::take(&mut contents.line_data.collation_key_buffer),
collation_key_ends: std::mem::take(&mut contents.line_data.collation_key_ends),
collation_key_ranges: std::mem::take(&mut contents.line_data.collation_key_ranges),
token_buffer: std::mem::take(&mut contents.token_buffer),
line_count_hint: contents.line_count_hint,
// buffer is set below after we consume `self`
Expand All @@ -134,7 +130,7 @@ pub struct RecycledChunk {
parsed_floats: Vec<GeneralBigDecimalParseResult>,
line_num_floats: Vec<Option<f64>>,
collation_key_buffer: Vec<u8>,
collation_key_ends: Vec<usize>,
collation_key_ranges: Vec<Option<Range<usize>>>,
token_buffer: Vec<Range<usize>>,
line_count_hint: usize,
buffer: Vec<u8>,
Expand All @@ -149,7 +145,7 @@ impl RecycledChunk {
parsed_floats: Vec::new(),
line_num_floats: Vec::new(),
collation_key_buffer: Vec::new(),
collation_key_ends: Vec::new(),
collation_key_ranges: Vec::new(),
token_buffer: Vec::new(),
line_count_hint: 0,
buffer: vec![0; capacity],
Expand Down Expand Up @@ -196,7 +192,7 @@ pub fn read<T: Read>(
parsed_floats,
line_num_floats,
collation_key_buffer,
collation_key_ends,
collation_key_ranges,
mut token_buffer,
mut line_count_hint,
mut buffer,
Expand Down Expand Up @@ -236,7 +232,7 @@ pub fn read<T: Read>(
parsed_floats,
line_num_floats,
collation_key_buffer,
collation_key_ends,
collation_key_ranges,
};
parse_lines(
read,
Expand Down Expand Up @@ -279,7 +275,7 @@ fn parse_lines<'a>(
assert!(line_data.parsed_floats.is_empty());
assert!(line_data.line_num_floats.is_empty());
assert!(line_data.collation_key_buffer.is_empty());
assert!(line_data.collation_key_ends.is_empty());
assert!(line_data.collation_key_ranges.is_empty());
token_buffer.clear();
let mut estimated = (*line_count_hint).max(1);
let mut exact_line_count = None;
Expand Down
23 changes: 16 additions & 7 deletions src/uu/sort/src/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ const POSITIVE: &u8 = &b'+';
const MIN_AUTOMATIC_BUF_SIZE: usize = 512 * 1024; // 512 KiB
const FALLBACK_AUTOMATIC_BUF_SIZE: usize = 32 * 1024 * 1024; // 32 MiB
const MAX_AUTOMATIC_BUF_SIZE: usize = 1024 * 1024 * 1024; // 1 GiB
const MAX_PRECOMPUTED_COLLATION_KEY_LINE_LEN: usize = u16::MAX as usize;

#[derive(Debug, Error)]
pub enum SortError {
Expand Down Expand Up @@ -643,10 +644,14 @@ impl<'a> Line<'a> {
) -> Self {
#[cfg(feature = "i18n-collator")]
if settings.precomputed.fast_locale_collation {
compute_sort_key_utf8(line, &mut line_data.collation_key_buffer);
line_data
.collation_key_ends
.push(line_data.collation_key_buffer.len());
if line.len() <= MAX_PRECOMPUTED_COLLATION_KEY_LINE_LEN {
let start = line_data.collation_key_buffer.len();
compute_sort_key_utf8(line, &mut line_data.collation_key_buffer);
let end = line_data.collation_key_buffer.len();
line_data.collation_key_ranges.push(Some(start..end));
} else {
line_data.collation_key_ranges.push(None);
}
return Self { line, index };
}

Expand Down Expand Up @@ -2653,9 +2658,13 @@ fn compare_by<'a>(

#[cfg(feature = "i18n-collator")]
if global_settings.precomputed.fast_locale_collation {
let a_key = a_line_data.collation_key(a.index);
let b_key = b_line_data.collation_key(b.index);
let mut cmp = a_key.cmp(b_key);
let mut cmp = match (
a_line_data.collation_key(a.index),
b_line_data.collation_key(b.index),
) {
(Some(a_key), Some(b_key)) => a_key.cmp(b_key),
_ => locale_cmp(a.line, b.line),
};
// If collation keys are equal, fall back to lexicographic comparison
// This can be the case for inputs like `01` and `0_1`, which have equal keys
if cmp == Ordering::Equal {
Expand Down
Loading