From 1681c91a356af14a2425a2017fea2fc675c8200a Mon Sep 17 00:00:00 2001 From: Yacin Tmimi Date: Tue, 9 Jul 2024 02:20:31 -0400 Subject: [PATCH] implement version-sorting algorithm for rust identifiers The algorithm is described in the [style guide] and was introduced in `r-l/rust 115046`. [style guide]: https://doc.rust-lang.org/nightly/style-guide/#sorting --- src/lib.rs | 1 + src/sort.rs | 368 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 369 insertions(+) create mode 100644 src/sort.rs diff --git a/src/lib.rs b/src/lib.rs index 7dc698dd92c..1f186b87767 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -93,6 +93,7 @@ macro_rules! static_regex { pub(crate) mod rustfmt_diff; mod shape; mod skip; +mod sort; pub(crate) mod source_file; pub(crate) mod source_map; mod spanned; diff --git a/src/sort.rs b/src/sort.rs new file mode 100644 index 00000000000..670f664a119 --- /dev/null +++ b/src/sort.rs @@ -0,0 +1,368 @@ +use itertools::EitherOrBoth; +use itertools::Itertools; + +/// Iterator which breaks an identifier into various [VersionChunk]s. +struct VersionChunkIter<'a> { + ident: &'a str, + start: usize, +} + +impl<'a> VersionChunkIter<'a> { + pub(crate) fn new(ident: &'a str) -> Self { + Self { ident, start: 0 } + } + + fn parse_numeric_chunk( + &mut self, + mut chars: std::str::CharIndices<'a>, + ) -> Option> { + let mut end = self.start; + let mut is_end_of_chunk = false; + + while let Some((idx, c)) = chars.next() { + end = self.start + idx; + + if c.is_ascii_digit() { + continue; + } + + is_end_of_chunk = true; + break; + } + + let source = if is_end_of_chunk { + let value = &self.ident[self.start..end]; + self.start = end; + value + } else { + let value = &self.ident[self.start..]; + self.start = self.ident.len(); + value + }; + + let zeros = source.chars().take_while(|c| *c == '0').count(); + let value = source.parse::().ok()?; + + Some(VersionChunk::Number { + value, + zeros, + source, + }) + } + + fn parse_str_chunk( + &mut self, + mut chars: std::str::CharIndices<'a>, + ) -> Option> { + let mut end = self.start; + let mut is_end_of_chunk = false; + + while let Some((idx, c)) = chars.next() { + end = self.start + idx; + + if c == '_' { + is_end_of_chunk = true; + break; + } + + if !c.is_numeric() { + continue; + } + + is_end_of_chunk = true; + break; + } + + let source = if is_end_of_chunk { + let value = &self.ident[self.start..end]; + self.start = end; + value + } else { + let value = &self.ident[self.start..]; + self.start = self.ident.len(); + value + }; + + Some(VersionChunk::Str(source)) + } +} + +impl<'a> Iterator for VersionChunkIter<'a> { + type Item = VersionChunk<'a>; + + fn next(&mut self) -> Option { + let mut chars = self.ident[self.start..].char_indices(); + let (_, next) = chars.next()?; + + if next == '_' { + self.start = self.start + next.len_utf8(); + return Some(VersionChunk::Underscore); + } + + if next.is_ascii_digit() { + return self.parse_numeric_chunk(chars); + } + + self.parse_str_chunk(chars) + } +} + +/// Represents a chunk in the version-sort algorithm +#[derive(Debug, PartialEq, Eq)] +enum VersionChunk<'a> { + /// A single `_` in an identifier. Underscores are sorted before all other characters. + Underscore, + /// A &str chunk in the version sort. + Str(&'a str), + /// A numeric chunk in the version sort. Keeps track of the numeric value and leading zeros. + Number { + value: usize, + zeros: usize, + source: &'a str, + }, +} + +/// Determine which side of the version-sort comparison had more leading zeros. +#[derive(Debug, PartialEq, Eq)] +enum MoreLeadingZeros { + Left, + Right, + Equal, +} + +/// Compare two identifiers based on the version sorting algorithm described in [the style guide] +/// +/// [the style guide]: https://doc.rust-lang.org/nightly/style-guide/#sorting +pub(crate) fn version_sort(a: &str, b: &str) -> std::cmp::Ordering { + let iter_a = VersionChunkIter::new(a); + let iter_b = VersionChunkIter::new(b); + let mut more_leading_zeros = MoreLeadingZeros::Equal; + + for either_or_both in iter_a.zip_longest(iter_b) { + match either_or_both { + EitherOrBoth::Left(_) => return std::cmp::Ordering::Greater, + EitherOrBoth::Right(_) => return std::cmp::Ordering::Less, + EitherOrBoth::Both(a, b) => match (a, b) { + (VersionChunk::Underscore, VersionChunk::Underscore) => { + continue; + } + (VersionChunk::Underscore, _) => return std::cmp::Ordering::Less, + (_, VersionChunk::Underscore) => return std::cmp::Ordering::Greater, + (VersionChunk::Str(ca), VersionChunk::Str(cb)) + | (VersionChunk::Str(ca), VersionChunk::Number { source: cb, .. }) + | (VersionChunk::Number { source: ca, .. }, VersionChunk::Str(cb)) => { + match ca.cmp(&cb) { + std::cmp::Ordering::Equal => { + continue; + } + order @ _ => return order, + } + } + ( + VersionChunk::Number { + value: va, + zeros: lza, + .. + }, + VersionChunk::Number { + value: vb, + zeros: lzb, + .. + }, + ) => match va.cmp(&vb) { + std::cmp::Ordering::Equal => { + if lza == lzb { + continue; + } + + if more_leading_zeros == MoreLeadingZeros::Equal && lza > lzb { + more_leading_zeros = MoreLeadingZeros::Left; + } else if more_leading_zeros == MoreLeadingZeros::Equal && lza < lzb { + more_leading_zeros = MoreLeadingZeros::Right; + } + continue; + } + order @ _ => return order, + }, + }, + } + } + + match more_leading_zeros { + MoreLeadingZeros::Equal => std::cmp::Ordering::Equal, + MoreLeadingZeros::Left => std::cmp::Ordering::Less, + MoreLeadingZeros::Right => std::cmp::Ordering::Greater, + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_chunks() { + let mut iter = VersionChunkIter::new("x86_128"); + assert_eq!(iter.next(), Some(VersionChunk::Str("x"))); + assert_eq!( + iter.next(), + Some(VersionChunk::Number { + value: 86, + zeros: 0, + source: "86" + }) + ); + assert_eq!(iter.next(), Some(VersionChunk::Underscore)); + assert_eq!( + iter.next(), + Some(VersionChunk::Number { + value: 128, + zeros: 0, + source: "128" + }) + ); + assert_eq!(iter.next(), None); + + let mut iter = VersionChunkIter::new("w005s09t"); + assert_eq!(iter.next(), Some(VersionChunk::Str("w"))); + assert_eq!( + iter.next(), + Some(VersionChunk::Number { + value: 5, + zeros: 2, + source: "005" + }) + ); + assert_eq!(iter.next(), Some(VersionChunk::Str("s"))); + assert_eq!( + iter.next(), + Some(VersionChunk::Number { + value: 9, + zeros: 1, + source: "09" + }) + ); + assert_eq!(iter.next(), Some(VersionChunk::Str("t"))); + assert_eq!(iter.next(), None); + + let mut iter = VersionChunkIter::new("ZY_WX"); + assert_eq!(iter.next(), Some(VersionChunk::Str("ZY"))); + assert_eq!(iter.next(), Some(VersionChunk::Underscore)); + assert_eq!(iter.next(), Some(VersionChunk::Str("WX"))); + + let mut iter = VersionChunkIter::new("_v1"); + assert_eq!(iter.next(), Some(VersionChunk::Underscore)); + assert_eq!(iter.next(), Some(VersionChunk::Str("v"))); + assert_eq!( + iter.next(), + Some(VersionChunk::Number { + value: 1, + zeros: 0, + source: "1" + }) + ); + + let mut iter = VersionChunkIter::new("_1v"); + assert_eq!(iter.next(), Some(VersionChunk::Underscore)); + assert_eq!( + iter.next(), + Some(VersionChunk::Number { + value: 1, + zeros: 0, + source: "1" + }) + ); + assert_eq!(iter.next(), Some(VersionChunk::Str("v"))); + + let mut iter = VersionChunkIter::new("v009"); + assert_eq!(iter.next(), Some(VersionChunk::Str("v"))); + assert_eq!( + iter.next(), + Some(VersionChunk::Number { + value: 9, + zeros: 2, + source: "009" + }) + ); + } + + #[test] + fn test_version_sort() { + let mut input = vec!["", "b", "a"]; + let expected = vec!["", "a", "b"]; + input.sort_by(|a, b| version_sort(a, b)); + assert_eq!(input, expected); + + let mut input = vec!["x7x", "xxx"]; + let expected = vec!["x7x", "xxx"]; + input.sort_by(|a, b| version_sort(a, b)); + assert_eq!(input, expected); + + let mut input = vec!["applesauce", "apple"]; + let expected = vec!["apple", "applesauce"]; + input.sort_by(|a, b| version_sort(a, b)); + assert_eq!(input, expected); + + let mut input = vec!["aaaaa", "aaa_a"]; + let expected = vec!["aaa_a", "aaaaa"]; + input.sort_by(|a, b| version_sort(a, b)); + assert_eq!(input, expected); + + let mut input = vec!["AAAAA", "AAA1A", "BBBBB", "BB_BB", "C3CCC"]; + let expected = vec!["AAA1A", "AAAAA", "BB_BB", "BBBBB", "C3CCC"]; + input.sort_by(|a, b| version_sort(a, b)); + assert_eq!(input, expected); + + let mut input = vec!["1_000_000", "1_010_001"]; + let expected = vec!["1_000_000", "1_010_001"]; + input.sort_by(|a, b| version_sort(a, b)); + assert_eq!(input, expected); + + let mut input = vec![ + "5", "50", "500", "5_000", "5_005", "5_050", "5_500", "50_000", "50_005", "50_050", + "50_500", + ]; + let expected = vec![ + "5", "5_000", "5_005", "5_050", "5_500", "50", "50_000", "50_005", "50_050", "50_500", + "500", + ]; + input.sort_by(|a, b| version_sort(a, b)); + assert_eq!(input, expected); + + let mut input = vec!["X86_64", "x86_64", "X86_128", "x86_128"]; + let expected = vec!["X86_64", "X86_128", "x86_64", "x86_128"]; + input.sort_by(|a, b| version_sort(a, b)); + assert_eq!(input, expected); + + let mut input = vec!["__", "_"]; + let expected = vec!["_", "__"]; + input.sort_by(|a, b| version_sort(a, b)); + assert_eq!(input, expected); + + let mut input = vec!["foo_", "foo"]; + let expected = vec!["foo", "foo_"]; + input.sort_by(|a, b| version_sort(a, b)); + assert_eq!(input, expected); + + let mut input = vec!["A", "AA", "B", "a", "aA", "aa", "b"]; + let expected = vec!["A", "AA", "B", "a", "aA", "aa", "b"]; + input.sort_by(|a, b| version_sort(a, b)); + assert_eq!(input, expected); + + let mut input = vec![ + "x86_128", "usize", "uz", "v000", "v00", "v0", "v0s", "v00t", "v0u", "v001", "v01", + "v1", "v009", "x87", "zyxw", "_ZYXW", "_abcd", "A2", "ABCD", "Z_YXW", "ZY_XW", "ZY_XW", + "ZYXW", "v09", "v9", "v010", "v10", "w005s09t", "w5s009t", "x64", "x86", "x86_32", + "ua", "x86_64", "ZYXW_", "a1", "abcd", "u_zzz", "u8", "u16", "u32", "u64", "u128", + "u256", + ]; + let expected = vec![ + "_ZYXW", "_abcd", "A2", "ABCD", "Z_YXW", "ZY_XW", "ZY_XW", "ZYXW", "ZYXW_", "a1", + "abcd", "u_zzz", "u8", "u16", "u32", "u64", "u128", "u256", "ua", "usize", "uz", + "v000", "v00", "v0", "v0s", "v00t", "v0u", "v001", "v01", "v1", "v009", "v09", "v9", + "v010", "v10", "w005s09t", "w5s009t", "x64", "x86", "x86_32", "x86_64", "x86_128", + "x87", "zyxw", + ]; + input.sort_by(|a, b| version_sort(a, b)); + assert_eq!(input, expected) + } +}