From e6fa5c18b56806aff5525c67f851a250bd8089f7 Mon Sep 17 00:00:00 2001 From: Andrew Tribick Date: Thu, 20 Jul 2023 21:52:33 +0200 Subject: [PATCH 1/2] Fix size_hint for EncodeUtf16 --- library/alloc/tests/str.rs | 22 ++++++++++++++++++++++ library/core/src/str/iter.rs | 19 ++++++++++++++----- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index 82c1a9f9ad7..8a4b4ac4e8d 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -1738,6 +1738,28 @@ fn test_utf16_code_units() { assert_eq!("é\u{1F4A9}".encode_utf16().collect::>(), [0xE9, 0xD83D, 0xDCA9]) } +#[test] +fn test_utf16_size_hint() { + assert_eq!("".encode_utf16().size_hint(), (0, Some(0))); + assert_eq!("123".encode_utf16().size_hint(), (1, Some(3))); + assert_eq!("1234".encode_utf16().size_hint(), (2, Some(4))); + assert_eq!("12345678".encode_utf16().size_hint(), (3, Some(8))); + + fn hint_vec(src: &str) -> Vec<(usize, Option)> { + let mut it = src.encode_utf16(); + let mut result = Vec::new(); + result.push(it.size_hint()); + while it.next().is_some() { + result.push(it.size_hint()) + } + result + } + + assert_eq!(hint_vec("12"), [(1, Some(2)), (1, Some(1)), (0, Some(0))]); + assert_eq!(hint_vec("\u{101234}"), [(2, Some(4)), (1, Some(1)), (0, Some(0))]); + assert_eq!(hint_vec("\u{101234}a"), [(2, Some(5)), (2, Some(2)), (1, Some(1)), (0, Some(0))]); +} + #[test] fn starts_with_in_unicode() { assert!(!"├── Cargo.toml".starts_with("# ")); diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index 772c3605562..133167a7067 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -1439,11 +1439,20 @@ impl<'a> Iterator for EncodeUtf16<'a> { #[inline] fn size_hint(&self) -> (usize, Option) { - let (low, high) = self.chars.size_hint(); - // every char gets either one u16 or two u16, - // so this iterator is between 1 or 2 times as - // long as the underlying iterator. - (low, high.and_then(|n| n.checked_mul(2))) + let len = self.chars.iter.len(); + // The highest bytes:code units ratio occurs for 3-byte sequences, so + // use this to determine the lower bound for the hint. The lowest + // ratio is for 1-byte sequences, so use this for the upper bound. + // `(len + 2)` can't overflow, because we know that the `slice::Iter` + // belongs to a slice in memory which has a maximum length of + // `isize::MAX` (that's well below `usize::MAX`) + if self.extra == 0 { + ((len + 2) / 3, Some(len)) + } else { + // We're in the middle of a surrogate pair, so add the remaining + // surrogate to the bounds. + ((len + 2) / 3 + 1, Some(len + 1)) + } } } From f777339af3eac0c0226417d3b63d50cbfd42eef2 Mon Sep 17 00:00:00 2001 From: Andrew Tribick Date: Fri, 21 Jul 2023 23:49:31 +0200 Subject: [PATCH 2/2] Clarify logic on bytes:code units ratio --- library/core/src/str/iter.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index 133167a7067..cd16810c4dd 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -1440,8 +1440,10 @@ impl<'a> Iterator for EncodeUtf16<'a> { #[inline] fn size_hint(&self) -> (usize, Option) { let len = self.chars.iter.len(); - // The highest bytes:code units ratio occurs for 3-byte sequences, so - // use this to determine the lower bound for the hint. The lowest + // The highest bytes:code units ratio occurs for 3-byte sequences, + // since a 4-byte sequence results in 2 code units. The lower bound + // is therefore determined by assuming the remaining bytes contain as + // many 3-byte sequences as possible. The highest bytes:code units // ratio is for 1-byte sequences, so use this for the upper bound. // `(len + 2)` can't overflow, because we know that the `slice::Iter` // belongs to a slice in memory which has a maximum length of