Rollup merge of #113898 - ajtribick:encode_utf16_size_hint, r=cuviper

Fix size_hint for EncodeUtf16 More realistic upper and lower bounds, and handle the case where the iterator is located within a surrogate pair. Resolves #113897
2023-07-22 11:48:54 +02:00 · 2023-07-22 11:48:54 +02:00 · 65b5cba0dd
commit 65b5cba0dd
parent 746d507c72 f777339af3
2 changed files with 38 additions and 5 deletions
--- a/library/alloc/tests/str.rs
+++ b/library/alloc/tests/str.rs
@ -1738,6 +1738,28 @@ fn test_utf16_code_units() {
    assert_eq!("é\u{1F4A9}".encode_utf16().collect::<Vec<u16>>(), [0xE9, 0xD83D, 0xDCA9])
 }
 #[test]
 fn test_utf16_size_hint() {
    assert_eq!("".encode_utf16().size_hint(), (0, Some(0)));
    assert_eq!("123".encode_utf16().size_hint(), (1, Some(3)));
    assert_eq!("1234".encode_utf16().size_hint(), (2, Some(4)));
    assert_eq!("12345678".encode_utf16().size_hint(), (3, Some(8)));
    fn hint_vec(src: &str) -> Vec<(usize, Option<usize>)> {
        let mut it = src.encode_utf16();
        let mut result = Vec::new();
        result.push(it.size_hint());
        while it.next().is_some() {
            result.push(it.size_hint())
        }
        result
    }
    assert_eq!(hint_vec("12"), [(1, Some(2)), (1, Some(1)), (0, Some(0))]);
    assert_eq!(hint_vec("\u{101234}"), [(2, Some(4)), (1, Some(1)), (0, Some(0))]);
    assert_eq!(hint_vec("\u{101234}a"), [(2, Some(5)), (2, Some(2)), (1, Some(1)), (0, Some(0))]);
 }
 #[test]
 fn starts_with_in_unicode() {
    assert!(!"├── Cargo.toml".starts_with("# "));
--- a/library/core/src/str/iter.rs
+++ b/library/core/src/str/iter.rs
@ -1439,11 +1439,22 @@ impl<'a> Iterator for EncodeUtf16<'a> {
    #[inline]
    fn size_hint(&self) -> (usize, Option<usize>) {
-        let (low, high) = self.chars.size_hint();
+        let len = self.chars.iter.len();
-        // every char gets either one u16 or two u16,
+        // The highest bytes:code units ratio occurs for 3-byte sequences,
-        // so this iterator is between 1 or 2 times as
+        // since a 4-byte sequence results in 2 code units. The lower bound
-        // long as the underlying iterator.
+        // is therefore determined by assuming the remaining bytes contain as
-        (low, high.and_then(|n| n.checked_mul(2)))
+        // many 3-byte sequences as possible. The highest bytes:code units
        // ratio is for 1-byte sequences, so use this for the upper bound.
        // `(len + 2)` can't overflow, because we know that the `slice::Iter`
        // belongs to a slice in memory which has a maximum length of
        // `isize::MAX` (that's well below `usize::MAX`)
        if self.extra == 0 {
            ((len + 2) / 3, Some(len))
        } else {
            // We're in the middle of a surrogate pair, so add the remaining
            // surrogate to the bounds.
            ((len + 2) / 3 + 1, Some(len + 1))
        }
    }
 }