From 40cf1f9257628d40e38a4eca9e1b8ea03a3abcd1 Mon Sep 17 00:00:00 2001 From: The 8472 Date: Tue, 29 Aug 2023 02:01:24 +0200 Subject: [PATCH] optimize str::iter::Chars::advance_by this avoids part of the char decoding work by not looking at utf8 continuation bytes --- library/alloc/tests/str.rs | 11 ++++++++ library/core/src/str/iter.rs | 50 ++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index cb59a9d4ab2..df8a260624a 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -1170,6 +1170,17 @@ fn test_iterator() { assert_eq!(s.chars().count(), v.len()); } +#[test] +fn test_iterator_advance() { + let s = "「赤錆」と呼ばれる鉄錆は、水の存在下での鉄の自然酸化によって生じる、オキシ水酸化鉄(III) 等の(含水)酸化物粒子の疎な凝集膜であるとみなせる。"; + let chars: Vec = s.chars().collect(); + let mut it = s.chars(); + it.advance_by(1).unwrap(); + assert_eq!(it.next(), Some(chars[1])); + it.advance_by(33).unwrap(); + assert_eq!(it.next(), Some(chars[35])); +} + #[test] fn test_rev_iterator() { let s = "ศไทย中华Việt Nam"; diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index c30f01b3c06..dd2efb00516 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -8,6 +8,7 @@ use crate::ops::Try; use crate::option; use crate::slice::{self, Split as SliceSplit}; +use core::num::NonZeroUsize; use super::from_utf8_unchecked; use super::pattern::Pattern; @@ -49,6 +50,55 @@ fn count(self) -> usize { super::count::count_chars(self.as_str()) } + #[inline] + fn advance_by(&mut self, mut remainder: usize) -> Result<(), NonZeroUsize> { + const CHUNK_SIZE: usize = 32; + + if remainder >= CHUNK_SIZE { + let mut chunks = self.iter.as_slice().array_chunks::(); + let mut bytes_skipped: usize = 0; + + while remainder > CHUNK_SIZE + && let Some(chunk) = chunks.next() + { + bytes_skipped += CHUNK_SIZE; + + let mut start_bytes = [false; CHUNK_SIZE]; + + for i in 0..CHUNK_SIZE { + start_bytes[i] = !super::validations::utf8_is_cont_byte(chunk[i]); + } + + remainder -= start_bytes.into_iter().map(|i| i as u8).sum::() as usize; + } + + // SAFETY: The amount of bytes exists since we just iterated over them, + // so advance_by will succeed. + unsafe { self.iter.advance_by(bytes_skipped).unwrap_unchecked() }; + + // skip trailing continuation bytes + while self.iter.len() > 0 { + let b = self.iter.as_slice()[0]; + if !super::validations::utf8_is_cont_byte(b) { + break; + } + // SAFETY: We just peeked at the byte, therefore it exists + unsafe { self.iter.advance_by(1).unwrap_unchecked() }; + } + } + + while (remainder > 0) && (self.iter.len() > 0) { + remainder -= 1; + let b = self.iter.as_slice()[0]; + let slurp = super::validations::utf8_char_width(b); + // SAFETY: utf8 validity requires that the string must contain + // the continuation bytes (if any) + unsafe { self.iter.advance_by(slurp).unwrap_unchecked() }; + } + + NonZeroUsize::new(remainder).map_or(Ok(()), Err) + } + #[inline] fn size_hint(&self) -> (usize, Option) { let len = self.iter.len();