optimize str::iter::Chars::advance_by
this avoids part of the char decoding work by not looking at utf8 continuation bytes
This commit is contained in:
parent
3f55e8665c
commit
40cf1f9257
@ -1170,6 +1170,17 @@ fn test_iterator() {
|
|||||||
assert_eq!(s.chars().count(), v.len());
|
assert_eq!(s.chars().count(), v.len());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_iterator_advance() {
|
||||||
|
let s = "「赤錆」と呼ばれる鉄錆は、水の存在下での鉄の自然酸化によって生じる、オキシ水酸化鉄(III) 等の(含水)酸化物粒子の疎な凝集膜であるとみなせる。";
|
||||||
|
let chars: Vec<char> = s.chars().collect();
|
||||||
|
let mut it = s.chars();
|
||||||
|
it.advance_by(1).unwrap();
|
||||||
|
assert_eq!(it.next(), Some(chars[1]));
|
||||||
|
it.advance_by(33).unwrap();
|
||||||
|
assert_eq!(it.next(), Some(chars[35]));
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_rev_iterator() {
|
fn test_rev_iterator() {
|
||||||
let s = "ศไทย中华Việt Nam";
|
let s = "ศไทย中华Việt Nam";
|
||||||
|
@ -8,6 +8,7 @@
|
|||||||
use crate::ops::Try;
|
use crate::ops::Try;
|
||||||
use crate::option;
|
use crate::option;
|
||||||
use crate::slice::{self, Split as SliceSplit};
|
use crate::slice::{self, Split as SliceSplit};
|
||||||
|
use core::num::NonZeroUsize;
|
||||||
|
|
||||||
use super::from_utf8_unchecked;
|
use super::from_utf8_unchecked;
|
||||||
use super::pattern::Pattern;
|
use super::pattern::Pattern;
|
||||||
@ -49,6 +50,55 @@ fn count(self) -> usize {
|
|||||||
super::count::count_chars(self.as_str())
|
super::count::count_chars(self.as_str())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn advance_by(&mut self, mut remainder: usize) -> Result<(), NonZeroUsize> {
|
||||||
|
const CHUNK_SIZE: usize = 32;
|
||||||
|
|
||||||
|
if remainder >= CHUNK_SIZE {
|
||||||
|
let mut chunks = self.iter.as_slice().array_chunks::<CHUNK_SIZE>();
|
||||||
|
let mut bytes_skipped: usize = 0;
|
||||||
|
|
||||||
|
while remainder > CHUNK_SIZE
|
||||||
|
&& let Some(chunk) = chunks.next()
|
||||||
|
{
|
||||||
|
bytes_skipped += CHUNK_SIZE;
|
||||||
|
|
||||||
|
let mut start_bytes = [false; CHUNK_SIZE];
|
||||||
|
|
||||||
|
for i in 0..CHUNK_SIZE {
|
||||||
|
start_bytes[i] = !super::validations::utf8_is_cont_byte(chunk[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
remainder -= start_bytes.into_iter().map(|i| i as u8).sum::<u8>() as usize;
|
||||||
|
}
|
||||||
|
|
||||||
|
// SAFETY: The amount of bytes exists since we just iterated over them,
|
||||||
|
// so advance_by will succeed.
|
||||||
|
unsafe { self.iter.advance_by(bytes_skipped).unwrap_unchecked() };
|
||||||
|
|
||||||
|
// skip trailing continuation bytes
|
||||||
|
while self.iter.len() > 0 {
|
||||||
|
let b = self.iter.as_slice()[0];
|
||||||
|
if !super::validations::utf8_is_cont_byte(b) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// SAFETY: We just peeked at the byte, therefore it exists
|
||||||
|
unsafe { self.iter.advance_by(1).unwrap_unchecked() };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while (remainder > 0) && (self.iter.len() > 0) {
|
||||||
|
remainder -= 1;
|
||||||
|
let b = self.iter.as_slice()[0];
|
||||||
|
let slurp = super::validations::utf8_char_width(b);
|
||||||
|
// SAFETY: utf8 validity requires that the string must contain
|
||||||
|
// the continuation bytes (if any)
|
||||||
|
unsafe { self.iter.advance_by(slurp).unwrap_unchecked() };
|
||||||
|
}
|
||||||
|
|
||||||
|
NonZeroUsize::new(remainder).map_or(Ok(()), Err)
|
||||||
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||||
let len = self.iter.len();
|
let len = self.iter.len();
|
||||||
|
Loading…
Reference in New Issue
Block a user