Auto merge of #88834 - the8472:char-count, r=joshtriplett
optimize str::from_utf8() validation when slice contains multibyte chars and str.chars().count() in all cases The change shows small but consistent improvements across several x86 target feature levels. I also tried to optimize counting with `slice.as_chunks` but that yielded more inconsistent results, bigger improvements for some optimization levels, lesser ones in others. ``` old, -O2, x86-64 test str::str_char_count_emoji ... bench: 1,924 ns/iter (+/- 26) test str::str_char_count_lorem ... bench: 879 ns/iter (+/- 12) test str::str_char_count_lorem_short ... bench: 5 ns/iter (+/- 0) new, -O2, x86-64 test str::str_char_count_emoji ... bench: 1,878 ns/iter (+/- 21) test str::str_char_count_lorem ... bench: 851 ns/iter (+/- 11) test str::str_char_count_lorem_short ... bench: 4 ns/iter (+/- 0) old, -O2, x86-64-v2 test str::str_char_count_emoji ... bench: 1,477 ns/iter (+/- 46) test str::str_char_count_lorem ... bench: 675 ns/iter (+/- 15) test str::str_char_count_lorem_short ... bench: 5 ns/iter (+/- 0) new, -O2, x86-64-v2 test str::str_char_count_emoji ... bench: 1,323 ns/iter (+/- 39) test str::str_char_count_lorem ... bench: 593 ns/iter (+/- 18) test str::str_char_count_lorem_short ... bench: 4 ns/iter (+/- 0) old, -O2, x86-64-v3 test str::str_char_count_emoji ... bench: 748 ns/iter (+/- 7) test str::str_char_count_lorem ... bench: 348 ns/iter (+/- 2) test str::str_char_count_lorem_short ... bench: 5 ns/iter (+/- 0) new, -O2, x86-64-v3 test str::str_char_count_emoji ... bench: 650 ns/iter (+/- 4) test str::str_char_count_lorem ... bench: 301 ns/iter (+/- 1) test str::str_char_count_lorem_short ... bench: 5 ns/iter (+/- 0) ``` and for the multibyte-char string validation: ``` old, -O2, x86-64 test str::str_validate_emoji ... bench: 4,606 ns/iter (+/- 64) new, -O2, x86-64 test str::str_validate_emoji ... bench: 3,837 ns/iter (+/- 60) ```
This commit is contained in:
commit
175b8db73b
@ -15,3 +15,4 @@ mod num;
|
||||
mod ops;
|
||||
mod pattern;
|
||||
mod slice;
|
||||
mod str;
|
||||
|
33
library/core/benches/str.rs
Normal file
33
library/core/benches/str.rs
Normal file
File diff suppressed because one or more lines are too long
@ -22,7 +22,7 @@ fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
|
||||
/// bits `10`).
|
||||
#[inline]
|
||||
pub(super) fn utf8_is_cont_byte(byte: u8) -> bool {
|
||||
(byte & !CONT_MASK) == TAG_CONT_U8
|
||||
(byte as i8) < -64
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@ -163,7 +163,7 @@ pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
|
||||
// %xF4 %x80-8F 2( UTF8-tail )
|
||||
match w {
|
||||
2 => {
|
||||
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||
if next!() as i8 >= -64 {
|
||||
err!(Some(1))
|
||||
}
|
||||
}
|
||||
@ -175,7 +175,7 @@ pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
|
||||
| (0xEE..=0xEF, 0x80..=0xBF) => {}
|
||||
_ => err!(Some(1)),
|
||||
}
|
||||
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||
if next!() as i8 >= -64 {
|
||||
err!(Some(2))
|
||||
}
|
||||
}
|
||||
@ -184,10 +184,10 @@ pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
|
||||
(0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {}
|
||||
_ => err!(Some(1)),
|
||||
}
|
||||
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||
if next!() as i8 >= -64 {
|
||||
err!(Some(2))
|
||||
}
|
||||
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||
if next!() as i8 >= -64 {
|
||||
err!(Some(3))
|
||||
}
|
||||
}
|
||||
@ -258,8 +258,6 @@ pub fn utf8_char_width(b: u8) -> usize {
|
||||
|
||||
/// Mask of the value bits of a continuation byte.
|
||||
const CONT_MASK: u8 = 0b0011_1111;
|
||||
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
|
||||
const TAG_CONT_U8: u8 = 0b1000_0000;
|
||||
|
||||
// truncate `&str` to length at most equal to `max`
|
||||
// return `true` if it were truncated, and the new str.
|
||||
|
Loading…
x
Reference in New Issue
Block a user