Optimize core::str::Chars::count
This commit is contained in:
parent
71226d717a
commit
628b217326
@ -2230,3 +2230,43 @@ fn utf8_chars() {
|
|||||||
assert!((!from_utf8(&[0xf0, 0xff, 0x10]).is_ok()));
|
assert!((!from_utf8(&[0xf0, 0xff, 0x10]).is_ok()));
|
||||||
assert!((!from_utf8(&[0xf0, 0xff, 0xff, 0x10]).is_ok()));
|
assert!((!from_utf8(&[0xf0, 0xff, 0xff, 0x10]).is_ok()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn utf8_char_counts() {
|
||||||
|
let strs = [("e", 1), ("é", 1), ("€", 1), ("\u{10000}", 1), ("eé€\u{10000}", 4)];
|
||||||
|
let mut reps = vec![1, 8, 64, 256, 512, 1024];
|
||||||
|
if cfg!(not(miri)) {
|
||||||
|
reps.push(1 << 16);
|
||||||
|
}
|
||||||
|
let counts = if cfg!(miri) { 0..1 } else { 0..8 };
|
||||||
|
let padding = counts.map(|len| " ".repeat(len)).collect::<Vec<String>>();
|
||||||
|
|
||||||
|
for repeat in reps {
|
||||||
|
for (tmpl_str, tmpl_char_count) in strs {
|
||||||
|
for pad_start in &padding {
|
||||||
|
for pad_end in &padding {
|
||||||
|
// Create a string with padding...
|
||||||
|
let with_padding =
|
||||||
|
format!("{}{}{}", pad_start, tmpl_str.repeat(repeat), pad_end);
|
||||||
|
// ...and then skip past that padding. This should ensure
|
||||||
|
// that we test several different alignments for both head
|
||||||
|
// and tail.
|
||||||
|
let si = pad_start.len();
|
||||||
|
let ei = with_padding.len() - pad_end.len();
|
||||||
|
let target = &with_padding[si..ei];
|
||||||
|
|
||||||
|
assert!(!target.starts_with(" ") && !target.ends_with(" "));
|
||||||
|
let expected_count = tmpl_char_count * repeat;
|
||||||
|
assert_eq!(
|
||||||
|
expected_count,
|
||||||
|
target.chars().count(),
|
||||||
|
"wrong count for `{:?}.repeat({})` (padding: `{:?}`)",
|
||||||
|
tmpl_str,
|
||||||
|
repeat,
|
||||||
|
(pad_start.len(), pad_end.len()),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
File diff suppressed because one or more lines are too long
101
library/core/benches/str/char_count.rs
Normal file
101
library/core/benches/str/char_count.rs
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
use super::corpora::*;
|
||||||
|
use test::{black_box, Bencher};
|
||||||
|
|
||||||
|
macro_rules! define_benches {
|
||||||
|
($( fn $name: ident($arg: ident: &str) $body: block )+) => {
|
||||||
|
define_benches!(mod en_small, en::SMALL, $($name $arg $body)+);
|
||||||
|
define_benches!(mod en_medium, en::MEDIUM, $($name $arg $body)+);
|
||||||
|
define_benches!(mod en_large, en::LARGE, $($name $arg $body)+);
|
||||||
|
define_benches!(mod en_huge, en::HUGE, $($name $arg $body)+);
|
||||||
|
|
||||||
|
define_benches!(mod zh_small, zh::SMALL, $($name $arg $body)+);
|
||||||
|
define_benches!(mod zh_medium, zh::MEDIUM, $($name $arg $body)+);
|
||||||
|
define_benches!(mod zh_large, zh::LARGE, $($name $arg $body)+);
|
||||||
|
define_benches!(mod zh_huge, zh::HUGE, $($name $arg $body)+);
|
||||||
|
|
||||||
|
define_benches!(mod ru_small, ru::SMALL, $($name $arg $body)+);
|
||||||
|
define_benches!(mod ru_medium, ru::MEDIUM, $($name $arg $body)+);
|
||||||
|
define_benches!(mod ru_large, ru::LARGE, $($name $arg $body)+);
|
||||||
|
define_benches!(mod ru_huge, ru::HUGE, $($name $arg $body)+);
|
||||||
|
|
||||||
|
define_benches!(mod emoji_small, emoji::SMALL, $($name $arg $body)+);
|
||||||
|
define_benches!(mod emoji_medium, emoji::MEDIUM, $($name $arg $body)+);
|
||||||
|
define_benches!(mod emoji_large, emoji::LARGE, $($name $arg $body)+);
|
||||||
|
define_benches!(mod emoji_huge, emoji::HUGE, $($name $arg $body)+);
|
||||||
|
};
|
||||||
|
(mod $mod_name: ident, $input: expr, $($name: ident $arg: ident $body: block)+) => {
|
||||||
|
mod $mod_name {
|
||||||
|
use super::*;
|
||||||
|
$(
|
||||||
|
#[bench]
|
||||||
|
fn $name(bencher: &mut Bencher) {
|
||||||
|
let input = $input;
|
||||||
|
bencher.bytes = input.len() as u64;
|
||||||
|
let mut input_s = input.to_string();
|
||||||
|
bencher.iter(|| {
|
||||||
|
let $arg: &str = &black_box(&mut input_s);
|
||||||
|
black_box($body)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
)+
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
define_benches! {
|
||||||
|
fn case00_cur_libcore(s: &str) {
|
||||||
|
cur_libcore(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn case01_old_libcore(s: &str) {
|
||||||
|
old_libcore(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn case02_iter_increment(s: &str) {
|
||||||
|
iterator_increment(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn case03_manual_char_len(s: &str) {
|
||||||
|
manual_char_len(s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn cur_libcore(s: &str) -> usize {
|
||||||
|
s.chars().count()
|
||||||
|
}
|
||||||
|
#[inline]
|
||||||
|
fn utf8_is_cont_byte(byte: u8) -> bool {
|
||||||
|
(byte as i8) < -64
|
||||||
|
}
|
||||||
|
fn old_libcore(s: &str) -> usize {
|
||||||
|
s.as_bytes().iter().filter(|&&byte| !utf8_is_cont_byte(byte)).count()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn iterator_increment(s: &str) -> usize {
|
||||||
|
let mut c = 0;
|
||||||
|
for _ in s.chars() {
|
||||||
|
c += 1;
|
||||||
|
}
|
||||||
|
c
|
||||||
|
}
|
||||||
|
|
||||||
|
fn manual_char_len(s: &str) -> usize {
|
||||||
|
let s = s.as_bytes();
|
||||||
|
let mut c = 0;
|
||||||
|
let mut i = 0;
|
||||||
|
let l = s.len();
|
||||||
|
while i < l {
|
||||||
|
let b = s[i];
|
||||||
|
if b < 0x80 {
|
||||||
|
i += 1;
|
||||||
|
} else if b < 0xe0 {
|
||||||
|
i += 2;
|
||||||
|
} else if b < 0xf0 {
|
||||||
|
i += 3;
|
||||||
|
} else {
|
||||||
|
i += 4;
|
||||||
|
}
|
||||||
|
c += 1;
|
||||||
|
}
|
||||||
|
c
|
||||||
|
}
|
83
library/core/benches/str/corpora.rs
Normal file
83
library/core/benches/str/corpora.rs
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
//! Exposes a number of modules with different kinds of strings.
|
||||||
|
//!
|
||||||
|
//! Each module contains `&str` constants named `SMALL`, `MEDIUM`, `LARGE`, and
|
||||||
|
//! `HUGE`.
|
||||||
|
//!
|
||||||
|
//! - The `SMALL` string is generally around 30-40 bytes.
|
||||||
|
//! - The `MEDIUM` string is generally around 600-700 bytes.
|
||||||
|
//! - The `LARGE` string is the `MEDIUM` string repeated 8x, and is around 5kb.
|
||||||
|
//! - The `HUGE` string is the `LARGE` string repeated 8x (or the `MEDIUM`
|
||||||
|
//! string repeated 64x), and is around 40kb.
|
||||||
|
//!
|
||||||
|
//! Except for `mod emoji` (which is just a bunch of emoji), the strings were
|
||||||
|
//! pulled from (localizations of) rust-lang.org.
|
||||||
|
|
||||||
|
macro_rules! repeat8 {
|
||||||
|
($s:expr) => {
|
||||||
|
concat!($s, $s, $s, $s, $s, $s, $s, $s)
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
macro_rules! define_consts {
|
||||||
|
($s:literal) => {
|
||||||
|
pub const MEDIUM: &str = $s;
|
||||||
|
pub const LARGE: &str = repeat8!($s);
|
||||||
|
pub const HUGE: &str = repeat8!(repeat8!(repeat8!($s)));
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
pub mod en {
|
||||||
|
pub const SMALL: &str = "Mary had a little lamb, Little lamb";
|
||||||
|
define_consts! {
|
||||||
|
"Rust is blazingly fast and memory-efficient: with no runtime or garbage
|
||||||
|
collector, it can power performance-critical services, run on embedded
|
||||||
|
devices, and easily integrate with other languages. Rust’s rich type system
|
||||||
|
and ownership model guarantee memory-safety and thread-safety — enabling you
|
||||||
|
to eliminate many classes of bugs at compile-time. Rust has great
|
||||||
|
documentation, a friendly compiler with useful error messages, and top-notch
|
||||||
|
tooling — an integrated package manager and build tool, smart multi-editor
|
||||||
|
support with auto-completion and type inspections, an auto-formatter, and
|
||||||
|
more."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub mod zh {
|
||||||
|
pub const SMALL: &str = "度惊人且内存利用率极高";
|
||||||
|
define_consts! {
|
||||||
|
"Rust 速度惊人且内存利用率极高。由于\
|
||||||
|
没有运行时和垃圾回收,它能够胜任对性能要\
|
||||||
|
求特别高的服务,可以在嵌入式设备上运行,\
|
||||||
|
还能轻松和其他语言集成。Rust 丰富的类型\
|
||||||
|
系统和所有权模型保证了内存安全和线程安全,\
|
||||||
|
让您在编译期就能够消除各种各样的错误。\
|
||||||
|
Rust 拥有出色的文档、友好的编译器和清晰\
|
||||||
|
的错误提示信息, 还集成了一流的工具——\
|
||||||
|
包管理器和构建工具, 智能地自动补全和类\
|
||||||
|
型检验的多编辑器支持, 以及自动格式化代\
|
||||||
|
码等等。"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub mod ru {
|
||||||
|
pub const SMALL: &str = "Сотни компаний по";
|
||||||
|
define_consts! {
|
||||||
|
"Сотни компаний по всему миру используют Rust в реальных\
|
||||||
|
проектах для быстрых кросс-платформенных решений с\
|
||||||
|
ограниченными ресурсами. Такие проекты, как Firefox,\
|
||||||
|
Dropbox и Cloudflare, используют Rust. Rust отлично\
|
||||||
|
подходит как для стартапов, так и для больших компаний,\
|
||||||
|
как для встраиваемых устройств, так и для масштабируемых\
|
||||||
|
web-сервисов. Мой самый большой комплимент Rust."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub mod emoji {
|
||||||
|
pub const SMALL: &str = "😀😃😄😁😆😅🤣😂🙂🙃😉😊😇🥰😍🤩😘";
|
||||||
|
define_consts! {
|
||||||
|
"😀😃😄😁😆😅🤣😂🙂🙃😉😊😇🥰😍🤩😘😗☺😚😙🥲😋😛😜🤪😝🤑🤗🤭🤫🤔🤐🤨😐😑😶😶🌫️😏😒\
|
||||||
|
🙄😬😮💨🤥😌😔😪🤤😴😷🤒🤕🤢🤮🤧🥵🥶🥴😵😵💫🤯<EFBFBD><EFBFBD>🥳🥸😎🤓🧐😕😟🙁☹😮😯😲😳🥺😦😧😨\
|
||||||
|
😰😥😢😭😱😖😣😞😓😩😫🥱😤😡😠🤬😈👿💀☠💩🤡👹👺👻👽👾🤖😺😸😹😻😼😽🙀😿😾🙈🙉🙊\
|
||||||
|
💋💌💘💝💖💗💓<EFBFBD><EFBFBD>💕💟❣💔❤️🔥❤️🩹❤🧡💛💚💙💜🤎🖤🤍💯💢💥💫💦💨🕳💬👁️🗨️🗨🗯💭💤👋\
|
||||||
|
🤚🖐✋🖖👌🤌🤏✌"
|
||||||
|
}
|
||||||
|
}
|
116
library/core/src/str/count.rs
Normal file
116
library/core/src/str/count.rs
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
//! Code for efficiently counting the number of `char`s in a UTF-8 encoded
|
||||||
|
//! string.
|
||||||
|
//!
|
||||||
|
//! Broadly, UTF-8 encodes `char`s as a "leading" byte which begins the `char`,
|
||||||
|
//! followed by some number (possibly 0) of continuation bytes.
|
||||||
|
//!
|
||||||
|
//! The leading byte can have a number of bit-patterns (with the specific
|
||||||
|
//! pattern indicating how many continuation bytes follow), but the continuation
|
||||||
|
//! bytes are always in the format `0b10XX_XXXX` (where the `X`s can take any
|
||||||
|
//! value). That is, the most significant bit is set, and the second most
|
||||||
|
//! significant bit is unset.
|
||||||
|
//!
|
||||||
|
//! To count the number of characters, we can just count the number of bytes in
|
||||||
|
//! the string which are not continuation bytes, which can be done many bytes at
|
||||||
|
//! a time fairly easily.
|
||||||
|
//!
|
||||||
|
//! Note: Because the term "leading byte" can sometimes be ambiguous (for
|
||||||
|
//! example, it could also refer to the first byte of a slice), we'll often use
|
||||||
|
//! the term "non-continuation byte" to refer to these bytes in the code.
|
||||||
|
|
||||||
|
pub(super) fn count_chars(s: &str) -> usize {
|
||||||
|
// For correctness, `CHUNK_SIZE` must be:
|
||||||
|
// - Less than or equal to 255, otherwise we'll overflow bytes in `counts`.
|
||||||
|
// - A multiple of `UNROLL_INNER`, otherwise our `break` inside the
|
||||||
|
// `body.chunks(CHUNK_SIZE)` loop.
|
||||||
|
//
|
||||||
|
// For performance, `CHUNK_SIZE` should be:
|
||||||
|
// - Relatively cheap to `%` against.
|
||||||
|
// - Large enough to avoid paying for the cost of the `sum_bytes_in_usize`
|
||||||
|
// too often.
|
||||||
|
const CHUNK_SIZE: usize = 192;
|
||||||
|
const UNROLL_INNER: usize = 4;
|
||||||
|
|
||||||
|
// Check the properties of `CHUNK_SIZE` / `UNROLL_INNER` that are required
|
||||||
|
// for correctness.
|
||||||
|
const _: [(); 1] = [(); (CHUNK_SIZE < 256 && (CHUNK_SIZE % UNROLL_INNER) == 0) as usize];
|
||||||
|
// SAFETY: transmuting `[u8]` to `[usize]` is safe except for size
|
||||||
|
// differences which are handled by `align_to`.
|
||||||
|
let (head, body, tail) = unsafe { s.as_bytes().align_to::<usize>() };
|
||||||
|
|
||||||
|
let mut total = char_count_general_case(head) + char_count_general_case(tail);
|
||||||
|
// Split `body` into `CHUNK_SIZE` chunks to reduce the frequency with which
|
||||||
|
// we call `sum_bytes_in_usize`.
|
||||||
|
for chunk in body.chunks(CHUNK_SIZE) {
|
||||||
|
// We accumulate intermediate sums in `counts`, where each byte contains
|
||||||
|
// a subset of the sum of this chunk, like a `[u8; size_of::<usize>()]`.
|
||||||
|
let mut counts = 0;
|
||||||
|
let unrolled_chunks = chunk.array_chunks::<UNROLL_INNER>();
|
||||||
|
// If there's a remainder (know can only happen for the last item in
|
||||||
|
// `chunks`, because `CHUNK_SIZE % UNROLL == 0`), then we need to
|
||||||
|
// account for that (although we don't use it to later).
|
||||||
|
let remainder = unrolled_chunks.remainder();
|
||||||
|
for unrolled in unrolled_chunks {
|
||||||
|
for &word in unrolled {
|
||||||
|
// Because `CHUNK_SIZE` is < 256, this addition can't cause the
|
||||||
|
// count in any of the bytes to overflow into a subsequent byte.
|
||||||
|
counts += contains_non_continuation_byte(word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sum the values in `counts` (which, again, is conceptually a `[u8;
|
||||||
|
// size_of::<usize>()]`), and accumulate the result into `total`.
|
||||||
|
total += sum_bytes_in_usize(counts);
|
||||||
|
|
||||||
|
// If there's any data in `remainder`, then handle it. This will only
|
||||||
|
// happen for the last `chunk` in `body.chunks()` (because `CHUNK_SIZE`
|
||||||
|
// is divisible by `UNROLL_INNER`), so we explicitly break at the end
|
||||||
|
// (which seems to help LLVM out).
|
||||||
|
if !remainder.is_empty() {
|
||||||
|
// Accumulate all the data in the remainder.
|
||||||
|
let mut counts = 0;
|
||||||
|
for &word in remainder {
|
||||||
|
counts += contains_non_continuation_byte(word);
|
||||||
|
}
|
||||||
|
total += sum_bytes_in_usize(counts);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
total
|
||||||
|
}
|
||||||
|
|
||||||
|
// Checks each byte of `w` to see if it contains the first byte in a UTF-8
|
||||||
|
// sequence. Bytes in `w` which are continuation bytes are left as `0x00` (e.g.
|
||||||
|
// false), and bytes which are non-continuation bytes are left as `0x01` (e.g.
|
||||||
|
// true)
|
||||||
|
#[inline]
|
||||||
|
fn contains_non_continuation_byte(w: usize) -> usize {
|
||||||
|
let lsb = 0x0101_0101_0101_0101u64 as usize;
|
||||||
|
((!w >> 7) | (w >> 6)) & lsb
|
||||||
|
}
|
||||||
|
|
||||||
|
// Morally equivalent to `values.to_ne_bytes().into_iter().sum::<usize>()`, but
|
||||||
|
// more efficient.
|
||||||
|
#[inline]
|
||||||
|
fn sum_bytes_in_usize(values: usize) -> usize {
|
||||||
|
const LSB_SHORTS: usize = 0x0001_0001_0001_0001_u64 as usize;
|
||||||
|
const SKIP_BYTES: usize = 0x00ff_00ff_00ff_00ff_u64 as usize;
|
||||||
|
|
||||||
|
let pair_sum: usize = (values & SKIP_BYTES) + ((values >> 8) & SKIP_BYTES);
|
||||||
|
pair_sum.wrapping_mul(LSB_SHORTS) >> ((core::mem::size_of::<usize>() - 2) * 8)
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is the most direct implementation of the concept of "count the number of
|
||||||
|
// bytes in the string which are not continuation bytes", and is used for the
|
||||||
|
// head and tail of the input string (the first and last item in the tuple
|
||||||
|
// returned by `slice::align_to`).
|
||||||
|
fn char_count_general_case(s: &[u8]) -> usize {
|
||||||
|
const CONT_MASK_U8: u8 = 0b0011_1111;
|
||||||
|
const TAG_CONT_U8: u8 = 0b1000_0000;
|
||||||
|
let mut leads = 0;
|
||||||
|
for &byte in s {
|
||||||
|
let is_lead = (byte & !CONT_MASK_U8) != TAG_CONT_U8;
|
||||||
|
leads += is_lead as usize;
|
||||||
|
}
|
||||||
|
leads
|
||||||
|
}
|
@ -12,7 +12,7 @@ use crate::slice::{self, Split as SliceSplit};
|
|||||||
use super::from_utf8_unchecked;
|
use super::from_utf8_unchecked;
|
||||||
use super::pattern::Pattern;
|
use super::pattern::Pattern;
|
||||||
use super::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher};
|
use super::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher};
|
||||||
use super::validations::{next_code_point, next_code_point_reverse, utf8_is_cont_byte};
|
use super::validations::{next_code_point, next_code_point_reverse};
|
||||||
use super::LinesAnyMap;
|
use super::LinesAnyMap;
|
||||||
use super::{BytesIsNotEmpty, UnsafeBytesToStr};
|
use super::{BytesIsNotEmpty, UnsafeBytesToStr};
|
||||||
use super::{CharEscapeDebugContinue, CharEscapeDefault, CharEscapeUnicode};
|
use super::{CharEscapeDebugContinue, CharEscapeDefault, CharEscapeUnicode};
|
||||||
@ -46,8 +46,7 @@ impl<'a> Iterator for Chars<'a> {
|
|||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn count(self) -> usize {
|
fn count(self) -> usize {
|
||||||
// length in `char` is equal to the number of non-continuation bytes
|
super::count::count_chars(self.as_str())
|
||||||
self.iter.filter(|&&byte| !utf8_is_cont_byte(byte)).count()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
#![stable(feature = "rust1", since = "1.0.0")]
|
#![stable(feature = "rust1", since = "1.0.0")]
|
||||||
|
|
||||||
mod converts;
|
mod converts;
|
||||||
|
mod count;
|
||||||
mod error;
|
mod error;
|
||||||
mod iter;
|
mod iter;
|
||||||
mod traits;
|
mod traits;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user