From 2f9f2e507eaecf45074727e34af02642b95fa724 Mon Sep 17 00:00:00 2001 From: The 8472 Date: Sun, 26 Jun 2022 22:53:25 +0200 Subject: [PATCH] Optimized vec::IntoIter::next_chunk impl ``` test vec::bench_next_chunk ... bench: 696 ns/iter (+/- 22) x86_64v1, pr test vec::bench_next_chunk ... bench: 309 ns/iter (+/- 4) znver2, default test vec::bench_next_chunk ... bench: 17,272 ns/iter (+/- 117) znver2, pr test vec::bench_next_chunk ... bench: 211 ns/iter (+/- 3) ``` The znver2 default impl seems to be slow due to inlining decisions. It goes through `core::array::iter_next_chunk` which has a deeper call tree. --- library/alloc/benches/lib.rs | 1 + library/alloc/benches/vec.rs | 20 +++++++++++++++ library/alloc/src/lib.rs | 4 +++ library/alloc/src/vec/into_iter.rs | 41 ++++++++++++++++++++++++++++-- 4 files changed, 64 insertions(+), 2 deletions(-) diff --git a/library/alloc/benches/lib.rs b/library/alloc/benches/lib.rs index 7dc0f7cebd5..72ac897d4f1 100644 --- a/library/alloc/benches/lib.rs +++ b/library/alloc/benches/lib.rs @@ -2,6 +2,7 @@ // See https://github.com/rust-lang/rust/issues/73535#event-3477699747 #![cfg(not(target_os = "android"))] #![feature(btree_drain_filter)] +#![feature(iter_next_chunk)] #![feature(map_first_last)] #![feature(repr_simd)] #![feature(slice_partition_dedup)] diff --git a/library/alloc/benches/vec.rs b/library/alloc/benches/vec.rs index efc47327e8a..663f6b9dd1c 100644 --- a/library/alloc/benches/vec.rs +++ b/library/alloc/benches/vec.rs @@ -762,3 +762,23 @@ fn bench_retain_whole_100000(b: &mut Bencher) { let mut v = black_box(vec![826u32; 100000]); b.iter(|| v.retain(|x| *x == 826u32)); } + +#[bench] +fn bench_next_chunk(b: &mut Bencher) { + let v = vec![13u8; 2048]; + + b.iter(|| { + const CHUNK: usize = 8; + + let mut sum = [0u32; CHUNK]; + let mut iter = black_box(v.clone()).into_iter(); + + while let Ok(chunk) = iter.next_chunk::() { + for i in 0..CHUNK { + sum[i] += chunk[i] as u32; + } + } + + sum + }) +} diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs index c08caa7b93e..cabed9e82f8 100644 --- a/library/alloc/src/lib.rs +++ b/library/alloc/src/lib.rs @@ -90,6 +90,7 @@ #![feature(alloc_layout_extra)] #![feature(allocator_api)] #![feature(array_chunks)] +#![feature(array_into_iter_constructors)] #![feature(array_methods)] #![feature(array_windows)] #![feature(assert_matches)] @@ -120,8 +121,11 @@ #![feature(hasher_prefixfree_extras)] #![feature(inplace_iteration)] #![feature(iter_advance_by)] +#![feature(iter_next_chunk)] #![feature(layout_for_ptr)] +#![feature(maybe_uninit_array_assume_init)] #![feature(maybe_uninit_slice)] +#![feature(maybe_uninit_uninit_array)] #![cfg_attr(test, feature(new_uninit))] #![feature(nonnull_slice_from_raw_parts)] #![feature(pattern)] diff --git a/library/alloc/src/vec/into_iter.rs b/library/alloc/src/vec/into_iter.rs index 28979457b7f..1b483e3fc77 100644 --- a/library/alloc/src/vec/into_iter.rs +++ b/library/alloc/src/vec/into_iter.rs @@ -2,13 +2,14 @@ use super::AsVecIntoIter; use crate::alloc::{Allocator, Global}; use crate::raw_vec::RawVec; +use core::array; use core::fmt; use core::intrinsics::arith_offset; use core::iter::{ FusedIterator, InPlaceIterable, SourceIter, TrustedLen, TrustedRandomAccessNoCoerce, }; use core::marker::PhantomData; -use core::mem::{self, ManuallyDrop}; +use core::mem::{self, ManuallyDrop, MaybeUninit}; #[cfg(not(no_global_oom_handling))] use core::ops::Deref; use core::ptr::{self, NonNull}; @@ -124,7 +125,6 @@ pub(super) fn forget_allocation_drop_remaining(&mut self) { } /// Forgets to Drop the remaining elements while still allowing the backing allocation to be freed. - #[cfg(not(no_global_oom_handling))] pub(crate) fn forget_remaining_elements(&mut self) { self.ptr = self.end; } @@ -204,6 +204,43 @@ fn count(self) -> usize { self.len() } + #[inline] + fn next_chunk(&mut self) -> Result<[T; N], core::array::IntoIter> { + let mut raw_ary = MaybeUninit::uninit_array(); + + let len = self.len(); + + if mem::size_of::() == 0 { + if len < N { + self.forget_remaining_elements(); + // Safety: ZSTs can be conjured ex nihilo, only the amount has to be correct + return Err(unsafe { array::IntoIter::new_unchecked(raw_ary, 0..len) }); + } + + self.ptr = unsafe { arith_offset(self.ptr as *const i8, N as isize) as *mut T }; + // Safety: ditto + return Ok(unsafe { MaybeUninit::array_assume_init(raw_ary) }); + } + + if len < N { + // Safety: `len` indicates that this many elements are available and we just checked that + // it fits into the array. + unsafe { + ptr::copy_nonoverlapping(self.ptr, raw_ary.as_mut_ptr() as *mut T, len); + self.forget_remaining_elements(); + return Err(array::IntoIter::new_unchecked(raw_ary, 0..len)); + } + } + + // Safety: `len` is larger than the array size. Copy a fixed amount here to fully initialize + // the array. + return unsafe { + ptr::copy_nonoverlapping(self.ptr, raw_ary.as_mut_ptr() as *mut T, N); + self.ptr = self.ptr.add(N); + Ok(MaybeUninit::array_assume_init(raw_ary)) + }; + } + unsafe fn __iterator_get_unchecked(&mut self, i: usize) -> Self::Item where Self: TrustedRandomAccessNoCoerce,