From c585541e671bc76d60163356c435bfb32ef858b4 Mon Sep 17 00:00:00 2001 From: Joshua Wong Date: Fri, 12 Apr 2024 18:15:02 -0500 Subject: [PATCH] optimize in-place collection of `Vec` LLVM does not know that the multiplication never overflows, which causes it to generate unnecessary instructions. Use `usize::unchecked_mul`, so that it can fold the `dst_cap` calculation when `size_of::() == size_of::()`. Running: ``` rustc -C llvm-args=-x86-asm-syntax=intel -O src/lib.rs --emit asm` ``` ```rust pub struct Foo([usize; 3]); pub fn unwrap_copy(v: Vec) -> Vec<[usize; 3]> { v.into_iter().map(|f| f.0).collect() } ``` Before this commit: ``` define void @unwrap_copy(ptr noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 %_16.i.i = mul i64 %me.sroa.0.0.copyload.i, 24 %dst_cap.i.i = udiv i64 %_16.i.i, 24 store i64 %dst_cap.i.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8 ret void } ``` After: ``` define void @unwrap_copy(ptr noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 store i64 %me.sroa.0.0.copyload.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8, !alias.scope !9, !noalias !14 ret void } ``` Note that there is still one more `mul,udiv` pair that I couldn't get rid of. The root cause is the same issue as #121239, the `nuw` gets stripped off of `ptr::sub_ptr`. --- library/alloc/src/vec/in_place_collect.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/library/alloc/src/vec/in_place_collect.rs b/library/alloc/src/vec/in_place_collect.rs index 88aa1b1b0e0..5b0fde93b91 100644 --- a/library/alloc/src/vec/in_place_collect.rs +++ b/library/alloc/src/vec/in_place_collect.rs @@ -259,7 +259,8 @@ fn from_iter_in_place(mut iterator: I) -> Vec inner.cap, inner.buf.cast::(), inner.end as *const T, - inner.cap * mem::size_of::() / mem::size_of::(), + // SAFETY: the multiplication can not overflow, since `inner.cap * size_of::()` is the size of the allocation. + inner.cap.unchecked_mul(mem::size_of::()) / mem::size_of::(), ) }; @@ -373,8 +374,10 @@ impl SpecInPlaceCollect for I // - unlike most internal iteration methods, it only takes a &mut self // - it lets us thread the write pointer through its innards and get it back in the end let sink = InPlaceDrop { inner: dst_buf, dst: dst_buf }; - let sink = - self.try_fold::<_, _, Result<_, !>>(sink, write_in_place_with_drop(end)).unwrap(); + let sink = match self.try_fold::<_, _, Result<_, !>>(sink, write_in_place_with_drop(end)) { + Ok(sink) => sink, + Err(never) => match never {}, + }; // iteration succeeded, don't drop head unsafe { ManuallyDrop::new(sink).dst.sub_ptr(dst_buf) } }