From b582bd388f5693119bbefa85ed7ea055760f9eef Mon Sep 17 00:00:00 2001 From: Scott McMurray Date: Sat, 26 Feb 2022 18:57:15 -0800 Subject: [PATCH] For MIRI, cfg out the swap logic from 94212 --- library/core/src/mem/mod.rs | 27 ++++++++++++++++++++++----- library/core/src/ptr/mod.rs | 23 +++++++++++++++-------- src/test/codegen/swap-large-types.rs | 27 +++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 13 deletions(-) diff --git a/library/core/src/mem/mod.rs b/library/core/src/mem/mod.rs index b5c1ae37e5e..8a99bed6a96 100644 --- a/library/core/src/mem/mod.rs +++ b/library/core/src/mem/mod.rs @@ -708,7 +708,10 @@ pub const fn swap(x: &mut T, y: &mut T) { // understanding `mem::replace`, `Option::take`, etc. - a better overall // solution might be to make `ptr::swap_nonoverlapping` into an intrinsic, which // a backend can choose to implement using the block optimization, or not. - #[cfg(not(target_arch = "spirv"))] + // NOTE(scottmcm) MIRI is disabled here as reading in smaller units is a + // pessimization for it. Also, if the type contains any unaligned pointers, + // copying those over multiple reads is difficult to support. + #[cfg(not(any(target_arch = "spirv", miri)))] { // For types that are larger multiples of their alignment, the simple way // tends to copy the whole thing to stack rather than doing it one part @@ -737,12 +740,26 @@ pub const fn swap(x: &mut T, y: &mut T) { #[rustc_const_unstable(feature = "const_swap", issue = "83163")] #[inline] pub(crate) const fn swap_simple(x: &mut T, y: &mut T) { + // We arrange for this to typically be called with small types, + // so this reads-and-writes approach is actually better than using + // copy_nonoverlapping as it easily puts things in LLVM registers + // directly and doesn't end up inlining allocas. + // And LLVM actually optimizes it to 3×memcpy if called with + // a type larger than it's willing to keep in a register. + // Having typed reads and writes in MIR here is also good as + // it lets MIRI and CTFE understand them better, including things + // like enforcing type validity for them. + // Importantly, read+copy_nonoverlapping+write introduces confusing + // asymmetry to the behaviour where one value went through read+write + // whereas the other was copied over by the intrinsic (see #94371). + // SAFETY: exclusive references are always valid to read/write, - // are non-overlapping, and nothing here panics so it's drop-safe. + // including being aligned, and nothing here panics so it's drop-safe. unsafe { - let z = ptr::read(x); - ptr::copy_nonoverlapping(y, x, 1); - ptr::write(y, z); + let a = ptr::read(x); + let b = ptr::read(y); + ptr::write(x, b); + ptr::write(y, a); } } diff --git a/library/core/src/ptr/mod.rs b/library/core/src/ptr/mod.rs index ff71fadb614..59b1b4c1367 100644 --- a/library/core/src/ptr/mod.rs +++ b/library/core/src/ptr/mod.rs @@ -419,6 +419,7 @@ pub const fn slice_from_raw_parts_mut(data: *mut T, len: usize) -> *mut [T] { #[stable(feature = "swap_nonoverlapping", since = "1.27.0")] #[rustc_const_unstable(feature = "const_swap", issue = "83163")] pub const unsafe fn swap_nonoverlapping(x: *mut T, y: *mut T, count: usize) { + #[allow(unused)] macro_rules! attempt_swap_as_chunks { ($ChunkTy:ty) => { if mem::align_of::() >= mem::align_of::<$ChunkTy>() @@ -437,15 +438,21 @@ macro_rules! attempt_swap_as_chunks { }; } - // Split up the slice into small power-of-two-sized chunks that LLVM is able - // to vectorize (unless it's a special type with more-than-pointer alignment, - // because we don't want to pessimize things like slices of SIMD vectors.) - if mem::align_of::() <= mem::size_of::() - && (!mem::size_of::().is_power_of_two() - || mem::size_of::() > mem::size_of::() * 2) + // NOTE(scottmcm) MIRI is disabled here as reading in smaller units is a + // pessimization for it. Also, if the type contains any unaligned pointers, + // copying those over multiple reads is difficult to support. + #[cfg(not(miri))] { - attempt_swap_as_chunks!(usize); - attempt_swap_as_chunks!(u8); + // Split up the slice into small power-of-two-sized chunks that LLVM is able + // to vectorize (unless it's a special type with more-than-pointer alignment, + // because we don't want to pessimize things like slices of SIMD vectors.) + if mem::align_of::() <= mem::size_of::() + && (!mem::size_of::().is_power_of_two() + || mem::size_of::() > mem::size_of::() * 2) + { + attempt_swap_as_chunks!(usize); + attempt_swap_as_chunks!(u8); + } } // SAFETY: Same preconditions as this function diff --git a/src/test/codegen/swap-large-types.rs b/src/test/codegen/swap-large-types.rs index 535d301a3d2..91a1ab7144f 100644 --- a/src/test/codegen/swap-large-types.rs +++ b/src/test/codegen/swap-large-types.rs @@ -39,6 +39,9 @@ pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) { swap(x, y) } +// Verify that types with usize alignment are swapped via vectored usizes, +// not falling back to byte-level code. + // CHECK-LABEL: @swap_slice #[no_mangle] pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) { @@ -50,6 +53,8 @@ pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) { } } +// But for a large align-1 type, vectorized byte copying is what we want. + type OneKilobyteBuffer = [u8; 1024]; // CHECK-LABEL: @swap_1kb_slices @@ -62,3 +67,25 @@ pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer]) x.swap_with_slice(y); } } + +// This verifies that the 2×read + 2×write optimizes to just 3 memcpys +// for an unusual type like this. It's not clear whether we should do anything +// smarter in Rust for these, so for now it's fine to leave these up to the backend. +// That's not as bad as it might seem, as for example, LLVM will lower the +// memcpys below to VMOVAPS on YMMs if one enables the AVX target feature. +// Eventually we'll be able to pass `align_of::` to a const generic and +// thus pick a smarter chunk size ourselves without huge code duplication. + +#[repr(align(64))] +pub struct BigButHighlyAligned([u8; 64 * 3]); + +// CHECK-LABEL: @swap_big_aligned +#[no_mangle] +pub fn swap_big_aligned(x: &mut BigButHighlyAligned, y: &mut BigButHighlyAligned) { +// CHECK-NOT: call void @llvm.memcpy +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 64 dereferenceable(192) +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 64 dereferenceable(192) +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 64 dereferenceable(192) +// CHECK-NOT: call void @llvm.memcpy + swap(x, y) +}