Use load-store instead of memcpy for short integer arrays

2023-05-26 03:32:22 -07:00 · 2023-05-26 03:32:22 -07:00 · e1b020df9f
commit e1b020df9f
parent cce0b52e7b
8 changed files with 136 additions and 16 deletions
--- a/compiler/rustc_codegen_llvm/src/type_.rs
+++ b/compiler/rustc_codegen_llvm/src/type_.rs
@ -288,6 +288,9 @@ impl<'ll, 'tcx> LayoutTypeMethods<'tcx> for CodegenCx<'ll, 'tcx> {
    fn reg_backend_type(&self, ty: &Reg) -> &'ll Type {
        ty.llvm_type(self)
    }
+    fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option<Self::Type> {
+        layout.scalar_copy_llvm_type(self)
+    }
 }

 impl<'ll, 'tcx> TypeMembershipMethods<'tcx> for CodegenCx<'ll, 'tcx> {
--- a/compiler/rustc_codegen_llvm/src/type_of.rs
+++ b/compiler/rustc_codegen_llvm/src/type_of.rs
@ -6,6 +6,7 @@ use rustc_middle::bug;
 use rustc_middle::ty::layout::{FnAbiOf, LayoutOf, TyAndLayout};
 use rustc_middle::ty::print::{with_no_trimmed_paths, with_no_visible_paths};
 use rustc_middle::ty::{self, Ty, TypeVisitableExt};
+use rustc_target::abi::HasDataLayout;
 use rustc_target::abi::{Abi, Align, FieldsShape};
 use rustc_target::abi::{Int, Pointer, F32, F64};
 use rustc_target::abi::{PointeeInfo, Scalar, Size, TyAbiInterface, Variants};
@ -192,6 +193,7 @@ pub trait LayoutLlvmExt<'tcx> {
    ) -> &'a Type;
    fn llvm_field_index<'a>(&self, cx: &CodegenCx<'a, 'tcx>, index: usize) -> u64;
    fn pointee_info_at<'a>(&self, cx: &CodegenCx<'a, 'tcx>, offset: Size) -> Option<PointeeInfo>;
+    fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type>;
 }

 impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
@ -414,4 +416,35 @@ impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
        cx.pointee_infos.borrow_mut().insert((self.ty, offset), result);
        result
    }
+
+    fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type> {
+        debug_assert!(self.is_sized());
+
+        // FIXME: this is a fairly arbitrary choice, but 128 bits on WASM
+        // (matching the 128-bit SIMD types proposal) and 256 bits on x64
+        // (like AVX2 registers) seems at least like a tolerable starting point.
+        let threshold = cx.data_layout().pointer_size * 4;
+        if self.layout.size() > threshold {
+            return None;
+        }
+
+        // Vectors, even for non-power-of-two sizes, have the same layout as
+        // arrays but don't count as aggregate types
+        if let FieldsShape::Array { count, .. } = self.layout.fields()
+            && let element = self.field(cx, 0)
+            && element.ty.is_integral()
+        {
+            // `cx.type_ix(bits)` is tempting here, but while that works great
+            // for things that *stay* as memory-to-memory copies, it also ends
+            // up suppressing vectorization as it introduces shifts when it
+            // extracts all the individual values.
+
+            let ety = element.llvm_type(cx);
+            return Some(cx.type_vector(ety, *count));
+        }
+
+        // FIXME: The above only handled integer arrays; surely more things
+        // would also be possible. Be careful about provenance, though!
+        None
+    }
 }
--- a/compiler/rustc_codegen_ssa/src/base.rs
+++ b/compiler/rustc_codegen_ssa/src/base.rs
@ -380,7 +380,19 @@ pub fn memcpy_ty<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
        return;
    }

-    bx.memcpy(dst, dst_align, src, src_align, bx.cx().const_usize(size), flags);
+    if flags == MemFlags::empty()
+        && let Some(bty) = bx.cx().scalar_copy_backend_type(layout)
+    {
+        // I look forward to only supporting opaque pointers
+        let pty = bx.type_ptr_to(bty);
+        let src = bx.pointercast(src, pty);
+        let dst = bx.pointercast(dst, pty);
+
+        let temp = bx.load(bty, src, src_align);
+        bx.store(temp, dst, dst_align);
+    } else {
+        bx.memcpy(dst, dst_align, src, src_align, bx.cx().const_usize(size), flags);
+    }
 }

 pub fn codegen_instance<'a, 'tcx: 'a, Bx: BuilderMethods<'a, 'tcx>>(
--- a/compiler/rustc_codegen_ssa/src/traits/type_.rs
+++ b/compiler/rustc_codegen_ssa/src/traits/type_.rs
@ -126,6 +126,28 @@ pub trait LayoutTypeMethods<'tcx>: Backend<'tcx> {
        index: usize,
        immediate: bool,
    ) -> Self::Type;
+
+    /// A type that can be used in a [`super::BuilderMethods::load`] +
+    /// [`super::BuilderMethods::store`] pair to implement a *typed* copy,
+    /// such as a MIR `*_0 = *_1`.
+    ///
+    /// It's always legal to return `None` here, as the provided impl does,
+    /// in which case callers should use [`super::BuilderMethods::memcpy`]
+    /// instead of the `load`+`store` pair.
+    ///
+    /// This can be helpful for things like arrays, where the LLVM backend type
+    /// `[3 x i16]` optimizes to three separate loads and stores, but it can
+    /// instead be copied via an `i48` that stays as the single `load`+`store`.
+    /// (As of 2023-05 LLVM cannot necessarily optimize away a `memcpy` in these
+    /// cases, due to `poison` handling, but in codegen we have more information
+    /// about the type invariants, so can emit something better instead.)
+    ///
+    /// This *should* return `None` for particularly-large types, where leaving
+    /// the `memcpy` may well be important to avoid code size explosion.
+    fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option<Self::Type> {
+        let _ = layout;
+        None
+    }
 }

 // For backends that support CFI using type membership (i.e., testing whether a given pointer is
--- a/tests/codegen/array-codegen.rs
+++ b/tests/codegen/array-codegen.rs
@ -0,0 +1,35 @@
+// compile-flags: -O -C no-prepopulate-passes
+// min-llvm-version: 15.0 (for opaque pointers)
+
+#![crate_type = "lib"]
+
+// CHECK-LABEL: @array_load
+#[no_mangle]
+pub fn array_load(a: &[u8; 4]) -> [u8; 4] {
+    // CHECK: %0 = alloca [4 x i8], align 1
+    // CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1
+    // CHECK: store <4 x i8> %[[TEMP1]], ptr %0, align 1
+    // CHECK: %[[TEMP2:.+]] = load i32, ptr %0, align 1
+    // CHECK: ret i32 %[[TEMP2]]
+    *a
+}
+
+// CHECK-LABEL: @array_store
+#[no_mangle]
+pub fn array_store(a: [u8; 4], p: &mut [u8; 4]) {
+    // CHECK: %a = alloca [4 x i8]
+    // CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1
+    // CHECK-NEXT: store <4 x i8> %[[TEMP]], ptr %p, align 1
+    *p = a;
+}
+
+// CHECK-LABEL: @array_copy
+#[no_mangle]
+pub fn array_copy(a: &[u8; 4], p: &mut [u8; 4]) {
+    // CHECK: %[[LOCAL:.+]] = alloca [4 x i8], align 1
+    // CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1
+    // CHECK: store <4 x i8> %[[TEMP1]], ptr %[[LOCAL]], align 1
+    // CHECK: %[[TEMP2:.+]] = load <4 x i8>, ptr %[[LOCAL]], align 1
+    // CHECK: store <4 x i8> %[[TEMP2]], ptr %p, align 1
+    *p = *a;
+}
--- a/tests/codegen/mem-replace-simple-type.rs
+++ b/tests/codegen/mem-replace-simple-type.rs
@ -32,3 +32,14 @@ pub fn replace_ref_str<'a>(r: &mut &'a str, v: &'a str) -> &'a str {
    // CHECK: ret { ptr, i64 } %[[P2]]
    std::mem::replace(r, v)
 }
+
+#[no_mangle]
+// CHECK-LABEL: @replace_short_array(
+pub fn replace_short_array(r: &mut [u32; 3], v: [u32; 3]) -> [u32; 3] {
+    // CHECK-NOT: alloca
+    // CHECK: %[[R:.+]] = load <3 x i32>, ptr %r, align 4
+    // CHECK: store <3 x i32> %[[R]], ptr %0
+    // CHECK: %[[V:.+]] = load <3 x i32>, ptr %v, align 4
+    // CHECK: store <3 x i32> %[[V]], ptr %r
+    std::mem::replace(r, v)
+}
--- a/tests/codegen/swap-simd-types.rs
+++ b/tests/codegen/swap-simd-types.rs
@ -30,3 +30,12 @@ pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
        x.swap_with_slice(y);
    }
 }
+
+// CHECK-LABEL: @swap_bytes32
+#[no_mangle]
+pub fn swap_bytes32(x: &mut [u8; 32], y: &mut [u8; 32]) {
+// CHECK-NOT: alloca
+// CHECK: load <32 x i8>{{.+}}align 1
+// CHECK: store <32 x i8>{{.+}}align 1
+    swap(x, y)
+}
--- a/tests/codegen/swap-small-types.rs
+++ b/tests/codegen/swap-small-types.rs
@ -1,4 +1,4 @@
-// compile-flags: -O
+// compile-flags: -O -Z merge-functions=disabled
 // only-x86_64
 // ignore-debug: the debug assertions get in the way

@ -12,13 +12,10 @@ type RGB48 = [u16; 3];
 #[no_mangle]
 pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) {
    // CHECK-NOT: alloca
-    // CHECK: %temp = alloca [3 x i16]
-    // CHECK-NOT: alloca
-    // CHECK-NOT: call void @llvm.memcpy
-    // CHECK: call void @llvm.memcpy.{{.+}}({{.+}} %temp, {{.+}} %x, {{.+}} 6, {{.+}})
-    // CHECK: call void @llvm.memcpy.{{.+}}({{.+}} %x, {{.+}} %y, {{.+}} 6, {{.+}})
-    // CHECK: call void @llvm.memcpy.{{.+}}({{.+}} %y, {{.+}} %temp, {{.+}} 6, {{.+}})
-    // CHECK-NOT: call void @llvm.memcpy
+    // CHECK: %[[TEMP0:.+]] = load <3 x i16>, ptr %x, align 2
+    // CHECK: %[[TEMP1:.+]] = load <3 x i16>, ptr %y, align 2
+    // CHECK: store <3 x i16> %[[TEMP1]], ptr %x, align 2
+    // CHECK: store <3 x i16> %[[TEMP0]], ptr %y, align 2

    let temp = *x;
    *x = *y;
@ -28,13 +25,11 @@ pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) {
 // CHECK-LABEL: @swap_rgb48
 #[no_mangle]
 pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
-    // FIXME MIR inlining messes up LLVM optimizations.
-    // If these checks start failing, please update this test.
-    // CHECK: alloca [3 x i16]
-    // CHECK: call void @llvm.memcpy
-// WOULD-CHECK-NOT: alloca
-// WOULD-CHECK: load i48
-// WOULD-CHECK: store i48
+    // CHECK-NOT: alloca
+    // CHECK: load <3 x i16>
+    // CHECK: load <3 x i16>
+    // CHECK: store <3 x i16>
+    // CHECK: store <3 x i16>
    swap(x, y)
 }