From b5376ba6017fa04a13afda6ac52587f06a6c0bd8 Mon Sep 17 00:00:00 2001 From: Scott McMurray Date: Fri, 29 Mar 2024 00:00:24 -0700 Subject: [PATCH] Remove my `scalar_copy_backend_type` optimization attempt I added this back in 111999, but I no longer think it's a good idea - It had to get scaled back to only power-of-two things to not break a bunch of targets - LLVM seems to be getting better at memcpy removal anyway - Introducing vector instructions has seemed to sometimes (115515) make autovectorization worse So this removes it from the codegen crates entirely, and instead just tries to use instead of direct `memcpy` so things will still use load/store for immediates. --- compiler/rustc_codegen_llvm/src/type_.rs | 3 -- compiler/rustc_codegen_llvm/src/type_of.rs | 42 ------------------ compiler/rustc_codegen_ssa/src/base.rs | 38 ++-------------- compiler/rustc_codegen_ssa/src/mir/block.rs | 13 ++---- compiler/rustc_codegen_ssa/src/mir/operand.rs | 16 +++---- .../rustc_codegen_ssa/src/traits/builder.rs | 22 ++++++++-- .../rustc_codegen_ssa/src/traits/type_.rs | 22 ---------- tests/codegen/array-codegen.rs | 44 +++++++++++-------- tests/codegen/array-optimized.rs | 8 ++-- tests/codegen/mem-replace-simple-type.rs | 6 +-- 10 files changed, 60 insertions(+), 154 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/type_.rs b/compiler/rustc_codegen_llvm/src/type_.rs index af1bbda4d08..a00f09dc40d 100644 --- a/compiler/rustc_codegen_llvm/src/type_.rs +++ b/compiler/rustc_codegen_llvm/src/type_.rs @@ -281,9 +281,6 @@ fn fn_ptr_backend_type(&self, fn_abi: &FnAbi<'tcx, Ty<'tcx>>) -> &'ll Type { fn reg_backend_type(&self, ty: &Reg) -> &'ll Type { ty.llvm_type(self) } - fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option { - layout.scalar_copy_llvm_type(self) - } } impl<'ll, 'tcx> TypeMembershipMethods<'tcx> for CodegenCx<'ll, 'tcx> { diff --git a/compiler/rustc_codegen_llvm/src/type_of.rs b/compiler/rustc_codegen_llvm/src/type_of.rs index d10a083765b..40ed6baa610 100644 --- a/compiler/rustc_codegen_llvm/src/type_of.rs +++ b/compiler/rustc_codegen_llvm/src/type_of.rs @@ -5,7 +5,6 @@ use rustc_middle::ty::layout::{LayoutOf, TyAndLayout}; use rustc_middle::ty::print::{with_no_trimmed_paths, with_no_visible_paths}; use rustc_middle::ty::{self, Ty, TypeVisitableExt}; -use rustc_target::abi::HasDataLayout; use rustc_target::abi::{Abi, Align, FieldsShape}; use rustc_target::abi::{Int, Pointer, F128, F16, F32, F64}; use rustc_target::abi::{Scalar, Size, Variants}; @@ -166,7 +165,6 @@ fn scalar_pair_element_llvm_type<'a>( index: usize, immediate: bool, ) -> &'a Type; - fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type>; } impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> { @@ -308,44 +306,4 @@ fn scalar_pair_element_llvm_type<'a>( self.scalar_llvm_type_at(cx, scalar) } - - fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type> { - debug_assert!(self.is_sized()); - - // FIXME: this is a fairly arbitrary choice, but 128 bits on WASM - // (matching the 128-bit SIMD types proposal) and 256 bits on x64 - // (like AVX2 registers) seems at least like a tolerable starting point. - let threshold = cx.data_layout().pointer_size * 4; - if self.layout.size() > threshold { - return None; - } - - // Vectors, even for non-power-of-two sizes, have the same layout as - // arrays but don't count as aggregate types - // While LLVM theoretically supports non-power-of-two sizes, and they - // often work fine, sometimes x86-isel deals with them horribly - // (see #115212) so for now only use power-of-two ones. - if let FieldsShape::Array { count, .. } = self.layout.fields() - && count.is_power_of_two() - && let element = self.field(cx, 0) - && element.ty.is_integral() - { - // `cx.type_ix(bits)` is tempting here, but while that works great - // for things that *stay* as memory-to-memory copies, it also ends - // up suppressing vectorization as it introduces shifts when it - // extracts all the individual values. - - let ety = element.llvm_type(cx); - if *count == 1 { - // Emitting `<1 x T>` would be silly; just use the scalar. - return Some(ety); - } else { - return Some(cx.type_vector(ety, *count)); - } - } - - // FIXME: The above only handled integer arrays; surely more things - // would also be possible. Be careful about provenance, though! - None - } } diff --git a/compiler/rustc_codegen_ssa/src/base.rs b/compiler/rustc_codegen_ssa/src/base.rs index c4c16ee7311..930b9b8c0db 100644 --- a/compiler/rustc_codegen_ssa/src/base.rs +++ b/compiler/rustc_codegen_ssa/src/base.rs @@ -12,7 +12,7 @@ use crate::mir::operand::OperandValue; use crate::mir::place::PlaceRef; use crate::traits::*; -use crate::{CachedModuleCodegen, CompiledModule, CrateInfo, MemFlags, ModuleCodegen, ModuleKind}; +use crate::{CachedModuleCodegen, CompiledModule, CrateInfo, ModuleCodegen, ModuleKind}; use rustc_ast::expand::allocator::{global_fn_name, AllocatorKind, ALLOCATOR_METHODS}; use rustc_attr as attr; @@ -37,7 +37,7 @@ use rustc_session::Session; use rustc_span::symbol::sym; use rustc_span::Symbol; -use rustc_target::abi::{Align, FIRST_VARIANT}; +use rustc_target::abi::FIRST_VARIANT; use std::cmp; use std::collections::BTreeSet; @@ -282,15 +282,7 @@ pub fn coerce_unsized_into<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( } if src_f.layout.ty == dst_f.layout.ty { - memcpy_ty( - bx, - dst_f.llval, - dst_f.align, - src_f.llval, - src_f.align, - src_f.layout, - MemFlags::empty(), - ); + bx.typed_place_copy(dst_f, src_f); } else { coerce_unsized_into(bx, src_f, dst_f); } @@ -382,30 +374,6 @@ pub fn wants_new_eh_instructions(sess: &Session) -> bool { wants_wasm_eh(sess) || wants_msvc_seh(sess) } -pub fn memcpy_ty<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( - bx: &mut Bx, - dst: Bx::Value, - dst_align: Align, - src: Bx::Value, - src_align: Align, - layout: TyAndLayout<'tcx>, - flags: MemFlags, -) { - let size = layout.size.bytes(); - if size == 0 { - return; - } - - if flags == MemFlags::empty() - && let Some(bty) = bx.cx().scalar_copy_backend_type(layout) - { - let temp = bx.load(bty, src, src_align); - bx.store(temp, dst, dst_align); - } else { - bx.memcpy(dst, dst_align, src, src_align, bx.cx().const_usize(size), flags); - } -} - pub fn codegen_instance<'a, 'tcx: 'a, Bx: BuilderMethods<'a, 'tcx>>( cx: &'a Bx::CodegenCx, instance: Instance<'tcx>, diff --git a/compiler/rustc_codegen_ssa/src/mir/block.rs b/compiler/rustc_codegen_ssa/src/mir/block.rs index 1aa52a985ef..c3137f0628e 100644 --- a/compiler/rustc_codegen_ssa/src/mir/block.rs +++ b/compiler/rustc_codegen_ssa/src/mir/block.rs @@ -1459,7 +1459,7 @@ fn codegen_argument( } _ => (op.immediate_or_packed_pair(bx), arg.layout.align.abi, false), }, - Ref(llval, _, align) => match arg.mode { + Ref(llval, llextra, align) => match arg.mode { PassMode::Indirect { attrs, .. } => { let required_align = match attrs.pointee_align { Some(pointee_align) => cmp::max(pointee_align, arg.layout.align.abi), @@ -1470,15 +1470,8 @@ fn codegen_argument( // alignment requirements may be higher than the type's alignment, so copy // to a higher-aligned alloca. let scratch = PlaceRef::alloca_aligned(bx, arg.layout, required_align); - base::memcpy_ty( - bx, - scratch.llval, - scratch.align, - llval, - align, - op.layout, - MemFlags::empty(), - ); + let op_place = PlaceRef { llval, llextra, layout: op.layout, align }; + bx.typed_place_copy(scratch, op_place); (scratch.llval, scratch.align, true) } else { (llval, align, true) diff --git a/compiler/rustc_codegen_ssa/src/mir/operand.rs b/compiler/rustc_codegen_ssa/src/mir/operand.rs index 932926976b5..ac38b91c5e0 100644 --- a/compiler/rustc_codegen_ssa/src/mir/operand.rs +++ b/compiler/rustc_codegen_ssa/src/mir/operand.rs @@ -1,7 +1,6 @@ use super::place::PlaceRef; use super::{FunctionCx, LocalRef}; -use crate::base; use crate::size_of_val; use crate::traits::*; use crate::MemFlags; @@ -398,7 +397,7 @@ pub fn nontemporal_store>( self.store_with_flags(bx, dest, MemFlags::NONTEMPORAL); } - fn store_with_flags>( + pub(crate) fn store_with_flags>( self, bx: &mut Bx, dest: PlaceRef<'tcx, V>, @@ -410,16 +409,11 @@ fn store_with_flags>( // Avoid generating stores of zero-sized values, because the only way to have a zero-sized // value is through `undef`/`poison`, and the store itself is useless. } - OperandValue::Ref(r, None, source_align) => { + OperandValue::Ref(llval, llextra @ None, source_align) => { assert!(dest.layout.is_sized(), "cannot directly store unsized values"); - if flags.contains(MemFlags::NONTEMPORAL) { - // HACK(nox): This is inefficient but there is no nontemporal memcpy. - let ty = bx.backend_type(dest.layout); - let val = bx.load(ty, r, source_align); - bx.store_with_flags(val, dest.llval, dest.align, flags); - return; - } - base::memcpy_ty(bx, dest.llval, dest.align, r, source_align, dest.layout, flags) + let source_place = + PlaceRef { llval, llextra, align: source_align, layout: dest.layout }; + bx.typed_place_copy_with_flags(dest, source_place, flags); } OperandValue::Ref(_, Some(_), _) => { bug!("cannot directly store unsized values"); diff --git a/compiler/rustc_codegen_ssa/src/traits/builder.rs b/compiler/rustc_codegen_ssa/src/traits/builder.rs index 6c8dcc5b690..c9d40b7c724 100644 --- a/compiler/rustc_codegen_ssa/src/traits/builder.rs +++ b/compiler/rustc_codegen_ssa/src/traits/builder.rs @@ -281,17 +281,31 @@ fn typed_place_copy( dst: PlaceRef<'tcx, Self::Value>, src: PlaceRef<'tcx, Self::Value>, ) { - debug_assert!(src.llextra.is_none()); - debug_assert!(dst.llextra.is_none()); + self.typed_place_copy_with_flags(dst, src, MemFlags::empty()); + } + + fn typed_place_copy_with_flags( + &mut self, + dst: PlaceRef<'tcx, Self::Value>, + src: PlaceRef<'tcx, Self::Value>, + flags: MemFlags, + ) { + debug_assert!(src.llextra.is_none(), "cannot directly copy from unsized values"); + debug_assert!(dst.llextra.is_none(), "cannot directly copy into unsized values"); debug_assert_eq!(dst.layout.size, src.layout.size); if self.sess().opts.optimize == OptLevel::No && self.is_backend_immediate(dst.layout) { // If we're not optimizing, the aliasing information from `memcpy` // isn't useful, so just load-store the value for smaller code. let temp = self.load_operand(src); - temp.val.store(self, dst); + temp.val.store_with_flags(self, dst, flags); + } else if flags.contains(MemFlags::NONTEMPORAL) { + // HACK(nox): This is inefficient but there is no nontemporal memcpy. + let ty = self.backend_type(dst.layout); + let val = self.load(ty, src.llval, src.align); + self.store_with_flags(val, dst.llval, dst.align, flags); } else if !dst.layout.is_zst() { let bytes = self.const_usize(dst.layout.size.bytes()); - self.memcpy(dst.llval, dst.align, src.llval, src.align, bytes, MemFlags::empty()); + self.memcpy(dst.llval, dst.align, src.llval, src.align, bytes, flags); } } diff --git a/compiler/rustc_codegen_ssa/src/traits/type_.rs b/compiler/rustc_codegen_ssa/src/traits/type_.rs index 555833759eb..34d9e75036f 100644 --- a/compiler/rustc_codegen_ssa/src/traits/type_.rs +++ b/compiler/rustc_codegen_ssa/src/traits/type_.rs @@ -133,28 +133,6 @@ fn is_backend_ref(&self, layout: TyAndLayout<'tcx>) -> bool { || self.is_backend_immediate(layout) || self.is_backend_scalar_pair(layout)) } - - /// A type that can be used in a [`super::BuilderMethods::load`] + - /// [`super::BuilderMethods::store`] pair to implement a *typed* copy, - /// such as a MIR `*_0 = *_1`. - /// - /// It's always legal to return `None` here, as the provided impl does, - /// in which case callers should use [`super::BuilderMethods::memcpy`] - /// instead of the `load`+`store` pair. - /// - /// This can be helpful for things like arrays, where the LLVM backend type - /// `[3 x i16]` optimizes to three separate loads and stores, but it can - /// instead be copied via an `i48` that stays as the single `load`+`store`. - /// (As of 2023-05 LLVM cannot necessarily optimize away a `memcpy` in these - /// cases, due to `poison` handling, but in codegen we have more information - /// about the type invariants, so can emit something better instead.) - /// - /// This *should* return `None` for particularly-large types, where leaving - /// the `memcpy` may well be important to avoid code size explosion. - fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option { - let _ = layout; - None - } } // For backends that support CFI using type membership (i.e., testing whether a given pointer is diff --git a/tests/codegen/array-codegen.rs b/tests/codegen/array-codegen.rs index bb4bd5444db..1310e61c41d 100644 --- a/tests/codegen/array-codegen.rs +++ b/tests/codegen/array-codegen.rs @@ -5,52 +5,58 @@ // CHECK-LABEL: @array_load #[no_mangle] pub fn array_load(a: &[u8; 4]) -> [u8; 4] { - // CHECK: %_0 = alloca [4 x i8], align 1 - // CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1 - // CHECK: store <4 x i8> %[[TEMP1]], ptr %_0, align 1 - // CHECK: %[[TEMP2:.+]] = load i32, ptr %_0, align 1 - // CHECK: ret i32 %[[TEMP2]] + // CHECK-NOT: alloca + // CHECK: %[[ALLOCA:.+]] = alloca [4 x i8], align 1 + // CHECK-NOT: alloca + // CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[ALLOCA]], ptr align 1 %a, {{.+}} 4, i1 false) + // CHECK: %[[TEMP:.+]] = load i32, ptr %[[ALLOCA]], align 1 + // CHECK: ret i32 %[[TEMP]] *a } // CHECK-LABEL: @array_store #[no_mangle] pub fn array_store(a: [u8; 4], p: &mut [u8; 4]) { + // CHECK-NOT: alloca + // CHECK: %[[TEMP:.+]] = alloca i32, [[TEMPALIGN:align [0-9]+]] + // CHECK-NOT: alloca // CHECK: %a = alloca [4 x i8] - // CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1 - // CHECK-NEXT: store <4 x i8> %[[TEMP]], ptr %p, align 1 + // CHECK-NOT: alloca + // store i32 %0, ptr %[[TEMP]] + // CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %a, ptr [[TEMPALIGN]] %[[TEMP]], {{.+}} 4, i1 false) + // CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %a, {{.+}} 4, i1 false) *p = a; } // CHECK-LABEL: @array_copy #[no_mangle] pub fn array_copy(a: &[u8; 4], p: &mut [u8; 4]) { + // CHECK-NOT: alloca // CHECK: %[[LOCAL:.+]] = alloca [4 x i8], align 1 - // CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1 - // CHECK: store <4 x i8> %[[TEMP1]], ptr %[[LOCAL]], align 1 - // CHECK: %[[TEMP2:.+]] = load <4 x i8>, ptr %[[LOCAL]], align 1 - // CHECK: store <4 x i8> %[[TEMP2]], ptr %p, align 1 + // CHECK-NOT: alloca + // CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[LOCAL]], ptr align 1 %a, {{.+}} 4, i1 false) + // CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %[[LOCAL]], {{.+}} 4, i1 false) *p = *a; } // CHECK-LABEL: @array_copy_1_element #[no_mangle] pub fn array_copy_1_element(a: &[u8; 1], p: &mut [u8; 1]) { + // CHECK-NOT: alloca // CHECK: %[[LOCAL:.+]] = alloca [1 x i8], align 1 - // CHECK: %[[TEMP1:.+]] = load i8, ptr %a, align 1 - // CHECK: store i8 %[[TEMP1]], ptr %[[LOCAL]], align 1 - // CHECK: %[[TEMP2:.+]] = load i8, ptr %[[LOCAL]], align 1 - // CHECK: store i8 %[[TEMP2]], ptr %p, align 1 + // CHECK-NOT: alloca + // CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[LOCAL]], ptr align 1 %a, {{.+}} 1, i1 false) + // CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %[[LOCAL]], {{.+}} 1, i1 false) *p = *a; } // CHECK-LABEL: @array_copy_2_elements #[no_mangle] pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) { + // CHECK-NOT: alloca // CHECK: %[[LOCAL:.+]] = alloca [2 x i8], align 1 - // CHECK: %[[TEMP1:.+]] = load <2 x i8>, ptr %a, align 1 - // CHECK: store <2 x i8> %[[TEMP1]], ptr %[[LOCAL]], align 1 - // CHECK: %[[TEMP2:.+]] = load <2 x i8>, ptr %[[LOCAL]], align 1 - // CHECK: store <2 x i8> %[[TEMP2]], ptr %p, align 1 + // CHECK-NOT: alloca + // CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[LOCAL]], ptr align 1 %a, {{.+}} 2, i1 false) + // CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %[[LOCAL]], {{.+}} 2, i1 false) *p = *a; } diff --git a/tests/codegen/array-optimized.rs b/tests/codegen/array-optimized.rs index 4cf16f1fb30..42fdbd39b7e 100644 --- a/tests/codegen/array-optimized.rs +++ b/tests/codegen/array-optimized.rs @@ -16,8 +16,8 @@ #[no_mangle] pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) { // CHECK-NOT: alloca - // CHECK: %[[TEMP:.+]] = load <2 x i8>, ptr %a, align 1 - // CHECK: store <2 x i8> %[[TEMP]], ptr %p, align 1 + // CHECK: %[[TEMP:.+]] = load i16, ptr %a, align 1 + // CHECK: store i16 %[[TEMP]], ptr %p, align 1 // CHECK: ret *p = *a; } @@ -26,8 +26,8 @@ #[no_mangle] pub fn array_copy_4_elements(a: &[u8; 4], p: &mut [u8; 4]) { // CHECK-NOT: alloca - // CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1 - // CHECK: store <4 x i8> %[[TEMP]], ptr %p, align 1 + // CHECK: %[[TEMP:.+]] = load i32, ptr %a, align 1 + // CHECK: store i32 %[[TEMP]], ptr %p, align 1 // CHECK: ret *p = *a; } diff --git a/tests/codegen/mem-replace-simple-type.rs b/tests/codegen/mem-replace-simple-type.rs index b00fbad05d9..50b43f5854a 100644 --- a/tests/codegen/mem-replace-simple-type.rs +++ b/tests/codegen/mem-replace-simple-type.rs @@ -45,9 +45,7 @@ pub fn replace_ref_str<'a>(r: &mut &'a str, v: &'a str) -> &'a str { // CHECK-LABEL: @replace_short_array_4( pub fn replace_short_array_4(r: &mut [u32; 4], v: [u32; 4]) -> [u32; 4] { // CHECK-NOT: alloca - // CHECK: %[[R:.+]] = load <4 x i32>, ptr %r, align 4 - // CHECK: store <4 x i32> %[[R]], ptr %result - // CHECK: %[[V:.+]] = load <4 x i32>, ptr %v, align 4 - // CHECK: store <4 x i32> %[[V]], ptr %r + // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %result, ptr align 4 %r, i64 16, i1 false) + // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %r, ptr align 4 %v, i64 16, i1 false) std::mem::replace(r, v) }