rust/tests/codegen/swap-small-types.rs

// compile-flags: -O -Z merge-functions=disabled
// only-x86_64
// ignore-debug: the debug assertions get in the way

#![crate_type = "lib"]

use std::mem::swap;

type RGB48 = [u16; 3];

// CHECK-LABEL: @swap_rgb48_manually(
#[no_mangle]
pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) {
    // FIXME: See #115212 for why this has an alloca again

    // CHECK: alloca [3 x i16], align 2
    // CHECK: call void @llvm.memcpy.p0.p0.i64({{.+}}, i64 6, i1 false)
    // CHECK: call void @llvm.memcpy.p0.p0.i64({{.+}}, i64 6, i1 false)
    // CHECK: call void @llvm.memcpy.p0.p0.i64({{.+}}, i64 6, i1 false)

    let temp = *x;
    *x = *y;
    *y = temp;
}

// CHECK-LABEL: @swap_rgb48
#[no_mangle]
pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
    // CHECK-NOT: alloca

    // Whether `i8` is the best for this is unclear, but
    // might as well record what's actually happening right now.

    // CHECK: load i8
    // CHECK: load i8
    // CHECK: store i8
    // CHECK: store i8
    swap(x, y)
}

type RGBA64 = [u16; 4];

// CHECK-LABEL: @swap_rgba64
#[no_mangle]
pub fn swap_rgba64(x: &mut RGBA64, y: &mut RGBA64) {
    // CHECK-NOT: alloca
    // CHECK-DAG: %[[XVAL:.+]] = load i64, ptr %x, align 2
    // CHECK-DAG: %[[YVAL:.+]] = load i64, ptr %y, align 2
    // CHECK-DAG: store i64 %[[YVAL]], ptr %x, align 2
    // CHECK-DAG: store i64 %[[XVAL]], ptr %y, align 2
    swap(x, y)
}

// CHECK-LABEL: @swap_vecs
#[no_mangle]
pub fn swap_vecs(x: &mut Vec<u32>, y: &mut Vec<u32>) {
    // CHECK-NOT: alloca
    // There are plenty more loads and stores than just these,
    // but at least one sure better be 64-bit (for size or capacity).
    // CHECK: load i64
    // CHECK: load i64
    // CHECK: store i64
    // CHECK: store i64
    // CHECK: ret void
    swap(x, y)
}

// CHECK-LABEL: @swap_slices
#[no_mangle]
pub fn swap_slices<'a>(x: &mut &'a [u32], y: &mut &'a [u32]) {
    // CHECK-NOT: alloca
    // CHECK: load ptr
    // CHECK: load i64
    // CHECK: load ptr
    // CHECK: load i64
    // CHECK: store ptr
    // CHECK: store i64
    // CHECK: store ptr
    // CHECK: store i64
    swap(x, y)
}

// LLVM doesn't vectorize a loop over 3-byte elements,
// so we chunk it down to bytes and loop over those instead.
type RGB24 = [u8; 3];

// CHECK-LABEL: @swap_rgb24_slices
#[no_mangle]
pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) {
    // CHECK-NOT: alloca
    // CHECK: load <{{[0-9]+}} x i8>
    // CHECK: store <{{[0-9]+}} x i8>
    if x.len() == y.len() {
        x.swap_with_slice(y);
    }
}

// This one has a power-of-two size, so we iterate over it directly
type RGBA32 = [u8; 4];

// CHECK-LABEL: @swap_rgba32_slices
#[no_mangle]
pub fn swap_rgba32_slices(x: &mut [RGBA32], y: &mut [RGBA32]) {
    // CHECK-NOT: alloca
    // CHECK: load <{{[0-9]+}} x i32>
    // CHECK: store <{{[0-9]+}} x i32>
    if x.len() == y.len() {
        x.swap_with_slice(y);
    }
}

// Strings have a non-power-of-two size, but have pointer alignment,
// so we swap usizes instead of dropping all the way down to bytes.
const _: () = assert!(!std::mem::size_of::<String>().is_power_of_two());

// CHECK-LABEL: @swap_string_slices
#[no_mangle]
pub fn swap_string_slices(x: &mut [String], y: &mut [String]) {
    // CHECK-NOT: alloca
    // CHECK: load <{{[0-9]+}} x i64>
    // CHECK: store <{{[0-9]+}} x i64>
    if x.len() == y.len() {
        x.swap_with_slice(y);
    }
}

#[repr(C, packed)]
pub struct Packed {
    pub first: bool,
    pub second: usize,
}

// CHECK-LABEL: @swap_packed_structs
#[no_mangle]
pub fn swap_packed_structs(x: &mut Packed, y: &mut Packed) {
    // CHECK-NOT: alloca
    // CHECK: ret void
    swap(x, y)
}
Use `load`-`store` instead of `memcpy` for short integer arrays 2023-05-26 05:32:22 -05:00			`// compile-flags: -O -Z merge-functions=disabled`
Only run the test on x86_64 Smaller platforms don't merge the loads the same way. 2018-07-22 01:12:46 -05:00			`// only-x86_64`
ignore some codegen tests in debug mode 2019-06-25 02:40:50 -05:00			`// ignore-debug: the debug assertions get in the way`
Don't use SIMD in mem::swap for types smaller than the block size LLVM isn't able to remove the alloca for the unaligned block in the SIMD tail in some cases, so doing this helps SRoA work in cases where it currently doesn't. Found in the `replace_with` RFC discussion. 2018-07-04 04:48:30 -05:00
			`#![crate_type = "lib"]`

			`use std::mem::swap;`

			`type RGB48 = [u16; 3];`

Add a codegen test for manually swapping a small `Copy` type To confirm we're not just helping `mem::swap` 2023-05-26 03:23:55 -05:00			`// CHECK-LABEL: @swap_rgb48_manually(`
			`#[no_mangle]`
			`pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) {`
Stop emitting non-power-of-two vectors in basic LLVM codegen 2023-08-25 22:06:57 -05:00			`// FIXME: See #115212 for why this has an alloca again`

			`// CHECK: alloca [3 x i16], align 2`
			`// CHECK: call void @llvm.memcpy.p0.p0.i64({{.+}}, i64 6, i1 false)`
			`// CHECK: call void @llvm.memcpy.p0.p0.i64({{.+}}, i64 6, i1 false)`
			`// CHECK: call void @llvm.memcpy.p0.p0.i64({{.+}}, i64 6, i1 false)`
Add a codegen test for manually swapping a small `Copy` type To confirm we're not just helping `mem::swap` 2023-05-26 03:23:55 -05:00
			`let temp = *x;`
			`x = y;`
			`*y = temp;`
			`}`

Don't use SIMD in mem::swap for types smaller than the block size LLVM isn't able to remove the alloca for the unaligned block in the SIMD tail in some cases, so doing this helps SRoA work in cases where it currently doesn't. Found in the `replace_with` RFC discussion. 2018-07-04 04:48:30 -05:00			`// CHECK-LABEL: @swap_rgb48`
			`#[no_mangle]`
			`pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {`
Tweak the threshold for chunked swapping Thanks to 98892 for the tests I brought in here, as it demonstrated that 3×usize is currently suboptimal. 2022-07-02 16:31:02 -05:00			`// CHECK-NOT: alloca`
Stop emitting non-power-of-two vectors in basic LLVM codegen 2023-08-25 22:06:57 -05:00
Tweak the threshold for chunked swapping Thanks to 98892 for the tests I brought in here, as it demonstrated that 3×usize is currently suboptimal. 2022-07-02 16:31:02 -05:00			// Whether `i8` is the best for this is unclear, but
			`// might as well record what's actually happening right now.`

			`// CHECK: load i8`
			`// CHECK: load i8`
			`// CHECK: store i8`
			`// CHECK: store i8`
Stop emitting non-power-of-two vectors in basic LLVM codegen 2023-08-25 22:06:57 -05:00			`swap(x, y)`
			`}`

			`type RGBA64 = [u16; 4];`

			`// CHECK-LABEL: @swap_rgba64`
			`#[no_mangle]`
			`pub fn swap_rgba64(x: &mut RGBA64, y: &mut RGBA64) {`
Use `load`-`store` instead of `memcpy` for short integer arrays 2023-05-26 05:32:22 -05:00			`// CHECK-NOT: alloca`
Tweak the threshold for chunked swapping Thanks to 98892 for the tests I brought in here, as it demonstrated that 3×usize is currently suboptimal. 2022-07-02 16:31:02 -05:00			`// CHECK-DAG: %[[XVAL:.+]] = load i64, ptr %x, align 2`
			`// CHECK-DAG: %[[YVAL:.+]] = load i64, ptr %y, align 2`
			`// CHECK-DAG: store i64 %[[YVAL]], ptr %x, align 2`
			`// CHECK-DAG: store i64 %[[XVAL]], ptr %y, align 2`
			`swap(x, y)`
			`}`

			`// CHECK-LABEL: @swap_vecs`
			`#[no_mangle]`
			`pub fn swap_vecs(x: &mut Vec<u32>, y: &mut Vec<u32>) {`
			`// CHECK-NOT: alloca`
			`// There are plenty more loads and stores than just these,`
			`// but at least one sure better be 64-bit (for size or capacity).`
			`// CHECK: load i64`
			`// CHECK: load i64`
			`// CHECK: store i64`
			`// CHECK: store i64`
			`// CHECK: ret void`
			`swap(x, y)`
			`}`

			`// CHECK-LABEL: @swap_slices`
			`#[no_mangle]`
			`pub fn swap_slices<'a>(x: &mut &'a [u32], y: &mut &'a [u32]) {`
			`// CHECK-NOT: alloca`
			`// CHECK: load ptr`
			`// CHECK: load i64`
			`// CHECK: load ptr`
			`// CHECK: load i64`
			`// CHECK: store ptr`
			`// CHECK: store i64`
			`// CHECK: store ptr`
			`// CHECK: store i64`
Don't use SIMD in mem::swap for types smaller than the block size LLVM isn't able to remove the alloca for the unaligned block in the SIMD tail in some cases, so doing this helps SRoA work in cases where it currently doesn't. Found in the `replace_with` RFC discussion. 2018-07-04 04:48:30 -05:00			`swap(x, y)`
			`}`
Stop manually SIMDing in swap_nonoverlapping Like I previously did for `reverse`, this leaves it to LLVM to pick how to vectorize it, since it can know better the chunk size to use, compared to the "32 bytes always" approach we currently have. It does still need logic to type-erase where appropriate, though, as while LLVM is now smart enough to vectorize over slices of things like `[u8; 4]`, it fails to do so over slices of `[u8; 3]`. As a bonus, this also means one no longer gets the spurious `memcpy`(s?) at the end up swapping a slice of `__m256`s: <https://rust.godbolt.org/z/joofr4v8Y> 2022-02-21 01:25:18 -06:00
			`// LLVM doesn't vectorize a loop over 3-byte elements,`
			`// so we chunk it down to bytes and loop over those instead.`
			`type RGB24 = [u8; 3];`

			`// CHECK-LABEL: @swap_rgb24_slices`
			`#[no_mangle]`
			`pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) {`
Tweak the threshold for chunked swapping Thanks to 98892 for the tests I brought in here, as it demonstrated that 3×usize is currently suboptimal. 2022-07-02 16:31:02 -05:00			`// CHECK-NOT: alloca`
			`// CHECK: load <{{[0-9]+}} x i8>`
			`// CHECK: store <{{[0-9]+}} x i8>`
Stop manually SIMDing in swap_nonoverlapping Like I previously did for `reverse`, this leaves it to LLVM to pick how to vectorize it, since it can know better the chunk size to use, compared to the "32 bytes always" approach we currently have. It does still need logic to type-erase where appropriate, though, as while LLVM is now smart enough to vectorize over slices of things like `[u8; 4]`, it fails to do so over slices of `[u8; 3]`. As a bonus, this also means one no longer gets the spurious `memcpy`(s?) at the end up swapping a slice of `__m256`s: <https://rust.godbolt.org/z/joofr4v8Y> 2022-02-21 01:25:18 -06:00			`if x.len() == y.len() {`
			`x.swap_with_slice(y);`
			`}`
			`}`

			`// This one has a power-of-two size, so we iterate over it directly`
			`type RGBA32 = [u8; 4];`

			`// CHECK-LABEL: @swap_rgba32_slices`
			`#[no_mangle]`
			`pub fn swap_rgba32_slices(x: &mut [RGBA32], y: &mut [RGBA32]) {`
Tweak the threshold for chunked swapping Thanks to 98892 for the tests I brought in here, as it demonstrated that 3×usize is currently suboptimal. 2022-07-02 16:31:02 -05:00			`// CHECK-NOT: alloca`
			`// CHECK: load <{{[0-9]+}} x i32>`
			`// CHECK: store <{{[0-9]+}} x i32>`
Stop manually SIMDing in swap_nonoverlapping Like I previously did for `reverse`, this leaves it to LLVM to pick how to vectorize it, since it can know better the chunk size to use, compared to the "32 bytes always" approach we currently have. It does still need logic to type-erase where appropriate, though, as while LLVM is now smart enough to vectorize over slices of things like `[u8; 4]`, it fails to do so over slices of `[u8; 3]`. As a bonus, this also means one no longer gets the spurious `memcpy`(s?) at the end up swapping a slice of `__m256`s: <https://rust.godbolt.org/z/joofr4v8Y> 2022-02-21 01:25:18 -06:00			`if x.len() == y.len() {`
			`x.swap_with_slice(y);`
			`}`
			`}`

			`// Strings have a non-power-of-two size, but have pointer alignment,`
			`// so we swap usizes instead of dropping all the way down to bytes.`
			`const _: () = assert!(!std::mem::size_of::<String>().is_power_of_two());`

			`// CHECK-LABEL: @swap_string_slices`
			`#[no_mangle]`
			`pub fn swap_string_slices(x: &mut [String], y: &mut [String]) {`
Tweak the threshold for chunked swapping Thanks to 98892 for the tests I brought in here, as it demonstrated that 3×usize is currently suboptimal. 2022-07-02 16:31:02 -05:00			`// CHECK-NOT: alloca`
			`// CHECK: load <{{[0-9]+}} x i64>`
			`// CHECK: store <{{[0-9]+}} x i64>`
Stop manually SIMDing in swap_nonoverlapping Like I previously did for `reverse`, this leaves it to LLVM to pick how to vectorize it, since it can know better the chunk size to use, compared to the "32 bytes always" approach we currently have. It does still need logic to type-erase where appropriate, though, as while LLVM is now smart enough to vectorize over slices of things like `[u8; 4]`, it fails to do so over slices of `[u8; 3]`. As a bonus, this also means one no longer gets the spurious `memcpy`(s?) at the end up swapping a slice of `__m256`s: <https://rust.godbolt.org/z/joofr4v8Y> 2022-02-21 01:25:18 -06:00			`if x.len() == y.len() {`
			`x.swap_with_slice(y);`
			`}`
			`}`
Tweak the threshold for chunked swapping Thanks to 98892 for the tests I brought in here, as it demonstrated that 3×usize is currently suboptimal. 2022-07-02 16:31:02 -05:00
			`#[repr(C, packed)]`
			`pub struct Packed {`
			`pub first: bool,`
			`pub second: usize,`
			`}`

			`// CHECK-LABEL: @swap_packed_structs`
			`#[no_mangle]`
			`pub fn swap_packed_structs(x: &mut Packed, y: &mut Packed) {`
			`// CHECK-NOT: alloca`
			`// CHECK: ret void`
			`swap(x, y)`
			`}`