rust/tests/assembly/simd-intrinsic-mask-reduce.rs

// verify that simd mask reductions do not introduce additional bit shift operations
//@ revisions: x86 aarch64
//@ [x86] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel
// Set the base cpu explicitly, in case the default has been changed.
//@ [x86] compile-flags: -C target-cpu=x86-64
//@ [x86] needs-llvm-components: x86
//@ [aarch64] compile-flags: --target=aarch64-unknown-linux-gnu
//@ [aarch64] needs-llvm-components: aarch64
//@ assembly-output: emit-asm
//@ compile-flags: --crate-type=lib -O -C panic=abort

#![feature(no_core, lang_items, repr_simd, intrinsics)]
#![no_core]
#![allow(non_camel_case_types)]

// Because we don't have core yet.
#[lang = "sized"]
pub trait Sized {}

#[lang = "copy"]
trait Copy {}

#[repr(simd)]
pub struct mask8x16([i8; 16]);

extern "rust-intrinsic" {
    fn simd_reduce_all<T>(x: T) -> bool;
    fn simd_reduce_any<T>(x: T) -> bool;
}

// CHECK-LABEL: mask_reduce_all:
#[no_mangle]
pub unsafe extern "C" fn mask_reduce_all(m: mask8x16) -> bool {
    // x86: psllw xmm0, 7
    // x86-NEXT: pmovmskb eax, xmm0
    // x86-NEXT: {{cmp ax, -1|xor eax, 65535}}
    // x86-NEXT: sete al
    //
    // aarch64: shl v0.16b, v0.16b, #7
    // aarch64-NEXT: cmlt v0.16b, v0.16b, #0
    // aarch64-NEXT: uminv b0, v0.16b
    // aarch64-NEXT: fmov [[REG:[a-z0-9]+]], s0
    // aarch64-NEXT: and w0, [[REG]], #0x1
    simd_reduce_all(m)
}

// CHECK-LABEL: mask_reduce_any:
#[no_mangle]
pub unsafe extern "C" fn mask_reduce_any(m: mask8x16) -> bool {
    // x86: psllw xmm0, 7
    // x86-NEXT: pmovmskb
    // x86-NEXT: test eax, eax
    // x86-NEXT: setne al
    //
    // aarch64: shl v0.16b, v0.16b, #7
    // aarch64-NEXT: cmlt v0.16b, v0.16b, #0
    // aarch64-NEXT: umaxv b0, v0.16b
    // aarch64-NEXT: fmov [[REG:[a-z0-9]+]], s0
    // aarch64-NEXT: and w0, [[REG]], #0x1
    simd_reduce_any(m)
}
Add tests for the generated assembly of mask related simd instructions. The tests show that the code generation currently uses the least significant bits of <iX x N> vector masks when converting to <i1 xN>. This leads to an additional left shift operation in the assembly for x86, since mask operations on x86 operate based on the most significant bit. On aarch64 the left shift is followed by a comparison against zero, which repeats the sign bit across the whole lane. The exception, which does not introduce an unneeded shift, is simd_bitmask, because the code generation already shifts before truncating. By using the "C" calling convention the tests should be stable regarding changes in register allocation, but it is possible that future llvm updates will require updating some of the checks. This additional instruction would be removed by the fix in #104693, which uses the most significant bit for all mask operations. 2024-03-02 03:59:11 -06:00			`// verify that simd mask reductions do not introduce additional bit shift operations`
			`//@ revisions: x86 aarch64`
			`//@ [x86] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel`
Add inline comments why we're forcing the target cpu 2024-05-01 18:54:20 -05:00			`// Set the base cpu explicitly, in case the default has been changed.`
Use an explicit x86-64 cpu in tests that are sensitive to it There are a few tests that depend on some target features not being enabled by default, and usually they are correct with the default x86-64 target CPU. However, in downstream builds we have modified the default to fit our distros -- `x86-64-v2` in RHEL 9 and `x86-64-v3` in RHEL 10 -- and the latter especially trips tests that expect not to have AVX. These cases are few enough that we can just set them back explicitly. 2024-05-01 17:25:26 -05:00			`//@ [x86] compile-flags: -C target-cpu=x86-64`
Add tests for the generated assembly of mask related simd instructions. The tests show that the code generation currently uses the least significant bits of <iX x N> vector masks when converting to <i1 xN>. This leads to an additional left shift operation in the assembly for x86, since mask operations on x86 operate based on the most significant bit. On aarch64 the left shift is followed by a comparison against zero, which repeats the sign bit across the whole lane. The exception, which does not introduce an unneeded shift, is simd_bitmask, because the code generation already shifts before truncating. By using the "C" calling convention the tests should be stable regarding changes in register allocation, but it is possible that future llvm updates will require updating some of the checks. This additional instruction would be removed by the fix in #104693, which uses the most significant bit for all mask operations. 2024-03-02 03:59:11 -06:00			`//@ [x86] needs-llvm-components: x86`
			`//@ [aarch64] compile-flags: --target=aarch64-unknown-linux-gnu`
			`//@ [aarch64] needs-llvm-components: aarch64`
			`//@ assembly-output: emit-asm`
Remove c_unwind from tests and fix tests 2023-08-25 07:52:51 -05:00			`//@ compile-flags: --crate-type=lib -O -C panic=abort`
Add tests for the generated assembly of mask related simd instructions. The tests show that the code generation currently uses the least significant bits of <iX x N> vector masks when converting to <i1 xN>. This leads to an additional left shift operation in the assembly for x86, since mask operations on x86 operate based on the most significant bit. On aarch64 the left shift is followed by a comparison against zero, which repeats the sign bit across the whole lane. The exception, which does not introduce an unneeded shift, is simd_bitmask, because the code generation already shifts before truncating. By using the "C" calling convention the tests should be stable regarding changes in register allocation, but it is possible that future llvm updates will require updating some of the checks. This additional instruction would be removed by the fix in #104693, which uses the most significant bit for all mask operations. 2024-03-02 03:59:11 -06:00
			`#![feature(no_core, lang_items, repr_simd, intrinsics)]`
			`#![no_core]`
			`#![allow(non_camel_case_types)]`

			`// Because we don't have core yet.`
			`#[lang = "sized"]`
			`pub trait Sized {}`

			`#[lang = "copy"]`
			`trait Copy {}`

			`#[repr(simd)]`
			`pub struct mask8x16([i8; 16]);`

			`extern "rust-intrinsic" {`
			`fn simd_reduce_all<T>(x: T) -> bool;`
			`fn simd_reduce_any<T>(x: T) -> bool;`
			`}`

			`// CHECK-LABEL: mask_reduce_all:`
			`#[no_mangle]`
			`pub unsafe extern "C" fn mask_reduce_all(m: mask8x16) -> bool {`
			`// x86: psllw xmm0, 7`
			`// x86-NEXT: pmovmskb eax, xmm0`
			`// x86-NEXT: {{cmp ax, -1\|xor eax, 65535}}`
			`// x86-NEXT: sete al`
			`//`
			`// aarch64: shl v0.16b, v0.16b, #7`
			`// aarch64-NEXT: cmlt v0.16b, v0.16b, #0`
			`// aarch64-NEXT: uminv b0, v0.16b`
			`// aarch64-NEXT: fmov [[REG:[a-z0-9]+]], s0`
			`// aarch64-NEXT: and w0, [[REG]], #0x1`
			`simd_reduce_all(m)`
			`}`

			`// CHECK-LABEL: mask_reduce_any:`
			`#[no_mangle]`
			`pub unsafe extern "C" fn mask_reduce_any(m: mask8x16) -> bool {`
			`// x86: psllw xmm0, 7`
			`// x86-NEXT: pmovmskb`
			`// x86-NEXT: test eax, eax`
			`// x86-NEXT: setne al`
			`//`
			`// aarch64: shl v0.16b, v0.16b, #7`
			`// aarch64-NEXT: cmlt v0.16b, v0.16b, #0`
			`// aarch64-NEXT: umaxv b0, v0.16b`
			`// aarch64-NEXT: fmov [[REG:[a-z0-9]+]], s0`
			`// aarch64-NEXT: and w0, [[REG]], #0x1`
			`simd_reduce_any(m)`
			`}`