rust/tests/codegen/simd-wide-sum.rs

// compile-flags: -C opt-level=3 --edition=2021
// only-x86_64
// ignore-debug: the debug assertions get in the way

#![crate_type = "lib"]
#![feature(portable_simd)]

use std::simd::{Simd, SimdUint};
const N: usize = 8;

#[no_mangle]
// CHECK-LABEL: @wider_reduce_simd
pub fn wider_reduce_simd(x: Simd<u8, N>) -> u16 {
    // CHECK: zext <8 x i8>
    // CHECK-SAME: to <8 x i16>
    // CHECK: call i16 @llvm.vector.reduce.add.v8i16(<8 x i16>
    let x: Simd<u16, N> = x.cast();
    x.reduce_sum()
}

#[no_mangle]
// CHECK-LABEL: @wider_reduce_loop
pub fn wider_reduce_loop(x: Simd<u8, N>) -> u16 {
    // CHECK: zext <8 x i8>
    // CHECK-SAME: to <8 x i16>
    // CHECK: call i16 @llvm.vector.reduce.add.v8i16(<8 x i16>
    let mut sum = 0_u16;
    for i in 0..N {
        sum += u16::from(x[i]);
    }
    sum
}

#[no_mangle]
// CHECK-LABEL: @wider_reduce_iter
pub fn wider_reduce_iter(x: Simd<u8, N>) -> u16 {
    // CHECK: zext <8 x i8>
    // CHECK-SAME: to <8 x i16>
    // CHECK: call i16 @llvm.vector.reduce.add.v8i16(<8 x i16>
    x.as_array().iter().copied().map(u16::from).sum()
}

// This iterator one is the most interesting, as it's the one
// which used to not auto-vectorize due to a suboptimality in the
// `<array::IntoIter as Iterator>::fold` implementation.

#[no_mangle]
// CHECK-LABEL: @wider_reduce_into_iter
pub fn wider_reduce_into_iter(x: Simd<u8, N>) -> u16 {
    // FIXME MIR inlining messes up LLVM optimizations.
    // WOULD-CHECK: zext <8 x i8>
    // WOULD-CHECK-SAME: to <8 x i16>
    // WOULD-CHECK: call i16 @llvm.vector.reduce.add.v8i16(<8 x i16>
    x.to_array().into_iter().map(u16::from).sum()
}
Fix `array::IntoIter::fold` to use the optimized `Range::fold` It was using `Iterator::by_ref` in the implementation, which ended up pessimizing it enough that, for example, it didn't vectorize when we tried it in the <https://rust-lang.zulipchat.com/#narrow/stream/257879-project-portable-simd/topic/Reducing.20sum.20into.20wider.20types> conversation. Demonstration that the codegen test doesn't pass on the current nightly: <https://rust.godbolt.org/z/Taxev5eMn> 2022-04-02 16:29:41 -05:00			`// compile-flags: -C opt-level=3 --edition=2021`
			`// only-x86_64`
			`// ignore-debug: the debug assertions get in the way`

			`#![crate_type = "lib"]`
			`#![feature(portable_simd)]`

Introduce core::simd trait imports in tests 2022-07-20 20:08:14 -05:00			`use std::simd::{Simd, SimdUint};`
Fix `array::IntoIter::fold` to use the optimized `Range::fold` It was using `Iterator::by_ref` in the implementation, which ended up pessimizing it enough that, for example, it didn't vectorize when we tried it in the <https://rust-lang.zulipchat.com/#narrow/stream/257879-project-portable-simd/topic/Reducing.20sum.20into.20wider.20types> conversation. Demonstration that the codegen test doesn't pass on the current nightly: <https://rust.godbolt.org/z/Taxev5eMn> 2022-04-02 16:29:41 -05:00			`const N: usize = 8;`

			`#[no_mangle]`
			`// CHECK-LABEL: @wider_reduce_simd`
			`pub fn wider_reduce_simd(x: Simd<u8, N>) -> u16 {`
			`// CHECK: zext <8 x i8>`
			`// CHECK-SAME: to <8 x i16>`
			`// CHECK: call i16 @llvm.vector.reduce.add.v8i16(<8 x i16>`
			`let x: Simd<u16, N> = x.cast();`
			`x.reduce_sum()`
			`}`

			`#[no_mangle]`
			`// CHECK-LABEL: @wider_reduce_loop`
			`pub fn wider_reduce_loop(x: Simd<u8, N>) -> u16 {`
			`// CHECK: zext <8 x i8>`
			`// CHECK-SAME: to <8 x i16>`
			`// CHECK: call i16 @llvm.vector.reduce.add.v8i16(<8 x i16>`
			`let mut sum = 0_u16;`
			`for i in 0..N {`
			`sum += u16::from(x[i]);`
			`}`
			`sum`
			`}`

			`#[no_mangle]`
			`// CHECK-LABEL: @wider_reduce_iter`
			`pub fn wider_reduce_iter(x: Simd<u8, N>) -> u16 {`
			`// CHECK: zext <8 x i8>`
			`// CHECK-SAME: to <8 x i16>`
			`// CHECK: call i16 @llvm.vector.reduce.add.v8i16(<8 x i16>`
			`x.as_array().iter().copied().map(u16::from).sum()`
			`}`

			`// This iterator one is the most interesting, as it's the one`
			`// which used to not auto-vectorize due to a suboptimality in the`
			// `<array::IntoIter as Iterator>::fold` implementation.

			`#[no_mangle]`
			`// CHECK-LABEL: @wider_reduce_into_iter`
			`pub fn wider_reduce_into_iter(x: Simd<u8, N>) -> u16 {`
Amend codegen test. 2022-05-29 12:44:58 -05:00			`// FIXME MIR inlining messes up LLVM optimizations.`
			`// WOULD-CHECK: zext <8 x i8>`
			`// WOULD-CHECK-SAME: to <8 x i16>`
			`// WOULD-CHECK: call i16 @llvm.vector.reduce.add.v8i16(<8 x i16>`
Fix `array::IntoIter::fold` to use the optimized `Range::fold` It was using `Iterator::by_ref` in the implementation, which ended up pessimizing it enough that, for example, it didn't vectorize when we tried it in the <https://rust-lang.zulipchat.com/#narrow/stream/257879-project-portable-simd/topic/Reducing.20sum.20into.20wider.20types> conversation. Demonstration that the codegen test doesn't pass on the current nightly: <https://rust.godbolt.org/z/Taxev5eMn> 2022-04-02 16:29:41 -05:00			`x.to_array().into_iter().map(u16::from).sum()`
			`}`