Implement min/max neon intrisics
This commit is contained in:
parent
ef3703694f
commit
1f09bae6a8
@ -99,6 +99,7 @@ const BASE_SYSROOT_SUITE: &[TestCase] = &[
|
||||
TestCase::build_bin_and_run("aot.mod_bench", "example/mod_bench.rs", &[]),
|
||||
TestCase::build_bin_and_run("aot.issue-72793", "example/issue-72793.rs", &[]),
|
||||
TestCase::build_bin("aot.issue-59326", "example/issue-59326.rs"),
|
||||
TestCase::build_bin_and_run("aot.neon", "example/neon.rs", &[]),
|
||||
];
|
||||
|
||||
pub(crate) static RAND_REPO: GitRepo = GitRepo::github(
|
||||
|
@ -42,6 +42,7 @@ aot.float-minmax-pass
|
||||
aot.mod_bench
|
||||
aot.issue-72793
|
||||
aot.issue-59326
|
||||
aot.neon
|
||||
|
||||
testsuite.extended_sysroot
|
||||
test.rust-random/rand
|
||||
|
155
example/neon.rs
Normal file
155
example/neon.rs
Normal file
@ -0,0 +1,155 @@
|
||||
// Most of these tests are copied from https://github.com/japaric/stdsimd/blob/0f4413d01c4f0c3ffbc5a69e9a37fbc7235b31a9/coresimd/arm/neon.rs
|
||||
|
||||
#![feature(portable_simd)]
|
||||
use std::arch::aarch64::*;
|
||||
use std::mem::transmute;
|
||||
use std::simd::*;
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe fn test_vpmin_s8() {
|
||||
let a = i8x8::from([1, -2, 3, -4, 5, 6, 7, 8]);
|
||||
let b = i8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
|
||||
let e = i8x8::from([-2, -4, 5, 7, 0, 2, 4, 6]);
|
||||
let r: i8x8 = transmute(vpmin_s8(transmute(a), transmute(b)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe fn test_vpmin_s16() {
|
||||
let a = i16x4::from([1, 2, 3, -4]);
|
||||
let b = i16x4::from([0, 3, 2, 5]);
|
||||
let e = i16x4::from([1, -4, 0, 2]);
|
||||
let r: i16x4 = transmute(vpmin_s16(transmute(a), transmute(b)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe fn test_vpmin_s32() {
|
||||
let a = i32x2::from([1, -2]);
|
||||
let b = i32x2::from([0, 3]);
|
||||
let e = i32x2::from([-2, 0]);
|
||||
let r: i32x2 = transmute(vpmin_s32(transmute(a), transmute(b)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe fn test_vpmin_u8() {
|
||||
let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
|
||||
let b = u8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
|
||||
let e = u8x8::from([1, 3, 5, 7, 0, 2, 4, 6]);
|
||||
let r: u8x8 = transmute(vpmin_u8(transmute(a), transmute(b)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe fn test_vpmin_u16() {
|
||||
let a = u16x4::from([1, 2, 3, 4]);
|
||||
let b = u16x4::from([0, 3, 2, 5]);
|
||||
let e = u16x4::from([1, 3, 0, 2]);
|
||||
let r: u16x4 = transmute(vpmin_u16(transmute(a), transmute(b)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe fn test_vpmin_u32() {
|
||||
let a = u32x2::from([1, 2]);
|
||||
let b = u32x2::from([0, 3]);
|
||||
let e = u32x2::from([1, 0]);
|
||||
let r: u32x2 = transmute(vpmin_u32(transmute(a), transmute(b)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe fn test_vpmin_f32() {
|
||||
let a = f32x2::from([1., -2.]);
|
||||
let b = f32x2::from([0., 3.]);
|
||||
let e = f32x2::from([-2., 0.]);
|
||||
let r: f32x2 = transmute(vpmin_f32(transmute(a), transmute(b)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe fn test_vpmax_s8() {
|
||||
let a = i8x8::from([1, -2, 3, -4, 5, 6, 7, 8]);
|
||||
let b = i8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
|
||||
let e = i8x8::from([1, 3, 6, 8, 3, 5, 7, 9]);
|
||||
let r: i8x8 = transmute(vpmax_s8(transmute(a), transmute(b)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe fn test_vpmax_s16() {
|
||||
let a = i16x4::from([1, 2, 3, -4]);
|
||||
let b = i16x4::from([0, 3, 2, 5]);
|
||||
let e = i16x4::from([2, 3, 3, 5]);
|
||||
let r: i16x4 = transmute(vpmax_s16(transmute(a), transmute(b)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe fn test_vpmax_s32() {
|
||||
let a = i32x2::from([1, -2]);
|
||||
let b = i32x2::from([0, 3]);
|
||||
let e = i32x2::from([1, 3]);
|
||||
let r: i32x2 = transmute(vpmax_s32(transmute(a), transmute(b)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe fn test_vpmax_u8() {
|
||||
let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
|
||||
let b = u8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
|
||||
let e = u8x8::from([2, 4, 6, 8, 3, 5, 7, 9]);
|
||||
let r: u8x8 = transmute(vpmax_u8(transmute(a), transmute(b)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe fn test_vpmax_u16() {
|
||||
let a = u16x4::from([1, 2, 3, 4]);
|
||||
let b = u16x4::from([0, 3, 2, 5]);
|
||||
let e = u16x4::from([2, 4, 3, 5]);
|
||||
let r: u16x4 = transmute(vpmax_u16(transmute(a), transmute(b)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe fn test_vpmax_u32() {
|
||||
let a = u32x2::from([1, 2]);
|
||||
let b = u32x2::from([0, 3]);
|
||||
let e = u32x2::from([2, 3]);
|
||||
let r: u32x2 = transmute(vpmax_u32(transmute(a), transmute(b)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe fn test_vpmax_f32() {
|
||||
let a = f32x2::from([1., -2.]);
|
||||
let b = f32x2::from([0., 3.]);
|
||||
let e = f32x2::from([1., 3.]);
|
||||
let r: f32x2 = transmute(vpmax_f32(transmute(a), transmute(b)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
fn main() {
|
||||
unsafe {
|
||||
test_vpmin_s8();
|
||||
test_vpmin_s16();
|
||||
test_vpmin_s32();
|
||||
test_vpmin_u8();
|
||||
test_vpmin_u16();
|
||||
test_vpmin_u32();
|
||||
test_vpmin_f32();
|
||||
test_vpmax_s8();
|
||||
test_vpmax_s16();
|
||||
test_vpmax_s32();
|
||||
test_vpmax_u8();
|
||||
test_vpmax_u16();
|
||||
test_vpmax_u32();
|
||||
test_vpmax_f32();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
fn main() {}
|
@ -156,6 +156,78 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
|
||||
});
|
||||
}
|
||||
|
||||
_ if intrinsic.starts_with("llvm.aarch64.neon.umaxp.v") => {
|
||||
intrinsic_args!(fx, args => (x, y); intrinsic);
|
||||
|
||||
simd_horizontal_pair_for_each_lane(
|
||||
fx,
|
||||
x,
|
||||
y,
|
||||
ret,
|
||||
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umax(x_lane, y_lane),
|
||||
);
|
||||
}
|
||||
|
||||
_ if intrinsic.starts_with("llvm.aarch64.neon.smaxp.v") => {
|
||||
intrinsic_args!(fx, args => (x, y); intrinsic);
|
||||
|
||||
simd_horizontal_pair_for_each_lane(
|
||||
fx,
|
||||
x,
|
||||
y,
|
||||
ret,
|
||||
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smax(x_lane, y_lane),
|
||||
);
|
||||
}
|
||||
|
||||
_ if intrinsic.starts_with("llvm.aarch64.neon.uminp.v") => {
|
||||
intrinsic_args!(fx, args => (x, y); intrinsic);
|
||||
|
||||
simd_horizontal_pair_for_each_lane(
|
||||
fx,
|
||||
x,
|
||||
y,
|
||||
ret,
|
||||
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umin(x_lane, y_lane),
|
||||
);
|
||||
}
|
||||
|
||||
_ if intrinsic.starts_with("llvm.aarch64.neon.sminp.v") => {
|
||||
intrinsic_args!(fx, args => (x, y); intrinsic);
|
||||
|
||||
simd_horizontal_pair_for_each_lane(
|
||||
fx,
|
||||
x,
|
||||
y,
|
||||
ret,
|
||||
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smin(x_lane, y_lane),
|
||||
);
|
||||
}
|
||||
|
||||
_ if intrinsic.starts_with("llvm.aarch64.neon.fminp.v") => {
|
||||
intrinsic_args!(fx, args => (x, y); intrinsic);
|
||||
|
||||
simd_horizontal_pair_for_each_lane(
|
||||
fx,
|
||||
x,
|
||||
y,
|
||||
ret,
|
||||
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmin(x_lane, y_lane),
|
||||
);
|
||||
}
|
||||
|
||||
_ if intrinsic.starts_with("llvm.aarch64.neon.fmaxp.v") => {
|
||||
intrinsic_args!(fx, args => (x, y); intrinsic);
|
||||
|
||||
simd_horizontal_pair_for_each_lane(
|
||||
fx,
|
||||
x,
|
||||
y,
|
||||
ret,
|
||||
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmax(x_lane, y_lane),
|
||||
);
|
||||
}
|
||||
|
||||
// FIXME generalize vector types
|
||||
"llvm.aarch64.neon.tbl1.v16i8" => {
|
||||
intrinsic_args!(fx, args => (t, idx); intrinsic);
|
||||
@ -172,25 +244,6 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME generalize vector types
|
||||
"llvm.aarch64.neon.umaxp.v16i8" => {
|
||||
intrinsic_args!(fx, args => (a, b); intrinsic);
|
||||
|
||||
// FIXME add helper for horizontal pairwise operations
|
||||
for i in 0..8 {
|
||||
let lane1 = a.value_lane(fx, i * 2).load_scalar(fx);
|
||||
let lane2 = a.value_lane(fx, i * 2 + 1).load_scalar(fx);
|
||||
let res = fx.bcx.ins().umax(lane1, lane2);
|
||||
ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
|
||||
}
|
||||
for i in 0..8 {
|
||||
let lane1 = b.value_lane(fx, i * 2).load_scalar(fx);
|
||||
let lane2 = b.value_lane(fx, i * 2 + 1).load_scalar(fx);
|
||||
let res = fx.bcx.ins().umax(lane1, lane2);
|
||||
ret.place_lane(fx, 8 + i).to_ptr().store(fx, res, MemFlags::trusted());
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
_ if intrinsic.starts_with("llvm.aarch64.neon.sshl.v")
|
||||
|| intrinsic.starts_with("llvm.aarch64.neon.sqshl.v")
|
||||
|
@ -132,6 +132,36 @@ fn simd_pair_for_each_lane<'tcx>(
|
||||
}
|
||||
}
|
||||
|
||||
fn simd_horizontal_pair_for_each_lane<'tcx>(
|
||||
fx: &mut FunctionCx<'_, '_, 'tcx>,
|
||||
x: CValue<'tcx>,
|
||||
y: CValue<'tcx>,
|
||||
ret: CPlace<'tcx>,
|
||||
f: &dyn Fn(&mut FunctionCx<'_, '_, 'tcx>, Ty<'tcx>, Ty<'tcx>, Value, Value) -> Value,
|
||||
) {
|
||||
assert_eq!(x.layout(), y.layout());
|
||||
let layout = x.layout();
|
||||
|
||||
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
|
||||
let lane_layout = fx.layout_of(lane_ty);
|
||||
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
|
||||
let ret_lane_layout = fx.layout_of(ret_lane_ty);
|
||||
assert_eq!(lane_count, ret_lane_count);
|
||||
|
||||
for lane_idx in 0..lane_count {
|
||||
let src = if lane_idx < (lane_count / 2) { x } else { y };
|
||||
let src_idx = lane_idx % (lane_count / 2);
|
||||
|
||||
let lhs_lane = src.value_lane(fx, src_idx * 2).load_scalar(fx);
|
||||
let rhs_lane = src.value_lane(fx, src_idx * 2 + 1).load_scalar(fx);
|
||||
|
||||
let res_lane = f(fx, lane_layout.ty, ret_lane_layout.ty, lhs_lane, rhs_lane);
|
||||
let res_lane = CValue::by_val(res_lane, ret_lane_layout);
|
||||
|
||||
ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane);
|
||||
}
|
||||
}
|
||||
|
||||
fn simd_trio_for_each_lane<'tcx>(
|
||||
fx: &mut FunctionCx<'_, '_, 'tcx>,
|
||||
x: CValue<'tcx>,
|
||||
|
Loading…
x
Reference in New Issue
Block a user