Auto merge of #3086 - eduardosm:x86-sse3-intrinsics, r=RalfJung
Implement SSE3 and SSSE3 intrinsics
This commit is contained in:
commit
a456149187
@ -9,6 +9,8 @@ use shims::foreign_items::EmulateByNameResult;
|
||||
|
||||
mod sse;
|
||||
mod sse2;
|
||||
mod sse3;
|
||||
mod ssse3;
|
||||
|
||||
impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for crate::MiriInterpCx<'mir, 'tcx> {}
|
||||
pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
|
||||
@ -88,6 +90,16 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
|
||||
this, link_name, abi, args, dest,
|
||||
);
|
||||
}
|
||||
name if name.starts_with("sse3.") => {
|
||||
return sse3::EvalContextExt::emulate_x86_sse3_intrinsic(
|
||||
this, link_name, abi, args, dest,
|
||||
);
|
||||
}
|
||||
name if name.starts_with("ssse3.") => {
|
||||
return ssse3::EvalContextExt::emulate_x86_ssse3_intrinsic(
|
||||
this, link_name, abi, args, dest,
|
||||
);
|
||||
}
|
||||
_ => return Ok(EmulateByNameResult::NotSupported),
|
||||
}
|
||||
Ok(EmulateByNameResult::NeedsJumping)
|
||||
@ -286,3 +298,44 @@ fn bin_op_simd_float_all<'tcx, F: rustc_apfloat::Float>(
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Horizontaly performs `which` operation on adjacent values of
|
||||
/// `left` and `right` SIMD vectors and stores the result in `dest`.
|
||||
fn horizontal_bin_op<'tcx>(
|
||||
this: &mut crate::MiriInterpCx<'_, 'tcx>,
|
||||
which: mir::BinOp,
|
||||
saturating: bool,
|
||||
left: &OpTy<'tcx, Provenance>,
|
||||
right: &OpTy<'tcx, Provenance>,
|
||||
dest: &PlaceTy<'tcx, Provenance>,
|
||||
) -> InterpResult<'tcx, ()> {
|
||||
let (left, left_len) = this.operand_to_simd(left)?;
|
||||
let (right, right_len) = this.operand_to_simd(right)?;
|
||||
let (dest, dest_len) = this.place_to_simd(dest)?;
|
||||
|
||||
assert_eq!(dest_len, left_len);
|
||||
assert_eq!(dest_len, right_len);
|
||||
assert_eq!(dest_len % 2, 0);
|
||||
|
||||
let middle = dest_len / 2;
|
||||
for i in 0..dest_len {
|
||||
// `i` is the index in `dest`
|
||||
// `j` is the index of the 2-item chunk in `src`
|
||||
let (j, src) =
|
||||
if i < middle { (i, &left) } else { (i.checked_sub(middle).unwrap(), &right) };
|
||||
// `base_i` is the index of the first item of the 2-item chunk in `src`
|
||||
let base_i = j.checked_mul(2).unwrap();
|
||||
let lhs = this.read_immediate(&this.project_index(src, base_i)?)?;
|
||||
let rhs = this.read_immediate(&this.project_index(src, base_i.checked_add(1).unwrap())?)?;
|
||||
|
||||
let res = if saturating {
|
||||
Immediate::from(this.saturating_arith(which, &lhs, &rhs)?)
|
||||
} else {
|
||||
*this.wrapping_binary_op(which, &lhs, &rhs)?
|
||||
};
|
||||
|
||||
this.write_immediate(res, &this.project_index(&dest, i)?)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
90
src/tools/miri/src/shims/x86/sse3.rs
Normal file
90
src/tools/miri/src/shims/x86/sse3.rs
Normal file
@ -0,0 +1,90 @@
|
||||
use rustc_middle::mir;
|
||||
use rustc_span::Symbol;
|
||||
use rustc_target::abi::Align;
|
||||
use rustc_target::spec::abi::Abi;
|
||||
|
||||
use super::horizontal_bin_op;
|
||||
use crate::*;
|
||||
use shims::foreign_items::EmulateByNameResult;
|
||||
|
||||
impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for crate::MiriInterpCx<'mir, 'tcx> {}
|
||||
pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
|
||||
crate::MiriInterpCxExt<'mir, 'tcx>
|
||||
{
|
||||
fn emulate_x86_sse3_intrinsic(
|
||||
&mut self,
|
||||
link_name: Symbol,
|
||||
abi: Abi,
|
||||
args: &[OpTy<'tcx, Provenance>],
|
||||
dest: &PlaceTy<'tcx, Provenance>,
|
||||
) -> InterpResult<'tcx, EmulateByNameResult<'mir, 'tcx>> {
|
||||
let this = self.eval_context_mut();
|
||||
// Prefix should have already been checked.
|
||||
let unprefixed_name = link_name.as_str().strip_prefix("llvm.x86.sse3.").unwrap();
|
||||
|
||||
match unprefixed_name {
|
||||
// Used to implement the _mm_addsub_ps and _mm_addsub_pd functions.
|
||||
// Alternatingly add and subtract floating point (f32 or f64) from
|
||||
// `left` and `right`
|
||||
"addsub.ps" | "addsub.pd" => {
|
||||
let [left, right] =
|
||||
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
|
||||
|
||||
let (left, left_len) = this.operand_to_simd(left)?;
|
||||
let (right, right_len) = this.operand_to_simd(right)?;
|
||||
let (dest, dest_len) = this.place_to_simd(dest)?;
|
||||
|
||||
assert_eq!(dest_len, left_len);
|
||||
assert_eq!(dest_len, right_len);
|
||||
|
||||
for i in 0..dest_len {
|
||||
let left = this.read_immediate(&this.project_index(&left, i)?)?;
|
||||
let right = this.read_immediate(&this.project_index(&right, i)?)?;
|
||||
let dest = this.project_index(&dest, i)?;
|
||||
|
||||
// Even elements are subtracted and odd elements are added.
|
||||
let op = if i % 2 == 0 { mir::BinOp::Sub } else { mir::BinOp::Add };
|
||||
let res = this.wrapping_binary_op(op, &left, &right)?;
|
||||
|
||||
this.write_immediate(*res, &dest)?;
|
||||
}
|
||||
}
|
||||
// Used to implement the _mm_h{add,sub}_p{s,d} functions.
|
||||
// Horizontally add/subtract adjacent floating point values
|
||||
// in `left` and `right`.
|
||||
"hadd.ps" | "hadd.pd" | "hsub.ps" | "hsub.pd" => {
|
||||
let [left, right] =
|
||||
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
|
||||
|
||||
let which = match unprefixed_name {
|
||||
"hadd.ps" | "hadd.pd" => mir::BinOp::Add,
|
||||
"hsub.ps" | "hsub.pd" => mir::BinOp::Sub,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
horizontal_bin_op(this, which, /*saturating*/ false, left, right, dest)?;
|
||||
}
|
||||
// Used to implement the _mm_lddqu_si128 function.
|
||||
// Reads a 128-bit vector from an unaligned pointer. This intrinsic
|
||||
// is expected to perform better than a regular unaligned read when
|
||||
// the data crosses a cache line, but for Miri this is just a regular
|
||||
// unaligned read.
|
||||
"ldu.dq" => {
|
||||
let [src_ptr] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
|
||||
let src_ptr = this.read_pointer(src_ptr)?;
|
||||
let dest = dest.force_mplace(this)?;
|
||||
|
||||
this.mem_copy(
|
||||
src_ptr,
|
||||
Align::ONE,
|
||||
dest.ptr(),
|
||||
Align::ONE,
|
||||
dest.layout.size,
|
||||
/*nonoverlapping*/ true,
|
||||
)?;
|
||||
}
|
||||
_ => return Ok(EmulateByNameResult::NotSupported),
|
||||
}
|
||||
Ok(EmulateByNameResult::NeedsJumping)
|
||||
}
|
||||
}
|
199
src/tools/miri/src/shims/x86/ssse3.rs
Normal file
199
src/tools/miri/src/shims/x86/ssse3.rs
Normal file
@ -0,0 +1,199 @@
|
||||
use rustc_middle::mir;
|
||||
use rustc_span::Symbol;
|
||||
use rustc_target::spec::abi::Abi;
|
||||
|
||||
use super::horizontal_bin_op;
|
||||
use crate::*;
|
||||
use shims::foreign_items::EmulateByNameResult;
|
||||
|
||||
impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for crate::MiriInterpCx<'mir, 'tcx> {}
|
||||
pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
|
||||
crate::MiriInterpCxExt<'mir, 'tcx>
|
||||
{
|
||||
fn emulate_x86_ssse3_intrinsic(
|
||||
&mut self,
|
||||
link_name: Symbol,
|
||||
abi: Abi,
|
||||
args: &[OpTy<'tcx, Provenance>],
|
||||
dest: &PlaceTy<'tcx, Provenance>,
|
||||
) -> InterpResult<'tcx, EmulateByNameResult<'mir, 'tcx>> {
|
||||
let this = self.eval_context_mut();
|
||||
// Prefix should have already been checked.
|
||||
let unprefixed_name = link_name.as_str().strip_prefix("llvm.x86.ssse3.").unwrap();
|
||||
|
||||
match unprefixed_name {
|
||||
// Used to implement the _mm_abs_epi{8,16,32} functions.
|
||||
// Calculates the absolute value of packed 8/16/32-bit integers.
|
||||
"pabs.b.128" | "pabs.w.128" | "pabs.d.128" => {
|
||||
let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
|
||||
|
||||
let (op, op_len) = this.operand_to_simd(op)?;
|
||||
let (dest, dest_len) = this.place_to_simd(dest)?;
|
||||
|
||||
assert_eq!(op_len, dest_len);
|
||||
|
||||
for i in 0..dest_len {
|
||||
let op = this.read_scalar(&this.project_index(&op, i)?)?;
|
||||
let dest = this.project_index(&dest, i)?;
|
||||
|
||||
// Converting to a host "i128" works since the input is always signed.
|
||||
let res = op.to_int(dest.layout.size)?.unsigned_abs();
|
||||
|
||||
this.write_scalar(Scalar::from_uint(res, dest.layout.size), &dest)?;
|
||||
}
|
||||
}
|
||||
// Used to implement the _mm_shuffle_epi8 intrinsic.
|
||||
// Shuffles bytes from `left` using `right` as pattern.
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
|
||||
"pshuf.b.128" => {
|
||||
let [left, right] =
|
||||
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
|
||||
|
||||
let (left, left_len) = this.operand_to_simd(left)?;
|
||||
let (right, right_len) = this.operand_to_simd(right)?;
|
||||
let (dest, dest_len) = this.place_to_simd(dest)?;
|
||||
|
||||
assert_eq!(dest_len, left_len);
|
||||
assert_eq!(dest_len, right_len);
|
||||
|
||||
for i in 0..dest_len {
|
||||
let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u8()?;
|
||||
let dest = this.project_index(&dest, i)?;
|
||||
|
||||
let res = if right & 0x80 == 0 {
|
||||
let j = right % 16; // index wraps around
|
||||
this.read_scalar(&this.project_index(&left, j.into())?)?
|
||||
} else {
|
||||
// If the highest bit in `right` is 1, write zero.
|
||||
Scalar::from_u8(0)
|
||||
};
|
||||
|
||||
this.write_scalar(res, &dest)?;
|
||||
}
|
||||
}
|
||||
// Used to implement the _mm_h{add,adds,sub}_epi{16,32} functions.
|
||||
// Horizontally add / add with saturation / subtract adjacent 16/32-bit
|
||||
// integer values in `left` and `right`.
|
||||
"phadd.w.128" | "phadd.sw.128" | "phadd.d.128" | "phsub.w.128" | "phsub.sw.128"
|
||||
| "phsub.d.128" => {
|
||||
let [left, right] =
|
||||
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
|
||||
|
||||
let (which, saturating) = match unprefixed_name {
|
||||
"phadd.w.128" | "phadd.d.128" => (mir::BinOp::Add, false),
|
||||
"phadd.sw.128" => (mir::BinOp::Add, true),
|
||||
"phsub.w.128" | "phsub.d.128" => (mir::BinOp::Sub, false),
|
||||
"phsub.sw.128" => (mir::BinOp::Sub, true),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
horizontal_bin_op(this, which, saturating, left, right, dest)?;
|
||||
}
|
||||
// Used to implement the _mm_maddubs_epi16 function.
|
||||
// Multiplies packed 8-bit unsigned integers from `left` and packed
|
||||
// signed 8-bit integers from `right` into 16-bit signed integers. Then,
|
||||
// the saturating sum of the products with indices `2*i` and `2*i+1`
|
||||
// produces the output at index `i`.
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
|
||||
"pmadd.ub.sw.128" => {
|
||||
let [left, right] =
|
||||
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
|
||||
|
||||
let (left, left_len) = this.operand_to_simd(left)?;
|
||||
let (right, right_len) = this.operand_to_simd(right)?;
|
||||
let (dest, dest_len) = this.place_to_simd(dest)?;
|
||||
|
||||
assert_eq!(left_len, right_len);
|
||||
assert_eq!(dest_len.checked_mul(2).unwrap(), left_len);
|
||||
|
||||
for i in 0..dest_len {
|
||||
let j1 = i.checked_mul(2).unwrap();
|
||||
let left1 = this.read_scalar(&this.project_index(&left, j1)?)?.to_u8()?;
|
||||
let right1 = this.read_scalar(&this.project_index(&right, j1)?)?.to_i8()?;
|
||||
|
||||
let j2 = j1.checked_add(1).unwrap();
|
||||
let left2 = this.read_scalar(&this.project_index(&left, j2)?)?.to_u8()?;
|
||||
let right2 = this.read_scalar(&this.project_index(&right, j2)?)?.to_i8()?;
|
||||
|
||||
let dest = this.project_index(&dest, i)?;
|
||||
|
||||
// Multiplication of a u8 and an i8 into an i16 cannot overflow.
|
||||
let mul1 = i16::from(left1).checked_mul(right1.into()).unwrap();
|
||||
let mul2 = i16::from(left2).checked_mul(right2.into()).unwrap();
|
||||
let res = mul1.saturating_add(mul2);
|
||||
|
||||
this.write_scalar(Scalar::from_i16(res), &dest)?;
|
||||
}
|
||||
}
|
||||
// Used to implement the _mm_mulhrs_epi16 function.
|
||||
// Multiplies packed 16-bit signed integer values, truncates the 32-bit
|
||||
// product to the 18 most significant bits by right-shifting, and then
|
||||
// divides the 18-bit value by 2 (rounding to nearest) by first adding
|
||||
// 1 and then taking the bits `1..=16`.
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
|
||||
"pmul.hr.sw.128" => {
|
||||
let [left, right] =
|
||||
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
|
||||
|
||||
let (left, left_len) = this.operand_to_simd(left)?;
|
||||
let (right, right_len) = this.operand_to_simd(right)?;
|
||||
let (dest, dest_len) = this.place_to_simd(dest)?;
|
||||
|
||||
assert_eq!(dest_len, left_len);
|
||||
assert_eq!(dest_len, right_len);
|
||||
|
||||
for i in 0..dest_len {
|
||||
let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
|
||||
let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
|
||||
let dest = this.project_index(&dest, i)?;
|
||||
|
||||
let res = (i32::from(left).checked_mul(right.into()).unwrap() >> 14)
|
||||
.checked_add(1)
|
||||
.unwrap()
|
||||
>> 1;
|
||||
|
||||
// The result of this operation can overflow a signed 16-bit integer.
|
||||
// When `left` and `right` are -0x8000, the result is 0x8000.
|
||||
#[allow(clippy::cast_possible_truncation)]
|
||||
let res = res as i16;
|
||||
|
||||
this.write_scalar(Scalar::from_i16(res), &dest)?;
|
||||
}
|
||||
}
|
||||
// Used to implement the _mm_sign_epi{8,16,32} functions.
|
||||
// Negates elements from `left` when the corresponding element in
|
||||
// `right` is negative. If an element from `right` is zero, zero
|
||||
// is writen to the corresponding output element.
|
||||
// Basically, we multiply `left` with `right.signum()`.
|
||||
"psign.b.128" | "psign.w.128" | "psign.d.128" => {
|
||||
let [left, right] =
|
||||
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
|
||||
|
||||
let (left, left_len) = this.operand_to_simd(left)?;
|
||||
let (right, right_len) = this.operand_to_simd(right)?;
|
||||
let (dest, dest_len) = this.place_to_simd(dest)?;
|
||||
|
||||
assert_eq!(dest_len, left_len);
|
||||
assert_eq!(dest_len, right_len);
|
||||
|
||||
for i in 0..dest_len {
|
||||
let dest = this.project_index(&dest, i)?;
|
||||
let left = this.read_immediate(&this.project_index(&left, i)?)?;
|
||||
let right = this
|
||||
.read_scalar(&this.project_index(&right, i)?)?
|
||||
.to_int(dest.layout.size)?;
|
||||
|
||||
let res = this.wrapping_binary_op(
|
||||
mir::BinOp::Mul,
|
||||
&left,
|
||||
&ImmTy::from_int(right.signum(), dest.layout),
|
||||
)?;
|
||||
|
||||
this.write_immediate(*res, &dest)?;
|
||||
}
|
||||
}
|
||||
_ => return Ok(EmulateByNameResult::NotSupported),
|
||||
}
|
||||
Ok(EmulateByNameResult::NeedsJumping)
|
||||
}
|
||||
}
|
395
src/tools/miri/tests/pass/intrinsics-x86-sse3-ssse3.rs
Normal file
395
src/tools/miri/tests/pass/intrinsics-x86-sse3-ssse3.rs
Normal file
@ -0,0 +1,395 @@
|
||||
// Ignore everything except x86 and x86_64
|
||||
// Any additional target are added to CI should be ignored here
|
||||
// (We cannot use `cfg`-based tricks here since the `target-feature` flags below only work on x86.)
|
||||
//@ignore-target-aarch64
|
||||
//@ignore-target-arm
|
||||
//@ignore-target-avr
|
||||
//@ignore-target-s390x
|
||||
//@ignore-target-thumbv7em
|
||||
//@ignore-target-wasm32
|
||||
// SSSE3 implicitly enables SSE3
|
||||
//@compile-flags: -C target-feature=+ssse3
|
||||
|
||||
use core::mem::transmute;
|
||||
#[cfg(target_arch = "x86")]
|
||||
use std::arch::x86::*;
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
use std::arch::x86_64::*;
|
||||
|
||||
fn main() {
|
||||
// SSSE3 implicitly enables SSE3, still check it to be sure
|
||||
assert!(is_x86_feature_detected!("sse3"));
|
||||
assert!(is_x86_feature_detected!("ssse3"));
|
||||
|
||||
unsafe {
|
||||
test_sse3();
|
||||
test_ssse3();
|
||||
}
|
||||
}
|
||||
|
||||
#[target_feature(enable = "sse3")]
|
||||
unsafe fn test_sse3() {
|
||||
// Mostly copied from library/stdarch/crates/core_arch/src/x86/sse3.rs
|
||||
|
||||
#[target_feature(enable = "sse3")]
|
||||
unsafe fn test_mm_addsub_ps() {
|
||||
let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
|
||||
let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
|
||||
let r = _mm_addsub_ps(a, b);
|
||||
assert_eq_m128(r, _mm_setr_ps(99.0, 25.0, 0.0, -15.0));
|
||||
}
|
||||
test_mm_addsub_ps();
|
||||
|
||||
#[target_feature(enable = "sse3")]
|
||||
unsafe fn test_mm_addsub_pd() {
|
||||
let a = _mm_setr_pd(-1.0, 5.0);
|
||||
let b = _mm_setr_pd(-100.0, 20.0);
|
||||
let r = _mm_addsub_pd(a, b);
|
||||
assert_eq_m128d(r, _mm_setr_pd(99.0, 25.0));
|
||||
}
|
||||
test_mm_addsub_pd();
|
||||
|
||||
#[target_feature(enable = "sse3")]
|
||||
unsafe fn test_mm_hadd_ps() {
|
||||
let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
|
||||
let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
|
||||
let r = _mm_hadd_ps(a, b);
|
||||
assert_eq_m128(r, _mm_setr_ps(4.0, -10.0, -80.0, -5.0));
|
||||
}
|
||||
test_mm_hadd_ps();
|
||||
|
||||
#[target_feature(enable = "sse3")]
|
||||
unsafe fn test_mm_hadd_pd() {
|
||||
let a = _mm_setr_pd(-1.0, 5.0);
|
||||
let b = _mm_setr_pd(-100.0, 20.0);
|
||||
let r = _mm_hadd_pd(a, b);
|
||||
assert_eq_m128d(r, _mm_setr_pd(4.0, -80.0));
|
||||
}
|
||||
test_mm_hadd_pd();
|
||||
|
||||
#[target_feature(enable = "sse3")]
|
||||
unsafe fn test_mm_hsub_ps() {
|
||||
let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
|
||||
let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
|
||||
let r = _mm_hsub_ps(a, b);
|
||||
assert_eq_m128(r, _mm_setr_ps(-6.0, 10.0, -120.0, 5.0));
|
||||
}
|
||||
test_mm_hsub_ps();
|
||||
|
||||
#[target_feature(enable = "sse3")]
|
||||
unsafe fn test_mm_hsub_pd() {
|
||||
let a = _mm_setr_pd(-1.0, 5.0);
|
||||
let b = _mm_setr_pd(-100.0, 20.0);
|
||||
let r = _mm_hsub_pd(a, b);
|
||||
assert_eq_m128d(r, _mm_setr_pd(-6.0, -120.0));
|
||||
}
|
||||
test_mm_hsub_pd();
|
||||
|
||||
#[target_feature(enable = "sse3")]
|
||||
unsafe fn test_mm_lddqu_si128() {
|
||||
let a = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
let r = _mm_lddqu_si128(&a);
|
||||
assert_eq_m128i(a, r);
|
||||
}
|
||||
test_mm_lddqu_si128();
|
||||
}
|
||||
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn test_ssse3() {
|
||||
// Mostly copied from library/stdarch/crates/core_arch/src/x86/ssse3.rs
|
||||
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn test_mm_abs_epi8() {
|
||||
let r = _mm_abs_epi8(_mm_set1_epi8(-5));
|
||||
assert_eq_m128i(r, _mm_set1_epi8(5));
|
||||
}
|
||||
test_mm_abs_epi8();
|
||||
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn test_mm_abs_epi16() {
|
||||
let r = _mm_abs_epi16(_mm_set1_epi16(-5));
|
||||
assert_eq_m128i(r, _mm_set1_epi16(5));
|
||||
}
|
||||
test_mm_abs_epi16();
|
||||
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn test_mm_abs_epi32() {
|
||||
let r = _mm_abs_epi32(_mm_set1_epi32(-5));
|
||||
assert_eq_m128i(r, _mm_set1_epi32(5));
|
||||
}
|
||||
test_mm_abs_epi32();
|
||||
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn test_mm_shuffle_epi8() {
|
||||
let a = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
let b = _mm_setr_epi8(4, 128_u8 as i8, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
|
||||
let expected = _mm_setr_epi8(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
|
||||
let r = _mm_shuffle_epi8(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
|
||||
// Test indices greater than 15 wrapping around
|
||||
let b = _mm_add_epi8(b, _mm_set1_epi8(32));
|
||||
let r = _mm_shuffle_epi8(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
}
|
||||
test_mm_shuffle_epi8();
|
||||
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn test_mm_hadd_epi16() {
|
||||
let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let b = _mm_setr_epi16(4, 128, 4, 3, 24, 12, 6, 19);
|
||||
let expected = _mm_setr_epi16(3, 7, 11, 15, 132, 7, 36, 25);
|
||||
let r = _mm_hadd_epi16(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
|
||||
// Test wrapping on overflow
|
||||
let a = _mm_setr_epi16(i16::MAX, 1, i16::MAX, 2, i16::MAX, 3, i16::MAX, 4);
|
||||
let b = _mm_setr_epi16(i16::MIN, -1, i16::MIN, -2, i16::MIN, -3, i16::MIN, -4);
|
||||
let expected = _mm_setr_epi16(
|
||||
i16::MIN,
|
||||
i16::MIN + 1,
|
||||
i16::MIN + 2,
|
||||
i16::MIN + 3,
|
||||
i16::MAX,
|
||||
i16::MAX - 1,
|
||||
i16::MAX - 2,
|
||||
i16::MAX - 3,
|
||||
);
|
||||
let r = _mm_hadd_epi16(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
}
|
||||
test_mm_hadd_epi16();
|
||||
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn test_mm_hadds_epi16() {
|
||||
let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let b = _mm_setr_epi16(4, 128, 4, 3, 32767, 1, -32768, -1);
|
||||
let expected = _mm_setr_epi16(3, 7, 11, 15, 132, 7, 32767, -32768);
|
||||
let r = _mm_hadds_epi16(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
|
||||
// Test saturating on overflow
|
||||
let a = _mm_setr_epi16(i16::MAX, 1, i16::MAX, 2, i16::MAX, 3, i16::MAX, 4);
|
||||
let b = _mm_setr_epi16(i16::MIN, -1, i16::MIN, -2, i16::MIN, -3, i16::MIN, -4);
|
||||
let expected = _mm_setr_epi16(
|
||||
i16::MAX,
|
||||
i16::MAX,
|
||||
i16::MAX,
|
||||
i16::MAX,
|
||||
i16::MIN,
|
||||
i16::MIN,
|
||||
i16::MIN,
|
||||
i16::MIN,
|
||||
);
|
||||
let r = _mm_hadds_epi16(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
}
|
||||
test_mm_hadds_epi16();
|
||||
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn test_mm_hadd_epi32() {
|
||||
let a = _mm_setr_epi32(1, 2, 3, 4);
|
||||
let b = _mm_setr_epi32(4, 128, 4, 3);
|
||||
let expected = _mm_setr_epi32(3, 7, 132, 7);
|
||||
let r = _mm_hadd_epi32(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
|
||||
// Test wrapping on overflow
|
||||
let a = _mm_setr_epi32(i32::MAX, 1, i32::MAX, 2);
|
||||
let b = _mm_setr_epi32(i32::MIN, -1, i32::MIN, -2);
|
||||
let expected = _mm_setr_epi32(i32::MIN, i32::MIN + 1, i32::MAX, i32::MAX - 1);
|
||||
let r = _mm_hadd_epi32(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
}
|
||||
test_mm_hadd_epi32();
|
||||
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn test_mm_hsub_epi16() {
|
||||
let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let b = _mm_setr_epi16(4, 128, 4, 3, 24, 12, 6, 19);
|
||||
let expected = _mm_setr_epi16(-1, -1, -1, -1, -124, 1, 12, -13);
|
||||
let r = _mm_hsub_epi16(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
|
||||
// Test wrapping on overflow
|
||||
let a = _mm_setr_epi16(i16::MAX, -1, i16::MAX, -2, i16::MAX, -3, i16::MAX, -4);
|
||||
let b = _mm_setr_epi16(i16::MIN, 1, i16::MIN, 2, i16::MIN, 3, i16::MIN, 4);
|
||||
let expected = _mm_setr_epi16(
|
||||
i16::MIN,
|
||||
i16::MIN + 1,
|
||||
i16::MIN + 2,
|
||||
i16::MIN + 3,
|
||||
i16::MAX,
|
||||
i16::MAX - 1,
|
||||
i16::MAX - 2,
|
||||
i16::MAX - 3,
|
||||
);
|
||||
let r = _mm_hsub_epi16(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
}
|
||||
test_mm_hsub_epi16();
|
||||
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn test_mm_hsubs_epi16() {
|
||||
let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let b = _mm_setr_epi16(4, 128, 4, 3, 32767, -1, -32768, 1);
|
||||
let expected = _mm_setr_epi16(-1, -1, -1, -1, -124, 1, 32767, -32768);
|
||||
let r = _mm_hsubs_epi16(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
|
||||
// Test saturating on overflow
|
||||
let a = _mm_setr_epi16(i16::MAX, -1, i16::MAX, -2, i16::MAX, -3, i16::MAX, -4);
|
||||
let b = _mm_setr_epi16(i16::MIN, 1, i16::MIN, 2, i16::MIN, 3, i16::MIN, 4);
|
||||
let expected = _mm_setr_epi16(
|
||||
i16::MAX,
|
||||
i16::MAX,
|
||||
i16::MAX,
|
||||
i16::MAX,
|
||||
i16::MIN,
|
||||
i16::MIN,
|
||||
i16::MIN,
|
||||
i16::MIN,
|
||||
);
|
||||
let r = _mm_hsubs_epi16(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
}
|
||||
test_mm_hsubs_epi16();
|
||||
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn test_mm_hsub_epi32() {
|
||||
let a = _mm_setr_epi32(1, 2, 3, 4);
|
||||
let b = _mm_setr_epi32(4, 128, 4, 3);
|
||||
let expected = _mm_setr_epi32(-1, -1, -124, 1);
|
||||
let r = _mm_hsub_epi32(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
|
||||
// Test wrapping on overflow
|
||||
let a = _mm_setr_epi32(i32::MAX, -1, i32::MAX, -2);
|
||||
let b = _mm_setr_epi32(i32::MIN, 1, i32::MIN, 2);
|
||||
let expected = _mm_setr_epi32(i32::MIN, i32::MIN + 1, i32::MAX, i32::MAX - 1);
|
||||
let r = _mm_hsub_epi32(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
}
|
||||
test_mm_hsub_epi32();
|
||||
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn test_mm_maddubs_epi16() {
|
||||
let a = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
let b = _mm_setr_epi8(4, 63, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
|
||||
let expected = _mm_setr_epi16(130, 24, 192, 194, 158, 175, 66, 120);
|
||||
let r = _mm_maddubs_epi16(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
|
||||
// Test widening and saturation
|
||||
let a = _mm_setr_epi8(
|
||||
u8::MAX as i8,
|
||||
u8::MAX as i8,
|
||||
u8::MAX as i8,
|
||||
u8::MAX as i8,
|
||||
u8::MAX as i8,
|
||||
u8::MAX as i8,
|
||||
100,
|
||||
100,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
);
|
||||
let b = _mm_setr_epi8(
|
||||
i8::MAX,
|
||||
i8::MAX,
|
||||
i8::MAX,
|
||||
i8::MIN,
|
||||
i8::MIN,
|
||||
i8::MIN,
|
||||
50,
|
||||
15,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
);
|
||||
let expected = _mm_setr_epi16(i16::MAX, -255, i16::MIN, 6500, 0, 0, 0, 0);
|
||||
let r = _mm_maddubs_epi16(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
}
|
||||
test_mm_maddubs_epi16();
|
||||
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn test_mm_mulhrs_epi16() {
|
||||
let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let b = _mm_setr_epi16(4, 128, 4, 3, 32767, -1, -32768, 1);
|
||||
let expected = _mm_setr_epi16(0, 0, 0, 0, 5, 0, -7, 0);
|
||||
let r = _mm_mulhrs_epi16(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
|
||||
// Test extreme values
|
||||
let a = _mm_setr_epi16(i16::MAX, i16::MIN, i16::MIN, 0, 0, 0, 0, 0);
|
||||
let b = _mm_setr_epi16(i16::MAX, i16::MIN, i16::MAX, 0, 0, 0, 0, 0);
|
||||
let expected = _mm_setr_epi16(i16::MAX - 1, i16::MIN, -i16::MAX, 0, 0, 0, 0, 0);
|
||||
let r = _mm_mulhrs_epi16(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
}
|
||||
test_mm_mulhrs_epi16();
|
||||
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn test_mm_sign_epi8() {
|
||||
let a = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -14, -15, 16);
|
||||
let b = _mm_setr_epi8(4, 63, -4, 3, 24, 12, -6, -19, 12, 5, -5, 10, 4, 1, -8, 0);
|
||||
let expected = _mm_setr_epi8(1, 2, -3, 4, 5, 6, -7, -8, 9, 10, -11, 12, 13, -14, 15, 0);
|
||||
let r = _mm_sign_epi8(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
}
|
||||
test_mm_sign_epi8();
|
||||
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn test_mm_sign_epi16() {
|
||||
let a = _mm_setr_epi16(1, 2, 3, 4, -5, -6, 7, 8);
|
||||
let b = _mm_setr_epi16(4, 128, 0, 3, 1, -1, -2, 1);
|
||||
let expected = _mm_setr_epi16(1, 2, 0, 4, -5, 6, -7, 8);
|
||||
let r = _mm_sign_epi16(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
}
|
||||
test_mm_sign_epi16();
|
||||
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn test_mm_sign_epi32() {
|
||||
let a = _mm_setr_epi32(-1, 2, 3, 4);
|
||||
let b = _mm_setr_epi32(1, -1, 1, 0);
|
||||
let expected = _mm_setr_epi32(-1, -2, 3, 0);
|
||||
let r = _mm_sign_epi32(a, b);
|
||||
assert_eq_m128i(r, expected);
|
||||
}
|
||||
test_mm_sign_epi32();
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
#[target_feature(enable = "sse")]
|
||||
unsafe fn assert_eq_m128(a: __m128, b: __m128) {
|
||||
let r = _mm_cmpeq_ps(a, b);
|
||||
if _mm_movemask_ps(r) != 0b1111 {
|
||||
panic!("{:?} != {:?}", a, b);
|
||||
}
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
#[target_feature(enable = "sse2")]
|
||||
unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
|
||||
if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
|
||||
panic!("{:?} != {:?}", a, b);
|
||||
}
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
#[target_feature(enable = "sse2")]
|
||||
pub unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
|
||||
assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user