Auto merge of #3055 - eduardosm:x86-sse2-intrinsics, r=RalfJung

Implement some `llvm.x86.sse2.*` intrinsics and add tests

Continuation of https://github.com/rust-lang/miri/pull/2989 with SSE2 intrinsics.

Thankfully, a significant amount of SSE2 functions use `simd_*` intrinsics, which are already implemented in Miri.
This commit is contained in:
bors 2023-09-12 18:55:17 +00:00
commit 8cd31eadba
6 changed files with 1893 additions and 56 deletions

View File

@ -1037,6 +1037,11 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
this, link_name, abi, args, dest,
);
}
name if name.starts_with("llvm.x86.sse2.") => {
return shims::x86::sse2::EvalContextExt::emulate_x86_sse2_intrinsic(
this, link_name, abi, args, dest,
);
}
// Platform-specific shims
_ =>

View File

@ -1 +1,45 @@
use crate::InterpResult;
pub(super) mod sse;
pub(super) mod sse2;
/// Floating point comparison operation
///
/// <https://www.felixcloutier.com/x86/cmpss>
/// <https://www.felixcloutier.com/x86/cmpps>
/// <https://www.felixcloutier.com/x86/cmpsd>
/// <https://www.felixcloutier.com/x86/cmppd>
#[derive(Copy, Clone)]
enum FloatCmpOp {
Eq,
Lt,
Le,
Unord,
Neq,
/// Not less-than
Nlt,
/// Not less-or-equal
Nle,
/// Ordered, i.e. neither of them is NaN
Ord,
}
impl FloatCmpOp {
/// Convert from the `imm` argument used to specify the comparison
/// operation in intrinsics such as `llvm.x86.sse.cmp.ss`.
fn from_intrinsic_imm(imm: i8, intrinsic: &str) -> InterpResult<'_, Self> {
match imm {
0 => Ok(Self::Eq),
1 => Ok(Self::Lt),
2 => Ok(Self::Le),
3 => Ok(Self::Unord),
4 => Ok(Self::Neq),
5 => Ok(Self::Nlt),
6 => Ok(Self::Nle),
7 => Ok(Self::Ord),
imm => {
throw_unsup_format!("invalid `imm` parameter of {intrinsic}: {imm}");
}
}
}
}

View File

@ -5,6 +5,7 @@ use rustc_target::spec::abi::Abi;
use rand::Rng as _;
use super::FloatCmpOp;
use crate::*;
use shims::foreign_items::EmulateByNameResult;
@ -78,7 +79,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
unary_op_ss(this, which, op, dest)?;
}
// Used to implement _mm_{sqrt,rcp,rsqrt}_ss functions.
// Used to implement _mm_{sqrt,rcp,rsqrt}_ps functions.
// Performs the operations on all components of `op`.
"sqrt.ps" | "rcp.ps" | "rsqrt.ps" => {
let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
@ -100,22 +101,10 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
let [left, right, imm] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let which = match this.read_scalar(imm)?.to_i8()? {
0 => FloatBinOp::Cmp(FloatCmpOp::Eq),
1 => FloatBinOp::Cmp(FloatCmpOp::Lt),
2 => FloatBinOp::Cmp(FloatCmpOp::Le),
3 => FloatBinOp::Cmp(FloatCmpOp::Unord),
4 => FloatBinOp::Cmp(FloatCmpOp::Neq),
5 => FloatBinOp::Cmp(FloatCmpOp::Nlt),
6 => FloatBinOp::Cmp(FloatCmpOp::Nle),
7 => FloatBinOp::Cmp(FloatCmpOp::Ord),
imm => {
throw_unsup_format!(
"invalid 3rd parameter of llvm.x86.sse.cmp.ps: {}",
imm
);
}
};
let which = FloatBinOp::Cmp(FloatCmpOp::from_intrinsic_imm(
this.read_scalar(imm)?.to_i8()?,
"llvm.x86.sse.cmp.ss",
)?);
bin_op_ss(this, which, left, right, dest)?;
}
@ -127,26 +116,14 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
let [left, right, imm] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let which = match this.read_scalar(imm)?.to_i8()? {
0 => FloatBinOp::Cmp(FloatCmpOp::Eq),
1 => FloatBinOp::Cmp(FloatCmpOp::Lt),
2 => FloatBinOp::Cmp(FloatCmpOp::Le),
3 => FloatBinOp::Cmp(FloatCmpOp::Unord),
4 => FloatBinOp::Cmp(FloatCmpOp::Neq),
5 => FloatBinOp::Cmp(FloatCmpOp::Nlt),
6 => FloatBinOp::Cmp(FloatCmpOp::Nle),
7 => FloatBinOp::Cmp(FloatCmpOp::Ord),
imm => {
throw_unsup_format!(
"invalid 3rd parameter of llvm.x86.sse.cmp.ps: {}",
imm
);
}
};
let which = FloatBinOp::Cmp(FloatCmpOp::from_intrinsic_imm(
this.read_scalar(imm)?.to_i8()?,
"llvm.x86.sse.cmp.ps",
)?);
bin_op_ps(this, which, left, right, dest)?;
}
// Used to implement _mm_{,u}comi{eq,lt,le,gt,ge,neq}_ps functions.
// Used to implement _mm_{,u}comi{eq,lt,le,gt,ge,neq}_ss functions.
// Compares the first component of `left` and `right` and returns
// a scalar value (0 or 1).
"comieq.ss" | "comilt.ss" | "comile.ss" | "comigt.ss" | "comige.ss" | "comineq.ss"
@ -292,6 +269,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
let op = this.read_scalar(&this.project_index(&op, i)?)?;
let op = op.to_u32()?;
// Extract the highest bit of `op` and place it in the `i`-th bit of `res`
res |= (op >> 31) << i;
}
@ -303,25 +281,6 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
}
}
/// Floating point comparison operation
///
/// <https://www.felixcloutier.com/x86/cmpss>
/// <https://www.felixcloutier.com/x86/cmpps>
#[derive(Copy, Clone)]
enum FloatCmpOp {
Eq,
Lt,
Le,
Unord,
Neq,
/// Not less-than
Nlt,
/// Not less-or-equal
Nle,
/// Ordered, i.e. neither of them is NaN
Ord,
}
#[derive(Copy, Clone)]
enum FloatBinOp {
/// Arithmetic operation
@ -436,8 +395,8 @@ fn bin_op_ss<'tcx>(
Ok(())
}
/// Performs `which` operation on each component of `left`, and
/// `right` storing the result is stored in `dest`.
/// Performs `which` operation on each component of `left` and
/// `right`, storing the result is stored in `dest`.
fn bin_op_ps<'tcx>(
this: &mut crate::MiriInterpCx<'_, 'tcx>,
which: FloatBinOp,

View File

@ -0,0 +1,982 @@
use rustc_apfloat::{
ieee::{Double, Single},
Float as _, FloatConvert as _,
};
use rustc_middle::ty::layout::LayoutOf as _;
use rustc_middle::ty::Ty;
use rustc_span::Symbol;
use rustc_target::abi::Size;
use rustc_target::spec::abi::Abi;
use super::FloatCmpOp;
use crate::*;
use shims::foreign_items::EmulateByNameResult;
impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for crate::MiriInterpCx<'mir, 'tcx> {}
pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
fn emulate_x86_sse2_intrinsic(
&mut self,
link_name: Symbol,
abi: Abi,
args: &[OpTy<'tcx, Provenance>],
dest: &PlaceTy<'tcx, Provenance>,
) -> InterpResult<'tcx, EmulateByNameResult<'mir, 'tcx>> {
let this = self.eval_context_mut();
// Prefix should have already been checked.
let unprefixed_name = link_name.as_str().strip_prefix("llvm.x86.sse2.").unwrap();
// These intrinsics operate on 128-bit (f32x4, f64x2, i8x16, i16x8, i32x4, i64x2) SIMD
// vectors unless stated otherwise.
// Many intrinsic names are sufixed with "ps" (packed single), "ss" (scalar signle),
// "pd" (packed double) or "sd" (scalar double), where single means single precision
// floating point (f32) and double means double precision floating point (f64). "ps"
// and "pd" means thet the operation is performed on each element of the vector, while
// "ss" and "sd" means that the operation is performed only on the first element, copying
// the remaining elements from the input vector (for binary operations, from the left-hand
// side).
// Intrinsincs sufixed with "epiX" or "epuX" operate with X-bit signed or unsigned
// vectors.
match unprefixed_name {
// Used to implement the _mm_avg_epu8 function.
// Averages packed unsigned 8-bit integers in `left` and `right`.
"pavg.b" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
assert_eq!(dest_len, left_len);
assert_eq!(dest_len, right_len);
for i in 0..dest_len {
let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u8()?;
let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u8()?;
let dest = this.project_index(&dest, i)?;
// Values are expanded from u8 to u16, so adds cannot overflow.
let res = u16::from(left)
.checked_add(u16::from(right))
.unwrap()
.checked_add(1)
.unwrap()
/ 2;
this.write_scalar(Scalar::from_u8(res.try_into().unwrap()), &dest)?;
}
}
// Used to implement the _mm_avg_epu16 function.
// Averages packed unsigned 16-bit integers in `left` and `right`.
"pavg.w" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
assert_eq!(dest_len, left_len);
assert_eq!(dest_len, right_len);
for i in 0..dest_len {
let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u16()?;
let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u16()?;
let dest = this.project_index(&dest, i)?;
// Values are expanded from u16 to u32, so adds cannot overflow.
let res = u32::from(left)
.checked_add(u32::from(right))
.unwrap()
.checked_add(1)
.unwrap()
/ 2;
this.write_scalar(Scalar::from_u16(res.try_into().unwrap()), &dest)?;
}
}
// Used to implement the _mm_mulhi_epi16 function.
"pmulh.w" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
assert_eq!(dest_len, left_len);
assert_eq!(dest_len, right_len);
for i in 0..dest_len {
let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
let dest = this.project_index(&dest, i)?;
// Values are expanded from i16 to i32, so multiplication cannot overflow.
let res = i32::from(left).checked_mul(i32::from(right)).unwrap() >> 16;
this.write_scalar(Scalar::from_int(res, Size::from_bits(16)), &dest)?;
}
}
// Used to implement the _mm_mulhi_epu16 function.
"pmulhu.w" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
assert_eq!(dest_len, left_len);
assert_eq!(dest_len, right_len);
for i in 0..dest_len {
let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u16()?;
let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u16()?;
let dest = this.project_index(&dest, i)?;
// Values are expanded from u16 to u32, so multiplication cannot overflow.
let res = u32::from(left).checked_mul(u32::from(right)).unwrap() >> 16;
this.write_scalar(Scalar::from_u16(res.try_into().unwrap()), &dest)?;
}
}
// Used to implement the _mm_mul_epu32 function.
// Multiplies the the low unsigned 32-bit integers from each packed
// 64-bit element and stores the result as 64-bit unsigned integers.
"pmulu.dq" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
// left and right are u32x4, dest is u64x2
assert_eq!(left_len, 4);
assert_eq!(right_len, 4);
assert_eq!(dest_len, 2);
for i in 0..dest_len {
let op_i = i.checked_mul(2).unwrap();
let left = this.read_scalar(&this.project_index(&left, op_i)?)?.to_u32()?;
let right = this.read_scalar(&this.project_index(&right, op_i)?)?.to_u32()?;
let dest = this.project_index(&dest, i)?;
// The multiplication will not overflow because stripping the
// operands are expanded from 32-bit to 64-bit.
let res = u64::from(left).checked_mul(u64::from(right)).unwrap();
this.write_scalar(Scalar::from_u64(res), &dest)?;
}
}
// Used to implement the _mm_sad_epu8 function.
// Computes the absolute differences of packed unsigned 8-bit integers in `a`
// and `b`, then horizontally sum each consecutive 8 differences to produce
// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
// the low 16 bits of 64-bit elements returned.
//
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
"psad.bw" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
// left and right are u8x16, dest is u64x2
assert_eq!(left_len, right_len);
assert_eq!(left_len, 16);
assert_eq!(dest_len, 2);
for i in 0..dest_len {
let dest = this.project_index(&dest, i)?;
let mut res: u16 = 0;
let n = left_len.checked_div(dest_len).unwrap();
for j in 0..n {
let op_i = j.checked_add(i.checked_mul(n).unwrap()).unwrap();
let left = this.read_scalar(&this.project_index(&left, op_i)?)?.to_u8()?;
let right =
this.read_scalar(&this.project_index(&right, op_i)?)?.to_u8()?;
res = res.checked_add(left.abs_diff(right).into()).unwrap();
}
this.write_scalar(Scalar::from_u64(res.into()), &dest)?;
}
}
// Used to implement the _mm_{sll,srl,sra}_epi16 functions.
// Shifts 16-bit packed integers in left by the amount in right.
// Both operands are vectors of 16-bit integers. However, right is
// interpreted as a single 64-bit integer (remaining bits are ignored).
// For logic shifts, when right is larger than 15, zero is produced.
// For arithmetic shifts, when right is larger than 15, the sign bit
// is copied to remaining bits.
"psll.w" | "psrl.w" | "psra.w" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
assert_eq!(dest_len, left_len);
assert_eq!(dest_len, right_len);
enum ShiftOp {
Sll,
Srl,
Sra,
}
let which = match unprefixed_name {
"psll.w" => ShiftOp::Sll,
"psrl.w" => ShiftOp::Srl,
"psra.w" => ShiftOp::Sra,
_ => unreachable!(),
};
// Get the 64-bit shift operand and convert it to the type expected
// by checked_{shl,shr} (u32).
// It is ok to saturate the value to u32::MAX because any value
// above 15 will produce the same result.
let shift = extract_first_u64(this, &right)?.try_into().unwrap_or(u32::MAX);
for i in 0..dest_len {
let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u16()?;
let dest = this.project_index(&dest, i)?;
let res = match which {
ShiftOp::Sll => left.checked_shl(shift).unwrap_or(0),
ShiftOp::Srl => left.checked_shr(shift).unwrap_or(0),
#[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss)]
ShiftOp::Sra => {
// Convert u16 to i16 to use arithmetic shift
let left = left as i16;
// Copy the sign bit to the remaining bits
left.checked_shr(shift).unwrap_or(left >> 15) as u16
}
};
this.write_scalar(Scalar::from_u16(res), &dest)?;
}
}
// Used to implement the _mm_{sll,srl,sra}_epi32 functions.
// 32-bit equivalent to the shift functions above.
"psll.d" | "psrl.d" | "psra.d" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
assert_eq!(dest_len, left_len);
assert_eq!(dest_len, right_len);
enum ShiftOp {
Sll,
Srl,
Sra,
}
let which = match unprefixed_name {
"psll.d" => ShiftOp::Sll,
"psrl.d" => ShiftOp::Srl,
"psra.d" => ShiftOp::Sra,
_ => unreachable!(),
};
// Get the 64-bit shift operand and convert it to the type expected
// by checked_{shl,shr} (u32).
// It is ok to saturate the value to u32::MAX because any value
// above 31 will produce the same result.
let shift = extract_first_u64(this, &right)?.try_into().unwrap_or(u32::MAX);
for i in 0..dest_len {
let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u32()?;
let dest = this.project_index(&dest, i)?;
let res = match which {
ShiftOp::Sll => left.checked_shl(shift).unwrap_or(0),
ShiftOp::Srl => left.checked_shr(shift).unwrap_or(0),
#[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss)]
ShiftOp::Sra => {
// Convert u32 to i32 to use arithmetic shift
let left = left as i32;
// Copy the sign bit to the remaining bits
left.checked_shr(shift).unwrap_or(left >> 31) as u32
}
};
this.write_scalar(Scalar::from_u32(res), &dest)?;
}
}
// Used to implement the _mm_{sll,srl}_epi64 functions.
// 64-bit equivalent to the shift functions above, except _mm_sra_epi64,
// which is not available in SSE2.
"psll.q" | "psrl.q" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
assert_eq!(dest_len, left_len);
assert_eq!(dest_len, right_len);
enum ShiftOp {
Sll,
Srl,
}
let which = match unprefixed_name {
"psll.q" => ShiftOp::Sll,
"psrl.q" => ShiftOp::Srl,
_ => unreachable!(),
};
// Get the 64-bit shift operand and convert it to the type expected
// by checked_{shl,shr} (u32).
// It is ok to saturate the value to u32::MAX because any value
// above 63 will produce the same result.
let shift = this
.read_scalar(&this.project_index(&right, 0)?)?
.to_u64()?
.try_into()
.unwrap_or(u32::MAX);
for i in 0..dest_len {
let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u64()?;
let dest = this.project_index(&dest, i)?;
let res = match which {
ShiftOp::Sll => left.checked_shl(shift).unwrap_or(0),
ShiftOp::Srl => left.checked_shr(shift).unwrap_or(0),
};
this.write_scalar(Scalar::from_u64(res), &dest)?;
}
}
// Used to implement the _mm_cvtepi32_ps function.
// Converts packed i32 to packed f32.
// FIXME: Can we get rid of this intrinsic and just use simd_as?
"cvtdq2ps" => {
let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (op, op_len) = this.operand_to_simd(op)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
assert_eq!(dest_len, op_len);
for i in 0..dest_len {
let op = this.read_scalar(&this.project_index(&op, i)?)?.to_i32()?;
let dest = this.project_index(&dest, i)?;
let res = Scalar::from_f32(Single::from_i128(op.into()).value);
this.write_scalar(res, &dest)?;
}
}
// Used to implement the _mm_cvtps_epi32 and _mm_cvttps_epi32 functions.
// Converts packed f32 to packed i32.
"cvtps2dq" | "cvttps2dq" => {
let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (op, op_len) = this.operand_to_simd(op)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
assert_eq!(dest_len, op_len);
let rnd = match unprefixed_name {
// "current SSE rounding mode", assume nearest
// https://www.felixcloutier.com/x86/cvtps2dq
"cvtps2dq" => rustc_apfloat::Round::NearestTiesToEven,
// always truncate
// https://www.felixcloutier.com/x86/cvttps2dq
"cvttps2dq" => rustc_apfloat::Round::TowardZero,
_ => unreachable!(),
};
for i in 0..dest_len {
let op = this.read_scalar(&this.project_index(&op, i)?)?.to_f32()?;
let dest = this.project_index(&dest, i)?;
let res =
this.float_to_int_checked(op, dest.layout.ty, rnd).unwrap_or_else(|| {
// Fallback to minimum acording to SSE2 semantics.
Scalar::from_i32(i32::MIN)
});
this.write_scalar(res, &dest)?;
}
}
// Used to implement the _mm_packs_epi16 function.
// Converts two 16-bit integer vectors to a single 8-bit integer
// vector with signed saturation.
"packsswb.128" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
// left and right are i16x8, dest is i8x16
assert_eq!(left_len, 8);
assert_eq!(right_len, 8);
assert_eq!(dest_len, 16);
for i in 0..left_len {
let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
let left_dest = this.project_index(&dest, i)?;
let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?;
let left_res =
i8::try_from(left).unwrap_or(if left < 0 { i8::MIN } else { i8::MAX });
let right_res =
i8::try_from(right).unwrap_or(if right < 0 { i8::MIN } else { i8::MAX });
this.write_scalar(Scalar::from_int(left_res, Size::from_bits(8)), &left_dest)?;
this.write_scalar(
Scalar::from_int(right_res, Size::from_bits(8)),
&right_dest,
)?;
}
}
// Used to implement the _mm_packus_epi16 function.
// Converts two 16-bit signed integer vectors to a single 8-bit
// unsigned integer vector with saturation.
"packuswb.128" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
// left and right are i16x8, dest is u8x16
assert_eq!(left_len, 8);
assert_eq!(right_len, 8);
assert_eq!(dest_len, 16);
for i in 0..left_len {
let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
let left_dest = this.project_index(&dest, i)?;
let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?;
let left_res = u8::try_from(left).unwrap_or(if left < 0 { 0 } else { u8::MAX });
let right_res =
u8::try_from(right).unwrap_or(if right < 0 { 0 } else { u8::MAX });
this.write_scalar(Scalar::from_u8(left_res), &left_dest)?;
this.write_scalar(Scalar::from_u8(right_res), &right_dest)?;
}
}
// Used to implement the _mm_packs_epi32 function.
// Converts two 16-bit integer vectors to a single 8-bit integer
// vector with signed saturation.
"packssdw.128" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
// left and right are i32x4, dest is i16x8
assert_eq!(left_len, 4);
assert_eq!(right_len, 4);
assert_eq!(dest_len, 8);
for i in 0..left_len {
let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i32()?;
let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i32()?;
let left_dest = this.project_index(&dest, i)?;
let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?;
let left_res =
i16::try_from(left).unwrap_or(if left < 0 { i16::MIN } else { i16::MAX });
let right_res =
i16::try_from(right).unwrap_or(if right < 0 { i16::MIN } else { i16::MAX });
this.write_scalar(Scalar::from_int(left_res, Size::from_bits(16)), &left_dest)?;
this.write_scalar(
Scalar::from_int(right_res, Size::from_bits(16)),
&right_dest,
)?;
}
}
// Used to implement _mm_min_sd and _mm_max_sd functions.
// Note that the semantics are a bit different from Rust simd_min
// and simd_max intrinsics regarding handling of NaN and -0.0: Rust
// matches the IEEE min/max operations, while x86 has different
// semantics.
"min.sd" | "max.sd" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let which = match unprefixed_name {
"min.sd" => FloatBinOp::Min,
"max.sd" => FloatBinOp::Max,
_ => unreachable!(),
};
bin_op_sd(this, which, left, right, dest)?;
}
// Used to implement _mm_min_pd and _mm_max_pd functions.
// Note that the semantics are a bit different from Rust simd_min
// and simd_max intrinsics regarding handling of NaN and -0.0: Rust
// matches the IEEE min/max operations, while x86 has different
// semantics.
"min.pd" | "max.pd" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let which = match unprefixed_name {
"min.pd" => FloatBinOp::Min,
"max.pd" => FloatBinOp::Max,
_ => unreachable!(),
};
bin_op_pd(this, which, left, right, dest)?;
}
// Used to implement _mm_sqrt_sd functions.
// Performs the operations on the first component of `op` and
// copies the remaining components from `op`.
"sqrt.sd" => {
let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (op, op_len) = this.operand_to_simd(op)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
assert_eq!(dest_len, op_len);
let op0 = this.read_scalar(&this.project_index(&op, 0)?)?.to_u64()?;
// FIXME using host floats
let res0 = Scalar::from_u64(f64::from_bits(op0).sqrt().to_bits());
this.write_scalar(res0, &this.project_index(&dest, 0)?)?;
for i in 1..dest_len {
this.copy_op(
&this.project_index(&op, i)?,
&this.project_index(&dest, i)?,
/*allow_transmute*/ false,
)?;
}
}
// Used to implement _mm_sqrt_pd functions.
// Performs the operations on all components of `op`.
"sqrt.pd" => {
let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (op, op_len) = this.operand_to_simd(op)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
assert_eq!(dest_len, op_len);
for i in 0..dest_len {
let op = this.read_scalar(&this.project_index(&op, i)?)?.to_u64()?;
let dest = this.project_index(&dest, i)?;
// FIXME using host floats
let res = Scalar::from_u64(f64::from_bits(op).sqrt().to_bits());
this.write_scalar(res, &dest)?;
}
}
// Used to implement the _mm_cmp*_sd function.
// Performs a comparison operation on the first component of `left`
// and `right`, returning 0 if false or `u64::MAX` if true. The remaining
// components are copied from `left`.
"cmp.sd" => {
let [left, right, imm] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let which = FloatBinOp::Cmp(FloatCmpOp::from_intrinsic_imm(
this.read_scalar(imm)?.to_i8()?,
"llvm.x86.sse2.cmp.sd",
)?);
bin_op_sd(this, which, left, right, dest)?;
}
// Used to implement the _mm_cmp*_pd functions.
// Performs a comparison operation on each component of `left`
// and `right`. For each component, returns 0 if false or `u64::MAX`
// if true.
"cmp.pd" => {
let [left, right, imm] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let which = FloatBinOp::Cmp(FloatCmpOp::from_intrinsic_imm(
this.read_scalar(imm)?.to_i8()?,
"llvm.x86.sse2.cmp.pd",
)?);
bin_op_pd(this, which, left, right, dest)?;
}
// Used to implement _mm_{,u}comi{eq,lt,le,gt,ge,neq}_sd functions.
// Compares the first component of `left` and `right` and returns
// a scalar value (0 or 1).
"comieq.sd" | "comilt.sd" | "comile.sd" | "comigt.sd" | "comige.sd" | "comineq.sd"
| "ucomieq.sd" | "ucomilt.sd" | "ucomile.sd" | "ucomigt.sd" | "ucomige.sd"
| "ucomineq.sd" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
assert_eq!(left_len, right_len);
let left = this.read_scalar(&this.project_index(&left, 0)?)?.to_f64()?;
let right = this.read_scalar(&this.project_index(&right, 0)?)?.to_f64()?;
// The difference between the com* and *ucom variants is signaling
// of exceptions when either argument is a quiet NaN. We do not
// support accessing the SSE status register from miri (or from Rust,
// for that matter), so we treat equally both variants.
let res = match unprefixed_name {
"comieq.sd" | "ucomieq.sd" => left == right,
"comilt.sd" | "ucomilt.sd" => left < right,
"comile.sd" | "ucomile.sd" => left <= right,
"comigt.sd" | "ucomigt.sd" => left > right,
"comige.sd" | "ucomige.sd" => left >= right,
"comineq.sd" | "ucomineq.sd" => left != right,
_ => unreachable!(),
};
this.write_scalar(Scalar::from_i32(i32::from(res)), dest)?;
}
// Used to implement the _mm_cvtpd_ps function.
// Converts packed f32 to packed f64.
"cvtpd2ps" => {
let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (op, op_len) = this.operand_to_simd(op)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
// op is f64x2, dest is f32x4
assert_eq!(op_len, 2);
assert_eq!(dest_len, 4);
for i in 0..op_len {
let op = this.read_scalar(&this.project_index(&op, i)?)?.to_f64()?;
let dest = this.project_index(&dest, i)?;
let res = op.convert(/*loses_info*/ &mut false).value;
this.write_scalar(Scalar::from_f32(res), &dest)?;
}
// Fill the remaining with zeros
for i in op_len..dest_len {
let dest = this.project_index(&dest, i)?;
this.write_scalar(Scalar::from_u32(0), &dest)?;
}
}
// Used to implement the _mm_cvtps_pd function.
// Converts packed f64 to packed f32.
"cvtps2pd" => {
let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (op, op_len) = this.operand_to_simd(op)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
// op is f32x4, dest is f64x2
assert_eq!(op_len, 4);
assert_eq!(dest_len, 2);
for i in 0..dest_len {
let op = this.read_scalar(&this.project_index(&op, i)?)?.to_f32()?;
let dest = this.project_index(&dest, i)?;
let res = op.convert(/*loses_info*/ &mut false).value;
this.write_scalar(Scalar::from_f64(res), &dest)?;
}
// the two remaining f32 are ignored
}
// Used to implement the _mm_cvtpd_epi32 and _mm_cvttpd_epi32 functions.
// Converts packed f64 to packed i32.
"cvtpd2dq" | "cvttpd2dq" => {
let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (op, op_len) = this.operand_to_simd(op)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
// op is f64x2, dest is i32x4
assert_eq!(op_len, 2);
assert_eq!(dest_len, 4);
let rnd = match unprefixed_name {
// "current SSE rounding mode", assume nearest
// https://www.felixcloutier.com/x86/cvtpd2dq
"cvtpd2dq" => rustc_apfloat::Round::NearestTiesToEven,
// always truncate
// https://www.felixcloutier.com/x86/cvttpd2dq
"cvttpd2dq" => rustc_apfloat::Round::TowardZero,
_ => unreachable!(),
};
for i in 0..op_len {
let op = this.read_scalar(&this.project_index(&op, i)?)?.to_f64()?;
let dest = this.project_index(&dest, i)?;
let res =
this.float_to_int_checked(op, dest.layout.ty, rnd).unwrap_or_else(|| {
// Fallback to minimum acording to SSE2 semantics.
Scalar::from_i32(i32::MIN)
});
this.write_scalar(res, &dest)?;
}
// Fill the remaining with zeros
for i in op_len..dest_len {
let dest = this.project_index(&dest, i)?;
this.write_scalar(Scalar::from_i32(0), &dest)?;
}
}
// Use to implement the _mm_cvtsd_si32 and _mm_cvttsd_si32 functions.
// Converts the first component of `op` from f64 to i32.
"cvtsd2si" | "cvttsd2si" => {
let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (op, _) = this.operand_to_simd(op)?;
let op = this.read_scalar(&this.project_index(&op, 0)?)?.to_f64()?;
let rnd = match unprefixed_name {
// "current SSE rounding mode", assume nearest
// https://www.felixcloutier.com/x86/cvtsd2si
"cvtsd2si" => rustc_apfloat::Round::NearestTiesToEven,
// always truncate
// https://www.felixcloutier.com/x86/cvttsd2si
"cvttsd2si" => rustc_apfloat::Round::TowardZero,
_ => unreachable!(),
};
let res = this.float_to_int_checked(op, dest.layout.ty, rnd).unwrap_or_else(|| {
// Fallback to minimum acording to SSE semantics.
Scalar::from_i32(i32::MIN)
});
this.write_scalar(res, dest)?;
}
// Use to implement the _mm_cvtsd_si64 and _mm_cvttsd_si64 functions.
// Converts the first component of `op` from f64 to i64.
"cvtsd2si64" | "cvttsd2si64" => {
let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (op, _) = this.operand_to_simd(op)?;
let op = this.read_scalar(&this.project_index(&op, 0)?)?.to_f64()?;
let rnd = match unprefixed_name {
// "current SSE rounding mode", assume nearest
// https://www.felixcloutier.com/x86/cvtsd2si
"cvtsd2si64" => rustc_apfloat::Round::NearestTiesToEven,
// always truncate
// https://www.felixcloutier.com/x86/cvttsd2si
"cvttsd2si64" => rustc_apfloat::Round::TowardZero,
_ => unreachable!(),
};
let res = this.float_to_int_checked(op, dest.layout.ty, rnd).unwrap_or_else(|| {
// Fallback to minimum acording to SSE semantics.
Scalar::from_i64(i64::MIN)
});
this.write_scalar(res, dest)?;
}
// Used to implement the _mm_cvtsd_ss and _mm_cvtss_sd functions.
// Converts the first f64/f32 from `right` to f32/f64 and copies
// the remaining elements from `left`
"cvtsd2ss" | "cvtss2sd" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (left, left_len) = this.operand_to_simd(left)?;
let (right, _) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
assert_eq!(dest_len, left_len);
// Convert first element of `right`
let right0 = this.read_immediate(&this.project_index(&right, 0)?)?;
let dest0 = this.project_index(&dest, 0)?;
// `float_to_float_or_int` here will convert from f64 to f32 (cvtsd2ss) or
// from f32 to f64 (cvtss2sd).
let res0 = this.float_to_float_or_int(&right0, dest0.layout.ty)?;
this.write_immediate(res0, &dest0)?;
// Copy remianing from `left`
for i in 1..dest_len {
this.copy_op(
&this.project_index(&left, i)?,
&this.project_index(&dest, i)?,
/*allow_transmute*/ false,
)?;
}
}
// Used to implement the _mm_movemask_pd function.
// Returns a scalar integer where the i-th bit is the highest
// bit of the i-th component of `op`.
// https://www.felixcloutier.com/x86/movmskpd
"movmsk.pd" => {
let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (op, op_len) = this.operand_to_simd(op)?;
let mut res = 0;
for i in 0..op_len {
let op = this.read_scalar(&this.project_index(&op, i)?)?;
let op = op.to_u64()?;
// Extract the highest bit of `op` and place it in the `i`-th bit of `res`
res |= (op >> 63) << i;
}
this.write_scalar(Scalar::from_u32(res.try_into().unwrap()), dest)?;
}
_ => return Ok(EmulateByNameResult::NotSupported),
}
Ok(EmulateByNameResult::NeedsJumping)
}
}
/// Takes a 128-bit vector, transmutes it to `[u64; 2]` and extracts
/// the first value.
fn extract_first_u64<'tcx>(
this: &crate::MiriInterpCx<'_, 'tcx>,
op: &MPlaceTy<'tcx, Provenance>,
) -> InterpResult<'tcx, u64> {
// Transmute vector to `[u64; 2]`
let u64_array_layout = this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u64, 2))?;
let op = op.transmute(u64_array_layout, this)?;
// Get the first u64 from the array
this.read_scalar(&this.project_index(&op, 0)?)?.to_u64()
}
#[derive(Copy, Clone)]
enum FloatBinOp {
/// Comparison
Cmp(FloatCmpOp),
/// Minimum value (with SSE semantics)
///
/// <https://www.felixcloutier.com/x86/minsd>
/// <https://www.felixcloutier.com/x86/minpd>
Min,
/// Maximum value (with SSE semantics)
///
/// <https://www.felixcloutier.com/x86/maxsd>
/// <https://www.felixcloutier.com/x86/maxpd>
Max,
}
/// Performs `which` scalar operation on `left` and `right` and returns
/// the result.
// FIXME make this generic over apfloat type to reduce code duplicaton with bin_op_f32
fn bin_op_f64<'tcx>(
which: FloatBinOp,
left: &ImmTy<'tcx, Provenance>,
right: &ImmTy<'tcx, Provenance>,
) -> InterpResult<'tcx, Scalar<Provenance>> {
match which {
FloatBinOp::Cmp(which) => {
let left = left.to_scalar().to_f64()?;
let right = right.to_scalar().to_f64()?;
// FIXME: Make sure that these operations match the semantics of cmppd
let res = match which {
FloatCmpOp::Eq => left == right,
FloatCmpOp::Lt => left < right,
FloatCmpOp::Le => left <= right,
FloatCmpOp::Unord => left.is_nan() || right.is_nan(),
FloatCmpOp::Neq => left != right,
FloatCmpOp::Nlt => !(left < right),
FloatCmpOp::Nle => !(left <= right),
FloatCmpOp::Ord => !left.is_nan() && !right.is_nan(),
};
Ok(Scalar::from_u64(if res { u64::MAX } else { 0 }))
}
FloatBinOp::Min => {
let left = left.to_scalar().to_f64()?;
let right = right.to_scalar().to_f64()?;
// SSE semantics to handle zero and NaN. Note that `x == Single::ZERO`
// is true when `x` is either +0 or -0.
if (left == Double::ZERO && right == Double::ZERO)
|| left.is_nan()
|| right.is_nan()
|| left >= right
{
Ok(Scalar::from_f64(right))
} else {
Ok(Scalar::from_f64(left))
}
}
FloatBinOp::Max => {
let left = left.to_scalar().to_f64()?;
let right = right.to_scalar().to_f64()?;
// SSE semantics to handle zero and NaN. Note that `x == Single::ZERO`
// is true when `x` is either +0 or -0.
if (left == Double::ZERO && right == Double::ZERO)
|| left.is_nan()
|| right.is_nan()
|| left <= right
{
Ok(Scalar::from_f64(right))
} else {
Ok(Scalar::from_f64(left))
}
}
}
}
/// Performs `which` operation on the first component of `left` and `right`
/// and copies the other components from `left`. The result is stored in `dest`.
fn bin_op_sd<'tcx>(
this: &mut crate::MiriInterpCx<'_, 'tcx>,
which: FloatBinOp,
left: &OpTy<'tcx, Provenance>,
right: &OpTy<'tcx, Provenance>,
dest: &PlaceTy<'tcx, Provenance>,
) -> InterpResult<'tcx, ()> {
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
assert_eq!(dest_len, left_len);
assert_eq!(dest_len, right_len);
let res0 = bin_op_f64(
which,
&this.read_immediate(&this.project_index(&left, 0)?)?,
&this.read_immediate(&this.project_index(&right, 0)?)?,
)?;
this.write_scalar(res0, &this.project_index(&dest, 0)?)?;
for i in 1..dest_len {
this.copy_op(
&this.project_index(&left, i)?,
&this.project_index(&dest, i)?,
/*allow_transmute*/ false,
)?;
}
Ok(())
}
/// Performs `which` operation on each component of `left` and
/// `right`, storing the result is stored in `dest`.
fn bin_op_pd<'tcx>(
this: &mut crate::MiriInterpCx<'_, 'tcx>,
which: FloatBinOp,
left: &OpTy<'tcx, Provenance>,
right: &OpTy<'tcx, Provenance>,
dest: &PlaceTy<'tcx, Provenance>,
) -> InterpResult<'tcx, ()> {
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
assert_eq!(dest_len, left_len);
assert_eq!(dest_len, right_len);
for i in 0..dest_len {
let left = this.read_immediate(&this.project_index(&left, i)?)?;
let right = this.read_immediate(&this.project_index(&right, i)?)?;
let dest = this.project_index(&dest, i)?;
let res = bin_op_f64(which, &left, &right)?;
this.write_scalar(res, &dest)?;
}
Ok(())
}

View File

@ -1,5 +1,15 @@
//@only-target-x86_64
// Ignore everything except x86 and x86_64
// Any additional target are added to CI should be ignored here
//@ignore-target-aarch64
//@ignore-target-arm
//@ignore-target-avr
//@ignore-target-s390x
//@ignore-target-thumbv7em
//@ignore-target-wasm32
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
use std::f32::NAN;
use std::mem::transmute;
@ -987,6 +997,8 @@ unsafe fn test_sse() {
}
test_mm_cvtsi32_ss();
// Intrinsic only available on x86_64
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse")]
unsafe fn test_mm_cvtss_si64() {
let inputs = &[
@ -1007,8 +1019,11 @@ unsafe fn test_sse() {
assert_eq!(e, r, "TestCase #{} _mm_cvtss_si64({:?}) = {}, expected: {}", i, x, r, e);
}
}
#[cfg(target_arch = "x86_64")]
test_mm_cvtss_si64();
// Intrinsic only available on x86_64
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse")]
unsafe fn test_mm_cvttss_si64() {
let inputs = &[
@ -1032,8 +1047,11 @@ unsafe fn test_sse() {
assert_eq!(e, r, "TestCase #{} _mm_cvttss_si64({:?}) = {}, expected: {}", i, x, r, e);
}
}
#[cfg(target_arch = "x86_64")]
test_mm_cvttss_si64();
// Intrinsic only available on x86_64
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse")]
unsafe fn test_mm_cvtsi64_ss() {
let inputs = &[
@ -1053,6 +1071,7 @@ unsafe fn test_sse() {
assert_eq_m128(e, r);
}
}
#[cfg(target_arch = "x86_64")]
test_mm_cvtsi64_ss();
#[target_feature(enable = "sse")]

View File

@ -0,0 +1,828 @@
// Ignore everything except x86 and x86_64
// Any additional target are added to CI should be ignored here
//@ignore-target-aarch64
//@ignore-target-arm
//@ignore-target-avr
//@ignore-target-s390x
//@ignore-target-thumbv7em
//@ignore-target-wasm32
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
use std::f64::NAN;
use std::mem::transmute;
fn main() {
assert!(is_x86_feature_detected!("sse2"));
unsafe {
test_sse2();
}
}
#[target_feature(enable = "sse2")]
unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
_mm_set_epi64x(b, a)
}
#[target_feature(enable = "sse2")]
unsafe fn test_sse2() {
// Mostly copied from library/stdarch/crates/core_arch/src/x86{,_64}/sse2.rs
unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
_mm_set_epi64x(b, a)
}
#[track_caller]
#[target_feature(enable = "sse")]
unsafe fn assert_eq_m128(a: __m128, b: __m128) {
let r = _mm_cmpeq_ps(a, b);
if _mm_movemask_ps(r) != 0b1111 {
panic!("{:?} != {:?}", a, b);
}
}
#[track_caller]
#[target_feature(enable = "sse2")]
unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
}
#[track_caller]
#[target_feature(enable = "sse2")]
unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
panic!("{:?} != {:?}", a, b);
}
}
#[target_feature(enable = "sse2")]
unsafe fn test_mm_avg_epu8() {
let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9));
let r = _mm_avg_epu8(a, b);
assert_eq_m128i(r, _mm_set1_epi8(6));
}
test_mm_avg_epu8();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_avg_epu16() {
let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9));
let r = _mm_avg_epu16(a, b);
assert_eq_m128i(r, _mm_set1_epi16(6));
}
test_mm_avg_epu16();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_mulhi_epi16() {
let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
let r = _mm_mulhi_epi16(a, b);
assert_eq_m128i(r, _mm_set1_epi16(-16));
}
test_mm_mulhi_epi16();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_mulhi_epu16() {
let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001));
let r = _mm_mulhi_epu16(a, b);
assert_eq_m128i(r, _mm_set1_epi16(15));
}
test_mm_mulhi_epu16();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_mul_epu32() {
let a = _mm_setr_epi64x(1_000_000_000, 1 << 34);
let b = _mm_setr_epi64x(1_000_000_000, 1 << 35);
let r = _mm_mul_epu32(a, b);
let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0);
assert_eq_m128i(r, e);
}
test_mm_mul_epu32();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_sad_epu8() {
#[rustfmt::skip]
let a = _mm_setr_epi8(
255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
1, 2, 3, 4,
155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
1, 2, 3, 4,
);
let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
let r = _mm_sad_epu8(a, b);
let e = _mm_setr_epi64x(1020, 614);
assert_eq_m128i(r, e);
}
test_mm_sad_epu8();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_sll_epi16() {
let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
assert_eq_m128i(
r,
_mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
);
let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
assert_eq_m128i(r, a);
let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
assert_eq_m128i(r, _mm_set1_epi16(0));
let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
assert_eq_m128i(r, _mm_set1_epi16(0));
}
test_mm_sll_epi16();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_srl_epi16() {
let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
assert_eq_m128i(r, _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0));
let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
assert_eq_m128i(r, a);
let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
assert_eq_m128i(r, _mm_set1_epi16(0));
let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
assert_eq_m128i(r, _mm_set1_epi16(0));
}
test_mm_srl_epi16();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_sra_epi16() {
let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
assert_eq_m128i(r, _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10));
let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
assert_eq_m128i(r, a);
let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
}
test_mm_sra_epi16();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_sll_epi32() {
let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
assert_eq_m128i(r, a);
let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
assert_eq_m128i(r, _mm_set1_epi32(0));
let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
assert_eq_m128i(r, _mm_set1_epi32(0));
}
test_mm_sll_epi32();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_srl_epi32() {
let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
assert_eq_m128i(r, a);
let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
assert_eq_m128i(r, _mm_set1_epi32(0));
let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
assert_eq_m128i(r, _mm_set1_epi32(0));
}
test_mm_srl_epi32();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_sra_epi32() {
let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
assert_eq_m128i(r, a);
let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
}
test_mm_sra_epi32();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_sll_epi64() {
let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
assert_eq_m128i(r, a);
let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
assert_eq_m128i(r, _mm_set1_epi64x(0));
let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
assert_eq_m128i(r, _mm_set1_epi64x(0));
}
test_mm_sll_epi64();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_srl_epi64() {
let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
assert_eq_m128i(r, a);
let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
assert_eq_m128i(r, _mm_set1_epi64x(0));
let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
assert_eq_m128i(r, _mm_set1_epi64x(0));
}
test_mm_srl_epi64();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cvtepi32_ps() {
let a = _mm_setr_epi32(1, 2, 3, 4);
let r = _mm_cvtepi32_ps(a);
assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
}
test_mm_cvtepi32_ps();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cvtps_epi32() {
let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
let r = _mm_cvtps_epi32(a);
assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4));
}
test_mm_cvtps_epi32();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cvttps_epi32() {
let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6);
let r = _mm_cvttps_epi32(a);
assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
let r = _mm_cvttps_epi32(a);
assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
}
test_mm_cvttps_epi32();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_packs_epi16() {
let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
let r = _mm_packs_epi16(a, b);
assert_eq_m128i(
r,
_mm_setr_epi8(0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F),
);
}
test_mm_packs_epi16();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_packus_epi16() {
let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
let r = _mm_packus_epi16(a, b);
assert_eq_m128i(r, _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0));
}
test_mm_packus_epi16();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_packs_epi32() {
let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
let r = _mm_packs_epi32(a, b);
assert_eq_m128i(r, _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF));
}
test_mm_packs_epi32();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_min_sd() {
let a = _mm_setr_pd(1.0, 2.0);
let b = _mm_setr_pd(5.0, 10.0);
let r = _mm_min_sd(a, b);
assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
}
test_mm_min_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_min_pd() {
let a = _mm_setr_pd(-1.0, 5.0);
let b = _mm_setr_pd(-100.0, 20.0);
let r = _mm_min_pd(a, b);
assert_eq_m128d(r, _mm_setr_pd(-100.0, 5.0));
// `_mm_min_pd` can **not** be implemented using the `simd_min` rust intrinsic because
// the semantics of `simd_min` are different to those of `_mm_min_pd` regarding handling
// of `-0.0`.
let a = _mm_setr_pd(-0.0, 0.0);
let b = _mm_setr_pd(0.0, 0.0);
let r1: [u8; 16] = transmute(_mm_min_pd(a, b));
let r2: [u8; 16] = transmute(_mm_min_pd(b, a));
let a: [u8; 16] = transmute(a);
let b: [u8; 16] = transmute(b);
assert_eq!(r1, b);
assert_eq!(r2, a);
assert_ne!(a, b); // sanity check that -0.0 is actually present
}
test_mm_min_pd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_max_sd() {
let a = _mm_setr_pd(1.0, 2.0);
let b = _mm_setr_pd(5.0, 10.0);
let r = _mm_max_sd(a, b);
assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
}
test_mm_max_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_max_pd() {
let a = _mm_setr_pd(-1.0, 5.0);
let b = _mm_setr_pd(-100.0, 20.0);
let r = _mm_max_pd(a, b);
assert_eq_m128d(r, _mm_setr_pd(-1.0, 20.0));
// `_mm_max_pd` can **not** be implemented using the `simd_max` rust intrinsic because
// the semantics of `simd_max` are different to those of `_mm_max_pd` regarding handling
// of `-0.0`.
let a = _mm_setr_pd(-0.0, 0.0);
let b = _mm_setr_pd(0.0, 0.0);
let r1: [u8; 16] = transmute(_mm_max_pd(a, b));
let r2: [u8; 16] = transmute(_mm_max_pd(b, a));
let a: [u8; 16] = transmute(a);
let b: [u8; 16] = transmute(b);
assert_eq!(r1, b);
assert_eq!(r2, a);
assert_ne!(a, b); // sanity check that -0.0 is actually present
}
test_mm_max_pd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_sqrt_sd() {
let a = _mm_setr_pd(1.0, 2.0);
let b = _mm_setr_pd(5.0, 10.0);
let r = _mm_sqrt_sd(a, b);
assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0));
}
test_mm_sqrt_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_sqrt_pd() {
let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0));
assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt()));
}
test_mm_sqrt_pd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpeq_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
let e = _mm_setr_epi64x(!0, transmute(2.0f64));
let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpeq_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmplt_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
let e = _mm_setr_epi64x(!0, transmute(2.0f64));
let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmplt_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmple_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
let e = _mm_setr_epi64x(!0, transmute(2.0f64));
let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmple_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpgt_sd() {
let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
let e = _mm_setr_epi64x(!0, transmute(2.0f64));
let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpgt_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpge_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
let e = _mm_setr_epi64x(!0, transmute(2.0f64));
let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpge_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpord_sd() {
let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
let e = _mm_setr_epi64x(0, transmute(2.0f64));
let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpord_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpunord_sd() {
let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
let e = _mm_setr_epi64x(!0, transmute(2.0f64));
let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpunord_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpneq_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
let e = _mm_setr_epi64x(!0, transmute(2.0f64));
let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpneq_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpnlt_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
let e = _mm_setr_epi64x(0, transmute(2.0f64));
let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpnlt_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpnle_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
let e = _mm_setr_epi64x(0, transmute(2.0f64));
let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpnle_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpngt_sd() {
let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
let e = _mm_setr_epi64x(0, transmute(2.0f64));
let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpngt_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpnge_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
let e = _mm_setr_epi64x(0, transmute(2.0f64));
let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpnge_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpeq_pd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
let e = _mm_setr_epi64x(!0, 0);
let r = transmute::<_, __m128i>(_mm_cmpeq_pd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpeq_pd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmplt_pd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
let e = _mm_setr_epi64x(0, !0);
let r = transmute::<_, __m128i>(_mm_cmplt_pd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmplt_pd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmple_pd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
let e = _mm_setr_epi64x(!0, !0);
let r = transmute::<_, __m128i>(_mm_cmple_pd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmple_pd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpgt_pd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
let e = _mm_setr_epi64x(0, 0);
let r = transmute::<_, __m128i>(_mm_cmpgt_pd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpgt_pd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpge_pd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
let e = _mm_setr_epi64x(!0, 0);
let r = transmute::<_, __m128i>(_mm_cmpge_pd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpge_pd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpord_pd() {
let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
let e = _mm_setr_epi64x(0, !0);
let r = transmute::<_, __m128i>(_mm_cmpord_pd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpord_pd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpunord_pd() {
let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
let e = _mm_setr_epi64x(!0, 0);
let r = transmute::<_, __m128i>(_mm_cmpunord_pd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpunord_pd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpneq_pd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
let e = _mm_setr_epi64x(!0, !0);
let r = transmute::<_, __m128i>(_mm_cmpneq_pd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpneq_pd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpnlt_pd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
let e = _mm_setr_epi64x(0, 0);
let r = transmute::<_, __m128i>(_mm_cmpnlt_pd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpnlt_pd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpnle_pd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
let e = _mm_setr_epi64x(0, 0);
let r = transmute::<_, __m128i>(_mm_cmpnle_pd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpnle_pd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpngt_pd() {
let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
let e = _mm_setr_epi64x(0, !0);
let r = transmute::<_, __m128i>(_mm_cmpngt_pd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpngt_pd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cmpnge_pd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
let e = _mm_setr_epi64x(0, !0);
let r = transmute::<_, __m128i>(_mm_cmpnge_pd(a, b));
assert_eq_m128i(r, e);
}
test_mm_cmpnge_pd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_comieq_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
assert!(_mm_comieq_sd(a, b) != 0);
let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0));
assert!(_mm_comieq_sd(a, b) == 0);
}
test_mm_comieq_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_comilt_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
assert!(_mm_comilt_sd(a, b) == 0);
}
test_mm_comilt_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_comile_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
assert!(_mm_comile_sd(a, b) != 0);
}
test_mm_comile_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_comigt_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
assert!(_mm_comigt_sd(a, b) == 0);
}
test_mm_comigt_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_comige_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
assert!(_mm_comige_sd(a, b) != 0);
}
test_mm_comige_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_comineq_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
assert!(_mm_comineq_sd(a, b) == 0);
}
test_mm_comineq_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_ucomieq_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
assert!(_mm_ucomieq_sd(a, b) != 0);
let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0));
assert!(_mm_ucomieq_sd(a, b) == 0);
}
test_mm_ucomieq_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_ucomilt_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
assert!(_mm_ucomilt_sd(a, b) == 0);
}
test_mm_ucomilt_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_ucomile_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
assert!(_mm_ucomile_sd(a, b) != 0);
}
test_mm_ucomile_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_ucomigt_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
assert!(_mm_ucomigt_sd(a, b) == 0);
}
test_mm_ucomigt_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_ucomige_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
assert!(_mm_ucomige_sd(a, b) != 0);
}
test_mm_ucomige_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_ucomineq_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
assert!(_mm_ucomineq_sd(a, b) == 0);
}
test_mm_ucomineq_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cvtpd_ps() {
let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0));
assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0));
let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0));
assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0));
let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN));
assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64));
assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0));
}
test_mm_cvtpd_ps();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cvtps_pd() {
let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0));
assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0));
let r = _mm_cvtps_pd(_mm_setr_ps(f32::MAX, f32::INFINITY, f32::NEG_INFINITY, f32::MIN));
assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY));
}
test_mm_cvtps_pd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cvtpd_epi32() {
let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0));
assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0));
let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0));
assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0));
let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN));
assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY));
assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN));
assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
}
test_mm_cvtpd_epi32();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cvttpd_epi32() {
let a = _mm_setr_pd(-1.1, 2.2);
let r = _mm_cvttpd_epi32(a);
assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0));
let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
let r = _mm_cvttpd_epi32(a);
assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
}
test_mm_cvttpd_epi32();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cvtsd_si32() {
let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0));
assert_eq!(r, -2);
let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN));
assert_eq!(r, i32::MIN);
let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN));
assert_eq!(r, i32::MIN);
}
test_mm_cvtsd_si32();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cvttsd_si32() {
let a = _mm_setr_pd(-1.1, 2.2);
let r = _mm_cvttsd_si32(a);
assert_eq!(r, -1);
let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
let r = _mm_cvttsd_si32(a);
assert_eq!(r, i32::MIN);
}
test_mm_cvttsd_si32();
// Intrinsic only available on x86_64
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cvtsd_si64() {
let r = _mm_cvtsd_si64(_mm_setr_pd(-2.0, 5.0));
assert_eq!(r, -2_i64);
let r = _mm_cvtsd_si64(_mm_setr_pd(f64::MAX, f64::MIN));
assert_eq!(r, i64::MIN);
}
#[cfg(target_arch = "x86_64")]
test_mm_cvtsd_si64();
// Intrinsic only available on x86_64
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cvttsd_si64() {
let a = _mm_setr_pd(-1.1, 2.2);
let r = _mm_cvttsd_si64(a);
assert_eq!(r, -1_i64);
}
#[cfg(target_arch = "x86_64")]
test_mm_cvttsd_si64();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cvtsd_ss() {
let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4);
let b = _mm_setr_pd(2.0, -5.0);
let r = _mm_cvtsd_ss(a, b);
assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
let b = _mm_setr_pd(f64::INFINITY, -5.0);
let r = _mm_cvtsd_ss(a, b);
assert_eq_m128(
r,
_mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY),
);
}
test_mm_cvtsd_ss();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cvtss_sd() {
let a = _mm_setr_pd(-1.1, 2.2);
let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
let r = _mm_cvtss_sd(a, b);
assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2));
let a = _mm_setr_pd(-1.1, f64::INFINITY);
let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0);
let r = _mm_cvtss_sd(a, b);
assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY));
}
test_mm_cvtss_sd();
#[target_feature(enable = "sse2")]
unsafe fn test_mm_movemask_pd() {
let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0));
assert_eq!(r, 0b01);
let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0));
assert_eq!(r, 0b11);
}
test_mm_movemask_pd();
}