From 8a5f7f25af35e0b4f006f32dcf5cc1ba9d05826d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Fri, 8 Sep 2023 14:42:40 +0200 Subject: [PATCH 1/3] Fix a few typos in shims/x86/sse.rs comments --- src/tools/miri/src/shims/x86/sse.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/tools/miri/src/shims/x86/sse.rs b/src/tools/miri/src/shims/x86/sse.rs index b18441bb408..62295fa9f49 100644 --- a/src/tools/miri/src/shims/x86/sse.rs +++ b/src/tools/miri/src/shims/x86/sse.rs @@ -78,7 +78,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { unary_op_ss(this, which, op, dest)?; } - // Used to implement _mm_{sqrt,rcp,rsqrt}_ss functions. + // Used to implement _mm_{sqrt,rcp,rsqrt}_ps functions. // Performs the operations on all components of `op`. "sqrt.ps" | "rcp.ps" | "rsqrt.ps" => { let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; @@ -146,7 +146,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { bin_op_ps(this, which, left, right, dest)?; } - // Used to implement _mm_{,u}comi{eq,lt,le,gt,ge,neq}_ps functions. + // Used to implement _mm_{,u}comi{eq,lt,le,gt,ge,neq}_ss functions. // Compares the first component of `left` and `right` and returns // a scalar value (0 or 1). "comieq.ss" | "comilt.ss" | "comile.ss" | "comigt.ss" | "comige.ss" | "comineq.ss" @@ -436,8 +436,8 @@ fn bin_op_ss<'tcx>( Ok(()) } -/// Performs `which` operation on each component of `left`, and -/// `right` storing the result is stored in `dest`. +/// Performs `which` operation on each component of `left` and +/// `right`, storing the result is stored in `dest`. fn bin_op_ps<'tcx>( this: &mut crate::MiriInterpCx<'_, 'tcx>, which: FloatBinOp, From 9df62ad5e147f8bb07287f47c845df1d3e51a02c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Tue, 12 Sep 2023 19:44:32 +0200 Subject: [PATCH 2/3] Ignore all archs except x86 and x86_64 in SSE tests --- .../miri/tests/pass/intrinsics-x86-sse.rs | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/tools/miri/tests/pass/intrinsics-x86-sse.rs b/src/tools/miri/tests/pass/intrinsics-x86-sse.rs index 677d7cc030e..9b1ded94b5d 100644 --- a/src/tools/miri/tests/pass/intrinsics-x86-sse.rs +++ b/src/tools/miri/tests/pass/intrinsics-x86-sse.rs @@ -1,5 +1,15 @@ -//@only-target-x86_64 +// Ignore everything except x86 and x86_64 +// Any additional target are added to CI should be ignored here +//@ignore-target-aarch64 +//@ignore-target-arm +//@ignore-target-avr +//@ignore-target-s390x +//@ignore-target-thumbv7em +//@ignore-target-wasm32 +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; use std::f32::NAN; use std::mem::transmute; @@ -987,6 +997,8 @@ unsafe fn test_sse() { } test_mm_cvtsi32_ss(); + // Intrinsic only available on x86_64 + #[cfg(target_arch = "x86_64")] #[target_feature(enable = "sse")] unsafe fn test_mm_cvtss_si64() { let inputs = &[ @@ -1007,8 +1019,11 @@ unsafe fn test_sse() { assert_eq!(e, r, "TestCase #{} _mm_cvtss_si64({:?}) = {}, expected: {}", i, x, r, e); } } + #[cfg(target_arch = "x86_64")] test_mm_cvtss_si64(); + // Intrinsic only available on x86_64 + #[cfg(target_arch = "x86_64")] #[target_feature(enable = "sse")] unsafe fn test_mm_cvttss_si64() { let inputs = &[ @@ -1032,8 +1047,11 @@ unsafe fn test_sse() { assert_eq!(e, r, "TestCase #{} _mm_cvttss_si64({:?}) = {}, expected: {}", i, x, r, e); } } + #[cfg(target_arch = "x86_64")] test_mm_cvttss_si64(); + // Intrinsic only available on x86_64 + #[cfg(target_arch = "x86_64")] #[target_feature(enable = "sse")] unsafe fn test_mm_cvtsi64_ss() { let inputs = &[ @@ -1053,6 +1071,7 @@ unsafe fn test_sse() { assert_eq_m128(e, r); } } + #[cfg(target_arch = "x86_64")] test_mm_cvtsi64_ss(); #[target_feature(enable = "sse")] From ab927c87cd319c2bc2e287b131dbeddd4990aa16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Fri, 8 Sep 2023 18:51:22 +0200 Subject: [PATCH 3/3] Implement some `llvm.x86.sse2.*` intrinsics and add tests Implements LLVM intrisics needed to run most SSE2 functions from `core::arch::x86{,_64}`. Also adds miri tests for those functions (mostly copied from core_arch tests). --- src/tools/miri/src/shims/foreign_items.rs | 5 + src/tools/miri/src/shims/x86/mod.rs | 44 + src/tools/miri/src/shims/x86/sse.rs | 61 +- src/tools/miri/src/shims/x86/sse2.rs | 982 ++++++++++++++++++ .../miri/tests/pass/intrinsics-x86-sse2.rs | 828 +++++++++++++++ 5 files changed, 1869 insertions(+), 51 deletions(-) create mode 100644 src/tools/miri/src/shims/x86/sse2.rs create mode 100644 src/tools/miri/tests/pass/intrinsics-x86-sse2.rs diff --git a/src/tools/miri/src/shims/foreign_items.rs b/src/tools/miri/src/shims/foreign_items.rs index 47cbd4419f3..0e8eaf450e4 100644 --- a/src/tools/miri/src/shims/foreign_items.rs +++ b/src/tools/miri/src/shims/foreign_items.rs @@ -1037,6 +1037,11 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { this, link_name, abi, args, dest, ); } + name if name.starts_with("llvm.x86.sse2.") => { + return shims::x86::sse2::EvalContextExt::emulate_x86_sse2_intrinsic( + this, link_name, abi, args, dest, + ); + } // Platform-specific shims _ => diff --git a/src/tools/miri/src/shims/x86/mod.rs b/src/tools/miri/src/shims/x86/mod.rs index 36e673129de..62f5eb1baf7 100644 --- a/src/tools/miri/src/shims/x86/mod.rs +++ b/src/tools/miri/src/shims/x86/mod.rs @@ -1 +1,45 @@ +use crate::InterpResult; + pub(super) mod sse; +pub(super) mod sse2; + +/// Floating point comparison operation +/// +/// +/// +/// +/// +#[derive(Copy, Clone)] +enum FloatCmpOp { + Eq, + Lt, + Le, + Unord, + Neq, + /// Not less-than + Nlt, + /// Not less-or-equal + Nle, + /// Ordered, i.e. neither of them is NaN + Ord, +} + +impl FloatCmpOp { + /// Convert from the `imm` argument used to specify the comparison + /// operation in intrinsics such as `llvm.x86.sse.cmp.ss`. + fn from_intrinsic_imm(imm: i8, intrinsic: &str) -> InterpResult<'_, Self> { + match imm { + 0 => Ok(Self::Eq), + 1 => Ok(Self::Lt), + 2 => Ok(Self::Le), + 3 => Ok(Self::Unord), + 4 => Ok(Self::Neq), + 5 => Ok(Self::Nlt), + 6 => Ok(Self::Nle), + 7 => Ok(Self::Ord), + imm => { + throw_unsup_format!("invalid `imm` parameter of {intrinsic}: {imm}"); + } + } + } +} diff --git a/src/tools/miri/src/shims/x86/sse.rs b/src/tools/miri/src/shims/x86/sse.rs index 62295fa9f49..ff4bd369706 100644 --- a/src/tools/miri/src/shims/x86/sse.rs +++ b/src/tools/miri/src/shims/x86/sse.rs @@ -5,6 +5,7 @@ use rustc_target::spec::abi::Abi; use rand::Rng as _; +use super::FloatCmpOp; use crate::*; use shims::foreign_items::EmulateByNameResult; @@ -100,22 +101,10 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { let [left, right, imm] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; - let which = match this.read_scalar(imm)?.to_i8()? { - 0 => FloatBinOp::Cmp(FloatCmpOp::Eq), - 1 => FloatBinOp::Cmp(FloatCmpOp::Lt), - 2 => FloatBinOp::Cmp(FloatCmpOp::Le), - 3 => FloatBinOp::Cmp(FloatCmpOp::Unord), - 4 => FloatBinOp::Cmp(FloatCmpOp::Neq), - 5 => FloatBinOp::Cmp(FloatCmpOp::Nlt), - 6 => FloatBinOp::Cmp(FloatCmpOp::Nle), - 7 => FloatBinOp::Cmp(FloatCmpOp::Ord), - imm => { - throw_unsup_format!( - "invalid 3rd parameter of llvm.x86.sse.cmp.ps: {}", - imm - ); - } - }; + let which = FloatBinOp::Cmp(FloatCmpOp::from_intrinsic_imm( + this.read_scalar(imm)?.to_i8()?, + "llvm.x86.sse.cmp.ss", + )?); bin_op_ss(this, which, left, right, dest)?; } @@ -127,22 +116,10 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { let [left, right, imm] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; - let which = match this.read_scalar(imm)?.to_i8()? { - 0 => FloatBinOp::Cmp(FloatCmpOp::Eq), - 1 => FloatBinOp::Cmp(FloatCmpOp::Lt), - 2 => FloatBinOp::Cmp(FloatCmpOp::Le), - 3 => FloatBinOp::Cmp(FloatCmpOp::Unord), - 4 => FloatBinOp::Cmp(FloatCmpOp::Neq), - 5 => FloatBinOp::Cmp(FloatCmpOp::Nlt), - 6 => FloatBinOp::Cmp(FloatCmpOp::Nle), - 7 => FloatBinOp::Cmp(FloatCmpOp::Ord), - imm => { - throw_unsup_format!( - "invalid 3rd parameter of llvm.x86.sse.cmp.ps: {}", - imm - ); - } - }; + let which = FloatBinOp::Cmp(FloatCmpOp::from_intrinsic_imm( + this.read_scalar(imm)?.to_i8()?, + "llvm.x86.sse.cmp.ps", + )?); bin_op_ps(this, which, left, right, dest)?; } @@ -292,6 +269,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { let op = this.read_scalar(&this.project_index(&op, i)?)?; let op = op.to_u32()?; + // Extract the highest bit of `op` and place it in the `i`-th bit of `res` res |= (op >> 31) << i; } @@ -303,25 +281,6 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { } } -/// Floating point comparison operation -/// -/// -/// -#[derive(Copy, Clone)] -enum FloatCmpOp { - Eq, - Lt, - Le, - Unord, - Neq, - /// Not less-than - Nlt, - /// Not less-or-equal - Nle, - /// Ordered, i.e. neither of them is NaN - Ord, -} - #[derive(Copy, Clone)] enum FloatBinOp { /// Arithmetic operation diff --git a/src/tools/miri/src/shims/x86/sse2.rs b/src/tools/miri/src/shims/x86/sse2.rs new file mode 100644 index 00000000000..5b42339e648 --- /dev/null +++ b/src/tools/miri/src/shims/x86/sse2.rs @@ -0,0 +1,982 @@ +use rustc_apfloat::{ + ieee::{Double, Single}, + Float as _, FloatConvert as _, +}; +use rustc_middle::ty::layout::LayoutOf as _; +use rustc_middle::ty::Ty; +use rustc_span::Symbol; +use rustc_target::abi::Size; +use rustc_target::spec::abi::Abi; + +use super::FloatCmpOp; +use crate::*; +use shims::foreign_items::EmulateByNameResult; + +impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for crate::MiriInterpCx<'mir, 'tcx> {} +pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { + fn emulate_x86_sse2_intrinsic( + &mut self, + link_name: Symbol, + abi: Abi, + args: &[OpTy<'tcx, Provenance>], + dest: &PlaceTy<'tcx, Provenance>, + ) -> InterpResult<'tcx, EmulateByNameResult<'mir, 'tcx>> { + let this = self.eval_context_mut(); + // Prefix should have already been checked. + let unprefixed_name = link_name.as_str().strip_prefix("llvm.x86.sse2.").unwrap(); + + // These intrinsics operate on 128-bit (f32x4, f64x2, i8x16, i16x8, i32x4, i64x2) SIMD + // vectors unless stated otherwise. + // Many intrinsic names are sufixed with "ps" (packed single), "ss" (scalar signle), + // "pd" (packed double) or "sd" (scalar double), where single means single precision + // floating point (f32) and double means double precision floating point (f64). "ps" + // and "pd" means thet the operation is performed on each element of the vector, while + // "ss" and "sd" means that the operation is performed only on the first element, copying + // the remaining elements from the input vector (for binary operations, from the left-hand + // side). + // Intrinsincs sufixed with "epiX" or "epuX" operate with X-bit signed or unsigned + // vectors. + match unprefixed_name { + // Used to implement the _mm_avg_epu8 function. + // Averages packed unsigned 8-bit integers in `left` and `right`. + "pavg.b" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + assert_eq!(dest_len, right_len); + + for i in 0..dest_len { + let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u8()?; + let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u8()?; + let dest = this.project_index(&dest, i)?; + + // Values are expanded from u8 to u16, so adds cannot overflow. + let res = u16::from(left) + .checked_add(u16::from(right)) + .unwrap() + .checked_add(1) + .unwrap() + / 2; + this.write_scalar(Scalar::from_u8(res.try_into().unwrap()), &dest)?; + } + } + // Used to implement the _mm_avg_epu16 function. + // Averages packed unsigned 16-bit integers in `left` and `right`. + "pavg.w" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + assert_eq!(dest_len, right_len); + + for i in 0..dest_len { + let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u16()?; + let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u16()?; + let dest = this.project_index(&dest, i)?; + + // Values are expanded from u16 to u32, so adds cannot overflow. + let res = u32::from(left) + .checked_add(u32::from(right)) + .unwrap() + .checked_add(1) + .unwrap() + / 2; + this.write_scalar(Scalar::from_u16(res.try_into().unwrap()), &dest)?; + } + } + // Used to implement the _mm_mulhi_epi16 function. + "pmulh.w" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + assert_eq!(dest_len, right_len); + + for i in 0..dest_len { + let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?; + let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?; + let dest = this.project_index(&dest, i)?; + + // Values are expanded from i16 to i32, so multiplication cannot overflow. + let res = i32::from(left).checked_mul(i32::from(right)).unwrap() >> 16; + this.write_scalar(Scalar::from_int(res, Size::from_bits(16)), &dest)?; + } + } + // Used to implement the _mm_mulhi_epu16 function. + "pmulhu.w" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + assert_eq!(dest_len, right_len); + + for i in 0..dest_len { + let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u16()?; + let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u16()?; + let dest = this.project_index(&dest, i)?; + + // Values are expanded from u16 to u32, so multiplication cannot overflow. + let res = u32::from(left).checked_mul(u32::from(right)).unwrap() >> 16; + this.write_scalar(Scalar::from_u16(res.try_into().unwrap()), &dest)?; + } + } + // Used to implement the _mm_mul_epu32 function. + // Multiplies the the low unsigned 32-bit integers from each packed + // 64-bit element and stores the result as 64-bit unsigned integers. + "pmulu.dq" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + // left and right are u32x4, dest is u64x2 + assert_eq!(left_len, 4); + assert_eq!(right_len, 4); + assert_eq!(dest_len, 2); + + for i in 0..dest_len { + let op_i = i.checked_mul(2).unwrap(); + let left = this.read_scalar(&this.project_index(&left, op_i)?)?.to_u32()?; + let right = this.read_scalar(&this.project_index(&right, op_i)?)?.to_u32()?; + let dest = this.project_index(&dest, i)?; + + // The multiplication will not overflow because stripping the + // operands are expanded from 32-bit to 64-bit. + let res = u64::from(left).checked_mul(u64::from(right)).unwrap(); + this.write_scalar(Scalar::from_u64(res), &dest)?; + } + } + // Used to implement the _mm_sad_epu8 function. + // Computes the absolute differences of packed unsigned 8-bit integers in `a` + // and `b`, then horizontally sum each consecutive 8 differences to produce + // two unsigned 16-bit integers, and pack these unsigned 16-bit integers in + // the low 16 bits of 64-bit elements returned. + // + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8 + "psad.bw" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + // left and right are u8x16, dest is u64x2 + assert_eq!(left_len, right_len); + assert_eq!(left_len, 16); + assert_eq!(dest_len, 2); + + for i in 0..dest_len { + let dest = this.project_index(&dest, i)?; + + let mut res: u16 = 0; + let n = left_len.checked_div(dest_len).unwrap(); + for j in 0..n { + let op_i = j.checked_add(i.checked_mul(n).unwrap()).unwrap(); + let left = this.read_scalar(&this.project_index(&left, op_i)?)?.to_u8()?; + let right = + this.read_scalar(&this.project_index(&right, op_i)?)?.to_u8()?; + + res = res.checked_add(left.abs_diff(right).into()).unwrap(); + } + + this.write_scalar(Scalar::from_u64(res.into()), &dest)?; + } + } + // Used to implement the _mm_{sll,srl,sra}_epi16 functions. + // Shifts 16-bit packed integers in left by the amount in right. + // Both operands are vectors of 16-bit integers. However, right is + // interpreted as a single 64-bit integer (remaining bits are ignored). + // For logic shifts, when right is larger than 15, zero is produced. + // For arithmetic shifts, when right is larger than 15, the sign bit + // is copied to remaining bits. + "psll.w" | "psrl.w" | "psra.w" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + assert_eq!(dest_len, right_len); + + enum ShiftOp { + Sll, + Srl, + Sra, + } + let which = match unprefixed_name { + "psll.w" => ShiftOp::Sll, + "psrl.w" => ShiftOp::Srl, + "psra.w" => ShiftOp::Sra, + _ => unreachable!(), + }; + + // Get the 64-bit shift operand and convert it to the type expected + // by checked_{shl,shr} (u32). + // It is ok to saturate the value to u32::MAX because any value + // above 15 will produce the same result. + let shift = extract_first_u64(this, &right)?.try_into().unwrap_or(u32::MAX); + + for i in 0..dest_len { + let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u16()?; + let dest = this.project_index(&dest, i)?; + + let res = match which { + ShiftOp::Sll => left.checked_shl(shift).unwrap_or(0), + ShiftOp::Srl => left.checked_shr(shift).unwrap_or(0), + #[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss)] + ShiftOp::Sra => { + // Convert u16 to i16 to use arithmetic shift + let left = left as i16; + // Copy the sign bit to the remaining bits + left.checked_shr(shift).unwrap_or(left >> 15) as u16 + } + }; + + this.write_scalar(Scalar::from_u16(res), &dest)?; + } + } + // Used to implement the _mm_{sll,srl,sra}_epi32 functions. + // 32-bit equivalent to the shift functions above. + "psll.d" | "psrl.d" | "psra.d" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + assert_eq!(dest_len, right_len); + + enum ShiftOp { + Sll, + Srl, + Sra, + } + let which = match unprefixed_name { + "psll.d" => ShiftOp::Sll, + "psrl.d" => ShiftOp::Srl, + "psra.d" => ShiftOp::Sra, + _ => unreachable!(), + }; + + // Get the 64-bit shift operand and convert it to the type expected + // by checked_{shl,shr} (u32). + // It is ok to saturate the value to u32::MAX because any value + // above 31 will produce the same result. + let shift = extract_first_u64(this, &right)?.try_into().unwrap_or(u32::MAX); + + for i in 0..dest_len { + let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u32()?; + let dest = this.project_index(&dest, i)?; + + let res = match which { + ShiftOp::Sll => left.checked_shl(shift).unwrap_or(0), + ShiftOp::Srl => left.checked_shr(shift).unwrap_or(0), + #[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss)] + ShiftOp::Sra => { + // Convert u32 to i32 to use arithmetic shift + let left = left as i32; + // Copy the sign bit to the remaining bits + left.checked_shr(shift).unwrap_or(left >> 31) as u32 + } + }; + + this.write_scalar(Scalar::from_u32(res), &dest)?; + } + } + // Used to implement the _mm_{sll,srl}_epi64 functions. + // 64-bit equivalent to the shift functions above, except _mm_sra_epi64, + // which is not available in SSE2. + "psll.q" | "psrl.q" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + assert_eq!(dest_len, right_len); + + enum ShiftOp { + Sll, + Srl, + } + let which = match unprefixed_name { + "psll.q" => ShiftOp::Sll, + "psrl.q" => ShiftOp::Srl, + _ => unreachable!(), + }; + + // Get the 64-bit shift operand and convert it to the type expected + // by checked_{shl,shr} (u32). + // It is ok to saturate the value to u32::MAX because any value + // above 63 will produce the same result. + let shift = this + .read_scalar(&this.project_index(&right, 0)?)? + .to_u64()? + .try_into() + .unwrap_or(u32::MAX); + + for i in 0..dest_len { + let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u64()?; + let dest = this.project_index(&dest, i)?; + + let res = match which { + ShiftOp::Sll => left.checked_shl(shift).unwrap_or(0), + ShiftOp::Srl => left.checked_shr(shift).unwrap_or(0), + }; + + this.write_scalar(Scalar::from_u64(res), &dest)?; + } + } + // Used to implement the _mm_cvtepi32_ps function. + // Converts packed i32 to packed f32. + // FIXME: Can we get rid of this intrinsic and just use simd_as? + "cvtdq2ps" => { + let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (op, op_len) = this.operand_to_simd(op)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, op_len); + + for i in 0..dest_len { + let op = this.read_scalar(&this.project_index(&op, i)?)?.to_i32()?; + let dest = this.project_index(&dest, i)?; + + let res = Scalar::from_f32(Single::from_i128(op.into()).value); + this.write_scalar(res, &dest)?; + } + } + // Used to implement the _mm_cvtps_epi32 and _mm_cvttps_epi32 functions. + // Converts packed f32 to packed i32. + "cvtps2dq" | "cvttps2dq" => { + let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (op, op_len) = this.operand_to_simd(op)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, op_len); + + let rnd = match unprefixed_name { + // "current SSE rounding mode", assume nearest + // https://www.felixcloutier.com/x86/cvtps2dq + "cvtps2dq" => rustc_apfloat::Round::NearestTiesToEven, + // always truncate + // https://www.felixcloutier.com/x86/cvttps2dq + "cvttps2dq" => rustc_apfloat::Round::TowardZero, + _ => unreachable!(), + }; + + for i in 0..dest_len { + let op = this.read_scalar(&this.project_index(&op, i)?)?.to_f32()?; + let dest = this.project_index(&dest, i)?; + + let res = + this.float_to_int_checked(op, dest.layout.ty, rnd).unwrap_or_else(|| { + // Fallback to minimum acording to SSE2 semantics. + Scalar::from_i32(i32::MIN) + }); + this.write_scalar(res, &dest)?; + } + } + // Used to implement the _mm_packs_epi16 function. + // Converts two 16-bit integer vectors to a single 8-bit integer + // vector with signed saturation. + "packsswb.128" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + // left and right are i16x8, dest is i8x16 + assert_eq!(left_len, 8); + assert_eq!(right_len, 8); + assert_eq!(dest_len, 16); + + for i in 0..left_len { + let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?; + let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?; + let left_dest = this.project_index(&dest, i)?; + let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?; + + let left_res = + i8::try_from(left).unwrap_or(if left < 0 { i8::MIN } else { i8::MAX }); + let right_res = + i8::try_from(right).unwrap_or(if right < 0 { i8::MIN } else { i8::MAX }); + + this.write_scalar(Scalar::from_int(left_res, Size::from_bits(8)), &left_dest)?; + this.write_scalar( + Scalar::from_int(right_res, Size::from_bits(8)), + &right_dest, + )?; + } + } + // Used to implement the _mm_packus_epi16 function. + // Converts two 16-bit signed integer vectors to a single 8-bit + // unsigned integer vector with saturation. + "packuswb.128" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + // left and right are i16x8, dest is u8x16 + assert_eq!(left_len, 8); + assert_eq!(right_len, 8); + assert_eq!(dest_len, 16); + + for i in 0..left_len { + let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?; + let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?; + let left_dest = this.project_index(&dest, i)?; + let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?; + + let left_res = u8::try_from(left).unwrap_or(if left < 0 { 0 } else { u8::MAX }); + let right_res = + u8::try_from(right).unwrap_or(if right < 0 { 0 } else { u8::MAX }); + + this.write_scalar(Scalar::from_u8(left_res), &left_dest)?; + this.write_scalar(Scalar::from_u8(right_res), &right_dest)?; + } + } + // Used to implement the _mm_packs_epi32 function. + // Converts two 16-bit integer vectors to a single 8-bit integer + // vector with signed saturation. + "packssdw.128" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + // left and right are i32x4, dest is i16x8 + assert_eq!(left_len, 4); + assert_eq!(right_len, 4); + assert_eq!(dest_len, 8); + + for i in 0..left_len { + let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i32()?; + let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i32()?; + let left_dest = this.project_index(&dest, i)?; + let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?; + + let left_res = + i16::try_from(left).unwrap_or(if left < 0 { i16::MIN } else { i16::MAX }); + let right_res = + i16::try_from(right).unwrap_or(if right < 0 { i16::MIN } else { i16::MAX }); + + this.write_scalar(Scalar::from_int(left_res, Size::from_bits(16)), &left_dest)?; + this.write_scalar( + Scalar::from_int(right_res, Size::from_bits(16)), + &right_dest, + )?; + } + } + // Used to implement _mm_min_sd and _mm_max_sd functions. + // Note that the semantics are a bit different from Rust simd_min + // and simd_max intrinsics regarding handling of NaN and -0.0: Rust + // matches the IEEE min/max operations, while x86 has different + // semantics. + "min.sd" | "max.sd" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let which = match unprefixed_name { + "min.sd" => FloatBinOp::Min, + "max.sd" => FloatBinOp::Max, + _ => unreachable!(), + }; + + bin_op_sd(this, which, left, right, dest)?; + } + // Used to implement _mm_min_pd and _mm_max_pd functions. + // Note that the semantics are a bit different from Rust simd_min + // and simd_max intrinsics regarding handling of NaN and -0.0: Rust + // matches the IEEE min/max operations, while x86 has different + // semantics. + "min.pd" | "max.pd" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let which = match unprefixed_name { + "min.pd" => FloatBinOp::Min, + "max.pd" => FloatBinOp::Max, + _ => unreachable!(), + }; + + bin_op_pd(this, which, left, right, dest)?; + } + // Used to implement _mm_sqrt_sd functions. + // Performs the operations on the first component of `op` and + // copies the remaining components from `op`. + "sqrt.sd" => { + let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (op, op_len) = this.operand_to_simd(op)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, op_len); + + let op0 = this.read_scalar(&this.project_index(&op, 0)?)?.to_u64()?; + // FIXME using host floats + let res0 = Scalar::from_u64(f64::from_bits(op0).sqrt().to_bits()); + this.write_scalar(res0, &this.project_index(&dest, 0)?)?; + + for i in 1..dest_len { + this.copy_op( + &this.project_index(&op, i)?, + &this.project_index(&dest, i)?, + /*allow_transmute*/ false, + )?; + } + } + // Used to implement _mm_sqrt_pd functions. + // Performs the operations on all components of `op`. + "sqrt.pd" => { + let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (op, op_len) = this.operand_to_simd(op)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, op_len); + + for i in 0..dest_len { + let op = this.read_scalar(&this.project_index(&op, i)?)?.to_u64()?; + let dest = this.project_index(&dest, i)?; + + // FIXME using host floats + let res = Scalar::from_u64(f64::from_bits(op).sqrt().to_bits()); + + this.write_scalar(res, &dest)?; + } + } + // Used to implement the _mm_cmp*_sd function. + // Performs a comparison operation on the first component of `left` + // and `right`, returning 0 if false or `u64::MAX` if true. The remaining + // components are copied from `left`. + "cmp.sd" => { + let [left, right, imm] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let which = FloatBinOp::Cmp(FloatCmpOp::from_intrinsic_imm( + this.read_scalar(imm)?.to_i8()?, + "llvm.x86.sse2.cmp.sd", + )?); + + bin_op_sd(this, which, left, right, dest)?; + } + // Used to implement the _mm_cmp*_pd functions. + // Performs a comparison operation on each component of `left` + // and `right`. For each component, returns 0 if false or `u64::MAX` + // if true. + "cmp.pd" => { + let [left, right, imm] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let which = FloatBinOp::Cmp(FloatCmpOp::from_intrinsic_imm( + this.read_scalar(imm)?.to_i8()?, + "llvm.x86.sse2.cmp.pd", + )?); + + bin_op_pd(this, which, left, right, dest)?; + } + // Used to implement _mm_{,u}comi{eq,lt,le,gt,ge,neq}_sd functions. + // Compares the first component of `left` and `right` and returns + // a scalar value (0 or 1). + "comieq.sd" | "comilt.sd" | "comile.sd" | "comigt.sd" | "comige.sd" | "comineq.sd" + | "ucomieq.sd" | "ucomilt.sd" | "ucomile.sd" | "ucomigt.sd" | "ucomige.sd" + | "ucomineq.sd" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + + assert_eq!(left_len, right_len); + + let left = this.read_scalar(&this.project_index(&left, 0)?)?.to_f64()?; + let right = this.read_scalar(&this.project_index(&right, 0)?)?.to_f64()?; + // The difference between the com* and *ucom variants is signaling + // of exceptions when either argument is a quiet NaN. We do not + // support accessing the SSE status register from miri (or from Rust, + // for that matter), so we treat equally both variants. + let res = match unprefixed_name { + "comieq.sd" | "ucomieq.sd" => left == right, + "comilt.sd" | "ucomilt.sd" => left < right, + "comile.sd" | "ucomile.sd" => left <= right, + "comigt.sd" | "ucomigt.sd" => left > right, + "comige.sd" | "ucomige.sd" => left >= right, + "comineq.sd" | "ucomineq.sd" => left != right, + _ => unreachable!(), + }; + this.write_scalar(Scalar::from_i32(i32::from(res)), dest)?; + } + // Used to implement the _mm_cvtpd_ps function. + // Converts packed f32 to packed f64. + "cvtpd2ps" => { + let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (op, op_len) = this.operand_to_simd(op)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + // op is f64x2, dest is f32x4 + assert_eq!(op_len, 2); + assert_eq!(dest_len, 4); + + for i in 0..op_len { + let op = this.read_scalar(&this.project_index(&op, i)?)?.to_f64()?; + let dest = this.project_index(&dest, i)?; + + let res = op.convert(/*loses_info*/ &mut false).value; + this.write_scalar(Scalar::from_f32(res), &dest)?; + } + // Fill the remaining with zeros + for i in op_len..dest_len { + let dest = this.project_index(&dest, i)?; + this.write_scalar(Scalar::from_u32(0), &dest)?; + } + } + // Used to implement the _mm_cvtps_pd function. + // Converts packed f64 to packed f32. + "cvtps2pd" => { + let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (op, op_len) = this.operand_to_simd(op)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + // op is f32x4, dest is f64x2 + assert_eq!(op_len, 4); + assert_eq!(dest_len, 2); + + for i in 0..dest_len { + let op = this.read_scalar(&this.project_index(&op, i)?)?.to_f32()?; + let dest = this.project_index(&dest, i)?; + + let res = op.convert(/*loses_info*/ &mut false).value; + this.write_scalar(Scalar::from_f64(res), &dest)?; + } + // the two remaining f32 are ignored + } + // Used to implement the _mm_cvtpd_epi32 and _mm_cvttpd_epi32 functions. + // Converts packed f64 to packed i32. + "cvtpd2dq" | "cvttpd2dq" => { + let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (op, op_len) = this.operand_to_simd(op)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + // op is f64x2, dest is i32x4 + assert_eq!(op_len, 2); + assert_eq!(dest_len, 4); + + let rnd = match unprefixed_name { + // "current SSE rounding mode", assume nearest + // https://www.felixcloutier.com/x86/cvtpd2dq + "cvtpd2dq" => rustc_apfloat::Round::NearestTiesToEven, + // always truncate + // https://www.felixcloutier.com/x86/cvttpd2dq + "cvttpd2dq" => rustc_apfloat::Round::TowardZero, + _ => unreachable!(), + }; + + for i in 0..op_len { + let op = this.read_scalar(&this.project_index(&op, i)?)?.to_f64()?; + let dest = this.project_index(&dest, i)?; + + let res = + this.float_to_int_checked(op, dest.layout.ty, rnd).unwrap_or_else(|| { + // Fallback to minimum acording to SSE2 semantics. + Scalar::from_i32(i32::MIN) + }); + this.write_scalar(res, &dest)?; + } + // Fill the remaining with zeros + for i in op_len..dest_len { + let dest = this.project_index(&dest, i)?; + this.write_scalar(Scalar::from_i32(0), &dest)?; + } + } + // Use to implement the _mm_cvtsd_si32 and _mm_cvttsd_si32 functions. + // Converts the first component of `op` from f64 to i32. + "cvtsd2si" | "cvttsd2si" => { + let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + let (op, _) = this.operand_to_simd(op)?; + + let op = this.read_scalar(&this.project_index(&op, 0)?)?.to_f64()?; + + let rnd = match unprefixed_name { + // "current SSE rounding mode", assume nearest + // https://www.felixcloutier.com/x86/cvtsd2si + "cvtsd2si" => rustc_apfloat::Round::NearestTiesToEven, + // always truncate + // https://www.felixcloutier.com/x86/cvttsd2si + "cvttsd2si" => rustc_apfloat::Round::TowardZero, + _ => unreachable!(), + }; + + let res = this.float_to_int_checked(op, dest.layout.ty, rnd).unwrap_or_else(|| { + // Fallback to minimum acording to SSE semantics. + Scalar::from_i32(i32::MIN) + }); + + this.write_scalar(res, dest)?; + } + // Use to implement the _mm_cvtsd_si64 and _mm_cvttsd_si64 functions. + // Converts the first component of `op` from f64 to i64. + "cvtsd2si64" | "cvttsd2si64" => { + let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + let (op, _) = this.operand_to_simd(op)?; + + let op = this.read_scalar(&this.project_index(&op, 0)?)?.to_f64()?; + + let rnd = match unprefixed_name { + // "current SSE rounding mode", assume nearest + // https://www.felixcloutier.com/x86/cvtsd2si + "cvtsd2si64" => rustc_apfloat::Round::NearestTiesToEven, + // always truncate + // https://www.felixcloutier.com/x86/cvttsd2si + "cvttsd2si64" => rustc_apfloat::Round::TowardZero, + _ => unreachable!(), + }; + + let res = this.float_to_int_checked(op, dest.layout.ty, rnd).unwrap_or_else(|| { + // Fallback to minimum acording to SSE semantics. + Scalar::from_i64(i64::MIN) + }); + + this.write_scalar(res, dest)?; + } + // Used to implement the _mm_cvtsd_ss and _mm_cvtss_sd functions. + // Converts the first f64/f32 from `right` to f32/f64 and copies + // the remaining elements from `left` + "cvtsd2ss" | "cvtss2sd" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, _) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + + // Convert first element of `right` + let right0 = this.read_immediate(&this.project_index(&right, 0)?)?; + let dest0 = this.project_index(&dest, 0)?; + // `float_to_float_or_int` here will convert from f64 to f32 (cvtsd2ss) or + // from f32 to f64 (cvtss2sd). + let res0 = this.float_to_float_or_int(&right0, dest0.layout.ty)?; + this.write_immediate(res0, &dest0)?; + + // Copy remianing from `left` + for i in 1..dest_len { + this.copy_op( + &this.project_index(&left, i)?, + &this.project_index(&dest, i)?, + /*allow_transmute*/ false, + )?; + } + } + // Used to implement the _mm_movemask_pd function. + // Returns a scalar integer where the i-th bit is the highest + // bit of the i-th component of `op`. + // https://www.felixcloutier.com/x86/movmskpd + "movmsk.pd" => { + let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + let (op, op_len) = this.operand_to_simd(op)?; + + let mut res = 0; + for i in 0..op_len { + let op = this.read_scalar(&this.project_index(&op, i)?)?; + let op = op.to_u64()?; + + // Extract the highest bit of `op` and place it in the `i`-th bit of `res` + res |= (op >> 63) << i; + } + + this.write_scalar(Scalar::from_u32(res.try_into().unwrap()), dest)?; + } + _ => return Ok(EmulateByNameResult::NotSupported), + } + Ok(EmulateByNameResult::NeedsJumping) + } +} + +/// Takes a 128-bit vector, transmutes it to `[u64; 2]` and extracts +/// the first value. +fn extract_first_u64<'tcx>( + this: &crate::MiriInterpCx<'_, 'tcx>, + op: &MPlaceTy<'tcx, Provenance>, +) -> InterpResult<'tcx, u64> { + // Transmute vector to `[u64; 2]` + let u64_array_layout = this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u64, 2))?; + let op = op.transmute(u64_array_layout, this)?; + + // Get the first u64 from the array + this.read_scalar(&this.project_index(&op, 0)?)?.to_u64() +} + +#[derive(Copy, Clone)] +enum FloatBinOp { + /// Comparison + Cmp(FloatCmpOp), + /// Minimum value (with SSE semantics) + /// + /// + /// + Min, + /// Maximum value (with SSE semantics) + /// + /// + /// + Max, +} + +/// Performs `which` scalar operation on `left` and `right` and returns +/// the result. +// FIXME make this generic over apfloat type to reduce code duplicaton with bin_op_f32 +fn bin_op_f64<'tcx>( + which: FloatBinOp, + left: &ImmTy<'tcx, Provenance>, + right: &ImmTy<'tcx, Provenance>, +) -> InterpResult<'tcx, Scalar> { + match which { + FloatBinOp::Cmp(which) => { + let left = left.to_scalar().to_f64()?; + let right = right.to_scalar().to_f64()?; + // FIXME: Make sure that these operations match the semantics of cmppd + let res = match which { + FloatCmpOp::Eq => left == right, + FloatCmpOp::Lt => left < right, + FloatCmpOp::Le => left <= right, + FloatCmpOp::Unord => left.is_nan() || right.is_nan(), + FloatCmpOp::Neq => left != right, + FloatCmpOp::Nlt => !(left < right), + FloatCmpOp::Nle => !(left <= right), + FloatCmpOp::Ord => !left.is_nan() && !right.is_nan(), + }; + Ok(Scalar::from_u64(if res { u64::MAX } else { 0 })) + } + FloatBinOp::Min => { + let left = left.to_scalar().to_f64()?; + let right = right.to_scalar().to_f64()?; + // SSE semantics to handle zero and NaN. Note that `x == Single::ZERO` + // is true when `x` is either +0 or -0. + if (left == Double::ZERO && right == Double::ZERO) + || left.is_nan() + || right.is_nan() + || left >= right + { + Ok(Scalar::from_f64(right)) + } else { + Ok(Scalar::from_f64(left)) + } + } + FloatBinOp::Max => { + let left = left.to_scalar().to_f64()?; + let right = right.to_scalar().to_f64()?; + // SSE semantics to handle zero and NaN. Note that `x == Single::ZERO` + // is true when `x` is either +0 or -0. + if (left == Double::ZERO && right == Double::ZERO) + || left.is_nan() + || right.is_nan() + || left <= right + { + Ok(Scalar::from_f64(right)) + } else { + Ok(Scalar::from_f64(left)) + } + } + } +} + +/// Performs `which` operation on the first component of `left` and `right` +/// and copies the other components from `left`. The result is stored in `dest`. +fn bin_op_sd<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + which: FloatBinOp, + left: &OpTy<'tcx, Provenance>, + right: &OpTy<'tcx, Provenance>, + dest: &PlaceTy<'tcx, Provenance>, +) -> InterpResult<'tcx, ()> { + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + assert_eq!(dest_len, right_len); + + let res0 = bin_op_f64( + which, + &this.read_immediate(&this.project_index(&left, 0)?)?, + &this.read_immediate(&this.project_index(&right, 0)?)?, + )?; + this.write_scalar(res0, &this.project_index(&dest, 0)?)?; + + for i in 1..dest_len { + this.copy_op( + &this.project_index(&left, i)?, + &this.project_index(&dest, i)?, + /*allow_transmute*/ false, + )?; + } + + Ok(()) +} + +/// Performs `which` operation on each component of `left` and +/// `right`, storing the result is stored in `dest`. +fn bin_op_pd<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + which: FloatBinOp, + left: &OpTy<'tcx, Provenance>, + right: &OpTy<'tcx, Provenance>, + dest: &PlaceTy<'tcx, Provenance>, +) -> InterpResult<'tcx, ()> { + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + assert_eq!(dest_len, right_len); + + for i in 0..dest_len { + let left = this.read_immediate(&this.project_index(&left, i)?)?; + let right = this.read_immediate(&this.project_index(&right, i)?)?; + let dest = this.project_index(&dest, i)?; + + let res = bin_op_f64(which, &left, &right)?; + this.write_scalar(res, &dest)?; + } + + Ok(()) +} diff --git a/src/tools/miri/tests/pass/intrinsics-x86-sse2.rs b/src/tools/miri/tests/pass/intrinsics-x86-sse2.rs new file mode 100644 index 00000000000..1b55a94783a --- /dev/null +++ b/src/tools/miri/tests/pass/intrinsics-x86-sse2.rs @@ -0,0 +1,828 @@ +// Ignore everything except x86 and x86_64 +// Any additional target are added to CI should be ignored here +//@ignore-target-aarch64 +//@ignore-target-arm +//@ignore-target-avr +//@ignore-target-s390x +//@ignore-target-thumbv7em +//@ignore-target-wasm32 + +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; +use std::f64::NAN; +use std::mem::transmute; + +fn main() { + assert!(is_x86_feature_detected!("sse2")); + + unsafe { + test_sse2(); + } +} + +#[target_feature(enable = "sse2")] +unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i { + _mm_set_epi64x(b, a) +} + +#[target_feature(enable = "sse2")] +unsafe fn test_sse2() { + // Mostly copied from library/stdarch/crates/core_arch/src/x86{,_64}/sse2.rs + + unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i { + _mm_set_epi64x(b, a) + } + + #[track_caller] + #[target_feature(enable = "sse")] + unsafe fn assert_eq_m128(a: __m128, b: __m128) { + let r = _mm_cmpeq_ps(a, b); + if _mm_movemask_ps(r) != 0b1111 { + panic!("{:?} != {:?}", a, b); + } + } + + #[track_caller] + #[target_feature(enable = "sse2")] + unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) { + assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b)) + } + + #[track_caller] + #[target_feature(enable = "sse2")] + unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) { + if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 { + panic!("{:?} != {:?}", a, b); + } + } + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_avg_epu8() { + let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9)); + let r = _mm_avg_epu8(a, b); + assert_eq_m128i(r, _mm_set1_epi8(6)); + } + test_mm_avg_epu8(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_avg_epu16() { + let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9)); + let r = _mm_avg_epu16(a, b); + assert_eq_m128i(r, _mm_set1_epi16(6)); + } + test_mm_avg_epu16(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_mulhi_epi16() { + let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001)); + let r = _mm_mulhi_epi16(a, b); + assert_eq_m128i(r, _mm_set1_epi16(-16)); + } + test_mm_mulhi_epi16(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_mulhi_epu16() { + let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001)); + let r = _mm_mulhi_epu16(a, b); + assert_eq_m128i(r, _mm_set1_epi16(15)); + } + test_mm_mulhi_epu16(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_mul_epu32() { + let a = _mm_setr_epi64x(1_000_000_000, 1 << 34); + let b = _mm_setr_epi64x(1_000_000_000, 1 << 35); + let r = _mm_mul_epu32(a, b); + let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0); + assert_eq_m128i(r, e); + } + test_mm_mul_epu32(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_sad_epu8() { + #[rustfmt::skip] + let a = _mm_setr_epi8( + 255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8, + 1, 2, 3, 4, + 155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8, + 1, 2, 3, 4, + ); + let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2); + let r = _mm_sad_epu8(a, b); + let e = _mm_setr_epi64x(1020, 614); + assert_eq_m128i(r, e); + } + test_mm_sad_epu8(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_sll_epi16() { + let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF); + let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4)); + assert_eq_m128i( + r, + _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0), + ); + let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0)); + assert_eq_m128i(r, a); + let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16)); + assert_eq_m128i(r, _mm_set1_epi16(0)); + let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m128i(r, _mm_set1_epi16(0)); + } + test_mm_sll_epi16(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_srl_epi16() { + let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF); + let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4)); + assert_eq_m128i(r, _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0)); + let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0)); + assert_eq_m128i(r, a); + let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16)); + assert_eq_m128i(r, _mm_set1_epi16(0)); + let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m128i(r, _mm_set1_epi16(0)); + } + test_mm_srl_epi16(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_sra_epi16() { + let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF); + let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4)); + assert_eq_m128i(r, _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10)); + let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0)); + assert_eq_m128i(r, a); + let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16)); + assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1)); + let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1)); + } + test_mm_sra_epi16(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_sll_epi32() { + let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF); + let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4)); + assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0)); + let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0)); + assert_eq_m128i(r, a); + let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32)); + assert_eq_m128i(r, _mm_set1_epi32(0)); + let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m128i(r, _mm_set1_epi32(0)); + } + test_mm_sll_epi32(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_srl_epi32() { + let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF); + let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4)); + assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000)); + let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0)); + assert_eq_m128i(r, a); + let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32)); + assert_eq_m128i(r, _mm_set1_epi32(0)); + let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m128i(r, _mm_set1_epi32(0)); + } + test_mm_srl_epi32(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_sra_epi32() { + let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF); + let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4)); + assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000)); + let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0)); + assert_eq_m128i(r, a); + let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32)); + assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1)); + let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1)); + } + test_mm_sra_epi32(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_sll_epi64() { + let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF); + let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4)); + assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0)); + let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0)); + assert_eq_m128i(r, a); + let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64)); + assert_eq_m128i(r, _mm_set1_epi64x(0)); + let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m128i(r, _mm_set1_epi64x(0)); + } + test_mm_sll_epi64(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_srl_epi64() { + let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF); + let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4)); + assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000)); + let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0)); + assert_eq_m128i(r, a); + let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64)); + assert_eq_m128i(r, _mm_set1_epi64x(0)); + let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m128i(r, _mm_set1_epi64x(0)); + } + test_mm_srl_epi64(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cvtepi32_ps() { + let a = _mm_setr_epi32(1, 2, 3, 4); + let r = _mm_cvtepi32_ps(a); + assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0)); + } + test_mm_cvtepi32_ps(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cvtps_epi32() { + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let r = _mm_cvtps_epi32(a); + assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4)); + } + test_mm_cvtps_epi32(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cvttps_epi32() { + let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6); + let r = _mm_cvttps_epi32(a); + assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6)); + + let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX); + let r = _mm_cvttps_epi32(a); + assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN)); + } + test_mm_cvttps_epi32(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_packs_epi16() { + let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0); + let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80); + let r = _mm_packs_epi16(a, b); + assert_eq_m128i( + r, + _mm_setr_epi8(0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F), + ); + } + test_mm_packs_epi16(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_packus_epi16() { + let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0); + let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100); + let r = _mm_packus_epi16(a, b); + assert_eq_m128i(r, _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0)); + } + test_mm_packus_epi16(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_packs_epi32() { + let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0); + let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000); + let r = _mm_packs_epi32(a, b); + assert_eq_m128i(r, _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF)); + } + test_mm_packs_epi32(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_min_sd() { + let a = _mm_setr_pd(1.0, 2.0); + let b = _mm_setr_pd(5.0, 10.0); + let r = _mm_min_sd(a, b); + assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0)); + } + test_mm_min_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_min_pd() { + let a = _mm_setr_pd(-1.0, 5.0); + let b = _mm_setr_pd(-100.0, 20.0); + let r = _mm_min_pd(a, b); + assert_eq_m128d(r, _mm_setr_pd(-100.0, 5.0)); + + // `_mm_min_pd` can **not** be implemented using the `simd_min` rust intrinsic because + // the semantics of `simd_min` are different to those of `_mm_min_pd` regarding handling + // of `-0.0`. + let a = _mm_setr_pd(-0.0, 0.0); + let b = _mm_setr_pd(0.0, 0.0); + let r1: [u8; 16] = transmute(_mm_min_pd(a, b)); + let r2: [u8; 16] = transmute(_mm_min_pd(b, a)); + let a: [u8; 16] = transmute(a); + let b: [u8; 16] = transmute(b); + assert_eq!(r1, b); + assert_eq!(r2, a); + assert_ne!(a, b); // sanity check that -0.0 is actually present + } + test_mm_min_pd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_max_sd() { + let a = _mm_setr_pd(1.0, 2.0); + let b = _mm_setr_pd(5.0, 10.0); + let r = _mm_max_sd(a, b); + assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0)); + } + test_mm_max_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_max_pd() { + let a = _mm_setr_pd(-1.0, 5.0); + let b = _mm_setr_pd(-100.0, 20.0); + let r = _mm_max_pd(a, b); + assert_eq_m128d(r, _mm_setr_pd(-1.0, 20.0)); + + // `_mm_max_pd` can **not** be implemented using the `simd_max` rust intrinsic because + // the semantics of `simd_max` are different to those of `_mm_max_pd` regarding handling + // of `-0.0`. + let a = _mm_setr_pd(-0.0, 0.0); + let b = _mm_setr_pd(0.0, 0.0); + let r1: [u8; 16] = transmute(_mm_max_pd(a, b)); + let r2: [u8; 16] = transmute(_mm_max_pd(b, a)); + let a: [u8; 16] = transmute(a); + let b: [u8; 16] = transmute(b); + assert_eq!(r1, b); + assert_eq!(r2, a); + assert_ne!(a, b); // sanity check that -0.0 is actually present + } + test_mm_max_pd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_sqrt_sd() { + let a = _mm_setr_pd(1.0, 2.0); + let b = _mm_setr_pd(5.0, 10.0); + let r = _mm_sqrt_sd(a, b); + assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0)); + } + test_mm_sqrt_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_sqrt_pd() { + let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0)); + assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt())); + } + test_mm_sqrt_pd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpeq_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + let e = _mm_setr_epi64x(!0, transmute(2.0f64)); + let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpeq_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmplt_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0)); + let e = _mm_setr_epi64x(!0, transmute(2.0f64)); + let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmplt_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmple_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + let e = _mm_setr_epi64x(!0, transmute(2.0f64)); + let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmple_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpgt_sd() { + let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0)); + let e = _mm_setr_epi64x(!0, transmute(2.0f64)); + let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpgt_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpge_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + let e = _mm_setr_epi64x(!0, transmute(2.0f64)); + let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpge_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpord_sd() { + let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0)); + let e = _mm_setr_epi64x(0, transmute(2.0f64)); + let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpord_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpunord_sd() { + let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0)); + let e = _mm_setr_epi64x(!0, transmute(2.0f64)); + let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpunord_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpneq_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0)); + let e = _mm_setr_epi64x(!0, transmute(2.0f64)); + let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpneq_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpnlt_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0)); + let e = _mm_setr_epi64x(0, transmute(2.0f64)); + let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpnlt_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpnle_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + let e = _mm_setr_epi64x(0, transmute(2.0f64)); + let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpnle_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpngt_sd() { + let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0)); + let e = _mm_setr_epi64x(0, transmute(2.0f64)); + let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpngt_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpnge_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + let e = _mm_setr_epi64x(0, transmute(2.0f64)); + let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpnge_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpeq_pd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + let e = _mm_setr_epi64x(!0, 0); + let r = transmute::<_, __m128i>(_mm_cmpeq_pd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpeq_pd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmplt_pd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + let e = _mm_setr_epi64x(0, !0); + let r = transmute::<_, __m128i>(_mm_cmplt_pd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmplt_pd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmple_pd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + let e = _mm_setr_epi64x(!0, !0); + let r = transmute::<_, __m128i>(_mm_cmple_pd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmple_pd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpgt_pd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + let e = _mm_setr_epi64x(0, 0); + let r = transmute::<_, __m128i>(_mm_cmpgt_pd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpgt_pd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpge_pd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + let e = _mm_setr_epi64x(!0, 0); + let r = transmute::<_, __m128i>(_mm_cmpge_pd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpge_pd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpord_pd() { + let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0)); + let e = _mm_setr_epi64x(0, !0); + let r = transmute::<_, __m128i>(_mm_cmpord_pd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpord_pd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpunord_pd() { + let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0)); + let e = _mm_setr_epi64x(!0, 0); + let r = transmute::<_, __m128i>(_mm_cmpunord_pd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpunord_pd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpneq_pd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0)); + let e = _mm_setr_epi64x(!0, !0); + let r = transmute::<_, __m128i>(_mm_cmpneq_pd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpneq_pd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpnlt_pd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0)); + let e = _mm_setr_epi64x(0, 0); + let r = transmute::<_, __m128i>(_mm_cmpnlt_pd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpnlt_pd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpnle_pd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + let e = _mm_setr_epi64x(0, 0); + let r = transmute::<_, __m128i>(_mm_cmpnle_pd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpnle_pd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpngt_pd() { + let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0)); + let e = _mm_setr_epi64x(0, !0); + let r = transmute::<_, __m128i>(_mm_cmpngt_pd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpngt_pd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cmpnge_pd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + let e = _mm_setr_epi64x(0, !0); + let r = transmute::<_, __m128i>(_mm_cmpnge_pd(a, b)); + assert_eq_m128i(r, e); + } + test_mm_cmpnge_pd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_comieq_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + assert!(_mm_comieq_sd(a, b) != 0); + + let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0)); + assert!(_mm_comieq_sd(a, b) == 0); + } + test_mm_comieq_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_comilt_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + assert!(_mm_comilt_sd(a, b) == 0); + } + test_mm_comilt_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_comile_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + assert!(_mm_comile_sd(a, b) != 0); + } + test_mm_comile_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_comigt_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + assert!(_mm_comigt_sd(a, b) == 0); + } + test_mm_comigt_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_comige_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + assert!(_mm_comige_sd(a, b) != 0); + } + test_mm_comige_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_comineq_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + assert!(_mm_comineq_sd(a, b) == 0); + } + test_mm_comineq_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_ucomieq_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + assert!(_mm_ucomieq_sd(a, b) != 0); + + let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0)); + assert!(_mm_ucomieq_sd(a, b) == 0); + } + test_mm_ucomieq_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_ucomilt_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + assert!(_mm_ucomilt_sd(a, b) == 0); + } + test_mm_ucomilt_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_ucomile_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + assert!(_mm_ucomile_sd(a, b) != 0); + } + test_mm_ucomile_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_ucomigt_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + assert!(_mm_ucomigt_sd(a, b) == 0); + } + test_mm_ucomigt_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_ucomige_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + assert!(_mm_ucomige_sd(a, b) != 0); + } + test_mm_ucomige_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_ucomineq_sd() { + let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); + assert!(_mm_ucomineq_sd(a, b) == 0); + } + test_mm_ucomineq_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cvtpd_ps() { + let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0)); + assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0)); + + let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0)); + assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0)); + + let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN)); + assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0)); + + let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64)); + assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0)); + } + test_mm_cvtpd_ps(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cvtps_pd() { + let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0)); + assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0)); + + let r = _mm_cvtps_pd(_mm_setr_ps(f32::MAX, f32::INFINITY, f32::NEG_INFINITY, f32::MIN)); + assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY)); + } + test_mm_cvtps_pd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cvtpd_epi32() { + let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0)); + assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0)); + + let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0)); + assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0)); + + let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN)); + assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0)); + + let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY)); + assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0)); + + let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN)); + assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0)); + } + test_mm_cvtpd_epi32(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cvttpd_epi32() { + let a = _mm_setr_pd(-1.1, 2.2); + let r = _mm_cvttpd_epi32(a); + assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0)); + + let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN); + let r = _mm_cvttpd_epi32(a); + assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0)); + } + test_mm_cvttpd_epi32(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cvtsd_si32() { + let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0)); + assert_eq!(r, -2); + + let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN)); + assert_eq!(r, i32::MIN); + + let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN)); + assert_eq!(r, i32::MIN); + } + test_mm_cvtsd_si32(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cvttsd_si32() { + let a = _mm_setr_pd(-1.1, 2.2); + let r = _mm_cvttsd_si32(a); + assert_eq!(r, -1); + + let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN); + let r = _mm_cvttsd_si32(a); + assert_eq!(r, i32::MIN); + } + test_mm_cvttsd_si32(); + + // Intrinsic only available on x86_64 + #[cfg(target_arch = "x86_64")] + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cvtsd_si64() { + let r = _mm_cvtsd_si64(_mm_setr_pd(-2.0, 5.0)); + assert_eq!(r, -2_i64); + + let r = _mm_cvtsd_si64(_mm_setr_pd(f64::MAX, f64::MIN)); + assert_eq!(r, i64::MIN); + } + #[cfg(target_arch = "x86_64")] + test_mm_cvtsd_si64(); + + // Intrinsic only available on x86_64 + #[cfg(target_arch = "x86_64")] + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cvttsd_si64() { + let a = _mm_setr_pd(-1.1, 2.2); + let r = _mm_cvttsd_si64(a); + assert_eq!(r, -1_i64); + } + #[cfg(target_arch = "x86_64")] + test_mm_cvttsd_si64(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cvtsd_ss() { + let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4); + let b = _mm_setr_pd(2.0, -5.0); + + let r = _mm_cvtsd_ss(a, b); + + assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4)); + + let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY); + let b = _mm_setr_pd(f64::INFINITY, -5.0); + + let r = _mm_cvtsd_ss(a, b); + + assert_eq_m128( + r, + _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY), + ); + } + test_mm_cvtsd_ss(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_cvtss_sd() { + let a = _mm_setr_pd(-1.1, 2.2); + let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + + let r = _mm_cvtss_sd(a, b); + assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2)); + + let a = _mm_setr_pd(-1.1, f64::INFINITY); + let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0); + + let r = _mm_cvtss_sd(a, b); + assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY)); + } + test_mm_cvtss_sd(); + + #[target_feature(enable = "sse2")] + unsafe fn test_mm_movemask_pd() { + let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0)); + assert_eq!(r, 0b01); + + let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0)); + assert_eq!(r, 0b11); + } + test_mm_movemask_pd(); +}