Auto merge of #3055 - eduardosm:x86-sse2-intrinsics, r=RalfJung

Implement some `llvm.x86.sse2.*` intrinsics and add tests Continuation of https://github.com/rust-lang/miri/pull/2989 with SSE2 intrinsics. Thankfully, a significant amount of SSE2 functions use `simd_*` intrinsics, which are already implemented in Miri.
2023-09-12 18:55:17 +00:00 · 2023-09-12 18:55:17 +00:00 · 8cd31eadba
commit 8cd31eadba
parent 259b4f0b1c ab927c87cd
6 changed files with 1893 additions and 56 deletions
--- a/src/tools/miri/src/shims/foreign_items.rs
+++ b/src/tools/miri/src/shims/foreign_items.rs
@ -1037,6 +1037,11 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                    this, link_name, abi, args, dest,
                );
            }
+            name if name.starts_with("llvm.x86.sse2.") => {
+                return shims::x86::sse2::EvalContextExt::emulate_x86_sse2_intrinsic(
+                    this, link_name, abi, args, dest,
+                );
+            }

            // Platform-specific shims
            _ =>
--- a/src/tools/miri/src/shims/x86/mod.rs
+++ b/src/tools/miri/src/shims/x86/mod.rs
@ -1 +1,45 @@
+use crate::InterpResult;
+
 pub(super) mod sse;
+pub(super) mod sse2;
+
+/// Floating point comparison operation
+///
+/// <https://www.felixcloutier.com/x86/cmpss>
+/// <https://www.felixcloutier.com/x86/cmpps>
+/// <https://www.felixcloutier.com/x86/cmpsd>
+/// <https://www.felixcloutier.com/x86/cmppd>
+#[derive(Copy, Clone)]
+enum FloatCmpOp {
+    Eq,
+    Lt,
+    Le,
+    Unord,
+    Neq,
+    /// Not less-than
+    Nlt,
+    /// Not less-or-equal
+    Nle,
+    /// Ordered, i.e. neither of them is NaN
+    Ord,
+}
+
+impl FloatCmpOp {
+    /// Convert from the `imm` argument used to specify the comparison
+    /// operation in intrinsics such as `llvm.x86.sse.cmp.ss`.
+    fn from_intrinsic_imm(imm: i8, intrinsic: &str) -> InterpResult<'_, Self> {
+        match imm {
+            0 => Ok(Self::Eq),
+            1 => Ok(Self::Lt),
+            2 => Ok(Self::Le),
+            3 => Ok(Self::Unord),
+            4 => Ok(Self::Neq),
+            5 => Ok(Self::Nlt),
+            6 => Ok(Self::Nle),
+            7 => Ok(Self::Ord),
+            imm => {
+                throw_unsup_format!("invalid `imm` parameter of {intrinsic}: {imm}");
+            }
+        }
+    }
+}
--- a/src/tools/miri/src/shims/x86/sse.rs
+++ b/src/tools/miri/src/shims/x86/sse.rs
@ -5,6 +5,7 @@ use rustc_target::spec::abi::Abi;

 use rand::Rng as _;

+use super::FloatCmpOp;
 use crate::*;
 use shims::foreign_items::EmulateByNameResult;

@ -78,7 +79,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {

                unary_op_ss(this, which, op, dest)?;
            }
-            // Used to implement _mm_{sqrt,rcp,rsqrt}_ss functions.
+            // Used to implement _mm_{sqrt,rcp,rsqrt}_ps functions.
            // Performs the operations on all components of `op`.
            "sqrt.ps" | "rcp.ps" | "rsqrt.ps" => {
                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
@ -100,22 +101,10 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                let [left, right, imm] =
                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;

-                let which = match this.read_scalar(imm)?.to_i8()? {
-                    0 => FloatBinOp::Cmp(FloatCmpOp::Eq),
-                    1 => FloatBinOp::Cmp(FloatCmpOp::Lt),
-                    2 => FloatBinOp::Cmp(FloatCmpOp::Le),
-                    3 => FloatBinOp::Cmp(FloatCmpOp::Unord),
-                    4 => FloatBinOp::Cmp(FloatCmpOp::Neq),
-                    5 => FloatBinOp::Cmp(FloatCmpOp::Nlt),
-                    6 => FloatBinOp::Cmp(FloatCmpOp::Nle),
-                    7 => FloatBinOp::Cmp(FloatCmpOp::Ord),
-                    imm => {
-                        throw_unsup_format!(
-                            "invalid 3rd parameter of llvm.x86.sse.cmp.ps: {}",
-                            imm
-                        );
-                    }
-                };
+                let which = FloatBinOp::Cmp(FloatCmpOp::from_intrinsic_imm(
+                    this.read_scalar(imm)?.to_i8()?,
+                    "llvm.x86.sse.cmp.ss",
+                )?);

                bin_op_ss(this, which, left, right, dest)?;
            }
@ -127,26 +116,14 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                let [left, right, imm] =
                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;

-                let which = match this.read_scalar(imm)?.to_i8()? {
-                    0 => FloatBinOp::Cmp(FloatCmpOp::Eq),
-                    1 => FloatBinOp::Cmp(FloatCmpOp::Lt),
-                    2 => FloatBinOp::Cmp(FloatCmpOp::Le),
-                    3 => FloatBinOp::Cmp(FloatCmpOp::Unord),
-                    4 => FloatBinOp::Cmp(FloatCmpOp::Neq),
-                    5 => FloatBinOp::Cmp(FloatCmpOp::Nlt),
-                    6 => FloatBinOp::Cmp(FloatCmpOp::Nle),
-                    7 => FloatBinOp::Cmp(FloatCmpOp::Ord),
-                    imm => {
-                        throw_unsup_format!(
-                            "invalid 3rd parameter of llvm.x86.sse.cmp.ps: {}",
-                            imm
-                        );
-                    }
-                };
+                let which = FloatBinOp::Cmp(FloatCmpOp::from_intrinsic_imm(
+                    this.read_scalar(imm)?.to_i8()?,
+                    "llvm.x86.sse.cmp.ps",
+                )?);

                bin_op_ps(this, which, left, right, dest)?;
            }
-            // Used to implement _mm_{,u}comi{eq,lt,le,gt,ge,neq}_ps functions.
+            // Used to implement _mm_{,u}comi{eq,lt,le,gt,ge,neq}_ss functions.
            // Compares the first component of `left` and `right` and returns
            // a scalar value (0 or 1).
            "comieq.ss" | "comilt.ss" | "comile.ss" | "comigt.ss" | "comige.ss" | "comineq.ss"
@ -292,6 +269,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                    let op = this.read_scalar(&this.project_index(&op, i)?)?;
                    let op = op.to_u32()?;

+                    // Extract the highest bit of `op` and place it in the `i`-th bit of `res`
                    res |= (op >> 31) << i;
                }

@ -303,25 +281,6 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
    }
 }

-/// Floating point comparison operation
-///
-/// <https://www.felixcloutier.com/x86/cmpss>
-/// <https://www.felixcloutier.com/x86/cmpps>
-#[derive(Copy, Clone)]
-enum FloatCmpOp {
-    Eq,
-    Lt,
-    Le,
-    Unord,
-    Neq,
-    /// Not less-than
-    Nlt,
-    /// Not less-or-equal
-    Nle,
-    /// Ordered, i.e. neither of them is NaN
-    Ord,
-}
-
 #[derive(Copy, Clone)]
 enum FloatBinOp {
    /// Arithmetic operation
@ -436,8 +395,8 @@ fn bin_op_ss<'tcx>(
    Ok(())
 }

-/// Performs `which` operation on each component of `left`, and
-/// `right` storing the result is stored in `dest`.
+/// Performs `which` operation on each component of `left` and
+/// `right`, storing the result is stored in `dest`.
 fn bin_op_ps<'tcx>(
    this: &mut crate::MiriInterpCx<'_, 'tcx>,
    which: FloatBinOp,
--- a/src/tools/miri/src/shims/x86/sse2.rs
+++ b/src/tools/miri/src/shims/x86/sse2.rs
@ -0,0 +1,982 @@
+use rustc_apfloat::{
+    ieee::{Double, Single},
+    Float as _, FloatConvert as _,
+};
+use rustc_middle::ty::layout::LayoutOf as _;
+use rustc_middle::ty::Ty;
+use rustc_span::Symbol;
+use rustc_target::abi::Size;
+use rustc_target::spec::abi::Abi;
+
+use super::FloatCmpOp;
+use crate::*;
+use shims::foreign_items::EmulateByNameResult;
+
+impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for crate::MiriInterpCx<'mir, 'tcx> {}
+pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
+    fn emulate_x86_sse2_intrinsic(
+        &mut self,
+        link_name: Symbol,
+        abi: Abi,
+        args: &[OpTy<'tcx, Provenance>],
+        dest: &PlaceTy<'tcx, Provenance>,
+    ) -> InterpResult<'tcx, EmulateByNameResult<'mir, 'tcx>> {
+        let this = self.eval_context_mut();
+        // Prefix should have already been checked.
+        let unprefixed_name = link_name.as_str().strip_prefix("llvm.x86.sse2.").unwrap();
+
+        // These intrinsics operate on 128-bit (f32x4, f64x2, i8x16, i16x8, i32x4, i64x2) SIMD
+        // vectors unless stated otherwise.
+        // Many intrinsic names are sufixed with "ps" (packed single), "ss" (scalar signle),
+        // "pd" (packed double) or "sd" (scalar double), where single means single precision
+        // floating point (f32) and double means double precision floating point (f64). "ps"
+        // and "pd" means thet the operation is performed on each element of the vector, while
+        // "ss" and "sd" means that the operation is performed only on the first element, copying
+        // the remaining elements from the input vector (for binary operations, from the left-hand
+        // side).
+        // Intrinsincs sufixed with "epiX" or "epuX" operate with X-bit signed or unsigned
+        // vectors.
+        match unprefixed_name {
+            // Used to implement the _mm_avg_epu8 function.
+            // Averages packed unsigned 8-bit integers in `left` and `right`.
+            "pavg.b" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+                assert_eq!(dest_len, right_len);
+
+                for i in 0..dest_len {
+                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u8()?;
+                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u8()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    // Values are expanded from u8 to u16, so adds cannot overflow.
+                    let res = u16::from(left)
+                        .checked_add(u16::from(right))
+                        .unwrap()
+                        .checked_add(1)
+                        .unwrap()
+                        / 2;
+                    this.write_scalar(Scalar::from_u8(res.try_into().unwrap()), &dest)?;
+                }
+            }
+            // Used to implement the _mm_avg_epu16 function.
+            // Averages packed unsigned 16-bit integers in `left` and `right`.
+            "pavg.w" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+                assert_eq!(dest_len, right_len);
+
+                for i in 0..dest_len {
+                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u16()?;
+                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u16()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    // Values are expanded from u16 to u32, so adds cannot overflow.
+                    let res = u32::from(left)
+                        .checked_add(u32::from(right))
+                        .unwrap()
+                        .checked_add(1)
+                        .unwrap()
+                        / 2;
+                    this.write_scalar(Scalar::from_u16(res.try_into().unwrap()), &dest)?;
+                }
+            }
+            // Used to implement the _mm_mulhi_epi16 function.
+            "pmulh.w" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+                assert_eq!(dest_len, right_len);
+
+                for i in 0..dest_len {
+                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
+                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    // Values are expanded from i16 to i32, so multiplication cannot overflow.
+                    let res = i32::from(left).checked_mul(i32::from(right)).unwrap() >> 16;
+                    this.write_scalar(Scalar::from_int(res, Size::from_bits(16)), &dest)?;
+                }
+            }
+            // Used to implement the _mm_mulhi_epu16 function.
+            "pmulhu.w" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+                assert_eq!(dest_len, right_len);
+
+                for i in 0..dest_len {
+                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u16()?;
+                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u16()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    // Values are expanded from u16 to u32, so multiplication cannot overflow.
+                    let res = u32::from(left).checked_mul(u32::from(right)).unwrap() >> 16;
+                    this.write_scalar(Scalar::from_u16(res.try_into().unwrap()), &dest)?;
+                }
+            }
+            // Used to implement the _mm_mul_epu32 function.
+            // Multiplies the the low unsigned 32-bit integers from each packed
+            // 64-bit element and stores the result as 64-bit unsigned integers.
+            "pmulu.dq" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                // left and right are u32x4, dest is u64x2
+                assert_eq!(left_len, 4);
+                assert_eq!(right_len, 4);
+                assert_eq!(dest_len, 2);
+
+                for i in 0..dest_len {
+                    let op_i = i.checked_mul(2).unwrap();
+                    let left = this.read_scalar(&this.project_index(&left, op_i)?)?.to_u32()?;
+                    let right = this.read_scalar(&this.project_index(&right, op_i)?)?.to_u32()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    // The multiplication will not overflow because stripping the
+                    // operands are expanded from 32-bit to 64-bit.
+                    let res = u64::from(left).checked_mul(u64::from(right)).unwrap();
+                    this.write_scalar(Scalar::from_u64(res), &dest)?;
+                }
+            }
+            // Used to implement the _mm_sad_epu8 function.
+            // Computes the absolute differences of packed unsigned 8-bit integers in `a`
+            // and `b`, then horizontally sum each consecutive 8 differences to produce
+            // two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
+            // the low 16 bits of 64-bit elements returned.
+            //
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
+            "psad.bw" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                // left and right are u8x16, dest is u64x2
+                assert_eq!(left_len, right_len);
+                assert_eq!(left_len, 16);
+                assert_eq!(dest_len, 2);
+
+                for i in 0..dest_len {
+                    let dest = this.project_index(&dest, i)?;
+
+                    let mut res: u16 = 0;
+                    let n = left_len.checked_div(dest_len).unwrap();
+                    for j in 0..n {
+                        let op_i = j.checked_add(i.checked_mul(n).unwrap()).unwrap();
+                        let left = this.read_scalar(&this.project_index(&left, op_i)?)?.to_u8()?;
+                        let right =
+                            this.read_scalar(&this.project_index(&right, op_i)?)?.to_u8()?;
+
+                        res = res.checked_add(left.abs_diff(right).into()).unwrap();
+                    }
+
+                    this.write_scalar(Scalar::from_u64(res.into()), &dest)?;
+                }
+            }
+            // Used to implement the _mm_{sll,srl,sra}_epi16 functions.
+            // Shifts 16-bit packed integers in left by the amount in right.
+            // Both operands are vectors of 16-bit integers. However, right is
+            // interpreted as a single 64-bit integer (remaining bits are ignored).
+            // For logic shifts, when right is larger than 15, zero is produced.
+            // For arithmetic shifts, when right is larger than 15, the sign bit
+            // is copied to remaining bits.
+            "psll.w" | "psrl.w" | "psra.w" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+                assert_eq!(dest_len, right_len);
+
+                enum ShiftOp {
+                    Sll,
+                    Srl,
+                    Sra,
+                }
+                let which = match unprefixed_name {
+                    "psll.w" => ShiftOp::Sll,
+                    "psrl.w" => ShiftOp::Srl,
+                    "psra.w" => ShiftOp::Sra,
+                    _ => unreachable!(),
+                };
+
+                // Get the 64-bit shift operand and convert it to the type expected
+                // by checked_{shl,shr} (u32).
+                // It is ok to saturate the value to u32::MAX because any value
+                // above 15 will produce the same result.
+                let shift = extract_first_u64(this, &right)?.try_into().unwrap_or(u32::MAX);
+
+                for i in 0..dest_len {
+                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u16()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    let res = match which {
+                        ShiftOp::Sll => left.checked_shl(shift).unwrap_or(0),
+                        ShiftOp::Srl => left.checked_shr(shift).unwrap_or(0),
+                        #[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss)]
+                        ShiftOp::Sra => {
+                            // Convert u16 to i16 to use arithmetic shift
+                            let left = left as i16;
+                            // Copy the sign bit to the remaining bits
+                            left.checked_shr(shift).unwrap_or(left >> 15) as u16
+                        }
+                    };
+
+                    this.write_scalar(Scalar::from_u16(res), &dest)?;
+                }
+            }
+            // Used to implement the _mm_{sll,srl,sra}_epi32 functions.
+            // 32-bit equivalent to the shift functions above.
+            "psll.d" | "psrl.d" | "psra.d" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+                assert_eq!(dest_len, right_len);
+
+                enum ShiftOp {
+                    Sll,
+                    Srl,
+                    Sra,
+                }
+                let which = match unprefixed_name {
+                    "psll.d" => ShiftOp::Sll,
+                    "psrl.d" => ShiftOp::Srl,
+                    "psra.d" => ShiftOp::Sra,
+                    _ => unreachable!(),
+                };
+
+                // Get the 64-bit shift operand and convert it to the type expected
+                // by checked_{shl,shr} (u32).
+                // It is ok to saturate the value to u32::MAX because any value
+                // above 31 will produce the same result.
+                let shift = extract_first_u64(this, &right)?.try_into().unwrap_or(u32::MAX);
+
+                for i in 0..dest_len {
+                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u32()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    let res = match which {
+                        ShiftOp::Sll => left.checked_shl(shift).unwrap_or(0),
+                        ShiftOp::Srl => left.checked_shr(shift).unwrap_or(0),
+                        #[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss)]
+                        ShiftOp::Sra => {
+                            // Convert u32 to i32 to use arithmetic shift
+                            let left = left as i32;
+                            // Copy the sign bit to the remaining bits
+                            left.checked_shr(shift).unwrap_or(left >> 31) as u32
+                        }
+                    };
+
+                    this.write_scalar(Scalar::from_u32(res), &dest)?;
+                }
+            }
+            // Used to implement the _mm_{sll,srl}_epi64 functions.
+            // 64-bit equivalent to the shift functions above, except _mm_sra_epi64,
+            // which is not available in SSE2.
+            "psll.q" | "psrl.q" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+                assert_eq!(dest_len, right_len);
+
+                enum ShiftOp {
+                    Sll,
+                    Srl,
+                }
+                let which = match unprefixed_name {
+                    "psll.q" => ShiftOp::Sll,
+                    "psrl.q" => ShiftOp::Srl,
+                    _ => unreachable!(),
+                };
+
+                // Get the 64-bit shift operand and convert it to the type expected
+                // by checked_{shl,shr} (u32).
+                // It is ok to saturate the value to u32::MAX because any value
+                // above 63 will produce the same result.
+                let shift = this
+                    .read_scalar(&this.project_index(&right, 0)?)?
+                    .to_u64()?
+                    .try_into()
+                    .unwrap_or(u32::MAX);
+
+                for i in 0..dest_len {
+                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_u64()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    let res = match which {
+                        ShiftOp::Sll => left.checked_shl(shift).unwrap_or(0),
+                        ShiftOp::Srl => left.checked_shr(shift).unwrap_or(0),
+                    };
+
+                    this.write_scalar(Scalar::from_u64(res), &dest)?;
+                }
+            }
+            // Used to implement the _mm_cvtepi32_ps function.
+            // Converts packed i32 to packed f32.
+            // FIXME: Can we get rid of this intrinsic and just use simd_as?
+            "cvtdq2ps" => {
+                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (op, op_len) = this.operand_to_simd(op)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, op_len);
+
+                for i in 0..dest_len {
+                    let op = this.read_scalar(&this.project_index(&op, i)?)?.to_i32()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    let res = Scalar::from_f32(Single::from_i128(op.into()).value);
+                    this.write_scalar(res, &dest)?;
+                }
+            }
+            // Used to implement the _mm_cvtps_epi32 and _mm_cvttps_epi32 functions.
+            // Converts packed f32 to packed i32.
+            "cvtps2dq" | "cvttps2dq" => {
+                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (op, op_len) = this.operand_to_simd(op)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, op_len);
+
+                let rnd = match unprefixed_name {
+                    // "current SSE rounding mode", assume nearest
+                    // https://www.felixcloutier.com/x86/cvtps2dq
+                    "cvtps2dq" => rustc_apfloat::Round::NearestTiesToEven,
+                    // always truncate
+                    // https://www.felixcloutier.com/x86/cvttps2dq
+                    "cvttps2dq" => rustc_apfloat::Round::TowardZero,
+                    _ => unreachable!(),
+                };
+
+                for i in 0..dest_len {
+                    let op = this.read_scalar(&this.project_index(&op, i)?)?.to_f32()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    let res =
+                        this.float_to_int_checked(op, dest.layout.ty, rnd).unwrap_or_else(|| {
+                            // Fallback to minimum acording to SSE2 semantics.
+                            Scalar::from_i32(i32::MIN)
+                        });
+                    this.write_scalar(res, &dest)?;
+                }
+            }
+            // Used to implement the _mm_packs_epi16 function.
+            // Converts two 16-bit integer vectors to a single 8-bit integer
+            // vector with signed saturation.
+            "packsswb.128" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                // left and right are i16x8, dest is i8x16
+                assert_eq!(left_len, 8);
+                assert_eq!(right_len, 8);
+                assert_eq!(dest_len, 16);
+
+                for i in 0..left_len {
+                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
+                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
+                    let left_dest = this.project_index(&dest, i)?;
+                    let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?;
+
+                    let left_res =
+                        i8::try_from(left).unwrap_or(if left < 0 { i8::MIN } else { i8::MAX });
+                    let right_res =
+                        i8::try_from(right).unwrap_or(if right < 0 { i8::MIN } else { i8::MAX });
+
+                    this.write_scalar(Scalar::from_int(left_res, Size::from_bits(8)), &left_dest)?;
+                    this.write_scalar(
+                        Scalar::from_int(right_res, Size::from_bits(8)),
+                        &right_dest,
+                    )?;
+                }
+            }
+            // Used to implement the _mm_packus_epi16 function.
+            // Converts two 16-bit signed integer vectors to a single 8-bit
+            // unsigned integer vector with saturation.
+            "packuswb.128" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                // left and right are i16x8, dest is u8x16
+                assert_eq!(left_len, 8);
+                assert_eq!(right_len, 8);
+                assert_eq!(dest_len, 16);
+
+                for i in 0..left_len {
+                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
+                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
+                    let left_dest = this.project_index(&dest, i)?;
+                    let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?;
+
+                    let left_res = u8::try_from(left).unwrap_or(if left < 0 { 0 } else { u8::MAX });
+                    let right_res =
+                        u8::try_from(right).unwrap_or(if right < 0 { 0 } else { u8::MAX });
+
+                    this.write_scalar(Scalar::from_u8(left_res), &left_dest)?;
+                    this.write_scalar(Scalar::from_u8(right_res), &right_dest)?;
+                }
+            }
+            // Used to implement the _mm_packs_epi32 function.
+            // Converts two 16-bit integer vectors to a single 8-bit integer
+            // vector with signed saturation.
+            "packssdw.128" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                // left and right are i32x4, dest is i16x8
+                assert_eq!(left_len, 4);
+                assert_eq!(right_len, 4);
+                assert_eq!(dest_len, 8);
+
+                for i in 0..left_len {
+                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i32()?;
+                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i32()?;
+                    let left_dest = this.project_index(&dest, i)?;
+                    let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?;
+
+                    let left_res =
+                        i16::try_from(left).unwrap_or(if left < 0 { i16::MIN } else { i16::MAX });
+                    let right_res =
+                        i16::try_from(right).unwrap_or(if right < 0 { i16::MIN } else { i16::MAX });
+
+                    this.write_scalar(Scalar::from_int(left_res, Size::from_bits(16)), &left_dest)?;
+                    this.write_scalar(
+                        Scalar::from_int(right_res, Size::from_bits(16)),
+                        &right_dest,
+                    )?;
+                }
+            }
+            // Used to implement _mm_min_sd and _mm_max_sd functions.
+            // Note that the semantics are a bit different from Rust simd_min
+            // and simd_max intrinsics regarding handling of NaN and -0.0: Rust
+            // matches the IEEE min/max operations, while x86 has different
+            // semantics.
+            "min.sd" | "max.sd" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let which = match unprefixed_name {
+                    "min.sd" => FloatBinOp::Min,
+                    "max.sd" => FloatBinOp::Max,
+                    _ => unreachable!(),
+                };
+
+                bin_op_sd(this, which, left, right, dest)?;
+            }
+            // Used to implement _mm_min_pd and _mm_max_pd functions.
+            // Note that the semantics are a bit different from Rust simd_min
+            // and simd_max intrinsics regarding handling of NaN and -0.0: Rust
+            // matches the IEEE min/max operations, while x86 has different
+            // semantics.
+            "min.pd" | "max.pd" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let which = match unprefixed_name {
+                    "min.pd" => FloatBinOp::Min,
+                    "max.pd" => FloatBinOp::Max,
+                    _ => unreachable!(),
+                };
+
+                bin_op_pd(this, which, left, right, dest)?;
+            }
+            // Used to implement _mm_sqrt_sd functions.
+            // Performs the operations on the first component of `op` and
+            // copies the remaining components from `op`.
+            "sqrt.sd" => {
+                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (op, op_len) = this.operand_to_simd(op)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, op_len);
+
+                let op0 = this.read_scalar(&this.project_index(&op, 0)?)?.to_u64()?;
+                // FIXME using host floats
+                let res0 = Scalar::from_u64(f64::from_bits(op0).sqrt().to_bits());
+                this.write_scalar(res0, &this.project_index(&dest, 0)?)?;
+
+                for i in 1..dest_len {
+                    this.copy_op(
+                        &this.project_index(&op, i)?,
+                        &this.project_index(&dest, i)?,
+                        /*allow_transmute*/ false,
+                    )?;
+                }
+            }
+            // Used to implement _mm_sqrt_pd functions.
+            // Performs the operations on all components of `op`.
+            "sqrt.pd" => {
+                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (op, op_len) = this.operand_to_simd(op)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, op_len);
+
+                for i in 0..dest_len {
+                    let op = this.read_scalar(&this.project_index(&op, i)?)?.to_u64()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    // FIXME using host floats
+                    let res = Scalar::from_u64(f64::from_bits(op).sqrt().to_bits());
+
+                    this.write_scalar(res, &dest)?;
+                }
+            }
+            // Used to implement the _mm_cmp*_sd function.
+            // Performs a comparison operation on the first component of `left`
+            // and `right`, returning 0 if false or `u64::MAX` if true. The remaining
+            // components are copied from `left`.
+            "cmp.sd" => {
+                let [left, right, imm] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let which = FloatBinOp::Cmp(FloatCmpOp::from_intrinsic_imm(
+                    this.read_scalar(imm)?.to_i8()?,
+                    "llvm.x86.sse2.cmp.sd",
+                )?);
+
+                bin_op_sd(this, which, left, right, dest)?;
+            }
+            // Used to implement the _mm_cmp*_pd functions.
+            // Performs a comparison operation on each component of `left`
+            // and `right`. For each component, returns 0 if false or `u64::MAX`
+            // if true.
+            "cmp.pd" => {
+                let [left, right, imm] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let which = FloatBinOp::Cmp(FloatCmpOp::from_intrinsic_imm(
+                    this.read_scalar(imm)?.to_i8()?,
+                    "llvm.x86.sse2.cmp.pd",
+                )?);
+
+                bin_op_pd(this, which, left, right, dest)?;
+            }
+            // Used to implement _mm_{,u}comi{eq,lt,le,gt,ge,neq}_sd functions.
+            // Compares the first component of `left` and `right` and returns
+            // a scalar value (0 or 1).
+            "comieq.sd" | "comilt.sd" | "comile.sd" | "comigt.sd" | "comige.sd" | "comineq.sd"
+            | "ucomieq.sd" | "ucomilt.sd" | "ucomile.sd" | "ucomigt.sd" | "ucomige.sd"
+            | "ucomineq.sd" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+
+                assert_eq!(left_len, right_len);
+
+                let left = this.read_scalar(&this.project_index(&left, 0)?)?.to_f64()?;
+                let right = this.read_scalar(&this.project_index(&right, 0)?)?.to_f64()?;
+                // The difference between the com* and *ucom variants is signaling
+                // of exceptions when either argument is a quiet NaN. We do not
+                // support accessing the SSE status register from miri (or from Rust,
+                // for that matter), so we treat equally both variants.
+                let res = match unprefixed_name {
+                    "comieq.sd" | "ucomieq.sd" => left == right,
+                    "comilt.sd" | "ucomilt.sd" => left < right,
+                    "comile.sd" | "ucomile.sd" => left <= right,
+                    "comigt.sd" | "ucomigt.sd" => left > right,
+                    "comige.sd" | "ucomige.sd" => left >= right,
+                    "comineq.sd" | "ucomineq.sd" => left != right,
+                    _ => unreachable!(),
+                };
+                this.write_scalar(Scalar::from_i32(i32::from(res)), dest)?;
+            }
+            // Used to implement the _mm_cvtpd_ps function.
+            // Converts packed f32 to packed f64.
+            "cvtpd2ps" => {
+                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (op, op_len) = this.operand_to_simd(op)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                // op is f64x2, dest is f32x4
+                assert_eq!(op_len, 2);
+                assert_eq!(dest_len, 4);
+
+                for i in 0..op_len {
+                    let op = this.read_scalar(&this.project_index(&op, i)?)?.to_f64()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    let res = op.convert(/*loses_info*/ &mut false).value;
+                    this.write_scalar(Scalar::from_f32(res), &dest)?;
+                }
+                // Fill the remaining with zeros
+                for i in op_len..dest_len {
+                    let dest = this.project_index(&dest, i)?;
+                    this.write_scalar(Scalar::from_u32(0), &dest)?;
+                }
+            }
+            // Used to implement the _mm_cvtps_pd function.
+            // Converts packed f64 to packed f32.
+            "cvtps2pd" => {
+                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (op, op_len) = this.operand_to_simd(op)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                // op is f32x4, dest is f64x2
+                assert_eq!(op_len, 4);
+                assert_eq!(dest_len, 2);
+
+                for i in 0..dest_len {
+                    let op = this.read_scalar(&this.project_index(&op, i)?)?.to_f32()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    let res = op.convert(/*loses_info*/ &mut false).value;
+                    this.write_scalar(Scalar::from_f64(res), &dest)?;
+                }
+                // the two remaining f32 are ignored
+            }
+            // Used to implement the _mm_cvtpd_epi32 and _mm_cvttpd_epi32 functions.
+            // Converts packed f64 to packed i32.
+            "cvtpd2dq" | "cvttpd2dq" => {
+                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (op, op_len) = this.operand_to_simd(op)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                // op is f64x2, dest is i32x4
+                assert_eq!(op_len, 2);
+                assert_eq!(dest_len, 4);
+
+                let rnd = match unprefixed_name {
+                    // "current SSE rounding mode", assume nearest
+                    // https://www.felixcloutier.com/x86/cvtpd2dq
+                    "cvtpd2dq" => rustc_apfloat::Round::NearestTiesToEven,
+                    // always truncate
+                    // https://www.felixcloutier.com/x86/cvttpd2dq
+                    "cvttpd2dq" => rustc_apfloat::Round::TowardZero,
+                    _ => unreachable!(),
+                };
+
+                for i in 0..op_len {
+                    let op = this.read_scalar(&this.project_index(&op, i)?)?.to_f64()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    let res =
+                        this.float_to_int_checked(op, dest.layout.ty, rnd).unwrap_or_else(|| {
+                            // Fallback to minimum acording to SSE2 semantics.
+                            Scalar::from_i32(i32::MIN)
+                        });
+                    this.write_scalar(res, &dest)?;
+                }
+                // Fill the remaining with zeros
+                for i in op_len..dest_len {
+                    let dest = this.project_index(&dest, i)?;
+                    this.write_scalar(Scalar::from_i32(0), &dest)?;
+                }
+            }
+            // Use to implement the _mm_cvtsd_si32 and _mm_cvttsd_si32 functions.
+            // Converts the first component of `op` from f64 to i32.
+            "cvtsd2si" | "cvttsd2si" => {
+                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+                let (op, _) = this.operand_to_simd(op)?;
+
+                let op = this.read_scalar(&this.project_index(&op, 0)?)?.to_f64()?;
+
+                let rnd = match unprefixed_name {
+                    // "current SSE rounding mode", assume nearest
+                    // https://www.felixcloutier.com/x86/cvtsd2si
+                    "cvtsd2si" => rustc_apfloat::Round::NearestTiesToEven,
+                    // always truncate
+                    // https://www.felixcloutier.com/x86/cvttsd2si
+                    "cvttsd2si" => rustc_apfloat::Round::TowardZero,
+                    _ => unreachable!(),
+                };
+
+                let res = this.float_to_int_checked(op, dest.layout.ty, rnd).unwrap_or_else(|| {
+                    // Fallback to minimum acording to SSE semantics.
+                    Scalar::from_i32(i32::MIN)
+                });
+
+                this.write_scalar(res, dest)?;
+            }
+            // Use to implement the _mm_cvtsd_si64 and _mm_cvttsd_si64 functions.
+            // Converts the first component of `op` from f64 to i64.
+            "cvtsd2si64" | "cvttsd2si64" => {
+                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+                let (op, _) = this.operand_to_simd(op)?;
+
+                let op = this.read_scalar(&this.project_index(&op, 0)?)?.to_f64()?;
+
+                let rnd = match unprefixed_name {
+                    // "current SSE rounding mode", assume nearest
+                    // https://www.felixcloutier.com/x86/cvtsd2si
+                    "cvtsd2si64" => rustc_apfloat::Round::NearestTiesToEven,
+                    // always truncate
+                    // https://www.felixcloutier.com/x86/cvttsd2si
+                    "cvttsd2si64" => rustc_apfloat::Round::TowardZero,
+                    _ => unreachable!(),
+                };
+
+                let res = this.float_to_int_checked(op, dest.layout.ty, rnd).unwrap_or_else(|| {
+                    // Fallback to minimum acording to SSE semantics.
+                    Scalar::from_i64(i64::MIN)
+                });
+
+                this.write_scalar(res, dest)?;
+            }
+            // Used to implement the _mm_cvtsd_ss and _mm_cvtss_sd functions.
+            // Converts the first f64/f32 from `right` to f32/f64 and copies
+            // the remaining elements from `left`
+            "cvtsd2ss" | "cvtss2sd" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, _) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+
+                // Convert first element of `right`
+                let right0 = this.read_immediate(&this.project_index(&right, 0)?)?;
+                let dest0 = this.project_index(&dest, 0)?;
+                // `float_to_float_or_int` here will convert from f64 to f32 (cvtsd2ss) or
+                // from f32 to f64 (cvtss2sd).
+                let res0 = this.float_to_float_or_int(&right0, dest0.layout.ty)?;
+                this.write_immediate(res0, &dest0)?;
+
+                // Copy remianing from `left`
+                for i in 1..dest_len {
+                    this.copy_op(
+                        &this.project_index(&left, i)?,
+                        &this.project_index(&dest, i)?,
+                        /*allow_transmute*/ false,
+                    )?;
+                }
+            }
+            // Used to implement the _mm_movemask_pd function.
+            // Returns a scalar integer where the i-th bit is the highest
+            // bit of the i-th component of `op`.
+            // https://www.felixcloutier.com/x86/movmskpd
+            "movmsk.pd" => {
+                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+                let (op, op_len) = this.operand_to_simd(op)?;
+
+                let mut res = 0;
+                for i in 0..op_len {
+                    let op = this.read_scalar(&this.project_index(&op, i)?)?;
+                    let op = op.to_u64()?;
+
+                    // Extract the highest bit of `op` and place it in the `i`-th bit of `res`
+                    res |= (op >> 63) << i;
+                }
+
+                this.write_scalar(Scalar::from_u32(res.try_into().unwrap()), dest)?;
+            }
+            _ => return Ok(EmulateByNameResult::NotSupported),
+        }
+        Ok(EmulateByNameResult::NeedsJumping)
+    }
+}
+
+/// Takes a 128-bit vector, transmutes it to `[u64; 2]` and extracts
+/// the first value.
+fn extract_first_u64<'tcx>(
+    this: &crate::MiriInterpCx<'_, 'tcx>,
+    op: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, u64> {
+    // Transmute vector to `[u64; 2]`
+    let u64_array_layout = this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u64, 2))?;
+    let op = op.transmute(u64_array_layout, this)?;
+
+    // Get the first u64 from the array
+    this.read_scalar(&this.project_index(&op, 0)?)?.to_u64()
+}
+
+#[derive(Copy, Clone)]
+enum FloatBinOp {
+    /// Comparison
+    Cmp(FloatCmpOp),
+    /// Minimum value (with SSE semantics)
+    ///
+    /// <https://www.felixcloutier.com/x86/minsd>
+    /// <https://www.felixcloutier.com/x86/minpd>
+    Min,
+    /// Maximum value (with SSE semantics)
+    ///
+    /// <https://www.felixcloutier.com/x86/maxsd>
+    /// <https://www.felixcloutier.com/x86/maxpd>
+    Max,
+}
+
+/// Performs `which` scalar operation on `left` and `right` and returns
+/// the result.
+// FIXME make this generic over apfloat type to reduce code duplicaton with bin_op_f32
+fn bin_op_f64<'tcx>(
+    which: FloatBinOp,
+    left: &ImmTy<'tcx, Provenance>,
+    right: &ImmTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, Scalar<Provenance>> {
+    match which {
+        FloatBinOp::Cmp(which) => {
+            let left = left.to_scalar().to_f64()?;
+            let right = right.to_scalar().to_f64()?;
+            // FIXME: Make sure that these operations match the semantics of cmppd
+            let res = match which {
+                FloatCmpOp::Eq => left == right,
+                FloatCmpOp::Lt => left < right,
+                FloatCmpOp::Le => left <= right,
+                FloatCmpOp::Unord => left.is_nan() || right.is_nan(),
+                FloatCmpOp::Neq => left != right,
+                FloatCmpOp::Nlt => !(left < right),
+                FloatCmpOp::Nle => !(left <= right),
+                FloatCmpOp::Ord => !left.is_nan() && !right.is_nan(),
+            };
+            Ok(Scalar::from_u64(if res { u64::MAX } else { 0 }))
+        }
+        FloatBinOp::Min => {
+            let left = left.to_scalar().to_f64()?;
+            let right = right.to_scalar().to_f64()?;
+            // SSE semantics to handle zero and NaN. Note that `x == Single::ZERO`
+            // is true when `x` is either +0 or -0.
+            if (left == Double::ZERO && right == Double::ZERO)
+                || left.is_nan()
+                || right.is_nan()
+                || left >= right
+            {
+                Ok(Scalar::from_f64(right))
+            } else {
+                Ok(Scalar::from_f64(left))
+            }
+        }
+        FloatBinOp::Max => {
+            let left = left.to_scalar().to_f64()?;
+            let right = right.to_scalar().to_f64()?;
+            // SSE semantics to handle zero and NaN. Note that `x == Single::ZERO`
+            // is true when `x` is either +0 or -0.
+            if (left == Double::ZERO && right == Double::ZERO)
+                || left.is_nan()
+                || right.is_nan()
+                || left <= right
+            {
+                Ok(Scalar::from_f64(right))
+            } else {
+                Ok(Scalar::from_f64(left))
+            }
+        }
+    }
+}
+
+/// Performs `which` operation on the first component of `left` and `right`
+/// and copies the other components from `left`. The result is stored in `dest`.
+fn bin_op_sd<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    which: FloatBinOp,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &PlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (left, left_len) = this.operand_to_simd(left)?;
+    let (right, right_len) = this.operand_to_simd(right)?;
+    let (dest, dest_len) = this.place_to_simd(dest)?;
+
+    assert_eq!(dest_len, left_len);
+    assert_eq!(dest_len, right_len);
+
+    let res0 = bin_op_f64(
+        which,
+        &this.read_immediate(&this.project_index(&left, 0)?)?,
+        &this.read_immediate(&this.project_index(&right, 0)?)?,
+    )?;
+    this.write_scalar(res0, &this.project_index(&dest, 0)?)?;
+
+    for i in 1..dest_len {
+        this.copy_op(
+            &this.project_index(&left, i)?,
+            &this.project_index(&dest, i)?,
+            /*allow_transmute*/ false,
+        )?;
+    }
+
+    Ok(())
+}
+
+/// Performs `which` operation on each component of `left` and
+/// `right`, storing the result is stored in `dest`.
+fn bin_op_pd<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    which: FloatBinOp,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &PlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (left, left_len) = this.operand_to_simd(left)?;
+    let (right, right_len) = this.operand_to_simd(right)?;
+    let (dest, dest_len) = this.place_to_simd(dest)?;
+
+    assert_eq!(dest_len, left_len);
+    assert_eq!(dest_len, right_len);
+
+    for i in 0..dest_len {
+        let left = this.read_immediate(&this.project_index(&left, i)?)?;
+        let right = this.read_immediate(&this.project_index(&right, i)?)?;
+        let dest = this.project_index(&dest, i)?;
+
+        let res = bin_op_f64(which, &left, &right)?;
+        this.write_scalar(res, &dest)?;
+    }
+
+    Ok(())
+}
--- a/src/tools/miri/tests/pass/intrinsics-x86-sse.rs
+++ b/src/tools/miri/tests/pass/intrinsics-x86-sse.rs
@ -1,5 +1,15 @@
-//@only-target-x86_64
+// Ignore everything except x86 and x86_64
+// Any additional target are added to CI should be ignored here
+//@ignore-target-aarch64
+//@ignore-target-arm
+//@ignore-target-avr
+//@ignore-target-s390x
+//@ignore-target-thumbv7em
+//@ignore-target-wasm32

+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 use std::f32::NAN;
 use std::mem::transmute;
@ -987,6 +997,8 @@ unsafe fn test_sse() {
    }
    test_mm_cvtsi32_ss();

+    // Intrinsic only available on x86_64
+    #[cfg(target_arch = "x86_64")]
    #[target_feature(enable = "sse")]
    unsafe fn test_mm_cvtss_si64() {
        let inputs = &[
@ -1007,8 +1019,11 @@ unsafe fn test_sse() {
            assert_eq!(e, r, "TestCase #{} _mm_cvtss_si64({:?}) = {}, expected: {}", i, x, r, e);
        }
    }
+    #[cfg(target_arch = "x86_64")]
    test_mm_cvtss_si64();

+    // Intrinsic only available on x86_64
+    #[cfg(target_arch = "x86_64")]
    #[target_feature(enable = "sse")]
    unsafe fn test_mm_cvttss_si64() {
        let inputs = &[
@ -1032,8 +1047,11 @@ unsafe fn test_sse() {
            assert_eq!(e, r, "TestCase #{} _mm_cvttss_si64({:?}) = {}, expected: {}", i, x, r, e);
        }
    }
+    #[cfg(target_arch = "x86_64")]
    test_mm_cvttss_si64();

+    // Intrinsic only available on x86_64
+    #[cfg(target_arch = "x86_64")]
    #[target_feature(enable = "sse")]
    unsafe fn test_mm_cvtsi64_ss() {
        let inputs = &[
@ -1053,6 +1071,7 @@ unsafe fn test_sse() {
            assert_eq_m128(e, r);
        }
    }
+    #[cfg(target_arch = "x86_64")]
    test_mm_cvtsi64_ss();

    #[target_feature(enable = "sse")]
--- a/src/tools/miri/tests/pass/intrinsics-x86-sse2.rs
+++ b/src/tools/miri/tests/pass/intrinsics-x86-sse2.rs
@ -0,0 +1,828 @@
+// Ignore everything except x86 and x86_64
+// Any additional target are added to CI should be ignored here
+//@ignore-target-aarch64
+//@ignore-target-arm
+//@ignore-target-avr
+//@ignore-target-s390x
+//@ignore-target-thumbv7em
+//@ignore-target-wasm32
+
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+use std::f64::NAN;
+use std::mem::transmute;
+
+fn main() {
+    assert!(is_x86_feature_detected!("sse2"));
+
+    unsafe {
+        test_sse2();
+    }
+}
+
+#[target_feature(enable = "sse2")]
+unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
+    _mm_set_epi64x(b, a)
+}
+
+#[target_feature(enable = "sse2")]
+unsafe fn test_sse2() {
+    // Mostly copied from library/stdarch/crates/core_arch/src/x86{,_64}/sse2.rs
+
+    unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
+        _mm_set_epi64x(b, a)
+    }
+
+    #[track_caller]
+    #[target_feature(enable = "sse")]
+    unsafe fn assert_eq_m128(a: __m128, b: __m128) {
+        let r = _mm_cmpeq_ps(a, b);
+        if _mm_movemask_ps(r) != 0b1111 {
+            panic!("{:?} != {:?}", a, b);
+        }
+    }
+
+    #[track_caller]
+    #[target_feature(enable = "sse2")]
+    unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
+        assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
+    }
+
+    #[track_caller]
+    #[target_feature(enable = "sse2")]
+    unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
+        if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
+            panic!("{:?} != {:?}", a, b);
+        }
+    }
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_avg_epu8() {
+        let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9));
+        let r = _mm_avg_epu8(a, b);
+        assert_eq_m128i(r, _mm_set1_epi8(6));
+    }
+    test_mm_avg_epu8();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_avg_epu16() {
+        let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9));
+        let r = _mm_avg_epu16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(6));
+    }
+    test_mm_avg_epu16();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_mulhi_epi16() {
+        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
+        let r = _mm_mulhi_epi16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(-16));
+    }
+    test_mm_mulhi_epi16();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_mulhi_epu16() {
+        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001));
+        let r = _mm_mulhi_epu16(a, b);
+        assert_eq_m128i(r, _mm_set1_epi16(15));
+    }
+    test_mm_mulhi_epu16();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_mul_epu32() {
+        let a = _mm_setr_epi64x(1_000_000_000, 1 << 34);
+        let b = _mm_setr_epi64x(1_000_000_000, 1 << 35);
+        let r = _mm_mul_epu32(a, b);
+        let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_mul_epu32();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_sad_epu8() {
+        #[rustfmt::skip]
+        let a = _mm_setr_epi8(
+            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
+            1, 2, 3, 4,
+            155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
+            1, 2, 3, 4,
+        );
+        let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
+        let r = _mm_sad_epu8(a, b);
+        let e = _mm_setr_epi64x(1020, 614);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_sad_epu8();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_sll_epi16() {
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
+        );
+        let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_set1_epi16(0));
+        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi16(0));
+    }
+    test_mm_sll_epi16();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_srl_epi16() {
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0));
+        let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_set1_epi16(0));
+        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi16(0));
+    }
+    test_mm_srl_epi16();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_sra_epi16() {
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10));
+        let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
+        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
+    }
+    test_mm_sra_epi16();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_sll_epi32() {
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
+        let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m128i(r, _mm_set1_epi32(0));
+        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi32(0));
+    }
+    test_mm_sll_epi32();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_srl_epi32() {
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
+        let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m128i(r, _mm_set1_epi32(0));
+        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi32(0));
+    }
+    test_mm_srl_epi32();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_sra_epi32() {
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
+        let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
+        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
+    }
+    test_mm_sra_epi32();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_sll_epi64() {
+        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
+        let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
+        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
+    }
+    test_mm_sll_epi64();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_srl_epi64() {
+        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
+        let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
+        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
+    }
+    test_mm_srl_epi64();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cvtepi32_ps() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let r = _mm_cvtepi32_ps(a);
+        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
+    }
+    test_mm_cvtepi32_ps();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cvtps_epi32() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_cvtps_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4));
+    }
+    test_mm_cvtps_epi32();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cvttps_epi32() {
+        let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6);
+        let r = _mm_cvttps_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
+
+        let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
+        let r = _mm_cvttps_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
+    }
+    test_mm_cvttps_epi32();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_packs_epi16() {
+        let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
+        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
+        let r = _mm_packs_epi16(a, b);
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi8(0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F),
+        );
+    }
+    test_mm_packs_epi16();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_packus_epi16() {
+        let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
+        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
+        let r = _mm_packus_epi16(a, b);
+        assert_eq_m128i(r, _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0));
+    }
+    test_mm_packus_epi16();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_packs_epi32() {
+        let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
+        let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
+        let r = _mm_packs_epi32(a, b);
+        assert_eq_m128i(r, _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF));
+    }
+    test_mm_packs_epi32();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_min_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_min_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
+    }
+    test_mm_min_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_min_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_min_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(-100.0, 5.0));
+
+        // `_mm_min_pd` can **not** be implemented using the `simd_min` rust intrinsic because
+        // the semantics of `simd_min` are different to those of `_mm_min_pd` regarding handling
+        // of `-0.0`.
+        let a = _mm_setr_pd(-0.0, 0.0);
+        let b = _mm_setr_pd(0.0, 0.0);
+        let r1: [u8; 16] = transmute(_mm_min_pd(a, b));
+        let r2: [u8; 16] = transmute(_mm_min_pd(b, a));
+        let a: [u8; 16] = transmute(a);
+        let b: [u8; 16] = transmute(b);
+        assert_eq!(r1, b);
+        assert_eq!(r2, a);
+        assert_ne!(a, b); // sanity check that -0.0 is actually present
+    }
+    test_mm_min_pd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_max_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_max_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
+    }
+    test_mm_max_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_max_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_max_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(-1.0, 20.0));
+
+        // `_mm_max_pd` can **not** be implemented using the `simd_max` rust intrinsic because
+        // the semantics of `simd_max` are different to those of `_mm_max_pd` regarding handling
+        // of `-0.0`.
+        let a = _mm_setr_pd(-0.0, 0.0);
+        let b = _mm_setr_pd(0.0, 0.0);
+        let r1: [u8; 16] = transmute(_mm_max_pd(a, b));
+        let r2: [u8; 16] = transmute(_mm_max_pd(b, a));
+        let a: [u8; 16] = transmute(a);
+        let b: [u8; 16] = transmute(b);
+        assert_eq!(r1, b);
+        assert_eq!(r2, a);
+        assert_ne!(a, b); // sanity check that -0.0 is actually present
+    }
+    test_mm_max_pd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_sqrt_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(5.0, 10.0);
+        let r = _mm_sqrt_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0));
+    }
+    test_mm_sqrt_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_sqrt_pd() {
+        let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0));
+        assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt()));
+    }
+    test_mm_sqrt_pd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpeq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpeq_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmplt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmplt_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmple_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmple_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpgt_sd() {
+        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpgt_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpge_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpge_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpord_sd() {
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpord_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpunord_sd() {
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpunord_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpneq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpneq_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpnlt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpnlt_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpnle_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpnle_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpngt_sd() {
+        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpngt_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpnge_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpnge_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpeq_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpeq_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpeq_pd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmplt_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmplt_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmplt_pd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmple_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmple_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmple_pd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpgt_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpgt_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpgt_pd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpge_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpge_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpge_pd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpord_pd() {
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmpord_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpord_pd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpunord_pd() {
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpunord_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpunord_pd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpneq_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(!0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmpneq_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpneq_pd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpnlt_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
+        let e = _mm_setr_epi64x(0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpnlt_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpnlt_pd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpnle_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, 0);
+        let r = transmute::<_, __m128i>(_mm_cmpnle_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpnle_pd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpngt_pd() {
+        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmpngt_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpngt_pd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cmpnge_pd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        let e = _mm_setr_epi64x(0, !0);
+        let r = transmute::<_, __m128i>(_mm_cmpnge_pd(a, b));
+        assert_eq_m128i(r, e);
+    }
+    test_mm_cmpnge_pd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_comieq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comieq_sd(a, b) != 0);
+
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comieq_sd(a, b) == 0);
+    }
+    test_mm_comieq_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_comilt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comilt_sd(a, b) == 0);
+    }
+    test_mm_comilt_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_comile_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comile_sd(a, b) != 0);
+    }
+    test_mm_comile_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_comigt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comigt_sd(a, b) == 0);
+    }
+    test_mm_comigt_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_comige_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comige_sd(a, b) != 0);
+    }
+    test_mm_comige_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_comineq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_comineq_sd(a, b) == 0);
+    }
+    test_mm_comineq_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_ucomieq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomieq_sd(a, b) != 0);
+
+        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0));
+        assert!(_mm_ucomieq_sd(a, b) == 0);
+    }
+    test_mm_ucomieq_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_ucomilt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomilt_sd(a, b) == 0);
+    }
+    test_mm_ucomilt_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_ucomile_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomile_sd(a, b) != 0);
+    }
+    test_mm_ucomile_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_ucomigt_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomigt_sd(a, b) == 0);
+    }
+    test_mm_ucomigt_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_ucomige_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomige_sd(a, b) != 0);
+    }
+    test_mm_ucomige_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_ucomineq_sd() {
+        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
+        assert!(_mm_ucomineq_sd(a, b) == 0);
+    }
+    test_mm_ucomineq_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cvtpd_ps() {
+        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0));
+        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0));
+
+        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0));
+        assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0));
+
+        let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN));
+        assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
+
+        let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64));
+        assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0));
+    }
+    test_mm_cvtpd_ps();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cvtps_pd() {
+        let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0));
+        assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0));
+
+        let r = _mm_cvtps_pd(_mm_setr_ps(f32::MAX, f32::INFINITY, f32::NEG_INFINITY, f32::MIN));
+        assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY));
+    }
+    test_mm_cvtps_pd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cvtpd_epi32() {
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0));
+        assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0));
+
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0));
+        assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0));
+
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN));
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
+
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY));
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
+
+        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN));
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
+    }
+    test_mm_cvtpd_epi32();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cvttpd_epi32() {
+        let a = _mm_setr_pd(-1.1, 2.2);
+        let r = _mm_cvttpd_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0));
+
+        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
+        let r = _mm_cvttpd_epi32(a);
+        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
+    }
+    test_mm_cvttpd_epi32();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cvtsd_si32() {
+        let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0));
+        assert_eq!(r, -2);
+
+        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN));
+        assert_eq!(r, i32::MIN);
+
+        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN));
+        assert_eq!(r, i32::MIN);
+    }
+    test_mm_cvtsd_si32();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cvttsd_si32() {
+        let a = _mm_setr_pd(-1.1, 2.2);
+        let r = _mm_cvttsd_si32(a);
+        assert_eq!(r, -1);
+
+        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
+        let r = _mm_cvttsd_si32(a);
+        assert_eq!(r, i32::MIN);
+    }
+    test_mm_cvttsd_si32();
+
+    // Intrinsic only available on x86_64
+    #[cfg(target_arch = "x86_64")]
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cvtsd_si64() {
+        let r = _mm_cvtsd_si64(_mm_setr_pd(-2.0, 5.0));
+        assert_eq!(r, -2_i64);
+
+        let r = _mm_cvtsd_si64(_mm_setr_pd(f64::MAX, f64::MIN));
+        assert_eq!(r, i64::MIN);
+    }
+    #[cfg(target_arch = "x86_64")]
+    test_mm_cvtsd_si64();
+
+    // Intrinsic only available on x86_64
+    #[cfg(target_arch = "x86_64")]
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cvttsd_si64() {
+        let a = _mm_setr_pd(-1.1, 2.2);
+        let r = _mm_cvttsd_si64(a);
+        assert_eq!(r, -1_i64);
+    }
+    #[cfg(target_arch = "x86_64")]
+    test_mm_cvttsd_si64();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cvtsd_ss() {
+        let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4);
+        let b = _mm_setr_pd(2.0, -5.0);
+
+        let r = _mm_cvtsd_ss(a, b);
+
+        assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
+
+        let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
+        let b = _mm_setr_pd(f64::INFINITY, -5.0);
+
+        let r = _mm_cvtsd_ss(a, b);
+
+        assert_eq_m128(
+            r,
+            _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY),
+        );
+    }
+    test_mm_cvtsd_ss();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_cvtss_sd() {
+        let a = _mm_setr_pd(-1.1, 2.2);
+        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+
+        let r = _mm_cvtss_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2));
+
+        let a = _mm_setr_pd(-1.1, f64::INFINITY);
+        let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0);
+
+        let r = _mm_cvtss_sd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY));
+    }
+    test_mm_cvtss_sd();
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn test_mm_movemask_pd() {
+        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0));
+        assert_eq!(r, 0b01);
+
+        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0));
+        assert_eq!(r, 0b11);
+    }
+    test_mm_movemask_pd();
+}