diff --git a/src/tools/miri/miri-script/src/commands.rs b/src/tools/miri/miri-script/src/commands.rs index fe66f1a9bdb..ed78f80c023 100644 --- a/src/tools/miri/miri-script/src/commands.rs +++ b/src/tools/miri/miri-script/src/commands.rs @@ -173,7 +173,7 @@ impl Command { // the merge has confused the heck out of josh in the past. // We pass `--no-verify` to avoid running git hooks like `./miri fmt` that could in turn // trigger auto-actions. - sh.write_file("rust-version", &commit)?; + sh.write_file("rust-version", format!("{commit}\n"))?; const PREPARING_COMMIT_MESSAGE: &str = "Preparing for merge from rustc"; cmd!(sh, "git commit rust-version --no-verify -m {PREPARING_COMMIT_MESSAGE}") .run() diff --git a/src/tools/miri/rust-version b/src/tools/miri/rust-version index 716b690daaa..22a3d439861 100644 --- a/src/tools/miri/rust-version +++ b/src/tools/miri/rust-version @@ -1 +1 @@ -fca59ab5f0e7df7d816bed77a32abc0045ebe80b \ No newline at end of file +9fa6bdd764a1f7bdf69eccceeace6d13f38cb2e1 diff --git a/src/tools/miri/src/borrow_tracker/tree_borrows/mod.rs b/src/tools/miri/src/borrow_tracker/tree_borrows/mod.rs index 0fbe66360b2..08b138c68ae 100644 --- a/src/tools/miri/src/borrow_tracker/tree_borrows/mod.rs +++ b/src/tools/miri/src/borrow_tracker/tree_borrows/mod.rs @@ -283,7 +283,7 @@ trait EvalContextPrivExt<'mir: 'ecx, 'tcx: 'mir, 'ecx>: crate::MiriInterpCxExt<' // interleaving, but wether UB happens can depend on whether a write occurs in the // future... let is_write = new_perm.initial_state.is_active() - || (new_perm.initial_state.is_resrved() && new_perm.protector.is_some()); + || (new_perm.initial_state.is_reserved() && new_perm.protector.is_some()); if is_write { // Need to get mutable access to alloc_extra. // (Cannot always do this as we can do read-only reborrowing on read-only allocations.) diff --git a/src/tools/miri/src/borrow_tracker/tree_borrows/perms.rs b/src/tools/miri/src/borrow_tracker/tree_borrows/perms.rs index b4a9a768e27..0ce29ac5437 100644 --- a/src/tools/miri/src/borrow_tracker/tree_borrows/perms.rs +++ b/src/tools/miri/src/borrow_tracker/tree_borrows/perms.rs @@ -152,7 +152,7 @@ impl Permission { matches!(self.inner, Active) } - pub fn is_resrved(self) -> bool { + pub fn is_reserved(self) -> bool { matches!(self.inner, Reserved { .. }) } diff --git a/src/tools/miri/src/concurrency/range_object_map.rs b/src/tools/miri/src/concurrency/range_object_map.rs index 89c009933bb..859eb4bbb60 100644 --- a/src/tools/miri/src/concurrency/range_object_map.rs +++ b/src/tools/miri/src/concurrency/range_object_map.rs @@ -42,30 +42,19 @@ impl RangeObjectMap { /// in an existing allocation, then returns Err containing the position /// where such allocation should be inserted fn find_offset(&self, offset: Size) -> Result { - // We do a binary search. - let mut left = 0usize; // inclusive - let mut right = self.v.len(); // exclusive - loop { - if left == right { - // No element contains the given offset. But the - // position is where such element should be placed at. - return Err(left); - } - let candidate = left.checked_add(right).unwrap() / 2; - let elem = &self.v[candidate]; + self.v.binary_search_by(|elem| -> std::cmp::Ordering { if offset < elem.range.start { // We are too far right (offset is further left). - debug_assert!(candidate < right); // we are making progress - right = candidate; + // (`Greater` means that `elem` is greater than the desired target.) + std::cmp::Ordering::Greater } else if offset >= elem.range.end() { // We are too far left (offset is further right). - debug_assert!(candidate >= left); // we are making progress - left = candidate + 1; + std::cmp::Ordering::Less } else { // This is it! - return Ok(candidate); + std::cmp::Ordering::Equal } - } + }) } /// Determines whether a given access on `range` overlaps with diff --git a/src/tools/miri/src/eval.rs b/src/tools/miri/src/eval.rs index b761a6cf475..88e7d5386db 100644 --- a/src/tools/miri/src/eval.rs +++ b/src/tools/miri/src/eval.rs @@ -301,7 +301,7 @@ pub fn create_ecx<'mir, 'tcx: 'mir>( // Third argument (`argv`): created from `config.args`. let argv = { // Put each argument in memory, collect pointers. - let mut argvs = Vec::>::new(); + let mut argvs = Vec::>::with_capacity(config.args.len()); for arg in config.args.iter() { // Make space for `0` terminator. let size = u64::try_from(arg.len()).unwrap().checked_add(1).unwrap(); diff --git a/src/tools/miri/src/helpers.rs b/src/tools/miri/src/helpers.rs index e0f74d03ff6..ea1931f7a18 100644 --- a/src/tools/miri/src/helpers.rs +++ b/src/tools/miri/src/helpers.rs @@ -1,6 +1,5 @@ pub mod convert; -use std::any::Any; use std::cmp; use std::iter; use std::num::NonZeroUsize; @@ -25,24 +24,6 @@ use rand::RngCore; use crate::*; -/// A trait to work around not having trait object upcasting: -/// Add `AsAny` as supertrait and your trait objects can be turned into `&dyn Any` on which you can -/// then call `downcast`. -pub trait AsAny: Any { - fn as_any(&self) -> &dyn Any; - fn as_any_mut(&mut self) -> &mut dyn Any; -} -impl AsAny for T { - #[inline(always)] - fn as_any(&self) -> &dyn Any { - self - } - #[inline(always)] - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } -} - // This mapping should match `decode_error_kind` in // . const UNIX_IO_ERROR_TABLE: &[(&str, std::io::ErrorKind)] = { diff --git a/src/tools/miri/src/lib.rs b/src/tools/miri/src/lib.rs index 5327c2f24e0..0e50618ad9e 100644 --- a/src/tools/miri/src/lib.rs +++ b/src/tools/miri/src/lib.rs @@ -1,4 +1,5 @@ #![feature(rustc_private)] +#![feature(float_gamma)] #![feature(map_try_insert)] #![feature(never_type)] #![feature(try_blocks)] @@ -10,6 +11,7 @@ #![feature(round_ties_even)] #![feature(os_str_bytes)] #![feature(lint_reasons)] +#![feature(trait_upcasting)] // Configure clippy and other lints #![allow( clippy::collapsible_else_if, @@ -19,6 +21,7 @@ clippy::enum_variant_names, clippy::field_reassign_with_default, clippy::manual_map, + clippy::neg_cmp_op_on_partial_ord, clippy::new_without_default, clippy::single_match, clippy::useless_format, diff --git a/src/tools/miri/src/shims/foreign_items.rs b/src/tools/miri/src/shims/foreign_items.rs index 167d1fd4518..953677d3290 100644 --- a/src/tools/miri/src/shims/foreign_items.rs +++ b/src/tools/miri/src/shims/foreign_items.rs @@ -815,6 +815,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { | "atanf" | "log1pf" | "expm1f" + | "tgammaf" => { let [f] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; // FIXME: Using host floats. @@ -830,6 +831,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { "atanf" => f.atan(), "log1pf" => f.ln_1p(), "expm1f" => f.exp_m1(), + "tgammaf" => f.gamma(), _ => bug!(), }; this.write_scalar(Scalar::from_u32(res.to_bits()), dest)?; @@ -866,6 +868,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { | "atan" | "log1p" | "expm1" + | "tgamma" => { let [f] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; // FIXME: Using host floats. @@ -881,6 +884,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { "atan" => f.atan(), "log1p" => f.ln_1p(), "expm1" => f.exp_m1(), + "tgamma" => f.gamma(), _ => bug!(), }; this.write_scalar(Scalar::from_u64(res.to_bits()), dest)?; @@ -917,6 +921,53 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { let res = x.scalbn(exp); this.write_scalar(Scalar::from_f64(res), dest)?; } + "lgammaf_r" => { + let [x, signp] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + // FIXME: Using host floats. + let x = f32::from_bits(this.read_scalar(x)?.to_u32()?); + let signp = this.deref_pointer(signp)?; + + let (res, sign) = x.ln_gamma(); + this.write_int(sign, &signp)?; + this.write_scalar(Scalar::from_u32(res.to_bits()), dest)?; + } + "lgamma_r" => { + let [x, signp] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + // FIXME: Using host floats. + let x = f64::from_bits(this.read_scalar(x)?.to_u64()?); + let signp = this.deref_pointer(signp)?; + + let (res, sign) = x.ln_gamma(); + this.write_int(sign, &signp)?; + this.write_scalar(Scalar::from_u64(res.to_bits()), dest)?; + } + + "llvm.prefetch" => { + let [p, rw, loc, ty] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let _ = this.read_pointer(p)?; + let rw = this.read_scalar(rw)?.to_i32()?; + let loc = this.read_scalar(loc)?.to_i32()?; + let ty = this.read_scalar(ty)?.to_i32()?; + + if ty == 1 { + // Data cache prefetch. + // Notably, we do not have to check the pointer, this operation is never UB! + + if !matches!(rw, 0 | 1) { + throw_unsup_format!("invalid `rw` value passed to `llvm.prefetch`: {}", rw); + } + if !matches!(loc, 0..=3) { + throw_unsup_format!( + "invalid `loc` value passed to `llvm.prefetch`: {}", + loc + ); + } + } else { + throw_unsup_format!("unsupported `llvm.prefetch` type argument: {}", ty); + } + } // Architecture-specific shims "llvm.x86.addcarry.64" if this.tcx.sess.target.arch == "x86_64" => { @@ -970,6 +1021,12 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { } } + name if name.starts_with("llvm.x86.sse.") => { + return shims::x86::sse::EvalContextExt::emulate_x86_sse_intrinsic( + this, link_name, abi, args, dest, + ); + } + // Platform-specific shims _ => return match this.tcx.sess.target.os.as_ref() { diff --git a/src/tools/miri/src/shims/mod.rs b/src/tools/miri/src/shims/mod.rs index 1027b24e301..5a9574766f3 100644 --- a/src/tools/miri/src/shims/mod.rs +++ b/src/tools/miri/src/shims/mod.rs @@ -7,6 +7,7 @@ pub mod foreign_items; pub mod intrinsics; pub mod unix; pub mod windows; +mod x86; pub mod dlsym; pub mod env; diff --git a/src/tools/miri/src/shims/unix/fs.rs b/src/tools/miri/src/shims/unix/fs.rs index 09349bfdead..beaef22075b 100644 --- a/src/tools/miri/src/shims/unix/fs.rs +++ b/src/tools/miri/src/shims/unix/fs.rs @@ -1,3 +1,4 @@ +use std::any::Any; use std::borrow::Cow; use std::collections::BTreeMap; use std::convert::TryInto; @@ -24,7 +25,7 @@ pub struct FileHandle { writable: bool, } -pub trait FileDescriptor: std::fmt::Debug + helpers::AsAny { +pub trait FileDescriptor: std::fmt::Debug + Any { fn name(&self) -> &'static str; fn read<'tcx>( @@ -72,6 +73,18 @@ pub trait FileDescriptor: std::fmt::Debug + helpers::AsAny { } } +impl dyn FileDescriptor { + #[inline(always)] + pub fn downcast_ref(&self) -> Option<&T> { + (self as &dyn Any).downcast_ref() + } + + #[inline(always)] + pub fn downcast_mut(&mut self) -> Option<&mut T> { + (self as &mut dyn Any).downcast_mut() + } +} + impl FileDescriptor for FileHandle { fn name(&self) -> &'static str { "FILE" @@ -689,7 +702,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { if let Some(file_descriptor) = this.machine.file_handler.handles.get(&fd) { // FIXME: Support fullfsync for all FDs let FileHandle { file, writable } = - file_descriptor.as_any().downcast_ref::().ok_or_else(|| { + file_descriptor.downcast_ref::().ok_or_else(|| { err_unsup_format!( "`F_FULLFSYNC` is only supported on file-backed file descriptors" ) @@ -1522,7 +1535,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { if let Some(file_descriptor) = this.machine.file_handler.handles.get_mut(&fd) { // FIXME: Support ftruncate64 for all FDs let FileHandle { file, writable } = - file_descriptor.as_any().downcast_ref::().ok_or_else(|| { + file_descriptor.downcast_ref::().ok_or_else(|| { err_unsup_format!( "`ftruncate64` is only supported on file-backed file descriptors" ) @@ -1568,7 +1581,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { if let Some(file_descriptor) = this.machine.file_handler.handles.get(&fd) { // FIXME: Support fsync for all FDs let FileHandle { file, writable } = - file_descriptor.as_any().downcast_ref::().ok_or_else(|| { + file_descriptor.downcast_ref::().ok_or_else(|| { err_unsup_format!("`fsync` is only supported on file-backed file descriptors") })?; let io_result = maybe_sync_file(file, *writable, File::sync_all); @@ -1593,7 +1606,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { if let Some(file_descriptor) = this.machine.file_handler.handles.get(&fd) { // FIXME: Support fdatasync for all FDs let FileHandle { file, writable } = - file_descriptor.as_any().downcast_ref::().ok_or_else(|| { + file_descriptor.downcast_ref::().ok_or_else(|| { err_unsup_format!( "`fdatasync` is only supported on file-backed file descriptors" ) @@ -1643,7 +1656,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { if let Some(file_descriptor) = this.machine.file_handler.handles.get(&fd) { // FIXME: Support sync_data_range for all FDs let FileHandle { file, writable } = - file_descriptor.as_any().downcast_ref::().ok_or_else(|| { + file_descriptor.downcast_ref::().ok_or_else(|| { err_unsup_format!( "`sync_data_range` is only supported on file-backed file descriptors" ) @@ -1953,7 +1966,6 @@ impl FileMetadata { let file = match option { Some(file_descriptor) => &file_descriptor - .as_any() .downcast_ref::() .ok_or_else(|| { err_unsup_format!( diff --git a/src/tools/miri/src/shims/unix/linux/fd.rs b/src/tools/miri/src/shims/unix/linux/fd.rs index f2be89ce637..22fbb6da95a 100644 --- a/src/tools/miri/src/shims/unix/linux/fd.rs +++ b/src/tools/miri/src/shims/unix/linux/fd.rs @@ -1,3 +1,5 @@ +use std::cell::Cell; + use rustc_middle::ty::ScalarInt; use crate::*; @@ -7,8 +9,6 @@ use socketpair::SocketPair; use shims::unix::fs::EvalContextExt as _; -use std::cell::Cell; - pub mod epoll; pub mod event; pub mod socketpair; @@ -81,7 +81,6 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { if let Some(epfd) = this.machine.file_handler.handles.get_mut(&epfd) { let epfd = epfd - .as_any_mut() .downcast_mut::() .ok_or_else(|| err_unsup_format!("non-epoll FD passed to `epoll_ctl`"))?; @@ -93,7 +92,6 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { } else if op == epoll_ctl_del { if let Some(epfd) = this.machine.file_handler.handles.get_mut(&epfd) { let epfd = epfd - .as_any_mut() .downcast_mut::() .ok_or_else(|| err_unsup_format!("non-epoll FD passed to `epoll_ctl`"))?; @@ -154,7 +152,6 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { if let Some(epfd) = this.machine.file_handler.handles.get_mut(&epfd) { let _epfd = epfd - .as_any_mut() .downcast_mut::() .ok_or_else(|| err_unsup_format!("non-epoll FD passed to `epoll_wait`"))?; diff --git a/src/tools/miri/src/shims/x86/mod.rs b/src/tools/miri/src/shims/x86/mod.rs new file mode 100644 index 00000000000..36e673129de --- /dev/null +++ b/src/tools/miri/src/shims/x86/mod.rs @@ -0,0 +1 @@ +pub(super) mod sse; diff --git a/src/tools/miri/src/shims/x86/sse.rs b/src/tools/miri/src/shims/x86/sse.rs new file mode 100644 index 00000000000..deae0875fb9 --- /dev/null +++ b/src/tools/miri/src/shims/x86/sse.rs @@ -0,0 +1,609 @@ +use rustc_apfloat::{ieee::Single, Float as _}; +use rustc_middle::mir; +use rustc_span::Symbol; +use rustc_target::spec::abi::Abi; + +use rand::Rng as _; + +use crate::*; +use shims::foreign_items::EmulateByNameResult; + +impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for crate::MiriInterpCx<'mir, 'tcx> {} +pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { + fn emulate_x86_sse_intrinsic( + &mut self, + link_name: Symbol, + abi: Abi, + args: &[OpTy<'tcx, Provenance>], + dest: &PlaceTy<'tcx, Provenance>, + ) -> InterpResult<'tcx, EmulateByNameResult<'mir, 'tcx>> { + let this = self.eval_context_mut(); + // Prefix should have already been checked. + let unprefixed_name = link_name.as_str().strip_prefix("llvm.x86.sse.").unwrap(); + // All these intrinsics operate on 128-bit (f32x4) SIMD vectors unless stated otherwise. + // Many intrinsic names are sufixed with "ps" (packed single) or "ss" (scalar single), + // where single means single precision floating point (f32). "ps" means thet the operation + // is performed on each element of the vector, while "ss" means that the operation is + // performed only on the first element, copying the remaining elements from the input + // vector (for binary operations, from the left-hand side). + match unprefixed_name { + // Used to implement _mm_{add,sub,mul,div,min,max}_ss functions. + // Performs the operations on the first component of `left` and + // `right` and copies the remaining components from `left`. + "add.ss" | "sub.ss" | "mul.ss" | "div.ss" | "min.ss" | "max.ss" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let which = match unprefixed_name { + "add.ss" => FloatBinOp::Arith(mir::BinOp::Add), + "sub.ss" => FloatBinOp::Arith(mir::BinOp::Sub), + "mul.ss" => FloatBinOp::Arith(mir::BinOp::Mul), + "div.ss" => FloatBinOp::Arith(mir::BinOp::Div), + "min.ss" => FloatBinOp::Min, + "max.ss" => FloatBinOp::Max, + _ => unreachable!(), + }; + + bin_op_ss(this, which, left, right, dest)?; + } + // Used to implement _mm_min_ps and _mm_max_ps functions. + // Note that the semantics are a bit different from Rust simd_min + // and simd_max intrinsics regarding handling of NaN and -0.0: Rust + // matches the IEEE min/max operations, while x86 has different + // semantics. + "min.ps" | "max.ps" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let which = match unprefixed_name { + "min.ps" => FloatBinOp::Min, + "max.ps" => FloatBinOp::Max, + _ => unreachable!(), + }; + + bin_op_ps(this, which, left, right, dest)?; + } + // Used to implement _mm_{sqrt,rcp,rsqrt}_ss functions. + // Performs the operations on the first component of `op` and + // copies the remaining components from `op`. + "sqrt.ss" | "rcp.ss" | "rsqrt.ss" => { + let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let which = match unprefixed_name { + "sqrt.ss" => FloatUnaryOp::Sqrt, + "rcp.ss" => FloatUnaryOp::Rcp, + "rsqrt.ss" => FloatUnaryOp::Rsqrt, + _ => unreachable!(), + }; + + unary_op_ss(this, which, op, dest)?; + } + // Used to implement _mm_{sqrt,rcp,rsqrt}_ss functions. + // Performs the operations on all components of `op`. + "sqrt.ps" | "rcp.ps" | "rsqrt.ps" => { + let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let which = match unprefixed_name { + "sqrt.ps" => FloatUnaryOp::Sqrt, + "rcp.ps" => FloatUnaryOp::Rcp, + "rsqrt.ps" => FloatUnaryOp::Rsqrt, + _ => unreachable!(), + }; + + unary_op_ps(this, which, op, dest)?; + } + // Used to implement the _mm_cmp_ss function. + // Performs a comparison operation on the first component of `left` + // and `right`, returning 0 if false or `u32::MAX` if true. The remaining + // components are copied from `left`. + "cmp.ss" => { + let [left, right, imm] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let which = match this.read_scalar(imm)?.to_i8()? { + 0 => FloatBinOp::Cmp(FloatCmpOp::Eq), + 1 => FloatBinOp::Cmp(FloatCmpOp::Lt), + 2 => FloatBinOp::Cmp(FloatCmpOp::Le), + 3 => FloatBinOp::Cmp(FloatCmpOp::Unord), + 4 => FloatBinOp::Cmp(FloatCmpOp::Neq), + 5 => FloatBinOp::Cmp(FloatCmpOp::Nlt), + 6 => FloatBinOp::Cmp(FloatCmpOp::Nle), + 7 => FloatBinOp::Cmp(FloatCmpOp::Ord), + imm => { + throw_unsup_format!( + "invalid 3rd parameter of llvm.x86.sse.cmp.ps: {}", + imm + ); + } + }; + + bin_op_ss(this, which, left, right, dest)?; + } + // Used to implement the _mm_cmp_ps function. + // Performs a comparison operation on each component of `left` + // and `right`. For each component, returns 0 if false or u32::MAX + // if true. + "cmp.ps" => { + let [left, right, imm] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let which = match this.read_scalar(imm)?.to_i8()? { + 0 => FloatBinOp::Cmp(FloatCmpOp::Eq), + 1 => FloatBinOp::Cmp(FloatCmpOp::Lt), + 2 => FloatBinOp::Cmp(FloatCmpOp::Le), + 3 => FloatBinOp::Cmp(FloatCmpOp::Unord), + 4 => FloatBinOp::Cmp(FloatCmpOp::Neq), + 5 => FloatBinOp::Cmp(FloatCmpOp::Nlt), + 6 => FloatBinOp::Cmp(FloatCmpOp::Nle), + 7 => FloatBinOp::Cmp(FloatCmpOp::Ord), + imm => { + throw_unsup_format!( + "invalid 3rd parameter of llvm.x86.sse.cmp.ps: {}", + imm + ); + } + }; + + bin_op_ps(this, which, left, right, dest)?; + } + // Used to implement _mm_{,u}comi{eq,lt,le,gt,ge,neq}_ps functions. + // Compares the first component of `left` and `right` and returns + // a scalar value (0 or 1). + "comieq.ss" | "comilt.ss" | "comile.ss" | "comigt.ss" | "comige.ss" | "comineq.ss" + | "ucomieq.ss" | "ucomilt.ss" | "ucomile.ss" | "ucomigt.ss" | "ucomige.ss" + | "ucomineq.ss" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + + assert_eq!(left_len, right_len); + + let left = this.read_scalar(&this.project_index(&left, 0)?)?.to_f32()?; + let right = this.read_scalar(&this.project_index(&right, 0)?)?.to_f32()?; + // The difference between the com* and *ucom variants is signaling + // of exceptions when either argument is a quiet NaN. We do not + // support accessing the SSE status register from miri (or from Rust, + // for that matter), so we treat equally both variants. + let res = match unprefixed_name { + "comieq.ss" | "ucomieq.ss" => left == right, + "comilt.ss" | "ucomilt.ss" => left < right, + "comile.ss" | "ucomile.ss" => left <= right, + "comigt.ss" | "ucomigt.ss" => left > right, + "comige.ss" | "ucomige.ss" => left >= right, + "comineq.ss" | "ucomineq.ss" => left != right, + _ => unreachable!(), + }; + this.write_scalar(Scalar::from_i32(i32::from(res)), dest)?; + } + // Use to implement _mm_cvtss_si32 and _mm_cvttss_si32. + // Converts the first component of `op` from f32 to i32. + "cvtss2si" | "cvttss2si" => { + let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + let (op, _) = this.operand_to_simd(op)?; + + let op = this.read_scalar(&this.project_index(&op, 0)?)?.to_f32()?; + + let rnd = match unprefixed_name { + // "current SSE rounding mode", assume nearest + // https://www.felixcloutier.com/x86/cvtss2si + "cvtss2si" => rustc_apfloat::Round::NearestTiesToEven, + // always truncate + // https://www.felixcloutier.com/x86/cvttss2si + "cvttss2si" => rustc_apfloat::Round::TowardZero, + _ => unreachable!(), + }; + + let mut exact = false; + let cvt = op.to_i128_r(32, rnd, &mut exact); + let res = if cvt.status.intersects( + rustc_apfloat::Status::INVALID_OP + | rustc_apfloat::Status::OVERFLOW + | rustc_apfloat::Status::UNDERFLOW, + ) { + // Input is NaN (flagged with INVALID_OP) or does not fit + // in an i32 (flagged with OVERFLOW or UNDERFLOW), fallback + // to minimum acording to SSE semantics. The INEXACT flag + // is ignored on purpose because rounding can happen during + // float-to-int conversion. + i32::MIN + } else { + i32::try_from(cvt.value).unwrap() + }; + + this.write_scalar(Scalar::from_i32(res), dest)?; + } + // Use to implement _mm_cvtss_si64 and _mm_cvttss_si64. + // Converts the first component of `op` from f32 to i64. + "cvtss2si64" | "cvttss2si64" => { + let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + let (op, _) = this.operand_to_simd(op)?; + + let op = this.read_scalar(&this.project_index(&op, 0)?)?.to_f32()?; + + let rnd = match unprefixed_name { + // "current SSE rounding mode", assume nearest + // https://www.felixcloutier.com/x86/cvtss2si + "cvtss2si64" => rustc_apfloat::Round::NearestTiesToEven, + // always truncate + // https://www.felixcloutier.com/x86/cvttss2si + "cvttss2si64" => rustc_apfloat::Round::TowardZero, + _ => unreachable!(), + }; + + let mut exact = false; + let cvt = op.to_i128_r(64, rnd, &mut exact); + let res = if cvt.status.intersects( + rustc_apfloat::Status::INVALID_OP + | rustc_apfloat::Status::OVERFLOW + | rustc_apfloat::Status::UNDERFLOW, + ) { + // Input is NaN (flagged with INVALID_OP) or does not fit + // in an i64 (flagged with OVERFLOW or UNDERFLOW), fallback + // to minimum acording to SSE semantics. The INEXACT flag + // is ignored on purpose because rounding can happen during + // float-to-int conversion. + i64::MIN + } else { + i64::try_from(cvt.value).unwrap() + }; + + this.write_scalar(Scalar::from_i64(res), dest)?; + } + // Used to implement the _mm_cvtsi32_ss function. + // Converts `right` from i32 to f32. Returns a SIMD vector with + // the result in the first component and the remaining components + // are copied from `left`. + // https://www.felixcloutier.com/x86/cvtsi2ss + "cvtsi2ss" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + + let right = this.read_scalar(right)?.to_i32()?; + + let res0 = Scalar::from_f32(Single::from_i128(right.into()).value); + this.write_scalar(res0, &this.project_index(&dest, 0)?)?; + + for i in 1..dest_len { + let left = this.read_immediate(&this.project_index(&left, i)?)?; + let dest = this.project_index(&dest, i)?; + + this.write_immediate(*left, &dest)?; + } + } + // Used to implement the _mm_cvtsi64_ss function. + // Converts `right` from i64 to f32. Returns a SIMD vector with + // the result in the first component and the remaining components + // are copied from `left`. + // https://www.felixcloutier.com/x86/cvtsi2ss + "cvtsi642ss" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + + let right = this.read_scalar(right)?.to_i64()?; + + let res0 = Scalar::from_f32(Single::from_i128(right.into()).value); + this.write_scalar(res0, &this.project_index(&dest, 0)?)?; + + for i in 1..dest_len { + let left = this.read_immediate(&this.project_index(&left, i)?)?; + let dest = this.project_index(&dest, i)?; + + this.write_immediate(*left, &dest)?; + } + } + // Used to implement the _mm_movemask_ps function. + // Returns a scalar integer where the i-th bit is the highest + // bit of the i-th component of `op`. + // https://www.felixcloutier.com/x86/movmskps + "movmsk.ps" => { + let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + let (op, op_len) = this.operand_to_simd(op)?; + + let mut res = 0; + for i in 0..op_len { + let op = this.read_scalar(&this.project_index(&op, i)?)?; + let op = op.to_u32()?; + + res |= (op >> 31) << i; + } + + this.write_scalar(Scalar::from_u32(res), dest)?; + } + _ => return Ok(EmulateByNameResult::NotSupported), + } + Ok(EmulateByNameResult::NeedsJumping) + } +} + +/// Floating point comparison operation +/// +/// +/// +#[derive(Copy, Clone)] +enum FloatCmpOp { + Eq, + Lt, + Le, + Unord, + Neq, + /// Not less-than + Nlt, + /// Not less-or-equal + Nle, + /// Ordered, i.e. neither of them is NaN + Ord, +} + +#[derive(Copy, Clone)] +enum FloatBinOp { + /// Arithmetic operation + Arith(mir::BinOp), + /// Comparison + Cmp(FloatCmpOp), + /// Minimum value (with SSE semantics) + /// + /// + /// + Min, + /// Maximum value (with SSE semantics) + /// + /// + /// + Max, +} + +/// Performs `which` scalar operation on `left` and `right` and returns +/// the result. +fn bin_op_f32<'tcx>( + this: &crate::MiriInterpCx<'_, 'tcx>, + which: FloatBinOp, + left: &ImmTy<'tcx, Provenance>, + right: &ImmTy<'tcx, Provenance>, +) -> InterpResult<'tcx, Scalar> { + match which { + FloatBinOp::Arith(which) => { + let (res, _, _) = this.overflowing_binary_op(which, left, right)?; + Ok(res) + } + FloatBinOp::Cmp(which) => { + let left = left.to_scalar().to_f32()?; + let right = right.to_scalar().to_f32()?; + // FIXME: Make sure that these operations match the semantics of cmpps + let res = match which { + FloatCmpOp::Eq => left == right, + FloatCmpOp::Lt => left < right, + FloatCmpOp::Le => left <= right, + FloatCmpOp::Unord => left.is_nan() || right.is_nan(), + FloatCmpOp::Neq => left != right, + FloatCmpOp::Nlt => !(left < right), + FloatCmpOp::Nle => !(left <= right), + FloatCmpOp::Ord => !left.is_nan() && !right.is_nan(), + }; + Ok(Scalar::from_u32(if res { u32::MAX } else { 0 })) + } + FloatBinOp::Min => { + let left = left.to_scalar().to_f32()?; + let right = right.to_scalar().to_f32()?; + // SSE semantics to handle zero and NaN. Note that `x == Single::ZERO` + // is true when `x` is either +0 or -0. + if (left == Single::ZERO && right == Single::ZERO) + || left.is_nan() + || right.is_nan() + || left >= right + { + Ok(Scalar::from_f32(right)) + } else { + Ok(Scalar::from_f32(left)) + } + } + FloatBinOp::Max => { + let left = left.to_scalar().to_f32()?; + let right = right.to_scalar().to_f32()?; + // SSE semantics to handle zero and NaN. Note that `x == Single::ZERO` + // is true when `x` is either +0 or -0. + if (left == Single::ZERO && right == Single::ZERO) + || left.is_nan() + || right.is_nan() + || left <= right + { + Ok(Scalar::from_f32(right)) + } else { + Ok(Scalar::from_f32(left)) + } + } + } +} + +/// Performs `which` operation on the first component of `left` and `right` +/// and copies the other components from `left`. The result is stored in `dest`. +fn bin_op_ss<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + which: FloatBinOp, + left: &OpTy<'tcx, Provenance>, + right: &OpTy<'tcx, Provenance>, + dest: &PlaceTy<'tcx, Provenance>, +) -> InterpResult<'tcx, ()> { + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + assert_eq!(dest_len, right_len); + + let res0 = bin_op_f32( + this, + which, + &this.read_immediate(&this.project_index(&left, 0)?)?, + &this.read_immediate(&this.project_index(&right, 0)?)?, + )?; + this.write_scalar(res0, &this.project_index(&dest, 0)?)?; + + for i in 1..dest_len { + let left = this.read_immediate(&this.project_index(&left, i)?)?; + let dest = this.project_index(&dest, i)?; + + this.write_immediate(*left, &dest)?; + } + + Ok(()) +} + +/// Performs `which` operation on each component of `left`, and +/// `right` storing the result is stored in `dest`. +fn bin_op_ps<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + which: FloatBinOp, + left: &OpTy<'tcx, Provenance>, + right: &OpTy<'tcx, Provenance>, + dest: &PlaceTy<'tcx, Provenance>, +) -> InterpResult<'tcx, ()> { + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + assert_eq!(dest_len, right_len); + + for i in 0..dest_len { + let left = this.read_immediate(&this.project_index(&left, i)?)?; + let right = this.read_immediate(&this.project_index(&right, i)?)?; + let dest = this.project_index(&dest, i)?; + + let res = bin_op_f32(this, which, &left, &right)?; + this.write_scalar(res, &dest)?; + } + + Ok(()) +} + +#[derive(Copy, Clone)] +enum FloatUnaryOp { + /// sqrt(x) + /// + /// + /// + Sqrt, + /// Approximation of 1/x + /// + /// + /// + Rcp, + /// Approximation of 1/sqrt(x) + /// + /// + /// + Rsqrt, +} + +/// Performs `which` scalar operation on `op` and returns the result. +#[allow(clippy::arithmetic_side_effects)] // floating point operations without side effects +fn unary_op_f32<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + which: FloatUnaryOp, + op: &ImmTy<'tcx, Provenance>, +) -> InterpResult<'tcx, Scalar> { + match which { + FloatUnaryOp::Sqrt => { + let op = op.to_scalar(); + // FIXME using host floats + Ok(Scalar::from_u32(f32::from_bits(op.to_u32()?).sqrt().to_bits())) + } + FloatUnaryOp::Rcp => { + let op = op.to_scalar().to_f32()?; + let div = (Single::from_u128(1).value / op).value; + // Apply a relative error with a magnitude on the order of 2^-12 to simulate the + // inaccuracy of RCP. + let res = apply_random_float_error(this, div, -12); + Ok(Scalar::from_f32(res)) + } + FloatUnaryOp::Rsqrt => { + let op = op.to_scalar().to_u32()?; + // FIXME using host floats + let sqrt = Single::from_bits(f32::from_bits(op).sqrt().to_bits().into()); + let rsqrt = (Single::from_u128(1).value / sqrt).value; + // Apply a relative error with a magnitude on the order of 2^-12 to simulate the + // inaccuracy of RSQRT. + let res = apply_random_float_error(this, rsqrt, -12); + Ok(Scalar::from_f32(res)) + } + } +} + +/// Disturbes a floating-point result by a relative error on the order of (-2^scale, 2^scale). +#[allow(clippy::arithmetic_side_effects)] // floating point arithmetic cannot panic +fn apply_random_float_error( + this: &mut crate::MiriInterpCx<'_, '_>, + val: F, + err_scale: i32, +) -> F { + let rng = this.machine.rng.get_mut(); + // generates rand(0, 2^64) * 2^(scale - 64) = rand(0, 1) * 2^scale + let err = + F::from_u128(rng.gen::().into()).value.scalbn(err_scale.checked_sub(64).unwrap()); + // give it a random sign + let err = if rng.gen::() { -err } else { err }; + // multiple the value with (1+err) + (val * (F::from_u128(1).value + err).value).value +} + +/// Performs `which` operation on the first component of `op` and copies +/// the other components. The result is stored in `dest`. +fn unary_op_ss<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + which: FloatUnaryOp, + op: &OpTy<'tcx, Provenance>, + dest: &PlaceTy<'tcx, Provenance>, +) -> InterpResult<'tcx, ()> { + let (op, op_len) = this.operand_to_simd(op)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, op_len); + + let res0 = unary_op_f32(this, which, &this.read_immediate(&this.project_index(&op, 0)?)?)?; + this.write_scalar(res0, &this.project_index(&dest, 0)?)?; + + for i in 1..dest_len { + let op = this.read_immediate(&this.project_index(&op, i)?)?; + let dest = this.project_index(&dest, i)?; + + this.write_immediate(*op, &dest)?; + } + + Ok(()) +} + +/// Performs `which` operation on each component of `op`, storing the +/// result is stored in `dest`. +fn unary_op_ps<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + which: FloatUnaryOp, + op: &OpTy<'tcx, Provenance>, + dest: &PlaceTy<'tcx, Provenance>, +) -> InterpResult<'tcx, ()> { + let (op, op_len) = this.operand_to_simd(op)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, op_len); + + for i in 0..dest_len { + let op = this.read_immediate(&this.project_index(&op, i)?)?; + let dest = this.project_index(&dest, i)?; + + let res = unary_op_f32(this, which, &op)?; + this.write_scalar(res, &dest)?; + } + + Ok(()) +} diff --git a/src/tools/miri/tests/pass/intrinsics-math.rs b/src/tools/miri/tests/pass/intrinsics-math.rs index 9f2dc333f33..e0e4f5654d6 100644 --- a/src/tools/miri/tests/pass/intrinsics-math.rs +++ b/src/tools/miri/tests/pass/intrinsics-math.rs @@ -1,5 +1,4 @@ -// SPDX-License-Identifier: MIT OR Apache-2.0 -// SPDX-FileCopyrightText: The Rust Project Developers (see https://thanks.rust-lang.org) +#![feature(float_gamma)] macro_rules! assert_approx_eq { ($a:expr, $b:expr) => {{ @@ -130,4 +129,20 @@ pub fn main() { ); assert_approx_eq!(0.5f32.atanh(), 0.54930614433405484569762261846126285f32); assert_approx_eq!(0.5f64.atanh(), 0.54930614433405484569762261846126285f64); + + assert_approx_eq!(5.0f32.gamma(), 24.0); + assert_approx_eq!(5.0f64.gamma(), 24.0); + // These fail even on the host, precision seems to be terrible. + //assert_approx_eq!(-0.5f32.gamma(), -2.0 * f32::consts::PI.sqrt()); + //assert_approx_eq!(-0.5f64.gamma(), -2.0 * f64::consts::PI.sqrt()); + + assert_eq!(2.0f32.ln_gamma(), (0.0, 1)); + assert_eq!(2.0f64.ln_gamma(), (0.0, 1)); + // Gamma(-0.5) = -2*sqrt(π) + let (val, sign) = (-0.5f32).ln_gamma(); + assert_approx_eq!(val, (2.0 * f32::consts::PI.sqrt()).ln()); + assert_eq!(sign, -1); + let (val, sign) = (-0.5f64).ln_gamma(); + assert_approx_eq!(val, (2.0 * f64::consts::PI.sqrt()).ln()); + assert_eq!(sign, -1); } diff --git a/src/tools/miri/tests/pass/intrinsics-x86-sse.rs b/src/tools/miri/tests/pass/intrinsics-x86-sse.rs new file mode 100644 index 00000000000..677d7cc030e --- /dev/null +++ b/src/tools/miri/tests/pass/intrinsics-x86-sse.rs @@ -0,0 +1,1075 @@ +//@only-target-x86_64 + +use std::arch::x86_64::*; +use std::f32::NAN; +use std::mem::transmute; + +fn main() { + assert!(is_x86_feature_detected!("sse")); + + unsafe { + test_sse(); + } +} + +macro_rules! assert_approx_eq { + ($a:expr, $b:expr, $eps:expr) => {{ + let (a, b) = (&$a, &$b); + assert!( + (*a - *b).abs() < $eps, + "assertion failed: `(left !== right)` \ + (left: `{:?}`, right: `{:?}`, expect diff: `{:?}`, real diff: `{:?}`)", + *a, + *b, + $eps, + (*a - *b).abs() + ); + }}; +} + +#[target_feature(enable = "sse")] +unsafe fn test_sse() { + // Mostly copied from library/stdarch/crates/core_arch/src/x86{,_64}/sse.rs + + #[target_feature(enable = "sse")] + unsafe fn assert_eq_m128(a: __m128, b: __m128) { + let r = _mm_cmpeq_ps(a, b); + if _mm_movemask_ps(r) != 0b1111 { + panic!("{:?} != {:?}", a, b); + } + } + + #[target_feature(enable = "sse")] + unsafe fn test_mm_add_ss() { + let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0); + let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0); + let r = _mm_add_ss(a, b); + assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0)); + } + test_mm_add_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_sub_ss() { + let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); + let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); + let r = _mm_sub_ss(a, b); + assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0)); + } + test_mm_sub_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_mul_ss() { + let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); + let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); + let r = _mm_mul_ss(a, b); + assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0)); + } + test_mm_mul_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_div_ss() { + let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); + let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); + let r = _mm_div_ss(a, b); + assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0)); + } + test_mm_div_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_sqrt_ss() { + let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); + let r = _mm_sqrt_ss(a); + let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0); + assert_eq_m128(r, e); + } + test_mm_sqrt_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_sqrt_ps() { + let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); + let r = _mm_sqrt_ps(a); + let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0); + assert_eq_m128(r, e); + } + test_mm_sqrt_ps(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_rcp_ss() { + let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); + let r = _mm_rcp_ss(a); + let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0); + let rel_err = 0.00048828125; + + let r: [f32; 4] = transmute(r); + let e: [f32; 4] = transmute(e); + assert_approx_eq!(r[0], e[0], 2. * rel_err); + for i in 1..4 { + assert_eq!(r[i], e[i]); + } + } + test_mm_rcp_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_rcp_ps() { + let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); + let r = _mm_rcp_ps(a); + let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215); + let rel_err = 0.00048828125; + + let r: [f32; 4] = transmute(r); + let e: [f32; 4] = transmute(e); + for i in 0..4 { + assert_approx_eq!(r[i], e[i], 2. * rel_err); + } + } + test_mm_rcp_ps(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_rsqrt_ss() { + let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); + let r = _mm_rsqrt_ss(a); + let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0); + let rel_err = 0.00048828125; + + let r: [f32; 4] = transmute(r); + let e: [f32; 4] = transmute(e); + assert_approx_eq!(r[0], e[0], 2. * rel_err); + for i in 1..4 { + assert_eq!(r[i], e[i]); + } + } + test_mm_rsqrt_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_rsqrt_ps() { + let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); + let r = _mm_rsqrt_ps(a); + let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845); + let rel_err = 0.00048828125; + + let r: [f32; 4] = transmute(r); + let e: [f32; 4] = transmute(e); + for i in 0..4 { + assert_approx_eq!(r[i], e[i], 2. * rel_err); + } + } + test_mm_rsqrt_ps(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_min_ss() { + let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); + let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); + let r = _mm_min_ss(a, b); + assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0)); + } + test_mm_min_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_min_ps() { + let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); + let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); + let r = _mm_min_ps(a, b); + assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0)); + + // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic because + // the semantics of `simd_min` are different to those of `_mm_min_ps` regarding handling + // of `-0.0`. + let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0); + let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0); + let r1: [u8; 16] = transmute(_mm_min_ps(a, b)); + let r2: [u8; 16] = transmute(_mm_min_ps(b, a)); + let a: [u8; 16] = transmute(a); + let b: [u8; 16] = transmute(b); + assert_eq!(r1, b); + assert_eq!(r2, a); + assert_ne!(a, b); // sanity check that -0.0 is actually present + } + test_mm_min_ps(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_max_ss() { + let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); + let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); + let r = _mm_max_ss(a, b); + assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0)); + } + test_mm_max_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_max_ps() { + let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); + let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); + let r = _mm_max_ps(a, b); + assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0)); + + // `_mm_max_ps` can **not** be implemented using the `simd_max` rust intrinsic because + // the semantics of `simd_max` are different to those of `_mm_max_ps` regarding handling + // of `-0.0`. + let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0); + let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0); + let r1: [u8; 16] = transmute(_mm_max_ps(a, b)); + let r2: [u8; 16] = transmute(_mm_max_ps(b, a)); + let a: [u8; 16] = transmute(a); + let b: [u8; 16] = transmute(b); + assert_eq!(r1, b); + assert_eq!(r2, a); + assert_ne!(a, b); // sanity check that -0.0 is actually present + } + test_mm_max_ps(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpeq_ss() { + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0); + let r: [u32; 4] = transmute(_mm_cmpeq_ss(a, b)); + let e: [u32; 4] = transmute(_mm_setr_ps(transmute(0u32), 2.0, 3.0, 4.0)); + assert_eq!(r, e); + + let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); + let r2: [u32; 4] = transmute(_mm_cmpeq_ss(a, b2)); + let e2: [u32; 4] = transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0)); + assert_eq!(r2, e2); + } + test_mm_cmpeq_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmplt_ss() { + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); + let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); + let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); + + let b1 = 0u32; // a.extract(0) < b.extract(0) + let c1 = 0u32; // a.extract(0) < c.extract(0) + let d1 = !0u32; // a.extract(0) < d.extract(0) + + let rb: [u32; 4] = transmute(_mm_cmplt_ss(a, b)); + let eb: [u32; 4] = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + assert_eq!(rb, eb); + + let rc: [u32; 4] = transmute(_mm_cmplt_ss(a, c)); + let ec: [u32; 4] = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + assert_eq!(rc, ec); + + let rd: [u32; 4] = transmute(_mm_cmplt_ss(a, d)); + let ed: [u32; 4] = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + assert_eq!(rd, ed); + } + test_mm_cmplt_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmple_ss() { + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); + let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); + let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); + + let b1 = 0u32; // a.extract(0) <= b.extract(0) + let c1 = !0u32; // a.extract(0) <= c.extract(0) + let d1 = !0u32; // a.extract(0) <= d.extract(0) + + let rb: [u32; 4] = transmute(_mm_cmple_ss(a, b)); + let eb: [u32; 4] = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + assert_eq!(rb, eb); + + let rc: [u32; 4] = transmute(_mm_cmple_ss(a, c)); + let ec: [u32; 4] = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + assert_eq!(rc, ec); + + let rd: [u32; 4] = transmute(_mm_cmple_ss(a, d)); + let ed: [u32; 4] = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + assert_eq!(rd, ed); + } + test_mm_cmple_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpgt_ss() { + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); + let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); + let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); + + let b1 = !0u32; // a.extract(0) > b.extract(0) + let c1 = 0u32; // a.extract(0) > c.extract(0) + let d1 = 0u32; // a.extract(0) > d.extract(0) + + let rb: [u32; 4] = transmute(_mm_cmpgt_ss(a, b)); + let eb: [u32; 4] = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + assert_eq!(rb, eb); + + let rc: [u32; 4] = transmute(_mm_cmpgt_ss(a, c)); + let ec: [u32; 4] = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + assert_eq!(rc, ec); + + let rd: [u32; 4] = transmute(_mm_cmpgt_ss(a, d)); + let ed: [u32; 4] = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + assert_eq!(rd, ed); + } + test_mm_cmpgt_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpge_ss() { + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); + let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); + let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); + + let b1 = !0u32; // a.extract(0) >= b.extract(0) + let c1 = !0u32; // a.extract(0) >= c.extract(0) + let d1 = 0u32; // a.extract(0) >= d.extract(0) + + let rb: [u32; 4] = transmute(_mm_cmpge_ss(a, b)); + let eb: [u32; 4] = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + assert_eq!(rb, eb); + + let rc: [u32; 4] = transmute(_mm_cmpge_ss(a, c)); + let ec: [u32; 4] = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + assert_eq!(rc, ec); + + let rd: [u32; 4] = transmute(_mm_cmpge_ss(a, d)); + let ed: [u32; 4] = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + assert_eq!(rd, ed); + } + test_mm_cmpge_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpneq_ss() { + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); + let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); + let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); + + let b1 = !0u32; // a.extract(0) != b.extract(0) + let c1 = 0u32; // a.extract(0) != c.extract(0) + let d1 = !0u32; // a.extract(0) != d.extract(0) + + let rb: [u32; 4] = transmute(_mm_cmpneq_ss(a, b)); + let eb: [u32; 4] = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + assert_eq!(rb, eb); + + let rc: [u32; 4] = transmute(_mm_cmpneq_ss(a, c)); + let ec: [u32; 4] = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + assert_eq!(rc, ec); + + let rd: [u32; 4] = transmute(_mm_cmpneq_ss(a, d)); + let ed: [u32; 4] = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + assert_eq!(rd, ed); + } + test_mm_cmpneq_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpnlt_ss() { + // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there + // must be a difference. It may have to do with behavior in the + // presence of NaNs (signaling or quiet). If so, we should add tests + // for those. + + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); + let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); + let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); + + let b1 = !0u32; // a.extract(0) >= b.extract(0) + let c1 = !0u32; // a.extract(0) >= c.extract(0) + let d1 = 0u32; // a.extract(0) >= d.extract(0) + + let rb: [u32; 4] = transmute(_mm_cmpnlt_ss(a, b)); + let eb: [u32; 4] = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + assert_eq!(rb, eb); + + let rc: [u32; 4] = transmute(_mm_cmpnlt_ss(a, c)); + let ec: [u32; 4] = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + assert_eq!(rc, ec); + + let rd: [u32; 4] = transmute(_mm_cmpnlt_ss(a, d)); + let ed: [u32; 4] = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + assert_eq!(rd, ed); + } + test_mm_cmpnlt_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpnle_ss() { + // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there + // must be a difference. It may have to do with behavior in the + // presence + // of NaNs (signaling or quiet). If so, we should add tests for those. + + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); + let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); + let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); + + let b1 = !0u32; // a.extract(0) > b.extract(0) + let c1 = 0u32; // a.extract(0) > c.extract(0) + let d1 = 0u32; // a.extract(0) > d.extract(0) + + let rb: [u32; 4] = transmute(_mm_cmpnle_ss(a, b)); + let eb: [u32; 4] = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + assert_eq!(rb, eb); + + let rc: [u32; 4] = transmute(_mm_cmpnle_ss(a, c)); + let ec: [u32; 4] = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + assert_eq!(rc, ec); + + let rd: [u32; 4] = transmute(_mm_cmpnle_ss(a, d)); + let ed: [u32; 4] = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + assert_eq!(rd, ed); + } + test_mm_cmpnle_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpngt_ss() { + // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there + // must be a difference. It may have to do with behavior in the + // presence of NaNs (signaling or quiet). If so, we should add tests + // for those. + + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); + let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); + let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); + + let b1 = 0u32; // a.extract(0) <= b.extract(0) + let c1 = !0u32; // a.extract(0) <= c.extract(0) + let d1 = !0u32; // a.extract(0) <= d.extract(0) + + let rb: [u32; 4] = transmute(_mm_cmpngt_ss(a, b)); + let eb: [u32; 4] = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + assert_eq!(rb, eb); + + let rc: [u32; 4] = transmute(_mm_cmpngt_ss(a, c)); + let ec: [u32; 4] = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + assert_eq!(rc, ec); + + let rd: [u32; 4] = transmute(_mm_cmpngt_ss(a, d)); + let ed: [u32; 4] = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + assert_eq!(rd, ed); + } + test_mm_cmpngt_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpnge_ss() { + // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there + // must be a difference. It may have to do with behavior in the + // presence of NaNs (signaling or quiet). If so, we should add tests + // for those. + + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); + let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); + let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); + + let b1 = 0u32; // a.extract(0) < b.extract(0) + let c1 = 0u32; // a.extract(0) < c.extract(0) + let d1 = !0u32; // a.extract(0) < d.extract(0) + + let rb: [u32; 4] = transmute(_mm_cmpnge_ss(a, b)); + let eb: [u32; 4] = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + assert_eq!(rb, eb); + + let rc: [u32; 4] = transmute(_mm_cmpnge_ss(a, c)); + let ec: [u32; 4] = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + assert_eq!(rc, ec); + + let rd: [u32; 4] = transmute(_mm_cmpnge_ss(a, d)); + let ed: [u32; 4] = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + assert_eq!(rd, ed); + } + test_mm_cmpnge_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpord_ss() { + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); + let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0); + let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); + + let b1 = !0u32; // a.extract(0) ord b.extract(0) + let c1 = 0u32; // a.extract(0) ord c.extract(0) + let d1 = !0u32; // a.extract(0) ord d.extract(0) + + let rb: [u32; 4] = transmute(_mm_cmpord_ss(a, b)); + let eb: [u32; 4] = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + assert_eq!(rb, eb); + + let rc: [u32; 4] = transmute(_mm_cmpord_ss(a, c)); + let ec: [u32; 4] = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + assert_eq!(rc, ec); + + let rd: [u32; 4] = transmute(_mm_cmpord_ss(a, d)); + let ed: [u32; 4] = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + assert_eq!(rd, ed); + } + test_mm_cmpord_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpunord_ss() { + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); + let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0); + let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); + + let b1 = 0u32; // a.extract(0) unord b.extract(0) + let c1 = !0u32; // a.extract(0) unord c.extract(0) + let d1 = 0u32; // a.extract(0) unord d.extract(0) + + let rb: [u32; 4] = transmute(_mm_cmpunord_ss(a, b)); + let eb: [u32; 4] = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + assert_eq!(rb, eb); + + let rc: [u32; 4] = transmute(_mm_cmpunord_ss(a, c)); + let ec: [u32; 4] = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + assert_eq!(rc, ec); + + let rd: [u32; 4] = transmute(_mm_cmpunord_ss(a, d)); + let ed: [u32; 4] = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + assert_eq!(rd, ed); + } + test_mm_cmpunord_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpeq_ps() { + let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); + let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); + let tru = !0u32; + let fls = 0u32; + + let e = [fls, fls, tru, fls]; + let r: [u32; 4] = transmute(_mm_cmpeq_ps(a, b)); + assert_eq!(r, e); + } + test_mm_cmpeq_ps(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmplt_ps() { + let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); + let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); + let tru = !0u32; + let fls = 0u32; + + let e = [tru, fls, fls, fls]; + let r: [u32; 4] = transmute(_mm_cmplt_ps(a, b)); + assert_eq!(r, e); + } + test_mm_cmplt_ps(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmple_ps() { + let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0); + let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); + let tru = !0u32; + let fls = 0u32; + + let e = [tru, fls, tru, fls]; + let r: [u32; 4] = transmute(_mm_cmple_ps(a, b)); + assert_eq!(r, e); + } + test_mm_cmple_ps(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpgt_ps() { + let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); + let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0); + let tru = !0u32; + let fls = 0u32; + + let e = [fls, tru, fls, fls]; + let r: [u32; 4] = transmute(_mm_cmpgt_ps(a, b)); + assert_eq!(r, e); + } + test_mm_cmpgt_ps(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpge_ps() { + let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); + let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0); + let tru = !0u32; + let fls = 0u32; + + let e = [fls, tru, tru, fls]; + let r: [u32; 4] = transmute(_mm_cmpge_ps(a, b)); + assert_eq!(r, e); + } + test_mm_cmpge_ps(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpneq_ps() { + let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); + let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); + let tru = !0u32; + let fls = 0u32; + + let e = [tru, tru, fls, tru]; + let r: [u32; 4] = transmute(_mm_cmpneq_ps(a, b)); + assert_eq!(r, e); + } + test_mm_cmpneq_ps(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpnlt_ps() { + let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); + let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); + let tru = !0u32; + let fls = 0u32; + + let e = [fls, tru, tru, tru]; + let r: [u32; 4] = transmute(_mm_cmpnlt_ps(a, b)); + assert_eq!(r, e); + } + test_mm_cmpnlt_ps(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpnle_ps() { + let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); + let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); + let tru = !0u32; + let fls = 0u32; + + let e = [fls, tru, fls, tru]; + let r: [u32; 4] = transmute(_mm_cmpnle_ps(a, b)); + assert_eq!(r, e); + } + test_mm_cmpnle_ps(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpngt_ps() { + let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); + let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); + let tru = !0u32; + let fls = 0u32; + + let e = [tru, fls, tru, tru]; + let r: [u32; 4] = transmute(_mm_cmpngt_ps(a, b)); + assert_eq!(r, e); + } + test_mm_cmpngt_ps(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpnge_ps() { + let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); + let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); + let tru = !0u32; + let fls = 0u32; + + let e = [tru, fls, fls, tru]; + let r: [u32; 4] = transmute(_mm_cmpnge_ps(a, b)); + assert_eq!(r, e); + } + test_mm_cmpnge_ps(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpord_ps() { + let a = _mm_setr_ps(10.0, 50.0, NAN, NAN); + let b = _mm_setr_ps(15.0, NAN, 1.0, NAN); + let tru = !0u32; + let fls = 0u32; + + let e = [tru, fls, fls, fls]; + let r: [u32; 4] = transmute(_mm_cmpord_ps(a, b)); + assert_eq!(r, e); + } + test_mm_cmpord_ps(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cmpunord_ps() { + let a = _mm_setr_ps(10.0, 50.0, NAN, NAN); + let b = _mm_setr_ps(15.0, NAN, 1.0, NAN); + let tru = !0u32; + let fls = 0u32; + + let e = [fls, tru, tru, tru]; + let r: [u32; 4] = transmute(_mm_cmpunord_ps(a, b)); + assert_eq!(r, e); + } + test_mm_cmpunord_ps(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_comieq_ss() { + let aa = &[3.0f32, 12.0, 23.0, NAN]; + let bb = &[3.0f32, 47.5, 1.5, NAN]; + + let ee = &[1i32, 0, 0, 0]; + + for i in 0..4 { + let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); + let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); + + let r = _mm_comieq_ss(a, b); + + assert_eq!( + ee[i], r, + "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})", + a, b, r, ee[i], i + ); + } + } + test_mm_comieq_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_comilt_ss() { + let aa = &[3.0f32, 12.0, 23.0, NAN]; + let bb = &[3.0f32, 47.5, 1.5, NAN]; + + let ee = &[0i32, 1, 0, 0]; + + for i in 0..4 { + let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); + let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); + + let r = _mm_comilt_ss(a, b); + + assert_eq!( + ee[i], r, + "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})", + a, b, r, ee[i], i + ); + } + } + test_mm_comilt_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_comile_ss() { + let aa = &[3.0f32, 12.0, 23.0, NAN]; + let bb = &[3.0f32, 47.5, 1.5, NAN]; + + let ee = &[1i32, 1, 0, 0]; + + for i in 0..4 { + let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); + let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); + + let r = _mm_comile_ss(a, b); + + assert_eq!( + ee[i], r, + "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})", + a, b, r, ee[i], i + ); + } + } + test_mm_comile_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_comigt_ss() { + let aa = &[3.0f32, 12.0, 23.0, NAN]; + let bb = &[3.0f32, 47.5, 1.5, NAN]; + + let ee = &[1i32, 0, 1, 0]; + + for i in 0..4 { + let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); + let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); + + let r = _mm_comige_ss(a, b); + + assert_eq!( + ee[i], r, + "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})", + a, b, r, ee[i], i + ); + } + } + test_mm_comigt_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_comineq_ss() { + let aa = &[3.0f32, 12.0, 23.0, NAN]; + let bb = &[3.0f32, 47.5, 1.5, NAN]; + + let ee = &[0i32, 1, 1, 1]; + + for i in 0..4 { + let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); + let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); + + let r = _mm_comineq_ss(a, b); + + assert_eq!( + ee[i], r, + "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})", + a, b, r, ee[i], i + ); + } + } + test_mm_comineq_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_ucomieq_ss() { + let aa = &[3.0f32, 12.0, 23.0, NAN]; + let bb = &[3.0f32, 47.5, 1.5, NAN]; + + let ee = &[1i32, 0, 0, 0]; + + for i in 0..4 { + let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); + let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); + + let r = _mm_ucomieq_ss(a, b); + + assert_eq!( + ee[i], r, + "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})", + a, b, r, ee[i], i + ); + } + } + test_mm_ucomieq_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_ucomilt_ss() { + let aa = &[3.0f32, 12.0, 23.0, NAN]; + let bb = &[3.0f32, 47.5, 1.5, NAN]; + + let ee = &[0i32, 1, 0, 0]; + + for i in 0..4 { + let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); + let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); + + let r = _mm_ucomilt_ss(a, b); + + assert_eq!( + ee[i], r, + "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})", + a, b, r, ee[i], i + ); + } + } + test_mm_ucomilt_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_ucomile_ss() { + let aa = &[3.0f32, 12.0, 23.0, NAN]; + let bb = &[3.0f32, 47.5, 1.5, NAN]; + + let ee = &[1i32, 1, 0, 0]; + + for i in 0..4 { + let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); + let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); + + let r = _mm_ucomile_ss(a, b); + + assert_eq!( + ee[i], r, + "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})", + a, b, r, ee[i], i + ); + } + } + test_mm_ucomile_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_ucomigt_ss() { + let aa = &[3.0f32, 12.0, 23.0, NAN]; + let bb = &[3.0f32, 47.5, 1.5, NAN]; + + let ee = &[0i32, 0, 1, 0]; + + for i in 0..4 { + let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); + let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); + + let r = _mm_ucomigt_ss(a, b); + + assert_eq!( + ee[i], r, + "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})", + a, b, r, ee[i], i + ); + } + } + test_mm_ucomigt_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_ucomige_ss() { + let aa = &[3.0f32, 12.0, 23.0, NAN]; + let bb = &[3.0f32, 47.5, 1.5, NAN]; + + let ee = &[1i32, 0, 1, 0]; + + for i in 0..4 { + let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); + let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); + + let r = _mm_ucomige_ss(a, b); + + assert_eq!( + ee[i], r, + "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})", + a, b, r, ee[i], i + ); + } + } + test_mm_ucomige_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_ucomineq_ss() { + let aa = &[3.0f32, 12.0, 23.0, NAN]; + let bb = &[3.0f32, 47.5, 1.5, NAN]; + + let ee = &[0i32, 1, 1, 1]; + + for i in 0..4 { + let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); + let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); + + let r = _mm_ucomineq_ss(a, b); + + assert_eq!( + ee[i], r, + "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})", + a, b, r, ee[i], i + ); + } + } + test_mm_ucomineq_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cvtss_si32() { + let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1]; + let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520]; + for i in 0..inputs.len() { + let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0); + let e = result[i]; + let r = _mm_cvtss_si32(x); + assert_eq!(e, r, "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}", i, x, r, e); + } + } + test_mm_cvtss_si32(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cvttss_si32() { + let inputs = &[ + (42.0f32, 42i32), + (-31.4, -31), + (-33.5, -33), + (-34.5, -34), + (10.999, 10), + (-5.99, -5), + (4.0e10, i32::MIN), + (4.0e-10, 0), + (NAN, i32::MIN), + (2147483500.1, 2147483520), + ]; + for i in 0..inputs.len() { + let (xi, e) = inputs[i]; + let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0); + let r = _mm_cvttss_si32(x); + assert_eq!(e, r, "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}", i, x, r, e); + } + } + test_mm_cvttss_si32(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cvtss_f32() { + let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0); + assert_eq!(_mm_cvtss_f32(a), 312.0134); + } + test_mm_cvtss_f32(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cvtsi32_ss() { + let inputs = &[ + (4555i32, 4555.0f32), + (322223333, 322223330.0), + (-432, -432.0), + (-322223333, -322223330.0), + ]; + + for i in 0..inputs.len() { + let (x, f) = inputs[i]; + let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); + let r = _mm_cvtsi32_ss(a, x); + let e = _mm_setr_ps(f, 6.0, 7.0, 8.0); + assert_eq_m128(e, r); + } + } + test_mm_cvtsi32_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cvtss_si64() { + let inputs = &[ + (42.0f32, 42i64), + (-31.4, -31), + (-33.5, -34), + (-34.5, -34), + (4.0e10, 40_000_000_000), + (4.0e-10, 0), + (f32::NAN, i64::MIN), + (2147483500.1, 2147483520), + (9.223371e18, 9223370937343148032), + ]; + for i in 0..inputs.len() { + let (xi, e) = inputs[i]; + let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0); + let r = _mm_cvtss_si64(x); + assert_eq!(e, r, "TestCase #{} _mm_cvtss_si64({:?}) = {}, expected: {}", i, x, r, e); + } + } + test_mm_cvtss_si64(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cvttss_si64() { + let inputs = &[ + (42.0f32, 42i64), + (-31.4, -31), + (-33.5, -33), + (-34.5, -34), + (10.999, 10), + (-5.99, -5), + (4.0e10, 40_000_000_000), + (4.0e-10, 0), + (f32::NAN, i64::MIN), + (2147483500.1, 2147483520), + (9.223371e18, 9223370937343148032), + (9.223372e18, i64::MIN), + ]; + for i in 0..inputs.len() { + let (xi, e) = inputs[i]; + let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0); + let r = _mm_cvttss_si64(x); + assert_eq!(e, r, "TestCase #{} _mm_cvttss_si64({:?}) = {}, expected: {}", i, x, r, e); + } + } + test_mm_cvttss_si64(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_cvtsi64_ss() { + let inputs = &[ + (4555i64, 4555.0f32), + (322223333, 322223330.0), + (-432, -432.0), + (-322223333, -322223330.0), + (9223372036854775807, 9.223372e18), + (-9223372036854775808, -9.223372e18), + ]; + + for i in 0..inputs.len() { + let (x, f) = inputs[i]; + let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); + let r = _mm_cvtsi64_ss(a, x); + let e = _mm_setr_ps(f, 6.0, 7.0, 8.0); + assert_eq_m128(e, r); + } + } + test_mm_cvtsi64_ss(); + + #[target_feature(enable = "sse")] + unsafe fn test_mm_movemask_ps() { + let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0)); + assert_eq!(r, 0b0101); + + let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0)); + assert_eq!(r, 0b0111); + } + test_mm_movemask_ps(); + + let x = 0i8; + _mm_prefetch(&x, _MM_HINT_T0); + _mm_prefetch(&x, _MM_HINT_T1); + _mm_prefetch(&x, _MM_HINT_T2); + _mm_prefetch(&x, _MM_HINT_NTA); + _mm_prefetch(&x, _MM_HINT_ET0); + _mm_prefetch(&x, _MM_HINT_ET1); +}