Auto merge of #2004 - RalfJung:simd, r=RalfJung

implement more SIMD intrinsics Requires https://github.com/rust-lang/rust/pull/94681 With this, the cast, i32_ops, and f32_ops test suites of portable-simd pass. :) Cc https://github.com/rust-lang/miri/issues/1912
2022-03-07 14:58:30 +00:00 · 2022-03-07 14:58:30 +00:00 · 64b086a8e2
commit 64b086a8e2
parent dd42a47f0a b87a9c90e1
4 changed files with 346 additions and 78 deletions
--- a/2
+++ b/2
@ -1 +1 @@
-8876ca3dd46b99fe7e6ad937f11493d37996231e
+297273c45b205820a4c055082c71677197a40b55
--- a/src/shims/intrinsics.rs
+++ b/src/shims/intrinsics.rs
@ -345,7 +345,6 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
                                bug!("simd_fabs operand is not a float")
                            };
                            let op = op.to_scalar()?;
-                            // FIXME: Using host floats.
                            match float_ty {
                                FloatTy::F32 => Scalar::from_f32(op.to_f32()?.abs()),
                                FloatTy::F64 => Scalar::from_f64(op.to_f64()?.abs()),
@ -371,7 +370,9 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
            | "simd_lt"
            | "simd_le"
            | "simd_gt"
-            | "simd_ge" => {
+            | "simd_ge"
+            | "simd_fmax"
+            | "simd_fmin" => {
                use mir::BinOp;

                let &[ref left, ref right] = check_arg_count(args)?;
@ -382,23 +383,30 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
                assert_eq!(dest_len, left_len);
                assert_eq!(dest_len, right_len);

-                let mir_op = match intrinsic_name {
-                    "simd_add" => BinOp::Add,
-                    "simd_sub" => BinOp::Sub,
-                    "simd_mul" => BinOp::Mul,
-                    "simd_div" => BinOp::Div,
-                    "simd_rem" => BinOp::Rem,
-                    "simd_shl" => BinOp::Shl,
-                    "simd_shr" => BinOp::Shr,
-                    "simd_and" => BinOp::BitAnd,
-                    "simd_or" => BinOp::BitOr,
-                    "simd_xor" => BinOp::BitXor,
-                    "simd_eq" => BinOp::Eq,
-                    "simd_ne" => BinOp::Ne,
-                    "simd_lt" => BinOp::Lt,
-                    "simd_le" => BinOp::Le,
-                    "simd_gt" => BinOp::Gt,
-                    "simd_ge" => BinOp::Ge,
+                enum Op {
+                    MirOp(BinOp),
+                    FMax,
+                    FMin,
+                }
+                let which = match intrinsic_name {
+                    "simd_add" => Op::MirOp(BinOp::Add),
+                    "simd_sub" => Op::MirOp(BinOp::Sub),
+                    "simd_mul" => Op::MirOp(BinOp::Mul),
+                    "simd_div" => Op::MirOp(BinOp::Div),
+                    "simd_rem" => Op::MirOp(BinOp::Rem),
+                    "simd_shl" => Op::MirOp(BinOp::Shl),
+                    "simd_shr" => Op::MirOp(BinOp::Shr),
+                    "simd_and" => Op::MirOp(BinOp::BitAnd),
+                    "simd_or" => Op::MirOp(BinOp::BitOr),
+                    "simd_xor" => Op::MirOp(BinOp::BitXor),
+                    "simd_eq" => Op::MirOp(BinOp::Eq),
+                    "simd_ne" => Op::MirOp(BinOp::Ne),
+                    "simd_lt" => Op::MirOp(BinOp::Lt),
+                    "simd_le" => Op::MirOp(BinOp::Le),
+                    "simd_gt" => Op::MirOp(BinOp::Gt),
+                    "simd_ge" => Op::MirOp(BinOp::Ge),
+                    "simd_fmax" => Op::FMax,
+                    "simd_fmin" => Op::FMin,
                    _ => unreachable!(),
                };

@ -406,26 +414,36 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
                    let left = this.read_immediate(&this.mplace_index(&left, i)?.into())?;
                    let right = this.read_immediate(&this.mplace_index(&right, i)?.into())?;
                    let dest = this.mplace_index(&dest, i)?;
-                    let (val, overflowed, ty) = this.overflowing_binary_op(mir_op, &left, &right)?;
-                    if matches!(mir_op, BinOp::Shl | BinOp::Shr) {
-                        // Shifts have extra UB as SIMD operations that the MIR binop does not have.
-                        // See <https://github.com/rust-lang/rust/issues/91237>.
-                        if overflowed {
-                            let r_val = right.to_scalar()?.to_bits(right.layout.size)?;
-                            throw_ub_format!("overflowing shift by {} in `{}` in SIMD lane {}", r_val, intrinsic_name, i);
+                    let val = match which {
+                        Op::MirOp(mir_op) => {
+                            let (val, overflowed, ty) = this.overflowing_binary_op(mir_op, &left, &right)?;
+                            if matches!(mir_op, BinOp::Shl | BinOp::Shr) {
+                                // Shifts have extra UB as SIMD operations that the MIR binop does not have.
+                                // See <https://github.com/rust-lang/rust/issues/91237>.
+                                if overflowed {
+                                    let r_val = right.to_scalar()?.to_bits(right.layout.size)?;
+                                    throw_ub_format!("overflowing shift by {} in `{}` in SIMD lane {}", r_val, intrinsic_name, i);
+                                }
+                            }
+                            if matches!(mir_op, BinOp::Eq | BinOp::Ne | BinOp::Lt | BinOp::Le | BinOp::Gt | BinOp::Ge) {
+                                // Special handling for boolean-returning operations
+                                assert_eq!(ty, this.tcx.types.bool);
+                                let val = val.to_bool().unwrap();
+                                bool_to_simd_element(val, dest.layout.size)
+                            } else {
+                                assert_ne!(ty, this.tcx.types.bool);
+                                assert_eq!(ty, dest.layout.ty);
+                                val
+                            }
                        }
-                    }
-                    if matches!(mir_op, BinOp::Eq | BinOp::Ne | BinOp::Lt | BinOp::Le | BinOp::Gt | BinOp::Ge) {
-                        // Special handling for boolean-returning operations
-                        assert_eq!(ty, this.tcx.types.bool);
-                        let val = val.to_bool().unwrap();
-                        let val = bool_to_simd_element(val, dest.layout.size);
-                        this.write_scalar(val, &dest.into())?;
-                    } else {
-                        assert_ne!(ty, this.tcx.types.bool);
-                        assert_eq!(ty, dest.layout.ty);
-                        this.write_scalar(val, &dest.into())?;
-                    }
+                        Op::FMax => {
+                            fmax_op(&left, &right)?
+                        }
+                        Op::FMin => {
+                            fmin_op(&left, &right)?
+                        }
+                    };
+                    this.write_scalar(val, &dest.into())?;
                }
            }
            #[rustfmt::skip]
@ -433,7 +451,9 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
            | "simd_reduce_or"
            | "simd_reduce_xor"
            | "simd_reduce_any"
-            | "simd_reduce_all" => {
+            | "simd_reduce_all"
+            | "simd_reduce_max"
+            | "simd_reduce_min" => {
                use mir::BinOp;

                let &[ref op] = check_arg_count(args)?;
@ -445,19 +465,27 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
                enum Op {
                    MirOp(BinOp),
                    MirOpBool(BinOp),
+                    Max,
+                    Min,
                }
-                // The initial value is the neutral element.
-                let (which, init) = match intrinsic_name {
-                    "simd_reduce_and" => (Op::MirOp(BinOp::BitAnd), ImmTy::from_int(-1, dest.layout)),
-                    "simd_reduce_or" => (Op::MirOp(BinOp::BitOr), ImmTy::from_int(0, dest.layout)),
-                    "simd_reduce_xor" => (Op::MirOp(BinOp::BitXor), ImmTy::from_int(0, dest.layout)),
-                    "simd_reduce_any" => (Op::MirOpBool(BinOp::BitOr), imm_from_bool(false)),
-                    "simd_reduce_all" => (Op::MirOpBool(BinOp::BitAnd), imm_from_bool(true)),
+                let which = match intrinsic_name {
+                    "simd_reduce_and" => Op::MirOp(BinOp::BitAnd),
+                    "simd_reduce_or" => Op::MirOp(BinOp::BitOr),
+                    "simd_reduce_xor" => Op::MirOp(BinOp::BitXor),
+                    "simd_reduce_any" => Op::MirOpBool(BinOp::BitOr),
+                    "simd_reduce_all" => Op::MirOpBool(BinOp::BitAnd),
+                    "simd_reduce_max" => Op::Max,
+                    "simd_reduce_min" => Op::Min,
                    _ => unreachable!(),
                };

-                let mut res = init;
-                for i in 0..op_len {
+                // Initialize with first lane, then proceed with the rest.
+                let mut res = this.read_immediate(&this.mplace_index(&op, 0)?.into())?;
+                if matches!(which, Op::MirOpBool(_)) {
+                    // Convert to `bool` scalar.
+                    res = imm_from_bool(simd_element_to_bool(res)?);
+                }
+                for i in 1..op_len {
                    let op = this.read_immediate(&this.mplace_index(&op, i)?.into())?;
                    res = match which {
                        Op::MirOp(mir_op) => {
@ -467,6 +495,30 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
                            let op = imm_from_bool(simd_element_to_bool(op)?);
                            this.binary_op(mir_op, &res, &op)?
                        }
+                        Op::Max => {
+                            if matches!(res.layout.ty.kind(), ty::Float(_)) {
+                                ImmTy::from_scalar(fmax_op(&res, &op)?, res.layout)
+                            } else {
+                                // Just boring integers, so NaNs to worry about
+                                if this.binary_op(BinOp::Ge, &res, &op)?.to_scalar()?.to_bool()? {
+                                    res
+                                } else {
+                                    op
+                                }
+                            }
+                        }
+                        Op::Min => {
+                            if matches!(res.layout.ty.kind(), ty::Float(_)) {
+                                ImmTy::from_scalar(fmin_op(&res, &op)?, res.layout)
+                            } else {
+                                // Just boring integers, so NaNs to worry about
+                                if this.binary_op(BinOp::Le, &res, &op)?.to_scalar()?.to_bool()? {
+                                    res
+                                } else {
+                                    op
+                                }
+                            }
+                        }
                    };
                }
                this.write_immediate(*res, dest)?;
@ -515,6 +567,45 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
                    this.write_immediate(*val, &dest.into())?;
                }
            }
+            #[rustfmt::skip]
+            "simd_cast" | "simd_as" => {
+                let &[ref op] = check_arg_count(args)?;
+                let (op, op_len) = this.operand_to_simd(op)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, op_len);
+
+                let safe_cast = intrinsic_name == "simd_as";
+
+                for i in 0..dest_len {
+                    let op = this.read_immediate(&this.mplace_index(&op, i)?.into())?;
+                    let dest = this.mplace_index(&dest, i)?;
+
+                    let val = match (op.layout.ty.kind(), dest.layout.ty.kind()) {
+                        // Int-to-(int|float): always safe
+                        (ty::Int(_) | ty::Uint(_), ty::Int(_) | ty::Uint(_) | ty::Float(_)) =>
+                            this.misc_cast(&op, dest.layout.ty)?,
+                        // Float-to-float: always safe
+                        (ty::Float(_), ty::Float(_)) =>
+                            this.misc_cast(&op, dest.layout.ty)?,
+                        // Float-to-int in safe mode
+                        (ty::Float(_), ty::Int(_) | ty::Uint(_)) if safe_cast =>
+                            this.misc_cast(&op, dest.layout.ty)?,
+                        // Float-to-int in unchecked mode
+                        (ty::Float(FloatTy::F32), ty::Int(_) | ty::Uint(_)) if !safe_cast =>
+                            this.float_to_int_unchecked(op.to_scalar()?.to_f32()?, dest.layout.ty)?.into(),
+                        (ty::Float(FloatTy::F64), ty::Int(_) | ty::Uint(_)) if !safe_cast =>
+                            this.float_to_int_unchecked(op.to_scalar()?.to_f64()?, dest.layout.ty)?.into(),
+                        _ =>
+                            throw_unsup_format!(
+                                "Unsupported SIMD cast from element type {} to {}",
+                                op.layout.ty,
+                                dest.layout.ty
+                            ),
+                    };
+                    this.write_immediate(val, &dest.into())?;
+                }
+            }

            // Atomic operations
            "atomic_load" => this.atomic_load(args, dest, AtomicReadOp::SeqCst)?,
@ -1003,3 +1094,35 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
        })
    }
 }
+
+fn fmax_op<'tcx>(
+    left: &ImmTy<'tcx, Tag>,
+    right: &ImmTy<'tcx, Tag>,
+) -> InterpResult<'tcx, Scalar<Tag>> {
+    assert_eq!(left.layout.ty, right.layout.ty);
+    let ty::Float(float_ty) = left.layout.ty.kind() else {
+        bug!("fmax operand is not a float")
+    };
+    let left = left.to_scalar()?;
+    let right = right.to_scalar()?;
+    Ok(match float_ty {
+        FloatTy::F32 => Scalar::from_f32(left.to_f32()?.max(right.to_f32()?)),
+        FloatTy::F64 => Scalar::from_f64(left.to_f64()?.max(right.to_f64()?)),
+    })
+}
+
+fn fmin_op<'tcx>(
+    left: &ImmTy<'tcx, Tag>,
+    right: &ImmTy<'tcx, Tag>,
+) -> InterpResult<'tcx, Scalar<Tag>> {
+    assert_eq!(left.layout.ty, right.layout.ty);
+    let ty::Float(float_ty) = left.layout.ty.kind() else {
+        bug!("fmin operand is not a float")
+    };
+    let left = left.to_scalar()?;
+    let right = right.to_scalar()?;
+    Ok(match float_ty {
+        FloatTy::F32 => Scalar::from_f32(left.to_f32()?.min(right.to_f32()?)),
+        FloatTy::F64 => Scalar::from_f64(left.to_f64()?.min(right.to_f64()?)),
+    })
+}
--- a/tests/compile-fail/intrinsics/simd-float-to-int.rs
+++ b/tests/compile-fail/intrinsics/simd-float-to-int.rs
@ -0,0 +1,7 @@
+// error-pattern: cannot be represented in target type `i32`
+#![feature(portable_simd)]
+use std::simd::*;
+
+fn main() { unsafe {
+    let _x : i32x2 = f32x2::from_array([f32::MAX, f32::MIN]).to_int_unchecked();
+} }
--- a/tests/run-pass/portable-simd.rs
+++ b/tests/run-pass/portable-simd.rs
@ -12,19 +12,37 @@ fn simd_ops_f32() {
    assert_eq!(a / f32x4::splat(2.0), f32x4::splat(5.0));
    assert_eq!(a % b, f32x4::from_array([0.0, 0.0, 1.0, 2.0]));
    assert_eq!(b.abs(), f32x4::from_array([1.0, 2.0, 3.0, 4.0]));
+    assert_eq!(a.max(b * f32x4::splat(4.0)), f32x4::from_array([10.0, 10.0, 12.0, 10.0]));
+    assert_eq!(a.min(b * f32x4::splat(4.0)), f32x4::from_array([4.0, 8.0, 10.0, -16.0]));

-    // FIXME use Mask::from_array once simd_cast is implemented.
-    assert_eq!(a.lanes_eq(f32x4::splat(5.0)*b), Mask::from_int(i32x4::from_array([0, -1, 0, 0])));
-    assert_eq!(a.lanes_ne(f32x4::splat(5.0)*b), Mask::from_int(i32x4::from_array([-1, 0, -1, -1])));
-    assert_eq!(a.lanes_le(f32x4::splat(5.0)*b), Mask::from_int(i32x4::from_array([0, -1, -1, 0])));
-    assert_eq!(a.lanes_lt(f32x4::splat(5.0)*b), Mask::from_int(i32x4::from_array([0, 0, -1, 0])));
-    assert_eq!(a.lanes_ge(f32x4::splat(5.0)*b), Mask::from_int(i32x4::from_array([-1, -1, 0, -1])));
-    assert_eq!(a.lanes_gt(f32x4::splat(5.0)*b), Mask::from_int(i32x4::from_array([-1, 0, 0, -1])));
+    assert_eq!(a.lanes_eq(f32x4::splat(5.0) * b), Mask::from_array([false, true, false, false]));
+    assert_eq!(a.lanes_ne(f32x4::splat(5.0) * b), Mask::from_array([true, false, true, true]));
+    assert_eq!(a.lanes_le(f32x4::splat(5.0) * b), Mask::from_array([false, true, true, false]));
+    assert_eq!(a.lanes_lt(f32x4::splat(5.0) * b), Mask::from_array([false, false, true, false]));
+    assert_eq!(a.lanes_ge(f32x4::splat(5.0) * b), Mask::from_array([true, true, false, true]));
+    assert_eq!(a.lanes_gt(f32x4::splat(5.0) * b), Mask::from_array([true, false, false, true]));

    assert_eq!(a.horizontal_sum(), 40.0);
    assert_eq!(b.horizontal_sum(), 2.0);
-    assert_eq!(a.horizontal_product(), 100.0*100.0);
+    assert_eq!(a.horizontal_product(), 100.0 * 100.0);
    assert_eq!(b.horizontal_product(), -24.0);
+    assert_eq!(a.horizontal_max(), 10.0);
+    assert_eq!(b.horizontal_max(), 3.0);
+    assert_eq!(a.horizontal_min(), 10.0);
+    assert_eq!(b.horizontal_min(), -4.0);
+
+    assert_eq!(
+        f32x2::from_array([0.0, f32::NAN]).max(f32x2::from_array([f32::NAN, 0.0])),
+        f32x2::from_array([0.0, 0.0])
+    );
+    assert_eq!(f32x2::from_array([0.0, f32::NAN]).horizontal_max(), 0.0);
+    assert_eq!(f32x2::from_array([f32::NAN, 0.0]).horizontal_max(), 0.0);
+    assert_eq!(
+        f32x2::from_array([0.0, f32::NAN]).min(f32x2::from_array([f32::NAN, 0.0])),
+        f32x2::from_array([0.0, 0.0])
+    );
+    assert_eq!(f32x2::from_array([0.0, f32::NAN]).horizontal_min(), 0.0);
+    assert_eq!(f32x2::from_array([f32::NAN, 0.0]).horizontal_min(), 0.0);
 }

 fn simd_ops_f64() {
@ -38,19 +56,37 @@ fn simd_ops_f64() {
    assert_eq!(a / f64x4::splat(2.0), f64x4::splat(5.0));
    assert_eq!(a % b, f64x4::from_array([0.0, 0.0, 1.0, 2.0]));
    assert_eq!(b.abs(), f64x4::from_array([1.0, 2.0, 3.0, 4.0]));
+    assert_eq!(a.max(b * f64x4::splat(4.0)), f64x4::from_array([10.0, 10.0, 12.0, 10.0]));
+    assert_eq!(a.min(b * f64x4::splat(4.0)), f64x4::from_array([4.0, 8.0, 10.0, -16.0]));

-    // FIXME use Mask::from_array once simd_cast is implemented.
-    assert_eq!(a.lanes_eq(f64x4::splat(5.0)*b), Mask::from_int(i64x4::from_array([0, -1, 0, 0])));
-    assert_eq!(a.lanes_ne(f64x4::splat(5.0)*b), Mask::from_int(i64x4::from_array([-1, 0, -1, -1])));
-    assert_eq!(a.lanes_le(f64x4::splat(5.0)*b), Mask::from_int(i64x4::from_array([0, -1, -1, 0])));
-    assert_eq!(a.lanes_lt(f64x4::splat(5.0)*b), Mask::from_int(i64x4::from_array([0, 0, -1, 0])));
-    assert_eq!(a.lanes_ge(f64x4::splat(5.0)*b), Mask::from_int(i64x4::from_array([-1, -1, 0, -1])));
-    assert_eq!(a.lanes_gt(f64x4::splat(5.0)*b), Mask::from_int(i64x4::from_array([-1, 0, 0, -1])));
+    assert_eq!(a.lanes_eq(f64x4::splat(5.0) * b), Mask::from_array([false, true, false, false]));
+    assert_eq!(a.lanes_ne(f64x4::splat(5.0) * b), Mask::from_array([true, false, true, true]));
+    assert_eq!(a.lanes_le(f64x4::splat(5.0) * b), Mask::from_array([false, true, true, false]));
+    assert_eq!(a.lanes_lt(f64x4::splat(5.0) * b), Mask::from_array([false, false, true, false]));
+    assert_eq!(a.lanes_ge(f64x4::splat(5.0) * b), Mask::from_array([true, true, false, true]));
+    assert_eq!(a.lanes_gt(f64x4::splat(5.0) * b), Mask::from_array([true, false, false, true]));

    assert_eq!(a.horizontal_sum(), 40.0);
    assert_eq!(b.horizontal_sum(), 2.0);
-    assert_eq!(a.horizontal_product(), 100.0*100.0);
+    assert_eq!(a.horizontal_product(), 100.0 * 100.0);
    assert_eq!(b.horizontal_product(), -24.0);
+    assert_eq!(a.horizontal_max(), 10.0);
+    assert_eq!(b.horizontal_max(), 3.0);
+    assert_eq!(a.horizontal_min(), 10.0);
+    assert_eq!(b.horizontal_min(), -4.0);
+
+    assert_eq!(
+        f64x2::from_array([0.0, f64::NAN]).max(f64x2::from_array([f64::NAN, 0.0])),
+        f64x2::from_array([0.0, 0.0])
+    );
+    assert_eq!(f64x2::from_array([0.0, f64::NAN]).horizontal_max(), 0.0);
+    assert_eq!(f64x2::from_array([f64::NAN, 0.0]).horizontal_max(), 0.0);
+    assert_eq!(
+        f64x2::from_array([0.0, f64::NAN]).min(f64x2::from_array([f64::NAN, 0.0])),
+        f64x2::from_array([0.0, 0.0])
+    );
+    assert_eq!(f64x2::from_array([0.0, f64::NAN]).horizontal_min(), 0.0);
+    assert_eq!(f64x2::from_array([f64::NAN, 0.0]).horizontal_min(), 0.0);
 }

 fn simd_ops_i32() {
@ -65,19 +101,33 @@ fn simd_ops_i32() {
    assert_eq!(i32x2::splat(i32::MIN) / i32x2::splat(-1), i32x2::splat(i32::MIN));
    assert_eq!(a % b, i32x4::from_array([0, 0, 1, 2]));
    assert_eq!(i32x2::splat(i32::MIN) % i32x2::splat(-1), i32x2::splat(0));
+    assert_eq!(b.abs(), i32x4::from_array([1, 2, 3, 4]));
+    // FIXME not a per-lane method (https://github.com/rust-lang/rust/issues/94682)
+    // assert_eq!(a.max(b * i32x4::splat(4)), i32x4::from_array([10, 10, 12, 10]));
+    // assert_eq!(a.min(b * i32x4::splat(4)), i32x4::from_array([4, 8, 10, -16]));
+
+    assert_eq!(!b, i32x4::from_array([!1, !2, !3, !-4]));
    assert_eq!(b << i32x4::splat(2), i32x4::from_array([4, 8, 12, -16]));
    assert_eq!(b >> i32x4::splat(1), i32x4::from_array([0, 1, 1, -2]));
    assert_eq!(b & i32x4::splat(2), i32x4::from_array([0, 2, 2, 0]));
    assert_eq!(b | i32x4::splat(2), i32x4::from_array([3, 2, 3, -2]));
    assert_eq!(b ^ i32x4::splat(2), i32x4::from_array([3, 0, 1, -2]));

-    // FIXME use Mask::from_array once simd_cast is implemented.
-    assert_eq!(a.lanes_eq(i32x4::splat(5)*b), Mask::from_int(i32x4::from_array([0, -1, 0, 0])));
-    assert_eq!(a.lanes_ne(i32x4::splat(5)*b), Mask::from_int(i32x4::from_array([-1, 0, -1, -1])));
-    assert_eq!(a.lanes_le(i32x4::splat(5)*b), Mask::from_int(i32x4::from_array([0, -1, -1, 0])));
-    assert_eq!(a.lanes_lt(i32x4::splat(5)*b), Mask::from_int(i32x4::from_array([0, 0, -1, 0])));
-    assert_eq!(a.lanes_ge(i32x4::splat(5)*b), Mask::from_int(i32x4::from_array([-1, -1, 0, -1])));
-    assert_eq!(a.lanes_gt(i32x4::splat(5)*b), Mask::from_int(i32x4::from_array([-1, 0, 0, -1])));
+    assert_eq!(a.lanes_eq(i32x4::splat(5) * b), Mask::from_array([false, true, false, false]));
+    assert_eq!(a.lanes_ne(i32x4::splat(5) * b), Mask::from_array([true, false, true, true]));
+    assert_eq!(a.lanes_le(i32x4::splat(5) * b), Mask::from_array([false, true, true, false]));
+    assert_eq!(a.lanes_lt(i32x4::splat(5) * b), Mask::from_array([false, false, true, false]));
+    assert_eq!(a.lanes_ge(i32x4::splat(5) * b), Mask::from_array([true, true, false, true]));
+    assert_eq!(a.lanes_gt(i32x4::splat(5) * b), Mask::from_array([true, false, false, true]));
+
+    assert_eq!(a.horizontal_sum(), 40);
+    assert_eq!(b.horizontal_sum(), 2);
+    assert_eq!(a.horizontal_product(), 100 * 100);
+    assert_eq!(b.horizontal_product(), -24);
+    assert_eq!(a.horizontal_max(), 10);
+    assert_eq!(b.horizontal_max(), 3);
+    assert_eq!(a.horizontal_min(), 10);
+    assert_eq!(b.horizontal_min(), -4);

    assert_eq!(a.horizontal_and(), 10);
    assert_eq!(b.horizontal_and(), 0);
@ -85,10 +135,90 @@ fn simd_ops_i32() {
    assert_eq!(b.horizontal_or(), -1);
    assert_eq!(a.horizontal_xor(), 0);
    assert_eq!(b.horizontal_xor(), -4);
-    assert_eq!(a.horizontal_sum(), 40);
-    assert_eq!(b.horizontal_sum(), 2);
-    assert_eq!(a.horizontal_product(), 100*100);
-    assert_eq!(b.horizontal_product(), -24);
+}
+
+fn simd_mask() {
+    let intmask = Mask::from_int(i32x4::from_array([0, -1, 0, 0]));
+    assert_eq!(intmask, Mask::from_array([false, true, false, false]));
+    assert_eq!(intmask.to_array(), [false, true, false, false]);
+}
+
+fn simd_cast() {
+    // between integer types
+    assert_eq!(i32x4::from_array([1, 2, 3, -4]), i16x4::from_array([1, 2, 3, -4]).cast());
+    assert_eq!(i16x4::from_array([1, 2, 3, -4]), i32x4::from_array([1, 2, 3, -4]).cast());
+    assert_eq!(i32x4::from_array([1, -1, 3, 4]), u64x4::from_array([1, u64::MAX, 3, 4]).cast());
+
+    // float -> int
+    assert_eq!(
+        i8x4::from_array([127, -128, 127, -128]),
+        f32x4::from_array([127.99, -128.99, 999.0, -999.0]).cast()
+    );
+    assert_eq!(
+        i32x4::from_array([0, 1, -1, 2147483520]),
+        f32x4::from_array([
+            -0.0,
+            /*0x1.19999ap+0*/ f32::from_bits(0x3f8ccccd),
+            /*-0x1.19999ap+0*/ f32::from_bits(0xbf8ccccd),
+            2147483520.0
+        ])
+        .cast()
+    );
+    assert_eq!(
+        i32x8::from_array([i32::MAX, i32::MIN, i32::MAX, i32::MIN, i32::MAX, i32::MIN, 0, 0]),
+        f32x8::from_array([
+            2147483648.0f32,
+            -2147483904.0f32,
+            f32::MAX,
+            f32::MIN,
+            f32::INFINITY,
+            f32::NEG_INFINITY,
+            f32::NAN,
+            -f32::NAN,
+        ])
+        .cast()
+    );
+
+    // int -> float
+    assert_eq!(
+        f32x4::from_array([
+            -2147483648.0,
+            /*0x1.26580cp+30*/ f32::from_bits(0x4e932c06),
+            16777220.0,
+            -16777220.0,
+        ]),
+        i32x4::from_array([-2147483647i32, 1234567890i32, 16777219i32, -16777219i32]).cast()
+    );
+
+    // float -> float
+    assert_eq!(
+        f32x4::from_array([f32::INFINITY, f32::INFINITY, f32::NEG_INFINITY, f32::NEG_INFINITY]),
+        f64x4::from_array([f64::MAX, f64::INFINITY, f64::MIN, f64::NEG_INFINITY]).cast()
+    );
+
+    // unchecked casts
+    unsafe {
+        assert_eq!(
+            i32x4::from_array([0, 1, -1, 2147483520]),
+            f32x4::from_array([
+                -0.0,
+                /*0x1.19999ap+0*/ f32::from_bits(0x3f8ccccd),
+                /*-0x1.19999ap+0*/ f32::from_bits(0xbf8ccccd),
+                2147483520.0
+            ])
+            .to_int_unchecked()
+        );
+        assert_eq!(
+            u64x4::from_array([0, 10000000000000000, u64::MAX - 2047, 9223372036854775808]),
+            f64x4::from_array([
+                -0.99999999999,
+                1e16,
+                (u64::MAX - 1024) as f64,
+                9223372036854775808.0
+            ])
+            .to_int_unchecked()
+        );
+    }
 }

 fn simd_intrinsics() {
@ -112,14 +242,22 @@ fn simd_intrinsics() {
        assert!(simd_reduce_all(i32x4::splat(-1)));
        assert!(!simd_reduce_all(i32x2::from_array([0, -1])));

-        assert_eq!(simd_select(i8x4::from_array([0, -1, -1, 0]), a, b), i32x4::from_array([1, 10, 10, 4]));
-        assert_eq!(simd_select(i8x4::from_array([0, -1, -1, 0]), b, a), i32x4::from_array([10, 2, 10, 10]));
+        assert_eq!(
+            simd_select(i8x4::from_array([0, -1, -1, 0]), a, b),
+            i32x4::from_array([1, 10, 10, 4])
+        );
+        assert_eq!(
+            simd_select(i8x4::from_array([0, -1, -1, 0]), b, a),
+            i32x4::from_array([10, 2, 10, 10])
+        );
    }
 }

 fn main() {
+    simd_mask();
    simd_ops_f32();
    simd_ops_f64();
    simd_ops_i32();
+    simd_cast();
    simd_intrinsics();
 }