Add new swizzle API

Expand swizzle API and migrate existing functions. Add rotate_left, rotate_right. Hide implementation details Add simd_shuffle macro
2021-09-15 04:59:03 +00:00 · 2021-09-15 04:59:03 +00:00 · 10168fb7c4
commit 10168fb7c4
parent a16b481a08
8 changed files with 491 additions and 261 deletions
--- a/crates/core_simd/examples/matrix_inversion.rs
+++ b/crates/core_simd/examples/matrix_inversion.rs
@ -2,6 +2,7 @@
 // Code ported from the `packed_simd` crate
 // Run this code with `cargo test --example matrix_inversion`
 #![feature(array_chunks, portable_simd)]
+use core_simd::Which::*;
 use core_simd::*;

 // Gotta define our own 4x4 matrix since Rust doesn't ship multidim arrays yet :^)
@ -163,86 +164,81 @@ pub fn simd_inv4x4(m: Matrix4x4) -> Option<Matrix4x4> {
    let m_2 = f32x4::from_array(m[2]);
    let m_3 = f32x4::from_array(m[3]);

-    // 2 argument shuffle, returns an f32x4
-    // the first f32x4 is indexes 0..=3
-    // the second f32x4 is indexed 4..=7
-    let tmp1 = f32x4::shuffle::<{ [0, 1, 4, 5] }>(m_0, m_1);
-    let row1 = f32x4::shuffle::<{ [0, 1, 4, 5] }>(m_2, m_3);
+    const SHUFFLE01: [Which; 4] = [First(0), First(1), Second(0), Second(1)];
+    const SHUFFLE02: [Which; 4] = [First(0), First(2), Second(0), Second(2)];
+    const SHUFFLE13: [Which; 4] = [First(1), First(3), Second(1), Second(3)];
+    const SHUFFLE23: [Which; 4] = [First(2), First(3), Second(2), Second(3)];

-    let row0 = f32x4::shuffle::<{ [0, 2, 4, 6] }>(tmp1, row1);
-    let row1 = f32x4::shuffle::<{ [1, 3, 5, 7] }>(row1, tmp1);
+    let tmp = simd_shuffle!(m_0, m_1, SHUFFLE01);
+    let row1 = simd_shuffle!(m_2, m_3, SHUFFLE01);

-    let tmp1 = f32x4::shuffle::<{ [2, 3, 6, 7] }>(m_0, m_1);
-    let row3 = f32x4::shuffle::<{ [2, 3, 6, 7] }>(m_2, m_3);
-    let row2 = f32x4::shuffle::<{ [0, 2, 4, 6] }>(tmp1, row3);
-    let row3 = f32x4::shuffle::<{ [1, 3, 5, 7] }>(row3, tmp1);
+    let row0 = simd_shuffle!(tmp, row1, SHUFFLE02);
+    let row1 = simd_shuffle!(row1, tmp, SHUFFLE13);

-    let tmp1 = row2 * row3;
-    // there's no syntax for a 1 arg shuffle yet,
-    // so we just pass the same f32x4 twice
-    let tmp1 = f32x4::shuffle::<{ [1, 0, 3, 2] }>(tmp1, tmp1);
+    let tmp = simd_shuffle!(m_0, m_1, SHUFFLE23);
+    let row3 = simd_shuffle!(m_2, m_3, SHUFFLE23);
+    let row2 = simd_shuffle!(tmp, row3, SHUFFLE02);
+    let row3 = simd_shuffle!(row3, tmp, SHUFFLE13);

-    let minor0 = row1 * tmp1;
-    let minor1 = row0 * tmp1;
-    let tmp1 = f32x4::shuffle::<{ [2, 3, 0, 1] }>(tmp1, tmp1);
-    let minor0 = (row1 * tmp1) - minor0;
-    let minor1 = (row0 * tmp1) - minor1;
-    let minor1 = f32x4::shuffle::<{ [2, 3, 0, 1] }>(minor1, minor1);
+    let tmp = (row2 * row3).reverse().rotate_right::<2>();
+    let minor0 = row1 * tmp;
+    let minor1 = row0 * tmp;
+    let tmp = tmp.rotate_right::<2>();
+    let minor0 = (row1 * tmp) - minor0;
+    let minor1 = (row0 * tmp) - minor1;
+    let minor1 = minor1.rotate_right::<2>();

-    let tmp1 = row1 * row2;
-    let tmp1 = f32x4::shuffle::<{ [1, 0, 3, 2] }>(tmp1, tmp1);
-    let minor0 = (row3 * tmp1) + minor0;
-    let minor3 = row0 * tmp1;
-    let tmp1 = f32x4::shuffle::<{ [2, 3, 0, 1] }>(tmp1, tmp1);
+    let tmp = (row1 * row2).reverse().rotate_right::<2>();
+    let minor0 = (row3 * tmp) + minor0;
+    let minor3 = row0 * tmp;
+    let tmp = tmp.rotate_right::<2>();

-    let minor0 = minor0 - row3 * tmp1;
-    let minor3 = row0 * tmp1 - minor3;
-    let minor3 = f32x4::shuffle::<{ [2, 3, 0, 1] }>(minor3, minor3);
+    let minor0 = minor0 - row3 * tmp;
+    let minor3 = row0 * tmp - minor3;
+    let minor3 = minor3.rotate_right::<2>();

-    let tmp1 = row3 * f32x4::shuffle::<{ [2, 3, 0, 1] }>(row1, row1);
-    let tmp1 = f32x4::shuffle::<{ [1, 0, 3, 2] }>(tmp1, tmp1);
-    let row2 = f32x4::shuffle::<{ [2, 3, 0, 1] }>(row2, row2);
-    let minor0 = row2 * tmp1 + minor0;
-    let minor2 = row0 * tmp1;
-    let tmp1 = f32x4::shuffle::<{ [2, 3, 0, 1] }>(tmp1, tmp1);
-    let minor0 = minor0 - row2 * tmp1;
-    let minor2 = row0 * tmp1 - minor2;
-    let minor2 = f32x4::shuffle::<{ [2, 3, 0, 1] }>(minor2, minor2);
+    let tmp = (row3 * row1.rotate_right::<2>())
+        .reverse()
+        .rotate_right::<2>();
+    let row2 = row2.rotate_right::<2>();
+    let minor0 = row2 * tmp + minor0;
+    let minor2 = row0 * tmp;
+    let tmp = tmp.rotate_right::<2>();
+    let minor0 = minor0 - row2 * tmp;
+    let minor2 = row0 * tmp - minor2;
+    let minor2 = minor2.rotate_right::<2>();

-    let tmp1 = row0 * row1;
-    let tmp1 = f32x4::shuffle::<{ [1, 0, 3, 2] }>(tmp1, tmp1);
-    let minor2 = minor2 + row3 * tmp1;
-    let minor3 = row2 * tmp1 - minor3;
-    let tmp1 = f32x4::shuffle::<{ [2, 3, 0, 1] }>(tmp1, tmp1);
-    let minor2 = row3 * tmp1 - minor2;
-    let minor3 = minor3 - row2 * tmp1;
+    let tmp = (row0 * row1).reverse().rotate_right::<2>();
+    let minor2 = minor2 + row3 * tmp;
+    let minor3 = row2 * tmp - minor3;
+    let tmp = tmp.rotate_right::<2>();
+    let minor2 = row3 * tmp - minor2;
+    let minor3 = minor3 - row2 * tmp;

-    let tmp1 = row0 * row3;
-    let tmp1 = f32x4::shuffle::<{ [1, 0, 3, 2] }>(tmp1, tmp1);
-    let minor1 = minor1 - row2 * tmp1;
-    let minor2 = row1 * tmp1 + minor2;
-    let tmp1 = f32x4::shuffle::<{ [2, 3, 0, 1] }>(tmp1, tmp1);
-    let minor1 = row2 * tmp1 + minor1;
-    let minor2 = minor2 - row1 * tmp1;
+    let tmp = (row0 * row3).reverse().rotate_right::<2>();
+    let minor1 = minor1 - row2 * tmp;
+    let minor2 = row1 * tmp + minor2;
+    let tmp = tmp.rotate_right::<2>();
+    let minor1 = row2 * tmp + minor1;
+    let minor2 = minor2 - row1 * tmp;

-    let tmp1 = row0 * row2;
-    let tmp1 = f32x4::shuffle::<{ [1, 0, 3, 2] }>(tmp1, tmp1);
-    let minor1 = row3 * tmp1 + minor1;
-    let minor3 = minor3 - row1 * tmp1;
-    let tmp1 = f32x4::shuffle::<{ [2, 3, 0, 1] }>(tmp1, tmp1);
-    let minor1 = minor1 - row3 * tmp1;
-    let minor3 = row1 * tmp1 + minor3;
+    let tmp = (row0 * row2).reverse().rotate_right::<2>();
+    let minor1 = row3 * tmp + minor1;
+    let minor3 = minor3 - row1 * tmp;
+    let tmp = tmp.rotate_right::<2>();
+    let minor1 = minor1 - row3 * tmp;
+    let minor3 = row1 * tmp + minor3;

    let det = row0 * minor0;
-    let det = f32x4::shuffle::<{ [2, 3, 0, 1] }>(det, det) + det;
-    let det = f32x4::shuffle::<{ [1, 0, 3, 2] }>(det, det) + det;
+    let det = det.rotate_right::<2>() + det;
+    let det = det.reverse().rotate_right::<2>() + det;

    if det.horizontal_sum() == 0. {
        return None;
    }
    // calculate the reciprocal
-    let tmp1 = f32x4::splat(1.0) / det;
-    let det = tmp1 + tmp1 - det * tmp1 * tmp1;
+    let tmp = f32x4::splat(1.0) / det;
+    let det = tmp + tmp - det * tmp * tmp;

    let res0 = minor0 * det;
    let res1 = minor1 * det;
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@ -54,11 +54,7 @@ extern "platform-intrinsic" {
    pub(crate) fn simd_ge<T, U>(x: T, y: T) -> U;

    // shufflevector
-    pub(crate) fn simd_shuffle2<T, U>(x: T, y: T, idx: [u32; 2]) -> U;
-    pub(crate) fn simd_shuffle4<T, U>(x: T, y: T, idx: [u32; 4]) -> U;
-    pub(crate) fn simd_shuffle8<T, U>(x: T, y: T, idx: [u32; 8]) -> U;
-    pub(crate) fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U;
-    pub(crate) fn simd_shuffle32<T, U>(x: T, y: T, idx: [u32; 32]) -> U;
+    pub(crate) fn simd_shuffle<T, U, V>(x: T, y: T, idx: U) -> V;

    pub(crate) fn simd_gather<T, U, V>(val: T, ptr: U, mask: V) -> T;
    pub(crate) fn simd_scatter<T, U, V>(val: T, ptr: U, mask: V);
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@ -3,6 +3,7 @@
 #![feature(
    adt_const_params,
    const_fn_trait_bound,
+    const_panic,
    platform_intrinsics,
    repr_simd,
    simd_ffi,
--- a/crates/core_simd/src/mod.rs
+++ b/crates/core_simd/src/mod.rs
@ -1,8 +1,9 @@
 #[macro_use]
-mod permute;
-#[macro_use]
 mod reduction;

+#[macro_use]
+mod swizzle;
+
 pub(crate) mod intrinsics;

 #[cfg(feature = "generic_const_exprs")]
@ -27,5 +28,6 @@ pub mod simd {
    pub use crate::core_simd::lane_count::{LaneCount, SupportedLaneCount};
    pub use crate::core_simd::masks::*;
    pub use crate::core_simd::select::Select;
+    pub use crate::core_simd::swizzle::*;
    pub use crate::core_simd::vector::*;
 }
--- a/crates/core_simd/src/permute.rs
+++ b/crates/core_simd/src/permute.rs
@ -1,154 +0,0 @@
-use crate::simd::intrinsics;
-use crate::simd::{Simd, SimdElement};
-
-macro_rules! impl_shuffle_lane {
-    { $fn:ident, $n:literal } => {
-        impl<T> Simd<T, $n>
-        where
-            T: SimdElement,
-        {
-            /// A const SIMD shuffle that takes 2 SIMD vectors and produces another vector, using
-            /// the indices in the const parameter. The first or "self" vector will have its lanes
-            /// indexed from 0, and the second vector will have its first lane indexed at $n.
-            /// Indices must be in-bounds of either vector at compile time.
-            ///
-            /// Some SIMD shuffle instructions can be quite slow, so avoiding them by loading data
-            /// into the desired patterns in advance is preferred, but shuffles are still faster
-            /// than storing and reloading from memory.
-            ///
-            /// ```
-            /// #![feature(portable_simd)]
-            /// # #[cfg(feature = "std")] use core_simd::Simd;
-            /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
-            /// let a = Simd::from_array([1.0, 2.0, 3.0, 4.0]);
-            /// let b = Simd::from_array([5.0, 6.0, 7.0, 8.0]);
-            /// const IDXS: [u32; 4] = [4,0,3,7];
-            /// let c = Simd::<_, 4>::shuffle::<IDXS>(a,b);
-            /// assert_eq!(Simd::from_array([5.0, 1.0, 4.0, 8.0]), c);
-            /// ```
-            #[inline]
-            pub fn shuffle<const IDX: [u32; $n]>(self, second: Self) -> Self {
-                unsafe { intrinsics::$fn(self, second, IDX) }
-            }
-
-            /// Reverse the order of the lanes in the vector.
-            #[inline]
-            pub fn reverse(self) -> Self {
-                const fn idx() -> [u32; $n] {
-                    let mut idx = [0u32; $n];
-                    let mut i = 0;
-                    while i < $n {
-                        idx[i] = ($n - i - 1) as u32;
-                        i += 1;
-                    }
-                    idx
-                }
-                self.shuffle::<{ idx() }>(self)
-            }
-
-            /// Interleave two vectors.
-            ///
-            /// Produces two vectors with lanes taken alternately from `self` and `other`.
-            ///
-            /// The first result contains the first `LANES / 2` lanes from `self` and `other`,
-            /// alternating, starting with the first lane of `self`.
-            ///
-            /// The second result contains the last `LANES / 2` lanes from `self` and `other`,
-            /// alternating, starting with the lane `LANES / 2` from the start of `self`.
-            ///
-            /// This particular permutation is efficient on many architectures.
-            ///
-            /// ```
-            /// #![feature(portable_simd)]
-            /// # #[cfg(feature = "std")] use core_simd::Simd;
-            /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
-            /// let a = Simd::from_array([0, 1, 2, 3]);
-            /// let b = Simd::from_array([4, 5, 6, 7]);
-            /// let (x, y) = a.interleave(b);
-            /// assert_eq!(x.to_array(), [0, 4, 1, 5]);
-            /// assert_eq!(y.to_array(), [2, 6, 3, 7]);
-            /// ```
-            #[inline]
-            pub fn interleave(self, other: Self) -> (Self, Self) {
-                const fn lo() -> [u32; $n] {
-                    let mut idx = [0u32; $n];
-                    let mut i = 0;
-                    while i < $n {
-                        let offset = i / 2;
-                        idx[i] = if i % 2 == 0 {
-                            offset
-                        } else {
-                            $n + offset
-                        } as u32;
-                        i += 1;
-                    }
-                    idx
-                }
-                const fn hi() -> [u32; $n] {
-                    let mut idx = [0u32; $n];
-                    let mut i = 0;
-                    while i < $n {
-                        let offset = ($n + i) / 2;
-                        idx[i] = if i % 2 == 0 {
-                            offset
-                        } else {
-                            $n + offset
-                        } as u32;
-                        i += 1;
-                    }
-                    idx
-                }
-                (self.shuffle::<{ lo() }>(other), self.shuffle::<{ hi() }>(other))
-            }
-
-            /// Deinterleave two vectors.
-            ///
-            /// The first result takes every other lane of `self` and then `other`, starting with
-            /// the first lane.
-            ///
-            /// The second result takes every other lane of `self` and then `other`, starting with
-            /// the second lane.
-            ///
-            /// This particular permutation is efficient on many architectures.
-            ///
-            /// ```
-            /// #![feature(portable_simd)]
-            /// # #[cfg(feature = "std")] use core_simd::Simd;
-            /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
-            /// let a = Simd::from_array([0, 4, 1, 5]);
-            /// let b = Simd::from_array([2, 6, 3, 7]);
-            /// let (x, y) = a.deinterleave(b);
-            /// assert_eq!(x.to_array(), [0, 1, 2, 3]);
-            /// assert_eq!(y.to_array(), [4, 5, 6, 7]);
-            /// ```
-            #[inline]
-            pub fn deinterleave(self, other: Self) -> (Self, Self) {
-                const fn even() -> [u32; $n] {
-                    let mut idx = [0u32; $n];
-                    let mut i = 0;
-                    while i < $n {
-                        idx[i] = 2 * i as u32;
-                        i += 1;
-                    }
-                    idx
-                }
-                const fn odd() -> [u32; $n] {
-                    let mut idx = [0u32; $n];
-                    let mut i = 0;
-                    while i < $n {
-                        idx[i] = 1 + 2 * i as u32;
-                        i += 1;
-                    }
-                    idx
-                }
-                (self.shuffle::<{ even() }>(other), self.shuffle::<{ odd() }>(other))
-            }
-        }
-    }
-}
-
-impl_shuffle_lane! { simd_shuffle2, 2 }
-impl_shuffle_lane! { simd_shuffle4, 4 }
-impl_shuffle_lane! { simd_shuffle8, 8 }
-impl_shuffle_lane! { simd_shuffle16, 16 }
-impl_shuffle_lane! { simd_shuffle32, 32 }
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@ -0,0 +1,364 @@
+use crate::simd::intrinsics;
+use crate::{LaneCount, Simd, SimdElement, SupportedLaneCount};
+
+/// Rearrange vector elements.
+///
+/// A new vector is constructed by specifying the the lanes of the source vector or vectors to use.
+///
+/// When shuffling one vector, the indices of the result vector are indicated by a `const` array
+/// of `usize`, like [`Swizzle`].
+/// When shuffling two vectors, the indices are indicated by a `const` array of [`Which`], like
+/// [`Swizzle2`].
+///
+/// # Examples
+/// ## One source vector
+/// ```
+/// # #![feature(portable_simd)]
+/// # use core_simd::{Simd, simd_shuffle};
+/// let v = Simd::<f32, 4>::from_array([0., 1., 2., 3.]);
+/// let v = simd_shuffle!(v, [3, 0, 1, 2]);
+/// assert_eq!(v.to_array(), [3., 0., 1., 2.]);
+/// ```
+///
+/// ## Two source vectors
+/// ```
+/// # #![feature(portable_simd)]
+/// # use core_simd::{Simd, simd_shuffle, Which};
+/// use Which::*;
+/// let a = Simd::<f32, 4>::from_array([0., 1., 2., 3.]);
+/// let b = Simd::<f32, 4>::from_array([4., 5., 6., 7.]);
+/// let v = simd_shuffle!(a, b, [First(0), First(1), Second(2), Second(3)]);
+/// assert_eq!(v.to_array(), [0., 1., 6., 7.]);
+/// ```
+#[macro_export]
+macro_rules! simd_shuffle {
+    {
+        $vector:expr, $index:expr $(,)?
+    } => {
+        {
+            // FIXME this won't work when we are in `core`!
+            use $crate::Swizzle;
+            struct Shuffle;
+            impl Swizzle<{$index.len()}, {$index.len()}> for Shuffle {
+                const INDEX: [usize; {$index.len()}] = $index;
+            }
+            Shuffle::swizzle($vector)
+        }
+    };
+    {
+        $first:expr, $second:expr, $index:expr $(,)?
+    } => {
+        {
+            // FIXME this won't work when we are in `core`!
+            use $crate::{Which, Swizzle2};
+            struct Shuffle;
+            impl Swizzle2<{$index.len()}, {$index.len()}> for Shuffle {
+                const INDEX: [Which; {$index.len()}] = $index;
+            }
+            Shuffle::swizzle2($first, $second)
+        }
+    }
+}
+
+/// An index into one of two vectors.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Which {
+    /// Indexes the first vector.
+    First(usize),
+    /// Indexes the second vector.
+    Second(usize),
+}
+
+/// Create a vector from the elements of another vector.
+pub trait Swizzle<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
+    /// Map from the lanes of the input vector to the output vector.
+    const INDEX: [usize; OUTPUT_LANES];
+
+    /// Create a new vector from the lanes of `vector`.
+    ///
+    /// Lane `i` of the output is `vector[Self::INDEX[i]]`.
+    fn swizzle<T>(vector: Simd<T, INPUT_LANES>) -> Simd<T, OUTPUT_LANES>
+    where
+        T: SimdElement,
+        LaneCount<INPUT_LANES>: SupportedLaneCount,
+        LaneCount<OUTPUT_LANES>: SupportedLaneCount,
+    {
+        unsafe { intrinsics::simd_shuffle(vector, vector, Self::INDEX_IMPL) }
+    }
+}
+
+/// Create a vector from the elements of two other vectors.
+pub trait Swizzle2<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
+    /// Map from the lanes of the input vectors to the output vector
+    const INDEX: [Which; OUTPUT_LANES];
+
+    /// Create a new vector from the lanes of `first` and `second`.
+    ///
+    /// Lane `i` is `first[j]` when `Self::INDEX[i]` is `First(j)`, or `second[j]` when it is
+    /// `Second(j)`.
+    fn swizzle2<T>(
+        first: Simd<T, INPUT_LANES>,
+        second: Simd<T, INPUT_LANES>,
+    ) -> Simd<T, OUTPUT_LANES>
+    where
+        T: SimdElement,
+        LaneCount<INPUT_LANES>: SupportedLaneCount,
+        LaneCount<OUTPUT_LANES>: SupportedLaneCount,
+    {
+        unsafe { intrinsics::simd_shuffle(first, second, Self::INDEX_IMPL) }
+    }
+}
+
+/// The `simd_shuffle` intrinsic expects `u32`, so do error checking and conversion here.
+trait SwizzleImpl<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
+    const INDEX_IMPL: [u32; OUTPUT_LANES];
+}
+
+impl<T, const INPUT_LANES: usize, const OUTPUT_LANES: usize> SwizzleImpl<INPUT_LANES, OUTPUT_LANES>
+    for T
+where
+    T: Swizzle<INPUT_LANES, OUTPUT_LANES> + ?Sized,
+{
+    const INDEX_IMPL: [u32; OUTPUT_LANES] = {
+        let mut output = [0; OUTPUT_LANES];
+        let mut i = 0;
+        while i < OUTPUT_LANES {
+            let index = Self::INDEX[i];
+            assert!(index as u32 as usize == index);
+            assert!(index < INPUT_LANES, "source lane exceeds input lane count",);
+            output[i] = index as u32;
+            i += 1;
+        }
+        output
+    };
+}
+
+/// The `simd_shuffle` intrinsic expects `u32`, so do error checking and conversion here.
+trait Swizzle2Impl<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
+    const INDEX_IMPL: [u32; OUTPUT_LANES];
+}
+
+impl<T, const INPUT_LANES: usize, const OUTPUT_LANES: usize> Swizzle2Impl<INPUT_LANES, OUTPUT_LANES>
+    for T
+where
+    T: Swizzle2<INPUT_LANES, OUTPUT_LANES> + ?Sized,
+{
+    const INDEX_IMPL: [u32; OUTPUT_LANES] = {
+        let mut output = [0; OUTPUT_LANES];
+        let mut i = 0;
+        while i < OUTPUT_LANES {
+            let (offset, index) = match Self::INDEX[i] {
+                Which::First(index) => (false, index),
+                Which::Second(index) => (true, index),
+            };
+            assert!(index < INPUT_LANES, "source lane exceeds input lane count",);
+
+            // lanes are indexed by the first vector, then second vector
+            let index = if offset { index + INPUT_LANES } else { index };
+            assert!(index as u32 as usize == index);
+            output[i] = index as u32;
+            i += 1;
+        }
+        output
+    };
+}
+
+impl<T, const LANES: usize> Simd<T, LANES>
+where
+    T: SimdElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    /// Reverse the order of the lanes in the vector.
+    #[inline]
+    pub fn reverse(self) -> Self {
+        const fn reverse_index<const LANES: usize>() -> [usize; LANES] {
+            let mut index = [0; LANES];
+            let mut i = 0;
+            while i < LANES {
+                index[i] = LANES - i - 1;
+                i += 1;
+            }
+            index
+        }
+
+        struct Reverse;
+
+        impl<const LANES: usize> Swizzle<LANES, LANES> for Reverse {
+            const INDEX: [usize; LANES] = reverse_index::<LANES>();
+        }
+
+        Reverse::swizzle(self)
+    }
+
+    /// Rotates the vector such that the first `OFFSET` elements of the slice move to the end
+    /// while the last `LANES - OFFSET` elements move to the front. After calling `rotate_left`, the
+    /// element previously in lane `OFFSET` will become the first element in the slice.
+    #[inline]
+    pub fn rotate_left<const OFFSET: usize>(self) -> Self {
+        const fn rotate_index<const OFFSET: usize, const LANES: usize>() -> [usize; LANES] {
+            let offset = OFFSET % LANES;
+            let mut index = [0; LANES];
+            let mut i = 0;
+            while i < LANES {
+                index[i] = (i + offset) % LANES;
+                i += 1;
+            }
+            index
+        }
+
+        struct Rotate<const OFFSET: usize>;
+
+        impl<const OFFSET: usize, const LANES: usize> Swizzle<LANES, LANES> for Rotate<OFFSET> {
+            const INDEX: [usize; LANES] = rotate_index::<OFFSET, LANES>();
+        }
+
+        Rotate::<OFFSET>::swizzle(self)
+    }
+
+    /// Rotates the vector such that the first `LANES - OFFSET` elements of the vector move to
+    /// the end while the last `OFFSET` elements move to the front. After calling `rotate_right`, the
+    /// element previously at index `LANES - OFFSET` will become the first element in the slice.
+    #[inline]
+    pub fn rotate_right<const OFFSET: usize>(self) -> Self {
+        const fn rotate_index<const OFFSET: usize, const LANES: usize>() -> [usize; LANES] {
+            let offset = LANES - OFFSET % LANES;
+            let mut index = [0; LANES];
+            let mut i = 0;
+            while i < LANES {
+                index[i] = (i + offset) % LANES;
+                i += 1;
+            }
+            index
+        }
+
+        struct Rotate<const OFFSET: usize>;
+
+        impl<const OFFSET: usize, const LANES: usize> Swizzle<LANES, LANES> for Rotate<OFFSET> {
+            const INDEX: [usize; LANES] = rotate_index::<OFFSET, LANES>();
+        }
+
+        Rotate::<OFFSET>::swizzle(self)
+    }
+
+    /// Interleave two vectors.
+    ///
+    /// Produces two vectors with lanes taken alternately from `self` and `other`.
+    ///
+    /// The first result contains the first `LANES / 2` lanes from `self` and `other`,
+    /// alternating, starting with the first lane of `self`.
+    ///
+    /// The second result contains the last `LANES / 2` lanes from `self` and `other`,
+    /// alternating, starting with the lane `LANES / 2` from the start of `self`.
+    ///
+    /// This particular permutation is efficient on many architectures.
+    ///
+    /// ```
+    /// #![feature(portable_simd)]
+    /// # use core_simd::Simd;
+    /// let a = Simd::from_array([0, 1, 2, 3]);
+    /// let b = Simd::from_array([4, 5, 6, 7]);
+    /// let (x, y) = a.interleave(b);
+    /// assert_eq!(x.to_array(), [0, 4, 1, 5]);
+    /// assert_eq!(y.to_array(), [2, 6, 3, 7]);
+    /// ```
+    #[inline]
+    pub fn interleave(self, other: Self) -> (Self, Self) {
+        const fn lo<const LANES: usize>() -> [Which; LANES] {
+            let mut idx = [Which::First(0); LANES];
+            let mut i = 0;
+            while i < LANES {
+                let offset = i / 2;
+                idx[i] = if i % 2 == 0 {
+                    Which::First(offset)
+                } else {
+                    Which::Second(offset)
+                };
+                i += 1;
+            }
+            idx
+        }
+        const fn hi<const LANES: usize>() -> [Which; LANES] {
+            let mut idx = [Which::First(0); LANES];
+            let mut i = 0;
+            while i < LANES {
+                let offset = (LANES + i) / 2;
+                idx[i] = if i % 2 == 0 {
+                    Which::First(offset)
+                } else {
+                    Which::Second(offset)
+                };
+                i += 1;
+            }
+            idx
+        }
+
+        struct Lo;
+        struct Hi;
+
+        impl<const LANES: usize> Swizzle2<LANES, LANES> for Lo {
+            const INDEX: [Which; LANES] = lo::<LANES>();
+        }
+
+        impl<const LANES: usize> Swizzle2<LANES, LANES> for Hi {
+            const INDEX: [Which; LANES] = hi::<LANES>();
+        }
+
+        (Lo::swizzle2(self, other), Hi::swizzle2(self, other))
+    }
+
+    /// Deinterleave two vectors.
+    ///
+    /// The first result takes every other lane of `self` and then `other`, starting with
+    /// the first lane.
+    ///
+    /// The second result takes every other lane of `self` and then `other`, starting with
+    /// the second lane.
+    ///
+    /// This particular permutation is efficient on many architectures.
+    ///
+    /// ```
+    /// #![feature(portable_simd)]
+    /// # use core_simd::Simd;
+    /// let a = Simd::from_array([0, 4, 1, 5]);
+    /// let b = Simd::from_array([2, 6, 3, 7]);
+    /// let (x, y) = a.deinterleave(b);
+    /// assert_eq!(x.to_array(), [0, 1, 2, 3]);
+    /// assert_eq!(y.to_array(), [4, 5, 6, 7]);
+    /// ```
+    #[inline]
+    pub fn deinterleave(self, other: Self) -> (Self, Self) {
+        const fn even<const LANES: usize>() -> [Which; LANES] {
+            let mut idx = [Which::First(0); LANES];
+            let mut i = 0;
+            while i < LANES / 2 {
+                idx[i] = Which::First(2 * i);
+                idx[i + LANES / 2] = Which::Second(2 * i);
+                i += 1;
+            }
+            idx
+        }
+        const fn odd<const LANES: usize>() -> [Which; LANES] {
+            let mut idx = [Which::First(0); LANES];
+            let mut i = 0;
+            while i < LANES / 2 {
+                idx[i] = Which::First(2 * i + 1);
+                idx[i + LANES / 2] = Which::Second(2 * i + 1);
+                i += 1;
+            }
+            idx
+        }
+
+        struct Even;
+        struct Odd;
+
+        impl<const LANES: usize> Swizzle2<LANES, LANES> for Even {
+            const INDEX: [Which; LANES] = even::<LANES>();
+        }
+
+        impl<const LANES: usize> Swizzle2<LANES, LANES> for Odd {
+            const INDEX: [Which; LANES] = odd::<LANES>();
+        }
+
+        (Even::swizzle2(self, other), Odd::swizzle2(self, other))
+    }
+}
--- a/crates/core_simd/tests/permute.rs
+++ b/crates/core_simd/tests/permute.rs
@ -1,37 +0,0 @@
-#![feature(portable_simd)]
-
-use core_simd::Simd;
-
-#[cfg(target_arch = "wasm32")]
-use wasm_bindgen_test::*;
-
-#[cfg(target_arch = "wasm32")]
-wasm_bindgen_test_configure!(run_in_browser);
-
-#[test]
-#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
-fn simple_shuffle() {
-    let a = Simd::from_array([2, 4, 1, 9]);
-    let b = a;
-    assert_eq!(a.shuffle::<{ [3, 1, 4, 6] }>(b).to_array(), [9, 4, 2, 1]);
-}
-
-#[test]
-#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
-fn reverse() {
-    let a = Simd::from_array([0, 1, 2, 3, 4, 5, 6, 7]);
-    assert_eq!(a.reverse().to_array(), [7, 6, 5, 4, 3, 2, 1, 0]);
-}
-
-#[test]
-#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
-fn interleave() {
-    let a = Simd::from_array([0, 1, 2, 3, 4, 5, 6, 7]);
-    let b = Simd::from_array([8, 9, 10, 11, 12, 13, 14, 15]);
-    let (lo, hi) = a.interleave(b);
-    assert_eq!(lo.to_array(), [0, 8, 1, 9, 2, 10, 3, 11]);
-    assert_eq!(hi.to_array(), [4, 12, 5, 13, 6, 14, 7, 15]);
-    let (even, odd) = lo.deinterleave(hi);
-    assert_eq!(even, a);
-    assert_eq!(odd, b);
-}
--- a/crates/core_simd/tests/swizzle.rs
+++ b/crates/core_simd/tests/swizzle.rs
@ -0,0 +1,62 @@
+#![feature(portable_simd)]
+use core_simd::{Simd, Swizzle};
+
+#[cfg(target_arch = "wasm32")]
+use wasm_bindgen_test::*;
+
+#[cfg(target_arch = "wasm32")]
+wasm_bindgen_test_configure!(run_in_browser);
+
+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn swizzle() {
+    struct Index;
+    impl Swizzle<4, 4> for Index {
+        const INDEX: [usize; 4] = [2, 1, 3, 0];
+    }
+    impl Swizzle<4, 2> for Index {
+        const INDEX: [usize; 2] = [1, 1];
+    }
+
+    let vector = Simd::from_array([2, 4, 1, 9]);
+    assert_eq!(Index::swizzle(vector).to_array(), [1, 4, 9, 2]);
+    assert_eq!(Index::swizzle(vector).to_array(), [4, 4]);
+}
+
+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn reverse() {
+    let a = Simd::from_array([1, 2, 3, 4]);
+    assert_eq!(a.reverse().to_array(), [4, 3, 2, 1]);
+}
+
+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn rotate() {
+    let a = Simd::from_array([1, 2, 3, 4]);
+    assert_eq!(a.rotate_left::<0>().to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.rotate_left::<1>().to_array(), [2, 3, 4, 1]);
+    assert_eq!(a.rotate_left::<2>().to_array(), [3, 4, 1, 2]);
+    assert_eq!(a.rotate_left::<3>().to_array(), [4, 1, 2, 3]);
+    assert_eq!(a.rotate_left::<4>().to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.rotate_left::<5>().to_array(), [2, 3, 4, 1]);
+    assert_eq!(a.rotate_right::<0>().to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.rotate_right::<1>().to_array(), [4, 1, 2, 3]);
+    assert_eq!(a.rotate_right::<2>().to_array(), [3, 4, 1, 2]);
+    assert_eq!(a.rotate_right::<3>().to_array(), [2, 3, 4, 1]);
+    assert_eq!(a.rotate_right::<4>().to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.rotate_right::<5>().to_array(), [4, 1, 2, 3]);
+}
+
+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn interleave() {
+    let a = Simd::from_array([0, 1, 2, 3, 4, 5, 6, 7]);
+    let b = Simd::from_array([8, 9, 10, 11, 12, 13, 14, 15]);
+    let (lo, hi) = a.interleave(b);
+    assert_eq!(lo.to_array(), [0, 8, 1, 9, 2, 10, 3, 11]);
+    assert_eq!(hi.to_array(), [4, 12, 5, 13, 6, 14, 7, 15]);
+    let (even, odd) = lo.deinterleave(hi);
+    assert_eq!(even, a);
+    assert_eq!(odd, b);
+}