use crate::simd::{LaneCount, Simd, SupportedLaneCount}; use core::mem; impl Simd where LaneCount: SupportedLaneCount, { /// Swizzle a vector of bytes according to the index vector. /// Indices within range select the appropriate byte. /// Indices "out of bounds" instead select 0. /// /// Note that the current implementation is selected during build-time /// of the standard library, so `cargo build -Zbuild-std` may be necessary /// to unlock better performance, especially for larger vectors. /// A planned compiler improvement will enable using `#[target_feature]` instead. #[inline] pub fn swizzle_dyn(self, idxs: Simd) -> Self { #![allow(unused_imports, unused_unsafe)] #[cfg(target_arch = "aarch64")] use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8}; #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] use core::arch::arm::{uint8x8_t, vtbl1_u8}; #[cfg(target_arch = "wasm32")] use core::arch::wasm32 as wasm; #[cfg(target_arch = "x86")] use core::arch::x86; #[cfg(target_arch = "x86_64")] use core::arch::x86_64 as x86; // SAFETY: Intrinsics covered by cfg unsafe { match N { #[cfg(target_feature = "neon")] 8 => transize(vtbl1_u8, self, idxs), #[cfg(target_feature = "ssse3")] 16 => transize(x86::_mm_shuffle_epi8, self, idxs), #[cfg(target_feature = "simd128")] 16 => transize(wasm::i8x16_swizzle, self, idxs), #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] 16 => transize(vqtbl1q_u8, self, idxs), #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))] 32 => transize_raw(avx2_pshufb, self, idxs), #[cfg(target_feature = "avx512vl,avx512vbmi")] 32 => transize(x86::_mm256_permutexvar_epi8, self, idxs), // Notable absence: avx512bw shuffle // If avx512bw is available, odds of avx512vbmi are good // FIXME: initial AVX512VBMI variant didn't actually pass muster // #[cfg(target_feature = "avx512vbmi")] // 64 => transize(x86::_mm512_permutexvar_epi8, self, idxs), _ => { let mut array = [0; N]; for (i, k) in idxs.to_array().into_iter().enumerate() { if (k as usize) < N { array[i] = self[k as usize]; }; } array.into() } } } } } /// "vpshufb like it was meant to be" on AVX2 /// /// # Safety /// This requires AVX2 to work #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "avx2")] #[allow(unused)] #[inline] #[allow(clippy::let_and_return)] unsafe fn avx2_pshufb(bytes: Simd, idxs: Simd) -> Simd { use crate::simd::SimdPartialOrd; #[cfg(target_arch = "x86")] use core::arch::x86; #[cfg(target_arch = "x86_64")] use core::arch::x86_64 as x86; use x86::_mm256_permute2x128_si256 as avx2_cross_shuffle; use x86::_mm256_shuffle_epi8 as avx2_half_pshufb; let mid = Simd::splat(16u8); let high = mid + mid; // SAFETY: Caller promised AVX2 unsafe { // This is ordering sensitive, and LLVM will order these how you put them. // Most AVX2 impls use ~5 "ports", and only 1 or 2 are capable of permutes. // But the "compose" step will lower to ops that can also use at least 1 other port. // So this tries to break up permutes so composition flows through "open" ports. // Comparative benches should be done on multiple AVX2 CPUs before reordering this let hihi = avx2_cross_shuffle::<0x11>(bytes.into(), bytes.into()); let hi_shuf = Simd::from(avx2_half_pshufb( hihi, // duplicate the vector's top half idxs.into(), // so that using only 4 bits of an index still picks bytes 16-31 )); // A zero-fill during the compose step gives the "all-Neon-like" OOB-is-0 semantics let compose = idxs.simd_lt(high).select(hi_shuf, Simd::splat(0)); let lolo = avx2_cross_shuffle::<0x00>(bytes.into(), bytes.into()); let lo_shuf = Simd::from(avx2_half_pshufb(lolo, idxs.into())); // Repeat, then pick indices < 16, overwriting indices 0-15 from previous compose step let compose = idxs.simd_lt(mid).select(lo_shuf, compose); compose } } /// This sets up a call to an architecture-specific function, and in doing so /// it persuades rustc that everything is the correct size. Which it is. /// This would not be needed if one could convince Rust that, by matching on N, /// N is that value, and thus it would be valid to substitute e.g. 16. /// /// # Safety /// The correctness of this function hinges on the sizes agreeing in actuality. #[allow(dead_code)] #[inline(always)] unsafe fn transize( f: unsafe fn(T, T) -> T, bytes: Simd, idxs: Simd, ) -> Simd where LaneCount: SupportedLaneCount, { let idxs = zeroing_idxs(idxs); // SAFETY: Same obligation to use this function as to use mem::transmute_copy. unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) } } /// Make indices that yield 0 for this architecture #[inline(always)] fn zeroing_idxs(idxs: Simd) -> Simd where LaneCount: SupportedLaneCount, { // On x86, make sure the top bit is set. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] let idxs = { use crate::simd::SimdPartialOrd; idxs.simd_lt(Simd::splat(N as u8)) .select(idxs, Simd::splat(u8::MAX)) }; // Simply do nothing on most architectures. idxs } /// As transize but no implicit call to `zeroing_idxs`. #[allow(dead_code)] #[inline(always)] unsafe fn transize_raw( f: unsafe fn(T, T) -> T, bytes: Simd, idxs: Simd, ) -> Simd where LaneCount: SupportedLaneCount, { // SAFETY: Same obligation to use this function as to use mem::transmute_copy. unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) } }