Rollup merge of #130991 - LaihoE:vectorized_slice_contains, r=Noratrieb

Vectorized SliceContains Godbolt for the u32 case: https://rust.godbolt.org/z/exT9xYWGs Unsure about: - Should align_to be used? It didn't seem to matter in my benchmark but maybe I was lucky with alignment? - Should u8/i8 also be implemented? Currently uses memchr (SWAR) Some benchmarks on x86 (contains called on an array with no matches, worst case may be slightly worse): ## Large N ![large_n_contains](https://github.com/user-attachments/assets/5be79072-970b-44be-a56c-16dc677dee46) ## Small N ![small_n_contains](https://github.com/user-attachments/assets/b8a33790-c176-459f-84f4-05feee893cd0)
2024-10-23 22:11:02 +02:00 · 2024-10-23 22:11:02 +02:00 · af2c7dffda
commit af2c7dffda
parent be01dabfef c11bfd828e
1 changed files with 26 additions and 0 deletions
--- a/library/core/src/slice/cmp.rs
+++ b/library/core/src/slice/cmp.rs
@ -257,3 +257,29 @@ fn slice_contains(&self, x: &[Self]) -> bool {
        memchr::memchr(byte, bytes).is_some()
    }
 }
 macro_rules! impl_slice_contains {
    ($($t:ty),*) => {
        $(
            impl SliceContains for $t {
                #[inline]
                fn slice_contains(&self, arr: &[$t]) -> bool {
                    // Make our LANE_COUNT 4x the normal lane count (aiming for 128 bit vectors).
                    // The compiler will nicely unroll it.
                    const LANE_COUNT: usize = 4 * (128 / (mem::size_of::<$t>() * 8));
                    // SIMD
                    let mut chunks = arr.chunks_exact(LANE_COUNT);
                    for chunk in &mut chunks {
                        if chunk.iter().fold(false, |acc, x| acc | (*x == *self)) {
                            return true;
                        }
                    }
                    // Scalar remainder
                    return chunks.remainder().iter().any(|x| *x == *self);
                }
            }
        )*
    };
 }
 impl_slice_contains!(u16, u32, u64, i16, i32, i64, f32, f64, usize, isize);