Merge pull request #1378 from bjorn3/more_vendor_intrinsics

Implement all vendor intrinsics used by regex on AVX2 systems
2023-06-05 20:42:09 +02:00 · 2023-06-05 20:42:09 +02:00 · 204c64bda1
commit 204c64bda1
parent e369cce377 8fbd6f521a
5 changed files with 243 additions and 8 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@ -93,12 +93,6 @@ jobs:
    - name: Prepare dependencies
      run: ./y.rs prepare

-    - name: Build without unstable features
-      env:
-        TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }}
-      # This is the config rust-lang/rust uses for builds
-      run: ./y.rs build --no-unstable-features
-
    - name: Build
      run: ./y.rs build --sysroot none

@ -107,6 +101,18 @@ jobs:
        TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }}
      run: ./y.rs test

+    - name: Install LLVM standard library
+      run: rustup target add ${{ matrix.env.TARGET_TRIPLE }}
+
+    # This is roughly config rust-lang/rust uses for testing
+    - name: Test with LLVM sysroot
+      # Skip native x86_64-pc-windows-gnu. It is way too slow and cross-compiled
+      # x86_64-pc-windows-gnu covers at least part of the tests.
+      if: matrix.os != 'windows-latest' || matrix.env.TARGET_TRIPLE != 'x86_64-pc-windows-gnu'
+      env:
+        TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }}
+      run: ./y.rs test --sysroot llvm --no-unstable-features
+

  # This job doesn't use cg_clif in any way. It checks that all cg_clif tests work with cg_llvm too.
  test_llvm:
--- a/example/alloc_example.rs
+++ b/example/alloc_example.rs
@ -1,4 +1,4 @@
-#![feature(start, core_intrinsics, alloc_error_handler)]
+#![feature(start, core_intrinsics, alloc_error_handler, lang_items)]
 #![no_std]

 extern crate alloc;
@ -27,6 +27,11 @@ fn alloc_error_handler(_: alloc::alloc::Layout) -> ! {
    core::intrinsics::abort();
 }

+#[lang = "eh_personality"]
+fn eh_personality() -> ! {
+    loop {}
+}
+
 #[start]
 fn main(_argc: isize, _argv: *const *const u8) -> isize {
    let world: Box<&str> = Box::new("Hello World!\0");
--- a/example/std_example.rs
+++ b/example/std_example.rs
@ -198,6 +198,9 @@ unsafe fn test_simd() {
    test_mm_extract_epi8();
    test_mm_insert_epi16();

+    test_mm256_shuffle_epi8();
+    test_mm256_permute2x128_si256();
+
    #[rustfmt::skip]
    let mask1 = _mm_movemask_epi8(dbg!(_mm_setr_epi8(255u8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)));
    assert_eq!(mask1, 1);
@ -293,6 +296,12 @@ pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
    }
 }

+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx")]
+pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
+    assert_eq!(std::mem::transmute::<_, [u64; 4]>(a), std::mem::transmute::<_, [u64; 4]>(b))
+}
+
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "sse2")]
 unsafe fn test_mm_cvtsi128_si64() {
@ -336,6 +345,44 @@ unsafe fn test_mm_insert_epi16() {
    assert_eq_m128i(r, e);
 }

+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+unsafe fn test_mm256_shuffle_epi8() {
+    #[rustfmt::skip]
+    let a = _mm256_setr_epi8(
+        1, 2, 3, 4, 5, 6, 7, 8,
+        9, 10, 11, 12, 13, 14, 15, 16,
+        17, 18, 19, 20, 21, 22, 23, 24,
+        25, 26, 27, 28, 29, 30, 31, 32,
+    );
+    #[rustfmt::skip]
+    let b = _mm256_setr_epi8(
+        4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
+        12, 5, 5, 10, 4, 1, 8, 0,
+        4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
+        12, 5, 5, 10, 4, 1, 8, 0,
+    );
+    #[rustfmt::skip]
+    let expected = _mm256_setr_epi8(
+        5, 0, 5, 4, 9, 13, 7, 4,
+        13, 6, 6, 11, 5, 2, 9, 1,
+        21, 0, 21, 20, 25, 29, 23, 20,
+        29, 22, 22, 27, 21, 18, 25, 17,
+    );
+    let r = _mm256_shuffle_epi8(a, b);
+    assert_eq_m256i(r, expected);
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+unsafe fn test_mm256_permute2x128_si256() {
+    let a = _mm256_setr_epi64x(100, 200, 500, 600);
+    let b = _mm256_setr_epi64x(300, 400, 700, 800);
+    let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
+    let e = _mm256_setr_epi64x(700, 800, 500, 600);
+    assert_eq_m256i(r, e);
+}
+
 fn test_checked_mul() {
    let u: Option<u8> = u8::from_str_radix("1000", 10).ok();
    assert_eq!(u, None);
--- a/src/intrinsics/llvm_x86.rs
+++ b/src/intrinsics/llvm_x86.rs
@ -110,7 +110,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
            };
            let a = codegen_operand(fx, a);
            let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
-                .expect("llvm.x86.sse2.psrli.d imm8 not const");
+                .expect("llvm.x86.sse2.pslli.d imm8 not const");

            simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
                .try_to_bits(Size::from_bytes(4))
@ -120,6 +120,162 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
                _ => fx.bcx.ins().iconst(types::I32, 0),
            });
        }
+        "llvm.x86.avx.psrli.d" => {
+            let (a, imm8) = match args {
+                [a, imm8] => (a, imm8),
+                _ => bug!("wrong number of args for intrinsic {intrinsic}"),
+            };
+            let a = codegen_operand(fx, a);
+            let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
+                .expect("llvm.x86.avx.psrli.d imm8 not const");
+
+            simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
+                .try_to_bits(Size::from_bytes(4))
+                .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
+            {
+                imm8 if imm8 < 32 => fx.bcx.ins().ushr_imm(lane, i64::from(imm8 as u8)),
+                _ => fx.bcx.ins().iconst(types::I32, 0),
+            });
+        }
+        "llvm.x86.avx.pslli.d" => {
+            let (a, imm8) = match args {
+                [a, imm8] => (a, imm8),
+                _ => bug!("wrong number of args for intrinsic {intrinsic}"),
+            };
+            let a = codegen_operand(fx, a);
+            let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
+                .expect("llvm.x86.avx.pslli.d imm8 not const");
+
+            simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
+                .try_to_bits(Size::from_bytes(4))
+                .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
+            {
+                imm8 if imm8 < 32 => fx.bcx.ins().ishl_imm(lane, i64::from(imm8 as u8)),
+                _ => fx.bcx.ins().iconst(types::I32, 0),
+            });
+        }
+        "llvm.x86.avx2.psrli.w" => {
+            let (a, imm8) = match args {
+                [a, imm8] => (a, imm8),
+                _ => bug!("wrong number of args for intrinsic {intrinsic}"),
+            };
+            let a = codegen_operand(fx, a);
+            let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
+                .expect("llvm.x86.avx.psrli.w imm8 not const");
+
+            simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
+                .try_to_bits(Size::from_bytes(4))
+                .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
+            {
+                imm8 if imm8 < 16 => fx.bcx.ins().ushr_imm(lane, i64::from(imm8 as u8)),
+                _ => fx.bcx.ins().iconst(types::I32, 0),
+            });
+        }
+        "llvm.x86.avx2.pslli.w" => {
+            let (a, imm8) = match args {
+                [a, imm8] => (a, imm8),
+                _ => bug!("wrong number of args for intrinsic {intrinsic}"),
+            };
+            let a = codegen_operand(fx, a);
+            let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
+                .expect("llvm.x86.avx.pslli.w imm8 not const");
+
+            simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
+                .try_to_bits(Size::from_bytes(4))
+                .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
+            {
+                imm8 if imm8 < 16 => fx.bcx.ins().ishl_imm(lane, i64::from(imm8 as u8)),
+                _ => fx.bcx.ins().iconst(types::I32, 0),
+            });
+        }
+        "llvm.x86.avx2.pshuf.b" => {
+            let (a, b) = match args {
+                [a, b] => (a, b),
+                _ => bug!("wrong number of args for intrinsic {intrinsic}"),
+            };
+            let a = codegen_operand(fx, a);
+            let b = codegen_operand(fx, b);
+
+            // Based on the pseudocode at https://github.com/rust-lang/stdarch/blob/1cfbca8b38fd9b4282b2f054f61c6ca69fc7ce29/crates/core_arch/src/x86/avx2.rs#L2319-L2332
+            let zero = fx.bcx.ins().iconst(types::I8, 0);
+            for i in 0..16 {
+                let b_lane = b.value_lane(fx, i).load_scalar(fx);
+                let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80);
+                let a_idx = fx.bcx.ins().band_imm(b_lane, 0xf);
+                let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx);
+                let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx);
+                let res = fx.bcx.ins().select(is_zero, zero, a_lane);
+                ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
+            }
+            for i in 16..32 {
+                let b_lane = b.value_lane(fx, i).load_scalar(fx);
+                let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80);
+                let b_lane_masked = fx.bcx.ins().band_imm(b_lane, 0xf);
+                let a_idx = fx.bcx.ins().iadd_imm(b_lane_masked, 16);
+                let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx);
+                let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx);
+                let res = fx.bcx.ins().select(is_zero, zero, a_lane);
+                ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
+            }
+        }
+        "llvm.x86.avx2.vperm2i128" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256
+            let (a, b, imm8) = match args {
+                [a, b, imm8] => (a, b, imm8),
+                _ => bug!("wrong number of args for intrinsic {intrinsic}"),
+            };
+            let a = codegen_operand(fx, a);
+            let b = codegen_operand(fx, b);
+            let imm8 = codegen_operand(fx, imm8).load_scalar(fx);
+
+            let a_0 = a.value_lane(fx, 0).load_scalar(fx);
+            let a_1 = a.value_lane(fx, 1).load_scalar(fx);
+            let a_low = fx.bcx.ins().iconcat(a_0, a_1);
+            let a_2 = a.value_lane(fx, 2).load_scalar(fx);
+            let a_3 = a.value_lane(fx, 3).load_scalar(fx);
+            let a_high = fx.bcx.ins().iconcat(a_2, a_3);
+
+            let b_0 = b.value_lane(fx, 0).load_scalar(fx);
+            let b_1 = b.value_lane(fx, 1).load_scalar(fx);
+            let b_low = fx.bcx.ins().iconcat(b_0, b_1);
+            let b_2 = b.value_lane(fx, 2).load_scalar(fx);
+            let b_3 = b.value_lane(fx, 3).load_scalar(fx);
+            let b_high = fx.bcx.ins().iconcat(b_2, b_3);
+
+            fn select4(
+                fx: &mut FunctionCx<'_, '_, '_>,
+                a_high: Value,
+                a_low: Value,
+                b_high: Value,
+                b_low: Value,
+                control: Value,
+            ) -> Value {
+                let a_or_b = fx.bcx.ins().band_imm(control, 0b0010);
+                let high_or_low = fx.bcx.ins().band_imm(control, 0b0001);
+                let is_zero = fx.bcx.ins().band_imm(control, 0b1000);
+
+                let zero = fx.bcx.ins().iconst(types::I64, 0);
+                let zero = fx.bcx.ins().iconcat(zero, zero);
+
+                let res_a = fx.bcx.ins().select(high_or_low, a_high, a_low);
+                let res_b = fx.bcx.ins().select(high_or_low, b_high, b_low);
+                let res = fx.bcx.ins().select(a_or_b, res_b, res_a);
+                fx.bcx.ins().select(is_zero, zero, res)
+            }
+
+            let control0 = imm8;
+            let res_low = select4(fx, a_high, a_low, b_high, b_low, control0);
+            let (res_0, res_1) = fx.bcx.ins().isplit(res_low);
+
+            let control1 = fx.bcx.ins().ushr_imm(imm8, 4);
+            let res_high = select4(fx, a_high, a_low, b_high, b_low, control1);
+            let (res_2, res_3) = fx.bcx.ins().isplit(res_high);
+
+            ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted());
+            ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted());
+            ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted());
+            ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted());
+        }
        "llvm.x86.sse2.storeu.dq" => {
            intrinsic_args!(fx, args => (mem_addr, a); intrinsic);
            let mem_addr = mem_addr.load_scalar(fx);
--- a/src/value_and_place.rs
+++ b/src/value_and_place.rs
@ -258,6 +258,27 @@ impl<'tcx> CValue<'tcx> {
        }
    }

+    /// Like [`CValue::value_lane`] except allowing a dynamically calculated lane index.
+    pub(crate) fn value_lane_dyn(
+        self,
+        fx: &mut FunctionCx<'_, '_, 'tcx>,
+        lane_idx: Value,
+    ) -> CValue<'tcx> {
+        let layout = self.1;
+        assert!(layout.ty.is_simd());
+        let (_lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+        let lane_layout = fx.layout_of(lane_ty);
+        match self.0 {
+            CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
+            CValueInner::ByRef(ptr, None) => {
+                let field_offset = fx.bcx.ins().imul_imm(lane_idx, lane_layout.size.bytes() as i64);
+                let field_ptr = ptr.offset_value(fx, field_offset);
+                CValue::by_ref(field_ptr, lane_layout)
+            }
+            CValueInner::ByRef(_, Some(_)) => unreachable!(),
+        }
+    }
+
    /// If `ty` is signed, `const_val` must already be sign extended.
    pub(crate) fn const_val(
        fx: &mut FunctionCx<'_, '_, 'tcx>,