Merge pull request #1378 from bjorn3/more_vendor_intrinsics

Implement all vendor intrinsics used by regex on AVX2 systems
2023-06-05 20:42:09 +02:00 · 2023-06-05 20:42:09 +02:00 · 204c64bda1
commit 204c64bda1
parent e369cce377 8fbd6f521a
5 changed files with 243 additions and 8 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@ -93,12 +93,6 @@ jobs:
    - name: Prepare dependencies
      run: ./y.rs prepare
    - name: Build without unstable features
      env:
        TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }}
      # This is the config rust-lang/rust uses for builds
      run: ./y.rs build --no-unstable-features
    - name: Build
      run: ./y.rs build --sysroot none
@ -107,6 +101,18 @@ jobs:
        TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }}
      run: ./y.rs test
    - name: Install LLVM standard library
      run: rustup target add ${{ matrix.env.TARGET_TRIPLE }}
    # This is roughly config rust-lang/rust uses for testing
    - name: Test with LLVM sysroot
      # Skip native x86_64-pc-windows-gnu. It is way too slow and cross-compiled
      # x86_64-pc-windows-gnu covers at least part of the tests.
      if: matrix.os != 'windows-latest' || matrix.env.TARGET_TRIPLE != 'x86_64-pc-windows-gnu'
      env:
        TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }}
      run: ./y.rs test --sysroot llvm --no-unstable-features
  # This job doesn't use cg_clif in any way. It checks that all cg_clif tests work with cg_llvm too.
  test_llvm:
--- a/example/alloc_example.rs
+++ b/example/alloc_example.rs
@ -1,4 +1,4 @@
-#![feature(start, core_intrinsics, alloc_error_handler)]
+#![feature(start, core_intrinsics, alloc_error_handler, lang_items)]
 #![no_std]
 extern crate alloc;
@ -27,6 +27,11 @@ fn alloc_error_handler(_: alloc::alloc::Layout) -> ! {
    core::intrinsics::abort();
 }
 #[lang = "eh_personality"]
 fn eh_personality() -> ! {
    loop {}
 }
 #[start]
 fn main(_argc: isize, _argv: *const *const u8) -> isize {
    let world: Box<&str> = Box::new("Hello World!\0");
--- a/example/std_example.rs
+++ b/example/std_example.rs
@ -198,6 +198,9 @@ unsafe fn test_simd() {
    test_mm_extract_epi8();
    test_mm_insert_epi16();
    test_mm256_shuffle_epi8();
    test_mm256_permute2x128_si256();
    #[rustfmt::skip]
    let mask1 = _mm_movemask_epi8(dbg!(_mm_setr_epi8(255u8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)));
    assert_eq!(mask1, 1);
@ -293,6 +296,12 @@ pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
    }
 }
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx")]
 pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
    assert_eq!(std::mem::transmute::<_, [u64; 4]>(a), std::mem::transmute::<_, [u64; 4]>(b))
 }
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "sse2")]
 unsafe fn test_mm_cvtsi128_si64() {
@ -336,6 +345,44 @@ unsafe fn test_mm_insert_epi16() {
    assert_eq_m128i(r, e);
 }
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx2")]
 unsafe fn test_mm256_shuffle_epi8() {
    #[rustfmt::skip]
    let a = _mm256_setr_epi8(
        1, 2, 3, 4, 5, 6, 7, 8,
        9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24,
        25, 26, 27, 28, 29, 30, 31, 32,
    );
    #[rustfmt::skip]
    let b = _mm256_setr_epi8(
        4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
        12, 5, 5, 10, 4, 1, 8, 0,
        4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
        12, 5, 5, 10, 4, 1, 8, 0,
    );
    #[rustfmt::skip]
    let expected = _mm256_setr_epi8(
        5, 0, 5, 4, 9, 13, 7, 4,
        13, 6, 6, 11, 5, 2, 9, 1,
        21, 0, 21, 20, 25, 29, 23, 20,
        29, 22, 22, 27, 21, 18, 25, 17,
    );
    let r = _mm256_shuffle_epi8(a, b);
    assert_eq_m256i(r, expected);
 }
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx2")]
 unsafe fn test_mm256_permute2x128_si256() {
    let a = _mm256_setr_epi64x(100, 200, 500, 600);
    let b = _mm256_setr_epi64x(300, 400, 700, 800);
    let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
    let e = _mm256_setr_epi64x(700, 800, 500, 600);
    assert_eq_m256i(r, e);
 }
 fn test_checked_mul() {
    let u: Option<u8> = u8::from_str_radix("1000", 10).ok();
    assert_eq!(u, None);
--- a/src/intrinsics/llvm_x86.rs
+++ b/src/intrinsics/llvm_x86.rs
@ -110,7 +110,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
            };
            let a = codegen_operand(fx, a);
            let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
-                .expect("llvm.x86.sse2.psrli.d imm8 not const");
+                .expect("llvm.x86.sse2.pslli.d imm8 not const");
            simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
                .try_to_bits(Size::from_bytes(4))
@ -120,6 +120,162 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
                _ => fx.bcx.ins().iconst(types::I32, 0),
            });
        }
        "llvm.x86.avx.psrli.d" => {
            let (a, imm8) = match args {
                [a, imm8] => (a, imm8),
                _ => bug!("wrong number of args for intrinsic {intrinsic}"),
            };
            let a = codegen_operand(fx, a);
            let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
                .expect("llvm.x86.avx.psrli.d imm8 not const");
            simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
                .try_to_bits(Size::from_bytes(4))
                .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
            {
                imm8 if imm8 < 32 => fx.bcx.ins().ushr_imm(lane, i64::from(imm8 as u8)),
                _ => fx.bcx.ins().iconst(types::I32, 0),
            });
        }
        "llvm.x86.avx.pslli.d" => {
            let (a, imm8) = match args {
                [a, imm8] => (a, imm8),
                _ => bug!("wrong number of args for intrinsic {intrinsic}"),
            };
            let a = codegen_operand(fx, a);
            let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
                .expect("llvm.x86.avx.pslli.d imm8 not const");
            simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
                .try_to_bits(Size::from_bytes(4))
                .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
            {
                imm8 if imm8 < 32 => fx.bcx.ins().ishl_imm(lane, i64::from(imm8 as u8)),
                _ => fx.bcx.ins().iconst(types::I32, 0),
            });
        }
        "llvm.x86.avx2.psrli.w" => {
            let (a, imm8) = match args {
                [a, imm8] => (a, imm8),
                _ => bug!("wrong number of args for intrinsic {intrinsic}"),
            };
            let a = codegen_operand(fx, a);
            let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
                .expect("llvm.x86.avx.psrli.w imm8 not const");
            simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
                .try_to_bits(Size::from_bytes(4))
                .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
            {
                imm8 if imm8 < 16 => fx.bcx.ins().ushr_imm(lane, i64::from(imm8 as u8)),
                _ => fx.bcx.ins().iconst(types::I32, 0),
            });
        }
        "llvm.x86.avx2.pslli.w" => {
            let (a, imm8) = match args {
                [a, imm8] => (a, imm8),
                _ => bug!("wrong number of args for intrinsic {intrinsic}"),
            };
            let a = codegen_operand(fx, a);
            let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
                .expect("llvm.x86.avx.pslli.w imm8 not const");
            simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
                .try_to_bits(Size::from_bytes(4))
                .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
            {
                imm8 if imm8 < 16 => fx.bcx.ins().ishl_imm(lane, i64::from(imm8 as u8)),
                _ => fx.bcx.ins().iconst(types::I32, 0),
            });
        }
        "llvm.x86.avx2.pshuf.b" => {
            let (a, b) = match args {
                [a, b] => (a, b),
                _ => bug!("wrong number of args for intrinsic {intrinsic}"),
            };
            let a = codegen_operand(fx, a);
            let b = codegen_operand(fx, b);
            // Based on the pseudocode at https://github.com/rust-lang/stdarch/blob/1cfbca8b38fd9b4282b2f054f61c6ca69fc7ce29/crates/core_arch/src/x86/avx2.rs#L2319-L2332
            let zero = fx.bcx.ins().iconst(types::I8, 0);
            for i in 0..16 {
                let b_lane = b.value_lane(fx, i).load_scalar(fx);
                let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80);
                let a_idx = fx.bcx.ins().band_imm(b_lane, 0xf);
                let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx);
                let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx);
                let res = fx.bcx.ins().select(is_zero, zero, a_lane);
                ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
            }
            for i in 16..32 {
                let b_lane = b.value_lane(fx, i).load_scalar(fx);
                let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80);
                let b_lane_masked = fx.bcx.ins().band_imm(b_lane, 0xf);
                let a_idx = fx.bcx.ins().iadd_imm(b_lane_masked, 16);
                let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx);
                let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx);
                let res = fx.bcx.ins().select(is_zero, zero, a_lane);
                ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
            }
        }
        "llvm.x86.avx2.vperm2i128" => {
            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256
            let (a, b, imm8) = match args {
                [a, b, imm8] => (a, b, imm8),
                _ => bug!("wrong number of args for intrinsic {intrinsic}"),
            };
            let a = codegen_operand(fx, a);
            let b = codegen_operand(fx, b);
            let imm8 = codegen_operand(fx, imm8).load_scalar(fx);
            let a_0 = a.value_lane(fx, 0).load_scalar(fx);
            let a_1 = a.value_lane(fx, 1).load_scalar(fx);
            let a_low = fx.bcx.ins().iconcat(a_0, a_1);
            let a_2 = a.value_lane(fx, 2).load_scalar(fx);
            let a_3 = a.value_lane(fx, 3).load_scalar(fx);
            let a_high = fx.bcx.ins().iconcat(a_2, a_3);
            let b_0 = b.value_lane(fx, 0).load_scalar(fx);
            let b_1 = b.value_lane(fx, 1).load_scalar(fx);
            let b_low = fx.bcx.ins().iconcat(b_0, b_1);
            let b_2 = b.value_lane(fx, 2).load_scalar(fx);
            let b_3 = b.value_lane(fx, 3).load_scalar(fx);
            let b_high = fx.bcx.ins().iconcat(b_2, b_3);
            fn select4(
                fx: &mut FunctionCx<'_, '_, '_>,
                a_high: Value,
                a_low: Value,
                b_high: Value,
                b_low: Value,
                control: Value,
            ) -> Value {
                let a_or_b = fx.bcx.ins().band_imm(control, 0b0010);
                let high_or_low = fx.bcx.ins().band_imm(control, 0b0001);
                let is_zero = fx.bcx.ins().band_imm(control, 0b1000);
                let zero = fx.bcx.ins().iconst(types::I64, 0);
                let zero = fx.bcx.ins().iconcat(zero, zero);
                let res_a = fx.bcx.ins().select(high_or_low, a_high, a_low);
                let res_b = fx.bcx.ins().select(high_or_low, b_high, b_low);
                let res = fx.bcx.ins().select(a_or_b, res_b, res_a);
                fx.bcx.ins().select(is_zero, zero, res)
            }
            let control0 = imm8;
            let res_low = select4(fx, a_high, a_low, b_high, b_low, control0);
            let (res_0, res_1) = fx.bcx.ins().isplit(res_low);
            let control1 = fx.bcx.ins().ushr_imm(imm8, 4);
            let res_high = select4(fx, a_high, a_low, b_high, b_low, control1);
            let (res_2, res_3) = fx.bcx.ins().isplit(res_high);
            ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted());
            ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted());
            ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted());
            ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted());
        }
        "llvm.x86.sse2.storeu.dq" => {
            intrinsic_args!(fx, args => (mem_addr, a); intrinsic);
            let mem_addr = mem_addr.load_scalar(fx);
--- a/src/value_and_place.rs
+++ b/src/value_and_place.rs
@ -258,6 +258,27 @@ impl<'tcx> CValue<'tcx> {
        }
    }
    /// Like [`CValue::value_lane`] except allowing a dynamically calculated lane index.
    pub(crate) fn value_lane_dyn(
        self,
        fx: &mut FunctionCx<'_, '_, 'tcx>,
        lane_idx: Value,
    ) -> CValue<'tcx> {
        let layout = self.1;
        assert!(layout.ty.is_simd());
        let (_lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
        let lane_layout = fx.layout_of(lane_ty);
        match self.0 {
            CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
            CValueInner::ByRef(ptr, None) => {
                let field_offset = fx.bcx.ins().imul_imm(lane_idx, lane_layout.size.bytes() as i64);
                let field_ptr = ptr.offset_value(fx, field_offset);
                CValue::by_ref(field_ptr, lane_layout)
            }
            CValueInner::ByRef(_, Some(_)) => unreachable!(),
        }
    }
    /// If `ty` is signed, `const_val` must already be sign extended.
    pub(crate) fn const_val(
        fx: &mut FunctionCx<'_, '_, 'tcx>,