diff --git a/example/std_example.rs b/example/std_example.rs index 811dbb267cd..1bf0ff64c92 100644 --- a/example/std_example.rs +++ b/example/std_example.rs @@ -197,6 +197,7 @@ unsafe fn test_simd() { test_mm_extract_epi8(); test_mm_insert_epi16(); + test_mm_shuffle_epi8(); test_mm256_shuffle_epi8(); test_mm256_permute2x128_si256(); @@ -345,6 +346,26 @@ unsafe fn test_mm_insert_epi16() { assert_eq_m128i(r, e); } +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "ssse3")] +unsafe fn test_mm_shuffle_epi8() { + #[rustfmt::skip] + let a = _mm_setr_epi8( + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + ); + #[rustfmt::skip] + let b = _mm_setr_epi8( + 4, 128_u8 as i8, 4, 3, + 24, 12, 6, 19, + 12, 5, 5, 10, + 4, 1, 8, 0, + ); + let expected = _mm_setr_epi8(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1); + let r = _mm_shuffle_epi8(a, b); + assert_eq_m128i(r, expected); +} + #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx2")] unsafe fn test_mm256_shuffle_epi8() { diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs index 2202bf13808..bbd5f4be783 100644 --- a/src/intrinsics/llvm_x86.rs +++ b/src/intrinsics/llvm_x86.rs @@ -222,7 +222,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( _ => fx.bcx.ins().iconst(types::I32, 0), }); } - "llvm.x86.avx2.pshuf.b" => { + "llvm.x86.ssse3.pshuf.b.128" | "llvm.x86.avx2.pshuf.b" => { let (a, b) = match args { [a, b] => (a, b), _ => bug!("wrong number of args for intrinsic {intrinsic}"), @@ -241,15 +241,18 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( let res = fx.bcx.ins().select(is_zero, zero, a_lane); ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted()); } - for i in 16..32 { - let b_lane = b.value_lane(fx, i).load_scalar(fx); - let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80); - let b_lane_masked = fx.bcx.ins().band_imm(b_lane, 0xf); - let a_idx = fx.bcx.ins().iadd_imm(b_lane_masked, 16); - let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx); - let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx); - let res = fx.bcx.ins().select(is_zero, zero, a_lane); - ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted()); + + if intrinsic == "llvm.x86.avx2.pshuf.b" { + for i in 16..32 { + let b_lane = b.value_lane(fx, i).load_scalar(fx); + let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80); + let b_lane_masked = fx.bcx.ins().band_imm(b_lane, 0xf); + let a_idx = fx.bcx.ins().iadd_imm(b_lane_masked, 16); + let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx); + let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx); + let res = fx.bcx.ins().select(is_zero, zero, a_lane); + ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted()); + } } } "llvm.x86.avx2.vperm2i128" => {