Implement _mm_shuffle_epi8

This commit is contained in:
bjorn3 2023-06-06 09:24:10 +00:00
parent e4d0811360
commit c09ef96878
2 changed files with 34 additions and 10 deletions

View File

@ -197,6 +197,7 @@ unsafe fn test_simd() {
test_mm_extract_epi8();
test_mm_insert_epi16();
test_mm_shuffle_epi8();
test_mm256_shuffle_epi8();
test_mm256_permute2x128_si256();
@ -345,6 +346,26 @@ unsafe fn test_mm_insert_epi16() {
assert_eq_m128i(r, e);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "ssse3")]
unsafe fn test_mm_shuffle_epi8() {
#[rustfmt::skip]
let a = _mm_setr_epi8(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
);
#[rustfmt::skip]
let b = _mm_setr_epi8(
4, 128_u8 as i8, 4, 3,
24, 12, 6, 19,
12, 5, 5, 10,
4, 1, 8, 0,
);
let expected = _mm_setr_epi8(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
let r = _mm_shuffle_epi8(a, b);
assert_eq_m128i(r, expected);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn test_mm256_shuffle_epi8() {

View File

@ -222,7 +222,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx2.pshuf.b" => {
"llvm.x86.ssse3.pshuf.b.128" | "llvm.x86.avx2.pshuf.b" => {
let (a, b) = match args {
[a, b] => (a, b),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
@ -241,15 +241,18 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
let res = fx.bcx.ins().select(is_zero, zero, a_lane);
ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
}
for i in 16..32 {
let b_lane = b.value_lane(fx, i).load_scalar(fx);
let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80);
let b_lane_masked = fx.bcx.ins().band_imm(b_lane, 0xf);
let a_idx = fx.bcx.ins().iadd_imm(b_lane_masked, 16);
let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx);
let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx);
let res = fx.bcx.ins().select(is_zero, zero, a_lane);
ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
if intrinsic == "llvm.x86.avx2.pshuf.b" {
for i in 16..32 {
let b_lane = b.value_lane(fx, i).load_scalar(fx);
let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80);
let b_lane_masked = fx.bcx.ins().band_imm(b_lane, 0xf);
let a_idx = fx.bcx.ins().iadd_imm(b_lane_masked, 16);
let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx);
let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx);
let res = fx.bcx.ins().select(is_zero, zero, a_lane);
ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
}
}
}
"llvm.x86.avx2.vperm2i128" => {