Implement all vendor intrinsics used by regex on AVX2 systems

This allows it to work with --sysroot llvm
This commit is contained in:
bjorn3 2023-06-05 08:55:43 +00:00
parent e369cce377
commit 76900705e8
3 changed files with 225 additions and 1 deletions

View File

@ -198,6 +198,9 @@ unsafe fn test_simd() {
test_mm_extract_epi8();
test_mm_insert_epi16();
test_mm256_shuffle_epi8();
test_mm256_permute2x128_si256();
#[rustfmt::skip]
let mask1 = _mm_movemask_epi8(dbg!(_mm_setr_epi8(255u8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)));
assert_eq!(mask1, 1);
@ -293,6 +296,12 @@ pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx")]
pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
assert_eq!(std::mem::transmute::<_, [u64; 4]>(a), std::mem::transmute::<_, [u64; 4]>(b))
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cvtsi128_si64() {
@ -336,6 +345,44 @@ unsafe fn test_mm_insert_epi16() {
assert_eq_m128i(r, e);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn test_mm256_shuffle_epi8() {
#[rustfmt::skip]
let a = _mm256_setr_epi8(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32,
);
#[rustfmt::skip]
let b = _mm256_setr_epi8(
4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
12, 5, 5, 10, 4, 1, 8, 0,
4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
12, 5, 5, 10, 4, 1, 8, 0,
);
#[rustfmt::skip]
let expected = _mm256_setr_epi8(
5, 0, 5, 4, 9, 13, 7, 4,
13, 6, 6, 11, 5, 2, 9, 1,
21, 0, 21, 20, 25, 29, 23, 20,
29, 22, 22, 27, 21, 18, 25, 17,
);
let r = _mm256_shuffle_epi8(a, b);
assert_eq_m256i(r, expected);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn test_mm256_permute2x128_si256() {
let a = _mm256_setr_epi64x(100, 200, 500, 600);
let b = _mm256_setr_epi64x(300, 400, 700, 800);
let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
let e = _mm256_setr_epi64x(700, 800, 500, 600);
assert_eq_m256i(r, e);
}
fn test_checked_mul() {
let u: Option<u8> = u8::from_str_radix("1000", 10).ok();
assert_eq!(u, None);

View File

@ -110,7 +110,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.sse2.psrli.d imm8 not const");
.expect("llvm.x86.sse2.pslli.d imm8 not const");
simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
@ -120,6 +120,162 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx.psrli.d" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.avx.psrli.d imm8 not const");
simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
{
imm8 if imm8 < 32 => fx.bcx.ins().ushr_imm(lane, i64::from(imm8 as u8)),
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx.pslli.d" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.avx.pslli.d imm8 not const");
simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
{
imm8 if imm8 < 32 => fx.bcx.ins().ishl_imm(lane, i64::from(imm8 as u8)),
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx2.psrli.w" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.avx.psrli.w imm8 not const");
simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
{
imm8 if imm8 < 16 => fx.bcx.ins().ushr_imm(lane, i64::from(imm8 as u8)),
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx2.pslli.w" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.avx.pslli.w imm8 not const");
simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
{
imm8 if imm8 < 16 => fx.bcx.ins().ishl_imm(lane, i64::from(imm8 as u8)),
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx2.pshuf.b" => {
let (a, b) = match args {
[a, b] => (a, b),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let b = codegen_operand(fx, b);
// Based on the pseudocode at https://github.com/rust-lang/stdarch/blob/1cfbca8b38fd9b4282b2f054f61c6ca69fc7ce29/crates/core_arch/src/x86/avx2.rs#L2319-L2332
let zero = fx.bcx.ins().iconst(types::I8, 0);
for i in 0..16 {
let b_lane = b.value_lane(fx, i).load_scalar(fx);
let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80);
let a_idx = fx.bcx.ins().band_imm(b_lane, 0xf);
let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx);
let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx);
let res = fx.bcx.ins().select(is_zero, zero, a_lane);
ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
}
for i in 16..32 {
let b_lane = b.value_lane(fx, i).load_scalar(fx);
let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80);
let b_lane_masked = fx.bcx.ins().band_imm(b_lane, 0xf);
let a_idx = fx.bcx.ins().iadd_imm(b_lane_masked, 16);
let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx);
let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx);
let res = fx.bcx.ins().select(is_zero, zero, a_lane);
ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
}
}
"llvm.x86.avx2.vperm2i128" => {
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256
let (a, b, imm8) = match args {
[a, b, imm8] => (a, b, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let b = codegen_operand(fx, b);
let imm8 = codegen_operand(fx, imm8).load_scalar(fx);
let a_0 = a.value_lane(fx, 0).load_scalar(fx);
let a_1 = a.value_lane(fx, 1).load_scalar(fx);
let a_low = fx.bcx.ins().iconcat(a_0, a_1);
let a_2 = a.value_lane(fx, 2).load_scalar(fx);
let a_3 = a.value_lane(fx, 3).load_scalar(fx);
let a_high = fx.bcx.ins().iconcat(a_2, a_3);
let b_0 = b.value_lane(fx, 0).load_scalar(fx);
let b_1 = b.value_lane(fx, 1).load_scalar(fx);
let b_low = fx.bcx.ins().iconcat(b_0, b_1);
let b_2 = b.value_lane(fx, 2).load_scalar(fx);
let b_3 = b.value_lane(fx, 3).load_scalar(fx);
let b_high = fx.bcx.ins().iconcat(b_2, b_3);
fn select4(
fx: &mut FunctionCx<'_, '_, '_>,
a_high: Value,
a_low: Value,
b_high: Value,
b_low: Value,
control: Value,
) -> Value {
let a_or_b = fx.bcx.ins().band_imm(control, 0b0010);
let high_or_low = fx.bcx.ins().band_imm(control, 0b0001);
let is_zero = fx.bcx.ins().band_imm(control, 0b1000);
let zero = fx.bcx.ins().iconst(types::I64, 0);
let zero = fx.bcx.ins().iconcat(zero, zero);
let res_a = fx.bcx.ins().select(high_or_low, a_high, a_low);
let res_b = fx.bcx.ins().select(high_or_low, b_high, b_low);
let res = fx.bcx.ins().select(a_or_b, res_b, res_a);
fx.bcx.ins().select(is_zero, zero, res)
}
let control0 = imm8;
let res_low = select4(fx, a_high, a_low, b_high, b_low, control0);
let (res_0, res_1) = fx.bcx.ins().isplit(res_low);
let control1 = fx.bcx.ins().ushr_imm(imm8, 4);
let res_high = select4(fx, a_high, a_low, b_high, b_low, control1);
let (res_2, res_3) = fx.bcx.ins().isplit(res_high);
ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted());
ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted());
ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted());
ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted());
}
"llvm.x86.sse2.storeu.dq" => {
intrinsic_args!(fx, args => (mem_addr, a); intrinsic);
let mem_addr = mem_addr.load_scalar(fx);

View File

@ -258,6 +258,27 @@ pub(crate) fn value_lane(
}
}
/// Like [`CValue::value_lane`] except allowing a dynamically calculated lane index.
pub(crate) fn value_lane_dyn(
self,
fx: &mut FunctionCx<'_, '_, 'tcx>,
lane_idx: Value,
) -> CValue<'tcx> {
let layout = self.1;
assert!(layout.ty.is_simd());
let (_lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
let lane_layout = fx.layout_of(lane_ty);
match self.0 {
CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
CValueInner::ByRef(ptr, None) => {
let field_offset = fx.bcx.ins().imul_imm(lane_idx, lane_layout.size.bytes() as i64);
let field_ptr = ptr.offset_value(fx, field_offset);
CValue::by_ref(field_ptr, lane_layout)
}
CValueInner::ByRef(_, Some(_)) => unreachable!(),
}
}
/// If `ty` is signed, `const_val` must already be sign extended.
pub(crate) fn const_val(
fx: &mut FunctionCx<'_, '_, 'tcx>,