Merge pull request #1495 from folkertdev/add-llvm-sse2-cvtps2dq

add `llvm.x86.sse2.cvtps2dq`
This commit is contained in:
bjorn3 2024-06-06 23:25:16 +02:00 committed by GitHub
commit c511676a62
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 48 additions and 1 deletions

View File

@ -251,6 +251,9 @@ unsafe fn test_simd() {
test_mm_add_epi8();
test_mm_add_pd();
test_mm_cvtepi8_epi16();
#[cfg(not(jit))]
test_mm_cvtps_epi32();
test_mm_cvttps_epi32();
test_mm_cvtsi128_si64();
test_mm_extract_epi8();
@ -476,6 +479,41 @@ unsafe fn test_mm256_permutevar8x32_epi32() {
assert_eq_m256i(r, e);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[cfg(not(jit))]
unsafe fn test_mm_cvtps_epi32() {
let floats: [f32; 4] = [1.5, -2.5, i32::MAX as f32 + 1.0, f32::NAN];
let float_vec = _mm_loadu_ps(floats.as_ptr());
let int_vec = _mm_cvtps_epi32(float_vec);
let mut ints: [i32; 4] = [0; 4];
_mm_storeu_si128(ints.as_mut_ptr() as *mut __m128i, int_vec);
// this is very different from `floats.map(|f| f as i32)`!
let expected_ints: [i32; 4] = [2, -2, i32::MIN, i32::MIN];
assert_eq!(ints, expected_ints);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn test_mm_cvttps_epi32() {
let floats: [f32; 4] = [1.5, -2.5, i32::MAX as f32 + 1.0, f32::NAN];
let float_vec = _mm_loadu_ps(floats.as_ptr());
let int_vec = _mm_cvttps_epi32(float_vec);
let mut ints: [i32; 4] = [0; 4];
_mm_storeu_si128(ints.as_mut_ptr() as *mut __m128i, int_vec);
// this is very different from `floats.map(|f| f as i32)`!
let expected_ints: [i32; 4] = [1, -2, i32::MIN, i32::MIN];
assert_eq!(ints, expected_ints);
}
fn test_checked_mul() {
let u: Option<u8> = u8::from_str_radix("1000", 10).ok();
assert_eq!(u, None);

View File

@ -459,11 +459,20 @@ fn select4(
intrinsic_args!(fx, args => (a); intrinsic);
let a = a.load_scalar(fx);
let value = fx.bcx.ins().x86_cvtt2dq(types::I32X4, a);
let cvalue = CValue::by_val(value, ret.layout());
ret.write_cvalue(fx, cvalue);
}
"llvm.x86.sse2.cvtps2dq" => {
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
intrinsic_args!(fx, args => (a); intrinsic);
let a = a.load_scalar(fx);
// Using inline asm instead of fcvt_to_sint_sat as unrepresentable values are turned
// into 0x80000000 for which Cranelift doesn't have a native instruction.
codegen_inline_asm_inner(
fx,
&[InlineAsmTemplatePiece::String(format!("cvttps2dq xmm0, xmm0"))],
&[InlineAsmTemplatePiece::String(format!("cvtps2dq xmm0, xmm0"))],
&[CInlineAsmOperand::InOut {
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
_late: true,