From 63cb28ed481e2f3096af20293a24f231d063b301 Mon Sep 17 00:00:00 2001 From: Folkert Date: Thu, 6 Jun 2024 22:33:43 +0200 Subject: [PATCH] add `llvm.x86.sse2.cvtps2dq` --- example/std_example.rs | 38 ++++++++++++++++++++++++++++++++++++++ src/intrinsics/llvm_x86.rs | 11 ++++++++++- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/example/std_example.rs b/example/std_example.rs index 7347b2e7789..6cedd84adfe 100644 --- a/example/std_example.rs +++ b/example/std_example.rs @@ -251,6 +251,9 @@ unsafe fn test_simd() { test_mm_add_epi8(); test_mm_add_pd(); test_mm_cvtepi8_epi16(); + #[cfg(not(jit))] + test_mm_cvtps_epi32(); + test_mm_cvttps_epi32(); test_mm_cvtsi128_si64(); test_mm_extract_epi8(); @@ -476,6 +479,41 @@ unsafe fn test_mm256_permutevar8x32_epi32() { assert_eq_m256i(r, e); } +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx2")] +#[cfg(not(jit))] +unsafe fn test_mm_cvtps_epi32() { + let floats: [f32; 4] = [1.5, -2.5, i32::MAX as f32 + 1.0, f32::NAN]; + + let float_vec = _mm_loadu_ps(floats.as_ptr()); + let int_vec = _mm_cvtps_epi32(float_vec); + + let mut ints: [i32; 4] = [0; 4]; + _mm_storeu_si128(ints.as_mut_ptr() as *mut __m128i, int_vec); + + // this is very different from `floats.map(|f| f as i32)`! + let expected_ints: [i32; 4] = [2, -2, i32::MIN, i32::MIN]; + + assert_eq!(ints, expected_ints); +} + +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx2")] +unsafe fn test_mm_cvttps_epi32() { + let floats: [f32; 4] = [1.5, -2.5, i32::MAX as f32 + 1.0, f32::NAN]; + + let float_vec = _mm_loadu_ps(floats.as_ptr()); + let int_vec = _mm_cvttps_epi32(float_vec); + + let mut ints: [i32; 4] = [0; 4]; + _mm_storeu_si128(ints.as_mut_ptr() as *mut __m128i, int_vec); + + // this is very different from `floats.map(|f| f as i32)`! + let expected_ints: [i32; 4] = [1, -2, i32::MIN, i32::MIN]; + + assert_eq!(ints, expected_ints); +} + fn test_checked_mul() { let u: Option = u8::from_str_radix("1000", 10).ok(); assert_eq!(u, None); diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs index 03dca0656ef..166b260b38a 100644 --- a/src/intrinsics/llvm_x86.rs +++ b/src/intrinsics/llvm_x86.rs @@ -459,11 +459,20 @@ fn select4( intrinsic_args!(fx, args => (a); intrinsic); let a = a.load_scalar(fx); + let value = fx.bcx.ins().x86_cvtt2dq(types::I32X4, a); + let cvalue = CValue::by_val(value, ret.layout()); + ret.write_cvalue(fx, cvalue); + } + "llvm.x86.sse2.cvtps2dq" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32 + intrinsic_args!(fx, args => (a); intrinsic); + let a = a.load_scalar(fx); + // Using inline asm instead of fcvt_to_sint_sat as unrepresentable values are turned // into 0x80000000 for which Cranelift doesn't have a native instruction. codegen_inline_asm_inner( fx, - &[InlineAsmTemplatePiece::String(format!("cvttps2dq xmm0, xmm0"))], + &[InlineAsmTemplatePiece::String(format!("cvtps2dq xmm0, xmm0"))], &[CInlineAsmOperand::InOut { reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)), _late: true,