Merge pull request #1495 from folkertdev/add-llvm-sse2-cvtps2dq

add `llvm.x86.sse2.cvtps2dq`
2024-06-06 23:25:16 +02:00 · 2024-06-06 23:25:16 +02:00 · c511676a62
commit c511676a62
parent 632e5df38a 63cb28ed48
2 changed files with 48 additions and 1 deletions
--- a/example/std_example.rs
+++ b/example/std_example.rs
@ -251,6 +251,9 @@ unsafe fn test_simd() {
    test_mm_add_epi8();
    test_mm_add_pd();
    test_mm_cvtepi8_epi16();
+    #[cfg(not(jit))]
+    test_mm_cvtps_epi32();
+    test_mm_cvttps_epi32();
    test_mm_cvtsi128_si64();

    test_mm_extract_epi8();
@ -476,6 +479,41 @@ unsafe fn test_mm256_permutevar8x32_epi32() {
    assert_eq_m256i(r, e);
 }

+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+#[cfg(not(jit))]
+unsafe fn test_mm_cvtps_epi32() {
+    let floats: [f32; 4] = [1.5, -2.5, i32::MAX as f32 + 1.0, f32::NAN];
+
+    let float_vec = _mm_loadu_ps(floats.as_ptr());
+    let int_vec = _mm_cvtps_epi32(float_vec);
+
+    let mut ints: [i32; 4] = [0; 4];
+    _mm_storeu_si128(ints.as_mut_ptr() as *mut __m128i, int_vec);
+
+    // this is very different from `floats.map(|f| f as i32)`!
+    let expected_ints: [i32; 4] = [2, -2, i32::MIN, i32::MIN];
+
+    assert_eq!(ints, expected_ints);
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+unsafe fn test_mm_cvttps_epi32() {
+    let floats: [f32; 4] = [1.5, -2.5, i32::MAX as f32 + 1.0, f32::NAN];
+
+    let float_vec = _mm_loadu_ps(floats.as_ptr());
+    let int_vec = _mm_cvttps_epi32(float_vec);
+
+    let mut ints: [i32; 4] = [0; 4];
+    _mm_storeu_si128(ints.as_mut_ptr() as *mut __m128i, int_vec);
+
+    // this is very different from `floats.map(|f| f as i32)`!
+    let expected_ints: [i32; 4] = [1, -2, i32::MIN, i32::MIN];
+
+    assert_eq!(ints, expected_ints);
+}
+
 fn test_checked_mul() {
    let u: Option<u8> = u8::from_str_radix("1000", 10).ok();
    assert_eq!(u, None);
--- a/src/intrinsics/llvm_x86.rs
+++ b/src/intrinsics/llvm_x86.rs
@ -459,11 +459,20 @@ fn select4(
            intrinsic_args!(fx, args => (a); intrinsic);
            let a = a.load_scalar(fx);

+            let value = fx.bcx.ins().x86_cvtt2dq(types::I32X4, a);
+            let cvalue = CValue::by_val(value, ret.layout());
+            ret.write_cvalue(fx, cvalue);
+        }
+        "llvm.x86.sse2.cvtps2dq" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
+            intrinsic_args!(fx, args => (a); intrinsic);
+            let a = a.load_scalar(fx);
+
            // Using inline asm instead of fcvt_to_sint_sat as unrepresentable values are turned
            // into 0x80000000 for which Cranelift doesn't have a native instruction.
            codegen_inline_asm_inner(
                fx,
-                &[InlineAsmTemplatePiece::String(format!("cvttps2dq xmm0, xmm0"))],
+                &[InlineAsmTemplatePiece::String(format!("cvtps2dq xmm0, xmm0"))],
                &[CInlineAsmOperand::InOut {
                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
                    _late: true,