From d3c9cc57d291efa09963add37b11cc52edcae19e Mon Sep 17 00:00:00 2001 From: Antoni Boucher Date: Tue, 3 Sep 2024 12:03:53 -0400 Subject: [PATCH] Add support for missing SIMD intrinsics --- src/intrinsic/llvm.rs | 45 ++++++++++++++++++++++++++++++++++++++++++- src/intrinsic/simd.rs | 18 ++++++++++++++++- 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/src/intrinsic/llvm.rs b/src/intrinsic/llvm.rs index 2287c96b41b..614bdbe26b8 100644 --- a/src/intrinsic/llvm.rs +++ b/src/intrinsic/llvm.rs @@ -182,7 +182,10 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>( | "__builtin_ia32_vplzcntd_128_mask" | "__builtin_ia32_vplzcntq_512_mask" | "__builtin_ia32_vplzcntq_256_mask" - | "__builtin_ia32_vplzcntq_128_mask" => { + | "__builtin_ia32_vplzcntq_128_mask" + | "__builtin_ia32_cvtqq2pd128_mask" + | "__builtin_ia32_cvtqq2pd256_mask" + | "__builtin_ia32_cvtqq2ps256_mask" => { let mut new_args = args.to_vec(); // Remove last arg as it doesn't seem to be used in GCC and is always false. new_args.pop(); @@ -378,6 +381,23 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>( ); args = vec![arg.get_address(None)].into(); } + "__builtin_ia32_cvtqq2pd512_mask" | "__builtin_ia32_cvtqq2ps512_mask" => { + let mut old_args = args.to_vec(); + let mut new_args = vec![]; + new_args.push(old_args.swap_remove(0)); + let arg2_type = gcc_func.get_param_type(1); + let vector_type = arg2_type.dyncast_vector().expect("vector type"); + let zero = builder.context.new_rvalue_zero(vector_type.get_element_type()); + let num_units = vector_type.get_num_units(); + let first_arg = + builder.context.new_rvalue_from_vector(None, arg2_type, &vec![zero; num_units]); + new_args.push(first_arg); + let arg3_type = gcc_func.get_param_type(2); + let minus_one = builder.context.new_rvalue_from_int(arg3_type, -1); + new_args.push(minus_one); + new_args.push(old_args.swap_remove(0)); + args = new_args.into(); + } _ => (), } } else { @@ -987,6 +1007,29 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function "llvm.x86.avx512.vpdpbusds.128" => "__builtin_ia32_vpdpbusds_v4si", "llvm.x86.xsave" => "__builtin_ia32_xsave", "llvm.x86.xsaveopt" => "__builtin_ia32_xsaveopt", + "llvm.x86.avx512.mask.loadu.w.512" => "__builtin_ia32_loaddquhi512_mask", + "llvm.x86.avx512.mask.loadu.b.512" => "__builtin_ia32_loaddquqi512_mask", + "llvm.x86.avx512.mask.loadu.w.256" => "__builtin_ia32_loaddquhi256_mask", + "llvm.x86.avx512.mask.loadu.b.256" => "__builtin_ia32_loaddquqi256_mask", + "llvm.x86.avx512.mask.loadu.w.128" => "__builtin_ia32_loaddquhi128_mask", + "llvm.x86.avx512.mask.loadu.b.128" => "__builtin_ia32_loaddquqi128_mask", + "llvm.x86.avx512.mask.storeu.w.512" => "__builtin_ia32_storedquhi512_mask", + "llvm.x86.avx512.mask.storeu.b.512" => "__builtin_ia32_storedquqi512_mask", + "llvm.x86.avx512.mask.storeu.w.256" => "__builtin_ia32_storedquhi256_mask", + "llvm.x86.avx512.mask.storeu.b.256" => "__builtin_ia32_storedquqi256_mask", + "llvm.x86.avx512.mask.storeu.w.128" => "__builtin_ia32_storedquhi128_mask", + "llvm.x86.avx512.mask.storeu.b.128" => "__builtin_ia32_storedquqi128_mask", + "llvm.x86.avx512.mask.expand.load.w.512" => "__builtin_ia32_expandloadhi512_mask", + "llvm.x86.avx512.mask.expand.load.w.256" => "__builtin_ia32_expandloadhi256_mask", + "llvm.x86.avx512.mask.expand.load.w.128" => "__builtin_ia32_expandloadhi128_mask", + "llvm.x86.avx512.mask.expand.load.b.512" => "__builtin_ia32_expandloadqi512_mask", + "llvm.x86.avx512.mask.expand.load.b.256" => "__builtin_ia32_expandloadqi256_mask", + "llvm.x86.avx512.mask.expand.load.b.128" => "__builtin_ia32_expandloadqi128_mask", + "llvm.x86.avx512.sitofp.round.v8f64.v8i64" => "__builtin_ia32_cvtqq2pd512_mask", + "llvm.x86.avx512.sitofp.round.v2f64.v2i64" => "__builtin_ia32_cvtqq2pd128_mask", + "llvm.x86.avx512.sitofp.round.v4f64.v4i64" => "__builtin_ia32_cvtqq2pd256_mask", + "llvm.x86.avx512.sitofp.round.v8f32.v8i64" => "__builtin_ia32_cvtqq2ps512_mask", + "llvm.x86.avx512.sitofp.round.v4f32.v4i64" => "__builtin_ia32_cvtqq2ps256_mask", // NOTE: this file is generated by https://github.com/GuillaumeGomez/llvmint/blob/master/generate_list.py _ => include!("archs.rs"), diff --git a/src/intrinsic/simd.rs b/src/intrinsic/simd.rs index d1b4a9689a9..a0d8ff6346f 100644 --- a/src/intrinsic/simd.rs +++ b/src/intrinsic/simd.rs @@ -201,7 +201,7 @@ macro_rules! require_simd { bx.context.new_bitcast(None, shuffled, v_type) }; - if name == sym::simd_bswap || name == sym::simd_bitreverse { + if matches!(name, sym::simd_bswap | sym::simd_bitreverse | sym::simd_ctpop) { require!( bx.type_kind(bx.element_type(llret_ty)) == TypeKind::Integer, InvalidMonomorphization::UnsupportedOperation { span, name, in_ty, in_elem } @@ -212,6 +212,22 @@ macro_rules! require_simd { return Ok(simd_bswap(bx, args[0].immediate())); } + let simd_ctpop = |bx: &mut Builder<'a, 'gcc, 'tcx>, vector: RValue<'gcc>| -> RValue<'gcc> { + let mut vector_elements = vec![]; + let elem_ty = bx.element_type(llret_ty); + for i in 0..in_len { + let index = bx.context.new_rvalue_from_long(bx.ulong_type, i as i64); + let element = bx.extract_element(vector, index).to_rvalue(); + let result = bx.context.new_cast(None, bx.pop_count(element), elem_ty); + vector_elements.push(result); + } + bx.context.new_rvalue_from_vector(None, llret_ty, &vector_elements) + }; + + if name == sym::simd_ctpop { + return Ok(simd_ctpop(bx, args[0].immediate())); + } + // We use a different algorithm from non-vector bitreverse to take advantage of most // processors' vector shuffle units. It works like this: // 1. Generate pre-reversed low and high nibbles as a vector.