From 3b3594044327ea6a426e0b95dd3ffee725089430 Mon Sep 17 00:00:00 2001 From: Antoni Boucher Date: Sat, 14 May 2022 17:36:37 -0400 Subject: [PATCH 01/12] Implement more SIMD --- Cargo.toml | 4 +- src/builder.rs | 2 +- src/intrinsic/llvm.rs | 115 +++++++++++++++++++++++++++---------- src/intrinsic/simd.rs | 130 ++++++++---------------------------------- 4 files changed, 113 insertions(+), 138 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 211d19a8dc8..0e41bec8b76 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,10 +22,10 @@ default = ["master"] master = ["gccjit/master"] [dependencies] -gccjit = { git = "https://github.com/antoyo/gccjit.rs" } +#gccjit = { git = "https://github.com/antoyo/gccjit.rs" } # Local copy. -#gccjit = { path = "../gccjit.rs" } +gccjit = { path = "../gccjit.rs" } target-lexicon = "0.10.0" diff --git a/src/builder.rs b/src/builder.rs index fa490fe3f22..e7adf29fed8 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -1409,7 +1409,7 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> { } #[cfg(not(feature="master"))] - pub fn vector_reduce(&mut self, src: RValue<'gcc>, op: F) -> RValue<'gcc> + pub fn vector_reduce(&mut self, _src: RValue<'gcc>, _op: F) -> RValue<'gcc> where F: Fn(RValue<'gcc>, RValue<'gcc>, &'gcc Context<'gcc>) -> RValue<'gcc> { unimplemented!(); diff --git a/src/intrinsic/llvm.rs b/src/intrinsic/llvm.rs index 1b089f08f76..6b78157410b 100644 --- a/src/intrinsic/llvm.rs +++ b/src/intrinsic/llvm.rs @@ -75,38 +75,38 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc "__builtin_ia32_pternlogd512_mask" | "__builtin_ia32_pternlogd256_mask" | "__builtin_ia32_pternlogd128_mask" | "__builtin_ia32_pternlogq512_mask" | "__builtin_ia32_pternlogq256_mask" | "__builtin_ia32_pternlogq128_mask" => { - let mut new_args = args.to_vec(); + let mut new_args = args.to_vec(); + let arg5_type = gcc_func.get_param_type(4); + let minus_one = builder.context.new_rvalue_from_int(arg5_type, -1); + new_args.push(minus_one); + args = new_args.into(); + }, + "__builtin_ia32_vfmaddps512_mask" | "__builtin_ia32_vfmaddpd512_mask" => { + let mut new_args = args.to_vec(); + + let mut last_arg = None; + if args.len() == 4 { + last_arg = new_args.pop(); + } + + let arg4_type = gcc_func.get_param_type(3); + let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); + new_args.push(minus_one); + + if args.len() == 3 { + // Both llvm.fma.v16f32 and llvm.x86.avx512.vfmadd.ps.512 maps to + // the same GCC intrinsic, but the former has 3 parameters and the + // latter has 4 so it doesn't require this additional argument. let arg5_type = gcc_func.get_param_type(4); - let minus_one = builder.context.new_rvalue_from_int(arg5_type, -1); - new_args.push(minus_one); - args = new_args.into(); - }, - "__builtin_ia32_vfmaddps512_mask" | "__builtin_ia32_vfmaddpd512_mask" => { - let mut new_args = args.to_vec(); + new_args.push(builder.context.new_rvalue_from_int(arg5_type, 4)); + } - let mut last_arg = None; - if args.len() == 4 { - last_arg = new_args.pop(); - } + if let Some(last_arg) = last_arg { + new_args.push(last_arg); + } - let arg4_type = gcc_func.get_param_type(3); - let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); - new_args.push(minus_one); - - if args.len() == 3 { - // Both llvm.fma.v16f32 and llvm.x86.avx512.vfmadd.ps.512 maps to - // the same GCC intrinsic, but the former has 3 parameters and the - // latter has 4 so it doesn't require this additional argument. - let arg5_type = gcc_func.get_param_type(4); - new_args.push(builder.context.new_rvalue_from_int(arg5_type, 4)); - } - - if let Some(last_arg) = last_arg { - new_args.push(last_arg); - } - - args = new_args.into(); - }, + args = new_args.into(); + }, "__builtin_ia32_addps512_mask" | "__builtin_ia32_addpd512_mask" | "__builtin_ia32_subps512_mask" | "__builtin_ia32_subpd512_mask" | "__builtin_ia32_mulps512_mask" | "__builtin_ia32_mulpd512_mask" @@ -131,6 +131,18 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc new_args.push(last_arg); args = new_args.into(); }, + "__builtin_ia32_cvtdq2ps512_mask" | "__builtin_ia32_cvtudq2ps512_mask" => { + let mut new_args = args.to_vec(); + let last_arg = new_args.pop().expect("last arg"); + let arg2_type = gcc_func.get_param_type(1); + let undefined = builder.current_func().new_local(None, arg2_type, "undefined_for_intrinsic").to_rvalue(); + new_args.push(undefined); + let arg3_type = gcc_func.get_param_type(2); + let minus_one = builder.context.new_rvalue_from_int(arg3_type, -1); + new_args.push(minus_one); + new_args.push(last_arg); + args = new_args.into(); + }, _ => (), } } @@ -149,7 +161,8 @@ pub fn ignore_arg_cast(func_name: &str, index: usize, args_len: usize) -> bool { | "__builtin_ia32_subps512_mask" | "__builtin_ia32_subpd512_mask" | "__builtin_ia32_mulps512_mask" | "__builtin_ia32_mulpd512_mask" | "__builtin_ia32_divps512_mask" | "__builtin_ia32_divpd512_mask" - | "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" => { + | "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" + | "__builtin_ia32_cvtdq2ps512_mask" | "__builtin_ia32_cvtudq2ps512_mask" => { if index == args_len - 1 { return true; } @@ -221,6 +234,48 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function "llvm.x86.avx512.div.pd.512" => "__builtin_ia32_divpd512_mask", "llvm.x86.avx512.vfmadd.ps.512" => "__builtin_ia32_vfmaddps512_mask", "llvm.x86.avx512.vfmadd.pd.512" => "__builtin_ia32_vfmaddpd512_mask", + "llvm.x86.avx512.sitofp.round.v16f32.v16i32" => "__builtin_ia32_cvtdq2ps512_mask", + "llvm.x86.avx512.uitofp.round.v16f32.v16i32" => "__builtin_ia32_cvtudq2ps512_mask", + "llvm.x86.avx512.mask.cvttps2dq.256" => "__builtin_ia32_cvttps2dq256_mask", + "llvm.x86.avx512.mask.cvttps2dq.128" => "__builtin_ia32_cvttps2dq128_mask", + "llvm.x86.avx512.mask.cvttpd2dq.256" => "__builtin_ia32_cvttpd2dq256_mask", + "llvm.x86.avx512.mask.compress.d.512" => "__builtin_ia32_compresssi512_mask", + "llvm.x86.avx512.mask.compress.d.256" => "__builtin_ia32_compresssi256_mask", + "llvm.x86.avx512.mask.compress.d.128" => "__builtin_ia32_compresssi128_mask", + "llvm.x86.avx512.mask.compress.q.512" => "__builtin_ia32_compressdi512_mask", + "llvm.x86.avx512.mask.compress.q.256" => "__builtin_ia32_compressdi256_mask", + "llvm.x86.avx512.mask.compress.q.128" => "__builtin_ia32_compressdi128_mask", + "llvm.x86.avx512.mask.compress.ps.512" => "__builtin_ia32_compresssf512_mask", + "llvm.x86.avx512.mask.compress.ps.256" => "__builtin_ia32_compresssf256_mask", + "llvm.x86.avx512.mask.compress.ps.128" => "__builtin_ia32_compresssf128_mask", + "llvm.x86.avx512.mask.compress.pd.512" => "__builtin_ia32_compressdf512_mask", + "llvm.x86.avx512.mask.compress.pd.256" => "__builtin_ia32_compressdf256_mask", + "llvm.x86.avx512.mask.compress.pd.128" => "__builtin_ia32_compressdf128_mask", + "llvm.x86.avx512.mask.compress.store.d.512" => "", + "llvm.x86.avx512.mask.compress.store.d.256" => "", + "llvm.x86.avx512.mask.compress.store.d.128" => "", + "llvm.x86.avx512.mask.compress.store.q.512" => "", + "llvm.x86.avx512.mask.compress.store.q.256" => "", + "llvm.x86.avx512.mask.compress.store.q.128" => "", + "llvm.x86.avx512.mask.compress.store.ps.512" => "", + "llvm.x86.avx512.mask.compress.store.ps.256" => "", + "llvm.x86.avx512.mask.compress.store.ps.128" => "", + "llvm.x86.avx512.mask.compress.store.pd.512" => "", + "llvm.x86.avx512.mask.compress.store.pd.256" => "", + "llvm.x86.avx512.mask.compress.store.pd.128" => "", + "llvm.x86.avx512.mask.expand.d.512" => "", + "llvm.x86.avx512.mask.expand.d.256" => "", + "llvm.x86.avx512.mask.expand.d.128" => "", + "llvm.x86.avx512.mask.expand.q.512" => "", + "" => "", + "" => "", + "" => "", + "" => "", + "" => "", + "" => "", + "" => "", + "" => "", + "" => "", // The above doc points to unknown builtins for the following, so override them: "llvm.x86.avx2.gather.d.d" => "__builtin_ia32_gathersiv4si", diff --git a/src/intrinsic/simd.rs b/src/intrinsic/simd.rs index 870e9f776a4..a6cf99c62ff 100644 --- a/src/intrinsic/simd.rs +++ b/src/intrinsic/simd.rs @@ -1,5 +1,3 @@ -use std::cmp::Ordering; - use gccjit::{BinaryOp, RValue, Type, ToRValue}; use rustc_codegen_ssa::base::compare_simd_types; use rustc_codegen_ssa::common::{TypeKind, span_invalid_monomorphization_error}; @@ -309,117 +307,37 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>, enum Style { Float, - Int(/* is signed? */ bool), + Int, Unsupported, } - let (in_style, in_width) = match in_elem.kind() { - // vectors of pointer-sized integers should've been - // disallowed before here, so this unwrap is safe. - ty::Int(i) => ( - Style::Int(true), - i.normalize(bx.tcx().sess.target.pointer_width).bit_width().unwrap(), - ), - ty::Uint(u) => ( - Style::Int(false), - u.normalize(bx.tcx().sess.target.pointer_width).bit_width().unwrap(), - ), - ty::Float(f) => (Style::Float, f.bit_width()), - _ => (Style::Unsupported, 0), - }; - let (out_style, out_width) = match out_elem.kind() { - ty::Int(i) => ( - Style::Int(true), - i.normalize(bx.tcx().sess.target.pointer_width).bit_width().unwrap(), - ), - ty::Uint(u) => ( - Style::Int(false), - u.normalize(bx.tcx().sess.target.pointer_width).bit_width().unwrap(), - ), - ty::Float(f) => (Style::Float, f.bit_width()), - _ => (Style::Unsupported, 0), - }; - - let extend = |in_type, out_type| { - let vector_type = bx.context.new_vector_type(out_type, 8); - let vector = args[0].immediate(); - let array_type = bx.context.new_array_type(None, in_type, 8); - // TODO(antoyo): switch to using new_vector_access or __builtin_convertvector for vector casting. - let array = bx.context.new_bitcast(None, vector, array_type); - - let cast_vec_element = |index| { - let index = bx.context.new_rvalue_from_int(bx.int_type, index); - bx.context.new_cast(None, bx.context.new_array_access(None, array, index).to_rvalue(), out_type) + let in_style = + match in_elem.kind() { + ty::Int(_) | ty::Uint(_) => Style::Int, + ty::Float(_) => Style::Float, + _ => Style::Unsupported, }; - bx.context.new_rvalue_from_vector(None, vector_type, &[ - cast_vec_element(0), - cast_vec_element(1), - cast_vec_element(2), - cast_vec_element(3), - cast_vec_element(4), - cast_vec_element(5), - cast_vec_element(6), - cast_vec_element(7), - ]) - }; + let out_style = + match out_elem.kind() { + ty::Int(_) | ty::Uint(_) => Style::Int, + ty::Float(_) => Style::Float, + _ => Style::Unsupported, + }; match (in_style, out_style) { - (Style::Int(in_is_signed), Style::Int(_)) => { - return Ok(match in_width.cmp(&out_width) { - Ordering::Greater => bx.trunc(args[0].immediate(), llret_ty), - Ordering::Equal => args[0].immediate(), - Ordering::Less => { - if in_is_signed { - match (in_width, out_width) { - // FIXME(antoyo): the function _mm_cvtepi8_epi16 should directly - // call an intrinsic equivalent to __builtin_ia32_pmovsxbw128 so that - // we can generate a call to it. - (8, 16) => extend(bx.i8_type, bx.i16_type), - (8, 32) => extend(bx.i8_type, bx.i32_type), - (8, 64) => extend(bx.i8_type, bx.i64_type), - (16, 32) => extend(bx.i16_type, bx.i32_type), - (32, 64) => extend(bx.i32_type, bx.i64_type), - (16, 64) => extend(bx.i16_type, bx.i64_type), - _ => unimplemented!("in: {}, out: {}", in_width, out_width), - } - } else { - match (in_width, out_width) { - (8, 16) => extend(bx.u8_type, bx.u16_type), - (8, 32) => extend(bx.u8_type, bx.u32_type), - (8, 64) => extend(bx.u8_type, bx.u64_type), - (16, 32) => extend(bx.u16_type, bx.u32_type), - (16, 64) => extend(bx.u16_type, bx.u64_type), - (32, 64) => extend(bx.u32_type, bx.u64_type), - _ => unimplemented!("in: {}, out: {}", in_width, out_width), - } - } - } - }); - } - (Style::Int(_), Style::Float) => { - // TODO: add support for internal functions in libgccjit to get access to IFN_VEC_CONVERT which is - // doing like __builtin_convertvector? - // Or maybe provide convert_vector as an API since it might not easy to get the - // types of internal functions. - unimplemented!(); - } - (Style::Float, Style::Int(_)) => { - unimplemented!(); - } - (Style::Float, Style::Float) => { - unimplemented!(); - } - _ => { /* Unsupported. Fallthrough. */ } + (Style::Unsupported, Style::Unsupported) => { + require!( + false, + "unsupported cast from `{}` with element `{}` to `{}` with element `{}`", + in_ty, + in_elem, + ret_ty, + out_elem + ); + }, + _ => return Ok(bx.context.convert_vector(None, args[0].immediate(), llret_ty)), } - require!( - false, - "unsupported cast from `{}` with element `{}` to `{}` with element `{}`", - in_ty, - in_elem, - ret_ty, - out_elem - ); } macro_rules! arith_binary { @@ -590,6 +508,8 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>, ); } }; + // TODO(antoyo): don't use target specific builtins here. + // Not sure how easy it would be to avoid theme here. let builtin_name = match (signed, is_add, in_len, elem_width) { (true, true, 32, 8) => "__builtin_ia32_paddsb256", // TODO(antoyo): cast arguments to unsigned. From 4e802c84c587bf0524fe672ed81ed97e33afea9d Mon Sep 17 00:00:00 2001 From: Antoni Boucher Date: Sun, 15 May 2022 10:49:09 -0400 Subject: [PATCH 02/12] Remove intrinsics that were adding by the updated script --- src/intrinsic/llvm.rs | 132 +++++++++++++++--------------------------- 1 file changed, 47 insertions(+), 85 deletions(-) diff --git a/src/intrinsic/llvm.rs b/src/intrinsic/llvm.rs index 6b78157410b..a4cd05a13c7 100644 --- a/src/intrinsic/llvm.rs +++ b/src/intrinsic/llvm.rs @@ -107,43 +107,53 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc args = new_args.into(); }, - "__builtin_ia32_addps512_mask" | "__builtin_ia32_addpd512_mask" - | "__builtin_ia32_subps512_mask" | "__builtin_ia32_subpd512_mask" - | "__builtin_ia32_mulps512_mask" | "__builtin_ia32_mulpd512_mask" - | "__builtin_ia32_divps512_mask" | "__builtin_ia32_divpd512_mask" => { - let mut new_args = args.to_vec(); - let last_arg = new_args.pop().expect("last arg"); - let arg3_type = gcc_func.get_param_type(2); - let undefined = builder.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue(); - new_args.push(undefined); - let arg4_type = gcc_func.get_param_type(3); - let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); - new_args.push(minus_one); - new_args.push(last_arg); - args = new_args.into(); - }, - "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" => { - let mut new_args = args.to_vec(); - let last_arg = new_args.pop().expect("last arg"); - let arg4_type = gcc_func.get_param_type(3); - let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); - new_args.push(minus_one); - new_args.push(last_arg); - args = new_args.into(); - }, - "__builtin_ia32_cvtdq2ps512_mask" | "__builtin_ia32_cvtudq2ps512_mask" => { - let mut new_args = args.to_vec(); - let last_arg = new_args.pop().expect("last arg"); - let arg2_type = gcc_func.get_param_type(1); - let undefined = builder.current_func().new_local(None, arg2_type, "undefined_for_intrinsic").to_rvalue(); - new_args.push(undefined); - let arg3_type = gcc_func.get_param_type(2); - let minus_one = builder.context.new_rvalue_from_int(arg3_type, -1); - new_args.push(minus_one); - new_args.push(last_arg); - args = new_args.into(); - }, - _ => (), + "__builtin_ia32_addps512_mask" | "__builtin_ia32_addpd512_mask" + | "__builtin_ia32_subps512_mask" | "__builtin_ia32_subpd512_mask" + | "__builtin_ia32_mulps512_mask" | "__builtin_ia32_mulpd512_mask" + | "__builtin_ia32_divps512_mask" | "__builtin_ia32_divpd512_mask" => { + let mut new_args = args.to_vec(); + let last_arg = new_args.pop().expect("last arg"); + let arg3_type = gcc_func.get_param_type(2); + let undefined = builder.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue(); + new_args.push(undefined); + let arg4_type = gcc_func.get_param_type(3); + let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); + new_args.push(minus_one); + new_args.push(last_arg); + args = new_args.into(); + }, + "__builtin_ia32_prold512_mask" => { + let mut new_args = args.to_vec(); + let arg3_type = gcc_func.get_param_type(2); + let undefined = builder.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue(); + new_args.push(undefined); + let arg4_type = gcc_func.get_param_type(3); + let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); + new_args.push(minus_one); + args = new_args.into(); + }, + "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" => { + let mut new_args = args.to_vec(); + let last_arg = new_args.pop().expect("last arg"); + let arg4_type = gcc_func.get_param_type(3); + let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); + new_args.push(minus_one); + new_args.push(last_arg); + args = new_args.into(); + }, + "__builtin_ia32_cvtdq2ps512_mask" | "__builtin_ia32_cvtudq2ps512_mask" => { + let mut new_args = args.to_vec(); + let last_arg = new_args.pop().expect("last arg"); + let arg2_type = gcc_func.get_param_type(1); + let undefined = builder.current_func().new_local(None, arg2_type, "undefined_for_intrinsic").to_rvalue(); + new_args.push(undefined); + let arg3_type = gcc_func.get_param_type(2); + let minus_one = builder.context.new_rvalue_from_int(arg3_type, -1); + new_args.push(minus_one); + new_args.push(last_arg); + args = new_args.into(); + }, + _ => (), } } @@ -202,18 +212,10 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function "llvm.sqrt.v2f64" => "__builtin_ia32_sqrtpd", "llvm.x86.avx512.pmul.dq.512" => "__builtin_ia32_pmuldq512_mask", "llvm.x86.avx512.pmulu.dq.512" => "__builtin_ia32_pmuludq512_mask", - "llvm.x86.avx512.mask.pmaxs.q.256" => "__builtin_ia32_pmaxsq256_mask", - "llvm.x86.avx512.mask.pmaxs.q.128" => "__builtin_ia32_pmaxsq128_mask", "llvm.x86.avx512.max.ps.512" => "__builtin_ia32_maxps512_mask", "llvm.x86.avx512.max.pd.512" => "__builtin_ia32_maxpd512_mask", - "llvm.x86.avx512.mask.pmaxu.q.256" => "__builtin_ia32_pmaxuq256_mask", - "llvm.x86.avx512.mask.pmaxu.q.128" => "__builtin_ia32_pmaxuq128_mask", - "llvm.x86.avx512.mask.pmins.q.256" => "__builtin_ia32_pminsq256_mask", - "llvm.x86.avx512.mask.pmins.q.128" => "__builtin_ia32_pminsq128_mask", "llvm.x86.avx512.min.ps.512" => "__builtin_ia32_minps512_mask", "llvm.x86.avx512.min.pd.512" => "__builtin_ia32_minpd512_mask", - "llvm.x86.avx512.mask.pminu.q.256" => "__builtin_ia32_pminuq256_mask", - "llvm.x86.avx512.mask.pminu.q.128" => "__builtin_ia32_pminuq128_mask", "llvm.fma.v16f32" => "__builtin_ia32_vfmaddps512_mask", "llvm.fma.v8f64" => "__builtin_ia32_vfmaddpd512_mask", "llvm.x86.avx512.vfmaddsub.ps.512" => "__builtin_ia32_vfmaddsubps512_mask", @@ -236,46 +238,6 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function "llvm.x86.avx512.vfmadd.pd.512" => "__builtin_ia32_vfmaddpd512_mask", "llvm.x86.avx512.sitofp.round.v16f32.v16i32" => "__builtin_ia32_cvtdq2ps512_mask", "llvm.x86.avx512.uitofp.round.v16f32.v16i32" => "__builtin_ia32_cvtudq2ps512_mask", - "llvm.x86.avx512.mask.cvttps2dq.256" => "__builtin_ia32_cvttps2dq256_mask", - "llvm.x86.avx512.mask.cvttps2dq.128" => "__builtin_ia32_cvttps2dq128_mask", - "llvm.x86.avx512.mask.cvttpd2dq.256" => "__builtin_ia32_cvttpd2dq256_mask", - "llvm.x86.avx512.mask.compress.d.512" => "__builtin_ia32_compresssi512_mask", - "llvm.x86.avx512.mask.compress.d.256" => "__builtin_ia32_compresssi256_mask", - "llvm.x86.avx512.mask.compress.d.128" => "__builtin_ia32_compresssi128_mask", - "llvm.x86.avx512.mask.compress.q.512" => "__builtin_ia32_compressdi512_mask", - "llvm.x86.avx512.mask.compress.q.256" => "__builtin_ia32_compressdi256_mask", - "llvm.x86.avx512.mask.compress.q.128" => "__builtin_ia32_compressdi128_mask", - "llvm.x86.avx512.mask.compress.ps.512" => "__builtin_ia32_compresssf512_mask", - "llvm.x86.avx512.mask.compress.ps.256" => "__builtin_ia32_compresssf256_mask", - "llvm.x86.avx512.mask.compress.ps.128" => "__builtin_ia32_compresssf128_mask", - "llvm.x86.avx512.mask.compress.pd.512" => "__builtin_ia32_compressdf512_mask", - "llvm.x86.avx512.mask.compress.pd.256" => "__builtin_ia32_compressdf256_mask", - "llvm.x86.avx512.mask.compress.pd.128" => "__builtin_ia32_compressdf128_mask", - "llvm.x86.avx512.mask.compress.store.d.512" => "", - "llvm.x86.avx512.mask.compress.store.d.256" => "", - "llvm.x86.avx512.mask.compress.store.d.128" => "", - "llvm.x86.avx512.mask.compress.store.q.512" => "", - "llvm.x86.avx512.mask.compress.store.q.256" => "", - "llvm.x86.avx512.mask.compress.store.q.128" => "", - "llvm.x86.avx512.mask.compress.store.ps.512" => "", - "llvm.x86.avx512.mask.compress.store.ps.256" => "", - "llvm.x86.avx512.mask.compress.store.ps.128" => "", - "llvm.x86.avx512.mask.compress.store.pd.512" => "", - "llvm.x86.avx512.mask.compress.store.pd.256" => "", - "llvm.x86.avx512.mask.compress.store.pd.128" => "", - "llvm.x86.avx512.mask.expand.d.512" => "", - "llvm.x86.avx512.mask.expand.d.256" => "", - "llvm.x86.avx512.mask.expand.d.128" => "", - "llvm.x86.avx512.mask.expand.q.512" => "", - "" => "", - "" => "", - "" => "", - "" => "", - "" => "", - "" => "", - "" => "", - "" => "", - "" => "", // The above doc points to unknown builtins for the following, so override them: "llvm.x86.avx2.gather.d.d" => "__builtin_ia32_gathersiv4si", From f59d345fc1979ff0b9dc14f4ffcf6a3bd3c6368f Mon Sep 17 00:00:00 2001 From: Antoni Boucher Date: Sun, 15 May 2022 11:43:57 -0400 Subject: [PATCH 03/12] Refactor --- src/intrinsic/llvm.rs | 239 ++++++++++++++++++------------------------ 1 file changed, 100 insertions(+), 139 deletions(-) diff --git a/src/intrinsic/llvm.rs b/src/intrinsic/llvm.rs index a4cd05a13c7..2a6739e74b0 100644 --- a/src/intrinsic/llvm.rs +++ b/src/intrinsic/llvm.rs @@ -9,151 +9,112 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc // arguments here. if gcc_func.get_param_count() != args.len() { match &*func_name { - "__builtin_ia32_pmuldq512_mask" | "__builtin_ia32_pmuludq512_mask" - // FIXME(antoyo): the following intrinsics has 4 (or 5) arguments according to the doc, but is defined with 2 (or 3) arguments in library/stdarch/crates/core_arch/src/x86/avx512f.rs. + // NOTE: the following intrinsics have a different number of parameters in LLVM and GCC. + "__builtin_ia32_prold512_mask" | "__builtin_ia32_pmuldq512_mask" | "__builtin_ia32_pmuludq512_mask" | "__builtin_ia32_pmaxsd512_mask" | "__builtin_ia32_pmaxsq512_mask" | "__builtin_ia32_pmaxsq256_mask" - | "__builtin_ia32_pmaxsq128_mask" | "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask" - | "__builtin_ia32_pmaxud512_mask" | "__builtin_ia32_pmaxuq512_mask" | "__builtin_ia32_pmaxuq256_mask" - | "__builtin_ia32_pmaxuq128_mask" + | "__builtin_ia32_pmaxsq128_mask" | "__builtin_ia32_pmaxud512_mask" | "__builtin_ia32_pmaxuq512_mask" | "__builtin_ia32_pminsd512_mask" | "__builtin_ia32_pminsq512_mask" | "__builtin_ia32_pminsq256_mask" - | "__builtin_ia32_pminsq128_mask" | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask" - | "__builtin_ia32_pminud512_mask" | "__builtin_ia32_pminuq512_mask" | "__builtin_ia32_pminuq256_mask" - | "__builtin_ia32_pminuq128_mask" | "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" + | "__builtin_ia32_pminsq128_mask" | "__builtin_ia32_pminud512_mask" | "__builtin_ia32_pminuq512_mask" => { - // TODO: refactor by separating those intrinsics outside of this branch. - let add_before_last_arg = - match &*func_name { - "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask" - | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask" - | "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => true, - _ => false, - }; - let new_first_arg_is_zero = - match &*func_name { - "__builtin_ia32_pmaxuq256_mask" | "__builtin_ia32_pmaxuq128_mask" - | "__builtin_ia32_pminuq256_mask" | "__builtin_ia32_pminuq128_mask" => true, - _ => false - }; - let arg3_index = - match &*func_name { - "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => 1, - _ => 2, - }; - let mut new_args = args.to_vec(); - let arg3_type = gcc_func.get_param_type(arg3_index); - let first_arg = - if new_first_arg_is_zero { - let vector_type = arg3_type.dyncast_vector().expect("vector type"); - let zero = builder.context.new_rvalue_zero(vector_type.get_element_type()); - let num_units = vector_type.get_num_units(); - builder.context.new_rvalue_from_vector(None, arg3_type, &vec![zero; num_units]) - } - else { - builder.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue() - }; - if add_before_last_arg { - new_args.insert(new_args.len() - 1, first_arg); - } - else { - new_args.push(first_arg); - } - let arg4_index = - match &*func_name { - "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => 2, - _ => 3, - }; - let arg4_type = gcc_func.get_param_type(arg4_index); - let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); - if add_before_last_arg { - new_args.insert(new_args.len() - 1, minus_one); - } - else { - new_args.push(minus_one); - } - args = new_args.into(); - }, - "__builtin_ia32_pternlogd512_mask" | "__builtin_ia32_pternlogd256_mask" - | "__builtin_ia32_pternlogd128_mask" | "__builtin_ia32_pternlogq512_mask" - | "__builtin_ia32_pternlogq256_mask" | "__builtin_ia32_pternlogq128_mask" => { - let mut new_args = args.to_vec(); + let mut new_args = args.to_vec(); + let arg3_type = gcc_func.get_param_type(2); + let first_arg = builder.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue(); + new_args.push(first_arg); + let arg4_type = gcc_func.get_param_type(3); + let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); + new_args.push(minus_one); + args = new_args.into(); + }, + "__builtin_ia32_pmaxuq256_mask" | "__builtin_ia32_pmaxuq128_mask" + | "__builtin_ia32_pminuq256_mask" | "__builtin_ia32_pminuq128_mask" + => { + let mut new_args = args.to_vec(); + let arg3_type = gcc_func.get_param_type(2); + let vector_type = arg3_type.dyncast_vector().expect("vector type"); + let zero = builder.context.new_rvalue_zero(vector_type.get_element_type()); + let num_units = vector_type.get_num_units(); + let first_arg = builder.context.new_rvalue_from_vector(None, arg3_type, &vec![zero; num_units]); + new_args.push(first_arg); + let arg4_type = gcc_func.get_param_type(3); + let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); + new_args.push(minus_one); + args = new_args.into(); + }, + "__builtin_ia32_pternlogd512_mask" | "__builtin_ia32_pternlogd256_mask" + | "__builtin_ia32_pternlogd128_mask" | "__builtin_ia32_pternlogq512_mask" + | "__builtin_ia32_pternlogq256_mask" | "__builtin_ia32_pternlogq128_mask" => { + let mut new_args = args.to_vec(); + let arg5_type = gcc_func.get_param_type(4); + let minus_one = builder.context.new_rvalue_from_int(arg5_type, -1); + new_args.push(minus_one); + args = new_args.into(); + }, + "__builtin_ia32_vfmaddps512_mask" | "__builtin_ia32_vfmaddpd512_mask" => { + let mut new_args = args.to_vec(); + + let mut last_arg = None; + if args.len() == 4 { + last_arg = new_args.pop(); + } + + let arg4_type = gcc_func.get_param_type(3); + let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); + new_args.push(minus_one); + + if args.len() == 3 { + // Both llvm.fma.v16f32 and llvm.x86.avx512.vfmadd.ps.512 maps to + // the same GCC intrinsic, but the former has 3 parameters and the + // latter has 4 so it doesn't require this additional argument. let arg5_type = gcc_func.get_param_type(4); - let minus_one = builder.context.new_rvalue_from_int(arg5_type, -1); - new_args.push(minus_one); - args = new_args.into(); - }, - "__builtin_ia32_vfmaddps512_mask" | "__builtin_ia32_vfmaddpd512_mask" => { - let mut new_args = args.to_vec(); + new_args.push(builder.context.new_rvalue_from_int(arg5_type, 4)); + } - let mut last_arg = None; - if args.len() == 4 { - last_arg = new_args.pop(); - } - - let arg4_type = gcc_func.get_param_type(3); - let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); - new_args.push(minus_one); - - if args.len() == 3 { - // Both llvm.fma.v16f32 and llvm.x86.avx512.vfmadd.ps.512 maps to - // the same GCC intrinsic, but the former has 3 parameters and the - // latter has 4 so it doesn't require this additional argument. - let arg5_type = gcc_func.get_param_type(4); - new_args.push(builder.context.new_rvalue_from_int(arg5_type, 4)); - } - - if let Some(last_arg) = last_arg { - new_args.push(last_arg); - } - - args = new_args.into(); - }, - "__builtin_ia32_addps512_mask" | "__builtin_ia32_addpd512_mask" - | "__builtin_ia32_subps512_mask" | "__builtin_ia32_subpd512_mask" - | "__builtin_ia32_mulps512_mask" | "__builtin_ia32_mulpd512_mask" - | "__builtin_ia32_divps512_mask" | "__builtin_ia32_divpd512_mask" => { - let mut new_args = args.to_vec(); - let last_arg = new_args.pop().expect("last arg"); - let arg3_type = gcc_func.get_param_type(2); - let undefined = builder.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue(); - new_args.push(undefined); - let arg4_type = gcc_func.get_param_type(3); - let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); - new_args.push(minus_one); + if let Some(last_arg) = last_arg { new_args.push(last_arg); - args = new_args.into(); - }, - "__builtin_ia32_prold512_mask" => { - let mut new_args = args.to_vec(); - let arg3_type = gcc_func.get_param_type(2); - let undefined = builder.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue(); - new_args.push(undefined); - let arg4_type = gcc_func.get_param_type(3); - let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); - new_args.push(minus_one); - args = new_args.into(); - }, - "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" => { - let mut new_args = args.to_vec(); - let last_arg = new_args.pop().expect("last arg"); - let arg4_type = gcc_func.get_param_type(3); - let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); - new_args.push(minus_one); - new_args.push(last_arg); - args = new_args.into(); - }, - "__builtin_ia32_cvtdq2ps512_mask" | "__builtin_ia32_cvtudq2ps512_mask" => { - let mut new_args = args.to_vec(); - let last_arg = new_args.pop().expect("last arg"); - let arg2_type = gcc_func.get_param_type(1); - let undefined = builder.current_func().new_local(None, arg2_type, "undefined_for_intrinsic").to_rvalue(); - new_args.push(undefined); - let arg3_type = gcc_func.get_param_type(2); - let minus_one = builder.context.new_rvalue_from_int(arg3_type, -1); - new_args.push(minus_one); - new_args.push(last_arg); - args = new_args.into(); - }, - _ => (), + } + + args = new_args.into(); + }, + "__builtin_ia32_addps512_mask" | "__builtin_ia32_addpd512_mask" + | "__builtin_ia32_subps512_mask" | "__builtin_ia32_subpd512_mask" + | "__builtin_ia32_mulps512_mask" | "__builtin_ia32_mulpd512_mask" + | "__builtin_ia32_divps512_mask" | "__builtin_ia32_divpd512_mask" + | "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask" + | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask" => { + let mut new_args = args.to_vec(); + let last_arg = new_args.pop().expect("last arg"); + let arg3_type = gcc_func.get_param_type(2); + let undefined = builder.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue(); + new_args.push(undefined); + let arg4_type = gcc_func.get_param_type(3); + let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); + new_args.push(minus_one); + new_args.push(last_arg); + args = new_args.into(); + }, + "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" => { + let mut new_args = args.to_vec(); + let last_arg = new_args.pop().expect("last arg"); + let arg4_type = gcc_func.get_param_type(3); + let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); + new_args.push(minus_one); + new_args.push(last_arg); + args = new_args.into(); + }, + "__builtin_ia32_cvtdq2ps512_mask" | "__builtin_ia32_cvtudq2ps512_mask" + | "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => { + let mut new_args = args.to_vec(); + let last_arg = new_args.pop().expect("last arg"); + let arg2_type = gcc_func.get_param_type(1); + let undefined = builder.current_func().new_local(None, arg2_type, "undefined_for_intrinsic").to_rvalue(); + new_args.push(undefined); + let arg3_type = gcc_func.get_param_type(2); + let minus_one = builder.context.new_rvalue_from_int(arg3_type, -1); + new_args.push(minus_one); + new_args.push(last_arg); + args = new_args.into(); + }, + _ => (), } } From 2fa6a9080b5c3bc963f649832f1fdf9fbcd8abd2 Mon Sep 17 00:00:00 2001 From: Antoni Boucher Date: Sun, 15 May 2022 12:07:28 -0400 Subject: [PATCH 04/12] Add more SIMD --- src/intrinsic/llvm.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/intrinsic/llvm.rs b/src/intrinsic/llvm.rs index 2a6739e74b0..241cff20459 100644 --- a/src/intrinsic/llvm.rs +++ b/src/intrinsic/llvm.rs @@ -15,6 +15,7 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc | "__builtin_ia32_pmaxsq128_mask" | "__builtin_ia32_pmaxud512_mask" | "__builtin_ia32_pmaxuq512_mask" | "__builtin_ia32_pminsd512_mask" | "__builtin_ia32_pminsq512_mask" | "__builtin_ia32_pminsq256_mask" | "__builtin_ia32_pminsq128_mask" | "__builtin_ia32_pminud512_mask" | "__builtin_ia32_pminuq512_mask" + | "__builtin_ia32_prolq512_mask" | "__builtin_ia32_prorq512_mask" => { let mut new_args = args.to_vec(); let arg3_type = gcc_func.get_param_type(2); @@ -25,8 +26,11 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc new_args.push(minus_one); args = new_args.into(); }, - "__builtin_ia32_pmaxuq256_mask" | "__builtin_ia32_pmaxuq128_mask" - | "__builtin_ia32_pminuq256_mask" | "__builtin_ia32_pminuq128_mask" + "__builtin_ia32_pmaxuq256_mask" | "__builtin_ia32_pmaxuq128_mask" | "__builtin_ia32_pminuq256_mask" + | "__builtin_ia32_pminuq128_mask" | "__builtin_ia32_prold256_mask" | "__builtin_ia32_prold128_mask" + | "__builtin_ia32_prord512_mask" | "__builtin_ia32_prord256_mask" | "__builtin_ia32_prord128_mask" + | "__builtin_ia32_prolq256_mask" | "__builtin_ia32_prolq128_mask" | "__builtin_ia32_prorq256_mask" + | "__builtin_ia32_prorq128_mask" => { let mut new_args = args.to_vec(); let arg3_type = gcc_func.get_param_type(2); From 06073c9dfc15dd07b611e3497539dcab547ce185 Mon Sep 17 00:00:00 2001 From: Antoni Boucher Date: Sun, 15 May 2022 20:55:51 -0400 Subject: [PATCH 05/12] Add more SIMD --- src/builder.rs | 6 +- src/intrinsic/llvm.rs | 213 ++++++++++++++++++++++++++++++++++++++- src/intrinsic/simd.rs | 226 ++++++++++++++++++++++++++++-------------- 3 files changed, 362 insertions(+), 83 deletions(-) diff --git a/src/builder.rs b/src/builder.rs index e7adf29fed8..b23d12cb0b6 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -286,10 +286,12 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> { if return_type != void_type { unsafe { RETURN_VALUE_COUNT += 1 }; - let result = current_func.new_local(None, return_type, &format!("ptrReturnValue{}", unsafe { RETURN_VALUE_COUNT })); let func_name = format!("{:?}", func_ptr); let args = llvm::adjust_intrinsic_arguments(&self, gcc_func, args, &func_name); - self.block.add_assignment(None, result, self.cx.context.new_call_through_ptr(None, func_ptr, &args)); + let return_value = self.cx.context.new_call_through_ptr(None, func_ptr, &args); + let return_value = llvm::adjust_intrinsic_return_value(&self, return_value, &func_name, &args); + let result = current_func.new_local(None, return_value.get_type(), &format!("ptrReturnValue{}", unsafe { RETURN_VALUE_COUNT })); + self.block.add_assignment(None, result, return_value); result.to_rvalue() } else { diff --git a/src/intrinsic/llvm.rs b/src/intrinsic/llvm.rs index 241cff20459..0e75724122c 100644 --- a/src/intrinsic/llvm.rs +++ b/src/intrinsic/llvm.rs @@ -15,7 +15,17 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc | "__builtin_ia32_pmaxsq128_mask" | "__builtin_ia32_pmaxud512_mask" | "__builtin_ia32_pmaxuq512_mask" | "__builtin_ia32_pminsd512_mask" | "__builtin_ia32_pminsq512_mask" | "__builtin_ia32_pminsq256_mask" | "__builtin_ia32_pminsq128_mask" | "__builtin_ia32_pminud512_mask" | "__builtin_ia32_pminuq512_mask" - | "__builtin_ia32_prolq512_mask" | "__builtin_ia32_prorq512_mask" + | "__builtin_ia32_prolq512_mask" | "__builtin_ia32_prorq512_mask" | "__builtin_ia32_pslldi512_mask" + | "__builtin_ia32_psrldi512_mask" | "__builtin_ia32_psllqi512_mask" | "__builtin_ia32_psrlqi512_mask" + | "__builtin_ia32_pslld512_mask" | "__builtin_ia32_psrld512_mask" | "__builtin_ia32_psllq512_mask" + | "__builtin_ia32_psrlq512_mask" | "__builtin_ia32_psrad512_mask" | "__builtin_ia32_psraq512_mask" + | "__builtin_ia32_psradi512_mask" | "__builtin_ia32_psraqi512_mask" | "__builtin_ia32_psrav16si_mask" + | "__builtin_ia32_psrav8di_mask" | "__builtin_ia32_prolvd512_mask" | "__builtin_ia32_prorvd512_mask" + | "__builtin_ia32_prolvq512_mask" | "__builtin_ia32_prorvq512_mask" | "__builtin_ia32_psllv16si_mask" + | "__builtin_ia32_psrlv16si_mask" | "__builtin_ia32_psllv8di_mask" | "__builtin_ia32_psrlv8di_mask" + | "__builtin_ia32_permvarsi512_mask" | "__builtin_ia32_vpermilvarps512_mask" + | "__builtin_ia32_vpermilvarpd512_mask" | "__builtin_ia32_permvardi512_mask" + | "__builtin_ia32_permvarsf512_mask" => { let mut new_args = args.to_vec(); let arg3_type = gcc_func.get_param_type(2); @@ -30,7 +40,12 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc | "__builtin_ia32_pminuq128_mask" | "__builtin_ia32_prold256_mask" | "__builtin_ia32_prold128_mask" | "__builtin_ia32_prord512_mask" | "__builtin_ia32_prord256_mask" | "__builtin_ia32_prord128_mask" | "__builtin_ia32_prolq256_mask" | "__builtin_ia32_prolq128_mask" | "__builtin_ia32_prorq256_mask" - | "__builtin_ia32_prorq128_mask" + | "__builtin_ia32_prorq128_mask" | "__builtin_ia32_psraq256_mask" | "__builtin_ia32_psraq128_mask" + | "__builtin_ia32_psraqi256_mask" | "__builtin_ia32_psraqi128_mask" | "__builtin_ia32_psravq256_mask" + | "__builtin_ia32_psravq128_mask" | "__builtin_ia32_prolvd256_mask" | "__builtin_ia32_prolvd128_mask" + | "__builtin_ia32_prorvd256_mask" | "__builtin_ia32_prorvd128_mask" | "__builtin_ia32_prolvq256_mask" + | "__builtin_ia32_prolvq128_mask" | "__builtin_ia32_prorvq256_mask" | "__builtin_ia32_prorvq128_mask" + | "__builtin_ia32_permvardi256_mask" | "__builtin_ia32_permvardf512_mask" | "__builtin_ia32_permvardf256_mask" => { let mut new_args = args.to_vec(); let arg3_type = gcc_func.get_param_type(2); @@ -105,6 +120,18 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc new_args.push(last_arg); args = new_args.into(); }, + "__builtin_ia32_vpermi2vard512_mask" | "__builtin_ia32_vpermi2vard256_mask" + | "__builtin_ia32_vpermi2vard128_mask" | "__builtin_ia32_vpermi2varq512_mask" + | "__builtin_ia32_vpermi2varq256_mask" | "__builtin_ia32_vpermi2varq128_mask" + | "__builtin_ia32_vpermi2varps512_mask" | "__builtin_ia32_vpermi2varps256_mask" + | "__builtin_ia32_vpermi2varps128_mask" | "__builtin_ia32_vpermi2varpd512_mask" + | "__builtin_ia32_vpermi2varpd256_mask" | "__builtin_ia32_vpermi2varpd128_mask" => { + let mut new_args = args.to_vec(); + let arg4_type = gcc_func.get_param_type(3); + let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); + new_args.push(minus_one); + args = new_args.into(); + }, "__builtin_ia32_cvtdq2ps512_mask" | "__builtin_ia32_cvtudq2ps512_mask" | "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => { let mut new_args = args.to_vec(); @@ -118,6 +145,52 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc new_args.push(last_arg); args = new_args.into(); }, + "__builtin_ia32_stmxcsr" => { + args = vec![].into(); + }, + "__builtin_ia32_addcarryx_u64" => { + let mut new_args = args.to_vec(); + let arg2_type = gcc_func.get_param_type(1); + let variable = builder.current_func().new_local(None, arg2_type, "addcarryResult"); + new_args.push(variable.get_address(None)); + args = new_args.into(); + }, + _ => (), + } + } + else { + match &*func_name { + "__builtin_ia32_rndscaless_mask_round" | "__builtin_ia32_rndscalesd_mask_round" => { + let new_args = args.to_vec(); + let arg3_type = gcc_func.get_param_type(2); + let arg3 = builder.context.new_cast(None, new_args[4], arg3_type); + let arg4_type = gcc_func.get_param_type(3); + let arg4 = builder.context.new_bitcast(None, new_args[2], arg4_type); + args = vec![new_args[0], new_args[1], arg3, arg4, new_args[3], new_args[5]].into(); + }, + // NOTE: the LLVM intrinsic receives 3 floats, but the GCC builtin requires 3 vectors. + // FIXME: the intrinsics like _mm_mask_fmadd_sd should probably directly call the GCC + // instrinsic to avoid this. + "__builtin_ia32_vfmaddss3_round" => { + let new_args = args.to_vec(); + let arg1_type = gcc_func.get_param_type(0); + let arg2_type = gcc_func.get_param_type(1); + let arg3_type = gcc_func.get_param_type(2); + let a = builder.context.new_rvalue_from_vector(None, arg1_type, &[new_args[0]; 4]); + let b = builder.context.new_rvalue_from_vector(None, arg2_type, &[new_args[1]; 4]); + let c = builder.context.new_rvalue_from_vector(None, arg3_type, &[new_args[2]; 4]); + args = vec![a, b, c, new_args[3]].into(); + }, + "__builtin_ia32_vfmaddsd3_round" => { + let new_args = args.to_vec(); + let arg1_type = gcc_func.get_param_type(0); + let arg2_type = gcc_func.get_param_type(1); + let arg3_type = gcc_func.get_param_type(2); + let a = builder.context.new_rvalue_from_vector(None, arg1_type, &[new_args[0]; 2]); + let b = builder.context.new_rvalue_from_vector(None, arg2_type, &[new_args[1]; 2]); + let c = builder.context.new_rvalue_from_vector(None, arg3_type, &[new_args[2]; 2]); + args = vec![a, b, c, new_args[3]].into(); + }, _ => (), } } @@ -125,11 +198,30 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc args } +pub fn adjust_intrinsic_return_value<'a, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc, 'tcx>, mut return_value: RValue<'gcc>, func_name: &str, args: &[RValue<'gcc>]) -> RValue<'gcc> { + match func_name { + "__builtin_ia32_vfmaddss3_round" | "__builtin_ia32_vfmaddsd3_round" => { + let zero = builder.context.new_rvalue_zero(builder.int_type); + return_value = builder.context.new_vector_access(None, return_value, zero).to_rvalue(); + }, + "__builtin_ia32_addcarryx_u64" => { + let last_arg = args.last().expect("last arg"); + let field1 = builder.context.new_field(None, builder.u8_type, "carryFlag"); + let field2 = builder.context.new_field(None, builder.ulonglong_type, "carryResult"); + let struct_type = builder.context.new_struct_type(None, "addcarryResult", &[field1, field2]); + return_value = builder.context.new_struct_constructor(None, struct_type.as_type(), None, &[return_value, last_arg.dereference(None).to_rvalue()]); + }, + _ => (), + } + + return_value +} + pub fn ignore_arg_cast(func_name: &str, index: usize, args_len: usize) -> bool { - // NOTE: these intrinsics have missing parameters before the last one, so ignore the - // last argument type check. // FIXME(antoyo): find a way to refactor in order to avoid this hack. match func_name { + // NOTE: these intrinsics have missing parameters before the last one, so ignore the + // last argument type check. "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask" | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask" | "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" | "__builtin_ia32_addps512_mask" | "__builtin_ia32_addpd512_mask" @@ -142,6 +234,11 @@ pub fn ignore_arg_cast(func_name: &str, index: usize, args_len: usize) -> bool { return true; } }, + "__builtin_ia32_rndscaless_mask_round" | "__builtin_ia32_rndscalesd_mask_round" => { + if index == 2 || index == 3 { + return true; + } + }, "__builtin_ia32_vfmaddps512_mask" | "__builtin_ia32_vfmaddpd512_mask" => { // Since there are two LLVM intrinsics that map to each of these GCC builtins and only // one of them has a missing parameter before the last one, we check the number of @@ -150,6 +247,8 @@ pub fn ignore_arg_cast(func_name: &str, index: usize, args_len: usize) -> bool { return true; } }, + // NOTE: the LLVM intrinsic receives 3 floats, but the GCC builtin requires 3 vectors. + "__builtin_ia32_vfmaddss3_round" | "__builtin_ia32_vfmaddsd3_round" => return true, _ => (), } @@ -203,6 +302,47 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function "llvm.x86.avx512.vfmadd.pd.512" => "__builtin_ia32_vfmaddpd512_mask", "llvm.x86.avx512.sitofp.round.v16f32.v16i32" => "__builtin_ia32_cvtdq2ps512_mask", "llvm.x86.avx512.uitofp.round.v16f32.v16i32" => "__builtin_ia32_cvtudq2ps512_mask", + "llvm.x86.avx512.mask.ucmp.d.512" => "__builtin_ia32_ucmpd512_mask", + "llvm.x86.avx512.mask.ucmp.d.256" => "__builtin_ia32_ucmpd256_mask", + "llvm.x86.avx512.mask.ucmp.d.128" => "__builtin_ia32_ucmpd128_mask", + "llvm.x86.avx512.mask.cmp.d.512" => "__builtin_ia32_cmpd512_mask", + "llvm.x86.avx512.mask.cmp.d.256" => "__builtin_ia32_cmpd256_mask", + "llvm.x86.avx512.mask.cmp.d.128" => "__builtin_ia32_cmpd128_mask", + "llvm.x86.avx512.mask.ucmp.q.512" => "__builtin_ia32_ucmpq512_mask", + "llvm.x86.avx512.mask.ucmp.q.256" => "__builtin_ia32_ucmpq256_mask", + "llvm.x86.avx512.mask.ucmp.q.128" => "__builtin_ia32_ucmpq128_mask", + "llvm.x86.avx512.mask.cmp.q.512" => "__builtin_ia32_cmpq512_mask", + "llvm.x86.avx512.mask.cmp.q.256" => "__builtin_ia32_cmpq256_mask", + "llvm.x86.avx512.mask.cmp.q.128" => "__builtin_ia32_cmpq128_mask", + "llvm.x86.avx512.mask.max.ss.round" => "__builtin_ia32_maxss_mask_round", + "llvm.x86.avx512.mask.max.sd.round" => "__builtin_ia32_maxsd_mask_round", + "llvm.x86.avx512.mask.min.ss.round" => "__builtin_ia32_minss_mask_round", + "llvm.x86.avx512.mask.min.sd.round" => "__builtin_ia32_minsd_mask_round", + "llvm.x86.avx512.mask.sqrt.ss" => "__builtin_ia32_sqrtss_mask_round", + "llvm.x86.avx512.mask.sqrt.sd" => "__builtin_ia32_sqrtsd_mask_round", + "llvm.x86.avx512.mask.getexp.ss" => "__builtin_ia32_getexpss_mask_round", + "llvm.x86.avx512.mask.getexp.sd" => "__builtin_ia32_getexpsd_mask_round", + "llvm.x86.avx512.mask.getmant.ss" => "__builtin_ia32_getmantss_mask_round", + "llvm.x86.avx512.mask.getmant.sd" => "__builtin_ia32_getmantsd_mask_round", + "llvm.x86.avx512.mask.rndscale.ss" => "__builtin_ia32_rndscaless_mask_round", + "llvm.x86.avx512.mask.rndscale.sd" => "__builtin_ia32_rndscalesd_mask_round", + "llvm.x86.avx512.mask.scalef.ss" => "__builtin_ia32_scalefss_mask_round", + "llvm.x86.avx512.mask.scalef.sd" => "__builtin_ia32_scalefsd_mask_round", + "llvm.x86.avx512.vfmadd.f32" => "__builtin_ia32_vfmaddss3_round", + "llvm.x86.avx512.vfmadd.f64" => "__builtin_ia32_vfmaddsd3_round", + "llvm.ceil.v4f64" => "__builtin_ia32_ceilpd256", + "llvm.ceil.v8f32" => "__builtin_ia32_ceilps256", + "llvm.floor.v4f64" => "__builtin_ia32_floorpd256", + "llvm.floor.v8f32" => "__builtin_ia32_floorps256", + "llvm.sqrt.v4f64" => "__builtin_ia32_sqrtpd256", + "llvm.x86.sse.stmxcsr" => "__builtin_ia32_stmxcsr", + "llvm.x86.sse.ldmxcsr" => "__builtin_ia32_ldmxcsr", + "llvm.ctpop.v16i32" => "__builtin_ia32_vpopcountd_v16si", + "llvm.ctpop.v8i32" => "__builtin_ia32_vpopcountd_v8si", + "llvm.ctpop.v4i32" => "__builtin_ia32_vpopcountd_v4si", + "llvm.ctpop.v8i64" => "__builtin_ia32_vpopcountq_v8di", + "llvm.ctpop.v4i64" => "__builtin_ia32_vpopcountq_v4di", + "llvm.ctpop.v2i64" => "__builtin_ia32_vpopcountq_v2di", // The above doc points to unknown builtins for the following, so override them: "llvm.x86.avx2.gather.d.d" => "__builtin_ia32_gathersiv4si", @@ -221,7 +361,70 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function "llvm.x86.avx2.gather.q.q.256" => "__builtin_ia32_gatherdiv4di", "llvm.x86.avx2.gather.q.pd" => "__builtin_ia32_gatherdiv2df", "llvm.x86.avx2.gather.q.pd.256" => "__builtin_ia32_gatherdiv4df", - "" => "", + "llvm.x86.avx512.pslli.d.512" => "__builtin_ia32_pslldi512_mask", + "llvm.x86.avx512.psrli.d.512" => "__builtin_ia32_psrldi512_mask", + "llvm.x86.avx512.pslli.q.512" => "__builtin_ia32_psllqi512_mask", + "llvm.x86.avx512.psrli.q.512" => "__builtin_ia32_psrlqi512_mask", + "llvm.x86.avx512.psll.d.512" => "__builtin_ia32_pslld512_mask", + "llvm.x86.avx512.psrl.d.512" => "__builtin_ia32_psrld512_mask", + "llvm.x86.avx512.psll.q.512" => "__builtin_ia32_psllq512_mask", + "llvm.x86.avx512.psrl.q.512" => "__builtin_ia32_psrlq512_mask", + "llvm.x86.avx512.psra.d.512" => "__builtin_ia32_psrad512_mask", + "llvm.x86.avx512.psra.q.512" => "__builtin_ia32_psraq512_mask", + "llvm.x86.avx512.psra.q.256" => "__builtin_ia32_psraq256_mask", + "llvm.x86.avx512.psra.q.128" => "__builtin_ia32_psraq128_mask", + "llvm.x86.avx512.psrai.d.512" => "__builtin_ia32_psradi512_mask", + "llvm.x86.avx512.psrai.q.512" => "__builtin_ia32_psraqi512_mask", + "llvm.x86.avx512.psrai.q.256" => "__builtin_ia32_psraqi256_mask", + "llvm.x86.avx512.psrai.q.128" => "__builtin_ia32_psraqi128_mask", + "llvm.x86.avx512.psrav.d.512" => "__builtin_ia32_psrav16si_mask", + "llvm.x86.avx512.psrav.q.512" => "__builtin_ia32_psrav8di_mask", + "llvm.x86.avx512.psrav.q.256" => "__builtin_ia32_psravq256_mask", + "llvm.x86.avx512.psrav.q.128" => "__builtin_ia32_psravq128_mask", + "llvm.x86.avx512.psllv.d.512" => "__builtin_ia32_psllv16si_mask", + "llvm.x86.avx512.psrlv.d.512" => "__builtin_ia32_psrlv16si_mask", + "llvm.x86.avx512.psllv.q.512" => "__builtin_ia32_psllv8di_mask", + "llvm.x86.avx512.psrlv.q.512" => "__builtin_ia32_psrlv8di_mask", + "llvm.x86.avx512.permvar.si.512" => "__builtin_ia32_permvarsi512_mask", + "llvm.x86.avx512.vpermilvar.ps.512" => "__builtin_ia32_vpermilvarps512_mask", + "llvm.x86.avx512.vpermilvar.pd.512" => "__builtin_ia32_vpermilvarpd512_mask", + "llvm.x86.avx512.permvar.di.512" => "__builtin_ia32_permvardi512_mask", + "llvm.x86.avx512.permvar.di.256" => "__builtin_ia32_permvardi256_mask", + "llvm.x86.avx512.permvar.sf.512" => "__builtin_ia32_permvarsf512_mask", + "llvm.x86.avx512.permvar.df.512" => "__builtin_ia32_permvardf512_mask", + "llvm.x86.avx512.permvar.df.256" => "__builtin_ia32_permvardf256_mask", + "llvm.x86.avx512.vpermi2var.d.512" => "__builtin_ia32_vpermi2vard512_mask", + "llvm.x86.avx512.vpermi2var.d.256" => "__builtin_ia32_vpermi2vard256_mask", + "llvm.x86.avx512.vpermi2var.d.128" => "__builtin_ia32_vpermi2vard128_mask", + "llvm.x86.avx512.vpermi2var.q.512" => "__builtin_ia32_vpermi2varq512_mask", + "llvm.x86.avx512.vpermi2var.q.256" => "__builtin_ia32_vpermi2varq256_mask", + "llvm.x86.avx512.vpermi2var.q.128" => "__builtin_ia32_vpermi2varq128_mask", + "llvm.x86.avx512.vpermi2var.ps.512" => "__builtin_ia32_vpermi2varps512_mask", + "llvm.x86.avx512.vpermi2var.ps.256" => "__builtin_ia32_vpermi2varps256_mask", + "llvm.x86.avx512.vpermi2var.ps.128" => "__builtin_ia32_vpermi2varps128_mask", + "llvm.x86.avx512.vpermi2var.pd.512" => "__builtin_ia32_vpermi2varpd512_mask", + "llvm.x86.avx512.vpermi2var.pd.256" => "__builtin_ia32_vpermi2varpd256_mask", + "llvm.x86.avx512.vpermi2var.pd.128" => "__builtin_ia32_vpermi2varpd128_mask", + "llvm.x86.avx512.mask.add.ss.round" => "__builtin_ia32_addss_mask_round", + "llvm.x86.avx512.mask.add.sd.round" => "__builtin_ia32_addsd_mask_round", + "llvm.x86.avx512.mask.sub.ss.round" => "__builtin_ia32_subss_mask_round", + "llvm.x86.avx512.mask.sub.sd.round" => "__builtin_ia32_subsd_mask_round", + "llvm.x86.avx512.mask.mul.ss.round" => "__builtin_ia32_mulss_mask_round", + "llvm.x86.avx512.mask.mul.sd.round" => "__builtin_ia32_mulsd_mask_round", + "llvm.x86.avx512.mask.div.ss.round" => "__builtin_ia32_divss_mask_round", + "llvm.x86.avx512.mask.div.sd.round" => "__builtin_ia32_divsd_mask_round", + "llvm.x86.avx512.mask.cvtss2sd.round" => "__builtin_ia32_cvtss2sd_mask_round", + "llvm.x86.avx512.mask.cvtsd2ss.round" => "__builtin_ia32_cvtsd2ss_mask_round", + "llvm.x86.aesni.aesenc.256" => "__builtin_ia32_vaesenc_v32qi", + "llvm.x86.aesni.aesenclast.256" => "__builtin_ia32_vaesenclast_v32qi", + "llvm.x86.aesni.aesdec.256" => "__builtin_ia32_vaesdec_v32qi", + "llvm.x86.aesni.aesdeclast.256" => "__builtin_ia32_vaesdeclast_v32qi", + "llvm.x86.aesni.aesenc.512" => "__builtin_ia32_vaesenc_v64qi", + "llvm.x86.aesni.aesenclast.512" => "__builtin_ia32_vaesenclast_v64qi", + "llvm.x86.aesni.aesdec.512" => "__builtin_ia32_vaesdec_v64qi", + "llvm.x86.aesni.aesdeclast.512" => "__builtin_ia32_vaesdeclast_v64qi", + "llvm.x86.addcarry.64" => "__builtin_ia32_addcarryx_u64", + // NOTE: this file is generated by https://github.com/GuillaumeGomez/llvmint/blob/master/generate_list.py _ => include!("archs.rs"), }; diff --git a/src/intrinsic/simd.rs b/src/intrinsic/simd.rs index a6cf99c62ff..bf5d555736a 100644 --- a/src/intrinsic/simd.rs +++ b/src/intrinsic/simd.rs @@ -1,4 +1,4 @@ -use gccjit::{BinaryOp, RValue, Type, ToRValue}; +use gccjit::{BinaryOp, RValue, Type, ToRValue, ComparisonOp, UnaryOp}; use rustc_codegen_ssa::base::compare_simd_types; use rustc_codegen_ssa::common::{TypeKind, span_invalid_monomorphization_error}; use rustc_codegen_ssa::mir::operand::OperandRef; @@ -213,48 +213,12 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>, let vector = args[0].immediate(); let index = args[1].immediate(); let value = args[2].immediate(); - // TODO(antoyo): use a recursive unqualified() here. - let vector_type = vector.get_type().unqualified().dyncast_vector().expect("vector type"); - let element_type = vector_type.get_element_type(); - // NOTE: we cannot cast to an array and assign to its element here because the value might - // not be an l-value. So, call a builtin to set the element. - // TODO(antoyo): perhaps we could create a new vector or maybe there's a GIMPLE instruction for that? - // TODO(antoyo): don't use target specific builtins here. - let func_name = - match in_len { - 2 => { - if element_type == bx.i64_type { - "__builtin_ia32_vec_set_v2di" - } - else { - unimplemented!(); - } - }, - 4 => { - if element_type == bx.i32_type { - "__builtin_ia32_vec_set_v4si" - } - else { - unimplemented!(); - } - }, - 8 => { - if element_type == bx.i16_type { - "__builtin_ia32_vec_set_v8hi" - } - else { - unimplemented!(); - } - }, - _ => unimplemented!("Len: {}", in_len), - }; - let builtin = bx.context.get_target_builtin_function(func_name); - let param1_type = builtin.get_param(0).to_rvalue().get_type(); - // TODO(antoyo): perhaps use __builtin_convertvector for vector casting. - let vector = bx.cx.bitcast_if_needed(vector, param1_type); - let result = bx.context.new_call(None, builtin, &[vector, value, bx.context.new_cast(None, index, bx.int_type)]); - // TODO(antoyo): perhaps use __builtin_convertvector for vector casting. - return Ok(bx.context.new_bitcast(None, result, vector.get_type())); + let variable = bx.current_func().new_local(None, vector.get_type(), "new_vector"); + bx.llbb().add_assignment(None, variable, vector); + let lvalue = bx.context.new_vector_access(None, variable.to_rvalue(), index); + // TODO: si simd_insert est constant, utiliser BIT_REF… + bx.llbb().add_assignment(None, lvalue, value); + return Ok(variable.to_rvalue()); } #[cfg(feature="master")] @@ -357,6 +321,67 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>, } } + if name == sym::simd_bitmask { + // The `fn simd_bitmask(vector) -> unsigned integer` intrinsic takes a + // vector mask and returns the most significant bit (MSB) of each lane in the form + // of either: + // * an unsigned integer + // * an array of `u8` + // If the vector has less than 8 lanes, a u8 is returned with zeroed trailing bits. + // + // The bit order of the result depends on the byte endianness, LSB-first for little + // endian and MSB-first for big endian. + + let vector = args[0].immediate(); + let vector_type = vector.get_type().dyncast_vector().expect("vector type"); + let elem_type = vector_type.get_element_type(); + let mut shifts = vec![]; + let mut masks = vec![]; + let mut mask = 1; + for i in 0..in_len { + shifts.push(bx.context.new_rvalue_from_int(elem_type, i as i32)); + masks.push(bx.context.new_rvalue_from_int(elem_type, mask)); + mask <<= 1; + } + masks.reverse(); + let shifts = bx.context.new_rvalue_from_vector(None, vector.get_type(), &shifts); + let shifted = vector >> shifts; + let masks = bx.context.new_rvalue_from_vector(None, vector.get_type(), &masks); + let masked = shifted & masks; + let reduced = bx.vector_reduce_op(masked, BinaryOp::BitwiseOr); + + let expected_int_bits = in_len.max(8); + let expected_bytes = expected_int_bits / 8 + ((expected_int_bits % 8 > 0) as u64); + + match ret_ty.kind() { + ty::Uint(i) if i.bit_width() == Some(expected_int_bits) => { + // Zero-extend iN to the bitmask type: + return Ok(bx.zext(reduced, bx.type_ix(expected_int_bits))); + } + ty::Array(elem, len) + if matches!(elem.kind(), ty::Uint(ty::UintTy::U8)) + && len.try_eval_usize(bx.tcx, ty::ParamEnv::reveal_all()) + == Some(expected_bytes) => + { + // Zero-extend iN to the array length: + let ze = bx.zext(reduced, bx.type_ix(expected_bytes * 8)); + + // Convert the integer to a byte array + let ptr = bx.alloca(bx.type_ix(expected_bytes * 8), Align::ONE); + bx.store(ze, ptr, Align::ONE); + let array_ty = bx.type_array(bx.type_i8(), expected_bytes); + let ptr = bx.pointercast(ptr, bx.cx.type_ptr_to(array_ty)); + return Ok(bx.load(array_ty, ptr, Align::ONE)); + } + _ => return_error!( + "cannot return `{}`, expected `u{}` or `[u8; {}]`", + ret_ty, + expected_int_bits, + expected_bytes + ), + } + } + fn simd_simple_float_intrinsic<'gcc, 'tcx>( name: Symbol, in_elem: Ty<'_>, @@ -496,42 +521,91 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>, let rhs = args[1].immediate(); let is_add = name == sym::simd_saturating_add; let ptr_bits = bx.tcx().data_layout.pointer_size.bits() as _; - let (signed, elem_width, elem_ty) = match *in_elem.kind() { - ty::Int(i) => (true, i.bit_width().unwrap_or(ptr_bits), bx.cx.type_int_from_ty(i)), - ty::Uint(i) => (false, i.bit_width().unwrap_or(ptr_bits), bx.cx.type_uint_from_ty(i)), - _ => { - return_error!( - "expected element type `{}` of vector type `{}` \ + let (signed, elem_width, elem_ty) = + match *in_elem.kind() { + ty::Int(i) => (true, i.bit_width().unwrap_or(ptr_bits) / 8, bx.cx.type_int_from_ty(i)), + ty::Uint(i) => (false, i.bit_width().unwrap_or(ptr_bits) / 8, bx.cx.type_uint_from_ty(i)), + _ => { + return_error!( + "expected element type `{}` of vector type `{}` \ to be a signed or unsigned integer type", - arg_tys[0].simd_size_and_type(bx.tcx()).1, - arg_tys[0] - ); - } - }; - // TODO(antoyo): don't use target specific builtins here. - // Not sure how easy it would be to avoid theme here. - let builtin_name = - match (signed, is_add, in_len, elem_width) { - (true, true, 32, 8) => "__builtin_ia32_paddsb256", // TODO(antoyo): cast arguments to unsigned. - (false, true, 32, 8) => "__builtin_ia32_paddusb256", - (true, true, 16, 16) => "__builtin_ia32_paddsw256", - (false, true, 16, 16) => "__builtin_ia32_paddusw256", - (true, false, 16, 16) => "__builtin_ia32_psubsw256", - (false, false, 16, 16) => "__builtin_ia32_psubusw256", - (true, false, 32, 8) => "__builtin_ia32_psubsb256", - (false, false, 32, 8) => "__builtin_ia32_psubusb256", - _ => unimplemented!("signed: {}, is_add: {}, in_len: {}, elem_width: {}", signed, is_add, in_len, elem_width), + arg_tys[0].simd_size_and_type(bx.tcx()).1, + arg_tys[0] + ); + } }; - let vec_ty = bx.cx.type_vector(elem_ty, in_len as u64); - let func = bx.context.get_target_builtin_function(builtin_name); - let param1_type = func.get_param(0).to_rvalue().get_type(); - let param2_type = func.get_param(1).to_rvalue().get_type(); - let lhs = bx.cx.bitcast_if_needed(lhs, param1_type); - let rhs = bx.cx.bitcast_if_needed(rhs, param2_type); - let result = bx.context.new_call(None, func, &[lhs, rhs]); - // TODO(antoyo): perhaps use __builtin_convertvector for vector casting. - return Ok(bx.context.new_bitcast(None, result, vec_ty)); + let result = + match (signed, is_add) { + (false, true) => { + let res = lhs + rhs; + let cmp = bx.context.new_comparison(None, ComparisonOp::LessThan, res, lhs); + res | cmp + }, + (true, true) => { + // Algorithm from: https://codereview.stackexchange.com/questions/115869/saturated-signed-addition + // TODO: improve using conditional operators if possible. + let arg_type = lhs.get_type(); + // TODO: convert lhs and rhs to unsigned. + let sum = lhs + rhs; + let vector_type = arg_type.dyncast_vector().expect("vector type"); + let unit = vector_type.get_num_units(); + let a = bx.context.new_rvalue_from_int(elem_ty, ((elem_width as i32) << 3) - 1); + let width = bx.context.new_rvalue_from_vector(None, lhs.get_type(), &vec![a; unit]); + + let xor1 = lhs ^ rhs; + let xor2 = lhs ^ sum; + let and = bx.context.new_unary_op(None, UnaryOp::BitwiseNegate, arg_type, xor1) & xor2; + let mask = and >> width; + + let one = bx.context.new_rvalue_one(elem_ty); + let ones = bx.context.new_rvalue_from_vector(None, lhs.get_type(), &vec![one; unit]); + let shift1 = ones << width; + let shift2 = sum >> width; + let mask_min = shift1 ^ shift2; + + let and1 = bx.context.new_unary_op(None, UnaryOp::BitwiseNegate, arg_type, mask) & sum; + let and2 = mask & mask_min; + + and1 + and2 + }, + (false, false) => { + let res = lhs - rhs; + let cmp = bx.context.new_comparison(None, ComparisonOp::LessThanEquals, res, lhs); + res & cmp + }, + (true, false) => { + let arg_type = lhs.get_type(); + // TODO(antoyo): this uses the same algorithm from saturating add, but add the + // negative of the right operand. Find a proper subtraction algorithm. + let rhs = bx.context.new_unary_op(None, UnaryOp::Minus, arg_type, rhs); + + // TODO: convert lhs and rhs to unsigned. + let sum = lhs + rhs; + let vector_type = arg_type.dyncast_vector().expect("vector type"); + let unit = vector_type.get_num_units(); + let a = bx.context.new_rvalue_from_int(elem_ty, ((elem_width as i32) << 3) - 1); + let width = bx.context.new_rvalue_from_vector(None, lhs.get_type(), &vec![a; unit]); + + let xor1 = lhs ^ rhs; + let xor2 = lhs ^ sum; + let and = bx.context.new_unary_op(None, UnaryOp::BitwiseNegate, arg_type, xor1) & xor2; + let mask = and >> width; + + let one = bx.context.new_rvalue_one(elem_ty); + let ones = bx.context.new_rvalue_from_vector(None, lhs.get_type(), &vec![one; unit]); + let shift1 = ones << width; + let shift2 = sum >> width; + let mask_min = shift1 ^ shift2; + + let and1 = bx.context.new_unary_op(None, UnaryOp::BitwiseNegate, arg_type, mask) & sum; + let and2 = mask & mask_min; + + and1 + and2 + } + }; + + return Ok(result); } macro_rules! arith_red { From df85771b34b745d2f9228cd260da9002a4a6cb60 Mon Sep 17 00:00:00 2001 From: Antoni Boucher Date: Sat, 4 Jun 2022 00:32:01 -0400 Subject: [PATCH 06/12] Add more SIMD --- src/intrinsic/llvm.rs | 92 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 87 insertions(+), 5 deletions(-) diff --git a/src/intrinsic/llvm.rs b/src/intrinsic/llvm.rs index 0e75724122c..7154a89543c 100644 --- a/src/intrinsic/llvm.rs +++ b/src/intrinsic/llvm.rs @@ -1,6 +1,6 @@ use std::borrow::Cow; -use gccjit::{Function, FunctionPtrType, RValue, ToRValue}; +use gccjit::{Function, FunctionPtrType, RValue, ToRValue, UnaryOp}; use crate::{context::CodegenCx, builder::Builder}; @@ -25,7 +25,10 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc | "__builtin_ia32_psrlv16si_mask" | "__builtin_ia32_psllv8di_mask" | "__builtin_ia32_psrlv8di_mask" | "__builtin_ia32_permvarsi512_mask" | "__builtin_ia32_vpermilvarps512_mask" | "__builtin_ia32_vpermilvarpd512_mask" | "__builtin_ia32_permvardi512_mask" - | "__builtin_ia32_permvarsf512_mask" + | "__builtin_ia32_permvarsf512_mask" | "__builtin_ia32_permvarqi512_mask" + | "__builtin_ia32_permvarqi256_mask" | "__builtin_ia32_permvarqi128_mask" + | "__builtin_ia32_vpmultishiftqb512_mask" | "__builtin_ia32_vpmultishiftqb256_mask" + | "__builtin_ia32_vpmultishiftqb128_mask" => { let mut new_args = args.to_vec(); let arg3_type = gcc_func.get_param_type(2); @@ -59,6 +62,23 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc new_args.push(minus_one); args = new_args.into(); }, + "__builtin_ia32_vplzcntd_512_mask" | "__builtin_ia32_vplzcntd_256_mask" | "__builtin_ia32_vplzcntd_128_mask" + | "__builtin_ia32_vplzcntq_512_mask" | "__builtin_ia32_vplzcntq_256_mask" | "__builtin_ia32_vplzcntq_128_mask" + | "__builtin_ia32_vpconflictsi_512_mask" | "__builtin_ia32_vpconflictsi_256_mask" + | "__builtin_ia32_vpconflictsi_128_mask" | "__builtin_ia32_vpconflictdi_512_mask" + | "__builtin_ia32_vpconflictdi_256_mask" | "__builtin_ia32_vpconflictdi_128_mask" => { + let mut new_args = args.to_vec(); + let arg2_type = gcc_func.get_param_type(1); + let vector_type = arg2_type.dyncast_vector().expect("vector type"); + let zero = builder.context.new_rvalue_zero(vector_type.get_element_type()); + let num_units = vector_type.get_num_units(); + let first_arg = builder.context.new_rvalue_from_vector(None, arg2_type, &vec![zero; num_units]); + new_args.push(first_arg); + let arg3_type = gcc_func.get_param_type(2); + let minus_one = builder.context.new_rvalue_from_int(arg3_type, -1); + new_args.push(minus_one); + args = new_args.into(); + }, "__builtin_ia32_pternlogd512_mask" | "__builtin_ia32_pternlogd256_mask" | "__builtin_ia32_pternlogd128_mask" | "__builtin_ia32_pternlogq512_mask" | "__builtin_ia32_pternlogq256_mask" | "__builtin_ia32_pternlogq128_mask" => { @@ -148,13 +168,20 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc "__builtin_ia32_stmxcsr" => { args = vec![].into(); }, - "__builtin_ia32_addcarryx_u64" => { + "__builtin_ia32_addcarryx_u64" | "__builtin_ia32_sbb_u64" => { let mut new_args = args.to_vec(); let arg2_type = gcc_func.get_param_type(1); let variable = builder.current_func().new_local(None, arg2_type, "addcarryResult"); new_args.push(variable.get_address(None)); args = new_args.into(); }, + "__builtin_ia32_vpermt2varqi512_mask" | "__builtin_ia32_vpermt2varqi256_mask" + | "__builtin_ia32_vpermt2varqi128_mask" => { + let mut new_args = args.to_vec(); + let arg4_type = gcc_func.get_param_type(3); + let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); + args = vec![new_args[1], new_args[0], new_args[2], minus_one].into(); + }, _ => (), } } @@ -191,6 +218,12 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc let c = builder.context.new_rvalue_from_vector(None, arg3_type, &[new_args[2]; 2]); args = vec![a, b, c, new_args[3]].into(); }, + "__builtin_ia32_vfmaddsubpd256" | "__builtin_ia32_vfmaddsubps" | "__builtin_ia32_vfmaddsubps256" => { + let mut new_args = args.to_vec(); + let arg3 = &mut new_args[2]; + *arg3 = builder.context.new_unary_op(None, UnaryOp::Minus, arg3.get_type(), *arg3); + args = new_args.into(); + }, _ => (), } } @@ -204,7 +237,7 @@ pub fn adjust_intrinsic_return_value<'a, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc, let zero = builder.context.new_rvalue_zero(builder.int_type); return_value = builder.context.new_vector_access(None, return_value, zero).to_rvalue(); }, - "__builtin_ia32_addcarryx_u64" => { + "__builtin_ia32_addcarryx_u64" | "__builtin_ia32_sbb_u64" => { let last_arg = args.last().expect("last arg"); let field1 = builder.context.new_field(None, builder.u8_type, "carryFlag"); let field2 = builder.context.new_field(None, builder.ulonglong_type, "carryResult"); @@ -343,6 +376,48 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function "llvm.ctpop.v8i64" => "__builtin_ia32_vpopcountq_v8di", "llvm.ctpop.v4i64" => "__builtin_ia32_vpopcountq_v4di", "llvm.ctpop.v2i64" => "__builtin_ia32_vpopcountq_v2di", + "llvm.x86.addcarry.64" => "__builtin_ia32_addcarryx_u64", + "llvm.x86.subborrow.64" => "__builtin_ia32_sbb_u64", + "llvm.floor.v2f64" => "__builtin_ia32_floorpd", + "llvm.floor.v4f32" => "__builtin_ia32_floorps", + "llvm.ceil.v2f64" => "__builtin_ia32_ceilpd", + "llvm.ceil.v4f32" => "__builtin_ia32_ceilps", + "llvm.fma.v2f64" => "__builtin_ia32_vfmaddpd", + "llvm.fma.v4f64" => "__builtin_ia32_vfmaddpd256", + "llvm.fma.v4f32" => "__builtin_ia32_vfmaddps", + "llvm.fma.v8f32" => "__builtin_ia32_vfmaddps256", + "llvm.ctlz.v16i32" => "__builtin_ia32_vplzcntd_512_mask", + "llvm.ctlz.v8i32" => "__builtin_ia32_vplzcntd_256_mask", + "llvm.ctlz.v4i32" => "__builtin_ia32_vplzcntd_128_mask", + "llvm.ctlz.v8i64" => "__builtin_ia32_vplzcntq_512_mask", + "llvm.ctlz.v4i64" => "__builtin_ia32_vplzcntq_256_mask", + "llvm.ctlz.v2i64" => "__builtin_ia32_vplzcntq_128_mask", + "llvm.ctpop.v32i16" => "__builtin_ia32_vpopcountw_v32hi", + "llvm.x86.fma.vfmsub.sd" => "__builtin_ia32_vfmsubsd3", + "llvm.x86.fma.vfmsub.ss" => "__builtin_ia32_vfmsubss3", + "llvm.x86.fma.vfmsubadd.pd" => "__builtin_ia32_vfmaddsubpd", + "llvm.x86.fma.vfmsubadd.pd.256" => "__builtin_ia32_vfmaddsubpd256", + "llvm.x86.fma.vfmsubadd.ps" => "__builtin_ia32_vfmaddsubps", + "llvm.x86.fma.vfmsubadd.ps.256" => "__builtin_ia32_vfmaddsubps256", + "llvm.x86.fma.vfnmadd.sd" => "__builtin_ia32_vfnmaddsd3", + "llvm.x86.fma.vfnmadd.ss" => "__builtin_ia32_vfnmaddss3", + "llvm.x86.fma.vfnmsub.sd" => "__builtin_ia32_vfnmsubsd3", + "llvm.x86.fma.vfnmsub.ss" => "__builtin_ia32_vfnmsubss3", + "llvm.x86.avx512.conflict.d.512" => "__builtin_ia32_vpconflictsi_512_mask", + "llvm.x86.avx512.conflict.d.256" => "__builtin_ia32_vpconflictsi_256_mask", + "llvm.x86.avx512.conflict.d.128" => "__builtin_ia32_vpconflictsi_128_mask", + "llvm.x86.avx512.conflict.q.512" => "__builtin_ia32_vpconflictdi_512_mask", + "llvm.x86.avx512.conflict.q.256" => "__builtin_ia32_vpconflictdi_256_mask", + "llvm.x86.avx512.conflict.q.128" => "__builtin_ia32_vpconflictdi_128_mask", + "llvm.x86.avx512.vpermi2var.qi.512" => "__builtin_ia32_vpermt2varqi512_mask", + "llvm.x86.avx512.vpermi2var.qi.256" => "__builtin_ia32_vpermt2varqi256_mask", + "llvm.x86.avx512.vpermi2var.qi.128" => "__builtin_ia32_vpermt2varqi128_mask", + "llvm.x86.avx512.permvar.qi.512" => "__builtin_ia32_permvarqi512_mask", + "llvm.x86.avx512.permvar.qi.256" => "__builtin_ia32_permvarqi256_mask", + "llvm.x86.avx512.permvar.qi.128" => "__builtin_ia32_permvarqi128_mask", + "llvm.x86.avx512.pmultishift.qb.512" => "__builtin_ia32_vpmultishiftqb512_mask", + "llvm.x86.avx512.pmultishift.qb.256" => "__builtin_ia32_vpmultishiftqb256_mask", + "llvm.x86.avx512.pmultishift.qb.128" => "__builtin_ia32_vpmultishiftqb128_mask", // The above doc points to unknown builtins for the following, so override them: "llvm.x86.avx2.gather.d.d" => "__builtin_ia32_gathersiv4si", @@ -423,7 +498,14 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function "llvm.x86.aesni.aesenclast.512" => "__builtin_ia32_vaesenclast_v64qi", "llvm.x86.aesni.aesdec.512" => "__builtin_ia32_vaesdec_v64qi", "llvm.x86.aesni.aesdeclast.512" => "__builtin_ia32_vaesdeclast_v64qi", - "llvm.x86.addcarry.64" => "__builtin_ia32_addcarryx_u64", + "llvm.x86.avx512bf16.cvtne2ps2bf16.128" => "__builtin_ia32_cvtne2ps2bf16_v8hi", + "llvm.x86.avx512bf16.cvtne2ps2bf16.256" => "__builtin_ia32_cvtne2ps2bf16_v16hi", + "llvm.x86.avx512bf16.cvtne2ps2bf16.512" => "__builtin_ia32_cvtne2ps2bf16_v32hi", + "llvm.x86.avx512bf16.cvtneps2bf16.256" => "__builtin_ia32_cvtneps2bf16_v8sf", + "llvm.x86.avx512bf16.cvtneps2bf16.512" => "__builtin_ia32_cvtneps2bf16_v16sf", + "llvm.x86.avx512bf16.dpbf16ps.128" => "__builtin_ia32_dpbf16ps_v4sf", + "llvm.x86.avx512bf16.dpbf16ps.256" => "__builtin_ia32_dpbf16ps_v8sf", + "llvm.x86.avx512bf16.dpbf16ps.512" => "__builtin_ia32_dpbf16ps_v16sf", // NOTE: this file is generated by https://github.com/GuillaumeGomez/llvmint/blob/master/generate_list.py _ => include!("archs.rs"), From fe606eb4449e07e21959a52ae07ca12be3955209 Mon Sep 17 00:00:00 2001 From: Antoni Boucher Date: Sat, 4 Jun 2022 23:03:03 -0400 Subject: [PATCH 07/12] Add more SIMD --- src/builder.rs | 8 ++++---- src/intrinsic/llvm.rs | 43 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/src/builder.rs b/src/builder.rs index b23d12cb0b6..aefd4eecc0a 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -275,19 +275,19 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> { } fn function_ptr_call(&mut self, func_ptr: RValue<'gcc>, args: &[RValue<'gcc>], _funclet: Option<&Funclet>) -> RValue<'gcc> { - let args = self.check_ptr_call("call", func_ptr, args); + let gcc_func = func_ptr.get_type().dyncast_function_ptr_type().expect("function ptr"); + let func_name = format!("{:?}", func_ptr); + let args = llvm::adjust_intrinsic_arguments(&self, gcc_func, args.into(), &func_name); + let args = self.check_ptr_call("call", func_ptr, &*args); // gccjit requires to use the result of functions, even when it's not used. // That's why we assign the result to a local or call add_eval(). - let gcc_func = func_ptr.get_type().dyncast_function_ptr_type().expect("function ptr"); let return_type = gcc_func.get_return_type(); let void_type = self.context.new_type::<()>(); let current_func = self.block.get_function(); if return_type != void_type { unsafe { RETURN_VALUE_COUNT += 1 }; - let func_name = format!("{:?}", func_ptr); - let args = llvm::adjust_intrinsic_arguments(&self, gcc_func, args, &func_name); let return_value = self.cx.context.new_call_through_ptr(None, func_ptr, &args); let return_value = llvm::adjust_intrinsic_return_value(&self, return_value, &func_name, &args); let result = current_func.new_local(None, return_value.get_type(), &format!("ptrReturnValue{}", unsafe { RETURN_VALUE_COUNT })); diff --git a/src/intrinsic/llvm.rs b/src/intrinsic/llvm.rs index 7154a89543c..29387712dae 100644 --- a/src/intrinsic/llvm.rs +++ b/src/intrinsic/llvm.rs @@ -49,6 +49,10 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc | "__builtin_ia32_prorvd256_mask" | "__builtin_ia32_prorvd128_mask" | "__builtin_ia32_prolvq256_mask" | "__builtin_ia32_prolvq128_mask" | "__builtin_ia32_prorvq256_mask" | "__builtin_ia32_prorvq128_mask" | "__builtin_ia32_permvardi256_mask" | "__builtin_ia32_permvardf512_mask" | "__builtin_ia32_permvardf256_mask" + | "__builtin_ia32_pmulhuw512_mask" | "__builtin_ia32_pmulhw512_mask" | "__builtin_ia32_pmulhrsw512_mask" + | "__builtin_ia32_pmaxuw512_mask" | "__builtin_ia32_pmaxub512_mask" | "__builtin_ia32_pmaxsw512_mask" + | "__builtin_ia32_pmaxsb512_mask" | "__builtin_ia32_pminuw512_mask" | "__builtin_ia32_pminub512_mask" + | "__builtin_ia32_pminsw512_mask" | "__builtin_ia32_pminsb512_mask" => { let mut new_args = args.to_vec(); let arg3_type = gcc_func.get_param_type(2); @@ -63,8 +67,22 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc args = new_args.into(); }, "__builtin_ia32_vplzcntd_512_mask" | "__builtin_ia32_vplzcntd_256_mask" | "__builtin_ia32_vplzcntd_128_mask" - | "__builtin_ia32_vplzcntq_512_mask" | "__builtin_ia32_vplzcntq_256_mask" | "__builtin_ia32_vplzcntq_128_mask" - | "__builtin_ia32_vpconflictsi_512_mask" | "__builtin_ia32_vpconflictsi_256_mask" + | "__builtin_ia32_vplzcntq_512_mask" | "__builtin_ia32_vplzcntq_256_mask" | "__builtin_ia32_vplzcntq_128_mask" => { + let mut new_args = args.to_vec(); + // Remove last arg as it doesn't seem to be used in GCC and is always false. + new_args.pop(); + let arg2_type = gcc_func.get_param_type(1); + let vector_type = arg2_type.dyncast_vector().expect("vector type"); + let zero = builder.context.new_rvalue_zero(vector_type.get_element_type()); + let num_units = vector_type.get_num_units(); + let first_arg = builder.context.new_rvalue_from_vector(None, arg2_type, &vec![zero; num_units]); + new_args.push(first_arg); + let arg3_type = gcc_func.get_param_type(2); + let minus_one = builder.context.new_rvalue_from_int(arg3_type, -1); + new_args.push(minus_one); + args = new_args.into(); + }, + "__builtin_ia32_vpconflictsi_512_mask" | "__builtin_ia32_vpconflictsi_256_mask" | "__builtin_ia32_vpconflictsi_128_mask" | "__builtin_ia32_vpconflictdi_512_mask" | "__builtin_ia32_vpconflictdi_256_mask" | "__builtin_ia32_vpconflictdi_128_mask" => { let mut new_args = args.to_vec(); @@ -177,7 +195,7 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc }, "__builtin_ia32_vpermt2varqi512_mask" | "__builtin_ia32_vpermt2varqi256_mask" | "__builtin_ia32_vpermt2varqi128_mask" => { - let mut new_args = args.to_vec(); + let new_args = args.to_vec(); let arg4_type = gcc_func.get_param_type(3); let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); args = vec![new_args[1], new_args[0], new_args[2], minus_one].into(); @@ -282,6 +300,12 @@ pub fn ignore_arg_cast(func_name: &str, index: usize, args_len: usize) -> bool { }, // NOTE: the LLVM intrinsic receives 3 floats, but the GCC builtin requires 3 vectors. "__builtin_ia32_vfmaddss3_round" | "__builtin_ia32_vfmaddsd3_round" => return true, + "__builtin_ia32_vplzcntd_512_mask" | "__builtin_ia32_vplzcntd_256_mask" | "__builtin_ia32_vplzcntd_128_mask" + | "__builtin_ia32_vplzcntq_512_mask" | "__builtin_ia32_vplzcntq_256_mask" | "__builtin_ia32_vplzcntq_128_mask" => { + if index == args_len - 1 { + return true; + } + }, _ => (), } @@ -418,6 +442,14 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function "llvm.x86.avx512.pmultishift.qb.512" => "__builtin_ia32_vpmultishiftqb512_mask", "llvm.x86.avx512.pmultishift.qb.256" => "__builtin_ia32_vpmultishiftqb256_mask", "llvm.x86.avx512.pmultishift.qb.128" => "__builtin_ia32_vpmultishiftqb128_mask", + "llvm.ctpop.v16i16" => "__builtin_ia32_vpopcountw_v16hi", + "llvm.ctpop.v8i16" => "__builtin_ia32_vpopcountw_v8hi", + "llvm.ctpop.v64i8" => "__builtin_ia32_vpopcountb_v64qi", + "llvm.ctpop.v32i8" => "__builtin_ia32_vpopcountb_v32qi", + "llvm.ctpop.v16i8" => "__builtin_ia32_vpopcountb_v16qi", + "llvm.x86.avx512.mask.vpshufbitqmb.512" => "__builtin_ia32_vpshufbitqmb512_mask", + "llvm.x86.avx512.mask.vpshufbitqmb.256" => "__builtin_ia32_vpshufbitqmb256_mask", + "llvm.x86.avx512.mask.vpshufbitqmb.128" => "__builtin_ia32_vpshufbitqmb128_mask", // The above doc points to unknown builtins for the following, so override them: "llvm.x86.avx2.gather.d.d" => "__builtin_ia32_gathersiv4si", @@ -506,6 +538,11 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function "llvm.x86.avx512bf16.dpbf16ps.128" => "__builtin_ia32_dpbf16ps_v4sf", "llvm.x86.avx512bf16.dpbf16ps.256" => "__builtin_ia32_dpbf16ps_v8sf", "llvm.x86.avx512bf16.dpbf16ps.512" => "__builtin_ia32_dpbf16ps_v16sf", + "llvm.x86.pclmulqdq.512" => "__builtin_ia32_vpclmulqdq_v8di", + "llvm.x86.pclmulqdq.256" => "__builtin_ia32_vpclmulqdq_v4di", + "llvm.x86.avx512.pmulhu.w.512" => "__builtin_ia32_pmulhuw512_mask", + "llvm.x86.avx512.pmulh.w.512" => "__builtin_ia32_pmulhw512_mask", + "llvm.x86.avx512.pmul.hr.sw.512" => "__builtin_ia32_pmulhrsw512_mask", // NOTE: this file is generated by https://github.com/GuillaumeGomez/llvmint/blob/master/generate_list.py _ => include!("archs.rs"), From e5a1bb2f59c49a6a741b8ee5b20ad16b93f314c6 Mon Sep 17 00:00:00 2001 From: Antoni Boucher Date: Sun, 5 Jun 2022 13:36:17 -0400 Subject: [PATCH 08/12] Add more SIMD --- src/builder.rs | 4 +- src/intrinsic/llvm.rs | 105 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 99 insertions(+), 10 deletions(-) diff --git a/src/builder.rs b/src/builder.rs index aefd4eecc0a..3e1f56c183a 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -277,7 +277,9 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> { fn function_ptr_call(&mut self, func_ptr: RValue<'gcc>, args: &[RValue<'gcc>], _funclet: Option<&Funclet>) -> RValue<'gcc> { let gcc_func = func_ptr.get_type().dyncast_function_ptr_type().expect("function ptr"); let func_name = format!("{:?}", func_ptr); + let previous_arg_count = args.len(); let args = llvm::adjust_intrinsic_arguments(&self, gcc_func, args.into(), &func_name); + let args_adjusted = args.len() != previous_arg_count; let args = self.check_ptr_call("call", func_ptr, &*args); // gccjit requires to use the result of functions, even when it's not used. @@ -289,7 +291,7 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> { if return_type != void_type { unsafe { RETURN_VALUE_COUNT += 1 }; let return_value = self.cx.context.new_call_through_ptr(None, func_ptr, &args); - let return_value = llvm::adjust_intrinsic_return_value(&self, return_value, &func_name, &args); + let return_value = llvm::adjust_intrinsic_return_value(&self, return_value, &func_name, &args, args_adjusted); let result = current_func.new_local(None, return_value.get_type(), &format!("ptrReturnValue{}", unsafe { RETURN_VALUE_COUNT })); self.block.add_assignment(None, result, return_value); result.to_rvalue() diff --git a/src/intrinsic/llvm.rs b/src/intrinsic/llvm.rs index 29387712dae..af6c121c337 100644 --- a/src/intrinsic/llvm.rs +++ b/src/intrinsic/llvm.rs @@ -53,6 +53,15 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc | "__builtin_ia32_pmaxuw512_mask" | "__builtin_ia32_pmaxub512_mask" | "__builtin_ia32_pmaxsw512_mask" | "__builtin_ia32_pmaxsb512_mask" | "__builtin_ia32_pminuw512_mask" | "__builtin_ia32_pminub512_mask" | "__builtin_ia32_pminsw512_mask" | "__builtin_ia32_pminsb512_mask" + | "__builtin_ia32_pmaddwd512_mask" | "__builtin_ia32_pmaddubsw512_mask" | "__builtin_ia32_packssdw512_mask" + | "__builtin_ia32_packsswb512_mask" | "__builtin_ia32_packusdw512_mask" | "__builtin_ia32_packuswb512_mask" + | "__builtin_ia32_pavgw512_mask" | "__builtin_ia32_pavgb512_mask" | "__builtin_ia32_psllw512_mask" + | "__builtin_ia32_psllwi512_mask" | "__builtin_ia32_psllv32hi_mask" | "__builtin_ia32_psrlw512_mask" + | "__builtin_ia32_psrlwi512_mask" | "__builtin_ia32_psllv16hi_mask" | "__builtin_ia32_psllv8hi_mask" + | "__builtin_ia32_psrlv32hi_mask" | "__builtin_ia32_psraw512_mask" | "__builtin_ia32_psrawi512_mask" + | "__builtin_ia32_psrlv16hi_mask" | "__builtin_ia32_psrlv8hi_mask" | "__builtin_ia32_psrav32hi_mask" + | "__builtin_ia32_permvarhi512_mask" | "__builtin_ia32_pshufb512_mask" | "__builtin_ia32_psrav16hi_mask" + | "__builtin_ia32_psrav8hi_mask" | "__builtin_ia32_permvarhi256_mask" | "__builtin_ia32_permvarhi128_mask" => { let mut new_args = args.to_vec(); let arg3_type = gcc_func.get_param_type(2); @@ -66,6 +75,19 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc new_args.push(minus_one); args = new_args.into(); }, + "__builtin_ia32_dbpsadbw512_mask" | "__builtin_ia32_dbpsadbw256_mask" | "__builtin_ia32_dbpsadbw128_mask" => { + let mut new_args = args.to_vec(); + let arg4_type = gcc_func.get_param_type(3); + let vector_type = arg4_type.dyncast_vector().expect("vector type"); + let zero = builder.context.new_rvalue_zero(vector_type.get_element_type()); + let num_units = vector_type.get_num_units(); + let first_arg = builder.context.new_rvalue_from_vector(None, arg4_type, &vec![zero; num_units]); + new_args.push(first_arg); + let arg5_type = gcc_func.get_param_type(4); + let minus_one = builder.context.new_rvalue_from_int(arg5_type, -1); + new_args.push(minus_one); + args = new_args.into(); + }, "__builtin_ia32_vplzcntd_512_mask" | "__builtin_ia32_vplzcntd_256_mask" | "__builtin_ia32_vplzcntd_128_mask" | "__builtin_ia32_vplzcntq_512_mask" | "__builtin_ia32_vplzcntq_256_mask" | "__builtin_ia32_vplzcntq_128_mask" => { let mut new_args = args.to_vec(); @@ -186,7 +208,7 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc "__builtin_ia32_stmxcsr" => { args = vec![].into(); }, - "__builtin_ia32_addcarryx_u64" | "__builtin_ia32_sbb_u64" => { + "__builtin_ia32_addcarryx_u64" | "__builtin_ia32_sbb_u64" | "__builtin_ia32_addcarryx_u32" | "__builtin_ia32_sbb_u32" => { let mut new_args = args.to_vec(); let arg2_type = gcc_func.get_param_type(1); let variable = builder.current_func().new_local(None, arg2_type, "addcarryResult"); @@ -194,12 +216,22 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc args = new_args.into(); }, "__builtin_ia32_vpermt2varqi512_mask" | "__builtin_ia32_vpermt2varqi256_mask" - | "__builtin_ia32_vpermt2varqi128_mask" => { + | "__builtin_ia32_vpermt2varqi128_mask" | "__builtin_ia32_vpermt2varhi512_mask" + | "__builtin_ia32_vpermt2varhi256_mask" | "__builtin_ia32_vpermt2varhi128_mask" + => { let new_args = args.to_vec(); let arg4_type = gcc_func.get_param_type(3); let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); args = vec![new_args[1], new_args[0], new_args[2], minus_one].into(); }, + "__builtin_ia32_xrstor" | "__builtin_ia32_xsavec" => { + let new_args = args.to_vec(); + let thirty_two = builder.context.new_rvalue_from_int(new_args[1].get_type(), 32); + let arg2 = new_args[1] << thirty_two | new_args[2]; + let arg2_type = gcc_func.get_param_type(1); + let arg2 = builder.context.new_cast(None, arg2, arg2_type); + args = vec![new_args[0], arg2].into(); + }, _ => (), } } @@ -249,18 +281,24 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc args } -pub fn adjust_intrinsic_return_value<'a, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc, 'tcx>, mut return_value: RValue<'gcc>, func_name: &str, args: &[RValue<'gcc>]) -> RValue<'gcc> { +pub fn adjust_intrinsic_return_value<'a, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc, 'tcx>, mut return_value: RValue<'gcc>, func_name: &str, args: &[RValue<'gcc>], args_adjusted: bool) -> RValue<'gcc> { match func_name { "__builtin_ia32_vfmaddss3_round" | "__builtin_ia32_vfmaddsd3_round" => { let zero = builder.context.new_rvalue_zero(builder.int_type); return_value = builder.context.new_vector_access(None, return_value, zero).to_rvalue(); }, - "__builtin_ia32_addcarryx_u64" | "__builtin_ia32_sbb_u64" => { - let last_arg = args.last().expect("last arg"); - let field1 = builder.context.new_field(None, builder.u8_type, "carryFlag"); - let field2 = builder.context.new_field(None, builder.ulonglong_type, "carryResult"); - let struct_type = builder.context.new_struct_type(None, "addcarryResult", &[field1, field2]); - return_value = builder.context.new_struct_constructor(None, struct_type.as_type(), None, &[return_value, last_arg.dereference(None).to_rvalue()]); + "__builtin_ia32_addcarryx_u64" | "__builtin_ia32_sbb_u64" | "__builtin_ia32_addcarryx_u32" | "__builtin_ia32_sbb_u32" => { + // Both llvm.x86.addcarry.32 and llvm.x86.addcarryx.u32 points to the same GCC builtin, + // but only the former requires adjusting the return value. + // Those 2 LLVM intrinsics differ by their argument count, that's why we check if the + // arguments were adjusted. + if args_adjusted { + let last_arg = args.last().expect("last arg"); + let field1 = builder.context.new_field(None, builder.u8_type, "carryFlag"); + let field2 = builder.context.new_field(None, args[1].get_type(), "carryResult"); + let struct_type = builder.context.new_struct_type(None, "addcarryResult", &[field1, field2]); + return_value = builder.context.new_struct_constructor(None, struct_type.as_type(), None, &[return_value, last_arg.dereference(None).to_rvalue()]); + } }, _ => (), } @@ -450,6 +488,22 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function "llvm.x86.avx512.mask.vpshufbitqmb.512" => "__builtin_ia32_vpshufbitqmb512_mask", "llvm.x86.avx512.mask.vpshufbitqmb.256" => "__builtin_ia32_vpshufbitqmb256_mask", "llvm.x86.avx512.mask.vpshufbitqmb.128" => "__builtin_ia32_vpshufbitqmb128_mask", + "llvm.x86.avx512.mask.ucmp.w.512" => "__builtin_ia32_ucmpw512_mask", + "llvm.x86.avx512.mask.ucmp.w.256" => "__builtin_ia32_ucmpw256_mask", + "llvm.x86.avx512.mask.ucmp.w.128" => "__builtin_ia32_ucmpw128_mask", + "llvm.x86.avx512.mask.ucmp.b.512" => "__builtin_ia32_ucmpb512_mask", + "llvm.x86.avx512.mask.ucmp.b.256" => "__builtin_ia32_ucmpb256_mask", + "llvm.x86.avx512.mask.ucmp.b.128" => "__builtin_ia32_ucmpb128_mask", + "llvm.x86.avx512.mask.cmp.w.512" => "__builtin_ia32_cmpw512_mask", + "llvm.x86.avx512.mask.cmp.w.256" => "__builtin_ia32_cmpw256_mask", + "llvm.x86.avx512.mask.cmp.w.128" => "__builtin_ia32_cmpw128_mask", + "llvm.x86.avx512.mask.cmp.b.512" => "__builtin_ia32_cmpb512_mask", + "llvm.x86.avx512.mask.cmp.b.256" => "__builtin_ia32_cmpb256_mask", + "llvm.x86.avx512.mask.cmp.b.128" => "__builtin_ia32_cmpb128_mask", + "llvm.x86.xrstor" => "__builtin_ia32_xrstor", + "llvm.x86.xsavec" => "__builtin_ia32_xsavec", + "llvm.x86.addcarry.32" => "__builtin_ia32_addcarryx_u32", + "llvm.x86.subborrow.32" => "__builtin_ia32_sbb_u32", // The above doc points to unknown builtins for the following, so override them: "llvm.x86.avx2.gather.d.d" => "__builtin_ia32_gathersiv4si", @@ -543,6 +597,39 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function "llvm.x86.avx512.pmulhu.w.512" => "__builtin_ia32_pmulhuw512_mask", "llvm.x86.avx512.pmulh.w.512" => "__builtin_ia32_pmulhw512_mask", "llvm.x86.avx512.pmul.hr.sw.512" => "__builtin_ia32_pmulhrsw512_mask", + "llvm.x86.avx512.pmaddw.d.512" => "__builtin_ia32_pmaddwd512_mask", + "llvm.x86.avx512.pmaddubs.w.512" => "__builtin_ia32_pmaddubsw512_mask", + "llvm.x86.avx512.packssdw.512" => "__builtin_ia32_packssdw512_mask", + "llvm.x86.avx512.packsswb.512" => "__builtin_ia32_packsswb512_mask", + "llvm.x86.avx512.packusdw.512" => "__builtin_ia32_packusdw512_mask", + "llvm.x86.avx512.packuswb.512" => "__builtin_ia32_packuswb512_mask", + "llvm.x86.avx512.pavg.w.512" => "__builtin_ia32_pavgw512_mask", + "llvm.x86.avx512.pavg.b.512" => "__builtin_ia32_pavgb512_mask", + "llvm.x86.avx512.psll.w.512" => "__builtin_ia32_psllw512_mask", + "llvm.x86.avx512.pslli.w.512" => "__builtin_ia32_psllwi512_mask", + "llvm.x86.avx512.psllv.w.512" => "__builtin_ia32_psllv32hi_mask", + "llvm.x86.avx512.psllv.w.256" => "__builtin_ia32_psllv16hi_mask", + "llvm.x86.avx512.psllv.w.128" => "__builtin_ia32_psllv8hi_mask", + "llvm.x86.avx512.psrl.w.512" => "__builtin_ia32_psrlw512_mask", + "llvm.x86.avx512.psrli.w.512" => "__builtin_ia32_psrlwi512_mask", + "llvm.x86.avx512.psrlv.w.512" => "__builtin_ia32_psrlv32hi_mask", + "llvm.x86.avx512.psrlv.w.256" => "__builtin_ia32_psrlv16hi_mask", + "llvm.x86.avx512.psrlv.w.128" => "__builtin_ia32_psrlv8hi_mask", + "llvm.x86.avx512.psra.w.512" => "__builtin_ia32_psraw512_mask", + "llvm.x86.avx512.psrai.w.512" => "__builtin_ia32_psrawi512_mask", + "llvm.x86.avx512.psrav.w.512" => "__builtin_ia32_psrav32hi_mask", + "llvm.x86.avx512.psrav.w.256" => "__builtin_ia32_psrav16hi_mask", + "llvm.x86.avx512.psrav.w.128" => "__builtin_ia32_psrav8hi_mask", + "llvm.x86.avx512.vpermi2var.hi.512" => "__builtin_ia32_vpermt2varhi512_mask", + "llvm.x86.avx512.vpermi2var.hi.256" => "__builtin_ia32_vpermt2varhi256_mask", + "llvm.x86.avx512.vpermi2var.hi.128" => "__builtin_ia32_vpermt2varhi128_mask", + "llvm.x86.avx512.permvar.hi.512" => "__builtin_ia32_permvarhi512_mask", + "llvm.x86.avx512.permvar.hi.256" => "__builtin_ia32_permvarhi256_mask", + "llvm.x86.avx512.permvar.hi.128" => "__builtin_ia32_permvarhi128_mask", + "llvm.x86.avx512.pshuf.b.512" => "__builtin_ia32_pshufb512_mask", + "llvm.x86.avx512.dbpsadbw.512" => "__builtin_ia32_dbpsadbw512_mask", + "llvm.x86.avx512.dbpsadbw.256" => "__builtin_ia32_dbpsadbw256_mask", + "llvm.x86.avx512.dbpsadbw.128" => "__builtin_ia32_dbpsadbw128_mask", // NOTE: this file is generated by https://github.com/GuillaumeGomez/llvmint/blob/master/generate_list.py _ => include!("archs.rs"), From b55bd956dc2ad488b890892e4d336ff6cdf3b128 Mon Sep 17 00:00:00 2001 From: Antoni Boucher Date: Sun, 5 Jun 2022 17:16:27 -0400 Subject: [PATCH 09/12] Add more SIMD --- src/intrinsic/llvm.rs | 46 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/src/intrinsic/llvm.rs b/src/intrinsic/llvm.rs index af6c121c337..14a7eaf49bb 100644 --- a/src/intrinsic/llvm.rs +++ b/src/intrinsic/llvm.rs @@ -171,7 +171,10 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc new_args.push(last_arg); args = new_args.into(); }, - "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" => { + "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" | "__builtin_ia32_vpmadd52huq512_mask" + | "__builtin_ia32_vpmadd52luq512_mask" | "__builtin_ia32_vpmadd52huq256_mask" | "__builtin_ia32_vpmadd52luq256_mask" + | "__builtin_ia32_vpmadd52huq128_mask" + => { let mut new_args = args.to_vec(); let last_arg = new_args.pop().expect("last arg"); let arg4_type = gcc_func.get_param_type(3); @@ -504,6 +507,42 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function "llvm.x86.xsavec" => "__builtin_ia32_xsavec", "llvm.x86.addcarry.32" => "__builtin_ia32_addcarryx_u32", "llvm.x86.subborrow.32" => "__builtin_ia32_sbb_u32", + "llvm.x86.avx512.mask.compress.store.w.512" => "__builtin_ia32_compressstoreuhi512_mask", + "llvm.x86.avx512.mask.compress.store.w.256" => "__builtin_ia32_compressstoreuhi256_mask", + "llvm.x86.avx512.mask.compress.store.w.128" => "__builtin_ia32_compressstoreuhi128_mask", + "llvm.x86.avx512.mask.compress.store.b.512" => "__builtin_ia32_compressstoreuqi512_mask", + "llvm.x86.avx512.mask.compress.store.b.256" => "__builtin_ia32_compressstoreuqi256_mask", + "llvm.x86.avx512.mask.compress.store.b.128" => "__builtin_ia32_compressstoreuqi128_mask", + "llvm.x86.avx512.mask.compress.w.512" => "__builtin_ia32_compressstoreuhi512_mask", + "llvm.x86.avx512.mask.compress.w.256" => "__builtin_ia32_compresshi256_mask", + "llvm.x86.avx512.mask.compress.w.128" => "__builtin_ia32_compresshi128_mask", + "llvm.x86.avx512.mask.compress.b.512" => "__builtin_ia32_compressstoreuqi512_mask", + "llvm.x86.avx512.mask.compress.b.256" => "__builtin_ia32_compressqi256_mask", + "llvm.x86.avx512.mask.compress.b.128" => "__builtin_ia32_compressqi128_mask", + "llvm.x86.avx512.mask.expand.w.512" => "__builtin_ia32_expandhi512_mask", + "llvm.x86.avx512.mask.expand.w.256" => "__builtin_ia32_expandhi256_mask", + "llvm.x86.avx512.mask.expand.w.128" => "__builtin_ia32_expandhi128_mask", + "llvm.x86.avx512.mask.expand.b.512" => "__builtin_ia32_expandqi512_mask", + "llvm.x86.avx512.mask.expand.b.256" => "__builtin_ia32_expandqi256_mask", + "llvm.x86.avx512.mask.expand.b.128" => "__builtin_ia32_expandqi128_mask", + "llvm.fshl.v8i64" => "__builtin_ia32_vpshldv_v8di", + "llvm.fshl.v4i64" => "__builtin_ia32_vpshldv_v4di", + "llvm.fshl.v2i64" => "__builtin_ia32_vpshldv_v2di", + "llvm.fshl.v16i32" => "__builtin_ia32_vpshldv_v16si", + "llvm.fshl.v8i32" => "__builtin_ia32_vpshldv_v8si", + "llvm.fshl.v4i32" => "__builtin_ia32_vpshldv_v4si", + "llvm.fshl.v32i16" => "__builtin_ia32_vpshldv_v32hi", + "llvm.fshl.v16i16" => "__builtin_ia32_vpshldv_v16hi", + "llvm.fshl.v8i16" => "__builtin_ia32_vpshldv_v8hi", + "llvm.fshr.v8i64" => "__builtin_ia32_vpshrdv_v8di", + "llvm.fshr.v4i64" => "__builtin_ia32_vpshrdv_v4di", + "llvm.fshr.v2i64" => "__builtin_ia32_vpshrdv_v2di", + "llvm.fshr.v16i32" => "__builtin_ia32_vpshrdv_v16si", + "llvm.fshr.v8i32" => "__builtin_ia32_vpshrdv_v8si", + "llvm.fshr.v4i32" => "__builtin_ia32_vpshrdv_v4si", + "llvm.fshr.v32i16" => "__builtin_ia32_vpshrdv_v32hi", + "llvm.fshr.v16i16" => "__builtin_ia32_vpshrdv_v16hi", + "llvm.fshr.v8i16" => "__builtin_ia32_vpshrdv_v8hi", // The above doc points to unknown builtins for the following, so override them: "llvm.x86.avx2.gather.d.d" => "__builtin_ia32_gathersiv4si", @@ -630,6 +669,11 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function "llvm.x86.avx512.dbpsadbw.512" => "__builtin_ia32_dbpsadbw512_mask", "llvm.x86.avx512.dbpsadbw.256" => "__builtin_ia32_dbpsadbw256_mask", "llvm.x86.avx512.dbpsadbw.128" => "__builtin_ia32_dbpsadbw128_mask", + "llvm.x86.avx512.vpmadd52h.uq.512" => "__builtin_ia32_vpmadd52huq512_mask", + "llvm.x86.avx512.vpmadd52l.uq.512" => "__builtin_ia32_vpmadd52luq512_mask", + "llvm.x86.avx512.vpmadd52h.uq.256" => "__builtin_ia32_vpmadd52huq256_mask", + "llvm.x86.avx512.vpmadd52l.uq.256" => "__builtin_ia32_vpmadd52luq256_mask", + "llvm.x86.avx512.vpmadd52h.uq.128" => "__builtin_ia32_vpmadd52huq128_mask", // NOTE: this file is generated by https://github.com/GuillaumeGomez/llvmint/blob/master/generate_list.py _ => include!("archs.rs"), From 13fa30c6c396a5ffe91423a028dc51547ec68d42 Mon Sep 17 00:00:00 2001 From: Antoni Boucher Date: Sun, 5 Jun 2022 18:53:30 -0400 Subject: [PATCH 10/12] Add more SIMD --- src/intrinsic/llvm.rs | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/intrinsic/llvm.rs b/src/intrinsic/llvm.rs index 14a7eaf49bb..42cf06c8c7a 100644 --- a/src/intrinsic/llvm.rs +++ b/src/intrinsic/llvm.rs @@ -171,10 +171,7 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc new_args.push(last_arg); args = new_args.into(); }, - "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" | "__builtin_ia32_vpmadd52huq512_mask" - | "__builtin_ia32_vpmadd52luq512_mask" | "__builtin_ia32_vpmadd52huq256_mask" | "__builtin_ia32_vpmadd52luq256_mask" - | "__builtin_ia32_vpmadd52huq128_mask" - => { + "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" => { let mut new_args = args.to_vec(); let last_arg = new_args.pop().expect("last arg"); let arg4_type = gcc_func.get_param_type(3); @@ -188,7 +185,10 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc | "__builtin_ia32_vpermi2varq256_mask" | "__builtin_ia32_vpermi2varq128_mask" | "__builtin_ia32_vpermi2varps512_mask" | "__builtin_ia32_vpermi2varps256_mask" | "__builtin_ia32_vpermi2varps128_mask" | "__builtin_ia32_vpermi2varpd512_mask" - | "__builtin_ia32_vpermi2varpd256_mask" | "__builtin_ia32_vpermi2varpd128_mask" => { + | "__builtin_ia32_vpermi2varpd256_mask" | "__builtin_ia32_vpermi2varpd128_mask" | "__builtin_ia32_vpmadd52huq512_mask" + | "__builtin_ia32_vpmadd52luq512_mask" | "__builtin_ia32_vpmadd52huq256_mask" | "__builtin_ia32_vpmadd52luq256_mask" + | "__builtin_ia32_vpmadd52huq128_mask" + => { let mut new_args = args.to_vec(); let arg4_type = gcc_func.get_param_type(3); let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); @@ -513,10 +513,10 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function "llvm.x86.avx512.mask.compress.store.b.512" => "__builtin_ia32_compressstoreuqi512_mask", "llvm.x86.avx512.mask.compress.store.b.256" => "__builtin_ia32_compressstoreuqi256_mask", "llvm.x86.avx512.mask.compress.store.b.128" => "__builtin_ia32_compressstoreuqi128_mask", - "llvm.x86.avx512.mask.compress.w.512" => "__builtin_ia32_compressstoreuhi512_mask", + "llvm.x86.avx512.mask.compress.w.512" => "__builtin_ia32_compresshi512_mask", "llvm.x86.avx512.mask.compress.w.256" => "__builtin_ia32_compresshi256_mask", "llvm.x86.avx512.mask.compress.w.128" => "__builtin_ia32_compresshi128_mask", - "llvm.x86.avx512.mask.compress.b.512" => "__builtin_ia32_compressstoreuqi512_mask", + "llvm.x86.avx512.mask.compress.b.512" => "__builtin_ia32_compressqi512_mask", "llvm.x86.avx512.mask.compress.b.256" => "__builtin_ia32_compressqi256_mask", "llvm.x86.avx512.mask.compress.b.128" => "__builtin_ia32_compressqi128_mask", "llvm.x86.avx512.mask.expand.w.512" => "__builtin_ia32_expandhi512_mask", @@ -674,6 +674,18 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function "llvm.x86.avx512.vpmadd52h.uq.256" => "__builtin_ia32_vpmadd52huq256_mask", "llvm.x86.avx512.vpmadd52l.uq.256" => "__builtin_ia32_vpmadd52luq256_mask", "llvm.x86.avx512.vpmadd52h.uq.128" => "__builtin_ia32_vpmadd52huq128_mask", + "llvm.x86.avx512.vpdpwssd.512" => "__builtin_ia32_vpdpwssd_v16si", + "llvm.x86.avx512.vpdpwssd.256" => "__builtin_ia32_vpdpwssd_v8si", + "llvm.x86.avx512.vpdpwssd.128" => "__builtin_ia32_vpdpwssd_v4si", + "llvm.x86.avx512.vpdpwssds.512" => "__builtin_ia32_vpdpwssds_v16si", + "llvm.x86.avx512.vpdpwssds.256" => "__builtin_ia32_vpdpwssds_v8si", + "llvm.x86.avx512.vpdpwssds.128" => "__builtin_ia32_vpdpwssds_v4si", + "llvm.x86.avx512.vpdpbusd.512" => "__builtin_ia32_vpdpbusd_v16si", + "llvm.x86.avx512.vpdpbusd.256" => "__builtin_ia32_vpdpbusd_v8si", + "llvm.x86.avx512.vpdpbusd.128" => "__builtin_ia32_vpdpbusd_v4si", + "llvm.x86.avx512.vpdpbusds.512" => "__builtin_ia32_vpdpbusds_v16si", + "llvm.x86.avx512.vpdpbusds.256" => "__builtin_ia32_vpdpbusds_v8si", + "llvm.x86.avx512.vpdpbusds.128" => "__builtin_ia32_vpdpbusds_v4si", // NOTE: this file is generated by https://github.com/GuillaumeGomez/llvmint/blob/master/generate_list.py _ => include!("archs.rs"), From 2ba5845c520c3fe6558b7047097a89003284bf9a Mon Sep 17 00:00:00 2001 From: Antoni Boucher Date: Sun, 5 Jun 2022 19:00:34 -0400 Subject: [PATCH 11/12] Update Cargo files --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0e41bec8b76..211d19a8dc8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,10 +22,10 @@ default = ["master"] master = ["gccjit/master"] [dependencies] -#gccjit = { git = "https://github.com/antoyo/gccjit.rs" } +gccjit = { git = "https://github.com/antoyo/gccjit.rs" } # Local copy. -gccjit = { path = "../gccjit.rs" } +#gccjit = { path = "../gccjit.rs" } target-lexicon = "0.10.0" From ee4755afdbd8123333513931747d295021561b97 Mon Sep 17 00:00:00 2001 From: Antoni Boucher Date: Thu, 9 Jun 2022 21:11:30 -0400 Subject: [PATCH 12/12] Add more SIMD --- Readme.md | 4 ++++ src/base.rs | 12 +++++++++--- src/builder.rs | 8 +++++--- src/intrinsic/llvm.rs | 5 ++++- src/intrinsic/simd.rs | 3 ++- 5 files changed, 24 insertions(+), 8 deletions(-) diff --git a/Readme.md b/Readme.md index fe23a267696..5bbd29fceba 100644 --- a/Readme.md +++ b/Readme.md @@ -127,6 +127,10 @@ To get the `rustc` command to run in `gdb`, add the `--verbose` flag to `cargo b * Build the stage2 compiler (`rustup toolchain link debug-current build/x86_64-unknown-linux-gnu/stage2`). * Clean and rebuild the codegen with `debug-current` in the file `rust-toolchain`. +### How to use [mem-trace](https://github.com/antoyo/mem-trace) + +`rustc` needs to be built without `jemalloc` so that `mem-trace` can overload `malloc` since `jemalloc` is linked statically, so a `LD_PRELOAD`-ed library won't a chance to intercept the calls to `malloc`. + ### How to build a cross-compiling libgccjit #### Building libgccjit diff --git a/src/base.rs b/src/base.rs index e4ecbd46f0c..19c981309d7 100644 --- a/src/base.rs +++ b/src/base.rs @@ -81,11 +81,17 @@ pub fn compile_codegen_unit<'tcx>(tcx: TyCtxt<'tcx>, cgu_name: Symbol, supports_ // TODO(antoyo): only add the following cli argument if the feature is supported. context.add_command_line_option("-msse2"); context.add_command_line_option("-mavx2"); - context.add_command_line_option("-msha"); - context.add_command_line_option("-mpclmul"); // FIXME(antoyo): the following causes an illegal instruction on vmovdqu64 in std_example on my CPU. // Only add if the CPU supports it. - //context.add_command_line_option("-mavx512f"); + /*context.add_command_line_option("-mavx512f"); + context.add_command_line_option("-msha"); + context.add_command_line_option("-mpclmul"); + context.add_command_line_option("-mfma"); + context.add_command_line_option("-mfma4"); + context.add_command_line_option("-mavx512vpopcntdq"); + context.add_command_line_option("-mavx512vl"); + context.add_command_line_option("-m64"); + context.add_command_line_option("-mbmi");*/ for arg in &tcx.sess.opts.cg.llvm_args { context.add_command_line_option(arg); } diff --git a/src/builder.rs b/src/builder.rs index 3e1f56c183a..3804a958e69 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -213,7 +213,7 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> { let actual_ty = actual_val.get_type(); if expected_ty != actual_ty { - if !actual_ty.is_vector() && !expected_ty.is_vector() && actual_ty.is_integral() && expected_ty.is_integral() && actual_ty.get_size() != expected_ty.get_size() { + if !actual_ty.is_vector() && !expected_ty.is_vector() && actual_ty.is_integral() && expected_ty.is_integral() { self.context.new_cast(None, actual_val, expected_ty) } else if on_stack_param_indices.contains(&index) { @@ -1390,18 +1390,20 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> { where F: Fn(RValue<'gcc>, RValue<'gcc>, &'gcc Context<'gcc>) -> RValue<'gcc> { let vector_type = src.get_type().unqualified().dyncast_vector().expect("vector type"); + let element_type = vector_type.get_element_type(); + let mask_element_type = self.type_ix(element_type.get_size() as u64 * 8); let element_count = vector_type.get_num_units(); let mut vector_elements = vec![]; for i in 0..element_count { vector_elements.push(i); } - let mask_type = self.context.new_vector_type(self.int_type, element_count as u64); + let mask_type = self.context.new_vector_type(mask_element_type, element_count as u64); let mut shift = 1; let mut res = src; while shift < element_count { let vector_elements: Vec<_> = vector_elements.iter() - .map(|i| self.context.new_rvalue_from_int(self.int_type, ((i + shift) % element_count) as i32)) + .map(|i| self.context.new_rvalue_from_int(mask_element_type, ((i + shift) % element_count) as i32)) .collect(); let mask = self.context.new_rvalue_from_vector(None, mask_type, &vector_elements); let shifted = self.context.new_rvalue_vector_perm(None, res, res, mask); diff --git a/src/intrinsic/llvm.rs b/src/intrinsic/llvm.rs index 42cf06c8c7a..f2faae07028 100644 --- a/src/intrinsic/llvm.rs +++ b/src/intrinsic/llvm.rs @@ -288,7 +288,10 @@ pub fn adjust_intrinsic_return_value<'a, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc, match func_name { "__builtin_ia32_vfmaddss3_round" | "__builtin_ia32_vfmaddsd3_round" => { let zero = builder.context.new_rvalue_zero(builder.int_type); - return_value = builder.context.new_vector_access(None, return_value, zero).to_rvalue(); + #[cfg(feature="master")] + { + return_value = builder.context.new_vector_access(None, return_value, zero).to_rvalue(); + } }, "__builtin_ia32_addcarryx_u64" | "__builtin_ia32_sbb_u64" | "__builtin_ia32_addcarryx_u32" | "__builtin_ia32_sbb_u32" => { // Both llvm.x86.addcarry.32 and llvm.x86.addcarryx.u32 points to the same GCC builtin, diff --git a/src/intrinsic/simd.rs b/src/intrinsic/simd.rs index bf5d555736a..8f9862414e6 100644 --- a/src/intrinsic/simd.rs +++ b/src/intrinsic/simd.rs @@ -216,7 +216,7 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>, let variable = bx.current_func().new_local(None, vector.get_type(), "new_vector"); bx.llbb().add_assignment(None, variable, vector); let lvalue = bx.context.new_vector_access(None, variable.to_rvalue(), index); - // TODO: si simd_insert est constant, utiliser BIT_REF… + // TODO: if simd_insert is constant, use BIT_REF. bx.llbb().add_assignment(None, lvalue, value); return Ok(variable.to_rvalue()); } @@ -252,6 +252,7 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>, return Ok(bx.vector_select(args[0].immediate(), args[1].immediate(), args[2].immediate())); } + #[cfg(feature="master")] if name == sym::simd_cast { require_simd!(ret_ty, "return"); let (out_len, out_elem) = ret_ty.simd_size_and_type(bx.tcx());