From cc7c9bea1546f9dc07b39feab2d1af776804ee84 Mon Sep 17 00:00:00 2001 From: Andy Sadler Date: Sat, 7 Oct 2023 23:59:58 -0500 Subject: [PATCH 1/7] implement simd_bswap intrinsic Implements lane-local byte swapping through vector shuffles. While this is more setup than non-vector shuffles, this implementation can shuffle multiple integers concurrently. Signed-off-by: Andy Sadler --- src/intrinsic/simd.rs | 54 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/src/intrinsic/simd.rs b/src/intrinsic/simd.rs index 85d3e7234a0..3356d6f4a4b 100644 --- a/src/intrinsic/simd.rs +++ b/src/intrinsic/simd.rs @@ -1,3 +1,5 @@ +use std::iter::FromIterator; + use gccjit::ToRValue; use gccjit::{BinaryOp, RValue, Type}; #[cfg(feature = "master")] @@ -156,6 +158,58 @@ macro_rules! require_simd { return Ok(compare_simd_types(bx, arg1, arg2, in_elem, llret_ty, cmp_op)); } + if name == sym::simd_bswap { + let vector = args[0].immediate(); + let ret = match in_elem.kind() { + ty::Int(i) if i.bit_width() == Some(8) => vector, + ty::Uint(i) if i.bit_width() == Some(8) => vector, + ty::Int(_) | ty::Uint(_) => { + let v_type = vector.get_type(); + let vector_type = v_type.unqualified().dyncast_vector().expect("vector type"); + let elem_type = vector_type.get_element_type(); + let elem_size_bytes = elem_type.get_size(); + let type_size_bytes = elem_size_bytes as u64 * in_len; + + let shuffle_indices = Vec::from_iter(0..type_size_bytes); + let byte_vector_type = bx.context.new_vector_type(bx.type_u8(), type_size_bytes); + let byte_vector = bx.context.new_bitcast(None, args[0].immediate(), byte_vector_type); + + #[cfg(not(feature = "master"))] + let shuffled = { + let new_elements: Vec<_> = shuffle_indices.chunks_exact(elem_size_bytes as _) + .flat_map(|x| x.iter().rev()) + .map(|&i| { + let index = bx.context.new_rvalue_from_long(bx.u64_type, i as _); + bx.extract_element(byte_vector, index) + }) + .collect(); + + bx.context.new_rvalue_from_vector(None, byte_vector_type, &new_elements) + }; + #[cfg(feature = "master")] + let shuffled = { + let indices: Vec<_> = shuffle_indices.chunks_exact(elem_size_bytes as _) + .flat_map(|x| x.iter().rev()) + .map(|&i| bx.context.new_rvalue_from_int(bx.u8_type, i as _)) + .collect(); + + let mask = bx.context.new_rvalue_from_vector(None, byte_vector_type, &indices); + bx.context.new_rvalue_vector_perm(None, byte_vector, byte_vector, mask) + }; + bx.context.new_bitcast(None, shuffled, v_type) + } + _ => { + return_error!(InvalidMonomorphization::UnsupportedOperation { + span, + name, + in_ty, + in_elem, + }); + } + }; + return Ok(ret); + } + if name == sym::simd_shuffle { // Make sure this is actually an array, since typeck only checks the length-suffixed // version of this intrinsic. From 6d13f949ee67dbed3ee9a1da5533d3a55c03e774 Mon Sep 17 00:00:00 2001 From: Andy Sadler Date: Thu, 9 Nov 2023 19:49:06 -0600 Subject: [PATCH 2/7] remove generic-bswap-byte from failing test list Signed-off-by: Andy Sadler --- failing-ui-tests.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/failing-ui-tests.txt b/failing-ui-tests.txt index 22044eabe96..4b2c3f64a17 100644 --- a/failing-ui-tests.txt +++ b/failing-ui-tests.txt @@ -57,7 +57,6 @@ tests/ui/coroutine/panic-safe.rs tests/ui/issues/issue-14875.rs tests/ui/issues/issue-29948.rs tests/ui/panics/nested_panic_caught.rs -tests/ui/simd/intrinsic/generic-bswap-byte.rs tests/ui/const_prop/ice-issue-111353.rs tests/ui/process/println-with-broken-pipe.rs tests/ui/panic-runtime/lto-abort.rs From 70586a23a7fbe5b78752438587db86678f53ee2f Mon Sep 17 00:00:00 2001 From: Andy Sadler Date: Sun, 8 Oct 2023 00:13:10 -0500 Subject: [PATCH 3/7] fix simd_frem intrinsic implementation The simd intrinsic handler was delegating implementation of `simd_frem` to `Builder::frem`, which wasn't able to handle vector-typed inputs. To fix this, teach this method how to handle vector inputs. Signed-off-by: Andy Sadler --- src/builder.rs | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/builder.rs b/src/builder.rs index b8a8c144dc9..4ae56a41e52 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -606,12 +606,29 @@ fn frem(&mut self, a: RValue<'gcc>, b: RValue<'gcc>) -> RValue<'gcc> { // ../../../gcc/gcc/cfgexpand.cc:6069 // 0x7f0101bf9194 execute // ../../../gcc/gcc/cfgexpand.cc:6795 - if a.get_type().is_compatible_with(self.cx.float_type) { + let a_type = a.get_type(); + let a_type_unqualified = a_type.unqualified(); + if a_type.is_compatible_with(self.cx.float_type) { let fmodf = self.context.get_builtin_function("fmodf"); // FIXME(antoyo): this seems to produce the wrong result. return self.context.new_call(None, fmodf, &[a, b]); } - assert_eq!(a.get_type().unqualified(), self.cx.double_type); + else if let Some(vector_type) = a_type_unqualified.dyncast_vector() { + assert_eq!(a_type_unqualified, b.get_type().unqualified()); + + let num_units = vector_type.get_num_units(); + let new_elements: Vec<_> = (0..num_units) + .map(|i| { + let index = self.context.new_rvalue_from_long(self.cx.type_u32(), i as _); + let x = self.extract_element(a, index).to_rvalue(); + let y = self.extract_element(b, index).to_rvalue(); + self.frem(x, y) + }) + .collect(); + + return self.context.new_rvalue_from_vector(None, a_type, &new_elements) + } + assert_eq!(a_type_unqualified, self.cx.double_type); let fmod = self.context.get_builtin_function("fmod"); return self.context.new_call(None, fmod, &[a, b]); From 8d42a82b6e86b30e9a18cd12e2a89fd7b261bdd3 Mon Sep 17 00:00:00 2001 From: Andy Sadler Date: Sun, 8 Oct 2023 18:49:16 -0500 Subject: [PATCH 4/7] impl simd_bitreverse intrinsic If we're running against a patched libgccjit, use an algorithm similar to what LLVM uses for this intrinsic. Otherwise, fallback to a per-element bitreverse. Signed-off-by: Andy Sadler --- src/intrinsic/simd.rs | 215 +++++++++++++++++++++++++++++++++--------- 1 file changed, 169 insertions(+), 46 deletions(-) diff --git a/src/intrinsic/simd.rs b/src/intrinsic/simd.rs index 3356d6f4a4b..2469e8d4c62 100644 --- a/src/intrinsic/simd.rs +++ b/src/intrinsic/simd.rs @@ -23,6 +23,8 @@ use crate::builder::Builder; #[cfg(feature = "master")] use crate::context::CodegenCx; +#[cfg(not(feature = "master"))] +use crate::common::SignType; pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>( bx: &mut Builder<'a, 'gcc, 'tcx>, @@ -158,56 +160,177 @@ macro_rules! require_simd { return Ok(compare_simd_types(bx, arg1, arg2, in_elem, llret_ty, cmp_op)); } - if name == sym::simd_bswap { - let vector = args[0].immediate(); - let ret = match in_elem.kind() { - ty::Int(i) if i.bit_width() == Some(8) => vector, - ty::Uint(i) if i.bit_width() == Some(8) => vector, - ty::Int(_) | ty::Uint(_) => { - let v_type = vector.get_type(); - let vector_type = v_type.unqualified().dyncast_vector().expect("vector type"); - let elem_type = vector_type.get_element_type(); - let elem_size_bytes = elem_type.get_size(); - let type_size_bytes = elem_size_bytes as u64 * in_len; + let simd_bswap = |bx: &mut Builder<'a, 'gcc, 'tcx>, vector: RValue<'gcc>| -> RValue<'gcc> { + let v_type = vector.get_type(); + let vector_type = v_type.unqualified().dyncast_vector().expect("vector type"); + let elem_type = vector_type.get_element_type(); + let elem_size_bytes = elem_type.get_size(); + if elem_size_bytes == 1 { + return vector; + } - let shuffle_indices = Vec::from_iter(0..type_size_bytes); - let byte_vector_type = bx.context.new_vector_type(bx.type_u8(), type_size_bytes); - let byte_vector = bx.context.new_bitcast(None, args[0].immediate(), byte_vector_type); + let type_size_bytes = elem_size_bytes as u64 * in_len; + let shuffle_indices = Vec::from_iter(0..type_size_bytes); + let byte_vector_type = bx.context.new_vector_type(bx.type_u8(), type_size_bytes); + let byte_vector = bx.context.new_bitcast(None, args[0].immediate(), byte_vector_type); - #[cfg(not(feature = "master"))] - let shuffled = { - let new_elements: Vec<_> = shuffle_indices.chunks_exact(elem_size_bytes as _) - .flat_map(|x| x.iter().rev()) - .map(|&i| { - let index = bx.context.new_rvalue_from_long(bx.u64_type, i as _); - bx.extract_element(byte_vector, index) - }) - .collect(); + #[cfg(not(feature = "master"))] + let shuffled = { + let new_elements: Vec<_> = shuffle_indices.chunks_exact(elem_size_bytes as _) + .flat_map(|x| x.iter().rev()) + .map(|&i| { + let index = bx.context.new_rvalue_from_long(bx.u64_type, i as _); + bx.extract_element(byte_vector, index) + }) + .collect(); - bx.context.new_rvalue_from_vector(None, byte_vector_type, &new_elements) - }; - #[cfg(feature = "master")] - let shuffled = { - let indices: Vec<_> = shuffle_indices.chunks_exact(elem_size_bytes as _) - .flat_map(|x| x.iter().rev()) - .map(|&i| bx.context.new_rvalue_from_int(bx.u8_type, i as _)) - .collect(); - - let mask = bx.context.new_rvalue_from_vector(None, byte_vector_type, &indices); - bx.context.new_rvalue_vector_perm(None, byte_vector, byte_vector, mask) - }; - bx.context.new_bitcast(None, shuffled, v_type) - } - _ => { - return_error!(InvalidMonomorphization::UnsupportedOperation { - span, - name, - in_ty, - in_elem, - }); - } + bx.context.new_rvalue_from_vector(None, byte_vector_type, &new_elements) }; - return Ok(ret); + #[cfg(feature = "master")] + let shuffled = { + let indices: Vec<_> = shuffle_indices.chunks_exact(elem_size_bytes as _) + .flat_map(|x| x.iter().rev()) + .map(|&i| bx.context.new_rvalue_from_int(bx.u8_type, i as _)) + .collect(); + + let mask = bx.context.new_rvalue_from_vector(None, byte_vector_type, &indices); + bx.context.new_rvalue_vector_perm(None, byte_vector, byte_vector, mask) + }; + bx.context.new_bitcast(None, shuffled, v_type) + }; + + if name == sym::simd_bswap || name == sym::simd_bitreverse { + require!( + bx.type_kind(bx.element_type(llret_ty)) == TypeKind::Integer, + InvalidMonomorphization::UnsupportedOperation { + span, + name, + in_ty, + in_elem, + } + ); + } + + if name == sym::simd_bswap { + return Ok(simd_bswap(bx, args[0].immediate())); + } + + // We use a different algorithm from non-vector bitreverse to take advantage of most + // processors' vector shuffle units. It works like this: + // 1. Generate pre-reversed low and high nibbles as a vector. + // 2. Byte-swap the input. + // 3. Mask off the low and high nibbles of each byte in the byte-swapped input. + // 4. Shuffle the pre-reversed low and high-nibbles using the masked nibbles as a shuffle mask. + // 5. Combine the results of the shuffle back together and cast back to the original type. + #[cfg(feature = "master")] + if name == sym::simd_bitreverse { + let vector = args[0].immediate(); + let v_type = vector.get_type(); + let vector_type = v_type.unqualified().dyncast_vector().expect("vector type"); + let elem_type = vector_type.get_element_type(); + let elem_size_bytes = elem_type.get_size(); + + let type_size_bytes = elem_size_bytes as u64 * in_len; + // We need to ensure at least 16 entries in our vector type, since the pre-reversed vectors + // we generate below have 16 entries in them. `new_rvalue_vector_perm` requires the mask + // vector to be of the same length as the source vectors. + let byte_vector_type_size = type_size_bytes.max(16); + + let byte_vector_type = bx.context.new_vector_type(bx.u8_type, type_size_bytes); + let long_byte_vector_type = bx.context.new_vector_type(bx.u8_type, byte_vector_type_size); + + // Step 1: Generate pre-reversed low and high nibbles as a vector. + let zero_byte = bx.context.new_rvalue_zero(bx.u8_type); + let hi_nibble_elements: Vec<_> = (0u8..16) + .map(|x| bx.context.new_rvalue_from_int(bx.u8_type, x.reverse_bits() as _)) + .chain((16..byte_vector_type_size).map(|_| zero_byte)) + .collect(); + let hi_nibble = bx.context.new_rvalue_from_vector(None, long_byte_vector_type, &hi_nibble_elements); + + let lo_nibble_elements: Vec<_> = (0u8..16) + .map(|x| bx.context.new_rvalue_from_int(bx.u8_type, (x.reverse_bits() >> 4) as _)) + .chain((16..byte_vector_type_size).map(|_| zero_byte)) + .collect(); + let lo_nibble = bx.context.new_rvalue_from_vector(None, long_byte_vector_type, &lo_nibble_elements); + + let mask = bx.context.new_rvalue_from_vector( + None, + long_byte_vector_type, + &vec![bx.context.new_rvalue_from_int(bx.u8_type, 0x0f); byte_vector_type_size as _]); + + let four_vec = bx.context.new_rvalue_from_vector( + None, + long_byte_vector_type, + &vec![bx.context.new_rvalue_from_int(bx.u8_type, 4); byte_vector_type_size as _]); + + // Step 2: Byte-swap the input. + let swapped = simd_bswap(bx, args[0].immediate()); + let byte_vector = bx.context.new_bitcast(None, swapped, byte_vector_type); + + // We're going to need to extend the vector with zeros to make sure that the types are the + // same, since that's what new_rvalue_vector_perm expects. + let byte_vector = if byte_vector_type_size > type_size_bytes { + let mut byte_vector_elements = Vec::with_capacity(byte_vector_type_size as _); + for i in 0..type_size_bytes { + let idx = bx.context.new_rvalue_from_int(bx.u32_type, i as _); + let val = bx.extract_element(byte_vector, idx); + byte_vector_elements.push(val); + } + for _ in type_size_bytes..byte_vector_type_size { + byte_vector_elements.push(zero_byte); + } + bx.context.new_rvalue_from_vector(None, long_byte_vector_type, &byte_vector_elements) + } else { + bx.context.new_bitcast(None, byte_vector, long_byte_vector_type) + }; + + // Step 3: Mask off the low and high nibbles of each byte in the byte-swapped input. + let masked_hi = (byte_vector >> four_vec) & mask; + let masked_lo = byte_vector & mask; + + // Step 4: Shuffle the pre-reversed low and high-nibbles using the masked nibbles as a shuffle mask. + let hi = bx.context.new_rvalue_vector_perm(None, hi_nibble, hi_nibble, masked_lo); + let lo = bx.context.new_rvalue_vector_perm(None, lo_nibble, lo_nibble, masked_hi); + + // Step 5: Combine the results of the shuffle back together and cast back to the original type. + let result = hi | lo; + let cast_ty = bx.context.new_vector_type(elem_type, byte_vector_type_size / (elem_size_bytes as u64)); + + // we might need to truncate if sizeof(v_type) < sizeof(cast_type) + if type_size_bytes < byte_vector_type_size { + let cast_result = bx.context.new_bitcast(None, result, cast_ty); + let elems: Vec<_> = (0..in_len) + .map(|i| { + let idx = bx.context.new_rvalue_from_int(bx.u32_type, i as _); + bx.extract_element(cast_result, idx) + }) + .collect(); + return Ok(bx.context.new_rvalue_from_vector(None, v_type, &elems)) + } else { + // avoid the unnecessary truncation as an optimization. + return Ok(bx.context.new_bitcast(None, result, v_type)); + } + } + // since gcc doesn't have vector shuffle methods available in non-patched builds, fallback to + // component-wise bitreverses if they're not available. + #[cfg(not(feature = "master"))] + if name == sym::simd_bitreverse { + let vector = args[0].immediate(); + let vector_ty = vector.get_type(); + let vector_type = vector_ty.unqualified().dyncast_vector().expect("vector type"); + let num_elements = vector_type.get_num_units(); + + let elem_type = vector_type.get_element_type(); + let elem_size_bytes = elem_type.get_size(); + let num_type = elem_type.to_unsigned(bx.cx); + let new_elements: Vec<_> = (0..num_elements) + .map(|idx| { + let index = bx.context.new_rvalue_from_long(num_type, idx as _); + let extracted_value = bx.extract_element(vector, index).to_rvalue(); + bx.bit_reverse(elem_size_bytes as u64 * 8, extracted_value) + }) + .collect(); + return Ok(bx.context.new_rvalue_from_vector(None, vector_ty, &new_elements)); } if name == sym::simd_shuffle { From 03e11a214e9b6295bb06a53f849b117e75986cf6 Mon Sep 17 00:00:00 2001 From: Andy Sadler Date: Thu, 19 Oct 2023 20:01:22 -0500 Subject: [PATCH 5/7] impl simd_ctlz/simd_cttz intrinsic Signed-off-by: Andy Sadler --- src/intrinsic/simd.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/intrinsic/simd.rs b/src/intrinsic/simd.rs index 2469e8d4c62..5991f061c10 100644 --- a/src/intrinsic/simd.rs +++ b/src/intrinsic/simd.rs @@ -333,6 +333,22 @@ macro_rules! require_simd { return Ok(bx.context.new_rvalue_from_vector(None, vector_ty, &new_elements)); } + if name == sym::simd_ctlz || name == sym::simd_cttz { + let vector = args[0].immediate(); + let elements: Vec<_> = (0..in_len) + .map(|i| { + let index = bx.context.new_rvalue_from_long(bx.i32_type, i as i64); + let value = bx.extract_element(vector, index).to_rvalue(); + if name == sym::simd_ctlz { + bx.count_leading_zeroes(value.get_type().get_size() as u64 * 8, value) + } else { + bx.count_trailing_zeroes(value.get_type().get_size() as u64 * 8, value) + } + }) + .collect(); + return Ok(bx.context.new_rvalue_from_vector(None, vector.get_type(), &elements)); + } + if name == sym::simd_shuffle { // Make sure this is actually an array, since typeck only checks the length-suffixed // version of this intrinsic. From 3a221320eb49bf464fd332c2be244ca3a783d1ac Mon Sep 17 00:00:00 2001 From: Andy Sadler Date: Thu, 9 Nov 2023 19:07:44 -0600 Subject: [PATCH 6/7] fix simd_neg implementation for ints gcc_not would panic upon encountering a vector type, which is not what we want here. Signed-off-by: Andy Sadler --- src/int.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/int.rs b/src/int.rs index 9b9b3ea4f87..b69e073c4d9 100644 --- a/src/int.rs +++ b/src/int.rs @@ -48,7 +48,7 @@ pub fn gcc_not(&self, a: RValue<'gcc>) -> RValue<'gcc> { pub fn gcc_neg(&self, a: RValue<'gcc>) -> RValue<'gcc> { let a_type = a.get_type(); - if self.is_native_int_type(a_type) { + if self.is_native_int_type(a_type) || a_type.is_vector() { self.cx.context.new_unary_op(None, UnaryOp::Minus, a.get_type(), a) } else { From 17b2c46c8896adf07ae8f70b0d8b70227b5f4c71 Mon Sep 17 00:00:00 2001 From: Andy Sadler Date: Thu, 9 Nov 2023 19:49:34 -0600 Subject: [PATCH 7/7] remove generic-arithmetic-pass from failing tests This test now passes when tested with a patched libgccjit. However, due to [some compiler bugs][1], we can't enable this for non-patched libgccjit yet. [1]: https://github.com/sadlerap/rustc_codegen_gcc/actions/runs/6820180639/job/18548672444#step:15:4375 Signed-off-by: Andy Sadler --- failing-ui-tests.txt | 1 - failing-ui-tests12.txt | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/failing-ui-tests.txt b/failing-ui-tests.txt index 4b2c3f64a17..023fe9d7e83 100644 --- a/failing-ui-tests.txt +++ b/failing-ui-tests.txt @@ -13,7 +13,6 @@ tests/ui/sepcomp/sepcomp-extern.rs tests/ui/sepcomp/sepcomp-fns-backwards.rs tests/ui/sepcomp/sepcomp-fns.rs tests/ui/sepcomp/sepcomp-statics.rs -tests/ui/simd/intrinsic/generic-arithmetic-pass.rs tests/ui/asm/x86_64/may_unwind.rs tests/ui/backtrace.rs tests/ui/catch-unwind-bang.rs diff --git a/failing-ui-tests12.txt b/failing-ui-tests12.txt index 24ef7bb8d70..fef259c53f4 100644 --- a/failing-ui-tests12.txt +++ b/failing-ui-tests12.txt @@ -9,6 +9,7 @@ tests/ui/packed/packed-struct-vec.rs tests/ui/packed/packed-tuple-struct-layout.rs tests/ui/simd/array-type.rs tests/ui/simd/intrinsic/float-minmax-pass.rs +tests/ui/simd/intrinsic/generic-arithmetic-pass.rs tests/ui/simd/intrinsic/generic-arithmetic-saturating-pass.rs tests/ui/simd/intrinsic/generic-as.rs tests/ui/simd/intrinsic/generic-cast-pass.rs