Auto merge of #116609 - eduardosm:bump-stdarch, r=workingjubilee

Bump stdarch submodule and remove special handling for LLVM intrinsics that are no longer needed

Bumps stdarch to pull https://github.com/rust-lang/stdarch/pull/1477, which reimplemented some functions with portable SIMD intrinsics instead of arch specific LLVM intrinsics.

Handling of those LLVM intrinsics is removed from cranelift codegen and miri.

cc `@RalfJung` `@bjorn3`
This commit is contained in:
bors 2023-10-28 11:26:34 +00:00
commit 3089c315b1
6 changed files with 8 additions and 262 deletions

View File

@ -32,41 +32,6 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
ret.write_cvalue(fx, CValue::by_val(res, fx.layout_of(fx.tcx.types.i64)));
}
// Used by `_mm_movemask_epi8` and `_mm256_movemask_epi8`
"llvm.x86.sse2.pmovmskb.128"
| "llvm.x86.avx2.pmovmskb"
| "llvm.x86.sse.movmsk.ps"
| "llvm.x86.sse2.movmsk.pd" => {
intrinsic_args!(fx, args => (a); intrinsic);
let (lane_count, lane_ty) = a.layout().ty.simd_size_and_type(fx.tcx);
let lane_ty = fx.clif_type(lane_ty).unwrap();
assert!(lane_count <= 32);
let mut res = fx.bcx.ins().iconst(types::I32, 0);
for lane in (0..lane_count).rev() {
let a_lane = a.value_lane(fx, lane).load_scalar(fx);
// cast float to int
let a_lane = match lane_ty {
types::F32 => codegen_bitcast(fx, types::I32, a_lane),
types::F64 => codegen_bitcast(fx, types::I64, a_lane),
_ => a_lane,
};
// extract sign bit of an int
let a_lane_sign = fx.bcx.ins().ushr_imm(a_lane, i64::from(lane_ty.bits() - 1));
// shift sign bit into result
let a_lane_sign = clif_intcast(fx, a_lane_sign, types::I32, false);
res = fx.bcx.ins().ishl_imm(res, 1);
res = fx.bcx.ins().bor(res, a_lane_sign);
}
let res = CValue::by_val(res, fx.layout_of(fx.tcx.types.i32));
ret.write_cvalue(fx, res);
}
"llvm.x86.sse.cmp.ps" | "llvm.x86.sse2.cmp.pd" => {
let (x, y, kind) = match args {
[x, y, kind] => (x, y, kind),

@ -1 +1 @@
Subproject commit 333e9e9977188d0748327e9b5be0f3f412063174
Subproject commit f4528dd6e85d97bb802240d7cd048b6e1bf72540

View File

@ -209,25 +209,6 @@ fn emulate_x86_sse_intrinsic(
)?;
}
}
// Used to implement the _mm_movemask_ps function.
// Returns a scalar integer where the i-th bit is the highest
// bit of the i-th component of `op`.
// https://www.felixcloutier.com/x86/movmskps
"movmsk.ps" => {
let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (op, op_len) = this.operand_to_simd(op)?;
let mut res = 0;
for i in 0..op_len {
let op = this.read_scalar(&this.project_index(&op, i)?)?;
let op = op.to_u32()?;
// Extract the highest bit of `op` and place it in the `i`-th bit of `res`
res |= (op >> 31) << i;
}
this.write_scalar(Scalar::from_u32(res), dest)?;
}
_ => return Ok(EmulateForeignItemResult::NotSupported),
}
Ok(EmulateForeignItemResult::NeedsJumping)

View File

@ -1,8 +1,4 @@
use rustc_apfloat::{
ieee::{Double, Single},
Float as _,
};
use rustc_middle::mir;
use rustc_apfloat::ieee::Double;
use rustc_middle::ty::layout::LayoutOf as _;
use rustc_middle::ty::Ty;
use rustc_span::Symbol;
@ -39,49 +35,6 @@ fn emulate_x86_sse2_intrinsic(
// Intrinsincs sufixed with "epiX" or "epuX" operate with X-bit signed or unsigned
// vectors.
match unprefixed_name {
// Used to implement the _mm_avg_epu8 and _mm_avg_epu16 functions.
// Averages packed unsigned 8/16-bit integers in `left` and `right`.
"pavg.b" | "pavg.w" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
assert_eq!(dest_len, left_len);
assert_eq!(dest_len, right_len);
for i in 0..dest_len {
let left = this.read_immediate(&this.project_index(&left, i)?)?;
let right = this.read_immediate(&this.project_index(&right, i)?)?;
let dest = this.project_index(&dest, i)?;
// Widen the operands to avoid overflow
let twice_wide = this.layout_of(this.get_twice_wide_int_ty(left.layout.ty))?;
let left = this.int_to_int_or_float(&left, twice_wide)?;
let right = this.int_to_int_or_float(&right, twice_wide)?;
// Calculate left + right + 1
let added = this.wrapping_binary_op(mir::BinOp::Add, &left, &right)?;
let added = this.wrapping_binary_op(
mir::BinOp::Add,
&added,
&ImmTy::from_uint(1u32, twice_wide),
)?;
// Calculate (left + right + 1) / 2
let divided = this.wrapping_binary_op(
mir::BinOp::Div,
&added,
&ImmTy::from_uint(2u32, twice_wide),
)?;
// Narrow back to the original type
let res = this.int_to_int_or_float(&divided, dest.layout)?;
this.write_immediate(*res, &dest)?;
}
}
// Used to implement the _mm_madd_epi16 function.
// Multiplies packed signed 16-bit integers in `left` and `right`, producing
// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
@ -118,70 +71,6 @@ fn emulate_x86_sse2_intrinsic(
this.write_scalar(Scalar::from_i32(res), &dest)?;
}
}
// Used to implement the _mm_mulhi_epi16 and _mm_mulhi_epu16 functions.
"pmulh.w" | "pmulhu.w" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
assert_eq!(dest_len, left_len);
assert_eq!(dest_len, right_len);
for i in 0..dest_len {
let left = this.read_immediate(&this.project_index(&left, i)?)?;
let right = this.read_immediate(&this.project_index(&right, i)?)?;
let dest = this.project_index(&dest, i)?;
// Widen the operands to avoid overflow
let twice_wide = this.layout_of(this.get_twice_wide_int_ty(left.layout.ty))?;
let left = this.int_to_int_or_float(&left, twice_wide)?;
let right = this.int_to_int_or_float(&right, twice_wide)?;
// Multiply
let multiplied = this.wrapping_binary_op(mir::BinOp::Mul, &left, &right)?;
// Keep the high half
let high = this.wrapping_binary_op(
mir::BinOp::Shr,
&multiplied,
&ImmTy::from_uint(dest.layout.size.bits(), twice_wide),
)?;
// Narrow back to the original type
let res = this.int_to_int_or_float(&high, dest.layout)?;
this.write_immediate(*res, &dest)?;
}
}
// Used to implement the _mm_mul_epu32 function.
// Multiplies the the low unsigned 32-bit integers from each packed
// 64-bit element and stores the result as 64-bit unsigned integers.
"pmulu.dq" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
// left and right are u32x4, dest is u64x2
assert_eq!(left_len, 4);
assert_eq!(right_len, 4);
assert_eq!(dest_len, 2);
for i in 0..dest_len {
let op_i = i.checked_mul(2).unwrap();
let left = this.read_scalar(&this.project_index(&left, op_i)?)?.to_u32()?;
let right = this.read_scalar(&this.project_index(&right, op_i)?)?.to_u32()?;
let dest = this.project_index(&dest, i)?;
// The multiplication will not overflow because stripping the
// operands are expanded from 32-bit to 64-bit.
let res = u64::from(left).checked_mul(u64::from(right)).unwrap();
this.write_scalar(Scalar::from_u64(res), &dest)?;
}
}
// Used to implement the _mm_sad_epu8 function.
// Computes the absolute differences of packed unsigned 8-bit integers in `a`
// and `b`, then horizontally sum each consecutive 8 differences to produce
@ -370,25 +259,6 @@ enum ShiftOp {
this.write_scalar(Scalar::from_u64(res), &dest)?;
}
}
// Used to implement the _mm_cvtepi32_ps function.
// Converts packed i32 to packed f32.
// FIXME: Can we get rid of this intrinsic and just use simd_as?
"cvtdq2ps" => {
let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (op, op_len) = this.operand_to_simd(op)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
assert_eq!(dest_len, op_len);
for i in 0..dest_len {
let op = this.read_scalar(&this.project_index(&op, i)?)?.to_i32()?;
let dest = this.project_index(&dest, i)?;
let res = Scalar::from_f32(Single::from_i128(op.into()).value);
this.write_scalar(res, &dest)?;
}
}
// Used to implement the _mm_cvtps_epi32 and _mm_cvttps_epi32 functions.
// Converts packed f32 to packed i32.
"cvtps2dq" | "cvttps2dq" => {
@ -652,31 +522,6 @@ enum ShiftOp {
};
this.write_scalar(Scalar::from_i32(i32::from(res)), dest)?;
}
// Used to implement the _mm_cvtpd_ps and _mm_cvtps_pd functions.
// Converts packed f32/f64 to packed f64/f32.
"cvtpd2ps" | "cvtps2pd" => {
let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (op, op_len) = this.operand_to_simd(op)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
// For cvtpd2ps: op is f64x2, dest is f32x4
// For cvtps2pd: op is f32x4, dest is f64x2
// In either case, the two first values are converted
for i in 0..op_len.min(dest_len) {
let op = this.read_immediate(&this.project_index(&op, i)?)?;
let dest = this.project_index(&dest, i)?;
let res = this.float_to_float_or_int(&op, dest.layout)?;
this.write_immediate(*res, &dest)?;
}
// For f32 -> f64, ignore the remaining
// For f64 -> f32, fill the remaining with zeros
for i in op_len..dest_len {
let dest = this.project_index(&dest, i)?;
this.write_scalar(Scalar::from_int(0, dest.layout.size), &dest)?;
}
}
// Used to implement the _mm_cvtpd_epi32 and _mm_cvttpd_epi32 functions.
// Converts packed f64 to packed i32.
"cvtpd2dq" | "cvttpd2dq" => {
@ -772,25 +617,6 @@ enum ShiftOp {
)?;
}
}
// Used to implement the _mm_movemask_pd function.
// Returns a scalar integer where the i-th bit is the highest
// bit of the i-th component of `op`.
// https://www.felixcloutier.com/x86/movmskpd
"movmsk.pd" => {
let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (op, op_len) = this.operand_to_simd(op)?;
let mut res = 0;
for i in 0..op_len {
let op = this.read_scalar(&this.project_index(&op, i)?)?;
let op = op.to_u64()?;
// Extract the highest bit of `op` and place it in the `i`-th bit of `res`
res |= (op >> 63) << i;
}
this.write_scalar(Scalar::from_u32(res.try_into().unwrap()), dest)?;
}
// Used to implement the `_mm_pause` function.
// The intrinsic is used to hint the processor that the code is in a spin-loop.
"pause" => {

View File

@ -22,32 +22,6 @@ fn emulate_x86_sse3_intrinsic(
let unprefixed_name = link_name.as_str().strip_prefix("llvm.x86.sse3.").unwrap();
match unprefixed_name {
// Used to implement the _mm_addsub_ps and _mm_addsub_pd functions.
// Alternatingly add and subtract floating point (f32 or f64) from
// `left` and `right`
"addsub.ps" | "addsub.pd" => {
let [left, right] =
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
let (left, left_len) = this.operand_to_simd(left)?;
let (right, right_len) = this.operand_to_simd(right)?;
let (dest, dest_len) = this.place_to_simd(dest)?;
assert_eq!(dest_len, left_len);
assert_eq!(dest_len, right_len);
for i in 0..dest_len {
let left = this.read_immediate(&this.project_index(&left, i)?)?;
let right = this.read_immediate(&this.project_index(&right, i)?)?;
let dest = this.project_index(&dest, i)?;
// Even elements are subtracted and odd elements are added.
let op = if i % 2 == 0 { mir::BinOp::Sub } else { mir::BinOp::Add };
let res = this.wrapping_binary_op(op, &left, &right)?;
this.write_immediate(*res, &dest)?;
}
}
// Used to implement the _mm_h{add,sub}_p{s,d} functions.
// Horizontally add/subtract adjacent floating point values
// in `left` and `right`.

View File

@ -117,12 +117,12 @@ unsafe fn test_mm_mul_epu32() {
#[target_feature(enable = "sse2")]
unsafe fn test_mm_sad_epu8() {
#[rustfmt::skip]
let a = _mm_setr_epi8(
255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
1, 2, 3, 4,
155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
1, 2, 3, 4,
);
let a = _mm_setr_epi8(
255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
1, 2, 3, 4,
155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
1, 2, 3, 4,
);
let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
let r = _mm_sad_epu8(a, b);
let e = _mm_setr_epi64x(1020, 614);