From dc6033477778bd16462d630a2812961b0856461d Mon Sep 17 00:00:00 2001 From: bjorn3 <17426603+bjorn3@users.noreply.github.com> Date: Sat, 11 Nov 2023 20:59:15 +0000 Subject: [PATCH] Implement AES-NI intrinsics using inline asm --- src/abi/mod.rs | 1 + src/inline_asm.rs | 44 +++++++++-- src/intrinsics/llvm.rs | 2 + src/intrinsics/llvm_x86.rs | 156 +++++++++++++++++++++++++++++++++++++ 4 files changed, 198 insertions(+), 5 deletions(-) diff --git a/src/abi/mod.rs b/src/abi/mod.rs index c4572e03525..0ff1473da43 100644 --- a/src/abi/mod.rs +++ b/src/abi/mod.rs @@ -383,6 +383,7 @@ pub(crate) fn codegen_terminator_call<'tcx>( args, ret_place, target, + source_info.span, ); return; } diff --git a/src/inline_asm.rs b/src/inline_asm.rs index 759ee8844fa..25d14319f57 100644 --- a/src/inline_asm.rs +++ b/src/inline_asm.rs @@ -645,8 +645,21 @@ impl<'tcx> InlineAssemblyGenerator<'_, 'tcx> { ) { match arch { InlineAsmArch::X86_64 => { - write!(generated_asm, " mov [rbx+0x{:x}], ", offset.bytes()).unwrap(); - reg.emit(generated_asm, InlineAsmArch::X86_64, None).unwrap(); + match reg { + InlineAsmReg::X86(reg) + if reg as u32 >= X86InlineAsmReg::xmm0 as u32 + && reg as u32 <= X86InlineAsmReg::xmm15 as u32 => + { + // rustc emits x0 rather than xmm0 + write!(generated_asm, " movups [rbx+0x{:x}], ", offset.bytes()).unwrap(); + write!(generated_asm, "xmm{}", reg as u32 - X86InlineAsmReg::xmm0 as u32) + .unwrap(); + } + _ => { + write!(generated_asm, " mov [rbx+0x{:x}], ", offset.bytes()).unwrap(); + reg.emit(generated_asm, InlineAsmArch::X86_64, None).unwrap(); + } + } generated_asm.push('\n'); } InlineAsmArch::AArch64 => { @@ -671,8 +684,24 @@ impl<'tcx> InlineAssemblyGenerator<'_, 'tcx> { ) { match arch { InlineAsmArch::X86_64 => { - generated_asm.push_str(" mov "); - reg.emit(generated_asm, InlineAsmArch::X86_64, None).unwrap(); + match reg { + InlineAsmReg::X86(reg) + if reg as u32 >= X86InlineAsmReg::xmm0 as u32 + && reg as u32 <= X86InlineAsmReg::xmm15 as u32 => + { + // rustc emits x0 rather than xmm0 + write!( + generated_asm, + " movups xmm{}", + reg as u32 - X86InlineAsmReg::xmm0 as u32 + ) + .unwrap(); + } + _ => { + generated_asm.push_str(" mov "); + reg.emit(generated_asm, InlineAsmArch::X86_64, None).unwrap() + } + } writeln!(generated_asm, ", [rbx+0x{:x}]", offset.bytes()).unwrap(); } InlineAsmArch::AArch64 => { @@ -728,7 +757,12 @@ fn call_inline_asm<'tcx>( fx.bcx.ins().call(inline_asm_func, &[stack_slot_addr]); for (offset, place) in outputs { - let ty = fx.clif_type(place.layout().ty).unwrap(); + let ty = if place.layout().ty.is_simd() { + let (lane_count, lane_type) = place.layout().ty.simd_size_and_type(fx.tcx); + fx.clif_type(lane_type).unwrap().by(lane_count.try_into().unwrap()).unwrap() + } else { + fx.clif_type(place.layout().ty).unwrap() + }; let value = stack_slot.offset(fx, i32::try_from(offset.bytes()).unwrap().into()).load( fx, ty, diff --git a/src/intrinsics/llvm.rs b/src/intrinsics/llvm.rs index e9b7daf1492..659e6c133ef 100644 --- a/src/intrinsics/llvm.rs +++ b/src/intrinsics/llvm.rs @@ -12,6 +12,7 @@ pub(crate) fn codegen_llvm_intrinsic_call<'tcx>( args: &[mir::Operand<'tcx>], ret: CPlace<'tcx>, target: Option, + span: Span, ) { if intrinsic.starts_with("llvm.aarch64") { return llvm_aarch64::codegen_aarch64_llvm_intrinsic_call( @@ -31,6 +32,7 @@ pub(crate) fn codegen_llvm_intrinsic_call<'tcx>( args, ret, target, + span, ); } diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs index 75e4850d290..85a2db43fde 100644 --- a/src/intrinsics/llvm_x86.rs +++ b/src/intrinsics/llvm_x86.rs @@ -15,6 +15,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( args: &[mir::Operand<'tcx>], ret: CPlace<'tcx>, target: Option, + span: Span, ) { match intrinsic { "llvm.x86.sse2.pause" | "llvm.aarch64.isb" => { @@ -718,6 +719,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( } "llvm.x86.pclmulqdq" => { + // FIXME use inline asm // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772 intrinsic_args!(fx, args => (a, b, imm8); intrinsic); @@ -779,6 +781,160 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( ret.place_lane(fx, 1).to_ptr().store(fx, res2, MemFlags::trusted()); } + "llvm.x86.aesni.aeskeygenassist" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128&ig_expand=261 + intrinsic_args!(fx, args => (a, _imm8); intrinsic); + + let a = a.load_scalar(fx); + + let imm8 = if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[1]) + { + imm8 + } else { + fx.tcx.sess.span_fatal( + span, + "Index argument for `_mm_aeskeygenassist_si128` is not a constant", + ); + }; + + let imm8 = imm8.try_to_u8().unwrap_or_else(|_| panic!("kind not scalar: {:?}", imm8)); + + codegen_inline_asm_inner( + fx, + &[InlineAsmTemplatePiece::String(format!("aeskeygenassist xmm0, xmm0, {imm8}"))], + &[CInlineAsmOperand::InOut { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)), + _late: true, + in_value: a, + out_place: Some(ret), + }], + InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM, + ); + } + + "llvm.x86.aesni.aesimc" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128&ig_expand=260 + intrinsic_args!(fx, args => (a); intrinsic); + + let a = a.load_scalar(fx); + + codegen_inline_asm_inner( + fx, + &[InlineAsmTemplatePiece::String("aesimc xmm0, xmm0".to_string())], + &[CInlineAsmOperand::InOut { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)), + _late: true, + in_value: a, + out_place: Some(ret), + }], + InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM, + ); + } + + "llvm.x86.aesni.aesenc" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenc_si128&ig_expand=252 + intrinsic_args!(fx, args => (a, round_key); intrinsic); + + let a = a.load_scalar(fx); + let round_key = round_key.load_scalar(fx); + + codegen_inline_asm_inner( + fx, + &[InlineAsmTemplatePiece::String("aesenc xmm0, xmm1".to_string())], + &[ + CInlineAsmOperand::InOut { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)), + _late: true, + in_value: a, + out_place: Some(ret), + }, + CInlineAsmOperand::In { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)), + value: round_key, + }, + ], + InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM, + ); + } + + "llvm.x86.aesni.aesenclast" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128&ig_expand=257 + intrinsic_args!(fx, args => (a, round_key); intrinsic); + + let a = a.load_scalar(fx); + let round_key = round_key.load_scalar(fx); + + codegen_inline_asm_inner( + fx, + &[InlineAsmTemplatePiece::String("aesenclast xmm0, xmm1".to_string())], + &[ + CInlineAsmOperand::InOut { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)), + _late: true, + in_value: a, + out_place: Some(ret), + }, + CInlineAsmOperand::In { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)), + value: round_key, + }, + ], + InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM, + ); + } + + "llvm.x86.aesni.aesdec" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128&ig_expand=242 + intrinsic_args!(fx, args => (a, round_key); intrinsic); + + let a = a.load_scalar(fx); + let round_key = round_key.load_scalar(fx); + + codegen_inline_asm_inner( + fx, + &[InlineAsmTemplatePiece::String("aesdec xmm0, xmm1".to_string())], + &[ + CInlineAsmOperand::InOut { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)), + _late: true, + in_value: a, + out_place: Some(ret), + }, + CInlineAsmOperand::In { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)), + value: round_key, + }, + ], + InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM, + ); + } + + "llvm.x86.aesni.aesdeclast" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128&ig_expand=247 + intrinsic_args!(fx, args => (a, round_key); intrinsic); + + let a = a.load_scalar(fx); + let round_key = round_key.load_scalar(fx); + + codegen_inline_asm_inner( + fx, + &[InlineAsmTemplatePiece::String("aesdeclast xmm0, xmm1".to_string())], + &[ + CInlineAsmOperand::InOut { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)), + _late: true, + in_value: a, + out_place: Some(ret), + }, + CInlineAsmOperand::In { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)), + value: round_key, + }, + ], + InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM, + ); + } + "llvm.x86.avx.ptestz.256" => { // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256&ig_expand=6945 intrinsic_args!(fx, args => (a, b); intrinsic);