This corresponds to the JMM memory model's non-volatile reads and writes to shared variables. It provides fairly weak guarantees, but prevents UB (specifically, you will never see a value that was not written _at some point_ to the provided location). It is not part of the C++ memory model and is only legal to provide to LLVM for loads and stores (not fences, atomicrmw, etc.).
Valid uses of this ordering are things like racy counters where you don't care about the operation actually being atomic, just want to avoid UB. It cannot be used for synchronization without additional memory barriers since unordered loads and stores may be reordered freely by the optimizer (this is the main way it differs from relaxed).
Because it is new to Rust and it provides so few guarantees, for now only the intrinsic is provided--this patch doesn't add it to any of the higher-level atomic wrappers.
This detects (a subset of) the cases when `transmute::<T, U>(x)` can be
lowered to a direct `bitcast T x to U` in LLVM. This assists with
efficiently handling a SIMD vector as multiple different types,
e.g. swapping bytes/words/double words around inside some larger vector
type.
C compilers like GCC and Clang handle integer vector types as `__m128i`
for all widths, and implicitly insert bitcasts as required. This patch
allows Rust to express this, even if it takes a bit of `unsafe`, whereas
previously it was impossible to do at all without inline assembly.
Example:
pub fn reverse_u32s(u: u64x2) -> u64x2 {
unsafe {
let tmp = mem::transmute::<_, u32x4>(u);
let swapped = u32x4(tmp.3, tmp.2, tmp.1, tmp.0);
mem::transmute::<_, u64x2>(swapped)
}
}
Compiling with `--opt-level=3` gives:
Before
define <2 x i64> @_ZN12reverse_u32s20hbdb206aba18a03d8tbaE(<2 x i64>) unnamed_addr #0 {
entry-block:
%1 = bitcast <2 x i64> %0 to i128
%u.0.extract.trunc = trunc i128 %1 to i32
%u.4.extract.shift = lshr i128 %1, 32
%u.4.extract.trunc = trunc i128 %u.4.extract.shift to i32
%u.8.extract.shift = lshr i128 %1, 64
%u.8.extract.trunc = trunc i128 %u.8.extract.shift to i32
%u.12.extract.shift = lshr i128 %1, 96
%u.12.extract.trunc = trunc i128 %u.12.extract.shift to i32
%2 = insertelement <4 x i32> undef, i32 %u.12.extract.trunc, i64 0
%3 = insertelement <4 x i32> %2, i32 %u.8.extract.trunc, i64 1
%4 = insertelement <4 x i32> %3, i32 %u.4.extract.trunc, i64 2
%5 = insertelement <4 x i32> %4, i32 %u.0.extract.trunc, i64 3
%6 = bitcast <4 x i32> %5 to <2 x i64>
ret <2 x i64> %6
}
_ZN12reverse_u32s20hbdb206aba18a03d8tbaE:
.cfi_startproc
movd %xmm0, %rax
punpckhqdq %xmm0, %xmm0
movd %xmm0, %rcx
movq %rcx, %rdx
shrq $32, %rdx
movq %rax, %rsi
shrq $32, %rsi
movd %eax, %xmm0
movd %ecx, %xmm1
punpckldq %xmm0, %xmm1
movd %esi, %xmm2
movd %edx, %xmm0
punpckldq %xmm2, %xmm0
punpckldq %xmm1, %xmm0
retq
After
define <2 x i64> @_ZN12reverse_u32s20hbdb206aba18a03d8tbaE(<2 x i64>) unnamed_addr #0 {
entry-block:
%1 = bitcast <2 x i64> %0 to <4 x i32>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%3 = bitcast <4 x i32> %2 to <2 x i64>
ret <2 x i64> %3
}
_ZN12reverse_u32s20hbdb206aba18a03d8tbaE:
.cfi_startproc
pshufd $27, %xmm0, %xmm0
retq