From 465dcf132083022f893de32f6b50ff6b7665055f Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Sat, 20 Apr 2024 11:18:50 +0200
Subject: [PATCH 01/28] global allocations: don't make up a super-high
 VectorIdx, just use the main thread

---
 src/tools/miri/src/concurrency/data_race.rs   | 24 +++++++++----------
 src/tools/miri/src/concurrency/thread.rs      | 10 +++++---
 .../miri/src/concurrency/vector_clock.rs      |  6 ++---
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/src/tools/miri/src/concurrency/data_race.rs b/src/tools/miri/src/concurrency/data_race.rs
index 2281609a049..bf147093fff 100644
--- a/src/tools/miri/src/concurrency/data_race.rs
+++ b/src/tools/miri/src/concurrency/data_race.rs
@@ -847,6 +847,7 @@ impl VClockAlloc {
         kind: MemoryKind,
         current_span: Span,
     ) -> VClockAlloc {
+        // Determine the thread that did the allocation, and when it did it.
         let (alloc_timestamp, alloc_index) = match kind {
             // User allocated and stack memory should track allocation.
             MemoryKind::Machine(
@@ -864,7 +865,7 @@ impl VClockAlloc {
                 (alloc_timestamp, alloc_index)
             }
             // Other global memory should trace races but be allocated at the 0 timestamp
-            // (conceptually they are allocated before everything).
+            // (conceptually they are allocated on the main thread before everything).
             MemoryKind::Machine(
                 MiriMemoryKind::Global
                 | MiriMemoryKind::Machine
@@ -872,7 +873,8 @@ impl VClockAlloc {
                 | MiriMemoryKind::ExternStatic
                 | MiriMemoryKind::Tls,
             )
-            | MemoryKind::CallerLocation => (VTimestamp::ZERO, VectorIdx::MAX_INDEX),
+            | MemoryKind::CallerLocation =>
+                (VTimestamp::ZERO, global.thread_index(ThreadId::MAIN_THREAD)),
         };
         VClockAlloc {
             alloc_ranges: RefCell::new(RangeMap::new(
@@ -1454,7 +1456,7 @@ impl GlobalState {
         // Setup the main-thread since it is not explicitly created:
         // uses vector index and thread-id 0.
         let index = global_state.vector_clocks.get_mut().push(ThreadClockSet::default());
-        global_state.vector_info.get_mut().push(ThreadId::new(0));
+        global_state.vector_info.get_mut().push(ThreadId::MAIN_THREAD);
         global_state
             .thread_info
             .get_mut()
@@ -1725,13 +1727,15 @@ impl GlobalState {
         Ref::map(clocks, |c| &c.clock)
     }
 
+    fn thread_index(&self, thread: ThreadId) -> VectorIdx {
+        self.thread_info.borrow()[thread].vector_index.expect("thread has no assigned vector")
+    }
+
     /// Load the vector index used by the given thread as well as the set of vector clocks
     /// used by the thread.
     #[inline]
     fn thread_state_mut(&self, thread: ThreadId) -> (VectorIdx, RefMut<'_, ThreadClockSet>) {
-        let index = self.thread_info.borrow()[thread]
-            .vector_index
-            .expect("Loading thread state for thread with no assigned vector");
+        let index = self.thread_index(thread);
         let ref_vector = self.vector_clocks.borrow_mut();
         let clocks = RefMut::map(ref_vector, |vec| &mut vec[index]);
         (index, clocks)
@@ -1741,9 +1745,7 @@ impl GlobalState {
     /// used by the thread.
     #[inline]
     fn thread_state(&self, thread: ThreadId) -> (VectorIdx, Ref<'_, ThreadClockSet>) {
-        let index = self.thread_info.borrow()[thread]
-            .vector_index
-            .expect("Loading thread state for thread with no assigned vector");
+        let index = self.thread_index(thread);
         let ref_vector = self.vector_clocks.borrow();
         let clocks = Ref::map(ref_vector, |vec| &vec[index]);
         (index, clocks)
@@ -1774,9 +1776,7 @@ impl GlobalState {
     #[inline]
     fn current_index(&self, thread_mgr: &ThreadManager<'_, '_>) -> VectorIdx {
         let active_thread_id = thread_mgr.get_active_thread_id();
-        self.thread_info.borrow()[active_thread_id]
-            .vector_index
-            .expect("active thread has no assigned vector")
+        self.thread_index(active_thread_id)
     }
 
     // SC ATOMIC STORE rule in the paper.
diff --git a/src/tools/miri/src/concurrency/thread.rs b/src/tools/miri/src/concurrency/thread.rs
index 2fabd39a744..0116bd0281a 100644
--- a/src/tools/miri/src/concurrency/thread.rs
+++ b/src/tools/miri/src/concurrency/thread.rs
@@ -57,6 +57,8 @@ impl ThreadId {
     pub fn to_u32(self) -> u32 {
         self.0
     }
+
+    pub const MAIN_THREAD: ThreadId = ThreadId(0);
 }
 
 impl Idx for ThreadId {
@@ -401,7 +403,7 @@ impl<'mir, 'tcx> Default for ThreadManager<'mir, 'tcx> {
         // Create the main thread and add it to the list of threads.
         threads.push(Thread::new(Some("main"), None));
         Self {
-            active_thread: ThreadId::new(0),
+            active_thread: ThreadId::MAIN_THREAD,
             threads,
             sync: SynchronizationState::default(),
             thread_local_alloc_ids: Default::default(),
@@ -416,10 +418,12 @@ impl<'mir, 'tcx: 'mir> ThreadManager<'mir, 'tcx> {
         ecx: &mut MiriInterpCx<'mir, 'tcx>,
         on_main_stack_empty: StackEmptyCallback<'mir, 'tcx>,
     ) {
-        ecx.machine.threads.threads[ThreadId::new(0)].on_stack_empty = Some(on_main_stack_empty);
+        ecx.machine.threads.threads[ThreadId::MAIN_THREAD].on_stack_empty =
+            Some(on_main_stack_empty);
         if ecx.tcx.sess.target.os.as_ref() != "windows" {
             // The main thread can *not* be joined on except on windows.
-            ecx.machine.threads.threads[ThreadId::new(0)].join_status = ThreadJoinStatus::Detached;
+            ecx.machine.threads.threads[ThreadId::MAIN_THREAD].join_status =
+                ThreadJoinStatus::Detached;
         }
     }
 
diff --git a/src/tools/miri/src/concurrency/vector_clock.rs b/src/tools/miri/src/concurrency/vector_clock.rs
index 2cd3d031b1e..f86f06a9d22 100644
--- a/src/tools/miri/src/concurrency/vector_clock.rs
+++ b/src/tools/miri/src/concurrency/vector_clock.rs
@@ -13,15 +13,13 @@ use super::data_race::NaReadType;
 /// but in some cases one vector index may be shared with
 /// multiple thread ids if it's safe to do so.
 #[derive(Clone, Copy, Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
-pub struct VectorIdx(u32);
+pub(super) struct VectorIdx(u32);
 
 impl VectorIdx {
     #[inline(always)]
-    pub fn to_u32(self) -> u32 {
+    fn to_u32(self) -> u32 {
         self.0
     }
-
-    pub const MAX_INDEX: VectorIdx = VectorIdx(u32::MAX);
 }
 
 impl Idx for VectorIdx {

From b562faa8c8a23d8c7021104bdd910d63b4e693e6 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Sat, 20 Apr 2024 11:34:41 +0200
Subject: [PATCH 02/28] more consistently talk about the 'active thread', not
 the 'current thread'

---
 src/tools/miri/src/concurrency/data_race.rs   | 44 +++++++++----------
 .../miri/src/concurrency/vector_clock.rs      |  6 +--
 src/tools/miri/src/concurrency/weak_memory.rs |  8 ++--
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/src/tools/miri/src/concurrency/data_race.rs b/src/tools/miri/src/concurrency/data_race.rs
index bf147093fff..f2bec972b18 100644
--- a/src/tools/miri/src/concurrency/data_race.rs
+++ b/src/tools/miri/src/concurrency/data_race.rs
@@ -859,7 +859,7 @@ impl VClockAlloc {
                 | MiriMemoryKind::Mmap,
             )
             | MemoryKind::Stack => {
-                let (alloc_index, clocks) = global.current_thread_state(thread_mgr);
+                let (alloc_index, clocks) = global.active_thread_state(thread_mgr);
                 let mut alloc_timestamp = clocks.clock[alloc_index];
                 alloc_timestamp.span = current_span;
                 (alloc_timestamp, alloc_index)
@@ -932,7 +932,7 @@ impl VClockAlloc {
         ptr_dbg: Pointer<AllocId>,
         ty: Option<Ty<'_>>,
     ) -> InterpResult<'tcx> {
-        let (current_index, current_clocks) = global.current_thread_state(thread_mgr);
+        let (active_index, active_clocks) = global.active_thread_state(thread_mgr);
         let mut other_size = None; // if `Some`, this was a size-mismatch race
         let write_clock;
         let (other_access, other_thread, other_clock) =
@@ -941,30 +941,30 @@ impl VClockAlloc {
             // we are reporting races between two non-atomic reads.
             if !access.is_atomic() &&
                 let Some(atomic) = mem_clocks.atomic() &&
-                let Some(idx) = Self::find_gt_index(&atomic.write_vector, &current_clocks.clock)
+                let Some(idx) = Self::find_gt_index(&atomic.write_vector, &active_clocks.clock)
             {
                 (AccessType::AtomicStore, idx, &atomic.write_vector)
             } else if !access.is_atomic() &&
                 let Some(atomic) = mem_clocks.atomic() &&
-                let Some(idx) = Self::find_gt_index(&atomic.read_vector, &current_clocks.clock)
+                let Some(idx) = Self::find_gt_index(&atomic.read_vector, &active_clocks.clock)
             {
                 (AccessType::AtomicLoad, idx, &atomic.read_vector)
             // Then check races with non-atomic writes/reads.
-            } else if mem_clocks.write.1 > current_clocks.clock[mem_clocks.write.0] {
+            } else if mem_clocks.write.1 > active_clocks.clock[mem_clocks.write.0] {
                 write_clock = mem_clocks.write();
                 (AccessType::NaWrite(mem_clocks.write_type), mem_clocks.write.0, &write_clock)
-            } else if let Some(idx) = Self::find_gt_index(&mem_clocks.read, &current_clocks.clock) {
+            } else if let Some(idx) = Self::find_gt_index(&mem_clocks.read, &active_clocks.clock) {
                 (AccessType::NaRead(mem_clocks.read[idx].read_type()), idx, &mem_clocks.read)
             // Finally, mixed-size races.
             } else if access.is_atomic() && let Some(atomic) = mem_clocks.atomic() && atomic.size != access_size {
                 // This is only a race if we are not synchronized with all atomic accesses, so find
                 // the one we are not synchronized with.
                 other_size = Some(atomic.size);
-                if let Some(idx) = Self::find_gt_index(&atomic.write_vector, &current_clocks.clock)
+                if let Some(idx) = Self::find_gt_index(&atomic.write_vector, &active_clocks.clock)
                     {
                         (AccessType::AtomicStore, idx, &atomic.write_vector)
                     } else if let Some(idx) =
-                        Self::find_gt_index(&atomic.read_vector, &current_clocks.clock)
+                        Self::find_gt_index(&atomic.read_vector, &active_clocks.clock)
                     {
                         (AccessType::AtomicLoad, idx, &atomic.read_vector)
                     } else {
@@ -977,7 +977,7 @@ impl VClockAlloc {
             };
 
         // Load elaborated thread information about the racing thread actions.
-        let current_thread_info = global.print_thread_metadata(thread_mgr, current_index);
+        let active_thread_info = global.print_thread_metadata(thread_mgr, active_index);
         let other_thread_info = global.print_thread_metadata(thread_mgr, other_thread);
         let involves_non_atomic = !access.is_atomic() || !other_access.is_atomic();
 
@@ -1005,8 +1005,8 @@ impl VClockAlloc {
             },
             op2: RacingOp {
                 action: access.description(ty, other_size.map(|_| access_size)),
-                thread_info: current_thread_info,
-                span: current_clocks.clock.as_slice()[current_index.index()].span_data(),
+                thread_info: active_thread_info,
+                span: active_clocks.clock.as_slice()[active_index.index()].span_data(),
             },
         }))?
     }
@@ -1028,7 +1028,7 @@ impl VClockAlloc {
         let current_span = machine.current_span();
         let global = machine.data_race.as_ref().unwrap();
         if global.race_detecting() {
-            let (index, mut thread_clocks) = global.current_thread_state_mut(&machine.threads);
+            let (index, mut thread_clocks) = global.active_thread_state_mut(&machine.threads);
             let mut alloc_ranges = self.alloc_ranges.borrow_mut();
             for (mem_clocks_range, mem_clocks) in
                 alloc_ranges.iter_mut(access_range.start, access_range.size)
@@ -1071,7 +1071,7 @@ impl VClockAlloc {
         let current_span = machine.current_span();
         let global = machine.data_race.as_mut().unwrap();
         if global.race_detecting() {
-            let (index, mut thread_clocks) = global.current_thread_state_mut(&machine.threads);
+            let (index, mut thread_clocks) = global.active_thread_state_mut(&machine.threads);
             for (mem_clocks_range, mem_clocks) in
                 self.alloc_ranges.get_mut().iter_mut(access_range.start, access_range.size)
             {
@@ -1520,7 +1520,7 @@ impl GlobalState {
         thread: ThreadId,
         current_span: Span,
     ) {
-        let current_index = self.current_index(thread_mgr);
+        let current_index = self.active_thread_index(thread_mgr);
 
         // Enable multi-threaded execution, there are now at least two threads
         // so data-races are now possible.
@@ -1644,7 +1644,7 @@ impl GlobalState {
     /// `thread_joined`.
     #[inline]
     pub fn thread_terminated(&mut self, thread_mgr: &ThreadManager<'_, '_>, current_span: Span) {
-        let current_index = self.current_index(thread_mgr);
+        let current_index = self.active_thread_index(thread_mgr);
 
         // Increment the clock to a unique termination timestamp.
         let vector_clocks = self.vector_clocks.get_mut();
@@ -1682,9 +1682,9 @@ impl GlobalState {
         op: impl FnOnce(VectorIdx, RefMut<'_, ThreadClockSet>) -> InterpResult<'tcx, bool>,
     ) -> InterpResult<'tcx> {
         if self.multi_threaded.get() {
-            let (index, clocks) = self.current_thread_state_mut(thread_mgr);
+            let (index, clocks) = self.active_thread_state_mut(thread_mgr);
             if op(index, clocks)? {
-                let (_, mut clocks) = self.current_thread_state_mut(thread_mgr);
+                let (_, mut clocks) = self.active_thread_state_mut(thread_mgr);
                 clocks.increment_clock(index, current_span);
             }
         }
@@ -1754,7 +1754,7 @@ impl GlobalState {
     /// Load the current vector clock in use and the current set of thread clocks
     /// in use for the vector.
     #[inline]
-    pub(super) fn current_thread_state(
+    pub(super) fn active_thread_state(
         &self,
         thread_mgr: &ThreadManager<'_, '_>,
     ) -> (VectorIdx, Ref<'_, ThreadClockSet>) {
@@ -1764,7 +1764,7 @@ impl GlobalState {
     /// Load the current vector clock in use and the current set of thread clocks
     /// in use for the vector mutably for modification.
     #[inline]
-    pub(super) fn current_thread_state_mut(
+    pub(super) fn active_thread_state_mut(
         &self,
         thread_mgr: &ThreadManager<'_, '_>,
     ) -> (VectorIdx, RefMut<'_, ThreadClockSet>) {
@@ -1774,20 +1774,20 @@ impl GlobalState {
     /// Return the current thread, should be the same
     /// as the data-race active thread.
     #[inline]
-    fn current_index(&self, thread_mgr: &ThreadManager<'_, '_>) -> VectorIdx {
+    fn active_thread_index(&self, thread_mgr: &ThreadManager<'_, '_>) -> VectorIdx {
         let active_thread_id = thread_mgr.get_active_thread_id();
         self.thread_index(active_thread_id)
     }
 
     // SC ATOMIC STORE rule in the paper.
     pub(super) fn sc_write(&self, thread_mgr: &ThreadManager<'_, '_>) {
-        let (index, clocks) = self.current_thread_state(thread_mgr);
+        let (index, clocks) = self.active_thread_state(thread_mgr);
         self.last_sc_write.borrow_mut().set_at_index(&clocks.clock, index);
     }
 
     // SC ATOMIC READ rule in the paper.
     pub(super) fn sc_read(&self, thread_mgr: &ThreadManager<'_, '_>) {
-        let (.., mut clocks) = self.current_thread_state_mut(thread_mgr);
+        let (.., mut clocks) = self.active_thread_state_mut(thread_mgr);
         clocks.read_seqcst.join(&self.last_sc_fence.borrow());
     }
 }
diff --git a/src/tools/miri/src/concurrency/vector_clock.rs b/src/tools/miri/src/concurrency/vector_clock.rs
index f86f06a9d22..c3496bc1a0c 100644
--- a/src/tools/miri/src/concurrency/vector_clock.rs
+++ b/src/tools/miri/src/concurrency/vector_clock.rs
@@ -49,7 +49,7 @@ const SMALL_VECTOR: usize = 4;
 /// a 32-bit unsigned integer which is the actual timestamp, and a `Span`
 /// so that diagnostics can report what code was responsible for an operation.
 #[derive(Clone, Copy, Debug)]
-pub struct VTimestamp {
+pub(super) struct VTimestamp {
     /// The lowest bit indicates read type, the rest is the time.
     /// `1` indicates a retag read, `0` a regular read.
     time_and_read_type: u32,
@@ -85,7 +85,7 @@ impl VTimestamp {
     }
 
     #[inline]
-    pub fn read_type(&self) -> NaReadType {
+    pub(super) fn read_type(&self) -> NaReadType {
         if self.time_and_read_type & 1 == 0 { NaReadType::Read } else { NaReadType::Retag }
     }
 
@@ -95,7 +95,7 @@ impl VTimestamp {
     }
 
     #[inline]
-    pub fn span_data(&self) -> SpanData {
+    pub(super) fn span_data(&self) -> SpanData {
         self.span.data()
     }
 }
diff --git a/src/tools/miri/src/concurrency/weak_memory.rs b/src/tools/miri/src/concurrency/weak_memory.rs
index 9ebb64afd35..f544393cfe6 100644
--- a/src/tools/miri/src/concurrency/weak_memory.rs
+++ b/src/tools/miri/src/concurrency/weak_memory.rs
@@ -270,7 +270,7 @@ impl<'mir, 'tcx: 'mir> StoreBuffer {
     ) {
         let store_elem = self.buffer.back();
         if let Some(store_elem) = store_elem {
-            let (index, clocks) = global.current_thread_state(thread_mgr);
+            let (index, clocks) = global.active_thread_state(thread_mgr);
             store_elem.load_impl(index, &clocks, is_seqcst);
         }
     }
@@ -289,7 +289,7 @@ impl<'mir, 'tcx: 'mir> StoreBuffer {
         let (store_elem, recency) = {
             // The `clocks` we got here must be dropped before calling validate_atomic_load
             // as the race detector will update it
-            let (.., clocks) = global.current_thread_state(thread_mgr);
+            let (.., clocks) = global.active_thread_state(thread_mgr);
             // Load from a valid entry in the store buffer
             self.fetch_store(is_seqcst, &clocks, &mut *rng)
         };
@@ -300,7 +300,7 @@ impl<'mir, 'tcx: 'mir> StoreBuffer {
         // requires access to ThreadClockSet.clock, which is updated by the race detector
         validate()?;
 
-        let (index, clocks) = global.current_thread_state(thread_mgr);
+        let (index, clocks) = global.active_thread_state(thread_mgr);
         let loaded = store_elem.load_impl(index, &clocks, is_seqcst);
         Ok((loaded, recency))
     }
@@ -312,7 +312,7 @@ impl<'mir, 'tcx: 'mir> StoreBuffer {
         thread_mgr: &ThreadManager<'_, '_>,
         is_seqcst: bool,
     ) -> InterpResult<'tcx> {
-        let (index, clocks) = global.current_thread_state(thread_mgr);
+        let (index, clocks) = global.active_thread_state(thread_mgr);
 
         self.store_impl(val, index, &clocks.clock, is_seqcst);
         Ok(())

From 9b9c548156f3a0e0714696fcad17b55afed7620f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= <eduardosm-dev@e64.io>
Date: Fri, 19 Apr 2024 23:10:04 +0200
Subject: [PATCH 03/28] Add `-Zmiri-env-set` to set environment variables
 without modifying the host environment

This option allows to pass environment variables to the interpreted program without needing to modify the host environment (which may have undesired effects in some cases).
---
 src/tools/miri/README.md                      |  4 +++
 src/tools/miri/src/bin/miri.rs                |  5 +++
 src/tools/miri/src/eval.rs                    |  5 ++-
 src/tools/miri/src/shims/env.rs               | 34 +++++++++++++------
 .../miri/tests/pass/shims/env/var-set.rs      |  7 ++++
 5 files changed, 43 insertions(+), 12 deletions(-)
 create mode 100644 src/tools/miri/tests/pass/shims/env/var-set.rs

diff --git a/src/tools/miri/README.md b/src/tools/miri/README.md
index 948f1ee6c63..fb2ac0b267e 100644
--- a/src/tools/miri/README.md
+++ b/src/tools/miri/README.md
@@ -311,6 +311,10 @@ environment variable. We first document the most relevant and most commonly used
 * `-Zmiri-env-forward=<var>` forwards the `var` environment variable to the interpreted program. Can
   be used multiple times to forward several variables. Execution will still be deterministic if the
   value of forwarded variables stays the same. Has no effect if `-Zmiri-disable-isolation` is set.
+* `-Zmiri-env-set=<var>=<value>` sets the `var` environment variable to `value` in the interpreted.
+  It can be used to pass environment variables without needing to alter the host environment. It can
+  be used multiple times to set several variables. If `-Zmiri-disable-isolation` or `-Zmiri-env-forward`
+  is set, values set with this option will have priority over values from the host environment.
 * `-Zmiri-ignore-leaks` disables the memory leak checker, and also allows some
   remaining threads to exist when the main thread exits.
 * `-Zmiri-isolation-error=<action>` configures Miri's response to operations
diff --git a/src/tools/miri/src/bin/miri.rs b/src/tools/miri/src/bin/miri.rs
index 3f7a965e9df..c3315edac20 100644
--- a/src/tools/miri/src/bin/miri.rs
+++ b/src/tools/miri/src/bin/miri.rs
@@ -498,6 +498,11 @@ fn main() {
             );
         } else if let Some(param) = arg.strip_prefix("-Zmiri-env-forward=") {
             miri_config.forwarded_env_vars.push(param.to_owned());
+        } else if let Some(param) = arg.strip_prefix("-Zmiri-env-set=") {
+            let Some((name, value)) = param.split_once('=') else {
+                show_error!("-Zmiri-env-set requires an argument of the form <name>=<value>");
+            };
+            miri_config.set_env_vars.insert(name.to_owned(), value.to_owned());
         } else if let Some(param) = arg.strip_prefix("-Zmiri-track-pointer-tag=") {
             let ids: Vec<u64> = match parse_comma_list(param) {
                 Ok(ids) => ids,
diff --git a/src/tools/miri/src/eval.rs b/src/tools/miri/src/eval.rs
index df0ede1e1b6..3623d97e759 100644
--- a/src/tools/miri/src/eval.rs
+++ b/src/tools/miri/src/eval.rs
@@ -9,7 +9,7 @@ use std::thread;
 
 use crate::concurrency::thread::TlsAllocAction;
 use crate::diagnostics::report_leaks;
-use rustc_data_structures::fx::FxHashSet;
+use rustc_data_structures::fx::{FxHashMap, FxHashSet};
 use rustc_hir::def::Namespace;
 use rustc_hir::def_id::DefId;
 use rustc_middle::ty::{
@@ -100,6 +100,8 @@ pub struct MiriConfig {
     pub ignore_leaks: bool,
     /// Environment variables that should always be forwarded from the host.
     pub forwarded_env_vars: Vec<String>,
+    /// Additional environment variables that should be set in the interpreted program.
+    pub set_env_vars: FxHashMap<String, String>,
     /// Command-line arguments passed to the interpreted program.
     pub args: Vec<String>,
     /// The seed to use when non-determinism or randomness are required (e.g. ptr-to-int cast, `getrandom()`).
@@ -163,6 +165,7 @@ impl Default for MiriConfig {
             isolated_op: IsolatedOp::Reject(RejectOpWith::Abort),
             ignore_leaks: false,
             forwarded_env_vars: vec![],
+            set_env_vars: FxHashMap::default(),
             args: vec![],
             seed: None,
             tracked_pointer_tags: FxHashSet::default(),
diff --git a/src/tools/miri/src/shims/env.rs b/src/tools/miri/src/shims/env.rs
index 1779189c9ce..d97873ce722 100644
--- a/src/tools/miri/src/shims/env.rs
+++ b/src/tools/miri/src/shims/env.rs
@@ -44,21 +44,15 @@ impl<'tcx> EnvVars<'tcx> {
                 let forward = ecx.machine.communicate()
                     || config.forwarded_env_vars.iter().any(|v| **v == *name);
                 if forward {
-                    let var_ptr = match ecx.tcx.sess.target.os.as_ref() {
-                        _ if ecx.target_os_is_unix() =>
-                            alloc_env_var_as_c_str(name.as_ref(), value.as_ref(), ecx)?,
-                        "windows" => alloc_env_var_as_wide_str(name.as_ref(), value.as_ref(), ecx)?,
-                        unsupported =>
-                            throw_unsup_format!(
-                                "environment support for target OS `{}` not yet available",
-                                unsupported
-                            ),
-                    };
-                    ecx.machine.env_vars.map.insert(name.clone(), var_ptr);
+                    add_env_var(ecx, name, value)?;
                 }
             }
         }
 
+        for (name, value) in &config.set_env_vars {
+            add_env_var(ecx, OsStr::new(name), OsStr::new(value))?;
+        }
+
         // Initialize the `environ` pointer when needed.
         if ecx.target_os_is_unix() {
             // This is memory backing an extern static, hence `ExternStatic`, not `Env`.
@@ -89,6 +83,24 @@ impl<'tcx> EnvVars<'tcx> {
     }
 }
 
+fn add_env_var<'mir, 'tcx>(
+    ecx: &mut InterpCx<'mir, 'tcx, MiriMachine<'mir, 'tcx>>,
+    name: &OsStr,
+    value: &OsStr,
+) -> InterpResult<'tcx, ()> {
+    let var_ptr = match ecx.tcx.sess.target.os.as_ref() {
+        _ if ecx.target_os_is_unix() => alloc_env_var_as_c_str(name, value, ecx)?,
+        "windows" => alloc_env_var_as_wide_str(name, value, ecx)?,
+        unsupported =>
+            throw_unsup_format!(
+                "environment support for target OS `{}` not yet available",
+                unsupported
+            ),
+    };
+    ecx.machine.env_vars.map.insert(name.to_os_string(), var_ptr);
+    Ok(())
+}
+
 fn alloc_env_var_as_c_str<'mir, 'tcx>(
     name: &OsStr,
     value: &OsStr,
diff --git a/src/tools/miri/tests/pass/shims/env/var-set.rs b/src/tools/miri/tests/pass/shims/env/var-set.rs
new file mode 100644
index 00000000000..2875b6c815a
--- /dev/null
+++ b/src/tools/miri/tests/pass/shims/env/var-set.rs
@@ -0,0 +1,7 @@
+// Test a value set on the host (MIRI_ENV_VAR_TEST) and one that is not.
+//@compile-flags: -Zmiri-env-set=MIRI_ENV_VAR_TEST=test_value_1 -Zmiri-env-set=TEST_VAR_2=test_value_2
+
+fn main() {
+    assert_eq!(std::env::var("MIRI_ENV_VAR_TEST"), Ok("test_value_1".to_owned()));
+    assert_eq!(std::env::var("TEST_VAR_2"), Ok("test_value_2".to_owned()));
+}

From fde24ed5bf66b79d72465f796d862f4241ca71e7 Mon Sep 17 00:00:00 2001
From: tiif <pekyuan@gmail.com>
Date: Mon, 22 Apr 2024 06:30:10 +0000
Subject: [PATCH 04/28] Add localtime_r shim

---
 src/tools/miri/Cargo.lock                     | 144 ++++++++++++++++++
 src/tools/miri/Cargo.toml                     |   1 +
 src/tools/miri/src/shims/time.rs              |  78 ++++++++++
 .../miri/src/shims/unix/foreign_items.rs      |   5 +
 .../miri/tests/pass-dep/shims/libc-misc.rs    |  45 ++++++
 5 files changed, 273 insertions(+)

diff --git a/src/tools/miri/Cargo.lock b/src/tools/miri/Cargo.lock
index 4fb479e1c54..1e6b5502b04 100644
--- a/src/tools/miri/Cargo.lock
+++ b/src/tools/miri/Cargo.lock
@@ -37,6 +37,21 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "android-tzdata"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "annotate-snippets"
 version = "0.9.2"
@@ -106,6 +121,12 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "bumpalo"
+version = "3.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
+
 [[package]]
 name = "camino"
 version = "1.1.6"
@@ -150,6 +171,18 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
+[[package]]
+name = "chrono"
+version = "0.4.38"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
+dependencies = [
+ "android-tzdata",
+ "iana-time-zone",
+ "num-traits",
+ "windows-targets 0.52.3",
+]
+
 [[package]]
 name = "cipher"
 version = "0.4.4"
@@ -216,6 +249,12 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
+
 [[package]]
 name = "cpufeatures"
 version = "0.2.12"
@@ -319,6 +358,29 @@ version = "0.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
 
+[[package]]
+name = "iana-time-zone"
+version = "0.1.60"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141"
+dependencies = [
+ "android_system_properties",
+ "core-foundation-sys",
+ "iana-time-zone-haiku",
+ "js-sys",
+ "wasm-bindgen",
+ "windows-core",
+]
+
+[[package]]
+name = "iana-time-zone-haiku"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "indenter"
 version = "0.3.3"
@@ -372,6 +434,15 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "js-sys"
+version = "0.3.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
+dependencies = [
+ "wasm-bindgen",
+]
+
 [[package]]
 name = "lazy_static"
 version = "1.4.0"
@@ -484,6 +555,7 @@ name = "miri"
 version = "0.1.0"
 dependencies = [
  "aes",
+ "chrono",
  "colored",
  "ctrlc",
  "getrandom",
@@ -512,6 +584,15 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "num-traits"
+version = "0.2.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a"
+dependencies = [
+ "autocfg",
+]
+
 [[package]]
 name = "number_prefix"
 version = "0.4.0"
@@ -964,6 +1045,60 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
+dependencies = [
+ "cfg-if",
+ "wasm-bindgen-macro",
+]
+
+[[package]]
+name = "wasm-bindgen-backend"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
+dependencies = [
+ "bumpalo",
+ "log",
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-backend",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
+
 [[package]]
 name = "winapi"
 version = "0.3.9"
@@ -986,6 +1121,15 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
+[[package]]
+name = "windows-core"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
+dependencies = [
+ "windows-targets 0.52.3",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.48.0"
diff --git a/src/tools/miri/Cargo.toml b/src/tools/miri/Cargo.toml
index 9d24d3c6f47..7748d630b12 100644
--- a/src/tools/miri/Cargo.toml
+++ b/src/tools/miri/Cargo.toml
@@ -24,6 +24,7 @@ smallvec = "1.7"
 aes = { version = "0.8.3", features = ["hazmat"] }
 measureme = "11"
 ctrlc = "3.2.5"
+chrono = { version = "0.4.38", default-features = false, features = ["clock"] }
 
 # Copied from `compiler/rustc/Cargo.toml`.
 # But only for some targets, it fails for others. Rustc configures this in its CI, but we can't
diff --git a/src/tools/miri/src/shims/time.rs b/src/tools/miri/src/shims/time.rs
index 1126c900226..dfdf58470d6 100644
--- a/src/tools/miri/src/shims/time.rs
+++ b/src/tools/miri/src/shims/time.rs
@@ -1,5 +1,9 @@
+use std::ffi::OsString;
+use std::fmt::Write;
 use std::time::{Duration, SystemTime};
 
+use chrono::{DateTime, Datelike, Local, Timelike, Utc};
+
 use crate::concurrency::thread::MachineCallback;
 use crate::*;
 
@@ -107,6 +111,80 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         Ok(0)
     }
 
+    // The localtime() function shall convert the time in seconds since the Epoch pointed to by
+    // timer into a broken-down time, expressed as a local time.
+    // https://linux.die.net/man/3/localtime_r
+    fn localtime_r(
+        &mut self,
+        timep: &OpTy<'tcx, Provenance>,
+        result_op: &OpTy<'tcx, Provenance>,
+    ) -> InterpResult<'tcx, Pointer<Option<Provenance>>> {
+        let this = self.eval_context_mut();
+
+        this.assert_target_os_is_unix("localtime_r");
+        this.check_no_isolation("`localtime_r`")?;
+
+        let timep = this.deref_pointer(timep)?;
+        let result = this.deref_pointer_as(result_op, this.libc_ty_layout("tm"))?;
+
+        // The input "represents the number of seconds elapsed since the Epoch,
+        // 1970-01-01 00:00:00 +0000 (UTC)".
+        let sec_since_epoch: i64 = this
+            .read_scalar(&timep)?
+            .to_int(this.libc_ty_layout("time_t").size)?
+            .try_into()
+            .unwrap();
+        let dt_utc: DateTime<Utc> =
+            DateTime::from_timestamp(sec_since_epoch, 0).expect("Invalid timestamp");
+        // Convert that to local time, then return the broken-down time value.
+        let dt: DateTime<Local> = DateTime::from(dt_utc);
+
+        // This value is always set to -1, because there is no way to know if dst is in effect with
+        // chrono crate yet.
+        // This may not be consistent with libc::localtime_r's result.
+        let tm_isdst = -1;
+
+        // tm_zone represents the timezone value in the form of: +0730, +08, -0730 or -08.
+        // This may not be consistent with libc::localtime_r's result.
+        let offset_in_second = Local::now().offset().local_minus_utc();
+        let tm_gmtoff = offset_in_second;
+        let mut tm_zone = String::new();
+        if offset_in_second < 0 {
+            tm_zone.push('-');
+        } else {
+            tm_zone.push('+');
+        }
+        let offset_hour = offset_in_second.abs() / 3600;
+        write!(tm_zone, "{:02}", offset_hour).unwrap();
+        let offset_min = (offset_in_second.abs() % 3600) / 60;
+        if offset_min != 0 {
+            write!(tm_zone, "{:02}", offset_min).unwrap();
+        }
+
+        // FIXME: String de-duplication is needed so that we only allocate this string only once
+        // even when there are multiple calls to this function.
+        let tm_zone_ptr =
+            this.alloc_os_str_as_c_str(&OsString::from(tm_zone), MiriMemoryKind::Machine.into())?;
+
+        this.write_pointer(tm_zone_ptr, &this.project_field_named(&result, "tm_zone")?)?;
+        this.write_int_fields_named(
+            &[
+                ("tm_sec", dt.second().into()),
+                ("tm_min", dt.minute().into()),
+                ("tm_hour", dt.hour().into()),
+                ("tm_mday", dt.day().into()),
+                ("tm_mon", dt.month0().into()),
+                ("tm_year", dt.year().checked_sub(1900).unwrap().into()),
+                ("tm_wday", dt.weekday().num_days_from_sunday().into()),
+                ("tm_yday", dt.ordinal0().into()),
+                ("tm_isdst", tm_isdst),
+                ("tm_gmtoff", tm_gmtoff.into()),
+            ],
+            &result,
+        )?;
+
+        Ok(result.ptr())
+    }
     #[allow(non_snake_case, clippy::arithmetic_side_effects)]
     fn GetSystemTimeAsFileTime(
         &mut self,
diff --git a/src/tools/miri/src/shims/unix/foreign_items.rs b/src/tools/miri/src/shims/unix/foreign_items.rs
index c72d3bb3df4..bd299aaa125 100644
--- a/src/tools/miri/src/shims/unix/foreign_items.rs
+++ b/src/tools/miri/src/shims/unix/foreign_items.rs
@@ -234,6 +234,11 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                 let result = this.gettimeofday(tv, tz)?;
                 this.write_scalar(Scalar::from_i32(result), dest)?;
             }
+            "localtime_r" => {
+                let [timep, result_op] = this.check_shim(abi, Abi::C {unwind: false}, link_name, args)?;
+                let result = this.localtime_r(timep, result_op)?;
+                this.write_pointer(result, dest)?;
+            }
             "clock_gettime" => {
                 let [clk_id, tp] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
diff --git a/src/tools/miri/tests/pass-dep/shims/libc-misc.rs b/src/tools/miri/tests/pass-dep/shims/libc-misc.rs
index abb384b0a85..f710daf5277 100644
--- a/src/tools/miri/tests/pass-dep/shims/libc-misc.rs
+++ b/src/tools/miri/tests/pass-dep/shims/libc-misc.rs
@@ -213,6 +213,50 @@ fn test_posix_gettimeofday() {
     assert_eq!(is_error, -1);
 }
 
+fn test_localtime_r() {
+    use std::ffi::CStr;
+    use std::{env, ptr};
+
+    // Set timezone to GMT.
+    let key = "TZ";
+    env::set_var(key, "GMT");
+
+    const TIME_SINCE_EPOCH: libc::time_t = 1712475836;
+    let custom_time_ptr = &TIME_SINCE_EPOCH;
+    let mut tm = libc::tm {
+        tm_sec: 0,
+        tm_min: 0,
+        tm_hour: 0,
+        tm_mday: 0,
+        tm_mon: 0,
+        tm_year: 0,
+        tm_wday: 0,
+        tm_yday: 0,
+        tm_isdst: 0,
+        tm_gmtoff: 0,
+        tm_zone: std::ptr::null_mut::<libc::c_char>(),
+    };
+    let res = unsafe { libc::localtime_r(custom_time_ptr, &mut tm) };
+
+    assert_eq!(tm.tm_sec, 56);
+    assert_eq!(tm.tm_min, 43);
+    assert_eq!(tm.tm_hour, 7);
+    assert_eq!(tm.tm_mday, 7);
+    assert_eq!(tm.tm_mon, 3);
+    assert_eq!(tm.tm_year, 124);
+    assert_eq!(tm.tm_wday, 0);
+    assert_eq!(tm.tm_yday, 97);
+    assert_eq!(tm.tm_isdst, -1);
+    assert_eq!(tm.tm_gmtoff, 0);
+    unsafe { assert_eq!(CStr::from_ptr(tm.tm_zone).to_str().unwrap(), "+00") };
+
+    // The returned value is the pointer passed in.
+    assert!(ptr::eq(res, &mut tm));
+
+    //Remove timezone setting.
+    env::remove_var(key);
+}
+
 fn test_isatty() {
     // Testing whether our isatty shim returns the right value would require controlling whether
     // these streams are actually TTYs, which is hard.
@@ -365,6 +409,7 @@ fn main() {
     test_posix_realpath_errors();
 
     test_thread_local_errno();
+    test_localtime_r();
 
     test_isatty();
 

From fb2396cbda433eebb0da1cdf2cc4c0b72074c2dc Mon Sep 17 00:00:00 2001
From: The Miri Cronjob Bot <miri@cron.bot>
Date: Tue, 23 Apr 2024 04:56:35 +0000
Subject: [PATCH 05/28] Preparing for merge from rustc

---
 src/tools/miri/rust-version | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tools/miri/rust-version b/src/tools/miri/rust-version
index a60acf44a40..9b0b1c8d230 100644
--- a/src/tools/miri/rust-version
+++ b/src/tools/miri/rust-version
@@ -1 +1 @@
-c8d19a92aa9022eb690899cf6d54fd23cb6877e5
+aca749eefceaed0cda19a7ec5e472fce9387bc00

From 4bcd5aff5c1cf65d26b42356e1f6209cdcb68b49 Mon Sep 17 00:00:00 2001
From: Oli Scherer <github35764891676564198441@oli-obk.de>
Date: Tue, 23 Apr 2024 11:17:36 +0200
Subject: [PATCH 06/28] Missing word at the end of sentence

---
 src/tools/miri/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tools/miri/README.md b/src/tools/miri/README.md
index fb2ac0b267e..95b99e9a54e 100644
--- a/src/tools/miri/README.md
+++ b/src/tools/miri/README.md
@@ -311,7 +311,7 @@ environment variable. We first document the most relevant and most commonly used
 * `-Zmiri-env-forward=<var>` forwards the `var` environment variable to the interpreted program. Can
   be used multiple times to forward several variables. Execution will still be deterministic if the
   value of forwarded variables stays the same. Has no effect if `-Zmiri-disable-isolation` is set.
-* `-Zmiri-env-set=<var>=<value>` sets the `var` environment variable to `value` in the interpreted.
+* `-Zmiri-env-set=<var>=<value>` sets the `var` environment variable to `value` in the interpreted program.
   It can be used to pass environment variables without needing to alter the host environment. It can
   be used multiple times to set several variables. If `-Zmiri-disable-isolation` or `-Zmiri-env-forward`
   is set, values set with this option will have priority over values from the host environment.

From 15a1b0df63c6e1b7c8cfac8ffe0d69f3060dc18e Mon Sep 17 00:00:00 2001
From: forcedebug <forcedebug@outlook.com>
Date: Tue, 23 Apr 2024 19:12:04 +0800
Subject: [PATCH 07/28] chore: fix some typos in comments

Signed-off-by: forcedebug <forcedebug@outlook.com>
---
 src/tools/miri/src/borrow_tracker/stacked_borrows/stack.rs | 2 +-
 src/tools/miri/src/shims/windows/foreign_items.rs          | 2 +-
 src/tools/miri/src/shims/x86/mod.rs                        | 2 +-
 src/tools/miri/src/shims/x86/sse.rs                        | 2 +-
 src/tools/miri/src/shims/x86/sse2.rs                       | 4 ++--
 src/tools/miri/tests/pass/const-addrs.rs                   | 2 +-
 src/tools/miri/tests/pass/issues/issue-miri-1909.rs        | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/tools/miri/src/borrow_tracker/stacked_borrows/stack.rs b/src/tools/miri/src/borrow_tracker/stacked_borrows/stack.rs
index bebd14d2f1e..55ff09c53fe 100644
--- a/src/tools/miri/src/borrow_tracker/stacked_borrows/stack.rs
+++ b/src/tools/miri/src/borrow_tracker/stacked_borrows/stack.rs
@@ -248,7 +248,7 @@ impl<'tcx> Stack {
     #[cfg(feature = "stack-cache")]
     fn find_granting_cache(&mut self, access: AccessKind, tag: BorTag) -> Option<usize> {
         // This looks like a common-sense optimization; we're going to do a linear search of the
-        // cache or the borrow stack to scan the shorter of the two. This optimization is miniscule
+        // cache or the borrow stack to scan the shorter of the two. This optimization is minuscule
         // and this check actually ensures we do not access an invalid cache.
         // When a stack is created and when items are removed from the top of the borrow stack, we
         // need some valid value to populate the cache. In both cases, we try to use the bottom
diff --git a/src/tools/miri/src/shims/windows/foreign_items.rs b/src/tools/miri/src/shims/windows/foreign_items.rs
index ec4c6101487..c81d6b2f7fd 100644
--- a/src/tools/miri/src/shims/windows/foreign_items.rs
+++ b/src/tools/miri/src/shims/windows/foreign_items.rs
@@ -653,7 +653,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                     OsStr::new(&formatted),
                     buffer,
                     size.into(),
-                    /*trunacte*/ false,
+                    /*truncate*/ false,
                 )?;
                 if !complete {
                     // The API docs don't say what happens when the buffer is not big enough...
diff --git a/src/tools/miri/src/shims/x86/mod.rs b/src/tools/miri/src/shims/x86/mod.rs
index 615821b2e37..a5bfd30cf61 100644
--- a/src/tools/miri/src/shims/x86/mod.rs
+++ b/src/tools/miri/src/shims/x86/mod.rs
@@ -650,7 +650,7 @@ fn convert_float_to_int<'tcx>(
         let dest = this.project_index(&dest, i)?;
 
         let res = this.float_to_int_checked(&op, dest.layout, rnd)?.unwrap_or_else(|| {
-            // Fallback to minimum acording to SSE/AVX semantics.
+            // Fallback to minimum according to SSE/AVX semantics.
             ImmTy::from_int(dest.layout.size.signed_int_min(), dest.layout)
         });
         this.write_immediate(*res, &dest)?;
diff --git a/src/tools/miri/src/shims/x86/sse.rs b/src/tools/miri/src/shims/x86/sse.rs
index b8c0dfb1c7f..17608837319 100644
--- a/src/tools/miri/src/shims/x86/sse.rs
+++ b/src/tools/miri/src/shims/x86/sse.rs
@@ -182,7 +182,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 };
 
                 let res = this.float_to_int_checked(&op, dest.layout, rnd)?.unwrap_or_else(|| {
-                    // Fallback to minimum acording to SSE semantics.
+                    // Fallback to minimum according to SSE semantics.
                     ImmTy::from_int(dest.layout.size.signed_int_min(), dest.layout)
                 });
 
diff --git a/src/tools/miri/src/shims/x86/sse2.rs b/src/tools/miri/src/shims/x86/sse2.rs
index 9db30d7ddca..9268766fc0f 100644
--- a/src/tools/miri/src/shims/x86/sse2.rs
+++ b/src/tools/miri/src/shims/x86/sse2.rs
@@ -420,7 +420,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 };
 
                 let res = this.float_to_int_checked(&op, dest.layout, rnd)?.unwrap_or_else(|| {
-                    // Fallback to minimum acording to SSE semantics.
+                    // Fallback to minimum according to SSE semantics.
                     ImmTy::from_int(dest.layout.size.signed_int_min(), dest.layout)
                 });
 
@@ -447,7 +447,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let res0 = this.float_to_float_or_int(&right0, dest0.layout)?;
                 this.write_immediate(*res0, &dest0)?;
 
-                // Copy remianing from `left`
+                // Copy remaining from `left`
                 for i in 1..dest_len {
                     this.copy_op(&this.project_index(&left, i)?, &this.project_index(&dest, i)?)?;
                 }
diff --git a/src/tools/miri/tests/pass/const-addrs.rs b/src/tools/miri/tests/pass/const-addrs.rs
index 6c14f0b679c..727c67ebfb5 100644
--- a/src/tools/miri/tests/pass/const-addrs.rs
+++ b/src/tools/miri/tests/pass/const-addrs.rs
@@ -4,7 +4,7 @@
 // deallocated.
 // In Miri we explicitly store previously-assigned AllocIds for each const and ensure
 // that we only hand out a finite number of AllocIds per const.
-// MIR inlining will put every evaluation of the const we're repeatedly evaluting into the same
+// MIR inlining will put every evaluation of the const we're repeatedly evaluating into the same
 // stack frame, breaking this test.
 //@compile-flags: -Zinline-mir=no
 #![feature(strict_provenance)]
diff --git a/src/tools/miri/tests/pass/issues/issue-miri-1909.rs b/src/tools/miri/tests/pass/issues/issue-miri-1909.rs
index ce2114e760a..8a2e67cdd09 100644
--- a/src/tools/miri/tests/pass/issues/issue-miri-1909.rs
+++ b/src/tools/miri/tests/pass/issues/issue-miri-1909.rs
@@ -9,7 +9,7 @@ use std::alloc::System;
 /// `ptr` must be valid for writes of `len` bytes
 unsafe fn volatile_write_zeroize_mem(ptr: *mut u8, len: usize) {
     for i in 0..len {
-        // ptr as usize + i can't overlow because `ptr` is valid for writes of `len`
+        // ptr as usize + i can't overflow because `ptr` is valid for writes of `len`
         let ptr_new: *mut u8 = ((ptr as usize) + i) as *mut u8;
         // SAFETY: `ptr` is valid for writes of `len` bytes, so `ptr_new` is valid for a
         // byte write

From ed36dde1d293ef0beb302f588317167d6e27533d Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Tue, 23 Apr 2024 14:09:00 +0200
Subject: [PATCH 08/28] add Windows TLS bug to trophy case

---
 src/tools/miri/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/tools/miri/README.md b/src/tools/miri/README.md
index 26f43cd492e..ef01ca25fb0 100644
--- a/src/tools/miri/README.md
+++ b/src/tools/miri/README.md
@@ -564,7 +564,8 @@ used according to their aliasing restrictions.
 
 ## Bugs found by Miri
 
-Miri has already found a number of bugs in the Rust standard library and beyond, which we collect here.
+Miri has already found a number of bugs in the Rust standard library and beyond, some of which we collect here.
+If Miri helped you find a subtle UB bug in your code, we'd appreciate a PR adding it to the list!
 
 Definite bugs found:
 
@@ -599,6 +600,7 @@ Definite bugs found:
 * [Deallocating with the wrong layout in new specializations for in-place `Iterator::collect`](https://github.com/rust-lang/rust/pull/118460)
 * [Incorrect offset computation for highly-aligned types in `portable-atomic-util`](https://github.com/taiki-e/portable-atomic/pull/138)
 * [Occasional memory leak in `std::mpsc` channels](https://github.com/rust-lang/rust/issues/121582) (original code in [crossbeam](https://github.com/crossbeam-rs/crossbeam/pull/1084))
+* [Weak-memory-induced memory leak in Windows thread-local storage](https://github.com/rust-lang/rust/pull/124281)
 
 Violations of [Stacked Borrows] found that are likely bugs (but Stacked Borrows is currently just an experiment):
 

From 5d1654a40a38d76d40b8835a655feb7d8c013230 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Tue, 23 Apr 2024 16:16:57 +0200
Subject: [PATCH 09/28] CI: don't run cron-fail-notify when the job just got
 canceled

Doesn't seem right to prepare a PR in that case
---
 src/tools/miri/.github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tools/miri/.github/workflows/ci.yml b/src/tools/miri/.github/workflows/ci.yml
index b0dab9f509d..4df383d1d68 100644
--- a/src/tools/miri/.github/workflows/ci.yml
+++ b/src/tools/miri/.github/workflows/ci.yml
@@ -165,7 +165,7 @@ jobs:
     name: cronjob failure notification
     runs-on: ubuntu-latest
     needs: [build, style]
-    if: github.event_name == 'schedule' && (failure() || cancelled())
+    if: github.event_name == 'schedule' && failure()
     steps:
       # Send a Zulip notification
       - name: Install zulip-send

From 799a4ded54147daf6aa84b08490d8b60163a73e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= <eduardosm-dev@e64.io>
Date: Fri, 19 Apr 2024 20:46:27 +0200
Subject: [PATCH 10/28] Implement LLVM x86 AVX2 intrinsics

---
 src/tools/miri/src/shims/x86/avx.rs           |   71 +-
 src/tools/miri/src/shims/x86/avx2.rs          |  444 +++++
 src/tools/miri/src/shims/x86/mod.rs           |  405 +++++
 src/tools/miri/src/shims/x86/sse2.rs          |   75 +-
 src/tools/miri/src/shims/x86/sse41.rs         |   59 +-
 src/tools/miri/src/shims/x86/ssse3.rs         |   65 +-
 .../miri/tests/pass/intrinsics-x86-avx2.rs    | 1613 +++++++++++++++++
 7 files changed, 2476 insertions(+), 256 deletions(-)
 create mode 100644 src/tools/miri/src/shims/x86/avx2.rs
 create mode 100644 src/tools/miri/tests/pass/intrinsics-x86-avx2.rs

diff --git a/src/tools/miri/src/shims/x86/avx.rs b/src/tools/miri/src/shims/x86/avx.rs
index 23c78647b9c..41c20d768f7 100644
--- a/src/tools/miri/src/shims/x86/avx.rs
+++ b/src/tools/miri/src/shims/x86/avx.rs
@@ -7,7 +7,8 @@ use rustc_target::spec::abi::Abi;
 
 use super::{
     bin_op_simd_float_all, conditional_dot_product, convert_float_to_int, horizontal_bin_op,
-    round_all, test_bits_masked, test_high_bits_masked, unary_op_ps, FloatBinOp, FloatUnaryOp,
+    mask_load, mask_store, round_all, test_bits_masked, test_high_bits_masked, unary_op_ps,
+    FloatBinOp, FloatUnaryOp,
 };
 use crate::*;
 use shims::foreign_items::EmulateForeignItemResult;
@@ -347,71 +348,3 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
         Ok(EmulateForeignItemResult::NeedsJumping)
     }
 }
-
-/// Conditionally loads from `ptr` according the high bit of each
-/// element of `mask`. `ptr` does not need to be aligned.
-fn mask_load<'tcx>(
-    this: &mut crate::MiriInterpCx<'_, 'tcx>,
-    ptr: &OpTy<'tcx, Provenance>,
-    mask: &OpTy<'tcx, Provenance>,
-    dest: &MPlaceTy<'tcx, Provenance>,
-) -> InterpResult<'tcx, ()> {
-    let (mask, mask_len) = this.operand_to_simd(mask)?;
-    let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-    assert_eq!(dest_len, mask_len);
-
-    let mask_item_size = mask.layout.field(this, 0).size;
-    let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap();
-
-    let ptr = this.read_pointer(ptr)?;
-    for i in 0..dest_len {
-        let mask = this.project_index(&mask, i)?;
-        let dest = this.project_index(&dest, i)?;
-
-        if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
-            // Size * u64 is implemented as always checked
-            #[allow(clippy::arithmetic_side_effects)]
-            let ptr = ptr.wrapping_offset(dest.layout.size * i, &this.tcx);
-            // Unaligned copy, which is what we want.
-            this.mem_copy(ptr, dest.ptr(), dest.layout.size, /*nonoverlapping*/ true)?;
-        } else {
-            this.write_scalar(Scalar::from_int(0, dest.layout.size), &dest)?;
-        }
-    }
-
-    Ok(())
-}
-
-/// Conditionally stores into `ptr` according the high bit of each
-/// element of `mask`. `ptr` does not need to be aligned.
-fn mask_store<'tcx>(
-    this: &mut crate::MiriInterpCx<'_, 'tcx>,
-    ptr: &OpTy<'tcx, Provenance>,
-    mask: &OpTy<'tcx, Provenance>,
-    value: &OpTy<'tcx, Provenance>,
-) -> InterpResult<'tcx, ()> {
-    let (mask, mask_len) = this.operand_to_simd(mask)?;
-    let (value, value_len) = this.operand_to_simd(value)?;
-
-    assert_eq!(value_len, mask_len);
-
-    let mask_item_size = mask.layout.field(this, 0).size;
-    let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap();
-
-    let ptr = this.read_pointer(ptr)?;
-    for i in 0..value_len {
-        let mask = this.project_index(&mask, i)?;
-        let value = this.project_index(&value, i)?;
-
-        if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
-            // Size * u64 is implemented as always checked
-            #[allow(clippy::arithmetic_side_effects)]
-            let ptr = ptr.wrapping_offset(value.layout.size * i, &this.tcx);
-            // Unaligned copy, which is what we want.
-            this.mem_copy(value.ptr(), ptr, value.layout.size, /*nonoverlapping*/ true)?;
-        }
-    }
-
-    Ok(())
-}
diff --git a/src/tools/miri/src/shims/x86/avx2.rs b/src/tools/miri/src/shims/x86/avx2.rs
new file mode 100644
index 00000000000..bbf53f9f1e5
--- /dev/null
+++ b/src/tools/miri/src/shims/x86/avx2.rs
@@ -0,0 +1,444 @@
+use crate::rustc_middle::ty::layout::LayoutOf as _;
+use rustc_middle::mir;
+use rustc_middle::ty::Ty;
+use rustc_span::Symbol;
+use rustc_target::spec::abi::Abi;
+
+use super::{
+    horizontal_bin_op, int_abs, mask_load, mask_store, mpsadbw, packssdw, packsswb, packusdw,
+    packuswb, pmulhrsw, psign, shift_simd_by_scalar, shift_simd_by_simd, ShiftOp,
+};
+use crate::*;
+use shims::foreign_items::EmulateForeignItemResult;
+
+impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for crate::MiriInterpCx<'mir, 'tcx> {}
+pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
+    crate::MiriInterpCxExt<'mir, 'tcx>
+{
+    fn emulate_x86_avx2_intrinsic(
+        &mut self,
+        link_name: Symbol,
+        abi: Abi,
+        args: &[OpTy<'tcx, Provenance>],
+        dest: &MPlaceTy<'tcx, Provenance>,
+    ) -> InterpResult<'tcx, EmulateForeignItemResult> {
+        let this = self.eval_context_mut();
+        this.expect_target_feature_for_intrinsic(link_name, "avx2")?;
+        // Prefix should have already been checked.
+        let unprefixed_name = link_name.as_str().strip_prefix("llvm.x86.avx2.").unwrap();
+
+        match unprefixed_name {
+            // Used to implement the _mm256_abs_epi{8,16,32} functions.
+            // Calculates the absolute value of packed 8/16/32-bit integers.
+            "pabs.b" | "pabs.w" | "pabs.d" => {
+                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                int_abs(this, op, dest)?;
+            }
+            // Used to implement the _mm256_h{add,adds,sub}_epi{16,32} functions.
+            // Horizontally add / add with saturation / subtract adjacent 16/32-bit
+            // integer values in `left` and `right`.
+            "phadd.w" | "phadd.sw" | "phadd.d" | "phsub.w" | "phsub.sw" | "phsub.d" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (which, saturating) = match unprefixed_name {
+                    "phadd.w" | "phadd.d" => (mir::BinOp::Add, false),
+                    "phadd.sw" => (mir::BinOp::Add, true),
+                    "phsub.w" | "phsub.d" => (mir::BinOp::Sub, false),
+                    "phsub.sw" => (mir::BinOp::Sub, true),
+                    _ => unreachable!(),
+                };
+
+                horizontal_bin_op(this, which, saturating, left, right, dest)?;
+            }
+            // Used to implement `_mm{,_mask}_{i32,i64}gather_{epi32,epi64,pd,ps}` functions
+            // Gathers elements from `slice` using `offsets * scale` as indices.
+            // When the highest bit of the corresponding element of `mask` is 0,
+            // the value is copied from `src` instead.
+            "gather.d.d" | "gather.d.d.256" | "gather.d.q" | "gather.d.q.256" | "gather.q.d"
+            | "gather.q.d.256" | "gather.q.q" | "gather.q.q.256" | "gather.d.pd"
+            | "gather.d.pd.256" | "gather.q.pd" | "gather.q.pd.256" | "gather.d.ps"
+            | "gather.d.ps.256" | "gather.q.ps" | "gather.q.ps.256" => {
+                let [src, slice, offsets, mask, scale] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                assert_eq!(dest.layout, src.layout);
+
+                let (src, _) = this.operand_to_simd(src)?;
+                let (offsets, offsets_len) = this.operand_to_simd(offsets)?;
+                let (mask, mask_len) = this.operand_to_simd(mask)?;
+                let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+                // There are cases like dest: i32x4, offsets: i64x2
+                let actual_len = dest_len.min(offsets_len);
+
+                assert_eq!(dest_len, mask_len);
+
+                let mask_item_size = mask.layout.field(this, 0).size;
+                let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap();
+
+                let scale = this.read_scalar(scale)?.to_i8()?;
+                if !matches!(scale, 1 | 2 | 4 | 8) {
+                    throw_unsup_format!("invalid gather scale {scale}");
+                }
+                let scale = i64::from(scale);
+
+                let slice = this.read_pointer(slice)?;
+                for i in 0..actual_len {
+                    let mask = this.project_index(&mask, i)?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
+                        let offset = this.project_index(&offsets, i)?;
+                        let offset =
+                            i64::try_from(this.read_scalar(&offset)?.to_int(offset.layout.size)?)
+                                .unwrap();
+                        let ptr = slice
+                            .wrapping_signed_offset(offset.checked_mul(scale).unwrap(), &this.tcx);
+                        // Unaligned copy, which is what we want.
+                        this.mem_copy(
+                            ptr,
+                            dest.ptr(),
+                            dest.layout.size,
+                            /*nonoverlapping*/ true,
+                        )?;
+                    } else {
+                        this.copy_op(&this.project_index(&src, i)?, &dest)?;
+                    }
+                }
+                for i in actual_len..dest_len {
+                    let dest = this.project_index(&dest, i)?;
+                    this.write_scalar(Scalar::from_int(0, dest.layout.size), &dest)?;
+                }
+            }
+            // Used to implement the _mm256_madd_epi16 function.
+            // Multiplies packed signed 16-bit integers in `left` and `right`, producing
+            // intermediate signed 32-bit integers. Horizontally add adjacent pairs of
+            // intermediate 32-bit integers, and pack the results in `dest`.
+            "pmadd.wd" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+                assert_eq!(left_len, right_len);
+                assert_eq!(dest_len.checked_mul(2).unwrap(), left_len);
+
+                for i in 0..dest_len {
+                    let j1 = i.checked_mul(2).unwrap();
+                    let left1 = this.read_scalar(&this.project_index(&left, j1)?)?.to_i16()?;
+                    let right1 = this.read_scalar(&this.project_index(&right, j1)?)?.to_i16()?;
+
+                    let j2 = j1.checked_add(1).unwrap();
+                    let left2 = this.read_scalar(&this.project_index(&left, j2)?)?.to_i16()?;
+                    let right2 = this.read_scalar(&this.project_index(&right, j2)?)?.to_i16()?;
+
+                    let dest = this.project_index(&dest, i)?;
+
+                    // Multiplications are i16*i16->i32, which will not overflow.
+                    let mul1 = i32::from(left1).checked_mul(right1.into()).unwrap();
+                    let mul2 = i32::from(left2).checked_mul(right2.into()).unwrap();
+                    // However, this addition can overflow in the most extreme case
+                    // (-0x8000)*(-0x8000)+(-0x8000)*(-0x8000) = 0x80000000
+                    let res = mul1.wrapping_add(mul2);
+
+                    this.write_scalar(Scalar::from_i32(res), &dest)?;
+                }
+            }
+            // Used to implement the _mm256_maddubs_epi16 function.
+            // Multiplies packed 8-bit unsigned integers from `left` and packed
+            // signed 8-bit integers from `right` into 16-bit signed integers. Then,
+            // the saturating sum of the products with indices `2*i` and `2*i+1`
+            // produces the output at index `i`.
+            "pmadd.ub.sw" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+                assert_eq!(left_len, right_len);
+                assert_eq!(dest_len.checked_mul(2).unwrap(), left_len);
+
+                for i in 0..dest_len {
+                    let j1 = i.checked_mul(2).unwrap();
+                    let left1 = this.read_scalar(&this.project_index(&left, j1)?)?.to_u8()?;
+                    let right1 = this.read_scalar(&this.project_index(&right, j1)?)?.to_i8()?;
+
+                    let j2 = j1.checked_add(1).unwrap();
+                    let left2 = this.read_scalar(&this.project_index(&left, j2)?)?.to_u8()?;
+                    let right2 = this.read_scalar(&this.project_index(&right, j2)?)?.to_i8()?;
+
+                    let dest = this.project_index(&dest, i)?;
+
+                    // Multiplication of a u8 and an i8 into an i16 cannot overflow.
+                    let mul1 = i16::from(left1).checked_mul(right1.into()).unwrap();
+                    let mul2 = i16::from(left2).checked_mul(right2.into()).unwrap();
+                    let res = mul1.saturating_add(mul2);
+
+                    this.write_scalar(Scalar::from_i16(res), &dest)?;
+                }
+            }
+            // Used to implement the _mm_maskload_epi32, _mm_maskload_epi64,
+            // _mm256_maskload_epi32 and _mm256_maskload_epi64 functions.
+            // For the element `i`, if the high bit of the `i`-th element of `mask`
+            // is one, it is loaded from `ptr.wrapping_add(i)`, otherwise zero is
+            // loaded.
+            "maskload.d" | "maskload.q" | "maskload.d.256" | "maskload.q.256" => {
+                let [ptr, mask] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                mask_load(this, ptr, mask, dest)?;
+            }
+            // Used to implement the _mm_maskstore_epi32, _mm_maskstore_epi64,
+            // _mm256_maskstore_epi32 and _mm256_maskstore_epi64 functions.
+            // For the element `i`, if the high bit of the element `i`-th of `mask`
+            // is one, it is stored into `ptr.wapping_add(i)`.
+            // Unlike SSE2's _mm_maskmoveu_si128, these are not non-temporal stores.
+            "maskstore.d" | "maskstore.q" | "maskstore.d.256" | "maskstore.q.256" => {
+                let [ptr, mask, value] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                mask_store(this, ptr, mask, value)?;
+            }
+            // Used to implement the _mm256_mpsadbw_epu8 function.
+            // Compute the sum of absolute differences of quadruplets of unsigned
+            // 8-bit integers in `left` and `right`, and store the 16-bit results
+            // in `right`. Quadruplets are selected from `left` and `right` with
+            // offsets specified in `imm`.
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8
+            "mpsadbw" => {
+                let [left, right, imm] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                mpsadbw(this, left, right, imm, dest)?;
+            }
+            // Used to implement the _mm256_mulhrs_epi16 function.
+            // Multiplies packed 16-bit signed integer values, truncates the 32-bit
+            // product to the 18 most significant bits by right-shifting, and then
+            // divides the 18-bit value by 2 (rounding to nearest) by first adding
+            // 1 and then taking the bits `1..=16`.
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16
+            "pmul.hr.sw" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                pmulhrsw(this, left, right, dest)?;
+            }
+            // Used to implement the _mm256_packs_epi16 function.
+            // Converts two 16-bit integer vectors to a single 8-bit integer
+            // vector with signed saturation.
+            "packsswb" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                packsswb(this, left, right, dest)?;
+            }
+            // Used to implement the _mm256_packs_epi32 function.
+            // Converts two 32-bit integer vectors to a single 16-bit integer
+            // vector with signed saturation.
+            "packssdw" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                packssdw(this, left, right, dest)?;
+            }
+            // Used to implement the _mm256_packus_epi16 function.
+            // Converts two 16-bit signed integer vectors to a single 8-bit
+            // unsigned integer vector with saturation.
+            "packuswb" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                packuswb(this, left, right, dest)?;
+            }
+            // Used to implement the _mm256_packus_epi32 function.
+            // Concatenates two 32-bit signed integer vectors and converts
+            // the result to a 16-bit unsigned integer vector with saturation.
+            "packusdw" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                packusdw(this, left, right, dest)?;
+            }
+            // Used to implement the _mm256_permutevar8x32_epi32 and
+            // _mm256_permutevar8x32_ps function.
+            // Shuffles `left` using the three low bits of each element of `right`
+            // as indices.
+            "permd" | "permps" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+                assert_eq!(dest_len, right_len);
+
+                for i in 0..dest_len {
+                    let dest = this.project_index(&dest, i)?;
+                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u32()?;
+                    let left = this.project_index(&left, (right & 0b111).into())?;
+
+                    this.copy_op(&left, &dest)?;
+                }
+            }
+            // Used to implement the _mm256_permute2x128_si256 function.
+            // Shuffles 128-bit blocks of `a` and `b` using `imm` as pattern.
+            "vperm2i128" => {
+                let [left, right, imm] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                assert_eq!(left.layout.size.bits(), 256);
+                assert_eq!(right.layout.size.bits(), 256);
+                assert_eq!(dest.layout.size.bits(), 256);
+
+                // Transmute to `[i128; 2]`
+
+                let array_layout =
+                    this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.i128, 2))?;
+                let left = left.transmute(array_layout, this)?;
+                let right = right.transmute(array_layout, this)?;
+                let dest = dest.transmute(array_layout, this)?;
+
+                let imm = this.read_scalar(imm)?.to_u8()?;
+
+                for i in 0..2 {
+                    let dest = this.project_index(&dest, i)?;
+                    let src = match (imm >> i.checked_mul(4).unwrap()) & 0b11 {
+                        0 => this.project_index(&left, 0)?,
+                        1 => this.project_index(&left, 1)?,
+                        2 => this.project_index(&right, 0)?,
+                        3 => this.project_index(&right, 1)?,
+                        _ => unreachable!(),
+                    };
+
+                    this.copy_op(&src, &dest)?;
+                }
+            }
+            // Used to implement the _mm256_sad_epu8 function.
+            // Compute the absolute differences of packed unsigned 8-bit integers
+            // in `left` and `right`, then horizontally sum each consecutive 8
+            // differences to produce four unsigned 16-bit integers, and pack
+            // these unsigned 16-bit integers in the low 16 bits of 64-bit elements
+            // in `dest`.
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8
+            "psad.bw" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+                assert_eq!(left_len, right_len);
+                assert_eq!(left_len, dest_len.checked_mul(8).unwrap());
+
+                for i in 0..dest_len {
+                    let dest = this.project_index(&dest, i)?;
+
+                    let mut acc: u16 = 0;
+                    for j in 0..8 {
+                        let src_index = i.checked_mul(8).unwrap().checked_add(j).unwrap();
+
+                        let left = this.project_index(&left, src_index)?;
+                        let left = this.read_scalar(&left)?.to_u8()?;
+
+                        let right = this.project_index(&right, src_index)?;
+                        let right = this.read_scalar(&right)?.to_u8()?;
+
+                        acc = acc.checked_add(left.abs_diff(right).into()).unwrap();
+                    }
+
+                    this.write_scalar(Scalar::from_u64(acc.into()), &dest)?;
+                }
+            }
+            // Used to implement the _mm256_shuffle_epi8 intrinsic.
+            // Shuffles bytes from `left` using `right` as pattern.
+            // Each 128-bit block is shuffled independently.
+            "pshuf.b" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+                assert_eq!(dest_len, right_len);
+
+                for i in 0..dest_len {
+                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u8()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    let res = if right & 0x80 == 0 {
+                        // Shuffle each 128-bit (16-byte) block independently.
+                        let j = u64::from(right % 16).checked_add(i & !15).unwrap();
+                        this.read_scalar(&this.project_index(&left, j)?)?
+                    } else {
+                        // If the highest bit in `right` is 1, write zero.
+                        Scalar::from_u8(0)
+                    };
+
+                    this.write_scalar(res, &dest)?;
+                }
+            }
+            // Used to implement the _mm256_sign_epi{8,16,32} functions.
+            // Negates elements from `left` when the corresponding element in
+            // `right` is negative. If an element from `right` is zero, zero
+            // is writen to the corresponding output element.
+            // Basically, we multiply `left` with `right.signum()`.
+            "psign.b" | "psign.w" | "psign.d" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                psign(this, left, right, dest)?;
+            }
+            // Used to implement the _mm256_{sll,srl,sra}_epi{16,32,64} functions
+            // (except _mm256_sra_epi64, which is not available in AVX2).
+            // Shifts N-bit packed integers in left by the amount in right.
+            // `right` is as 128-bit vector. but it is interpreted as a single
+            // 64-bit integer (remaining bits are ignored).
+            // For logic shifts, when right is larger than N - 1, zero is produced.
+            // For arithmetic shifts, when right is larger than N - 1, the sign bit
+            // is copied to remaining bits.
+            "psll.w" | "psrl.w" | "psra.w" | "psll.d" | "psrl.d" | "psra.d" | "psll.q"
+            | "psrl.q" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let which = match unprefixed_name {
+                    "psll.w" | "psll.d" | "psll.q" => ShiftOp::Left,
+                    "psrl.w" | "psrl.d" | "psrl.q" => ShiftOp::RightLogic,
+                    "psra.w" | "psra.d" => ShiftOp::RightArith,
+                    _ => unreachable!(),
+                };
+
+                shift_simd_by_scalar(this, left, right, which, dest)?;
+            }
+            // Used to implement the _mm{,256}_{sllv,srlv,srav}_epi{32,64} functions
+            // (except _mm{,256}_srav_epi64, which are not available in AVX2).
+            "psllv.d" | "psllv.d.256" | "psllv.q" | "psllv.q.256" | "psrlv.d" | "psrlv.d.256"
+            | "psrlv.q" | "psrlv.q.256" | "psrav.d" | "psrav.d.256" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let which = match unprefixed_name {
+                    "psllv.d" | "psllv.d.256" | "psllv.q" | "psllv.q.256" => ShiftOp::Left,
+                    "psrlv.d" | "psrlv.d.256" | "psrlv.q" | "psrlv.q.256" => ShiftOp::RightLogic,
+                    "psrav.d" | "psrav.d.256" => ShiftOp::RightArith,
+                    _ => unreachable!(),
+                };
+
+                shift_simd_by_simd(this, left, right, which, dest)?;
+            }
+            _ => return Ok(EmulateForeignItemResult::NotSupported),
+        }
+        Ok(EmulateForeignItemResult::NeedsJumping)
+    }
+}
diff --git a/src/tools/miri/src/shims/x86/mod.rs b/src/tools/miri/src/shims/x86/mod.rs
index 615821b2e37..a9d248c2a85 100644
--- a/src/tools/miri/src/shims/x86/mod.rs
+++ b/src/tools/miri/src/shims/x86/mod.rs
@@ -14,6 +14,7 @@ use shims::foreign_items::EmulateForeignItemResult;
 
 mod aesni;
 mod avx;
+mod avx2;
 mod sse;
 mod sse2;
 mod sse3;
@@ -136,6 +137,11 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                     this, link_name, abi, args, dest,
                 );
             }
+            name if name.starts_with("avx2.") => {
+                return avx2::EvalContextExt::emulate_x86_avx2_intrinsic(
+                    this, link_name, abi, args, dest,
+                );
+            }
 
             _ => return Ok(EmulateForeignItemResult::NotSupported),
         }
@@ -534,6 +540,61 @@ fn shift_simd_by_scalar<'tcx>(
     Ok(())
 }
 
+/// Shifts each element of `left` by the corresponding element of `right`.
+///
+/// For logic shifts, when right is larger than BITS - 1, zero is produced.
+/// For arithmetic right-shifts, when right is larger than BITS - 1, the sign
+/// bit is copied to remaining bits.
+fn shift_simd_by_simd<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    which: ShiftOp,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (left, left_len) = this.operand_to_simd(left)?;
+    let (right, right_len) = this.operand_to_simd(right)?;
+    let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+    assert_eq!(dest_len, left_len);
+    assert_eq!(dest_len, right_len);
+
+    for i in 0..dest_len {
+        let left = this.read_scalar(&this.project_index(&left, i)?)?;
+        let right = this.read_scalar(&this.project_index(&right, i)?)?;
+        let dest = this.project_index(&dest, i)?;
+
+        // It is ok to saturate the value to u32::MAX because any value
+        // above BITS - 1 will produce the same result.
+        let shift = u32::try_from(right.to_uint(dest.layout.size)?).unwrap_or(u32::MAX);
+
+        let res = match which {
+            ShiftOp::Left => {
+                let left = left.to_uint(dest.layout.size)?;
+                let res = left.checked_shl(shift).unwrap_or(0);
+                // `truncate` is needed as left-shift can make the absolute value larger.
+                Scalar::from_uint(dest.layout.size.truncate(res), dest.layout.size)
+            }
+            ShiftOp::RightLogic => {
+                let left = left.to_uint(dest.layout.size)?;
+                let res = left.checked_shr(shift).unwrap_or(0);
+                // No `truncate` needed as right-shift can only make the absolute value smaller.
+                Scalar::from_uint(res, dest.layout.size)
+            }
+            ShiftOp::RightArith => {
+                let left = left.to_int(dest.layout.size)?;
+                // On overflow, copy the sign bit to the remaining bits
+                let res = left.checked_shr(shift).unwrap_or(left >> 127);
+                // No `truncate` needed as right-shift can only make the absolute value smaller.
+                Scalar::from_int(res, dest.layout.size)
+            }
+        };
+        this.write_scalar(res, &dest)?;
+    }
+
+    Ok(())
+}
+
 /// Takes a 128-bit vector, transmutes it to `[u64; 2]` and extracts
 /// the first value.
 fn extract_first_u64<'tcx>(
@@ -664,6 +725,33 @@ fn convert_float_to_int<'tcx>(
     Ok(())
 }
 
+/// Calculates absolute value of integers in `op` and stores the result in `dest`.
+///
+/// In case of overflow (when the operand is the minimum value), the operation
+/// will wrap around.
+fn int_abs<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    op: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (op, op_len) = this.operand_to_simd(op)?;
+    let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+    assert_eq!(op_len, dest_len);
+
+    for i in 0..dest_len {
+        let op = this.read_scalar(&this.project_index(&op, i)?)?;
+        let dest = this.project_index(&dest, i)?;
+
+        // Converting to a host "i128" works since the input is always signed.
+        let res = op.to_int(dest.layout.size)?.unsigned_abs();
+
+        this.write_scalar(Scalar::from_uint(res, dest.layout.size), &dest)?;
+    }
+
+    Ok(())
+}
+
 /// Splits `op` (which must be a SIMD vector) into 128-bit chuncks.
 ///
 /// Returns a tuple where:
@@ -874,3 +962,320 @@ fn test_high_bits_masked<'tcx>(
 
     Ok((direct, negated))
 }
+
+/// Conditionally loads from `ptr` according the high bit of each
+/// element of `mask`. `ptr` does not need to be aligned.
+fn mask_load<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    ptr: &OpTy<'tcx, Provenance>,
+    mask: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (mask, mask_len) = this.operand_to_simd(mask)?;
+    let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+    assert_eq!(dest_len, mask_len);
+
+    let mask_item_size = mask.layout.field(this, 0).size;
+    let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap();
+
+    let ptr = this.read_pointer(ptr)?;
+    for i in 0..dest_len {
+        let mask = this.project_index(&mask, i)?;
+        let dest = this.project_index(&dest, i)?;
+
+        if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
+            // Size * u64 is implemented as always checked
+            #[allow(clippy::arithmetic_side_effects)]
+            let ptr = ptr.wrapping_offset(dest.layout.size * i, &this.tcx);
+            // Unaligned copy, which is what we want.
+            this.mem_copy(ptr, dest.ptr(), dest.layout.size, /*nonoverlapping*/ true)?;
+        } else {
+            this.write_scalar(Scalar::from_int(0, dest.layout.size), &dest)?;
+        }
+    }
+
+    Ok(())
+}
+
+/// Conditionally stores into `ptr` according the high bit of each
+/// element of `mask`. `ptr` does not need to be aligned.
+fn mask_store<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    ptr: &OpTy<'tcx, Provenance>,
+    mask: &OpTy<'tcx, Provenance>,
+    value: &OpTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (mask, mask_len) = this.operand_to_simd(mask)?;
+    let (value, value_len) = this.operand_to_simd(value)?;
+
+    assert_eq!(value_len, mask_len);
+
+    let mask_item_size = mask.layout.field(this, 0).size;
+    let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap();
+
+    let ptr = this.read_pointer(ptr)?;
+    for i in 0..value_len {
+        let mask = this.project_index(&mask, i)?;
+        let value = this.project_index(&value, i)?;
+
+        if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
+            // Size * u64 is implemented as always checked
+            #[allow(clippy::arithmetic_side_effects)]
+            let ptr = ptr.wrapping_offset(value.layout.size * i, &this.tcx);
+            // Unaligned copy, which is what we want.
+            this.mem_copy(value.ptr(), ptr, value.layout.size, /*nonoverlapping*/ true)?;
+        }
+    }
+
+    Ok(())
+}
+
+/// Compute the sum of absolute differences of quadruplets of unsigned
+/// 8-bit integers in `left` and `right`, and store the 16-bit results
+/// in `right`. Quadruplets are selected from `left` and `right` with
+/// offsets specified in `imm`.
+///
+/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16>
+/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8>
+///
+/// Each 128-bit chunk is treated independently (i.e., the value for
+/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
+/// 128-bit chunks of `left` and `right`).
+fn mpsadbw<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    imm: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    assert_eq!(left.layout, right.layout);
+    assert_eq!(left.layout.size, dest.layout.size);
+
+    let (num_chunks, op_items_per_chunk, left) = split_simd_to_128bit_chunks(this, left)?;
+    let (_, _, right) = split_simd_to_128bit_chunks(this, right)?;
+    let (_, dest_items_per_chunk, dest) = split_simd_to_128bit_chunks(this, dest)?;
+
+    assert_eq!(op_items_per_chunk, dest_items_per_chunk.checked_mul(2).unwrap());
+
+    let imm = this.read_scalar(imm)?.to_uint(imm.layout.size)?;
+    // Bit 2 of `imm` specifies the offset for indices of `left`.
+    // The offset is 0 when the bit is 0 or 4 when the bit is 1.
+    let left_offset = u64::try_from((imm >> 2) & 1).unwrap().checked_mul(4).unwrap();
+    // Bits 0..=1 of `imm` specify the offset for indices of
+    // `right` in blocks of 4 elements.
+    let right_offset = u64::try_from(imm & 0b11).unwrap().checked_mul(4).unwrap();
+
+    for i in 0..num_chunks {
+        let left = this.project_index(&left, i)?;
+        let right = this.project_index(&right, i)?;
+        let dest = this.project_index(&dest, i)?;
+
+        for j in 0..dest_items_per_chunk {
+            let left_offset = left_offset.checked_add(j).unwrap();
+            let mut res: u16 = 0;
+            for k in 0..4 {
+                let left = this
+                    .read_scalar(&this.project_index(&left, left_offset.checked_add(k).unwrap())?)?
+                    .to_u8()?;
+                let right = this
+                    .read_scalar(
+                        &this.project_index(&right, right_offset.checked_add(k).unwrap())?,
+                    )?
+                    .to_u8()?;
+                res = res.checked_add(left.abs_diff(right).into()).unwrap();
+            }
+            this.write_scalar(Scalar::from_u16(res), &this.project_index(&dest, j)?)?;
+        }
+    }
+
+    Ok(())
+}
+
+/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
+/// product to the 18 most significant bits by right-shifting, and then
+/// divides the 18-bit value by 2 (rounding to nearest) by first adding
+/// 1 and then taking the bits `1..=16`.
+///
+/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16>
+/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16>
+fn pmulhrsw<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (left, left_len) = this.operand_to_simd(left)?;
+    let (right, right_len) = this.operand_to_simd(right)?;
+    let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+    assert_eq!(dest_len, left_len);
+    assert_eq!(dest_len, right_len);
+
+    for i in 0..dest_len {
+        let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
+        let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
+        let dest = this.project_index(&dest, i)?;
+
+        let res =
+            (i32::from(left).checked_mul(right.into()).unwrap() >> 14).checked_add(1).unwrap() >> 1;
+
+        // The result of this operation can overflow a signed 16-bit integer.
+        // When `left` and `right` are -0x8000, the result is 0x8000.
+        #[allow(clippy::cast_possible_truncation)]
+        let res = res as i16;
+
+        this.write_scalar(Scalar::from_i16(res), &dest)?;
+    }
+
+    Ok(())
+}
+
+fn pack_generic<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+    f: impl Fn(Scalar<Provenance>) -> InterpResult<'tcx, Scalar<Provenance>>,
+) -> InterpResult<'tcx, ()> {
+    assert_eq!(left.layout, right.layout);
+    assert_eq!(left.layout.size, dest.layout.size);
+
+    let (num_chunks, op_items_per_chunk, left) = split_simd_to_128bit_chunks(this, left)?;
+    let (_, _, right) = split_simd_to_128bit_chunks(this, right)?;
+    let (_, dest_items_per_chunk, dest) = split_simd_to_128bit_chunks(this, dest)?;
+
+    assert_eq!(dest_items_per_chunk, op_items_per_chunk.checked_mul(2).unwrap());
+
+    for i in 0..num_chunks {
+        let left = this.project_index(&left, i)?;
+        let right = this.project_index(&right, i)?;
+        let dest = this.project_index(&dest, i)?;
+
+        for j in 0..op_items_per_chunk {
+            let left = this.read_scalar(&this.project_index(&left, j)?)?;
+            let right = this.read_scalar(&this.project_index(&right, j)?)?;
+            let left_dest = this.project_index(&dest, j)?;
+            let right_dest =
+                this.project_index(&dest, j.checked_add(op_items_per_chunk).unwrap())?;
+
+            let left_res = f(left)?;
+            let right_res = f(right)?;
+
+            this.write_scalar(left_res, &left_dest)?;
+            this.write_scalar(right_res, &right_dest)?;
+        }
+    }
+
+    Ok(())
+}
+
+/// Converts two 16-bit integer vectors to a single 8-bit integer
+/// vector with signed saturation.
+///
+/// Each 128-bit chunk is treated independently (i.e., the value for
+/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
+/// 128-bit chunks of `left` and `right`).
+fn packsswb<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    pack_generic(this, left, right, dest, |op| {
+        let op = op.to_i16()?;
+        let res = i8::try_from(op).unwrap_or(if op < 0 { i8::MIN } else { i8::MAX });
+        Ok(Scalar::from_i8(res))
+    })
+}
+
+/// Converts two 16-bit signed integer vectors to a single 8-bit
+/// unsigned integer vector with saturation.
+///
+/// Each 128-bit chunk is treated independently (i.e., the value for
+/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
+/// 128-bit chunks of `left` and `right`).
+fn packuswb<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    pack_generic(this, left, right, dest, |op| {
+        let op = op.to_i16()?;
+        let res = u8::try_from(op).unwrap_or(if op < 0 { 0 } else { u8::MAX });
+        Ok(Scalar::from_u8(res))
+    })
+}
+
+/// Converts two 32-bit integer vectors to a single 16-bit integer
+/// vector with signed saturation.
+///
+/// Each 128-bit chunk is treated independently (i.e., the value for
+/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
+/// 128-bit chunks of `left` and `right`).
+fn packssdw<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    pack_generic(this, left, right, dest, |op| {
+        let op = op.to_i32()?;
+        let res = i16::try_from(op).unwrap_or(if op < 0 { i16::MIN } else { i16::MAX });
+        Ok(Scalar::from_i16(res))
+    })
+}
+
+/// Converts two 32-bit integer vectors to a single 16-bit integer
+/// vector with unsigned saturation.
+///
+/// Each 128-bit chunk is treated independently (i.e., the value for
+/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
+/// 128-bit chunks of `left` and `right`).
+fn packusdw<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    pack_generic(this, left, right, dest, |op| {
+        let op = op.to_i32()?;
+        let res = u16::try_from(op).unwrap_or(if op < 0 { 0 } else { u16::MAX });
+        Ok(Scalar::from_u16(res))
+    })
+}
+
+/// Negates elements from `left` when the corresponding element in
+/// `right` is negative. If an element from `right` is zero, zero
+/// is writen to the corresponding output element.
+/// In other words, multiplies `left` with `right.signum()`.
+fn psign<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (left, left_len) = this.operand_to_simd(left)?;
+    let (right, right_len) = this.operand_to_simd(right)?;
+    let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+    assert_eq!(dest_len, left_len);
+    assert_eq!(dest_len, right_len);
+
+    for i in 0..dest_len {
+        let dest = this.project_index(&dest, i)?;
+        let left = this.read_immediate(&this.project_index(&left, i)?)?;
+        let right = this.read_scalar(&this.project_index(&right, i)?)?.to_int(dest.layout.size)?;
+
+        let res = this.wrapping_binary_op(
+            mir::BinOp::Mul,
+            &left,
+            &ImmTy::from_int(right.signum(), dest.layout),
+        )?;
+
+        this.write_immediate(*res, &dest)?;
+    }
+
+    Ok(())
+}
diff --git a/src/tools/miri/src/shims/x86/sse2.rs b/src/tools/miri/src/shims/x86/sse2.rs
index 9db30d7ddca..63b6a301942 100644
--- a/src/tools/miri/src/shims/x86/sse2.rs
+++ b/src/tools/miri/src/shims/x86/sse2.rs
@@ -3,8 +3,8 @@ use rustc_span::Symbol;
 use rustc_target::spec::abi::Abi;
 
 use super::{
-    bin_op_simd_float_all, bin_op_simd_float_first, convert_float_to_int, shift_simd_by_scalar,
-    FloatBinOp, ShiftOp,
+    bin_op_simd_float_all, bin_op_simd_float_first, convert_float_to_int, packssdw, packsswb,
+    packuswb, shift_simd_by_scalar, FloatBinOp, ShiftOp,
 };
 use crate::*;
 use shims::foreign_items::EmulateForeignItemResult;
@@ -176,29 +176,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                // left and right are i16x8, dest is i8x16
-                assert_eq!(left_len, 8);
-                assert_eq!(right_len, 8);
-                assert_eq!(dest_len, 16);
-
-                for i in 0..left_len {
-                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
-                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
-                    let left_dest = this.project_index(&dest, i)?;
-                    let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?;
-
-                    let left_res =
-                        i8::try_from(left).unwrap_or(if left < 0 { i8::MIN } else { i8::MAX });
-                    let right_res =
-                        i8::try_from(right).unwrap_or(if right < 0 { i8::MIN } else { i8::MAX });
-
-                    this.write_scalar(Scalar::from_i8(left_res), &left_dest)?;
-                    this.write_scalar(Scalar::from_i8(right_res), &right_dest)?;
-                }
+                packsswb(this, left, right, dest)?;
             }
             // Used to implement the _mm_packus_epi16 function.
             // Converts two 16-bit signed integer vectors to a single 8-bit
@@ -207,28 +185,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                // left and right are i16x8, dest is u8x16
-                assert_eq!(left_len, 8);
-                assert_eq!(right_len, 8);
-                assert_eq!(dest_len, 16);
-
-                for i in 0..left_len {
-                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
-                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
-                    let left_dest = this.project_index(&dest, i)?;
-                    let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?;
-
-                    let left_res = u8::try_from(left).unwrap_or(if left < 0 { 0 } else { u8::MAX });
-                    let right_res =
-                        u8::try_from(right).unwrap_or(if right < 0 { 0 } else { u8::MAX });
-
-                    this.write_scalar(Scalar::from_u8(left_res), &left_dest)?;
-                    this.write_scalar(Scalar::from_u8(right_res), &right_dest)?;
-                }
+                packuswb(this, left, right, dest)?;
             }
             // Used to implement the _mm_packs_epi32 function.
             // Converts two 32-bit integer vectors to a single 16-bit integer
@@ -237,29 +194,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                // left and right are i32x4, dest is i16x8
-                assert_eq!(left_len, 4);
-                assert_eq!(right_len, 4);
-                assert_eq!(dest_len, 8);
-
-                for i in 0..left_len {
-                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i32()?;
-                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i32()?;
-                    let left_dest = this.project_index(&dest, i)?;
-                    let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?;
-
-                    let left_res =
-                        i16::try_from(left).unwrap_or(if left < 0 { i16::MIN } else { i16::MAX });
-                    let right_res =
-                        i16::try_from(right).unwrap_or(if right < 0 { i16::MIN } else { i16::MAX });
-
-                    this.write_scalar(Scalar::from_i16(left_res), &left_dest)?;
-                    this.write_scalar(Scalar::from_i16(right_res), &right_dest)?;
-                }
+                packssdw(this, left, right, dest)?;
             }
             // Used to implement _mm_min_sd and _mm_max_sd functions.
             // Note that the semantics are a bit different from Rust simd_min
diff --git a/src/tools/miri/src/shims/x86/sse41.rs b/src/tools/miri/src/shims/x86/sse41.rs
index 16a82eed99b..19bc27421d3 100644
--- a/src/tools/miri/src/shims/x86/sse41.rs
+++ b/src/tools/miri/src/shims/x86/sse41.rs
@@ -1,7 +1,7 @@
 use rustc_span::Symbol;
 use rustc_target::spec::abi::Abi;
 
-use super::{conditional_dot_product, round_all, round_first, test_bits_masked};
+use super::{conditional_dot_product, mpsadbw, packusdw, round_all, round_first, test_bits_masked};
 use crate::*;
 use shims::foreign_items::EmulateForeignItemResult;
 
@@ -68,27 +68,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                assert_eq!(left_len, right_len);
-                assert_eq!(dest_len, left_len.checked_mul(2).unwrap());
-
-                for i in 0..left_len {
-                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i32()?;
-                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i32()?;
-                    let left_dest = this.project_index(&dest, i)?;
-                    let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?;
-
-                    let left_res =
-                        u16::try_from(left).unwrap_or(if left < 0 { 0 } else { u16::MAX });
-                    let right_res =
-                        u16::try_from(right).unwrap_or(if right < 0 { 0 } else { u16::MAX });
-
-                    this.write_scalar(Scalar::from_u16(left_res), &left_dest)?;
-                    this.write_scalar(Scalar::from_u16(right_res), &right_dest)?;
-                }
+                packusdw(this, left, right, dest)?;
             }
             // Used to implement the _mm_dp_ps and _mm_dp_pd functions.
             // Conditionally multiplies the packed floating-point elements in
@@ -176,40 +156,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right, imm] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                assert_eq!(left_len, right_len);
-                assert_eq!(left_len, dest_len.checked_mul(2).unwrap());
-
-                let imm = this.read_scalar(imm)?.to_u8()?;
-                // Bit 2 of `imm` specifies the offset for indices of `left`.
-                // The offset is 0 when the bit is 0 or 4 when the bit is 1.
-                let left_offset = u64::from((imm >> 2) & 1).checked_mul(4).unwrap();
-                // Bits 0..=1 of `imm` specify the offset for indices of
-                // `right` in blocks of 4 elements.
-                let right_offset = u64::from(imm & 0b11).checked_mul(4).unwrap();
-
-                for i in 0..dest_len {
-                    let left_offset = left_offset.checked_add(i).unwrap();
-                    let mut res: u16 = 0;
-                    for j in 0..4 {
-                        let left = this
-                            .read_scalar(
-                                &this.project_index(&left, left_offset.checked_add(j).unwrap())?,
-                            )?
-                            .to_u8()?;
-                        let right = this
-                            .read_scalar(
-                                &this
-                                    .project_index(&right, right_offset.checked_add(j).unwrap())?,
-                            )?
-                            .to_u8()?;
-                        res = res.checked_add(left.abs_diff(right).into()).unwrap();
-                    }
-                    this.write_scalar(Scalar::from_u16(res), &this.project_index(&dest, i)?)?;
-                }
+                mpsadbw(this, left, right, imm, dest)?;
             }
             // Used to implement the _mm_testz_si128, _mm_testc_si128
             // and _mm_testnzc_si128 functions.
diff --git a/src/tools/miri/src/shims/x86/ssse3.rs b/src/tools/miri/src/shims/x86/ssse3.rs
index dd5d064b20f..4f8e52dbb7d 100644
--- a/src/tools/miri/src/shims/x86/ssse3.rs
+++ b/src/tools/miri/src/shims/x86/ssse3.rs
@@ -2,7 +2,7 @@ use rustc_middle::mir;
 use rustc_span::Symbol;
 use rustc_target::spec::abi::Abi;
 
-use super::horizontal_bin_op;
+use super::{horizontal_bin_op, int_abs, pmulhrsw, psign};
 use crate::*;
 use shims::foreign_items::EmulateForeignItemResult;
 
@@ -28,20 +28,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
             "pabs.b.128" | "pabs.w.128" | "pabs.d.128" => {
                 let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (op, op_len) = this.operand_to_simd(op)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                assert_eq!(op_len, dest_len);
-
-                for i in 0..dest_len {
-                    let op = this.read_scalar(&this.project_index(&op, i)?)?;
-                    let dest = this.project_index(&dest, i)?;
-
-                    // Converting to a host "i128" works since the input is always signed.
-                    let res = op.to_int(dest.layout.size)?.unsigned_abs();
-
-                    this.write_scalar(Scalar::from_uint(res, dest.layout.size), &dest)?;
-                }
+                int_abs(this, op, dest)?;
             }
             // Used to implement the _mm_shuffle_epi8 intrinsic.
             // Shuffles bytes from `left` using `right` as pattern.
@@ -136,30 +123,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                assert_eq!(dest_len, left_len);
-                assert_eq!(dest_len, right_len);
-
-                for i in 0..dest_len {
-                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
-                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
-                    let dest = this.project_index(&dest, i)?;
-
-                    let res = (i32::from(left).checked_mul(right.into()).unwrap() >> 14)
-                        .checked_add(1)
-                        .unwrap()
-                        >> 1;
-
-                    // The result of this operation can overflow a signed 16-bit integer.
-                    // When `left` and `right` are -0x8000, the result is 0x8000.
-                    #[allow(clippy::cast_possible_truncation)]
-                    let res = res as i16;
-
-                    this.write_scalar(Scalar::from_i16(res), &dest)?;
-                }
+                pmulhrsw(this, left, right, dest)?;
             }
             // Used to implement the _mm_sign_epi{8,16,32} functions.
             // Negates elements from `left` when the corresponding element in
@@ -170,28 +134,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                assert_eq!(dest_len, left_len);
-                assert_eq!(dest_len, right_len);
-
-                for i in 0..dest_len {
-                    let dest = this.project_index(&dest, i)?;
-                    let left = this.read_immediate(&this.project_index(&left, i)?)?;
-                    let right = this
-                        .read_scalar(&this.project_index(&right, i)?)?
-                        .to_int(dest.layout.size)?;
-
-                    let res = this.wrapping_binary_op(
-                        mir::BinOp::Mul,
-                        &left,
-                        &ImmTy::from_int(right.signum(), dest.layout),
-                    )?;
-
-                    this.write_immediate(*res, &dest)?;
-                }
+                psign(this, left, right, dest)?;
             }
             _ => return Ok(EmulateForeignItemResult::NotSupported),
         }
diff --git a/src/tools/miri/tests/pass/intrinsics-x86-avx2.rs b/src/tools/miri/tests/pass/intrinsics-x86-avx2.rs
new file mode 100644
index 00000000000..80d125bb856
--- /dev/null
+++ b/src/tools/miri/tests/pass/intrinsics-x86-avx2.rs
@@ -0,0 +1,1613 @@
+// Ignore everything except x86 and x86_64
+// Any new targets that are added to CI should be ignored here.
+// (We cannot use `cfg`-based tricks here since the `target-feature` flags below only work on x86.)
+//@ignore-target-aarch64
+//@ignore-target-arm
+//@ignore-target-avr
+//@ignore-target-s390x
+//@ignore-target-thumbv7em
+//@ignore-target-wasm32
+//@compile-flags: -C target-feature=+avx2
+
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+use std::mem::transmute;
+
+fn main() {
+    assert!(is_x86_feature_detected!("avx2"));
+
+    unsafe {
+        test_avx2();
+    }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn test_avx2() {
+    // Mostly copied from library/stdarch/crates/core_arch/src/x86/avx2.rs
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm256_abs_epi32(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi32(
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_abs_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_abs_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0,  1, -1, 2, -2, 3, -3, 4,
+            -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
+        );
+        let r = _mm256_abs_epi16(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            0, 1, 1, 2, 2, 3, 3, 4,
+            4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_abs_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_abs_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 1, -1, 2, -2, 3, -3, 4,
+            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
+            0, 1, -1, 2, -2, 3, -3, 4,
+            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
+        );
+        let r = _mm256_abs_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            0, 1, 1, 2, 2, 3, 3, 4,
+            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
+            0, 1, 1, 2, 2, 3, 3, 4,
+            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_abs_epi8();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_hadd_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hadd_epi16(a, b);
+        let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
+        assert_eq_m256i(r, e);
+
+        // Test wrapping on overflow
+        let a = _mm256_setr_epi16(
+            i16::MAX,
+            1,
+            i16::MAX,
+            2,
+            i16::MAX,
+            3,
+            i16::MAX,
+            4,
+            i16::MAX,
+            5,
+            i16::MAX,
+            6,
+            i16::MAX,
+            7,
+            i16::MAX,
+            8,
+        );
+        let b = _mm256_setr_epi16(
+            i16::MIN,
+            -1,
+            i16::MIN,
+            -2,
+            i16::MIN,
+            -3,
+            i16::MIN,
+            -4,
+            i16::MIN,
+            -5,
+            i16::MIN,
+            -6,
+            i16::MIN,
+            -7,
+            i16::MIN,
+            -8,
+        );
+        let expected = _mm256_setr_epi16(
+            i16::MIN,
+            i16::MIN + 1,
+            i16::MIN + 2,
+            i16::MIN + 3,
+            i16::MAX,
+            i16::MAX - 1,
+            i16::MAX - 2,
+            i16::MAX - 3,
+            i16::MIN + 4,
+            i16::MIN + 5,
+            i16::MIN + 6,
+            i16::MIN + 7,
+            i16::MAX - 4,
+            i16::MAX - 5,
+            i16::MAX - 6,
+            i16::MAX - 7,
+        );
+        let r = _mm256_hadd_epi16(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_hadd_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_hadd_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_hadd_epi32(a, b);
+        let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
+        assert_eq_m256i(r, e);
+
+        // Test wrapping on overflow
+        let a = _mm256_setr_epi32(i32::MAX, 1, i32::MAX, 2, i32::MAX, 3, i32::MAX, 4);
+        let b = _mm256_setr_epi32(i32::MIN, -1, i32::MIN, -2, i32::MIN, -3, i32::MIN, -4);
+        let expected = _mm256_setr_epi32(
+            i32::MIN,
+            i32::MIN + 1,
+            i32::MAX,
+            i32::MAX - 1,
+            i32::MIN + 2,
+            i32::MIN + 3,
+            i32::MAX - 2,
+            i32::MAX - 3,
+        );
+        let r = _mm256_hadd_epi32(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_hadd_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_hadds_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
+        let a = _mm256_insert_epi16::<1>(a, 1);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hadds_epi16(a, b);
+        let e = _mm256_setr_epi16(0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
+        assert_eq_m256i(r, e);
+
+        // Test saturating on overflow
+        let a = _mm256_setr_epi16(
+            i16::MAX,
+            1,
+            i16::MAX,
+            2,
+            i16::MAX,
+            3,
+            i16::MAX,
+            4,
+            i16::MAX,
+            5,
+            i16::MAX,
+            6,
+            i16::MAX,
+            7,
+            i16::MAX,
+            8,
+        );
+        let b = _mm256_setr_epi16(
+            i16::MIN,
+            -1,
+            i16::MIN,
+            -2,
+            i16::MIN,
+            -3,
+            i16::MIN,
+            -4,
+            i16::MIN,
+            -5,
+            i16::MIN,
+            -6,
+            i16::MIN,
+            -7,
+            i16::MIN,
+            -8,
+        );
+        let expected = _mm256_setr_epi16(
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+        );
+        let r = _mm256_hadds_epi16(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_hadds_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_hsub_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hsub_epi16(a, b);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+
+        // Test wrapping on overflow
+        let a = _mm256_setr_epi16(
+            i16::MAX,
+            -1,
+            i16::MAX,
+            -2,
+            i16::MAX,
+            -3,
+            i16::MAX,
+            -4,
+            i16::MAX,
+            -5,
+            i16::MAX,
+            -6,
+            i16::MAX,
+            -7,
+            i16::MAX,
+            -8,
+        );
+        let b = _mm256_setr_epi16(
+            i16::MIN,
+            1,
+            i16::MIN,
+            2,
+            i16::MIN,
+            3,
+            i16::MIN,
+            4,
+            i16::MIN,
+            5,
+            i16::MIN,
+            6,
+            i16::MIN,
+            7,
+            i16::MIN,
+            8,
+        );
+        let expected = _mm256_setr_epi16(
+            i16::MIN,
+            i16::MIN + 1,
+            i16::MIN + 2,
+            i16::MIN + 3,
+            i16::MAX,
+            i16::MAX - 1,
+            i16::MAX - 2,
+            i16::MAX - 3,
+            i16::MIN + 4,
+            i16::MIN + 5,
+            i16::MIN + 6,
+            i16::MIN + 7,
+            i16::MAX - 4,
+            i16::MAX - 5,
+            i16::MAX - 6,
+            i16::MAX - 7,
+        );
+        let r = _mm256_hsub_epi16(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_hsub_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_hsub_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_hsub_epi32(a, b);
+        let e = _mm256_set1_epi32(0);
+        assert_eq_m256i(r, e);
+
+        // Test wrapping on overflow
+        let a = _mm256_setr_epi32(i32::MAX, -1, i32::MAX, -2, i32::MAX, -3, i32::MAX, -4);
+        let b = _mm256_setr_epi32(i32::MIN, 1, i32::MIN, 2, i32::MIN, 3, i32::MIN, 4);
+        let expected = _mm256_setr_epi32(
+            i32::MIN,
+            i32::MIN + 1,
+            i32::MAX,
+            i32::MAX - 1,
+            i32::MIN + 2,
+            i32::MIN + 3,
+            i32::MAX - 2,
+            i32::MAX - 3,
+        );
+        let r = _mm256_hsub_epi32(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_hsub_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_hsubs_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
+        let a = _mm256_insert_epi16::<1>(a, -1);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hsubs_epi16(a, b);
+        let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF);
+        assert_eq_m256i(r, e);
+
+        // Test saturating on overflow
+        let a = _mm256_setr_epi16(
+            i16::MAX,
+            -1,
+            i16::MAX,
+            -2,
+            i16::MAX,
+            -3,
+            i16::MAX,
+            -4,
+            i16::MAX,
+            -5,
+            i16::MAX,
+            -6,
+            i16::MAX,
+            -7,
+            i16::MAX,
+            -8,
+        );
+        let b = _mm256_setr_epi16(
+            i16::MIN,
+            1,
+            i16::MIN,
+            2,
+            i16::MIN,
+            3,
+            i16::MIN,
+            4,
+            i16::MIN,
+            5,
+            i16::MIN,
+            6,
+            i16::MIN,
+            7,
+            i16::MIN,
+            8,
+        );
+        let expected = _mm256_setr_epi16(
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+        );
+        let r = _mm256_hsubs_epi16(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_hsubs_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i32gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
+    }
+    test_mm_i32gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm_mask_i32gather_epi32::<4>(
+            _mm_set1_epi32(256),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm_setr_epi32(-1, -1, -1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
+    }
+    test_mm_mask_i32gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r =
+            _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+    }
+    test_mm256_i32gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_mask_i32gather_epi32::<4>(
+            _mm256_set1_epi32(256),
+            arr.as_ptr(),
+            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
+            _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
+        );
+        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
+    }
+    test_mm256_mask_i32gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i32gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
+    }
+    test_mm_i32gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_mask_i32gather_ps::<4>(
+            _mm_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
+    }
+    test_mm_mask_i32gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r =
+            _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
+    }
+    test_mm256_i32gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_mask_i32gather_ps::<4>(
+            _mm256_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
+            _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
+        );
+        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0));
+    }
+    test_mm256_mask_i32gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i32gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
+        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
+    }
+    test_mm_i32gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_mask_i32gather_epi64::<8>(
+            _mm_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm_setr_epi32(16, 16, 16, 16),
+            _mm_setr_epi64x(-1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
+    }
+    test_mm_mask_i32gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
+    }
+    test_mm256_i32gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_mask_i32gather_epi64::<8>(
+            _mm256_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm256_setr_epi64x(-1, -1, -1, 0),
+        );
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
+    }
+    test_mm256_mask_i32gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i32gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
+        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
+    }
+    test_mm_i32gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_mask_i32gather_pd::<8>(
+            _mm_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi32(16, 16, 16, 16),
+            _mm_setr_pd(-1.0, 0.0),
+        );
+        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
+    }
+    test_mm_mask_i32gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
+    }
+    test_mm256_i32gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_mask_i32gather_pd::<8>(
+            _mm256_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
+    }
+    test_mm256_mask_i32gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i64gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
+    }
+    test_mm_i64gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm_mask_i64gather_epi32::<4>(
+            _mm_set1_epi32(256),
+            arr.as_ptr(),
+            _mm_setr_epi64x(0, 16),
+            _mm_setr_epi32(-1, 0, -1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
+    }
+    test_mm_mask_i64gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
+    }
+    test_mm256_i64gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_mask_i64gather_epi32::<4>(
+            _mm_set1_epi32(256),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm_setr_epi32(-1, -1, -1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
+    }
+    test_mm256_mask_i64gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i64gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
+    }
+    test_mm_i64gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_mask_i64gather_ps::<4>(
+            _mm_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi64x(0, 16),
+            _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
+    }
+    test_mm_mask_i64gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
+    }
+    test_mm256_i64gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_mask_i64gather_ps::<4>(
+            _mm_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
+    }
+    test_mm256_mask_i64gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i64gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
+    }
+    test_mm_i64gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_mask_i64gather_epi64::<8>(
+            _mm_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm_setr_epi64x(16, 16),
+            _mm_setr_epi64x(-1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
+    }
+    test_mm_mask_i64gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
+    }
+    test_mm256_i64gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_mask_i64gather_epi64::<8>(
+            _mm256_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm256_setr_epi64x(-1, -1, -1, 0),
+        );
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
+    }
+    test_mm256_mask_i64gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i64gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
+    }
+    test_mm_i64gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_mask_i64gather_pd::<8>(
+            _mm_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi64x(16, 16),
+            _mm_setr_pd(-1.0, 0.0),
+        );
+        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
+    }
+    test_mm_mask_i64gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
+    }
+    test_mm256_i64gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_mask_i64gather_pd::<8>(
+            _mm256_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
+    }
+    test_mm256_mask_i64gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_madd_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_madd_epi16(a, b);
+        let e = _mm256_set1_epi32(16);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_madd_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_maddubs_epi16() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_maddubs_epi16(a, b);
+        let e = _mm256_set1_epi16(16);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_maddubs_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_maskload_epi32() {
+        let nums = [1, 2, 3, 4];
+        let a = &nums as *const i32;
+        let mask = _mm_setr_epi32(-1, 0, 0, -1);
+        let r = _mm_maskload_epi32(a, mask);
+        let e = _mm_setr_epi32(1, 0, 0, 4);
+        assert_eq_m128i(r, e);
+
+        // Unaligned pointer
+        let a = Unaligned::new([1i32, 2, 3, 4]);
+        let mask = _mm_setr_epi32(0, !0, 0, !0);
+        let r = _mm_maskload_epi32(a.as_ptr().cast(), mask);
+        let e = _mm_setr_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
+
+        // Only loading first element, so slice can be short.
+        let a = &[2i32];
+        let mask = _mm_setr_epi32(!0, 0, 0, 0);
+        let r = _mm_maskload_epi32(a.as_ptr(), mask);
+        let e = _mm_setr_epi32(2, 0, 0, 0);
+        assert_eq_m128i(r, e);
+
+        // Only loading last element, so slice can be short.
+        let a = &[2i32];
+        let mask = _mm_setr_epi32(0, 0, 0, !0);
+        let r = _mm_maskload_epi32(a.as_ptr().wrapping_sub(3), mask);
+        let e = _mm_setr_epi32(0, 0, 0, 2);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_maskload_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_maskload_epi32() {
+        let nums = [1, 2, 3, 4, 5, 6, 7, 8];
+        let a = &nums as *const i32;
+        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
+        let r = _mm256_maskload_epi32(a, mask);
+        let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
+        assert_eq_m256i(r, e);
+
+        // Unaligned pointer
+        let a = Unaligned::new([1i32, 2, 3, 4, 5, 6, 7, 8]);
+        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
+        let r = _mm256_maskload_epi32(a.as_ptr().cast(), mask);
+        let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m256i(r, e);
+
+        // Only loading first element, so slice can be short.
+        let a = &[2i32];
+        let mask = _mm256_setr_epi32(!0, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_maskload_epi32(a.as_ptr(), mask);
+        let e = _mm256_setr_epi32(2, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+
+        // Only loading last element, so slice can be short.
+        let a = &[2i32];
+        let mask = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, !0);
+        let r = _mm256_maskload_epi32(a.as_ptr().wrapping_sub(7), mask);
+        let e = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 2);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_maskload_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_maskload_epi64() {
+        let nums = [1_i64, 2_i64];
+        let a = &nums as *const i64;
+        let mask = _mm_setr_epi64x(0, -1);
+        let r = _mm_maskload_epi64(a, mask);
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+
+        // Unaligned pointer
+        let a = Unaligned::new([1i64, 2]);
+        let mask = _mm_setr_epi64x(0, !0);
+        let r = _mm_maskload_epi64(a.as_ptr().cast(), mask);
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+
+        // Only loading first element, so slice can be short.
+        let a = &[2i64];
+        let mask = _mm_setr_epi64x(!0, 0);
+        let r = _mm_maskload_epi64(a.as_ptr(), mask);
+        let e = _mm_setr_epi64x(2, 0);
+        assert_eq_m128i(r, e);
+
+        // Only loading last element, so slice can be short.
+        let a = &[2i64];
+        let mask = _mm_setr_epi64x(0, !0);
+        let r = _mm_maskload_epi64(a.as_ptr().wrapping_sub(1), mask);
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_maskload_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_maskload_epi64() {
+        let nums = [1_i64, 2_i64, 3_i64, 4_i64];
+        let a = &nums as *const i64;
+        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
+        let r = _mm256_maskload_epi64(a, mask);
+        let e = _mm256_setr_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+
+        // Unaligned pointer
+        let a = Unaligned::new([1i64, 2, 3, 4]);
+        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
+        let r = _mm256_maskload_epi64(a.as_ptr().cast(), mask);
+        let e = _mm256_setr_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
+
+        // Only loading first element, so slice can be short.
+        let a = &[2i64];
+        let mask = _mm256_setr_epi64x(!0, 0, 0, 0);
+        let r = _mm256_maskload_epi64(a.as_ptr(), mask);
+        let e = _mm256_setr_epi64x(2, 0, 0, 0);
+        assert_eq_m256i(r, e);
+
+        // Only loading last element, so slice can be short.
+        let a = &[2i64];
+        let mask = _mm256_setr_epi64x(0, 0, 0, !0);
+        let r = _mm256_maskload_epi64(a.as_ptr().wrapping_sub(3), mask);
+        let e = _mm256_setr_epi64x(0, 0, 0, 2);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_maskload_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_maskstore_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let mut arr = [-1, -1, -1, -1];
+        let mask = _mm_setr_epi32(-1, 0, 0, -1);
+        _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
+        let e = [1, -1, -1, 4];
+        assert_eq!(arr, e);
+
+        // Unaligned pointer
+        let mut r = Unaligned::new([0i32; 4]);
+        let mask = _mm_setr_epi32(0, !0, 0, !0);
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        _mm_maskstore_epi32(r.as_mut_ptr().cast(), mask, a);
+        let e = [0i32, 2, 0, 4];
+        assert_eq!(r.read(), e);
+
+        // Only storing first element, so slice can be short.
+        let mut r = [0i32];
+        let mask = _mm_setr_epi32(!0, 0, 0, 0);
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        _mm_maskstore_epi32(r.as_mut_ptr(), mask, a);
+        let e = [1i32];
+        assert_eq!(r, e);
+
+        // Only storing last element, so slice can be short.
+        let mut r = [0i32];
+        let mask = _mm_setr_epi32(0, 0, 0, !0);
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        _mm_maskstore_epi32(r.as_mut_ptr().wrapping_sub(3), mask, a);
+        let e = [4i32];
+        assert_eq!(r, e);
+    }
+    test_mm_maskstore_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_maskstore_epi32() {
+        let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
+        let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
+        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
+        _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
+        let e = [1, -1, -1, 42, -1, 6, 7, -1];
+        assert_eq!(arr, e);
+
+        // Unaligned pointer
+        let mut r = Unaligned::new([0i32; 8]);
+        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        _mm256_maskstore_epi32(r.as_mut_ptr().cast(), mask, a);
+        let e = [0i32, 2, 0, 4, 0, 6, 0, 8];
+        assert_eq!(r.read(), e);
+
+        // Only storing first element, so slice can be short.
+        let mut r = [0i32];
+        let mask = _mm256_setr_epi32(!0, 0, 0, 0, 0, 0, 0, 0);
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        _mm256_maskstore_epi32(r.as_mut_ptr(), mask, a);
+        let e = [1i32];
+        assert_eq!(r, e);
+
+        // Only storing last element, so slice can be short.
+        let mut r = [0i32];
+        let mask = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, !0);
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        _mm256_maskstore_epi32(r.as_mut_ptr().wrapping_sub(7), mask, a);
+        let e = [8i32];
+        assert_eq!(r, e);
+    }
+    test_mm256_maskstore_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_maskstore_epi64() {
+        let a = _mm_setr_epi64x(1_i64, 2_i64);
+        let mut arr = [-1_i64, -1_i64];
+        let mask = _mm_setr_epi64x(0, -1);
+        _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
+        let e = [-1, 2];
+        assert_eq!(arr, e);
+
+        // Unaligned pointer
+        let mut r = Unaligned::new([0i64; 2]);
+        let mask = _mm_setr_epi64x(0, !0);
+        let a = _mm_setr_epi64x(1, 2);
+        _mm_maskstore_epi64(r.as_mut_ptr().cast(), mask, a);
+        let e = [0i64, 2];
+        assert_eq!(r.read(), e);
+
+        // Only storing first element, so slice can be short.
+        let mut r = [0i64];
+        let mask = _mm_setr_epi64x(!0, 0);
+        let a = _mm_setr_epi64x(1, 2);
+        _mm_maskstore_epi64(r.as_mut_ptr(), mask, a);
+        let e = [1i64];
+        assert_eq!(r, e);
+
+        // Only storing last element, so slice can be short.
+        let mut r = [0i64];
+        let mask = _mm_setr_epi64x(0, !0);
+        let a = _mm_setr_epi64x(1, 2);
+        _mm_maskstore_epi64(r.as_mut_ptr().wrapping_sub(1), mask, a);
+        let e = [2i64];
+        assert_eq!(r, e);
+    }
+    test_mm_maskstore_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_maskstore_epi64() {
+        let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
+        let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
+        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
+        _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
+        let e = [-1, 2, 3, -1];
+        assert_eq!(arr, e);
+
+        // Unaligned pointer
+        let mut r = Unaligned::new([0i64; 4]);
+        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        _mm256_maskstore_epi64(r.as_mut_ptr().cast(), mask, a);
+        let e = [0i64, 2, 0, 4];
+        assert_eq!(r.read(), e);
+
+        // Only storing first element, so slice can be short.
+        let mut r = [0i64];
+        let mask = _mm256_setr_epi64x(!0, 0, 0, 0);
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        _mm256_maskstore_epi64(r.as_mut_ptr(), mask, a);
+        let e = [1i64];
+        assert_eq!(r, e);
+
+        // Only storing last element, so slice can be short.
+        let mut r = [0i64];
+        let mask = _mm256_setr_epi64x(0, 0, 0, !0);
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        _mm256_maskstore_epi64(r.as_mut_ptr().wrapping_sub(3), mask, a);
+        let e = [4i64];
+        assert_eq!(r, e);
+    }
+    test_mm256_maskstore_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mpsadbw_epu8() {
+        let a = _mm256_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 2, 4, 6, 8, 10, 12, 14, 16,
+            18, 20, 22, 24, 26, 28, 30,
+        );
+
+        let r = _mm256_mpsadbw_epu8::<0b000>(a, a);
+        let e = _mm256_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28, 0, 8, 16, 24, 32, 40, 48, 56);
+        assert_eq_m256i(r, e);
+
+        let r = _mm256_mpsadbw_epu8::<0b001>(a, a);
+        let e = _mm256_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12, 32, 24, 16, 8, 0, 8, 16, 24);
+        assert_eq_m256i(r, e);
+
+        let r = _mm256_mpsadbw_epu8::<0b100>(a, a);
+        let e = _mm256_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44, 32, 40, 48, 56, 64, 72, 80, 88);
+        assert_eq_m256i(r, e);
+
+        let r = _mm256_mpsadbw_epu8::<0b101>(a, a);
+        let e = _mm256_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28, 0, 8, 16, 24, 32, 40, 48, 56);
+        assert_eq_m256i(r, e);
+
+        let r = _mm256_mpsadbw_epu8::<0b111>(a, a);
+        let e = _mm256_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4, 64, 56, 48, 40, 32, 24, 16, 8);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_mpsadbw_epu8();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mulhrs_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_mullo_epi16(a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_mulhrs_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_packs_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_packs_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_packs_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_packs_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_packs_epi32(a, b);
+        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
+
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_packs_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_packus_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_packus_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_packus_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_packus_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_packus_epi32(a, b);
+        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
+
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_packus_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_permutevar8x32_epi32() {
+        let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
+        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
+        let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
+        let r = _mm256_permutevar8x32_epi32(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_permutevar8x32_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_permute2x128_si256() {
+        let a = _mm256_setr_epi64x(100, 200, 500, 600);
+        let b = _mm256_setr_epi64x(300, 400, 700, 800);
+        let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
+        let e = _mm256_setr_epi64x(700, 800, 500, 600);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_permute2x128_si256();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_permutevar8x32_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
+        let r = _mm256_permutevar8x32_ps(a, b);
+        let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
+        assert_eq_m256(r, e);
+    }
+    test_mm256_permutevar8x32_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sad_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_sad_epu8(a, b);
+        let e = _mm256_set1_epi64x(16);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_sad_epu8();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
+            12, 5, 5, 10, 4, 1, 8, 0,
+            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
+            12, 5, 5, 10, 4, 1, 8, 0,
+        );
+        #[rustfmt::skip]
+        let expected = _mm256_setr_epi8(
+            5, 0, 5, 4, 9, 13, 7, 4,
+            13, 6, 6, 11, 5, 2, 9, 1,
+            21, 0, 21, 20, 25, 29, 23, 20,
+            29, 22, 22, 27, 21, 18, 25, 17,
+        );
+        let r = _mm256_shuffle_epi8(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_shuffle_epi8();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sign_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(-1);
+        let r = _mm256_sign_epi16(a, b);
+        let e = _mm256_set1_epi16(-2);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_sign_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sign_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_sign_epi32(a, b);
+        let e = _mm256_set1_epi32(-2);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_sign_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sign_epi8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(-1);
+        let r = _mm256_sign_epi8(a, b);
+        let e = _mm256_set1_epi8(-2);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_sign_epi8();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sll_epi16() {
+        let a = _mm256_setr_epi16(
+            0x88, -0x88, 0x99, -0x99, 0xAA, -0xAA, 0xBB, -0xBB, 0xCC, -0xCC, 0xDD, -0xDD, 0xEE,
+            -0xEE, 0xFF, -0xFF,
+        );
+        let r = _mm256_sll_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi16(
+                0x880, -0x880, 0x990, -0x990, 0xAA0, -0xAA0, 0xBB0, -0xBB0, 0xCC0, -0xCC0, 0xDD0,
+                -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0,
+            ),
+        );
+        let r = _mm256_sll_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_sll_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m256i(r, _mm256_set1_epi16(0));
+        let r = _mm256_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_set1_epi16(0));
+    }
+    test_mm256_sll_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sll_epi32() {
+        let a =
+            _mm256_setr_epi32(0xCCCC, -0xCCCC, 0xDDDD, -0xDDDD, 0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm256_sll_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi32(
+                0xCCCC0, -0xCCCC0, 0xDDDD0, -0xDDDD0, 0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0,
+            ),
+        );
+        let r = _mm256_sll_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_sll_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m256i(r, _mm256_set1_epi32(0));
+        let r = _mm256_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_set1_epi32(0));
+    }
+    test_mm256_sll_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sll_epi64() {
+        let a = _mm256_set_epi64x(0xEEEEEEEE, -0xEEEEEEEE, 0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm256_sll_epi64(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(r, _mm256_set_epi64x(0xEEEEEEEE0, -0xEEEEEEEE0, 0xFFFFFFFF0, -0xFFFFFFFF0));
+        let r = _mm256_sll_epi64(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_sll_epi64(a, _mm_set_epi64x(0, 64));
+        assert_eq_m256i(r, _mm256_set1_epi64x(0));
+        let r = _mm256_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_set1_epi64x(0));
+    }
+    test_mm256_sll_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sra_epi16() {
+        let a = _mm256_setr_epi16(
+            0x88, -0x88, 0x99, -0x99, 0xAA, -0xAA, 0xBB, -0xBB, 0xCC, -0xCC, 0xDD, -0xDD, 0xEE,
+            -0xEE, 0xFF, -0xFF,
+        );
+        let r = _mm256_sra_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi16(
+                0x8, -0x9, 0x9, -0xA, 0xA, -0xB, 0xB, -0xC, 0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF,
+                -0x10,
+            ),
+        );
+        let r = _mm256_sra_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_sra_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1),
+        );
+        let r = _mm256_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1),
+        );
+    }
+    test_mm256_sra_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sra_epi32() {
+        let a =
+            _mm256_setr_epi32(0xCCCC, -0xCCCC, 0xDDDD, -0xDDDD, 0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm256_sra_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi32(0xCCC, -0xCCD, 0xDDD, -0xDDE, 0xEEE, -0xEEF, 0xFFF, -0x1000),
+        );
+        let r = _mm256_sra_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_sra_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m256i(r, _mm256_setr_epi32(0, -1, 0, -1, 0, -1, 0, -1));
+        let r = _mm256_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_setr_epi32(0, -1, 0, -1, 0, -1, 0, -1));
+    }
+    test_mm256_sra_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_srl_epi16() {
+        let a = _mm256_setr_epi16(
+            0x88, -0x88, 0x99, -0x99, 0xAA, -0xAA, 0xBB, -0xBB, 0xCC, -0xCC, 0xDD, -0xDD, 0xEE,
+            -0xEE, 0xFF, -0xFF,
+        );
+        let r = _mm256_srl_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi16(
+                0x8, 0xFF7, 0x9, 0xFF6, 0xA, 0xFF5, 0xB, 0xFF4, 0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1,
+                0xF, 0xFF0,
+            ),
+        );
+        let r = _mm256_srl_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_srl_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m256i(r, _mm256_set1_epi16(0));
+        let r = _mm256_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_set1_epi16(0));
+    }
+    test_mm256_srl_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_srl_epi32() {
+        let a =
+            _mm256_setr_epi32(0xCCCC, -0xCCCC, 0xDDDD, -0xDDDD, 0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm256_srl_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi32(
+                0xCCC, 0xFFFF333, 0xDDD, 0xFFFF222, 0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000,
+            ),
+        );
+        let r = _mm256_srl_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_srl_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m256i(r, _mm256_set1_epi32(0));
+        let r = _mm256_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_set1_epi32(0));
+    }
+    test_mm256_srl_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_srl_epi64() {
+        let a = _mm256_set_epi64x(0xEEEEEEEE, -0xEEEEEEEE, 0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm256_srl_epi64(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_set_epi64x(0xEEEEEEE, 0xFFFFFFFF1111111, 0xFFFFFFF, 0xFFFFFFFF0000000),
+        );
+        let r = _mm256_srl_epi64(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_srl_epi64(a, _mm_set_epi64x(0, 64));
+        assert_eq_m256i(r, _mm256_set1_epi64x(0));
+        let r = _mm256_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_set1_epi64x(0));
+    }
+    test_mm256_srl_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_sllv_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(4, 3, 2, 1);
+        let r = _mm_sllv_epi32(a, b);
+        let e = _mm_set_epi32(16, 16, 12, 8);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_sllv_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sllv_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let r = _mm256_sllv_epi32(a, b);
+        let e = _mm256_set_epi32(256, 256, 192, 128, 80, 48, 28, 16);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_sllv_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_sllv_epi64() {
+        let a = _mm_set_epi64x(2, 3);
+        let b = _mm_set_epi64x(1, 2);
+        let r = _mm_sllv_epi64(a, b);
+        let e = _mm_set_epi64x(4, 12);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_sllv_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sllv_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(4, 3, 2, 1);
+        let r = _mm256_sllv_epi64(a, b);
+        let e = _mm256_set_epi64x(16, 16, 12, 8);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_sllv_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_srav_epi32() {
+        let a = _mm_set_epi32(16, -32, 64, -128);
+        let b = _mm_set_epi32(4, 3, 2, 1);
+        let r = _mm_srav_epi32(a, b);
+        let e = _mm_set_epi32(1, -4, 16, -64);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_srav_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_srav_epi32() {
+        let a = _mm256_set_epi32(256, -512, 1024, -2048, 4096, -8192, 16384, -32768);
+        let b = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let r = _mm256_srav_epi32(a, b);
+        let e = _mm256_set_epi32(1, -4, 16, -64, 256, -1024, 4096, -16384);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_srav_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_srlv_epi32() {
+        let a = _mm_set_epi32(16, 32, 64, 128);
+        let b = _mm_set_epi32(4, 3, 2, 1);
+        let r = _mm_srlv_epi32(a, b);
+        let e = _mm_set_epi32(1, 4, 16, 64);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_srlv_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_srlv_epi32() {
+        let a = _mm256_set_epi32(256, 512, 1024, 2048, 4096, 8192, 16384, 32768);
+        let b = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let r = _mm256_srlv_epi32(a, b);
+        let e = _mm256_set_epi32(1, 4, 16, 64, 256, 1024, 4096, 16384);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_srlv_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_srlv_epi64() {
+        let a = _mm_set_epi64x(4, 8);
+        let b = _mm_set_epi64x(2, 1);
+        let r = _mm_srlv_epi64(a, b);
+        let e = _mm_set_epi64x(1, 4);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_srlv_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_srlv_epi64() {
+        let a = _mm256_set_epi64x(16, 32, 64, 128);
+        let b = _mm256_set_epi64x(4, 3, 2, 1);
+        let r = _mm256_srlv_epi64(a, b);
+        let e = _mm256_set_epi64x(1, 4, 16, 64);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_srlv_epi64();
+}
+
+#[target_feature(enable = "sse2")]
+unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
+    _mm_set_epi64x(b, a)
+}
+
+#[track_caller]
+#[target_feature(enable = "sse")]
+unsafe fn assert_eq_m128(a: __m128, b: __m128) {
+    let r = _mm_cmpeq_ps(a, b);
+    if _mm_movemask_ps(r) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "sse2")]
+unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
+    if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "sse2")]
+unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
+    assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+unsafe fn assert_eq_m256(a: __m256, b: __m256) {
+    let cmp = _mm256_cmp_ps::<_CMP_EQ_OQ>(a, b);
+    if _mm256_movemask_ps(cmp) != 0b11111111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+unsafe fn assert_eq_m256d(a: __m256d, b: __m256d) {
+    let cmp = _mm256_cmp_pd::<_CMP_EQ_OQ>(a, b);
+    if _mm256_movemask_pd(cmp) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
+    assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b))
+}
+
+/// Stores `T` in an unaligned address
+struct Unaligned<T: Copy> {
+    buf: Vec<u8>,
+    offset: bool,
+    _marker: std::marker::PhantomData<T>,
+}
+
+impl<T: Copy> Unaligned<T> {
+    fn new(value: T) -> Self {
+        // Allocate extra byte for unalignment headroom
+        let len = std::mem::size_of::<T>();
+        let mut buf = Vec::<u8>::with_capacity(len + 1);
+        // Force the address to be a non-multiple of 2, so it is as unaligned as it can get.
+        let offset = (buf.as_ptr() as usize % 2) == 0;
+        let value_ptr: *const T = &value;
+        unsafe {
+            buf.as_mut_ptr().add(offset.into()).copy_from_nonoverlapping(value_ptr.cast(), len);
+        }
+        Self { buf, offset, _marker: std::marker::PhantomData }
+    }
+
+    fn as_ptr(&self) -> *const T {
+        unsafe { self.buf.as_ptr().add(self.offset.into()).cast() }
+    }
+
+    fn as_mut_ptr(&mut self) -> *mut T {
+        unsafe { self.buf.as_mut_ptr().add(self.offset.into()).cast() }
+    }
+
+    fn read(&self) -> T {
+        unsafe { self.as_ptr().read_unaligned() }
+    }
+}

From ed15716fe61e181b7c605c00e44ca208e2446db7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= <eduardosm-dev@e64.io>
Date: Tue, 23 Apr 2024 17:47:46 +0200
Subject: [PATCH 11/28] Configure clippy not to generate warnings about
 arithmetic operations on `rustc_target::abi::Size`

---
 src/tools/miri/clippy.toml          | 1 +
 src/tools/miri/src/shims/x86/mod.rs | 4 ----
 2 files changed, 1 insertion(+), 4 deletions(-)
 create mode 100644 src/tools/miri/clippy.toml

diff --git a/src/tools/miri/clippy.toml b/src/tools/miri/clippy.toml
new file mode 100644
index 00000000000..284e18a45a3
--- /dev/null
+++ b/src/tools/miri/clippy.toml
@@ -0,0 +1 @@
+arithmetic-side-effects-allowed = ["rustc_target::abi::Size"]
diff --git a/src/tools/miri/src/shims/x86/mod.rs b/src/tools/miri/src/shims/x86/mod.rs
index a9d248c2a85..cf3c3758cd1 100644
--- a/src/tools/miri/src/shims/x86/mod.rs
+++ b/src/tools/miri/src/shims/x86/mod.rs
@@ -985,8 +985,6 @@ fn mask_load<'tcx>(
         let dest = this.project_index(&dest, i)?;
 
         if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
-            // Size * u64 is implemented as always checked
-            #[allow(clippy::arithmetic_side_effects)]
             let ptr = ptr.wrapping_offset(dest.layout.size * i, &this.tcx);
             // Unaligned copy, which is what we want.
             this.mem_copy(ptr, dest.ptr(), dest.layout.size, /*nonoverlapping*/ true)?;
@@ -1020,8 +1018,6 @@ fn mask_store<'tcx>(
         let value = this.project_index(&value, i)?;
 
         if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
-            // Size * u64 is implemented as always checked
-            #[allow(clippy::arithmetic_side_effects)]
             let ptr = ptr.wrapping_offset(value.layout.size * i, &this.tcx);
             // Unaligned copy, which is what we want.
             this.mem_copy(value.ptr(), ptr, value.layout.size, /*nonoverlapping*/ true)?;

From 4dbc4f87e6d6ba7e1d71266ebc66a0fb0f3f7550 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= <eduardosm-dev@e64.io>
Date: Tue, 23 Apr 2024 19:23:15 +0200
Subject: [PATCH 12/28] Fix wording in shift functions doc comments

---
 src/tools/miri/src/shims/x86/mod.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tools/miri/src/shims/x86/mod.rs b/src/tools/miri/src/shims/x86/mod.rs
index cf3c3758cd1..cd4e1c2e000 100644
--- a/src/tools/miri/src/shims/x86/mod.rs
+++ b/src/tools/miri/src/shims/x86/mod.rs
@@ -488,7 +488,7 @@ enum ShiftOp {
 ///
 /// For logic shifts, when right is larger than BITS - 1, zero is produced.
 /// For arithmetic right-shifts, when right is larger than BITS - 1, the sign
-/// bit is copied to remaining bits.
+/// bit is copied to all bits.
 fn shift_simd_by_scalar<'tcx>(
     this: &mut crate::MiriInterpCx<'_, 'tcx>,
     left: &OpTy<'tcx, Provenance>,
@@ -544,7 +544,7 @@ fn shift_simd_by_scalar<'tcx>(
 ///
 /// For logic shifts, when right is larger than BITS - 1, zero is produced.
 /// For arithmetic right-shifts, when right is larger than BITS - 1, the sign
-/// bit is copied to remaining bits.
+/// bit is copied to all bits.
 fn shift_simd_by_simd<'tcx>(
     this: &mut crate::MiriInterpCx<'_, 'tcx>,
     left: &OpTy<'tcx, Provenance>,

From b3affd3e90d44833ca43e8d14c0af54bcd862283 Mon Sep 17 00:00:00 2001
From: The Miri Cronjob Bot <miri@cron.bot>
Date: Wed, 24 Apr 2024 04:57:24 +0000
Subject: [PATCH 13/28] Preparing for merge from rustc

---
 src/tools/miri/rust-version | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tools/miri/rust-version b/src/tools/miri/rust-version
index 9b0b1c8d230..3191355ccbe 100644
--- a/src/tools/miri/rust-version
+++ b/src/tools/miri/rust-version
@@ -1 +1 @@
-aca749eefceaed0cda19a7ec5e472fce9387bc00
+c1feb3eceef7d5f0126c309a87062cf413fe0a25

From 342943bc7728198dd92e8adb536dbd608a5dfcec Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Tue, 23 Apr 2024 09:34:49 +0200
Subject: [PATCH 14/28] windows: basic support for GetUserProfileDirectoryW

---
 src/tools/miri/Cargo.lock                     | 49 ++++++++++++++++
 src/tools/miri/Cargo.toml                     |  1 +
 src/tools/miri/src/shims/env.rs               | 58 ++++++++++++++++++-
 .../miri/src/shims/windows/foreign_items.rs   |  6 ++
 src/tools/miri/tests/pass/shims/env/home.rs   |  2 +-
 5 files changed, 114 insertions(+), 2 deletions(-)

diff --git a/src/tools/miri/Cargo.lock b/src/tools/miri/Cargo.lock
index 1e6b5502b04..293b937a5e5 100644
--- a/src/tools/miri/Cargo.lock
+++ b/src/tools/miri/Cargo.lock
@@ -299,6 +299,27 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "directories"
+version = "5.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a49173b84e034382284f27f1af4dcbbd231ffa358c0fe316541a7337f376a35"
+dependencies = [
+ "dirs-sys",
+]
+
+[[package]]
+name = "dirs-sys"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c"
+dependencies = [
+ "libc",
+ "option-ext",
+ "redox_users",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "encode_unicode"
 version = "0.3.6"
@@ -490,6 +511,16 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "libredox"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d"
+dependencies = [
+ "bitflags 2.4.2",
+ "libc",
+]
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.13"
@@ -558,6 +589,7 @@ dependencies = [
  "chrono",
  "colored",
  "ctrlc",
+ "directories",
  "getrandom",
  "jemalloc-sys",
  "lazy_static",
@@ -614,6 +646,12 @@ version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
+[[package]]
+name = "option-ext"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
+
 [[package]]
 name = "owo-colors"
 version = "3.5.0"
@@ -746,6 +784,17 @@ dependencies = [
  "bitflags 1.3.2",
 ]
 
+[[package]]
+name = "redox_users"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891"
+dependencies = [
+ "getrandom",
+ "libredox",
+ "thiserror",
+]
+
 [[package]]
 name = "regex"
 version = "1.10.3"
diff --git a/src/tools/miri/Cargo.toml b/src/tools/miri/Cargo.toml
index 7748d630b12..b00dae784d2 100644
--- a/src/tools/miri/Cargo.toml
+++ b/src/tools/miri/Cargo.toml
@@ -25,6 +25,7 @@ aes = { version = "0.8.3", features = ["hazmat"] }
 measureme = "11"
 ctrlc = "3.2.5"
 chrono = { version = "0.4.38", default-features = false, features = ["clock"] }
+directories = "5"
 
 # Copied from `compiler/rustc/Cargo.toml`.
 # But only for some targets, it fails for others. Rustc configures this in its CI, but we can't
diff --git a/src/tools/miri/src/shims/env.rs b/src/tools/miri/src/shims/env.rs
index d97873ce722..22571d0c1c2 100644
--- a/src/tools/miri/src/shims/env.rs
+++ b/src/tools/miri/src/shims/env.rs
@@ -494,9 +494,65 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
     fn GetCurrentProcessId(&mut self) -> InterpResult<'tcx, u32> {
         let this = self.eval_context_mut();
         this.assert_target_os("windows", "GetCurrentProcessId");
-
         this.check_no_isolation("`GetCurrentProcessId`")?;
 
         Ok(std::process::id())
     }
+
+    #[allow(non_snake_case)]
+    fn GetUserProfileDirectoryW(
+        &mut self,
+        token: &OpTy<'tcx, Provenance>, // HANDLE
+        buf: &OpTy<'tcx, Provenance>,   // LPWSTR
+        size: &OpTy<'tcx, Provenance>,  // LPDWORD
+    ) -> InterpResult<'tcx, Scalar<Provenance>> // returns BOOL
+    {
+        let this = self.eval_context_mut();
+        this.assert_target_os("windows", "GetUserProfileDirectoryW");
+        this.check_no_isolation("`GetUserProfileDirectoryW`")?;
+
+        let token = this.read_target_isize(token)?;
+        let buf = this.read_pointer(buf)?;
+        let size = this.deref_pointer(size)?;
+
+        if token != -4 {
+            throw_unsup_format!(
+                "GetUserProfileDirectoryW: only CURRENT_PROCESS_TOKEN is supported"
+            );
+        }
+
+        // See <https://learn.microsoft.com/en-us/windows/win32/api/userenv/nf-userenv-getuserprofiledirectoryw> for docs.
+        Ok(match directories::UserDirs::new() {
+            Some(dirs) => {
+                let home = dirs.home_dir();
+                let size_avail = if this.ptr_is_null(size.ptr())? {
+                    0 // if the buf pointer is null, we can't write to it; `size` will be updated to the required length
+                } else {
+                    this.read_scalar(&size)?.to_u32()?
+                };
+                // Of course we cannot use `windows_check_buffer_size` here since this uses
+                // a different method for dealing with a too-small buffer than the other functions...
+                let (success, len) = this.write_path_to_wide_str(
+                    home,
+                    buf,
+                    size_avail.into(),
+                    /*truncate*/ false,
+                )?;
+                // The Windows docs just say that this is written on failure. But std
+                // seems to rely on it always being written.
+                this.write_scalar(Scalar::from_u32(len.try_into().unwrap()), &size)?;
+                if success {
+                    Scalar::from_i32(1) // return TRUE
+                } else {
+                    this.set_last_error(this.eval_windows("c", "ERROR_INSUFFICIENT_BUFFER"))?;
+                    Scalar::from_i32(0) // return FALSE
+                }
+            }
+            None => {
+                // We have to pick some error code.
+                this.set_last_error(this.eval_windows("c", "ERROR_BAD_USER_PROFILE"))?;
+                Scalar::from_i32(0) // return FALSE
+            }
+        })
+    }
 }
diff --git a/src/tools/miri/src/shims/windows/foreign_items.rs b/src/tools/miri/src/shims/windows/foreign_items.rs
index c81d6b2f7fd..cf35ecec7cf 100644
--- a/src/tools/miri/src/shims/windows/foreign_items.rs
+++ b/src/tools/miri/src/shims/windows/foreign_items.rs
@@ -135,6 +135,12 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                 let result = this.SetCurrentDirectoryW(path)?;
                 this.write_scalar(result, dest)?;
             }
+            "GetUserProfileDirectoryW" => {
+                let [token, buf, size] =
+                    this.check_shim(abi, Abi::System { unwind: false }, link_name, args)?;
+                let result = this.GetUserProfileDirectoryW(token, buf, size)?;
+                this.write_scalar(result, dest)?;
+            }
 
             // File related shims
             "NtWriteFile" => {
diff --git a/src/tools/miri/tests/pass/shims/env/home.rs b/src/tools/miri/tests/pass/shims/env/home.rs
index 9eb9c3af569..c237f9ed9ff 100644
--- a/src/tools/miri/tests/pass/shims/env/home.rs
+++ b/src/tools/miri/tests/pass/shims/env/home.rs
@@ -1,9 +1,9 @@
-//@ignore-target-windows: home_dir is not supported on Windows
 //@compile-flags: -Zmiri-disable-isolation
 use std::env;
 
 fn main() {
     env::remove_var("HOME"); // make sure we enter the interesting codepath
+    env::remove_var("USERPROFILE"); // Windows also looks as this env var
     #[allow(deprecated)]
     env::home_dir().unwrap();
 }

From 00acfabcee9043c9a716acabde327f03a4a3eaa8 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Tue, 23 Apr 2024 09:41:55 +0200
Subject: [PATCH 15/28] windows buffer size protocol: turns out std resets
 last_error to 0; let's require that in general

---
 src/tools/miri/src/shims/env.rs               | 23 +++++++++++--------
 .../miri/src/shims/windows/foreign_items.rs   |  3 ++-
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/tools/miri/src/shims/env.rs b/src/tools/miri/src/shims/env.rs
index 22571d0c1c2..402e2670888 100644
--- a/src/tools/miri/src/shims/env.rs
+++ b/src/tools/miri/src/shims/env.rs
@@ -160,10 +160,12 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         this.assert_target_os("windows", "GetEnvironmentVariableW");
 
         let name_ptr = this.read_pointer(name_op)?;
+        let buf_ptr = this.read_pointer(buf_op)?;
+        let buf_size = this.read_scalar(size_op)?.to_u32()?; // in characters
+
         let name = this.read_os_str_from_wide_str(name_ptr)?;
         Ok(match this.machine.env_vars.map.get(&name) {
             Some(&var_ptr) => {
-                this.set_last_error(Scalar::from_u32(0))?; // make sure this is unambiguously not an error
                 // The offset is used to strip the "{name}=" part of the string.
                 #[rustfmt::skip]
                 let name_offset_bytes = u64::try_from(name.len()).unwrap()
@@ -172,14 +174,14 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                 let var_ptr = var_ptr.offset(Size::from_bytes(name_offset_bytes), this)?;
                 let var = this.read_os_str_from_wide_str(var_ptr)?;
 
-                let buf_ptr = this.read_pointer(buf_op)?;
-                // `buf_size` represents the size in characters.
-                let buf_size = u64::from(this.read_scalar(size_op)?.to_u32()?);
-                Scalar::from_u32(windows_check_buffer_size(
-                    this.write_os_str_to_wide_str(
-                        &var, buf_ptr, buf_size, /*truncate*/ false,
-                    )?,
-                ))
+                Scalar::from_u32(windows_check_buffer_size(this.write_os_str_to_wide_str(
+                    &var,
+                    buf_ptr,
+                    buf_size.into(),
+                    /*truncate*/ false,
+                )?))
+                // This can in fact return 0. It is up to the caller to set last_error to 0
+                // beforehand and check it afterwards to exclude that case.
             }
             None => {
                 let envvar_not_found = this.eval_windows("c", "ERROR_ENVVAR_NOT_FOUND");
@@ -375,7 +377,8 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         // If we cannot get the current directory, we return 0
         match env::current_dir() {
             Ok(cwd) => {
-                this.set_last_error(Scalar::from_u32(0))?; // make sure this is unambiguously not an error
+                // This can in fact return 0. It is up to the caller to set last_error to 0
+                // beforehand and check it afterwards to exclude that case.
                 return Ok(Scalar::from_u32(windows_check_buffer_size(
                     this.write_path_to_wide_str(&cwd, buf, size, /*truncate*/ false)?,
                 )));
diff --git a/src/tools/miri/src/shims/windows/foreign_items.rs b/src/tools/miri/src/shims/windows/foreign_items.rs
index cf35ecec7cf..e0cca43d263 100644
--- a/src/tools/miri/src/shims/windows/foreign_items.rs
+++ b/src/tools/miri/src/shims/windows/foreign_items.rs
@@ -231,7 +231,6 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                         Scalar::from_u32(0) // return zero upon failure
                     }
                     Ok(abs_filename) => {
-                        this.set_last_error(Scalar::from_u32(0))?; // make sure this is unambiguously not an error
                         Scalar::from_u32(helpers::windows_check_buffer_size(
                             this.write_path_to_wide_str(
                                 &abs_filename,
@@ -240,6 +239,8 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                                 /*truncate*/ false,
                             )?,
                         ))
+                        // This can in fact return 0. It is up to the caller to set last_error to 0
+                        // beforehand and check it afterwards to exclude that case.
                     }
                 };
                 this.write_scalar(result, dest)?;

From ccb43b6a95b10e6e83a9a6ce9fa88ead0c6ae0bc Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Tue, 23 Apr 2024 09:56:56 +0200
Subject: [PATCH 16/28] make the obsucre truncating variant of
 this.write_os_str_to_wide_str a non-default function

---
 src/tools/miri/src/shims/env.rs               | 10 +--
 src/tools/miri/src/shims/os_str.rs            | 68 ++++++++++++-------
 .../miri/src/shims/windows/foreign_items.rs   | 25 ++-----
 3 files changed, 52 insertions(+), 51 deletions(-)

diff --git a/src/tools/miri/src/shims/env.rs b/src/tools/miri/src/shims/env.rs
index 402e2670888..298fefdb0f3 100644
--- a/src/tools/miri/src/shims/env.rs
+++ b/src/tools/miri/src/shims/env.rs
@@ -178,7 +178,6 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                     &var,
                     buf_ptr,
                     buf_size.into(),
-                    /*truncate*/ false,
                 )?))
                 // This can in fact return 0. It is up to the caller to set last_error to 0
                 // beforehand and check it afterwards to exclude that case.
@@ -380,7 +379,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                 // This can in fact return 0. It is up to the caller to set last_error to 0
                 // beforehand and check it afterwards to exclude that case.
                 return Ok(Scalar::from_u32(windows_check_buffer_size(
-                    this.write_path_to_wide_str(&cwd, buf, size, /*truncate*/ false)?,
+                    this.write_path_to_wide_str(&cwd, buf, size)?,
                 )));
             }
             Err(e) => this.set_last_error_from_io_error(e.kind())?,
@@ -535,12 +534,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                 };
                 // Of course we cannot use `windows_check_buffer_size` here since this uses
                 // a different method for dealing with a too-small buffer than the other functions...
-                let (success, len) = this.write_path_to_wide_str(
-                    home,
-                    buf,
-                    size_avail.into(),
-                    /*truncate*/ false,
-                )?;
+                let (success, len) = this.write_path_to_wide_str(home, buf, size_avail.into())?;
                 // The Windows docs just say that this is written on failure. But std
                 // seems to rely on it always being written.
                 this.write_scalar(Scalar::from_u32(len.try_into().unwrap()), &size)?;
diff --git a/src/tools/miri/src/shims/os_str.rs b/src/tools/miri/src/shims/os_str.rs
index 3e8c35d48ae..5fcea9ced69 100644
--- a/src/tools/miri/src/shims/os_str.rs
+++ b/src/tools/miri/src/shims/os_str.rs
@@ -72,11 +72,9 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         u16vec_to_osstring(u16_vec)
     }
 
-    /// Helper function to write an OsStr as a null-terminated sequence of bytes, which is what
-    /// the Unix APIs usually handle. This function returns `Ok((false, length))` without trying
-    /// to write if `size` is not large enough to fit the contents of `os_string` plus a null
-    /// terminator. It returns `Ok((true, length))` if the writing process was successful. The
-    /// string length returned does include the null terminator.
+    /// Helper function to write an OsStr as a null-terminated sequence of bytes, which is what the
+    /// Unix APIs usually handle. Returns `(success, full_len)`, where length includes the null
+    /// terminator. On failure, nothing is written.
     fn write_os_str_to_c_str(
         &mut self,
         os_str: &OsStr,
@@ -87,19 +85,9 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         self.eval_context_mut().write_c_str(bytes, ptr, size)
     }
 
-    /// Helper function to write an OsStr as a 0x0000-terminated u16-sequence, which is what the
-    /// Windows APIs usually handle.
-    ///
-    /// If `truncate == false` (the usual mode of operation), this function returns `Ok((false,
-    /// length))` without trying to write if `size` is not large enough to fit the contents of
-    /// `os_string` plus a null terminator. It returns `Ok((true, length))` if the writing process
-    /// was successful. The string length returned does include the null terminator. Length is
-    /// measured in units of `u16.`
-    ///
-    /// If `truncate == true`, then in case `size` is not large enough it *will* write the first
-    /// `size.saturating_sub(1)` many items, followed by a null terminator (if `size > 0`).
-    /// The return value is still `(false, length)` in that case.
-    fn write_os_str_to_wide_str(
+    /// Internal helper to share code between `write_os_str_to_wide_str` and
+    /// `write_os_str_to_wide_str_truncated`.
+    fn write_os_str_to_wide_str_helper(
         &mut self,
         os_str: &OsStr,
         ptr: Pointer<Option<Provenance>>,
@@ -133,6 +121,29 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         Ok((written, size_needed))
     }
 
+    /// Helper function to write an OsStr as a 0x0000-terminated u16-sequence, which is what the
+    /// Windows APIs usually handle. Returns `(success, full_len)`, where length is measured
+    /// in units of `u16` and includes the null terminator. On failure, nothing is written.
+    fn write_os_str_to_wide_str(
+        &mut self,
+        os_str: &OsStr,
+        ptr: Pointer<Option<Provenance>>,
+        size: u64,
+    ) -> InterpResult<'tcx, (bool, u64)> {
+        self.write_os_str_to_wide_str_helper(os_str, ptr, size, /*truncate*/ false)
+    }
+
+    /// Like `write_os_str_to_wide_str`, but on failure as much as possible is written into
+    /// the buffer (always with a null terminator).
+    fn write_os_str_to_wide_str_truncated(
+        &mut self,
+        os_str: &OsStr,
+        ptr: Pointer<Option<Provenance>>,
+        size: u64,
+    ) -> InterpResult<'tcx, (bool, u64)> {
+        self.write_os_str_to_wide_str_helper(os_str, ptr, size, /*truncate*/ true)
+    }
+
     /// Allocate enough memory to store the given `OsStr` as a null-terminated sequence of bytes.
     fn alloc_os_str_as_c_str(
         &mut self,
@@ -160,9 +171,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
 
         let arg_type = Ty::new_array(this.tcx.tcx, this.tcx.types.u16, size);
         let arg_place = this.allocate(this.layout_of(arg_type).unwrap(), memkind)?;
-        let (written, _) = self
-            .write_os_str_to_wide_str(os_str, arg_place.ptr(), size, /*truncate*/ false)
-            .unwrap();
+        let (written, _) = self.write_os_str_to_wide_str(os_str, arg_place.ptr(), size).unwrap();
         assert!(written);
         Ok(arg_place.ptr())
     }
@@ -217,12 +226,25 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         path: &Path,
         ptr: Pointer<Option<Provenance>>,
         size: u64,
-        truncate: bool,
     ) -> InterpResult<'tcx, (bool, u64)> {
         let this = self.eval_context_mut();
         let os_str =
             this.convert_path(Cow::Borrowed(path.as_os_str()), PathConversion::HostToTarget);
-        this.write_os_str_to_wide_str(&os_str, ptr, size, truncate)
+        this.write_os_str_to_wide_str(&os_str, ptr, size)
+    }
+
+    /// Write a Path to the machine memory (as a null-terminated sequence of `u16`s),
+    /// adjusting path separators if needed.
+    fn write_path_to_wide_str_truncated(
+        &mut self,
+        path: &Path,
+        ptr: Pointer<Option<Provenance>>,
+        size: u64,
+    ) -> InterpResult<'tcx, (bool, u64)> {
+        let this = self.eval_context_mut();
+        let os_str =
+            this.convert_path(Cow::Borrowed(path.as_os_str()), PathConversion::HostToTarget);
+        this.write_os_str_to_wide_str_truncated(&os_str, ptr, size)
     }
 
     /// Allocate enough memory to store a Path as a null-terminated sequence of bytes,
diff --git a/src/tools/miri/src/shims/windows/foreign_items.rs b/src/tools/miri/src/shims/windows/foreign_items.rs
index e0cca43d263..24f7cd18e7a 100644
--- a/src/tools/miri/src/shims/windows/foreign_items.rs
+++ b/src/tools/miri/src/shims/windows/foreign_items.rs
@@ -232,12 +232,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                     }
                     Ok(abs_filename) => {
                         Scalar::from_u32(helpers::windows_check_buffer_size(
-                            this.write_path_to_wide_str(
-                                &abs_filename,
-                                buffer,
-                                size.into(),
-                                /*truncate*/ false,
-                            )?,
+                            this.write_path_to_wide_str(&abs_filename, buffer, size.into())?,
                         ))
                         // This can in fact return 0. It is up to the caller to set last_error to 0
                         // beforehand and check it afterwards to exclude that case.
@@ -608,15 +603,9 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
 
                 // Using the host current_exe is a bit off, but consistent with Linux
                 // (where stdlib reads /proc/self/exe).
-                // Unfortunately this Windows function has a crazy behavior so we can't just use
-                // `write_path_to_wide_str`...
                 let path = std::env::current_exe().unwrap();
-                let (all_written, size_needed) = this.write_path_to_wide_str(
-                    &path,
-                    filename,
-                    size.into(),
-                    /*truncate*/ true,
-                )?;
+                let (all_written, size_needed) =
+                    this.write_path_to_wide_str_truncated(&path, filename, size.into())?;
 
                 if all_written {
                     // If the function succeeds, the return value is the length of the string that
@@ -656,12 +645,8 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                     Some(err) => format!("{err}"),
                     None => format!("<unknown error in FormatMessageW: {message_id}>"),
                 };
-                let (complete, length) = this.write_os_str_to_wide_str(
-                    OsStr::new(&formatted),
-                    buffer,
-                    size.into(),
-                    /*truncate*/ false,
-                )?;
+                let (complete, length) =
+                    this.write_os_str_to_wide_str(OsStr::new(&formatted), buffer, size.into())?;
                 if !complete {
                     // The API docs don't say what happens when the buffer is not big enough...
                     // Let's just bail.

From ccb87f1baa008719cb438d5f63d7030fdcd64eb8 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Tue, 23 Apr 2024 10:01:00 +0200
Subject: [PATCH 17/28] avoid some unnecessary Scalar-i32-Scalar roundtrips

---
 src/tools/miri/src/shims/unix/linux/mem.rs |  4 ++--
 src/tools/miri/src/shims/unix/mem.rs       | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/tools/miri/src/shims/unix/linux/mem.rs b/src/tools/miri/src/shims/unix/linux/mem.rs
index ec2922d0275..3948216f729 100644
--- a/src/tools/miri/src/shims/unix/linux/mem.rs
+++ b/src/tools/miri/src/shims/unix/linux/mem.rs
@@ -23,7 +23,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         // old_address must be a multiple of the page size
         #[allow(clippy::arithmetic_side_effects)] // PAGE_SIZE is nonzero
         if old_address.addr().bytes() % this.machine.page_size != 0 || new_size == 0 {
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("EINVAL")))?;
+            this.set_last_error(this.eval_libc("EINVAL"))?;
             return Ok(this.eval_libc("MAP_FAILED"));
         }
 
@@ -37,7 +37,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
 
         if flags & this.eval_libc_i32("MREMAP_MAYMOVE") == 0 {
             // We only support MREMAP_MAYMOVE, so not passing the flag is just a failure
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("EINVAL")))?;
+            this.set_last_error(this.eval_libc("EINVAL"))?;
             return Ok(this.eval_libc("MAP_FAILED"));
         }
 
diff --git a/src/tools/miri/src/shims/unix/mem.rs b/src/tools/miri/src/shims/unix/mem.rs
index d3470893dbb..f52dc23656d 100644
--- a/src/tools/miri/src/shims/unix/mem.rs
+++ b/src/tools/miri/src/shims/unix/mem.rs
@@ -53,11 +53,11 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
 
         // First, we do some basic argument validation as required by mmap
         if (flags & (map_private | map_shared)).count_ones() != 1 {
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("EINVAL")))?;
+            this.set_last_error(this.eval_libc("EINVAL"))?;
             return Ok(this.eval_libc("MAP_FAILED"));
         }
         if length == 0 {
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("EINVAL")))?;
+            this.set_last_error(this.eval_libc("EINVAL"))?;
             return Ok(this.eval_libc("MAP_FAILED"));
         }
 
@@ -77,7 +77,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         //
         // Miri doesn't support MAP_FIXED or any any protections other than PROT_READ|PROT_WRITE.
         if flags & map_fixed != 0 || prot != prot_read | prot_write {
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("ENOTSUP")))?;
+            this.set_last_error(this.eval_libc("ENOTSUP"))?;
             return Ok(this.eval_libc("MAP_FAILED"));
         }
 
@@ -96,11 +96,11 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
 
         let align = this.machine.page_align();
         let Some(map_length) = length.checked_next_multiple_of(this.machine.page_size) else {
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("EINVAL")))?;
+            this.set_last_error(this.eval_libc("EINVAL"))?;
             return Ok(this.eval_libc("MAP_FAILED"));
         };
         if map_length > this.target_usize_max() {
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("EINVAL")))?;
+            this.set_last_error(this.eval_libc("EINVAL"))?;
             return Ok(this.eval_libc("MAP_FAILED"));
         }
 
@@ -131,16 +131,16 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         // as a dealloc.
         #[allow(clippy::arithmetic_side_effects)] // PAGE_SIZE is nonzero
         if addr.addr().bytes() % this.machine.page_size != 0 {
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("EINVAL")))?;
+            this.set_last_error(this.eval_libc("EINVAL"))?;
             return Ok(Scalar::from_i32(-1));
         }
 
         let Some(length) = length.checked_next_multiple_of(this.machine.page_size) else {
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("EINVAL")))?;
+            this.set_last_error(this.eval_libc("EINVAL"))?;
             return Ok(Scalar::from_i32(-1));
         };
         if length > this.target_usize_max() {
-            this.set_last_error(Scalar::from_i32(this.eval_libc_i32("EINVAL")))?;
+            this.set_last_error(this.eval_libc("EINVAL"))?;
             return Ok(this.eval_libc("MAP_FAILED"));
         }
 

From 193b37dbafb85dc9cf792d2eb1e6b17e47660d9d Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Wed, 24 Apr 2024 09:10:16 +0200
Subject: [PATCH 18/28] avoid 'let _' in tests where we actually want the value
 to be computed

---
 .../tests/fail/both_borrows/aliasing_mut4.rs  |  2 +-
 .../storage_dead_dangling.rs                  |  2 +-
 .../storage_dead_dangling.stderr              |  4 ++--
 src/tools/miri/tests/pass/adjacent-allocs.rs  |  2 +-
 src/tools/miri/tests/pass/dyn-upcast.rs       | 20 +++++++++----------
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/tools/miri/tests/fail/both_borrows/aliasing_mut4.rs b/src/tools/miri/tests/fail/both_borrows/aliasing_mut4.rs
index e188a1f0c34..c656a509644 100644
--- a/src/tools/miri/tests/fail/both_borrows/aliasing_mut4.rs
+++ b/src/tools/miri/tests/fail/both_borrows/aliasing_mut4.rs
@@ -8,7 +8,7 @@ use std::mem;
 pub fn safe(x: &i32, y: &mut Cell<i32>) {
     //~[stack]^ ERROR: protect
     y.set(1);
-    let _ = *x;
+    let _load = *x;
 }
 
 fn main() {
diff --git a/src/tools/miri/tests/fail/dangling_pointers/storage_dead_dangling.rs b/src/tools/miri/tests/fail/dangling_pointers/storage_dead_dangling.rs
index f9983f48c61..f4349286801 100644
--- a/src/tools/miri/tests/fail/dangling_pointers/storage_dead_dangling.rs
+++ b/src/tools/miri/tests/fail/dangling_pointers/storage_dead_dangling.rs
@@ -10,7 +10,7 @@ fn fill(v: &mut i32) {
 }
 
 fn evil() {
-    let _ = unsafe { &mut *(LEAK as *mut i32) }; //~ ERROR: is a dangling pointer
+    let _ref = unsafe { &mut *(LEAK as *mut i32) }; //~ ERROR: is a dangling pointer
 }
 
 fn main() {
diff --git a/src/tools/miri/tests/fail/dangling_pointers/storage_dead_dangling.stderr b/src/tools/miri/tests/fail/dangling_pointers/storage_dead_dangling.stderr
index 27e5a865069..73c3ff1ee05 100644
--- a/src/tools/miri/tests/fail/dangling_pointers/storage_dead_dangling.stderr
+++ b/src/tools/miri/tests/fail/dangling_pointers/storage_dead_dangling.stderr
@@ -1,8 +1,8 @@
 error: Undefined Behavior: out-of-bounds pointer use: $HEX[noalloc] is a dangling pointer (it has no provenance)
   --> $DIR/storage_dead_dangling.rs:LL:CC
    |
-LL |     let _ = unsafe { &mut *(LEAK as *mut i32) };
-   |                      ^^^^^^^^^^^^^^^^^^^^^^^^ out-of-bounds pointer use: $HEX[noalloc] is a dangling pointer (it has no provenance)
+LL |     let _ref = unsafe { &mut *(LEAK as *mut i32) };
+   |                         ^^^^^^^^^^^^^^^^^^^^^^^^ out-of-bounds pointer use: $HEX[noalloc] is a dangling pointer (it has no provenance)
    |
    = help: this indicates a bug in the program: it performed an invalid operation, and caused Undefined Behavior
    = help: see https://doc.rust-lang.org/nightly/reference/behavior-considered-undefined.html for further information
diff --git a/src/tools/miri/tests/pass/adjacent-allocs.rs b/src/tools/miri/tests/pass/adjacent-allocs.rs
index cbf41d68b57..8be4bdac7e1 100644
--- a/src/tools/miri/tests/pass/adjacent-allocs.rs
+++ b/src/tools/miri/tests/pass/adjacent-allocs.rs
@@ -30,7 +30,7 @@ fn test1() {
         // See https://github.com/rust-lang/miri/issues/1866#issuecomment-985770125
         {
             let m = 0u64;
-            let _ = &m as *const u64;
+            let _ptr = &m as *const u64;
         }
 
         let iptr = ptr as usize;
diff --git a/src/tools/miri/tests/pass/dyn-upcast.rs b/src/tools/miri/tests/pass/dyn-upcast.rs
index ddc4bdcf082..ff995f38196 100644
--- a/src/tools/miri/tests/pass/dyn-upcast.rs
+++ b/src/tools/miri/tests/pass/dyn-upcast.rs
@@ -69,7 +69,7 @@ fn basic() {
     }
 
     let baz: &dyn Baz = &1;
-    let _: &dyn fmt::Debug = baz;
+    let _up: &dyn fmt::Debug = baz;
     assert_eq!(*baz, 1);
     assert_eq!(baz.a(), 100);
     assert_eq!(baz.b(), 200);
@@ -79,7 +79,7 @@ fn basic() {
     assert_eq!(baz.w(), 21);
 
     let bar: &dyn Bar = baz;
-    let _: &dyn fmt::Debug = bar;
+    let _up: &dyn fmt::Debug = bar;
     assert_eq!(*bar, 1);
     assert_eq!(bar.a(), 100);
     assert_eq!(bar.b(), 200);
@@ -88,14 +88,14 @@ fn basic() {
     assert_eq!(bar.w(), 21);
 
     let foo: &dyn Foo = baz;
-    let _: &dyn fmt::Debug = foo;
+    let _up: &dyn fmt::Debug = foo;
     assert_eq!(*foo, 1);
     assert_eq!(foo.a(), 100);
     assert_eq!(foo.z(), 11);
     assert_eq!(foo.y(), 12);
 
     let foo: &dyn Foo = bar;
-    let _: &dyn fmt::Debug = foo;
+    let _up: &dyn fmt::Debug = foo;
     assert_eq!(*foo, 1);
     assert_eq!(foo.a(), 100);
     assert_eq!(foo.z(), 11);
@@ -168,7 +168,7 @@ fn diamond() {
     }
 
     let baz: &dyn Baz = &1;
-    let _: &dyn fmt::Debug = baz;
+    let _up: &dyn fmt::Debug = baz;
     assert_eq!(*baz, 1);
     assert_eq!(baz.a(), 100);
     assert_eq!(baz.b(), 200);
@@ -180,7 +180,7 @@ fn diamond() {
     assert_eq!(baz.v(), 31);
 
     let bar1: &dyn Bar1 = baz;
-    let _: &dyn fmt::Debug = bar1;
+    let _up: &dyn fmt::Debug = bar1;
     assert_eq!(*bar1, 1);
     assert_eq!(bar1.a(), 100);
     assert_eq!(bar1.b(), 200);
@@ -189,7 +189,7 @@ fn diamond() {
     assert_eq!(bar1.w(), 21);
 
     let bar2: &dyn Bar2 = baz;
-    let _: &dyn fmt::Debug = bar2;
+    let _up: &dyn fmt::Debug = bar2;
     assert_eq!(*bar2, 1);
     assert_eq!(bar2.a(), 100);
     assert_eq!(bar2.c(), 300);
@@ -198,17 +198,17 @@ fn diamond() {
     assert_eq!(bar2.v(), 31);
 
     let foo: &dyn Foo = baz;
-    let _: &dyn fmt::Debug = foo;
+    let _up: &dyn fmt::Debug = foo;
     assert_eq!(*foo, 1);
     assert_eq!(foo.a(), 100);
 
     let foo: &dyn Foo = bar1;
-    let _: &dyn fmt::Debug = foo;
+    let _up: &dyn fmt::Debug = foo;
     assert_eq!(*foo, 1);
     assert_eq!(foo.a(), 100);
 
     let foo: &dyn Foo = bar2;
-    let _: &dyn fmt::Debug = foo;
+    let _up: &dyn fmt::Debug = foo;
     assert_eq!(*foo, 1);
     assert_eq!(foo.a(), 100);
 }

From aa19679ff08ea2a3d165aa66d4ddd0bdae8c3392 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Wed, 24 Apr 2024 14:58:12 +0200
Subject: [PATCH 19/28] unix_sigpipe: don't inline DEFAULT, just use it from
 rustc

---
 src/tools/miri/src/eval.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/tools/miri/src/eval.rs b/src/tools/miri/src/eval.rs
index d74cd5ff3e2..2242768a568 100644
--- a/src/tools/miri/src/eval.rs
+++ b/src/tools/miri/src/eval.rs
@@ -386,10 +386,9 @@ pub fn create_ecx<'mir, 'tcx: 'mir>(
 
             let main_ptr = ecx.fn_ptr(FnVal::Instance(entry_instance));
 
-            // Inlining of `DEFAULT` from
-            // https://github.com/rust-lang/rust/blob/master/compiler/rustc_session/src/config/sigpipe.rs.
             // Always using DEFAULT is okay since we don't support signals in Miri anyway.
-            let sigpipe = 2;
+            // (This means we are effectively ignoring `#[unix_sigpipe]`.)
+            let sigpipe = rustc_session::config::sigpipe::DEFAULT;
 
             ecx.call_function(
                 start_instance,

From 8b0ab42855453b54e53266f682c0e5b9c55f3f17 Mon Sep 17 00:00:00 2001
From: Predrag Gruevski <2348618+obi1kenobi@users.noreply.github.com>
Date: Wed, 24 Apr 2024 10:57:18 -0400
Subject: [PATCH 20/28] Upgrade to `actions/checkout@v4` in `ci.yml`.

This is a newer version of the same action. None of the uses here were particularly special (no complex features of v3 were used) so this is a straightforward as-is upgrade.
---
 src/tools/miri/.github/workflows/ci.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/tools/miri/.github/workflows/ci.yml b/src/tools/miri/.github/workflows/ci.yml
index b0dab9f509d..73afd2a12a9 100644
--- a/src/tools/miri/.github/workflows/ci.yml
+++ b/src/tools/miri/.github/workflows/ci.yml
@@ -32,7 +32,7 @@ jobs:
     env:
       HOST_TARGET: ${{ matrix.host_target }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Show Rust version (stable toolchain)
         run: |
@@ -85,7 +85,7 @@ jobs:
     name: style checks
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       # This is exactly duplicated from above. GHA is pretty terrible when it comes
       # to avoiding code duplication.
@@ -191,7 +191,7 @@ jobs:
           The Miri Cronjobs Bot'
 
       # Attempt to auto-sync with rustc
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 256 # get a bit more of the history
       - name: install josh-proxy

From 8c24fe108530bcba2d61ce1ca0bbc672edb23ca2 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Wed, 24 Apr 2024 18:05:03 +0200
Subject: [PATCH 21/28] make miri-script a workspace root

---
 src/tools/miri/miri-script/Cargo.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/tools/miri/miri-script/Cargo.toml b/src/tools/miri/miri-script/Cargo.toml
index aaa788d5846..79d0b13600d 100644
--- a/src/tools/miri/miri-script/Cargo.toml
+++ b/src/tools/miri/miri-script/Cargo.toml
@@ -8,7 +8,9 @@ version = "0.1.0"
 default-run = "miri-script"
 edition = "2021"
 
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+[workspace]
+# We make this a workspace root so that cargo does not go looking in ../Cargo.toml for the workspace root.
+# This is needed to make this package build on stable when the parent package uses unstable cargo features.
 
 [dependencies]
 which = "4.4"

From bed7caf20638dde1822553d432f6fe213baf4eae Mon Sep 17 00:00:00 2001
From: The Miri Cronjob Bot <miri@cron.bot>
Date: Thu, 25 Apr 2024 04:57:07 +0000
Subject: [PATCH 22/28] Preparing for merge from rustc

---
 src/tools/miri/rust-version | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tools/miri/rust-version b/src/tools/miri/rust-version
index 3191355ccbe..a6433a8e286 100644
--- a/src/tools/miri/rust-version
+++ b/src/tools/miri/rust-version
@@ -1 +1 @@
-c1feb3eceef7d5f0126c309a87062cf413fe0a25
+cb3752d20e0f5d24348062211102a08d46fbecff

From 258e5043a156a87497f98d5220624545658aad0b Mon Sep 17 00:00:00 2001
From: The Miri Cronjob Bot <miri@cron.bot>
Date: Thu, 25 Apr 2024 05:05:41 +0000
Subject: [PATCH 23/28] fmt

---
 .../miri/tests/fail/coroutine-pinned-moved.rs |   3 +-
 src/tools/miri/tests/pass/coroutine.rs        | 225 ++++++++++++------
 .../coroutine-self-referential.rs             |   3 +-
 3 files changed, 150 insertions(+), 81 deletions(-)

diff --git a/src/tools/miri/tests/fail/coroutine-pinned-moved.rs b/src/tools/miri/tests/fail/coroutine-pinned-moved.rs
index 8648be2a264..46ec58810a6 100644
--- a/src/tools/miri/tests/fail/coroutine-pinned-moved.rs
+++ b/src/tools/miri/tests/fail/coroutine-pinned-moved.rs
@@ -7,7 +7,8 @@ use std::{
 };
 
 fn firstn() -> impl Coroutine<Yield = u64, Return = ()> {
-    #[coroutine] static move || {
+    #[coroutine]
+    static move || {
         let mut num = 0;
         let num = &mut num;
         *num += 0;
diff --git a/src/tools/miri/tests/pass/coroutine.rs b/src/tools/miri/tests/pass/coroutine.rs
index e76abfc4185..7822c136d91 100644
--- a/src/tools/miri/tests/pass/coroutine.rs
+++ b/src/tools/miri/tests/pass/coroutine.rs
@@ -43,94 +43,144 @@ fn basic() {
         panic!()
     }
 
-    finish(1, false, #[coroutine] || yield 1);
+    finish(
+        1,
+        false,
+        #[coroutine]
+        || yield 1,
+    );
 
-    finish(3, false, #[coroutine] || {
-        let mut x = 0;
-        yield 1;
-        x += 1;
-        yield 1;
-        x += 1;
-        yield 1;
-        assert_eq!(x, 2);
-    });
-
-    finish(7 * 8 / 2, false, #[coroutine] || {
-        for i in 0..8 {
-            yield i;
-        }
-    });
-
-    finish(1, false, #[coroutine] || {
-        if true {
+    finish(
+        3,
+        false,
+        #[coroutine]
+        || {
+            let mut x = 0;
             yield 1;
-        } else {
-        }
-    });
+            x += 1;
+            yield 1;
+            x += 1;
+            yield 1;
+            assert_eq!(x, 2);
+        },
+    );
 
-    finish(1, false, #[coroutine] || {
-        if false {
-        } else {
-            yield 1;
-        }
-    });
+    finish(
+        7 * 8 / 2,
+        false,
+        #[coroutine]
+        || {
+            for i in 0..8 {
+                yield i;
+            }
+        },
+    );
 
-    finish(2, false, #[coroutine] || {
-        if {
+    finish(
+        1,
+        false,
+        #[coroutine]
+        || {
+            if true {
+                yield 1;
+            } else {
+            }
+        },
+    );
+
+    finish(
+        1,
+        false,
+        #[coroutine]
+        || {
+            if false {
+            } else {
+                yield 1;
+            }
+        },
+    );
+
+    finish(
+        2,
+        false,
+        #[coroutine]
+        || {
+            if {
+                yield 1;
+                false
+            } {
+                yield 1;
+                panic!()
+            }
             yield 1;
-            false
-        } {
-            yield 1;
-            panic!()
-        }
-        yield 1;
-    });
+        },
+    );
 
     // also test self-referential coroutines
     assert_eq!(
-        finish(5, true, #[coroutine] static || {
-            let mut x = 5;
-            let y = &mut x;
-            *y = 5;
-            yield *y;
-            *y = 10;
-            x
-        }),
+        finish(
+            5,
+            true,
+            #[coroutine]
+            static || {
+                let mut x = 5;
+                let y = &mut x;
+                *y = 5;
+                yield *y;
+                *y = 10;
+                x
+            }
+        ),
         10
     );
     assert_eq!(
-        finish(5, true, #[coroutine] || {
-            let mut x = Box::new(5);
-            let y = &mut *x;
-            *y = 5;
-            yield *y;
-            *y = 10;
-            *x
-        }),
+        finish(
+            5,
+            true,
+            #[coroutine]
+            || {
+                let mut x = Box::new(5);
+                let y = &mut *x;
+                *y = 5;
+                yield *y;
+                *y = 10;
+                *x
+            }
+        ),
         10
     );
 
     let b = true;
-    finish(1, false, #[coroutine] || {
-        yield 1;
-        if b {
-            return;
-        }
-        #[allow(unused)]
-        let x = never();
-        #[allow(unreachable_code)]
-        yield 2;
-        drop(x);
-    });
-
-    finish(3, false, #[coroutine] || {
-        yield 1;
-        #[allow(unreachable_code)]
-        let _x: (String, !) = (String::new(), {
+    finish(
+        1,
+        false,
+        #[coroutine]
+        || {
+            yield 1;
+            if b {
+                return;
+            }
+            #[allow(unused)]
+            let x = never();
+            #[allow(unreachable_code)]
             yield 2;
-            return;
-        });
-    });
+            drop(x);
+        },
+    );
+
+    finish(
+        3,
+        false,
+        #[coroutine]
+        || {
+            yield 1;
+            #[allow(unreachable_code)]
+            let _x: (String, !) = (String::new(), {
+                yield 2;
+                return;
+            });
+        },
+    );
 }
 
 fn smoke_resume_arg() {
@@ -172,7 +222,8 @@ fn smoke_resume_arg() {
     }
 
     drain(
-        &mut #[coroutine] |mut b| {
+        &mut #[coroutine]
+        |mut b| {
             while b != 0 {
                 b = yield (b + 1);
             }
@@ -181,21 +232,35 @@ fn smoke_resume_arg() {
         vec![(1, Yielded(2)), (-45, Yielded(-44)), (500, Yielded(501)), (0, Complete(-1))],
     );
 
-    expect_drops(2, || drain(&mut #[coroutine] |a| yield a, vec![(DropMe, Yielded(DropMe))]));
+    expect_drops(2, || {
+        drain(
+            &mut #[coroutine]
+            |a| yield a,
+            vec![(DropMe, Yielded(DropMe))],
+        )
+    });
 
     expect_drops(6, || {
         drain(
-            &mut #[coroutine] |a| yield yield a,
+            &mut #[coroutine]
+            |a| yield yield a,
             vec![(DropMe, Yielded(DropMe)), (DropMe, Yielded(DropMe)), (DropMe, Complete(DropMe))],
         )
     });
 
     #[allow(unreachable_code)]
-    expect_drops(2, || drain(&mut #[coroutine] |a| yield return a, vec![(DropMe, Complete(DropMe))]));
+    expect_drops(2, || {
+        drain(
+            &mut #[coroutine]
+            |a| yield return a,
+            vec![(DropMe, Complete(DropMe))],
+        )
+    });
 
     expect_drops(2, || {
         drain(
-            &mut #[coroutine] |a: DropMe| {
+            &mut #[coroutine]
+            |a: DropMe| {
                 if false { yield () } else { a }
             },
             vec![(DropMe, Complete(DropMe))],
@@ -205,7 +270,8 @@ fn smoke_resume_arg() {
     expect_drops(4, || {
         drain(
             #[allow(unused_assignments, unused_variables)]
-            &mut #[coroutine] |mut a: DropMe| {
+            &mut #[coroutine]
+            |mut a: DropMe| {
                 a = yield;
                 a = yield;
                 a = yield;
@@ -228,7 +294,8 @@ fn uninit_fields() {
     }
 
     fn run<T>(x: bool, y: bool) {
-        let mut c = #[coroutine] || {
+        let mut c = #[coroutine]
+        || {
             if x {
                 let _a: T;
                 if y {
diff --git a/src/tools/miri/tests/pass/stacked-borrows/coroutine-self-referential.rs b/src/tools/miri/tests/pass/stacked-borrows/coroutine-self-referential.rs
index bb98e024a0a..259fc72d392 100644
--- a/src/tools/miri/tests/pass/stacked-borrows/coroutine-self-referential.rs
+++ b/src/tools/miri/tests/pass/stacked-borrows/coroutine-self-referential.rs
@@ -8,7 +8,8 @@ use std::{
 };
 
 fn firstn() -> impl Coroutine<Yield = u64, Return = ()> {
-    #[coroutine] static move || {
+    #[coroutine]
+    static move || {
         let mut num = 0;
         let num = &mut num;
 

From f38dba69b18a8ce593272ea3c2369c4274d8837b Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Tue, 23 Apr 2024 08:42:52 +0200
Subject: [PATCH 24/28] weak memory outdated loads: show where the load was
 from

---
 src/tools/miri/src/concurrency/weak_memory.rs |  4 +++-
 src/tools/miri/src/diagnostics.rs             | 11 +++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/tools/miri/src/concurrency/weak_memory.rs b/src/tools/miri/src/concurrency/weak_memory.rs
index f544393cfe6..574962c48d4 100644
--- a/src/tools/miri/src/concurrency/weak_memory.rs
+++ b/src/tools/miri/src/concurrency/weak_memory.rs
@@ -520,7 +520,9 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                     validate,
                 )?;
                 if global.track_outdated_loads && recency == LoadRecency::Outdated {
-                    this.emit_diagnostic(NonHaltingDiagnostic::WeakMemoryOutdatedLoad);
+                    this.emit_diagnostic(NonHaltingDiagnostic::WeakMemoryOutdatedLoad {
+                        ptr: place.ptr(),
+                    });
                 }
 
                 return Ok(loaded);
diff --git a/src/tools/miri/src/diagnostics.rs b/src/tools/miri/src/diagnostics.rs
index 0c0ac4c6036..9fa786332e3 100644
--- a/src/tools/miri/src/diagnostics.rs
+++ b/src/tools/miri/src/diagnostics.rs
@@ -125,7 +125,9 @@ pub enum NonHaltingDiagnostic {
     Int2Ptr {
         details: bool,
     },
-    WeakMemoryOutdatedLoad,
+    WeakMemoryOutdatedLoad {
+        ptr: Pointer<Option<Provenance>>,
+    },
 }
 
 /// Level of Miri specific diagnostics
@@ -583,7 +585,8 @@ impl<'mir, 'tcx> MiriMachine<'mir, 'tcx> {
             | AccessedAlloc(..)
             | FreedAlloc(..)
             | ProgressReport { .. }
-            | WeakMemoryOutdatedLoad => ("tracking was triggered".to_string(), DiagLevel::Note),
+            | WeakMemoryOutdatedLoad { .. } =>
+                ("tracking was triggered".to_string(), DiagLevel::Note),
         };
 
         let msg = match &e {
@@ -610,8 +613,8 @@ impl<'mir, 'tcx> MiriMachine<'mir, 'tcx> {
             ProgressReport { .. } =>
                 format!("progress report: current operation being executed is here"),
             Int2Ptr { .. } => format!("integer-to-pointer cast"),
-            WeakMemoryOutdatedLoad =>
-                format!("weak memory emulation: outdated value returned from load"),
+            WeakMemoryOutdatedLoad { ptr } =>
+                format!("weak memory emulation: outdated value returned from load at {ptr}"),
         };
 
         let notes = match &e {

From ea9cff254f2e363a30fcd6f887347f05a76a0f70 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Tue, 23 Apr 2024 08:43:06 +0200
Subject: [PATCH 25/28] add a test for the TLS memory leak

---
 src/tools/miri/tests/many-seeds/tls-leak.rs | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 src/tools/miri/tests/many-seeds/tls-leak.rs

diff --git a/src/tools/miri/tests/many-seeds/tls-leak.rs b/src/tools/miri/tests/many-seeds/tls-leak.rs
new file mode 100644
index 00000000000..70a506958d1
--- /dev/null
+++ b/src/tools/miri/tests/many-seeds/tls-leak.rs
@@ -0,0 +1,13 @@
+//! Regression test for <https://github.com/rust-lang/rust/issues/123583>.
+use std::thread;
+
+pub(crate) fn with_thread_local() {
+    thread_local! { static X: Box<u8> = Box::new(0); }
+    X.with(|_x| {})
+}
+
+fn main() {
+    let j2 = thread::spawn(with_thread_local);
+    with_thread_local();
+    j2.join().unwrap();
+}

From 247e82cb8391765cef976d0a23d7bdc3e509a978 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Tue, 23 Apr 2024 08:44:49 +0200
Subject: [PATCH 26/28] run many-seeds tests at least a few times on all tier 1
 targets

---
 src/tools/miri/ci/ci.sh                     | 18 +++++++++-------
 src/tools/miri/tests/many-seeds/tls-leak.rs | 23 ++++++++++++++++-----
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/src/tools/miri/ci/ci.sh b/src/tools/miri/ci/ci.sh
index f8ba612750e..ad0c795315e 100755
--- a/src/tools/miri/ci/ci.sh
+++ b/src/tools/miri/ci/ci.sh
@@ -128,16 +128,18 @@ function run_tests_minimal {
 ## Main Testing Logic ##
 
 # In particular, fully cover all tier 1 targets.
+# We also want to run the many-seeds tests on all tier 1 targets.
 case $HOST_TARGET in
   x86_64-unknown-linux-gnu)
     # Host
     GC_STRESS=1 MIR_OPT=1 MANY_SEEDS=64 TEST_BENCH=1 CARGO_MIRI_ENV=1 run_tests
     # Extra tier 1
-    MIRI_TEST_TARGET=i686-unknown-linux-gnu run_tests
-    MIRI_TEST_TARGET=aarch64-unknown-linux-gnu run_tests
-    MIRI_TEST_TARGET=x86_64-apple-darwin run_tests
-    MIRI_TEST_TARGET=i686-pc-windows-gnu run_tests
-    MIRI_TEST_TARGET=x86_64-pc-windows-gnu run_tests
+    # With reduced many-seed count to avoid spending too much time on that.
+    # (All OSes are run with 64 seeds at least once though via the macOS runner.)
+    MANY_SEEDS=16 MIRI_TEST_TARGET=i686-unknown-linux-gnu run_tests
+    MANY_SEEDS=16 MIRI_TEST_TARGET=aarch64-unknown-linux-gnu run_tests
+    MANY_SEEDS=16 MIRI_TEST_TARGET=x86_64-apple-darwin run_tests
+    MANY_SEEDS=16 MIRI_TEST_TARGET=x86_64-pc-windows-gnu run_tests
     # Extra tier 2
     MIRI_TEST_TARGET=aarch64-apple-darwin run_tests
     MIRI_TEST_TARGET=arm-unknown-linux-gnueabi run_tests
@@ -155,13 +157,15 @@ case $HOST_TARGET in
     # Host (tier 2)
     GC_STRESS=1 MIR_OPT=1 MANY_SEEDS=64 TEST_BENCH=1 CARGO_MIRI_ENV=1 run_tests
     # Extra tier 1
-    MIRI_TEST_TARGET=x86_64-pc-windows-msvc CARGO_MIRI_ENV=1 run_tests
+    MANY_SEEDS=64 MIRI_TEST_TARGET=i686-pc-windows-gnu run_tests
+    MANY_SEEDS=64 MIRI_TEST_TARGET=x86_64-pc-windows-msvc CARGO_MIRI_ENV=1 run_tests
     # Extra tier 2
     MIRI_TEST_TARGET=s390x-unknown-linux-gnu run_tests # big-endian architecture
     ;;
   i686-pc-windows-msvc)
     # Host
-    # Only smoke-test `many-seeds`; 64 runs take 15min here!
+    # Only smoke-test `many-seeds`; 64 runs of just the scoped-thread-leak test take 15min here!
+    # See <https://github.com/rust-lang/miri/issues/3509>.
     GC_STRESS=1 MIR_OPT=1 MANY_SEEDS=1 TEST_BENCH=1 run_tests
     # Extra tier 1
     # We really want to ensure a Linux target works on a Windows host,
diff --git a/src/tools/miri/tests/many-seeds/tls-leak.rs b/src/tools/miri/tests/many-seeds/tls-leak.rs
index 70a506958d1..3b243633439 100644
--- a/src/tools/miri/tests/many-seeds/tls-leak.rs
+++ b/src/tools/miri/tests/many-seeds/tls-leak.rs
@@ -1,13 +1,26 @@
 //! Regression test for <https://github.com/rust-lang/rust/issues/123583>.
 use std::thread;
 
-pub(crate) fn with_thread_local() {
+fn with_thread_local1() {
     thread_local! { static X: Box<u8> = Box::new(0); }
     X.with(|_x| {})
 }
 
-fn main() {
-    let j2 = thread::spawn(with_thread_local);
-    with_thread_local();
-    j2.join().unwrap();
+fn with_thread_local2() {
+    thread_local! { static Y: Box<u8> = Box::new(0); }
+    Y.with(|_y| {})
+}
+
+fn main() {
+    // Here we have two threads racing on initializing the thread-local and adding it to the global
+    // dtor list (on targets that have such a list, i.e., targets without target_thread_local).
+    let t = thread::spawn(with_thread_local1);
+    with_thread_local1();
+    t.join().unwrap();
+
+    // Here we have one thread running the destructors racing with another thread initializing a
+    // thread-local. The second thread adds a destructor that could be picked up by the first.
+    let t = thread::spawn(|| { /* immediately just run destructors */ });
+    with_thread_local2(); // initialize thread-local
+    t.join().unwrap();
 }

From 505e4dd7ba9f32305ab42b3cd46d122bd7fb4b6f Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Thu, 25 Apr 2024 08:13:34 +0200
Subject: [PATCH 27/28] CI: run benches with hyperfine rather than bash

---
 src/tools/miri/.github/workflows/ci.yml | 8 ++++----
 src/tools/miri/ci/ci.sh                 | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/tools/miri/.github/workflows/ci.yml b/src/tools/miri/.github/workflows/ci.yml
index 73afd2a12a9..69442295b4f 100644
--- a/src/tools/miri/.github/workflows/ci.yml
+++ b/src/tools/miri/.github/workflows/ci.yml
@@ -57,12 +57,12 @@ jobs:
             ~/.cargo/bin
             ~/.cargo/.crates.toml
             ~/.cargo/.crates2.json
-          key: cargo-${{ runner.os }}-reset20240331-${{ hashFiles('**/Cargo.lock') }}
-          restore-keys: cargo-${{ runner.os }}-reset20240331
+          key: cargo-${{ runner.os }}-reset20240425-${{ hashFiles('**/Cargo.lock') }}
+          restore-keys: cargo-${{ runner.os }}-reset20240425
 
-      - name: Install rustup-toolchain-install-master
+      - name: Install tools
         if: ${{ steps.cache.outputs.cache-hit != 'true' }}
-        run: cargo install -f rustup-toolchain-install-master
+        run: cargo install -f rustup-toolchain-install-master hyperfine
 
       - name: Install miri toolchain
         run: |
diff --git a/src/tools/miri/ci/ci.sh b/src/tools/miri/ci/ci.sh
index f8ba612750e..eb32f325a15 100755
--- a/src/tools/miri/ci/ci.sh
+++ b/src/tools/miri/ci/ci.sh
@@ -78,8 +78,8 @@ function run_tests {
     done
   fi
   if [ -n "${TEST_BENCH-}" ]; then
-    # Check that the benchmarks build and run, but without actually benchmarking.
-    time HYPERFINE="'$BASH' -c" ./miri bench
+    # Check that the benchmarks build and run, but only once.
+    time HYPERFINE="hyperfine -w0 -r1" ./miri bench
   fi
 
   ## test-cargo-miri

From 9f7194b5eb05022822be25f2011c1cc73ba972b5 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Thu, 25 Apr 2024 12:28:45 +0200
Subject: [PATCH 28/28] update lockfile

---
 Cargo.lock | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 552b446a1e7..458a1c1a3ec 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -491,9 +491,9 @@ checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e"
 
 [[package]]
 name = "chrono"
-version = "0.4.37"
+version = "0.4.38"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a0d04d43504c61aa6c7531f1871dd0d418d91130162063b789da00fd7057a5e"
+checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
 dependencies = [
  "android-tzdata",
  "iana-time-zone",
@@ -2493,8 +2493,10 @@ name = "miri"
 version = "0.1.0"
 dependencies = [
  "aes",
+ "chrono",
  "colored",
  "ctrlc",
+ "directories",
  "getrandom",
  "jemalloc-sys",
  "lazy_static",