From b48e7e5496202a3a93b24060ec782b0eec08b67b Mon Sep 17 00:00:00 2001
From: Chris Denton <chris@chrisdenton.dev>
Date: Sat, 6 Apr 2024 15:30:17 +0000
Subject: [PATCH 1/3] Add const UTF-8 to UTF-16 conversion macros

`wide_str!` creates a null terminated UTF-16 string whereas `utf16!` just creates a UTF-16 string without adding a null.
---
 library/std/src/sys/pal/windows/api.rs       | 94 ++++++++++++++++++++
 library/std/src/sys/pal/windows/api/tests.rs | 16 ++++
 library/std/src/sys/pal/windows/mod.rs       |  5 +-
 3 files changed, 113 insertions(+), 2 deletions(-)
 create mode 100644 library/std/src/sys/pal/windows/api/tests.rs

diff --git a/library/std/src/sys/pal/windows/api.rs b/library/std/src/sys/pal/windows/api.rs
index 90e1bff52a3..8613dba42d2 100644
--- a/library/std/src/sys/pal/windows/api.rs
+++ b/library/std/src/sys/pal/windows/api.rs
@@ -34,6 +34,100 @@ use core::ptr::addr_of;
 
 use super::c;
 
+/// Creates a null-terminated UTF-16 string from a str.
+pub macro wide_str($str:literal) {{
+    const _: () = {
+        if core::slice::memchr::memchr(0, $str.as_bytes()).is_some() {
+            panic!("null terminated strings cannot contain interior nulls");
+        }
+    };
+    crate::sys::pal::windows::api::utf16!(concat!($str, '\0'))
+}}
+
+/// Creates a UTF-16 string from a str without null termination.
+pub macro utf16($str:expr) {{
+    const UTF8: &str = $str;
+    const UTF16_LEN: usize = crate::sys::pal::windows::api::utf16_len(UTF8);
+    const UTF16: [u16; UTF16_LEN] = crate::sys::pal::windows::api::to_utf16(UTF8);
+    &UTF16
+}}
+
+#[cfg(test)]
+mod tests;
+
+/// Gets the UTF-16 length of a UTF-8 string, for use in the wide_str macro.
+pub const fn utf16_len(s: &str) -> usize {
+    let s = s.as_bytes();
+    let mut i = 0;
+    let mut len = 0;
+    while i < s.len() {
+        // the length of a UTF-8 encoded code-point is given by the number of
+        // leading ones, except in the case of ASCII.
+        let utf8_len = match s[i].leading_ones() {
+            0 => 1,
+            n => n as usize,
+        };
+        i += utf8_len;
+        len += if utf8_len < 4 { 1 } else { 2 };
+    }
+    len
+}
+
+/// Const convert UTF-8 to UTF-16, for use in the wide_str macro.
+///
+/// Note that this is designed for use in const contexts so is not optimized.
+pub const fn to_utf16<const UTF16_LEN: usize>(s: &str) -> [u16; UTF16_LEN] {
+    let mut output = [0_u16; UTF16_LEN];
+    let mut pos = 0;
+    let s = s.as_bytes();
+    let mut i = 0;
+    while i < s.len() {
+        match s[i].leading_ones() {
+            // Decode UTF-8 based on its length.
+            // See https://en.wikipedia.org/wiki/UTF-8
+            0 => {
+                // ASCII is the same in both encodings
+                output[pos] = s[i] as u16;
+                i += 1;
+                pos += 1;
+            }
+            2 => {
+                // Bits: 110xxxxx 10xxxxxx
+                output[pos] = ((s[i] as u16 & 0b11111) << 6) | (s[i + 1] as u16 & 0b111111);
+                i += 2;
+                pos += 1;
+            }
+            3 => {
+                // Bits: 1110xxxx 10xxxxxx 10xxxxxx
+                output[pos] = ((s[i] as u16 & 0b1111) << 12)
+                    | ((s[i + 1] as u16 & 0b111111) << 6)
+                    | (s[i + 2] as u16 & 0b111111);
+                i += 3;
+                pos += 1;
+            }
+            4 => {
+                // Bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+                let mut c = ((s[i] as u32 & 0b111) << 18)
+                    | ((s[i + 1] as u32 & 0b111111) << 12)
+                    | ((s[i + 2] as u32 & 0b111111) << 6)
+                    | (s[i + 3] as u32 & 0b111111);
+                // re-encode as UTF-16 (see https://en.wikipedia.org/wiki/UTF-16)
+                // - Subtract 0x10000 from the code point
+                // - For the high surrogate, shift right by 10 then add 0xD800
+                // - For the low surrogate, take the low 10 bits then add 0xDC00
+                c -= 0x10000;
+                output[pos] = ((c >> 10) + 0xD800) as u16;
+                output[pos + 1] = ((c & 0b1111111111) + 0xDC00) as u16;
+                i += 4;
+                pos += 2;
+            }
+            // valid UTF-8 cannot have any other values
+            _ => unreachable!(),
+        }
+    }
+    output
+}
+
 /// Helper method for getting the size of `T` as a u32.
 /// Errors at compile time if the size would overflow.
 ///
diff --git a/library/std/src/sys/pal/windows/api/tests.rs b/library/std/src/sys/pal/windows/api/tests.rs
new file mode 100644
index 00000000000..fab022c7b93
--- /dev/null
+++ b/library/std/src/sys/pal/windows/api/tests.rs
@@ -0,0 +1,16 @@
+use crate::sys::pal::windows::api::{utf16, wide_str};
+
+macro_rules! check_utf16 {
+    ($str:literal) => {{
+        assert!(wide_str!($str).iter().copied().eq($str.encode_utf16().chain([0])));
+        assert!(utf16!($str).iter().copied().eq($str.encode_utf16()));
+    }};
+}
+
+#[test]
+fn test_utf16_macros() {
+    check_utf16!("hello world");
+    check_utf16!("€4.50");
+    check_utf16!("𨉟呐㗂越");
+    check_utf16!("Pchnąć w tę łódź jeża lub ośm skrzyń fig");
+}
diff --git a/library/std/src/sys/pal/windows/mod.rs b/library/std/src/sys/pal/windows/mod.rs
index 6a561518fad..e68954d447a 100644
--- a/library/std/src/sys/pal/windows/mod.rs
+++ b/library/std/src/sys/pal/windows/mod.rs
@@ -5,6 +5,7 @@ use crate::io::ErrorKind;
 use crate::mem::MaybeUninit;
 use crate::os::windows::ffi::{OsStrExt, OsStringExt};
 use crate::path::PathBuf;
+use crate::sys::pal::windows::api::wide_str;
 use crate::time::Duration;
 
 pub use self::rand::hashmap_random_keys;
@@ -12,6 +13,8 @@ pub use self::rand::hashmap_random_keys;
 #[macro_use]
 pub mod compat;
 
+mod api;
+
 pub mod alloc;
 pub mod args;
 pub mod c;
@@ -41,8 +44,6 @@ cfg_if::cfg_if! {
     }
 }
 
-mod api;
-
 /// Map a Result<T, WinError> to io::Result<T>.
 trait IoResult<T> {
     fn io_result(self) -> crate::io::Result<T>;

From 952d432666e6b1a8c76c332375e3483213532670 Mon Sep 17 00:00:00 2001
From: Chris Denton <chris@chrisdenton.dev>
Date: Sat, 6 Apr 2024 03:15:06 +0000
Subject: [PATCH 2/3] Windows: set main thread name without reencoding

---
 library/std/src/sys/pal/windows/mod.rs    |  2 +-
 library/std/src/sys/pal/windows/thread.rs | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/library/std/src/sys/pal/windows/mod.rs b/library/std/src/sys/pal/windows/mod.rs
index e68954d447a..a734c2bd4c7 100644
--- a/library/std/src/sys/pal/windows/mod.rs
+++ b/library/std/src/sys/pal/windows/mod.rs
@@ -61,7 +61,7 @@ pub unsafe fn init(_argc: isize, _argv: *const *const u8, _sigpipe: u8) {
 
     // Normally, `thread::spawn` will call `Thread::set_name` but since this thread already
     // exists, we have to call it ourselves.
-    thread::Thread::set_name(&c"main");
+    thread::Thread::set_name_wide(wide_str!("main"));
 }
 
 // SAFETY: must be called only once during runtime cleanup.
diff --git a/library/std/src/sys/pal/windows/thread.rs b/library/std/src/sys/pal/windows/thread.rs
index c0c63c3340f..9b1c5b34bbf 100644
--- a/library/std/src/sys/pal/windows/thread.rs
+++ b/library/std/src/sys/pal/windows/thread.rs
@@ -59,13 +59,17 @@ impl Thread {
     pub fn set_name(name: &CStr) {
         if let Ok(utf8) = name.to_str() {
             if let Ok(utf16) = to_u16s(utf8) {
-                unsafe {
-                    c::SetThreadDescription(c::GetCurrentThread(), utf16.as_ptr());
-                };
+                Self::set_name_wide(&utf16)
             };
         };
     }
 
+    pub fn set_name_wide(name: &[u16]) {
+        unsafe {
+            c::SetThreadDescription(c::GetCurrentThread(), name.as_ptr());
+        };
+    }
+
     pub fn join(self) {
         let rc = unsafe { c::WaitForSingleObject(self.handle.as_raw_handle(), c::INFINITE) };
         if rc == c::WAIT_FAILED {

From 19f04a7d6878fc4c258ba3d4374e81c8bbeca2e0 Mon Sep 17 00:00:00 2001
From: Chris Denton <chris@chrisdenton.dev>
Date: Mon, 8 Apr 2024 11:42:16 +0000
Subject: [PATCH 3/3] Add comment on UTF-16 surrogates

---
 library/std/src/sys/pal/windows/api.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/library/std/src/sys/pal/windows/api.rs b/library/std/src/sys/pal/windows/api.rs
index 8613dba42d2..555ad581b85 100644
--- a/library/std/src/sys/pal/windows/api.rs
+++ b/library/std/src/sys/pal/windows/api.rs
@@ -68,6 +68,8 @@ pub const fn utf16_len(s: &str) -> usize {
             n => n as usize,
         };
         i += utf8_len;
+        // Note that UTF-16 surrogates (U+D800 to U+DFFF) are not encodable as UTF-8,
+        // so (unlike with WTF-8) we don't have to worry about how they'll get re-encoded.
         len += if utf8_len < 4 { 1 } else { 2 };
     }
     len