From 8d2beb50c2b228f4bd6f8e2d81d82c9e1f5e5ba2 Mon Sep 17 00:00:00 2001
From: Ed Page <eopage@gmail.com>
Date: Mon, 27 Mar 2023 21:22:36 -0500
Subject: [PATCH] Allow access to `OsStr` bytes

`OsStr` has historically kept its implementation details private out of
concern for locking us into a specific encoding on Windows.

This is an alternative to #95290 which proposed specifying the encoding on Windows.  Instead, this
only specifies that for cross-platform code, `OsStr`'s encoding is a superset of UTF-8 and defines
rules for safely interacting with it

At minimum, this can greatly simplify the `os_str_bytes` crate and every
arg parser that interacts with `OsStr` directly (which is most of those
that support invalid UTF-8).
---
 library/std/src/ffi/mod.rs               |  8 ++++
 library/std/src/ffi/os_str.rs            | 56 +++++++++++++++++++++++-
 library/std/src/sys/unix/os_str.rs       |  9 +++-
 library/std/src/sys/unix/os_str/tests.rs |  9 ++--
 library/std/src/sys/windows/os_str.rs    | 10 +++++
 library/std/src/sys_common/wtf8.rs       |  8 +++-
 6 files changed, 91 insertions(+), 9 deletions(-)
diff --git a/library/std/src/ffi/mod.rs b/library/std/src/ffi/mod.rs
index d987bf69b25..3ddb8748753 100644
--- a/library/std/src/ffi/mod.rs
+++ b/library/std/src/ffi/mod.rs
@@ -127,6 +127,14 @@
 //! trait, which provides a [`from_wide`] method to convert a native Windows
 //! string (without the terminating nul character) to an [`OsString`].
 //!
+//! ## On all platforms
+//!
+//! On all platforms, [`OsStr`] consists of a sequence of bytes that is encoded as a superset of
+//! UTF-8; see [`OsString`] for more details on its encoding on different platforms.
+//!
+//! For limited, inexpensive conversions from and to bytes, see [`OsStr::as_os_str_bytes`] and
+//! [`OsStr::from_os_str_bytes_unchecked`].
+//!
 //! [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
 //! [Unicode code point]: https://www.unicode.org/glossary/#code_point
 //! [`env::set_var()`]: crate::env::set_var "env::set_var"
diff --git a/library/std/src/ffi/os_str.rs b/library/std/src/ffi/os_str.rs
index 5c0541d3caf..25ab2196688 100644
--- a/library/std/src/ffi/os_str.rs
+++ b/library/std/src/ffi/os_str.rs
@@ -667,6 +667,45 @@ pub fn new<S: AsRef<OsStr> + ?Sized>(s: &S) -> &OsStr {
         s.as_ref()
     }
 
+    /// Converts a slice of bytes to an OS string slice without checking that the string contains
+    /// valid `OsStr`-encoded data.
+    ///
+    /// See the [module's toplevel documentation about conversions][conversions] for safe,
+    /// cross-platform [conversions] from/to native representations.
+    ///
+    /// # Safety
+    ///
+    /// `OsStr`'s encoding is an unspecified superset of UTF-8 and callers must
+    /// pass in bytes that originated as a mixture of validated UTF-8 and bytes from
+    /// [`OsStr::as_os_str_bytes`] from within the same rust version built for the same target
+    /// platform.  The bytes from `OsStr::as_os_str_bytes` may be split either
+    /// immediately before or immediately after some valid non-empty UTF-8 substring
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// #![feature(os_str_bytes)]
+    ///
+    /// use std::ffi::OsStr;
+    ///
+    /// let os_str = OsStr::new("Mary had a little lamb");
+    /// let bytes = os_str.as_os_str_bytes();
+    /// let words = bytes.split(|b| *b == b' ');
+    /// let words: Vec<&OsStr> = words.map(|word| {
+    ///     // SAFETY:
+    ///     // - Each `word` only contains content that originated from `OsStr::as_os_str_bytes`
+    ///     // - Only split with ASCII whitespace which is a non-empty UTF-8 substring
+    ///     unsafe { OsStr::from_os_str_bytes_unchecked(word) }
+    /// }).collect();
+    /// ```
+    ///
+    /// [conversions]: super#conversions
+    #[inline]
+    #[unstable(feature = "os_str_bytes", issue = "111544")]
+    pub fn from_os_str_bytes_unchecked(bytes: &[u8]) -> &Self {
+        Self::from_inner(Slice::from_os_str_bytes_unchecked(bytes))
+    }
+
     #[inline]
     fn from_inner(inner: &Slice) -> &OsStr {
         // SAFETY: OsStr is just a wrapper of Slice,
@@ -837,13 +876,28 @@ pub fn into_os_string(self: Box<OsStr>) -> OsString {
         OsString { inner: Buf::from_box(boxed) }
     }
 
+    /// Converts an OS string slice to a byte slice.  To convert the byte slice back into an OS
+    /// string slice, use the [`OsStr::from_os_str_bytes_unchecked`] function.
+    ///
+    /// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should
+    /// be treated as opaque and only comparable within the same rust version built for the same
+    /// target platform.  See [`OsString`] for more encoding details and [`std::ffi`] for
+    /// platform-specific, specified conversions.
+    ///
+    /// [`std::ffi`]: crate::ffi
+    #[inline]
+    #[unstable(feature = "os_str_bytes", issue = "111544")]
+    pub fn as_os_str_bytes(&self) -> &[u8] {
+        self.inner.as_os_str_bytes()
+    }
+
     /// Gets the underlying byte representation.
     ///
     /// Note: it is *crucial* that this API is not externally public, to avoid
     /// revealing the internal, platform-specific encodings.
     #[inline]
     pub(crate) fn bytes(&self) -> &[u8] {
-        unsafe { &*(&self.inner as *const _ as *const [u8]) }
+        self.as_os_str_bytes()
     }
 
     /// Converts this string to its ASCII lower case equivalent in-place.
diff --git a/library/std/src/sys/unix/os_str.rs b/library/std/src/sys/unix/os_str.rs
index 488217f3941..142fcb9ed0b 100644
--- a/library/std/src/sys/unix/os_str.rs
+++ b/library/std/src/sys/unix/os_str.rs
@@ -193,13 +193,18 @@ pub fn into_rc(&self) -> Rc<Slice> {
 
 impl Slice {
     #[inline]
-    fn from_u8_slice(s: &[u8]) -> &Slice {
+    pub fn as_os_str_bytes(&self) -> &[u8] {
+        &self.inner
+    }
+
+    #[inline]
+    pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice {
         unsafe { mem::transmute(s) }
     }
 
     #[inline]
     pub fn from_str(s: &str) -> &Slice {
-        Slice::from_u8_slice(s.as_bytes())
+        unsafe { Slice::from_os_str_bytes_unchecked(s.as_bytes()) }
     }
 
     pub fn to_str(&self) -> Option<&str> {
diff --git a/library/std/src/sys/unix/os_str/tests.rs b/library/std/src/sys/unix/os_str/tests.rs
index 22ba0c92350..91bc0e61a4a 100644
--- a/library/std/src/sys/unix/os_str/tests.rs
+++ b/library/std/src/sys/unix/os_str/tests.rs
@@ -2,7 +2,7 @@
 
 #[test]
 fn slice_debug_output() {
-    let input = Slice::from_u8_slice(b"\xF0hello,\tworld");
+    let input = unsafe { Slice::from_os_str_bytes_unchecked(b"\xF0hello,\tworld") };
     let expected = r#""\xF0hello,\tworld""#;
     let output = format!("{input:?}");
 
@@ -11,8 +11,7 @@ fn slice_debug_output() {
 
 #[test]
 fn display() {
-    assert_eq!(
-        "Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye",
-        Slice::from_u8_slice(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string(),
-    );
+    assert_eq!("Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye", unsafe {
+        Slice::from_os_str_bytes_unchecked(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string()
+    },);
 }
diff --git a/library/std/src/sys/windows/os_str.rs b/library/std/src/sys/windows/os_str.rs
index 2f2b0e56e08..611f0d040f0 100644
--- a/library/std/src/sys/windows/os_str.rs
+++ b/library/std/src/sys/windows/os_str.rs
@@ -151,6 +151,16 @@ pub fn into_rc(&self) -> Rc<Slice> {
 }
 
 impl Slice {
+    #[inline]
+    pub fn as_os_str_bytes(&self) -> &[u8] {
+        self.inner.as_bytes()
+    }
+
+    #[inline]
+    pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice {
+        mem::transmute(Wtf8::from_bytes_unchecked(s))
+    }
+
     #[inline]
     pub fn from_str(s: &str) -> &Slice {
         unsafe { mem::transmute(Wtf8::from_str(s)) }
diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs
index ff96c35fb0b..31bb0ad25a6 100644
--- a/library/std/src/sys_common/wtf8.rs
+++ b/library/std/src/sys_common/wtf8.rs
@@ -570,7 +570,7 @@ pub fn from_str(value: &str) -> &Wtf8 {
     /// Since the byte slice is not checked for valid WTF-8, this functions is
     /// marked unsafe.
     #[inline]
-    unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
+    pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
         mem::transmute(value)
     }
 
@@ -614,6 +614,12 @@ pub fn code_points(&self) -> Wtf8CodePoints<'_> {
         Wtf8CodePoints { bytes: self.bytes.iter() }
     }
 
+    /// Access raw bytes of WTF-8 data
+    #[inline]
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.bytes
+    }
+
     /// Tries to convert the string to UTF-8 and return a `&str` slice.
     ///
     /// Returns `None` if the string contains surrogates.