From ee604fccd9d91b8f4cf9b4cad5bafdf698dd0335 Mon Sep 17 00:00:00 2001
From: Ed Page <eopage@gmail.com>
Date: Fri, 7 Jul 2023 09:46:48 -0500
Subject: [PATCH] Allow limited access to `OsString` bytes

This extends #109698 to allow no-cost conversion between `Vec<u8>` and `OsString`
as suggested in feedback from `os_str_bytes` crate in #111544.
---
 library/std/src/ffi/os_str.rs         | 65 +++++++++++++++++++++++++++
 library/std/src/sys/unix/os_str.rs    | 10 +++++
 library/std/src/sys/windows/os_str.rs | 10 +++++
 library/std/src/sys_common/wtf8.rs    | 15 +++++++
 4 files changed, 100 insertions(+)
diff --git a/library/std/src/ffi/os_str.rs b/library/std/src/ffi/os_str.rs
index fbdf7f5ecac..c0f2dfa4e0b 100644
--- a/library/std/src/ffi/os_str.rs
+++ b/library/std/src/ffi/os_str.rs
@@ -141,6 +141,51 @@ pub fn new() -> OsString {
         OsString { inner: Buf::from_string(String::new()) }
     }
 
+    /// Converts bytes to an `OsString` without checking that the bytes contains
+    /// valid [`OsStr`]-encoded data.
+    ///
+    /// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8.
+    /// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit
+    /// ASCII.
+    ///
+    /// See the [module's toplevel documentation about conversions][conversions] for safe,
+    /// cross-platform [conversions] from/to native representations.
+    ///
+    /// # Safety
+    ///
+    /// As the encoding is unspecified, callers must pass in bytes that originated as a mixture of
+    /// validated UTF-8 and bytes from [`OsStr::as_os_str_bytes`] from within the same rust version
+    /// built for the same target platform.  For example, reconstructing an `OsString` from bytes sent
+    /// over the network or stored in a file will likely violate these safety rules.
+    ///
+    /// Due to the encoding being self-synchronizing, the bytes from [`OsStr::as_os_str_bytes`] can be
+    /// split either immediately before or immediately after any valid non-empty UTF-8 substring.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// #![feature(os_str_bytes)]
+    ///
+    /// use std::ffi::OsStr;
+    ///
+    /// let os_str = OsStr::new("Mary had a little lamb");
+    /// let bytes = os_str.as_os_str_bytes();
+    /// let words = bytes.split(|b| *b == b' ');
+    /// let words: Vec<&OsStr> = words.map(|word| {
+    ///     // SAFETY:
+    ///     // - Each `word` only contains content that originated from `OsStr::as_os_str_bytes`
+    ///     // - Only split with ASCII whitespace which is a non-empty UTF-8 substring
+    ///     unsafe { OsStr::from_os_str_bytes_unchecked(word) }
+    /// }).collect();
+    /// ```
+    ///
+    /// [conversions]: super#conversions
+    #[inline]
+    #[unstable(feature = "os_str_bytes", issue = "111544")]
+    pub unsafe fn from_os_str_bytes_unchecked(bytes: Vec<u8>) -> Self {
+        OsString { inner: Buf::from_os_str_bytes_unchecked(bytes) }
+    }
+
     /// Converts to an [`OsStr`] slice.
     ///
     /// # Examples
@@ -159,6 +204,26 @@ pub fn as_os_str(&self) -> &OsStr {
         self
     }
 
+    /// Converts the `OsString` into a byte slice.  To convert the byte slice back into an
+    /// `OsString`, use the [`OsStr::from_os_str_bytes_unchecked`] function.
+    ///
+    /// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8.
+    /// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit
+    /// ASCII.
+    ///
+    /// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should
+    /// be treated as opaque and only comparable within the same rust version built for the same
+    /// target platform.  For example, sending the bytes over the network or storing it in a file
+    /// will likely result in incompatible data.  See [`OsString`] for more encoding details
+    /// and [`std::ffi`] for platform-specific, specified conversions.
+    ///
+    /// [`std::ffi`]: crate::ffi
+    #[inline]
+    #[unstable(feature = "os_str_bytes", issue = "111544")]
+    pub fn into_os_str_bytes(self) -> Vec<u8> {
+        self.inner.into_os_str_bytes()
+    }
+
     /// Converts the `OsString` into a [`String`] if it contains valid Unicode data.
     ///
     /// On failure, ownership of the original `OsString` is returned.
diff --git a/library/std/src/sys/unix/os_str.rs b/library/std/src/sys/unix/os_str.rs
index f7333fd5a1f..463b0a27515 100644
--- a/library/std/src/sys/unix/os_str.rs
+++ b/library/std/src/sys/unix/os_str.rs
@@ -96,6 +96,16 @@ fn as_inner(&self) -> &[u8] {
 }
 
 impl Buf {
+    #[inline]
+    pub fn into_os_str_bytes(self) -> Vec<u8> {
+        self.inner
+    }
+
+    #[inline]
+    pub unsafe fn from_os_str_bytes_unchecked(s: Vec<u8>) -> Self {
+        Self { inner: s }
+    }
+
     pub fn from_string(s: String) -> Buf {
         Buf { inner: s.into_bytes() }
     }
diff --git a/library/std/src/sys/windows/os_str.rs b/library/std/src/sys/windows/os_str.rs
index 16c4f55c687..4708657a907 100644
--- a/library/std/src/sys/windows/os_str.rs
+++ b/library/std/src/sys/windows/os_str.rs
@@ -63,6 +63,16 @@ fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
 }
 
 impl Buf {
+    #[inline]
+    pub fn into_os_str_bytes(self) -> Vec<u8> {
+        self.inner.into_bytes()
+    }
+
+    #[inline]
+    pub unsafe fn from_os_str_bytes_unchecked(s: Vec<u8>) -> Self {
+        Self { inner: Wtf8Buf::from_bytes_unchecked(s) }
+    }
+
     pub fn with_capacity(capacity: usize) -> Buf {
         Buf { inner: Wtf8Buf::with_capacity(capacity) }
     }
diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs
index c9d3e13cf0c..195d175cc9b 100644
--- a/library/std/src/sys_common/wtf8.rs
+++ b/library/std/src/sys_common/wtf8.rs
@@ -182,6 +182,15 @@ pub fn with_capacity(capacity: usize) -> Wtf8Buf {
         Wtf8Buf { bytes: Vec::with_capacity(capacity), is_known_utf8: true }
     }
 
+    /// Creates a WTF-8 string from a WTF-8 byte vec.
+    ///
+    /// Since the byte vec is not checked for valid WTF-8, this functions is
+    /// marked unsafe.
+    #[inline]
+    pub unsafe fn from_bytes_unchecked(value: Vec<u8>) -> Wtf8Buf {
+        Wtf8Buf { bytes: value, is_known_utf8: false }
+    }
+
     /// Creates a WTF-8 string from a UTF-8 `String`.
     ///
     /// This takes ownership of the `String` and does not copy.
@@ -402,6 +411,12 @@ pub fn truncate(&mut self, new_len: usize) {
         self.bytes.truncate(new_len)
     }
 
+    /// Consumes the WTF-8 string and tries to convert it to a vec of bytes.
+    #[inline]
+    pub fn into_bytes(self) -> Vec<u8> {
+        self.bytes
+    }
+
     /// Consumes the WTF-8 string and tries to convert it to UTF-8.
     ///
     /// This does not copy the data.