Rollup merge of #113442 - epage:osstring, r=cuviper

Allow limited access to `OsString` bytes This extends #109698 to allow no-cost conversion between `Vec<u8>` and `OsString` as suggested in feedback from `os_str_bytes` crate in #111544.
2023-07-22 11:48:53 +02:00 · 2023-07-22 11:48:53 +02:00 · 0877d11e8d
commit 0877d11e8d
parent 58a4be1dfb ee604fccd9
4 changed files with 100 additions and 0 deletions
--- a/library/std/src/ffi/os_str.rs
+++ b/library/std/src/ffi/os_str.rs
@ -141,6 +141,51 @@ pub fn new() -> OsString {
        OsString { inner: Buf::from_string(String::new()) }
    }

+    /// Converts bytes to an `OsString` without checking that the bytes contains
+    /// valid [`OsStr`]-encoded data.
+    ///
+    /// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8.
+    /// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit
+    /// ASCII.
+    ///
+    /// See the [module's toplevel documentation about conversions][conversions] for safe,
+    /// cross-platform [conversions] from/to native representations.
+    ///
+    /// # Safety
+    ///
+    /// As the encoding is unspecified, callers must pass in bytes that originated as a mixture of
+    /// validated UTF-8 and bytes from [`OsStr::as_os_str_bytes`] from within the same rust version
+    /// built for the same target platform.  For example, reconstructing an `OsString` from bytes sent
+    /// over the network or stored in a file will likely violate these safety rules.
+    ///
+    /// Due to the encoding being self-synchronizing, the bytes from [`OsStr::as_os_str_bytes`] can be
+    /// split either immediately before or immediately after any valid non-empty UTF-8 substring.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// #![feature(os_str_bytes)]
+    ///
+    /// use std::ffi::OsStr;
+    ///
+    /// let os_str = OsStr::new("Mary had a little lamb");
+    /// let bytes = os_str.as_os_str_bytes();
+    /// let words = bytes.split(|b| *b == b' ');
+    /// let words: Vec<&OsStr> = words.map(|word| {
+    ///     // SAFETY:
+    ///     // - Each `word` only contains content that originated from `OsStr::as_os_str_bytes`
+    ///     // - Only split with ASCII whitespace which is a non-empty UTF-8 substring
+    ///     unsafe { OsStr::from_os_str_bytes_unchecked(word) }
+    /// }).collect();
+    /// ```
+    ///
+    /// [conversions]: super#conversions
+    #[inline]
+    #[unstable(feature = "os_str_bytes", issue = "111544")]
+    pub unsafe fn from_os_str_bytes_unchecked(bytes: Vec<u8>) -> Self {
+        OsString { inner: Buf::from_os_str_bytes_unchecked(bytes) }
+    }
+
    /// Converts to an [`OsStr`] slice.
    ///
    /// # Examples
@ -159,6 +204,26 @@ pub fn as_os_str(&self) -> &OsStr {
        self
    }

+    /// Converts the `OsString` into a byte slice.  To convert the byte slice back into an
+    /// `OsString`, use the [`OsStr::from_os_str_bytes_unchecked`] function.
+    ///
+    /// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8.
+    /// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit
+    /// ASCII.
+    ///
+    /// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should
+    /// be treated as opaque and only comparable within the same rust version built for the same
+    /// target platform.  For example, sending the bytes over the network or storing it in a file
+    /// will likely result in incompatible data.  See [`OsString`] for more encoding details
+    /// and [`std::ffi`] for platform-specific, specified conversions.
+    ///
+    /// [`std::ffi`]: crate::ffi
+    #[inline]
+    #[unstable(feature = "os_str_bytes", issue = "111544")]
+    pub fn into_os_str_bytes(self) -> Vec<u8> {
+        self.inner.into_os_str_bytes()
+    }
+
    /// Converts the `OsString` into a [`String`] if it contains valid Unicode data.
    ///
    /// On failure, ownership of the original `OsString` is returned.
--- a/library/std/src/sys/unix/os_str.rs
+++ b/library/std/src/sys/unix/os_str.rs
@ -96,6 +96,16 @@ fn as_inner(&self) -> &[u8] {
 }

 impl Buf {
+    #[inline]
+    pub fn into_os_str_bytes(self) -> Vec<u8> {
+        self.inner
+    }
+
+    #[inline]
+    pub unsafe fn from_os_str_bytes_unchecked(s: Vec<u8>) -> Self {
+        Self { inner: s }
+    }
+
    pub fn from_string(s: String) -> Buf {
        Buf { inner: s.into_bytes() }
    }
--- a/library/std/src/sys/windows/os_str.rs
+++ b/library/std/src/sys/windows/os_str.rs
@ -63,6 +63,16 @@ fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
 }

 impl Buf {
+    #[inline]
+    pub fn into_os_str_bytes(self) -> Vec<u8> {
+        self.inner.into_bytes()
+    }
+
+    #[inline]
+    pub unsafe fn from_os_str_bytes_unchecked(s: Vec<u8>) -> Self {
+        Self { inner: Wtf8Buf::from_bytes_unchecked(s) }
+    }
+
    pub fn with_capacity(capacity: usize) -> Buf {
        Buf { inner: Wtf8Buf::with_capacity(capacity) }
    }
--- a/library/std/src/sys_common/wtf8.rs
+++ b/library/std/src/sys_common/wtf8.rs
@ -182,6 +182,15 @@ pub fn with_capacity(capacity: usize) -> Wtf8Buf {
        Wtf8Buf { bytes: Vec::with_capacity(capacity), is_known_utf8: true }
    }

+    /// Creates a WTF-8 string from a WTF-8 byte vec.
+    ///
+    /// Since the byte vec is not checked for valid WTF-8, this functions is
+    /// marked unsafe.
+    #[inline]
+    pub unsafe fn from_bytes_unchecked(value: Vec<u8>) -> Wtf8Buf {
+        Wtf8Buf { bytes: value, is_known_utf8: false }
+    }
+
    /// Creates a WTF-8 string from a UTF-8 `String`.
    ///
    /// This takes ownership of the `String` and does not copy.
@ -402,6 +411,12 @@ pub fn truncate(&mut self, new_len: usize) {
        self.bytes.truncate(new_len)
    }

+    /// Consumes the WTF-8 string and tries to convert it to a vec of bytes.
+    #[inline]
+    pub fn into_bytes(self) -> Vec<u8> {
+        self.bytes
+    }
+
    /// Consumes the WTF-8 string and tries to convert it to UTF-8.
    ///
    /// This does not copy the data.