From ee604fccd9d91b8f4cf9b4cad5bafdf698dd0335 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Fri, 7 Jul 2023 09:46:48 -0500 Subject: [PATCH] Allow limited access to `OsString` bytes This extends #109698 to allow no-cost conversion between `Vec` and `OsString` as suggested in feedback from `os_str_bytes` crate in #111544. --- library/std/src/ffi/os_str.rs | 65 +++++++++++++++++++++++++++ library/std/src/sys/unix/os_str.rs | 10 +++++ library/std/src/sys/windows/os_str.rs | 10 +++++ library/std/src/sys_common/wtf8.rs | 15 +++++++ 4 files changed, 100 insertions(+) diff --git a/library/std/src/ffi/os_str.rs b/library/std/src/ffi/os_str.rs index fbdf7f5ecac..c0f2dfa4e0b 100644 --- a/library/std/src/ffi/os_str.rs +++ b/library/std/src/ffi/os_str.rs @@ -141,6 +141,51 @@ pub fn new() -> OsString { OsString { inner: Buf::from_string(String::new()) } } + /// Converts bytes to an `OsString` without checking that the bytes contains + /// valid [`OsStr`]-encoded data. + /// + /// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8. + /// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit + /// ASCII. + /// + /// See the [module's toplevel documentation about conversions][conversions] for safe, + /// cross-platform [conversions] from/to native representations. + /// + /// # Safety + /// + /// As the encoding is unspecified, callers must pass in bytes that originated as a mixture of + /// validated UTF-8 and bytes from [`OsStr::as_os_str_bytes`] from within the same rust version + /// built for the same target platform. For example, reconstructing an `OsString` from bytes sent + /// over the network or stored in a file will likely violate these safety rules. + /// + /// Due to the encoding being self-synchronizing, the bytes from [`OsStr::as_os_str_bytes`] can be + /// split either immediately before or immediately after any valid non-empty UTF-8 substring. + /// + /// # Example + /// + /// ``` + /// #![feature(os_str_bytes)] + /// + /// use std::ffi::OsStr; + /// + /// let os_str = OsStr::new("Mary had a little lamb"); + /// let bytes = os_str.as_os_str_bytes(); + /// let words = bytes.split(|b| *b == b' '); + /// let words: Vec<&OsStr> = words.map(|word| { + /// // SAFETY: + /// // - Each `word` only contains content that originated from `OsStr::as_os_str_bytes` + /// // - Only split with ASCII whitespace which is a non-empty UTF-8 substring + /// unsafe { OsStr::from_os_str_bytes_unchecked(word) } + /// }).collect(); + /// ``` + /// + /// [conversions]: super#conversions + #[inline] + #[unstable(feature = "os_str_bytes", issue = "111544")] + pub unsafe fn from_os_str_bytes_unchecked(bytes: Vec) -> Self { + OsString { inner: Buf::from_os_str_bytes_unchecked(bytes) } + } + /// Converts to an [`OsStr`] slice. /// /// # Examples @@ -159,6 +204,26 @@ pub fn as_os_str(&self) -> &OsStr { self } + /// Converts the `OsString` into a byte slice. To convert the byte slice back into an + /// `OsString`, use the [`OsStr::from_os_str_bytes_unchecked`] function. + /// + /// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8. + /// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit + /// ASCII. + /// + /// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should + /// be treated as opaque and only comparable within the same rust version built for the same + /// target platform. For example, sending the bytes over the network or storing it in a file + /// will likely result in incompatible data. See [`OsString`] for more encoding details + /// and [`std::ffi`] for platform-specific, specified conversions. + /// + /// [`std::ffi`]: crate::ffi + #[inline] + #[unstable(feature = "os_str_bytes", issue = "111544")] + pub fn into_os_str_bytes(self) -> Vec { + self.inner.into_os_str_bytes() + } + /// Converts the `OsString` into a [`String`] if it contains valid Unicode data. /// /// On failure, ownership of the original `OsString` is returned. diff --git a/library/std/src/sys/unix/os_str.rs b/library/std/src/sys/unix/os_str.rs index f7333fd5a1f..463b0a27515 100644 --- a/library/std/src/sys/unix/os_str.rs +++ b/library/std/src/sys/unix/os_str.rs @@ -96,6 +96,16 @@ fn as_inner(&self) -> &[u8] { } impl Buf { + #[inline] + pub fn into_os_str_bytes(self) -> Vec { + self.inner + } + + #[inline] + pub unsafe fn from_os_str_bytes_unchecked(s: Vec) -> Self { + Self { inner: s } + } + pub fn from_string(s: String) -> Buf { Buf { inner: s.into_bytes() } } diff --git a/library/std/src/sys/windows/os_str.rs b/library/std/src/sys/windows/os_str.rs index 16c4f55c687..4708657a907 100644 --- a/library/std/src/sys/windows/os_str.rs +++ b/library/std/src/sys/windows/os_str.rs @@ -63,6 +63,16 @@ fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { } impl Buf { + #[inline] + pub fn into_os_str_bytes(self) -> Vec { + self.inner.into_bytes() + } + + #[inline] + pub unsafe fn from_os_str_bytes_unchecked(s: Vec) -> Self { + Self { inner: Wtf8Buf::from_bytes_unchecked(s) } + } + pub fn with_capacity(capacity: usize) -> Buf { Buf { inner: Wtf8Buf::with_capacity(capacity) } } diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs index c9d3e13cf0c..195d175cc9b 100644 --- a/library/std/src/sys_common/wtf8.rs +++ b/library/std/src/sys_common/wtf8.rs @@ -182,6 +182,15 @@ pub fn with_capacity(capacity: usize) -> Wtf8Buf { Wtf8Buf { bytes: Vec::with_capacity(capacity), is_known_utf8: true } } + /// Creates a WTF-8 string from a WTF-8 byte vec. + /// + /// Since the byte vec is not checked for valid WTF-8, this functions is + /// marked unsafe. + #[inline] + pub unsafe fn from_bytes_unchecked(value: Vec) -> Wtf8Buf { + Wtf8Buf { bytes: value, is_known_utf8: false } + } + /// Creates a WTF-8 string from a UTF-8 `String`. /// /// This takes ownership of the `String` and does not copy. @@ -402,6 +411,12 @@ pub fn truncate(&mut self, new_len: usize) { self.bytes.truncate(new_len) } + /// Consumes the WTF-8 string and tries to convert it to a vec of bytes. + #[inline] + pub fn into_bytes(self) -> Vec { + self.bytes + } + /// Consumes the WTF-8 string and tries to convert it to UTF-8. /// /// This does not copy the data.