From 8d2beb50c2b228f4bd6f8e2d81d82c9e1f5e5ba2 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Mon, 27 Mar 2023 21:22:36 -0500 Subject: [PATCH] Allow access to `OsStr` bytes `OsStr` has historically kept its implementation details private out of concern for locking us into a specific encoding on Windows. This is an alternative to #95290 which proposed specifying the encoding on Windows. Instead, this only specifies that for cross-platform code, `OsStr`'s encoding is a superset of UTF-8 and defines rules for safely interacting with it At minimum, this can greatly simplify the `os_str_bytes` crate and every arg parser that interacts with `OsStr` directly (which is most of those that support invalid UTF-8). --- library/std/src/ffi/mod.rs | 8 ++++ library/std/src/ffi/os_str.rs | 56 +++++++++++++++++++++++- library/std/src/sys/unix/os_str.rs | 9 +++- library/std/src/sys/unix/os_str/tests.rs | 9 ++-- library/std/src/sys/windows/os_str.rs | 10 +++++ library/std/src/sys_common/wtf8.rs | 8 +++- 6 files changed, 91 insertions(+), 9 deletions(-) diff --git a/library/std/src/ffi/mod.rs b/library/std/src/ffi/mod.rs index d987bf69b25..3ddb8748753 100644 --- a/library/std/src/ffi/mod.rs +++ b/library/std/src/ffi/mod.rs @@ -127,6 +127,14 @@ //! trait, which provides a [`from_wide`] method to convert a native Windows //! string (without the terminating nul character) to an [`OsString`]. //! +//! ## On all platforms +//! +//! On all platforms, [`OsStr`] consists of a sequence of bytes that is encoded as a superset of +//! UTF-8; see [`OsString`] for more details on its encoding on different platforms. +//! +//! For limited, inexpensive conversions from and to bytes, see [`OsStr::as_os_str_bytes`] and +//! [`OsStr::from_os_str_bytes_unchecked`]. +//! //! [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value //! [Unicode code point]: https://www.unicode.org/glossary/#code_point //! [`env::set_var()`]: crate::env::set_var "env::set_var" diff --git a/library/std/src/ffi/os_str.rs b/library/std/src/ffi/os_str.rs index 5c0541d3caf..25ab2196688 100644 --- a/library/std/src/ffi/os_str.rs +++ b/library/std/src/ffi/os_str.rs @@ -667,6 +667,45 @@ pub fn new + ?Sized>(s: &S) -> &OsStr { s.as_ref() } + /// Converts a slice of bytes to an OS string slice without checking that the string contains + /// valid `OsStr`-encoded data. + /// + /// See the [module's toplevel documentation about conversions][conversions] for safe, + /// cross-platform [conversions] from/to native representations. + /// + /// # Safety + /// + /// `OsStr`'s encoding is an unspecified superset of UTF-8 and callers must + /// pass in bytes that originated as a mixture of validated UTF-8 and bytes from + /// [`OsStr::as_os_str_bytes`] from within the same rust version built for the same target + /// platform. The bytes from `OsStr::as_os_str_bytes` may be split either + /// immediately before or immediately after some valid non-empty UTF-8 substring + /// + /// # Example + /// + /// ``` + /// #![feature(os_str_bytes)] + /// + /// use std::ffi::OsStr; + /// + /// let os_str = OsStr::new("Mary had a little lamb"); + /// let bytes = os_str.as_os_str_bytes(); + /// let words = bytes.split(|b| *b == b' '); + /// let words: Vec<&OsStr> = words.map(|word| { + /// // SAFETY: + /// // - Each `word` only contains content that originated from `OsStr::as_os_str_bytes` + /// // - Only split with ASCII whitespace which is a non-empty UTF-8 substring + /// unsafe { OsStr::from_os_str_bytes_unchecked(word) } + /// }).collect(); + /// ``` + /// + /// [conversions]: super#conversions + #[inline] + #[unstable(feature = "os_str_bytes", issue = "111544")] + pub fn from_os_str_bytes_unchecked(bytes: &[u8]) -> &Self { + Self::from_inner(Slice::from_os_str_bytes_unchecked(bytes)) + } + #[inline] fn from_inner(inner: &Slice) -> &OsStr { // SAFETY: OsStr is just a wrapper of Slice, @@ -837,13 +876,28 @@ pub fn into_os_string(self: Box) -> OsString { OsString { inner: Buf::from_box(boxed) } } + /// Converts an OS string slice to a byte slice. To convert the byte slice back into an OS + /// string slice, use the [`OsStr::from_os_str_bytes_unchecked`] function. + /// + /// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should + /// be treated as opaque and only comparable within the same rust version built for the same + /// target platform. See [`OsString`] for more encoding details and [`std::ffi`] for + /// platform-specific, specified conversions. + /// + /// [`std::ffi`]: crate::ffi + #[inline] + #[unstable(feature = "os_str_bytes", issue = "111544")] + pub fn as_os_str_bytes(&self) -> &[u8] { + self.inner.as_os_str_bytes() + } + /// Gets the underlying byte representation. /// /// Note: it is *crucial* that this API is not externally public, to avoid /// revealing the internal, platform-specific encodings. #[inline] pub(crate) fn bytes(&self) -> &[u8] { - unsafe { &*(&self.inner as *const _ as *const [u8]) } + self.as_os_str_bytes() } /// Converts this string to its ASCII lower case equivalent in-place. diff --git a/library/std/src/sys/unix/os_str.rs b/library/std/src/sys/unix/os_str.rs index 488217f3941..142fcb9ed0b 100644 --- a/library/std/src/sys/unix/os_str.rs +++ b/library/std/src/sys/unix/os_str.rs @@ -193,13 +193,18 @@ pub fn into_rc(&self) -> Rc { impl Slice { #[inline] - fn from_u8_slice(s: &[u8]) -> &Slice { + pub fn as_os_str_bytes(&self) -> &[u8] { + &self.inner + } + + #[inline] + pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice { unsafe { mem::transmute(s) } } #[inline] pub fn from_str(s: &str) -> &Slice { - Slice::from_u8_slice(s.as_bytes()) + unsafe { Slice::from_os_str_bytes_unchecked(s.as_bytes()) } } pub fn to_str(&self) -> Option<&str> { diff --git a/library/std/src/sys/unix/os_str/tests.rs b/library/std/src/sys/unix/os_str/tests.rs index 22ba0c92350..91bc0e61a4a 100644 --- a/library/std/src/sys/unix/os_str/tests.rs +++ b/library/std/src/sys/unix/os_str/tests.rs @@ -2,7 +2,7 @@ #[test] fn slice_debug_output() { - let input = Slice::from_u8_slice(b"\xF0hello,\tworld"); + let input = unsafe { Slice::from_os_str_bytes_unchecked(b"\xF0hello,\tworld") }; let expected = r#""\xF0hello,\tworld""#; let output = format!("{input:?}"); @@ -11,8 +11,7 @@ fn slice_debug_output() { #[test] fn display() { - assert_eq!( - "Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye", - Slice::from_u8_slice(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string(), - ); + assert_eq!("Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye", unsafe { + Slice::from_os_str_bytes_unchecked(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string() + },); } diff --git a/library/std/src/sys/windows/os_str.rs b/library/std/src/sys/windows/os_str.rs index 2f2b0e56e08..611f0d040f0 100644 --- a/library/std/src/sys/windows/os_str.rs +++ b/library/std/src/sys/windows/os_str.rs @@ -151,6 +151,16 @@ pub fn into_rc(&self) -> Rc { } impl Slice { + #[inline] + pub fn as_os_str_bytes(&self) -> &[u8] { + self.inner.as_bytes() + } + + #[inline] + pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice { + mem::transmute(Wtf8::from_bytes_unchecked(s)) + } + #[inline] pub fn from_str(s: &str) -> &Slice { unsafe { mem::transmute(Wtf8::from_str(s)) } diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs index ff96c35fb0b..31bb0ad25a6 100644 --- a/library/std/src/sys_common/wtf8.rs +++ b/library/std/src/sys_common/wtf8.rs @@ -570,7 +570,7 @@ pub fn from_str(value: &str) -> &Wtf8 { /// Since the byte slice is not checked for valid WTF-8, this functions is /// marked unsafe. #[inline] - unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 { + pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 { mem::transmute(value) } @@ -614,6 +614,12 @@ pub fn code_points(&self) -> Wtf8CodePoints<'_> { Wtf8CodePoints { bytes: self.bytes.iter() } } + /// Access raw bytes of WTF-8 data + #[inline] + pub fn as_bytes(&self) -> &[u8] { + &self.bytes + } + /// Tries to convert the string to UTF-8 and return a `&str` slice. /// /// Returns `None` if the string contains surrogates.