Allow access to OsStr bytes

`OsStr` has historically kept its implementation details private out of
concern for locking us into a specific encoding on Windows.

This is an alternative to #95290 which proposed specifying the encoding on Windows.  Instead, this
only specifies that for cross-platform code, `OsStr`'s encoding is a superset of UTF-8 and defines
rules for safely interacting with it

At minimum, this can greatly simplify the `os_str_bytes` crate and every
arg parser that interacts with `OsStr` directly (which is most of those
that support invalid UTF-8).
This commit is contained in:
Ed Page 2023-03-27 21:22:36 -05:00
parent 70e04bd88d
commit 8d2beb50c2
6 changed files with 91 additions and 9 deletions

View File

@ -127,6 +127,14 @@
//! trait, which provides a [`from_wide`] method to convert a native Windows
//! string (without the terminating nul character) to an [`OsString`].
//!
//! ## On all platforms
//!
//! On all platforms, [`OsStr`] consists of a sequence of bytes that is encoded as a superset of
//! UTF-8; see [`OsString`] for more details on its encoding on different platforms.
//!
//! For limited, inexpensive conversions from and to bytes, see [`OsStr::as_os_str_bytes`] and
//! [`OsStr::from_os_str_bytes_unchecked`].
//!
//! [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
//! [Unicode code point]: https://www.unicode.org/glossary/#code_point
//! [`env::set_var()`]: crate::env::set_var "env::set_var"

View File

@ -667,6 +667,45 @@ pub fn new<S: AsRef<OsStr> + ?Sized>(s: &S) -> &OsStr {
s.as_ref()
}
/// Converts a slice of bytes to an OS string slice without checking that the string contains
/// valid `OsStr`-encoded data.
///
/// See the [module's toplevel documentation about conversions][conversions] for safe,
/// cross-platform [conversions] from/to native representations.
///
/// # Safety
///
/// `OsStr`'s encoding is an unspecified superset of UTF-8 and callers must
/// pass in bytes that originated as a mixture of validated UTF-8 and bytes from
/// [`OsStr::as_os_str_bytes`] from within the same rust version built for the same target
/// platform. The bytes from `OsStr::as_os_str_bytes` may be split either
/// immediately before or immediately after some valid non-empty UTF-8 substring
///
/// # Example
///
/// ```
/// #![feature(os_str_bytes)]
///
/// use std::ffi::OsStr;
///
/// let os_str = OsStr::new("Mary had a little lamb");
/// let bytes = os_str.as_os_str_bytes();
/// let words = bytes.split(|b| *b == b' ');
/// let words: Vec<&OsStr> = words.map(|word| {
/// // SAFETY:
/// // - Each `word` only contains content that originated from `OsStr::as_os_str_bytes`
/// // - Only split with ASCII whitespace which is a non-empty UTF-8 substring
/// unsafe { OsStr::from_os_str_bytes_unchecked(word) }
/// }).collect();
/// ```
///
/// [conversions]: super#conversions
#[inline]
#[unstable(feature = "os_str_bytes", issue = "111544")]
pub fn from_os_str_bytes_unchecked(bytes: &[u8]) -> &Self {
Self::from_inner(Slice::from_os_str_bytes_unchecked(bytes))
}
#[inline]
fn from_inner(inner: &Slice) -> &OsStr {
// SAFETY: OsStr is just a wrapper of Slice,
@ -837,13 +876,28 @@ pub fn into_os_string(self: Box<OsStr>) -> OsString {
OsString { inner: Buf::from_box(boxed) }
}
/// Converts an OS string slice to a byte slice. To convert the byte slice back into an OS
/// string slice, use the [`OsStr::from_os_str_bytes_unchecked`] function.
///
/// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should
/// be treated as opaque and only comparable within the same rust version built for the same
/// target platform. See [`OsString`] for more encoding details and [`std::ffi`] for
/// platform-specific, specified conversions.
///
/// [`std::ffi`]: crate::ffi
#[inline]
#[unstable(feature = "os_str_bytes", issue = "111544")]
pub fn as_os_str_bytes(&self) -> &[u8] {
self.inner.as_os_str_bytes()
}
/// Gets the underlying byte representation.
///
/// Note: it is *crucial* that this API is not externally public, to avoid
/// revealing the internal, platform-specific encodings.
#[inline]
pub(crate) fn bytes(&self) -> &[u8] {
unsafe { &*(&self.inner as *const _ as *const [u8]) }
self.as_os_str_bytes()
}
/// Converts this string to its ASCII lower case equivalent in-place.

View File

@ -193,13 +193,18 @@ pub fn into_rc(&self) -> Rc<Slice> {
impl Slice {
#[inline]
fn from_u8_slice(s: &[u8]) -> &Slice {
pub fn as_os_str_bytes(&self) -> &[u8] {
&self.inner
}
#[inline]
pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice {
unsafe { mem::transmute(s) }
}
#[inline]
pub fn from_str(s: &str) -> &Slice {
Slice::from_u8_slice(s.as_bytes())
unsafe { Slice::from_os_str_bytes_unchecked(s.as_bytes()) }
}
pub fn to_str(&self) -> Option<&str> {

View File

@ -2,7 +2,7 @@
#[test]
fn slice_debug_output() {
let input = Slice::from_u8_slice(b"\xF0hello,\tworld");
let input = unsafe { Slice::from_os_str_bytes_unchecked(b"\xF0hello,\tworld") };
let expected = r#""\xF0hello,\tworld""#;
let output = format!("{input:?}");
@ -11,8 +11,7 @@ fn slice_debug_output() {
#[test]
fn display() {
assert_eq!(
"Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye",
Slice::from_u8_slice(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string(),
);
assert_eq!("Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye", unsafe {
Slice::from_os_str_bytes_unchecked(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string()
},);
}

View File

@ -151,6 +151,16 @@ pub fn into_rc(&self) -> Rc<Slice> {
}
impl Slice {
#[inline]
pub fn as_os_str_bytes(&self) -> &[u8] {
self.inner.as_bytes()
}
#[inline]
pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice {
mem::transmute(Wtf8::from_bytes_unchecked(s))
}
#[inline]
pub fn from_str(s: &str) -> &Slice {
unsafe { mem::transmute(Wtf8::from_str(s)) }

View File

@ -570,7 +570,7 @@ pub fn from_str(value: &str) -> &Wtf8 {
/// Since the byte slice is not checked for valid WTF-8, this functions is
/// marked unsafe.
#[inline]
unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
mem::transmute(value)
}
@ -614,6 +614,12 @@ pub fn code_points(&self) -> Wtf8CodePoints<'_> {
Wtf8CodePoints { bytes: self.bytes.iter() }
}
/// Access raw bytes of WTF-8 data
#[inline]
pub fn as_bytes(&self) -> &[u8] {
&self.bytes
}
/// Tries to convert the string to UTF-8 and return a `&str` slice.
///
/// Returns `None` if the string contains surrogates.