Allow access to OsStr
bytes
`OsStr` has historically kept its implementation details private out of concern for locking us into a specific encoding on Windows. This is an alternative to #95290 which proposed specifying the encoding on Windows. Instead, this only specifies that for cross-platform code, `OsStr`'s encoding is a superset of UTF-8 and defines rules for safely interacting with it At minimum, this can greatly simplify the `os_str_bytes` crate and every arg parser that interacts with `OsStr` directly (which is most of those that support invalid UTF-8).
This commit is contained in:
parent
70e04bd88d
commit
8d2beb50c2
@ -127,6 +127,14 @@
|
||||
//! trait, which provides a [`from_wide`] method to convert a native Windows
|
||||
//! string (without the terminating nul character) to an [`OsString`].
|
||||
//!
|
||||
//! ## On all platforms
|
||||
//!
|
||||
//! On all platforms, [`OsStr`] consists of a sequence of bytes that is encoded as a superset of
|
||||
//! UTF-8; see [`OsString`] for more details on its encoding on different platforms.
|
||||
//!
|
||||
//! For limited, inexpensive conversions from and to bytes, see [`OsStr::as_os_str_bytes`] and
|
||||
//! [`OsStr::from_os_str_bytes_unchecked`].
|
||||
//!
|
||||
//! [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
|
||||
//! [Unicode code point]: https://www.unicode.org/glossary/#code_point
|
||||
//! [`env::set_var()`]: crate::env::set_var "env::set_var"
|
||||
|
@ -667,6 +667,45 @@ pub fn new<S: AsRef<OsStr> + ?Sized>(s: &S) -> &OsStr {
|
||||
s.as_ref()
|
||||
}
|
||||
|
||||
/// Converts a slice of bytes to an OS string slice without checking that the string contains
|
||||
/// valid `OsStr`-encoded data.
|
||||
///
|
||||
/// See the [module's toplevel documentation about conversions][conversions] for safe,
|
||||
/// cross-platform [conversions] from/to native representations.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// `OsStr`'s encoding is an unspecified superset of UTF-8 and callers must
|
||||
/// pass in bytes that originated as a mixture of validated UTF-8 and bytes from
|
||||
/// [`OsStr::as_os_str_bytes`] from within the same rust version built for the same target
|
||||
/// platform. The bytes from `OsStr::as_os_str_bytes` may be split either
|
||||
/// immediately before or immediately after some valid non-empty UTF-8 substring
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// #![feature(os_str_bytes)]
|
||||
///
|
||||
/// use std::ffi::OsStr;
|
||||
///
|
||||
/// let os_str = OsStr::new("Mary had a little lamb");
|
||||
/// let bytes = os_str.as_os_str_bytes();
|
||||
/// let words = bytes.split(|b| *b == b' ');
|
||||
/// let words: Vec<&OsStr> = words.map(|word| {
|
||||
/// // SAFETY:
|
||||
/// // - Each `word` only contains content that originated from `OsStr::as_os_str_bytes`
|
||||
/// // - Only split with ASCII whitespace which is a non-empty UTF-8 substring
|
||||
/// unsafe { OsStr::from_os_str_bytes_unchecked(word) }
|
||||
/// }).collect();
|
||||
/// ```
|
||||
///
|
||||
/// [conversions]: super#conversions
|
||||
#[inline]
|
||||
#[unstable(feature = "os_str_bytes", issue = "111544")]
|
||||
pub fn from_os_str_bytes_unchecked(bytes: &[u8]) -> &Self {
|
||||
Self::from_inner(Slice::from_os_str_bytes_unchecked(bytes))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn from_inner(inner: &Slice) -> &OsStr {
|
||||
// SAFETY: OsStr is just a wrapper of Slice,
|
||||
@ -837,13 +876,28 @@ pub fn into_os_string(self: Box<OsStr>) -> OsString {
|
||||
OsString { inner: Buf::from_box(boxed) }
|
||||
}
|
||||
|
||||
/// Converts an OS string slice to a byte slice. To convert the byte slice back into an OS
|
||||
/// string slice, use the [`OsStr::from_os_str_bytes_unchecked`] function.
|
||||
///
|
||||
/// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should
|
||||
/// be treated as opaque and only comparable within the same rust version built for the same
|
||||
/// target platform. See [`OsString`] for more encoding details and [`std::ffi`] for
|
||||
/// platform-specific, specified conversions.
|
||||
///
|
||||
/// [`std::ffi`]: crate::ffi
|
||||
#[inline]
|
||||
#[unstable(feature = "os_str_bytes", issue = "111544")]
|
||||
pub fn as_os_str_bytes(&self) -> &[u8] {
|
||||
self.inner.as_os_str_bytes()
|
||||
}
|
||||
|
||||
/// Gets the underlying byte representation.
|
||||
///
|
||||
/// Note: it is *crucial* that this API is not externally public, to avoid
|
||||
/// revealing the internal, platform-specific encodings.
|
||||
#[inline]
|
||||
pub(crate) fn bytes(&self) -> &[u8] {
|
||||
unsafe { &*(&self.inner as *const _ as *const [u8]) }
|
||||
self.as_os_str_bytes()
|
||||
}
|
||||
|
||||
/// Converts this string to its ASCII lower case equivalent in-place.
|
||||
|
@ -193,13 +193,18 @@ pub fn into_rc(&self) -> Rc<Slice> {
|
||||
|
||||
impl Slice {
|
||||
#[inline]
|
||||
fn from_u8_slice(s: &[u8]) -> &Slice {
|
||||
pub fn as_os_str_bytes(&self) -> &[u8] {
|
||||
&self.inner
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice {
|
||||
unsafe { mem::transmute(s) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn from_str(s: &str) -> &Slice {
|
||||
Slice::from_u8_slice(s.as_bytes())
|
||||
unsafe { Slice::from_os_str_bytes_unchecked(s.as_bytes()) }
|
||||
}
|
||||
|
||||
pub fn to_str(&self) -> Option<&str> {
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
#[test]
|
||||
fn slice_debug_output() {
|
||||
let input = Slice::from_u8_slice(b"\xF0hello,\tworld");
|
||||
let input = unsafe { Slice::from_os_str_bytes_unchecked(b"\xF0hello,\tworld") };
|
||||
let expected = r#""\xF0hello,\tworld""#;
|
||||
let output = format!("{input:?}");
|
||||
|
||||
@ -11,8 +11,7 @@ fn slice_debug_output() {
|
||||
|
||||
#[test]
|
||||
fn display() {
|
||||
assert_eq!(
|
||||
"Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye",
|
||||
Slice::from_u8_slice(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string(),
|
||||
);
|
||||
assert_eq!("Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye", unsafe {
|
||||
Slice::from_os_str_bytes_unchecked(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string()
|
||||
},);
|
||||
}
|
||||
|
@ -151,6 +151,16 @@ pub fn into_rc(&self) -> Rc<Slice> {
|
||||
}
|
||||
|
||||
impl Slice {
|
||||
#[inline]
|
||||
pub fn as_os_str_bytes(&self) -> &[u8] {
|
||||
self.inner.as_bytes()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice {
|
||||
mem::transmute(Wtf8::from_bytes_unchecked(s))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn from_str(s: &str) -> &Slice {
|
||||
unsafe { mem::transmute(Wtf8::from_str(s)) }
|
||||
|
@ -570,7 +570,7 @@ pub fn from_str(value: &str) -> &Wtf8 {
|
||||
/// Since the byte slice is not checked for valid WTF-8, this functions is
|
||||
/// marked unsafe.
|
||||
#[inline]
|
||||
unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
|
||||
pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
|
||||
mem::transmute(value)
|
||||
}
|
||||
|
||||
@ -614,6 +614,12 @@ pub fn code_points(&self) -> Wtf8CodePoints<'_> {
|
||||
Wtf8CodePoints { bytes: self.bytes.iter() }
|
||||
}
|
||||
|
||||
/// Access raw bytes of WTF-8 data
|
||||
#[inline]
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
&self.bytes
|
||||
}
|
||||
|
||||
/// Tries to convert the string to UTF-8 and return a `&str` slice.
|
||||
///
|
||||
/// Returns `None` if the string contains surrogates.
|
||||
|
Loading…
Reference in New Issue
Block a user