Allow access to OsStr
bytes
`OsStr` has historically kept its implementation details private out of concern for locking us into a specific encoding on Windows. This is an alternative to #95290 which proposed specifying the encoding on Windows. Instead, this only specifies that for cross-platform code, `OsStr`'s encoding is a superset of UTF-8 and defines rules for safely interacting with it At minimum, this can greatly simplify the `os_str_bytes` crate and every arg parser that interacts with `OsStr` directly (which is most of those that support invalid UTF-8).
This commit is contained in:
parent
70e04bd88d
commit
8d2beb50c2
@ -127,6 +127,14 @@
|
|||||||
//! trait, which provides a [`from_wide`] method to convert a native Windows
|
//! trait, which provides a [`from_wide`] method to convert a native Windows
|
||||||
//! string (without the terminating nul character) to an [`OsString`].
|
//! string (without the terminating nul character) to an [`OsString`].
|
||||||
//!
|
//!
|
||||||
|
//! ## On all platforms
|
||||||
|
//!
|
||||||
|
//! On all platforms, [`OsStr`] consists of a sequence of bytes that is encoded as a superset of
|
||||||
|
//! UTF-8; see [`OsString`] for more details on its encoding on different platforms.
|
||||||
|
//!
|
||||||
|
//! For limited, inexpensive conversions from and to bytes, see [`OsStr::as_os_str_bytes`] and
|
||||||
|
//! [`OsStr::from_os_str_bytes_unchecked`].
|
||||||
|
//!
|
||||||
//! [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
|
//! [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
|
||||||
//! [Unicode code point]: https://www.unicode.org/glossary/#code_point
|
//! [Unicode code point]: https://www.unicode.org/glossary/#code_point
|
||||||
//! [`env::set_var()`]: crate::env::set_var "env::set_var"
|
//! [`env::set_var()`]: crate::env::set_var "env::set_var"
|
||||||
|
@ -667,6 +667,45 @@ pub fn new<S: AsRef<OsStr> + ?Sized>(s: &S) -> &OsStr {
|
|||||||
s.as_ref()
|
s.as_ref()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Converts a slice of bytes to an OS string slice without checking that the string contains
|
||||||
|
/// valid `OsStr`-encoded data.
|
||||||
|
///
|
||||||
|
/// See the [module's toplevel documentation about conversions][conversions] for safe,
|
||||||
|
/// cross-platform [conversions] from/to native representations.
|
||||||
|
///
|
||||||
|
/// # Safety
|
||||||
|
///
|
||||||
|
/// `OsStr`'s encoding is an unspecified superset of UTF-8 and callers must
|
||||||
|
/// pass in bytes that originated as a mixture of validated UTF-8 and bytes from
|
||||||
|
/// [`OsStr::as_os_str_bytes`] from within the same rust version built for the same target
|
||||||
|
/// platform. The bytes from `OsStr::as_os_str_bytes` may be split either
|
||||||
|
/// immediately before or immediately after some valid non-empty UTF-8 substring
|
||||||
|
///
|
||||||
|
/// # Example
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// #![feature(os_str_bytes)]
|
||||||
|
///
|
||||||
|
/// use std::ffi::OsStr;
|
||||||
|
///
|
||||||
|
/// let os_str = OsStr::new("Mary had a little lamb");
|
||||||
|
/// let bytes = os_str.as_os_str_bytes();
|
||||||
|
/// let words = bytes.split(|b| *b == b' ');
|
||||||
|
/// let words: Vec<&OsStr> = words.map(|word| {
|
||||||
|
/// // SAFETY:
|
||||||
|
/// // - Each `word` only contains content that originated from `OsStr::as_os_str_bytes`
|
||||||
|
/// // - Only split with ASCII whitespace which is a non-empty UTF-8 substring
|
||||||
|
/// unsafe { OsStr::from_os_str_bytes_unchecked(word) }
|
||||||
|
/// }).collect();
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// [conversions]: super#conversions
|
||||||
|
#[inline]
|
||||||
|
#[unstable(feature = "os_str_bytes", issue = "111544")]
|
||||||
|
pub fn from_os_str_bytes_unchecked(bytes: &[u8]) -> &Self {
|
||||||
|
Self::from_inner(Slice::from_os_str_bytes_unchecked(bytes))
|
||||||
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn from_inner(inner: &Slice) -> &OsStr {
|
fn from_inner(inner: &Slice) -> &OsStr {
|
||||||
// SAFETY: OsStr is just a wrapper of Slice,
|
// SAFETY: OsStr is just a wrapper of Slice,
|
||||||
@ -837,13 +876,28 @@ pub fn into_os_string(self: Box<OsStr>) -> OsString {
|
|||||||
OsString { inner: Buf::from_box(boxed) }
|
OsString { inner: Buf::from_box(boxed) }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Converts an OS string slice to a byte slice. To convert the byte slice back into an OS
|
||||||
|
/// string slice, use the [`OsStr::from_os_str_bytes_unchecked`] function.
|
||||||
|
///
|
||||||
|
/// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should
|
||||||
|
/// be treated as opaque and only comparable within the same rust version built for the same
|
||||||
|
/// target platform. See [`OsString`] for more encoding details and [`std::ffi`] for
|
||||||
|
/// platform-specific, specified conversions.
|
||||||
|
///
|
||||||
|
/// [`std::ffi`]: crate::ffi
|
||||||
|
#[inline]
|
||||||
|
#[unstable(feature = "os_str_bytes", issue = "111544")]
|
||||||
|
pub fn as_os_str_bytes(&self) -> &[u8] {
|
||||||
|
self.inner.as_os_str_bytes()
|
||||||
|
}
|
||||||
|
|
||||||
/// Gets the underlying byte representation.
|
/// Gets the underlying byte representation.
|
||||||
///
|
///
|
||||||
/// Note: it is *crucial* that this API is not externally public, to avoid
|
/// Note: it is *crucial* that this API is not externally public, to avoid
|
||||||
/// revealing the internal, platform-specific encodings.
|
/// revealing the internal, platform-specific encodings.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub(crate) fn bytes(&self) -> &[u8] {
|
pub(crate) fn bytes(&self) -> &[u8] {
|
||||||
unsafe { &*(&self.inner as *const _ as *const [u8]) }
|
self.as_os_str_bytes()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Converts this string to its ASCII lower case equivalent in-place.
|
/// Converts this string to its ASCII lower case equivalent in-place.
|
||||||
|
@ -193,13 +193,18 @@ pub fn into_rc(&self) -> Rc<Slice> {
|
|||||||
|
|
||||||
impl Slice {
|
impl Slice {
|
||||||
#[inline]
|
#[inline]
|
||||||
fn from_u8_slice(s: &[u8]) -> &Slice {
|
pub fn as_os_str_bytes(&self) -> &[u8] {
|
||||||
|
&self.inner
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice {
|
||||||
unsafe { mem::transmute(s) }
|
unsafe { mem::transmute(s) }
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn from_str(s: &str) -> &Slice {
|
pub fn from_str(s: &str) -> &Slice {
|
||||||
Slice::from_u8_slice(s.as_bytes())
|
unsafe { Slice::from_os_str_bytes_unchecked(s.as_bytes()) }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn to_str(&self) -> Option<&str> {
|
pub fn to_str(&self) -> Option<&str> {
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn slice_debug_output() {
|
fn slice_debug_output() {
|
||||||
let input = Slice::from_u8_slice(b"\xF0hello,\tworld");
|
let input = unsafe { Slice::from_os_str_bytes_unchecked(b"\xF0hello,\tworld") };
|
||||||
let expected = r#""\xF0hello,\tworld""#;
|
let expected = r#""\xF0hello,\tworld""#;
|
||||||
let output = format!("{input:?}");
|
let output = format!("{input:?}");
|
||||||
|
|
||||||
@ -11,8 +11,7 @@ fn slice_debug_output() {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn display() {
|
fn display() {
|
||||||
assert_eq!(
|
assert_eq!("Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye", unsafe {
|
||||||
"Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye",
|
Slice::from_os_str_bytes_unchecked(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string()
|
||||||
Slice::from_u8_slice(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string(),
|
},);
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
@ -151,6 +151,16 @@ pub fn into_rc(&self) -> Rc<Slice> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Slice {
|
impl Slice {
|
||||||
|
#[inline]
|
||||||
|
pub fn as_os_str_bytes(&self) -> &[u8] {
|
||||||
|
self.inner.as_bytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice {
|
||||||
|
mem::transmute(Wtf8::from_bytes_unchecked(s))
|
||||||
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn from_str(s: &str) -> &Slice {
|
pub fn from_str(s: &str) -> &Slice {
|
||||||
unsafe { mem::transmute(Wtf8::from_str(s)) }
|
unsafe { mem::transmute(Wtf8::from_str(s)) }
|
||||||
|
@ -570,7 +570,7 @@ pub fn from_str(value: &str) -> &Wtf8 {
|
|||||||
/// Since the byte slice is not checked for valid WTF-8, this functions is
|
/// Since the byte slice is not checked for valid WTF-8, this functions is
|
||||||
/// marked unsafe.
|
/// marked unsafe.
|
||||||
#[inline]
|
#[inline]
|
||||||
unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
|
pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
|
||||||
mem::transmute(value)
|
mem::transmute(value)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -614,6 +614,12 @@ pub fn code_points(&self) -> Wtf8CodePoints<'_> {
|
|||||||
Wtf8CodePoints { bytes: self.bytes.iter() }
|
Wtf8CodePoints { bytes: self.bytes.iter() }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Access raw bytes of WTF-8 data
|
||||||
|
#[inline]
|
||||||
|
pub fn as_bytes(&self) -> &[u8] {
|
||||||
|
&self.bytes
|
||||||
|
}
|
||||||
|
|
||||||
/// Tries to convert the string to UTF-8 and return a `&str` slice.
|
/// Tries to convert the string to UTF-8 and return a `&str` slice.
|
||||||
///
|
///
|
||||||
/// Returns `None` if the string contains surrogates.
|
/// Returns `None` if the string contains surrogates.
|
||||||
|
Loading…
Reference in New Issue
Block a user