Rollup merge of #113442 - epage:osstring, r=cuviper

Allow limited access to `OsString` bytes

This extends #109698 to allow no-cost conversion between `Vec<u8>` and `OsString` as suggested in feedback from `os_str_bytes` crate in #111544.
This commit is contained in:
Matthias Krüger 2023-07-22 11:48:53 +02:00 committed by GitHub
commit 0877d11e8d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 100 additions and 0 deletions

View File

@ -141,6 +141,51 @@ pub fn new() -> OsString {
OsString { inner: Buf::from_string(String::new()) }
}
/// Converts bytes to an `OsString` without checking that the bytes contains
/// valid [`OsStr`]-encoded data.
///
/// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8.
/// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit
/// ASCII.
///
/// See the [module's toplevel documentation about conversions][conversions] for safe,
/// cross-platform [conversions] from/to native representations.
///
/// # Safety
///
/// As the encoding is unspecified, callers must pass in bytes that originated as a mixture of
/// validated UTF-8 and bytes from [`OsStr::as_os_str_bytes`] from within the same rust version
/// built for the same target platform. For example, reconstructing an `OsString` from bytes sent
/// over the network or stored in a file will likely violate these safety rules.
///
/// Due to the encoding being self-synchronizing, the bytes from [`OsStr::as_os_str_bytes`] can be
/// split either immediately before or immediately after any valid non-empty UTF-8 substring.
///
/// # Example
///
/// ```
/// #![feature(os_str_bytes)]
///
/// use std::ffi::OsStr;
///
/// let os_str = OsStr::new("Mary had a little lamb");
/// let bytes = os_str.as_os_str_bytes();
/// let words = bytes.split(|b| *b == b' ');
/// let words: Vec<&OsStr> = words.map(|word| {
/// // SAFETY:
/// // - Each `word` only contains content that originated from `OsStr::as_os_str_bytes`
/// // - Only split with ASCII whitespace which is a non-empty UTF-8 substring
/// unsafe { OsStr::from_os_str_bytes_unchecked(word) }
/// }).collect();
/// ```
///
/// [conversions]: super#conversions
#[inline]
#[unstable(feature = "os_str_bytes", issue = "111544")]
pub unsafe fn from_os_str_bytes_unchecked(bytes: Vec<u8>) -> Self {
OsString { inner: Buf::from_os_str_bytes_unchecked(bytes) }
}
/// Converts to an [`OsStr`] slice.
///
/// # Examples
@ -159,6 +204,26 @@ pub fn as_os_str(&self) -> &OsStr {
self
}
/// Converts the `OsString` into a byte slice. To convert the byte slice back into an
/// `OsString`, use the [`OsStr::from_os_str_bytes_unchecked`] function.
///
/// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8.
/// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit
/// ASCII.
///
/// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should
/// be treated as opaque and only comparable within the same rust version built for the same
/// target platform. For example, sending the bytes over the network or storing it in a file
/// will likely result in incompatible data. See [`OsString`] for more encoding details
/// and [`std::ffi`] for platform-specific, specified conversions.
///
/// [`std::ffi`]: crate::ffi
#[inline]
#[unstable(feature = "os_str_bytes", issue = "111544")]
pub fn into_os_str_bytes(self) -> Vec<u8> {
self.inner.into_os_str_bytes()
}
/// Converts the `OsString` into a [`String`] if it contains valid Unicode data.
///
/// On failure, ownership of the original `OsString` is returned.

View File

@ -96,6 +96,16 @@ fn as_inner(&self) -> &[u8] {
}
impl Buf {
#[inline]
pub fn into_os_str_bytes(self) -> Vec<u8> {
self.inner
}
#[inline]
pub unsafe fn from_os_str_bytes_unchecked(s: Vec<u8>) -> Self {
Self { inner: s }
}
pub fn from_string(s: String) -> Buf {
Buf { inner: s.into_bytes() }
}

View File

@ -63,6 +63,16 @@ fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
}
impl Buf {
#[inline]
pub fn into_os_str_bytes(self) -> Vec<u8> {
self.inner.into_bytes()
}
#[inline]
pub unsafe fn from_os_str_bytes_unchecked(s: Vec<u8>) -> Self {
Self { inner: Wtf8Buf::from_bytes_unchecked(s) }
}
pub fn with_capacity(capacity: usize) -> Buf {
Buf { inner: Wtf8Buf::with_capacity(capacity) }
}

View File

@ -182,6 +182,15 @@ pub fn with_capacity(capacity: usize) -> Wtf8Buf {
Wtf8Buf { bytes: Vec::with_capacity(capacity), is_known_utf8: true }
}
/// Creates a WTF-8 string from a WTF-8 byte vec.
///
/// Since the byte vec is not checked for valid WTF-8, this functions is
/// marked unsafe.
#[inline]
pub unsafe fn from_bytes_unchecked(value: Vec<u8>) -> Wtf8Buf {
Wtf8Buf { bytes: value, is_known_utf8: false }
}
/// Creates a WTF-8 string from a UTF-8 `String`.
///
/// This takes ownership of the `String` and does not copy.
@ -402,6 +411,12 @@ pub fn truncate(&mut self, new_len: usize) {
self.bytes.truncate(new_len)
}
/// Consumes the WTF-8 string and tries to convert it to a vec of bytes.
#[inline]
pub fn into_bytes(self) -> Vec<u8> {
self.bytes
}
/// Consumes the WTF-8 string and tries to convert it to UTF-8.
///
/// This does not copy the data.