Rollup merge of #95967 - CAD97:from-utf16, r=dtolnay

Add explicit-endian String::from_utf16 variants

This adds the following APIs under `feature(str_from_utf16_endian)`:

```rust
impl String {
    pub fn from_utf16le(v: &[u8]) -> Result<String, FromUtf16Error>;
    pub fn from_utf16le_lossy(v: &[u8]) -> String;
    pub fn from_utf16be(v: &[u8]) -> Result<String, FromUtf16Error>;
    pub fn from_utf16be_lossy(v: &[u8]) -> String;
}
```

These are versions of `String::from_utf16` that explicitly take [UTF-16LE and UTF-16BE](https://unicode.org/faq/utf_bom.html#gen7). Notably, we can do better than just the obvious `decode_utf16(v.array_chunks::<2>().copied().map(u16::from_le_bytes)).collect()` in that:

- We handle the case where the byte slice is not an even number of bytes, and
- In the case that the UTF-16 is native endian and the slice is aligned, we can forward to `String::from_utf16`.

If the Unicode Consortium actively defines how to handle character replacement when decoding a UTF-16 bytestream with a trailing odd byte, I was unable to find reference. However, the behavior implemented here is fairly self-evidently correct: replace the single errant byte with the replacement character.
This commit is contained in:
Ali MJ Al-Nasrawy 2023-10-11 03:53:16 +03:00 committed by GitHub
commit 38654ad741
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -714,6 +714,156 @@ pub fn from_utf16_lossy(v: &[u16]) -> String {
.collect()
}
/// Decode a UTF-16LEencoded vector `v` into a `String`, returning [`Err`]
/// if `v` contains any invalid data.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// #![feature(str_from_utf16_endian)]
/// // 𝄞music
/// let v = &[0x34, 0xD8, 0x1E, 0xDD, 0x6d, 0x00, 0x75, 0x00,
/// 0x73, 0x00, 0x69, 0x00, 0x63, 0x00];
/// assert_eq!(String::from("𝄞music"),
/// String::from_utf16le(v).unwrap());
///
/// // 𝄞mu<invalid>ic
/// let v = &[0x34, 0xD8, 0x1E, 0xDD, 0x6d, 0x00, 0x75, 0x00,
/// 0x00, 0xD8, 0x69, 0x00, 0x63, 0x00];
/// assert!(String::from_utf16le(v).is_err());
/// ```
#[cfg(not(no_global_oom_handling))]
#[unstable(feature = "str_from_utf16_endian", issue = "116258")]
pub fn from_utf16le(v: &[u8]) -> Result<String, FromUtf16Error> {
if v.len() % 2 != 0 {
return Err(FromUtf16Error(()));
}
match (cfg!(target_endian = "little"), unsafe { v.align_to::<u16>() }) {
(true, ([], v, [])) => Self::from_utf16(v),
_ => char::decode_utf16(v.array_chunks::<2>().copied().map(u16::from_le_bytes))
.collect::<Result<_, _>>()
.map_err(|_| FromUtf16Error(())),
}
}
/// Decode a UTF-16LEencoded slice `v` into a `String`, replacing
/// invalid data with [the replacement character (`U+FFFD`)][U+FFFD].
///
/// Unlike [`from_utf8_lossy`] which returns a [`Cow<'a, str>`],
/// `from_utf16le_lossy` returns a `String` since the UTF-16 to UTF-8
/// conversion requires a memory allocation.
///
/// [`from_utf8_lossy`]: String::from_utf8_lossy
/// [`Cow<'a, str>`]: crate::borrow::Cow "borrow::Cow"
/// [U+FFFD]: core::char::REPLACEMENT_CHARACTER
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// #![feature(str_from_utf16_endian)]
/// // 𝄞mus<invalid>ic<invalid>
/// let v = &[0x34, 0xD8, 0x1E, 0xDD, 0x6d, 0x00, 0x75, 0x00,
/// 0x73, 0x00, 0x1E, 0xDD, 0x69, 0x00, 0x63, 0x00,
/// 0x34, 0xD8];
///
/// assert_eq!(String::from("𝄞mus\u{FFFD}ic\u{FFFD}"),
/// String::from_utf16le_lossy(v));
/// ```
#[cfg(not(no_global_oom_handling))]
#[unstable(feature = "str_from_utf16_endian", issue = "116258")]
pub fn from_utf16le_lossy(v: &[u8]) -> String {
match (cfg!(target_endian = "little"), unsafe { v.align_to::<u16>() }) {
(true, ([], v, [])) => Self::from_utf16_lossy(v),
(true, ([], v, [_remainder])) => Self::from_utf16_lossy(v) + "\u{FFFD}",
_ => {
let mut iter = v.array_chunks::<2>();
let string = char::decode_utf16(iter.by_ref().copied().map(u16::from_le_bytes))
.map(|r| r.unwrap_or(char::REPLACEMENT_CHARACTER))
.collect();
if iter.remainder().is_empty() { string } else { string + "\u{FFFD}" }
}
}
}
/// Decode a UTF-16BEencoded vector `v` into a `String`, returning [`Err`]
/// if `v` contains any invalid data.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// #![feature(str_from_utf16_endian)]
/// // 𝄞music
/// let v = &[0xD8, 0x34, 0xDD, 0x1E, 0x00, 0x6d, 0x00, 0x75,
/// 0x00, 0x73, 0x00, 0x69, 0x00, 0x63];
/// assert_eq!(String::from("𝄞music"),
/// String::from_utf16be(v).unwrap());
///
/// // 𝄞mu<invalid>ic
/// let v = &[0xD8, 0x34, 0xDD, 0x1E, 0x00, 0x6d, 0x00, 0x75,
/// 0xD8, 0x00, 0x00, 0x69, 0x00, 0x63];
/// assert!(String::from_utf16be(v).is_err());
/// ```
#[cfg(not(no_global_oom_handling))]
#[unstable(feature = "str_from_utf16_endian", issue = "116258")]
pub fn from_utf16be(v: &[u8]) -> Result<String, FromUtf16Error> {
if v.len() % 2 != 0 {
return Err(FromUtf16Error(()));
}
match (cfg!(target_endian = "big"), unsafe { v.align_to::<u16>() }) {
(true, ([], v, [])) => Self::from_utf16(v),
_ => char::decode_utf16(v.array_chunks::<2>().copied().map(u16::from_be_bytes))
.collect::<Result<_, _>>()
.map_err(|_| FromUtf16Error(())),
}
}
/// Decode a UTF-16BEencoded slice `v` into a `String`, replacing
/// invalid data with [the replacement character (`U+FFFD`)][U+FFFD].
///
/// Unlike [`from_utf8_lossy`] which returns a [`Cow<'a, str>`],
/// `from_utf16le_lossy` returns a `String` since the UTF-16 to UTF-8
/// conversion requires a memory allocation.
///
/// [`from_utf8_lossy`]: String::from_utf8_lossy
/// [`Cow<'a, str>`]: crate::borrow::Cow "borrow::Cow"
/// [U+FFFD]: core::char::REPLACEMENT_CHARACTER
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// #![feature(str_from_utf16_endian)]
/// // 𝄞mus<invalid>ic<invalid>
/// let v = &[0xD8, 0x34, 0xDD, 0x1E, 0x00, 0x6d, 0x00, 0x75,
/// 0x00, 0x73, 0xDD, 0x1E, 0x00, 0x69, 0x00, 0x63,
/// 0xD8, 0x34];
///
/// assert_eq!(String::from("𝄞mus\u{FFFD}ic\u{FFFD}"),
/// String::from_utf16be_lossy(v));
/// ```
#[cfg(not(no_global_oom_handling))]
#[unstable(feature = "str_from_utf16_endian", issue = "116258")]
pub fn from_utf16be_lossy(v: &[u8]) -> String {
match (cfg!(target_endian = "big"), unsafe { v.align_to::<u16>() }) {
(true, ([], v, [])) => Self::from_utf16_lossy(v),
(true, ([], v, [_remainder])) => Self::from_utf16_lossy(v) + "\u{FFFD}",
_ => {
let mut iter = v.array_chunks::<2>();
let string = char::decode_utf16(iter.by_ref().copied().map(u16::from_be_bytes))
.map(|r| r.unwrap_or(char::REPLACEMENT_CHARACTER))
.collect();
if iter.remainder().is_empty() { string } else { string + "\u{FFFD}" }
}
}
}
/// Decomposes a `String` into its raw components.
///
/// Returns the raw pointer to the underlying data, the length of