Rollup merge of #130408 - okaneco:into_lossy_refactor, r=Noratrieb

Avoid re-validating UTF-8 in `FromUtf8Error::into_utf8_lossy`

Part of the unstable feature `string_from_utf8_lossy_owned` - #129436

Refactor `FromUtf8Error::into_utf8_lossy` to copy valid UTF-8 bytes into the buffer, avoiding double validation of bytes.
Add tests that mirror the `String::from_utf8_lossy` tests.
This commit is contained in:
Michael Goulet 2024-09-21 15:18:56 -04:00 committed by GitHub
commit 493852ccd6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 63 additions and 1 deletions

View File

@ -2087,7 +2087,31 @@ pub fn as_bytes(&self) -> &[u8] {
#[cfg(not(no_global_oom_handling))] #[cfg(not(no_global_oom_handling))]
#[unstable(feature = "string_from_utf8_lossy_owned", issue = "129436")] #[unstable(feature = "string_from_utf8_lossy_owned", issue = "129436")]
pub fn into_utf8_lossy(self) -> String { pub fn into_utf8_lossy(self) -> String {
String::from_utf8_lossy_owned(self.bytes) const REPLACEMENT: &str = "\u{FFFD}";
let mut res = {
let mut v = Vec::with_capacity(self.bytes.len());
// `Utf8Error::valid_up_to` returns the maximum index of validated
// UTF-8 bytes. Copy the valid bytes into the output buffer.
v.extend_from_slice(&self.bytes[..self.error.valid_up_to()]);
// SAFETY: This is safe because the only bytes present in the buffer
// were validated as UTF-8 by the call to `String::from_utf8` which
// produced this `FromUtf8Error`.
unsafe { String::from_utf8_unchecked(v) }
};
let iter = self.bytes[self.error.valid_up_to()..].utf8_chunks();
for chunk in iter {
res.push_str(chunk.valid());
if !chunk.invalid().is_empty() {
res.push_str(REPLACEMENT);
}
}
res
} }
/// Returns the bytes that were attempted to convert to a `String`. /// Returns the bytes that were attempted to convert to a `String`.

View File

@ -28,6 +28,7 @@
#![feature(iter_next_chunk)] #![feature(iter_next_chunk)]
#![feature(round_char_boundary)] #![feature(round_char_boundary)]
#![feature(slice_partition_dedup)] #![feature(slice_partition_dedup)]
#![feature(string_from_utf8_lossy_owned)]
#![feature(string_remove_matches)] #![feature(string_remove_matches)]
#![feature(const_btree_len)] #![feature(const_btree_len)]
#![feature(const_trait_impl)] #![feature(const_trait_impl)]

View File

@ -114,6 +114,43 @@ fn test_from_utf8_lossy() {
); );
} }
#[test]
fn test_fromutf8error_into_lossy() {
fn func(input: &[u8]) -> String {
String::from_utf8(input.to_owned()).unwrap_or_else(|e| e.into_utf8_lossy())
}
let xs = b"hello";
let ys = "hello".to_owned();
assert_eq!(func(xs), ys);
let xs = "ศไทย中华Việt Nam".as_bytes();
let ys = "ศไทย中华Việt Nam".to_owned();
assert_eq!(func(xs), ys);
let xs = b"Hello\xC2 There\xFF Goodbye";
assert_eq!(func(xs), "Hello\u{FFFD} There\u{FFFD} Goodbye".to_owned());
let xs = b"Hello\xC0\x80 There\xE6\x83 Goodbye";
assert_eq!(func(xs), "Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye".to_owned());
let xs = b"\xF5foo\xF5\x80bar";
assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}\u{FFFD}bar".to_owned());
let xs = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz";
assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz".to_owned());
let xs = b"\xF4foo\xF4\x80bar\xF4\xBFbaz";
assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz".to_owned());
let xs = b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar";
assert_eq!(func(xs), "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}foo\u{10000}bar".to_owned());
// surrogates
let xs = b"\xED\xA0\x80foo\xED\xBF\xBFbar";
assert_eq!(func(xs), "\u{FFFD}\u{FFFD}\u{FFFD}foo\u{FFFD}\u{FFFD}\u{FFFD}bar".to_owned());
}
#[test] #[test]
fn test_from_utf16() { fn test_from_utf16() {
let pairs = [ let pairs = [