Avoid re-validating UTF-8 in FromUtf8Error::into_utf8_lossy

Refactor `into_utf8_lossy` to copy valid UTF-8 bytes into the buffer,
avoiding double validation of bytes.
Add tests that mirror the `String::from_utf8_lossy` tests
This commit is contained in:
okaneco 2024-08-25 02:41:56 -04:00
parent 8c2c9a9ef5
commit b94c5a169b
3 changed files with 63 additions and 1 deletions

View File

@ -2081,7 +2081,31 @@ pub fn as_bytes(&self) -> &[u8] {
#[cfg(not(no_global_oom_handling))]
#[unstable(feature = "string_from_utf8_lossy_owned", issue = "129436")]
pub fn into_utf8_lossy(self) -> String {
String::from_utf8_lossy_owned(self.bytes)
const REPLACEMENT: &str = "\u{FFFD}";
let mut res = {
let mut v = Vec::with_capacity(self.bytes.len());
// `Utf8Error::valid_up_to` returns the maximum index of validated
// UTF-8 bytes. Copy the valid bytes into the output buffer.
v.extend_from_slice(&self.bytes[..self.error.valid_up_to()]);
// SAFETY: This is safe because the only bytes present in the buffer
// were validated as UTF-8 by the call to `String::from_utf8` which
// produced this `FromUtf8Error`.
unsafe { String::from_utf8_unchecked(v) }
};
let iter = self.bytes[self.error.valid_up_to()..].utf8_chunks();
for chunk in iter {
res.push_str(chunk.valid());
if !chunk.invalid().is_empty() {
res.push_str(REPLACEMENT);
}
}
res
}
/// Returns the bytes that were attempted to convert to a `String`.

View File

@ -28,6 +28,7 @@
#![feature(iter_next_chunk)]
#![feature(round_char_boundary)]
#![feature(slice_partition_dedup)]
#![feature(string_from_utf8_lossy_owned)]
#![feature(string_remove_matches)]
#![feature(const_btree_len)]
#![feature(const_trait_impl)]

View File

@ -114,6 +114,43 @@ fn test_from_utf8_lossy() {
);
}
#[test]
fn test_fromutf8error_into_lossy() {
fn func(input: &[u8]) -> String {
String::from_utf8(input.to_owned()).unwrap_or_else(|e| e.into_utf8_lossy())
}
let xs = b"hello";
let ys = "hello".to_owned();
assert_eq!(func(xs), ys);
let xs = "ศไทย中华Việt Nam".as_bytes();
let ys = "ศไทย中华Việt Nam".to_owned();
assert_eq!(func(xs), ys);
let xs = b"Hello\xC2 There\xFF Goodbye";
assert_eq!(func(xs), "Hello\u{FFFD} There\u{FFFD} Goodbye".to_owned());
let xs = b"Hello\xC0\x80 There\xE6\x83 Goodbye";
assert_eq!(func(xs), "Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye".to_owned());
let xs = b"\xF5foo\xF5\x80bar";
assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}\u{FFFD}bar".to_owned());
let xs = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz";
assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz".to_owned());
let xs = b"\xF4foo\xF4\x80bar\xF4\xBFbaz";
assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz".to_owned());
let xs = b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar";
assert_eq!(func(xs), "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}foo\u{10000}bar".to_owned());
// surrogates
let xs = b"\xED\xA0\x80foo\xED\xBF\xBFbar";
assert_eq!(func(xs), "\u{FFFD}\u{FFFD}\u{FFFD}foo\u{FFFD}\u{FFFD}\u{FFFD}bar".to_owned());
}
#[test]
fn test_from_utf16() {
let pairs = [