Avoid re-validating UTF-8 in FromUtf8Error::into_utf8_lossy
Refactor `into_utf8_lossy` to copy valid UTF-8 bytes into the buffer, avoiding double validation of bytes. Add tests that mirror the `String::from_utf8_lossy` tests
This commit is contained in:
parent
8c2c9a9ef5
commit
b94c5a169b
@ -2081,7 +2081,31 @@ pub fn as_bytes(&self) -> &[u8] {
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
#[unstable(feature = "string_from_utf8_lossy_owned", issue = "129436")]
|
||||
pub fn into_utf8_lossy(self) -> String {
|
||||
String::from_utf8_lossy_owned(self.bytes)
|
||||
const REPLACEMENT: &str = "\u{FFFD}";
|
||||
|
||||
let mut res = {
|
||||
let mut v = Vec::with_capacity(self.bytes.len());
|
||||
|
||||
// `Utf8Error::valid_up_to` returns the maximum index of validated
|
||||
// UTF-8 bytes. Copy the valid bytes into the output buffer.
|
||||
v.extend_from_slice(&self.bytes[..self.error.valid_up_to()]);
|
||||
|
||||
// SAFETY: This is safe because the only bytes present in the buffer
|
||||
// were validated as UTF-8 by the call to `String::from_utf8` which
|
||||
// produced this `FromUtf8Error`.
|
||||
unsafe { String::from_utf8_unchecked(v) }
|
||||
};
|
||||
|
||||
let iter = self.bytes[self.error.valid_up_to()..].utf8_chunks();
|
||||
|
||||
for chunk in iter {
|
||||
res.push_str(chunk.valid());
|
||||
if !chunk.invalid().is_empty() {
|
||||
res.push_str(REPLACEMENT);
|
||||
}
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
/// Returns the bytes that were attempted to convert to a `String`.
|
||||
|
@ -28,6 +28,7 @@
|
||||
#![feature(iter_next_chunk)]
|
||||
#![feature(round_char_boundary)]
|
||||
#![feature(slice_partition_dedup)]
|
||||
#![feature(string_from_utf8_lossy_owned)]
|
||||
#![feature(string_remove_matches)]
|
||||
#![feature(const_btree_len)]
|
||||
#![feature(const_trait_impl)]
|
||||
|
@ -114,6 +114,43 @@ fn test_from_utf8_lossy() {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fromutf8error_into_lossy() {
|
||||
fn func(input: &[u8]) -> String {
|
||||
String::from_utf8(input.to_owned()).unwrap_or_else(|e| e.into_utf8_lossy())
|
||||
}
|
||||
|
||||
let xs = b"hello";
|
||||
let ys = "hello".to_owned();
|
||||
assert_eq!(func(xs), ys);
|
||||
|
||||
let xs = "ศไทย中华Việt Nam".as_bytes();
|
||||
let ys = "ศไทย中华Việt Nam".to_owned();
|
||||
assert_eq!(func(xs), ys);
|
||||
|
||||
let xs = b"Hello\xC2 There\xFF Goodbye";
|
||||
assert_eq!(func(xs), "Hello\u{FFFD} There\u{FFFD} Goodbye".to_owned());
|
||||
|
||||
let xs = b"Hello\xC0\x80 There\xE6\x83 Goodbye";
|
||||
assert_eq!(func(xs), "Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye".to_owned());
|
||||
|
||||
let xs = b"\xF5foo\xF5\x80bar";
|
||||
assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}\u{FFFD}bar".to_owned());
|
||||
|
||||
let xs = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz";
|
||||
assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz".to_owned());
|
||||
|
||||
let xs = b"\xF4foo\xF4\x80bar\xF4\xBFbaz";
|
||||
assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz".to_owned());
|
||||
|
||||
let xs = b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar";
|
||||
assert_eq!(func(xs), "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}foo\u{10000}bar".to_owned());
|
||||
|
||||
// surrogates
|
||||
let xs = b"\xED\xA0\x80foo\xED\xBF\xBFbar";
|
||||
assert_eq!(func(xs), "\u{FFFD}\u{FFFD}\u{FFFD}foo\u{FFFD}\u{FFFD}\u{FFFD}bar".to_owned());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_utf16() {
|
||||
let pairs = [
|
||||
|
Loading…
Reference in New Issue
Block a user