Rollup merge of #130408 - okaneco:into_lossy_refactor, r=Noratrieb
Avoid re-validating UTF-8 in `FromUtf8Error::into_utf8_lossy` Part of the unstable feature `string_from_utf8_lossy_owned` - #129436 Refactor `FromUtf8Error::into_utf8_lossy` to copy valid UTF-8 bytes into the buffer, avoiding double validation of bytes. Add tests that mirror the `String::from_utf8_lossy` tests.
This commit is contained in:
commit
493852ccd6
@ -2087,7 +2087,31 @@ pub fn as_bytes(&self) -> &[u8] {
|
|||||||
#[cfg(not(no_global_oom_handling))]
|
#[cfg(not(no_global_oom_handling))]
|
||||||
#[unstable(feature = "string_from_utf8_lossy_owned", issue = "129436")]
|
#[unstable(feature = "string_from_utf8_lossy_owned", issue = "129436")]
|
||||||
pub fn into_utf8_lossy(self) -> String {
|
pub fn into_utf8_lossy(self) -> String {
|
||||||
String::from_utf8_lossy_owned(self.bytes)
|
const REPLACEMENT: &str = "\u{FFFD}";
|
||||||
|
|
||||||
|
let mut res = {
|
||||||
|
let mut v = Vec::with_capacity(self.bytes.len());
|
||||||
|
|
||||||
|
// `Utf8Error::valid_up_to` returns the maximum index of validated
|
||||||
|
// UTF-8 bytes. Copy the valid bytes into the output buffer.
|
||||||
|
v.extend_from_slice(&self.bytes[..self.error.valid_up_to()]);
|
||||||
|
|
||||||
|
// SAFETY: This is safe because the only bytes present in the buffer
|
||||||
|
// were validated as UTF-8 by the call to `String::from_utf8` which
|
||||||
|
// produced this `FromUtf8Error`.
|
||||||
|
unsafe { String::from_utf8_unchecked(v) }
|
||||||
|
};
|
||||||
|
|
||||||
|
let iter = self.bytes[self.error.valid_up_to()..].utf8_chunks();
|
||||||
|
|
||||||
|
for chunk in iter {
|
||||||
|
res.push_str(chunk.valid());
|
||||||
|
if !chunk.invalid().is_empty() {
|
||||||
|
res.push_str(REPLACEMENT);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
res
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the bytes that were attempted to convert to a `String`.
|
/// Returns the bytes that were attempted to convert to a `String`.
|
||||||
|
@ -28,6 +28,7 @@
|
|||||||
#![feature(iter_next_chunk)]
|
#![feature(iter_next_chunk)]
|
||||||
#![feature(round_char_boundary)]
|
#![feature(round_char_boundary)]
|
||||||
#![feature(slice_partition_dedup)]
|
#![feature(slice_partition_dedup)]
|
||||||
|
#![feature(string_from_utf8_lossy_owned)]
|
||||||
#![feature(string_remove_matches)]
|
#![feature(string_remove_matches)]
|
||||||
#![feature(const_btree_len)]
|
#![feature(const_btree_len)]
|
||||||
#![feature(const_trait_impl)]
|
#![feature(const_trait_impl)]
|
||||||
|
@ -114,6 +114,43 @@ fn test_from_utf8_lossy() {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_fromutf8error_into_lossy() {
|
||||||
|
fn func(input: &[u8]) -> String {
|
||||||
|
String::from_utf8(input.to_owned()).unwrap_or_else(|e| e.into_utf8_lossy())
|
||||||
|
}
|
||||||
|
|
||||||
|
let xs = b"hello";
|
||||||
|
let ys = "hello".to_owned();
|
||||||
|
assert_eq!(func(xs), ys);
|
||||||
|
|
||||||
|
let xs = "ศไทย中华Việt Nam".as_bytes();
|
||||||
|
let ys = "ศไทย中华Việt Nam".to_owned();
|
||||||
|
assert_eq!(func(xs), ys);
|
||||||
|
|
||||||
|
let xs = b"Hello\xC2 There\xFF Goodbye";
|
||||||
|
assert_eq!(func(xs), "Hello\u{FFFD} There\u{FFFD} Goodbye".to_owned());
|
||||||
|
|
||||||
|
let xs = b"Hello\xC0\x80 There\xE6\x83 Goodbye";
|
||||||
|
assert_eq!(func(xs), "Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye".to_owned());
|
||||||
|
|
||||||
|
let xs = b"\xF5foo\xF5\x80bar";
|
||||||
|
assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}\u{FFFD}bar".to_owned());
|
||||||
|
|
||||||
|
let xs = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz";
|
||||||
|
assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz".to_owned());
|
||||||
|
|
||||||
|
let xs = b"\xF4foo\xF4\x80bar\xF4\xBFbaz";
|
||||||
|
assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz".to_owned());
|
||||||
|
|
||||||
|
let xs = b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar";
|
||||||
|
assert_eq!(func(xs), "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}foo\u{10000}bar".to_owned());
|
||||||
|
|
||||||
|
// surrogates
|
||||||
|
let xs = b"\xED\xA0\x80foo\xED\xBF\xBFbar";
|
||||||
|
assert_eq!(func(xs), "\u{FFFD}\u{FFFD}\u{FFFD}foo\u{FFFD}\u{FFFD}\u{FFFD}bar".to_owned());
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_from_utf16() {
|
fn test_from_utf16() {
|
||||||
let pairs = [
|
let pairs = [
|
||||||
|
Loading…
Reference in New Issue
Block a user