Yield Err in char::decode_utf8 per Unicode, like String::from_utf8_lossy
This commit is contained in:
parent
892bf3d41d
commit
46226a7a6e
@ -737,25 +737,81 @@ pub struct InvalidSequence(());
|
||||
impl<I: Iterator<Item = u8>> Iterator for DecodeUtf8<I> {
|
||||
type Item = Result<char, InvalidSequence>;
|
||||
#[inline]
|
||||
|
||||
fn next(&mut self) -> Option<Result<char, InvalidSequence>> {
|
||||
self.0.next().map(|b| {
|
||||
if b & 0x80 == 0 { Ok(b as char) } else {
|
||||
let l = (!b).leading_zeros() as usize; // number of bytes in UTF-8 representation
|
||||
if l < 2 || l > 6 { return Err(InvalidSequence(())) };
|
||||
let mut x = (b as u32) & (0x7F >> l);
|
||||
for _ in 0..l-1 {
|
||||
self.0.next().map(|first_byte| {
|
||||
// Emit InvalidSequence according to
|
||||
// Unicode §5.22 Best Practice for U+FFFD Substitution
|
||||
// http://www.unicode.org/versions/Unicode9.0.0/ch05.pdf#G40630
|
||||
|
||||
// Roughly: consume at least one byte,
|
||||
// then validate one byte at a time and stop before the first unexpected byte
|
||||
// (which might be the valid start of the next byte sequence).
|
||||
|
||||
let mut code_point;
|
||||
macro_rules! first_byte {
|
||||
($mask: expr) => {
|
||||
code_point = u32::from(first_byte & $mask)
|
||||
}
|
||||
}
|
||||
macro_rules! continuation_byte {
|
||||
() => { continuation_byte!(0x80...0xBF) };
|
||||
($range: pat) => {
|
||||
match self.0.peek() {
|
||||
Some(&b) if b & 0xC0 == 0x80 => {
|
||||
Some(&byte @ $range) => {
|
||||
code_point = (code_point << 6) | u32::from(byte & 0b0011_1111);
|
||||
self.0.next();
|
||||
x = (x << 6) | (b as u32) & 0x3F;
|
||||
},
|
||||
_ => return Err(InvalidSequence(())),
|
||||
}
|
||||
_ => return Err(InvalidSequence(()))
|
||||
}
|
||||
}
|
||||
match from_u32(x) {
|
||||
Some(x) if l == x.len_utf8() => Ok(x),
|
||||
_ => Err(InvalidSequence(())),
|
||||
}
|
||||
|
||||
match first_byte {
|
||||
0x00...0x7F => {
|
||||
first_byte!(0b1111_1111);
|
||||
}
|
||||
0xC2...0xDF => {
|
||||
first_byte!(0b0001_1111);
|
||||
continuation_byte!();
|
||||
}
|
||||
0xE0 => {
|
||||
first_byte!(0b0000_1111);
|
||||
continuation_byte!(0xA0...0xBF); // 0x80...0x9F here are overlong
|
||||
continuation_byte!();
|
||||
}
|
||||
0xE1...0xEC | 0xEE...0xEF => {
|
||||
first_byte!(0b0000_1111);
|
||||
continuation_byte!();
|
||||
continuation_byte!();
|
||||
}
|
||||
0xED => {
|
||||
first_byte!(0b0000_1111);
|
||||
continuation_byte!(0x80...0x9F); // 0xA0..0xBF here are surrogates
|
||||
continuation_byte!();
|
||||
}
|
||||
0xF0 => {
|
||||
first_byte!(0b0000_0111);
|
||||
continuation_byte!(0x90...0xBF); // 0x80..0x8F here are overlong
|
||||
continuation_byte!();
|
||||
continuation_byte!();
|
||||
}
|
||||
0xF1...0xF3 => {
|
||||
first_byte!(0b0000_0111);
|
||||
continuation_byte!();
|
||||
continuation_byte!();
|
||||
continuation_byte!();
|
||||
}
|
||||
0xF4 => {
|
||||
first_byte!(0b0000_0111);
|
||||
continuation_byte!(0x80...0x8F); // 0x90..0xBF here are beyond char::MAX
|
||||
continuation_byte!();
|
||||
continuation_byte!();
|
||||
}
|
||||
_ => return Err(InvalidSequence(())) // Illegal first byte, overlong, or beyond MAX
|
||||
}
|
||||
unsafe {
|
||||
Ok(from_u32_unchecked(code_point))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
@ -367,12 +367,13 @@ fn test_decode_utf8() {
|
||||
assert_eq!(s, $expected_str,
|
||||
"input bytes: {:?}, expected str: {:?}, result: {:?}",
|
||||
input_bytes, $expected_str, s);
|
||||
assert_eq!(String::from_utf8_lossy(&$input_bytes), $expected_str);
|
||||
}
|
||||
}
|
||||
|
||||
assert_decode_utf8!([], "");
|
||||
assert_decode_utf8!([0x41], "A");
|
||||
assert_decode_utf8!([0xC1, 0x81], "<EFBFBD>");
|
||||
assert_decode_utf8!([0xC1, 0x81], "<EFBFBD><EFBFBD>");
|
||||
assert_decode_utf8!([0xE2, 0x99, 0xA5], "♥");
|
||||
assert_decode_utf8!([0xE2, 0x99, 0xA5, 0x41], "♥A");
|
||||
assert_decode_utf8!([0xE2, 0x99], "<EFBFBD>");
|
||||
@ -385,4 +386,22 @@ fn test_decode_utf8() {
|
||||
assert_decode_utf8!([0xFE, 0x41], "<EFBFBD>A");
|
||||
assert_decode_utf8!([0xFF], "<EFBFBD>");
|
||||
assert_decode_utf8!([0xFF, 0x41], "<EFBFBD>A");
|
||||
assert_decode_utf8!([0xC0, 0x80], "<EFBFBD><EFBFBD>");
|
||||
|
||||
// Surrogates
|
||||
assert_decode_utf8!([0xED, 0x9F, 0xBF], "\u{D7FF}");
|
||||
assert_decode_utf8!([0xED, 0xA0, 0x80], "<EFBFBD><EFBFBD><EFBFBD>");
|
||||
assert_decode_utf8!([0xED, 0xBF, 0x80], "<EFBFBD><EFBFBD><EFBFBD>");
|
||||
assert_decode_utf8!([0xEE, 0x80, 0x80], "\u{E000}");
|
||||
|
||||
// char::MAX
|
||||
assert_decode_utf8!([0xF4, 0x8F, 0xBF, 0xBF], "\u{10FFFF}");
|
||||
assert_decode_utf8!([0xF4, 0x8F, 0xBF, 0x41], "<EFBFBD>A");
|
||||
assert_decode_utf8!([0xF4, 0x90, 0x80, 0x80], "<EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
|
||||
|
||||
// 5 and 6 bytes sequence
|
||||
// Part of the original design of UTF-8,
|
||||
// but invalid now that UTF-8 is artificially restricted to match the range of UTF-16.
|
||||
assert_decode_utf8!([0xF8, 0x80, 0x80, 0x80, 0x80], "<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
|
||||
assert_decode_utf8!([0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], "<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user