Refactor low-level UTF-16 decoding.
* Rename `utf16_items` to `decode_utf16`. "Items" is meaningless. * Move it to `rustc_unicode::char`, exposed in `std::char`. * Generalize it to any `u16` iterable, not just `&[u16]`. * Make it yield `Result` instead of a custom `Utf16Item` enum that was isomorphic to `Result`. This enable using the `FromIterator for Result` impl. * Add a `REPLACEMENT_CHARACTER` constant. * Document how `result.unwrap_or(REPLACEMENT_CHARACTER)` replaces `Utf16Item::to_char_lossy`.
This commit is contained in:
parent
c408b78633
commit
6174b8d726
@ -56,6 +56,7 @@
|
||||
#![feature(unicode)]
|
||||
#![feature(unique)]
|
||||
#![feature(unsafe_no_drop_flag, filling_drop)]
|
||||
#![feature(decode_utf16)]
|
||||
#![feature(utf8_error)]
|
||||
#![cfg_attr(test, feature(rand, test))]
|
||||
|
||||
|
@ -20,8 +20,8 @@ use core::ops::{self, Deref, Add, Index};
|
||||
use core::ptr;
|
||||
use core::slice;
|
||||
use core::str::pattern::Pattern;
|
||||
use rustc_unicode::char::{decode_utf16, REPLACEMENT_CHARACTER};
|
||||
use rustc_unicode::str as unicode_str;
|
||||
use rustc_unicode::str::Utf16Item;
|
||||
|
||||
use borrow::{Cow, IntoCow};
|
||||
use range::RangeArgument;
|
||||
@ -267,14 +267,7 @@ impl String {
|
||||
/// ```
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
pub fn from_utf16(v: &[u16]) -> Result<String, FromUtf16Error> {
|
||||
let mut s = String::with_capacity(v.len());
|
||||
for c in unicode_str::utf16_items(v) {
|
||||
match c {
|
||||
Utf16Item::ScalarValue(c) => s.push(c),
|
||||
Utf16Item::LoneSurrogate(_) => return Err(FromUtf16Error(())),
|
||||
}
|
||||
}
|
||||
Ok(s)
|
||||
decode_utf16(v.iter().cloned()).collect::<Result<_, _>>().map_err(|_| FromUtf16Error(()))
|
||||
}
|
||||
|
||||
/// Decode a UTF-16 encoded vector `v` into a string, replacing
|
||||
@ -294,7 +287,7 @@ impl String {
|
||||
#[inline]
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
pub fn from_utf16_lossy(v: &[u16]) -> String {
|
||||
unicode_str::utf16_items(v).map(|c| c.to_char_lossy()).collect()
|
||||
decode_utf16(v.iter().cloned()).map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)).collect()
|
||||
}
|
||||
|
||||
/// Creates a new `String` from a length, capacity, and pointer.
|
||||
|
@ -211,3 +211,12 @@ fn test_len_utf16() {
|
||||
assert!('\u{a66e}'.len_utf16() == 1);
|
||||
assert!('\u{1f4a9}'.len_utf16() == 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_utf16() {
|
||||
fn check(s: &[u16], expected: &[Result<char, u16>]) {
|
||||
assert_eq!(::std::char::decode_utf16(s.iter().cloned()).collect::<Vec<_>>(), expected);
|
||||
}
|
||||
check(&[0xD800, 0x41, 0x42], &[Err(0xD800), Ok('A'), Ok('B')]);
|
||||
check(&[0xD800, 0], &[Err(0xD800), Ok('\0')]);
|
||||
}
|
||||
|
@ -19,6 +19,7 @@
|
||||
#![feature(float_from_str_radix)]
|
||||
#![feature(flt2dec)]
|
||||
#![feature(dec2flt)]
|
||||
#![feature(decode_utf16)]
|
||||
#![feature(fmt_radix)]
|
||||
#![feature(iter_arith)]
|
||||
#![feature(iter_arith)]
|
||||
|
@ -503,3 +503,116 @@ impl char {
|
||||
ToUppercase(CaseMappingIter::new(conversions::to_upper(self)))
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator that decodes UTF-16 encoded codepoints from an iterator of `u16`s.
|
||||
#[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")]
|
||||
#[derive(Clone)]
|
||||
pub struct DecodeUtf16<I> where I: Iterator<Item=u16> {
|
||||
iter: I,
|
||||
buf: Option<u16>,
|
||||
}
|
||||
|
||||
/// Create an iterator over the UTF-16 encoded codepoints in `iterable`,
|
||||
/// returning unpaired surrogates as `Err`s.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// #![feature(decode_utf16)]
|
||||
///
|
||||
/// use std::char::decode_utf16;
|
||||
///
|
||||
/// fn main() {
|
||||
/// // 𝄞mus<invalid>ic<invalid>
|
||||
/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
|
||||
/// 0x0073, 0xDD1E, 0x0069, 0x0063,
|
||||
/// 0xD834];
|
||||
///
|
||||
/// assert_eq!(decode_utf16(v.iter().cloned()).collect::<Vec<_>>(),
|
||||
/// vec![Ok('𝄞'),
|
||||
/// Ok('m'), Ok('u'), Ok('s'),
|
||||
/// Err(0xDD1E),
|
||||
/// Ok('i'), Ok('c'),
|
||||
/// Err(0xD834)]);
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// A lossy decoder can be obtained by replacing `Err` results with the replacement character:
|
||||
///
|
||||
/// ```
|
||||
/// #![feature(decode_utf16)]
|
||||
///
|
||||
/// use std::char::{decode_utf16, REPLACEMENT_CHARACTER};
|
||||
///
|
||||
/// fn main() {
|
||||
/// // 𝄞mus<invalid>ic<invalid>
|
||||
/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
|
||||
/// 0x0073, 0xDD1E, 0x0069, 0x0063,
|
||||
/// 0xD834];
|
||||
///
|
||||
/// assert_eq!(decode_utf16(v.iter().cloned())
|
||||
/// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
|
||||
/// .collect::<String>(),
|
||||
/// "𝄞mus<75>ic<69>");
|
||||
/// }
|
||||
/// ```
|
||||
#[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")]
|
||||
#[inline]
|
||||
pub fn decode_utf16<I: IntoIterator<Item=u16>>(iterable: I) -> DecodeUtf16<I::IntoIter> {
|
||||
DecodeUtf16 {
|
||||
iter: iterable.into_iter(),
|
||||
buf: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")]
|
||||
impl<I: Iterator<Item=u16>> Iterator for DecodeUtf16<I> {
|
||||
type Item = Result<char, u16>;
|
||||
|
||||
fn next(&mut self) -> Option<Result<char, u16>> {
|
||||
let u = match self.buf.take() {
|
||||
Some(buf) => buf,
|
||||
None => match self.iter.next() {
|
||||
Some(u) => u,
|
||||
None => return None
|
||||
}
|
||||
};
|
||||
|
||||
if u < 0xD800 || 0xDFFF < u {
|
||||
// not a surrogate
|
||||
Some(Ok(unsafe { from_u32_unchecked(u as u32) }))
|
||||
} else if u >= 0xDC00 {
|
||||
// a trailing surrogate
|
||||
Some(Err(u))
|
||||
} else {
|
||||
let u2 = match self.iter.next() {
|
||||
Some(u2) => u2,
|
||||
// eof
|
||||
None => return Some(Err(u))
|
||||
};
|
||||
if u2 < 0xDC00 || u2 > 0xDFFF {
|
||||
// not a trailing surrogate so we're not a valid
|
||||
// surrogate pair, so rewind to redecode u2 next time.
|
||||
self.buf = Some(u2);
|
||||
return Some(Err(u))
|
||||
}
|
||||
|
||||
// all ok, so lets decode it.
|
||||
let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
|
||||
Some(Ok(unsafe { from_u32_unchecked(c) }))
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let (low, high) = self.iter.size_hint();
|
||||
// we could be entirely valid surrogates (2 elements per
|
||||
// char), or entirely non-surrogates (1 element per char)
|
||||
(low / 2, high)
|
||||
}
|
||||
}
|
||||
|
||||
/// U+FFFD REPLACEMENT CHARACTER (<28>) is used in Unicode to represent a decoding error.
|
||||
/// It can occur, for example, when giving ill-formed UTF-8 bytes to `String::from_utf8_lossy`.
|
||||
#[unstable(feature = "decode_utf16", reason = "recently added", issue = "27830")]
|
||||
pub const REPLACEMENT_CHARACTER: char = '\u{FFFD}';
|
||||
|
@ -46,6 +46,7 @@ mod tables;
|
||||
mod u_str;
|
||||
pub mod char;
|
||||
|
||||
#[allow(deprecated)]
|
||||
pub mod str {
|
||||
pub use u_str::{UnicodeStr, SplitWhitespace};
|
||||
pub use u_str::{utf8_char_width, is_utf16, Utf16Items, Utf16Item};
|
||||
|
@ -13,8 +13,9 @@
|
||||
//! This module provides functionality to `str` that requires the Unicode methods provided by the
|
||||
//! unicode parts of the CharExt trait.
|
||||
|
||||
use char::{DecodeUtf16, decode_utf16};
|
||||
use core::char;
|
||||
use core::iter::Filter;
|
||||
use core::iter::{Cloned, Filter};
|
||||
use core::slice;
|
||||
use core::str::Split;
|
||||
|
||||
@ -119,11 +120,18 @@ pub fn is_utf16(v: &[u16]) -> bool {
|
||||
|
||||
/// An iterator that decodes UTF-16 encoded codepoints from a vector
|
||||
/// of `u16`s.
|
||||
#[deprecated(since = "1.4.0", reason = "renamed to `char::DecodeUtf16`")]
|
||||
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
|
||||
#[allow(deprecated)]
|
||||
#[derive(Clone)]
|
||||
pub struct Utf16Items<'a> {
|
||||
iter: slice::Iter<'a, u16>
|
||||
decoder: DecodeUtf16<Cloned<slice::Iter<'a, u16>>>
|
||||
}
|
||||
|
||||
/// The possibilities for values decoded from a `u16` stream.
|
||||
#[deprecated(since = "1.4.0", reason = "`char::DecodeUtf16` uses `Result<char, u16>` instead")]
|
||||
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
|
||||
#[allow(deprecated)]
|
||||
#[derive(Copy, PartialEq, Eq, Clone, Debug)]
|
||||
pub enum Utf16Item {
|
||||
/// A valid codepoint.
|
||||
@ -132,6 +140,7 @@ pub enum Utf16Item {
|
||||
LoneSurrogate(u16)
|
||||
}
|
||||
|
||||
#[allow(deprecated)]
|
||||
impl Utf16Item {
|
||||
/// Convert `self` to a `char`, taking `LoneSurrogate`s to the
|
||||
/// replacement character (U+FFFD).
|
||||
@ -144,49 +153,22 @@ impl Utf16Item {
|
||||
}
|
||||
}
|
||||
|
||||
#[deprecated(since = "1.4.0", reason = "use `char::DecodeUtf16` instead")]
|
||||
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
|
||||
#[allow(deprecated)]
|
||||
impl<'a> Iterator for Utf16Items<'a> {
|
||||
type Item = Utf16Item;
|
||||
|
||||
fn next(&mut self) -> Option<Utf16Item> {
|
||||
let u = match self.iter.next() {
|
||||
Some(u) => *u,
|
||||
None => return None
|
||||
};
|
||||
|
||||
if u < 0xD800 || 0xDFFF < u {
|
||||
// not a surrogate
|
||||
Some(Utf16Item::ScalarValue(unsafe { char::from_u32_unchecked(u as u32) }))
|
||||
} else if u >= 0xDC00 {
|
||||
// a trailing surrogate
|
||||
Some(Utf16Item::LoneSurrogate(u))
|
||||
} else {
|
||||
// preserve state for rewinding.
|
||||
let old = self.iter.clone();
|
||||
|
||||
let u2 = match self.iter.next() {
|
||||
Some(u2) => *u2,
|
||||
// eof
|
||||
None => return Some(Utf16Item::LoneSurrogate(u))
|
||||
};
|
||||
if u2 < 0xDC00 || u2 > 0xDFFF {
|
||||
// not a trailing surrogate so we're not a valid
|
||||
// surrogate pair, so rewind to redecode u2 next time.
|
||||
self.iter = old.clone();
|
||||
return Some(Utf16Item::LoneSurrogate(u))
|
||||
}
|
||||
|
||||
// all ok, so lets decode it.
|
||||
let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
|
||||
Some(Utf16Item::ScalarValue(unsafe { char::from_u32_unchecked(c) }))
|
||||
}
|
||||
self.decoder.next().map(|result| match result {
|
||||
Ok(c) => Utf16Item::ScalarValue(c),
|
||||
Err(s) => Utf16Item::LoneSurrogate(s),
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let (low, high) = self.iter.size_hint();
|
||||
// we could be entirely valid surrogates (2 elements per
|
||||
// char), or entirely non-surrogates (1 element per char)
|
||||
(low / 2, high)
|
||||
self.decoder.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
@ -196,7 +178,7 @@ impl<'a> Iterator for Utf16Items<'a> {
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// #![feature(unicode)]
|
||||
/// #![feature(unicode, decode_utf16)]
|
||||
///
|
||||
/// extern crate rustc_unicode;
|
||||
///
|
||||
@ -216,8 +198,11 @@ impl<'a> Iterator for Utf16Items<'a> {
|
||||
/// LoneSurrogate(0xD834)]);
|
||||
/// }
|
||||
/// ```
|
||||
#[deprecated(since = "1.4.0", reason = "renamed to `char::decode_utf16`")]
|
||||
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
|
||||
#[allow(deprecated)]
|
||||
pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> {
|
||||
Utf16Items { iter : v.iter() }
|
||||
Utf16Items { decoder: decode_utf16(v.iter().cloned()) }
|
||||
}
|
||||
|
||||
/// Iterator adaptor for encoding `char`s to UTF-16.
|
||||
|
@ -209,8 +209,6 @@ use std::str::FromStr;
|
||||
use std::string;
|
||||
use std::{char, f64, fmt, str};
|
||||
use std;
|
||||
use rustc_unicode::str as unicode_str;
|
||||
use rustc_unicode::str::Utf16Item;
|
||||
|
||||
use Encodable;
|
||||
|
||||
@ -1712,11 +1710,13 @@ impl<T: Iterator<Item=char>> Parser<T> {
|
||||
_ => return self.error(UnexpectedEndOfHexEscape),
|
||||
}
|
||||
|
||||
let buf = [n1, try!(self.decode_hex_escape())];
|
||||
match unicode_str::utf16_items(&buf).next() {
|
||||
Some(Utf16Item::ScalarValue(c)) => res.push(c),
|
||||
_ => return self.error(LoneLeadingSurrogateInHexEscape),
|
||||
let n2 = try!(self.decode_hex_escape());
|
||||
if n2 < 0xDC00 || n2 > 0xDFFF {
|
||||
return self.error(LoneLeadingSurrogateInHexEscape)
|
||||
}
|
||||
let c = (((n1 - 0xD800) as u32) << 10 |
|
||||
(n2 - 0xDC00) as u32) + 0x1_0000;
|
||||
res.push(char::from_u32(c).unwrap());
|
||||
}
|
||||
|
||||
n => match char::from_u32(n as u32) {
|
||||
|
@ -242,6 +242,7 @@
|
||||
#![feature(unicode)]
|
||||
#![feature(unique)]
|
||||
#![feature(unsafe_no_drop_flag, filling_drop)]
|
||||
#![feature(decode_utf16)]
|
||||
#![feature(vec_push_all)]
|
||||
#![feature(vec_resize)]
|
||||
#![feature(wrapping)]
|
||||
|
@ -37,7 +37,6 @@ use hash::{Hash, Hasher};
|
||||
use iter::FromIterator;
|
||||
use mem;
|
||||
use ops;
|
||||
use rustc_unicode::str::{Utf16Item, utf16_items};
|
||||
use slice;
|
||||
use str;
|
||||
use string::String;
|
||||
@ -186,14 +185,14 @@ impl Wtf8Buf {
|
||||
/// will always return the original code units.
|
||||
pub fn from_wide(v: &[u16]) -> Wtf8Buf {
|
||||
let mut string = Wtf8Buf::with_capacity(v.len());
|
||||
for item in utf16_items(v) {
|
||||
for item in char::decode_utf16(v.iter().cloned()) {
|
||||
match item {
|
||||
Utf16Item::ScalarValue(c) => string.push_char(c),
|
||||
Utf16Item::LoneSurrogate(s) => {
|
||||
Ok(ch) => string.push_char(ch),
|
||||
Err(surrogate) => {
|
||||
// Surrogates are known to be in the code point range.
|
||||
let code_point = unsafe { CodePoint::from_u32_unchecked(s as u32) };
|
||||
let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) };
|
||||
// Skip the WTF-8 concatenation check,
|
||||
// surrogate pairs are already decoded by utf16_items
|
||||
// surrogate pairs are already decoded by decode_utf16
|
||||
string.push_code_point_unchecked(code_point)
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user