auto merge of #12317 : huonw/rust/utf16, r=alexcrichton
Iterators! Use them (in `is_utf16`), create them (in `utf16_items`). Handle errors gracefully (`from_utf16_lossy`) and `from_utf16` returning `Option<~str>` instead of failing. Add a pile of tests.
This commit is contained in:
commit
cae5999a54
@ -571,7 +571,9 @@ pub fn readdir(p: &CString) -> IoResult<~[Path]> {
|
||||
else {
|
||||
let fp_vec = vec::from_buf(
|
||||
fp_buf, wcslen(fp_buf) as uint);
|
||||
let fp_str = str::from_utf16(fp_vec);
|
||||
let fp_trimmed = str::truncate_utf16_at_nul(fp_vec);
|
||||
let fp_str = str::from_utf16(fp_trimmed)
|
||||
.expect("rust_list_dir_wfd_fp_buf returned invalid UTF-16");
|
||||
paths.push(Path::new(fp_str));
|
||||
}
|
||||
more_files = FindNextFileW(find_handle, wfd_ptr as HANDLE);
|
||||
|
@ -88,7 +88,8 @@ pub fn getcwd() -> Path {
|
||||
fail!();
|
||||
}
|
||||
}
|
||||
Path::new(str::from_utf16(buf))
|
||||
Path::new(str::from_utf16(str::truncate_utf16_at_nul(buf))
|
||||
.expect("GetCurrentDirectoryW returned invalid UTF-16"))
|
||||
}
|
||||
|
||||
#[cfg(windows)]
|
||||
@ -124,7 +125,12 @@ pub mod win32 {
|
||||
}
|
||||
if k != 0 && done {
|
||||
let sub = buf.slice(0, k as uint);
|
||||
res = option::Some(str::from_utf16(sub));
|
||||
// We want to explicitly catch the case when the
|
||||
// closure returned invalid UTF-16, rather than
|
||||
// set `res` to None and continue.
|
||||
let s = str::from_utf16(sub)
|
||||
.expect("fill_utf16_buf_and_decode: closure created invalid UTF-16");
|
||||
res = option::Some(s)
|
||||
}
|
||||
}
|
||||
return res;
|
||||
@ -739,7 +745,8 @@ pub fn last_os_error() -> ~str {
|
||||
fail!("[{}] FormatMessage failure", errno());
|
||||
}
|
||||
|
||||
str::from_utf16(buf)
|
||||
str::from_utf16(str::truncate_utf16_at_nul(buf))
|
||||
.expect("FormatMessageW returned invalid UTF-16")
|
||||
}
|
||||
}
|
||||
|
||||
@ -828,8 +835,10 @@ fn real_args() -> ~[~str] {
|
||||
while *ptr.offset(len as int) != 0 { len += 1; }
|
||||
|
||||
// Push it onto the list.
|
||||
args.push(vec::raw::buf_as_slice(ptr, len,
|
||||
str::from_utf16));
|
||||
let opt_s = vec::raw::buf_as_slice(ptr, len, |buf| {
|
||||
str::from_utf16(str::truncate_utf16_at_nul(buf))
|
||||
});
|
||||
args.push(opt_s.expect("CommandLineToArgvW returned invalid UTF-16"));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -830,60 +830,192 @@ fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
|
||||
|
||||
/// Determines if a vector of `u16` contains valid UTF-16
|
||||
pub fn is_utf16(v: &[u16]) -> bool {
|
||||
let len = v.len();
|
||||
let mut i = 0u;
|
||||
while i < len {
|
||||
let u = v[i];
|
||||
|
||||
if u <= 0xD7FF_u16 || u >= 0xE000_u16 {
|
||||
i += 1u;
|
||||
|
||||
} else {
|
||||
if i+1u < len { return false; }
|
||||
let u2 = v[i+1u];
|
||||
if u < 0xD7FF_u16 || u > 0xDBFF_u16 { return false; }
|
||||
if u2 < 0xDC00_u16 || u2 > 0xDFFF_u16 { return false; }
|
||||
i += 2u;
|
||||
let mut it = v.iter();
|
||||
macro_rules! next ( ($ret:expr) => {
|
||||
match it.next() { Some(u) => *u, None => return $ret }
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
)
|
||||
loop {
|
||||
let u = next!(true);
|
||||
|
||||
/// Iterates over the utf-16 characters in the specified slice, yielding each
|
||||
/// decoded unicode character to the function provided.
|
||||
///
|
||||
/// # Failures
|
||||
///
|
||||
/// * Fails on invalid utf-16 data
|
||||
pub fn utf16_chars(v: &[u16], f: |char|) {
|
||||
let len = v.len();
|
||||
let mut i = 0u;
|
||||
while i < len && v[i] != 0u16 {
|
||||
let u = v[i];
|
||||
|
||||
if u <= 0xD7FF_u16 || u >= 0xE000_u16 {
|
||||
f(unsafe { cast::transmute(u as u32) });
|
||||
i += 1u;
|
||||
|
||||
} else {
|
||||
let u2 = v[i+1u];
|
||||
assert!(u >= 0xD800_u16 && u <= 0xDBFF_u16);
|
||||
assert!(u2 >= 0xDC00_u16 && u2 <= 0xDFFF_u16);
|
||||
let mut c: u32 = (u - 0xD800_u16) as u32;
|
||||
c = c << 10;
|
||||
c |= (u2 - 0xDC00_u16) as u32;
|
||||
c |= 0x1_0000_u32;
|
||||
f(unsafe { cast::transmute(c) });
|
||||
i += 2u;
|
||||
match char::from_u32(u as u32) {
|
||||
Some(_) => {}
|
||||
None => {
|
||||
let u2 = next!(false);
|
||||
if u < 0xD7FF || u > 0xDBFF ||
|
||||
u2 < 0xDC00 || u2 > 0xDFFF { return false; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Allocates a new string from the utf-16 slice provided
|
||||
pub fn from_utf16(v: &[u16]) -> ~str {
|
||||
let mut buf = with_capacity(v.len());
|
||||
utf16_chars(v, |ch| buf.push_char(ch));
|
||||
buf
|
||||
/// An iterator that decodes UTF-16 encoded codepoints from a vector
|
||||
/// of `u16`s.
|
||||
#[deriving(Clone)]
|
||||
pub struct UTF16Items<'a> {
|
||||
priv iter: vec::Items<'a, u16>
|
||||
}
|
||||
/// The possibilities for values decoded from a `u16` stream.
|
||||
#[deriving(Eq, TotalEq, Clone)]
|
||||
pub enum UTF16Item {
|
||||
/// A valid codepoint.
|
||||
ScalarValue(char),
|
||||
/// An invalid surrogate without its pair.
|
||||
LoneSurrogate(u16)
|
||||
}
|
||||
|
||||
impl UTF16Item {
|
||||
/// Convert `self` to a `char`, taking `LoneSurrogate`s to the
|
||||
/// replacement character (U+FFFD).
|
||||
#[inline]
|
||||
pub fn to_char_lossy(&self) -> char {
|
||||
match *self {
|
||||
ScalarValue(c) => c,
|
||||
LoneSurrogate(_) => '\uFFFD'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator<UTF16Item> for UTF16Items<'a> {
|
||||
fn next(&mut self) -> Option<UTF16Item> {
|
||||
let u = match self.iter.next() {
|
||||
Some(u) => *u,
|
||||
None => return None
|
||||
};
|
||||
|
||||
if u < 0xD800 || 0xDFFF < u {
|
||||
// not a surrogate
|
||||
Some(ScalarValue(unsafe {cast::transmute(u as u32)}))
|
||||
} else if u >= 0xDC00 {
|
||||
// a trailing surrogate
|
||||
Some(LoneSurrogate(u))
|
||||
} else {
|
||||
// preserve state for rewinding.
|
||||
let old = self.iter;
|
||||
|
||||
let u2 = match self.iter.next() {
|
||||
Some(u2) => *u2,
|
||||
// eof
|
||||
None => return Some(LoneSurrogate(u))
|
||||
};
|
||||
if u2 < 0xDC00 || u2 > 0xDFFF {
|
||||
// not a trailing surrogate so we're not a valid
|
||||
// surrogate pair, so rewind to redecode u2 next time.
|
||||
self.iter = old;
|
||||
return Some(LoneSurrogate(u))
|
||||
}
|
||||
|
||||
// all ok, so lets decode it.
|
||||
let c = ((u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
|
||||
Some(ScalarValue(unsafe {cast::transmute(c)}))
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (uint, Option<uint>) {
|
||||
let (low, high) = self.iter.size_hint();
|
||||
// we could be entirely valid surrogates (2 elements per
|
||||
// char), or entirely non-surrogates (1 element per char)
|
||||
(low / 2, high)
|
||||
}
|
||||
}
|
||||
|
||||
/// Create an iterator over the UTF-16 encoded codepoints in `v`,
|
||||
/// returning invalid surrogates as `LoneSurrogate`s.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use std::str;
|
||||
/// use std::str::{ScalarValue, LoneSurrogate};
|
||||
///
|
||||
/// // 𝄞mus<invalid>ic<invalid>
|
||||
/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
|
||||
/// 0x0073, 0xDD1E, 0x0069, 0x0063,
|
||||
/// 0xD834];
|
||||
///
|
||||
/// assert_eq!(str::utf16_items(v).to_owned_vec(),
|
||||
/// ~[ScalarValue('𝄞'),
|
||||
/// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
|
||||
/// LoneSurrogate(0xDD1E),
|
||||
/// ScalarValue('i'), ScalarValue('c'),
|
||||
/// LoneSurrogate(0xD834)]);
|
||||
/// ```
|
||||
pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> {
|
||||
UTF16Items { iter : v.iter() }
|
||||
}
|
||||
|
||||
/// Return a slice of `v` ending at (and not including) the first NUL
|
||||
/// (0).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use std::str;
|
||||
///
|
||||
/// // "abcd"
|
||||
/// let mut v = ['a' as u16, 'b' as u16, 'c' as u16, 'd' as u16];
|
||||
/// // no NULs so no change
|
||||
/// assert_eq!(str::truncate_utf16_at_nul(v), v.as_slice());
|
||||
///
|
||||
/// // "ab\0d"
|
||||
/// v[2] = 0;
|
||||
/// assert_eq!(str::truncate_utf16_at_nul(v),
|
||||
/// &['a' as u16, 'b' as u16]);
|
||||
/// ```
|
||||
pub fn truncate_utf16_at_nul<'a>(v: &'a [u16]) -> &'a [u16] {
|
||||
match v.iter().position(|c| *c == 0) {
|
||||
// don't include the 0
|
||||
Some(i) => v.slice_to(i),
|
||||
None => v
|
||||
}
|
||||
}
|
||||
|
||||
/// Decode a UTF-16 encoded vector `v` into a string, returning `None`
|
||||
/// if `v` contains any invalid data.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use std::str;
|
||||
///
|
||||
/// // 𝄞music
|
||||
/// let mut v = [0xD834, 0xDD1E, 0x006d, 0x0075,
|
||||
/// 0x0073, 0x0069, 0x0063];
|
||||
/// assert_eq!(str::from_utf16(v), Some(~"𝄞music"));
|
||||
///
|
||||
/// // 𝄞mu<invalid>ic
|
||||
/// v[4] = 0xD800;
|
||||
/// assert_eq!(str::from_utf16(v), None);
|
||||
/// ```
|
||||
pub fn from_utf16(v: &[u16]) -> Option<~str> {
|
||||
let mut s = with_capacity(v.len() / 2);
|
||||
for c in utf16_items(v) {
|
||||
match c {
|
||||
ScalarValue(c) => s.push_char(c),
|
||||
LoneSurrogate(_) => return None
|
||||
}
|
||||
}
|
||||
Some(s)
|
||||
}
|
||||
|
||||
/// Decode a UTF-16 encoded vector `v` into a string, replacing
|
||||
/// invalid data with the replacement character (U+FFFD).
|
||||
///
|
||||
/// # Example
|
||||
/// ```rust
|
||||
/// use std::str;
|
||||
///
|
||||
/// // 𝄞mus<invalid>ic<invalid>
|
||||
/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
|
||||
/// 0x0073, 0xDD1E, 0x0069, 0x0063,
|
||||
/// 0xD834];
|
||||
///
|
||||
/// assert_eq!(str::from_utf16_lossy(v),
|
||||
/// ~"𝄞mus\uFFFDic\uFFFD");
|
||||
/// ```
|
||||
pub fn from_utf16_lossy(v: &[u16]) -> ~str {
|
||||
utf16_items(v).map(|c| c.to_char_lossy()).collect()
|
||||
}
|
||||
|
||||
/// Allocates a new string with the specified capacity. The string returned is
|
||||
@ -3536,6 +3668,65 @@ mod tests {
|
||||
assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_utf16() {
|
||||
macro_rules! pos ( ($($e:expr),*) => { { $(assert!(is_utf16($e));)* } });
|
||||
|
||||
// non-surrogates
|
||||
pos!([0x0000],
|
||||
[0x0001, 0x0002],
|
||||
[0xD7FF],
|
||||
[0xE000]);
|
||||
|
||||
// surrogate pairs (randomly generated with Python 3's
|
||||
// .encode('utf-16be'))
|
||||
pos!([0xdb54, 0xdf16, 0xd880, 0xdee0, 0xdb6a, 0xdd45],
|
||||
[0xd91f, 0xdeb1, 0xdb31, 0xdd84, 0xd8e2, 0xde14],
|
||||
[0xdb9f, 0xdc26, 0xdb6f, 0xde58, 0xd850, 0xdfae]);
|
||||
|
||||
// mixtures (also random)
|
||||
pos!([0xd921, 0xdcc2, 0x002d, 0x004d, 0xdb32, 0xdf65],
|
||||
[0xdb45, 0xdd2d, 0x006a, 0xdacd, 0xddfe, 0x0006],
|
||||
[0x0067, 0xd8ff, 0xddb7, 0x000f, 0xd900, 0xdc80]);
|
||||
|
||||
// negative tests
|
||||
macro_rules! neg ( ($($e:expr),*) => { { $(assert!(!is_utf16($e));)* } });
|
||||
|
||||
neg!(
|
||||
// surrogate + regular unit
|
||||
[0xdb45, 0x0000],
|
||||
// surrogate + lead surrogate
|
||||
[0xd900, 0xd900],
|
||||
// unterminated surrogate
|
||||
[0xd8ff],
|
||||
// trail surrogate without a lead
|
||||
[0xddb7]);
|
||||
|
||||
// random byte sequences that Python 3's .decode('utf-16be')
|
||||
// failed on
|
||||
neg!([0x5b3d, 0x0141, 0xde9e, 0x8fdc, 0xc6e7],
|
||||
[0xdf5a, 0x82a5, 0x62b9, 0xb447, 0x92f3],
|
||||
[0xda4e, 0x42bc, 0x4462, 0xee98, 0xc2ca],
|
||||
[0xbe00, 0xb04a, 0x6ecb, 0xdd89, 0xe278],
|
||||
[0x0465, 0xab56, 0xdbb6, 0xa893, 0x665e],
|
||||
[0x6b7f, 0x0a19, 0x40f4, 0xa657, 0xdcc5],
|
||||
[0x9b50, 0xda5e, 0x24ec, 0x03ad, 0x6dee],
|
||||
[0x8d17, 0xcaa7, 0xf4ae, 0xdf6e, 0xbed7],
|
||||
[0xdaee, 0x2584, 0x7d30, 0xa626, 0x121a],
|
||||
[0xd956, 0x4b43, 0x7570, 0xccd6, 0x4f4a],
|
||||
[0x9dcf, 0x1b49, 0x4ba5, 0xfce9, 0xdffe],
|
||||
[0x6572, 0xce53, 0xb05a, 0xf6af, 0xdacf],
|
||||
[0x1b90, 0x728c, 0x9906, 0xdb68, 0xf46e],
|
||||
[0x1606, 0xbeca, 0xbe76, 0x860f, 0xdfa5],
|
||||
[0x8b4f, 0xde7a, 0xd220, 0x9fac, 0x2b6f],
|
||||
[0xb8fe, 0xebbe, 0xda32, 0x1a5f, 0x8b8b],
|
||||
[0x934b, 0x8956, 0xc434, 0x1881, 0xddf7],
|
||||
[0x5a95, 0x13fc, 0xf116, 0xd89b, 0x93f9],
|
||||
[0xd640, 0x71f1, 0xdd7d, 0x77eb, 0x1cd8],
|
||||
[0x348b, 0xaef0, 0xdb2c, 0xebf1, 0x1282],
|
||||
[0x50d7, 0xd824, 0x5010, 0xb369, 0x22ea]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_raw_from_c_str() {
|
||||
unsafe {
|
||||
@ -3687,17 +3878,72 @@ mod tests {
|
||||
0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16,
|
||||
0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16,
|
||||
0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16,
|
||||
0x000a_u16 ]) ];
|
||||
0x000a_u16 ]),
|
||||
// Issue #12318, even-numbered non-BMP planes
|
||||
(~"\U00020000",
|
||||
~[0xD840, 0xDC00])];
|
||||
|
||||
for p in pairs.iter() {
|
||||
let (s, u) = (*p).clone();
|
||||
assert!(s.to_utf16() == u);
|
||||
assert!(from_utf16(u) == s);
|
||||
assert!(from_utf16(s.to_utf16()) == s);
|
||||
assert!(from_utf16(u).to_utf16() == u);
|
||||
assert!(is_utf16(u));
|
||||
assert_eq!(s.to_utf16(), u);
|
||||
|
||||
assert_eq!(from_utf16(u).unwrap(), s);
|
||||
assert_eq!(from_utf16_lossy(u), s);
|
||||
|
||||
assert_eq!(from_utf16(s.to_utf16()).unwrap(), s);
|
||||
assert_eq!(from_utf16(u).unwrap().to_utf16(), u);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf16_invalid() {
|
||||
// completely positive cases tested above.
|
||||
// lead + eof
|
||||
assert_eq!(from_utf16([0xD800]), None);
|
||||
// lead + lead
|
||||
assert_eq!(from_utf16([0xD800, 0xD800]), None);
|
||||
|
||||
// isolated trail
|
||||
assert_eq!(from_utf16([0x0061, 0xDC00]), None);
|
||||
|
||||
// general
|
||||
assert_eq!(from_utf16([0xD800, 0xd801, 0xdc8b, 0xD800]), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf16_lossy() {
|
||||
// completely positive cases tested above.
|
||||
// lead + eof
|
||||
assert_eq!(from_utf16_lossy([0xD800]), ~"\uFFFD");
|
||||
// lead + lead
|
||||
assert_eq!(from_utf16_lossy([0xD800, 0xD800]), ~"\uFFFD\uFFFD");
|
||||
|
||||
// isolated trail
|
||||
assert_eq!(from_utf16_lossy([0x0061, 0xDC00]), ~"a\uFFFD");
|
||||
|
||||
// general
|
||||
assert_eq!(from_utf16_lossy([0xD800, 0xd801, 0xdc8b, 0xD800]), ~"\uFFFD𐒋\uFFFD");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_truncate_utf16_at_nul() {
|
||||
let v = [];
|
||||
assert_eq!(truncate_utf16_at_nul(v), &[]);
|
||||
|
||||
let v = [0, 2, 3];
|
||||
assert_eq!(truncate_utf16_at_nul(v), &[]);
|
||||
|
||||
let v = [1, 0, 3];
|
||||
assert_eq!(truncate_utf16_at_nul(v), &[1]);
|
||||
|
||||
let v = [1, 2, 0];
|
||||
assert_eq!(truncate_utf16_at_nul(v), &[1, 2]);
|
||||
|
||||
let v = [1, 2, 3];
|
||||
assert_eq!(truncate_utf16_at_nul(v), &[1, 2, 3]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_char_at() {
|
||||
let s = ~"ศไทย中华Việt Nam";
|
||||
|
Loading…
x
Reference in New Issue
Block a user