From 031f9b15df3df5da19b64a1f824463053898d021 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 1 Mar 2017 22:41:44 +0100 Subject: [PATCH 1/2] Only keep one copy of the UTF8_CHAR_WIDTH table. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … instead of one of each of libcore and libstd_unicode. Move the `utf8_char_width` function to `core::str` under the `str_internals` unstable feature. --- src/libcollections/lib.rs | 1 + src/libcollections/string.rs | 4 ++-- src/libcore/str/mod.rs | 7 +++++++ src/libstd/io/mod.rs | 2 +- src/libstd_unicode/lib.rs | 2 +- src/libstd_unicode/u_str.rs | 26 -------------------------- 6 files changed, 12 insertions(+), 30 deletions(-) diff --git a/src/libcollections/lib.rs b/src/libcollections/lib.rs index 53d5466e12b..f88bdd0ecf3 100644 --- a/src/libcollections/lib.rs +++ b/src/libcollections/lib.rs @@ -54,6 +54,7 @@ #![feature(slice_patterns)] #![feature(specialization)] #![feature(staged_api)] +#![feature(str_internals)] #![feature(trusted_len)] #![feature(unicode)] #![feature(unique)] diff --git a/src/libcollections/string.rs b/src/libcollections/string.rs index 4b37aef860d..c3edba669f0 100644 --- a/src/libcollections/string.rs +++ b/src/libcollections/string.rs @@ -62,9 +62,9 @@ use core::mem; use core::ops::{self, Add, AddAssign, Index, IndexMut}; use core::ptr; +use core::str as core_str; use core::str::pattern::Pattern; use std_unicode::char::{decode_utf16, REPLACEMENT_CHARACTER}; -use std_unicode::str as unicode_str; use borrow::{Cow, ToOwned}; use range::RangeArgument; @@ -575,7 +575,7 @@ macro_rules! error { () => ({ if byte < 128 { // subseqidx handles this } else { - let w = unicode_str::utf8_char_width(byte); + let w = core_str::utf8_char_width(byte); match w { 2 => { diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index 925cd84154a..52e33016310 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -1352,6 +1352,13 @@ macro_rules! next { () => {{ 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF ]; +/// Given a first byte, determine how many bytes are in this UTF-8 character +#[unstable(feature = "str_internals", issue = "0")] +#[inline] +pub fn utf8_char_width(b: u8) -> usize { + return UTF8_CHAR_WIDTH[b as usize] as usize; +} + /// Mask of the value bits of a continuation byte const CONT_MASK: u8 = 0b0011_1111; /// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte diff --git a/src/libstd/io/mod.rs b/src/libstd/io/mod.rs index 8cb7b2bda75..58788cdcd4c 100644 --- a/src/libstd/io/mod.rs +++ b/src/libstd/io/mod.rs @@ -256,7 +256,7 @@ #![stable(feature = "rust1", since = "1.0.0")] use cmp; -use std_unicode::str as core_str; +use core::str as core_str; use error as std_error; use fmt; use result; diff --git a/src/libstd_unicode/lib.rs b/src/libstd_unicode/lib.rs index d52d1549b51..1adf00e40f1 100644 --- a/src/libstd_unicode/lib.rs +++ b/src/libstd_unicode/lib.rs @@ -47,7 +47,7 @@ #[allow(deprecated)] pub mod str { pub use u_str::{SplitWhitespace, UnicodeStr}; - pub use u_str::{is_utf16, utf8_char_width}; + pub use u_str::is_utf16; pub use u_str::Utf16Encoder; } diff --git a/src/libstd_unicode/u_str.rs b/src/libstd_unicode/u_str.rs index 1c7894794c9..0ca6db9b0de 100644 --- a/src/libstd_unicode/u_str.rs +++ b/src/libstd_unicode/u_str.rs @@ -77,32 +77,6 @@ fn trim_right(&self) -> &str { } } -// https://tools.ietf.org/html/rfc3629 -static UTF8_CHAR_WIDTH: [u8; 256] = [ -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF -0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, -2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF -3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF -4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF -]; - -/// Given a first byte, determine how many bytes are in this UTF-8 character -#[inline] -pub fn utf8_char_width(b: u8) -> usize { - return UTF8_CHAR_WIDTH[b as usize] as usize; -} - /// Determines if a vector of `u16` contains valid UTF-16 pub fn is_utf16(v: &[u16]) -> bool { let mut it = v.iter(); From 24b39c51af8b7320fd825a66a239a497f20b0ece Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 1 Mar 2017 23:01:09 +0100 Subject: [PATCH 2/2] Remove std_unicode::str::is_utf16 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It was only accessible through the `#[unstable]` crate std_unicode. It has never been used in the compiler or standard library since 47e7a05a28c9662159af2d2e0f2b7efc13fa09cb added it in 2012 “for OS API interop”. It can be replaced with a one-liner: ```rust fn is_utf16(slice: &[u16]) -> bool { std::char::decode_utf16(s.iter().cloned()).all(|r| r.is_ok()) } ``` --- src/libcollectionstest/str.rs | 65 -------------------------------- src/libcollectionstest/string.rs | 2 +- src/libstd_unicode/lib.rs | 1 - src/libstd_unicode/u_str.rs | 22 ----------- 4 files changed, 1 insertion(+), 89 deletions(-) diff --git a/src/libcollectionstest/str.rs b/src/libcollectionstest/str.rs index 6221888f5e5..8071c7e8c20 100644 --- a/src/libcollectionstest/str.rs +++ b/src/libcollectionstest/str.rs @@ -540,71 +540,6 @@ fn from_utf8_mostly_ascii() { } } -#[test] -fn test_is_utf16() { - use std_unicode::str::is_utf16; - - macro_rules! pos { - ($($e:expr),*) => { { $(assert!(is_utf16($e));)* } } - } - - // non-surrogates - pos!(&[0x0000], - &[0x0001, 0x0002], - &[0xD7FF], - &[0xE000]); - - // surrogate pairs (randomly generated with Python 3's - // .encode('utf-16be')) - pos!(&[0xdb54, 0xdf16, 0xd880, 0xdee0, 0xdb6a, 0xdd45], - &[0xd91f, 0xdeb1, 0xdb31, 0xdd84, 0xd8e2, 0xde14], - &[0xdb9f, 0xdc26, 0xdb6f, 0xde58, 0xd850, 0xdfae]); - - // mixtures (also random) - pos!(&[0xd921, 0xdcc2, 0x002d, 0x004d, 0xdb32, 0xdf65], - &[0xdb45, 0xdd2d, 0x006a, 0xdacd, 0xddfe, 0x0006], - &[0x0067, 0xd8ff, 0xddb7, 0x000f, 0xd900, 0xdc80]); - - // negative tests - macro_rules! neg { - ($($e:expr),*) => { { $(assert!(!is_utf16($e));)* } } - } - - neg!( - // surrogate + regular unit - &[0xdb45, 0x0000], - // surrogate + lead surrogate - &[0xd900, 0xd900], - // unterminated surrogate - &[0xd8ff], - // trail surrogate without a lead - &[0xddb7]); - - // random byte sequences that Python 3's .decode('utf-16be') - // failed on - neg!(&[0x5b3d, 0x0141, 0xde9e, 0x8fdc, 0xc6e7], - &[0xdf5a, 0x82a5, 0x62b9, 0xb447, 0x92f3], - &[0xda4e, 0x42bc, 0x4462, 0xee98, 0xc2ca], - &[0xbe00, 0xb04a, 0x6ecb, 0xdd89, 0xe278], - &[0x0465, 0xab56, 0xdbb6, 0xa893, 0x665e], - &[0x6b7f, 0x0a19, 0x40f4, 0xa657, 0xdcc5], - &[0x9b50, 0xda5e, 0x24ec, 0x03ad, 0x6dee], - &[0x8d17, 0xcaa7, 0xf4ae, 0xdf6e, 0xbed7], - &[0xdaee, 0x2584, 0x7d30, 0xa626, 0x121a], - &[0xd956, 0x4b43, 0x7570, 0xccd6, 0x4f4a], - &[0x9dcf, 0x1b49, 0x4ba5, 0xfce9, 0xdffe], - &[0x6572, 0xce53, 0xb05a, 0xf6af, 0xdacf], - &[0x1b90, 0x728c, 0x9906, 0xdb68, 0xf46e], - &[0x1606, 0xbeca, 0xbe76, 0x860f, 0xdfa5], - &[0x8b4f, 0xde7a, 0xd220, 0x9fac, 0x2b6f], - &[0xb8fe, 0xebbe, 0xda32, 0x1a5f, 0x8b8b], - &[0x934b, 0x8956, 0xc434, 0x1881, 0xddf7], - &[0x5a95, 0x13fc, 0xf116, 0xd89b, 0x93f9], - &[0xd640, 0x71f1, 0xdd7d, 0x77eb, 0x1cd8], - &[0x348b, 0xaef0, 0xdb2c, 0xebf1, 0x1282], - &[0x50d7, 0xd824, 0x5010, 0xb369, 0x22ea]); -} - #[test] fn test_as_bytes() { // no null diff --git a/src/libcollectionstest/string.rs b/src/libcollectionstest/string.rs index f77dd510303..2f021b9935d 100644 --- a/src/libcollectionstest/string.rs +++ b/src/libcollectionstest/string.rs @@ -129,7 +129,7 @@ fn test_from_utf16() { let s_as_utf16 = s.encode_utf16().collect::>(); let u_as_string = String::from_utf16(&u).unwrap(); - assert!(::std_unicode::str::is_utf16(&u)); + assert!(::std_unicode::char::decode_utf16(u.iter().cloned()).all(|r| r.is_ok())); assert_eq!(s_as_utf16, u); assert_eq!(u_as_string, s); diff --git a/src/libstd_unicode/lib.rs b/src/libstd_unicode/lib.rs index 1adf00e40f1..7e5ab1a54ab 100644 --- a/src/libstd_unicode/lib.rs +++ b/src/libstd_unicode/lib.rs @@ -47,7 +47,6 @@ #[allow(deprecated)] pub mod str { pub use u_str::{SplitWhitespace, UnicodeStr}; - pub use u_str::is_utf16; pub use u_str::Utf16Encoder; } diff --git a/src/libstd_unicode/u_str.rs b/src/libstd_unicode/u_str.rs index 0ca6db9b0de..3c02ea82d2a 100644 --- a/src/libstd_unicode/u_str.rs +++ b/src/libstd_unicode/u_str.rs @@ -77,28 +77,6 @@ fn trim_right(&self) -> &str { } } -/// Determines if a vector of `u16` contains valid UTF-16 -pub fn is_utf16(v: &[u16]) -> bool { - let mut it = v.iter(); - macro_rules! next { ($ret:expr) => { - match it.next() { Some(u) => *u, None => return $ret } - } - } - loop { - let u = next!(true); - - match char::from_u32(u as u32) { - Some(_) => {} - None => { - let u2 = next!(false); - if u < 0xD7FF || u > 0xDBFF || u2 < 0xDC00 || u2 > 0xDFFF { - return false; - } - } - } - } -} - /// Iterator adaptor for encoding `char`s to UTF-16. #[derive(Clone)] pub struct Utf16Encoder {