auto merge of #10621 : Florob/rust/unicode63, r=cmr
This update the unicode.rs file to the latest Unicode version released 2013-09-30.
This commit is contained in:
commit
503e5df3f2
@ -5,7 +5,7 @@
|
||||
# code covering the core properties. Since this is a pretty rare event we
|
||||
# just store this out-of-line and check the unicode.rs file into git.
|
||||
#
|
||||
# The emitted code is "the minimum we think is necessary for libcore", that
|
||||
# The emitted code is "the minimum we think is necessary for libstd", that
|
||||
# is, to support basic operations of the compiler and "most nontrivial rust
|
||||
# programs". It is not meant to be a complete implementation of unicode.
|
||||
# For that we recommend you use a proper binding to libicu.
|
||||
@ -41,7 +41,7 @@ def load_unicode_data(f):
|
||||
continue
|
||||
[code, name, gencat, combine, bidi,
|
||||
decomp, deci, digit, num, mirror,
|
||||
old, iso, upcase, lowcsae, titlecase ] = fields
|
||||
old, iso, upcase, lowcase, titlecase ] = fields
|
||||
|
||||
code = int(code, 16)
|
||||
|
||||
@ -89,11 +89,9 @@ def load_unicode_data(f):
|
||||
|
||||
return (canon_decomp, compat_decomp, gencats, combines)
|
||||
|
||||
|
||||
def load_derived_core_properties(f):
|
||||
def load_properties(f, interestingprops):
|
||||
fetch(f)
|
||||
derivedprops = {}
|
||||
interestingprops = ["XID_Start", "XID_Continue", "Alphabetic"]
|
||||
props = {}
|
||||
re1 = re.compile("^([0-9A-F]+) +; (\w+)")
|
||||
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+) +; (\w+)")
|
||||
|
||||
@ -118,10 +116,10 @@ def load_derived_core_properties(f):
|
||||
continue
|
||||
d_lo = int(d_lo, 16)
|
||||
d_hi = int(d_hi, 16)
|
||||
if prop not in derivedprops:
|
||||
derivedprops[prop] = []
|
||||
derivedprops[prop].append((d_lo, d_hi))
|
||||
return derivedprops
|
||||
if prop not in props:
|
||||
props[prop] = []
|
||||
props[prop].append((d_lo, d_hi))
|
||||
return props
|
||||
|
||||
def escape_char(c):
|
||||
if c <= 0xff:
|
||||
@ -144,7 +142,7 @@ def emit_bsearch_range_table(f):
|
||||
use cmp::{Equal, Less, Greater};
|
||||
use vec::ImmutableVector;
|
||||
use option::None;
|
||||
(do r.bsearch |&(lo,hi)| {
|
||||
r.bsearch(|&(lo,hi)| {
|
||||
if lo <= c && c <= hi { Equal }
|
||||
else if hi < c { Less }
|
||||
else { Greater }
|
||||
@ -302,14 +300,14 @@ def emit_decomp_module(f, canon, compat, combine):
|
||||
ix += 1
|
||||
f.write("\n ];\n")
|
||||
|
||||
f.write(" pub fn canonical(c: char, i: &fn(char)) "
|
||||
f.write(" pub fn canonical(c: char, i: |char|) "
|
||||
+ "{ d(c, i, false); }\n\n")
|
||||
f.write(" pub fn compatibility(c: char, i: &fn(char)) "
|
||||
f.write(" pub fn compatibility(c: char, i: |char|) "
|
||||
+"{ d(c, i, true); }\n\n")
|
||||
f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n"
|
||||
+ " bsearch_range_value_table(c, combining_class_table)\n"
|
||||
+ " }\n\n")
|
||||
f.write(" fn d(c: char, i: &fn(char), k: bool) {\n")
|
||||
f.write(" fn d(c: char, i: |char|, k: bool) {\n")
|
||||
f.write(" use iter::Iterator;\n");
|
||||
|
||||
f.write(" if c <= '\\x7f' { i(c); return; }\n")
|
||||
@ -376,5 +374,9 @@ emit_property_module(rf, "general_category", gencats)
|
||||
|
||||
emit_decomp_module(rf, canon_decomp, compat_decomp, combines)
|
||||
|
||||
derived = load_derived_core_properties("DerivedCoreProperties.txt")
|
||||
derived = load_properties("DerivedCoreProperties.txt",
|
||||
["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"])
|
||||
emit_property_module(rf, "derived_property", derived)
|
||||
|
||||
props = load_properties("PropList.txt", ["White_Space"])
|
||||
emit_property_module(rf, "property", props)
|
||||
|
@ -14,7 +14,7 @@ use cast::transmute;
|
||||
use option::{None, Option, Some};
|
||||
use iter::{Iterator, range_step};
|
||||
use str::StrSlice;
|
||||
use unicode::{derived_property, general_category, decompose};
|
||||
use unicode::{derived_property, property, general_category, decompose};
|
||||
use to_str::ToStr;
|
||||
use str;
|
||||
|
||||
@ -89,30 +89,28 @@ pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
|
||||
|
||||
///
|
||||
/// Indicates whether a character is in lower case, defined
|
||||
/// in terms of the Unicode General Category 'Ll'
|
||||
/// in terms of the Unicode Derived Core Property 'Lowercase'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_lowercase(c: char) -> bool { general_category::Ll(c) }
|
||||
pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }
|
||||
|
||||
///
|
||||
/// Indicates whether a character is in upper case, defined
|
||||
/// in terms of the Unicode General Category 'Lu'.
|
||||
/// in terms of the Unicode Derived Core Property 'Uppercase'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_uppercase(c: char) -> bool { general_category::Lu(c) }
|
||||
pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }
|
||||
|
||||
///
|
||||
/// Indicates whether a character is whitespace. Whitespace is defined in
|
||||
/// terms of the Unicode General Categories 'Zs', 'Zl', 'Zp'
|
||||
/// additional 'Cc'-category control codes in the range [0x09, 0x0d]
|
||||
/// terms of the Unicode Property 'White_Space'.
|
||||
///
|
||||
#[inline]
|
||||
pub fn is_whitespace(c: char) -> bool {
|
||||
// As an optimization ASCII whitespace characters are checked separately
|
||||
c == ' '
|
||||
|| ('\x09' <= c && c <= '\x0d')
|
||||
|| general_category::Zs(c)
|
||||
|| general_category::Zl(c)
|
||||
|| general_category::Zp(c)
|
||||
|| property::White_Space(c)
|
||||
}
|
||||
|
||||
///
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -51,34 +51,34 @@ fn f() {
|
||||
CR4+2: (should align)
|
||||
*/
|
||||
/*
|
||||
// (NEL deliberately omitted)
|
||||
NEL4+2: (should align)
|
||||
*/
|
||||
/*
|
||||
Ogham Space Mark 4+2: (should align)
|
||||
*/
|
||||
/*
|
||||
Mongolian Vowel Separator 4+2: (should align)
|
||||
Ogham Space Mark 4+2: (should align)
|
||||
*/
|
||||
/*
|
||||
Four-per-em space 4+2: (should align)
|
||||
*/
|
||||
|
||||
/*
|
||||
Mongolian Vowel Sep count 1: (should align)
|
||||
Mongolian Vowel Sep count 2: (should align)
|
||||
Mongolian Vowel Sep count 3: (should align)
|
||||
Mongolian Vowel Sep count 4: (should align)
|
||||
Mongolian Vowel Sep count 5: (should align)
|
||||
Mongolian Vowel Sep count 6: (should align)
|
||||
Mongolian Vowel Sep count 7: (should align)
|
||||
Mongolian Vowel Sep count 8: (should align)
|
||||
Mongolian Vowel Sep count 9: (should align)
|
||||
Mongolian Vowel Sep count A: (should align)
|
||||
Mongolian Vowel Sep count B: (should align)
|
||||
Mongolian Vowel Sep count C: (should align)
|
||||
Mongolian Vowel Sep count D: (should align)
|
||||
Mongolian Vowel Sep count E: (should align)
|
||||
Mongolian Vowel Sep count F: (should align)
|
||||
Ogham Space Mark count 1: (should align)
|
||||
Ogham Space Mark count 2: (should align)
|
||||
Ogham Space Mark count 3: (should align)
|
||||
Ogham Space Mark count 4: (should align)
|
||||
Ogham Space Mark count 5: (should align)
|
||||
Ogham Space Mark count 6: (should align)
|
||||
Ogham Space Mark count 7: (should align)
|
||||
Ogham Space Mark count 8: (should align)
|
||||
Ogham Space Mark count 9: (should align)
|
||||
Ogham Space Mark count A: (should align)
|
||||
Ogham Space Mark count B: (should align)
|
||||
Ogham Space Mark count C: (should align)
|
||||
Ogham Space Mark count D: (should align)
|
||||
Ogham Space Mark count E: (should align)
|
||||
Ogham Space Mark count F: (should align)
|
||||
*/
|
||||
|
||||
|
||||
@ -88,26 +88,25 @@ fn f() {
|
||||
/*
|
||||
Hello from offset 6
|
||||
Space 6+2: compare A
|
||||
Mongolian Vowel Separator 6+2: compare B
|
||||
Ogham Space Mark 6+2: compare B
|
||||
*/
|
||||
|
||||
/**/
|
||||
/* */
|
||||
|
||||
/*
|
||||
Hello from another offset 6 with wchars establishing column offset
|
||||
Space 6+2: compare C
|
||||
Mongolian Vowel Separator 6+2: compare D
|
||||
Ogham Space Mark 6+2: compare D
|
||||
*/
|
||||
}
|
||||
|
||||
fn main() {
|
||||
// Taken from http://en.wikipedia.org/wiki/Whitespace_character
|
||||
// Taken from http://www.unicode.org/Public/UNIDATA/PropList.txt
|
||||
let chars =
|
||||
['\x0A', '\x0B', '\x0C', '\x0D', '\x20',
|
||||
// '\x85', // for some reason Rust thinks NEL isn't whitespace
|
||||
'\xA0', '\u1680', '\u180E', '\u2000', '\u2001', '\u2002', '\u2003',
|
||||
'\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200A',
|
||||
'\u2028', '\u2029', '\u202F', '\u205F', '\u3000'];
|
||||
['\x0A', '\x0B', '\x0C', '\x0D', '\x20', '\x85', '\xA0', '\u1680',
|
||||
'\u2000', '\u2001', '\u2002', '\u2003', '\u2004', '\u2005', '\u2006',
|
||||
'\u2007', '\u2008', '\u2009', '\u200A', '\u2028', '\u2029', '\u202F',
|
||||
'\u205F', '\u3000'];
|
||||
for c in chars.iter() {
|
||||
let ws = c.is_whitespace();
|
||||
println!("{:?} {:?}" , c , ws);
|
||||
|
@ -51,55 +51,54 @@ fn f() {
|
||||
CR4+2: (should align)
|
||||
*/
|
||||
/*
|
||||
// (NEL deliberately omitted)
|
||||
NEL4+2: (should align)
|
||||
*/
|
||||
/*
|
||||
Ogham Space Mark 4+2: (should align)
|
||||
*/
|
||||
/*
|
||||
Mongolian Vowel Separator 4+2: (should align)
|
||||
Ogham Space Mark 4+2: (should align)
|
||||
*/
|
||||
/*
|
||||
Four-per-em space 4+2: (should align)
|
||||
*/
|
||||
|
||||
/*
|
||||
Mongolian Vowel Sep count 1: (should align)
|
||||
Mongolian Vowel Sep count 2: (should align)
|
||||
Mongolian Vowel Sep count 3: (should align)
|
||||
Mongolian Vowel Sep count 4: (should align)
|
||||
Mongolian Vowel Sep count 5: (should align)
|
||||
Mongolian Vowel Sep count 6: (should align)
|
||||
Mongolian Vowel Sep count 7: (should align)
|
||||
Mongolian Vowel Sep count 8: (should align)
|
||||
Mongolian Vowel Sep count 9: (should align)
|
||||
Mongolian Vowel Sep count A: (should align)
|
||||
Mongolian Vowel Sep count B: (should align)
|
||||
Mongolian Vowel Sep count C: (should align)
|
||||
Mongolian Vowel Sep count D: (should align)
|
||||
Mongolian Vowel Sep count E: (should align)
|
||||
Mongolian Vowel Sep count F: (should align)
|
||||
Ogham Space Mark count 1: (should align)
|
||||
Ogham Space Mark count 2: (should align)
|
||||
Ogham Space Mark count 3: (should align)
|
||||
Ogham Space Mark count 4: (should align)
|
||||
Ogham Space Mark count 5: (should align)
|
||||
Ogham Space Mark count 6: (should align)
|
||||
Ogham Space Mark count 7: (should align)
|
||||
Ogham Space Mark count 8: (should align)
|
||||
Ogham Space Mark count 9: (should align)
|
||||
Ogham Space Mark count A: (should align)
|
||||
Ogham Space Mark count B: (should align)
|
||||
Ogham Space Mark count C: (should align)
|
||||
Ogham Space Mark count D: (should align)
|
||||
Ogham Space Mark count E: (should align)
|
||||
Ogham Space Mark count F: (should align)
|
||||
*/
|
||||
|
||||
|
||||
/* */ /*
|
||||
Hello from offset 6
|
||||
Space 6+2: compare A
|
||||
Mongolian Vowel Separator 6+2: compare B
|
||||
Ogham Space Mark 6+2: compare B
|
||||
*/
|
||||
/**/ /*
|
||||
/* */ /*
|
||||
Hello from another offset 6 with wchars establishing column offset
|
||||
Space 6+2: compare C
|
||||
Mongolian Vowel Separator 6+2: compare D
|
||||
Ogham Space Mark 6+2: compare D
|
||||
*/
|
||||
}
|
||||
|
||||
fn main() {
|
||||
// Taken from http://en.wikipedia.org/wiki/Whitespace_character
|
||||
// Taken from http://www.unicode.org/Public/UNIDATA/PropList.txt
|
||||
let chars =
|
||||
['\x0A', '\x0B', '\x0C', '\x0D', '\x20',
|
||||
// '\x85', // for some reason Rust thinks NEL isn't whitespace
|
||||
'\xA0', '\u1680', '\u180E', '\u2000', '\u2001', '\u2002', '\u2003',
|
||||
['\x0A', '\x0B', '\x0C', '\x0D', '\x20', '\x85',
|
||||
'\xA0', '\u1680', '\u2000', '\u2001', '\u2002', '\u2003',
|
||||
'\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200A',
|
||||
'\u2028', '\u2029', '\u202F', '\u205F', '\u3000'];
|
||||
for c in chars.iter() {
|
||||
|
Loading…
x
Reference in New Issue
Block a user