diff --git a/doc/rust.texi b/doc/rust.texi index d5bfb646ec4..c746015e78e 100644 --- a/doc/rust.texi +++ b/doc/rust.texi @@ -595,9 +595,10 @@ otherwise defined as keywords or reserved tokens. @xref{Ref.Lex.Key}. @xref{Ref.Lex.Res}. That is: an identifier starts with any character having derived property -@code{XID_Start} and continues with zero or more characters having derived -property @code{XID_Continue}; and such an identifier is NFKC-normalized during -lexing, such that all subsequent comparison of identifiers is performed on the +@code{XID_Start}, or the character U+005F (underscore, @code{_}), and +continues with zero or more characters having derived property +@code{XID_Continue}. An identifier is NFKC-normalized during lexing, such +that all subsequent comparison of identifiers is performed on the NFKC-normalized forms. @emph{TODO: define relationship between Unicode and Rust versions}. diff --git a/src/comp/syntax/parse/lexer.rs b/src/comp/syntax/parse/lexer.rs index 338c29ddb07..bee3d7783cb 100644 --- a/src/comp/syntax/parse/lexer.rs +++ b/src/comp/syntax/parse/lexer.rs @@ -309,14 +309,16 @@ fn next_token(rdr: reader) -> {tok: token::token, chpos: uint, bpos: uint} { fn next_token_inner(rdr: reader) -> token::token { let accum_str = ""; let c = rdr.curr(); - if is_alpha(c) || c == '_' { - while is_alnum(c) || c == '_' { + if char::is_XID_start(c) || c == '_' { + while char::is_XID_continue(c) { str::push_char(accum_str, c); rdr.bump(); c = rdr.curr(); } if str::eq(accum_str, "_") { ret token::UNDERSCORE; } let is_mod_name = c == ':' && rdr.next() == ':'; + + // FIXME: perform NFKC normalization here. ret token::IDENT(interner::intern::(*rdr.get_interner(), accum_str), is_mod_name); } diff --git a/src/libcore/char.rs b/src/libcore/char.rs index 05ca5a3c59e..0bdda6f0afb 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -37,6 +37,12 @@ Utilities for manipulating the char type Cn Unassigned a reserved unassigned code point or a noncharacter */ +export is_alphabetic, + is_XID_start, is_XID_continue, + is_lowercase, is_uppercase, + is_whitespace, is_alphanumeric, + to_digit, maybe_digit, cmp; + import is_alphabetic = unicode::derived_property::Alphabetic; import is_XID_start = unicode::derived_property::XID_Start; import is_XID_continue = unicode::derived_property::XID_Continue; diff --git a/src/test/run-pass/utf8_idents.rs b/src/test/run-pass/utf8_idents.rs new file mode 100644 index 00000000000..e017673602a --- /dev/null +++ b/src/test/run-pass/utf8_idents.rs @@ -0,0 +1,34 @@ +fn main() { + let Π = 3.14; + let लंच = Π * Π + 1.54; + assert लंच - 1.54 == Π * Π; + assert საჭმელად_გემრიელი_სადილი() == 0; +} + +fn საჭმელად_გემრიელი_სადილი() -> int { + + // Lunch in several languages. + + let ランチ = 10; + let 午餐 = 10; + + let ארוחת_צהריי = 10; + let غداء = 10; + let լանչ = 10; + let обед = 10; + let абед = 10; + let μεσημεριανό = 10; + let hádegismatur = 10; + let ручек = 10; + + let ăn_trưa = 10; + let อาหารกลางวัน = 10; + + // Lunchy arithmetic, mm. + + assert hádegismatur * ручек * обед == 1000; + assert 10 == ארוחת_צהריי; + assert ランチ + 午餐 + μεσημεριανό == 30; + assert ăn_trưa + อาหารกลางวัน == 20; + ret (абед + լանչ) >> غداء; +}