Tokenize emoji as if they were valid indentifiers
In the lexer, consider emojis to be valid identifiers and reject them later to avoid knock down parse errors.
This commit is contained in:
parent
311fa1f14d
commit
5a68abb094
42
Cargo.lock
42
Cargo.lock
@ -4040,6 +4040,7 @@ name = "rustc_lexer"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"expect-test",
|
||||
"unic-emoji-char",
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
@ -5510,6 +5511,47 @@ version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
|
||||
|
||||
[[package]]
|
||||
name = "unic-char-property"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221"
|
||||
dependencies = [
|
||||
"unic-char-range",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unic-char-range"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc"
|
||||
|
||||
[[package]]
|
||||
name = "unic-common"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"
|
||||
|
||||
[[package]]
|
||||
name = "unic-emoji-char"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b07221e68897210270a38bde4babb655869637af0f69407f96053a34f76494d"
|
||||
dependencies = [
|
||||
"unic-char-property",
|
||||
"unic-char-range",
|
||||
"unic-ucd-version",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unic-ucd-version"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4"
|
||||
dependencies = [
|
||||
"unic-common",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicase"
|
||||
version = "2.6.0"
|
||||
|
@ -35,7 +35,7 @@ use rustc_session::output::{filename_for_input, filename_for_metadata};
|
||||
use rustc_session::search_paths::PathKind;
|
||||
use rustc_session::{Limit, Session};
|
||||
use rustc_span::symbol::{sym, Ident, Symbol};
|
||||
use rustc_span::FileName;
|
||||
use rustc_span::{FileName, MultiSpan};
|
||||
use rustc_trait_selection::traits;
|
||||
use rustc_typeck as typeck;
|
||||
use tempfile::Builder as TempFileBuilder;
|
||||
@ -450,6 +450,16 @@ pub fn configure_and_expand(
|
||||
});
|
||||
}
|
||||
|
||||
// Gate identifiers containing invalid Unicode codepoints that were recovered during lexing.
|
||||
sess.parse_sess.bad_unicode_identifiers.with_lock(|identifiers| {
|
||||
for (ident, spans) in identifiers.drain() {
|
||||
sess.diagnostic().span_err(
|
||||
MultiSpan::from(spans),
|
||||
&format!("identifiers cannot contain emojis: `{}`", ident),
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
Ok(krate)
|
||||
}
|
||||
|
||||
|
@ -17,6 +17,7 @@ doctest = false
|
||||
# Note that this crate purposefully does not depend on other rustc crates
|
||||
[dependencies]
|
||||
unicode-xid = "0.2.0"
|
||||
unic-emoji-char = "0.9.0"
|
||||
|
||||
[dev-dependencies]
|
||||
expect-test = "1.0"
|
||||
|
@ -64,6 +64,8 @@ pub enum TokenKind {
|
||||
/// "ident" or "continue"
|
||||
/// At this step keywords are also considered identifiers.
|
||||
Ident,
|
||||
/// Like the above, but containing invalid unicode codepoints.
|
||||
InvalidIdent,
|
||||
/// "r#ident"
|
||||
RawIdent,
|
||||
/// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the
|
||||
@ -411,6 +413,11 @@ impl Cursor<'_> {
|
||||
let kind = Str { terminated };
|
||||
Literal { kind, suffix_start }
|
||||
}
|
||||
// Identifier (this should be checked after other variant that can
|
||||
// start as identifier).
|
||||
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
|
||||
self.fake_ident_or_unknown_prefix()
|
||||
}
|
||||
_ => Unknown,
|
||||
};
|
||||
Token::new(token_kind, self.len_consumed())
|
||||
@ -492,10 +499,28 @@ impl Cursor<'_> {
|
||||
// we see a prefix here, it is definitely an unknown prefix.
|
||||
match self.first() {
|
||||
'#' | '"' | '\'' => UnknownPrefix,
|
||||
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
|
||||
self.fake_ident_or_unknown_prefix()
|
||||
}
|
||||
_ => Ident,
|
||||
}
|
||||
}
|
||||
|
||||
fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind {
|
||||
// Start is already eaten, eat the rest of identifier.
|
||||
self.eat_while(|c| {
|
||||
unicode_xid::UnicodeXID::is_xid_continue(c)
|
||||
|| (!c.is_ascii() && unic_emoji_char::is_emoji(c))
|
||||
|| c == '\u{200d}'
|
||||
});
|
||||
// Known prefixes must have been handled earlier. So if
|
||||
// we see a prefix here, it is definitely an unknown prefix.
|
||||
match self.first() {
|
||||
'#' | '"' | '\'' => UnknownPrefix,
|
||||
_ => InvalidIdent,
|
||||
}
|
||||
}
|
||||
|
||||
fn number(&mut self, first_digit: char) -> LiteralKind {
|
||||
debug_assert!('0' <= self.prev() && self.prev() <= '9');
|
||||
let mut base = Base::Decimal;
|
||||
|
@ -222,6 +222,12 @@ impl<'a> StringReader<'a> {
|
||||
}
|
||||
token::Ident(sym, is_raw_ident)
|
||||
}
|
||||
rustc_lexer::TokenKind::InvalidIdent => {
|
||||
let sym = nfc_normalize(self.str_from(start));
|
||||
let span = self.mk_sp(start, self.pos);
|
||||
self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span);
|
||||
token::Ident(sym, false)
|
||||
}
|
||||
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
|
||||
let suffix_start = start + BytePos(suffix_start as u32);
|
||||
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
|
||||
|
@ -119,8 +119,13 @@ pub struct ParseSess {
|
||||
pub config: CrateConfig,
|
||||
pub edition: Edition,
|
||||
pub missing_fragment_specifiers: Lock<FxHashMap<Span, NodeId>>,
|
||||
/// Places where raw identifiers were used. This is used for feature-gating raw identifiers.
|
||||
/// Places where raw identifiers were used. This is used to avoid complaining about idents
|
||||
/// clashing with keywords in new editions.
|
||||
pub raw_identifier_spans: Lock<Vec<Span>>,
|
||||
/// Places where identifiers that contain invalid Unicode codepoints but that look like they
|
||||
/// should be. Useful to avoid bad tokenization when encountering emojis. We group them to
|
||||
/// provide a single error per unique incorrect identifier.
|
||||
pub bad_unicode_identifiers: Lock<FxHashMap<Symbol, Vec<Span>>>,
|
||||
source_map: Lrc<SourceMap>,
|
||||
pub buffered_lints: Lock<Vec<BufferedEarlyLint>>,
|
||||
/// Contains the spans of block expressions that could have been incomplete based on the
|
||||
@ -160,6 +165,7 @@ impl ParseSess {
|
||||
edition: ExpnId::root().expn_data().edition,
|
||||
missing_fragment_specifiers: Default::default(),
|
||||
raw_identifier_spans: Lock::new(Vec::new()),
|
||||
bad_unicode_identifiers: Lock::new(Default::default()),
|
||||
source_map,
|
||||
buffered_lints: Lock::new(vec![]),
|
||||
ambiguous_block_expr_parse: Lock::new(FxHashMap::default()),
|
||||
|
16
src/test/ui/parser/emoji-identifiers.rs
Normal file
16
src/test/ui/parser/emoji-identifiers.rs
Normal file
@ -0,0 +1,16 @@
|
||||
struct ABig👩👩👧👧Family; //~ ERROR identifiers cannot contain emojis
|
||||
struct 👀; //~ ERROR identifiers cannot contain emojis
|
||||
impl 👀 {
|
||||
fn full_of_✨() -> 👀 { //~ ERROR identifiers cannot contain emojis
|
||||
👀
|
||||
}
|
||||
}
|
||||
fn i_like_to_😅_a_lot() -> 👀 { //~ ERROR identifiers cannot contain emojis
|
||||
👀::full_of✨() //~ ERROR no function or associated item named `full_of✨` found for struct `👀`
|
||||
//~^ ERROR identifiers cannot contain emojis
|
||||
}
|
||||
fn main() {
|
||||
let _ = i_like_to_😄_a_lot(); //~ ERROR cannot find function `i_like_to_😄_a_lot` in this scope
|
||||
//~^ ERROR identifiers cannot contain emojis
|
||||
}
|
||||
|
72
src/test/ui/parser/emoji-identifiers.stderr
Normal file
72
src/test/ui/parser/emoji-identifiers.stderr
Normal file
@ -0,0 +1,72 @@
|
||||
error[E0425]: cannot find function `i_like_to_😄_a_lot` in this scope
|
||||
--> $DIR/emoji-identifiers.rs:13:13
|
||||
|
|
||||
LL | fn i_like_to_😅_a_lot() -> 👀 {
|
||||
| ----------------------------- similarly named function `i_like_to_😅_a_lot` defined here
|
||||
...
|
||||
LL | let _ = i_like_to_😄_a_lot();
|
||||
| ^^^^^^^^^^^^^^^^^^ help: a function with a similar name exists: `i_like_to_😅_a_lot`
|
||||
|
||||
error: identifiers cannot contain emojis: `i_like_to_😄_a_lot`
|
||||
--> $DIR/emoji-identifiers.rs:13:13
|
||||
|
|
||||
LL | let _ = i_like_to_😄_a_lot();
|
||||
| ^^^^^^^^^^^^^^^^^^
|
||||
|
||||
error: identifiers cannot contain emojis: `full_of_✨`
|
||||
--> $DIR/emoji-identifiers.rs:4:8
|
||||
|
|
||||
LL | fn full_of_✨() -> 👀 {
|
||||
| ^^^^^^^^^^
|
||||
|
||||
error: identifiers cannot contain emojis: `full_of✨`
|
||||
--> $DIR/emoji-identifiers.rs:9:8
|
||||
|
|
||||
LL | 👀::full_of✨()
|
||||
| ^^^^^^^^^
|
||||
|
||||
error: identifiers cannot contain emojis: `👀`
|
||||
--> $DIR/emoji-identifiers.rs:2:8
|
||||
|
|
||||
LL | struct 👀;
|
||||
| ^^
|
||||
LL | impl 👀 {
|
||||
| ^^
|
||||
LL | fn full_of_✨() -> 👀 {
|
||||
| ^^
|
||||
LL | 👀
|
||||
| ^^
|
||||
...
|
||||
LL | fn i_like_to_😅_a_lot() -> 👀 {
|
||||
| ^^
|
||||
LL | 👀::full_of✨()
|
||||
| ^^
|
||||
|
||||
error: identifiers cannot contain emojis: `i_like_to_😅_a_lot`
|
||||
--> $DIR/emoji-identifiers.rs:8:4
|
||||
|
|
||||
LL | fn i_like_to_😅_a_lot() -> 👀 {
|
||||
| ^^^^^^^^^^^^^^^^^^
|
||||
|
||||
error: identifiers cannot contain emojis: `ABig👩👩👧👧Family`
|
||||
--> $DIR/emoji-identifiers.rs:1:8
|
||||
|
|
||||
LL | struct ABig👩👩👧👧Family;
|
||||
| ^^^^^^^^^^^^^^^^^^
|
||||
|
||||
error[E0599]: no function or associated item named `full_of✨` found for struct `👀` in the current scope
|
||||
--> $DIR/emoji-identifiers.rs:9:8
|
||||
|
|
||||
LL | struct 👀;
|
||||
| ---------- function or associated item `full_of✨` not found for this
|
||||
...
|
||||
LL | 👀::full_of✨()
|
||||
| ^^^^^^^^^
|
||||
| |
|
||||
| function or associated item not found in `👀`
|
||||
| help: there is an associated function with a similar name: `full_of_✨`
|
||||
|
||||
error: aborting due to 8 previous errors
|
||||
|
||||
Some errors have detailed explanations: E0425, E0599.
|
||||
For more information about an error, try `rustc --explain E0425`.
|
Loading…
x
Reference in New Issue
Block a user