Inline and remove cook_lexer_token
.
This is a small performance win, alas.
This commit is contained in:
parent
da84f0f4c3
commit
fb4dba0a17
@ -86,13 +86,182 @@ fn next_token(&mut self) -> (Token, bool) {
|
||||
|
||||
debug!("next_token: {:?}({:?})", token.kind, self.str_from(start));
|
||||
|
||||
match self.cook_lexer_token(token.kind, start) {
|
||||
Some(kind) => {
|
||||
let span = self.mk_sp(start, self.pos);
|
||||
return (Token::new(kind, span), preceded_by_whitespace);
|
||||
// Now "cook" the token, converting the simple `rustc_lexer::TokenKind` enum into a
|
||||
// rich `rustc_ast::TokenKind`. This turns strings into interned symbols and runs
|
||||
// additional validation.
|
||||
let kind = match token.kind {
|
||||
rustc_lexer::TokenKind::LineComment { doc_style } => {
|
||||
// Skip non-doc comments
|
||||
let Some(doc_style) = doc_style else {
|
||||
self.lint_unicode_text_flow(start);
|
||||
preceded_by_whitespace = true;
|
||||
continue;
|
||||
};
|
||||
|
||||
// Opening delimiter of the length 3 is not included into the symbol.
|
||||
let content_start = start + BytePos(3);
|
||||
let content = self.str_from(content_start);
|
||||
self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
|
||||
}
|
||||
None => preceded_by_whitespace = true,
|
||||
}
|
||||
rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
|
||||
if !terminated {
|
||||
self.report_unterminated_block_comment(start, doc_style);
|
||||
}
|
||||
|
||||
// Skip non-doc comments
|
||||
let Some(doc_style) = doc_style else {
|
||||
self.lint_unicode_text_flow(start);
|
||||
preceded_by_whitespace = true;
|
||||
continue;
|
||||
};
|
||||
|
||||
// Opening delimiter of the length 3 and closing delimiter of the length 2
|
||||
// are not included into the symbol.
|
||||
let content_start = start + BytePos(3);
|
||||
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
|
||||
let content = self.str_from_to(content_start, content_end);
|
||||
self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
|
||||
}
|
||||
rustc_lexer::TokenKind::Whitespace => {
|
||||
preceded_by_whitespace = true;
|
||||
continue;
|
||||
}
|
||||
rustc_lexer::TokenKind::Ident => {
|
||||
let sym = nfc_normalize(self.str_from(start));
|
||||
let span = self.mk_sp(start, self.pos);
|
||||
self.sess.symbol_gallery.insert(sym, span);
|
||||
token::Ident(sym, false)
|
||||
}
|
||||
rustc_lexer::TokenKind::RawIdent => {
|
||||
let sym = nfc_normalize(self.str_from(start + BytePos(2)));
|
||||
let span = self.mk_sp(start, self.pos);
|
||||
self.sess.symbol_gallery.insert(sym, span);
|
||||
if !sym.can_be_raw() {
|
||||
self.err_span(span, &format!("`{}` cannot be a raw identifier", sym));
|
||||
}
|
||||
self.sess.raw_identifier_spans.borrow_mut().push(span);
|
||||
token::Ident(sym, true)
|
||||
}
|
||||
rustc_lexer::TokenKind::UnknownPrefix => {
|
||||
self.report_unknown_prefix(start);
|
||||
let sym = nfc_normalize(self.str_from(start));
|
||||
let span = self.mk_sp(start, self.pos);
|
||||
self.sess.symbol_gallery.insert(sym, span);
|
||||
token::Ident(sym, false)
|
||||
}
|
||||
rustc_lexer::TokenKind::InvalidIdent
|
||||
// Do not recover an identifier with emoji if the codepoint is a confusable
|
||||
// with a recoverable substitution token, like `➖`.
|
||||
if !UNICODE_ARRAY
|
||||
.iter()
|
||||
.any(|&(c, _, _)| {
|
||||
let sym = self.str_from(start);
|
||||
sym.chars().count() == 1 && c == sym.chars().next().unwrap()
|
||||
}) =>
|
||||
{
|
||||
let sym = nfc_normalize(self.str_from(start));
|
||||
let span = self.mk_sp(start, self.pos);
|
||||
self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default()
|
||||
.push(span);
|
||||
token::Ident(sym, false)
|
||||
}
|
||||
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
|
||||
let suffix_start = start + BytePos(suffix_start);
|
||||
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
|
||||
let suffix = if suffix_start < self.pos {
|
||||
let string = self.str_from(suffix_start);
|
||||
if string == "_" {
|
||||
self.sess
|
||||
.span_diagnostic
|
||||
.struct_span_warn(
|
||||
self.mk_sp(suffix_start, self.pos),
|
||||
"underscore literal suffix is not allowed",
|
||||
)
|
||||
.warn(
|
||||
"this was previously accepted by the compiler but is \
|
||||
being phased out; it will become a hard error in \
|
||||
a future release!",
|
||||
)
|
||||
.note(
|
||||
"see issue #42326 \
|
||||
<https://github.com/rust-lang/rust/issues/42326> \
|
||||
for more information",
|
||||
)
|
||||
.emit();
|
||||
None
|
||||
} else {
|
||||
Some(Symbol::intern(string))
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
token::Literal(token::Lit { kind, symbol, suffix })
|
||||
}
|
||||
rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
|
||||
// Include the leading `'` in the real identifier, for macro
|
||||
// expansion purposes. See #12512 for the gory details of why
|
||||
// this is necessary.
|
||||
let lifetime_name = self.str_from(start);
|
||||
if starts_with_number {
|
||||
self.err_span_(start, self.pos, "lifetimes cannot start with a number");
|
||||
}
|
||||
let ident = Symbol::intern(lifetime_name);
|
||||
token::Lifetime(ident)
|
||||
}
|
||||
rustc_lexer::TokenKind::Semi => token::Semi,
|
||||
rustc_lexer::TokenKind::Comma => token::Comma,
|
||||
rustc_lexer::TokenKind::Dot => token::Dot,
|
||||
rustc_lexer::TokenKind::OpenParen => token::OpenDelim(Delimiter::Parenthesis),
|
||||
rustc_lexer::TokenKind::CloseParen => token::CloseDelim(Delimiter::Parenthesis),
|
||||
rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(Delimiter::Brace),
|
||||
rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(Delimiter::Brace),
|
||||
rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(Delimiter::Bracket),
|
||||
rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(Delimiter::Bracket),
|
||||
rustc_lexer::TokenKind::At => token::At,
|
||||
rustc_lexer::TokenKind::Pound => token::Pound,
|
||||
rustc_lexer::TokenKind::Tilde => token::Tilde,
|
||||
rustc_lexer::TokenKind::Question => token::Question,
|
||||
rustc_lexer::TokenKind::Colon => token::Colon,
|
||||
rustc_lexer::TokenKind::Dollar => token::Dollar,
|
||||
rustc_lexer::TokenKind::Eq => token::Eq,
|
||||
rustc_lexer::TokenKind::Bang => token::Not,
|
||||
rustc_lexer::TokenKind::Lt => token::Lt,
|
||||
rustc_lexer::TokenKind::Gt => token::Gt,
|
||||
rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus),
|
||||
rustc_lexer::TokenKind::And => token::BinOp(token::And),
|
||||
rustc_lexer::TokenKind::Or => token::BinOp(token::Or),
|
||||
rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus),
|
||||
rustc_lexer::TokenKind::Star => token::BinOp(token::Star),
|
||||
rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash),
|
||||
rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
|
||||
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
|
||||
|
||||
rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
|
||||
let c = self.str_from(start).chars().next().unwrap();
|
||||
let mut err =
|
||||
self.struct_err_span_char(start, self.pos, "unknown start of token", c);
|
||||
// FIXME: the lexer could be used to turn the ASCII version of unicode
|
||||
// homoglyphs, instead of keeping a table in `check_for_substitution`into the
|
||||
// token. Ideally, this should be inside `rustc_lexer`. However, we should
|
||||
// first remove compound tokens like `<<` from `rustc_lexer`, and then add
|
||||
// fancier error recovery to it, as there will be less overall work to do this
|
||||
// way.
|
||||
let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
|
||||
if c == '\x00' {
|
||||
err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
|
||||
}
|
||||
err.emit();
|
||||
if let Some(token) = token {
|
||||
token
|
||||
} else {
|
||||
preceded_by_whitespace = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
rustc_lexer::TokenKind::Eof => token::Eof,
|
||||
};
|
||||
let span = self.mk_sp(start, self.pos);
|
||||
return (Token::new(kind, span), preceded_by_whitespace);
|
||||
}
|
||||
}
|
||||
|
||||
@ -158,172 +327,6 @@ fn lint_unicode_text_flow(&self, start: BytePos) {
|
||||
}
|
||||
}
|
||||
|
||||
/// Turns simple `rustc_lexer::TokenKind` enum into a rich
|
||||
/// `rustc_ast::TokenKind`. This turns strings into interned
|
||||
/// symbols and runs additional validation.
|
||||
fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> Option<TokenKind> {
|
||||
Some(match token {
|
||||
rustc_lexer::TokenKind::LineComment { doc_style } => {
|
||||
// Skip non-doc comments
|
||||
let Some(doc_style) = doc_style else {
|
||||
self.lint_unicode_text_flow(start);
|
||||
return None;
|
||||
};
|
||||
|
||||
// Opening delimiter of the length 3 is not included into the symbol.
|
||||
let content_start = start + BytePos(3);
|
||||
let content = self.str_from(content_start);
|
||||
self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
|
||||
}
|
||||
rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
|
||||
if !terminated {
|
||||
self.report_unterminated_block_comment(start, doc_style);
|
||||
}
|
||||
|
||||
// Skip non-doc comments
|
||||
let Some(doc_style) = doc_style else {
|
||||
self.lint_unicode_text_flow(start);
|
||||
return None;
|
||||
};
|
||||
|
||||
// Opening delimiter of the length 3 and closing delimiter of the length 2
|
||||
// are not included into the symbol.
|
||||
let content_start = start + BytePos(3);
|
||||
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
|
||||
let content = self.str_from_to(content_start, content_end);
|
||||
self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
|
||||
}
|
||||
rustc_lexer::TokenKind::Whitespace => return None,
|
||||
rustc_lexer::TokenKind::Ident => {
|
||||
let sym = nfc_normalize(self.str_from(start));
|
||||
let span = self.mk_sp(start, self.pos);
|
||||
self.sess.symbol_gallery.insert(sym, span);
|
||||
token::Ident(sym, false)
|
||||
}
|
||||
rustc_lexer::TokenKind::RawIdent => {
|
||||
let sym = nfc_normalize(self.str_from(start + BytePos(2)));
|
||||
let span = self.mk_sp(start, self.pos);
|
||||
self.sess.symbol_gallery.insert(sym, span);
|
||||
if !sym.can_be_raw() {
|
||||
self.err_span(span, &format!("`{}` cannot be a raw identifier", sym));
|
||||
}
|
||||
self.sess.raw_identifier_spans.borrow_mut().push(span);
|
||||
token::Ident(sym, true)
|
||||
}
|
||||
rustc_lexer::TokenKind::UnknownPrefix => {
|
||||
self.report_unknown_prefix(start);
|
||||
let sym = nfc_normalize(self.str_from(start));
|
||||
let span = self.mk_sp(start, self.pos);
|
||||
self.sess.symbol_gallery.insert(sym, span);
|
||||
token::Ident(sym, false)
|
||||
}
|
||||
rustc_lexer::TokenKind::InvalidIdent
|
||||
// Do not recover an identifier with emoji if the codepoint is a confusable
|
||||
// with a recoverable substitution token, like `➖`.
|
||||
if !UNICODE_ARRAY
|
||||
.iter()
|
||||
.any(|&(c, _, _)| {
|
||||
let sym = self.str_from(start);
|
||||
sym.chars().count() == 1 && c == sym.chars().next().unwrap()
|
||||
})
|
||||
=>
|
||||
{
|
||||
let sym = nfc_normalize(self.str_from(start));
|
||||
let span = self.mk_sp(start, self.pos);
|
||||
self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span);
|
||||
token::Ident(sym, false)
|
||||
}
|
||||
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
|
||||
let suffix_start = start + BytePos(suffix_start);
|
||||
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
|
||||
let suffix = if suffix_start < self.pos {
|
||||
let string = self.str_from(suffix_start);
|
||||
if string == "_" {
|
||||
self.sess
|
||||
.span_diagnostic
|
||||
.struct_span_warn(
|
||||
self.mk_sp(suffix_start, self.pos),
|
||||
"underscore literal suffix is not allowed",
|
||||
)
|
||||
.warn(
|
||||
"this was previously accepted by the compiler but is \
|
||||
being phased out; it will become a hard error in \
|
||||
a future release!",
|
||||
)
|
||||
.note(
|
||||
"see issue #42326 \
|
||||
<https://github.com/rust-lang/rust/issues/42326> \
|
||||
for more information",
|
||||
)
|
||||
.emit();
|
||||
None
|
||||
} else {
|
||||
Some(Symbol::intern(string))
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
token::Literal(token::Lit { kind, symbol, suffix })
|
||||
}
|
||||
rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
|
||||
// Include the leading `'` in the real identifier, for macro
|
||||
// expansion purposes. See #12512 for the gory details of why
|
||||
// this is necessary.
|
||||
let lifetime_name = self.str_from(start);
|
||||
if starts_with_number {
|
||||
self.err_span_(start, self.pos, "lifetimes cannot start with a number");
|
||||
}
|
||||
let ident = Symbol::intern(lifetime_name);
|
||||
token::Lifetime(ident)
|
||||
}
|
||||
rustc_lexer::TokenKind::Semi => token::Semi,
|
||||
rustc_lexer::TokenKind::Comma => token::Comma,
|
||||
rustc_lexer::TokenKind::Dot => token::Dot,
|
||||
rustc_lexer::TokenKind::OpenParen => token::OpenDelim(Delimiter::Parenthesis),
|
||||
rustc_lexer::TokenKind::CloseParen => token::CloseDelim(Delimiter::Parenthesis),
|
||||
rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(Delimiter::Brace),
|
||||
rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(Delimiter::Brace),
|
||||
rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(Delimiter::Bracket),
|
||||
rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(Delimiter::Bracket),
|
||||
rustc_lexer::TokenKind::At => token::At,
|
||||
rustc_lexer::TokenKind::Pound => token::Pound,
|
||||
rustc_lexer::TokenKind::Tilde => token::Tilde,
|
||||
rustc_lexer::TokenKind::Question => token::Question,
|
||||
rustc_lexer::TokenKind::Colon => token::Colon,
|
||||
rustc_lexer::TokenKind::Dollar => token::Dollar,
|
||||
rustc_lexer::TokenKind::Eq => token::Eq,
|
||||
rustc_lexer::TokenKind::Bang => token::Not,
|
||||
rustc_lexer::TokenKind::Lt => token::Lt,
|
||||
rustc_lexer::TokenKind::Gt => token::Gt,
|
||||
rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus),
|
||||
rustc_lexer::TokenKind::And => token::BinOp(token::And),
|
||||
rustc_lexer::TokenKind::Or => token::BinOp(token::Or),
|
||||
rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus),
|
||||
rustc_lexer::TokenKind::Star => token::BinOp(token::Star),
|
||||
rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash),
|
||||
rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
|
||||
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
|
||||
|
||||
rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
|
||||
let c = self.str_from(start).chars().next().unwrap();
|
||||
let mut err =
|
||||
self.struct_err_span_char(start, self.pos, "unknown start of token", c);
|
||||
// FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs,
|
||||
// instead of keeping a table in `check_for_substitution`into the token. Ideally,
|
||||
// this should be inside `rustc_lexer`. However, we should first remove compound
|
||||
// tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
|
||||
// as there will be less overall work to do this way.
|
||||
let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
|
||||
if c == '\x00' {
|
||||
err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
|
||||
}
|
||||
err.emit();
|
||||
token?
|
||||
}
|
||||
rustc_lexer::TokenKind::Eof => token::Eof,
|
||||
})
|
||||
}
|
||||
|
||||
fn cook_doc_comment(
|
||||
&self,
|
||||
content_start: BytePos,
|
||||
|
Loading…
Reference in New Issue
Block a user