Auto merge of #102302 - nnethercote:more-lexer-improvements, r=matklad

More lexer improvements A follow-up to #99884. r? `@matklad`
2022-09-28 08:14:04 +00:00 · 2022-09-28 08:14:04 +00:00 · 6201eabde8
commit 6201eabde8
parent 837bf370de d0a26acb2a
7 changed files with 443 additions and 457 deletions
--- a/compiler/rustc_ast/src/token.rs
+++ b/compiler/rustc_ast/src/token.rs
@ -13,7 +13,7 @@ use rustc_span::symbol::{kw, sym};
 use rustc_span::symbol::{Ident, Symbol};
 use rustc_span::{self, edition::Edition, Span, DUMMY_SP};
 use std::borrow::Cow;
-use std::{fmt, mem};
+use std::fmt;

 #[derive(Clone, Copy, PartialEq, Encodable, Decodable, Debug, HashStable_Generic)]
 pub enum CommentKind {
@ -335,11 +335,6 @@ impl Token {
        Token::new(Ident(ident.name, ident.is_raw_guess()), ident.span)
    }

-    /// Return this token by value and leave a dummy token in its place.
-    pub fn take(&mut self) -> Self {
-        mem::replace(self, Token::dummy())
-    }
-
    /// For interpolated tokens, returns a span of the fragment to which the interpolated
    /// token refers. For all other tokens this is just a regular span.
    /// It is particularly important to use this for identifiers and lifetimes
--- a/compiler/rustc_errors/src/lib.rs
+++ b/compiler/rustc_errors/src/lib.rs
@ -62,7 +62,8 @@ pub mod translation;
 pub use diagnostic_builder::IntoDiagnostic;
 pub use snippet::Style;

-pub type PResult<'a, T> = Result<T, DiagnosticBuilder<'a, ErrorGuaranteed>>;
+pub type PErr<'a> = DiagnosticBuilder<'a, ErrorGuaranteed>;
+pub type PResult<'a, T> = Result<T, PErr<'a>>;

 // `PResult` is used a lot. Make sure it doesn't unintentionally get bigger.
 // (See also the comment on `DiagnosticBuilder`'s `diagnostic` field.)
--- a/compiler/rustc_lexer/src/cursor.rs
+++ b/compiler/rustc_lexer/src/cursor.rs
@ -4,8 +4,8 @@ use std::str::Chars;
 ///
 /// Next characters can be peeked via `first` method,
 /// and position can be shifted forward via `bump` method.
-pub(crate) struct Cursor<'a> {
-    initial_len: usize,
+pub struct Cursor<'a> {
+    len_remaining: usize,
    /// Iterator over chars. Slightly faster than a &str.
    chars: Chars<'a>,
    #[cfg(debug_assertions)]
@ -15,9 +15,9 @@ pub(crate) struct Cursor<'a> {
 pub(crate) const EOF_CHAR: char = '\0';

 impl<'a> Cursor<'a> {
-    pub(crate) fn new(input: &'a str) -> Cursor<'a> {
+    pub fn new(input: &'a str) -> Cursor<'a> {
        Cursor {
-            initial_len: input.len(),
+            len_remaining: input.len(),
            chars: input.chars(),
            #[cfg(debug_assertions)]
            prev: EOF_CHAR,
@ -61,13 +61,13 @@ impl<'a> Cursor<'a> {
    }

    /// Returns amount of already consumed symbols.
-    pub(crate) fn len_consumed(&self) -> u32 {
-        (self.initial_len - self.chars.as_str().len()) as u32
+    pub(crate) fn pos_within_token(&self) -> u32 {
+        (self.len_remaining - self.chars.as_str().len()) as u32
    }

    /// Resets the number of bytes consumed to 0.
-    pub(crate) fn reset_len_consumed(&mut self) {
-        self.initial_len = self.chars.as_str().len();
+    pub(crate) fn reset_pos_within_token(&mut self) {
+        self.len_remaining = self.chars.as_str().len();
    }

    /// Moves to the next character.
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@ -29,9 +29,11 @@ pub mod unescape;
 #[cfg(test)]
 mod tests;

+pub use crate::cursor::Cursor;
+
 use self::LiteralKind::*;
 use self::TokenKind::*;
-use crate::cursor::{Cursor, EOF_CHAR};
+use crate::cursor::EOF_CHAR;
 use std::convert::TryFrom;

 /// Parsed token.
@ -139,6 +141,9 @@ pub enum TokenKind {

    /// Unknown token, not expected by the lexer, e.g. "№"
    Unknown,
+
+    /// End of input.
+    Eof,
 }

 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
@ -219,13 +224,6 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
    None
 }

-/// Parses the first token from the provided input string.
-#[inline]
-pub fn first_token(input: &str) -> Token {
-    debug_assert!(!input.is_empty());
-    Cursor::new(input).advance_token()
-}
-
 /// Validates a raw string literal. Used for getting more information about a
 /// problem with a `RawStr`/`RawByteStr` with a `None` field.
 #[inline]
@ -243,12 +241,8 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
 pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
    let mut cursor = Cursor::new(input);
    std::iter::from_fn(move || {
-        if cursor.is_eof() {
-            None
-        } else {
-            cursor.reset_len_consumed();
-            Some(cursor.advance_token())
-        }
+        let token = cursor.advance_token();
+        if token.kind != TokenKind::Eof { Some(token) } else { None }
    })
 }

@ -311,8 +305,11 @@ pub fn is_ident(string: &str) -> bool {

 impl Cursor<'_> {
    /// Parses a token from the input string.
-    fn advance_token(&mut self) -> Token {
-        let first_char = self.bump().unwrap();
+    pub fn advance_token(&mut self) -> Token {
+        let first_char = match self.bump() {
+            Some(c) => c,
+            None => return Token::new(TokenKind::Eof, 0),
+        };
        let token_kind = match first_char {
            // Slash, comment or block comment.
            '/' => match self.first() {
@ -329,7 +326,7 @@ impl Cursor<'_> {
                ('#', c1) if is_id_start(c1) => self.raw_ident(),
                ('#', _) | ('"', _) => {
                    let res = self.raw_double_quoted_string(1);
-                    let suffix_start = self.len_consumed();
+                    let suffix_start = self.pos_within_token();
                    if res.is_ok() {
                        self.eat_literal_suffix();
                    }
@ -344,7 +341,7 @@ impl Cursor<'_> {
                ('\'', _) => {
                    self.bump();
                    let terminated = self.single_quoted_string();
-                    let suffix_start = self.len_consumed();
+                    let suffix_start = self.pos_within_token();
                    if terminated {
                        self.eat_literal_suffix();
                    }
@ -354,7 +351,7 @@ impl Cursor<'_> {
                ('"', _) => {
                    self.bump();
                    let terminated = self.double_quoted_string();
-                    let suffix_start = self.len_consumed();
+                    let suffix_start = self.pos_within_token();
                    if terminated {
                        self.eat_literal_suffix();
                    }
@ -364,7 +361,7 @@ impl Cursor<'_> {
                ('r', '"') | ('r', '#') => {
                    self.bump();
                    let res = self.raw_double_quoted_string(2);
-                    let suffix_start = self.len_consumed();
+                    let suffix_start = self.pos_within_token();
                    if res.is_ok() {
                        self.eat_literal_suffix();
                    }
@ -381,7 +378,7 @@ impl Cursor<'_> {
            // Numeric literal.
            c @ '0'..='9' => {
                let literal_kind = self.number(c);
-                let suffix_start = self.len_consumed();
+                let suffix_start = self.pos_within_token();
                self.eat_literal_suffix();
                TokenKind::Literal { kind: literal_kind, suffix_start }
            }
@ -420,7 +417,7 @@ impl Cursor<'_> {
            // String literal.
            '"' => {
                let terminated = self.double_quoted_string();
-                let suffix_start = self.len_consumed();
+                let suffix_start = self.pos_within_token();
                if terminated {
                    self.eat_literal_suffix();
                }
@ -433,7 +430,9 @@ impl Cursor<'_> {
            }
            _ => Unknown,
        };
-        Token::new(token_kind, self.len_consumed())
+        let res = Token::new(token_kind, self.pos_within_token());
+        self.reset_pos_within_token();
+        res
    }

    fn line_comment(&mut self) -> TokenKind {
@ -618,7 +617,7 @@ impl Cursor<'_> {

        if !can_be_a_lifetime {
            let terminated = self.single_quoted_string();
-            let suffix_start = self.len_consumed();
+            let suffix_start = self.pos_within_token();
            if terminated {
                self.eat_literal_suffix();
            }
@ -643,7 +642,7 @@ impl Cursor<'_> {
        if self.first() == '\'' {
            self.bump();
            let kind = Char { terminated: true };
-            Literal { kind, suffix_start: self.len_consumed() }
+            Literal { kind, suffix_start: self.pos_within_token() }
        } else {
            Lifetime { starts_with_number }
        }
@ -724,7 +723,7 @@ impl Cursor<'_> {

    fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> {
        debug_assert!(self.prev() == 'r');
-        let start_pos = self.len_consumed();
+        let start_pos = self.pos_within_token();
        let mut possible_terminator_offset = None;
        let mut max_hashes = 0;

@ -778,7 +777,7 @@ impl Cursor<'_> {
                // Keep track of possible terminators to give a hint about
                // where there might be a missing terminator
                possible_terminator_offset =
-                    Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len);
+                    Some(self.pos_within_token() - start_pos - n_end_hashes + prefix_len);
                max_hashes = n_end_hashes;
            }
        }
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@ -1,10 +1,11 @@
 use crate::lexer::unicode_chars::UNICODE_ARRAY;
 use rustc_ast::ast::{self, AttrStyle};
 use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind};
-use rustc_ast::tokenstream::{Spacing, TokenStream};
+use rustc_ast::tokenstream::TokenStream;
 use rustc_ast::util::unicode::contains_text_flow_control_chars;
 use rustc_errors::{error_code, Applicability, DiagnosticBuilder, ErrorGuaranteed, PResult};
 use rustc_lexer::unescape::{self, Mode};
+use rustc_lexer::Cursor;
 use rustc_lexer::{Base, DocStyle, RawStrError};
 use rustc_session::lint::builtin::{
    RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
@ -38,11 +39,20 @@ pub struct UnmatchedBrace {

 pub(crate) fn parse_token_trees<'a>(
    sess: &'a ParseSess,
-    src: &'a str,
-    start_pos: BytePos,
+    mut src: &'a str,
+    mut start_pos: BytePos,
    override_span: Option<Span>,
 ) -> (PResult<'a, TokenStream>, Vec<UnmatchedBrace>) {
-    StringReader { sess, start_pos, pos: start_pos, src, override_span }.into_token_trees()
+    // Skip `#!`, if present.
+    if let Some(shebang_len) = rustc_lexer::strip_shebang(src) {
+        src = &src[shebang_len..];
+        start_pos = start_pos + BytePos::from_usize(shebang_len);
+    }
+
+    let cursor = Cursor::new(src);
+    let string_reader =
+        StringReader { sess, start_pos, pos: start_pos, src, cursor, override_span };
+    tokentrees::TokenTreesReader::parse_token_trees(string_reader)
 }

 struct StringReader<'a> {
@ -53,6 +63,8 @@ struct StringReader<'a> {
    pos: BytePos,
    /// Source text to tokenize.
    src: &'a str,
+    /// Cursor for getting lexer tokens.
+    cursor: Cursor<'a>,
    override_span: Option<Span>,
 }

@ -61,42 +73,195 @@ impl<'a> StringReader<'a> {
        self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi))
    }

-    /// Returns the next token, and info about preceding whitespace, if any.
-    fn next_token(&mut self) -> (Spacing, Token) {
-        let mut spacing = Spacing::Joint;
-
-        // Skip `#!` at the start of the file
-        if self.pos == self.start_pos
-            && let Some(shebang_len) = rustc_lexer::strip_shebang(self.src)
-        {
-            self.pos = self.pos + BytePos::from_usize(shebang_len);
-            spacing = Spacing::Alone;
-        }
+    /// Returns the next token, paired with a bool indicating if the token was
+    /// preceded by whitespace.
+    fn next_token(&mut self) -> (Token, bool) {
+        let mut preceded_by_whitespace = false;

        // Skip trivial (whitespace & comments) tokens
        loop {
-            let start_src_index = self.src_index(self.pos);
-            let text: &str = &self.src[start_src_index..];
-
-            if text.is_empty() {
-                let span = self.mk_sp(self.pos, self.pos);
-                return (spacing, Token::new(token::Eof, span));
-            }
-
-            let token = rustc_lexer::first_token(text);
-
+            let token = self.cursor.advance_token();
            let start = self.pos;
            self.pos = self.pos + BytePos(token.len);

            debug!("next_token: {:?}({:?})", token.kind, self.str_from(start));

-            match self.cook_lexer_token(token.kind, start) {
-                Some(kind) => {
-                    let span = self.mk_sp(start, self.pos);
-                    return (spacing, Token::new(kind, span));
+            // Now "cook" the token, converting the simple `rustc_lexer::TokenKind` enum into a
+            // rich `rustc_ast::TokenKind`. This turns strings into interned symbols and runs
+            // additional validation.
+            let kind = match token.kind {
+                rustc_lexer::TokenKind::LineComment { doc_style } => {
+                    // Skip non-doc comments
+                    let Some(doc_style) = doc_style else {
+                        self.lint_unicode_text_flow(start);
+                        preceded_by_whitespace = true;
+                        continue;
+                    };
+
+                    // Opening delimiter of the length 3 is not included into the symbol.
+                    let content_start = start + BytePos(3);
+                    let content = self.str_from(content_start);
+                    self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
                }
-                None => spacing = Spacing::Alone,
-            }
+                rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
+                    if !terminated {
+                        self.report_unterminated_block_comment(start, doc_style);
+                    }
+
+                    // Skip non-doc comments
+                    let Some(doc_style) = doc_style else {
+                        self.lint_unicode_text_flow(start);
+                        preceded_by_whitespace = true;
+                        continue;
+                    };
+
+                    // Opening delimiter of the length 3 and closing delimiter of the length 2
+                    // are not included into the symbol.
+                    let content_start = start + BytePos(3);
+                    let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
+                    let content = self.str_from_to(content_start, content_end);
+                    self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
+                }
+                rustc_lexer::TokenKind::Whitespace => {
+                    preceded_by_whitespace = true;
+                    continue;
+                }
+                rustc_lexer::TokenKind::Ident => {
+                    let sym = nfc_normalize(self.str_from(start));
+                    let span = self.mk_sp(start, self.pos);
+                    self.sess.symbol_gallery.insert(sym, span);
+                    token::Ident(sym, false)
+                }
+                rustc_lexer::TokenKind::RawIdent => {
+                    let sym = nfc_normalize(self.str_from(start + BytePos(2)));
+                    let span = self.mk_sp(start, self.pos);
+                    self.sess.symbol_gallery.insert(sym, span);
+                    if !sym.can_be_raw() {
+                        self.err_span(span, &format!("`{}` cannot be a raw identifier", sym));
+                    }
+                    self.sess.raw_identifier_spans.borrow_mut().push(span);
+                    token::Ident(sym, true)
+                }
+                rustc_lexer::TokenKind::UnknownPrefix => {
+                    self.report_unknown_prefix(start);
+                    let sym = nfc_normalize(self.str_from(start));
+                    let span = self.mk_sp(start, self.pos);
+                    self.sess.symbol_gallery.insert(sym, span);
+                    token::Ident(sym, false)
+                }
+                rustc_lexer::TokenKind::InvalidIdent
+                    // Do not recover an identifier with emoji if the codepoint is a confusable
+                    // with a recoverable substitution token, like `➖`.
+                    if !UNICODE_ARRAY
+                        .iter()
+                        .any(|&(c, _, _)| {
+                            let sym = self.str_from(start);
+                            sym.chars().count() == 1 && c == sym.chars().next().unwrap()
+                        }) =>
+                {
+                    let sym = nfc_normalize(self.str_from(start));
+                    let span = self.mk_sp(start, self.pos);
+                    self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default()
+                        .push(span);
+                    token::Ident(sym, false)
+                }
+                rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
+                    let suffix_start = start + BytePos(suffix_start);
+                    let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
+                    let suffix = if suffix_start < self.pos {
+                        let string = self.str_from(suffix_start);
+                        if string == "_" {
+                            self.sess
+                                .span_diagnostic
+                                .struct_span_warn(
+                                    self.mk_sp(suffix_start, self.pos),
+                                    "underscore literal suffix is not allowed",
+                                )
+                                .warn(
+                                    "this was previously accepted by the compiler but is \
+                                       being phased out; it will become a hard error in \
+                                       a future release!",
+                                )
+                                .note(
+                                    "see issue #42326 \
+                                     <https://github.com/rust-lang/rust/issues/42326> \
+                                     for more information",
+                                )
+                                .emit();
+                            None
+                        } else {
+                            Some(Symbol::intern(string))
+                        }
+                    } else {
+                        None
+                    };
+                    token::Literal(token::Lit { kind, symbol, suffix })
+                }
+                rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
+                    // Include the leading `'` in the real identifier, for macro
+                    // expansion purposes. See #12512 for the gory details of why
+                    // this is necessary.
+                    let lifetime_name = self.str_from(start);
+                    if starts_with_number {
+                        self.err_span_(start, self.pos, "lifetimes cannot start with a number");
+                    }
+                    let ident = Symbol::intern(lifetime_name);
+                    token::Lifetime(ident)
+                }
+                rustc_lexer::TokenKind::Semi => token::Semi,
+                rustc_lexer::TokenKind::Comma => token::Comma,
+                rustc_lexer::TokenKind::Dot => token::Dot,
+                rustc_lexer::TokenKind::OpenParen => token::OpenDelim(Delimiter::Parenthesis),
+                rustc_lexer::TokenKind::CloseParen => token::CloseDelim(Delimiter::Parenthesis),
+                rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(Delimiter::Brace),
+                rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(Delimiter::Brace),
+                rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(Delimiter::Bracket),
+                rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(Delimiter::Bracket),
+                rustc_lexer::TokenKind::At => token::At,
+                rustc_lexer::TokenKind::Pound => token::Pound,
+                rustc_lexer::TokenKind::Tilde => token::Tilde,
+                rustc_lexer::TokenKind::Question => token::Question,
+                rustc_lexer::TokenKind::Colon => token::Colon,
+                rustc_lexer::TokenKind::Dollar => token::Dollar,
+                rustc_lexer::TokenKind::Eq => token::Eq,
+                rustc_lexer::TokenKind::Bang => token::Not,
+                rustc_lexer::TokenKind::Lt => token::Lt,
+                rustc_lexer::TokenKind::Gt => token::Gt,
+                rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus),
+                rustc_lexer::TokenKind::And => token::BinOp(token::And),
+                rustc_lexer::TokenKind::Or => token::BinOp(token::Or),
+                rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus),
+                rustc_lexer::TokenKind::Star => token::BinOp(token::Star),
+                rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash),
+                rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
+                rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
+
+                rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
+                    let c = self.str_from(start).chars().next().unwrap();
+                    let mut err =
+                        self.struct_err_span_char(start, self.pos, "unknown start of token", c);
+                    // FIXME: the lexer could be used to turn the ASCII version of unicode
+                    // homoglyphs, instead of keeping a table in `check_for_substitution`into the
+                    // token. Ideally, this should be inside `rustc_lexer`. However, we should
+                    // first remove compound tokens like `<<` from `rustc_lexer`, and then add
+                    // fancier error recovery to it, as there will be less overall work to do this
+                    // way.
+                    let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
+                    if c == '\x00' {
+                        err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
+                    }
+                    err.emit();
+                    if let Some(token) = token {
+                        token
+                    } else {
+                        preceded_by_whitespace = true;
+                        continue;
+                    }
+                }
+                rustc_lexer::TokenKind::Eof => token::Eof,
+            };
+            let span = self.mk_sp(start, self.pos);
+            return (Token::new(kind, span), preceded_by_whitespace);
        }
    }

@ -162,171 +327,6 @@ impl<'a> StringReader<'a> {
        }
    }

-    /// Turns simple `rustc_lexer::TokenKind` enum into a rich
-    /// `rustc_ast::TokenKind`. This turns strings into interned
-    /// symbols and runs additional validation.
-    fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> Option<TokenKind> {
-        Some(match token {
-            rustc_lexer::TokenKind::LineComment { doc_style } => {
-                // Skip non-doc comments
-                let Some(doc_style) = doc_style else {
-                    self.lint_unicode_text_flow(start);
-                    return None;
-                };
-
-                // Opening delimiter of the length 3 is not included into the symbol.
-                let content_start = start + BytePos(3);
-                let content = self.str_from(content_start);
-                self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
-            }
-            rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
-                if !terminated {
-                    self.report_unterminated_block_comment(start, doc_style);
-                }
-
-                // Skip non-doc comments
-                let Some(doc_style) = doc_style else {
-                    self.lint_unicode_text_flow(start);
-                    return None;
-                };
-
-                // Opening delimiter of the length 3 and closing delimiter of the length 2
-                // are not included into the symbol.
-                let content_start = start + BytePos(3);
-                let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
-                let content = self.str_from_to(content_start, content_end);
-                self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
-            }
-            rustc_lexer::TokenKind::Whitespace => return None,
-            rustc_lexer::TokenKind::Ident
-            | rustc_lexer::TokenKind::RawIdent
-            | rustc_lexer::TokenKind::UnknownPrefix => {
-                let is_raw_ident = token == rustc_lexer::TokenKind::RawIdent;
-                let is_unknown_prefix = token == rustc_lexer::TokenKind::UnknownPrefix;
-                let mut ident_start = start;
-                if is_raw_ident {
-                    ident_start = ident_start + BytePos(2);
-                }
-                if is_unknown_prefix {
-                    self.report_unknown_prefix(start);
-                }
-                let sym = nfc_normalize(self.str_from(ident_start));
-                let span = self.mk_sp(start, self.pos);
-                self.sess.symbol_gallery.insert(sym, span);
-                if is_raw_ident {
-                    if !sym.can_be_raw() {
-                        self.err_span(span, &format!("`{}` cannot be a raw identifier", sym));
-                    }
-                    self.sess.raw_identifier_spans.borrow_mut().push(span);
-                }
-                token::Ident(sym, is_raw_ident)
-            }
-            rustc_lexer::TokenKind::InvalidIdent
-                // Do not recover an identifier with emoji if the codepoint is a confusable
-                // with a recoverable substitution token, like `➖`.
-                if !UNICODE_ARRAY
-                    .iter()
-                    .any(|&(c, _, _)| {
-                        let sym = self.str_from(start);
-                        sym.chars().count() == 1 && c == sym.chars().next().unwrap()
-                    })
-                     =>
-            {
-                let sym = nfc_normalize(self.str_from(start));
-                let span = self.mk_sp(start, self.pos);
-                self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span);
-                token::Ident(sym, false)
-            }
-            rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
-                let suffix_start = start + BytePos(suffix_start);
-                let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
-                let suffix = if suffix_start < self.pos {
-                    let string = self.str_from(suffix_start);
-                    if string == "_" {
-                        self.sess
-                            .span_diagnostic
-                            .struct_span_warn(
-                                self.mk_sp(suffix_start, self.pos),
-                                "underscore literal suffix is not allowed",
-                            )
-                            .warn(
-                                "this was previously accepted by the compiler but is \
-                                   being phased out; it will become a hard error in \
-                                   a future release!",
-                            )
-                            .note(
-                                "see issue #42326 \
-                                 <https://github.com/rust-lang/rust/issues/42326> \
-                                 for more information",
-                            )
-                            .emit();
-                        None
-                    } else {
-                        Some(Symbol::intern(string))
-                    }
-                } else {
-                    None
-                };
-                token::Literal(token::Lit { kind, symbol, suffix })
-            }
-            rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
-                // Include the leading `'` in the real identifier, for macro
-                // expansion purposes. See #12512 for the gory details of why
-                // this is necessary.
-                let lifetime_name = self.str_from(start);
-                if starts_with_number {
-                    self.err_span_(start, self.pos, "lifetimes cannot start with a number");
-                }
-                let ident = Symbol::intern(lifetime_name);
-                token::Lifetime(ident)
-            }
-            rustc_lexer::TokenKind::Semi => token::Semi,
-            rustc_lexer::TokenKind::Comma => token::Comma,
-            rustc_lexer::TokenKind::Dot => token::Dot,
-            rustc_lexer::TokenKind::OpenParen => token::OpenDelim(Delimiter::Parenthesis),
-            rustc_lexer::TokenKind::CloseParen => token::CloseDelim(Delimiter::Parenthesis),
-            rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(Delimiter::Brace),
-            rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(Delimiter::Brace),
-            rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(Delimiter::Bracket),
-            rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(Delimiter::Bracket),
-            rustc_lexer::TokenKind::At => token::At,
-            rustc_lexer::TokenKind::Pound => token::Pound,
-            rustc_lexer::TokenKind::Tilde => token::Tilde,
-            rustc_lexer::TokenKind::Question => token::Question,
-            rustc_lexer::TokenKind::Colon => token::Colon,
-            rustc_lexer::TokenKind::Dollar => token::Dollar,
-            rustc_lexer::TokenKind::Eq => token::Eq,
-            rustc_lexer::TokenKind::Bang => token::Not,
-            rustc_lexer::TokenKind::Lt => token::Lt,
-            rustc_lexer::TokenKind::Gt => token::Gt,
-            rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus),
-            rustc_lexer::TokenKind::And => token::BinOp(token::And),
-            rustc_lexer::TokenKind::Or => token::BinOp(token::Or),
-            rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus),
-            rustc_lexer::TokenKind::Star => token::BinOp(token::Star),
-            rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash),
-            rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
-            rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
-
-            rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
-                let c = self.str_from(start).chars().next().unwrap();
-                let mut err =
-                    self.struct_err_span_char(start, self.pos, "unknown start of token", c);
-                // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs,
-                // instead of keeping a table in `check_for_substitution`into the token. Ideally,
-                // this should be inside `rustc_lexer`. However, we should first remove compound
-                // tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
-                // as there will be less overall work to do this way.
-                let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
-                if c == '\x00' {
-                    err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
-                }
-                err.emit();
-                token?
-            }
-        })
-    }
-
    fn cook_doc_comment(
        &self,
        content_start: BytePos,
--- a/compiler/rustc_parse/src/lexer/tokentrees.rs
+++ b/compiler/rustc_parse/src/lexer/tokentrees.rs
@ -1,31 +1,15 @@
 use super::{StringReader, UnmatchedBrace};
-
 use rustc_ast::token::{self, Delimiter, Token};
 use rustc_ast::tokenstream::{DelimSpan, Spacing, TokenStream, TokenTree};
 use rustc_ast_pretty::pprust::token_to_string;
 use rustc_data_structures::fx::FxHashMap;
-use rustc_errors::PResult;
+use rustc_errors::{PErr, PResult};
 use rustc_span::Span;

-impl<'a> StringReader<'a> {
-    pub(super) fn into_token_trees(self) -> (PResult<'a, TokenStream>, Vec<UnmatchedBrace>) {
-        let mut tt_reader = TokenTreesReader {
-            string_reader: self,
-            token: Token::dummy(),
-            open_braces: Vec::new(),
-            unmatched_braces: Vec::new(),
-            matching_delim_spans: Vec::new(),
-            last_unclosed_found_span: None,
-            last_delim_empty_block_spans: FxHashMap::default(),
-            matching_block_spans: Vec::new(),
-        };
-        let res = tt_reader.parse_all_token_trees();
-        (res, tt_reader.unmatched_braces)
-    }
-}
-
-struct TokenTreesReader<'a> {
+pub(super) struct TokenTreesReader<'a> {
    string_reader: StringReader<'a>,
+    /// The "next" token, which has been obtained from the `StringReader` but
+    /// not yet handled by the `TokenTreesReader`.
    token: Token,
    /// Stack of open delimiters and their spans. Used for error message.
    open_braces: Vec<(Delimiter, Span)>,
@ -43,231 +27,235 @@ struct TokenTreesReader<'a> {
 }

 impl<'a> TokenTreesReader<'a> {
+    pub(super) fn parse_token_trees(
+        string_reader: StringReader<'a>,
+    ) -> (PResult<'a, TokenStream>, Vec<UnmatchedBrace>) {
+        let mut tt_reader = TokenTreesReader {
+            string_reader,
+            token: Token::dummy(),
+            open_braces: Vec::new(),
+            unmatched_braces: Vec::new(),
+            matching_delim_spans: Vec::new(),
+            last_unclosed_found_span: None,
+            last_delim_empty_block_spans: FxHashMap::default(),
+            matching_block_spans: Vec::new(),
+        };
+        let res = tt_reader.parse_all_token_trees();
+        (res, tt_reader.unmatched_braces)
+    }
+
    // Parse a stream of tokens into a list of `TokenTree`s, up to an `Eof`.
    fn parse_all_token_trees(&mut self) -> PResult<'a, TokenStream> {
+        self.token = self.string_reader.next_token().0;
        let mut buf = TokenStreamBuilder::default();
-
-        self.bump();
-        while self.token != token::Eof {
-            buf.push(self.parse_token_tree()?);
+        loop {
+            match self.token.kind {
+                token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)),
+                token::CloseDelim(delim) => return Err(self.close_delim_err(delim)),
+                token::Eof => return Ok(buf.into_token_stream()),
+                _ => buf.push(self.parse_token_tree_non_delim_non_eof()),
+            }
        }
-
-        Ok(buf.into_token_stream())
    }

    // Parse a stream of tokens into a list of `TokenTree`s, up to a `CloseDelim`.
    fn parse_token_trees_until_close_delim(&mut self) -> TokenStream {
        let mut buf = TokenStreamBuilder::default();
        loop {
-            if let token::CloseDelim(..) = self.token.kind {
-                return buf.into_token_stream();
-            }
-
-            match self.parse_token_tree() {
-                Ok(tree) => buf.push(tree),
-                Err(mut e) => {
-                    e.emit();
+            match self.token.kind {
+                token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)),
+                token::CloseDelim(..) => return buf.into_token_stream(),
+                token::Eof => {
+                    self.eof_err().emit();
                    return buf.into_token_stream();
                }
+                _ => buf.push(self.parse_token_tree_non_delim_non_eof()),
            }
        }
    }

-    fn parse_token_tree(&mut self) -> PResult<'a, TokenTree> {
-        let sm = self.string_reader.sess.source_map();
+    fn eof_err(&mut self) -> PErr<'a> {
+        let msg = "this file contains an unclosed delimiter";
+        let mut err = self.string_reader.sess.span_diagnostic.struct_span_err(self.token.span, msg);
+        for &(_, sp) in &self.open_braces {
+            err.span_label(sp, "unclosed delimiter");
+            self.unmatched_braces.push(UnmatchedBrace {
+                expected_delim: Delimiter::Brace,
+                found_delim: None,
+                found_span: self.token.span,
+                unclosed_span: Some(sp),
+                candidate_span: None,
+            });
+        }
+
+        if let Some((delim, _)) = self.open_braces.last() {
+            if let Some((_, open_sp, close_sp)) =
+                self.matching_delim_spans.iter().find(|(d, open_sp, close_sp)| {
+                    let sm = self.string_reader.sess.source_map();
+                    if let Some(close_padding) = sm.span_to_margin(*close_sp) {
+                        if let Some(open_padding) = sm.span_to_margin(*open_sp) {
+                            return delim == d && close_padding != open_padding;
+                        }
+                    }
+                    false
+                })
+            // these are in reverse order as they get inserted on close, but
+            {
+                // we want the last open/first close
+                err.span_label(*open_sp, "this delimiter might not be properly closed...");
+                err.span_label(*close_sp, "...as it matches this but it has different indentation");
+            }
+        }
+        err
+    }
+
+    fn parse_token_tree_open_delim(&mut self, open_delim: Delimiter) -> TokenTree {
+        // The span for beginning of the delimited section
+        let pre_span = self.token.span;
+
+        // Move past the open delimiter.
+        self.open_braces.push((open_delim, self.token.span));
+        self.token = self.string_reader.next_token().0;
+
+        // Parse the token trees within the delimiters.
+        // We stop at any delimiter so we can try to recover if the user
+        // uses an incorrect delimiter.
+        let tts = self.parse_token_trees_until_close_delim();
+
+        // Expand to cover the entire delimited token tree
+        let delim_span = DelimSpan::from_pair(pre_span, self.token.span);

        match self.token.kind {
-            token::Eof => {
-                let msg = "this file contains an unclosed delimiter";
-                let mut err =
-                    self.string_reader.sess.span_diagnostic.struct_span_err(self.token.span, msg);
-                for &(_, sp) in &self.open_braces {
-                    err.span_label(sp, "unclosed delimiter");
+            // Correct delimiter.
+            token::CloseDelim(close_delim) if close_delim == open_delim => {
+                let (open_brace, open_brace_span) = self.open_braces.pop().unwrap();
+                let close_brace_span = self.token.span;
+
+                if tts.is_empty() {
+                    let empty_block_span = open_brace_span.to(close_brace_span);
+                    let sm = self.string_reader.sess.source_map();
+                    if !sm.is_multiline(empty_block_span) {
+                        // Only track if the block is in the form of `{}`, otherwise it is
+                        // likely that it was written on purpose.
+                        self.last_delim_empty_block_spans.insert(open_delim, empty_block_span);
+                    }
+                }
+
+                //only add braces
+                if let (Delimiter::Brace, Delimiter::Brace) = (open_brace, open_delim) {
+                    self.matching_block_spans.push((open_brace_span, close_brace_span));
+                }
+
+                if self.open_braces.is_empty() {
+                    // Clear up these spans to avoid suggesting them as we've found
+                    // properly matched delimiters so far for an entire block.
+                    self.matching_delim_spans.clear();
+                } else {
+                    self.matching_delim_spans.push((open_brace, open_brace_span, close_brace_span));
+                }
+                // Move past the closing delimiter.
+                self.token = self.string_reader.next_token().0;
+            }
+            // Incorrect delimiter.
+            token::CloseDelim(close_delim) => {
+                let mut unclosed_delimiter = None;
+                let mut candidate = None;
+
+                if self.last_unclosed_found_span != Some(self.token.span) {
+                    // do not complain about the same unclosed delimiter multiple times
+                    self.last_unclosed_found_span = Some(self.token.span);
+                    // This is a conservative error: only report the last unclosed
+                    // delimiter. The previous unclosed delimiters could actually be
+                    // closed! The parser just hasn't gotten to them yet.
+                    if let Some(&(_, sp)) = self.open_braces.last() {
+                        unclosed_delimiter = Some(sp);
+                    };
+                    let sm = self.string_reader.sess.source_map();
+                    if let Some(current_padding) = sm.span_to_margin(self.token.span) {
+                        for (brace, brace_span) in &self.open_braces {
+                            if let Some(padding) = sm.span_to_margin(*brace_span) {
+                                // high likelihood of these two corresponding
+                                if current_padding == padding && brace == &close_delim {
+                                    candidate = Some(*brace_span);
+                                }
+                            }
+                        }
+                    }
+                    let (tok, _) = self.open_braces.pop().unwrap();
                    self.unmatched_braces.push(UnmatchedBrace {
-                        expected_delim: Delimiter::Brace,
-                        found_delim: None,
+                        expected_delim: tok,
+                        found_delim: Some(close_delim),
                        found_span: self.token.span,
-                        unclosed_span: Some(sp),
-                        candidate_span: None,
+                        unclosed_span: unclosed_delimiter,
+                        candidate_span: candidate,
                    });
+                } else {
+                    self.open_braces.pop();
                }

-                if let Some((delim, _)) = self.open_braces.last() {
-                    if let Some((_, open_sp, close_sp)) =
-                        self.matching_delim_spans.iter().find(|(d, open_sp, close_sp)| {
-                            if let Some(close_padding) = sm.span_to_margin(*close_sp) {
-                                if let Some(open_padding) = sm.span_to_margin(*open_sp) {
-                                    return delim == d && close_padding != open_padding;
-                                }
-                            }
-                            false
-                        })
-                    // these are in reverse order as they get inserted on close, but
-                    {
-                        // we want the last open/first close
-                        err.span_label(*open_sp, "this delimiter might not be properly closed...");
-                        err.span_label(
-                            *close_sp,
-                            "...as it matches this but it has different indentation",
-                        );
-                    }
+                // If the incorrect delimiter matches an earlier opening
+                // delimiter, then don't consume it (it can be used to
+                // close the earlier one). Otherwise, consume it.
+                // E.g., we try to recover from:
+                // fn foo() {
+                //     bar(baz(
+                // }  // Incorrect delimiter but matches the earlier `{`
+                if !self.open_braces.iter().any(|&(b, _)| b == close_delim) {
+                    self.token = self.string_reader.next_token().0;
                }
-                Err(err)
            }
-            token::OpenDelim(delim) => {
-                // The span for beginning of the delimited section
-                let pre_span = self.token.span;
-
-                // Parse the open delimiter.
-                self.open_braces.push((delim, self.token.span));
-                self.bump();
-
-                // Parse the token trees within the delimiters.
-                // We stop at any delimiter so we can try to recover if the user
-                // uses an incorrect delimiter.
-                let tts = self.parse_token_trees_until_close_delim();
-
-                // Expand to cover the entire delimited token tree
-                let delim_span = DelimSpan::from_pair(pre_span, self.token.span);
-
-                match self.token.kind {
-                    // Correct delimiter.
-                    token::CloseDelim(d) if d == delim => {
-                        let (open_brace, open_brace_span) = self.open_braces.pop().unwrap();
-                        let close_brace_span = self.token.span;
-
-                        if tts.is_empty() {
-                            let empty_block_span = open_brace_span.to(close_brace_span);
-                            if !sm.is_multiline(empty_block_span) {
-                                // Only track if the block is in the form of `{}`, otherwise it is
-                                // likely that it was written on purpose.
-                                self.last_delim_empty_block_spans.insert(delim, empty_block_span);
-                            }
-                        }
-
-                        //only add braces
-                        if let (Delimiter::Brace, Delimiter::Brace) = (open_brace, delim) {
-                            self.matching_block_spans.push((open_brace_span, close_brace_span));
-                        }
-
-                        if self.open_braces.is_empty() {
-                            // Clear up these spans to avoid suggesting them as we've found
-                            // properly matched delimiters so far for an entire block.
-                            self.matching_delim_spans.clear();
-                        } else {
-                            self.matching_delim_spans.push((
-                                open_brace,
-                                open_brace_span,
-                                close_brace_span,
-                            ));
-                        }
-                        // Parse the closing delimiter.
-                        self.bump();
-                    }
-                    // Incorrect delimiter.
-                    token::CloseDelim(other) => {
-                        let mut unclosed_delimiter = None;
-                        let mut candidate = None;
-
-                        if self.last_unclosed_found_span != Some(self.token.span) {
-                            // do not complain about the same unclosed delimiter multiple times
-                            self.last_unclosed_found_span = Some(self.token.span);
-                            // This is a conservative error: only report the last unclosed
-                            // delimiter. The previous unclosed delimiters could actually be
-                            // closed! The parser just hasn't gotten to them yet.
-                            if let Some(&(_, sp)) = self.open_braces.last() {
-                                unclosed_delimiter = Some(sp);
-                            };
-                            if let Some(current_padding) = sm.span_to_margin(self.token.span) {
-                                for (brace, brace_span) in &self.open_braces {
-                                    if let Some(padding) = sm.span_to_margin(*brace_span) {
-                                        // high likelihood of these two corresponding
-                                        if current_padding == padding && brace == &other {
-                                            candidate = Some(*brace_span);
-                                        }
-                                    }
-                                }
-                            }
-                            let (tok, _) = self.open_braces.pop().unwrap();
-                            self.unmatched_braces.push(UnmatchedBrace {
-                                expected_delim: tok,
-                                found_delim: Some(other),
-                                found_span: self.token.span,
-                                unclosed_span: unclosed_delimiter,
-                                candidate_span: candidate,
-                            });
-                        } else {
-                            self.open_braces.pop();
-                        }
-
-                        // If the incorrect delimiter matches an earlier opening
-                        // delimiter, then don't consume it (it can be used to
-                        // close the earlier one). Otherwise, consume it.
-                        // E.g., we try to recover from:
-                        // fn foo() {
-                        //     bar(baz(
-                        // }  // Incorrect delimiter but matches the earlier `{`
-                        if !self.open_braces.iter().any(|&(b, _)| b == other) {
-                            self.bump();
-                        }
-                    }
-                    token::Eof => {
-                        // Silently recover, the EOF token will be seen again
-                        // and an error emitted then. Thus we don't pop from
-                        // self.open_braces here.
-                    }
-                    _ => {}
-                }
-
-                Ok(TokenTree::Delimited(delim_span, delim, tts))
-            }
-            token::CloseDelim(delim) => {
-                // An unexpected closing delimiter (i.e., there is no
-                // matching opening delimiter).
-                let token_str = token_to_string(&self.token);
-                let msg = format!("unexpected closing delimiter: `{}`", token_str);
-                let mut err =
-                    self.string_reader.sess.span_diagnostic.struct_span_err(self.token.span, &msg);
-
-                // Braces are added at the end, so the last element is the biggest block
-                if let Some(parent) = self.matching_block_spans.last() {
-                    if let Some(span) = self.last_delim_empty_block_spans.remove(&delim) {
-                        // Check if the (empty block) is in the last properly closed block
-                        if (parent.0.to(parent.1)).contains(span) {
-                            err.span_label(
-                                span,
-                                "block is empty, you might have not meant to close it",
-                            );
-                        } else {
-                            err.span_label(parent.0, "this opening brace...");
-
-                            err.span_label(parent.1, "...matches this closing brace");
-                        }
-                    } else {
-                        err.span_label(parent.0, "this opening brace...");
-
-                        err.span_label(parent.1, "...matches this closing brace");
-                    }
-                }
-
-                err.span_label(self.token.span, "unexpected closing delimiter");
-                Err(err)
-            }
-            _ => {
-                let tok = self.token.take();
-                let mut spacing = self.bump();
-                if !self.token.is_op() {
-                    spacing = Spacing::Alone;
-                }
-                Ok(TokenTree::Token(tok, spacing))
+            token::Eof => {
+                // Silently recover, the EOF token will be seen again
+                // and an error emitted then. Thus we don't pop from
+                // self.open_braces here.
            }
+            _ => unreachable!(),
        }
+
+        TokenTree::Delimited(delim_span, open_delim, tts)
    }

-    fn bump(&mut self) -> Spacing {
-        let (spacing, token) = self.string_reader.next_token();
-        self.token = token;
-        spacing
+    fn close_delim_err(&mut self, delim: Delimiter) -> PErr<'a> {
+        // An unexpected closing delimiter (i.e., there is no
+        // matching opening delimiter).
+        let token_str = token_to_string(&self.token);
+        let msg = format!("unexpected closing delimiter: `{}`", token_str);
+        let mut err =
+            self.string_reader.sess.span_diagnostic.struct_span_err(self.token.span, &msg);
+
+        // Braces are added at the end, so the last element is the biggest block
+        if let Some(parent) = self.matching_block_spans.last() {
+            if let Some(span) = self.last_delim_empty_block_spans.remove(&delim) {
+                // Check if the (empty block) is in the last properly closed block
+                if (parent.0.to(parent.1)).contains(span) {
+                    err.span_label(span, "block is empty, you might have not meant to close it");
+                } else {
+                    err.span_label(parent.0, "this opening brace...");
+                    err.span_label(parent.1, "...matches this closing brace");
+                }
+            } else {
+                err.span_label(parent.0, "this opening brace...");
+                err.span_label(parent.1, "...matches this closing brace");
+            }
+        }
+
+        err.span_label(self.token.span, "unexpected closing delimiter");
+        err
+    }
+
+    #[inline]
+    fn parse_token_tree_non_delim_non_eof(&mut self) -> TokenTree {
+        // `this_spacing` for the returned token refers to whether the token is
+        // immediately followed by another op token. It is determined by the
+        // next token: its kind and its `preceded_by_whitespace` status.
+        let (next_tok, is_next_tok_preceded_by_whitespace) = self.string_reader.next_token();
+        let this_spacing = if is_next_tok_preceded_by_whitespace || !next_tok.is_op() {
+            Spacing::Alone
+        } else {
+            Spacing::Joint
+        };
+        let this_tok = std::mem::replace(&mut self.token, next_tok);
+        TokenTree::Token(this_tok, this_spacing)
    }
 }

--- a/src/librustdoc/html/highlight.rs
+++ b/src/librustdoc/html/highlight.rs
@ -13,6 +13,7 @@ use std::collections::VecDeque;
 use std::fmt::{Display, Write};

 use rustc_data_structures::fx::FxHashMap;
+use rustc_lexer::Cursor;
 use rustc_lexer::{LiteralKind, TokenKind};
 use rustc_span::edition::Edition;
 use rustc_span::symbol::Symbol;
@ -408,15 +409,16 @@ enum Highlight<'a> {

 struct TokenIter<'a> {
    src: &'a str,
+    cursor: Cursor<'a>,
 }

 impl<'a> Iterator for TokenIter<'a> {
    type Item = (TokenKind, &'a str);
    fn next(&mut self) -> Option<(TokenKind, &'a str)> {
-        if self.src.is_empty() {
+        let token = self.cursor.advance_token();
+        if token.kind == TokenKind::Eof {
            return None;
        }
-        let token = rustc_lexer::first_token(self.src);
        let (text, rest) = self.src.split_at(token.len as usize);
        self.src = rest;
        Some((token.kind, text))
@ -525,7 +527,7 @@ impl<'a> Classifier<'a> {
    /// Takes as argument the source code to HTML-ify, the rust edition to use and the source code
    /// file span which will be used later on by the `span_correspondance_map`.
    fn new(src: &str, file_span: Span, decoration_info: Option<DecorationInfo>) -> Classifier<'_> {
-        let tokens = PeekIter::new(TokenIter { src });
+        let tokens = PeekIter::new(TokenIter { src, cursor: Cursor::new(src) });
        let decorations = decoration_info.map(Decorations::new);
        Classifier {
            tokens,
@ -850,6 +852,7 @@ impl<'a> Classifier<'a> {
                Class::Ident(self.new_span(before, text))
            }
            TokenKind::Lifetime { .. } => Class::Lifetime,
+            TokenKind::Eof => panic!("Eof in advance"),
        };
        // Anything that didn't return above is the simple case where we the
        // class just spans a single token, so we can use the `string` method.