Auto merge of #102508 - nnethercote:even-more-lexer-improvements, r=matklad

Even more lexer improvements These are just about code clarity, rather than performance. r? `@matklad`
2022-10-03 04:49:46 +00:00 · 2022-10-03 04:49:46 +00:00 · dbaf3e67aa
commit dbaf3e67aa
parent 607b8296e0 4e5ddf1adf
6 changed files with 70 additions and 80 deletions
--- a/compiler/rustc_ast/src/token.rs
+++ b/compiler/rustc_ast/src/token.rs
@ -345,17 +345,14 @@ impl Token {
    }
    pub fn is_op(&self) -> bool {
-        !matches!(
+        match self.kind {
-            self.kind,
+            Eq | Lt | Le | EqEq | Ne | Ge | Gt | AndAnd | OrOr | Not | Tilde | BinOp(_)
-            OpenDelim(..)
+            | BinOpEq(_) | At | Dot | DotDot | DotDotDot | DotDotEq | Comma | Semi | Colon
-                | CloseDelim(..)
+            | ModSep | RArrow | LArrow | FatArrow | Pound | Dollar | Question | SingleQuote => true,
-                | Literal(..)
+
-                | DocComment(..)
+            OpenDelim(..) | CloseDelim(..) | Literal(..) | DocComment(..) | Ident(..)
-                | Ident(..)
+            | Lifetime(..) | Interpolated(..) | Eof => false,
-                | Lifetime(..)
+        }
                | Interpolated(..)
                | Eof
        )
    }
    pub fn is_like_plus(&self) -> bool {
--- a/compiler/rustc_ast/src/tokenstream.rs
+++ b/compiler/rustc_ast/src/tokenstream.rs
@ -304,9 +304,20 @@ pub struct AttributesData {
 #[derive(Clone, Debug, Default, Encodable, Decodable)]
 pub struct TokenStream(pub(crate) Lrc<Vec<TokenTree>>);
 /// Similar to `proc_macro::Spacing`, but for tokens.
 ///
 /// Note that all `ast::TokenTree::Token` instances have a `Spacing`, but when
 /// we convert to `proc_macro::TokenTree` for proc macros only `Punct`
 /// `TokenTree`s have a `proc_macro::Spacing`.
 #[derive(Clone, Copy, Debug, PartialEq, Encodable, Decodable, HashStable_Generic)]
 pub enum Spacing {
    /// The token is not immediately followed by an operator token (as
    /// determined by `Token::is_op`). E.g. a `+` token is `Alone` in `+ =`,
    /// `+/*foo*/=`, `+ident`, and `+()`.
    Alone,
    /// The token is immediately followed by an operator token. E.g. a `+`
    /// token is `Joint` in `+=` and `++`.
    Joint,
 }
--- a/compiler/rustc_expand/src/proc_macro_server.rs
+++ b/compiler/rustc_expand/src/proc_macro_server.rs
@ -110,10 +110,14 @@ impl FromInternal<(TokenStream, &mut Rustc<'_, '_>)> for Vec<TokenTree<TokenStre
                tokenstream::TokenTree::Token(token, spacing) => (token, spacing == Joint),
            };
            // Split the operator into one or more `Punct`s, one per character.
            // The final one inherits the jointness of the original token. Any
            // before that get `joint = true`.
            let mut op = |s: &str| {
                assert!(s.is_ascii());
-                trees.extend(s.as_bytes().iter().enumerate().map(|(idx, &ch)| {
+                trees.extend(s.bytes().enumerate().map(|(idx, ch)| {
-                    TokenTree::Punct(Punct { ch, joint: joint || idx != s.len() - 1, span })
+                    let is_final = idx == s.len() - 1;
                    TokenTree::Punct(Punct { ch, joint: if is_final { joint } else { true }, span })
                }));
            };
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@ -52,7 +52,7 @@ pub(crate) fn parse_token_trees<'a>(
    let cursor = Cursor::new(src);
    let string_reader =
        StringReader { sess, start_pos, pos: start_pos, src, cursor, override_span };
-    tokentrees::TokenTreesReader::parse_token_trees(string_reader)
+    tokentrees::TokenTreesReader::parse_all_token_trees(string_reader)
 }
 struct StringReader<'a> {
--- a/compiler/rustc_parse/src/lexer/tokentrees.rs
+++ b/compiler/rustc_parse/src/lexer/tokentrees.rs
@ -27,7 +27,7 @@ pub(super) struct TokenTreesReader<'a> {
 }
 impl<'a> TokenTreesReader<'a> {
-    pub(super) fn parse_token_trees(
+    pub(super) fn parse_all_token_trees(
        string_reader: StringReader<'a>,
    ) -> (PResult<'a, TokenStream>, Vec<UnmatchedBrace>) {
        let mut tt_reader = TokenTreesReader {
@ -40,36 +40,51 @@ impl<'a> TokenTreesReader<'a> {
            last_delim_empty_block_spans: FxHashMap::default(),
            matching_block_spans: Vec::new(),
        };
-        let res = tt_reader.parse_all_token_trees();
+        let res = tt_reader.parse_token_trees(/* is_delimited */ false);
        (res, tt_reader.unmatched_braces)
    }
-    // Parse a stream of tokens into a list of `TokenTree`s, up to an `Eof`.
+    // Parse a stream of tokens into a list of `TokenTree`s.
-    fn parse_all_token_trees(&mut self) -> PResult<'a, TokenStream> {
+    fn parse_token_trees(&mut self, is_delimited: bool) -> PResult<'a, TokenStream> {
        self.token = self.string_reader.next_token().0;
-        let mut buf = TokenStreamBuilder::default();
+        let mut buf = Vec::new();
        loop {
            match self.token.kind {
                token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)),
-                token::CloseDelim(delim) => return Err(self.close_delim_err(delim)),
+                token::CloseDelim(delim) => {
-                token::Eof => return Ok(buf.into_token_stream()),
+                    return if is_delimited {
-                _ => buf.push(self.parse_token_tree_non_delim_non_eof()),
+                        Ok(TokenStream::new(buf))
-            }
+                    } else {
-        }
+                        Err(self.close_delim_err(delim))
-    }
+                    };
-
+                }
-    // Parse a stream of tokens into a list of `TokenTree`s, up to a `CloseDelim`.
+                token::Eof => {
-    fn parse_token_trees_until_close_delim(&mut self) -> TokenStream {
+                    if is_delimited {
-        let mut buf = TokenStreamBuilder::default();
+                        self.eof_err().emit();
-        loop {
+                    }
-            match self.token.kind {
+                    return Ok(TokenStream::new(buf));
-                token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)),
+                }
-                token::CloseDelim(..) => return buf.into_token_stream(),
+                _ => {
-                token::Eof => {
+                    // Get the next normal token. This might require getting multiple adjacent
-                    self.eof_err().emit();
+                    // single-char tokens and joining them together.
-                    return buf.into_token_stream();
+                    let (this_spacing, next_tok) = loop {
                        let (next_tok, is_next_tok_preceded_by_whitespace) =
                            self.string_reader.next_token();
                        if !is_next_tok_preceded_by_whitespace {
                            if let Some(glued) = self.token.glue(&next_tok) {
                                self.token = glued;
                            } else {
                                let this_spacing =
                                    if next_tok.is_op() { Spacing::Joint } else { Spacing::Alone };
                                break (this_spacing, next_tok);
                            }
                        } else {
                            break (Spacing::Alone, next_tok);
                        }
                    };
                    let this_tok = std::mem::replace(&mut self.token, next_tok);
                    buf.push(TokenTree::Token(this_tok, this_spacing));
                }
                _ => buf.push(self.parse_token_tree_non_delim_non_eof()),
            }
        }
    }
@ -113,14 +128,12 @@ impl<'a> TokenTreesReader<'a> {
        // The span for beginning of the delimited section
        let pre_span = self.token.span;
        // Move past the open delimiter.
        self.open_braces.push((open_delim, self.token.span));
        self.token = self.string_reader.next_token().0;
        // Parse the token trees within the delimiters.
        // We stop at any delimiter so we can try to recover if the user
        // uses an incorrect delimiter.
-        let tts = self.parse_token_trees_until_close_delim();
+        let tts = self.parse_token_trees(/* is_delimited */ true).unwrap();
        // Expand to cover the entire delimited token tree
        let delim_span = DelimSpan::from_pair(pre_span, self.token.span);
@ -242,43 +255,4 @@ impl<'a> TokenTreesReader<'a> {
        err.span_label(self.token.span, "unexpected closing delimiter");
        err
    }
    #[inline]
    fn parse_token_tree_non_delim_non_eof(&mut self) -> TokenTree {
        // `this_spacing` for the returned token refers to whether the token is
        // immediately followed by another op token. It is determined by the
        // next token: its kind and its `preceded_by_whitespace` status.
        let (next_tok, is_next_tok_preceded_by_whitespace) = self.string_reader.next_token();
        let this_spacing = if is_next_tok_preceded_by_whitespace || !next_tok.is_op() {
            Spacing::Alone
        } else {
            Spacing::Joint
        };
        let this_tok = std::mem::replace(&mut self.token, next_tok);
        TokenTree::Token(this_tok, this_spacing)
    }
 }
 #[derive(Default)]
 struct TokenStreamBuilder {
    buf: Vec<TokenTree>,
 }
 impl TokenStreamBuilder {
    #[inline(always)]
    fn push(&mut self, tree: TokenTree) {
        if let Some(TokenTree::Token(prev_token, Spacing::Joint)) = self.buf.last()
            && let TokenTree::Token(token, joint) = &tree
            && let Some(glued) = prev_token.glue(token)
        {
            self.buf.pop();
            self.buf.push(TokenTree::Token(glued, *joint));
        } else {
            self.buf.push(tree)
        }
    }
    fn into_token_stream(self) -> TokenStream {
        TokenStream::new(self.buf)
    }
 }
--- a/compiler/rustc_parse/src/parser/mod.rs
+++ b/compiler/rustc_parse/src/parser/mod.rs
@ -302,7 +302,10 @@ impl TokenCursor {
    fn desugar(&mut self, attr_style: AttrStyle, data: Symbol, span: Span) -> (Token, Spacing) {
        // Searches for the occurrences of `"#*` and returns the minimum number of `#`s
-        // required to wrap the text.
+        // required to wrap the text. E.g.
        // - `abc d` is wrapped as `r"abc d"` (num_of_hashes = 0)
        // - `abc "d"` is wrapped as `r#"abc "d""#` (num_of_hashes = 1)
        // - `abc "##d##"` is wrapped as `r###"abc "d""###` (num_of_hashes = 3)
        let mut num_of_hashes = 0;
        let mut count = 0;
        for ch in data.as_str().chars() {
@ -314,6 +317,7 @@ impl TokenCursor {
            num_of_hashes = cmp::max(num_of_hashes, count);
        }
        // `/// foo` becomes `doc = r"foo".
        let delim_span = DelimSpan::from_single(span);
        let body = TokenTree::Delimited(
            delim_span,