From dad95578b0f000807f23fe2d37ca37ea8bd8522c Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Fri, 12 Jul 2024 09:22:16 +1000
Subject: [PATCH 1/3] Add unit tests for `Parser::look_ahead`.

It's currently buggy, so some of the test results are surprising, as
described in the `FIXME` comments. The bugs will be fixed in subsequent
commits.
---
 compiler/rustc_parse/src/parser/tests.rs | 116 +++++++++++++++++++++++
 1 file changed, 116 insertions(+)

diff --git a/compiler/rustc_parse/src/parser/tests.rs b/compiler/rustc_parse/src/parser/tests.rs
index 42392ad2163..538ec6b8cfd 100644
--- a/compiler/rustc_parse/src/parser/tests.rs
+++ b/compiler/rustc_parse/src/parser/tests.rs
@@ -1376,6 +1376,122 @@ fn parse_expr_from_source_str(
     });
 }
 
+// Uses a macro rather than a function so that failure messages mention the
+// correct line in the test function.
+macro_rules! look {
+    ($p:ident, $dist:literal, $kind:expr) => {
+        $p.look_ahead($dist, |tok| assert_eq!($kind, tok.kind));
+    };
+}
+
+#[test]
+fn look_ahead() {
+    create_default_session_globals_then(|| {
+        let sym_f = Symbol::intern("f");
+        let sym_x = Symbol::intern("x");
+        #[allow(non_snake_case)]
+        let sym_S = Symbol::intern("S");
+        let raw_no = IdentIsRaw::No;
+
+        let psess = psess();
+        let mut p = string_to_parser(&psess, "fn f(x: u32) { x } struct S;".to_string());
+
+        // Current position is the `fn`.
+        look!(p, 0, token::Ident(kw::Fn, raw_no));
+        look!(p, 1, token::Ident(sym_f, raw_no));
+        look!(p, 2, token::OpenDelim(Delimiter::Parenthesis));
+        look!(p, 3, token::Ident(sym_x, raw_no));
+        look!(p, 4, token::Colon);
+        look!(p, 5, token::Ident(sym::u32, raw_no));
+        look!(p, 6, token::CloseDelim(Delimiter::Parenthesis));
+        look!(p, 7, token::OpenDelim(Delimiter::Brace));
+        look!(p, 8, token::Ident(sym_x, raw_no));
+        look!(p, 9, token::CloseDelim(Delimiter::Brace));
+        look!(p, 10, token::Ident(kw::Struct, raw_no));
+        look!(p, 11, token::Ident(sym_S, raw_no));
+        look!(p, 12, token::Semi);
+        // Any lookahead past the end of the token stream returns `Eof`.
+        look!(p, 13, token::Eof);
+        look!(p, 14, token::Eof);
+        look!(p, 15, token::Eof);
+        look!(p, 100, token::Eof);
+
+        // Move forward to the first `x`.
+        for _ in 0..3 {
+            p.bump();
+        }
+        look!(p, 0, token::Ident(sym_x, raw_no));
+        look!(p, 1, token::Colon);
+        look!(p, 2, token::Ident(sym::u32, raw_no));
+        look!(p, 3, token::CloseDelim(Delimiter::Parenthesis));
+        // FIXME(nnethercote) If we lookahead any distance past a close delim
+        // we currently return that close delim.
+        look!(p, 4, token::CloseDelim(Delimiter::Parenthesis));
+        look!(p, 5, token::CloseDelim(Delimiter::Parenthesis));
+        look!(p, 6, token::CloseDelim(Delimiter::Parenthesis));
+        look!(p, 100, token::CloseDelim(Delimiter::Parenthesis));
+
+        // Move forward to the `;`.
+        for _ in 0..9 {
+            p.bump();
+        }
+        look!(p, 0, token::Semi);
+        // Any lookahead past the end of the token stream returns `Eof`.
+        look!(p, 1, token::Eof);
+        look!(p, 100, token::Eof);
+
+        // Move one past the `;`, i.e. past the end of the token stream.
+        p.bump();
+        look!(p, 0, token::Eof);
+        look!(p, 1, token::Eof);
+        look!(p, 100, token::Eof);
+
+        // Bumping after Eof is idempotent.
+        p.bump();
+        look!(p, 0, token::Eof);
+        look!(p, 1, token::Eof);
+        look!(p, 100, token::Eof);
+    });
+}
+
+/// FIXME(nnethercote) Currently there is some buggy behaviour when using
+/// `look_ahead` not within the outermost token stream, as this test shows.
+#[test]
+fn look_ahead_non_outermost_stream() {
+    create_default_session_globals_then(|| {
+        let sym_f = Symbol::intern("f");
+        #[allow(non_snake_case)]
+        let sym_S = Symbol::intern("S");
+        let raw_no = IdentIsRaw::No;
+
+        let psess = psess();
+        let mut p = string_to_parser(&psess, "mod m { fn f(x: u32) { x } struct S; }".to_string());
+
+        // Move forward to the `fn`, which is not within the outermost token
+        // stream (because it's inside the `mod { ... }`).
+        for _ in 0..3 {
+            p.bump();
+        }
+        look!(p, 0, token::Ident(kw::Fn, raw_no));
+        look!(p, 1, token::Ident(sym_f, raw_no));
+        look!(p, 2, token::OpenDelim(Delimiter::Parenthesis));
+        // FIXME(nnethercote) The current code incorrectly skips the `x: u32)`
+        // to the next token tree.
+        look!(p, 3, token::OpenDelim(Delimiter::Brace));
+        // FIXME(nnethercote) The current code incorrectly skips the `x }`
+        // to the next token tree.
+        look!(p, 4, token::Ident(kw::Struct, raw_no));
+        look!(p, 5, token::Ident(sym_S, raw_no));
+        look!(p, 6, token::Semi);
+        // FIXME(nnethercote) If we lookahead any distance past a close delim
+        // we currently return that close delim.
+        look!(p, 7, token::CloseDelim(Delimiter::Brace));
+        look!(p, 8, token::CloseDelim(Delimiter::Brace));
+        look!(p, 9, token::CloseDelim(Delimiter::Brace));
+        look!(p, 100, token::CloseDelim(Delimiter::Brace));
+    });
+}
+
 // This tests that when parsing a string (rather than a file) we don't try
 // and read in a file for a module declaration and just parse a stub.
 // See `recurse_into_file_modules` in the parser.

From ebe1305b1e0bb32913b309ce65bd97106532ad6a Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Fri, 12 Jul 2024 12:56:58 +1000
Subject: [PATCH 2/3] Remove the bogus special case from `Parser::look_ahead`.

The general case at the bottom of `look_ahead` is slow, because it
clones the token cursor. Above it there is a special case for
performance that is hit most of the time and avoids the cloning.
Unfortunately, its behaviour differs from the general case in two ways.

- When within a pair of delimiters, if you look any distance past the
  closing delimiter you get the closing delimiter instead of what comes
  after the closing delimiter.

- It uses `tree_cursor.look_ahead(dist - 1)` which totally confuses
  tokens with token trees. This means that only the first token in a
  token tree will be seen. E.g. in a sequence like `{ a }` the `a` and
  `}` will be skipped over. Bad!

It's likely that these differences weren't noticed before now because
the use of `look_ahead` in the parser is limited to small distances and
relatively few contexts.

Removing the special case causes slowdowns up of to 2% on a range of
benchmarks. The next commit will add a new, correct special case to
regain that lost performance.
---
 compiler/rustc_parse/src/parser/mod.rs   | 37 +------------------
 compiler/rustc_parse/src/parser/tests.rs | 47 +++++++++++++-----------
 2 files changed, 28 insertions(+), 56 deletions(-)

diff --git a/compiler/rustc_parse/src/parser/mod.rs b/compiler/rustc_parse/src/parser/mod.rs
index 45ca267fe5d..f906a2ecab7 100644
--- a/compiler/rustc_parse/src/parser/mod.rs
+++ b/compiler/rustc_parse/src/parser/mod.rs
@@ -1118,41 +1118,8 @@ pub fn look_ahead<R>(&self, dist: usize, looker: impl FnOnce(&Token) -> R) -> R
             return looker(&self.token);
         }
 
-        if let Some(&(_, span, _, delim)) = self.token_cursor.stack.last()
-            && delim != Delimiter::Invisible
-        {
-            // We are not in the outermost token stream, and the token stream
-            // we are in has non-skipped delimiters. Look for skipped
-            // delimiters in the lookahead range.
-            let tree_cursor = &self.token_cursor.tree_cursor;
-            let all_normal = (0..dist).all(|i| {
-                let token = tree_cursor.look_ahead(i);
-                !matches!(token, Some(TokenTree::Delimited(.., Delimiter::Invisible, _)))
-            });
-            if all_normal {
-                // There were no skipped delimiters. Do lookahead by plain indexing.
-                return match tree_cursor.look_ahead(dist - 1) {
-                    Some(tree) => {
-                        // Indexing stayed within the current token stream.
-                        match tree {
-                            TokenTree::Token(token, _) => looker(token),
-                            TokenTree::Delimited(dspan, _, delim, _) => {
-                                looker(&Token::new(token::OpenDelim(*delim), dspan.open))
-                            }
-                        }
-                    }
-                    None => {
-                        // Indexing went past the end of the current token
-                        // stream. Use the close delimiter, no matter how far
-                        // ahead `dist` went.
-                        looker(&Token::new(token::CloseDelim(delim), span.close))
-                    }
-                };
-            }
-        }
-
-        // We are in a more complex case. Just clone the token cursor and use
-        // `next`, skipping delimiters as necessary. Slow but simple.
+        // Just clone the token cursor and use `next`, skipping delimiters as
+        // necessary. Slow but simple.
         let mut cursor = self.token_cursor.clone();
         let mut i = 0;
         let mut token = Token::dummy();
diff --git a/compiler/rustc_parse/src/parser/tests.rs b/compiler/rustc_parse/src/parser/tests.rs
index 538ec6b8cfd..5b2d119e42b 100644
--- a/compiler/rustc_parse/src/parser/tests.rs
+++ b/compiler/rustc_parse/src/parser/tests.rs
@@ -1424,12 +1424,15 @@ fn look_ahead() {
         look!(p, 1, token::Colon);
         look!(p, 2, token::Ident(sym::u32, raw_no));
         look!(p, 3, token::CloseDelim(Delimiter::Parenthesis));
-        // FIXME(nnethercote) If we lookahead any distance past a close delim
-        // we currently return that close delim.
-        look!(p, 4, token::CloseDelim(Delimiter::Parenthesis));
-        look!(p, 5, token::CloseDelim(Delimiter::Parenthesis));
-        look!(p, 6, token::CloseDelim(Delimiter::Parenthesis));
-        look!(p, 100, token::CloseDelim(Delimiter::Parenthesis));
+        look!(p, 4, token::OpenDelim(Delimiter::Brace));
+        look!(p, 5, token::Ident(sym_x, raw_no));
+        look!(p, 6, token::CloseDelim(Delimiter::Brace));
+        look!(p, 7, token::Ident(kw::Struct, raw_no));
+        look!(p, 8, token::Ident(sym_S, raw_no));
+        look!(p, 9, token::Semi);
+        look!(p, 10, token::Eof);
+        look!(p, 11, token::Eof);
+        look!(p, 100, token::Eof);
 
         // Move forward to the `;`.
         for _ in 0..9 {
@@ -1454,12 +1457,13 @@ fn look_ahead() {
     });
 }
 
-/// FIXME(nnethercote) Currently there is some buggy behaviour when using
-/// `look_ahead` not within the outermost token stream, as this test shows.
+/// There used to be some buggy behaviour when using `look_ahead` not within
+/// the outermost token stream, which this test covers.
 #[test]
 fn look_ahead_non_outermost_stream() {
     create_default_session_globals_then(|| {
         let sym_f = Symbol::intern("f");
+        let sym_x = Symbol::intern("x");
         #[allow(non_snake_case)]
         let sym_S = Symbol::intern("S");
         let raw_no = IdentIsRaw::No;
@@ -1475,20 +1479,21 @@ fn look_ahead_non_outermost_stream() {
         look!(p, 0, token::Ident(kw::Fn, raw_no));
         look!(p, 1, token::Ident(sym_f, raw_no));
         look!(p, 2, token::OpenDelim(Delimiter::Parenthesis));
-        // FIXME(nnethercote) The current code incorrectly skips the `x: u32)`
-        // to the next token tree.
-        look!(p, 3, token::OpenDelim(Delimiter::Brace));
-        // FIXME(nnethercote) The current code incorrectly skips the `x }`
-        // to the next token tree.
-        look!(p, 4, token::Ident(kw::Struct, raw_no));
-        look!(p, 5, token::Ident(sym_S, raw_no));
-        look!(p, 6, token::Semi);
-        // FIXME(nnethercote) If we lookahead any distance past a close delim
-        // we currently return that close delim.
-        look!(p, 7, token::CloseDelim(Delimiter::Brace));
-        look!(p, 8, token::CloseDelim(Delimiter::Brace));
+        look!(p, 3, token::Ident(sym_x, raw_no));
+        look!(p, 4, token::Colon);
+        look!(p, 5, token::Ident(sym::u32, raw_no));
+        look!(p, 6, token::CloseDelim(Delimiter::Parenthesis));
+        look!(p, 7, token::OpenDelim(Delimiter::Brace));
+        look!(p, 8, token::Ident(sym_x, raw_no));
         look!(p, 9, token::CloseDelim(Delimiter::Brace));
-        look!(p, 100, token::CloseDelim(Delimiter::Brace));
+        look!(p, 10, token::Ident(kw::Struct, raw_no));
+        look!(p, 11, token::Ident(sym_S, raw_no));
+        look!(p, 12, token::Semi);
+        look!(p, 13, token::CloseDelim(Delimiter::Brace));
+        // Any lookahead past the end of the token stream returns `Eof`.
+        look!(p, 14, token::Eof);
+        look!(p, 15, token::Eof);
+        look!(p, 100, token::Eof);
     });
 }
 

From 100f3fd133d928e9c86cff202fa32e5e4d0ce6c7 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Fri, 12 Jul 2024 13:20:24 +1000
Subject: [PATCH 3/3] Add a new special case to `Parser::look_ahead`.

This new special case is simpler than the old special case because it
only is used when `dist == 1`. But that's still enough to cover ~98% of
cases. This results in equivalent performance to the old special case,
and identical behaviour as the general case.
---
 compiler/rustc_parse/src/parser/mod.rs | 29 ++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/compiler/rustc_parse/src/parser/mod.rs b/compiler/rustc_parse/src/parser/mod.rs
index f906a2ecab7..ef9b3aabc61 100644
--- a/compiler/rustc_parse/src/parser/mod.rs
+++ b/compiler/rustc_parse/src/parser/mod.rs
@@ -1118,6 +1118,35 @@ pub fn look_ahead<R>(&self, dist: usize, looker: impl FnOnce(&Token) -> R) -> R
             return looker(&self.token);
         }
 
+        // Typically around 98% of the `dist > 0` cases have `dist == 1`, so we
+        // have a fast special case for that.
+        if dist == 1 {
+            // The index is zero because the tree cursor's index always points
+            // to the next token to be gotten.
+            match self.token_cursor.tree_cursor.look_ahead(0) {
+                Some(tree) => {
+                    // Indexing stayed within the current token tree.
+                    return match tree {
+                        TokenTree::Token(token, _) => looker(token),
+                        TokenTree::Delimited(dspan, _, delim, _) => {
+                            looker(&Token::new(token::OpenDelim(*delim), dspan.open))
+                        }
+                    };
+                }
+                None => {
+                    // The tree cursor lookahead went (one) past the end of the
+                    // current token tree. Try to return a close delimiter.
+                    if let Some(&(_, span, _, delim)) = self.token_cursor.stack.last()
+                        && delim != Delimiter::Invisible
+                    {
+                        // We are not in the outermost token stream, so we have
+                        // delimiters. Also, those delimiters are not skipped.
+                        return looker(&Token::new(token::CloseDelim(delim), span.close));
+                    }
+                }
+            }
+        }
+
         // Just clone the token cursor and use `next`, skipping delimiters as
         // necessary. Slow but simple.
         let mut cursor = self.token_cursor.clone();