From 3be86e6528db24fb055530ef93c93b2a9fc9ce90 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 28 Sep 2022 10:07:01 +1000 Subject: [PATCH 1/8] Clarify operator splitting. I found this code hard to read. --- compiler/rustc_expand/src/proc_macro_server.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/compiler/rustc_expand/src/proc_macro_server.rs b/compiler/rustc_expand/src/proc_macro_server.rs index 59a7b668a83..ff09a9fb87a 100644 --- a/compiler/rustc_expand/src/proc_macro_server.rs +++ b/compiler/rustc_expand/src/proc_macro_server.rs @@ -110,10 +110,14 @@ impl FromInternal<(TokenStream, &mut Rustc<'_, '_>)> for Vec (token, spacing == Joint), }; + // Split the operator into one or more `Punct`s, one per character. + // The final one inherits the jointness of the original token. Any + // before that get `joint = true`. let mut op = |s: &str| { assert!(s.is_ascii()); - trees.extend(s.as_bytes().iter().enumerate().map(|(idx, &ch)| { - TokenTree::Punct(Punct { ch, joint: joint || idx != s.len() - 1, span }) + trees.extend(s.bytes().enumerate().map(|(idx, ch)| { + let is_final = idx == s.len() - 1; + TokenTree::Punct(Punct { ch, joint: if is_final { joint } else { true }, span }) })); }; From bbb53bf7727f07c1ea6d7e2d36dc51fbfc6b6726 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 28 Sep 2022 11:20:42 +1000 Subject: [PATCH 2/8] Add comments to `Spacing`. --- compiler/rustc_ast/src/tokenstream.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/compiler/rustc_ast/src/tokenstream.rs b/compiler/rustc_ast/src/tokenstream.rs index 824206a99d8..4d2049cbc41 100644 --- a/compiler/rustc_ast/src/tokenstream.rs +++ b/compiler/rustc_ast/src/tokenstream.rs @@ -304,9 +304,20 @@ pub struct AttributesData { #[derive(Clone, Debug, Default, Encodable, Decodable)] pub struct TokenStream(pub(crate) Lrc>); +/// Similar to `proc_macro::Spacing`, but for tokens. +/// +/// Note that all `ast::TokenTree::Token` instances have a `Spacing`, but when +/// we convert to `proc_macro::TokenTree` for proc macros only `Punct` +/// `TokenTree`s have a `proc_macro::Spacing`. #[derive(Clone, Copy, Debug, PartialEq, Encodable, Decodable, HashStable_Generic)] pub enum Spacing { + /// The token is not immediately followed by an operator token (as + /// determined by `Token::is_op`). E.g. a `+` token is `Alone` in `+ =`, + /// `+/*foo*/=`, `+ident`, and `+()`. Alone, + + /// The token is immediately followed by an operator token. E.g. a `+` + /// token is `Joint` in `+=` and `++`. Joint, } From 40e4827fd2ad3b050bfaaf7450ed5e6a5407ff9d Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 28 Sep 2022 11:29:47 +1000 Subject: [PATCH 3/8] Rewrite `Token::is_op`. An exhaustive match is more readable and more future-proof. --- compiler/rustc_ast/src/token.rs | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/compiler/rustc_ast/src/token.rs b/compiler/rustc_ast/src/token.rs index 99034799b3c..16224d71e45 100644 --- a/compiler/rustc_ast/src/token.rs +++ b/compiler/rustc_ast/src/token.rs @@ -345,17 +345,14 @@ impl Token { } pub fn is_op(&self) -> bool { - !matches!( - self.kind, - OpenDelim(..) - | CloseDelim(..) - | Literal(..) - | DocComment(..) - | Ident(..) - | Lifetime(..) - | Interpolated(..) - | Eof - ) + match self.kind { + Eq | Lt | Le | EqEq | Ne | Ge | Gt | AndAnd | OrOr | Not | Tilde | BinOp(_) + | BinOpEq(_) | At | Dot | DotDot | DotDotDot | DotDotEq | Comma | Semi | Colon + | ModSep | RArrow | LArrow | FatArrow | Pound | Dollar | Question | SingleQuote => true, + + OpenDelim(..) | CloseDelim(..) | Literal(..) | DocComment(..) | Ident(..) + | Lifetime(..) | Interpolated(..) | Eof => false, + } } pub fn is_like_plus(&self) -> bool { From 9de9cf19d7da502e08b93c32d89aa9850e70a595 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 30 Sep 2022 07:39:54 +1000 Subject: [PATCH 4/8] Add comments to `TokenCursor::desugar`. It took me some time to work out what this code was doing. --- compiler/rustc_parse/src/parser/mod.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/compiler/rustc_parse/src/parser/mod.rs b/compiler/rustc_parse/src/parser/mod.rs index 2aebaf7c3af..b934e087608 100644 --- a/compiler/rustc_parse/src/parser/mod.rs +++ b/compiler/rustc_parse/src/parser/mod.rs @@ -302,7 +302,10 @@ impl TokenCursor { fn desugar(&mut self, attr_style: AttrStyle, data: Symbol, span: Span) -> (Token, Spacing) { // Searches for the occurrences of `"#*` and returns the minimum number of `#`s - // required to wrap the text. + // required to wrap the text. E.g. + // - `abc d` is wrapped as `r"abc d"` (num_of_hashes = 0) + // - `abc "d"` is wrapped as `r#"abc "d""#` (num_of_hashes = 1) + // - `abc "##d##"` is wrapped as `r###"abc "d""###` (num_of_hashes = 3) let mut num_of_hashes = 0; let mut count = 0; for ch in data.as_str().chars() { @@ -314,6 +317,7 @@ impl TokenCursor { num_of_hashes = cmp::max(num_of_hashes, count); } + // `/// foo` becomes `doc = r"foo". let delim_span = DelimSpan::from_single(span); let body = TokenTree::Delimited( delim_span, From ce7676829e4786a44e8199e5df6e0cf9035a6d0f Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 30 Sep 2022 16:43:11 +1000 Subject: [PATCH 5/8] Merge `parse_token_trees_until_close_delim` and `parse_all_token_trees`. Because they're very similar, and this will allow some follow-up changes. --- compiler/rustc_parse/src/lexer/mod.rs | 2 +- compiler/rustc_parse/src/lexer/tokentrees.rs | 39 ++++++++------------ 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index bcd078a8967..88540e13ef2 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -52,7 +52,7 @@ pub(crate) fn parse_token_trees<'a>( let cursor = Cursor::new(src); let string_reader = StringReader { sess, start_pos, pos: start_pos, src, cursor, override_span }; - tokentrees::TokenTreesReader::parse_token_trees(string_reader) + tokentrees::TokenTreesReader::parse_all_token_trees(string_reader) } struct StringReader<'a> { diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs index 364753154db..b06f23d7c7b 100644 --- a/compiler/rustc_parse/src/lexer/tokentrees.rs +++ b/compiler/rustc_parse/src/lexer/tokentrees.rs @@ -27,7 +27,7 @@ pub(super) struct TokenTreesReader<'a> { } impl<'a> TokenTreesReader<'a> { - pub(super) fn parse_token_trees( + pub(super) fn parse_all_token_trees( string_reader: StringReader<'a>, ) -> (PResult<'a, TokenStream>, Vec) { let mut tt_reader = TokenTreesReader { @@ -40,34 +40,29 @@ impl<'a> TokenTreesReader<'a> { last_delim_empty_block_spans: FxHashMap::default(), matching_block_spans: Vec::new(), }; - let res = tt_reader.parse_all_token_trees(); + let res = tt_reader.parse_token_trees(/* is_top_level */ true); (res, tt_reader.unmatched_braces) } - // Parse a stream of tokens into a list of `TokenTree`s, up to an `Eof`. - fn parse_all_token_trees(&mut self) -> PResult<'a, TokenStream> { + // Parse a stream of tokens into a list of `TokenTree`s. + fn parse_token_trees(&mut self, is_top_level: bool) -> PResult<'a, TokenStream> { self.token = self.string_reader.next_token().0; let mut buf = TokenStreamBuilder::default(); loop { match self.token.kind { token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)), - token::CloseDelim(delim) => return Err(self.close_delim_err(delim)), - token::Eof => return Ok(buf.into_token_stream()), - _ => buf.push(self.parse_token_tree_non_delim_non_eof()), - } - } - } - - // Parse a stream of tokens into a list of `TokenTree`s, up to a `CloseDelim`. - fn parse_token_trees_until_close_delim(&mut self) -> TokenStream { - let mut buf = TokenStreamBuilder::default(); - loop { - match self.token.kind { - token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)), - token::CloseDelim(..) => return buf.into_token_stream(), + token::CloseDelim(delim) => { + return if !is_top_level { + Ok(buf.into_token_stream()) + } else { + Err(self.close_delim_err(delim)) + }; + } token::Eof => { - self.eof_err().emit(); - return buf.into_token_stream(); + if !is_top_level { + self.eof_err().emit(); + } + return Ok(buf.into_token_stream()); } _ => buf.push(self.parse_token_tree_non_delim_non_eof()), } @@ -113,14 +108,12 @@ impl<'a> TokenTreesReader<'a> { // The span for beginning of the delimited section let pre_span = self.token.span; - // Move past the open delimiter. self.open_braces.push((open_delim, self.token.span)); - self.token = self.string_reader.next_token().0; // Parse the token trees within the delimiters. // We stop at any delimiter so we can try to recover if the user // uses an incorrect delimiter. - let tts = self.parse_token_trees_until_close_delim(); + let tts = self.parse_token_trees(/* is_top_level */ false).unwrap(); // Expand to cover the entire delimited token tree let delim_span = DelimSpan::from_pair(pre_span, self.token.span); From 8d0754d602d8d6fd2b357d98ee0bdaf2382b937a Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 30 Sep 2022 16:50:02 +1000 Subject: [PATCH 6/8] Inline and remove `parse_token_tree_non_delim_non_eof`. It has a single call site. --- compiler/rustc_parse/src/lexer/tokentrees.rs | 30 +++++++++----------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs index b06f23d7c7b..e3ccfc65462 100644 --- a/compiler/rustc_parse/src/lexer/tokentrees.rs +++ b/compiler/rustc_parse/src/lexer/tokentrees.rs @@ -64,7 +64,20 @@ impl<'a> TokenTreesReader<'a> { } return Ok(buf.into_token_stream()); } - _ => buf.push(self.parse_token_tree_non_delim_non_eof()), + _ => { + // `this_spacing` for the returned token refers to whether the token is + // immediately followed by another op token. It is determined by the + // next token: its kind and its `preceded_by_whitespace` status. + let (next_tok, is_next_tok_preceded_by_whitespace) = + self.string_reader.next_token(); + let this_spacing = if is_next_tok_preceded_by_whitespace || !next_tok.is_op() { + Spacing::Alone + } else { + Spacing::Joint + }; + let this_tok = std::mem::replace(&mut self.token, next_tok); + buf.push(TokenTree::Token(this_tok, this_spacing)) + } } } } @@ -235,21 +248,6 @@ impl<'a> TokenTreesReader<'a> { err.span_label(self.token.span, "unexpected closing delimiter"); err } - - #[inline] - fn parse_token_tree_non_delim_non_eof(&mut self) -> TokenTree { - // `this_spacing` for the returned token refers to whether the token is - // immediately followed by another op token. It is determined by the - // next token: its kind and its `preceded_by_whitespace` status. - let (next_tok, is_next_tok_preceded_by_whitespace) = self.string_reader.next_token(); - let this_spacing = if is_next_tok_preceded_by_whitespace || !next_tok.is_op() { - Spacing::Alone - } else { - Spacing::Joint - }; - let this_tok = std::mem::replace(&mut self.token, next_tok); - TokenTree::Token(this_tok, this_spacing) - } } #[derive(Default)] From a822d08bd1145f14838b5483582b574c8c12df52 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 30 Sep 2022 16:51:35 +1000 Subject: [PATCH 7/8] Remove `TokenStreamBuilder`. It's now only used in one function. Also, the "should we glue the tokens?" check is only necessary when pushing a `TokenTree::Token`, not when pushing a `TokenTree::Delimited`. As part of this, we now do the "should we glue the tokens?" check immediately, which avoids having look back at the previous token. It also puts all the logic dealing with token gluing in a single place. --- compiler/rustc_parse/src/lexer/tokentrees.rs | 57 +++++++------------- 1 file changed, 20 insertions(+), 37 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs index e3ccfc65462..0af52043d37 100644 --- a/compiler/rustc_parse/src/lexer/tokentrees.rs +++ b/compiler/rustc_parse/src/lexer/tokentrees.rs @@ -47,13 +47,13 @@ impl<'a> TokenTreesReader<'a> { // Parse a stream of tokens into a list of `TokenTree`s. fn parse_token_trees(&mut self, is_top_level: bool) -> PResult<'a, TokenStream> { self.token = self.string_reader.next_token().0; - let mut buf = TokenStreamBuilder::default(); + let mut buf = Vec::new(); loop { match self.token.kind { token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)), token::CloseDelim(delim) => { return if !is_top_level { - Ok(buf.into_token_stream()) + Ok(TokenStream::new(buf)) } else { Err(self.close_delim_err(delim)) }; @@ -62,21 +62,28 @@ impl<'a> TokenTreesReader<'a> { if !is_top_level { self.eof_err().emit(); } - return Ok(buf.into_token_stream()); + return Ok(TokenStream::new(buf)); } _ => { - // `this_spacing` for the returned token refers to whether the token is - // immediately followed by another op token. It is determined by the - // next token: its kind and its `preceded_by_whitespace` status. - let (next_tok, is_next_tok_preceded_by_whitespace) = - self.string_reader.next_token(); - let this_spacing = if is_next_tok_preceded_by_whitespace || !next_tok.is_op() { - Spacing::Alone - } else { - Spacing::Joint + // Get the next normal token. This might require getting multiple adjacent + // single-char tokens and joining them together. + let (this_spacing, next_tok) = loop { + let (next_tok, is_next_tok_preceded_by_whitespace) = + self.string_reader.next_token(); + if !is_next_tok_preceded_by_whitespace { + if let Some(glued) = self.token.glue(&next_tok) { + self.token = glued; + } else { + let this_spacing = + if next_tok.is_op() { Spacing::Joint } else { Spacing::Alone }; + break (this_spacing, next_tok); + } + } else { + break (Spacing::Alone, next_tok); + } }; let this_tok = std::mem::replace(&mut self.token, next_tok); - buf.push(TokenTree::Token(this_tok, this_spacing)) + buf.push(TokenTree::Token(this_tok, this_spacing)); } } } @@ -249,27 +256,3 @@ impl<'a> TokenTreesReader<'a> { err } } - -#[derive(Default)] -struct TokenStreamBuilder { - buf: Vec, -} - -impl TokenStreamBuilder { - #[inline(always)] - fn push(&mut self, tree: TokenTree) { - if let Some(TokenTree::Token(prev_token, Spacing::Joint)) = self.buf.last() - && let TokenTree::Token(token, joint) = &tree - && let Some(glued) = prev_token.glue(token) - { - self.buf.pop(); - self.buf.push(TokenTree::Token(glued, *joint)); - } else { - self.buf.push(tree) - } - } - - fn into_token_stream(self) -> TokenStream { - TokenStream::new(self.buf) - } -} From 4e5ddf1adf09c5d1c425b1afeef8f1ac19f05562 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Sat, 1 Oct 2022 07:57:22 +1000 Subject: [PATCH 8/8] Invert `is_top_level` to avoid negation. --- compiler/rustc_parse/src/lexer/tokentrees.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs index 0af52043d37..b2701817d48 100644 --- a/compiler/rustc_parse/src/lexer/tokentrees.rs +++ b/compiler/rustc_parse/src/lexer/tokentrees.rs @@ -40,26 +40,26 @@ impl<'a> TokenTreesReader<'a> { last_delim_empty_block_spans: FxHashMap::default(), matching_block_spans: Vec::new(), }; - let res = tt_reader.parse_token_trees(/* is_top_level */ true); + let res = tt_reader.parse_token_trees(/* is_delimited */ false); (res, tt_reader.unmatched_braces) } // Parse a stream of tokens into a list of `TokenTree`s. - fn parse_token_trees(&mut self, is_top_level: bool) -> PResult<'a, TokenStream> { + fn parse_token_trees(&mut self, is_delimited: bool) -> PResult<'a, TokenStream> { self.token = self.string_reader.next_token().0; let mut buf = Vec::new(); loop { match self.token.kind { token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)), token::CloseDelim(delim) => { - return if !is_top_level { + return if is_delimited { Ok(TokenStream::new(buf)) } else { Err(self.close_delim_err(delim)) }; } token::Eof => { - if !is_top_level { + if is_delimited { self.eof_err().emit(); } return Ok(TokenStream::new(buf)); @@ -133,7 +133,7 @@ impl<'a> TokenTreesReader<'a> { // Parse the token trees within the delimiters. // We stop at any delimiter so we can try to recover if the user // uses an incorrect delimiter. - let tts = self.parse_token_trees(/* is_top_level */ false).unwrap(); + let tts = self.parse_token_trees(/* is_delimited */ true).unwrap(); // Expand to cover the entire delimited token tree let delim_span = DelimSpan::from_pair(pre_span, self.token.span);