From 73cc5751773d4c49cc9d938548762520037926ba Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 19 Sep 2024 19:32:17 +1000 Subject: [PATCH] Fix `break_last_token`. It currently doesn't handle the three-char tokens `>>=` and `<<=` correctly. These can be broken twice, resulting in three individual tokens. This is a latent bug that currently doesn't cause any problems, but does cause problems for #124141, because that PR increases the usage of lazy token streams. --- compiler/rustc_ast/src/token.rs | 64 ++++++++++--------- .../rustc_parse/src/parser/attr_wrapper.rs | 36 ++++++----- compiler/rustc_parse/src/parser/mod.rs | 39 +++++------ tests/ui/macros/break-last-token-twice.rs | 16 +++++ 4 files changed, 91 insertions(+), 64 deletions(-) create mode 100644 tests/ui/macros/break-last-token-twice.rs diff --git a/compiler/rustc_ast/src/token.rs b/compiler/rustc_ast/src/token.rs index a0082a41713..2904bae00b3 100644 --- a/compiler/rustc_ast/src/token.rs +++ b/compiler/rustc_ast/src/token.rs @@ -385,35 +385,41 @@ pub fn lit(kind: LitKind, symbol: Symbol, suffix: Option) -> TokenKind { Literal(Lit::new(kind, symbol, suffix)) } - /// An approximation to proc-macro-style single-character operators used by rustc parser. - /// If the operator token can be broken into two tokens, the first of which is single-character, - /// then this function performs that operation, otherwise it returns `None`. - pub fn break_two_token_op(&self) -> Option<(TokenKind, TokenKind)> { - Some(match *self { - Le => (Lt, Eq), - EqEq => (Eq, Eq), - Ne => (Not, Eq), - Ge => (Gt, Eq), - AndAnd => (BinOp(And), BinOp(And)), - OrOr => (BinOp(Or), BinOp(Or)), - BinOp(Shl) => (Lt, Lt), - BinOp(Shr) => (Gt, Gt), - BinOpEq(Plus) => (BinOp(Plus), Eq), - BinOpEq(Minus) => (BinOp(Minus), Eq), - BinOpEq(Star) => (BinOp(Star), Eq), - BinOpEq(Slash) => (BinOp(Slash), Eq), - BinOpEq(Percent) => (BinOp(Percent), Eq), - BinOpEq(Caret) => (BinOp(Caret), Eq), - BinOpEq(And) => (BinOp(And), Eq), - BinOpEq(Or) => (BinOp(Or), Eq), - BinOpEq(Shl) => (Lt, Le), - BinOpEq(Shr) => (Gt, Ge), - DotDot => (Dot, Dot), - DotDotDot => (Dot, DotDot), - PathSep => (Colon, Colon), - RArrow => (BinOp(Minus), Gt), - LArrow => (Lt, BinOp(Minus)), - FatArrow => (Eq, Gt), + /// An approximation to proc-macro-style single-character operators used by + /// rustc parser. If the operator token can be broken into two tokens, the + /// first of which has `n` (1 or 2) chars, then this function performs that + /// operation, otherwise it returns `None`. + pub fn break_two_token_op(&self, n: u32) -> Option<(TokenKind, TokenKind)> { + assert!(n == 1 || n == 2); + Some(match (self, n) { + (Le, 1) => (Lt, Eq), + (EqEq, 1) => (Eq, Eq), + (Ne, 1) => (Not, Eq), + (Ge, 1) => (Gt, Eq), + (AndAnd, 1) => (BinOp(And), BinOp(And)), + (OrOr, 1) => (BinOp(Or), BinOp(Or)), + (BinOp(Shl), 1) => (Lt, Lt), + (BinOp(Shr), 1) => (Gt, Gt), + (BinOpEq(Plus), 1) => (BinOp(Plus), Eq), + (BinOpEq(Minus), 1) => (BinOp(Minus), Eq), + (BinOpEq(Star), 1) => (BinOp(Star), Eq), + (BinOpEq(Slash), 1) => (BinOp(Slash), Eq), + (BinOpEq(Percent), 1) => (BinOp(Percent), Eq), + (BinOpEq(Caret), 1) => (BinOp(Caret), Eq), + (BinOpEq(And), 1) => (BinOp(And), Eq), + (BinOpEq(Or), 1) => (BinOp(Or), Eq), + (BinOpEq(Shl), 1) => (Lt, Le), // `<` + `<=` + (BinOpEq(Shl), 2) => (BinOp(Shl), Eq), // `<<` + `=` + (BinOpEq(Shr), 1) => (Gt, Ge), // `>` + `>=` + (BinOpEq(Shr), 2) => (BinOp(Shr), Eq), // `>>` + `=` + (DotDot, 1) => (Dot, Dot), + (DotDotDot, 1) => (Dot, DotDot), // `.` + `..` + (DotDotDot, 2) => (DotDot, Dot), // `..` + `.` + (DotDotEq, 2) => (DotDot, Eq), + (PathSep, 1) => (Colon, Colon), + (RArrow, 1) => (BinOp(Minus), Gt), + (LArrow, 1) => (Lt, BinOp(Minus)), + (FatArrow, 1) => (Eq, Gt), _ => return None, }) } diff --git a/compiler/rustc_parse/src/parser/attr_wrapper.rs b/compiler/rustc_parse/src/parser/attr_wrapper.rs index 6a241be0a15..ee045b70d75 100644 --- a/compiler/rustc_parse/src/parser/attr_wrapper.rs +++ b/compiler/rustc_parse/src/parser/attr_wrapper.rs @@ -108,7 +108,7 @@ struct LazyAttrTokenStreamImpl { start_token: (Token, Spacing), cursor_snapshot: TokenCursor, num_calls: u32, - break_last_token: bool, + break_last_token: u32, node_replacements: Box<[NodeReplacement]>, } @@ -339,17 +339,20 @@ pub(super) fn collect_tokens( let parser_replacements_end = self.capture_state.parser_replacements.len(); assert!( - !(self.break_last_token && matches!(capture_trailing, Trailing::Yes)), - "Cannot set break_last_token and have trailing token" + !(self.break_last_token > 0 && matches!(capture_trailing, Trailing::Yes)), + "Cannot have break_last_token > 0 and have trailing token" ); + assert!(self.break_last_token <= 2, "cannot break token more than twice"); let end_pos = self.num_bump_calls + capture_trailing as u32 - // If we 'broke' the last token (e.g. breaking a '>>' token to two '>' tokens), then - // extend the range of captured tokens to include it, since the parser was not actually - // bumped past it. When the `LazyAttrTokenStream` gets converted into an - // `AttrTokenStream`, we will create the proper token. - + self.break_last_token as u32; + // If we "broke" the last token (e.g. breaking a `>>` token once into `>` + `>`, or + // breaking a `>>=` token twice into `>` + `>` + `=`), then extend the range of + // captured tokens to include it, because the parser was not actually bumped past it. + // (Even if we broke twice, it was still just one token originally, hence the `1`.) + // When the `LazyAttrTokenStream` gets converted into an `AttrTokenStream`, we will + // rebreak that final token once or twice. + + if self.break_last_token == 0 { 0 } else { 1 }; let num_calls = end_pos - collect_pos.start_pos; @@ -425,7 +428,7 @@ pub(super) fn collect_tokens( // for the `#[cfg]` and/or `#[cfg_attr]` attrs. This allows us to run // eager cfg-expansion on the captured token stream. if definite_capture_mode { - assert!(!self.break_last_token, "Should not have unglued last token with cfg attr"); + assert!(self.break_last_token == 0, "Should not have unglued last token with cfg attr"); // What is the status here when parsing the example code at the top of this method? // @@ -471,7 +474,7 @@ pub(super) fn collect_tokens( /// close delims. fn make_attr_token_stream( iter: impl Iterator, - break_last_token: bool, + break_last_token: u32, ) -> AttrTokenStream { #[derive(Debug)] struct FrameData { @@ -513,18 +516,17 @@ struct FrameData { } } - if break_last_token { + if break_last_token > 0 { let last_token = stack_top.inner.pop().unwrap(); if let AttrTokenTree::Token(last_token, spacing) = last_token { - let unglued_first = last_token.kind.break_two_token_op().unwrap().0; + let (unglued, _) = last_token.kind.break_two_token_op(break_last_token).unwrap(); - // An 'unglued' token is always two ASCII characters + // Tokens are always ASCII chars, so we can use byte arithmetic here. let mut first_span = last_token.span.shrink_to_lo(); - first_span = first_span.with_hi(first_span.lo() + rustc_span::BytePos(1)); + first_span = + first_span.with_hi(first_span.lo() + rustc_span::BytePos(break_last_token)); - stack_top - .inner - .push(AttrTokenTree::Token(Token::new(unglued_first, first_span), spacing)); + stack_top.inner.push(AttrTokenTree::Token(Token::new(unglued, first_span), spacing)); } else { panic!("Unexpected last token {last_token:?}") } diff --git a/compiler/rustc_parse/src/parser/mod.rs b/compiler/rustc_parse/src/parser/mod.rs index 9d9265d5318..ca0838a7929 100644 --- a/compiler/rustc_parse/src/parser/mod.rs +++ b/compiler/rustc_parse/src/parser/mod.rs @@ -146,21 +146,25 @@ pub struct Parser<'a> { token_cursor: TokenCursor, // The number of calls to `bump`, i.e. the position in the token stream. num_bump_calls: u32, - // During parsing we may sometimes need to 'unglue' a glued token into two - // component tokens (e.g. '>>' into '>' and '>), so the parser can consume - // them one at a time. This process bypasses the normal capturing mechanism - // (e.g. `num_bump_calls` will not be incremented), since the 'unglued' - // tokens due not exist in the original `TokenStream`. + // During parsing we may sometimes need to "unglue" a glued token into two + // or three component tokens (e.g. `>>` into `>` and `>`, or `>>=` into `>` + // and `>` and `=`), so the parser can consume them one at a time. This + // process bypasses the normal capturing mechanism (e.g. `num_bump_calls` + // will not be incremented), since the "unglued" tokens due not exist in + // the original `TokenStream`. // - // If we end up consuming both unglued tokens, this is not an issue. We'll - // end up capturing the single 'glued' token. + // If we end up consuming all the component tokens, this is not an issue, + // because we'll end up capturing the single "glued" token. // - // However, sometimes we may want to capture just the first 'unglued' + // However, sometimes we may want to capture not all of the original // token. For example, capturing the `Vec` in `Option>` // requires us to unglue the trailing `>>` token. The `break_last_token` - // field is used to track this token. It gets appended to the captured + // field is used to track these tokens. They get appended to the captured // stream when we evaluate a `LazyAttrTokenStream`. - break_last_token: bool, + // + // This value is always 0, 1, or 2. It can only reach 2 when splitting + // `>>=` or `<<=`. + break_last_token: u32, /// This field is used to keep track of how many left angle brackets we have seen. This is /// required in order to detect extra leading left angle brackets (`<` characters) and error /// appropriately. @@ -453,7 +457,7 @@ pub fn new( expected_tokens: Vec::new(), token_cursor: TokenCursor { tree_cursor: stream.into_trees(), stack: Vec::new() }, num_bump_calls: 0, - break_last_token: false, + break_last_token: 0, unmatched_angle_bracket_count: 0, angle_bracket_nesting: 0, last_unexpected_token_span: None, @@ -773,7 +777,7 @@ fn break_and_eat(&mut self, expected: TokenKind) -> bool { self.bump(); return true; } - match self.token.kind.break_two_token_op() { + match self.token.kind.break_two_token_op(1) { Some((first, second)) if first == expected => { let first_span = self.psess.source_map().start_point(self.token.span); let second_span = self.token.span.with_lo(first_span.hi()); @@ -783,8 +787,8 @@ fn break_and_eat(&mut self, expected: TokenKind) -> bool { // // If we consume any additional tokens, then this token // is not needed (we'll capture the entire 'glued' token), - // and `bump` will set this field to `None` - self.break_last_token = true; + // and `bump` will set this field to 0. + self.break_last_token += 1; // Use the spacing of the glued token as the spacing of the // unglued second token. self.bump_with((Token::new(second, second_span), self.token_spacing)); @@ -1148,10 +1152,9 @@ pub fn bump(&mut self) { // than `.0`/`.1` access. let mut next = self.token_cursor.inlined_next(); self.num_bump_calls += 1; - // We've retrieved an token from the underlying - // cursor, so we no longer need to worry about - // an unglued token. See `break_and_eat` for more details - self.break_last_token = false; + // We got a token from the underlying cursor and no longer need to + // worry about an unglued token. See `break_and_eat` for more details. + self.break_last_token = 0; if next.0.span.is_dummy() { // Tweak the location for better diagnostics, but keep syntactic context intact. let fallback_span = self.token.span; diff --git a/tests/ui/macros/break-last-token-twice.rs b/tests/ui/macros/break-last-token-twice.rs new file mode 100644 index 00000000000..791f349ab38 --- /dev/null +++ b/tests/ui/macros/break-last-token-twice.rs @@ -0,0 +1,16 @@ +//@ check-pass + +macro_rules! m { + (static $name:ident: $t:ty = $e:expr) => { + let $name: $t = $e; + } +} + +fn main() { + m! { + // Tricky: the trailing `>>=` token here is broken twice: + // - into `>` and `>=` + // - then the `>=` is broken into `>` and `=` + static _x: Vec>= vec![] + } +}