From 629e97a5a02edb3d8dc63c5157962c093217d441 Mon Sep 17 00:00:00 2001 From: Russell Cohen Date: Sat, 28 Mar 2020 01:46:20 -0400 Subject: [PATCH] Improve error messages for raw strings (#60762) This diff improves error messages around raw strings in a few ways: - Catch extra trailing `#` in the parser. This can't be handled in the lexer because we could be in a macro that actually expects another # (see test) - Refactor & unify error handling in the lexer between ByteStrings and RawByteStrings - Detect potentially intended terminators (longest sequence of "#*" is suggested) --- src/librustc_lexer/src/cursor.rs | 2 +- src/librustc_lexer/src/lib.rs | 131 +++++++++++++++--- src/librustc_lexer/src/tests.rs | 119 ++++++++++++++++ src/librustc_parse/lexer/mod.rs | 94 ++++++++----- src/librustc_parse/parser/diagnostics.rs | 31 ++++- .../ui/parser/raw/raw-byte-string-eof.stderr | 4 +- .../ui/parser/raw/raw-str-in-macro-call.rs | 14 ++ src/test/ui/parser/raw/raw-str-unbalanced.rs | 2 +- .../ui/parser/raw/raw-str-unbalanced.stderr | 6 +- src/test/ui/parser/raw/raw_string.stderr | 4 +- 10 files changed, 344 insertions(+), 63 deletions(-) create mode 100644 src/librustc_lexer/src/tests.rs create mode 100644 src/test/ui/parser/raw/raw-str-in-macro-call.rs diff --git a/src/librustc_lexer/src/cursor.rs b/src/librustc_lexer/src/cursor.rs index ed0911379c4..13d0b07d98b 100644 --- a/src/librustc_lexer/src/cursor.rs +++ b/src/librustc_lexer/src/cursor.rs @@ -41,7 +41,7 @@ impl<'a> Cursor<'a> { /// If requested position doesn't exist, `EOF_CHAR` is returned. /// However, getting `EOF_CHAR` doesn't always mean actual end of file, /// it should be checked with `is_eof` method. - fn nth_char(&self, n: usize) -> char { + pub(crate) fn nth_char(&self, n: usize) -> char { self.chars().nth(n).unwrap_or(EOF_CHAR) } diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index d3ac58a49c8..70df6d210f4 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -17,9 +17,13 @@ mod cursor; pub mod unescape; +#[cfg(test)] +mod tests; + use self::LiteralKind::*; use self::TokenKind::*; use crate::cursor::{Cursor, EOF_CHAR}; +use std::convert::TryInto; /// Parsed token. /// It doesn't contain information about data that has been parsed, @@ -132,9 +136,65 @@ pub enum LiteralKind { /// "b"abc"", "b"abc" ByteStr { terminated: bool }, /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a" - RawStr { n_hashes: usize, started: bool, terminated: bool }, + RawStr(UnvalidatedRawStr), /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a" - RawByteStr { n_hashes: usize, started: bool, terminated: bool }, + RawByteStr(UnvalidatedRawStr), +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub struct UnvalidatedRawStr { + valid_start: bool, + n_start_hashes: usize, + n_end_hashes: usize, + possible_terminator_offset: Option, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum LexRawStrError { + /// Non # characters between `r` and `"` eg. `r#~"..` + InvalidStarter, + /// The string was never terminated. `possible_terminator_offset` is the best guess of where they + /// may have intended to terminate it. + NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option }, + /// More than 65536 # signs + TooManyDelimiters, +} + +#[derive(Debug, Eq, PartialEq, Copy, Clone)] +pub struct ValidatedRawStr { + n_hashes: u16, +} + +impl ValidatedRawStr { + pub fn num_hashes(&self) -> u16 { + self.n_hashes + } +} + +impl UnvalidatedRawStr { + pub fn started(&self) -> bool { + self.valid_start + } + + pub fn validate(self) -> Result { + if !self.valid_start { + return Err(LexRawStrError::InvalidStarter); + } + + let n_start_safe: u16 = + self.n_start_hashes.try_into().map_err(|_| LexRawStrError::TooManyDelimiters)?; + match (self.n_start_hashes, self.n_end_hashes) { + (n_start, n_end) if n_start > n_end => Err(LexRawStrError::NoTerminator { + expected: n_start, + found: self.n_end_hashes, + possible_terminator_offset: self.possible_terminator_offset, + }), + (n_start, n_end) => { + debug_assert_eq!(n_start, n_end); + Ok(ValidatedRawStr { n_hashes: n_start_safe }) + } + } + } } /// Base of numeric literal encoding according to its prefix. @@ -209,7 +269,7 @@ pub fn is_whitespace(c: char) -> bool { // Dedicated whitespace characters from Unicode | '\u{2028}' // LINE SEPARATOR | '\u{2029}' // PARAGRAPH SEPARATOR - => true, + => true, _ => false, } } @@ -258,12 +318,12 @@ impl Cursor<'_> { 'r' => match (self.first(), self.second()) { ('#', c1) if is_id_start(c1) => self.raw_ident(), ('#', _) | ('"', _) => { - let (n_hashes, started, terminated) = self.raw_double_quoted_string(); + let raw_str_i = self.raw_double_quoted_string(1); let suffix_start = self.len_consumed(); - if terminated { + if raw_str_i.n_end_hashes == raw_str_i.n_start_hashes { self.eat_literal_suffix(); } - let kind = RawStr { n_hashes, started, terminated }; + let kind = RawStr(raw_str_i); Literal { kind, suffix_start } } _ => self.ident(), @@ -293,12 +353,14 @@ impl Cursor<'_> { } ('r', '"') | ('r', '#') => { self.bump(); - let (n_hashes, started, terminated) = self.raw_double_quoted_string(); + let raw_str_i = self.raw_double_quoted_string(2); let suffix_start = self.len_consumed(); + let terminated = raw_str_i.n_start_hashes == raw_str_i.n_end_hashes; if terminated { self.eat_literal_suffix(); } - let kind = RawByteStr { n_hashes, started, terminated }; + + let kind = RawByteStr(raw_str_i); Literal { kind, suffix_start } } _ => self.ident(), @@ -594,29 +656,41 @@ impl Cursor<'_> { false } - /// Eats the double-quoted string and returns a tuple of - /// (amount of the '#' symbols, raw string started, raw string terminated) - fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) { + /// Eats the double-quoted string an UnvalidatedRawStr + fn raw_double_quoted_string(&mut self, prefix_len: usize) -> UnvalidatedRawStr { debug_assert!(self.prev() == 'r'); - let mut started: bool = false; - let mut finished: bool = false; + let mut valid_start: bool = false; + let start_pos = self.len_consumed(); + let (mut possible_terminator_offset, mut max_hashes) = (None, 0); // Count opening '#' symbols. - let n_hashes = self.eat_while(|c| c == '#'); + let n_start_hashes = self.eat_while(|c| c == '#'); // Check that string is started. match self.bump() { - Some('"') => started = true, - _ => return (n_hashes, started, finished), + Some('"') => valid_start = true, + _ => { + return UnvalidatedRawStr { + valid_start, + n_start_hashes, + n_end_hashes: 0, + possible_terminator_offset, + }; + } } // Skip the string contents and on each '#' character met, check if this is // a raw string termination. - while !finished { + loop { self.eat_while(|c| c != '"'); if self.is_eof() { - return (n_hashes, started, finished); + return UnvalidatedRawStr { + valid_start, + n_start_hashes, + n_end_hashes: max_hashes, + possible_terminator_offset, + }; } // Eat closing double quote. @@ -624,7 +698,7 @@ impl Cursor<'_> { // Check that amount of closing '#' symbols // is equal to the amount of opening ones. - let mut hashes_left = n_hashes; + let mut hashes_left = n_start_hashes; let is_closing_hash = |c| { if c == '#' && hashes_left != 0 { hashes_left -= 1; @@ -633,10 +707,23 @@ impl Cursor<'_> { false } }; - finished = self.eat_while(is_closing_hash) == n_hashes; - } + let n_end_hashes = self.eat_while(is_closing_hash); - (n_hashes, started, finished) + if n_end_hashes == n_start_hashes { + return UnvalidatedRawStr { + valid_start, + n_start_hashes, + n_end_hashes, + possible_terminator_offset: None, + }; + } else if n_end_hashes > 0 && n_end_hashes > max_hashes { + // Keep track of possible terminators to give a hint about where there might be + // a missing terminator + possible_terminator_offset = + Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len); + max_hashes = n_end_hashes; + } + } } fn eat_decimal_digits(&mut self) -> bool { diff --git a/src/librustc_lexer/src/tests.rs b/src/librustc_lexer/src/tests.rs new file mode 100644 index 00000000000..ba5897c5d42 --- /dev/null +++ b/src/librustc_lexer/src/tests.rs @@ -0,0 +1,119 @@ +#[cfg(test)] +mod tests { + use crate::*; + + fn check_raw_str( + s: &str, + expected: UnvalidatedRawStr, + validated: Result, + ) { + let mut cursor = Cursor::new(s); + let tok = cursor.raw_double_quoted_string(0); + assert_eq!(tok, expected); + assert_eq!(tok.validate(), validated); + } + + #[test] + fn test_naked_raw_str() { + check_raw_str( + r#""abc""#, + UnvalidatedRawStr { + n_start_hashes: 0, + n_end_hashes: 0, + valid_start: true, + possible_terminator_offset: None, + }, + Ok(ValidatedRawStr { n_hashes: 0 }), + ); + } + + #[test] + fn test_raw_no_start() { + check_raw_str( + r##""abc"#"##, + UnvalidatedRawStr { + n_start_hashes: 0, + n_end_hashes: 0, + valid_start: true, + possible_terminator_offset: None, + }, + Ok(ValidatedRawStr { n_hashes: 0 }), + ); + } + + #[test] + fn test_too_many_terminators() { + // this error is handled in the parser later + check_raw_str( + r###"#"abc"##"###, + UnvalidatedRawStr { + n_start_hashes: 1, + n_end_hashes: 1, + valid_start: true, + possible_terminator_offset: None, + }, + Ok(ValidatedRawStr { n_hashes: 1 }), + ); + } + + #[test] + fn test_unterminated() { + check_raw_str( + r#"#"abc"#, + UnvalidatedRawStr { + n_start_hashes: 1, + n_end_hashes: 0, + valid_start: true, + possible_terminator_offset: None, + }, + Err(LexRawStrError::NoTerminator { + expected: 1, + found: 0, + possible_terminator_offset: None, + }), + ); + check_raw_str( + r###"##"abc"#"###, + UnvalidatedRawStr { + n_start_hashes: 2, + n_end_hashes: 1, + valid_start: true, + possible_terminator_offset: Some(7), + }, + Err(LexRawStrError::NoTerminator { + expected: 2, + found: 1, + possible_terminator_offset: Some(7), + }), + ); + // We're looking for "# not just any # + check_raw_str( + r###"##"abc#"###, + UnvalidatedRawStr { + n_start_hashes: 2, + n_end_hashes: 0, + valid_start: true, + possible_terminator_offset: None, + }, + Err(LexRawStrError::NoTerminator { + expected: 2, + found: 0, + possible_terminator_offset: None, + }), + ) + } + + #[test] + fn test_invalid_start() { + check_raw_str( + r##"#~"abc"#"##, + UnvalidatedRawStr { + n_start_hashes: 1, + n_end_hashes: 0, + valid_start: false, + possible_terminator_offset: None, + }, + Err(LexRawStrError::InvalidStarter), + ); + } +} diff --git a/src/librustc_parse/lexer/mod.rs b/src/librustc_parse/lexer/mod.rs index ac58cbb9e8d..2f720d95c6d 100644 --- a/src/librustc_parse/lexer/mod.rs +++ b/src/librustc_parse/lexer/mod.rs @@ -1,20 +1,20 @@ use rustc_ast::token::{self, Token, TokenKind}; use rustc_ast::util::comments; use rustc_data_structures::sync::Lrc; -use rustc_errors::{error_code, DiagnosticBuilder, FatalError}; -use rustc_lexer::unescape; +use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError}; use rustc_lexer::Base; +use rustc_lexer::{unescape, LexRawStrError, UnvalidatedRawStr, ValidatedRawStr}; use rustc_session::parse::ParseSess; use rustc_span::symbol::{sym, Symbol}; use rustc_span::{BytePos, Pos, Span}; use log::debug; use std::char; -use std::convert::TryInto; mod tokentrees; mod unescape_error_reporting; mod unicode_chars; + use unescape_error_reporting::{emit_unescape_error, push_escaped_char}; #[derive(Clone, Debug)] @@ -376,30 +376,22 @@ impl<'a> StringReader<'a> { let id = self.symbol_from_to(content_start, content_end); (token::ByteStr, id) } - rustc_lexer::LiteralKind::RawStr { n_hashes, started, terminated } => { - if !started { - self.report_non_started_raw_string(start); - } - if !terminated { - self.report_unterminated_raw_string(start, n_hashes) - } - let n_hashes: u16 = self.restrict_n_hashes(start, n_hashes); + rustc_lexer::LiteralKind::RawStr(unvalidated_raw_str) => { + let valid_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str); + let n_hashes = valid_raw_str.num_hashes(); let n = u32::from(n_hashes); + let content_start = start + BytePos(2 + n); let content_end = suffix_start - BytePos(1 + n); self.validate_raw_str_escape(content_start, content_end); let id = self.symbol_from_to(content_start, content_end); (token::StrRaw(n_hashes), id) } - rustc_lexer::LiteralKind::RawByteStr { n_hashes, started, terminated } => { - if !started { - self.report_non_started_raw_string(start); - } - if !terminated { - self.report_unterminated_raw_string(start, n_hashes) - } - let n_hashes: u16 = self.restrict_n_hashes(start, n_hashes); + rustc_lexer::LiteralKind::RawByteStr(unvalidated_raw_str) => { + let validated_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str); + let n_hashes = validated_raw_str.num_hashes(); let n = u32::from(n_hashes); + let content_start = start + BytePos(3 + n); let content_end = suffix_start - BytePos(1 + n); self.validate_raw_byte_str_escape(content_start, content_end); @@ -485,6 +477,26 @@ impl<'a> StringReader<'a> { } } + fn validate_and_report_errors( + &self, + start: BytePos, + unvalidated_raw_str: UnvalidatedRawStr, + ) -> ValidatedRawStr { + match unvalidated_raw_str.validate() { + Err(LexRawStrError::InvalidStarter) => self.report_non_started_raw_string(start), + Err(LexRawStrError::NoTerminator { expected, found, possible_terminator_offset }) => { + self.report_unterminated_raw_string( + start, + expected, + possible_terminator_offset, + found, + ) + } + Err(LexRawStrError::TooManyDelimiters) => self.report_too_many_hashes(start), + Ok(valid) => valid, + } + } + fn report_non_started_raw_string(&self, start: BytePos) -> ! { let bad_char = self.str_from(start).chars().last().unwrap(); self.struct_fatal_span_char( @@ -498,38 +510,52 @@ impl<'a> StringReader<'a> { FatalError.raise() } - fn report_unterminated_raw_string(&self, start: BytePos, n_hashes: usize) -> ! { + fn report_unterminated_raw_string( + &self, + start: BytePos, + n_hashes: usize, + possible_offset: Option, + found_terminators: usize, + ) -> ! { let mut err = self.sess.span_diagnostic.struct_span_fatal_with_code( self.mk_sp(start, start), "unterminated raw string", error_code!(E0748), ); + err.span_label(self.mk_sp(start, start), "unterminated raw string"); if n_hashes > 0 { err.note(&format!( "this raw string should be terminated with `\"{}`", - "#".repeat(n_hashes as usize) + "#".repeat(n_hashes) )); } + if let Some(possible_offset) = possible_offset { + let span = self.mk_sp( + start + BytePos(possible_offset as u32), + start + BytePos(possible_offset as u32) + BytePos(found_terminators as u32), + ); + err.span_suggestion( + span, + "you might have intended to terminate the string here", + "#".repeat(n_hashes), + Applicability::MaybeIncorrect, + ); + } + err.emit(); FatalError.raise() } - fn restrict_n_hashes(&self, start: BytePos, n_hashes: usize) -> u16 { - match n_hashes.try_into() { - Ok(n_hashes) => n_hashes, - Err(_) => { - self.fatal_span_( - start, - self.pos, - "too many `#` symbols: raw strings may be \ - delimited by up to 65535 `#` symbols", - ) - .raise(); - } - } + fn report_too_many_hashes(&self, start: BytePos) -> ! { + self.fatal_span_( + start, + self.pos, + "too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols", + ) + .raise(); } fn validate_char_escape(&self, content_start: BytePos, content_end: BytePos) { diff --git a/src/librustc_parse/parser/diagnostics.rs b/src/librustc_parse/parser/diagnostics.rs index c4546dedfcd..7b6840307cb 100644 --- a/src/librustc_parse/parser/diagnostics.rs +++ b/src/librustc_parse/parser/diagnostics.rs @@ -6,7 +6,7 @@ use rustc_ast::ast::{ }; use rustc_ast::ast::{AttrVec, ItemKind, Mutability, Pat, PatKind, PathSegment, QSelf, Ty, TyKind}; use rustc_ast::ptr::P; -use rustc_ast::token::{self, TokenKind}; +use rustc_ast::token::{self, Lit, LitKind, Token, TokenKind}; use rustc_ast::util::parser::AssocOp; use rustc_ast_pretty::pprust; use rustc_data_structures::fx::FxHashSet; @@ -255,6 +255,10 @@ impl<'a> Parser<'a> { } } + if self.check_too_many_raw_str_terminators(&mut err) { + return Err(err); + } + let sm = self.sess.source_map(); if self.prev_token.span == DUMMY_SP { // Account for macro context where the previous span might not be @@ -282,6 +286,31 @@ impl<'a> Parser<'a> { Err(err) } + fn check_too_many_raw_str_terminators(&mut self, err: &mut DiagnosticBuilder<'_>) -> bool { + let prev_token_raw_str = match self.prev_token { + Token { kind: TokenKind::Literal(Lit { kind: LitKind::StrRaw(n), .. }), .. } => Some(n), + Token { + kind: TokenKind::Literal(Lit { kind: LitKind::ByteStrRaw(n), .. }), .. + } => Some(n), + _ => None, + }; + + if let Some(n_hashes) = prev_token_raw_str { + if self.token.kind == TokenKind::Pound { + err.set_primary_message("too many `#` when terminating raw string"); + err.span_suggestion( + self.token.span, + "Remove the extra `#`", + String::new(), + Applicability::MachineApplicable, + ); + err.note(&format!("The raw string started with {} `#`s", n_hashes)); + return true; + } + } + false + } + pub fn maybe_annotate_with_ascription( &mut self, err: &mut DiagnosticBuilder<'_>, diff --git a/src/test/ui/parser/raw/raw-byte-string-eof.stderr b/src/test/ui/parser/raw/raw-byte-string-eof.stderr index d5f22e2a1a8..81344841c27 100644 --- a/src/test/ui/parser/raw/raw-byte-string-eof.stderr +++ b/src/test/ui/parser/raw/raw-byte-string-eof.stderr @@ -2,7 +2,9 @@ error[E0748]: unterminated raw string --> $DIR/raw-byte-string-eof.rs:2:5 | LL | br##"a"#; - | ^ unterminated raw string + | ^ - help: you might have intended to terminate the string here: `##` + | | + | unterminated raw string | = note: this raw string should be terminated with `"##` diff --git a/src/test/ui/parser/raw/raw-str-in-macro-call.rs b/src/test/ui/parser/raw/raw-str-in-macro-call.rs new file mode 100644 index 00000000000..462c2279f5c --- /dev/null +++ b/src/test/ui/parser/raw/raw-str-in-macro-call.rs @@ -0,0 +1,14 @@ +// check-pass + +macro_rules! m1 { + ($tt:tt #) => () +} + +macro_rules! m2 { + ($tt:tt) => () +} + +fn main() { + m1!(r#"abc"##); + m2!(r#"abc"#); +} diff --git a/src/test/ui/parser/raw/raw-str-unbalanced.rs b/src/test/ui/parser/raw/raw-str-unbalanced.rs index 5a1d1be11b6..35f118f5ce6 100644 --- a/src/test/ui/parser/raw/raw-str-unbalanced.rs +++ b/src/test/ui/parser/raw/raw-str-unbalanced.rs @@ -1,4 +1,4 @@ static s: &'static str = r#" - "## //~ ERROR expected one of `.`, `;`, `?`, or an operator, found `#` + "## //~ too many `#` when terminating raw string ; diff --git a/src/test/ui/parser/raw/raw-str-unbalanced.stderr b/src/test/ui/parser/raw/raw-str-unbalanced.stderr index ddb75722bef..891f1d6337c 100644 --- a/src/test/ui/parser/raw/raw-str-unbalanced.stderr +++ b/src/test/ui/parser/raw/raw-str-unbalanced.stderr @@ -1,8 +1,10 @@ -error: expected one of `.`, `;`, `?`, or an operator, found `#` +error: too many `#` when terminating raw string --> $DIR/raw-str-unbalanced.rs:3:9 | LL | "## - | ^ expected one of `.`, `;`, `?`, or an operator + | ^ help: Remove the extra `#` + | + = note: The raw string started with 1 `#`s error: aborting due to previous error diff --git a/src/test/ui/parser/raw/raw_string.stderr b/src/test/ui/parser/raw/raw_string.stderr index 0f1d7e4651d..e91a16bedc4 100644 --- a/src/test/ui/parser/raw/raw_string.stderr +++ b/src/test/ui/parser/raw/raw_string.stderr @@ -2,7 +2,9 @@ error[E0748]: unterminated raw string --> $DIR/raw_string.rs:2:13 | LL | let x = r##"lol"#; - | ^ unterminated raw string + | ^ - help: you might have intended to terminate the string here: `##` + | | + | unterminated raw string | = note: this raw string should be terminated with `"##`