Improve error messages for raw strings (#60762)

This diff improves error messages around raw strings in a few ways: - Catch extra trailing `#` in the parser. This can't be handled in the lexer because we could be in a macro that actually expects another # (see test) - Refactor & unify error handling in the lexer between ByteStrings and RawByteStrings - Detect potentially intended terminators (longest sequence of "#*" is suggested)
2020-03-28 01:46:20 -04:00 · 2020-03-28 01:46:20 -04:00 · 629e97a5a0
commit 629e97a5a0
parent 840a5769b0
10 changed files with 344 additions and 63 deletions
--- a/src/librustc_lexer/src/cursor.rs
+++ b/src/librustc_lexer/src/cursor.rs
@ -41,7 +41,7 @@ impl<'a> Cursor<'a> {
    /// If requested position doesn't exist, `EOF_CHAR` is returned.
    /// However, getting `EOF_CHAR` doesn't always mean actual end of file,
    /// it should be checked with `is_eof` method.
-    fn nth_char(&self, n: usize) -> char {
+    pub(crate) fn nth_char(&self, n: usize) -> char {
        self.chars().nth(n).unwrap_or(EOF_CHAR)
    }

--- a/src/librustc_lexer/src/lib.rs
+++ b/src/librustc_lexer/src/lib.rs
@ -17,9 +17,13 @@
 mod cursor;
 pub mod unescape;

+#[cfg(test)]
+mod tests;
+
 use self::LiteralKind::*;
 use self::TokenKind::*;
 use crate::cursor::{Cursor, EOF_CHAR};
+use std::convert::TryInto;

 /// Parsed token.
 /// It doesn't contain information about data that has been parsed,
@ -132,9 +136,65 @@ pub enum LiteralKind {
    /// "b"abc"", "b"abc"
    ByteStr { terminated: bool },
    /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
-    RawStr { n_hashes: usize, started: bool, terminated: bool },
+    RawStr(UnvalidatedRawStr),
    /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
-    RawByteStr { n_hashes: usize, started: bool, terminated: bool },
+    RawByteStr(UnvalidatedRawStr),
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub struct UnvalidatedRawStr {
+    valid_start: bool,
+    n_start_hashes: usize,
+    n_end_hashes: usize,
+    possible_terminator_offset: Option<usize>,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub enum LexRawStrError {
+    /// Non # characters between `r` and `"` eg. `r#~"..`
+    InvalidStarter,
+    /// The string was never terminated. `possible_terminator_offset` is the best guess of where they
+    /// may have intended to terminate it.
+    NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
+    /// More than 65536 # signs
+    TooManyDelimiters,
+}
+
+#[derive(Debug, Eq, PartialEq, Copy, Clone)]
+pub struct ValidatedRawStr {
+    n_hashes: u16,
+}
+
+impl ValidatedRawStr {
+    pub fn num_hashes(&self) -> u16 {
+        self.n_hashes
+    }
+}
+
+impl UnvalidatedRawStr {
+    pub fn started(&self) -> bool {
+        self.valid_start
+    }
+
+    pub fn validate(self) -> Result<ValidatedRawStr, LexRawStrError> {
+        if !self.valid_start {
+            return Err(LexRawStrError::InvalidStarter);
+        }
+
+        let n_start_safe: u16 =
+            self.n_start_hashes.try_into().map_err(|_| LexRawStrError::TooManyDelimiters)?;
+        match (self.n_start_hashes, self.n_end_hashes) {
+            (n_start, n_end) if n_start > n_end => Err(LexRawStrError::NoTerminator {
+                expected: n_start,
+                found: self.n_end_hashes,
+                possible_terminator_offset: self.possible_terminator_offset,
+            }),
+            (n_start, n_end) => {
+                debug_assert_eq!(n_start, n_end);
+                Ok(ValidatedRawStr { n_hashes: n_start_safe })
+            }
+        }
+    }
 }

 /// Base of numeric literal encoding according to its prefix.
@ -209,7 +269,7 @@ pub fn is_whitespace(c: char) -> bool {
        // Dedicated whitespace characters from Unicode
        | '\u{2028}' // LINE SEPARATOR
        | '\u{2029}' // PARAGRAPH SEPARATOR
-            => true,
+        => true,
        _ => false,
    }
 }
@ -258,12 +318,12 @@ impl Cursor<'_> {
            'r' => match (self.first(), self.second()) {
                ('#', c1) if is_id_start(c1) => self.raw_ident(),
                ('#', _) | ('"', _) => {
-                    let (n_hashes, started, terminated) = self.raw_double_quoted_string();
+                    let raw_str_i = self.raw_double_quoted_string(1);
                    let suffix_start = self.len_consumed();
-                    if terminated {
+                    if raw_str_i.n_end_hashes == raw_str_i.n_start_hashes {
                        self.eat_literal_suffix();
                    }
-                    let kind = RawStr { n_hashes, started, terminated };
+                    let kind = RawStr(raw_str_i);
                    Literal { kind, suffix_start }
                }
                _ => self.ident(),
@ -293,12 +353,14 @@ impl Cursor<'_> {
                }
                ('r', '"') | ('r', '#') => {
                    self.bump();
-                    let (n_hashes, started, terminated) = self.raw_double_quoted_string();
+                    let raw_str_i = self.raw_double_quoted_string(2);
                    let suffix_start = self.len_consumed();
+                    let terminated = raw_str_i.n_start_hashes == raw_str_i.n_end_hashes;
                    if terminated {
                        self.eat_literal_suffix();
                    }
-                    let kind = RawByteStr { n_hashes, started, terminated };
+
+                    let kind = RawByteStr(raw_str_i);
                    Literal { kind, suffix_start }
                }
                _ => self.ident(),
@ -594,29 +656,41 @@ impl Cursor<'_> {
        false
    }

-    /// Eats the double-quoted string and returns a tuple of
-    /// (amount of the '#' symbols, raw string started, raw string terminated)
-    fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) {
+    /// Eats the double-quoted string an UnvalidatedRawStr
+    fn raw_double_quoted_string(&mut self, prefix_len: usize) -> UnvalidatedRawStr {
        debug_assert!(self.prev() == 'r');
-        let mut started: bool = false;
-        let mut finished: bool = false;
+        let mut valid_start: bool = false;
+        let start_pos = self.len_consumed();
+        let (mut possible_terminator_offset, mut max_hashes) = (None, 0);

        // Count opening '#' symbols.
-        let n_hashes = self.eat_while(|c| c == '#');
+        let n_start_hashes = self.eat_while(|c| c == '#');

        // Check that string is started.
        match self.bump() {
-            Some('"') => started = true,
-            _ => return (n_hashes, started, finished),
+            Some('"') => valid_start = true,
+            _ => {
+                return UnvalidatedRawStr {
+                    valid_start,
+                    n_start_hashes,
+                    n_end_hashes: 0,
+                    possible_terminator_offset,
+                };
+            }
        }

        // Skip the string contents and on each '#' character met, check if this is
        // a raw string termination.
-        while !finished {
+        loop {
            self.eat_while(|c| c != '"');

            if self.is_eof() {
-                return (n_hashes, started, finished);
+                return UnvalidatedRawStr {
+                    valid_start,
+                    n_start_hashes,
+                    n_end_hashes: max_hashes,
+                    possible_terminator_offset,
+                };
            }

            // Eat closing double quote.
@ -624,7 +698,7 @@ impl Cursor<'_> {

            // Check that amount of closing '#' symbols
            // is equal to the amount of opening ones.
-            let mut hashes_left = n_hashes;
+            let mut hashes_left = n_start_hashes;
            let is_closing_hash = |c| {
                if c == '#' && hashes_left != 0 {
                    hashes_left -= 1;
@ -633,10 +707,23 @@ impl Cursor<'_> {
                    false
                }
            };
-            finished = self.eat_while(is_closing_hash) == n_hashes;
-        }
+            let n_end_hashes = self.eat_while(is_closing_hash);

-        (n_hashes, started, finished)
+            if n_end_hashes == n_start_hashes {
+                return UnvalidatedRawStr {
+                    valid_start,
+                    n_start_hashes,
+                    n_end_hashes,
+                    possible_terminator_offset: None,
+                };
+            } else if n_end_hashes > 0 && n_end_hashes > max_hashes {
+                // Keep track of possible terminators to give a hint about where there might be
+                // a missing terminator
+                possible_terminator_offset =
+                    Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len);
+                max_hashes = n_end_hashes;
+            }
+        }
    }

    fn eat_decimal_digits(&mut self) -> bool {
--- a/src/librustc_lexer/src/tests.rs
+++ b/src/librustc_lexer/src/tests.rs
@ -0,0 +1,119 @@
+#[cfg(test)]
+mod tests {
+    use crate::*;
+
+    fn check_raw_str(
+        s: &str,
+        expected: UnvalidatedRawStr,
+        validated: Result<ValidatedRawStr, LexRawStrError>,
+    ) {
+        let mut cursor = Cursor::new(s);
+        let tok = cursor.raw_double_quoted_string(0);
+        assert_eq!(tok, expected);
+        assert_eq!(tok.validate(), validated);
+    }
+
+    #[test]
+    fn test_naked_raw_str() {
+        check_raw_str(
+            r#""abc""#,
+            UnvalidatedRawStr {
+                n_start_hashes: 0,
+                n_end_hashes: 0,
+                valid_start: true,
+                possible_terminator_offset: None,
+            },
+            Ok(ValidatedRawStr { n_hashes: 0 }),
+        );
+    }
+
+    #[test]
+    fn test_raw_no_start() {
+        check_raw_str(
+            r##""abc"#"##,
+            UnvalidatedRawStr {
+                n_start_hashes: 0,
+                n_end_hashes: 0,
+                valid_start: true,
+                possible_terminator_offset: None,
+            },
+            Ok(ValidatedRawStr { n_hashes: 0 }),
+        );
+    }
+
+    #[test]
+    fn test_too_many_terminators() {
+        // this error is handled in the parser later
+        check_raw_str(
+            r###"#"abc"##"###,
+            UnvalidatedRawStr {
+                n_start_hashes: 1,
+                n_end_hashes: 1,
+                valid_start: true,
+                possible_terminator_offset: None,
+            },
+            Ok(ValidatedRawStr { n_hashes: 1 }),
+        );
+    }
+
+    #[test]
+    fn test_unterminated() {
+        check_raw_str(
+            r#"#"abc"#,
+            UnvalidatedRawStr {
+                n_start_hashes: 1,
+                n_end_hashes: 0,
+                valid_start: true,
+                possible_terminator_offset: None,
+            },
+            Err(LexRawStrError::NoTerminator {
+                expected: 1,
+                found: 0,
+                possible_terminator_offset: None,
+            }),
+        );
+        check_raw_str(
+            r###"##"abc"#"###,
+            UnvalidatedRawStr {
+                n_start_hashes: 2,
+                n_end_hashes: 1,
+                valid_start: true,
+                possible_terminator_offset: Some(7),
+            },
+            Err(LexRawStrError::NoTerminator {
+                expected: 2,
+                found: 1,
+                possible_terminator_offset: Some(7),
+            }),
+        );
+        // We're looking for "# not just any #
+        check_raw_str(
+            r###"##"abc#"###,
+            UnvalidatedRawStr {
+                n_start_hashes: 2,
+                n_end_hashes: 0,
+                valid_start: true,
+                possible_terminator_offset: None,
+            },
+            Err(LexRawStrError::NoTerminator {
+                expected: 2,
+                found: 0,
+                possible_terminator_offset: None,
+            }),
+        )
+    }
+
+    #[test]
+    fn test_invalid_start() {
+        check_raw_str(
+            r##"#~"abc"#"##,
+            UnvalidatedRawStr {
+                n_start_hashes: 1,
+                n_end_hashes: 0,
+                valid_start: false,
+                possible_terminator_offset: None,
+            },
+            Err(LexRawStrError::InvalidStarter),
+        );
+    }
+}
--- a/src/librustc_parse/lexer/mod.rs
+++ b/src/librustc_parse/lexer/mod.rs
@ -1,20 +1,20 @@
 use rustc_ast::token::{self, Token, TokenKind};
 use rustc_ast::util::comments;
 use rustc_data_structures::sync::Lrc;
-use rustc_errors::{error_code, DiagnosticBuilder, FatalError};
-use rustc_lexer::unescape;
+use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError};
 use rustc_lexer::Base;
+use rustc_lexer::{unescape, LexRawStrError, UnvalidatedRawStr, ValidatedRawStr};
 use rustc_session::parse::ParseSess;
 use rustc_span::symbol::{sym, Symbol};
 use rustc_span::{BytePos, Pos, Span};

 use log::debug;
 use std::char;
-use std::convert::TryInto;

 mod tokentrees;
 mod unescape_error_reporting;
 mod unicode_chars;
+
 use unescape_error_reporting::{emit_unescape_error, push_escaped_char};

 #[derive(Clone, Debug)]
@ -376,30 +376,22 @@ impl<'a> StringReader<'a> {
                let id = self.symbol_from_to(content_start, content_end);
                (token::ByteStr, id)
            }
-            rustc_lexer::LiteralKind::RawStr { n_hashes, started, terminated } => {
-                if !started {
-                    self.report_non_started_raw_string(start);
-                }
-                if !terminated {
-                    self.report_unterminated_raw_string(start, n_hashes)
-                }
-                let n_hashes: u16 = self.restrict_n_hashes(start, n_hashes);
+            rustc_lexer::LiteralKind::RawStr(unvalidated_raw_str) => {
+                let valid_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str);
+                let n_hashes = valid_raw_str.num_hashes();
                let n = u32::from(n_hashes);
+
                let content_start = start + BytePos(2 + n);
                let content_end = suffix_start - BytePos(1 + n);
                self.validate_raw_str_escape(content_start, content_end);
                let id = self.symbol_from_to(content_start, content_end);
                (token::StrRaw(n_hashes), id)
            }
-            rustc_lexer::LiteralKind::RawByteStr { n_hashes, started, terminated } => {
-                if !started {
-                    self.report_non_started_raw_string(start);
-                }
-                if !terminated {
-                    self.report_unterminated_raw_string(start, n_hashes)
-                }
-                let n_hashes: u16 = self.restrict_n_hashes(start, n_hashes);
+            rustc_lexer::LiteralKind::RawByteStr(unvalidated_raw_str) => {
+                let validated_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str);
+                let n_hashes = validated_raw_str.num_hashes();
                let n = u32::from(n_hashes);
+
                let content_start = start + BytePos(3 + n);
                let content_end = suffix_start - BytePos(1 + n);
                self.validate_raw_byte_str_escape(content_start, content_end);
@ -485,6 +477,26 @@ impl<'a> StringReader<'a> {
        }
    }

+    fn validate_and_report_errors(
+        &self,
+        start: BytePos,
+        unvalidated_raw_str: UnvalidatedRawStr,
+    ) -> ValidatedRawStr {
+        match unvalidated_raw_str.validate() {
+            Err(LexRawStrError::InvalidStarter) => self.report_non_started_raw_string(start),
+            Err(LexRawStrError::NoTerminator { expected, found, possible_terminator_offset }) => {
+                self.report_unterminated_raw_string(
+                    start,
+                    expected,
+                    possible_terminator_offset,
+                    found,
+                )
+            }
+            Err(LexRawStrError::TooManyDelimiters) => self.report_too_many_hashes(start),
+            Ok(valid) => valid,
+        }
+    }
+
    fn report_non_started_raw_string(&self, start: BytePos) -> ! {
        let bad_char = self.str_from(start).chars().last().unwrap();
        self.struct_fatal_span_char(
@ -498,38 +510,52 @@ impl<'a> StringReader<'a> {
        FatalError.raise()
    }

-    fn report_unterminated_raw_string(&self, start: BytePos, n_hashes: usize) -> ! {
+    fn report_unterminated_raw_string(
+        &self,
+        start: BytePos,
+        n_hashes: usize,
+        possible_offset: Option<usize>,
+        found_terminators: usize,
+    ) -> ! {
        let mut err = self.sess.span_diagnostic.struct_span_fatal_with_code(
            self.mk_sp(start, start),
            "unterminated raw string",
            error_code!(E0748),
        );
+
        err.span_label(self.mk_sp(start, start), "unterminated raw string");

        if n_hashes > 0 {
            err.note(&format!(
                "this raw string should be terminated with `\"{}`",
-                "#".repeat(n_hashes as usize)
+                "#".repeat(n_hashes)
            ));
        }

+        if let Some(possible_offset) = possible_offset {
+            let span = self.mk_sp(
+                start + BytePos(possible_offset as u32),
+                start + BytePos(possible_offset as u32) + BytePos(found_terminators as u32),
+            );
+            err.span_suggestion(
+                span,
+                "you might have intended to terminate the string here",
+                "#".repeat(n_hashes),
+                Applicability::MaybeIncorrect,
+            );
+        }
+
        err.emit();
        FatalError.raise()
    }

-    fn restrict_n_hashes(&self, start: BytePos, n_hashes: usize) -> u16 {
-        match n_hashes.try_into() {
-            Ok(n_hashes) => n_hashes,
-            Err(_) => {
-                self.fatal_span_(
-                    start,
-                    self.pos,
-                    "too many `#` symbols: raw strings may be \
-                                  delimited by up to 65535 `#` symbols",
-                )
-                .raise();
-            }
-        }
+    fn report_too_many_hashes(&self, start: BytePos) -> ! {
+        self.fatal_span_(
+            start,
+            self.pos,
+            "too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols",
+        )
+        .raise();
    }

    fn validate_char_escape(&self, content_start: BytePos, content_end: BytePos) {
--- a/src/librustc_parse/parser/diagnostics.rs
+++ b/src/librustc_parse/parser/diagnostics.rs
@ -6,7 +6,7 @@ use rustc_ast::ast::{
 };
 use rustc_ast::ast::{AttrVec, ItemKind, Mutability, Pat, PatKind, PathSegment, QSelf, Ty, TyKind};
 use rustc_ast::ptr::P;
-use rustc_ast::token::{self, TokenKind};
+use rustc_ast::token::{self, Lit, LitKind, Token, TokenKind};
 use rustc_ast::util::parser::AssocOp;
 use rustc_ast_pretty::pprust;
 use rustc_data_structures::fx::FxHashSet;
@ -255,6 +255,10 @@ impl<'a> Parser<'a> {
            }
        }

+        if self.check_too_many_raw_str_terminators(&mut err) {
+            return Err(err);
+        }
+
        let sm = self.sess.source_map();
        if self.prev_token.span == DUMMY_SP {
            // Account for macro context where the previous span might not be
@ -282,6 +286,31 @@ impl<'a> Parser<'a> {
        Err(err)
    }

+    fn check_too_many_raw_str_terminators(&mut self, err: &mut DiagnosticBuilder<'_>) -> bool {
+        let prev_token_raw_str = match self.prev_token {
+            Token { kind: TokenKind::Literal(Lit { kind: LitKind::StrRaw(n), .. }), .. } => Some(n),
+            Token {
+                kind: TokenKind::Literal(Lit { kind: LitKind::ByteStrRaw(n), .. }), ..
+            } => Some(n),
+            _ => None,
+        };
+
+        if let Some(n_hashes) = prev_token_raw_str {
+            if self.token.kind == TokenKind::Pound {
+                err.set_primary_message("too many `#` when terminating raw string");
+                err.span_suggestion(
+                    self.token.span,
+                    "Remove the extra `#`",
+                    String::new(),
+                    Applicability::MachineApplicable,
+                );
+                err.note(&format!("The raw string started with {} `#`s", n_hashes));
+                return true;
+            }
+        }
+        false
+    }
+
    pub fn maybe_annotate_with_ascription(
        &mut self,
        err: &mut DiagnosticBuilder<'_>,
--- a/src/test/ui/parser/raw/raw-byte-string-eof.stderr
+++ b/src/test/ui/parser/raw/raw-byte-string-eof.stderr
@ -2,7 +2,9 @@ error[E0748]: unterminated raw string
  --> $DIR/raw-byte-string-eof.rs:2:5
   |
 LL |     br##"a"#;
-   |     ^ unterminated raw string
+   |     ^      - help: you might have intended to terminate the string here: `##`
+   |     |
+   |     unterminated raw string
   |
   = note: this raw string should be terminated with `"##`

--- a/src/test/ui/parser/raw/raw-str-in-macro-call.rs
+++ b/src/test/ui/parser/raw/raw-str-in-macro-call.rs
@ -0,0 +1,14 @@
+// check-pass
+
+macro_rules! m1 {
+    ($tt:tt #) => ()
+}
+
+macro_rules! m2 {
+    ($tt:tt) => ()
+}
+
+fn main() {
+    m1!(r#"abc"##);
+    m2!(r#"abc"#);
+}
--- a/src/test/ui/parser/raw/raw-str-unbalanced.rs
+++ b/src/test/ui/parser/raw/raw-str-unbalanced.rs
@ -1,4 +1,4 @@
 static s: &'static str =
    r#"
-      "## //~ ERROR expected one of `.`, `;`, `?`, or an operator, found `#`
+      "## //~ too many `#` when terminating raw string
 ;
--- a/src/test/ui/parser/raw/raw-str-unbalanced.stderr
+++ b/src/test/ui/parser/raw/raw-str-unbalanced.stderr
@ -1,8 +1,10 @@
-error: expected one of `.`, `;`, `?`, or an operator, found `#`
+error: too many `#` when terminating raw string
  --> $DIR/raw-str-unbalanced.rs:3:9
   |
 LL |       "##
-   |         ^ expected one of `.`, `;`, `?`, or an operator
+   |         ^ help: Remove the extra `#`
+   |
+   = note: The raw string started with 1 `#`s

 error: aborting due to previous error

--- a/src/test/ui/parser/raw/raw_string.stderr
+++ b/src/test/ui/parser/raw/raw_string.stderr
@ -2,7 +2,9 @@ error[E0748]: unterminated raw string
  --> $DIR/raw_string.rs:2:13
   |
 LL |     let x = r##"lol"#;
-   |             ^ unterminated raw string
+   |             ^       - help: you might have intended to terminate the string here: `##`
+   |             |
+   |             unterminated raw string
   |
   = note: this raw string should be terminated with `"##`