Rollup merge of #63017 - matklad:no-fatal, r=petrochenkov

Remove special code-path for handing unknown tokens In `StringReader`, we have a buffer of fatal errors, which is used only in a single case: when we see something which is not a reasonable token at all, like `🦀`. I think a more straightforward thing to do here is to produce an explicit error token in this case, and let the next layer (the parser), deal with it. However currently this leads to duplicated error messages. What should we do with this? Naively, I would think that emitting (just emitting, not raising) `FatalError` should stop other errors, but looks like this is not the case? We can also probably tweak parser on the case-by-case basis, to avoid emitting "expected" errors if the current token is an `Err`. I personally also fine with cascading errors in this case: it's quite unlikely that you actually type a fully invalid token. @petrochenkov, which approach should we take to fight cascading errors?
2019-08-06 08:17:34 +02:00 · 2019-08-06 08:17:34 +02:00 · 61e270ab48
commit 61e270ab48
parent fe998dbfe4 b3e8c8bbe2
13 changed files with 223 additions and 116 deletions
--- a/src/librustc/ich/impls_syntax.rs
+++ b/src/librustc/ich/impls_syntax.rs
@ -363,7 +363,8 @@ impl<'a> HashStable<StableHashingContext<'a>> for token::TokenKind {
            }

            token::DocComment(val) |
-            token::Shebang(val) => val.hash_stable(hcx, hasher),
+            token::Shebang(val) |
+            token::Unknown(val) => val.hash_stable(hcx, hasher),
        }
    }
 }
--- a/src/librustdoc/html/highlight.rs
+++ b/src/librustdoc/html/highlight.rs
@ -44,7 +44,7 @@ pub fn render_with_highlighting(

        let mut highlighted_source = vec![];
        if classifier.write_source(&mut highlighted_source).is_err() {
-            Err(classifier.lexer.buffer_fatal_errors())
+            Err(())
        } else {
            Ok(String::from_utf8_lossy(&highlighted_source).into_owned())
        }
@ -59,14 +59,9 @@ pub fn render_with_highlighting(
            }
            write_footer(&mut out).unwrap();
        }
-        Err(errors) => {
-            // If errors are encountered while trying to highlight, cancel the errors and just emit
-            // the unhighlighted source. The errors will have already been reported in the
-            // `check-code-block-syntax` pass.
-            for mut error in errors {
-                error.cancel();
-            }
-
+        Err(()) => {
+            // If errors are encountered while trying to highlight, just emit
+            // the unhighlighted source.
            write!(out, "<pre><code>{}</code></pre>", src).unwrap();
        }
    }
@ -192,14 +187,20 @@ impl<'a> Classifier<'a> {
        if let Some(token) = self.peek_token.take() {
            return Ok(token);
        }
-        self.lexer.try_next_token().map_err(|()| HighlightError::LexError)
+        let token = self.lexer.next_token();
+        if let token::Unknown(..) = &token.kind {
+            return Err(HighlightError::LexError);
+        }
+        Ok(token)
    }

    fn peek(&mut self) -> Result<&Token, HighlightError> {
        if self.peek_token.is_none() {
-            self.peek_token = Some(
-                self.lexer.try_next_token().map_err(|()| HighlightError::LexError)?
-            );
+            let token = self.lexer.next_token();
+            if let token::Unknown(..) = &token.kind {
+                return Err(HighlightError::LexError);
+            }
+            self.peek_token = Some(token);
        }
        Ok(self.peek_token.as_ref().unwrap())
    }
@ -237,7 +238,7 @@ impl<'a> Classifier<'a> {
                return Ok(());
            },

-            token::Whitespace => Class::None,
+            token::Whitespace | token::Unknown(..) => Class::None,
            token::Comment => Class::Comment,
            token::DocComment(..) => Class::DocComment,

--- a/src/librustdoc/passes/check_code_block_syntax.rs
+++ b/src/librustdoc/passes/check_code_block_syntax.rs
@ -32,24 +32,20 @@ impl<'a, 'tcx> SyntaxChecker<'a, 'tcx> {
            dox[code_block.code].to_owned(),
        );

-        let errors = {
+        let has_errors = {
+            let mut has_errors = false;
            let mut lexer = Lexer::new(&sess, source_file, None);
-            while let Ok(token::Token { kind, .. }) = lexer.try_next_token() {
-                if kind == token::Eof {
-                    break;
+            loop  {
+                match lexer.next_token().kind {
+                    token::Eof => break,
+                    token::Unknown(..) => has_errors = true,
+                    _ => (),
                }
            }
-
-            let errors = lexer.buffer_fatal_errors();
-
-            if !errors.is_empty() {
-                Err(errors)
-            } else {
-                Ok(())
-            }
+            has_errors
        };

-        if let Err(errors) = errors {
+        if has_errors {
            let mut diag = if let Some(sp) =
                super::source_span_for_markdown_range(self.cx, &dox, &code_block.range, &item.attrs)
            {
@ -58,11 +54,6 @@ impl<'a, 'tcx> SyntaxChecker<'a, 'tcx> {
                    .sess()
                    .struct_span_warn(sp, "could not parse code block as Rust code");

-                for mut err in errors {
-                    diag.note(&format!("error from rustc: {}", err.message()));
-                    err.cancel();
-                }
-
                if code_block.syntax.is_none() && code_block.is_fenced {
                    let sp = sp.from_inner(InnerSpan::new(0, 3));
                    diag.span_suggestion(
@ -82,11 +73,6 @@ impl<'a, 'tcx> SyntaxChecker<'a, 'tcx> {
                    "doc comment contains an invalid Rust code block",
                );

-                for mut err in errors {
-                    // Don't bother reporting the error, because we can't show where it happened.
-                    err.cancel();
-                }
-
                if code_block.syntax.is_none() && code_block.is_fenced {
                    diag.help("mark blocks that do not contain Rust code as text: ```text");
                }
--- a/src/libsyntax/ext/proc_macro_server.rs
+++ b/src/libsyntax/ext/proc_macro_server.rs
@ -184,7 +184,7 @@ impl FromInternal<(TreeAndJoint, &'_ ParseSess, &'_ mut Vec<Self>)>
            }

            OpenDelim(..) | CloseDelim(..) => unreachable!(),
-            Whitespace | Comment | Shebang(..) | Eof => unreachable!(),
+            Whitespace | Comment | Shebang(..) | Unknown(..) | Eof => unreachable!(),
        }
    }
 }
--- a/src/libsyntax/parse/lexer/mod.rs
+++ b/src/libsyntax/parse/lexer/mod.rs
@ -3,7 +3,7 @@ use crate::parse::token::{self, Token, TokenKind};
 use crate::symbol::{sym, Symbol};
 use crate::parse::unescape_error_reporting::{emit_unescape_error, push_escaped_char};

-use errors::{FatalError, Diagnostic, DiagnosticBuilder};
+use errors::{FatalError, DiagnosticBuilder};
 use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION};
 use rustc_lexer::Base;
 use rustc_lexer::unescape;
@ -39,7 +39,6 @@ pub struct StringReader<'a> {
    pos: BytePos,
    /// Stop reading src at this index.
    end_src_index: usize,
-    fatal_errs: Vec<DiagnosticBuilder<'a>>,
    /// Source text to tokenize.
    src: Lrc<String>,
    override_span: Option<Span>,
@ -62,7 +61,6 @@ impl<'a> StringReader<'a> {
            pos: source_file.start_pos,
            end_src_index: src.len(),
            src,
-            fatal_errs: Vec::new(),
            override_span,
        }
    }
@ -89,29 +87,17 @@ impl<'a> StringReader<'a> {
        self.override_span.unwrap_or_else(|| Span::new(lo, hi, NO_EXPANSION))
    }

-    fn unwrap_or_abort(&mut self, res: Result<Token, ()>) -> Token {
-        match res {
-            Ok(tok) => tok,
-            Err(_) => {
-                self.emit_fatal_errors();
-                FatalError.raise();
-            }
-        }
-    }
-
    /// Returns the next token, including trivia like whitespace or comments.
    ///
    /// `Err(())` means that some errors were encountered, which can be
    /// retrieved using `buffer_fatal_errors`.
-    pub fn try_next_token(&mut self) -> Result<Token, ()> {
-        assert!(self.fatal_errs.is_empty());
-
+    pub fn next_token(&mut self) -> Token {
        let start_src_index = self.src_index(self.pos);
        let text: &str = &self.src[start_src_index..self.end_src_index];

        if text.is_empty() {
            let span = self.mk_sp(self.pos, self.pos);
-            return Ok(Token::new(token::Eof, span));
+            return Token::new(token::Eof, span);
        }

        {
@ -125,7 +111,7 @@ impl<'a> StringReader<'a> {
                    let kind = token::Shebang(sym);

                    let span = self.mk_sp(start, self.pos);
-                    return Ok(Token::new(kind, span));
+                    return Token::new(kind, span);
                }
            }
        }
@ -139,39 +125,10 @@ impl<'a> StringReader<'a> {

        // This could use `?`, but that makes code significantly (10-20%) slower.
        // https://github.com/rust-lang/rust/issues/37939
-        let kind = match self.cook_lexer_token(token.kind, start) {
-            Ok(it) => it,
-            Err(err) => return Err(self.fatal_errs.push(err)),
-        };
+        let kind = self.cook_lexer_token(token.kind, start);

        let span = self.mk_sp(start, self.pos);
-        Ok(Token::new(kind, span))
-    }
-
-    /// Returns the next token, including trivia like whitespace or comments.
-    ///
-    /// Aborts in case of an error.
-    pub fn next_token(&mut self) -> Token {
-        let res = self.try_next_token();
-        self.unwrap_or_abort(res)
-    }
-
-    fn emit_fatal_errors(&mut self) {
-        for err in &mut self.fatal_errs {
-            err.emit();
-        }
-
-        self.fatal_errs.clear();
-    }
-
-    pub fn buffer_fatal_errors(&mut self) -> Vec<Diagnostic> {
-        let mut buffer = Vec::new();
-
-        for err in self.fatal_errs.drain(..) {
-            err.buffer(&mut buffer);
-        }
-
-        buffer
+        Token::new(kind, span)
    }

    /// Report a fatal lexical error with a given span.
@ -218,8 +175,8 @@ impl<'a> StringReader<'a> {
        &self,
        token: rustc_lexer::TokenKind,
        start: BytePos,
-    ) -> Result<TokenKind, DiagnosticBuilder<'a>> {
-        let kind = match token {
+    ) -> TokenKind {
+        match token {
            rustc_lexer::TokenKind::LineComment => {
                let string = self.str_from(start);
                // comments with only more "/"s are not doc comments
@ -396,16 +353,12 @@ impl<'a> StringReader<'a> {
                // this should be inside `rustc_lexer`. However, we should first remove compound
                // tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
                // as there will be less overall work to do this way.
-                return match unicode_chars::check_for_substitution(self, start, c, &mut err) {
-                    Some(token) => {
-                        err.emit();
-                        Ok(token)
-                    }
-                    None => Err(err),
-                }
+                let token = unicode_chars::check_for_substitution(self, start, c, &mut err)
+                    .unwrap_or_else(|| token::Unknown(self.symbol_from(start)));
+                err.emit();
+                token
            }
-        };
-        Ok(kind)
+        }
    }

    fn cook_lexer_literal(
--- a/src/libsyntax/parse/lexer/tokentrees.rs
+++ b/src/libsyntax/parse/lexer/tokentrees.rs
@ -217,7 +217,7 @@ impl<'a> TokenTreesReader<'a> {
        loop {
            let token = self.string_reader.next_token();
            match token.kind {
-                token::Whitespace | token::Comment | token::Shebang(_) => {
+                token::Whitespace | token::Comment | token::Shebang(_) | token::Unknown(_) => {
                    self.joint_to_prev = NonJoint;
                }
                _ => {
--- a/src/libsyntax/parse/token.rs
+++ b/src/libsyntax/parse/token.rs
@ -255,6 +255,8 @@ pub enum TokenKind {
    /// A comment.
    Comment,
    Shebang(ast::Name),
+    /// A completely invalid token which should be skipped.
+    Unknown(ast::Name),

    Eof,
 }
@ -603,7 +605,7 @@ impl Token {
            DotDotEq | Comma | Semi | ModSep | RArrow | LArrow | FatArrow | Pound | Dollar |
            Question | OpenDelim(..) | CloseDelim(..) |
            Literal(..) | Ident(..) | Lifetime(..) | Interpolated(..) | DocComment(..) |
-            Whitespace | Comment | Shebang(..) | Eof => return None,
+            Whitespace | Comment | Shebang(..) | Unknown(..) | Eof => return None,
        };

        Some(Token::new(kind, self.span.to(joint.span)))
--- a/src/libsyntax/print/pprust.rs
+++ b/src/libsyntax/print/pprust.rs
@ -288,6 +288,7 @@ fn token_kind_to_string_ext(tok: &TokenKind, convert_dollar_crate: Option<Span>)
        token::Whitespace           => " ".to_string(),
        token::Comment              => "/* */".to_string(),
        token::Shebang(s)           => format!("/* shebang: {}*/", s),
+        token::Unknown(s)           => s.to_string(),

        token::Interpolated(ref nt) => nonterminal_to_string(nt),
    }
--- a/src/test/rustdoc-ui/invalid-syntax.stderr
+++ b/src/test/rustdoc-ui/invalid-syntax.stderr
@ -1,3 +1,21 @@
+error: unknown start of token: \
+ --> <doctest>:1:1
+  |
+1 | \__________pkt->size___________/          \_result->size_/ \__pkt->size__/
+  | ^
+
+error: unknown start of token: \
+ --> <doctest>:1:43
+  |
+1 | \__________pkt->size___________/          \_result->size_/ \__pkt->size__/
+  |                                           ^
+
+error: unknown start of token: \
+ --> <doctest>:1:60
+  |
+1 | \__________pkt->size___________/          \_result->size_/ \__pkt->size__/
+  |                                                            ^
+
 warning: could not parse code block as Rust code
  --> $DIR/invalid-syntax.rs:3:5
   |
@ -6,13 +24,31 @@ LL |   /// ```
 LL | | /// \__________pkt->size___________/          \_result->size_/ \__pkt->size__/
 LL | | /// ```
   | |_______^
-   |
-   = note: error from rustc: unknown start of token: \
 help: mark blocks that do not contain Rust code as text
   |
 LL | /// ```text
   |     ^^^^^^^

+error: unknown start of token: `
+ --> <doctest>:3:30
+  |
+3 |    |     ^^^^^^ did you mean `baz::foobar`?
+  |                              ^
+help: Unicode character '`' (Grave Accent) looks like ''' (Single Quote), but it is not
+  |
+3 |    |     ^^^^^^ did you mean 'baz::foobar`?
+  |                              ^
+
+error: unknown start of token: `
+ --> <doctest>:3:42
+  |
+3 |    |     ^^^^^^ did you mean `baz::foobar`?
+  |                                          ^
+help: Unicode character '`' (Grave Accent) looks like ''' (Single Quote), but it is not
+  |
+3 |    |     ^^^^^^ did you mean `baz::foobar'?
+  |                                          ^
+
 warning: could not parse code block as Rust code
  --> $DIR/invalid-syntax.rs:8:5
   |
@ -23,13 +59,17 @@ LL | | /// LL | use foobar::Baz;
 LL | | ///    |     ^^^^^^ did you mean `baz::foobar`?
 LL | | /// ```
   | |_______^
-   |
-   = note: error from rustc: unknown start of token: `
 help: mark blocks that do not contain Rust code as text
   |
 LL | /// ```text
   |     ^^^^^^^

+error: unknown start of token: \
+ --> <doctest>:1:1
+  |
+1 | \_
+  | ^
+
 warning: could not parse code block as Rust code
  --> $DIR/invalid-syntax.rs:19:5
   |
@ -38,13 +78,17 @@ LL |   /// ```
 LL | | /// \_
 LL | | /// ```
   | |_______^
-   |
-   = note: error from rustc: unknown start of token: \
 help: mark blocks that do not contain Rust code as text
   |
 LL | /// ```text
   |     ^^^^^^^

+error: unknown start of token: \
+ --> <doctest>:1:1
+  |
+1 | \_
+  | ^
+
 warning: could not parse code block as Rust code
  --> $DIR/invalid-syntax.rs:32:5
   |
@ -53,8 +97,12 @@ LL |   /// ```rust
 LL | | /// \_
 LL | | /// ```
   | |_______^
-   |
-   = note: error from rustc: unknown start of token: \
+
+error: unknown start of token: \
+ --> <doctest>:2:5
+  |
+2 |     \_
+  |     ^

 warning: could not parse code block as Rust code
  --> $DIR/invalid-syntax.rs:41:9
@ -63,16 +111,48 @@ LL |   ///     code with bad syntax
   |  _________^
 LL | | ///     \_
   | |__________^
-   |
-   = note: error from rustc: unknown start of token: \
+
+error: unknown start of token: `
+ --> <doctest>:1:1
+  |
+1 | ```
+  | ^
+help: Unicode character '`' (Grave Accent) looks like ''' (Single Quote), but it is not
+  |
+1 | '``
+  | ^
+
+error: unknown start of token: `
+ --> <doctest>:1:2
+  |
+1 | ```
+  |  ^
+help: Unicode character '`' (Grave Accent) looks like ''' (Single Quote), but it is not
+  |
+1 | `'`
+  |  ^
+
+error: unknown start of token: `
+ --> <doctest>:1:3
+  |
+1 | ```
+  |   ^
+help: Unicode character '`' (Grave Accent) looks like ''' (Single Quote), but it is not
+  |
+1 | ``'
+  |   ^

 warning: could not parse code block as Rust code
  --> $DIR/invalid-syntax.rs:55:9
   |
 LL | ///     ```
   |         ^^^
-   |
-   = note: error from rustc: unknown start of token: `
+
+error: unknown start of token: \
+ --> <doctest>:1:1
+  |
+1 | \_
+  | ^

 warning: could not parse code block as Rust code
  --> $DIR/invalid-syntax.rs:58:5
@ -82,8 +162,12 @@ LL |   /// ```edition2018
 LL | | /// \_
 LL | | /// ```
   | |_______^
-   |
-   = note: error from rustc: unknown start of token: \
+
+error: unknown start of token: \
+ --> <doctest>:1:1
+  |
+1 | \_
+  | ^

 warning: doc comment contains an invalid Rust code block
  --> $DIR/invalid-syntax.rs:63:1
@ -95,3 +179,59 @@ LL | | #[doc = "```"]
   |
   = help: mark blocks that do not contain Rust code as text: ```text

+error: unknown start of token: \
+ --> <rustdoc-highlighting>:1:1
+  |
+1 | \_
+  | ^
+
+error: unknown start of token: \
+ --> <rustdoc-highlighting>:1:1
+  |
+1 | \_
+  | ^
+
+error: unknown start of token: `
+ --> <rustdoc-highlighting>:1:1
+  |
+1 | ```
+  | ^
+help: Unicode character '`' (Grave Accent) looks like ''' (Single Quote), but it is not
+  |
+1 | '``
+  | ^
+
+error: unknown start of token: \
+ --> <rustdoc-highlighting>:2:1
+  |
+2 | \_
+  | ^
+
+error: unknown start of token: \
+ --> <rustdoc-highlighting>:1:1
+  |
+1 | \_
+  | ^
+
+error: unknown start of token: \
+ --> <rustdoc-highlighting>:1:1
+  |
+1 | \_
+  | ^
+
+error: unknown start of token: `
+ --> <rustdoc-highlighting>:3:30
+  |
+3 |    |     ^^^^^^ did you mean `baz::foobar`?
+  |                              ^
+help: Unicode character '`' (Grave Accent) looks like ''' (Single Quote), but it is not
+  |
+3 |    |     ^^^^^^ did you mean 'baz::foobar`?
+  |                              ^
+
+error: unknown start of token: \
+ --> <rustdoc-highlighting>:1:1
+  |
+1 | \__________pkt->size___________/          \_result->size_/ \__pkt->size__/
+  | ^
+
--- a/src/test/ui/parser/lex-bad-token.rs
+++ b/src/test/ui/parser/lex-bad-token.rs
@ -1 +1,3 @@
 ● //~ ERROR: unknown start of token
+
+fn main() {}
--- a/src/test/ui/parser/lex-stray-backslash.rs
+++ b/src/test/ui/parser/lex-stray-backslash.rs
@ -1 +1,3 @@
 \ //~ ERROR: unknown start of token: \
+
+fn main() {}
--- a/src/test/ui/parser/unicode-quote-chars.rs
+++ b/src/test/ui/parser/unicode-quote-chars.rs
@ -4,4 +4,7 @@ fn main() {
    println!(“hello world”);
    //~^ ERROR unknown start of token: \u{201c}
    //~^^ HELP Unicode characters '“' (Left Double Quotation Mark) and '”' (Right Double Quotation Mark) look like '"' (Quotation Mark), but are not
+    //~^^^ ERROR unknown start of token: \u{201d}
+    //~^^^^ HELP Unicode character '”' (Right Double Quotation Mark) looks like '"' (Quotation Mark), but it is not
+    //~^^^^^ ERROR expected token: `,`
 }
--- a/src/test/ui/parser/unicode-quote-chars.stderr
+++ b/src/test/ui/parser/unicode-quote-chars.stderr
@ -8,5 +8,21 @@ help: Unicode characters '“' (Left Double Quotation Mark) and '”' (Right Dou
 LL |     println!("hello world");
   |              ^^^^^^^^^^^^^

-error: aborting due to previous error
+error: unknown start of token: \u{201d}
+  --> $DIR/unicode-quote-chars.rs:4:26
+   |
+LL |     println!(“hello world”);
+   |                          ^
+help: Unicode character '”' (Right Double Quotation Mark) looks like '"' (Quotation Mark), but it is not
+   |
+LL |     println!(“hello world");
+   |                          ^
+
+error: expected token: `,`
+  --> $DIR/unicode-quote-chars.rs:4:21
+   |
+LL |     println!(“hello world”);
+   |                     ^^^^^ expected `,`
+
+error: aborting due to 3 previous errors