Rollup merge of #70522 - rcoh:60762-raw-string-errors, r=petrochenkov

Improve error messages for raw strings (#60762)

This diff improves error messages around raw strings in a few ways:
- Catch extra trailing `#` in the parser. This can't be handled in the lexer because we could be in a macro that actually expects another # (see test)
- Refactor & unify error handling in the lexer between ByteStrings and RawByteStrings
- Detect potentially intended terminators (longest sequence of "#*" is suggested)

Fixes #60762
cc @estebank who reviewed the original (abandoned) PR for the same ticket.
r? @Centril
This commit is contained in:
Mazdak Farrokhzad 2020-04-01 14:32:14 +02:00 committed by GitHub
commit c739465b1b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 385 additions and 74 deletions

View File

@ -17,9 +17,13 @@
mod cursor;
pub mod unescape;
#[cfg(test)]
mod tests;
use self::LiteralKind::*;
use self::TokenKind::*;
use crate::cursor::{Cursor, EOF_CHAR};
use std::convert::TryInto;
/// Parsed token.
/// It doesn't contain information about data that has been parsed,
@ -132,9 +136,80 @@ pub enum LiteralKind {
/// "b"abc"", "b"abc"
ByteStr { terminated: bool },
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
RawStr { n_hashes: usize, started: bool, terminated: bool },
RawStr(UnvalidatedRawStr),
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
RawByteStr { n_hashes: usize, started: bool, terminated: bool },
RawByteStr(UnvalidatedRawStr),
}
/// Represents something that looks like a raw string, but may have some
/// problems. Use `.validate()` to convert it into something
/// usable.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct UnvalidatedRawStr {
/// The prefix (`r###"`) is valid
valid_start: bool,
/// The number of leading `#`
n_start_hashes: usize,
/// The number of trailing `#`. `n_end_hashes` <= `n_start_hashes`
n_end_hashes: usize,
/// The offset starting at `r` or `br` where the user may have intended to end the string.
/// Currently, it is the longest sequence of pattern `"#+"`.
possible_terminator_offset: Option<usize>,
}
/// Error produced validating a raw string. Represents cases like:
/// - `r##~"abcde"##`: `LexRawStrError::InvalidStarter`
/// - `r###"abcde"##`: `LexRawStrError::NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
/// - Too many `#`s (>65536): `TooManyDelimiters`
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum LexRawStrError {
/// Non `#` characters exist between `r` and `"` eg. `r#~"..`
InvalidStarter,
/// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they
/// may have intended to terminate it.
NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
/// More than 65536 `#`s exist.
TooManyDelimiters,
}
/// Raw String that contains a valid prefix (`#+"`) and postfix (`"#+`) where
/// there are a matching number of `#` characters in both. Note that this will
/// not consume extra trailing `#` characters: `r###"abcde"####` is lexed as a
/// `ValidatedRawString { n_hashes: 3 }` followed by a `#` token.
#[derive(Debug, Eq, PartialEq, Copy, Clone)]
pub struct ValidatedRawStr {
n_hashes: u16,
}
impl ValidatedRawStr {
pub fn num_hashes(&self) -> u16 {
self.n_hashes
}
}
impl UnvalidatedRawStr {
pub fn validate(self) -> Result<ValidatedRawStr, LexRawStrError> {
if !self.valid_start {
return Err(LexRawStrError::InvalidStarter);
}
// Only up to 65535 `#`s are allowed in raw strings
let n_start_safe: u16 =
self.n_start_hashes.try_into().map_err(|_| LexRawStrError::TooManyDelimiters)?;
if self.n_start_hashes > self.n_end_hashes {
Err(LexRawStrError::NoTerminator {
expected: self.n_start_hashes,
found: self.n_end_hashes,
possible_terminator_offset: self.possible_terminator_offset,
})
} else {
// Since the lexer should never produce a literal with n_end > n_start, if n_start <= n_end,
// they must be equal.
debug_assert_eq!(self.n_start_hashes, self.n_end_hashes);
Ok(ValidatedRawStr { n_hashes: n_start_safe })
}
}
}
/// Base of numeric literal encoding according to its prefix.
@ -209,7 +284,7 @@ pub fn is_whitespace(c: char) -> bool {
// Dedicated whitespace characters from Unicode
| '\u{2028}' // LINE SEPARATOR
| '\u{2029}' // PARAGRAPH SEPARATOR
=> true,
=> true,
_ => false,
}
}
@ -258,12 +333,12 @@ fn advance_token(&mut self) -> Token {
'r' => match (self.first(), self.second()) {
('#', c1) if is_id_start(c1) => self.raw_ident(),
('#', _) | ('"', _) => {
let (n_hashes, started, terminated) = self.raw_double_quoted_string();
let raw_str_i = self.raw_double_quoted_string(1);
let suffix_start = self.len_consumed();
if terminated {
if raw_str_i.n_end_hashes == raw_str_i.n_start_hashes {
self.eat_literal_suffix();
}
let kind = RawStr { n_hashes, started, terminated };
let kind = RawStr(raw_str_i);
Literal { kind, suffix_start }
}
_ => self.ident(),
@ -293,12 +368,14 @@ fn advance_token(&mut self) -> Token {
}
('r', '"') | ('r', '#') => {
self.bump();
let (n_hashes, started, terminated) = self.raw_double_quoted_string();
let raw_str_i = self.raw_double_quoted_string(2);
let suffix_start = self.len_consumed();
let terminated = raw_str_i.n_start_hashes == raw_str_i.n_end_hashes;
if terminated {
self.eat_literal_suffix();
}
let kind = RawByteStr { n_hashes, started, terminated };
let kind = RawByteStr(raw_str_i);
Literal { kind, suffix_start }
}
_ => self.ident(),
@ -594,29 +671,41 @@ fn double_quoted_string(&mut self) -> bool {
false
}
/// Eats the double-quoted string and returns a tuple of
/// (amount of the '#' symbols, raw string started, raw string terminated)
fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) {
/// Eats the double-quoted string and returns an `UnvalidatedRawStr`.
fn raw_double_quoted_string(&mut self, prefix_len: usize) -> UnvalidatedRawStr {
debug_assert!(self.prev() == 'r');
let mut started: bool = false;
let mut finished: bool = false;
let mut valid_start: bool = false;
let start_pos = self.len_consumed();
let (mut possible_terminator_offset, mut max_hashes) = (None, 0);
// Count opening '#' symbols.
let n_hashes = self.eat_while(|c| c == '#');
let n_start_hashes = self.eat_while(|c| c == '#');
// Check that string is started.
match self.bump() {
Some('"') => started = true,
_ => return (n_hashes, started, finished),
Some('"') => valid_start = true,
_ => {
return UnvalidatedRawStr {
valid_start,
n_start_hashes,
n_end_hashes: 0,
possible_terminator_offset,
};
}
}
// Skip the string contents and on each '#' character met, check if this is
// a raw string termination.
while !finished {
loop {
self.eat_while(|c| c != '"');
if self.is_eof() {
return (n_hashes, started, finished);
return UnvalidatedRawStr {
valid_start,
n_start_hashes,
n_end_hashes: max_hashes,
possible_terminator_offset,
};
}
// Eat closing double quote.
@ -624,7 +713,7 @@ fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) {
// Check that amount of closing '#' symbols
// is equal to the amount of opening ones.
let mut hashes_left = n_hashes;
let mut hashes_left = n_start_hashes;
let is_closing_hash = |c| {
if c == '#' && hashes_left != 0 {
hashes_left -= 1;
@ -633,10 +722,23 @@ fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) {
false
}
};
finished = self.eat_while(is_closing_hash) == n_hashes;
}
let n_end_hashes = self.eat_while(is_closing_hash);
(n_hashes, started, finished)
if n_end_hashes == n_start_hashes {
return UnvalidatedRawStr {
valid_start,
n_start_hashes,
n_end_hashes,
possible_terminator_offset: None,
};
} else if n_end_hashes > max_hashes {
// Keep track of possible terminators to give a hint about where there might be
// a missing terminator
possible_terminator_offset =
Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len);
max_hashes = n_end_hashes;
}
}
}
fn eat_decimal_digits(&mut self) -> bool {

View File

@ -0,0 +1,121 @@
#[cfg(test)]
mod tests {
use crate::*;
fn check_raw_str(
s: &str,
expected: UnvalidatedRawStr,
validated: Result<ValidatedRawStr, LexRawStrError>,
) {
let s = &format!("r{}", s);
let mut cursor = Cursor::new(s);
cursor.bump();
let tok = cursor.raw_double_quoted_string(0);
assert_eq!(tok, expected);
assert_eq!(tok.validate(), validated);
}
#[test]
fn test_naked_raw_str() {
check_raw_str(
r#""abc""#,
UnvalidatedRawStr {
n_start_hashes: 0,
n_end_hashes: 0,
valid_start: true,
possible_terminator_offset: None,
},
Ok(ValidatedRawStr { n_hashes: 0 }),
);
}
#[test]
fn test_raw_no_start() {
check_raw_str(
r##""abc"#"##,
UnvalidatedRawStr {
n_start_hashes: 0,
n_end_hashes: 0,
valid_start: true,
possible_terminator_offset: None,
},
Ok(ValidatedRawStr { n_hashes: 0 }),
);
}
#[test]
fn test_too_many_terminators() {
// this error is handled in the parser later
check_raw_str(
r###"#"abc"##"###,
UnvalidatedRawStr {
n_start_hashes: 1,
n_end_hashes: 1,
valid_start: true,
possible_terminator_offset: None,
},
Ok(ValidatedRawStr { n_hashes: 1 }),
);
}
#[test]
fn test_unterminated() {
check_raw_str(
r#"#"abc"#,
UnvalidatedRawStr {
n_start_hashes: 1,
n_end_hashes: 0,
valid_start: true,
possible_terminator_offset: None,
},
Err(LexRawStrError::NoTerminator {
expected: 1,
found: 0,
possible_terminator_offset: None,
}),
);
check_raw_str(
r###"##"abc"#"###,
UnvalidatedRawStr {
n_start_hashes: 2,
n_end_hashes: 1,
valid_start: true,
possible_terminator_offset: Some(7),
},
Err(LexRawStrError::NoTerminator {
expected: 2,
found: 1,
possible_terminator_offset: Some(7),
}),
);
// We're looking for "# not just any #
check_raw_str(
r###"##"abc#"###,
UnvalidatedRawStr {
n_start_hashes: 2,
n_end_hashes: 0,
valid_start: true,
possible_terminator_offset: None,
},
Err(LexRawStrError::NoTerminator {
expected: 2,
found: 0,
possible_terminator_offset: None,
}),
)
}
#[test]
fn test_invalid_start() {
check_raw_str(
r##"#~"abc"#"##,
UnvalidatedRawStr {
n_start_hashes: 1,
n_end_hashes: 0,
valid_start: false,
possible_terminator_offset: None,
},
Err(LexRawStrError::InvalidStarter),
);
}
}

View File

@ -1,20 +1,20 @@
use rustc_ast::token::{self, Token, TokenKind};
use rustc_ast::util::comments;
use rustc_data_structures::sync::Lrc;
use rustc_errors::{error_code, DiagnosticBuilder, FatalError};
use rustc_lexer::unescape;
use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError};
use rustc_lexer::Base;
use rustc_lexer::{unescape, LexRawStrError, UnvalidatedRawStr, ValidatedRawStr};
use rustc_session::parse::ParseSess;
use rustc_span::symbol::{sym, Symbol};
use rustc_span::{BytePos, Pos, Span};
use log::debug;
use std::char;
use std::convert::TryInto;
mod tokentrees;
mod unescape_error_reporting;
mod unicode_chars;
use unescape_error_reporting::{emit_unescape_error, push_escaped_char};
#[derive(Clone, Debug)]
@ -373,30 +373,22 @@ fn cook_lexer_literal(
let id = self.symbol_from_to(content_start, content_end);
(token::ByteStr, id)
}
rustc_lexer::LiteralKind::RawStr { n_hashes, started, terminated } => {
if !started {
self.report_non_started_raw_string(start);
}
if !terminated {
self.report_unterminated_raw_string(start, n_hashes)
}
let n_hashes: u16 = self.restrict_n_hashes(start, n_hashes);
rustc_lexer::LiteralKind::RawStr(unvalidated_raw_str) => {
let valid_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str);
let n_hashes = valid_raw_str.num_hashes();
let n = u32::from(n_hashes);
let content_start = start + BytePos(2 + n);
let content_end = suffix_start - BytePos(1 + n);
self.validate_raw_str_escape(content_start, content_end);
let id = self.symbol_from_to(content_start, content_end);
(token::StrRaw(n_hashes), id)
}
rustc_lexer::LiteralKind::RawByteStr { n_hashes, started, terminated } => {
if !started {
self.report_non_started_raw_string(start);
}
if !terminated {
self.report_unterminated_raw_string(start, n_hashes)
}
let n_hashes: u16 = self.restrict_n_hashes(start, n_hashes);
rustc_lexer::LiteralKind::RawByteStr(unvalidated_raw_str) => {
let validated_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str);
let n_hashes = validated_raw_str.num_hashes();
let n = u32::from(n_hashes);
let content_start = start + BytePos(3 + n);
let content_end = suffix_start - BytePos(1 + n);
self.validate_raw_byte_str_escape(content_start, content_end);
@ -482,6 +474,26 @@ fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) {
}
}
fn validate_and_report_errors(
&self,
start: BytePos,
unvalidated_raw_str: UnvalidatedRawStr,
) -> ValidatedRawStr {
match unvalidated_raw_str.validate() {
Err(LexRawStrError::InvalidStarter) => self.report_non_started_raw_string(start),
Err(LexRawStrError::NoTerminator { expected, found, possible_terminator_offset }) => {
self.report_unterminated_raw_string(
start,
expected,
possible_terminator_offset,
found,
)
}
Err(LexRawStrError::TooManyDelimiters) => self.report_too_many_hashes(start),
Ok(valid) => valid,
}
}
fn report_non_started_raw_string(&self, start: BytePos) -> ! {
let bad_char = self.str_from(start).chars().last().unwrap();
self.struct_fatal_span_char(
@ -495,38 +507,51 @@ fn report_non_started_raw_string(&self, start: BytePos) -> ! {
FatalError.raise()
}
fn report_unterminated_raw_string(&self, start: BytePos, n_hashes: usize) -> ! {
fn report_unterminated_raw_string(
&self,
start: BytePos,
n_hashes: usize,
possible_offset: Option<usize>,
found_terminators: usize,
) -> ! {
let mut err = self.sess.span_diagnostic.struct_span_fatal_with_code(
self.mk_sp(start, start),
"unterminated raw string",
error_code!(E0748),
);
err.span_label(self.mk_sp(start, start), "unterminated raw string");
if n_hashes > 0 {
err.note(&format!(
"this raw string should be terminated with `\"{}`",
"#".repeat(n_hashes as usize)
"#".repeat(n_hashes)
));
}
if let Some(possible_offset) = possible_offset {
let lo = start + BytePos(possible_offset as u32);
let hi = lo + BytePos(found_terminators as u32);
let span = self.mk_sp(lo, hi);
err.span_suggestion(
span,
"consider terminating the string here",
"#".repeat(n_hashes),
Applicability::MaybeIncorrect,
);
}
err.emit();
FatalError.raise()
}
fn restrict_n_hashes(&self, start: BytePos, n_hashes: usize) -> u16 {
match n_hashes.try_into() {
Ok(n_hashes) => n_hashes,
Err(_) => {
self.fatal_span_(
start,
self.pos,
"too many `#` symbols: raw strings may be \
delimited by up to 65535 `#` symbols",
)
.raise();
}
}
fn report_too_many_hashes(&self, start: BytePos) -> ! {
self.fatal_span_(
start,
self.pos,
"too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols",
)
.raise();
}
fn validate_char_escape(&self, content_start: BytePos, content_end: BytePos) {

View File

@ -4,6 +4,7 @@
#![feature(crate_visibility_modifier)]
#![feature(bindings_after_at)]
#![feature(try_blocks)]
#![feature(or_patterns)]
use rustc_ast::ast;
use rustc_ast::token::{self, Nonterminal};

View File

@ -6,7 +6,7 @@
};
use rustc_ast::ast::{AttrVec, ItemKind, Mutability, Pat, PatKind, PathSegment, QSelf, Ty, TyKind};
use rustc_ast::ptr::P;
use rustc_ast::token::{self, TokenKind};
use rustc_ast::token::{self, Lit, LitKind, TokenKind};
use rustc_ast::util::parser::AssocOp;
use rustc_ast_pretty::pprust;
use rustc_data_structures::fx::FxHashSet;
@ -255,6 +255,10 @@ fn tokens_to_string(tokens: &[TokenType]) -> String {
}
}
if self.check_too_many_raw_str_terminators(&mut err) {
return Err(err);
}
let sm = self.sess.source_map();
if self.prev_token.span == DUMMY_SP {
// Account for macro context where the previous span might not be
@ -282,6 +286,29 @@ fn tokens_to_string(tokens: &[TokenType]) -> String {
Err(err)
}
fn check_too_many_raw_str_terminators(&mut self, err: &mut DiagnosticBuilder<'_>) -> bool {
match (&self.prev_token.kind, &self.token.kind) {
(
TokenKind::Literal(Lit {
kind: LitKind::StrRaw(n_hashes) | LitKind::ByteStrRaw(n_hashes),
..
}),
TokenKind::Pound,
) => {
err.set_primary_message("too many `#` when terminating raw string");
err.span_suggestion(
self.token.span,
"remove the extra `#`",
String::new(),
Applicability::MachineApplicable,
);
err.note(&format!("the raw string started with {} `#`s", n_hashes));
true
}
_ => false,
}
}
pub fn maybe_annotate_with_ascription(
&mut self,
err: &mut DiagnosticBuilder<'_>,
@ -491,7 +518,7 @@ fn attempt_chained_comparison_suggestion(
.unwrap_or_else(|_| pprust::expr_to_string(&e))
};
err.span_suggestion_verbose(
inner_op.span.shrink_to_hi(),
inner_op.span.shrink_to_hi(),
"split the comparison into two",
format!(" && {}", expr_to_str(&r1)),
Applicability::MaybeIncorrect,
@ -1086,7 +1113,7 @@ pub(super) fn could_ascription_be_path(&self, node: &ast::ExprKind) -> bool {
self.look_ahead(2, |t| t.is_ident())
|| self.look_ahead(1, |t| t == &token::ModSep)
&& (self.look_ahead(2, |t| t.is_ident()) || // `foo:bar::baz`
self.look_ahead(2, |t| t == &token::Lt)) // `foo:bar::<baz>`
self.look_ahead(2, |t| t == &token::Lt)) // `foo:bar::<baz>`
}
pub(super) fn recover_seq_parse_error(

View File

@ -1,4 +0,0 @@
static s: &'static str =
r#"
"## //~ ERROR expected one of `.`, `;`, `?`, or an operator, found `#`
;

View File

@ -1,8 +0,0 @@
error: expected one of `.`, `;`, `?`, or an operator, found `#`
--> $DIR/raw-str-unbalanced.rs:3:9
|
LL | "##
| ^ expected one of `.`, `;`, `?`, or an operator
error: aborting due to previous error

View File

@ -2,7 +2,9 @@ error[E0748]: unterminated raw string
--> $DIR/raw-byte-string-eof.rs:2:5
|
LL | br##"a"#;
| ^ unterminated raw string
| ^ - help: consider terminating the string here: `##`
| |
| unterminated raw string
|
= note: this raw string should be terminated with `"##`

View File

@ -0,0 +1,14 @@
// check-pass
macro_rules! m1 {
($tt:tt #) => ()
}
macro_rules! m2 {
($tt:tt) => ()
}
fn main() {
m1!(r#"abc"##);
m2!(r#"abc"#);
}

View File

@ -0,0 +1,4 @@
static s: &'static str =
r#"
"## //~ too many `#` when terminating raw string
;

View File

@ -0,0 +1,10 @@
error: too many `#` when terminating raw string
--> $DIR/raw-str-unbalanced.rs:3:9
|
LL | "##
| ^ help: remove the extra `#`
|
= note: the raw string started with 1 `#`s
error: aborting due to previous error

View File

@ -0,0 +1,4 @@
fn main() {
let x = r###"here's a long string"# "# "##;
//~^ ERROR unterminated raw string
}

View File

@ -0,0 +1,11 @@
error[E0748]: unterminated raw string
--> $DIR/raw-string-2.rs:2:13
|
LL | let x = r###"here's a long string"# "# "##;
| ^ unterminated raw string -- help: consider terminating the string here: `###`
|
= note: this raw string should be terminated with `"###`
error: aborting due to previous error
For more information about this error, try `rustc --explain E0748`.

View File

@ -1,8 +1,10 @@
error[E0748]: unterminated raw string
--> $DIR/raw_string.rs:2:13
--> $DIR/raw-string.rs:2:13
|
LL | let x = r##"lol"#;
| ^ unterminated raw string
| ^ - help: consider terminating the string here: `##`
| |
| unterminated raw string
|
= note: this raw string should be terminated with `"##`