Emit a single error for contiguous sequences of Unicode homoglyphs
This commit is contained in:
parent
ef4046e4f3
commit
a3d6bc3468
@ -79,7 +79,7 @@ fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span {
|
||||
/// preceded by whitespace.
|
||||
fn next_token(&mut self) -> (Token, bool) {
|
||||
let mut preceded_by_whitespace = false;
|
||||
|
||||
let mut swallow_next_invalid = 0;
|
||||
// Skip trivial (whitespace & comments) tokens
|
||||
loop {
|
||||
let token = self.cursor.advance_token();
|
||||
@ -232,19 +232,34 @@ fn next_token(&mut self) -> (Token, bool) {
|
||||
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
|
||||
|
||||
rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
|
||||
let c = self.str_from(start).chars().next().unwrap();
|
||||
// Don't emit diagnostics for sequences of the same invalid token
|
||||
if swallow_next_invalid > 0 {
|
||||
swallow_next_invalid -= 1;
|
||||
continue;
|
||||
}
|
||||
let mut it = self.str_from_to_end(start).chars();
|
||||
let c = it.next().unwrap();
|
||||
let repeats = it.take_while(|c1| *c1 == c).count();
|
||||
let mut err =
|
||||
self.struct_err_span_char(start, self.pos, "unknown start of token", c);
|
||||
self.struct_err_span_char(start, self.pos + Pos::from_usize(repeats * c.len_utf8()), "unknown start of token", c);
|
||||
// FIXME: the lexer could be used to turn the ASCII version of unicode
|
||||
// homoglyphs, instead of keeping a table in `check_for_substitution`into the
|
||||
// token. Ideally, this should be inside `rustc_lexer`. However, we should
|
||||
// first remove compound tokens like `<<` from `rustc_lexer`, and then add
|
||||
// fancier error recovery to it, as there will be less overall work to do this
|
||||
// way.
|
||||
let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
|
||||
let token = unicode_chars::check_for_substitution(self, start, c, &mut err, repeats+1);
|
||||
if c == '\x00' {
|
||||
err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
|
||||
}
|
||||
if repeats > 0 {
|
||||
if repeats == 1 {
|
||||
err.note(format!("character appears once more"));
|
||||
} else {
|
||||
err.note(format!("character appears {repeats} more times"));
|
||||
}
|
||||
swallow_next_invalid = repeats;
|
||||
}
|
||||
err.emit();
|
||||
if let Some(token) = token {
|
||||
token
|
||||
@ -486,6 +501,11 @@ fn str_from_to(&self, start: BytePos, end: BytePos) -> &str {
|
||||
&self.src[self.src_index(start)..self.src_index(end)]
|
||||
}
|
||||
|
||||
/// Slice of the source text spanning from `start` until the end
|
||||
fn str_from_to_end(&self, start: BytePos) -> &str {
|
||||
&self.src[self.src_index(start)..]
|
||||
}
|
||||
|
||||
fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! {
|
||||
match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) {
|
||||
Err(RawStrError::InvalidStarter { bad_char }) => {
|
||||
|
@ -337,10 +337,11 @@ pub(super) fn check_for_substitution<'a>(
|
||||
pos: BytePos,
|
||||
ch: char,
|
||||
err: &mut Diagnostic,
|
||||
count: usize,
|
||||
) -> Option<token::TokenKind> {
|
||||
let &(_u_char, u_name, ascii_char) = UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch)?;
|
||||
|
||||
let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8()));
|
||||
let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8() * count));
|
||||
|
||||
let Some((_ascii_char, ascii_name, token)) = ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) else {
|
||||
let msg = format!("substitution character not found for '{}'", ch);
|
||||
@ -369,7 +370,12 @@ pub(super) fn check_for_substitution<'a>(
|
||||
"Unicode character '{}' ({}) looks like '{}' ({}), but it is not",
|
||||
ch, u_name, ascii_char, ascii_name
|
||||
);
|
||||
err.span_suggestion(span, &msg, ascii_char, Applicability::MaybeIncorrect);
|
||||
err.span_suggestion(
|
||||
span,
|
||||
&msg,
|
||||
ascii_char.to_string().repeat(count),
|
||||
Applicability::MaybeIncorrect,
|
||||
);
|
||||
}
|
||||
token.clone()
|
||||
}
|
||||
|
@ -77,8 +77,6 @@ LL | /// ```
|
||||
| ^^^
|
||||
|
|
||||
= note: error from rustc: unknown start of token: `
|
||||
= note: error from rustc: unknown start of token: `
|
||||
= note: error from rustc: unknown start of token: `
|
||||
|
||||
warning: could not parse code block as Rust code
|
||||
--> $DIR/invalid-syntax.rs:64:5
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -2,4 +2,8 @@ fn main() {
|
||||
let y = 0;
|
||||
//~^ ERROR unknown start of token: \u{37e}
|
||||
//~^^ HELP Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), but it is not
|
||||
let x = 0;
|
||||
//~^ ERROR unknown start of token: \u{a0}
|
||||
//~^^ NOTE character appears 3 more times
|
||||
//~^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
|
||||
}
|
||||
|
@ -9,5 +9,17 @@ help: Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), b
|
||||
LL | let y = 0;
|
||||
| ~
|
||||
|
||||
error: aborting due to previous error
|
||||
error: unknown start of token: \u{a0}
|
||||
--> $DIR/unicode-chars.rs:5:5
|
||||
|
|
||||
LL | let x = 0;
|
||||
| ^^^^
|
||||
|
|
||||
= note: character appears 3 more times
|
||||
help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
|
||||
|
|
||||
LL | let x = 0;
|
||||
| ++++
|
||||
|
||||
error: aborting due to 2 previous errors
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user