diff --git a/crates/parser/src/lexed_str.rs b/crates/parser/src/lexed_str.rs index 30c1c4f8c75..8e8bdce1eef 100644 --- a/crates/parser/src/lexed_str.rs +++ b/crates/parser/src/lexed_str.rs @@ -9,8 +9,11 @@ //! include info about comments and whitespace. use rustc_dependencies::lexer as rustc_lexer; + use std::ops; +use rustc_lexer::unescape::{EscapeError, Mode}; + use crate::{ SyntaxKind::{self, *}, T, @@ -254,13 +257,28 @@ fn extend_literal(&mut self, len: usize, kind: &rustc_lexer::LiteralKind) { rustc_lexer::LiteralKind::Char { terminated } => { if !terminated { err = "Missing trailing `'` symbol to terminate the character literal"; + } else { + let text = &self.res.text[self.offset + 1..][..len - 1]; + let i = text.rfind('\'').unwrap(); + let text = &text[..i]; + if let Err(e) = rustc_lexer::unescape::unescape_char(text) { + err = error_to_diagnostic_message(e, Mode::Char); + } } CHAR } rustc_lexer::LiteralKind::Byte { terminated } => { if !terminated { err = "Missing trailing `'` symbol to terminate the byte literal"; + } else { + let text = &self.res.text[self.offset + 2..][..len - 2]; + let i = text.rfind('\'').unwrap(); + let text = &text[..i]; + if let Err(e) = rustc_lexer::unescape::unescape_char(text) { + err = error_to_diagnostic_message(e, Mode::Byte); + } } + BYTE } rustc_lexer::LiteralKind::Str { terminated } => { @@ -305,3 +323,40 @@ fn extend_literal(&mut self, len: usize, kind: &rustc_lexer::LiteralKind) { self.push(syntax_kind, len, err); } } + +fn error_to_diagnostic_message(error: EscapeError, mode: Mode) -> &'static str { + match error { + EscapeError::ZeroChars => "empty character literal", + EscapeError::MoreThanOneChar => "character literal may only contain one codepoint", + EscapeError::LoneSlash => "", + EscapeError::InvalidEscape if mode == Mode::Byte || mode == Mode::ByteStr => { + "unknown byte escape" + } + EscapeError::InvalidEscape => "unknown character escape", + EscapeError::BareCarriageReturn => "", + EscapeError::BareCarriageReturnInRawString => "", + EscapeError::EscapeOnlyChar if mode == Mode::Byte => "byte constant must be escaped", + EscapeError::EscapeOnlyChar => "character constant must be escaped", + EscapeError::TooShortHexEscape => "numeric character escape is too short", + EscapeError::InvalidCharInHexEscape => "invalid character in numeric character escape", + EscapeError::OutOfRangeHexEscape => "out of range hex escape", + EscapeError::NoBraceInUnicodeEscape => "incorrect unicode escape sequence", + EscapeError::InvalidCharInUnicodeEscape => "invalid character in unicode escape", + EscapeError::EmptyUnicodeEscape => "empty unicode escape", + EscapeError::UnclosedUnicodeEscape => "unterminated unicode escape", + EscapeError::LeadingUnderscoreUnicodeEscape => "invalid start of unicode escape", + EscapeError::OverlongUnicodeEscape => "overlong unicode escape", + EscapeError::LoneSurrogateUnicodeEscape => "invalid unicode character escape", + EscapeError::OutOfRangeUnicodeEscape => "invalid unicode character escape", + EscapeError::UnicodeEscapeInByte => "unicode escape in byte string", + EscapeError::NonAsciiCharInByte if mode == Mode::Byte => { + "non-ASCII character in byte literal" + } + EscapeError::NonAsciiCharInByte if mode == Mode::ByteStr => { + "non-ASCII character in byte string literal" + } + EscapeError::NonAsciiCharInByte => "non-ASCII character in raw byte string literal", + EscapeError::UnskippedWhitespaceWarning => "", + EscapeError::MultipleSkippedLinesWarning => "", + } +} diff --git a/crates/parser/test_data/lexer/err/byte_char_literals.rast b/crates/parser/test_data/lexer/err/byte_char_literals.rast new file mode 100644 index 00000000000..24892bc2394 --- /dev/null +++ b/crates/parser/test_data/lexer/err/byte_char_literals.rast @@ -0,0 +1,92 @@ +BYTE "b''" error: empty character literal +WHITESPACE "\n" +BYTE "b'\\'" error: Missing trailing `'` symbol to terminate the byte literal +WHITESPACE "\n" +BYTE "b'\n'" error: byte constant must be escaped +WHITESPACE "\n" +BYTE "b'spam'" error: character literal may only contain one codepoint +WHITESPACE "\n" +BYTE "b'\\x0ff'" error: character literal may only contain one codepoint +WHITESPACE "\n" +BYTE "b'\\\"a'" error: character literal may only contain one codepoint +WHITESPACE "\n" +BYTE "b'\\na'" error: character literal may only contain one codepoint +WHITESPACE "\n" +BYTE "b'\\ra'" error: character literal may only contain one codepoint +WHITESPACE "\n" +BYTE "b'\\ta'" error: character literal may only contain one codepoint +WHITESPACE "\n" +BYTE "b'\\\\a'" error: character literal may only contain one codepoint +WHITESPACE "\n" +BYTE "b'\\'a'" error: character literal may only contain one codepoint +WHITESPACE "\n" +BYTE "b'\\0a'" error: character literal may only contain one codepoint +WHITESPACE "\n" +BYTE "b'\\u{0}x'" error: character literal may only contain one codepoint +WHITESPACE "\n" +BYTE "b'\\u{1F63b}}'" error: character literal may only contain one codepoint +WHITESPACE "\n" +BYTE "b'\\v'" error: unknown byte escape +WHITESPACE "\n" +BYTE "b'\\💩'" error: unknown byte escape +WHITESPACE "\n" +BYTE "b'\\●'" error: unknown byte escape +WHITESPACE "\n" +BYTE "b'\\\\\\r'" error: character literal may only contain one codepoint +WHITESPACE "\n" +BYTE "b'\\x'" error: numeric character escape is too short +WHITESPACE "\n" +BYTE "b'\\x0'" error: numeric character escape is too short +WHITESPACE "\n" +BYTE "b'\\xf'" error: numeric character escape is too short +WHITESPACE "\n" +BYTE "b'\\xa'" error: numeric character escape is too short +WHITESPACE "\n" +BYTE "b'\\xx'" error: invalid character in numeric character escape +WHITESPACE "\n" +BYTE "b'\\xы'" error: invalid character in numeric character escape +WHITESPACE "\n" +BYTE "b'\\x🦀'" error: invalid character in numeric character escape +WHITESPACE "\n" +BYTE "b'\\xtt'" error: invalid character in numeric character escape +WHITESPACE "\n" +BYTE "b'\\xff'" error: out of range hex escape +WHITESPACE "\n" +BYTE "b'\\xFF'" error: out of range hex escape +WHITESPACE "\n" +BYTE "b'\\x80'" error: out of range hex escape +WHITESPACE "\n" +BYTE "b'\\u'" error: incorrect unicode escape sequence +WHITESPACE "\n" +BYTE "b'\\u[0123]'" error: incorrect unicode escape sequence +WHITESPACE "\n" +BYTE "b'\\u{0x}'" error: invalid character in unicode escape +WHITESPACE "\n" +BYTE "b'\\u{'" error: unterminated unicode escape +WHITESPACE "\n" +BYTE "b'\\u{0000'" error: unterminated unicode escape +WHITESPACE "\n" +BYTE "b'\\u{}'" error: empty unicode escape +WHITESPACE "\n" +BYTE "b'\\u{_0000}'" error: invalid start of unicode escape +WHITESPACE "\n" +BYTE "b'\\u{0000000}'" error: overlong unicode escape +WHITESPACE "\n" +BYTE "b'\\u{FFFFFF}'" error: invalid unicode character escape +WHITESPACE "\n" +BYTE "b'\\u{ffffff}'" error: invalid unicode character escape +WHITESPACE "\n" +BYTE "b'\\u{ffffff}'" error: invalid unicode character escape +WHITESPACE "\n" +BYTE "b'\\u{DC00}'" error: invalid unicode character escape +WHITESPACE "\n" +BYTE "b'\\u{DDDD}'" error: invalid unicode character escape +WHITESPACE "\n" +BYTE "b'\\u{DFFF}'" error: invalid unicode character escape +WHITESPACE "\n" +BYTE "b'\\u{D800}'" error: invalid unicode character escape +WHITESPACE "\n" +BYTE "b'\\u{DAAA}'" error: invalid unicode character escape +WHITESPACE "\n" +BYTE "b'\\u{DBFF}'" error: invalid unicode character escape +WHITESPACE "\n" diff --git a/crates/parser/test_data/lexer/err/byte_char_literals.rs b/crates/parser/test_data/lexer/err/byte_char_literals.rs new file mode 100644 index 00000000000..9f2f4309e76 --- /dev/null +++ b/crates/parser/test_data/lexer/err/byte_char_literals.rs @@ -0,0 +1,47 @@ +b'' +b'\' +b' +' +b'spam' +b'\x0ff' +b'\"a' +b'\na' +b'\ra' +b'\ta' +b'\\a' +b'\'a' +b'\0a' +b'\u{0}x' +b'\u{1F63b}}' +b'\v' +b'\💩' +b'\●' +b'\\\r' +b'\x' +b'\x0' +b'\xf' +b'\xa' +b'\xx' +b'\xы' +b'\x🦀' +b'\xtt' +b'\xff' +b'\xFF' +b'\x80' +b'\u' +b'\u[0123]' +b'\u{0x}' +b'\u{' +b'\u{0000' +b'\u{}' +b'\u{_0000}' +b'\u{0000000}' +b'\u{FFFFFF}' +b'\u{ffffff}' +b'\u{ffffff}' +b'\u{DC00}' +b'\u{DDDD}' +b'\u{DFFF}' +b'\u{D800}' +b'\u{DAAA}' +b'\u{DBFF}' diff --git a/crates/parser/test_data/lexer/err/char_literals.rast b/crates/parser/test_data/lexer/err/char_literals.rast new file mode 100644 index 00000000000..b1e1364d4c2 --- /dev/null +++ b/crates/parser/test_data/lexer/err/char_literals.rast @@ -0,0 +1,92 @@ +CHAR "'hello'" error: character literal may only contain one codepoint +WHITESPACE "\n" +CHAR "''" error: empty character literal +WHITESPACE "\n" +CHAR "'\n'" error: character constant must be escaped +WHITESPACE "\n" +CHAR "'spam'" error: character literal may only contain one codepoint +WHITESPACE "\n" +CHAR "'\\x0ff'" error: character literal may only contain one codepoint +WHITESPACE "\n" +CHAR "'\\\"a'" error: character literal may only contain one codepoint +WHITESPACE "\n" +CHAR "'\\na'" error: character literal may only contain one codepoint +WHITESPACE "\n" +CHAR "'\\ra'" error: character literal may only contain one codepoint +WHITESPACE "\n" +CHAR "'\\ta'" error: character literal may only contain one codepoint +WHITESPACE "\n" +CHAR "'\\\\a'" error: character literal may only contain one codepoint +WHITESPACE "\n" +CHAR "'\\'a'" error: character literal may only contain one codepoint +WHITESPACE "\n" +CHAR "'\\0a'" error: character literal may only contain one codepoint +WHITESPACE "\n" +CHAR "'\\u{0}x'" error: character literal may only contain one codepoint +WHITESPACE "\n" +CHAR "'\\u{1F63b}}'" error: character literal may only contain one codepoint +WHITESPACE "\n" +CHAR "'\\v'" error: unknown character escape +WHITESPACE "\n" +CHAR "'\\💩'" error: unknown character escape +WHITESPACE "\n" +CHAR "'\\●'" error: unknown character escape +WHITESPACE "\n" +CHAR "'\\\\\\r'" error: character literal may only contain one codepoint +WHITESPACE "\n" +CHAR "'\\x'" error: numeric character escape is too short +WHITESPACE "\n" +CHAR "'\\x0'" error: numeric character escape is too short +WHITESPACE "\n" +CHAR "'\\xf'" error: numeric character escape is too short +WHITESPACE "\n" +CHAR "'\\xa'" error: numeric character escape is too short +WHITESPACE "\n" +CHAR "'\\xx'" error: invalid character in numeric character escape +WHITESPACE "\n" +CHAR "'\\xы'" error: invalid character in numeric character escape +WHITESPACE "\n" +CHAR "'\\x🦀'" error: invalid character in numeric character escape +WHITESPACE "\n" +CHAR "'\\xtt'" error: invalid character in numeric character escape +WHITESPACE "\n" +CHAR "'\\xff'" error: out of range hex escape +WHITESPACE "\n" +CHAR "'\\xFF'" error: out of range hex escape +WHITESPACE "\n" +CHAR "'\\x80'" error: out of range hex escape +WHITESPACE "\n" +CHAR "'\\u'" error: incorrect unicode escape sequence +WHITESPACE "\n" +CHAR "'\\u[0123]'" error: incorrect unicode escape sequence +WHITESPACE "\n" +CHAR "'\\u{0x}'" error: invalid character in unicode escape +WHITESPACE "\n" +CHAR "'\\u{'" error: unterminated unicode escape +WHITESPACE "\n" +CHAR "'\\u{0000'" error: unterminated unicode escape +WHITESPACE "\n" +CHAR "'\\u{}'" error: empty unicode escape +WHITESPACE "\n" +CHAR "'\\u{_0000}'" error: invalid start of unicode escape +WHITESPACE "\n" +CHAR "'\\u{0000000}'" error: overlong unicode escape +WHITESPACE "\n" +CHAR "'\\u{FFFFFF}'" error: invalid unicode character escape +WHITESPACE "\n" +CHAR "'\\u{ffffff}'" error: invalid unicode character escape +WHITESPACE "\n" +CHAR "'\\u{ffffff}'" error: invalid unicode character escape +WHITESPACE "\n" +CHAR "'\\u{DC00}'" error: invalid unicode character escape +WHITESPACE "\n" +CHAR "'\\u{DDDD}'" error: invalid unicode character escape +WHITESPACE "\n" +CHAR "'\\u{DFFF}'" error: invalid unicode character escape +WHITESPACE "\n" +CHAR "'\\u{D800}'" error: invalid unicode character escape +WHITESPACE "\n" +CHAR "'\\u{DAAA}'" error: invalid unicode character escape +WHITESPACE "\n" +CHAR "'\\u{DBFF}'" error: invalid unicode character escape +WHITESPACE "\n" diff --git a/crates/parser/test_data/lexer/err/char_literals.rs b/crates/parser/test_data/lexer/err/char_literals.rs new file mode 100644 index 00000000000..291f99d8020 --- /dev/null +++ b/crates/parser/test_data/lexer/err/char_literals.rs @@ -0,0 +1,47 @@ +'hello' +'' +' +' +'spam' +'\x0ff' +'\"a' +'\na' +'\ra' +'\ta' +'\\a' +'\'a' +'\0a' +'\u{0}x' +'\u{1F63b}}' +'\v' +'\💩' +'\●' +'\\\r' +'\x' +'\x0' +'\xf' +'\xa' +'\xx' +'\xы' +'\x🦀' +'\xtt' +'\xff' +'\xFF' +'\x80' +'\u' +'\u[0123]' +'\u{0x}' +'\u{' +'\u{0000' +'\u{}' +'\u{_0000}' +'\u{0000000}' +'\u{FFFFFF}' +'\u{ffffff}' +'\u{ffffff}' +'\u{DC00}' +'\u{DDDD}' +'\u{DFFF}' +'\u{D800}' +'\u{DAAA}' +'\u{DBFF}' diff --git a/crates/parser/test_data/lexer/ok/byte_strings.rast b/crates/parser/test_data/lexer/ok/byte_strings.rast index c848ac368e4..fd20ca57ac6 100644 --- a/crates/parser/test_data/lexer/ok/byte_strings.rast +++ b/crates/parser/test_data/lexer/ok/byte_strings.rast @@ -1,13 +1,9 @@ -BYTE "b''" -WHITESPACE " " BYTE "b'x'" WHITESPACE " " BYTE_STRING "b\"foo\"" WHITESPACE " " BYTE_STRING "br\"\"" WHITESPACE "\n" -BYTE "b''suf" -WHITESPACE " " BYTE_STRING "b\"\"ix" WHITESPACE " " BYTE_STRING "br\"\"br" @@ -17,6 +13,4 @@ WHITESPACE " " BYTE "b'\\\\'" WHITESPACE " " BYTE "b'\\''" -WHITESPACE " " -BYTE "b'hello'" WHITESPACE "\n" diff --git a/crates/parser/test_data/lexer/ok/byte_strings.rs b/crates/parser/test_data/lexer/ok/byte_strings.rs index b54930f5e69..65460d02cb2 100644 --- a/crates/parser/test_data/lexer/ok/byte_strings.rs +++ b/crates/parser/test_data/lexer/ok/byte_strings.rs @@ -1,3 +1,3 @@ -b'' b'x' b"foo" br"" -b''suf b""ix br""br -b'\n' b'\\' b'\'' b'hello' +b'x' b"foo" br"" +b""ix br""br +b'\n' b'\\' b'\'' diff --git a/crates/parser/test_data/lexer/ok/chars.rast b/crates/parser/test_data/lexer/ok/chars.rast index 66e58cc298f..07172a4ecc0 100644 --- a/crates/parser/test_data/lexer/ok/chars.rast +++ b/crates/parser/test_data/lexer/ok/chars.rast @@ -4,8 +4,6 @@ CHAR "' '" WHITESPACE " " CHAR "'0'" WHITESPACE " " -CHAR "'hello'" -WHITESPACE " " CHAR "'\\x7f'" WHITESPACE " " CHAR "'\\n'" diff --git a/crates/parser/test_data/lexer/ok/chars.rs b/crates/parser/test_data/lexer/ok/chars.rs index 454ee0a5f61..15f52c113c1 100644 --- a/crates/parser/test_data/lexer/ok/chars.rs +++ b/crates/parser/test_data/lexer/ok/chars.rs @@ -1 +1 @@ -'x' ' ' '0' 'hello' '\x7f' '\n' '\\' '\'' +'x' ' ' '0' '\x7f' '\n' '\\' '\''