From 1fe6ac87e91f97ac28111863970aff6f5d2deb31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pol=20Valletb=C3=B3?= <pol.valletbo@glovoapp.com>
Date: Wed, 11 Oct 2023 12:19:00 +0200
Subject: [PATCH 1/2] add diagnosis messages for chars and byte literal errors

---
 crates/parser/src/lexed_str.rs                | 55 +++++++++++
 .../lexer/err/byte_char_literals.rast         | 92 +++++++++++++++++++
 .../test_data/lexer/err/byte_char_literals.rs | 47 ++++++++++
 .../test_data/lexer/err/char_literals.rast    | 92 +++++++++++++++++++
 .../test_data/lexer/err/char_literals.rs      | 47 ++++++++++
 .../test_data/lexer/ok/byte_strings.rast      |  6 --
 .../parser/test_data/lexer/ok/byte_strings.rs |  6 +-
 crates/parser/test_data/lexer/ok/chars.rast   |  2 -
 crates/parser/test_data/lexer/ok/chars.rs     |  2 +-
 9 files changed, 337 insertions(+), 12 deletions(-)
 create mode 100644 crates/parser/test_data/lexer/err/byte_char_literals.rast
 create mode 100644 crates/parser/test_data/lexer/err/byte_char_literals.rs
 create mode 100644 crates/parser/test_data/lexer/err/char_literals.rast
 create mode 100644 crates/parser/test_data/lexer/err/char_literals.rs

diff --git a/crates/parser/src/lexed_str.rs b/crates/parser/src/lexed_str.rs
index 30c1c4f8c75..031ac27724e 100644
--- a/crates/parser/src/lexed_str.rs
+++ b/crates/parser/src/lexed_str.rs
@@ -9,8 +9,11 @@
 //! include info about comments and whitespace.
 
 use rustc_dependencies::lexer as rustc_lexer;
+
 use std::ops;
 
+use rustc_lexer::unescape::{Mode, EscapeError};
+
 use crate::{
     SyntaxKind::{self, *},
     T,
@@ -254,13 +257,28 @@ fn extend_literal(&mut self, len: usize, kind: &rustc_lexer::LiteralKind) {
             rustc_lexer::LiteralKind::Char { terminated } => {
                 if !terminated {
                     err = "Missing trailing `'` symbol to terminate the character literal";
+                } else {
+                    let text = &self.res.text[self.offset + 1..][..len - 1];
+                    let i = text.rfind('\'').unwrap();
+                    let text = &text[..i];
+                    if let Err(e) = rustc_lexer::unescape::unescape_char(text) {
+                        err = error_to_diagnostic_message(e, Mode::Char);
+                    }
                 }
                 CHAR
             }
             rustc_lexer::LiteralKind::Byte { terminated } => {
                 if !terminated {
                     err = "Missing trailing `'` symbol to terminate the byte literal";
+                } else {
+                    let text = &self.res.text[self.offset + 2..][..len - 2];
+                    let i = text.rfind('\'').unwrap();
+                    let text = &text[..i];
+                    if let Err(e) = rustc_lexer::unescape::unescape_char(text) {
+                        err = error_to_diagnostic_message(e, Mode::Byte);
+                    }
                 }
+
                 BYTE
             }
             rustc_lexer::LiteralKind::Str { terminated } => {
@@ -305,3 +323,40 @@ fn extend_literal(&mut self, len: usize, kind: &rustc_lexer::LiteralKind) {
         self.push(syntax_kind, len, err);
     }
 }
+
+fn error_to_diagnostic_message(error: EscapeError, mode: Mode) -> &'static str {
+    match error {
+        EscapeError::ZeroChars => "empty character literal",
+        EscapeError::MoreThanOneChar => "character literal may only contain one codepoint",
+        EscapeError::LoneSlash => "",
+        EscapeError::InvalidEscape if mode == Mode::Byte || mode == Mode::ByteStr => {
+            "unknown byte escape"
+        }
+        EscapeError::InvalidEscape => "unknown character escape",
+        EscapeError::BareCarriageReturn => "",
+        EscapeError::BareCarriageReturnInRawString => "",
+        EscapeError::EscapeOnlyChar if mode == Mode::Byte => "byte constant must be escaped",
+        EscapeError::EscapeOnlyChar => "character constant must be escaped",
+        EscapeError::TooShortHexEscape => "numeric character escape is too short",
+        EscapeError::InvalidCharInHexEscape => "invalid character in numeric character escape",
+        EscapeError::OutOfRangeHexEscape => "out of range hex escape",
+        EscapeError::NoBraceInUnicodeEscape => "incorrect unicode escape sequence",
+        EscapeError::InvalidCharInUnicodeEscape => "invalid character in unicode escape",
+        EscapeError::EmptyUnicodeEscape => "empty unicode escape",
+        EscapeError::UnclosedUnicodeEscape => "unterminated unicode escape",
+        EscapeError::LeadingUnderscoreUnicodeEscape => "invalid start of unicode escape",
+        EscapeError::OverlongUnicodeEscape => "overlong unicode escape",
+        EscapeError::LoneSurrogateUnicodeEscape => "invalid unicode character escape",
+        EscapeError::OutOfRangeUnicodeEscape => "invalid unicode character escape",
+        EscapeError::UnicodeEscapeInByte => "unicode escape in byte string",
+        EscapeError::NonAsciiCharInByte if mode == Mode::Byte => {
+            "non-ASCII character in byte literal"
+        }
+        EscapeError::NonAsciiCharInByte if mode == Mode::ByteStr => {
+            "non-ASCII character in byte string literal"
+        }
+        EscapeError::NonAsciiCharInByte => "non-ASCII character in raw byte string literal",
+        EscapeError::UnskippedWhitespaceWarning => "",
+        EscapeError::MultipleSkippedLinesWarning => "",
+    }
+}
diff --git a/crates/parser/test_data/lexer/err/byte_char_literals.rast b/crates/parser/test_data/lexer/err/byte_char_literals.rast
new file mode 100644
index 00000000000..24892bc2394
--- /dev/null
+++ b/crates/parser/test_data/lexer/err/byte_char_literals.rast
@@ -0,0 +1,92 @@
+BYTE "b''" error: empty character literal
+WHITESPACE "\n"
+BYTE "b'\\'" error: Missing trailing `'` symbol to terminate the byte literal
+WHITESPACE "\n"
+BYTE "b'\n'" error: byte constant must be escaped
+WHITESPACE "\n"
+BYTE "b'spam'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\x0ff'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\\"a'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\na'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\ra'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\ta'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\\\a'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\'a'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\0a'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\u{0}x'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\u{1F63b}}'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\v'" error: unknown byte escape
+WHITESPACE "\n"
+BYTE "b'\\💩'" error: unknown byte escape
+WHITESPACE "\n"
+BYTE "b'\\●'" error: unknown byte escape
+WHITESPACE "\n"
+BYTE "b'\\\\\\r'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+BYTE "b'\\x'" error: numeric character escape is too short
+WHITESPACE "\n"
+BYTE "b'\\x0'" error: numeric character escape is too short
+WHITESPACE "\n"
+BYTE "b'\\xf'" error: numeric character escape is too short
+WHITESPACE "\n"
+BYTE "b'\\xa'" error: numeric character escape is too short
+WHITESPACE "\n"
+BYTE "b'\\xx'" error: invalid character in numeric character escape
+WHITESPACE "\n"
+BYTE "b'\\xы'" error: invalid character in numeric character escape
+WHITESPACE "\n"
+BYTE "b'\\x🦀'" error: invalid character in numeric character escape
+WHITESPACE "\n"
+BYTE "b'\\xtt'" error: invalid character in numeric character escape
+WHITESPACE "\n"
+BYTE "b'\\xff'" error: out of range hex escape
+WHITESPACE "\n"
+BYTE "b'\\xFF'" error: out of range hex escape
+WHITESPACE "\n"
+BYTE "b'\\x80'" error: out of range hex escape
+WHITESPACE "\n"
+BYTE "b'\\u'" error: incorrect unicode escape sequence
+WHITESPACE "\n"
+BYTE "b'\\u[0123]'" error: incorrect unicode escape sequence
+WHITESPACE "\n"
+BYTE "b'\\u{0x}'" error: invalid character in unicode escape
+WHITESPACE "\n"
+BYTE "b'\\u{'" error: unterminated unicode escape
+WHITESPACE "\n"
+BYTE "b'\\u{0000'" error: unterminated unicode escape
+WHITESPACE "\n"
+BYTE "b'\\u{}'" error: empty unicode escape
+WHITESPACE "\n"
+BYTE "b'\\u{_0000}'" error: invalid start of unicode escape
+WHITESPACE "\n"
+BYTE "b'\\u{0000000}'" error: overlong unicode escape
+WHITESPACE "\n"
+BYTE "b'\\u{FFFFFF}'" error: invalid unicode character escape
+WHITESPACE "\n"
+BYTE "b'\\u{ffffff}'" error: invalid unicode character escape
+WHITESPACE "\n"
+BYTE "b'\\u{ffffff}'" error: invalid unicode character escape
+WHITESPACE "\n"
+BYTE "b'\\u{DC00}'" error: invalid unicode character escape
+WHITESPACE "\n"
+BYTE "b'\\u{DDDD}'" error: invalid unicode character escape
+WHITESPACE "\n"
+BYTE "b'\\u{DFFF}'" error: invalid unicode character escape
+WHITESPACE "\n"
+BYTE "b'\\u{D800}'" error: invalid unicode character escape
+WHITESPACE "\n"
+BYTE "b'\\u{DAAA}'" error: invalid unicode character escape
+WHITESPACE "\n"
+BYTE "b'\\u{DBFF}'" error: invalid unicode character escape
+WHITESPACE "\n"
diff --git a/crates/parser/test_data/lexer/err/byte_char_literals.rs b/crates/parser/test_data/lexer/err/byte_char_literals.rs
new file mode 100644
index 00000000000..9f2f4309e76
--- /dev/null
+++ b/crates/parser/test_data/lexer/err/byte_char_literals.rs
@@ -0,0 +1,47 @@
+b''
+b'\'
+b'
+'
+b'spam'
+b'\x0ff'
+b'\"a'
+b'\na'
+b'\ra'
+b'\ta'
+b'\\a'
+b'\'a'
+b'\0a'
+b'\u{0}x'
+b'\u{1F63b}}'
+b'\v'
+b'\💩'
+b'\●'
+b'\\\r'
+b'\x'
+b'\x0'
+b'\xf'
+b'\xa'
+b'\xx'
+b'\xы'
+b'\x🦀'
+b'\xtt'
+b'\xff'
+b'\xFF'
+b'\x80'
+b'\u'
+b'\u[0123]'
+b'\u{0x}'
+b'\u{'
+b'\u{0000'
+b'\u{}'
+b'\u{_0000}'
+b'\u{0000000}'
+b'\u{FFFFFF}'
+b'\u{ffffff}'
+b'\u{ffffff}'
+b'\u{DC00}'
+b'\u{DDDD}'
+b'\u{DFFF}'
+b'\u{D800}'
+b'\u{DAAA}'
+b'\u{DBFF}'
diff --git a/crates/parser/test_data/lexer/err/char_literals.rast b/crates/parser/test_data/lexer/err/char_literals.rast
new file mode 100644
index 00000000000..b1e1364d4c2
--- /dev/null
+++ b/crates/parser/test_data/lexer/err/char_literals.rast
@@ -0,0 +1,92 @@
+CHAR "'hello'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "''" error: empty character literal
+WHITESPACE "\n"
+CHAR "'\n'" error: character constant must be escaped
+WHITESPACE "\n"
+CHAR "'spam'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\x0ff'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\\"a'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\na'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\ra'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\ta'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\\\a'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\'a'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\0a'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\u{0}x'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\u{1F63b}}'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\v'" error: unknown character escape
+WHITESPACE "\n"
+CHAR "'\\💩'" error: unknown character escape
+WHITESPACE "\n"
+CHAR "'\\●'" error: unknown character escape
+WHITESPACE "\n"
+CHAR "'\\\\\\r'" error: character literal may only contain one codepoint
+WHITESPACE "\n"
+CHAR "'\\x'" error: numeric character escape is too short
+WHITESPACE "\n"
+CHAR "'\\x0'" error: numeric character escape is too short
+WHITESPACE "\n"
+CHAR "'\\xf'" error: numeric character escape is too short
+WHITESPACE "\n"
+CHAR "'\\xa'" error: numeric character escape is too short
+WHITESPACE "\n"
+CHAR "'\\xx'" error: invalid character in numeric character escape
+WHITESPACE "\n"
+CHAR "'\\xы'" error: invalid character in numeric character escape
+WHITESPACE "\n"
+CHAR "'\\x🦀'" error: invalid character in numeric character escape
+WHITESPACE "\n"
+CHAR "'\\xtt'" error: invalid character in numeric character escape
+WHITESPACE "\n"
+CHAR "'\\xff'" error: out of range hex escape
+WHITESPACE "\n"
+CHAR "'\\xFF'" error: out of range hex escape
+WHITESPACE "\n"
+CHAR "'\\x80'" error: out of range hex escape
+WHITESPACE "\n"
+CHAR "'\\u'" error: incorrect unicode escape sequence
+WHITESPACE "\n"
+CHAR "'\\u[0123]'" error: incorrect unicode escape sequence
+WHITESPACE "\n"
+CHAR "'\\u{0x}'" error: invalid character in unicode escape
+WHITESPACE "\n"
+CHAR "'\\u{'" error: unterminated unicode escape
+WHITESPACE "\n"
+CHAR "'\\u{0000'" error: unterminated unicode escape
+WHITESPACE "\n"
+CHAR "'\\u{}'" error: empty unicode escape
+WHITESPACE "\n"
+CHAR "'\\u{_0000}'" error: invalid start of unicode escape
+WHITESPACE "\n"
+CHAR "'\\u{0000000}'" error: overlong unicode escape
+WHITESPACE "\n"
+CHAR "'\\u{FFFFFF}'" error: invalid unicode character escape
+WHITESPACE "\n"
+CHAR "'\\u{ffffff}'" error: invalid unicode character escape
+WHITESPACE "\n"
+CHAR "'\\u{ffffff}'" error: invalid unicode character escape
+WHITESPACE "\n"
+CHAR "'\\u{DC00}'" error: invalid unicode character escape
+WHITESPACE "\n"
+CHAR "'\\u{DDDD}'" error: invalid unicode character escape
+WHITESPACE "\n"
+CHAR "'\\u{DFFF}'" error: invalid unicode character escape
+WHITESPACE "\n"
+CHAR "'\\u{D800}'" error: invalid unicode character escape
+WHITESPACE "\n"
+CHAR "'\\u{DAAA}'" error: invalid unicode character escape
+WHITESPACE "\n"
+CHAR "'\\u{DBFF}'" error: invalid unicode character escape
+WHITESPACE "\n"
diff --git a/crates/parser/test_data/lexer/err/char_literals.rs b/crates/parser/test_data/lexer/err/char_literals.rs
new file mode 100644
index 00000000000..291f99d8020
--- /dev/null
+++ b/crates/parser/test_data/lexer/err/char_literals.rs
@@ -0,0 +1,47 @@
+'hello'
+''
+'
+'
+'spam'
+'\x0ff'
+'\"a'
+'\na'
+'\ra'
+'\ta'
+'\\a'
+'\'a'
+'\0a'
+'\u{0}x'
+'\u{1F63b}}'
+'\v'
+'\💩'
+'\●'
+'\\\r'
+'\x'
+'\x0'
+'\xf'
+'\xa'
+'\xx'
+'\xы'
+'\x🦀'
+'\xtt'
+'\xff'
+'\xFF'
+'\x80'
+'\u'
+'\u[0123]'
+'\u{0x}'
+'\u{'
+'\u{0000'
+'\u{}'
+'\u{_0000}'
+'\u{0000000}'
+'\u{FFFFFF}'
+'\u{ffffff}'
+'\u{ffffff}'
+'\u{DC00}'
+'\u{DDDD}'
+'\u{DFFF}'
+'\u{D800}'
+'\u{DAAA}'
+'\u{DBFF}'
diff --git a/crates/parser/test_data/lexer/ok/byte_strings.rast b/crates/parser/test_data/lexer/ok/byte_strings.rast
index c848ac368e4..fd20ca57ac6 100644
--- a/crates/parser/test_data/lexer/ok/byte_strings.rast
+++ b/crates/parser/test_data/lexer/ok/byte_strings.rast
@@ -1,13 +1,9 @@
-BYTE "b''"
-WHITESPACE " "
 BYTE "b'x'"
 WHITESPACE " "
 BYTE_STRING "b\"foo\""
 WHITESPACE " "
 BYTE_STRING "br\"\""
 WHITESPACE "\n"
-BYTE "b''suf"
-WHITESPACE " "
 BYTE_STRING "b\"\"ix"
 WHITESPACE " "
 BYTE_STRING "br\"\"br"
@@ -17,6 +13,4 @@ WHITESPACE " "
 BYTE "b'\\\\'"
 WHITESPACE " "
 BYTE "b'\\''"
-WHITESPACE " "
-BYTE "b'hello'"
 WHITESPACE "\n"
diff --git a/crates/parser/test_data/lexer/ok/byte_strings.rs b/crates/parser/test_data/lexer/ok/byte_strings.rs
index b54930f5e69..65460d02cb2 100644
--- a/crates/parser/test_data/lexer/ok/byte_strings.rs
+++ b/crates/parser/test_data/lexer/ok/byte_strings.rs
@@ -1,3 +1,3 @@
-b'' b'x' b"foo" br""
-b''suf b""ix br""br
-b'\n' b'\\' b'\'' b'hello'
+b'x' b"foo" br""
+b""ix br""br
+b'\n' b'\\' b'\''
diff --git a/crates/parser/test_data/lexer/ok/chars.rast b/crates/parser/test_data/lexer/ok/chars.rast
index 66e58cc298f..07172a4ecc0 100644
--- a/crates/parser/test_data/lexer/ok/chars.rast
+++ b/crates/parser/test_data/lexer/ok/chars.rast
@@ -4,8 +4,6 @@ CHAR "' '"
 WHITESPACE " "
 CHAR "'0'"
 WHITESPACE " "
-CHAR "'hello'"
-WHITESPACE " "
 CHAR "'\\x7f'"
 WHITESPACE " "
 CHAR "'\\n'"
diff --git a/crates/parser/test_data/lexer/ok/chars.rs b/crates/parser/test_data/lexer/ok/chars.rs
index 454ee0a5f61..15f52c113c1 100644
--- a/crates/parser/test_data/lexer/ok/chars.rs
+++ b/crates/parser/test_data/lexer/ok/chars.rs
@@ -1 +1 @@
-'x' ' ' '0' 'hello' '\x7f' '\n' '\\' '\''
+'x' ' ' '0' '\x7f' '\n' '\\' '\''

From f58a8250dc3104e336ec8611702d015bd47f508a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pol=20Valletb=C3=B3?= <pol.valletbo@glovoapp.com>
Date: Wed, 11 Oct 2023 12:36:53 +0200
Subject: [PATCH 2/2] fix: cargo fmt

---
 crates/parser/src/lexed_str.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/parser/src/lexed_str.rs b/crates/parser/src/lexed_str.rs
index 031ac27724e..8e8bdce1eef 100644
--- a/crates/parser/src/lexed_str.rs
+++ b/crates/parser/src/lexed_str.rs
@@ -12,7 +12,7 @@
 
 use std::ops;
 
-use rustc_lexer::unescape::{Mode, EscapeError};
+use rustc_lexer::unescape::{EscapeError, Mode};
 
 use crate::{
     SyntaxKind::{self, *},