395 lines
16 KiB
Rust
395 lines
16 KiB
Rust
//! Characters and their corresponding confusables were collected from
|
||
//! <https://www.unicode.org/Public/security/10.0.0/confusables.txt>
|
||
|
||
use super::StringReader;
|
||
use crate::token::{self, Delimiter};
|
||
use rustc_errors::{Applicability, Diagnostic};
|
||
use rustc_span::{symbol::kw, BytePos, Pos, Span};
|
||
|
||
#[rustfmt::skip] // for line breaks
|
||
pub(crate) const UNICODE_ARRAY: &[(char, &str, &str)] = &[
|
||
('
', "Line Separator", " "),
|
||
('
', "Paragraph Separator", " "),
|
||
(' ', "Ogham Space mark", " "),
|
||
(' ', "En Quad", " "),
|
||
(' ', "Em Quad", " "),
|
||
(' ', "En Space", " "),
|
||
(' ', "Em Space", " "),
|
||
(' ', "Three-Per-Em Space", " "),
|
||
(' ', "Four-Per-Em Space", " "),
|
||
(' ', "Six-Per-Em Space", " "),
|
||
(' ', "Punctuation Space", " "),
|
||
(' ', "Thin Space", " "),
|
||
(' ', "Hair Space", " "),
|
||
(' ', "Medium Mathematical Space", " "),
|
||
(' ', "No-Break Space", " "),
|
||
(' ', "Figure Space", " "),
|
||
(' ', "Narrow No-Break Space", " "),
|
||
(' ', "Ideographic Space", " "),
|
||
|
||
('ߺ', "Nko Lajanyalan", "_"),
|
||
('﹍', "Dashed Low Line", "_"),
|
||
('﹎', "Centreline Low Line", "_"),
|
||
('﹏', "Wavy Low Line", "_"),
|
||
('_', "Fullwidth Low Line", "_"),
|
||
|
||
('‐', "Hyphen", "-"),
|
||
('‑', "Non-Breaking Hyphen", "-"),
|
||
('‒', "Figure Dash", "-"),
|
||
('–', "En Dash", "-"),
|
||
('—', "Em Dash", "-"),
|
||
('﹘', "Small Em Dash", "-"),
|
||
('۔', "Arabic Full Stop", "-"),
|
||
('⁃', "Hyphen Bullet", "-"),
|
||
('˗', "Modifier Letter Minus Sign", "-"),
|
||
('−', "Minus Sign", "-"),
|
||
('➖', "Heavy Minus Sign", "-"),
|
||
('Ⲻ', "Coptic Letter Dialect-P Ni", "-"),
|
||
('ー', "Katakana-Hiragana Prolonged Sound Mark", "-"),
|
||
('-', "Fullwidth Hyphen-Minus", "-"),
|
||
('―', "Horizontal Bar", "-"),
|
||
('─', "Box Drawings Light Horizontal", "-"),
|
||
('━', "Box Drawings Heavy Horizontal", "-"),
|
||
('㇐', "CJK Stroke H", "-"),
|
||
('ꟷ', "Latin Epigraphic Letter Sideways I", "-"),
|
||
('ᅳ', "Hangul Jungseong Eu", "-"),
|
||
('ㅡ', "Hangul Letter Eu", "-"),
|
||
('一', "CJK Unified Ideograph-4E00", "-"),
|
||
('⼀', "Kangxi Radical One", "-"),
|
||
|
||
('؍', "Arabic Date Separator", ","),
|
||
('٫', "Arabic Decimal Separator", ","),
|
||
('‚', "Single Low-9 Quotation Mark", ","),
|
||
('¸', "Cedilla", ","),
|
||
('ꓹ', "Lisu Letter Tone Na Po", ","),
|
||
(',', "Fullwidth Comma", ","),
|
||
|
||
(';', "Greek Question Mark", ";"),
|
||
(';', "Fullwidth Semicolon", ";"),
|
||
('︔', "Presentation Form For Vertical Semicolon", ";"),
|
||
|
||
('ः', "Devanagari Sign Visarga", ":"),
|
||
('ઃ', "Gujarati Sign Visarga", ":"),
|
||
(':', "Fullwidth Colon", ":"),
|
||
('։', "Armenian Full Stop", ":"),
|
||
('܃', "Syriac Supralinear Colon", ":"),
|
||
('܄', "Syriac Sublinear Colon", ":"),
|
||
('᛬', "Runic Multiple Punctuation", ":"),
|
||
('︰', "Presentation Form For Vertical Two Dot Leader", ":"),
|
||
('᠃', "Mongolian Full Stop", ":"),
|
||
('᠉', "Mongolian Manchu Full Stop", ":"),
|
||
('⁚', "Two Dot Punctuation", ":"),
|
||
('׃', "Hebrew Punctuation Sof Pasuq", ":"),
|
||
('˸', "Modifier Letter Raised Colon", ":"),
|
||
('꞉', "Modifier Letter Colon", ":"),
|
||
('∶', "Ratio", ":"),
|
||
('ː', "Modifier Letter Triangular Colon", ":"),
|
||
('ꓽ', "Lisu Letter Tone Mya Jeu", ":"),
|
||
('︓', "Presentation Form For Vertical Colon", ":"),
|
||
|
||
('!', "Fullwidth Exclamation Mark", "!"),
|
||
('ǃ', "Latin Letter Retroflex Click", "!"),
|
||
('ⵑ', "Tifinagh Letter Tuareg Yang", "!"),
|
||
('︕', "Presentation Form For Vertical Exclamation Mark", "!"),
|
||
|
||
('ʔ', "Latin Letter Glottal Stop", "?"),
|
||
('Ɂ', "Latin Capital Letter Glottal Stop", "?"),
|
||
('ॽ', "Devanagari Letter Glottal Stop", "?"),
|
||
('Ꭾ', "Cherokee Letter He", "?"),
|
||
('ꛫ', "Bamum Letter Ntuu", "?"),
|
||
('?', "Fullwidth Question Mark", "?"),
|
||
('︖', "Presentation Form For Vertical Question Mark", "?"),
|
||
|
||
('𝅭', "Musical Symbol Combining Augmentation Dot", "."),
|
||
('․', "One Dot Leader", "."),
|
||
('܁', "Syriac Supralinear Full Stop", "."),
|
||
('܂', "Syriac Sublinear Full Stop", "."),
|
||
('꘎', "Vai Full Stop", "."),
|
||
('𐩐', "Kharoshthi Punctuation Dot", "."),
|
||
('٠', "Arabic-Indic Digit Zero", "."),
|
||
('۰', "Extended Arabic-Indic Digit Zero", "."),
|
||
('ꓸ', "Lisu Letter Tone Mya Ti", "."),
|
||
('·', "Middle Dot", "."),
|
||
('・', "Katakana Middle Dot", "."),
|
||
('・', "Halfwidth Katakana Middle Dot", "."),
|
||
('᛫', "Runic Single Punctuation", "."),
|
||
('·', "Greek Ano Teleia", "."),
|
||
('⸱', "Word Separator Middle Dot", "."),
|
||
('𐄁', "Aegean Word Separator Dot", "."),
|
||
('•', "Bullet", "."),
|
||
('‧', "Hyphenation Point", "."),
|
||
('∙', "Bullet Operator", "."),
|
||
('⋅', "Dot Operator", "."),
|
||
('ꞏ', "Latin Letter Sinological Dot", "."),
|
||
('ᐧ', "Canadian Syllabics Final Middle Dot", "."),
|
||
('ᐧ', "Canadian Syllabics Final Middle Dot", "."),
|
||
('.', "Fullwidth Full Stop", "."),
|
||
('。', "Ideographic Full Stop", "."),
|
||
('︒', "Presentation Form For Vertical Ideographic Full Stop", "."),
|
||
|
||
('՝', "Armenian Comma", "\'"),
|
||
(''', "Fullwidth Apostrophe", "\'"),
|
||
('‘', "Left Single Quotation Mark", "\'"),
|
||
('’', "Right Single Quotation Mark", "\'"),
|
||
('‛', "Single High-Reversed-9 Quotation Mark", "\'"),
|
||
('′', "Prime", "\'"),
|
||
('‵', "Reversed Prime", "\'"),
|
||
('՚', "Armenian Apostrophe", "\'"),
|
||
('׳', "Hebrew Punctuation Geresh", "\'"),
|
||
('`', "Grave Accent", "\'"),
|
||
('`', "Greek Varia", "\'"),
|
||
('`', "Fullwidth Grave Accent", "\'"),
|
||
('´', "Acute Accent", "\'"),
|
||
('΄', "Greek Tonos", "\'"),
|
||
('´', "Greek Oxia", "\'"),
|
||
('᾽', "Greek Koronis", "\'"),
|
||
('᾿', "Greek Psili", "\'"),
|
||
('῾', "Greek Dasia", "\'"),
|
||
('ʹ', "Modifier Letter Prime", "\'"),
|
||
('ʹ', "Greek Numeral Sign", "\'"),
|
||
('ˈ', "Modifier Letter Vertical Line", "\'"),
|
||
('ˊ', "Modifier Letter Acute Accent", "\'"),
|
||
('ˋ', "Modifier Letter Grave Accent", "\'"),
|
||
('˴', "Modifier Letter Middle Grave Accent", "\'"),
|
||
('ʻ', "Modifier Letter Turned Comma", "\'"),
|
||
('ʽ', "Modifier Letter Reversed Comma", "\'"),
|
||
('ʼ', "Modifier Letter Apostrophe", "\'"),
|
||
('ʾ', "Modifier Letter Right Half Ring", "\'"),
|
||
('ꞌ', "Latin Small Letter Saltillo", "\'"),
|
||
('י', "Hebrew Letter Yod", "\'"),
|
||
('ߴ', "Nko High Tone Apostrophe", "\'"),
|
||
('ߵ', "Nko Low Tone Apostrophe", "\'"),
|
||
('ᑊ', "Canadian Syllabics West-Cree P", "\'"),
|
||
('ᛌ', "Runic Letter Short-Twig-Sol S", "\'"),
|
||
('𖽑', "Miao Sign Aspiration", "\'"),
|
||
('𖽒', "Miao Sign Reformed Voicing", "\'"),
|
||
|
||
('᳓', "Vedic Sign Nihshvasa", "\""),
|
||
('"', "Fullwidth Quotation Mark", "\""),
|
||
('“', "Left Double Quotation Mark", "\""),
|
||
('”', "Right Double Quotation Mark", "\""),
|
||
('‟', "Double High-Reversed-9 Quotation Mark", "\""),
|
||
('″', "Double Prime", "\""),
|
||
('‶', "Reversed Double Prime", "\""),
|
||
('〃', "Ditto Mark", "\""),
|
||
('״', "Hebrew Punctuation Gershayim", "\""),
|
||
('˝', "Double Acute Accent", "\""),
|
||
('ʺ', "Modifier Letter Double Prime", "\""),
|
||
('˶', "Modifier Letter Middle Double Acute Accent", "\""),
|
||
('˵', "Modifier Letter Middle Double Grave Accent", "\""),
|
||
('ˮ', "Modifier Letter Double Apostrophe", "\""),
|
||
('ײ', "Hebrew Ligature Yiddish Double Yod", "\""),
|
||
('❞', "Heavy Double Comma Quotation Mark Ornament", "\""),
|
||
('❝', "Heavy Double Turned Comma Quotation Mark Ornament", "\""),
|
||
|
||
('(', "Fullwidth Left Parenthesis", "("),
|
||
('❨', "Medium Left Parenthesis Ornament", "("),
|
||
('﴾', "Ornate Left Parenthesis", "("),
|
||
|
||
(')', "Fullwidth Right Parenthesis", ")"),
|
||
('❩', "Medium Right Parenthesis Ornament", ")"),
|
||
('﴿', "Ornate Right Parenthesis", ")"),
|
||
|
||
('[', "Fullwidth Left Square Bracket", "["),
|
||
('❲', "Light Left Tortoise Shell Bracket Ornament", "["),
|
||
('「', "Left Corner Bracket", "["),
|
||
('『', "Left White Corner Bracket", "["),
|
||
('【', "Left Black Lenticular Bracket", "["),
|
||
('〔', "Left Tortoise Shell Bracket", "["),
|
||
('〖', "Left White Lenticular Bracket", "["),
|
||
('〘', "Left White Tortoise Shell Bracket", "["),
|
||
('〚', "Left White Square Bracket", "["),
|
||
|
||
(']', "Fullwidth Right Square Bracket", "]"),
|
||
('❳', "Light Right Tortoise Shell Bracket Ornament", "]"),
|
||
('」', "Right Corner Bracket", "]"),
|
||
('』', "Right White Corner Bracket", "]"),
|
||
('】', "Right Black Lenticular Bracket", "]"),
|
||
('〕', "Right Tortoise Shell Bracket", "]"),
|
||
('〗', "Right White Lenticular Bracket", "]"),
|
||
('〙', "Right White Tortoise Shell Bracket", "]"),
|
||
('〛', "Right White Square Bracket", "]"),
|
||
|
||
('❴', "Medium Left Curly Bracket Ornament", "{"),
|
||
('𝄔', "Musical Symbol Brace", "{"),
|
||
('{', "Fullwidth Left Curly Bracket", "{"),
|
||
|
||
('❵', "Medium Right Curly Bracket Ornament", "}"),
|
||
('}', "Fullwidth Right Curly Bracket", "}"),
|
||
|
||
('⁎', "Low Asterisk", "*"),
|
||
('٭', "Arabic Five Pointed Star", "*"),
|
||
('∗', "Asterisk Operator", "*"),
|
||
('𐌟', "Old Italic Letter Ess", "*"),
|
||
('*', "Fullwidth Asterisk", "*"),
|
||
|
||
('᜵', "Philippine Single Punctuation", "/"),
|
||
('⁁', "Caret Insertion Point", "/"),
|
||
('∕', "Division Slash", "/"),
|
||
('⁄', "Fraction Slash", "/"),
|
||
('╱', "Box Drawings Light Diagonal Upper Right To Lower Left", "/"),
|
||
('⟋', "Mathematical Rising Diagonal", "/"),
|
||
('⧸', "Big Solidus", "/"),
|
||
('𝈺', "Greek Instrumental Notation Symbol-47", "/"),
|
||
('㇓', "CJK Stroke Sp", "/"),
|
||
('〳', "Vertical Kana Repeat Mark Upper Half", "/"),
|
||
('Ⳇ', "Coptic Capital Letter Old Coptic Esh", "/"),
|
||
('ノ', "Katakana Letter No", "/"),
|
||
('丿', "CJK Unified Ideograph-4E3F", "/"),
|
||
('⼃', "Kangxi Radical Slash", "/"),
|
||
('/', "Fullwidth Solidus", "/"),
|
||
|
||
('\', "Fullwidth Reverse Solidus", "\\"),
|
||
('﹨', "Small Reverse Solidus", "\\"),
|
||
('∖', "Set Minus", "\\"),
|
||
('⟍', "Mathematical Falling Diagonal", "\\"),
|
||
('⧵', "Reverse Solidus Operator", "\\"),
|
||
('⧹', "Big Reverse Solidus", "\\"),
|
||
('⧹', "Greek Vocal Notation Symbol-16", "\\"),
|
||
('⧹', "Greek Instrumental Symbol-48", "\\"),
|
||
('㇔', "CJK Stroke D", "\\"),
|
||
('丶', "CJK Unified Ideograph-4E36", "\\"),
|
||
('⼂', "Kangxi Radical Dot", "\\"),
|
||
('、', "Ideographic Comma", "\\"),
|
||
('ヽ', "Katakana Iteration Mark", "\\"),
|
||
|
||
('ꝸ', "Latin Small Letter Um", "&"),
|
||
('&', "Fullwidth Ampersand", "&"),
|
||
|
||
('᛭', "Runic Cross Punctuation", "+"),
|
||
('➕', "Heavy Plus Sign", "+"),
|
||
('𐊛', "Lycian Letter H", "+"),
|
||
('﬩', "Hebrew Letter Alternative Plus Sign", "+"),
|
||
('+', "Fullwidth Plus Sign", "+"),
|
||
|
||
('‹', "Single Left-Pointing Angle Quotation Mark", "<"),
|
||
('❮', "Heavy Left-Pointing Angle Quotation Mark Ornament", "<"),
|
||
('˂', "Modifier Letter Left Arrowhead", "<"),
|
||
('𝈶', "Greek Instrumental Symbol-40", "<"),
|
||
('ᐸ', "Canadian Syllabics Pa", "<"),
|
||
('ᚲ', "Runic Letter Kauna", "<"),
|
||
('❬', "Medium Left-Pointing Angle Bracket Ornament", "<"),
|
||
('⟨', "Mathematical Left Angle Bracket", "<"),
|
||
('〈', "Left-Pointing Angle Bracket", "<"),
|
||
('〈', "Left Angle Bracket", "<"),
|
||
('㇛', "CJK Stroke Pd", "<"),
|
||
('く', "Hiragana Letter Ku", "<"),
|
||
('𡿨', "CJK Unified Ideograph-21FE8", "<"),
|
||
('《', "Left Double Angle Bracket", "<"),
|
||
('<', "Fullwidth Less-Than Sign", "<"),
|
||
|
||
('᐀', "Canadian Syllabics Hyphen", "="),
|
||
('⹀', "Double Hyphen", "="),
|
||
('゠', "Katakana-Hiragana Double Hyphen", "="),
|
||
('꓿', "Lisu Punctuation Full Stop", "="),
|
||
('=', "Fullwidth Equals Sign", "="),
|
||
|
||
('›', "Single Right-Pointing Angle Quotation Mark", ">"),
|
||
('❯', "Heavy Right-Pointing Angle Quotation Mark Ornament", ">"),
|
||
('˃', "Modifier Letter Right Arrowhead", ">"),
|
||
('𝈷', "Greek Instrumental Symbol-42", ">"),
|
||
('ᐳ', "Canadian Syllabics Po", ">"),
|
||
('𖼿', "Miao Letter Archaic Zza", ">"),
|
||
('❭', "Medium Right-Pointing Angle Bracket Ornament", ">"),
|
||
('⟩', "Mathematical Right Angle Bracket", ">"),
|
||
('〉', "Right-Pointing Angle Bracket", ">"),
|
||
('〉', "Right Angle Bracket", ">"),
|
||
('》', "Right Double Angle Bracket", ">"),
|
||
('>', "Fullwidth Greater-Than Sign", ">"),
|
||
('⩵', "Two Consecutive Equals Signs", "==")
|
||
];
|
||
|
||
// FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, instead of
|
||
// keeping the substitution token in this table. Ideally, this should be inside `rustc_lexer`.
|
||
// However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add
|
||
// fancier error recovery to it, as there will be less overall work to do this way.
|
||
const ASCII_ARRAY: &[(&str, &str, Option<token::TokenKind>)] = &[
|
||
(" ", "Space", None),
|
||
("_", "Underscore", Some(token::Ident(kw::Underscore, false))),
|
||
("-", "Minus/Hyphen", Some(token::BinOp(token::Minus))),
|
||
(",", "Comma", Some(token::Comma)),
|
||
(";", "Semicolon", Some(token::Semi)),
|
||
(":", "Colon", Some(token::Colon)),
|
||
("!", "Exclamation Mark", Some(token::Not)),
|
||
("?", "Question Mark", Some(token::Question)),
|
||
(".", "Period", Some(token::Dot)),
|
||
("(", "Left Parenthesis", Some(token::OpenDelim(Delimiter::Parenthesis))),
|
||
(")", "Right Parenthesis", Some(token::CloseDelim(Delimiter::Parenthesis))),
|
||
("[", "Left Square Bracket", Some(token::OpenDelim(Delimiter::Bracket))),
|
||
("]", "Right Square Bracket", Some(token::CloseDelim(Delimiter::Bracket))),
|
||
("{", "Left Curly Brace", Some(token::OpenDelim(Delimiter::Brace))),
|
||
("}", "Right Curly Brace", Some(token::CloseDelim(Delimiter::Brace))),
|
||
("*", "Asterisk", Some(token::BinOp(token::Star))),
|
||
("/", "Slash", Some(token::BinOp(token::Slash))),
|
||
("\\", "Backslash", None),
|
||
("&", "Ampersand", Some(token::BinOp(token::And))),
|
||
("+", "Plus Sign", Some(token::BinOp(token::Plus))),
|
||
("<", "Less-Than Sign", Some(token::Lt)),
|
||
("=", "Equals Sign", Some(token::Eq)),
|
||
("==", "Double Equals Sign", Some(token::EqEq)),
|
||
(">", "Greater-Than Sign", Some(token::Gt)),
|
||
// FIXME: Literals are already lexed by this point, so we can't recover gracefully just by
|
||
// spitting the correct token out.
|
||
("\'", "Single Quote", None),
|
||
("\"", "Quotation Mark", None),
|
||
];
|
||
|
||
pub(super) fn check_for_substitution<'a>(
|
||
reader: &StringReader<'a>,
|
||
pos: BytePos,
|
||
ch: char,
|
||
err: &mut Diagnostic,
|
||
count: usize,
|
||
) -> Option<token::TokenKind> {
|
||
let &(_, u_name, ascii_str) = UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch)?;
|
||
|
||
let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8() * count));
|
||
|
||
let Some((_, ascii_name, token)) = ASCII_ARRAY.iter().find(|&&(s, _, _)| s == ascii_str) else {
|
||
let msg = format!("substitution character not found for '{}'", ch);
|
||
reader.sess.span_diagnostic.span_bug_no_panic(span, &msg);
|
||
return None;
|
||
};
|
||
|
||
// special help suggestion for "directed" double quotes
|
||
if let Some(s) = peek_delimited(&reader.src[reader.src_index(pos)..], '“', '”') {
|
||
let msg = format!(
|
||
"Unicode characters '“' (Left Double Quotation Mark) and \
|
||
'”' (Right Double Quotation Mark) look like '{}' ({}), but are not",
|
||
ascii_str, ascii_name
|
||
);
|
||
err.span_suggestion(
|
||
Span::with_root_ctxt(
|
||
pos,
|
||
pos + Pos::from_usize('“'.len_utf8() + s.len() + '”'.len_utf8()),
|
||
),
|
||
&msg,
|
||
format!("\"{}\"", s),
|
||
Applicability::MaybeIncorrect,
|
||
);
|
||
} else {
|
||
let msg = format!(
|
||
"Unicode character '{}' ({}) looks like '{}' ({}), but it is not",
|
||
ch, u_name, ascii_str, ascii_name
|
||
);
|
||
err.span_suggestion(
|
||
span,
|
||
&msg,
|
||
ascii_str.to_string().repeat(count),
|
||
Applicability::MaybeIncorrect,
|
||
);
|
||
}
|
||
token.clone()
|
||
}
|
||
|
||
/// Extract string if found at current position with given delimiters
|
||
fn peek_delimited(text: &str, from_ch: char, to_ch: char) -> Option<&str> {
|
||
let mut chars = text.chars();
|
||
let first_char = chars.next()?;
|
||
if first_char != from_ch {
|
||
return None;
|
||
}
|
||
let last_char_idx = chars.as_str().find(to_ch)?;
|
||
Some(&chars.as_str()[..last_char_idx])
|
||
}
|