Rollup merge of #120840 - HTGAzureX1212:HTGAzureX1212/unicode-identifier-types, r=fmease,Manishearth

Split Diagnostics for Uncommon Codepoints: Add Individual Identifier Types This pull request further modifies the `uncommon_codepoints` lint, adding the individual identifier types of `Technical`, `Not_NFKC`, `Exclusion` and `Limited_Use` to the diagnostic message. Example rendered diagnostic: ``` error: identifier contains a Unicode codepoint that is not used in normalized strings: 'ĳ' --> $DIR/lint-uncommon-codepoints.rs:6:4 | LL | fn dĳkstra() {} | ^^^^^^^ = note: this character is included in the Not_NFKC Unicode general security profile ``` Second step of #120228.
2024-02-26 10:27:41 +01:00 · 2024-02-26 10:27:41 +01:00 · 91d337dfa8
commit 91d337dfa8
parent 0e08be5360 8bccceb8fc
8 changed files with 79 additions and 19 deletions
--- a/compiler/rustc_lint/messages.ftl
+++ b/compiler/rustc_lint/messages.ftl
@ -244,9 +244,28 @@ lint_hidden_unicode_codepoints = unicode codepoint changing visible direction of
 lint_identifier_non_ascii_char = identifier contains non-ASCII characters

 lint_identifier_uncommon_codepoints = identifier contains {$codepoints_len ->
-    [one] an uncommon Unicode codepoint
-    *[other] uncommon Unicode codepoints
+    [one] { $identifier_type ->
+        [Exclusion] a character from an archaic script
+        [Technical] a character that is for non-linguistic, specialized usage
+        [Limited_Use] a character from a script in limited use
+        [Not_NFKC] a non normalized (NFKC) character
+        *[other] an uncommon character
+    }
+    *[other] { $identifier_type ->
+        [Exclusion] {$codepoints_len} characters from archaic scripts
+        [Technical] {$codepoints_len} characters that are for non-linguistic, specialized usage
+        [Limited_Use] {$codepoints_len} characters from scripts in limited use
+        [Not_NFKC] {$codepoints_len} non normalized (NFKC) characters
+        *[other] uncommon characters
+    }
 }: {$codepoints}
+    .note = {$codepoints_len ->
+        [one] this character is
+        *[other] these characters are
+    } included in the{$identifier_type ->
+        [Restricted] {""}
+        *[other] {" "}{$identifier_type}
+    } Unicode general security profile

 lint_ignored_unless_crate_specified = {$level}({$name}) is ignored unless specified at crate level

--- a/compiler/rustc_lint/src/lib.rs
+++ b/compiler/rustc_lint/src/lib.rs
@ -31,6 +31,7 @@
 #![feature(array_windows)]
 #![feature(box_patterns)]
 #![feature(control_flow_enum)]
+#![feature(extract_if)]
 #![feature(generic_nonzero)]
 #![feature(if_let_guard)]
 #![feature(iter_order_by)]
--- a/compiler/rustc_lint/src/lints.rs
+++ b/compiler/rustc_lint/src/lints.rs
@ -1129,9 +1129,11 @@ pub struct MultipleSupertraitUpcastable {

 #[derive(LintDiagnostic)]
 #[diag(lint_identifier_uncommon_codepoints)]
+#[note]
 pub struct IdentifierUncommonCodepoints {
    pub codepoints: Vec<char>,
    pub codepoints_len: usize,
+    pub identifier_type: &'static str,
 }

 #[derive(LintDiagnostic)]
--- a/compiler/rustc_lint/src/non_ascii_idents.rs
+++ b/compiler/rustc_lint/src/non_ascii_idents.rs
@ -7,6 +7,7 @@
 use rustc_data_structures::fx::FxIndexMap;
 use rustc_data_structures::unord::UnordMap;
 use rustc_span::symbol::Symbol;
+use unicode_security::general_security_profile::IdentifierType;

 declare_lint! {
    /// The `non_ascii_idents` lint detects non-ASCII identifiers.
@ -189,18 +190,48 @@ fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
            if check_uncommon_codepoints
                && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
            {
-                let codepoints: Vec<_> = symbol_str
+                let mut chars: Vec<_> = symbol_str
                    .chars()
-                    .filter(|c| !GeneralSecurityProfile::identifier_allowed(*c))
+                    .map(|c| (c, GeneralSecurityProfile::identifier_type(c)))
                    .collect();
-                let codepoints_len = codepoints.len();

+                for (id_ty, id_ty_descr) in [
+                    (IdentifierType::Exclusion, "Exclusion"),
+                    (IdentifierType::Technical, "Technical"),
+                    (IdentifierType::Limited_Use, "Limited_Use"),
+                    (IdentifierType::Not_NFKC, "Not_NFKC"),
+                ] {
+                    let codepoints: Vec<_> =
+                        chars.extract_if(|(_, ty)| *ty == Some(id_ty)).collect();
+                    if codepoints.is_empty() {
+                        continue;
+                    }
                    cx.emit_span_lint(
                        UNCOMMON_CODEPOINTS,
                        sp,
-                    IdentifierUncommonCodepoints { codepoints, codepoints_len },
+                        IdentifierUncommonCodepoints {
+                            codepoints_len: codepoints.len(),
+                            codepoints: codepoints.into_iter().map(|(c, _)| c).collect(),
+                            identifier_type: id_ty_descr,
+                        },
                    );
                }
+
+                let remaining = chars
+                    .extract_if(|(c, _)| !GeneralSecurityProfile::identifier_allowed(*c))
+                    .collect::<Vec<_>>();
+                if !remaining.is_empty() {
+                    cx.emit_span_lint(
+                        UNCOMMON_CODEPOINTS,
+                        sp,
+                        IdentifierUncommonCodepoints {
+                            codepoints_len: remaining.len(),
+                            codepoints: remaining.into_iter().map(|(c, _)| c).collect(),
+                            identifier_type: "Restricted",
+                        },
+                    );
+                }
+            }
        }

        if has_non_ascii_idents && check_confusable_idents {
--- a/tests/ui/lexer/lex-emoji-identifiers.rs
+++ b/tests/ui/lexer/lex-emoji-identifiers.rs
@ -4,7 +4,7 @@ fn invalid_emoji_usages() {
    let wireless🛜 = "basic emoji"; //~ ERROR: identifiers cannot contain emoji
    // FIXME
    let key1️⃣ = "keycap sequence"; //~ ERROR: unknown start of token
-                                    //~^ WARN: identifier contains an uncommon Unicode codepoint
+                                    //~^ WARN: identifier contains an uncommon character: '\u{fe0f}'
    let flag🇺🇳 = "flag sequence"; //~ ERROR: identifiers cannot contain emoji
    let wales🏴 = "tag sequence"; //~ ERROR: identifiers cannot contain emoji
    let folded🙏🏿 = "modifier sequence"; //~ ERROR: identifiers cannot contain emoji
--- a/tests/ui/lexer/lex-emoji-identifiers.stderr
+++ b/tests/ui/lexer/lex-emoji-identifiers.stderr
@ -40,12 +40,13 @@ error: identifiers cannot contain emoji: `folded🙏🏿`
 LL |     let folded🙏🏿 = "modifier sequence";
   |         ^^^^^^^^^^

-warning: identifier contains an uncommon Unicode codepoint: '\u{fe0f}'
+warning: identifier contains an uncommon character: '\u{fe0f}'
  --> $DIR/lex-emoji-identifiers.rs:6:9
   |
 LL |     let key1️⃣ = "keycap sequence";
   |         ^^^^
   |
+   = note: this character is included in the Unicode general security profile
   = note: `#[warn(uncommon_codepoints)]` on by default

 error: aborting due to 7 previous errors; 1 warning emitted
--- a/tests/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.rs
+++ b/tests/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.rs
@ -1,12 +1,13 @@
 #![deny(uncommon_codepoints)]

-const µ: f64 = 0.000001; //~ ERROR identifier contains an uncommon Unicode codepoint
+const µ: f64 = 0.000001; //~ identifier contains a non normalized (NFKC) character: 'µ'
 //~| WARNING should have an upper case name

-fn dĳkstra() {} //~ ERROR identifier contains an uncommon Unicode codepoint
+fn dĳkstra() {}
+//~^ ERROR identifier contains a non normalized (NFKC) character: 'ĳ'

 fn main() {
-    let ㇻㇲㇳ = "rust"; //~ ERROR identifier contains uncommon Unicode codepoints
+    let ㇻㇲㇳ = "rust"; //~ ERROR identifier contains uncommon characters: 'ㇻ', 'ㇲ', and 'ㇳ'

    // using the same identifier the second time won't trigger the lint.
    println!("{}", ㇻㇲㇳ);
--- a/tests/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.stderr
+++ b/tests/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.stderr
@ -1,26 +1,31 @@
-error: identifier contains an uncommon Unicode codepoint: 'µ'
+error: identifier contains a non normalized (NFKC) character: 'µ'
  --> $DIR/lint-uncommon-codepoints.rs:3:7
   |
 LL | const µ: f64 = 0.000001;
   |       ^
   |
+   = note: this character is included in the Not_NFKC Unicode general security profile
 note: the lint level is defined here
  --> $DIR/lint-uncommon-codepoints.rs:1:9
   |
 LL | #![deny(uncommon_codepoints)]
   |         ^^^^^^^^^^^^^^^^^^^

-error: identifier contains an uncommon Unicode codepoint: 'ĳ'
+error: identifier contains a non normalized (NFKC) character: 'ĳ'
  --> $DIR/lint-uncommon-codepoints.rs:6:4
   |
 LL | fn dĳkstra() {}
   |    ^^^^^^^
+   |
+   = note: this character is included in the Not_NFKC Unicode general security profile

-error: identifier contains uncommon Unicode codepoints: 'ㇻ', 'ㇲ', and 'ㇳ'
-  --> $DIR/lint-uncommon-codepoints.rs:9:9
+error: identifier contains uncommon characters: 'ㇻ', 'ㇲ', and 'ㇳ'
+  --> $DIR/lint-uncommon-codepoints.rs:10:9
   |
 LL |     let ㇻㇲㇳ = "rust";
   |         ^^^^^^
+   |
+   = note: these characters are included in the Unicode general security profile

 warning: constant `µ` should have an upper case name
  --> $DIR/lint-uncommon-codepoints.rs:3:7