Make unicode text flow control chars visible as �

We already point these out quite aggressively, telling people not to use them, but would normally be rendered as nothing. Having them visible will make it easier for people to actually deal with them.

```
error: unicode codepoint changing visible direction of text present in literal
  --> $DIR/unicode-control-codepoints.rs:26:22
   |
LL |     println!("{:?}", '�');
   |                      ^-^
   |                      ||
   |                      |'\u{202e}'
   |                      this literal contains an invisible unicode text flow control codepoint
   |
   = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen
   = help: if their presence wasn't intentional, you can remove them
help: if you want to keep them but make them visible in your source code, you can escape them
   |
LL |     println!("{:?}", '\u{202e}');
   |                       ~~~~~~~~
```

vs the previous

```
error: unicode codepoint changing visible direction of text present in literal
  --> $DIR/unicode-control-codepoints.rs:26:22
   |
LL |     println!("{:?}", '');
   |                      ^-
   |                      ||
   |                      |'\u{202e}'
   |                      this literal contains an invisible unicode text flow control codepoint
   |
   = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen
   = help: if their presence wasn't intentional, you can remove them
help: if you want to keep them but make them visible in your source code, you can escape them
   |
LL |     println!("{:?}", '\u{202e}');
   |                       ~~~~~~~~
```
This commit is contained in:
Esteban Küber 2024-07-18 20:02:08 +00:00
parent 2d7795dfb9
commit 9dffe9573b
3 changed files with 62 additions and 60 deletions

View File

@ -2558,18 +2558,19 @@ fn num_decimal_digits(num: usize) -> usize {
}
// We replace some characters so the CLI output is always consistent and underlines aligned.
// Keep the following list in sync with `rustc_span::char_width`.
const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
('\t', " "), // We do our own tab replacement
('\t', " "), // We do our own tab replacement
('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
('\u{202A}', ""), // The following unicode text flow control characters are inconsistently
('\u{202B}', ""), // supported across CLIs and can cause confusion due to the bytes on disk
('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always.
('\u{202E}', ""),
('\u{2066}', ""),
('\u{2067}', ""),
('\u{2068}', ""),
('\u{202C}', ""),
('\u{2069}', ""),
('\u{202A}', "<EFBFBD>"), // The following unicode text flow control characters are inconsistently
('\u{202B}', "<EFBFBD>"), // supported across CLIs and can cause confusion due to the bytes on disk
('\u{202D}', "<EFBFBD>"), // not corresponding to the visible source code, so we replace them always.
('\u{202E}', "<EFBFBD>"),
('\u{2066}', "<EFBFBD>"),
('\u{2067}', "<EFBFBD>"),
('\u{2068}', "<EFBFBD>"),
('\u{202C}', "<EFBFBD>"),
('\u{2069}', "<EFBFBD>"),
// In terminals without Unicode support the following will be garbled, but in *all* terminals
// the underlying codepoint will be as well. We could gate this replacement behind a "unicode
// support" gate.

View File

@ -2093,7 +2093,8 @@ pub fn char_width(ch: char) -> usize {
| '\u{000E}' | '\u{000F}' | '\u{0010}' | '\u{0011}' | '\u{0012}' | '\u{0013}'
| '\u{0014}' | '\u{0015}' | '\u{0016}' | '\u{0017}' | '\u{0018}' | '\u{0019}'
| '\u{001A}' | '\u{001B}' | '\u{001C}' | '\u{001D}' | '\u{001E}' | '\u{001F}'
| '\u{007F}' => 1,
| '\u{007F}' | '\u{202A}' | '\u{202B}' | '\u{202D}' | '\u{202E}' | '\u{2066}'
| '\u{2067}' | '\u{2068}' | '\u{202C}' | '\u{2069}' => 1,
_ => unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1),
}
}

View File

@ -17,78 +17,78 @@ LL | println!("{:?}", b"us\u{202B}e\u{202A}r");
error: non-ASCII character in byte string literal
--> $DIR/unicode-control-codepoints.rs:16:26
|
LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
LL | println!("{:?}", b"/*<EFBFBD> } <EFBFBD>if isAdmin<EFBFBD> <EFBFBD> begin admins only ");
| ^ must be ASCII but is '\u{202e}'
|
help: if you meant to use the UTF-8 encoding of '\u{202e}', use \xHH escapes
|
LL | println!("{:?}", b"/*\xE2\x80\xAE } if isAdmin begin admins only ");
LL | println!("{:?}", b"/*\xE2\x80\xAE } <EFBFBD>if isAdmin<EFBFBD> <EFBFBD> begin admins only ");
| ~~~~~~~~~~~~
error: non-ASCII character in byte string literal
--> $DIR/unicode-control-codepoints.rs:16:30
|
LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
| ^ must be ASCII but is '\u{2066}'
LL | println!("{:?}", b"/*<EFBFBD> } <EFBFBD>if isAdmin<EFBFBD> <EFBFBD> begin admins only ");
| ^ must be ASCII but is '\u{2066}'
|
help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes
|
LL | println!("{:?}", b"/* } \xE2\x81\xA6if isAdmin begin admins only ");
| ~~~~~~~~~~~~
LL | println!("{:?}", b"/*<EFBFBD> } \xE2\x81\xA6if isAdmin<EFBFBD> <EFBFBD> begin admins only ");
| ~~~~~~~~~~~~
error: non-ASCII character in byte string literal
--> $DIR/unicode-control-codepoints.rs:16:41
|
LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
| ^ must be ASCII but is '\u{2069}'
LL | println!("{:?}", b"/*<EFBFBD> } <EFBFBD>if isAdmin<EFBFBD> <EFBFBD> begin admins only ");
| ^ must be ASCII but is '\u{2069}'
|
help: if you meant to use the UTF-8 encoding of '\u{2069}', use \xHH escapes
|
LL | println!("{:?}", b"/* } if isAdmin\xE2\x81\xA9 begin admins only ");
| ~~~~~~~~~~~~
LL | println!("{:?}", b"/*<EFBFBD> } <EFBFBD>if isAdmin\xE2\x81\xA9 <EFBFBD> begin admins only ");
| ~~~~~~~~~~~~
error: non-ASCII character in byte string literal
--> $DIR/unicode-control-codepoints.rs:16:43
|
LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
| ^ must be ASCII but is '\u{2066}'
LL | println!("{:?}", b"/*<EFBFBD> } <EFBFBD>if isAdmin<EFBFBD> <EFBFBD> begin admins only ");
| ^ must be ASCII but is '\u{2066}'
|
help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes
|
LL | println!("{:?}", b"/* } if isAdmin \xE2\x81\xA6 begin admins only ");
| ~~~~~~~~~~~~
LL | println!("{:?}", b"/*<EFBFBD> } <EFBFBD>if isAdmin<EFBFBD> \xE2\x81\xA6 begin admins only ");
| ~~~~~~~~~~~~
error: non-ASCII character in raw byte string literal
--> $DIR/unicode-control-codepoints.rs:21:29
|
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
LL | println!("{:?}", br##"/*<EFBFBD> } <EFBFBD>if isAdmin<EFBFBD> <EFBFBD> begin admins only "##);
| ^ must be ASCII but is '\u{202e}'
error: non-ASCII character in raw byte string literal
--> $DIR/unicode-control-codepoints.rs:21:33
|
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
| ^ must be ASCII but is '\u{2066}'
LL | println!("{:?}", br##"/*<EFBFBD> } <EFBFBD>if isAdmin<EFBFBD> <EFBFBD> begin admins only "##);
| ^ must be ASCII but is '\u{2066}'
error: non-ASCII character in raw byte string literal
--> $DIR/unicode-control-codepoints.rs:21:44
|
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
| ^ must be ASCII but is '\u{2069}'
LL | println!("{:?}", br##"/*<EFBFBD> } <EFBFBD>if isAdmin<EFBFBD> <EFBFBD> begin admins only "##);
| ^ must be ASCII but is '\u{2069}'
error: non-ASCII character in raw byte string literal
--> $DIR/unicode-control-codepoints.rs:21:46
|
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
| ^ must be ASCII but is '\u{2066}'
LL | println!("{:?}", br##"/*<EFBFBD> } <EFBFBD>if isAdmin<EFBFBD> <EFBFBD> begin admins only "##);
| ^ must be ASCII but is '\u{2066}'
error: unicode codepoint changing visible direction of text present in comment
--> $DIR/unicode-control-codepoints.rs:2:5
|
LL | // if access_level != "user" { // Check if admin
| ^^^^^^^^^^^^^^^^^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^
| | ||
| | |'\u{202a}'
LL | // if access_level != "us<EFBFBD>e<EFBFBD>r" { // Check if admin
| ^^^^^^^^^^^^^^^^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^^^
| | | |
| | | '\u{202a}'
| | '\u{202b}'
| this comment contains invisible unicode text flow control codepoints
|
@ -99,12 +99,12 @@ LL | // if access_level != "user" { // Check if admin
error: unicode codepoint changing visible direction of text present in comment
--> $DIR/unicode-control-codepoints.rs:30:1
|
LL | //"/* } if isAdmin begin admins only */"
| ^^^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^
| | | | ||
| | | | |'\u{2066}'
| | | | '\u{2069}'
| | | '\u{2066}'
LL | //"/*<EFBFBD> } <EFBFBD>if isAdmin<EFBFBD> <EFBFBD> begin admins only */"
| ^^^^^-^^^-^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^^^
| | | | | |
| | | | | '\u{2066}'
| | | | '\u{2069}'
| | | '\u{2066}'
| | '\u{202e}'
| this comment contains invisible unicode text flow control codepoints
|
@ -114,12 +114,12 @@ LL | //"/* } if isAdmin begin admins only */"
error: unicode codepoint changing visible direction of text present in literal
--> $DIR/unicode-control-codepoints.rs:11:22
|
LL | println!("{:?}", "/* } if isAdmin begin admins only ");
| ^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^
| | | | ||
| | | | |'\u{2066}'
| | | | '\u{2069}'
| | | '\u{2066}'
LL | println!("{:?}", "/*<EFBFBD> } <EFBFBD>if isAdmin<EFBFBD> <EFBFBD> begin admins only ");
| ^^^-^^^-^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^
| | | | | |
| | | | | '\u{2066}'
| | | | '\u{2069}'
| | | '\u{2066}'
| | '\u{202e}'
| this literal contains invisible unicode text flow control codepoints
|
@ -134,12 +134,12 @@ LL | println!("{:?}", "/*\u{202e} } \u{2066}if isAdmin\u{2069} \u{2066} begi
error: unicode codepoint changing visible direction of text present in literal
--> $DIR/unicode-control-codepoints.rs:14:22
|
LL | println!("{:?}", r##"/* } if isAdmin begin admins only "##);
| ^^^^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^
| | | | ||
| | | | |'\u{2066}'
| | | | '\u{2069}'
| | | '\u{2066}'
LL | println!("{:?}", r##"/*<EFBFBD> } <EFBFBD>if isAdmin<EFBFBD> <EFBFBD> begin admins only "##);
| ^^^^^^-^^^-^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^^^
| | | | | |
| | | | | '\u{2066}'
| | | | '\u{2069}'
| | | '\u{2066}'
| | '\u{202e}'
| this literal contains invisible unicode text flow control codepoints
|
@ -153,8 +153,8 @@ LL | println!("{:?}", r##"/*\u{202e} } \u{2066}if isAdmin\u{2069} \u{2066} b
error: unicode codepoint changing visible direction of text present in literal
--> $DIR/unicode-control-codepoints.rs:26:22
|
LL | println!("{:?}", '');
| ^-
LL | println!("{:?}", '<EFBFBD>');
| ^-^
| ||
| |'\u{202e}'
| this literal contains an invisible unicode text flow control codepoint
@ -169,8 +169,8 @@ LL | println!("{:?}", '\u{202e}');
error: unicode codepoint changing visible direction of text present in doc comment
--> $DIR/unicode-control-codepoints.rs:33:1
|
LL | /** ''); */fn foo() {}
| ^^^^^^^^^^^^ this doc comment contains an invisible unicode text flow control codepoint
LL | /** '<EFBFBD>'); */fn foo() {}
| ^^^^^^^^^^^^^ this doc comment contains an invisible unicode text flow control codepoint
|
= note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen
= note: if their presence wasn't intentional, you can remove them
@ -181,8 +181,8 @@ error: unicode codepoint changing visible direction of text present in doc comme
|
LL | / /**
LL | | *
LL | | * ''); */fn bar() {}
| |___________^ this doc comment contains an invisible unicode text flow control codepoint
LL | | * '<EFBFBD>'); */fn bar() {}
| |____________^ this doc comment contains an invisible unicode text flow control codepoint
|
= note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen
= note: if their presence wasn't intentional, you can remove them