From 6d9ee6ee26c296d9361d5f0fb479b2462687fe4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Esteban=20K=C3=BCber?= Date: Thu, 25 Jul 2024 18:38:28 +0000 Subject: [PATCH] Change output normalization logic to be linear against size of output Scan strings to be normalized for printing in a linear scan and collect the resulting `String` only once. Use a binary search when looking for chars to be replaced, instead of a `HashMap::get`. --- Cargo.lock | 1 + compiler/rustc_errors/Cargo.toml | 1 + compiler/rustc_errors/src/emitter.rs | 100 ++++++++++++++------------- 3 files changed, 55 insertions(+), 47 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 281599a21fc..43fbaf966b7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3867,6 +3867,7 @@ version = "0.0.0" dependencies = [ "annotate-snippets 0.10.2", "derive_setters", + "either", "rustc_ast", "rustc_ast_pretty", "rustc_data_structures", diff --git a/compiler/rustc_errors/Cargo.toml b/compiler/rustc_errors/Cargo.toml index 2fff9f2de50..ddf72a2d5fd 100644 --- a/compiler/rustc_errors/Cargo.toml +++ b/compiler/rustc_errors/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" # tidy-alphabetical-start annotate-snippets = "0.10" derive_setters = "0.1.6" +either = "1.5.0" rustc_ast = { path = "../rustc_ast" } rustc_ast_pretty = { path = "../rustc_ast_pretty" } rustc_data_structures = { path = "../rustc_data_structures" } diff --git a/compiler/rustc_errors/src/emitter.rs b/compiler/rustc_errors/src/emitter.rs index 73908e58085..d186996040b 100644 --- a/compiler/rustc_errors/src/emitter.rs +++ b/compiler/rustc_errors/src/emitter.rs @@ -16,6 +16,7 @@ use std::path::Path; use derive_setters::Setters; +use either::Either; use rustc_data_structures::fx::{FxHashMap, FxIndexMap, FxIndexSet}; use rustc_data_structures::sync::{DynSend, IntoDynSyncSend, Lrc}; use rustc_error_messages::{FluentArgs, SpanLabel}; @@ -2559,60 +2560,65 @@ fn num_decimal_digits(num: usize) -> usize { // We replace some characters so the CLI output is always consistent and underlines aligned. // Keep the following list in sync with `rustc_span::char_width`. +// ATTENTION: keep lexicografically sorted so that the binary search will work const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[ - ('\t', " "), // We do our own tab replacement - ('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters. - ('\u{202A}', "�"), // The following unicode text flow control characters are inconsistently - ('\u{202B}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk - ('\u{202D}', "�"), // not corresponding to the visible source code, so we replace them always. - ('\u{202E}', "�"), - ('\u{2066}', "�"), - ('\u{2067}', "�"), - ('\u{2068}', "�"), - ('\u{202C}', "�"), - ('\u{2069}', "�"), // In terminals without Unicode support the following will be garbled, but in *all* terminals // the underlying codepoint will be as well. We could gate this replacement behind a "unicode // support" gate. - ('\u{0000}', "␀"), - ('\u{0001}', "␁"), - ('\u{0002}', "␂"), - ('\u{0003}', "␃"), - ('\u{0004}', "␄"), - ('\u{0005}', "␅"), - ('\u{0006}', "␆"), - ('\u{0007}', "␇"), - ('\u{0008}', "␈"), - ('\u{000B}', "␋"), - ('\u{000C}', "␌"), - ('\u{000D}', "␍"), - ('\u{000E}', "␎"), - ('\u{000F}', "␏"), - ('\u{0010}', "␐"), - ('\u{0011}', "␑"), - ('\u{0012}', "␒"), - ('\u{0013}', "␓"), - ('\u{0014}', "␔"), - ('\u{0015}', "␕"), - ('\u{0016}', "␖"), - ('\u{0017}', "␗"), - ('\u{0018}', "␘"), - ('\u{0019}', "␙"), - ('\u{001A}', "␚"), - ('\u{001B}', "␛"), - ('\u{001C}', "␜"), - ('\u{001D}', "␝"), - ('\u{001E}', "␞"), - ('\u{001F}', "␟"), - ('\u{007F}', "␡"), + ('\0', "␀"), + ('\u{1}', "␁"), + ('\u{2}', "␂"), + ('\u{3}', "␃"), + ('\u{4}', "␄"), + ('\u{5}', "␅"), + ('\u{6}', "␆"), + ('\u{7}', "␇"), + ('\u{8}', "␈"), + ('\t', " "), // We do our own tab replacement + ('\u{b}', "␋"), + ('\u{c}', "␌"), + ('\r', "␍"), + ('\u{e}', "␎"), + ('\u{f}', "␏"), + ('\u{10}', "␐"), + ('\u{11}', "␑"), + ('\u{12}', "␒"), + ('\u{13}', "␓"), + ('\u{14}', "␔"), + ('\u{15}', "␕"), + ('\u{16}', "␖"), + ('\u{17}', "␗"), + ('\u{18}', "␘"), + ('\u{19}', "␙"), + ('\u{1a}', "␚"), + ('\u{1b}', "␛"), + ('\u{1c}', "␜"), + ('\u{1d}', "␝"), + ('\u{1e}', "␞"), + ('\u{1f}', "␟"), + ('\u{7f}', "␡"), + ('\u{200d}', ""), // Replace ZWJ for consistent terminal output of grapheme clusters. + ('\u{202a}', "�"), // The following unicode text flow control characters are inconsistently + ('\u{202b}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk + ('\u{202c}', "�"), // not corresponding to the visible source code, so we replace them always. + ('\u{202d}', "�"), + ('\u{202e}', "�"), + ('\u{2066}', "�"), + ('\u{2067}', "�"), + ('\u{2068}', "�"), + ('\u{2069}', "�"), ]; fn normalize_whitespace(str: &str) -> String { - let mut s = str.to_string(); - for (c, replacement) in OUTPUT_REPLACEMENTS { - s = s.replace(*c, replacement); - } - s + // Scan the input string for a character in the ordered table above. If it's present, replace + // it with it's alternative string (it can be more than 1 char!). Otherwise, retain the input + // char. At the end, allocate all chars into a string in one operation. + str.chars() + .flat_map(|c| match OUTPUT_REPLACEMENTS.binary_search_by_key(&c, |(k, _)| *k) { + Ok(i) => Either::Left(OUTPUT_REPLACEMENTS[i].1.chars()), + _ => Either::Right([c].into_iter()), + }) + .collect() } fn draw_col_separator(buffer: &mut StyledBuffer, line: usize, col: usize) {