From f3661dce09bc715a46c01f7ea57694e06b587f29 Mon Sep 17 00:00:00 2001
From: Michael Howell <michael@notriddle.com>
Date: Mon, 10 Jun 2024 15:01:31 -0700
Subject: [PATCH] rustdoc: word wrap CamelCase in the item list table

This is an alternative to ee6459d6521cf6a4c2e08b6e13ce3c6ce5d55ed0.
That is, it fixes the issue that affects the very long type names
in https://docs.rs/async-stripe/0.31.0/stripe/index.html#structs.

This is, necessarily, a pile of nasty heuristics.
We need to balance a few issues:

- Sometimes, there's no real word break.
  For example, `BTreeMap` should be `BTree<wbr>Map`,
  not `B<wbr>Tree<wbr>Map`.

- Sometimes, there's a legit word break,
  but the name is tiny and the HTML overhead isn't worth it.
  For example, if we're typesetting `TyCtx`,
  writing `Ty<wbr>Ctx` would have an HTML overhead of 50%.
  Line breaking inside it makes no sense.
---
 Cargo.lock                                    |  1 +
 src/librustdoc/Cargo.toml                     |  1 +
 src/librustdoc/html/escape.rs                 | 44 ++++++++++++++
 src/librustdoc/html/escape/tests.rs           | 57 +++++++++++++++++++
 src/librustdoc/html/format.rs                 |  3 +-
 src/librustdoc/html/render/print_item.rs      |  6 +-
 ...long_typename.extremely_long_typename.html |  1 +
 tests/rustdoc/extremely_long_typename.rs      |  7 +++
 .../item-desc-list-at-start.item-table.html   |  2 +-
 9 files changed, 117 insertions(+), 5 deletions(-)
 create mode 100644 src/librustdoc/html/escape/tests.rs
 create mode 100644 tests/rustdoc/extremely_long_typename.extremely_long_typename.html
 create mode 100644 tests/rustdoc/extremely_long_typename.rs
diff --git a/Cargo.lock b/Cargo.lock
index 281599a21fc..1a7d7e3f5d7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4826,6 +4826,7 @@ dependencies = [
  "tracing",
  "tracing-subscriber",
  "tracing-tree",
+ "unicode-segmentation",
 ]
 
 [[package]]
diff --git a/src/librustdoc/Cargo.toml b/src/librustdoc/Cargo.toml
index fe531f0ff59..dfd7414652f 100644
--- a/src/librustdoc/Cargo.toml
+++ b/src/librustdoc/Cargo.toml
@@ -23,6 +23,7 @@ tempfile = "3"
 tracing = "0.1"
 tracing-tree = "0.3.0"
 threadpool = "1.8.1"
+unicode-segmentation = "1.9"
 
 [dependencies.tracing-subscriber]
 version = "0.3.3"
diff --git a/src/librustdoc/html/escape.rs b/src/librustdoc/html/escape.rs
index ea4b573aeb9..94414913163 100644
--- a/src/librustdoc/html/escape.rs
+++ b/src/librustdoc/html/escape.rs
@@ -5,6 +5,8 @@
 
 use std::fmt;
 
+use unicode_segmentation::UnicodeSegmentation;
+
 /// Wrapper struct which will emit the HTML-escaped version of the contained
 /// string when passed to a format string.
 pub(crate) struct Escape<'a>(pub &'a str);
@@ -74,3 +76,45 @@ fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
         Ok(())
     }
 }
+
+/// Wrapper struct which will emit the HTML-escaped version of the contained
+/// string when passed to a format string. This function also word-breaks
+/// CamelCase and snake_case word names.
+///
+/// This is only safe to use for text nodes. If you need your output to be
+/// safely contained in an attribute, use [`Escape`]. If you don't know the
+/// difference, use [`Escape`].
+pub(crate) struct EscapeBodyTextWithWbr<'a>(pub &'a str);
+
+impl<'a> fmt::Display for EscapeBodyTextWithWbr<'a> {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let EscapeBodyTextWithWbr(text) = *self;
+        if text.len() < 8 {
+            return EscapeBodyText(text).fmt(fmt);
+        }
+        let mut last = 0;
+        let mut it = text.grapheme_indices(true).peekable();
+        let _ = it.next(); // don't insert wbr before first char
+        while let Some((i, s)) = it.next() {
+            let pk = it.peek();
+            let is_uppercase = || s.chars().any(|c| c.is_uppercase());
+            let next_is_uppercase =
+                || pk.map_or(true, |(_, t)| t.chars().any(|c| c.is_uppercase()));
+            let next_is_underscore = || pk.map_or(true, |(_, t)| t.contains('_'));
+            if (i - last > 3 && is_uppercase() && !next_is_uppercase())
+                || (s.contains('_') && !next_is_underscore())
+            {
+                EscapeBodyText(&text[last..i]).fmt(fmt)?;
+                fmt.write_str("<wbr>")?;
+                last = i;
+            }
+        }
+        if last < text.len() {
+            EscapeBodyText(&text[last..]).fmt(fmt)?;
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests;
diff --git a/src/librustdoc/html/escape/tests.rs b/src/librustdoc/html/escape/tests.rs
new file mode 100644
index 00000000000..f99a2a693a6
--- /dev/null
+++ b/src/librustdoc/html/escape/tests.rs
@@ -0,0 +1,57 @@
+// basic examples
+#[test]
+fn escape_body_text_with_wbr() {
+    use super::EscapeBodyTextWithWbr as E;
+    // extreme corner cases
+    assert_eq!(&E("").to_string(), "");
+    assert_eq!(&E("a").to_string(), "a");
+    assert_eq!(&E("A").to_string(), "A");
+    // real(istic) examples
+    assert_eq!(&E("FirstSecond").to_string(), "First<wbr>Second");
+    assert_eq!(&E("First_Second").to_string(), "First<wbr>_Second");
+    assert_eq!(&E("First<T>_Second").to_string(), "First&lt;<wbr>T&gt;<wbr>_Second");
+    assert_eq!(&E("first_second").to_string(), "first<wbr>_second");
+    assert_eq!(&E("MY_CONSTANT").to_string(), "MY<wbr>_CONSTANT");
+    assert_eq!(&E("___________").to_string(), "___________");
+    // a string won't get wrapped if it's less than 8 bytes
+    assert_eq!(&E("HashSet").to_string(), "HashSet");
+    // an individual word won't get wrapped if it's less than 4 bytes
+    assert_eq!(&E("VecDequeue").to_string(), "VecDequeue");
+    assert_eq!(&E("VecDequeueSet").to_string(), "VecDequeue<wbr>Set");
+    // how to handle acronyms
+    assert_eq!(&E("BTreeMap").to_string(), "BTree<wbr>Map");
+    assert_eq!(&E("HTTPSProxy").to_string(), "HTTPS<wbr>Proxy");
+    // more corners
+    assert_eq!(&E("ṼẽçÑñéå").to_string(), "Ṽẽç<wbr>Ññéå");
+    assert_eq!(&E("V\u{0300}e\u{0300}c\u{0300}D\u{0300}e\u{0300}q\u{0300}u\u{0300}e\u{0300}u\u{0300}e\u{0300}").to_string(), "V\u{0300}e\u{0300}c\u{0300}<wbr>D\u{0300}e\u{0300}q\u{0300}u\u{0300}e\u{0300}u\u{0300}e\u{0300}");
+    assert_eq!(&E("LPFNACCESSIBLEOBJECTFROMWINDOW").to_string(), "LPFNACCESSIBLEOBJECTFROMWINDOW");
+}
+// property test
+#[test]
+fn escape_body_text_with_wbr_makes_sense() {
+    use itertools::Itertools as _;
+
+    use super::EscapeBodyTextWithWbr as E;
+    const C: [u8; 3] = [b'a', b'A', b'_'];
+    for chars in [
+        C.into_iter(),
+        C.into_iter(),
+        C.into_iter(),
+        C.into_iter(),
+        C.into_iter(),
+        C.into_iter(),
+        C.into_iter(),
+        C.into_iter(),
+    ]
+    .into_iter()
+    .multi_cartesian_product()
+    {
+        let s = String::from_utf8(chars).unwrap();
+        assert_eq!(s.len(), 8);
+        let esc = E(&s).to_string();
+        assert!(!esc.contains("<wbr><wbr>"));
+        assert!(!esc.ends_with("<wbr>"));
+        assert!(!esc.starts_with("<wbr>"));
+        assert_eq!(&esc.replace("<wbr>", ""), &s);
+    }
+}
diff --git a/src/librustdoc/html/format.rs b/src/librustdoc/html/format.rs
index d6aed75103d..bb5ac303ffd 100644
--- a/src/librustdoc/html/format.rs
+++ b/src/librustdoc/html/format.rs
@@ -32,7 +32,7 @@
 use crate::clean::{self, ExternalCrate, PrimitiveType};
 use crate::formats::cache::Cache;
 use crate::formats::item_type::ItemType;
-use crate::html::escape::Escape;
+use crate::html::escape::{Escape, EscapeBodyText};
 use crate::html::render::Context;
 use crate::passes::collect_intra_doc_links::UrlFragment;
 
@@ -988,6 +988,7 @@ pub(crate) fn anchor<'a, 'cx: 'a>(
                 f,
                 r#"<a class="{short_ty}" href="{url}" title="{short_ty} {path}">{text}</a>"#,
                 path = join_with_double_colon(&fqp),
+                text = EscapeBodyText(text.as_str()),
             )
         } else {
             f.write_str(text.as_str())
diff --git a/src/librustdoc/html/render/print_item.rs b/src/librustdoc/html/render/print_item.rs
index 24476e80778..3f01c082ba9 100644
--- a/src/librustdoc/html/render/print_item.rs
+++ b/src/librustdoc/html/render/print_item.rs
@@ -29,7 +29,7 @@
 use crate::config::ModuleSorting;
 use crate::formats::item_type::ItemType;
 use crate::formats::Impl;
-use crate::html::escape::Escape;
+use crate::html::escape::{Escape, EscapeBodyTextWithWbr};
 use crate::html::format::{
     display_fn, join_with_double_colon, print_abi_with_space, print_constness_with_space,
     print_where_clause, visibility_print_with_space, Buffer, Ending, PrintWithSpace,
@@ -423,7 +423,7 @@ fn cmp(i1: &clean::Item, i2: &clean::Item, tcx: TyCtxt<'_>) -> Ordering {
                         "<div class=\"item-name\"><code>{}extern crate {} as {};",
                         visibility_print_with_space(myitem, cx),
                         anchor(myitem.item_id.expect_def_id(), src, cx),
-                        myitem.name.unwrap(),
+                        EscapeBodyTextWithWbr(myitem.name.unwrap().as_str()),
                     ),
                     None => write!(
                         w,
@@ -520,7 +520,7 @@ fn cmp(i1: &clean::Item, i2: &clean::Item, tcx: TyCtxt<'_>) -> Ordering {
                         {stab_tags}\
                      </div>\
                      {docs_before}{docs}{docs_after}",
-                    name = myitem.name.unwrap(),
+                    name = EscapeBodyTextWithWbr(myitem.name.unwrap().as_str()),
                     visibility_and_hidden = visibility_and_hidden,
                     stab_tags = extra_info_tags(myitem, item, tcx),
                     class = myitem.type_(),
diff --git a/tests/rustdoc/extremely_long_typename.extremely_long_typename.html b/tests/rustdoc/extremely_long_typename.extremely_long_typename.html
new file mode 100644
index 00000000000..b20e59866da
--- /dev/null
+++ b/tests/rustdoc/extremely_long_typename.extremely_long_typename.html
@@ -0,0 +1 @@
+<li><div class="item-name"><a class="struct" href="struct.CreateSubscriptionPaymentSettingsPaymentMethodOptionsCustomerBalanceBankTransferEuBankTransfer.html" title="struct extremely_long_typename::CreateSubscriptionPaymentSettingsPaymentMethodOptionsCustomerBalanceBankTransferEuBankTransfer">Create<wbr />Subscription<wbr />Payment<wbr />Settings<wbr />Payment<wbr />Method<wbr />Options<wbr />Customer<wbr />Balance<wbr />Bank<wbr />Transfer<wbr />EuBank<wbr />Transfer</a></div></li>
\ No newline at end of file
diff --git a/tests/rustdoc/extremely_long_typename.rs b/tests/rustdoc/extremely_long_typename.rs
new file mode 100644
index 00000000000..212afe2d110
--- /dev/null
+++ b/tests/rustdoc/extremely_long_typename.rs
@@ -0,0 +1,7 @@
+// ignore-tidy-linelength
+// Make sure that, if an extremely long type name is named,
+// the item table has it line wrapped.
+// There should be some reasonably-placed `<wbr>` tags in the snapshot file.
+
+// @snapshot extremely_long_typename "extremely_long_typename/index.html" '//ul[@class="item-table"]/li'
+pub struct CreateSubscriptionPaymentSettingsPaymentMethodOptionsCustomerBalanceBankTransferEuBankTransfer;
diff --git a/tests/rustdoc/item-desc-list-at-start.item-table.html b/tests/rustdoc/item-desc-list-at-start.item-table.html
index 72bde573cea..ab8b1508b55 100644
--- a/tests/rustdoc/item-desc-list-at-start.item-table.html
+++ b/tests/rustdoc/item-desc-list-at-start.item-table.html
@@ -1 +1 @@
-<ul class="item-table"><li><div class="item-name"><a class="constant" href="constant.MY_CONSTANT.html" title="constant item_desc_list_at_start::MY_CONSTANT">MY_CONSTANT</a></div><div class="desc docblock-short">Groups: <code>SamplePatternSGIS</code>, <code>SamplePatternEXT</code></div></li></ul>
\ No newline at end of file
+<ul class="item-table"><li><div class="item-name"><a class="constant" href="constant.MY_CONSTANT.html" title="constant item_desc_list_at_start::MY_CONSTANT">MY<wbr />_CONSTANT</a></div><div class="desc docblock-short">Groups: <code>SamplePatternSGIS</code>, <code>SamplePatternEXT</code></div></li></ul>
\ No newline at end of file