From 1573d10325f286ffcbba290d27984ded9538a5c3 Mon Sep 17 00:00:00 2001
From: Philipp Hansch <dev@phansch.net>
Date: Tue, 6 Apr 2021 06:59:10 +0200
Subject: [PATCH 1/5] tabs_in_doc_comments: Fix ICE due to char indexing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a quick-fix for an ICE in `tabs_in_doc_comments`. The problem
was that we we're indexing into possibly multi-byte characters, such as '位'.

More specifically `get_chunks_of_tabs` was returning indices into
multi-byte characters. Those were passed on to a `Span` creation that
then caused the ICE.

This fix makes sure that we don't return indices that point inside a
multi-byte character. *However*, we are still iterating over unicode
codepoints, not grapheme clusters. So a seemingly single character like y̆ ,
which actually consists of two codepoints, will probably still cause
incorrect spans in the output.
---
 clippy_lints/src/tabs_in_doc_comments.rs | 28 ++++++++++++++----------
 tests/ui/crashes/ice-5835.rs             |  8 +++++++
 tests/ui/crashes/ice-5835.stderr         | 20 +++++++++++++++++
 3 files changed, 45 insertions(+), 11 deletions(-)
 create mode 100644 tests/ui/crashes/ice-5835.rs
 create mode 100644 tests/ui/crashes/ice-5835.stderr

diff --git a/clippy_lints/src/tabs_in_doc_comments.rs b/clippy_lints/src/tabs_in_doc_comments.rs
index 88bd2feaadd..3f9692540f7 100644
--- a/clippy_lints/src/tabs_in_doc_comments.rs
+++ b/clippy_lints/src/tabs_in_doc_comments.rs
@@ -104,30 +104,29 @@ fn get_chunks_of_tabs(the_str: &str) -> Vec<(u32, u32)> {
     // tracker to decide if the last group of tabs is not closed by a non-tab character
     let mut is_active = false;
 
-    let chars_array: Vec<_> = the_str.chars().collect();
+    let char_indices: Vec<_> = the_str.char_indices().collect();
 
-    if chars_array == vec!['\t'] {
+    if char_indices.len() == 1 && char_indices.first().unwrap().1 == '\t' {
         return vec![(0, 1)];
     }
 
-    for (index, arr) in chars_array.windows(2).enumerate() {
-        let index = u32::try_from(index).expect(line_length_way_to_long);
-        match arr {
-            ['\t', '\t'] => {
+    for entry in char_indices.windows(2) {
+        match entry {
+            [(_, '\t'), (_, '\t')] => {
                 // either string starts with double tab, then we have to set it active,
                 // otherwise is_active is true anyway
                 is_active = true;
             },
-            [_, '\t'] => {
+            [(_, _), (index_b, '\t')] => {
                 // as ['\t', '\t'] is excluded, this has to be a start of a tab group,
                 // set indices accordingly
                 is_active = true;
-                current_start = index + 1;
+                current_start = *index_b as u32;
             },
-            ['\t', _] => {
+            [(_, '\t'), (index_b, _)] => {
                 // this now has to be an end of the group, hence we have to push a new tuple
                 is_active = false;
-                spans.push((current_start, index + 1));
+                spans.push((current_start, *index_b as u32));
             },
             _ => {},
         }
@@ -137,7 +136,7 @@ fn get_chunks_of_tabs(the_str: &str) -> Vec<(u32, u32)> {
     if is_active {
         spans.push((
             current_start,
-            u32::try_from(the_str.chars().count()).expect(line_length_way_to_long),
+            u32::try_from(char_indices.last().unwrap().0 + 1).expect(line_length_way_to_long),
         ));
     }
 
@@ -148,6 +147,13 @@ fn get_chunks_of_tabs(the_str: &str) -> Vec<(u32, u32)> {
 mod tests_for_get_chunks_of_tabs {
     use super::get_chunks_of_tabs;
 
+    #[test]
+    fn test_unicode_han_string() {
+        let res = get_chunks_of_tabs(" 位\t");
+
+        assert_eq!(res, vec![(4, 5)]);
+    }
+
     #[test]
     fn test_empty_string() {
         let res = get_chunks_of_tabs("");
diff --git a/tests/ui/crashes/ice-5835.rs b/tests/ui/crashes/ice-5835.rs
new file mode 100644
index 00000000000..209a5b1eb09
--- /dev/null
+++ b/tests/ui/crashes/ice-5835.rs
@@ -0,0 +1,8 @@
+#![rustfmt::skip]
+
+pub struct Foo {
+    /// 位	
+    pub bar: u8,
+}
+
+fn main() {}
diff --git a/tests/ui/crashes/ice-5835.stderr b/tests/ui/crashes/ice-5835.stderr
new file mode 100644
index 00000000000..e286bc580ad
--- /dev/null
+++ b/tests/ui/crashes/ice-5835.stderr
@@ -0,0 +1,20 @@
+error[E0658]: custom inner attributes are unstable
+  --> $DIR/ice-5835.rs:1:4
+   |
+LL | #![rustfmt::skip]
+   |    ^^^^^^^^^^^^^
+   |
+   = note: see issue #54726 <https://github.com/rust-lang/rust/issues/54726> for more information
+   = help: add `#![feature(custom_inner_attributes)]` to the crate attributes to enable
+
+error: using tabs in doc comments is not recommended
+  --> $DIR/ice-5835.rs:4:10
+   |
+LL |     /// 位    
+   |           ^^^^ help: consider using four spaces per tab
+   |
+   = note: `-D clippy::tabs-in-doc-comments` implied by `-D warnings`
+
+error: aborting due to 2 previous errors
+
+For more information about this error, try `rustc --explain E0658`.

From dde46c9340994ad396a8677702be58122384a5ed Mon Sep 17 00:00:00 2001
From: Philipp Hansch <dev@phansch.net>
Date: Sat, 10 Apr 2021 14:42:33 +0200
Subject: [PATCH 2/5] Replace complex conditional with pattern matching

---
 clippy_lints/src/tabs_in_doc_comments.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clippy_lints/src/tabs_in_doc_comments.rs b/clippy_lints/src/tabs_in_doc_comments.rs
index 3f9692540f7..d68227545a6 100644
--- a/clippy_lints/src/tabs_in_doc_comments.rs
+++ b/clippy_lints/src/tabs_in_doc_comments.rs
@@ -106,7 +106,7 @@ fn get_chunks_of_tabs(the_str: &str) -> Vec<(u32, u32)> {
 
     let char_indices: Vec<_> = the_str.char_indices().collect();
 
-    if char_indices.len() == 1 && char_indices.first().unwrap().1 == '\t' {
+    if let &[(_, '\t')] = &char_indices.as_slice() {
         return vec![(0, 1)];
     }
 

From 8b9331b49dd86cbc1ec655f6a6f5f9d74ca8d901 Mon Sep 17 00:00:00 2001
From: Philipp Hansch <dev@phansch.net>
Date: Sat, 10 Apr 2021 14:43:52 +0200
Subject: [PATCH 3/5] Fix rustfmt error / Add comment for tab character

---
 tests/ui/crashes/ice-5835.rs     |  5 +++--
 tests/ui/crashes/ice-5835.stderr | 14 ++------------
 2 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/tests/ui/crashes/ice-5835.rs b/tests/ui/crashes/ice-5835.rs
index 209a5b1eb09..5e99cb432b6 100644
--- a/tests/ui/crashes/ice-5835.rs
+++ b/tests/ui/crashes/ice-5835.rs
@@ -1,7 +1,8 @@
-#![rustfmt::skip]
-
+#[rustfmt::skip]
 pub struct Foo {
     /// 位	
+    ///   ^ Do not remove this tab character.
+    ///   It was required to trigger the ICE.
     pub bar: u8,
 }
 
diff --git a/tests/ui/crashes/ice-5835.stderr b/tests/ui/crashes/ice-5835.stderr
index e286bc580ad..c972bcb60a0 100644
--- a/tests/ui/crashes/ice-5835.stderr
+++ b/tests/ui/crashes/ice-5835.stderr
@@ -1,20 +1,10 @@
-error[E0658]: custom inner attributes are unstable
-  --> $DIR/ice-5835.rs:1:4
-   |
-LL | #![rustfmt::skip]
-   |    ^^^^^^^^^^^^^
-   |
-   = note: see issue #54726 <https://github.com/rust-lang/rust/issues/54726> for more information
-   = help: add `#![feature(custom_inner_attributes)]` to the crate attributes to enable
-
 error: using tabs in doc comments is not recommended
-  --> $DIR/ice-5835.rs:4:10
+  --> $DIR/ice-5835.rs:3:10
    |
 LL |     /// 位    
    |           ^^^^ help: consider using four spaces per tab
    |
    = note: `-D clippy::tabs-in-doc-comments` implied by `-D warnings`
 
-error: aborting due to 2 previous errors
+error: aborting due to previous error
 
-For more information about this error, try `rustc --explain E0658`.

From 47a4865406ec748729a6e7ec23e5a4d5f9b88230 Mon Sep 17 00:00:00 2001
From: Philipp Hansch <dev@phansch.net>
Date: Sat, 10 Apr 2021 15:30:51 +0200
Subject: [PATCH 4/5] Fix dogfood

---
 clippy_lints/src/tabs_in_doc_comments.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clippy_lints/src/tabs_in_doc_comments.rs b/clippy_lints/src/tabs_in_doc_comments.rs
index d68227545a6..51581b75ce3 100644
--- a/clippy_lints/src/tabs_in_doc_comments.rs
+++ b/clippy_lints/src/tabs_in_doc_comments.rs
@@ -106,7 +106,7 @@ fn get_chunks_of_tabs(the_str: &str) -> Vec<(u32, u32)> {
 
     let char_indices: Vec<_> = the_str.char_indices().collect();
 
-    if let &[(_, '\t')] = &char_indices.as_slice() {
+    if let [(_, '\t')] = char_indices.as_slice() {
         return vec![(0, 1)];
     }
 
@@ -121,12 +121,12 @@ fn get_chunks_of_tabs(the_str: &str) -> Vec<(u32, u32)> {
                 // as ['\t', '\t'] is excluded, this has to be a start of a tab group,
                 // set indices accordingly
                 is_active = true;
-                current_start = *index_b as u32;
+                current_start = u32::try_from(*index_b).unwrap();
             },
             [(_, '\t'), (index_b, _)] => {
                 // this now has to be an end of the group, hence we have to push a new tuple
                 is_active = false;
-                spans.push((current_start, *index_b as u32));
+                spans.push((current_start, u32::try_from(*index_b).unwrap()));
             },
             _ => {},
         }
@@ -149,7 +149,7 @@ mod tests_for_get_chunks_of_tabs {
 
     #[test]
     fn test_unicode_han_string() {
-        let res = get_chunks_of_tabs(" 位\t");
+        let res = get_chunks_of_tabs(" \u{4f4d}\t");
 
         assert_eq!(res, vec![(4, 5)]);
     }

From cbdebd97ec4846391dc0f9a1288a3ab1fc053f99 Mon Sep 17 00:00:00 2001
From: Philipp Hansch <dev@phansch.net>
Date: Wed, 14 Apr 2021 06:30:51 +0200
Subject: [PATCH 5/5] Explain why we use `char_indices()` instead of `chars()`

---
 clippy_lints/src/tabs_in_doc_comments.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clippy_lints/src/tabs_in_doc_comments.rs b/clippy_lints/src/tabs_in_doc_comments.rs
index 51581b75ce3..1995981b905 100644
--- a/clippy_lints/src/tabs_in_doc_comments.rs
+++ b/clippy_lints/src/tabs_in_doc_comments.rs
@@ -104,6 +104,9 @@ fn get_chunks_of_tabs(the_str: &str) -> Vec<(u32, u32)> {
     // tracker to decide if the last group of tabs is not closed by a non-tab character
     let mut is_active = false;
 
+    // Note that we specifically need the char _byte_ indices here, not the positional indexes
+    // within the char array to deal with multi-byte characters properly. `char_indices` does
+    // exactly that. It provides an iterator over tuples of the form `(byte position, char)`.
     let char_indices: Vec<_> = the_str.char_indices().collect();
 
     if let [(_, '\t')] = char_indices.as_slice() {