Switch linkchecker to use html5ever for html parsing.

The existing regex-based HTML parsing was just too primitive to correctly handle HTML content. Some books have legitimate `href="…"` text which should not be validated because it is part of the text, not actual HTML.
2024-02-08 08:35:12 -08:00 · 2024-02-08 08:35:12 -08:00 · 776590b14e
commit 776590b14e
parent bf6a1b1245
3 changed files with 244 additions and 205 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2274,6 +2274,7 @@ dependencies = [
 name = "linkchecker"
 version = "0.1.0"
 dependencies = [
+ "html5ever",
 "once_cell",
 "regex",
 ]
--- a/src/tools/linkchecker/Cargo.toml
+++ b/src/tools/linkchecker/Cargo.toml
@ -10,3 +10,4 @@ path = "main.rs"
 [dependencies]
 regex = "1"
 once_cell = "1"
+html5ever = "0.26.0"
--- a/src/tools/linkchecker/main.rs
+++ b/src/tools/linkchecker/main.rs
@ -14,6 +14,12 @@
 //! A few exceptions are allowed as there's known bugs in rustdoc, but this
 //! should catch the majority of "broken link" cases.

+use html5ever::tendril::ByteTendril;
+use html5ever::tokenizer::{
+    BufferQueue, TagToken, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
+};
+use once_cell::sync::Lazy;
+use regex::Regex;
 use std::cell::RefCell;
 use std::collections::{HashMap, HashSet};
 use std::env;
@ -23,9 +29,6 @@ use std::path::{Component, Path, PathBuf};
 use std::rc::Rc;
 use std::time::Instant;

-use once_cell::sync::Lazy;
-use regex::Regex;
-
 // Add linkcheck exceptions here
 // If at all possible you should use intra-doc links to avoid linkcheck issues. These
 // are cases where that does not work
@ -182,163 +185,10 @@ impl Checker {
            }
        };

-        // Search for anything that's the regex 'href[ ]*=[ ]*".*?"'
-        with_attrs_in_source(&source, " href", |url, i, base| {
-            // Ignore external URLs
-            if url.starts_with("http:")
-                || url.starts_with("https:")
-                || url.starts_with("javascript:")
-                || url.starts_with("ftp:")
-                || url.starts_with("irc:")
-                || url.starts_with("data:")
-                || url.starts_with("mailto:")
-            {
-                report.links_ignored_external += 1;
-                return;
-            }
-            report.links_checked += 1;
-            let (url, fragment) = match url.split_once('#') {
-                None => (url, None),
-                Some((url, fragment)) => (url, Some(fragment)),
-            };
-            // NB: the `splitn` always succeeds, even if the delimiter is not present.
-            let url = url.splitn(2, '?').next().unwrap();
-
-            // Once we've plucked out the URL, parse it using our base url and
-            // then try to extract a file path.
-            let mut path = file.to_path_buf();
-            if !base.is_empty() || !url.is_empty() {
-                path.pop();
-                for part in Path::new(base).join(url).components() {
-                    match part {
-                        Component::Prefix(_) | Component::RootDir => {
-                            // Avoid absolute paths as they make the docs not
-                            // relocatable by making assumptions on where the docs
-                            // are hosted relative to the site root.
-                            report.errors += 1;
-                            println!(
-                                "{}:{}: absolute path - {}",
-                                pretty_path,
-                                i + 1,
-                                Path::new(base).join(url).display()
-                            );
-                            return;
-                        }
-                        Component::CurDir => {}
-                        Component::ParentDir => {
-                            path.pop();
-                        }
-                        Component::Normal(s) => {
-                            path.push(s);
-                        }
-                    }
-                }
-            }
-
-            let (target_pretty_path, target_entry) = self.load_file(&path, report);
-            let (target_source, target_ids) = match target_entry {
-                FileEntry::Missing => {
-                    if is_exception(file, &target_pretty_path) {
-                        report.links_ignored_exception += 1;
-                    } else {
-                        report.errors += 1;
-                        println!(
-                            "{}:{}: broken link - `{}`",
-                            pretty_path,
-                            i + 1,
-                            target_pretty_path
-                        );
-                    }
-                    return;
-                }
-                FileEntry::Dir => {
-                    // Links to directories show as directory listings when viewing
-                    // the docs offline so it's best to avoid them.
-                    report.errors += 1;
-                    println!(
-                        "{}:{}: directory link to `{}` \
-                         (directory links should use index.html instead)",
-                        pretty_path,
-                        i + 1,
-                        target_pretty_path
-                    );
-                    return;
-                }
-                FileEntry::OtherFile => return,
-                FileEntry::Redirect { target } => {
-                    let t = target.clone();
-                    let (target, redir_entry) = self.load_file(&t, report);
-                    match redir_entry {
-                        FileEntry::Missing => {
-                            report.errors += 1;
-                            println!(
-                                "{}:{}: broken redirect from `{}` to `{}`",
-                                pretty_path,
-                                i + 1,
-                                target_pretty_path,
-                                target
-                            );
-                            return;
-                        }
-                        FileEntry::Redirect { target } => {
-                            // Redirect to a redirect, this link checker
-                            // currently doesn't support this, since it would
-                            // require cycle checking, etc.
-                            report.errors += 1;
-                            println!(
-                                "{}:{}: redirect from `{}` to `{}` \
-                                 which is also a redirect (not supported)",
-                                pretty_path,
-                                i + 1,
-                                target_pretty_path,
-                                target.display()
-                            );
-                            return;
-                        }
-                        FileEntry::Dir => {
-                            report.errors += 1;
-                            println!(
-                                "{}:{}: redirect from `{}` to `{}` \
-                                 which is a directory \
-                                 (directory links should use index.html instead)",
-                                pretty_path,
-                                i + 1,
-                                target_pretty_path,
-                                target
-                            );
-                            return;
-                        }
-                        FileEntry::OtherFile => return,
-                        FileEntry::HtmlFile { source, ids } => (source, ids),
-                    }
-                }
-                FileEntry::HtmlFile { source, ids } => (source, ids),
-            };
-
-            // Alright, if we've found an HTML file for the target link. If
-            // this is a fragment link, also check that the `id` exists.
-            if let Some(ref fragment) = fragment {
-                // Fragments like `#1-6` are most likely line numbers to be
-                // interpreted by javascript, so we're ignoring these
-                if fragment.splitn(2, '-').all(|f| f.chars().all(|c| c.is_numeric())) {
-                    return;
-                }
-
-                parse_ids(&mut target_ids.borrow_mut(), &pretty_path, target_source, report);
-
-                if target_ids.borrow().contains(*fragment) {
-                    return;
-                }
-
-                if is_exception(file, &format!("#{}", fragment)) {
-                    report.links_ignored_exception += 1;
-                } else {
-                    report.errors += 1;
-                    print!("{}:{}: broken link fragment ", pretty_path, i + 1);
-                    println!("`#{}` pointing to `{}`", fragment, target_pretty_path);
-                };
-            }
-        });
+        let (base, urls) = get_urls(&source);
+        for (i, url) in urls {
+            self.check_url(file, &pretty_path, report, &base, i, &url);
+        }

        self.check_intra_doc_links(file, &pretty_path, &source, report);

@ -350,6 +200,159 @@ impl Checker {
        }
    }

+    fn check_url(
+        &mut self,
+        file: &Path,
+        pretty_path: &str,
+        report: &mut Report,
+        base: &Option<String>,
+        i: u64,
+        url: &str,
+    ) {
+        // Ignore external URLs
+        if url.starts_with("http:")
+            || url.starts_with("https:")
+            || url.starts_with("javascript:")
+            || url.starts_with("ftp:")
+            || url.starts_with("irc:")
+            || url.starts_with("data:")
+            || url.starts_with("mailto:")
+        {
+            report.links_ignored_external += 1;
+            return;
+        }
+        report.links_checked += 1;
+        let (url, fragment) = match url.split_once('#') {
+            None => (url, None),
+            Some((url, fragment)) => (url, Some(fragment)),
+        };
+        // NB: the `splitn` always succeeds, even if the delimiter is not present.
+        let url = url.splitn(2, '?').next().unwrap();
+
+        // Once we've plucked out the URL, parse it using our base url and
+        // then try to extract a file path.
+        let mut path = file.to_path_buf();
+        if base.is_some() || !url.is_empty() {
+            let base = base.as_deref().unwrap_or("");
+            path.pop();
+            for part in Path::new(base).join(url).components() {
+                match part {
+                    Component::Prefix(_) | Component::RootDir => {
+                        // Avoid absolute paths as they make the docs not
+                        // relocatable by making assumptions on where the docs
+                        // are hosted relative to the site root.
+                        report.errors += 1;
+                        println!(
+                            "{}:{}: absolute path - {}",
+                            pretty_path,
+                            i,
+                            Path::new(base).join(url).display()
+                        );
+                        return;
+                    }
+                    Component::CurDir => {}
+                    Component::ParentDir => {
+                        path.pop();
+                    }
+                    Component::Normal(s) => {
+                        path.push(s);
+                    }
+                }
+            }
+        }
+
+        let (target_pretty_path, target_entry) = self.load_file(&path, report);
+        let (target_source, target_ids) = match target_entry {
+            FileEntry::Missing => {
+                if is_exception(file, &target_pretty_path) {
+                    report.links_ignored_exception += 1;
+                } else {
+                    report.errors += 1;
+                    println!("{}:{}: broken link - `{}`", pretty_path, i, target_pretty_path);
+                }
+                return;
+            }
+            FileEntry::Dir => {
+                // Links to directories show as directory listings when viewing
+                // the docs offline so it's best to avoid them.
+                report.errors += 1;
+                println!(
+                    "{}:{}: directory link to `{}` \
+                         (directory links should use index.html instead)",
+                    pretty_path, i, target_pretty_path
+                );
+                return;
+            }
+            FileEntry::OtherFile => return,
+            FileEntry::Redirect { target } => {
+                let t = target.clone();
+                let (target, redir_entry) = self.load_file(&t, report);
+                match redir_entry {
+                    FileEntry::Missing => {
+                        report.errors += 1;
+                        println!(
+                            "{}:{}: broken redirect from `{}` to `{}`",
+                            pretty_path, i, target_pretty_path, target
+                        );
+                        return;
+                    }
+                    FileEntry::Redirect { target } => {
+                        // Redirect to a redirect, this link checker
+                        // currently doesn't support this, since it would
+                        // require cycle checking, etc.
+                        report.errors += 1;
+                        println!(
+                            "{}:{}: redirect from `{}` to `{}` \
+                                 which is also a redirect (not supported)",
+                            pretty_path,
+                            i,
+                            target_pretty_path,
+                            target.display()
+                        );
+                        return;
+                    }
+                    FileEntry::Dir => {
+                        report.errors += 1;
+                        println!(
+                            "{}:{}: redirect from `{}` to `{}` \
+                                 which is a directory \
+                                 (directory links should use index.html instead)",
+                            pretty_path, i, target_pretty_path, target
+                        );
+                        return;
+                    }
+                    FileEntry::OtherFile => return,
+                    FileEntry::HtmlFile { source, ids } => (source, ids),
+                }
+            }
+            FileEntry::HtmlFile { source, ids } => (source, ids),
+        };
+
+        // Alright, if we've found an HTML file for the target link. If
+        // this is a fragment link, also check that the `id` exists.
+        if let Some(ref fragment) = fragment {
+            // Fragments like `#1-6` are most likely line numbers to be
+            // interpreted by javascript, so we're ignoring these
+            if fragment.splitn(2, '-').all(|f| f.chars().all(|c| c.is_numeric())) {
+                return;
+            }
+
+            parse_ids(&mut target_ids.borrow_mut(), &pretty_path, target_source, report);
+
+            if target_ids.borrow().contains(*fragment) {
+                return;
+            }
+
+            if is_exception(file, &format!("#{}", fragment)) {
+                report.links_ignored_exception += 1;
+            } else {
+                report.errors += 1;
+                print!("{}:{}: broken link fragment ", pretty_path, i);
+                println!("`#{}` pointing to `{}`", fragment, target_pretty_path);
+            };
+        }
+    }
+
    fn check_intra_doc_links(
        &mut self,
        file: &Path,
@ -496,59 +499,93 @@ fn maybe_redirect(source: &str) -> Option<String> {
    find_redirect(REDIRECT_RUSTDOC).or_else(|| find_redirect(REDIRECT_MDBOOK))
 }

-fn with_attrs_in_source<F: FnMut(&str, usize, &str)>(source: &str, attr: &str, mut f: F) {
-    let mut base = "";
-    for (i, mut line) in source.lines().enumerate() {
-        while let Some(j) = line.find(attr) {
-            let rest = &line[j + attr.len()..];
-            // The base tag should always be the first link in the document so
-            // we can get away with using one pass.
-            let is_base = line[..j].ends_with("<base");
-            line = rest;
-            let pos_equals = match rest.find('=') {
-                Some(i) => i,
-                None => continue,
-            };
-            if rest[..pos_equals].trim_start_matches(' ') != "" {
-                continue;
-            }
+fn parse_html<Sink: TokenSink>(source: &str, sink: Sink) -> Sink {
+    let tendril: ByteTendril = source.as_bytes().into();
+    let mut input = BufferQueue::new();
+    input.push_back(tendril.try_reinterpret().unwrap());

-            let rest = &rest[pos_equals + 1..];
+    let mut tok = Tokenizer::new(sink, TokenizerOpts::default());
+    let _ = tok.feed(&mut input);
+    assert!(input.is_empty());
+    tok.end();
+    tok.sink
+}

-            let pos_quote = match rest.find(&['"', '\''][..]) {
-                Some(i) => i,
-                None => continue,
-            };
-            let quote_delim = rest.as_bytes()[pos_quote] as char;
+#[derive(Default)]
+struct AttrCollector {
+    attr_name: &'static [u8],
+    base: Option<String>,
+    found_attrs: Vec<(u64, String)>,
+    /// Tracks whether or not it is inside a <script> tag.
+    ///
+    /// A lot of our sources have JSON script tags which have HTML embedded
+    /// within, but that cannot be parsed or processed correctly (since it is
+    /// JSON, not HTML). I think the sink is supposed to return
+    /// `TokenSinkResult::Script(…)` (and then maybe switch parser?), but I
+    /// don't fully understand the best way to use that, and this seems good
+    /// enough for now.
+    in_script: bool,
+}

-            if rest[..pos_quote].trim_start_matches(' ') != "" {
-                continue;
+impl TokenSink for AttrCollector {
+    type Handle = ();
+
+    fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<()> {
+        match token {
+            TagToken(tag) => {
+                let tag_name = tag.name.as_bytes();
+                if tag_name == b"base" {
+                    if let Some(href) =
+                        tag.attrs.iter().find(|attr| attr.name.local.as_bytes() == b"href")
+                    {
+                        self.base = Some(href.value.to_string());
+                    }
+                    return TokenSinkResult::Continue;
+                } else if tag_name == b"script" {
+                    self.in_script = !self.in_script;
+                }
+                if self.in_script {
+                    return TokenSinkResult::Continue;
+                }
+                for attr in tag.attrs.iter() {
+                    let name = attr.name.local.as_bytes();
+                    if name == self.attr_name {
+                        let url = attr.value.to_string();
+                        self.found_attrs.push((line_number, url));
+                    }
+                }
            }
-            let rest = &rest[pos_quote + 1..];
-            let url = match rest.find(quote_delim) {
-                Some(i) => &rest[..i],
-                None => continue,
-            };
-            if is_base {
-                base = url;
-                continue;
-            }
-            f(url, i, base)
+            // Note: ParseError is pretty noisy. It seems html5ever does not
+            // particularly like some kinds of HTML comments.
+            _ => {}
        }
+        TokenSinkResult::Continue
    }
 }

+/// Retrieves href="..." attributes from HTML elements.
+fn get_urls(source: &str) -> (Option<String>, Vec<(u64, String)>) {
+    let collector = AttrCollector { attr_name: b"href", ..AttrCollector::default() };
+    let sink = parse_html(source, collector);
+    (sink.base, sink.found_attrs)
+}
+
+/// Retrieves id="..." attributes from HTML elements.
 fn parse_ids(ids: &mut HashSet<String>, file: &str, source: &str, report: &mut Report) {
-    if ids.is_empty() {
-        with_attrs_in_source(source, " id", |fragment, i, _| {
-            let frag = fragment.trim_start_matches('#').to_owned();
-            let encoded = small_url_encode(&frag);
-            if !ids.insert(frag) {
-                report.errors += 1;
-                println!("{}:{}: id is not unique: `{}`", file, i, fragment);
-            }
-            // Just in case, we also add the encoded id.
-            ids.insert(encoded);
-        });
+    if !ids.is_empty() {
+        // ids have already been parsed
+        return;
+    }
+
+    let collector = AttrCollector { attr_name: b"id", ..AttrCollector::default() };
+    let sink = parse_html(source, collector);
+    for (line_number, id) in sink.found_attrs {
+        let encoded = small_url_encode(&id);
+        if let Some(id) = ids.replace(id) {
+            report.errors += 1;
+            println!("{}:{}: id is not unique: `{}`", file, line_number, id);
+        }
+        // Just in case, we also add the encoded id.
+        ids.insert(encoded);
    }
 }