Switch linkchecker to use html5ever for html parsing.

The existing regex-based HTML parsing was just too primitive to
correctly handle HTML content. Some books have legitimate `href="…"`
text which should not be validated because it is part of the text, not
actual HTML.
This commit is contained in:
Eric Huss 2024-02-08 08:35:12 -08:00
parent bf6a1b1245
commit 776590b14e
3 changed files with 244 additions and 205 deletions

View File

@ -2274,6 +2274,7 @@ dependencies = [
name = "linkchecker"
version = "0.1.0"
dependencies = [
"html5ever",
"once_cell",
"regex",
]

View File

@ -10,3 +10,4 @@ path = "main.rs"
[dependencies]
regex = "1"
once_cell = "1"
html5ever = "0.26.0"

View File

@ -14,6 +14,12 @@
//! A few exceptions are allowed as there's known bugs in rustdoc, but this
//! should catch the majority of "broken link" cases.
use html5ever::tendril::ByteTendril;
use html5ever::tokenizer::{
BufferQueue, TagToken, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
};
use once_cell::sync::Lazy;
use regex::Regex;
use std::cell::RefCell;
use std::collections::{HashMap, HashSet};
use std::env;
@ -23,9 +29,6 @@
use std::rc::Rc;
use std::time::Instant;
use once_cell::sync::Lazy;
use regex::Regex;
// Add linkcheck exceptions here
// If at all possible you should use intra-doc links to avoid linkcheck issues. These
// are cases where that does not work
@ -182,163 +185,10 @@ fn check(&mut self, file: &Path, report: &mut Report) {
}
};
// Search for anything that's the regex 'href[ ]*=[ ]*".*?"'
with_attrs_in_source(&source, " href", |url, i, base| {
// Ignore external URLs
if url.starts_with("http:")
|| url.starts_with("https:")
|| url.starts_with("javascript:")
|| url.starts_with("ftp:")
|| url.starts_with("irc:")
|| url.starts_with("data:")
|| url.starts_with("mailto:")
{
report.links_ignored_external += 1;
return;
}
report.links_checked += 1;
let (url, fragment) = match url.split_once('#') {
None => (url, None),
Some((url, fragment)) => (url, Some(fragment)),
};
// NB: the `splitn` always succeeds, even if the delimiter is not present.
let url = url.splitn(2, '?').next().unwrap();
// Once we've plucked out the URL, parse it using our base url and
// then try to extract a file path.
let mut path = file.to_path_buf();
if !base.is_empty() || !url.is_empty() {
path.pop();
for part in Path::new(base).join(url).components() {
match part {
Component::Prefix(_) | Component::RootDir => {
// Avoid absolute paths as they make the docs not
// relocatable by making assumptions on where the docs
// are hosted relative to the site root.
report.errors += 1;
println!(
"{}:{}: absolute path - {}",
pretty_path,
i + 1,
Path::new(base).join(url).display()
);
return;
}
Component::CurDir => {}
Component::ParentDir => {
path.pop();
}
Component::Normal(s) => {
path.push(s);
}
}
}
}
let (target_pretty_path, target_entry) = self.load_file(&path, report);
let (target_source, target_ids) = match target_entry {
FileEntry::Missing => {
if is_exception(file, &target_pretty_path) {
report.links_ignored_exception += 1;
} else {
report.errors += 1;
println!(
"{}:{}: broken link - `{}`",
pretty_path,
i + 1,
target_pretty_path
);
}
return;
}
FileEntry::Dir => {
// Links to directories show as directory listings when viewing
// the docs offline so it's best to avoid them.
report.errors += 1;
println!(
"{}:{}: directory link to `{}` \
(directory links should use index.html instead)",
pretty_path,
i + 1,
target_pretty_path
);
return;
}
FileEntry::OtherFile => return,
FileEntry::Redirect { target } => {
let t = target.clone();
let (target, redir_entry) = self.load_file(&t, report);
match redir_entry {
FileEntry::Missing => {
report.errors += 1;
println!(
"{}:{}: broken redirect from `{}` to `{}`",
pretty_path,
i + 1,
target_pretty_path,
target
);
return;
}
FileEntry::Redirect { target } => {
// Redirect to a redirect, this link checker
// currently doesn't support this, since it would
// require cycle checking, etc.
report.errors += 1;
println!(
"{}:{}: redirect from `{}` to `{}` \
which is also a redirect (not supported)",
pretty_path,
i + 1,
target_pretty_path,
target.display()
);
return;
}
FileEntry::Dir => {
report.errors += 1;
println!(
"{}:{}: redirect from `{}` to `{}` \
which is a directory \
(directory links should use index.html instead)",
pretty_path,
i + 1,
target_pretty_path,
target
);
return;
}
FileEntry::OtherFile => return,
FileEntry::HtmlFile { source, ids } => (source, ids),
}
}
FileEntry::HtmlFile { source, ids } => (source, ids),
};
// Alright, if we've found an HTML file for the target link. If
// this is a fragment link, also check that the `id` exists.
if let Some(ref fragment) = fragment {
// Fragments like `#1-6` are most likely line numbers to be
// interpreted by javascript, so we're ignoring these
if fragment.splitn(2, '-').all(|f| f.chars().all(|c| c.is_numeric())) {
return;
}
parse_ids(&mut target_ids.borrow_mut(), &pretty_path, target_source, report);
if target_ids.borrow().contains(*fragment) {
return;
}
if is_exception(file, &format!("#{}", fragment)) {
report.links_ignored_exception += 1;
} else {
report.errors += 1;
print!("{}:{}: broken link fragment ", pretty_path, i + 1);
println!("`#{}` pointing to `{}`", fragment, target_pretty_path);
};
}
});
let (base, urls) = get_urls(&source);
for (i, url) in urls {
self.check_url(file, &pretty_path, report, &base, i, &url);
}
self.check_intra_doc_links(file, &pretty_path, &source, report);
@ -350,6 +200,159 @@ fn check(&mut self, file: &Path, report: &mut Report) {
}
}
fn check_url(
&mut self,
file: &Path,
pretty_path: &str,
report: &mut Report,
base: &Option<String>,
i: u64,
url: &str,
) {
// Ignore external URLs
if url.starts_with("http:")
|| url.starts_with("https:")
|| url.starts_with("javascript:")
|| url.starts_with("ftp:")
|| url.starts_with("irc:")
|| url.starts_with("data:")
|| url.starts_with("mailto:")
{
report.links_ignored_external += 1;
return;
}
report.links_checked += 1;
let (url, fragment) = match url.split_once('#') {
None => (url, None),
Some((url, fragment)) => (url, Some(fragment)),
};
// NB: the `splitn` always succeeds, even if the delimiter is not present.
let url = url.splitn(2, '?').next().unwrap();
// Once we've plucked out the URL, parse it using our base url and
// then try to extract a file path.
let mut path = file.to_path_buf();
if base.is_some() || !url.is_empty() {
let base = base.as_deref().unwrap_or("");
path.pop();
for part in Path::new(base).join(url).components() {
match part {
Component::Prefix(_) | Component::RootDir => {
// Avoid absolute paths as they make the docs not
// relocatable by making assumptions on where the docs
// are hosted relative to the site root.
report.errors += 1;
println!(
"{}:{}: absolute path - {}",
pretty_path,
i,
Path::new(base).join(url).display()
);
return;
}
Component::CurDir => {}
Component::ParentDir => {
path.pop();
}
Component::Normal(s) => {
path.push(s);
}
}
}
}
let (target_pretty_path, target_entry) = self.load_file(&path, report);
let (target_source, target_ids) = match target_entry {
FileEntry::Missing => {
if is_exception(file, &target_pretty_path) {
report.links_ignored_exception += 1;
} else {
report.errors += 1;
println!("{}:{}: broken link - `{}`", pretty_path, i, target_pretty_path);
}
return;
}
FileEntry::Dir => {
// Links to directories show as directory listings when viewing
// the docs offline so it's best to avoid them.
report.errors += 1;
println!(
"{}:{}: directory link to `{}` \
(directory links should use index.html instead)",
pretty_path, i, target_pretty_path
);
return;
}
FileEntry::OtherFile => return,
FileEntry::Redirect { target } => {
let t = target.clone();
let (target, redir_entry) = self.load_file(&t, report);
match redir_entry {
FileEntry::Missing => {
report.errors += 1;
println!(
"{}:{}: broken redirect from `{}` to `{}`",
pretty_path, i, target_pretty_path, target
);
return;
}
FileEntry::Redirect { target } => {
// Redirect to a redirect, this link checker
// currently doesn't support this, since it would
// require cycle checking, etc.
report.errors += 1;
println!(
"{}:{}: redirect from `{}` to `{}` \
which is also a redirect (not supported)",
pretty_path,
i,
target_pretty_path,
target.display()
);
return;
}
FileEntry::Dir => {
report.errors += 1;
println!(
"{}:{}: redirect from `{}` to `{}` \
which is a directory \
(directory links should use index.html instead)",
pretty_path, i, target_pretty_path, target
);
return;
}
FileEntry::OtherFile => return,
FileEntry::HtmlFile { source, ids } => (source, ids),
}
}
FileEntry::HtmlFile { source, ids } => (source, ids),
};
// Alright, if we've found an HTML file for the target link. If
// this is a fragment link, also check that the `id` exists.
if let Some(ref fragment) = fragment {
// Fragments like `#1-6` are most likely line numbers to be
// interpreted by javascript, so we're ignoring these
if fragment.splitn(2, '-').all(|f| f.chars().all(|c| c.is_numeric())) {
return;
}
parse_ids(&mut target_ids.borrow_mut(), &pretty_path, target_source, report);
if target_ids.borrow().contains(*fragment) {
return;
}
if is_exception(file, &format!("#{}", fragment)) {
report.links_ignored_exception += 1;
} else {
report.errors += 1;
print!("{}:{}: broken link fragment ", pretty_path, i);
println!("`#{}` pointing to `{}`", fragment, target_pretty_path);
};
}
}
fn check_intra_doc_links(
&mut self,
file: &Path,
@ -496,59 +499,93 @@ fn maybe_redirect(source: &str) -> Option<String> {
find_redirect(REDIRECT_RUSTDOC).or_else(|| find_redirect(REDIRECT_MDBOOK))
}
fn with_attrs_in_source<F: FnMut(&str, usize, &str)>(source: &str, attr: &str, mut f: F) {
let mut base = "";
for (i, mut line) in source.lines().enumerate() {
while let Some(j) = line.find(attr) {
let rest = &line[j + attr.len()..];
// The base tag should always be the first link in the document so
// we can get away with using one pass.
let is_base = line[..j].ends_with("<base");
line = rest;
let pos_equals = match rest.find('=') {
Some(i) => i,
None => continue,
};
if rest[..pos_equals].trim_start_matches(' ') != "" {
continue;
}
fn parse_html<Sink: TokenSink>(source: &str, sink: Sink) -> Sink {
let tendril: ByteTendril = source.as_bytes().into();
let mut input = BufferQueue::new();
input.push_back(tendril.try_reinterpret().unwrap());
let rest = &rest[pos_equals + 1..];
let mut tok = Tokenizer::new(sink, TokenizerOpts::default());
let _ = tok.feed(&mut input);
assert!(input.is_empty());
tok.end();
tok.sink
}
let pos_quote = match rest.find(&['"', '\''][..]) {
Some(i) => i,
None => continue,
};
let quote_delim = rest.as_bytes()[pos_quote] as char;
#[derive(Default)]
struct AttrCollector {
attr_name: &'static [u8],
base: Option<String>,
found_attrs: Vec<(u64, String)>,
/// Tracks whether or not it is inside a <script> tag.
///
/// A lot of our sources have JSON script tags which have HTML embedded
/// within, but that cannot be parsed or processed correctly (since it is
/// JSON, not HTML). I think the sink is supposed to return
/// `TokenSinkResult::Script(…)` (and then maybe switch parser?), but I
/// don't fully understand the best way to use that, and this seems good
/// enough for now.
in_script: bool,
}
if rest[..pos_quote].trim_start_matches(' ') != "" {
continue;
impl TokenSink for AttrCollector {
type Handle = ();
fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<()> {
match token {
TagToken(tag) => {
let tag_name = tag.name.as_bytes();
if tag_name == b"base" {
if let Some(href) =
tag.attrs.iter().find(|attr| attr.name.local.as_bytes() == b"href")
{
self.base = Some(href.value.to_string());
}
return TokenSinkResult::Continue;
} else if tag_name == b"script" {
self.in_script = !self.in_script;
}
if self.in_script {
return TokenSinkResult::Continue;
}
for attr in tag.attrs.iter() {
let name = attr.name.local.as_bytes();
if name == self.attr_name {
let url = attr.value.to_string();
self.found_attrs.push((line_number, url));
}
}
}
let rest = &rest[pos_quote + 1..];
let url = match rest.find(quote_delim) {
Some(i) => &rest[..i],
None => continue,
};
if is_base {
base = url;
continue;
}
f(url, i, base)
// Note: ParseError is pretty noisy. It seems html5ever does not
// particularly like some kinds of HTML comments.
_ => {}
}
TokenSinkResult::Continue
}
}
/// Retrieves href="..." attributes from HTML elements.
fn get_urls(source: &str) -> (Option<String>, Vec<(u64, String)>) {
let collector = AttrCollector { attr_name: b"href", ..AttrCollector::default() };
let sink = parse_html(source, collector);
(sink.base, sink.found_attrs)
}
/// Retrieves id="..." attributes from HTML elements.
fn parse_ids(ids: &mut HashSet<String>, file: &str, source: &str, report: &mut Report) {
if ids.is_empty() {
with_attrs_in_source(source, " id", |fragment, i, _| {
let frag = fragment.trim_start_matches('#').to_owned();
let encoded = small_url_encode(&frag);
if !ids.insert(frag) {
report.errors += 1;
println!("{}:{}: id is not unique: `{}`", file, i, fragment);
}
// Just in case, we also add the encoded id.
ids.insert(encoded);
});
if !ids.is_empty() {
// ids have already been parsed
return;
}
let collector = AttrCollector { attr_name: b"id", ..AttrCollector::default() };
let sink = parse_html(source, collector);
for (line_number, id) in sink.found_attrs {
let encoded = small_url_encode(&id);
if let Some(id) = ids.replace(id) {
report.errors += 1;
println!("{}:{}: id is not unique: `{}`", file, line_number, id);
}
// Just in case, we also add the encoded id.
ids.insert(encoded);
}
}