Switch linkchecker to use html5ever for html parsing.
The existing regex-based HTML parsing was just too primitive to correctly handle HTML content. Some books have legitimate `href="…"` text which should not be validated because it is part of the text, not actual HTML.
This commit is contained in:
parent
bf6a1b1245
commit
776590b14e
@ -2274,6 +2274,7 @@ dependencies = [
|
||||
name = "linkchecker"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"html5ever",
|
||||
"once_cell",
|
||||
"regex",
|
||||
]
|
||||
|
@ -10,3 +10,4 @@ path = "main.rs"
|
||||
[dependencies]
|
||||
regex = "1"
|
||||
once_cell = "1"
|
||||
html5ever = "0.26.0"
|
||||
|
@ -14,6 +14,12 @@
|
||||
//! A few exceptions are allowed as there's known bugs in rustdoc, but this
|
||||
//! should catch the majority of "broken link" cases.
|
||||
|
||||
use html5ever::tendril::ByteTendril;
|
||||
use html5ever::tokenizer::{
|
||||
BufferQueue, TagToken, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
use std::cell::RefCell;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::env;
|
||||
@ -23,9 +29,6 @@
|
||||
use std::rc::Rc;
|
||||
use std::time::Instant;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
|
||||
// Add linkcheck exceptions here
|
||||
// If at all possible you should use intra-doc links to avoid linkcheck issues. These
|
||||
// are cases where that does not work
|
||||
@ -182,163 +185,10 @@ fn check(&mut self, file: &Path, report: &mut Report) {
|
||||
}
|
||||
};
|
||||
|
||||
// Search for anything that's the regex 'href[ ]*=[ ]*".*?"'
|
||||
with_attrs_in_source(&source, " href", |url, i, base| {
|
||||
// Ignore external URLs
|
||||
if url.starts_with("http:")
|
||||
|| url.starts_with("https:")
|
||||
|| url.starts_with("javascript:")
|
||||
|| url.starts_with("ftp:")
|
||||
|| url.starts_with("irc:")
|
||||
|| url.starts_with("data:")
|
||||
|| url.starts_with("mailto:")
|
||||
{
|
||||
report.links_ignored_external += 1;
|
||||
return;
|
||||
}
|
||||
report.links_checked += 1;
|
||||
let (url, fragment) = match url.split_once('#') {
|
||||
None => (url, None),
|
||||
Some((url, fragment)) => (url, Some(fragment)),
|
||||
};
|
||||
// NB: the `splitn` always succeeds, even if the delimiter is not present.
|
||||
let url = url.splitn(2, '?').next().unwrap();
|
||||
|
||||
// Once we've plucked out the URL, parse it using our base url and
|
||||
// then try to extract a file path.
|
||||
let mut path = file.to_path_buf();
|
||||
if !base.is_empty() || !url.is_empty() {
|
||||
path.pop();
|
||||
for part in Path::new(base).join(url).components() {
|
||||
match part {
|
||||
Component::Prefix(_) | Component::RootDir => {
|
||||
// Avoid absolute paths as they make the docs not
|
||||
// relocatable by making assumptions on where the docs
|
||||
// are hosted relative to the site root.
|
||||
report.errors += 1;
|
||||
println!(
|
||||
"{}:{}: absolute path - {}",
|
||||
pretty_path,
|
||||
i + 1,
|
||||
Path::new(base).join(url).display()
|
||||
);
|
||||
return;
|
||||
}
|
||||
Component::CurDir => {}
|
||||
Component::ParentDir => {
|
||||
path.pop();
|
||||
}
|
||||
Component::Normal(s) => {
|
||||
path.push(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let (target_pretty_path, target_entry) = self.load_file(&path, report);
|
||||
let (target_source, target_ids) = match target_entry {
|
||||
FileEntry::Missing => {
|
||||
if is_exception(file, &target_pretty_path) {
|
||||
report.links_ignored_exception += 1;
|
||||
} else {
|
||||
report.errors += 1;
|
||||
println!(
|
||||
"{}:{}: broken link - `{}`",
|
||||
pretty_path,
|
||||
i + 1,
|
||||
target_pretty_path
|
||||
);
|
||||
}
|
||||
return;
|
||||
}
|
||||
FileEntry::Dir => {
|
||||
// Links to directories show as directory listings when viewing
|
||||
// the docs offline so it's best to avoid them.
|
||||
report.errors += 1;
|
||||
println!(
|
||||
"{}:{}: directory link to `{}` \
|
||||
(directory links should use index.html instead)",
|
||||
pretty_path,
|
||||
i + 1,
|
||||
target_pretty_path
|
||||
);
|
||||
return;
|
||||
}
|
||||
FileEntry::OtherFile => return,
|
||||
FileEntry::Redirect { target } => {
|
||||
let t = target.clone();
|
||||
let (target, redir_entry) = self.load_file(&t, report);
|
||||
match redir_entry {
|
||||
FileEntry::Missing => {
|
||||
report.errors += 1;
|
||||
println!(
|
||||
"{}:{}: broken redirect from `{}` to `{}`",
|
||||
pretty_path,
|
||||
i + 1,
|
||||
target_pretty_path,
|
||||
target
|
||||
);
|
||||
return;
|
||||
}
|
||||
FileEntry::Redirect { target } => {
|
||||
// Redirect to a redirect, this link checker
|
||||
// currently doesn't support this, since it would
|
||||
// require cycle checking, etc.
|
||||
report.errors += 1;
|
||||
println!(
|
||||
"{}:{}: redirect from `{}` to `{}` \
|
||||
which is also a redirect (not supported)",
|
||||
pretty_path,
|
||||
i + 1,
|
||||
target_pretty_path,
|
||||
target.display()
|
||||
);
|
||||
return;
|
||||
}
|
||||
FileEntry::Dir => {
|
||||
report.errors += 1;
|
||||
println!(
|
||||
"{}:{}: redirect from `{}` to `{}` \
|
||||
which is a directory \
|
||||
(directory links should use index.html instead)",
|
||||
pretty_path,
|
||||
i + 1,
|
||||
target_pretty_path,
|
||||
target
|
||||
);
|
||||
return;
|
||||
}
|
||||
FileEntry::OtherFile => return,
|
||||
FileEntry::HtmlFile { source, ids } => (source, ids),
|
||||
}
|
||||
}
|
||||
FileEntry::HtmlFile { source, ids } => (source, ids),
|
||||
};
|
||||
|
||||
// Alright, if we've found an HTML file for the target link. If
|
||||
// this is a fragment link, also check that the `id` exists.
|
||||
if let Some(ref fragment) = fragment {
|
||||
// Fragments like `#1-6` are most likely line numbers to be
|
||||
// interpreted by javascript, so we're ignoring these
|
||||
if fragment.splitn(2, '-').all(|f| f.chars().all(|c| c.is_numeric())) {
|
||||
return;
|
||||
}
|
||||
|
||||
parse_ids(&mut target_ids.borrow_mut(), &pretty_path, target_source, report);
|
||||
|
||||
if target_ids.borrow().contains(*fragment) {
|
||||
return;
|
||||
}
|
||||
|
||||
if is_exception(file, &format!("#{}", fragment)) {
|
||||
report.links_ignored_exception += 1;
|
||||
} else {
|
||||
report.errors += 1;
|
||||
print!("{}:{}: broken link fragment ", pretty_path, i + 1);
|
||||
println!("`#{}` pointing to `{}`", fragment, target_pretty_path);
|
||||
};
|
||||
}
|
||||
});
|
||||
let (base, urls) = get_urls(&source);
|
||||
for (i, url) in urls {
|
||||
self.check_url(file, &pretty_path, report, &base, i, &url);
|
||||
}
|
||||
|
||||
self.check_intra_doc_links(file, &pretty_path, &source, report);
|
||||
|
||||
@ -350,6 +200,159 @@ fn check(&mut self, file: &Path, report: &mut Report) {
|
||||
}
|
||||
}
|
||||
|
||||
fn check_url(
|
||||
&mut self,
|
||||
file: &Path,
|
||||
pretty_path: &str,
|
||||
report: &mut Report,
|
||||
base: &Option<String>,
|
||||
i: u64,
|
||||
url: &str,
|
||||
) {
|
||||
// Ignore external URLs
|
||||
if url.starts_with("http:")
|
||||
|| url.starts_with("https:")
|
||||
|| url.starts_with("javascript:")
|
||||
|| url.starts_with("ftp:")
|
||||
|| url.starts_with("irc:")
|
||||
|| url.starts_with("data:")
|
||||
|| url.starts_with("mailto:")
|
||||
{
|
||||
report.links_ignored_external += 1;
|
||||
return;
|
||||
}
|
||||
report.links_checked += 1;
|
||||
let (url, fragment) = match url.split_once('#') {
|
||||
None => (url, None),
|
||||
Some((url, fragment)) => (url, Some(fragment)),
|
||||
};
|
||||
// NB: the `splitn` always succeeds, even if the delimiter is not present.
|
||||
let url = url.splitn(2, '?').next().unwrap();
|
||||
|
||||
// Once we've plucked out the URL, parse it using our base url and
|
||||
// then try to extract a file path.
|
||||
let mut path = file.to_path_buf();
|
||||
if base.is_some() || !url.is_empty() {
|
||||
let base = base.as_deref().unwrap_or("");
|
||||
path.pop();
|
||||
for part in Path::new(base).join(url).components() {
|
||||
match part {
|
||||
Component::Prefix(_) | Component::RootDir => {
|
||||
// Avoid absolute paths as they make the docs not
|
||||
// relocatable by making assumptions on where the docs
|
||||
// are hosted relative to the site root.
|
||||
report.errors += 1;
|
||||
println!(
|
||||
"{}:{}: absolute path - {}",
|
||||
pretty_path,
|
||||
i,
|
||||
Path::new(base).join(url).display()
|
||||
);
|
||||
return;
|
||||
}
|
||||
Component::CurDir => {}
|
||||
Component::ParentDir => {
|
||||
path.pop();
|
||||
}
|
||||
Component::Normal(s) => {
|
||||
path.push(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let (target_pretty_path, target_entry) = self.load_file(&path, report);
|
||||
let (target_source, target_ids) = match target_entry {
|
||||
FileEntry::Missing => {
|
||||
if is_exception(file, &target_pretty_path) {
|
||||
report.links_ignored_exception += 1;
|
||||
} else {
|
||||
report.errors += 1;
|
||||
println!("{}:{}: broken link - `{}`", pretty_path, i, target_pretty_path);
|
||||
}
|
||||
return;
|
||||
}
|
||||
FileEntry::Dir => {
|
||||
// Links to directories show as directory listings when viewing
|
||||
// the docs offline so it's best to avoid them.
|
||||
report.errors += 1;
|
||||
println!(
|
||||
"{}:{}: directory link to `{}` \
|
||||
(directory links should use index.html instead)",
|
||||
pretty_path, i, target_pretty_path
|
||||
);
|
||||
return;
|
||||
}
|
||||
FileEntry::OtherFile => return,
|
||||
FileEntry::Redirect { target } => {
|
||||
let t = target.clone();
|
||||
let (target, redir_entry) = self.load_file(&t, report);
|
||||
match redir_entry {
|
||||
FileEntry::Missing => {
|
||||
report.errors += 1;
|
||||
println!(
|
||||
"{}:{}: broken redirect from `{}` to `{}`",
|
||||
pretty_path, i, target_pretty_path, target
|
||||
);
|
||||
return;
|
||||
}
|
||||
FileEntry::Redirect { target } => {
|
||||
// Redirect to a redirect, this link checker
|
||||
// currently doesn't support this, since it would
|
||||
// require cycle checking, etc.
|
||||
report.errors += 1;
|
||||
println!(
|
||||
"{}:{}: redirect from `{}` to `{}` \
|
||||
which is also a redirect (not supported)",
|
||||
pretty_path,
|
||||
i,
|
||||
target_pretty_path,
|
||||
target.display()
|
||||
);
|
||||
return;
|
||||
}
|
||||
FileEntry::Dir => {
|
||||
report.errors += 1;
|
||||
println!(
|
||||
"{}:{}: redirect from `{}` to `{}` \
|
||||
which is a directory \
|
||||
(directory links should use index.html instead)",
|
||||
pretty_path, i, target_pretty_path, target
|
||||
);
|
||||
return;
|
||||
}
|
||||
FileEntry::OtherFile => return,
|
||||
FileEntry::HtmlFile { source, ids } => (source, ids),
|
||||
}
|
||||
}
|
||||
FileEntry::HtmlFile { source, ids } => (source, ids),
|
||||
};
|
||||
|
||||
// Alright, if we've found an HTML file for the target link. If
|
||||
// this is a fragment link, also check that the `id` exists.
|
||||
if let Some(ref fragment) = fragment {
|
||||
// Fragments like `#1-6` are most likely line numbers to be
|
||||
// interpreted by javascript, so we're ignoring these
|
||||
if fragment.splitn(2, '-').all(|f| f.chars().all(|c| c.is_numeric())) {
|
||||
return;
|
||||
}
|
||||
|
||||
parse_ids(&mut target_ids.borrow_mut(), &pretty_path, target_source, report);
|
||||
|
||||
if target_ids.borrow().contains(*fragment) {
|
||||
return;
|
||||
}
|
||||
|
||||
if is_exception(file, &format!("#{}", fragment)) {
|
||||
report.links_ignored_exception += 1;
|
||||
} else {
|
||||
report.errors += 1;
|
||||
print!("{}:{}: broken link fragment ", pretty_path, i);
|
||||
println!("`#{}` pointing to `{}`", fragment, target_pretty_path);
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
fn check_intra_doc_links(
|
||||
&mut self,
|
||||
file: &Path,
|
||||
@ -496,59 +499,93 @@ fn maybe_redirect(source: &str) -> Option<String> {
|
||||
find_redirect(REDIRECT_RUSTDOC).or_else(|| find_redirect(REDIRECT_MDBOOK))
|
||||
}
|
||||
|
||||
fn with_attrs_in_source<F: FnMut(&str, usize, &str)>(source: &str, attr: &str, mut f: F) {
|
||||
let mut base = "";
|
||||
for (i, mut line) in source.lines().enumerate() {
|
||||
while let Some(j) = line.find(attr) {
|
||||
let rest = &line[j + attr.len()..];
|
||||
// The base tag should always be the first link in the document so
|
||||
// we can get away with using one pass.
|
||||
let is_base = line[..j].ends_with("<base");
|
||||
line = rest;
|
||||
let pos_equals = match rest.find('=') {
|
||||
Some(i) => i,
|
||||
None => continue,
|
||||
};
|
||||
if rest[..pos_equals].trim_start_matches(' ') != "" {
|
||||
continue;
|
||||
}
|
||||
fn parse_html<Sink: TokenSink>(source: &str, sink: Sink) -> Sink {
|
||||
let tendril: ByteTendril = source.as_bytes().into();
|
||||
let mut input = BufferQueue::new();
|
||||
input.push_back(tendril.try_reinterpret().unwrap());
|
||||
|
||||
let rest = &rest[pos_equals + 1..];
|
||||
let mut tok = Tokenizer::new(sink, TokenizerOpts::default());
|
||||
let _ = tok.feed(&mut input);
|
||||
assert!(input.is_empty());
|
||||
tok.end();
|
||||
tok.sink
|
||||
}
|
||||
|
||||
let pos_quote = match rest.find(&['"', '\''][..]) {
|
||||
Some(i) => i,
|
||||
None => continue,
|
||||
};
|
||||
let quote_delim = rest.as_bytes()[pos_quote] as char;
|
||||
#[derive(Default)]
|
||||
struct AttrCollector {
|
||||
attr_name: &'static [u8],
|
||||
base: Option<String>,
|
||||
found_attrs: Vec<(u64, String)>,
|
||||
/// Tracks whether or not it is inside a <script> tag.
|
||||
///
|
||||
/// A lot of our sources have JSON script tags which have HTML embedded
|
||||
/// within, but that cannot be parsed or processed correctly (since it is
|
||||
/// JSON, not HTML). I think the sink is supposed to return
|
||||
/// `TokenSinkResult::Script(…)` (and then maybe switch parser?), but I
|
||||
/// don't fully understand the best way to use that, and this seems good
|
||||
/// enough for now.
|
||||
in_script: bool,
|
||||
}
|
||||
|
||||
if rest[..pos_quote].trim_start_matches(' ') != "" {
|
||||
continue;
|
||||
impl TokenSink for AttrCollector {
|
||||
type Handle = ();
|
||||
|
||||
fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<()> {
|
||||
match token {
|
||||
TagToken(tag) => {
|
||||
let tag_name = tag.name.as_bytes();
|
||||
if tag_name == b"base" {
|
||||
if let Some(href) =
|
||||
tag.attrs.iter().find(|attr| attr.name.local.as_bytes() == b"href")
|
||||
{
|
||||
self.base = Some(href.value.to_string());
|
||||
}
|
||||
return TokenSinkResult::Continue;
|
||||
} else if tag_name == b"script" {
|
||||
self.in_script = !self.in_script;
|
||||
}
|
||||
if self.in_script {
|
||||
return TokenSinkResult::Continue;
|
||||
}
|
||||
for attr in tag.attrs.iter() {
|
||||
let name = attr.name.local.as_bytes();
|
||||
if name == self.attr_name {
|
||||
let url = attr.value.to_string();
|
||||
self.found_attrs.push((line_number, url));
|
||||
}
|
||||
}
|
||||
}
|
||||
let rest = &rest[pos_quote + 1..];
|
||||
let url = match rest.find(quote_delim) {
|
||||
Some(i) => &rest[..i],
|
||||
None => continue,
|
||||
};
|
||||
if is_base {
|
||||
base = url;
|
||||
continue;
|
||||
}
|
||||
f(url, i, base)
|
||||
// Note: ParseError is pretty noisy. It seems html5ever does not
|
||||
// particularly like some kinds of HTML comments.
|
||||
_ => {}
|
||||
}
|
||||
TokenSinkResult::Continue
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieves href="..." attributes from HTML elements.
|
||||
fn get_urls(source: &str) -> (Option<String>, Vec<(u64, String)>) {
|
||||
let collector = AttrCollector { attr_name: b"href", ..AttrCollector::default() };
|
||||
let sink = parse_html(source, collector);
|
||||
(sink.base, sink.found_attrs)
|
||||
}
|
||||
|
||||
/// Retrieves id="..." attributes from HTML elements.
|
||||
fn parse_ids(ids: &mut HashSet<String>, file: &str, source: &str, report: &mut Report) {
|
||||
if ids.is_empty() {
|
||||
with_attrs_in_source(source, " id", |fragment, i, _| {
|
||||
let frag = fragment.trim_start_matches('#').to_owned();
|
||||
let encoded = small_url_encode(&frag);
|
||||
if !ids.insert(frag) {
|
||||
report.errors += 1;
|
||||
println!("{}:{}: id is not unique: `{}`", file, i, fragment);
|
||||
}
|
||||
// Just in case, we also add the encoded id.
|
||||
ids.insert(encoded);
|
||||
});
|
||||
if !ids.is_empty() {
|
||||
// ids have already been parsed
|
||||
return;
|
||||
}
|
||||
|
||||
let collector = AttrCollector { attr_name: b"id", ..AttrCollector::default() };
|
||||
let sink = parse_html(source, collector);
|
||||
for (line_number, id) in sink.found_attrs {
|
||||
let encoded = small_url_encode(&id);
|
||||
if let Some(id) = ids.replace(id) {
|
||||
report.errors += 1;
|
||||
println!("{}:{}: id is not unique: `{}`", file, line_number, id);
|
||||
}
|
||||
// Just in case, we also add the encoded id.
|
||||
ids.insert(encoded);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user