Extend linkchecker with anchor checking

This adds checks to ensure that:
* link anchors refer to existing id's on the target page
* id's are unique within an html document
* page redirects are valid
This commit is contained in:
mitaa 2016-03-26 11:59:30 +01:00
parent d0f74b6059
commit 6a76872d71
7 changed files with 217 additions and 64 deletions

View File

@ -27,8 +27,7 @@ explicitly implement to be used by this generic function.
* _Inference_. Since the type parameters to generic functions can usually be
inferred, generic functions can help cut down on verbosity in code where
explicit conversions or other method calls would usually be necessary. See the
[overloading/implicits use case](#use-case-limited-overloading-andor-implicit-conversions)
below.
overloading/implicits use case below.
* _Precise types_. Because generics give a _name_ to the specific type
implementing a trait, it is possible to be precise about places where that
exact type is required or produced. For example, a function
@ -51,7 +50,7 @@ explicitly implement to be used by this generic function.
a `Vec<T>` contains elements of a single concrete type (and, indeed, the
vector representation is specialized to lay these out in line). Sometimes
heterogeneous collections are useful; see
[trait objects](#use-case-trait-objects) below.
trait objects below.
* _Signature verbosity_. Heavy use of generics can bloat function signatures.
**[Ed. note]** This problem may be mitigated by some language improvements; stay tuned.

View File

@ -434,7 +434,7 @@ pub trait Iterator {
/// `None`. Once `None` is encountered, `count()` returns the number of
/// times it called [`next()`].
///
/// [`next()`]: #method.next
/// [`next()`]: #tymethod.next
///
/// # Overflow Behavior
///
@ -497,7 +497,7 @@ pub trait Iterator {
/// This method will evaluate the iterator `n` times, discarding those elements.
/// After it does so, it will call [`next()`] and return its value.
///
/// [`next()`]: #method.next
/// [`next()`]: #tymethod.next
///
/// Like most indexing operations, the count starts from zero, so `nth(0)`
/// returns the first value, `nth(1)` the second, and so on.

View File

@ -1254,7 +1254,7 @@ pub trait BufRead: Read {
/// longer be returned. As such, this function may do odd things if
/// `fill_buf` isn't called before calling it.
///
/// [fillbuf]: #tymethod.fill_buff
/// [fillbuf]: #tymethod.fill_buf
///
/// The `amt` must be `<=` the number of bytes in the buffer returned by
/// `fill_buf`.

View File

@ -190,7 +190,7 @@
//! [`thread`]: thread/index.html
//! [`use std::env`]: env/index.html
//! [`use`]: ../book/crates-and-modules.html#importing-modules-with-use
//! [crate root]: ../book/crates-and-modules.html#basic-terminology:-crates-and-modules
//! [crate root]: ../book/crates-and-modules.html#basic-terminology-crates-and-modules
//! [crates.io]: https://crates.io
//! [deref coercions]: ../book/deref-coercions.html
//! [files]: fs/struct.File.html

View File

@ -196,7 +196,7 @@ impl TcpStream {
///
/// For more information about this option, see [`set_nodelay`][link].
///
/// [link]: #tymethod.set_nodelay
/// [link]: #method.set_nodelay
#[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn nodelay(&self) -> io::Result<bool> {
self.0.nodelay()
@ -215,7 +215,7 @@ impl TcpStream {
///
/// For more information about this option, see [`set_ttl`][link].
///
/// [link]: #tymethod.set_ttl
/// [link]: #method.set_ttl
#[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn ttl(&self) -> io::Result<u32> {
self.0.ttl()
@ -238,7 +238,7 @@ impl TcpStream {
///
/// For more information about this option, see [`set_only_v6`][link].
///
/// [link]: #tymethod.set_only_v6
/// [link]: #method.set_only_v6
#[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn only_v6(&self) -> io::Result<bool> {
self.0.only_v6()
@ -374,7 +374,7 @@ impl TcpListener {
///
/// For more information about this option, see [`set_ttl`][link].
///
/// [link]: #tymethod.set_ttl
/// [link]: #method.set_ttl
#[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn ttl(&self) -> io::Result<u32> {
self.0.ttl()
@ -397,7 +397,7 @@ impl TcpListener {
///
/// For more information about this option, see [`set_only_v6`][link].
///
/// [link]: #tymethod.set_only_v6
/// [link]: #method.set_only_v6
#[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn only_v6(&self) -> io::Result<bool> {
self.0.only_v6()

View File

@ -155,7 +155,7 @@ impl UdpSocket {
/// For more information about this option, see
/// [`set_broadcast`][link].
///
/// [link]: #tymethod.set_broadcast
/// [link]: #method.set_broadcast
#[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn broadcast(&self) -> io::Result<bool> {
self.0.broadcast()
@ -175,7 +175,7 @@ impl UdpSocket {
/// For more information about this option, see
/// [`set_multicast_loop_v4`][link].
///
/// [link]: #tymethod.set_multicast_loop_v4
/// [link]: #method.set_multicast_loop_v4
#[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn multicast_loop_v4(&self) -> io::Result<bool> {
self.0.multicast_loop_v4()
@ -198,7 +198,7 @@ impl UdpSocket {
/// For more information about this option, see
/// [`set_multicast_ttl_v4`][link].
///
/// [link]: #tymethod.set_multicast_ttl_v4
/// [link]: #method.set_multicast_ttl_v4
#[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn multicast_ttl_v4(&self) -> io::Result<u32> {
self.0.multicast_ttl_v4()
@ -218,7 +218,7 @@ impl UdpSocket {
/// For more information about this option, see
/// [`set_multicast_loop_v6`][link].
///
/// [link]: #tymethod.set_multicast_loop_v6
/// [link]: #method.set_multicast_loop_v6
#[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn multicast_loop_v6(&self) -> io::Result<bool> {
self.0.multicast_loop_v6()
@ -237,7 +237,7 @@ impl UdpSocket {
///
/// For more information about this option, see [`set_ttl`][link].
///
/// [link]: #tymethod.set_ttl
/// [link]: #method.set_ttl
#[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn ttl(&self) -> io::Result<u32> {
self.0.ttl()
@ -260,7 +260,7 @@ impl UdpSocket {
///
/// For more information about this option, see [`set_only_v6`][link].
///
/// [link]: #tymethod.set_only_v6
/// [link]: #method.set_only_v6
#[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn only_v6(&self) -> io::Result<bool> {
self.0.only_v6()
@ -293,7 +293,7 @@ impl UdpSocket {
/// For more information about this option, see
/// [`join_multicast_v4`][link].
///
/// [link]: #tymethod.join_multicast_v4
/// [link]: #method.join_multicast_v4
#[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn leave_multicast_v4(&self, multiaddr: &Ipv4Addr, interface: &Ipv4Addr) -> io::Result<()> {
self.0.leave_multicast_v4(multiaddr, interface)
@ -304,7 +304,7 @@ impl UdpSocket {
/// For more information about this option, see
/// [`join_multicast_v6`][link].
///
/// [link]: #tymethod.join_multicast_v6
/// [link]: #method.join_multicast_v6
#[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn leave_multicast_v6(&self, multiaddr: &Ipv6Addr, interface: u32) -> io::Result<()> {
self.0.leave_multicast_v6(multiaddr, interface)

View File

@ -17,9 +17,9 @@
//! actually point to a valid place.
//!
//! Currently this doesn't actually do any HTML parsing or anything fancy like
//! that, it just has a simple "regex" to search for `href` tags. These values
//! are then translated to file URLs if possible and then the destination is
//! asserted to exist.
//! that, it just has a simple "regex" to search for `href` and `id` tags.
//! These values are then translated to file URLs if possible and then the
//! destination is asserted to exist.
//!
//! A few whitelisted exceptions are allowed as there's known bugs in rustdoc,
//! but this should catch the majority of "broken link" cases.
@ -29,14 +29,16 @@ extern crate url;
use std::env;
use std::fs::File;
use std::io::prelude::*;
use std::path::Path;
use std::path::{Path, PathBuf};
use std::collections::{HashMap, HashSet};
use std::collections::hash_map::Entry;
use url::{Url, UrlParser};
macro_rules! t {
($e:expr) => (match $e {
Ok(e) => e,
Err(e) => panic!("{} failed with {}", stringify!($e), e),
Err(e) => panic!("{} failed with {:?}", stringify!($e), e),
})
}
@ -45,44 +47,68 @@ fn main() {
let docs = env::current_dir().unwrap().join(docs);
let mut url = Url::from_file_path(&docs).unwrap();
let mut errors = false;
walk(&docs, &docs, &mut url, &mut errors);
walk(&mut HashMap::new(), &docs, &docs, &mut url, &mut errors);
if errors {
panic!("found some broken links");
}
}
fn walk(root: &Path, dir: &Path, url: &mut Url, errors: &mut bool) {
#[derive(Debug)]
pub enum LoadError {
IOError(std::io::Error),
BrokenRedirect(PathBuf, std::io::Error),
}
struct FileEntry {
source: String,
ids: HashSet<String>,
}
type Cache = HashMap<PathBuf, FileEntry>;
fn walk(cache: &mut Cache,
root: &Path,
dir: &Path,
url: &mut Url,
errors: &mut bool)
{
for entry in t!(dir.read_dir()).map(|e| t!(e)) {
let path = entry.path();
let kind = t!(entry.file_type());
url.path_mut().unwrap().push(entry.file_name().into_string().unwrap());
if kind.is_dir() {
walk(root, &path, url, errors);
walk(cache, root, &path, url, errors);
} else {
check(root, &path, url, errors);
check(cache, root, &path, url, errors);
}
url.path_mut().unwrap().pop();
}
}
fn check(root: &Path, file: &Path, base: &Url, errors: &mut bool) {
fn check(cache: &mut Cache,
root: &Path,
file: &Path,
base: &Url,
errors: &mut bool)
{
// ignore js files as they are not prone to errors as the rest of the
// documentation is and they otherwise bring up false positives.
if file.extension().and_then(|s| s.to_str()) == Some("js") {
return
}
let pretty_file = file.strip_prefix(root).unwrap_or(file);
// Unfortunately we're not 100% full of valid links today to we need a few
// whitelists to get this past `make check` today.
// FIXME(#32129)
if file.ends_with("std/string/struct.String.html") {
if file.ends_with("std/string/struct.String.html") ||
file.ends_with("collections/string/struct.String.html") {
return
}
// FIXME(#32130)
if file.ends_with("btree_set/struct.BTreeSet.html") ||
file.ends_with("collections/struct.BTreeSet.html") {
file.ends_with("collections/struct.BTreeSet.html") ||
file.ends_with("collections/btree_map/struct.BTreeMap.html") ||
file.ends_with("collections/hash_map/struct.HashMap.html") {
return
}
@ -104,15 +130,159 @@ fn check(root: &Path, file: &Path, base: &Url, errors: &mut bool) {
let mut parser = UrlParser::new();
parser.base_url(base);
let mut contents = String::new();
if t!(File::open(file)).read_to_string(&mut contents).is_err() {
return
let res = load_file(cache, root, PathBuf::from(file), false, false);
let (pretty_file, contents) = match res {
Ok(res) => res,
Err(_) => return,
};
// Search for anything that's the regex 'href[ ]*=[ ]*".*?"'
with_attrs_in_source(&contents, " href", |url, i| {
// Once we've plucked out the URL, parse it using our base url and
// then try to extract a file path. If either of these fail then we
// just keep going.
let (parsed_url, path) = match url_to_file_path(&parser, url) {
Some((url, path)) => (url, PathBuf::from(path)),
None => return,
};
// Alright, if we've found a file name then this file had better
// exist! If it doesn't then we register and print an error.
if path.exists() {
if path.is_dir() {
return;
}
let res = load_file(cache, root, path.clone(), true, false);
let (pretty_path, contents) = match res {
Ok(res) => res,
Err(LoadError::IOError(err)) => panic!(format!("{}", err)),
Err(LoadError::BrokenRedirect(target, _)) => {
print!("{}:{}: broken redirect to {}",
pretty_file.display(), i + 1, target.display());
return;
}
};
if let Some(ref fragment) = parsed_url.fragment {
// Fragments like `#1-6` are most likely line numbers to be
// interpreted by javascript, so we're ignoring these
if fragment.splitn(2, '-')
.all(|f| f.chars().all(|c| c.is_numeric())) {
return;
}
let ids = &mut cache.get_mut(&pretty_path).unwrap().ids;
if ids.is_empty() {
// Search for anything that's the regex 'id[ ]*=[ ]*".*?"'
with_attrs_in_source(&contents, " id", |fragment, i| {
let frag = fragment.trim_left_matches("#").to_owned();
if !ids.insert(frag) {
*errors = true;
println!("{}:{}: id is not unique: `{}`",
pretty_file.display(), i, fragment);
}
});
}
if !ids.contains(fragment) {
*errors = true;
print!("{}:{}: broken link fragment ",
pretty_file.display(), i + 1);
println!("`#{}` pointing to `{}`",
fragment, pretty_path.display());
};
}
} else {
*errors = true;
print!("{}:{}: broken link - ", pretty_file.display(), i + 1);
let pretty_path = path.strip_prefix(root).unwrap_or(&path);
println!("{}", pretty_path.display());
}
});
}
fn load_file(cache: &mut Cache,
root: &Path,
file: PathBuf,
follow_redirects: bool,
is_redirect: bool) -> Result<(PathBuf, String), LoadError> {
let mut contents = String::new();
let pretty_file = PathBuf::from(file.strip_prefix(root).unwrap_or(&file));
let maybe_redirect = match cache.entry(pretty_file.clone()) {
Entry::Occupied(entry) => {
contents = entry.get().source.clone();
None
},
Entry::Vacant(entry) => {
let mut fp = try!(File::open(file.clone()).map_err(|err| {
if is_redirect {
LoadError::BrokenRedirect(file.clone(), err)
} else {
LoadError::IOError(err)
}
}));
try!(fp.read_to_string(&mut contents)
.map_err(|err| LoadError::IOError(err)));
let maybe = if follow_redirects {
maybe_redirect(&contents)
} else {
None
};
if maybe.is_none() {
entry.insert(FileEntry {
source: contents.clone(),
ids: HashSet::new(),
});
}
maybe
},
};
let base = Url::from_file_path(&file).unwrap();
let mut parser = UrlParser::new();
parser.base_url(&base);
match maybe_redirect.and_then(|url| url_to_file_path(&parser, &url)) {
Some((_, redirect_file)) => {
assert!(follow_redirects);
let path = PathBuf::from(redirect_file);
load_file(cache, root, path, follow_redirects, true)
}
None => Ok((pretty_file, contents))
}
}
fn maybe_redirect(source: &str) -> Option<String> {
const REDIRECT: &'static str = "<p>Redirecting to <a href=";
let mut lines = source.lines();
let redirect_line = match lines.nth(6) {
Some(l) => l,
None => return None,
};
redirect_line.find(REDIRECT).map(|i| {
let rest = &redirect_line[(i + REDIRECT.len() + 1)..];
let pos_quote = rest.find('"').unwrap();
rest[..pos_quote].to_owned()
})
}
fn url_to_file_path(parser: &UrlParser, url: &str) -> Option<(Url, PathBuf)> {
parser.parse(url).ok().and_then(|parsed_url| {
parsed_url.to_file_path().ok().map(|f| (parsed_url, f))
})
}
fn with_attrs_in_source<F: FnMut(&str, usize)>(contents: &str,
attr: &str,
mut f: F)
{
for (i, mut line) in contents.lines().enumerate() {
// Search for anything that's the regex 'href[ ]*=[ ]*".*?"'
while let Some(j) = line.find(" href") {
let rest = &line[j + 5..];
while let Some(j) = line.find(attr) {
let rest = &line[j + attr.len() ..];
line = rest;
let pos_equals = match rest.find("=") {
Some(i) => i,
@ -121,40 +291,24 @@ fn check(root: &Path, file: &Path, base: &Url, errors: &mut bool) {
if rest[..pos_equals].trim_left_matches(" ") != "" {
continue
}
let rest = &rest[pos_equals + 1..];
let pos_quote = match rest.find("\"").or_else(|| rest.find("'")) {
let pos_quote = match rest.find(&['"', '\''][..]) {
Some(i) => i,
None => continue,
};
let quote_delim = rest.as_bytes()[pos_quote] as char;
if rest[..pos_quote].trim_left_matches(" ") != "" {
continue
}
let rest = &rest[pos_quote + 1..];
let url = match rest.find("\"").or_else(|| rest.find("'")) {
let url = match rest.find(quote_delim) {
Some(i) => &rest[..i],
None => continue,
};
// Once we've plucked out the URL, parse it using our base url and
// then try to extract a file path. If either if these fail then we
// just keep going.
let parsed_url = match parser.parse(url) {
Ok(url) => url,
Err(..) => continue,
};
let path = match parsed_url.to_file_path() {
Ok(path) => path,
Err(..) => continue,
};
// Alright, if we've found a file name then this file had better
// exist! If it doesn't then we register and print an error.
if !path.exists() {
*errors = true;
print!("{}:{}: broken link - ", pretty_file.display(), i + 1);
let pretty_path = path.strip_prefix(root).unwrap_or(&path);
println!("{}", pretty_path.display());
}
f(url, i)
}
}
}