Extend linkchecker with anchor checking

This adds checks to ensure that:
* link anchors refer to existing id's on the target page
* id's are unique within an html document
* page redirects are valid
This commit is contained in:
mitaa 2016-03-26 11:59:30 +01:00
parent d0f74b6059
commit 6a76872d71
7 changed files with 217 additions and 64 deletions

View File

@ -27,8 +27,7 @@ explicitly implement to be used by this generic function.
* _Inference_. Since the type parameters to generic functions can usually be * _Inference_. Since the type parameters to generic functions can usually be
inferred, generic functions can help cut down on verbosity in code where inferred, generic functions can help cut down on verbosity in code where
explicit conversions or other method calls would usually be necessary. See the explicit conversions or other method calls would usually be necessary. See the
[overloading/implicits use case](#use-case-limited-overloading-andor-implicit-conversions) overloading/implicits use case below.
below.
* _Precise types_. Because generics give a _name_ to the specific type * _Precise types_. Because generics give a _name_ to the specific type
implementing a trait, it is possible to be precise about places where that implementing a trait, it is possible to be precise about places where that
exact type is required or produced. For example, a function exact type is required or produced. For example, a function
@ -51,7 +50,7 @@ explicitly implement to be used by this generic function.
a `Vec<T>` contains elements of a single concrete type (and, indeed, the a `Vec<T>` contains elements of a single concrete type (and, indeed, the
vector representation is specialized to lay these out in line). Sometimes vector representation is specialized to lay these out in line). Sometimes
heterogeneous collections are useful; see heterogeneous collections are useful; see
[trait objects](#use-case-trait-objects) below. trait objects below.
* _Signature verbosity_. Heavy use of generics can bloat function signatures. * _Signature verbosity_. Heavy use of generics can bloat function signatures.
**[Ed. note]** This problem may be mitigated by some language improvements; stay tuned. **[Ed. note]** This problem may be mitigated by some language improvements; stay tuned.

View File

@ -434,7 +434,7 @@ pub trait Iterator {
/// `None`. Once `None` is encountered, `count()` returns the number of /// `None`. Once `None` is encountered, `count()` returns the number of
/// times it called [`next()`]. /// times it called [`next()`].
/// ///
/// [`next()`]: #method.next /// [`next()`]: #tymethod.next
/// ///
/// # Overflow Behavior /// # Overflow Behavior
/// ///
@ -497,7 +497,7 @@ pub trait Iterator {
/// This method will evaluate the iterator `n` times, discarding those elements. /// This method will evaluate the iterator `n` times, discarding those elements.
/// After it does so, it will call [`next()`] and return its value. /// After it does so, it will call [`next()`] and return its value.
/// ///
/// [`next()`]: #method.next /// [`next()`]: #tymethod.next
/// ///
/// Like most indexing operations, the count starts from zero, so `nth(0)` /// Like most indexing operations, the count starts from zero, so `nth(0)`
/// returns the first value, `nth(1)` the second, and so on. /// returns the first value, `nth(1)` the second, and so on.

View File

@ -1254,7 +1254,7 @@ pub trait BufRead: Read {
/// longer be returned. As such, this function may do odd things if /// longer be returned. As such, this function may do odd things if
/// `fill_buf` isn't called before calling it. /// `fill_buf` isn't called before calling it.
/// ///
/// [fillbuf]: #tymethod.fill_buff /// [fillbuf]: #tymethod.fill_buf
/// ///
/// The `amt` must be `<=` the number of bytes in the buffer returned by /// The `amt` must be `<=` the number of bytes in the buffer returned by
/// `fill_buf`. /// `fill_buf`.

View File

@ -190,7 +190,7 @@
//! [`thread`]: thread/index.html //! [`thread`]: thread/index.html
//! [`use std::env`]: env/index.html //! [`use std::env`]: env/index.html
//! [`use`]: ../book/crates-and-modules.html#importing-modules-with-use //! [`use`]: ../book/crates-and-modules.html#importing-modules-with-use
//! [crate root]: ../book/crates-and-modules.html#basic-terminology:-crates-and-modules //! [crate root]: ../book/crates-and-modules.html#basic-terminology-crates-and-modules
//! [crates.io]: https://crates.io //! [crates.io]: https://crates.io
//! [deref coercions]: ../book/deref-coercions.html //! [deref coercions]: ../book/deref-coercions.html
//! [files]: fs/struct.File.html //! [files]: fs/struct.File.html

View File

@ -196,7 +196,7 @@ impl TcpStream {
/// ///
/// For more information about this option, see [`set_nodelay`][link]. /// For more information about this option, see [`set_nodelay`][link].
/// ///
/// [link]: #tymethod.set_nodelay /// [link]: #method.set_nodelay
#[stable(feature = "net2_mutators", since = "1.9.0")] #[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn nodelay(&self) -> io::Result<bool> { pub fn nodelay(&self) -> io::Result<bool> {
self.0.nodelay() self.0.nodelay()
@ -215,7 +215,7 @@ impl TcpStream {
/// ///
/// For more information about this option, see [`set_ttl`][link]. /// For more information about this option, see [`set_ttl`][link].
/// ///
/// [link]: #tymethod.set_ttl /// [link]: #method.set_ttl
#[stable(feature = "net2_mutators", since = "1.9.0")] #[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn ttl(&self) -> io::Result<u32> { pub fn ttl(&self) -> io::Result<u32> {
self.0.ttl() self.0.ttl()
@ -238,7 +238,7 @@ impl TcpStream {
/// ///
/// For more information about this option, see [`set_only_v6`][link]. /// For more information about this option, see [`set_only_v6`][link].
/// ///
/// [link]: #tymethod.set_only_v6 /// [link]: #method.set_only_v6
#[stable(feature = "net2_mutators", since = "1.9.0")] #[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn only_v6(&self) -> io::Result<bool> { pub fn only_v6(&self) -> io::Result<bool> {
self.0.only_v6() self.0.only_v6()
@ -374,7 +374,7 @@ impl TcpListener {
/// ///
/// For more information about this option, see [`set_ttl`][link]. /// For more information about this option, see [`set_ttl`][link].
/// ///
/// [link]: #tymethod.set_ttl /// [link]: #method.set_ttl
#[stable(feature = "net2_mutators", since = "1.9.0")] #[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn ttl(&self) -> io::Result<u32> { pub fn ttl(&self) -> io::Result<u32> {
self.0.ttl() self.0.ttl()
@ -397,7 +397,7 @@ impl TcpListener {
/// ///
/// For more information about this option, see [`set_only_v6`][link]. /// For more information about this option, see [`set_only_v6`][link].
/// ///
/// [link]: #tymethod.set_only_v6 /// [link]: #method.set_only_v6
#[stable(feature = "net2_mutators", since = "1.9.0")] #[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn only_v6(&self) -> io::Result<bool> { pub fn only_v6(&self) -> io::Result<bool> {
self.0.only_v6() self.0.only_v6()

View File

@ -155,7 +155,7 @@ impl UdpSocket {
/// For more information about this option, see /// For more information about this option, see
/// [`set_broadcast`][link]. /// [`set_broadcast`][link].
/// ///
/// [link]: #tymethod.set_broadcast /// [link]: #method.set_broadcast
#[stable(feature = "net2_mutators", since = "1.9.0")] #[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn broadcast(&self) -> io::Result<bool> { pub fn broadcast(&self) -> io::Result<bool> {
self.0.broadcast() self.0.broadcast()
@ -175,7 +175,7 @@ impl UdpSocket {
/// For more information about this option, see /// For more information about this option, see
/// [`set_multicast_loop_v4`][link]. /// [`set_multicast_loop_v4`][link].
/// ///
/// [link]: #tymethod.set_multicast_loop_v4 /// [link]: #method.set_multicast_loop_v4
#[stable(feature = "net2_mutators", since = "1.9.0")] #[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn multicast_loop_v4(&self) -> io::Result<bool> { pub fn multicast_loop_v4(&self) -> io::Result<bool> {
self.0.multicast_loop_v4() self.0.multicast_loop_v4()
@ -198,7 +198,7 @@ impl UdpSocket {
/// For more information about this option, see /// For more information about this option, see
/// [`set_multicast_ttl_v4`][link]. /// [`set_multicast_ttl_v4`][link].
/// ///
/// [link]: #tymethod.set_multicast_ttl_v4 /// [link]: #method.set_multicast_ttl_v4
#[stable(feature = "net2_mutators", since = "1.9.0")] #[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn multicast_ttl_v4(&self) -> io::Result<u32> { pub fn multicast_ttl_v4(&self) -> io::Result<u32> {
self.0.multicast_ttl_v4() self.0.multicast_ttl_v4()
@ -218,7 +218,7 @@ impl UdpSocket {
/// For more information about this option, see /// For more information about this option, see
/// [`set_multicast_loop_v6`][link]. /// [`set_multicast_loop_v6`][link].
/// ///
/// [link]: #tymethod.set_multicast_loop_v6 /// [link]: #method.set_multicast_loop_v6
#[stable(feature = "net2_mutators", since = "1.9.0")] #[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn multicast_loop_v6(&self) -> io::Result<bool> { pub fn multicast_loop_v6(&self) -> io::Result<bool> {
self.0.multicast_loop_v6() self.0.multicast_loop_v6()
@ -237,7 +237,7 @@ impl UdpSocket {
/// ///
/// For more information about this option, see [`set_ttl`][link]. /// For more information about this option, see [`set_ttl`][link].
/// ///
/// [link]: #tymethod.set_ttl /// [link]: #method.set_ttl
#[stable(feature = "net2_mutators", since = "1.9.0")] #[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn ttl(&self) -> io::Result<u32> { pub fn ttl(&self) -> io::Result<u32> {
self.0.ttl() self.0.ttl()
@ -260,7 +260,7 @@ impl UdpSocket {
/// ///
/// For more information about this option, see [`set_only_v6`][link]. /// For more information about this option, see [`set_only_v6`][link].
/// ///
/// [link]: #tymethod.set_only_v6 /// [link]: #method.set_only_v6
#[stable(feature = "net2_mutators", since = "1.9.0")] #[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn only_v6(&self) -> io::Result<bool> { pub fn only_v6(&self) -> io::Result<bool> {
self.0.only_v6() self.0.only_v6()
@ -293,7 +293,7 @@ impl UdpSocket {
/// For more information about this option, see /// For more information about this option, see
/// [`join_multicast_v4`][link]. /// [`join_multicast_v4`][link].
/// ///
/// [link]: #tymethod.join_multicast_v4 /// [link]: #method.join_multicast_v4
#[stable(feature = "net2_mutators", since = "1.9.0")] #[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn leave_multicast_v4(&self, multiaddr: &Ipv4Addr, interface: &Ipv4Addr) -> io::Result<()> { pub fn leave_multicast_v4(&self, multiaddr: &Ipv4Addr, interface: &Ipv4Addr) -> io::Result<()> {
self.0.leave_multicast_v4(multiaddr, interface) self.0.leave_multicast_v4(multiaddr, interface)
@ -304,7 +304,7 @@ impl UdpSocket {
/// For more information about this option, see /// For more information about this option, see
/// [`join_multicast_v6`][link]. /// [`join_multicast_v6`][link].
/// ///
/// [link]: #tymethod.join_multicast_v6 /// [link]: #method.join_multicast_v6
#[stable(feature = "net2_mutators", since = "1.9.0")] #[stable(feature = "net2_mutators", since = "1.9.0")]
pub fn leave_multicast_v6(&self, multiaddr: &Ipv6Addr, interface: u32) -> io::Result<()> { pub fn leave_multicast_v6(&self, multiaddr: &Ipv6Addr, interface: u32) -> io::Result<()> {
self.0.leave_multicast_v6(multiaddr, interface) self.0.leave_multicast_v6(multiaddr, interface)

View File

@ -17,9 +17,9 @@
//! actually point to a valid place. //! actually point to a valid place.
//! //!
//! Currently this doesn't actually do any HTML parsing or anything fancy like //! Currently this doesn't actually do any HTML parsing or anything fancy like
//! that, it just has a simple "regex" to search for `href` tags. These values //! that, it just has a simple "regex" to search for `href` and `id` tags.
//! are then translated to file URLs if possible and then the destination is //! These values are then translated to file URLs if possible and then the
//! asserted to exist. //! destination is asserted to exist.
//! //!
//! A few whitelisted exceptions are allowed as there's known bugs in rustdoc, //! A few whitelisted exceptions are allowed as there's known bugs in rustdoc,
//! but this should catch the majority of "broken link" cases. //! but this should catch the majority of "broken link" cases.
@ -29,14 +29,16 @@ extern crate url;
use std::env; use std::env;
use std::fs::File; use std::fs::File;
use std::io::prelude::*; use std::io::prelude::*;
use std::path::Path; use std::path::{Path, PathBuf};
use std::collections::{HashMap, HashSet};
use std::collections::hash_map::Entry;
use url::{Url, UrlParser}; use url::{Url, UrlParser};
macro_rules! t { macro_rules! t {
($e:expr) => (match $e { ($e:expr) => (match $e {
Ok(e) => e, Ok(e) => e,
Err(e) => panic!("{} failed with {}", stringify!($e), e), Err(e) => panic!("{} failed with {:?}", stringify!($e), e),
}) })
} }
@ -45,44 +47,68 @@ fn main() {
let docs = env::current_dir().unwrap().join(docs); let docs = env::current_dir().unwrap().join(docs);
let mut url = Url::from_file_path(&docs).unwrap(); let mut url = Url::from_file_path(&docs).unwrap();
let mut errors = false; let mut errors = false;
walk(&docs, &docs, &mut url, &mut errors); walk(&mut HashMap::new(), &docs, &docs, &mut url, &mut errors);
if errors { if errors {
panic!("found some broken links"); panic!("found some broken links");
} }
} }
fn walk(root: &Path, dir: &Path, url: &mut Url, errors: &mut bool) { #[derive(Debug)]
pub enum LoadError {
IOError(std::io::Error),
BrokenRedirect(PathBuf, std::io::Error),
}
struct FileEntry {
source: String,
ids: HashSet<String>,
}
type Cache = HashMap<PathBuf, FileEntry>;
fn walk(cache: &mut Cache,
root: &Path,
dir: &Path,
url: &mut Url,
errors: &mut bool)
{
for entry in t!(dir.read_dir()).map(|e| t!(e)) { for entry in t!(dir.read_dir()).map(|e| t!(e)) {
let path = entry.path(); let path = entry.path();
let kind = t!(entry.file_type()); let kind = t!(entry.file_type());
url.path_mut().unwrap().push(entry.file_name().into_string().unwrap()); url.path_mut().unwrap().push(entry.file_name().into_string().unwrap());
if kind.is_dir() { if kind.is_dir() {
walk(root, &path, url, errors); walk(cache, root, &path, url, errors);
} else { } else {
check(root, &path, url, errors); check(cache, root, &path, url, errors);
} }
url.path_mut().unwrap().pop(); url.path_mut().unwrap().pop();
} }
} }
fn check(root: &Path, file: &Path, base: &Url, errors: &mut bool) { fn check(cache: &mut Cache,
root: &Path,
file: &Path,
base: &Url,
errors: &mut bool)
{
// ignore js files as they are not prone to errors as the rest of the // ignore js files as they are not prone to errors as the rest of the
// documentation is and they otherwise bring up false positives. // documentation is and they otherwise bring up false positives.
if file.extension().and_then(|s| s.to_str()) == Some("js") { if file.extension().and_then(|s| s.to_str()) == Some("js") {
return return
} }
let pretty_file = file.strip_prefix(root).unwrap_or(file);
// Unfortunately we're not 100% full of valid links today to we need a few // Unfortunately we're not 100% full of valid links today to we need a few
// whitelists to get this past `make check` today. // whitelists to get this past `make check` today.
// FIXME(#32129) // FIXME(#32129)
if file.ends_with("std/string/struct.String.html") { if file.ends_with("std/string/struct.String.html") ||
file.ends_with("collections/string/struct.String.html") {
return return
} }
// FIXME(#32130) // FIXME(#32130)
if file.ends_with("btree_set/struct.BTreeSet.html") || if file.ends_with("btree_set/struct.BTreeSet.html") ||
file.ends_with("collections/struct.BTreeSet.html") { file.ends_with("collections/struct.BTreeSet.html") ||
file.ends_with("collections/btree_map/struct.BTreeMap.html") ||
file.ends_with("collections/hash_map/struct.HashMap.html") {
return return
} }
@ -104,15 +130,159 @@ fn check(root: &Path, file: &Path, base: &Url, errors: &mut bool) {
let mut parser = UrlParser::new(); let mut parser = UrlParser::new();
parser.base_url(base); parser.base_url(base);
let mut contents = String::new();
if t!(File::open(file)).read_to_string(&mut contents).is_err() {
return
}
let res = load_file(cache, root, PathBuf::from(file), false, false);
let (pretty_file, contents) = match res {
Ok(res) => res,
Err(_) => return,
};
// Search for anything that's the regex 'href[ ]*=[ ]*".*?"'
with_attrs_in_source(&contents, " href", |url, i| {
// Once we've plucked out the URL, parse it using our base url and
// then try to extract a file path. If either of these fail then we
// just keep going.
let (parsed_url, path) = match url_to_file_path(&parser, url) {
Some((url, path)) => (url, PathBuf::from(path)),
None => return,
};
// Alright, if we've found a file name then this file had better
// exist! If it doesn't then we register and print an error.
if path.exists() {
if path.is_dir() {
return;
}
let res = load_file(cache, root, path.clone(), true, false);
let (pretty_path, contents) = match res {
Ok(res) => res,
Err(LoadError::IOError(err)) => panic!(format!("{}", err)),
Err(LoadError::BrokenRedirect(target, _)) => {
print!("{}:{}: broken redirect to {}",
pretty_file.display(), i + 1, target.display());
return;
}
};
if let Some(ref fragment) = parsed_url.fragment {
// Fragments like `#1-6` are most likely line numbers to be
// interpreted by javascript, so we're ignoring these
if fragment.splitn(2, '-')
.all(|f| f.chars().all(|c| c.is_numeric())) {
return;
}
let ids = &mut cache.get_mut(&pretty_path).unwrap().ids;
if ids.is_empty() {
// Search for anything that's the regex 'id[ ]*=[ ]*".*?"'
with_attrs_in_source(&contents, " id", |fragment, i| {
let frag = fragment.trim_left_matches("#").to_owned();
if !ids.insert(frag) {
*errors = true;
println!("{}:{}: id is not unique: `{}`",
pretty_file.display(), i, fragment);
}
});
}
if !ids.contains(fragment) {
*errors = true;
print!("{}:{}: broken link fragment ",
pretty_file.display(), i + 1);
println!("`#{}` pointing to `{}`",
fragment, pretty_path.display());
};
}
} else {
*errors = true;
print!("{}:{}: broken link - ", pretty_file.display(), i + 1);
let pretty_path = path.strip_prefix(root).unwrap_or(&path);
println!("{}", pretty_path.display());
}
});
}
fn load_file(cache: &mut Cache,
root: &Path,
file: PathBuf,
follow_redirects: bool,
is_redirect: bool) -> Result<(PathBuf, String), LoadError> {
let mut contents = String::new();
let pretty_file = PathBuf::from(file.strip_prefix(root).unwrap_or(&file));
let maybe_redirect = match cache.entry(pretty_file.clone()) {
Entry::Occupied(entry) => {
contents = entry.get().source.clone();
None
},
Entry::Vacant(entry) => {
let mut fp = try!(File::open(file.clone()).map_err(|err| {
if is_redirect {
LoadError::BrokenRedirect(file.clone(), err)
} else {
LoadError::IOError(err)
}
}));
try!(fp.read_to_string(&mut contents)
.map_err(|err| LoadError::IOError(err)));
let maybe = if follow_redirects {
maybe_redirect(&contents)
} else {
None
};
if maybe.is_none() {
entry.insert(FileEntry {
source: contents.clone(),
ids: HashSet::new(),
});
}
maybe
},
};
let base = Url::from_file_path(&file).unwrap();
let mut parser = UrlParser::new();
parser.base_url(&base);
match maybe_redirect.and_then(|url| url_to_file_path(&parser, &url)) {
Some((_, redirect_file)) => {
assert!(follow_redirects);
let path = PathBuf::from(redirect_file);
load_file(cache, root, path, follow_redirects, true)
}
None => Ok((pretty_file, contents))
}
}
fn maybe_redirect(source: &str) -> Option<String> {
const REDIRECT: &'static str = "<p>Redirecting to <a href=";
let mut lines = source.lines();
let redirect_line = match lines.nth(6) {
Some(l) => l,
None => return None,
};
redirect_line.find(REDIRECT).map(|i| {
let rest = &redirect_line[(i + REDIRECT.len() + 1)..];
let pos_quote = rest.find('"').unwrap();
rest[..pos_quote].to_owned()
})
}
fn url_to_file_path(parser: &UrlParser, url: &str) -> Option<(Url, PathBuf)> {
parser.parse(url).ok().and_then(|parsed_url| {
parsed_url.to_file_path().ok().map(|f| (parsed_url, f))
})
}
fn with_attrs_in_source<F: FnMut(&str, usize)>(contents: &str,
attr: &str,
mut f: F)
{
for (i, mut line) in contents.lines().enumerate() { for (i, mut line) in contents.lines().enumerate() {
// Search for anything that's the regex 'href[ ]*=[ ]*".*?"' while let Some(j) = line.find(attr) {
while let Some(j) = line.find(" href") { let rest = &line[j + attr.len() ..];
let rest = &line[j + 5..];
line = rest; line = rest;
let pos_equals = match rest.find("=") { let pos_equals = match rest.find("=") {
Some(i) => i, Some(i) => i,
@ -121,40 +291,24 @@ fn check(root: &Path, file: &Path, base: &Url, errors: &mut bool) {
if rest[..pos_equals].trim_left_matches(" ") != "" { if rest[..pos_equals].trim_left_matches(" ") != "" {
continue continue
} }
let rest = &rest[pos_equals + 1..]; let rest = &rest[pos_equals + 1..];
let pos_quote = match rest.find("\"").or_else(|| rest.find("'")) {
let pos_quote = match rest.find(&['"', '\''][..]) {
Some(i) => i, Some(i) => i,
None => continue, None => continue,
}; };
let quote_delim = rest.as_bytes()[pos_quote] as char;
if rest[..pos_quote].trim_left_matches(" ") != "" { if rest[..pos_quote].trim_left_matches(" ") != "" {
continue continue
} }
let rest = &rest[pos_quote + 1..]; let rest = &rest[pos_quote + 1..];
let url = match rest.find("\"").or_else(|| rest.find("'")) { let url = match rest.find(quote_delim) {
Some(i) => &rest[..i], Some(i) => &rest[..i],
None => continue, None => continue,
}; };
f(url, i)
// Once we've plucked out the URL, parse it using our base url and
// then try to extract a file path. If either if these fail then we
// just keep going.
let parsed_url = match parser.parse(url) {
Ok(url) => url,
Err(..) => continue,
};
let path = match parsed_url.to_file_path() {
Ok(path) => path,
Err(..) => continue,
};
// Alright, if we've found a file name then this file had better
// exist! If it doesn't then we register and print an error.
if !path.exists() {
*errors = true;
print!("{}:{}: broken link - ", pretty_file.display(), i + 1);
let pretty_path = path.strip_prefix(root).unwrap_or(&path);
println!("{}", pretty_path.display());
}
} }
} }
} }