From 6a76872d7165f901e3ec127a25be1a6303d137b3 Mon Sep 17 00:00:00 2001 From: mitaa Date: Sat, 26 Mar 2016 11:59:30 +0100 Subject: [PATCH] Extend linkchecker with anchor checking This adds checks to ensure that: * link anchors refer to existing id's on the target page * id's are unique within an html document * page redirects are valid --- src/doc/style/features/traits/generics.md | 5 +- src/libcore/iter.rs | 4 +- src/libstd/io/mod.rs | 2 +- src/libstd/lib.rs | 2 +- src/libstd/net/tcp.rs | 10 +- src/libstd/net/udp.rs | 16 +- src/tools/linkchecker/main.rs | 242 ++++++++++++++++++---- 7 files changed, 217 insertions(+), 64 deletions(-) diff --git a/src/doc/style/features/traits/generics.md b/src/doc/style/features/traits/generics.md index 26ffda50ac5..a09640c3055 100644 --- a/src/doc/style/features/traits/generics.md +++ b/src/doc/style/features/traits/generics.md @@ -27,8 +27,7 @@ explicitly implement to be used by this generic function. * _Inference_. Since the type parameters to generic functions can usually be inferred, generic functions can help cut down on verbosity in code where explicit conversions or other method calls would usually be necessary. See the - [overloading/implicits use case](#use-case-limited-overloading-andor-implicit-conversions) - below. + overloading/implicits use case below. * _Precise types_. Because generics give a _name_ to the specific type implementing a trait, it is possible to be precise about places where that exact type is required or produced. For example, a function @@ -51,7 +50,7 @@ explicitly implement to be used by this generic function. a `Vec` contains elements of a single concrete type (and, indeed, the vector representation is specialized to lay these out in line). Sometimes heterogeneous collections are useful; see - [trait objects](#use-case-trait-objects) below. + trait objects below. * _Signature verbosity_. Heavy use of generics can bloat function signatures. **[Ed. note]** This problem may be mitigated by some language improvements; stay tuned. diff --git a/src/libcore/iter.rs b/src/libcore/iter.rs index 72421e94a43..4dbcf7ab4e3 100644 --- a/src/libcore/iter.rs +++ b/src/libcore/iter.rs @@ -434,7 +434,7 @@ pub trait Iterator { /// `None`. Once `None` is encountered, `count()` returns the number of /// times it called [`next()`]. /// - /// [`next()`]: #method.next + /// [`next()`]: #tymethod.next /// /// # Overflow Behavior /// @@ -497,7 +497,7 @@ pub trait Iterator { /// This method will evaluate the iterator `n` times, discarding those elements. /// After it does so, it will call [`next()`] and return its value. /// - /// [`next()`]: #method.next + /// [`next()`]: #tymethod.next /// /// Like most indexing operations, the count starts from zero, so `nth(0)` /// returns the first value, `nth(1)` the second, and so on. diff --git a/src/libstd/io/mod.rs b/src/libstd/io/mod.rs index 28492b30e0f..5362f7086bd 100644 --- a/src/libstd/io/mod.rs +++ b/src/libstd/io/mod.rs @@ -1254,7 +1254,7 @@ pub trait BufRead: Read { /// longer be returned. As such, this function may do odd things if /// `fill_buf` isn't called before calling it. /// - /// [fillbuf]: #tymethod.fill_buff + /// [fillbuf]: #tymethod.fill_buf /// /// The `amt` must be `<=` the number of bytes in the buffer returned by /// `fill_buf`. diff --git a/src/libstd/lib.rs b/src/libstd/lib.rs index 5608bb646a4..f7b1042592d 100644 --- a/src/libstd/lib.rs +++ b/src/libstd/lib.rs @@ -190,7 +190,7 @@ //! [`thread`]: thread/index.html //! [`use std::env`]: env/index.html //! [`use`]: ../book/crates-and-modules.html#importing-modules-with-use -//! [crate root]: ../book/crates-and-modules.html#basic-terminology:-crates-and-modules +//! [crate root]: ../book/crates-and-modules.html#basic-terminology-crates-and-modules //! [crates.io]: https://crates.io //! [deref coercions]: ../book/deref-coercions.html //! [files]: fs/struct.File.html diff --git a/src/libstd/net/tcp.rs b/src/libstd/net/tcp.rs index 414696413f4..38da74b8903 100644 --- a/src/libstd/net/tcp.rs +++ b/src/libstd/net/tcp.rs @@ -196,7 +196,7 @@ impl TcpStream { /// /// For more information about this option, see [`set_nodelay`][link]. /// - /// [link]: #tymethod.set_nodelay + /// [link]: #method.set_nodelay #[stable(feature = "net2_mutators", since = "1.9.0")] pub fn nodelay(&self) -> io::Result { self.0.nodelay() @@ -215,7 +215,7 @@ impl TcpStream { /// /// For more information about this option, see [`set_ttl`][link]. /// - /// [link]: #tymethod.set_ttl + /// [link]: #method.set_ttl #[stable(feature = "net2_mutators", since = "1.9.0")] pub fn ttl(&self) -> io::Result { self.0.ttl() @@ -238,7 +238,7 @@ impl TcpStream { /// /// For more information about this option, see [`set_only_v6`][link]. /// - /// [link]: #tymethod.set_only_v6 + /// [link]: #method.set_only_v6 #[stable(feature = "net2_mutators", since = "1.9.0")] pub fn only_v6(&self) -> io::Result { self.0.only_v6() @@ -374,7 +374,7 @@ impl TcpListener { /// /// For more information about this option, see [`set_ttl`][link]. /// - /// [link]: #tymethod.set_ttl + /// [link]: #method.set_ttl #[stable(feature = "net2_mutators", since = "1.9.0")] pub fn ttl(&self) -> io::Result { self.0.ttl() @@ -397,7 +397,7 @@ impl TcpListener { /// /// For more information about this option, see [`set_only_v6`][link]. /// - /// [link]: #tymethod.set_only_v6 + /// [link]: #method.set_only_v6 #[stable(feature = "net2_mutators", since = "1.9.0")] pub fn only_v6(&self) -> io::Result { self.0.only_v6() diff --git a/src/libstd/net/udp.rs b/src/libstd/net/udp.rs index da1cf115e8d..0be9f13e817 100644 --- a/src/libstd/net/udp.rs +++ b/src/libstd/net/udp.rs @@ -155,7 +155,7 @@ impl UdpSocket { /// For more information about this option, see /// [`set_broadcast`][link]. /// - /// [link]: #tymethod.set_broadcast + /// [link]: #method.set_broadcast #[stable(feature = "net2_mutators", since = "1.9.0")] pub fn broadcast(&self) -> io::Result { self.0.broadcast() @@ -175,7 +175,7 @@ impl UdpSocket { /// For more information about this option, see /// [`set_multicast_loop_v4`][link]. /// - /// [link]: #tymethod.set_multicast_loop_v4 + /// [link]: #method.set_multicast_loop_v4 #[stable(feature = "net2_mutators", since = "1.9.0")] pub fn multicast_loop_v4(&self) -> io::Result { self.0.multicast_loop_v4() @@ -198,7 +198,7 @@ impl UdpSocket { /// For more information about this option, see /// [`set_multicast_ttl_v4`][link]. /// - /// [link]: #tymethod.set_multicast_ttl_v4 + /// [link]: #method.set_multicast_ttl_v4 #[stable(feature = "net2_mutators", since = "1.9.0")] pub fn multicast_ttl_v4(&self) -> io::Result { self.0.multicast_ttl_v4() @@ -218,7 +218,7 @@ impl UdpSocket { /// For more information about this option, see /// [`set_multicast_loop_v6`][link]. /// - /// [link]: #tymethod.set_multicast_loop_v6 + /// [link]: #method.set_multicast_loop_v6 #[stable(feature = "net2_mutators", since = "1.9.0")] pub fn multicast_loop_v6(&self) -> io::Result { self.0.multicast_loop_v6() @@ -237,7 +237,7 @@ impl UdpSocket { /// /// For more information about this option, see [`set_ttl`][link]. /// - /// [link]: #tymethod.set_ttl + /// [link]: #method.set_ttl #[stable(feature = "net2_mutators", since = "1.9.0")] pub fn ttl(&self) -> io::Result { self.0.ttl() @@ -260,7 +260,7 @@ impl UdpSocket { /// /// For more information about this option, see [`set_only_v6`][link]. /// - /// [link]: #tymethod.set_only_v6 + /// [link]: #method.set_only_v6 #[stable(feature = "net2_mutators", since = "1.9.0")] pub fn only_v6(&self) -> io::Result { self.0.only_v6() @@ -293,7 +293,7 @@ impl UdpSocket { /// For more information about this option, see /// [`join_multicast_v4`][link]. /// - /// [link]: #tymethod.join_multicast_v4 + /// [link]: #method.join_multicast_v4 #[stable(feature = "net2_mutators", since = "1.9.0")] pub fn leave_multicast_v4(&self, multiaddr: &Ipv4Addr, interface: &Ipv4Addr) -> io::Result<()> { self.0.leave_multicast_v4(multiaddr, interface) @@ -304,7 +304,7 @@ impl UdpSocket { /// For more information about this option, see /// [`join_multicast_v6`][link]. /// - /// [link]: #tymethod.join_multicast_v6 + /// [link]: #method.join_multicast_v6 #[stable(feature = "net2_mutators", since = "1.9.0")] pub fn leave_multicast_v6(&self, multiaddr: &Ipv6Addr, interface: u32) -> io::Result<()> { self.0.leave_multicast_v6(multiaddr, interface) diff --git a/src/tools/linkchecker/main.rs b/src/tools/linkchecker/main.rs index 19037a2c4d7..db87bc1fce4 100644 --- a/src/tools/linkchecker/main.rs +++ b/src/tools/linkchecker/main.rs @@ -17,9 +17,9 @@ //! actually point to a valid place. //! //! Currently this doesn't actually do any HTML parsing or anything fancy like -//! that, it just has a simple "regex" to search for `href` tags. These values -//! are then translated to file URLs if possible and then the destination is -//! asserted to exist. +//! that, it just has a simple "regex" to search for `href` and `id` tags. +//! These values are then translated to file URLs if possible and then the +//! destination is asserted to exist. //! //! A few whitelisted exceptions are allowed as there's known bugs in rustdoc, //! but this should catch the majority of "broken link" cases. @@ -29,14 +29,16 @@ extern crate url; use std::env; use std::fs::File; use std::io::prelude::*; -use std::path::Path; +use std::path::{Path, PathBuf}; +use std::collections::{HashMap, HashSet}; +use std::collections::hash_map::Entry; use url::{Url, UrlParser}; macro_rules! t { ($e:expr) => (match $e { Ok(e) => e, - Err(e) => panic!("{} failed with {}", stringify!($e), e), + Err(e) => panic!("{} failed with {:?}", stringify!($e), e), }) } @@ -45,44 +47,68 @@ fn main() { let docs = env::current_dir().unwrap().join(docs); let mut url = Url::from_file_path(&docs).unwrap(); let mut errors = false; - walk(&docs, &docs, &mut url, &mut errors); + walk(&mut HashMap::new(), &docs, &docs, &mut url, &mut errors); if errors { panic!("found some broken links"); } } -fn walk(root: &Path, dir: &Path, url: &mut Url, errors: &mut bool) { +#[derive(Debug)] +pub enum LoadError { + IOError(std::io::Error), + BrokenRedirect(PathBuf, std::io::Error), +} + +struct FileEntry { + source: String, + ids: HashSet, +} + +type Cache = HashMap; + +fn walk(cache: &mut Cache, + root: &Path, + dir: &Path, + url: &mut Url, + errors: &mut bool) +{ for entry in t!(dir.read_dir()).map(|e| t!(e)) { let path = entry.path(); let kind = t!(entry.file_type()); url.path_mut().unwrap().push(entry.file_name().into_string().unwrap()); if kind.is_dir() { - walk(root, &path, url, errors); + walk(cache, root, &path, url, errors); } else { - check(root, &path, url, errors); + check(cache, root, &path, url, errors); } url.path_mut().unwrap().pop(); } } -fn check(root: &Path, file: &Path, base: &Url, errors: &mut bool) { +fn check(cache: &mut Cache, + root: &Path, + file: &Path, + base: &Url, + errors: &mut bool) +{ // ignore js files as they are not prone to errors as the rest of the // documentation is and they otherwise bring up false positives. if file.extension().and_then(|s| s.to_str()) == Some("js") { return } - let pretty_file = file.strip_prefix(root).unwrap_or(file); - // Unfortunately we're not 100% full of valid links today to we need a few // whitelists to get this past `make check` today. // FIXME(#32129) - if file.ends_with("std/string/struct.String.html") { + if file.ends_with("std/string/struct.String.html") || + file.ends_with("collections/string/struct.String.html") { return } // FIXME(#32130) if file.ends_with("btree_set/struct.BTreeSet.html") || - file.ends_with("collections/struct.BTreeSet.html") { + file.ends_with("collections/struct.BTreeSet.html") || + file.ends_with("collections/btree_map/struct.BTreeMap.html") || + file.ends_with("collections/hash_map/struct.HashMap.html") { return } @@ -104,15 +130,159 @@ fn check(root: &Path, file: &Path, base: &Url, errors: &mut bool) { let mut parser = UrlParser::new(); parser.base_url(base); - let mut contents = String::new(); - if t!(File::open(file)).read_to_string(&mut contents).is_err() { - return - } + let res = load_file(cache, root, PathBuf::from(file), false, false); + let (pretty_file, contents) = match res { + Ok(res) => res, + Err(_) => return, + }; + + // Search for anything that's the regex 'href[ ]*=[ ]*".*?"' + with_attrs_in_source(&contents, " href", |url, i| { + // Once we've plucked out the URL, parse it using our base url and + // then try to extract a file path. If either of these fail then we + // just keep going. + let (parsed_url, path) = match url_to_file_path(&parser, url) { + Some((url, path)) => (url, PathBuf::from(path)), + None => return, + }; + + // Alright, if we've found a file name then this file had better + // exist! If it doesn't then we register and print an error. + if path.exists() { + if path.is_dir() { + return; + } + let res = load_file(cache, root, path.clone(), true, false); + let (pretty_path, contents) = match res { + Ok(res) => res, + Err(LoadError::IOError(err)) => panic!(format!("{}", err)), + Err(LoadError::BrokenRedirect(target, _)) => { + print!("{}:{}: broken redirect to {}", + pretty_file.display(), i + 1, target.display()); + return; + } + }; + + if let Some(ref fragment) = parsed_url.fragment { + // Fragments like `#1-6` are most likely line numbers to be + // interpreted by javascript, so we're ignoring these + if fragment.splitn(2, '-') + .all(|f| f.chars().all(|c| c.is_numeric())) { + return; + } + + let ids = &mut cache.get_mut(&pretty_path).unwrap().ids; + if ids.is_empty() { + // Search for anything that's the regex 'id[ ]*=[ ]*".*?"' + with_attrs_in_source(&contents, " id", |fragment, i| { + let frag = fragment.trim_left_matches("#").to_owned(); + if !ids.insert(frag) { + *errors = true; + println!("{}:{}: id is not unique: `{}`", + pretty_file.display(), i, fragment); + } + }); + } + if !ids.contains(fragment) { + *errors = true; + print!("{}:{}: broken link fragment ", + pretty_file.display(), i + 1); + println!("`#{}` pointing to `{}`", + fragment, pretty_path.display()); + }; + } + } else { + *errors = true; + print!("{}:{}: broken link - ", pretty_file.display(), i + 1); + let pretty_path = path.strip_prefix(root).unwrap_or(&path); + println!("{}", pretty_path.display()); + } + }); +} + +fn load_file(cache: &mut Cache, + root: &Path, + file: PathBuf, + follow_redirects: bool, + is_redirect: bool) -> Result<(PathBuf, String), LoadError> { + + let mut contents = String::new(); + let pretty_file = PathBuf::from(file.strip_prefix(root).unwrap_or(&file)); + + let maybe_redirect = match cache.entry(pretty_file.clone()) { + Entry::Occupied(entry) => { + contents = entry.get().source.clone(); + None + }, + Entry::Vacant(entry) => { + let mut fp = try!(File::open(file.clone()).map_err(|err| { + if is_redirect { + LoadError::BrokenRedirect(file.clone(), err) + } else { + LoadError::IOError(err) + } + })); + try!(fp.read_to_string(&mut contents) + .map_err(|err| LoadError::IOError(err))); + + let maybe = if follow_redirects { + maybe_redirect(&contents) + } else { + None + }; + if maybe.is_none() { + entry.insert(FileEntry { + source: contents.clone(), + ids: HashSet::new(), + }); + } + maybe + }, + }; + let base = Url::from_file_path(&file).unwrap(); + let mut parser = UrlParser::new(); + parser.base_url(&base); + + match maybe_redirect.and_then(|url| url_to_file_path(&parser, &url)) { + Some((_, redirect_file)) => { + assert!(follow_redirects); + let path = PathBuf::from(redirect_file); + load_file(cache, root, path, follow_redirects, true) + } + None => Ok((pretty_file, contents)) + } +} + +fn maybe_redirect(source: &str) -> Option { + const REDIRECT: &'static str = "

Redirecting to Option<(Url, PathBuf)> { + parser.parse(url).ok().and_then(|parsed_url| { + parsed_url.to_file_path().ok().map(|f| (parsed_url, f)) + }) +} + +fn with_attrs_in_source(contents: &str, + attr: &str, + mut f: F) +{ for (i, mut line) in contents.lines().enumerate() { - // Search for anything that's the regex 'href[ ]*=[ ]*".*?"' - while let Some(j) = line.find(" href") { - let rest = &line[j + 5..]; + while let Some(j) = line.find(attr) { + let rest = &line[j + attr.len() ..]; line = rest; let pos_equals = match rest.find("=") { Some(i) => i, @@ -121,40 +291,24 @@ fn check(root: &Path, file: &Path, base: &Url, errors: &mut bool) { if rest[..pos_equals].trim_left_matches(" ") != "" { continue } + let rest = &rest[pos_equals + 1..]; - let pos_quote = match rest.find("\"").or_else(|| rest.find("'")) { + + let pos_quote = match rest.find(&['"', '\''][..]) { Some(i) => i, None => continue, }; + let quote_delim = rest.as_bytes()[pos_quote] as char; + if rest[..pos_quote].trim_left_matches(" ") != "" { continue } let rest = &rest[pos_quote + 1..]; - let url = match rest.find("\"").or_else(|| rest.find("'")) { + let url = match rest.find(quote_delim) { Some(i) => &rest[..i], None => continue, }; - - // Once we've plucked out the URL, parse it using our base url and - // then try to extract a file path. If either if these fail then we - // just keep going. - let parsed_url = match parser.parse(url) { - Ok(url) => url, - Err(..) => continue, - }; - let path = match parsed_url.to_file_path() { - Ok(path) => path, - Err(..) => continue, - }; - - // Alright, if we've found a file name then this file had better - // exist! If it doesn't then we register and print an error. - if !path.exists() { - *errors = true; - print!("{}:{}: broken link - ", pretty_file.display(), i + 1); - let pretty_path = path.strip_prefix(root).unwrap_or(&path); - println!("{}", pretty_path.display()); - } + f(url, i) } } }