From 5817351048b7c817720f696dd6a0f7005bd1a7a4 Mon Sep 17 00:00:00 2001 From: Zack Weinberg <zackw@panix.com> Date: Mon, 13 Feb 2017 12:33:35 -0500 Subject: [PATCH 1/2] tidy: exempt URLs from the line length restriction The length of a URL is usually not under our control, and Markdown provides no way to split a URL in the middle. Therefore, comment lines consisting _solely_ of a URL (possibly with a Markdown link label in front) should be exempt from the line-length restriction. Inline hyperlink destinations ( `[foo](http://...)` notation ) are _not_ exempt, because it is my arrogant opinion that long lines of that type make the source text illegible. The patch adds dependencies on the `regex` and `lazy_static` crates to the tidy utility. This _appears_ to Just Work, but if you would rather not have that dependency I am willing to provide a hand-written parser instead. --- src/tools/tidy/Cargo.toml | 2 ++ src/tools/tidy/src/main.rs | 3 +++ src/tools/tidy/src/style.rs | 33 +++++++++++++++++++++++++++++++-- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/src/tools/tidy/Cargo.toml b/src/tools/tidy/Cargo.toml index e900bd47fb7..39986d59289 100644 --- a/src/tools/tidy/Cargo.toml +++ b/src/tools/tidy/Cargo.toml @@ -4,3 +4,5 @@ version = "0.1.0" authors = ["Alex Crichton <alex@alexcrichton.com>"] [dependencies] +regex = "*" +lazy_static = "*" diff --git a/src/tools/tidy/src/main.rs b/src/tools/tidy/src/main.rs index 9962c6ec9af..bbd6c8e87c2 100644 --- a/src/tools/tidy/src/main.rs +++ b/src/tools/tidy/src/main.rs @@ -14,6 +14,9 @@ //! etc. This is run by default on `make check` and as part of the auto //! builders. +extern crate regex; +#[macro_use] extern crate lazy_static; + use std::fs; use std::path::{PathBuf, Path}; use std::env; diff --git a/src/tools/tidy/src/style.rs b/src/tools/tidy/src/style.rs index c722eb690b8..91c5edfd75a 100644 --- a/src/tools/tidy/src/style.rs +++ b/src/tools/tidy/src/style.rs @@ -26,6 +26,8 @@ use std::fs::File; use std::io::prelude::*; use std::path::Path; +use regex::Regex; + const COLS: usize = 100; const LICENSE: &'static str = "\ Copyright <year> The Rust Project Developers. See the COPYRIGHT @@ -38,6 +40,32 @@ http://www.apache.org/licenses/LICENSE-2.0> or the MIT license option. This file may not be copied, modified, or distributed except according to those terms."; +/// True if LINE is allowed to be longer than the normal limit. +/// +/// Currently there is only one exception: if the line is within a +/// comment, and its entire text is one URL (possibly with a Markdown +/// link label in front), then it's allowed to be overlength. This is +/// because Markdown offers no way to split a line in the middle of a +/// URL, and the length of URLs for external references is beyond our +/// control. +fn long_line_is_ok(line: &str) -> bool { + lazy_static! { + static ref URL_RE: Regex = Regex::new( + // This regexp uses the CommonMark definition of link + // label. It thinks any sequence of nonwhitespace + // characters beginning with "http://" or "https://" is a + // URL. Add more schemas as necessary. + r"^\s*//[!/]?\s+(?:\[(?:[^\]\\]|\\.){1,999}\]:\s+)?https?://\S+$" + ).unwrap(); + } + + if URL_RE.is_match(line) { + return true; + } + + false +} + pub fn check(path: &Path, bad: &mut bool) { let mut contents = String::new(); super::walk(path, &mut super::filter_dirs, &mut |file| { @@ -61,8 +89,9 @@ pub fn check(path: &Path, bad: &mut bool) { println!("{}:{}: {}", file.display(), i + 1, msg); *bad = true; }; - if line.chars().count() > COLS && !skip_length { - err(&format!("line longer than {} chars", COLS)); + if !skip_length && line.chars().count() > COLS + && !long_line_is_ok(line) { + err(&format!("line longer than {} chars", COLS)); } if line.contains("\t") && !skip_tab { err("tab character"); From ff4758c2a0dffef264fe73b90668bd04b1b2fa89 Mon Sep 17 00:00:00 2001 From: Zack Weinberg <zackw@panix.com> Date: Mon, 13 Feb 2017 15:44:51 -0500 Subject: [PATCH 2/2] Replace regex-based parser for URL lines with open-coded one. --- src/tools/tidy/Cargo.toml | 2 -- src/tools/tidy/src/main.rs | 3 -- src/tools/tidy/src/style.rs | 66 ++++++++++++++++++++++++++----------- 3 files changed, 46 insertions(+), 25 deletions(-) diff --git a/src/tools/tidy/Cargo.toml b/src/tools/tidy/Cargo.toml index 39986d59289..e900bd47fb7 100644 --- a/src/tools/tidy/Cargo.toml +++ b/src/tools/tidy/Cargo.toml @@ -4,5 +4,3 @@ version = "0.1.0" authors = ["Alex Crichton <alex@alexcrichton.com>"] [dependencies] -regex = "*" -lazy_static = "*" diff --git a/src/tools/tidy/src/main.rs b/src/tools/tidy/src/main.rs index bbd6c8e87c2..9962c6ec9af 100644 --- a/src/tools/tidy/src/main.rs +++ b/src/tools/tidy/src/main.rs @@ -14,9 +14,6 @@ //! etc. This is run by default on `make check` and as part of the auto //! builders. -extern crate regex; -#[macro_use] extern crate lazy_static; - use std::fs; use std::path::{PathBuf, Path}; use std::env; diff --git a/src/tools/tidy/src/style.rs b/src/tools/tidy/src/style.rs index 91c5edfd75a..2233f8c3529 100644 --- a/src/tools/tidy/src/style.rs +++ b/src/tools/tidy/src/style.rs @@ -26,8 +26,6 @@ use std::fs::File; use std::io::prelude::*; use std::path::Path; -use regex::Regex; - const COLS: usize = 100; const LICENSE: &'static str = "\ Copyright <year> The Rust Project Developers. See the COPYRIGHT @@ -40,26 +38,54 @@ http://www.apache.org/licenses/LICENSE-2.0> or the MIT license option. This file may not be copied, modified, or distributed except according to those terms."; -/// True if LINE is allowed to be longer than the normal limit. -/// -/// Currently there is only one exception: if the line is within a -/// comment, and its entire text is one URL (possibly with a Markdown -/// link label in front), then it's allowed to be overlength. This is -/// because Markdown offers no way to split a line in the middle of a -/// URL, and the length of URLs for external references is beyond our -/// control. -fn long_line_is_ok(line: &str) -> bool { - lazy_static! { - static ref URL_RE: Regex = Regex::new( - // This regexp uses the CommonMark definition of link - // label. It thinks any sequence of nonwhitespace - // characters beginning with "http://" or "https://" is a - // URL. Add more schemas as necessary. - r"^\s*//[!/]?\s+(?:\[(?:[^\]\\]|\\.){1,999}\]:\s+)?https?://\S+$" - ).unwrap(); +/// Parser states for line_is_url. +#[derive(PartialEq)] +#[allow(non_camel_case_types)] +enum LIUState { EXP_COMMENT_START, + EXP_LINK_LABEL_OR_URL, + EXP_URL, + EXP_END } + +/// True if LINE appears to be a line comment containing an URL, +/// possibly with a Markdown link label in front, and nothing else. +/// The Markdown link label, if present, may not contain whitespace. +/// Lines of this form are allowed to be overlength, because Markdown +/// offers no way to split a line in the middle of a URL, and the lengths +/// of URLs to external references are beyond our control. +fn line_is_url(line: &str) -> bool { + use self::LIUState::*; + let mut state: LIUState = EXP_COMMENT_START; + + for tok in line.split_whitespace() { + match (state, tok) { + (EXP_COMMENT_START, "//") => state = EXP_LINK_LABEL_OR_URL, + (EXP_COMMENT_START, "///") => state = EXP_LINK_LABEL_OR_URL, + (EXP_COMMENT_START, "//!") => state = EXP_LINK_LABEL_OR_URL, + + (EXP_LINK_LABEL_OR_URL, w) + if w.len() >= 4 && w.starts_with("[") && w.ends_with("]:") + => state = EXP_URL, + + (EXP_LINK_LABEL_OR_URL, w) + if w.starts_with("http://") || w.starts_with("https://") + => state = EXP_END, + + (EXP_URL, w) + if w.starts_with("http://") || w.starts_with("https://") + => state = EXP_END, + + (_, _) => return false, + } } - if URL_RE.is_match(line) { + state == EXP_END +} + +/// True if LINE is allowed to be longer than the normal limit. +/// Currently there is only one exception, for long URLs, but more +/// may be added in the future. +fn long_line_is_ok(line: &str) -> bool { + if line_is_url(line) { return true; }