From d2ef1b318d72909d029910389abeabbf13547f01 Mon Sep 17 00:00:00 2001 From: mcarton Date: Mon, 29 May 2017 00:12:43 +0200 Subject: [PATCH] Rewrite `doc_markdown` to use `pulldown-cmark` --- clippy_lints/src/doc.rs | 401 +++++++++++----------------------------- 1 file changed, 113 insertions(+), 288 deletions(-) diff --git a/clippy_lints/src/doc.rs b/clippy_lints/src/doc.rs index 729c3eaf86e..b8f80bb3081 100644 --- a/clippy_lints/src/doc.rs +++ b/clippy_lints/src/doc.rs @@ -1,3 +1,5 @@ +use itertools::Itertools; +use pulldown_cmark; use rustc::lint::*; use syntax::ast; use syntax::codemap::{Span, BytePos}; @@ -53,324 +55,156 @@ impl EarlyLintPass for Doc { } } +struct Parser<'a> { + parser: pulldown_cmark::Parser<'a>, +} + +impl<'a> Parser<'a> { + fn new(parser: pulldown_cmark::Parser<'a>) -> Parser<'a> { + Self { parser } + } +} + +impl<'a> Iterator for Parser<'a> { + type Item = (usize, pulldown_cmark::Event<'a>); + + fn next(&mut self) -> Option { + let offset = self.parser.get_offset(); + self.parser.next().map(|event| (offset, event)) + } +} + /// Cleanup documentation decoration (`///` and such). /// /// We can't use `syntax::attr::AttributeMethods::with_desugared_doc` or /// `syntax::parse::lexer::comments::strip_doc_comment_decoration` because we need to keep track of -/// the span but this function is inspired from the later. +/// the spans but this function is inspired from the later. #[allow(cast_possible_truncation)] -pub fn strip_doc_comment_decoration((comment, span): (String, Span)) -> Vec<(String, Span)> { +pub fn strip_doc_comment_decoration(comment: String, span: Span) -> (String, Vec<(usize, Span)>) { // one-line comments lose their prefix const ONELINERS: &'static [&'static str] = &["///!", "///", "//!", "//"]; for prefix in ONELINERS { if comment.starts_with(*prefix) { - return vec![(comment[prefix.len()..].to_owned(), - Span { lo: span.lo + BytePos(prefix.len() as u32), ..span })]; + let doc = &comment[prefix.len()..]; + let mut doc = doc.to_owned(); + doc.push('\n'); + return ( + doc.to_owned(), + vec![(doc.len(), Span { lo: span.lo + BytePos(prefix.len() as u32), ..span })] + ); } } if comment.starts_with("/*") { - return comment[3..comment.len() - 2] - .lines() - .map(|line| { - let offset = line.as_ptr() as usize - comment.as_ptr() as usize; - debug_assert_eq!(offset as u32 as usize, offset); + let doc = &comment[3..comment.len() - 2]; + let mut sizes = vec![]; - (line.to_owned(), Span { lo: span.lo + BytePos(offset as u32), ..span }) - }) - .collect(); + for line in doc.lines() { + let offset = line.as_ptr() as usize - comment.as_ptr() as usize; + debug_assert_eq!(offset as u32 as usize, offset); + + sizes.push((line.len(), Span { lo: span.lo + BytePos(offset as u32), ..span })); + } + + return (doc.to_string(), sizes); } panic!("not a doc-comment: {}", comment); } pub fn check_attrs<'a>(cx: &EarlyContext, valid_idents: &[String], attrs: &'a [ast::Attribute]) { - let mut docs = vec![]; + let mut doc = String::new(); + let mut spans = vec![]; for attr in attrs { if attr.is_sugared_doc { - if let Some(ref doc) = attr.value_str() { - let doc = doc.to_string(); - docs.extend_from_slice(&strip_doc_comment_decoration((doc, attr.span))); + if let Some(ref current) = attr.value_str() { + let current = current.to_string(); + let (current, current_spans) = strip_doc_comment_decoration(current, attr.span); + spans.extend_from_slice(¤t_spans); + doc.push_str(¤t); } } } - if !docs.is_empty() { - let _ = check_doc(cx, valid_idents, &docs); + let mut current = 0; + for &mut (ref mut offset, _) in &mut spans { + let offset_copy = *offset; + *offset = current; + current += offset_copy; + } + + println!("{:?}", spans); + if !doc.is_empty() { + let parser = Parser::new(pulldown_cmark::Parser::new(&doc)); + let parser = parser.coalesce(|x, y| { + use pulldown_cmark::Event::*; + + let x_offset = x.0; + let y_offset = y.0; + + match (x.1, y.1) { + (Text(x), Text(y)) => Ok((x_offset, Text((x.into_owned() + &y).into()))), + (x, y) => Err(((x_offset, x), (y_offset, y))), + } + }); + check_doc(cx, valid_idents, parser, &spans); } } -#[allow(while_let_loop)] // #362 -fn check_doc(cx: &EarlyContext, valid_idents: &[String], docs: &[(String, Span)]) -> Result<(), ()> { - // In markdown, `_` can be used to emphasize something, or, is a raw `_` depending on context. - // There really is no markdown specification that would disambiguate this properly. This is - // what GitHub and Rustdoc do: - // - // foo_bar test_quz → foo_bar test_quz - // foo_bar_baz → foo_bar_baz (note that the “official” spec says this should be emphasized) - // _foo bar_ test_quz_ → foo bar test_quz_ - // \_foo bar\_ → _foo bar_ - // (_baz_) → (baz) - // foo _ bar _ baz → foo _ bar _ baz +fn check_doc<'a, Events: Iterator)>>( + cx: &EarlyContext, + valid_idents: &[String], + docs: Events, + spans: &[(usize, Span)] +) { + use pulldown_cmark::Event::*; + use pulldown_cmark::Tag::*; - /// Character that can appear in a path - fn is_path_char(c: char) -> bool { - match c { - t if t.is_alphanumeric() => true, - ':' | '_' => true, - _ => false, - } - } + let mut in_code = false; - #[derive(Clone, Debug)] - /// This type is used to iterate through the documentation characters, keeping the span at the - /// same time. - struct Parser<'a> { - /// First byte of the current potential match - current_word_begin: usize, - /// List of lines and their associated span - docs: &'a [(String, Span)], - /// Index of the current line we are parsing - line: usize, - /// Whether we are in a link - link: bool, - /// Whether we are at the beginning of a line - new_line: bool, - /// Whether we were to the end of a line last time `next` was called - reset: bool, - /// The position of the current character within the current line - pos: usize, - } + for (offset, event) in docs { + println!("{:?}, {:?}", offset, event); + match event { + Start(CodeBlock(_)) | Start(Code) => in_code = true, + End(CodeBlock(_)) | End(Code) => in_code = false, + Start(_tag) | End(_tag) => (), // We don't care about other tags + Html(_html) | InlineHtml(_html) => (), // HTML is weird, just ignore it + FootnoteReference(footnote) => (), // TODO + SoftBreak => (), + HardBreak => (), + Text(text) => { + if !in_code { + let index = match spans.binary_search_by(|c| c.0.cmp(&offset)) { + Ok(o) => o, + Err(e) => e-1, + }; - impl<'a> Parser<'a> { - fn advance_begin(&mut self) { - self.current_word_begin = self.pos; - } - - fn line(&self) -> (&'a str, Span) { - let (ref doc, span) = self.docs[self.line]; - (doc, span) - } - - fn peek(&self) -> Option { - self.line().0[self.pos..].chars().next() - } - - #[allow(while_let_on_iterator)] // borrowck complains about for - fn jump_to(&mut self, n: char) -> Result { - while let Some((new_line, c)) = self.next() { - if c == n { - self.advance_begin(); - return Ok(new_line); + let (_, span) = spans[index]; + check_text(cx, valid_idents, &text, span); } - } - - Err(()) - } - - fn next_line(&mut self) { - self.pos = 0; - self.current_word_begin = 0; - self.line += 1; - self.new_line = true; - } - - fn put_back(&mut self, c: char) { - self.pos -= c.len_utf8(); - } - - #[allow(cast_possible_truncation)] - fn word(&self) -> (&'a str, Span) { - let begin = self.current_word_begin; - let end = self.pos; - - debug_assert_eq!(end as u32 as usize, end); - debug_assert_eq!(begin as u32 as usize, begin); - - let (doc, mut span) = self.line(); - span.hi = span.lo + BytePos(end as u32); - span.lo = span.lo + BytePos(begin as u32); - - (&doc[begin..end], span) - } - } - - impl<'a> Iterator for Parser<'a> { - type Item = (bool, char); - - fn next(&mut self) -> Option<(bool, char)> { - if self.line < self.docs.len() { - if self.reset { - self.line += 1; - self.reset = false; - self.pos = 0; - self.current_word_begin = 0; - } - - let mut chars = self.line().0[self.pos..].chars(); - let c = chars.next(); - - if let Some(c) = c { - self.pos += c.len_utf8(); - let new_line = self.new_line; - self.new_line = c == '\n' || (self.new_line && c.is_whitespace()); - Some((new_line, c)) - } else if self.line == self.docs.len() - 1 { - None - } else { - self.new_line = true; - self.reset = true; - self.pos += 1; - Some((true, '\n')) - } - } else { - None - } - } - } - - let mut parser = Parser { - current_word_begin: 0, - docs: docs, - line: 0, - link: false, - new_line: true, - reset: false, - pos: 0, - }; - - /// Check for fanced code block. - macro_rules! check_block { - ($parser:expr, $c:tt, $new_line:expr) => {{ - check_block!($parser, $c, $c, $new_line) - }}; - - ($parser:expr, $c:pat, $c_expr:expr, $new_line:expr) => {{ - fn check_block(parser: &mut Parser, new_line: bool) -> Result { - if new_line { - let mut lookup_parser = parser.clone(); - if let (Some((false, $c)), Some((false, $c))) = (lookup_parser.next(), lookup_parser.next()) { - *parser = lookup_parser; - // 3 or more ` or ~ open a code block to be closed with the same number of ` or ~ - let mut open_count = 3; - while let Some((false, $c)) = parser.next() { - open_count += 1; - } - - loop { - loop { - if try!(parser.jump_to($c_expr)) { - break; - } - } - - lookup_parser = parser.clone(); - let a = lookup_parser.next(); - let b = lookup_parser.next(); - if let (Some((false, $c)), Some((false, $c))) = (a, b) { - let mut close_count = 3; - while let Some((false, $c)) = lookup_parser.next() { - close_count += 1; - } - - if close_count == open_count { - *parser = lookup_parser; - return Ok(true); - } - } - } - } - } - - Ok(false) - } - - check_block(&mut $parser, $new_line) - }}; - } - - loop { - match parser.next() { - Some((new_line, c)) => { - match c { - '#' if new_line => { - // don’t warn on titles - parser.next_line(); - }, - '`' => { - if try!(check_block!(parser, '`', new_line)) { - continue; - } - - // not a code block, just inline code - try!(parser.jump_to('`')); - }, - '~' => { - if try!(check_block!(parser, '~', new_line)) { - continue; - } - - // ~ does not introduce inline code, but two of them introduce - // strikethrough. Too bad for the consistency but we don't care about - // strikethrough. - }, - '[' => { - // Check for a reference definition `[foo]:` at the beginning of a line - let mut link = true; - - if new_line { - let mut lookup_parser = parser.clone(); - if lookup_parser.any(|(_, c)| c == ']') { - if let Some((_, ':')) = lookup_parser.next() { - lookup_parser.next_line(); - parser = lookup_parser; - link = false; - } - } - } - - parser.advance_begin(); - parser.link = link; - }, - ']' if parser.link => { - parser.link = false; - - match parser.peek() { - Some('(') => { - try!(parser.jump_to(')')); - }, - Some('[') => { - try!(parser.jump_to(']')); - }, - Some(_) => continue, - None => return Err(()), - } - }, - c if !is_path_char(c) => { - parser.advance_begin(); - }, - _ => { - if let Some((_, c)) = parser.find(|&(_, c)| !is_path_char(c)) { - parser.put_back(c); - } - - let (word, span) = parser.word(); - check_word(cx, valid_idents, word, span); - parser.advance_begin(); - }, - } - }, - None => break, } } - - Ok(()) } -fn check_word(cx: &EarlyContext, valid_idents: &[String], word: &str, span: Span) { - /// Checks if a string a camel-case, ie. contains at least two uppercase letter (`Clippy` is +fn check_text(cx: &EarlyContext, valid_idents: &[String], text: &str, span: Span) { + for word in text.split_whitespace() { + // Trim punctuation as in `some comment (see foo::bar).` + // ^^ + // Or even as in `_foo bar_` which is emphasized. + let word = word.trim_matches(|c: char| !c.is_alphanumeric()); + + if valid_idents.iter().any(|i| i == word) { + continue; + } + + check_word(cx, word, span); + } +} + +fn check_word(cx: &EarlyContext, word: &str, span: Span) { + /// Checks if a string is camel-case, ie. contains at least two uppercase letter (`Clippy` is /// ok) and one lower-case letter (`NASA` is ok). Plural are also excluded (`IDs` is ok). fn is_camel_case(s: &str) -> bool { if s.starts_with(|c: char| c.is_digit(10)) { @@ -391,15 +225,6 @@ fn check_word(cx: &EarlyContext, valid_idents: &[String], word: &str, span: Span s != "_" && !s.contains("\\_") && s.contains('_') } - // Trim punctuation as in `some comment (see foo::bar).` - // ^^ - // Or even as in `_foo bar_` which is emphasized. - let word = word.trim_matches(|c: char| !c.is_alphanumeric()); - - if valid_idents.iter().any(|i| i == word) { - return; - } - if has_underscore(word) || word.contains("::") || is_camel_case(word) { span_lint(cx, DOC_MARKDOWN,