From d2ef1b318d72909d029910389abeabbf13547f01 Mon Sep 17 00:00:00 2001
From: mcarton <cartonmartin+git@gmail.com>
Date: Mon, 29 May 2017 00:12:43 +0200
Subject: [PATCH] Rewrite `doc_markdown` to use `pulldown-cmark`

---
 clippy_lints/src/doc.rs | 401 +++++++++++-----------------------------
 1 file changed, 113 insertions(+), 288 deletions(-)

diff --git a/clippy_lints/src/doc.rs b/clippy_lints/src/doc.rs
index 729c3eaf86e..b8f80bb3081 100644
--- a/clippy_lints/src/doc.rs
+++ b/clippy_lints/src/doc.rs
@@ -1,3 +1,5 @@
+use itertools::Itertools;
+use pulldown_cmark;
 use rustc::lint::*;
 use syntax::ast;
 use syntax::codemap::{Span, BytePos};
@@ -53,324 +55,156 @@ impl EarlyLintPass for Doc {
     }
 }
 
+struct Parser<'a> {
+    parser: pulldown_cmark::Parser<'a>,
+}
+
+impl<'a> Parser<'a> {
+    fn new(parser: pulldown_cmark::Parser<'a>) -> Parser<'a> {
+        Self { parser }
+    }
+}
+
+impl<'a> Iterator for Parser<'a> {
+    type Item = (usize, pulldown_cmark::Event<'a>);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let offset = self.parser.get_offset();
+        self.parser.next().map(|event| (offset, event))
+    }
+}
+
 /// Cleanup documentation decoration (`///` and such).
 ///
 /// We can't use `syntax::attr::AttributeMethods::with_desugared_doc` or
 /// `syntax::parse::lexer::comments::strip_doc_comment_decoration` because we need to keep track of
-/// the span but this function is inspired from the later.
+/// the spans but this function is inspired from the later.
 #[allow(cast_possible_truncation)]
-pub fn strip_doc_comment_decoration((comment, span): (String, Span)) -> Vec<(String, Span)> {
+pub fn strip_doc_comment_decoration(comment: String, span: Span) -> (String, Vec<(usize, Span)>) {
     // one-line comments lose their prefix
     const ONELINERS: &'static [&'static str] = &["///!", "///", "//!", "//"];
     for prefix in ONELINERS {
         if comment.starts_with(*prefix) {
-            return vec![(comment[prefix.len()..].to_owned(),
-                         Span { lo: span.lo + BytePos(prefix.len() as u32), ..span })];
+            let doc = &comment[prefix.len()..];
+            let mut doc = doc.to_owned();
+            doc.push('\n');
+            return (
+                doc.to_owned(),
+                vec![(doc.len(), Span { lo: span.lo + BytePos(prefix.len() as u32), ..span })]
+            );
         }
     }
 
     if comment.starts_with("/*") {
-        return comment[3..comment.len() - 2]
-            .lines()
-            .map(|line| {
-                let offset = line.as_ptr() as usize - comment.as_ptr() as usize;
-                debug_assert_eq!(offset as u32 as usize, offset);
+        let doc = &comment[3..comment.len() - 2];
+        let mut sizes = vec![];
 
-                (line.to_owned(), Span { lo: span.lo + BytePos(offset as u32), ..span })
-            })
-            .collect();
+        for line in doc.lines() {
+            let offset = line.as_ptr() as usize - comment.as_ptr() as usize;
+            debug_assert_eq!(offset as u32 as usize, offset);
+
+            sizes.push((line.len(), Span { lo: span.lo + BytePos(offset as u32), ..span }));
+        }
+
+        return (doc.to_string(), sizes);
     }
 
     panic!("not a doc-comment: {}", comment);
 }
 
 pub fn check_attrs<'a>(cx: &EarlyContext, valid_idents: &[String], attrs: &'a [ast::Attribute]) {
-    let mut docs = vec![];
+    let mut doc = String::new();
+    let mut spans = vec![];
 
     for attr in attrs {
         if attr.is_sugared_doc {
-            if let Some(ref doc) = attr.value_str() {
-                let doc = doc.to_string();
-                docs.extend_from_slice(&strip_doc_comment_decoration((doc, attr.span)));
+            if let Some(ref current) = attr.value_str() {
+                let current = current.to_string();
+                let (current, current_spans) = strip_doc_comment_decoration(current, attr.span);
+                spans.extend_from_slice(&current_spans);
+                doc.push_str(&current);
             }
         }
     }
 
-    if !docs.is_empty() {
-        let _ = check_doc(cx, valid_idents, &docs);
+    let mut current = 0;
+    for &mut (ref mut offset, _) in &mut spans {
+        let offset_copy = *offset;
+        *offset = current;
+        current += offset_copy;
+    }
+
+    println!("{:?}", spans);
+    if !doc.is_empty() {
+        let parser = Parser::new(pulldown_cmark::Parser::new(&doc));
+        let parser = parser.coalesce(|x, y| {
+            use pulldown_cmark::Event::*;
+
+            let x_offset = x.0;
+            let y_offset = y.0;
+
+            match (x.1, y.1) {
+                (Text(x), Text(y)) => Ok((x_offset, Text((x.into_owned() + &y).into()))),
+                (x, y) => Err(((x_offset, x), (y_offset, y))),
+            }
+        });
+        check_doc(cx, valid_idents, parser, &spans);
     }
 }
 
-#[allow(while_let_loop)] // #362
-fn check_doc(cx: &EarlyContext, valid_idents: &[String], docs: &[(String, Span)]) -> Result<(), ()> {
-    // In markdown, `_` can be used to emphasize something, or, is a raw `_` depending on context.
-    // There really is no markdown specification that would disambiguate this properly. This is
-    // what GitHub and Rustdoc do:
-    //
-    // foo_bar test_quz    → foo_bar test_quz
-    // foo_bar_baz         → foo_bar_baz (note that the “official” spec says this should be emphasized)
-    // _foo bar_ test_quz_ → <em>foo bar</em> test_quz_
-    // \_foo bar\_         → _foo bar_
-    // (_baz_)             → (<em>baz</em>)
-    // foo _ bar _ baz     → foo _ bar _ baz
+fn check_doc<'a, Events: Iterator<Item=(usize, pulldown_cmark::Event<'a>)>>(
+    cx: &EarlyContext,
+    valid_idents: &[String],
+    docs: Events,
+    spans: &[(usize, Span)]
+) {
+    use pulldown_cmark::Event::*;
+    use pulldown_cmark::Tag::*;
 
-    /// Character that can appear in a path
-    fn is_path_char(c: char) -> bool {
-        match c {
-            t if t.is_alphanumeric() => true,
-            ':' | '_' => true,
-            _ => false,
-        }
-    }
+    let mut in_code = false;
 
-    #[derive(Clone, Debug)]
-    /// This type is used to iterate through the documentation characters, keeping the span at the
-    /// same time.
-    struct Parser<'a> {
-        /// First byte of the current potential match
-        current_word_begin: usize,
-        /// List of lines and their associated span
-        docs: &'a [(String, Span)],
-        /// Index of the current line we are parsing
-        line: usize,
-        /// Whether we are in a link
-        link: bool,
-        /// Whether we are at the beginning of a line
-        new_line: bool,
-        /// Whether we were to the end of a line last time `next` was called
-        reset: bool,
-        /// The position of the current character within the current line
-        pos: usize,
-    }
+    for (offset, event) in docs {
+        println!("{:?}, {:?}", offset, event);
+        match event {
+            Start(CodeBlock(_)) | Start(Code) => in_code = true,
+            End(CodeBlock(_)) | End(Code) => in_code = false,
+            Start(_tag) | End(_tag) => (), // We don't care about other tags
+            Html(_html) | InlineHtml(_html) => (), // HTML is weird, just ignore it
+            FootnoteReference(footnote) => (), // TODO
+            SoftBreak => (),
+            HardBreak => (),
+            Text(text) => {
+                if !in_code {
+                    let index = match spans.binary_search_by(|c| c.0.cmp(&offset)) {
+                        Ok(o) => o,
+                        Err(e) => e-1,
+                    };
 
-    impl<'a> Parser<'a> {
-        fn advance_begin(&mut self) {
-            self.current_word_begin = self.pos;
-        }
-
-        fn line(&self) -> (&'a str, Span) {
-            let (ref doc, span) = self.docs[self.line];
-            (doc, span)
-        }
-
-        fn peek(&self) -> Option<char> {
-            self.line().0[self.pos..].chars().next()
-        }
-
-        #[allow(while_let_on_iterator)] // borrowck complains about for
-        fn jump_to(&mut self, n: char) -> Result<bool, ()> {
-            while let Some((new_line, c)) = self.next() {
-                if c == n {
-                    self.advance_begin();
-                    return Ok(new_line);
+                    let (_, span) = spans[index];
+                    check_text(cx, valid_idents, &text, span);
                 }
-            }
-
-            Err(())
-        }
-
-        fn next_line(&mut self) {
-            self.pos = 0;
-            self.current_word_begin = 0;
-            self.line += 1;
-            self.new_line = true;
-        }
-
-        fn put_back(&mut self, c: char) {
-            self.pos -= c.len_utf8();
-        }
-
-        #[allow(cast_possible_truncation)]
-        fn word(&self) -> (&'a str, Span) {
-            let begin = self.current_word_begin;
-            let end = self.pos;
-
-            debug_assert_eq!(end as u32 as usize, end);
-            debug_assert_eq!(begin as u32 as usize, begin);
-
-            let (doc, mut span) = self.line();
-            span.hi = span.lo + BytePos(end as u32);
-            span.lo = span.lo + BytePos(begin as u32);
-
-            (&doc[begin..end], span)
-        }
-    }
-
-    impl<'a> Iterator for Parser<'a> {
-        type Item = (bool, char);
-
-        fn next(&mut self) -> Option<(bool, char)> {
-            if self.line < self.docs.len() {
-                if self.reset {
-                    self.line += 1;
-                    self.reset = false;
-                    self.pos = 0;
-                    self.current_word_begin = 0;
-                }
-
-                let mut chars = self.line().0[self.pos..].chars();
-                let c = chars.next();
-
-                if let Some(c) = c {
-                    self.pos += c.len_utf8();
-                    let new_line = self.new_line;
-                    self.new_line = c == '\n' || (self.new_line && c.is_whitespace());
-                    Some((new_line, c))
-                } else if self.line == self.docs.len() - 1 {
-                    None
-                } else {
-                    self.new_line = true;
-                    self.reset = true;
-                    self.pos += 1;
-                    Some((true, '\n'))
-                }
-            } else {
-                None
-            }
-        }
-    }
-
-    let mut parser = Parser {
-        current_word_begin: 0,
-        docs: docs,
-        line: 0,
-        link: false,
-        new_line: true,
-        reset: false,
-        pos: 0,
-    };
-
-    /// Check for fanced code block.
-    macro_rules! check_block {
-        ($parser:expr, $c:tt, $new_line:expr) => {{
-            check_block!($parser, $c, $c, $new_line)
-        }};
-
-        ($parser:expr, $c:pat, $c_expr:expr, $new_line:expr) => {{
-            fn check_block(parser: &mut Parser, new_line: bool) -> Result<bool, ()> {
-                if new_line {
-                    let mut lookup_parser = parser.clone();
-                    if let (Some((false, $c)), Some((false, $c))) = (lookup_parser.next(), lookup_parser.next()) {
-                        *parser = lookup_parser;
-                        // 3 or more ` or ~ open a code block to be closed with the same number of ` or ~
-                        let mut open_count = 3;
-                        while let Some((false, $c)) = parser.next() {
-                            open_count += 1;
-                        }
-
-                        loop {
-                            loop {
-                                if try!(parser.jump_to($c_expr)) {
-                                    break;
-                                }
-                            }
-
-                            lookup_parser = parser.clone();
-                            let a = lookup_parser.next();
-                            let b = lookup_parser.next();
-                            if let (Some((false, $c)), Some((false, $c))) = (a, b) {
-                                let mut close_count = 3;
-                                while let Some((false, $c)) = lookup_parser.next() {
-                                    close_count += 1;
-                                }
-
-                                if close_count == open_count {
-                                    *parser = lookup_parser;
-                                    return Ok(true);
-                                }
-                            }
-                        }
-                    }
-                }
-
-                Ok(false)
-            }
-
-            check_block(&mut $parser, $new_line)
-        }};
-    }
-
-    loop {
-        match parser.next() {
-            Some((new_line, c)) => {
-                match c {
-                    '#' if new_line => {
-                        // don’t warn on titles
-                        parser.next_line();
-                    },
-                    '`' => {
-                        if try!(check_block!(parser, '`', new_line)) {
-                            continue;
-                        }
-
-                        // not a code block, just inline code
-                        try!(parser.jump_to('`'));
-                    },
-                    '~' => {
-                        if try!(check_block!(parser, '~', new_line)) {
-                            continue;
-                        }
-
-                        // ~ does not introduce inline code, but two of them introduce
-                        // strikethrough. Too bad for the consistency but we don't care about
-                        // strikethrough.
-                    },
-                    '[' => {
-                        // Check for a reference definition `[foo]:` at the beginning of a line
-                        let mut link = true;
-
-                        if new_line {
-                            let mut lookup_parser = parser.clone();
-                            if lookup_parser.any(|(_, c)| c == ']') {
-                                if let Some((_, ':')) = lookup_parser.next() {
-                                    lookup_parser.next_line();
-                                    parser = lookup_parser;
-                                    link = false;
-                                }
-                            }
-                        }
-
-                        parser.advance_begin();
-                        parser.link = link;
-                    },
-                    ']' if parser.link => {
-                        parser.link = false;
-
-                        match parser.peek() {
-                            Some('(') => {
-                                try!(parser.jump_to(')'));
-                            },
-                            Some('[') => {
-                                try!(parser.jump_to(']'));
-                            },
-                            Some(_) => continue,
-                            None => return Err(()),
-                        }
-                    },
-                    c if !is_path_char(c) => {
-                        parser.advance_begin();
-                    },
-                    _ => {
-                        if let Some((_, c)) = parser.find(|&(_, c)| !is_path_char(c)) {
-                            parser.put_back(c);
-                        }
-
-                        let (word, span) = parser.word();
-                        check_word(cx, valid_idents, word, span);
-                        parser.advance_begin();
-                    },
-                }
-
             },
-            None => break,
         }
     }
-
-    Ok(())
 }
 
-fn check_word(cx: &EarlyContext, valid_idents: &[String], word: &str, span: Span) {
-    /// Checks if a string a camel-case, ie. contains at least two uppercase letter (`Clippy` is
+fn check_text(cx: &EarlyContext, valid_idents: &[String], text: &str, span: Span) {
+    for word in text.split_whitespace() {
+        // Trim punctuation as in `some comment (see foo::bar).`
+        //                                                   ^^
+        // Or even as in `_foo bar_` which is emphasized.
+        let word = word.trim_matches(|c: char| !c.is_alphanumeric());
+
+        if valid_idents.iter().any(|i| i == word) {
+            continue;
+        }
+
+        check_word(cx, word, span);
+    }
+}
+
+fn check_word(cx: &EarlyContext, word: &str, span: Span) {
+    /// Checks if a string is camel-case, ie. contains at least two uppercase letter (`Clippy` is
     /// ok) and one lower-case letter (`NASA` is ok). Plural are also excluded (`IDs` is ok).
     fn is_camel_case(s: &str) -> bool {
         if s.starts_with(|c: char| c.is_digit(10)) {
@@ -391,15 +225,6 @@ fn check_word(cx: &EarlyContext, valid_idents: &[String], word: &str, span: Span
         s != "_" && !s.contains("\\_") && s.contains('_')
     }
 
-    // Trim punctuation as in `some comment (see foo::bar).`
-    //                                                   ^^
-    // Or even as in `_foo bar_` which is emphasized.
-    let word = word.trim_matches(|c: char| !c.is_alphanumeric());
-
-    if valid_idents.iter().any(|i| i == word) {
-        return;
-    }
-
     if has_underscore(word) || word.contains("::") || is_camel_case(word) {
         span_lint(cx,
                   DOC_MARKDOWN,