From 39197e673e79ca91ac6f1484adf3aa051696d575 Mon Sep 17 00:00:00 2001
From: Aleksey Kladov <aleksey.kladov@gmail.com>
Date: Mon, 17 Aug 2020 18:43:35 +0200
Subject: [PATCH 1/2] Move doc comment parsing to rustc_lexer

Plain comments are trivial, while doc comments are not, so it feels
like this belongs to the rustc_lexer.

The specific reason to do this is the desire to use rustc_lexer in
rustdoc for syntax highlighting, without duplicating "is this a doc
comment?" logic there.
---
 src/librustc_ast/util/comments.rs       | 48 +-----------
 src/librustc_ast/util/comments/tests.rs |  7 --
 src/librustc_lexer/src/lib.rs           | 35 +++++++--
 src/librustc_parse/lexer/mod.rs         | 97 +++++++++++++++----------
 4 files changed, 91 insertions(+), 96 deletions(-)
diff --git a/src/librustc_ast/util/comments.rs b/src/librustc_ast/util/comments.rs
index a73891db160..e97c8cc4562 100644
--- a/src/librustc_ast/util/comments.rs
+++ b/src/librustc_ast/util/comments.rs
@@ -1,4 +1,3 @@
-use crate::ast::AttrStyle;
 use rustc_span::source_map::SourceMap;
 use rustc_span::{BytePos, CharPos, FileName, Pos, Symbol};
 
@@ -24,45 +23,6 @@ pub struct Comment {
     pub pos: BytePos,
 }
 
-/// For a full line comment string returns its doc comment style if it's a doc comment
-/// and returns `None` if it's a regular comment.
-pub fn line_doc_comment_style(line_comment: &str) -> Option<AttrStyle> {
-    let line_comment = line_comment.as_bytes();
-    assert!(line_comment.starts_with(b"//"));
-    match line_comment.get(2) {
-        // `//!` is an inner line doc comment.
-        Some(b'!') => Some(AttrStyle::Inner),
-        Some(b'/') => match line_comment.get(3) {
-            // `////` (more than 3 slashes) is not considered a doc comment.
-            Some(b'/') => None,
-            // Otherwise `///` is an outer line doc comment.
-            _ => Some(AttrStyle::Outer),
-        },
-        _ => None,
-    }
-}
-
-/// For a full block comment string returns its doc comment style if it's a doc comment
-/// and returns `None` if it's a regular comment.
-pub fn block_doc_comment_style(block_comment: &str, terminated: bool) -> Option<AttrStyle> {
-    let block_comment = block_comment.as_bytes();
-    assert!(block_comment.starts_with(b"/*"));
-    assert!(!terminated || block_comment.ends_with(b"*/"));
-    match block_comment.get(2) {
-        // `/*!` is an inner block doc comment.
-        Some(b'!') => Some(AttrStyle::Inner),
-        Some(b'*') => match block_comment.get(3) {
-            // `/***` (more than 2 stars) is not considered a doc comment.
-            Some(b'*') => None,
-            // `/**/` is not considered a doc comment.
-            Some(b'/') if block_comment.len() == 4 => None,
-            // Otherwise `/**` is an outer block doc comment.
-            _ => Some(AttrStyle::Outer),
-        },
-        _ => None,
-    }
-}
-
 /// Makes a doc string more presentable to users.
 /// Used by rustdoc and perhaps other tools, but not by rustc.
 pub fn beautify_doc_string(data: Symbol) -> String {
@@ -216,8 +176,8 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
                     }
                 }
             }
-            rustc_lexer::TokenKind::BlockComment { terminated } => {
-                if block_doc_comment_style(token_text, terminated).is_none() {
+            rustc_lexer::TokenKind::BlockComment { doc_style, .. } => {
+                if doc_style.is_none() {
                     let code_to_the_right = match text[pos + token.len..].chars().next() {
                         Some('\r' | '\n') => false,
                         _ => true,
@@ -238,8 +198,8 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
                     comments.push(Comment { style, lines, pos: pos_in_file })
                 }
             }
-            rustc_lexer::TokenKind::LineComment => {
-                if line_doc_comment_style(token_text).is_none() {
+            rustc_lexer::TokenKind::LineComment { doc_style } => {
+                if doc_style.is_none() {
                     comments.push(Comment {
                         style: if code_to_the_left {
                             CommentStyle::Trailing
diff --git a/src/librustc_ast/util/comments/tests.rs b/src/librustc_ast/util/comments/tests.rs
index 1919b9341aa..e19198f863b 100644
--- a/src/librustc_ast/util/comments/tests.rs
+++ b/src/librustc_ast/util/comments/tests.rs
@@ -1,13 +1,6 @@
 use super::*;
 use rustc_span::with_default_session_globals;
 
-#[test]
-fn line_doc_comments() {
-    assert!(line_doc_comment_style("///").is_some());
-    assert!(line_doc_comment_style("/// blah").is_some());
-    assert!(line_doc_comment_style("////").is_none());
-}
-
 #[test]
 fn test_block_doc_comment_1() {
     with_default_session_globals(|| {
diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs
index 7949a232b9b..23c90c4bb6d 100644
--- a/src/librustc_lexer/src/lib.rs
+++ b/src/librustc_lexer/src/lib.rs
@@ -51,12 +51,12 @@ impl Token {
 pub enum TokenKind {
     // Multi-char tokens:
     /// "// comment"
-    LineComment,
+    LineComment { doc_style: Option<DocStyle> },
     /// `/* block comment */`
     ///
     /// Block comments can be recursive, so the sequence like `/* /* */`
     /// will not be considered terminated and will result in a parsing error.
-    BlockComment { terminated: bool },
+    BlockComment { doc_style: Option<DocStyle>, terminated: bool },
     /// Any whitespace characters sequence.
     Whitespace,
     /// "ident" or "continue"
@@ -129,6 +129,12 @@ pub enum TokenKind {
     Unknown,
 }
 
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub enum DocStyle {
+    Outer,
+    Inner,
+}
+
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
 pub enum LiteralKind {
     /// "12_u8", "0o100", "0b120i99"
@@ -188,7 +194,7 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
         // a doc comment (due to `TokenKind::(Line,Block)Comment` ambiguity at lexer level),
         // then it may be valid Rust code, so consider it Rust code.
         let next_non_whitespace_token = tokenize(input_tail).map(|tok| tok.kind).find(|tok|
-            !matches!(tok, TokenKind::Whitespace | TokenKind::LineComment | TokenKind::BlockComment { .. })
+            !matches!(tok, TokenKind::Whitespace | TokenKind::LineComment { .. } | TokenKind::BlockComment { .. })
         );
         if next_non_whitespace_token != Some(TokenKind::OpenBracket) {
             // No other choice than to consider this a shebang.
@@ -410,13 +416,32 @@ impl Cursor<'_> {
     fn line_comment(&mut self) -> TokenKind {
         debug_assert!(self.prev() == '/' && self.first() == '/');
         self.bump();
+
+        let doc_style = match self.first() {
+            // `//!` is an inner line doc comment.
+            '!' => Some(DocStyle::Inner),
+            // `////` (more than 3 slashes) is not considered a doc comment.
+            '/' if self.second() != '/' => Some(DocStyle::Outer),
+            _ => None,
+        };
+
         self.eat_while(|c| c != '\n');
-        LineComment
+        LineComment { doc_style }
     }
 
     fn block_comment(&mut self) -> TokenKind {
         debug_assert!(self.prev() == '/' && self.first() == '*');
         self.bump();
+
+        let doc_style = match self.first() {
+            // `/*!` is an inner block doc comment.
+            '!' => Some(DocStyle::Inner),
+            // `/***` (more than 2 stars) is not considered a doc comment.
+            // `/**/` is not considered a doc comment.
+            '*' if !matches!(self.second(), '*' | '/') => Some(DocStyle::Outer),
+            _ => None,
+        };
+
         let mut depth = 1usize;
         while let Some(c) = self.bump() {
             match c {
@@ -438,7 +463,7 @@ impl Cursor<'_> {
             }
         }
 
-        BlockComment { terminated: depth == 0 }
+        BlockComment { doc_style, terminated: depth == 0 }
     }
 
     fn whitespace(&mut self) -> TokenKind {
diff --git a/src/librustc_parse/lexer/mod.rs b/src/librustc_parse/lexer/mod.rs
index 675cfa41f10..a57aace1437 100644
--- a/src/librustc_parse/lexer/mod.rs
+++ b/src/librustc_parse/lexer/mod.rs
@@ -1,5 +1,5 @@
+use rustc_ast::ast::AttrStyle;
 use rustc_ast::token::{self, CommentKind, Token, TokenKind};
-use rustc_ast::util::comments;
 use rustc_data_structures::sync::Lrc;
 use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError};
 use rustc_lexer::Base;
@@ -15,7 +15,7 @@ mod tokentrees;
 mod unescape_error_reporting;
 mod unicode_chars;
 
-use rustc_lexer::unescape::Mode;
+use rustc_lexer::{unescape::Mode, DocStyle};
 use unescape_error_reporting::{emit_unescape_error, push_escaped_char};
 
 #[derive(Clone, Debug)]
@@ -168,25 +168,23 @@ impl<'a> StringReader<'a> {
     /// symbols and runs additional validation.
     fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> TokenKind {
         match token {
-            rustc_lexer::TokenKind::LineComment => {
-                let string = self.str_from(start);
-                if let Some(attr_style) = comments::line_doc_comment_style(string) {
-                    self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment");
-                    // Opening delimiter of the length 3 is not included into the symbol.
-                    token::DocComment(CommentKind::Line, attr_style, Symbol::intern(&string[3..]))
-                } else {
-                    token::Comment
+            rustc_lexer::TokenKind::LineComment { doc_style } => {
+                match doc_style {
+                    Some(doc_style) => {
+                        // Opening delimiter of the length 3 is not included into the symbol.
+                        let content_start = start + BytePos(3);
+                        let content = self.str_from(content_start);
+
+                        self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
+                    }
+                    None => token::Comment,
                 }
             }
-            rustc_lexer::TokenKind::BlockComment { terminated } => {
-                let string = self.str_from(start);
-                let attr_style = comments::block_doc_comment_style(string, terminated);
-
+            rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
                 if !terminated {
-                    let msg = if attr_style.is_some() {
-                        "unterminated block doc-comment"
-                    } else {
-                        "unterminated block comment"
+                    let msg = match doc_style {
+                        Some(_) => "unterminated block doc-comment",
+                        None => "unterminated block comment",
                     };
                     let last_bpos = self.pos;
                     self.sess
@@ -199,18 +197,17 @@ impl<'a> StringReader<'a> {
                         .emit();
                     FatalError.raise();
                 }
+                match doc_style {
+                    Some(doc_style) => {
+                        // Opening delimiter of the length 3 and closing delimiter of the length 2
+                        // are not included into the symbol.
+                        let content_start = start + BytePos(3);
+                        let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
+                        let content = self.str_from_to(content_start, content_end);
 
-                if let Some(attr_style) = attr_style {
-                    self.forbid_bare_cr(start, string, "bare CR not allowed in block doc-comment");
-                    // Opening delimiter of the length 3 and closing delimiter of the length 2
-                    // are not included into the symbol.
-                    token::DocComment(
-                        CommentKind::Block,
-                        attr_style,
-                        Symbol::intern(&string[3..string.len() - if terminated { 2 } else { 0 }]),
-                    )
-                } else {
-                    token::Comment
+                        self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
+                    }
+                    None => token::Comment,
                 }
             }
             rustc_lexer::TokenKind::Whitespace => token::Whitespace,
@@ -319,6 +316,37 @@ impl<'a> StringReader<'a> {
         }
     }
 
+    fn cook_doc_comment(
+        &self,
+        content_start: BytePos,
+        content: &str,
+        comment_kind: CommentKind,
+        doc_style: DocStyle,
+    ) -> TokenKind {
+        let mut idx = 0;
+        loop {
+            idx = match content[idx..].find('\r') {
+                None => break,
+                Some(it) => idx + it + 1,
+            };
+            self.err_span_(
+                content_start + BytePos(idx as u32 - 1),
+                content_start + BytePos(idx as u32),
+                match comment_kind {
+                    CommentKind::Line => "bare CR not allowed in doc-comment",
+                    CommentKind::Block => "bare CR not allowed in block doc-comment",
+                },
+            );
+        }
+
+        let attr_style = match doc_style {
+            DocStyle::Outer => AttrStyle::Outer,
+            DocStyle::Inner => AttrStyle::Inner,
+        };
+
+        token::DocComment(comment_kind, attr_style, Symbol::intern(content))
+    }
+
     fn cook_lexer_literal(
         &self,
         start: BytePos,
@@ -472,17 +500,6 @@ impl<'a> StringReader<'a> {
         &self.src[self.src_index(start)..self.src_index(end)]
     }
 
-    fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) {
-        let mut idx = 0;
-        loop {
-            idx = match s[idx..].find('\r') {
-                None => break,
-                Some(it) => idx + it + 1,
-            };
-            self.err_span_(start + BytePos(idx as u32 - 1), start + BytePos(idx as u32), errmsg);
-        }
-    }
-
     fn report_raw_str_error(&self, start: BytePos, opt_err: Option<RawStrError>) {
         match opt_err {
             Some(RawStrError::InvalidStarter { bad_char }) => {

From ccbe94bf77e6a32fc9f31425bc820345be3143c0 Mon Sep 17 00:00:00 2001
From: Aleksey Kladov <aleksey.kladov@gmail.com>
Date: Mon, 17 Aug 2020 21:52:49 +0200
Subject: [PATCH 2/2] Simplify search for bare `\r` in doc comments

Outer `if` is the fast path -- it calls into hyperoptimized memchr.

The inner loop is just the simplest code possible -- it doesn't
generated the tightest code, but that shouldn't matter if we are going
to error anyhow.
---
 src/librustc_parse/lexer/mod.rs | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/src/librustc_parse/lexer/mod.rs b/src/librustc_parse/lexer/mod.rs
index a57aace1437..60a4dc574cb 100644
--- a/src/librustc_parse/lexer/mod.rs
+++ b/src/librustc_parse/lexer/mod.rs
@@ -323,20 +323,17 @@ impl<'a> StringReader<'a> {
         comment_kind: CommentKind,
         doc_style: DocStyle,
     ) -> TokenKind {
-        let mut idx = 0;
-        loop {
-            idx = match content[idx..].find('\r') {
-                None => break,
-                Some(it) => idx + it + 1,
-            };
-            self.err_span_(
-                content_start + BytePos(idx as u32 - 1),
-                content_start + BytePos(idx as u32),
-                match comment_kind {
-                    CommentKind::Line => "bare CR not allowed in doc-comment",
-                    CommentKind::Block => "bare CR not allowed in block doc-comment",
-                },
-            );
+        if content.contains('\r') {
+            for (idx, _) in content.char_indices().filter(|&(_, c)| c == '\r') {
+                self.err_span_(
+                    content_start + BytePos(idx as u32),
+                    content_start + BytePos(idx as u32 + 1),
+                    match comment_kind {
+                        CommentKind::Line => "bare CR not allowed in doc-comment",
+                        CommentKind::Block => "bare CR not allowed in block doc-comment",
+                    },
+                );
+            }
         }
 
         let attr_style = match doc_style {