Auto merge of #75642 - matklad:lexer-comments, r=petrochenkov

Move doc comment parsing to rustc_lexer Plain comments are trivia, while doc comments are not, so it feels like this belongs to the rustc_lexer. The specific reason to do this is the desire to use rustc_lexer in rustdoc for syntax highlighting, without duplicating "is this a doc comment?" logic there. r? @ghost
2020-08-21 06:05:39 +00:00 · 2020-08-21 06:05:39 +00:00 · b51651ae9d
parent ff5e0f1dc8 ccbe94bf77
commit b51651ae9d
4 changed files with 88 additions and 96 deletions
--- a/src/librustc_ast/util/comments.rs
+++ b/src/librustc_ast/util/comments.rs
@ -1,4 +1,3 @@
-use crate::ast::AttrStyle;
 use rustc_span::source_map::SourceMap;
 use rustc_span::{BytePos, CharPos, FileName, Pos, Symbol};

@ -24,45 +23,6 @@ pub struct Comment {
    pub pos: BytePos,
 }

-/// For a full line comment string returns its doc comment style if it's a doc comment
-/// and returns `None` if it's a regular comment.
-pub fn line_doc_comment_style(line_comment: &str) -> Option<AttrStyle> {
-    let line_comment = line_comment.as_bytes();
-    assert!(line_comment.starts_with(b"//"));
-    match line_comment.get(2) {
-        // `//!` is an inner line doc comment.
-        Some(b'!') => Some(AttrStyle::Inner),
-        Some(b'/') => match line_comment.get(3) {
-            // `////` (more than 3 slashes) is not considered a doc comment.
-            Some(b'/') => None,
-            // Otherwise `///` is an outer line doc comment.
-            _ => Some(AttrStyle::Outer),
-        },
-        _ => None,
-    }
-}
-
-/// For a full block comment string returns its doc comment style if it's a doc comment
-/// and returns `None` if it's a regular comment.
-pub fn block_doc_comment_style(block_comment: &str, terminated: bool) -> Option<AttrStyle> {
-    let block_comment = block_comment.as_bytes();
-    assert!(block_comment.starts_with(b"/*"));
-    assert!(!terminated || block_comment.ends_with(b"*/"));
-    match block_comment.get(2) {
-        // `/*!` is an inner block doc comment.
-        Some(b'!') => Some(AttrStyle::Inner),
-        Some(b'*') => match block_comment.get(3) {
-            // `/***` (more than 2 stars) is not considered a doc comment.
-            Some(b'*') => None,
-            // `/**/` is not considered a doc comment.
-            Some(b'/') if block_comment.len() == 4 => None,
-            // Otherwise `/**` is an outer block doc comment.
-            _ => Some(AttrStyle::Outer),
-        },
-        _ => None,
-    }
-}
-
 /// Makes a doc string more presentable to users.
 /// Used by rustdoc and perhaps other tools, but not by rustc.
 pub fn beautify_doc_string(data: Symbol) -> String {
@ -216,8 +176,8 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
                    }
                }
            }
-            rustc_lexer::TokenKind::BlockComment { terminated } => {
-                if block_doc_comment_style(token_text, terminated).is_none() {
+            rustc_lexer::TokenKind::BlockComment { doc_style, .. } => {
+                if doc_style.is_none() {
                    let code_to_the_right = match text[pos + token.len..].chars().next() {
                        Some('\r' | '\n') => false,
                        _ => true,
@ -238,8 +198,8 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
                    comments.push(Comment { style, lines, pos: pos_in_file })
                }
            }
-            rustc_lexer::TokenKind::LineComment => {
-                if line_doc_comment_style(token_text).is_none() {
+            rustc_lexer::TokenKind::LineComment { doc_style } => {
+                if doc_style.is_none() {
                    comments.push(Comment {
                        style: if code_to_the_left {
                            CommentStyle::Trailing
--- a/src/librustc_ast/util/comments/tests.rs
+++ b/src/librustc_ast/util/comments/tests.rs
@ -1,13 +1,6 @@
 use super::*;
 use rustc_span::with_default_session_globals;

-#[test]
-fn line_doc_comments() {
-    assert!(line_doc_comment_style("///").is_some());
-    assert!(line_doc_comment_style("/// blah").is_some());
-    assert!(line_doc_comment_style("////").is_none());
-}
-
 #[test]
 fn test_block_doc_comment_1() {
    with_default_session_globals(|| {
--- a/src/librustc_lexer/src/lib.rs
+++ b/src/librustc_lexer/src/lib.rs
@ -51,12 +51,12 @@ impl Token {
 pub enum TokenKind {
    // Multi-char tokens:
    /// "// comment"
-    LineComment,
+    LineComment { doc_style: Option<DocStyle> },
    /// `/* block comment */`
    ///
    /// Block comments can be recursive, so the sequence like `/* /* */`
    /// will not be considered terminated and will result in a parsing error.
-    BlockComment { terminated: bool },
+    BlockComment { doc_style: Option<DocStyle>, terminated: bool },
    /// Any whitespace characters sequence.
    Whitespace,
    /// "ident" or "continue"
@ -129,6 +129,12 @@ pub enum TokenKind {
    Unknown,
 }

+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub enum DocStyle {
+    Outer,
+    Inner,
+}
+
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
 pub enum LiteralKind {
    /// "12_u8", "0o100", "0b120i99"
@ -188,7 +194,7 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
        // a doc comment (due to `TokenKind::(Line,Block)Comment` ambiguity at lexer level),
        // then it may be valid Rust code, so consider it Rust code.
        let next_non_whitespace_token = tokenize(input_tail).map(|tok| tok.kind).find(|tok|
-            !matches!(tok, TokenKind::Whitespace | TokenKind::LineComment | TokenKind::BlockComment { .. })
+            !matches!(tok, TokenKind::Whitespace | TokenKind::LineComment { .. } | TokenKind::BlockComment { .. })
        );
        if next_non_whitespace_token != Some(TokenKind::OpenBracket) {
            // No other choice than to consider this a shebang.
@ -410,13 +416,32 @@ impl Cursor<'_> {
    fn line_comment(&mut self) -> TokenKind {
        debug_assert!(self.prev() == '/' && self.first() == '/');
        self.bump();
+
+        let doc_style = match self.first() {
+            // `//!` is an inner line doc comment.
+            '!' => Some(DocStyle::Inner),
+            // `////` (more than 3 slashes) is not considered a doc comment.
+            '/' if self.second() != '/' => Some(DocStyle::Outer),
+            _ => None,
+        };
+
        self.eat_while(|c| c != '\n');
-        LineComment
+        LineComment { doc_style }
    }

    fn block_comment(&mut self) -> TokenKind {
        debug_assert!(self.prev() == '/' && self.first() == '*');
        self.bump();
+
+        let doc_style = match self.first() {
+            // `/*!` is an inner block doc comment.
+            '!' => Some(DocStyle::Inner),
+            // `/***` (more than 2 stars) is not considered a doc comment.
+            // `/**/` is not considered a doc comment.
+            '*' if !matches!(self.second(), '*' | '/') => Some(DocStyle::Outer),
+            _ => None,
+        };
+
        let mut depth = 1usize;
        while let Some(c) = self.bump() {
            match c {
@ -438,7 +463,7 @@ impl Cursor<'_> {
            }
        }

-        BlockComment { terminated: depth == 0 }
+        BlockComment { doc_style, terminated: depth == 0 }
    }

    fn whitespace(&mut self) -> TokenKind {
--- a/src/librustc_parse/lexer/mod.rs
+++ b/src/librustc_parse/lexer/mod.rs
@ -1,5 +1,5 @@
+use rustc_ast::ast::AttrStyle;
 use rustc_ast::token::{self, CommentKind, Token, TokenKind};
-use rustc_ast::util::comments;
 use rustc_data_structures::sync::Lrc;
 use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError};
 use rustc_lexer::Base;
@ -15,7 +15,7 @@ mod tokentrees;
 mod unescape_error_reporting;
 mod unicode_chars;

-use rustc_lexer::unescape::Mode;
+use rustc_lexer::{unescape::Mode, DocStyle};
 use unescape_error_reporting::{emit_unescape_error, push_escaped_char};

 #[derive(Clone, Debug)]
@ -168,25 +168,23 @@ impl<'a> StringReader<'a> {
    /// symbols and runs additional validation.
    fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> TokenKind {
        match token {
-            rustc_lexer::TokenKind::LineComment => {
-                let string = self.str_from(start);
-                if let Some(attr_style) = comments::line_doc_comment_style(string) {
-                    self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment");
-                    // Opening delimiter of the length 3 is not included into the symbol.
-                    token::DocComment(CommentKind::Line, attr_style, Symbol::intern(&string[3..]))
-                } else {
-                    token::Comment
+            rustc_lexer::TokenKind::LineComment { doc_style } => {
+                match doc_style {
+                    Some(doc_style) => {
+                        // Opening delimiter of the length 3 is not included into the symbol.
+                        let content_start = start + BytePos(3);
+                        let content = self.str_from(content_start);
+
+                        self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
+                    }
+                    None => token::Comment,
                }
            }
-            rustc_lexer::TokenKind::BlockComment { terminated } => {
-                let string = self.str_from(start);
-                let attr_style = comments::block_doc_comment_style(string, terminated);
-
+            rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
                if !terminated {
-                    let msg = if attr_style.is_some() {
-                        "unterminated block doc-comment"
-                    } else {
-                        "unterminated block comment"
+                    let msg = match doc_style {
+                        Some(_) => "unterminated block doc-comment",
+                        None => "unterminated block comment",
                    };
                    let last_bpos = self.pos;
                    self.sess
@ -199,18 +197,17 @@ impl<'a> StringReader<'a> {
                        .emit();
                    FatalError.raise();
                }
+                match doc_style {
+                    Some(doc_style) => {
+                        // Opening delimiter of the length 3 and closing delimiter of the length 2
+                        // are not included into the symbol.
+                        let content_start = start + BytePos(3);
+                        let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
+                        let content = self.str_from_to(content_start, content_end);

-                if let Some(attr_style) = attr_style {
-                    self.forbid_bare_cr(start, string, "bare CR not allowed in block doc-comment");
-                    // Opening delimiter of the length 3 and closing delimiter of the length 2
-                    // are not included into the symbol.
-                    token::DocComment(
-                        CommentKind::Block,
-                        attr_style,
-                        Symbol::intern(&string[3..string.len() - if terminated { 2 } else { 0 }]),
-                    )
-                } else {
-                    token::Comment
+                        self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
+                    }
+                    None => token::Comment,
                }
            }
            rustc_lexer::TokenKind::Whitespace => token::Whitespace,
@ -319,6 +316,34 @@ impl<'a> StringReader<'a> {
        }
    }

+    fn cook_doc_comment(
+        &self,
+        content_start: BytePos,
+        content: &str,
+        comment_kind: CommentKind,
+        doc_style: DocStyle,
+    ) -> TokenKind {
+        if content.contains('\r') {
+            for (idx, _) in content.char_indices().filter(|&(_, c)| c == '\r') {
+                self.err_span_(
+                    content_start + BytePos(idx as u32),
+                    content_start + BytePos(idx as u32 + 1),
+                    match comment_kind {
+                        CommentKind::Line => "bare CR not allowed in doc-comment",
+                        CommentKind::Block => "bare CR not allowed in block doc-comment",
+                    },
+                );
+            }
+        }
+
+        let attr_style = match doc_style {
+            DocStyle::Outer => AttrStyle::Outer,
+            DocStyle::Inner => AttrStyle::Inner,
+        };
+
+        token::DocComment(comment_kind, attr_style, Symbol::intern(content))
+    }
+
    fn cook_lexer_literal(
        &self,
        start: BytePos,
@ -472,17 +497,6 @@ impl<'a> StringReader<'a> {
        &self.src[self.src_index(start)..self.src_index(end)]
    }

-    fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) {
-        let mut idx = 0;
-        loop {
-            idx = match s[idx..].find('\r') {
-                None => break,
-                Some(it) => idx + it + 1,
-            };
-            self.err_span_(start + BytePos(idx as u32 - 1), start + BytePos(idx as u32), errmsg);
-        }
-    }
-
    fn report_raw_str_error(&self, start: BytePos, opt_err: Option<RawStrError>) {
        match opt_err {
            Some(RawStrError::InvalidStarter { bad_char }) => {