From 39197e673e79ca91ac6f1484adf3aa051696d575 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Mon, 17 Aug 2020 18:43:35 +0200 Subject: [PATCH 1/2] Move doc comment parsing to rustc_lexer Plain comments are trivial, while doc comments are not, so it feels like this belongs to the rustc_lexer. The specific reason to do this is the desire to use rustc_lexer in rustdoc for syntax highlighting, without duplicating "is this a doc comment?" logic there. --- src/librustc_ast/util/comments.rs | 48 +----------- src/librustc_ast/util/comments/tests.rs | 7 -- src/librustc_lexer/src/lib.rs | 35 +++++++-- src/librustc_parse/lexer/mod.rs | 97 +++++++++++++++---------- 4 files changed, 91 insertions(+), 96 deletions(-) diff --git a/src/librustc_ast/util/comments.rs b/src/librustc_ast/util/comments.rs index a73891db160..e97c8cc4562 100644 --- a/src/librustc_ast/util/comments.rs +++ b/src/librustc_ast/util/comments.rs @@ -1,4 +1,3 @@ -use crate::ast::AttrStyle; use rustc_span::source_map::SourceMap; use rustc_span::{BytePos, CharPos, FileName, Pos, Symbol}; @@ -24,45 +23,6 @@ pub struct Comment { pub pos: BytePos, } -/// For a full line comment string returns its doc comment style if it's a doc comment -/// and returns `None` if it's a regular comment. -pub fn line_doc_comment_style(line_comment: &str) -> Option { - let line_comment = line_comment.as_bytes(); - assert!(line_comment.starts_with(b"//")); - match line_comment.get(2) { - // `//!` is an inner line doc comment. - Some(b'!') => Some(AttrStyle::Inner), - Some(b'/') => match line_comment.get(3) { - // `////` (more than 3 slashes) is not considered a doc comment. - Some(b'/') => None, - // Otherwise `///` is an outer line doc comment. - _ => Some(AttrStyle::Outer), - }, - _ => None, - } -} - -/// For a full block comment string returns its doc comment style if it's a doc comment -/// and returns `None` if it's a regular comment. -pub fn block_doc_comment_style(block_comment: &str, terminated: bool) -> Option { - let block_comment = block_comment.as_bytes(); - assert!(block_comment.starts_with(b"/*")); - assert!(!terminated || block_comment.ends_with(b"*/")); - match block_comment.get(2) { - // `/*!` is an inner block doc comment. - Some(b'!') => Some(AttrStyle::Inner), - Some(b'*') => match block_comment.get(3) { - // `/***` (more than 2 stars) is not considered a doc comment. - Some(b'*') => None, - // `/**/` is not considered a doc comment. - Some(b'/') if block_comment.len() == 4 => None, - // Otherwise `/**` is an outer block doc comment. - _ => Some(AttrStyle::Outer), - }, - _ => None, - } -} - /// Makes a doc string more presentable to users. /// Used by rustdoc and perhaps other tools, but not by rustc. pub fn beautify_doc_string(data: Symbol) -> String { @@ -216,8 +176,8 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec { - if block_doc_comment_style(token_text, terminated).is_none() { + rustc_lexer::TokenKind::BlockComment { doc_style, .. } => { + if doc_style.is_none() { let code_to_the_right = match text[pos + token.len..].chars().next() { Some('\r' | '\n') => false, _ => true, @@ -238,8 +198,8 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec { - if line_doc_comment_style(token_text).is_none() { + rustc_lexer::TokenKind::LineComment { doc_style } => { + if doc_style.is_none() { comments.push(Comment { style: if code_to_the_left { CommentStyle::Trailing diff --git a/src/librustc_ast/util/comments/tests.rs b/src/librustc_ast/util/comments/tests.rs index 1919b9341aa..e19198f863b 100644 --- a/src/librustc_ast/util/comments/tests.rs +++ b/src/librustc_ast/util/comments/tests.rs @@ -1,13 +1,6 @@ use super::*; use rustc_span::with_default_session_globals; -#[test] -fn line_doc_comments() { - assert!(line_doc_comment_style("///").is_some()); - assert!(line_doc_comment_style("/// blah").is_some()); - assert!(line_doc_comment_style("////").is_none()); -} - #[test] fn test_block_doc_comment_1() { with_default_session_globals(|| { diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index 7949a232b9b..23c90c4bb6d 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -51,12 +51,12 @@ impl Token { pub enum TokenKind { // Multi-char tokens: /// "// comment" - LineComment, + LineComment { doc_style: Option }, /// `/* block comment */` /// /// Block comments can be recursive, so the sequence like `/* /* */` /// will not be considered terminated and will result in a parsing error. - BlockComment { terminated: bool }, + BlockComment { doc_style: Option, terminated: bool }, /// Any whitespace characters sequence. Whitespace, /// "ident" or "continue" @@ -129,6 +129,12 @@ pub enum TokenKind { Unknown, } +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum DocStyle { + Outer, + Inner, +} + #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] pub enum LiteralKind { /// "12_u8", "0o100", "0b120i99" @@ -188,7 +194,7 @@ pub fn strip_shebang(input: &str) -> Option { // a doc comment (due to `TokenKind::(Line,Block)Comment` ambiguity at lexer level), // then it may be valid Rust code, so consider it Rust code. let next_non_whitespace_token = tokenize(input_tail).map(|tok| tok.kind).find(|tok| - !matches!(tok, TokenKind::Whitespace | TokenKind::LineComment | TokenKind::BlockComment { .. }) + !matches!(tok, TokenKind::Whitespace | TokenKind::LineComment { .. } | TokenKind::BlockComment { .. }) ); if next_non_whitespace_token != Some(TokenKind::OpenBracket) { // No other choice than to consider this a shebang. @@ -410,13 +416,32 @@ impl Cursor<'_> { fn line_comment(&mut self) -> TokenKind { debug_assert!(self.prev() == '/' && self.first() == '/'); self.bump(); + + let doc_style = match self.first() { + // `//!` is an inner line doc comment. + '!' => Some(DocStyle::Inner), + // `////` (more than 3 slashes) is not considered a doc comment. + '/' if self.second() != '/' => Some(DocStyle::Outer), + _ => None, + }; + self.eat_while(|c| c != '\n'); - LineComment + LineComment { doc_style } } fn block_comment(&mut self) -> TokenKind { debug_assert!(self.prev() == '/' && self.first() == '*'); self.bump(); + + let doc_style = match self.first() { + // `/*!` is an inner block doc comment. + '!' => Some(DocStyle::Inner), + // `/***` (more than 2 stars) is not considered a doc comment. + // `/**/` is not considered a doc comment. + '*' if !matches!(self.second(), '*' | '/') => Some(DocStyle::Outer), + _ => None, + }; + let mut depth = 1usize; while let Some(c) = self.bump() { match c { @@ -438,7 +463,7 @@ impl Cursor<'_> { } } - BlockComment { terminated: depth == 0 } + BlockComment { doc_style, terminated: depth == 0 } } fn whitespace(&mut self) -> TokenKind { diff --git a/src/librustc_parse/lexer/mod.rs b/src/librustc_parse/lexer/mod.rs index 675cfa41f10..a57aace1437 100644 --- a/src/librustc_parse/lexer/mod.rs +++ b/src/librustc_parse/lexer/mod.rs @@ -1,5 +1,5 @@ +use rustc_ast::ast::AttrStyle; use rustc_ast::token::{self, CommentKind, Token, TokenKind}; -use rustc_ast::util::comments; use rustc_data_structures::sync::Lrc; use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError}; use rustc_lexer::Base; @@ -15,7 +15,7 @@ mod tokentrees; mod unescape_error_reporting; mod unicode_chars; -use rustc_lexer::unescape::Mode; +use rustc_lexer::{unescape::Mode, DocStyle}; use unescape_error_reporting::{emit_unescape_error, push_escaped_char}; #[derive(Clone, Debug)] @@ -168,25 +168,23 @@ impl<'a> StringReader<'a> { /// symbols and runs additional validation. fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> TokenKind { match token { - rustc_lexer::TokenKind::LineComment => { - let string = self.str_from(start); - if let Some(attr_style) = comments::line_doc_comment_style(string) { - self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment"); - // Opening delimiter of the length 3 is not included into the symbol. - token::DocComment(CommentKind::Line, attr_style, Symbol::intern(&string[3..])) - } else { - token::Comment + rustc_lexer::TokenKind::LineComment { doc_style } => { + match doc_style { + Some(doc_style) => { + // Opening delimiter of the length 3 is not included into the symbol. + let content_start = start + BytePos(3); + let content = self.str_from(content_start); + + self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style) + } + None => token::Comment, } } - rustc_lexer::TokenKind::BlockComment { terminated } => { - let string = self.str_from(start); - let attr_style = comments::block_doc_comment_style(string, terminated); - + rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => { if !terminated { - let msg = if attr_style.is_some() { - "unterminated block doc-comment" - } else { - "unterminated block comment" + let msg = match doc_style { + Some(_) => "unterminated block doc-comment", + None => "unterminated block comment", }; let last_bpos = self.pos; self.sess @@ -199,18 +197,17 @@ impl<'a> StringReader<'a> { .emit(); FatalError.raise(); } + match doc_style { + Some(doc_style) => { + // Opening delimiter of the length 3 and closing delimiter of the length 2 + // are not included into the symbol. + let content_start = start + BytePos(3); + let content_end = self.pos - BytePos(if terminated { 2 } else { 0 }); + let content = self.str_from_to(content_start, content_end); - if let Some(attr_style) = attr_style { - self.forbid_bare_cr(start, string, "bare CR not allowed in block doc-comment"); - // Opening delimiter of the length 3 and closing delimiter of the length 2 - // are not included into the symbol. - token::DocComment( - CommentKind::Block, - attr_style, - Symbol::intern(&string[3..string.len() - if terminated { 2 } else { 0 }]), - ) - } else { - token::Comment + self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style) + } + None => token::Comment, } } rustc_lexer::TokenKind::Whitespace => token::Whitespace, @@ -319,6 +316,37 @@ impl<'a> StringReader<'a> { } } + fn cook_doc_comment( + &self, + content_start: BytePos, + content: &str, + comment_kind: CommentKind, + doc_style: DocStyle, + ) -> TokenKind { + let mut idx = 0; + loop { + idx = match content[idx..].find('\r') { + None => break, + Some(it) => idx + it + 1, + }; + self.err_span_( + content_start + BytePos(idx as u32 - 1), + content_start + BytePos(idx as u32), + match comment_kind { + CommentKind::Line => "bare CR not allowed in doc-comment", + CommentKind::Block => "bare CR not allowed in block doc-comment", + }, + ); + } + + let attr_style = match doc_style { + DocStyle::Outer => AttrStyle::Outer, + DocStyle::Inner => AttrStyle::Inner, + }; + + token::DocComment(comment_kind, attr_style, Symbol::intern(content)) + } + fn cook_lexer_literal( &self, start: BytePos, @@ -472,17 +500,6 @@ impl<'a> StringReader<'a> { &self.src[self.src_index(start)..self.src_index(end)] } - fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) { - let mut idx = 0; - loop { - idx = match s[idx..].find('\r') { - None => break, - Some(it) => idx + it + 1, - }; - self.err_span_(start + BytePos(idx as u32 - 1), start + BytePos(idx as u32), errmsg); - } - } - fn report_raw_str_error(&self, start: BytePos, opt_err: Option) { match opt_err { Some(RawStrError::InvalidStarter { bad_char }) => { From ccbe94bf77e6a32fc9f31425bc820345be3143c0 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Mon, 17 Aug 2020 21:52:49 +0200 Subject: [PATCH 2/2] Simplify search for bare `\r` in doc comments Outer `if` is the fast path -- it calls into hyperoptimized memchr. The inner loop is just the simplest code possible -- it doesn't generated the tightest code, but that shouldn't matter if we are going to error anyhow. --- src/librustc_parse/lexer/mod.rs | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/src/librustc_parse/lexer/mod.rs b/src/librustc_parse/lexer/mod.rs index a57aace1437..60a4dc574cb 100644 --- a/src/librustc_parse/lexer/mod.rs +++ b/src/librustc_parse/lexer/mod.rs @@ -323,20 +323,17 @@ impl<'a> StringReader<'a> { comment_kind: CommentKind, doc_style: DocStyle, ) -> TokenKind { - let mut idx = 0; - loop { - idx = match content[idx..].find('\r') { - None => break, - Some(it) => idx + it + 1, - }; - self.err_span_( - content_start + BytePos(idx as u32 - 1), - content_start + BytePos(idx as u32), - match comment_kind { - CommentKind::Line => "bare CR not allowed in doc-comment", - CommentKind::Block => "bare CR not allowed in block doc-comment", - }, - ); + if content.contains('\r') { + for (idx, _) in content.char_indices().filter(|&(_, c)| c == '\r') { + self.err_span_( + content_start + BytePos(idx as u32), + content_start + BytePos(idx as u32 + 1), + match comment_kind { + CommentKind::Line => "bare CR not allowed in doc-comment", + CommentKind::Block => "bare CR not allowed in block doc-comment", + }, + ); + } } let attr_style = match doc_style {