Auto merge of #75642 - matklad:lexer-comments, r=petrochenkov

Move doc comment parsing to rustc_lexer

Plain comments are trivia, while doc comments are not, so it feels
like this belongs to the rustc_lexer.

The specific reason to do this is the desire to use rustc_lexer in
rustdoc for syntax highlighting, without duplicating "is this a doc
comment?" logic there.

r? @ghost
This commit is contained in:
bors 2020-08-21 06:05:39 +00:00
commit b51651ae9d
4 changed files with 88 additions and 96 deletions

View File

@ -1,4 +1,3 @@
use crate::ast::AttrStyle;
use rustc_span::source_map::SourceMap;
use rustc_span::{BytePos, CharPos, FileName, Pos, Symbol};
@ -24,45 +23,6 @@ pub struct Comment {
pub pos: BytePos,
}
/// For a full line comment string returns its doc comment style if it's a doc comment
/// and returns `None` if it's a regular comment.
pub fn line_doc_comment_style(line_comment: &str) -> Option<AttrStyle> {
let line_comment = line_comment.as_bytes();
assert!(line_comment.starts_with(b"//"));
match line_comment.get(2) {
// `//!` is an inner line doc comment.
Some(b'!') => Some(AttrStyle::Inner),
Some(b'/') => match line_comment.get(3) {
// `////` (more than 3 slashes) is not considered a doc comment.
Some(b'/') => None,
// Otherwise `///` is an outer line doc comment.
_ => Some(AttrStyle::Outer),
},
_ => None,
}
}
/// For a full block comment string returns its doc comment style if it's a doc comment
/// and returns `None` if it's a regular comment.
pub fn block_doc_comment_style(block_comment: &str, terminated: bool) -> Option<AttrStyle> {
let block_comment = block_comment.as_bytes();
assert!(block_comment.starts_with(b"/*"));
assert!(!terminated || block_comment.ends_with(b"*/"));
match block_comment.get(2) {
// `/*!` is an inner block doc comment.
Some(b'!') => Some(AttrStyle::Inner),
Some(b'*') => match block_comment.get(3) {
// `/***` (more than 2 stars) is not considered a doc comment.
Some(b'*') => None,
// `/**/` is not considered a doc comment.
Some(b'/') if block_comment.len() == 4 => None,
// Otherwise `/**` is an outer block doc comment.
_ => Some(AttrStyle::Outer),
},
_ => None,
}
}
/// Makes a doc string more presentable to users.
/// Used by rustdoc and perhaps other tools, but not by rustc.
pub fn beautify_doc_string(data: Symbol) -> String {
@ -216,8 +176,8 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
}
}
}
rustc_lexer::TokenKind::BlockComment { terminated } => {
if block_doc_comment_style(token_text, terminated).is_none() {
rustc_lexer::TokenKind::BlockComment { doc_style, .. } => {
if doc_style.is_none() {
let code_to_the_right = match text[pos + token.len..].chars().next() {
Some('\r' | '\n') => false,
_ => true,
@ -238,8 +198,8 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
comments.push(Comment { style, lines, pos: pos_in_file })
}
}
rustc_lexer::TokenKind::LineComment => {
if line_doc_comment_style(token_text).is_none() {
rustc_lexer::TokenKind::LineComment { doc_style } => {
if doc_style.is_none() {
comments.push(Comment {
style: if code_to_the_left {
CommentStyle::Trailing

View File

@ -1,13 +1,6 @@
use super::*;
use rustc_span::with_default_session_globals;
#[test]
fn line_doc_comments() {
assert!(line_doc_comment_style("///").is_some());
assert!(line_doc_comment_style("/// blah").is_some());
assert!(line_doc_comment_style("////").is_none());
}
#[test]
fn test_block_doc_comment_1() {
with_default_session_globals(|| {

View File

@ -51,12 +51,12 @@ impl Token {
pub enum TokenKind {
// Multi-char tokens:
/// "// comment"
LineComment,
LineComment { doc_style: Option<DocStyle> },
/// `/* block comment */`
///
/// Block comments can be recursive, so the sequence like `/* /* */`
/// will not be considered terminated and will result in a parsing error.
BlockComment { terminated: bool },
BlockComment { doc_style: Option<DocStyle>, terminated: bool },
/// Any whitespace characters sequence.
Whitespace,
/// "ident" or "continue"
@ -129,6 +129,12 @@ pub enum TokenKind {
Unknown,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum DocStyle {
Outer,
Inner,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum LiteralKind {
/// "12_u8", "0o100", "0b120i99"
@ -188,7 +194,7 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
// a doc comment (due to `TokenKind::(Line,Block)Comment` ambiguity at lexer level),
// then it may be valid Rust code, so consider it Rust code.
let next_non_whitespace_token = tokenize(input_tail).map(|tok| tok.kind).find(|tok|
!matches!(tok, TokenKind::Whitespace | TokenKind::LineComment | TokenKind::BlockComment { .. })
!matches!(tok, TokenKind::Whitespace | TokenKind::LineComment { .. } | TokenKind::BlockComment { .. })
);
if next_non_whitespace_token != Some(TokenKind::OpenBracket) {
// No other choice than to consider this a shebang.
@ -410,13 +416,32 @@ impl Cursor<'_> {
fn line_comment(&mut self) -> TokenKind {
debug_assert!(self.prev() == '/' && self.first() == '/');
self.bump();
let doc_style = match self.first() {
// `//!` is an inner line doc comment.
'!' => Some(DocStyle::Inner),
// `////` (more than 3 slashes) is not considered a doc comment.
'/' if self.second() != '/' => Some(DocStyle::Outer),
_ => None,
};
self.eat_while(|c| c != '\n');
LineComment
LineComment { doc_style }
}
fn block_comment(&mut self) -> TokenKind {
debug_assert!(self.prev() == '/' && self.first() == '*');
self.bump();
let doc_style = match self.first() {
// `/*!` is an inner block doc comment.
'!' => Some(DocStyle::Inner),
// `/***` (more than 2 stars) is not considered a doc comment.
// `/**/` is not considered a doc comment.
'*' if !matches!(self.second(), '*' | '/') => Some(DocStyle::Outer),
_ => None,
};
let mut depth = 1usize;
while let Some(c) = self.bump() {
match c {
@ -438,7 +463,7 @@ impl Cursor<'_> {
}
}
BlockComment { terminated: depth == 0 }
BlockComment { doc_style, terminated: depth == 0 }
}
fn whitespace(&mut self) -> TokenKind {

View File

@ -1,5 +1,5 @@
use rustc_ast::ast::AttrStyle;
use rustc_ast::token::{self, CommentKind, Token, TokenKind};
use rustc_ast::util::comments;
use rustc_data_structures::sync::Lrc;
use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError};
use rustc_lexer::Base;
@ -15,7 +15,7 @@ mod tokentrees;
mod unescape_error_reporting;
mod unicode_chars;
use rustc_lexer::unescape::Mode;
use rustc_lexer::{unescape::Mode, DocStyle};
use unescape_error_reporting::{emit_unescape_error, push_escaped_char};
#[derive(Clone, Debug)]
@ -168,25 +168,23 @@ impl<'a> StringReader<'a> {
/// symbols and runs additional validation.
fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> TokenKind {
match token {
rustc_lexer::TokenKind::LineComment => {
let string = self.str_from(start);
if let Some(attr_style) = comments::line_doc_comment_style(string) {
self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment");
// Opening delimiter of the length 3 is not included into the symbol.
token::DocComment(CommentKind::Line, attr_style, Symbol::intern(&string[3..]))
} else {
token::Comment
rustc_lexer::TokenKind::LineComment { doc_style } => {
match doc_style {
Some(doc_style) => {
// Opening delimiter of the length 3 is not included into the symbol.
let content_start = start + BytePos(3);
let content = self.str_from(content_start);
self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
}
None => token::Comment,
}
}
rustc_lexer::TokenKind::BlockComment { terminated } => {
let string = self.str_from(start);
let attr_style = comments::block_doc_comment_style(string, terminated);
rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
if !terminated {
let msg = if attr_style.is_some() {
"unterminated block doc-comment"
} else {
"unterminated block comment"
let msg = match doc_style {
Some(_) => "unterminated block doc-comment",
None => "unterminated block comment",
};
let last_bpos = self.pos;
self.sess
@ -199,18 +197,17 @@ impl<'a> StringReader<'a> {
.emit();
FatalError.raise();
}
match doc_style {
Some(doc_style) => {
// Opening delimiter of the length 3 and closing delimiter of the length 2
// are not included into the symbol.
let content_start = start + BytePos(3);
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
let content = self.str_from_to(content_start, content_end);
if let Some(attr_style) = attr_style {
self.forbid_bare_cr(start, string, "bare CR not allowed in block doc-comment");
// Opening delimiter of the length 3 and closing delimiter of the length 2
// are not included into the symbol.
token::DocComment(
CommentKind::Block,
attr_style,
Symbol::intern(&string[3..string.len() - if terminated { 2 } else { 0 }]),
)
} else {
token::Comment
self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
}
None => token::Comment,
}
}
rustc_lexer::TokenKind::Whitespace => token::Whitespace,
@ -319,6 +316,34 @@ impl<'a> StringReader<'a> {
}
}
fn cook_doc_comment(
&self,
content_start: BytePos,
content: &str,
comment_kind: CommentKind,
doc_style: DocStyle,
) -> TokenKind {
if content.contains('\r') {
for (idx, _) in content.char_indices().filter(|&(_, c)| c == '\r') {
self.err_span_(
content_start + BytePos(idx as u32),
content_start + BytePos(idx as u32 + 1),
match comment_kind {
CommentKind::Line => "bare CR not allowed in doc-comment",
CommentKind::Block => "bare CR not allowed in block doc-comment",
},
);
}
}
let attr_style = match doc_style {
DocStyle::Outer => AttrStyle::Outer,
DocStyle::Inner => AttrStyle::Inner,
};
token::DocComment(comment_kind, attr_style, Symbol::intern(content))
}
fn cook_lexer_literal(
&self,
start: BytePos,
@ -472,17 +497,6 @@ impl<'a> StringReader<'a> {
&self.src[self.src_index(start)..self.src_index(end)]
}
fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) {
let mut idx = 0;
loop {
idx = match s[idx..].find('\r') {
None => break,
Some(it) => idx + it + 1,
};
self.err_span_(start + BytePos(idx as u32 - 1), start + BytePos(idx as u32), errmsg);
}
}
fn report_raw_str_error(&self, start: BytePos, opt_err: Option<RawStrError>) {
match opt_err {
Some(RawStrError::InvalidStarter { bad_char }) => {