Rollup merge of #62329 - matklad:no-peeking, r=petrochenkov

Remove support for 1-token lookahead from the lexer

`StringReader` maintained `peek_token` and `peek_span_src_raw` for look ahead.

`peek_token` was used only by rustdoc syntax coloring. After moving peeking logic into highlighter, I was able to remove `peek_token` from the lexer. I tried to use `iter::Peekable`, but that wasn't as pretty as I hoped, due to buffered fatal errors. So I went with hand-rolled peeking.

After that I've noticed that the only peeking behavior left was for raw tokens to test tt jointness. I've rewritten it in terms of trivia tokens, and not just spans.

After that it became possible to simplify the awkward constructor of the lexer, which could return `Err` if the first peeked token contained error.
This commit is contained in:
Mazdak Farrokhzad 2019-07-06 02:38:01 +02:00 committed by GitHub
commit 952ee77871
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 128 additions and 179 deletions

View File

@ -53,7 +53,7 @@ impl<'a> SpanUtils<'a> {
pub fn sub_span_of_token(&self, span: Span, tok: TokenKind) -> Option<Span> { pub fn sub_span_of_token(&self, span: Span, tok: TokenKind) -> Option<Span> {
let mut toks = self.retokenise_span(span); let mut toks = self.retokenise_span(span);
loop { loop {
let next = toks.real_token(); let next = toks.next_token();
if next == token::Eof { if next == token::Eof {
return None; return None;
} }

View File

@ -38,17 +38,17 @@ pub fn render_with_highlighting(
FileName::Custom(String::from("rustdoc-highlighting")), FileName::Custom(String::from("rustdoc-highlighting")),
src.to_owned(), src.to_owned(),
); );
let highlight_result = let highlight_result = {
lexer::StringReader::new_or_buffered_errs(&sess, fm, None).and_then(|lexer| { let lexer = lexer::StringReader::new(&sess, fm, None);
let mut classifier = Classifier::new(lexer, sess.source_map()); let mut classifier = Classifier::new(lexer, sess.source_map());
let mut highlighted_source = vec![]; let mut highlighted_source = vec![];
if classifier.write_source(&mut highlighted_source).is_err() { if classifier.write_source(&mut highlighted_source).is_err() {
Err(classifier.lexer.buffer_fatal_errors()) Err(classifier.lexer.buffer_fatal_errors())
} else { } else {
Ok(String::from_utf8_lossy(&highlighted_source).into_owned()) Ok(String::from_utf8_lossy(&highlighted_source).into_owned())
} }
}); };
match highlight_result { match highlight_result {
Ok(highlighted_source) => { Ok(highlighted_source) => {
@ -79,6 +79,7 @@ pub fn render_with_highlighting(
/// each span of text in sequence. /// each span of text in sequence.
struct Classifier<'a> { struct Classifier<'a> {
lexer: lexer::StringReader<'a>, lexer: lexer::StringReader<'a>,
peek_token: Option<Token>,
source_map: &'a SourceMap, source_map: &'a SourceMap,
// State of the classifier. // State of the classifier.
@ -178,6 +179,7 @@ impl<'a> Classifier<'a> {
fn new(lexer: lexer::StringReader<'a>, source_map: &'a SourceMap) -> Classifier<'a> { fn new(lexer: lexer::StringReader<'a>, source_map: &'a SourceMap) -> Classifier<'a> {
Classifier { Classifier {
lexer, lexer,
peek_token: None,
source_map, source_map,
in_attribute: false, in_attribute: false,
in_macro: false, in_macro: false,
@ -187,10 +189,19 @@ impl<'a> Classifier<'a> {
/// Gets the next token out of the lexer. /// Gets the next token out of the lexer.
fn try_next_token(&mut self) -> Result<Token, HighlightError> { fn try_next_token(&mut self) -> Result<Token, HighlightError> {
match self.lexer.try_next_token() { if let Some(token) = self.peek_token.take() {
Ok(token) => Ok(token), return Ok(token);
Err(_) => Err(HighlightError::LexError),
} }
self.lexer.try_next_token().map_err(|()| HighlightError::LexError)
}
fn peek(&mut self) -> Result<&Token, HighlightError> {
if self.peek_token.is_none() {
self.peek_token = Some(
self.lexer.try_next_token().map_err(|()| HighlightError::LexError)?
);
}
Ok(self.peek_token.as_ref().unwrap())
} }
/// Exhausts the `lexer` writing the output into `out`. /// Exhausts the `lexer` writing the output into `out`.
@ -234,7 +245,7 @@ impl<'a> Classifier<'a> {
// reference or dereference operator or a reference or pointer type, instead of the // reference or dereference operator or a reference or pointer type, instead of the
// bit-and or multiplication operator. // bit-and or multiplication operator.
token::BinOp(token::And) | token::BinOp(token::Star) token::BinOp(token::And) | token::BinOp(token::Star)
if self.lexer.peek() != &token::Whitespace => Class::RefKeyWord, if self.peek()? != &token::Whitespace => Class::RefKeyWord,
// Consider this as part of a macro invocation if there was a // Consider this as part of a macro invocation if there was a
// leading identifier. // leading identifier.
@ -257,7 +268,7 @@ impl<'a> Classifier<'a> {
token::Question => Class::QuestionMark, token::Question => Class::QuestionMark,
token::Dollar => { token::Dollar => {
if self.lexer.peek().is_ident() { if self.peek()?.is_ident() {
self.in_macro_nonterminal = true; self.in_macro_nonterminal = true;
Class::MacroNonTerminal Class::MacroNonTerminal
} else { } else {
@ -280,9 +291,9 @@ impl<'a> Classifier<'a> {
// as an attribute. // as an attribute.
// Case 1: #![inner_attribute] // Case 1: #![inner_attribute]
if self.lexer.peek() == &token::Not { if self.peek()? == &token::Not {
self.try_next_token()?; // NOTE: consumes `!` token! self.try_next_token()?; // NOTE: consumes `!` token!
if self.lexer.peek() == &token::OpenDelim(token::Bracket) { if self.peek()? == &token::OpenDelim(token::Bracket) {
self.in_attribute = true; self.in_attribute = true;
out.enter_span(Class::Attribute)?; out.enter_span(Class::Attribute)?;
} }
@ -292,7 +303,7 @@ impl<'a> Classifier<'a> {
} }
// Case 2: #[outer_attribute] // Case 2: #[outer_attribute]
if self.lexer.peek() == &token::OpenDelim(token::Bracket) { if self.peek()? == &token::OpenDelim(token::Bracket) {
self.in_attribute = true; self.in_attribute = true;
out.enter_span(Class::Attribute)?; out.enter_span(Class::Attribute)?;
} }
@ -341,7 +352,7 @@ impl<'a> Classifier<'a> {
if self.in_macro_nonterminal { if self.in_macro_nonterminal {
self.in_macro_nonterminal = false; self.in_macro_nonterminal = false;
Class::MacroNonTerminal Class::MacroNonTerminal
} else if self.lexer.peek() == &token::Not { } else if self.peek()? == &token::Not {
self.in_macro = true; self.in_macro = true;
Class::Macro Class::Macro
} else { } else {

View File

@ -32,7 +32,8 @@ impl<'a, 'tcx> SyntaxChecker<'a, 'tcx> {
dox[code_block.code].to_owned(), dox[code_block.code].to_owned(),
); );
let errors = Lexer::new_or_buffered_errs(&sess, source_file, None).and_then(|mut lexer| { let errors = {
let mut lexer = Lexer::new(&sess, source_file, None);
while let Ok(token::Token { kind, .. }) = lexer.try_next_token() { while let Ok(token::Token { kind, .. }) = lexer.try_next_token() {
if kind == token::Eof { if kind == token::Eof {
break; break;
@ -46,7 +47,7 @@ impl<'a, 'tcx> SyntaxChecker<'a, 'tcx> {
} else { } else {
Ok(()) Ok(())
} }
}); };
if let Err(errors) = errors { if let Err(errors) = errors {
let mut diag = if let Some(sp) = let mut diag = if let Some(sp) =

View File

@ -268,7 +268,7 @@ fn read_block_comment(rdr: &mut StringReader<'_>,
while level > 0 { while level > 0 {
debug!("=== block comment level {}", level); debug!("=== block comment level {}", level);
if rdr.is_eof() { if rdr.is_eof() {
rdr.fatal("unterminated block comment").raise(); rdr.fatal_span_(rdr.pos, rdr.pos, "unterminated block comment").raise();
} }
if rdr.ch_is('\n') { if rdr.ch_is('\n') {
trim_whitespace_prefix_and_push_line(&mut lines, curr_line, col); trim_whitespace_prefix_and_push_line(&mut lines, curr_line, col);
@ -346,7 +346,7 @@ pub fn gather_comments(sess: &ParseSess, path: FileName, srdr: &mut dyn Read) ->
srdr.read_to_string(&mut src).unwrap(); srdr.read_to_string(&mut src).unwrap();
let cm = SourceMap::new(sess.source_map().path_mapping().clone()); let cm = SourceMap::new(sess.source_map().path_mapping().clone());
let source_file = cm.new_source_file(path, src); let source_file = cm.new_source_file(path, src);
let mut rdr = lexer::StringReader::new_raw(sess, source_file, None); let mut rdr = lexer::StringReader::new(sess, source_file, None);
let mut comments: Vec<Comment> = Vec::new(); let mut comments: Vec<Comment> = Vec::new();
let mut code_to_the_left = false; // Only code let mut code_to_the_left = false; // Only code

View File

@ -38,9 +38,6 @@ pub struct StringReader<'a> {
crate source_file: Lrc<syntax_pos::SourceFile>, crate source_file: Lrc<syntax_pos::SourceFile>,
/// Stop reading src at this index. /// Stop reading src at this index.
crate end_src_index: usize, crate end_src_index: usize,
// cached:
peek_token: Token,
peek_span_src_raw: Span,
fatal_errs: Vec<DiagnosticBuilder<'a>>, fatal_errs: Vec<DiagnosticBuilder<'a>>,
// cache a direct reference to the source text, so that we don't have to // cache a direct reference to the source text, so that we don't have to
// retrieve it via `self.source_file.src.as_ref().unwrap()` all the time. // retrieve it via `self.source_file.src.as_ref().unwrap()` all the time.
@ -49,15 +46,59 @@ pub struct StringReader<'a> {
} }
impl<'a> StringReader<'a> { impl<'a> StringReader<'a> {
fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span { pub fn new(sess: &'a ParseSess,
self.mk_sp_and_raw(lo, hi).0 source_file: Lrc<syntax_pos::SourceFile>,
override_span: Option<Span>) -> Self {
let mut sr = StringReader::new_internal(sess, source_file, override_span);
sr.bump();
sr
} }
fn mk_sp_and_raw(&self, lo: BytePos, hi: BytePos) -> (Span, Span) { pub fn retokenize(sess: &'a ParseSess, mut span: Span) -> Self {
let raw = Span::new(lo, hi, NO_EXPANSION); let begin = sess.source_map().lookup_byte_offset(span.lo());
let real = self.override_span.unwrap_or(raw); let end = sess.source_map().lookup_byte_offset(span.hi());
(real, raw) // Make the range zero-length if the span is invalid.
if span.lo() > span.hi() || begin.sf.start_pos != end.sf.start_pos {
span = span.shrink_to_lo();
}
let mut sr = StringReader::new_internal(sess, begin.sf, None);
// Seek the lexer to the right byte range.
sr.next_pos = span.lo();
sr.end_src_index = sr.src_index(span.hi());
sr.bump();
sr
}
fn new_internal(sess: &'a ParseSess, source_file: Lrc<syntax_pos::SourceFile>,
override_span: Option<Span>) -> Self
{
if source_file.src.is_none() {
sess.span_diagnostic.bug(&format!("Cannot lex source_file without source: {}",
source_file.name));
}
let src = (*source_file.src.as_ref().unwrap()).clone();
StringReader {
sess,
next_pos: source_file.start_pos,
pos: source_file.start_pos,
ch: Some('\n'),
source_file,
end_src_index: src.len(),
src,
fatal_errs: Vec::new(),
override_span,
}
}
fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span {
self.override_span.unwrap_or_else(|| Span::new(lo, hi, NO_EXPANSION))
} }
fn unwrap_or_abort(&mut self, res: Result<Token, ()>) -> Token { fn unwrap_or_abort(&mut self, res: Result<Token, ()>) -> Token {
@ -70,35 +111,32 @@ impl<'a> StringReader<'a> {
} }
} }
fn next_token(&mut self) -> Token where Self: Sized { /// Returns the next token, including trivia like whitespace or comments.
let res = self.try_next_token(); ///
self.unwrap_or_abort(res) /// `Err(())` means that some errors were encountered, which can be
} /// retrieved using `buffer_fatal_errors`.
/// Returns the next token. EFFECT: advances the string_reader.
pub fn try_next_token(&mut self) -> Result<Token, ()> { pub fn try_next_token(&mut self) -> Result<Token, ()> {
assert!(self.fatal_errs.is_empty()); assert!(self.fatal_errs.is_empty());
let ret_val = self.peek_token.take(); match self.scan_whitespace_or_comment() {
self.advance_token()?; Some(comment) => Ok(comment),
Ok(ret_val) None => {
} let (kind, start_pos, end_pos) = if self.is_eof() {
(token::Eof, self.source_file.end_pos, self.source_file.end_pos)
fn try_real_token(&mut self) -> Result<Token, ()> { } else {
let mut t = self.try_next_token()?; let start_pos = self.pos;
loop { (self.next_token_inner()?, start_pos, self.pos)
match t.kind { };
token::Whitespace | token::Comment | token::Shebang(_) => { let span = self.mk_sp(start_pos, end_pos);
t = self.try_next_token()?; Ok(Token::new(kind, span))
}
_ => break,
} }
} }
Ok(t)
} }
pub fn real_token(&mut self) -> Token { /// Returns the next token, including trivia like whitespace or comments.
let res = self.try_real_token(); ///
/// Aborts in case of an error.
pub fn next_token(&mut self) -> Token {
let res = self.try_next_token();
self.unwrap_or_abort(res) self.unwrap_or_abort(res)
} }
@ -120,10 +158,6 @@ impl<'a> StringReader<'a> {
FatalError.raise(); FatalError.raise();
} }
fn fatal(&self, m: &str) -> FatalError {
self.fatal_span(self.peek_token.span, m)
}
crate fn emit_fatal_errors(&mut self) { crate fn emit_fatal_errors(&mut self) {
for err in &mut self.fatal_errs { for err in &mut self.fatal_errs {
err.emit(); err.emit();
@ -142,81 +176,6 @@ impl<'a> StringReader<'a> {
buffer buffer
} }
pub fn peek(&self) -> &Token {
&self.peek_token
}
/// For comments.rs, which hackily pokes into next_pos and ch
fn new_raw(sess: &'a ParseSess,
source_file: Lrc<syntax_pos::SourceFile>,
override_span: Option<Span>) -> Self {
let mut sr = StringReader::new_raw_internal(sess, source_file, override_span);
sr.bump();
sr
}
fn new_raw_internal(sess: &'a ParseSess, source_file: Lrc<syntax_pos::SourceFile>,
override_span: Option<Span>) -> Self
{
if source_file.src.is_none() {
sess.span_diagnostic.bug(&format!("Cannot lex source_file without source: {}",
source_file.name));
}
let src = (*source_file.src.as_ref().unwrap()).clone();
StringReader {
sess,
next_pos: source_file.start_pos,
pos: source_file.start_pos,
ch: Some('\n'),
source_file,
end_src_index: src.len(),
peek_token: Token::dummy(),
peek_span_src_raw: syntax_pos::DUMMY_SP,
src,
fatal_errs: Vec::new(),
override_span,
}
}
pub fn new_or_buffered_errs(sess: &'a ParseSess,
source_file: Lrc<syntax_pos::SourceFile>,
override_span: Option<Span>) -> Result<Self, Vec<Diagnostic>> {
let mut sr = StringReader::new_raw(sess, source_file, override_span);
if sr.advance_token().is_err() {
Err(sr.buffer_fatal_errors())
} else {
Ok(sr)
}
}
pub fn retokenize(sess: &'a ParseSess, mut span: Span) -> Self {
let begin = sess.source_map().lookup_byte_offset(span.lo());
let end = sess.source_map().lookup_byte_offset(span.hi());
// Make the range zero-length if the span is invalid.
if span.lo() > span.hi() || begin.sf.start_pos != end.sf.start_pos {
span = span.shrink_to_lo();
}
let mut sr = StringReader::new_raw_internal(sess, begin.sf, None);
// Seek the lexer to the right byte range.
sr.next_pos = span.lo();
sr.end_src_index = sr.src_index(span.hi());
sr.bump();
if sr.advance_token().is_err() {
sr.emit_fatal_errors();
FatalError.raise();
}
sr
}
#[inline] #[inline]
fn ch_is(&self, c: char) -> bool { fn ch_is(&self, c: char) -> bool {
self.ch == Some(c) self.ch == Some(c)
@ -269,30 +228,6 @@ impl<'a> StringReader<'a> {
self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..]) self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..])
} }
/// Advance peek_token to refer to the next token, and
/// possibly update the interner.
fn advance_token(&mut self) -> Result<(), ()> {
match self.scan_whitespace_or_comment() {
Some(comment) => {
self.peek_span_src_raw = comment.span;
self.peek_token = comment;
}
None => {
let (kind, start_pos, end_pos) = if self.is_eof() {
(token::Eof, self.source_file.end_pos, self.source_file.end_pos)
} else {
let start_pos = self.pos;
(self.next_token_inner()?, start_pos, self.pos)
};
let (real, raw) = self.mk_sp_and_raw(start_pos, end_pos);
self.peek_token = Token::new(kind, real);
self.peek_span_src_raw = raw;
}
}
Ok(())
}
#[inline] #[inline]
fn src_index(&self, pos: BytePos) -> usize { fn src_index(&self, pos: BytePos) -> usize {
(pos - self.source_file.start_pos).to_usize() (pos - self.source_file.start_pos).to_usize()
@ -1462,12 +1397,7 @@ mod tests {
teststr: String) teststr: String)
-> StringReader<'a> { -> StringReader<'a> {
let sf = sm.new_source_file(PathBuf::from(teststr.clone()).into(), teststr); let sf = sm.new_source_file(PathBuf::from(teststr.clone()).into(), teststr);
let mut sr = StringReader::new_raw(sess, sf, None); StringReader::new(sess, sf, None)
if sr.advance_token().is_err() {
sr.emit_fatal_errors();
FatalError.raise();
}
sr
} }
#[test] #[test]
@ -1489,17 +1419,17 @@ mod tests {
assert_eq!(tok1.kind, tok2.kind); assert_eq!(tok1.kind, tok2.kind);
assert_eq!(tok1.span, tok2.span); assert_eq!(tok1.span, tok2.span);
assert_eq!(string_reader.next_token(), token::Whitespace); assert_eq!(string_reader.next_token(), token::Whitespace);
// the 'main' id is already read:
assert_eq!(string_reader.pos.clone(), BytePos(28));
// read another token: // read another token:
let tok3 = string_reader.next_token(); let tok3 = string_reader.next_token();
assert_eq!(string_reader.pos.clone(), BytePos(28));
let tok4 = Token::new( let tok4 = Token::new(
mk_ident("main"), mk_ident("main"),
Span::new(BytePos(24), BytePos(28), NO_EXPANSION), Span::new(BytePos(24), BytePos(28), NO_EXPANSION),
); );
assert_eq!(tok3.kind, tok4.kind); assert_eq!(tok3.kind, tok4.kind);
assert_eq!(tok3.span, tok4.span); assert_eq!(tok3.span, tok4.span);
// the lparen is already read:
assert_eq!(string_reader.next_token(), token::OpenDelim(token::Paren));
assert_eq!(string_reader.pos.clone(), BytePos(29)) assert_eq!(string_reader.pos.clone(), BytePos(29))
}) })
} }

View File

@ -4,13 +4,14 @@ use crate::print::pprust::token_to_string;
use crate::parse::lexer::{StringReader, UnmatchedBrace}; use crate::parse::lexer::{StringReader, UnmatchedBrace};
use crate::parse::token::{self, Token}; use crate::parse::token::{self, Token};
use crate::parse::PResult; use crate::parse::PResult;
use crate::tokenstream::{DelimSpan, IsJoint::*, TokenStream, TokenTree, TreeAndJoint}; use crate::tokenstream::{DelimSpan, IsJoint::{self, *}, TokenStream, TokenTree, TreeAndJoint};
impl<'a> StringReader<'a> { impl<'a> StringReader<'a> {
crate fn into_token_trees(self) -> (PResult<'a, TokenStream>, Vec<UnmatchedBrace>) { crate fn into_token_trees(self) -> (PResult<'a, TokenStream>, Vec<UnmatchedBrace>) {
let mut tt_reader = TokenTreesReader { let mut tt_reader = TokenTreesReader {
string_reader: self, string_reader: self,
token: Token::dummy(), token: Token::dummy(),
joint_to_prev: Joint,
open_braces: Vec::new(), open_braces: Vec::new(),
unmatched_braces: Vec::new(), unmatched_braces: Vec::new(),
matching_delim_spans: Vec::new(), matching_delim_spans: Vec::new(),
@ -24,6 +25,7 @@ impl<'a> StringReader<'a> {
struct TokenTreesReader<'a> { struct TokenTreesReader<'a> {
string_reader: StringReader<'a>, string_reader: StringReader<'a>,
token: Token, token: Token,
joint_to_prev: IsJoint,
/// Stack of open delimiters and their spans. Used for error message. /// Stack of open delimiters and their spans. Used for error message.
open_braces: Vec<(token::DelimToken, Span)>, open_braces: Vec<(token::DelimToken, Span)>,
unmatched_braces: Vec<UnmatchedBrace>, unmatched_braces: Vec<UnmatchedBrace>,
@ -203,21 +205,26 @@ impl<'a> TokenTreesReader<'a> {
}, },
_ => { _ => {
let tt = TokenTree::Token(self.token.take()); let tt = TokenTree::Token(self.token.take());
// Note that testing for joint-ness here is done via the raw
// source span as the joint-ness is a property of the raw source
// rather than wanting to take `override_span` into account.
// Additionally, we actually check if the *next* pair of tokens
// is joint, but this is equivalent to checking the current pair.
let raw = self.string_reader.peek_span_src_raw;
self.real_token(); self.real_token();
let is_joint = raw.hi() == self.string_reader.peek_span_src_raw.lo() let is_joint = self.joint_to_prev == Joint && self.token.is_op();
&& self.token.is_op();
Ok((tt, if is_joint { Joint } else { NonJoint })) Ok((tt, if is_joint { Joint } else { NonJoint }))
} }
} }
} }
fn real_token(&mut self) { fn real_token(&mut self) {
self.token = self.string_reader.real_token(); self.joint_to_prev = Joint;
loop {
let token = self.string_reader.next_token();
match token.kind {
token::Whitespace | token::Comment | token::Shebang(_) => {
self.joint_to_prev = NonJoint;
}
_ => {
self.token = token;
return;
},
}
}
} }
} }

View File

@ -308,7 +308,7 @@ pub fn maybe_file_to_stream(
source_file: Lrc<SourceFile>, source_file: Lrc<SourceFile>,
override_span: Option<Span>, override_span: Option<Span>,
) -> Result<(TokenStream, Vec<lexer::UnmatchedBrace>), Vec<Diagnostic>> { ) -> Result<(TokenStream, Vec<lexer::UnmatchedBrace>), Vec<Diagnostic>> {
let srdr = lexer::StringReader::new_or_buffered_errs(sess, source_file, override_span)?; let srdr = lexer::StringReader::new(sess, source_file, override_span);
let (token_trees, unmatched_braces) = srdr.into_token_trees(); let (token_trees, unmatched_braces) = srdr.into_token_trees();
match token_trees { match token_trees {