Rollup merge of #62329 - matklad:no-peeking, r=petrochenkov

Remove support for 1-token lookahead from the lexer

`StringReader` maintained `peek_token` and `peek_span_src_raw` for look ahead.

`peek_token` was used only by rustdoc syntax coloring. After moving peeking logic into highlighter, I was able to remove `peek_token` from the lexer. I tried to use `iter::Peekable`, but that wasn't as pretty as I hoped, due to buffered fatal errors. So I went with hand-rolled peeking.

After that I've noticed that the only peeking behavior left was for raw tokens to test tt jointness. I've rewritten it in terms of trivia tokens, and not just spans.

After that it became possible to simplify the awkward constructor of the lexer, which could return `Err` if the first peeked token contained error.
This commit is contained in:
Mazdak Farrokhzad 2019-07-06 02:38:01 +02:00 committed by GitHub
commit 952ee77871
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 128 additions and 179 deletions

View File

@ -53,7 +53,7 @@ impl<'a> SpanUtils<'a> {
pub fn sub_span_of_token(&self, span: Span, tok: TokenKind) -> Option<Span> {
let mut toks = self.retokenise_span(span);
loop {
let next = toks.real_token();
let next = toks.next_token();
if next == token::Eof {
return None;
}

View File

@ -38,17 +38,17 @@ pub fn render_with_highlighting(
FileName::Custom(String::from("rustdoc-highlighting")),
src.to_owned(),
);
let highlight_result =
lexer::StringReader::new_or_buffered_errs(&sess, fm, None).and_then(|lexer| {
let mut classifier = Classifier::new(lexer, sess.source_map());
let highlight_result = {
let lexer = lexer::StringReader::new(&sess, fm, None);
let mut classifier = Classifier::new(lexer, sess.source_map());
let mut highlighted_source = vec![];
if classifier.write_source(&mut highlighted_source).is_err() {
Err(classifier.lexer.buffer_fatal_errors())
} else {
Ok(String::from_utf8_lossy(&highlighted_source).into_owned())
}
});
let mut highlighted_source = vec![];
if classifier.write_source(&mut highlighted_source).is_err() {
Err(classifier.lexer.buffer_fatal_errors())
} else {
Ok(String::from_utf8_lossy(&highlighted_source).into_owned())
}
};
match highlight_result {
Ok(highlighted_source) => {
@ -79,6 +79,7 @@ pub fn render_with_highlighting(
/// each span of text in sequence.
struct Classifier<'a> {
lexer: lexer::StringReader<'a>,
peek_token: Option<Token>,
source_map: &'a SourceMap,
// State of the classifier.
@ -178,6 +179,7 @@ impl<'a> Classifier<'a> {
fn new(lexer: lexer::StringReader<'a>, source_map: &'a SourceMap) -> Classifier<'a> {
Classifier {
lexer,
peek_token: None,
source_map,
in_attribute: false,
in_macro: false,
@ -187,10 +189,19 @@ impl<'a> Classifier<'a> {
/// Gets the next token out of the lexer.
fn try_next_token(&mut self) -> Result<Token, HighlightError> {
match self.lexer.try_next_token() {
Ok(token) => Ok(token),
Err(_) => Err(HighlightError::LexError),
if let Some(token) = self.peek_token.take() {
return Ok(token);
}
self.lexer.try_next_token().map_err(|()| HighlightError::LexError)
}
fn peek(&mut self) -> Result<&Token, HighlightError> {
if self.peek_token.is_none() {
self.peek_token = Some(
self.lexer.try_next_token().map_err(|()| HighlightError::LexError)?
);
}
Ok(self.peek_token.as_ref().unwrap())
}
/// Exhausts the `lexer` writing the output into `out`.
@ -234,7 +245,7 @@ impl<'a> Classifier<'a> {
// reference or dereference operator or a reference or pointer type, instead of the
// bit-and or multiplication operator.
token::BinOp(token::And) | token::BinOp(token::Star)
if self.lexer.peek() != &token::Whitespace => Class::RefKeyWord,
if self.peek()? != &token::Whitespace => Class::RefKeyWord,
// Consider this as part of a macro invocation if there was a
// leading identifier.
@ -257,7 +268,7 @@ impl<'a> Classifier<'a> {
token::Question => Class::QuestionMark,
token::Dollar => {
if self.lexer.peek().is_ident() {
if self.peek()?.is_ident() {
self.in_macro_nonterminal = true;
Class::MacroNonTerminal
} else {
@ -280,9 +291,9 @@ impl<'a> Classifier<'a> {
// as an attribute.
// Case 1: #![inner_attribute]
if self.lexer.peek() == &token::Not {
if self.peek()? == &token::Not {
self.try_next_token()?; // NOTE: consumes `!` token!
if self.lexer.peek() == &token::OpenDelim(token::Bracket) {
if self.peek()? == &token::OpenDelim(token::Bracket) {
self.in_attribute = true;
out.enter_span(Class::Attribute)?;
}
@ -292,7 +303,7 @@ impl<'a> Classifier<'a> {
}
// Case 2: #[outer_attribute]
if self.lexer.peek() == &token::OpenDelim(token::Bracket) {
if self.peek()? == &token::OpenDelim(token::Bracket) {
self.in_attribute = true;
out.enter_span(Class::Attribute)?;
}
@ -341,7 +352,7 @@ impl<'a> Classifier<'a> {
if self.in_macro_nonterminal {
self.in_macro_nonterminal = false;
Class::MacroNonTerminal
} else if self.lexer.peek() == &token::Not {
} else if self.peek()? == &token::Not {
self.in_macro = true;
Class::Macro
} else {

View File

@ -32,7 +32,8 @@ impl<'a, 'tcx> SyntaxChecker<'a, 'tcx> {
dox[code_block.code].to_owned(),
);
let errors = Lexer::new_or_buffered_errs(&sess, source_file, None).and_then(|mut lexer| {
let errors = {
let mut lexer = Lexer::new(&sess, source_file, None);
while let Ok(token::Token { kind, .. }) = lexer.try_next_token() {
if kind == token::Eof {
break;
@ -46,7 +47,7 @@ impl<'a, 'tcx> SyntaxChecker<'a, 'tcx> {
} else {
Ok(())
}
});
};
if let Err(errors) = errors {
let mut diag = if let Some(sp) =

View File

@ -268,7 +268,7 @@ fn read_block_comment(rdr: &mut StringReader<'_>,
while level > 0 {
debug!("=== block comment level {}", level);
if rdr.is_eof() {
rdr.fatal("unterminated block comment").raise();
rdr.fatal_span_(rdr.pos, rdr.pos, "unterminated block comment").raise();
}
if rdr.ch_is('\n') {
trim_whitespace_prefix_and_push_line(&mut lines, curr_line, col);
@ -346,7 +346,7 @@ pub fn gather_comments(sess: &ParseSess, path: FileName, srdr: &mut dyn Read) ->
srdr.read_to_string(&mut src).unwrap();
let cm = SourceMap::new(sess.source_map().path_mapping().clone());
let source_file = cm.new_source_file(path, src);
let mut rdr = lexer::StringReader::new_raw(sess, source_file, None);
let mut rdr = lexer::StringReader::new(sess, source_file, None);
let mut comments: Vec<Comment> = Vec::new();
let mut code_to_the_left = false; // Only code

View File

@ -38,9 +38,6 @@ pub struct StringReader<'a> {
crate source_file: Lrc<syntax_pos::SourceFile>,
/// Stop reading src at this index.
crate end_src_index: usize,
// cached:
peek_token: Token,
peek_span_src_raw: Span,
fatal_errs: Vec<DiagnosticBuilder<'a>>,
// cache a direct reference to the source text, so that we don't have to
// retrieve it via `self.source_file.src.as_ref().unwrap()` all the time.
@ -49,15 +46,59 @@ pub struct StringReader<'a> {
}
impl<'a> StringReader<'a> {
fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span {
self.mk_sp_and_raw(lo, hi).0
pub fn new(sess: &'a ParseSess,
source_file: Lrc<syntax_pos::SourceFile>,
override_span: Option<Span>) -> Self {
let mut sr = StringReader::new_internal(sess, source_file, override_span);
sr.bump();
sr
}
fn mk_sp_and_raw(&self, lo: BytePos, hi: BytePos) -> (Span, Span) {
let raw = Span::new(lo, hi, NO_EXPANSION);
let real = self.override_span.unwrap_or(raw);
pub fn retokenize(sess: &'a ParseSess, mut span: Span) -> Self {
let begin = sess.source_map().lookup_byte_offset(span.lo());
let end = sess.source_map().lookup_byte_offset(span.hi());
(real, raw)
// Make the range zero-length if the span is invalid.
if span.lo() > span.hi() || begin.sf.start_pos != end.sf.start_pos {
span = span.shrink_to_lo();
}
let mut sr = StringReader::new_internal(sess, begin.sf, None);
// Seek the lexer to the right byte range.
sr.next_pos = span.lo();
sr.end_src_index = sr.src_index(span.hi());
sr.bump();
sr
}
fn new_internal(sess: &'a ParseSess, source_file: Lrc<syntax_pos::SourceFile>,
override_span: Option<Span>) -> Self
{
if source_file.src.is_none() {
sess.span_diagnostic.bug(&format!("Cannot lex source_file without source: {}",
source_file.name));
}
let src = (*source_file.src.as_ref().unwrap()).clone();
StringReader {
sess,
next_pos: source_file.start_pos,
pos: source_file.start_pos,
ch: Some('\n'),
source_file,
end_src_index: src.len(),
src,
fatal_errs: Vec::new(),
override_span,
}
}
fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span {
self.override_span.unwrap_or_else(|| Span::new(lo, hi, NO_EXPANSION))
}
fn unwrap_or_abort(&mut self, res: Result<Token, ()>) -> Token {
@ -70,35 +111,32 @@ impl<'a> StringReader<'a> {
}
}
fn next_token(&mut self) -> Token where Self: Sized {
let res = self.try_next_token();
self.unwrap_or_abort(res)
}
/// Returns the next token. EFFECT: advances the string_reader.
/// Returns the next token, including trivia like whitespace or comments.
///
/// `Err(())` means that some errors were encountered, which can be
/// retrieved using `buffer_fatal_errors`.
pub fn try_next_token(&mut self) -> Result<Token, ()> {
assert!(self.fatal_errs.is_empty());
let ret_val = self.peek_token.take();
self.advance_token()?;
Ok(ret_val)
}
fn try_real_token(&mut self) -> Result<Token, ()> {
let mut t = self.try_next_token()?;
loop {
match t.kind {
token::Whitespace | token::Comment | token::Shebang(_) => {
t = self.try_next_token()?;
}
_ => break,
match self.scan_whitespace_or_comment() {
Some(comment) => Ok(comment),
None => {
let (kind, start_pos, end_pos) = if self.is_eof() {
(token::Eof, self.source_file.end_pos, self.source_file.end_pos)
} else {
let start_pos = self.pos;
(self.next_token_inner()?, start_pos, self.pos)
};
let span = self.mk_sp(start_pos, end_pos);
Ok(Token::new(kind, span))
}
}
Ok(t)
}
pub fn real_token(&mut self) -> Token {
let res = self.try_real_token();
/// Returns the next token, including trivia like whitespace or comments.
///
/// Aborts in case of an error.
pub fn next_token(&mut self) -> Token {
let res = self.try_next_token();
self.unwrap_or_abort(res)
}
@ -120,10 +158,6 @@ impl<'a> StringReader<'a> {
FatalError.raise();
}
fn fatal(&self, m: &str) -> FatalError {
self.fatal_span(self.peek_token.span, m)
}
crate fn emit_fatal_errors(&mut self) {
for err in &mut self.fatal_errs {
err.emit();
@ -142,81 +176,6 @@ impl<'a> StringReader<'a> {
buffer
}
pub fn peek(&self) -> &Token {
&self.peek_token
}
/// For comments.rs, which hackily pokes into next_pos and ch
fn new_raw(sess: &'a ParseSess,
source_file: Lrc<syntax_pos::SourceFile>,
override_span: Option<Span>) -> Self {
let mut sr = StringReader::new_raw_internal(sess, source_file, override_span);
sr.bump();
sr
}
fn new_raw_internal(sess: &'a ParseSess, source_file: Lrc<syntax_pos::SourceFile>,
override_span: Option<Span>) -> Self
{
if source_file.src.is_none() {
sess.span_diagnostic.bug(&format!("Cannot lex source_file without source: {}",
source_file.name));
}
let src = (*source_file.src.as_ref().unwrap()).clone();
StringReader {
sess,
next_pos: source_file.start_pos,
pos: source_file.start_pos,
ch: Some('\n'),
source_file,
end_src_index: src.len(),
peek_token: Token::dummy(),
peek_span_src_raw: syntax_pos::DUMMY_SP,
src,
fatal_errs: Vec::new(),
override_span,
}
}
pub fn new_or_buffered_errs(sess: &'a ParseSess,
source_file: Lrc<syntax_pos::SourceFile>,
override_span: Option<Span>) -> Result<Self, Vec<Diagnostic>> {
let mut sr = StringReader::new_raw(sess, source_file, override_span);
if sr.advance_token().is_err() {
Err(sr.buffer_fatal_errors())
} else {
Ok(sr)
}
}
pub fn retokenize(sess: &'a ParseSess, mut span: Span) -> Self {
let begin = sess.source_map().lookup_byte_offset(span.lo());
let end = sess.source_map().lookup_byte_offset(span.hi());
// Make the range zero-length if the span is invalid.
if span.lo() > span.hi() || begin.sf.start_pos != end.sf.start_pos {
span = span.shrink_to_lo();
}
let mut sr = StringReader::new_raw_internal(sess, begin.sf, None);
// Seek the lexer to the right byte range.
sr.next_pos = span.lo();
sr.end_src_index = sr.src_index(span.hi());
sr.bump();
if sr.advance_token().is_err() {
sr.emit_fatal_errors();
FatalError.raise();
}
sr
}
#[inline]
fn ch_is(&self, c: char) -> bool {
self.ch == Some(c)
@ -269,30 +228,6 @@ impl<'a> StringReader<'a> {
self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..])
}
/// Advance peek_token to refer to the next token, and
/// possibly update the interner.
fn advance_token(&mut self) -> Result<(), ()> {
match self.scan_whitespace_or_comment() {
Some(comment) => {
self.peek_span_src_raw = comment.span;
self.peek_token = comment;
}
None => {
let (kind, start_pos, end_pos) = if self.is_eof() {
(token::Eof, self.source_file.end_pos, self.source_file.end_pos)
} else {
let start_pos = self.pos;
(self.next_token_inner()?, start_pos, self.pos)
};
let (real, raw) = self.mk_sp_and_raw(start_pos, end_pos);
self.peek_token = Token::new(kind, real);
self.peek_span_src_raw = raw;
}
}
Ok(())
}
#[inline]
fn src_index(&self, pos: BytePos) -> usize {
(pos - self.source_file.start_pos).to_usize()
@ -1462,12 +1397,7 @@ mod tests {
teststr: String)
-> StringReader<'a> {
let sf = sm.new_source_file(PathBuf::from(teststr.clone()).into(), teststr);
let mut sr = StringReader::new_raw(sess, sf, None);
if sr.advance_token().is_err() {
sr.emit_fatal_errors();
FatalError.raise();
}
sr
StringReader::new(sess, sf, None)
}
#[test]
@ -1489,17 +1419,17 @@ mod tests {
assert_eq!(tok1.kind, tok2.kind);
assert_eq!(tok1.span, tok2.span);
assert_eq!(string_reader.next_token(), token::Whitespace);
// the 'main' id is already read:
assert_eq!(string_reader.pos.clone(), BytePos(28));
// read another token:
let tok3 = string_reader.next_token();
assert_eq!(string_reader.pos.clone(), BytePos(28));
let tok4 = Token::new(
mk_ident("main"),
Span::new(BytePos(24), BytePos(28), NO_EXPANSION),
);
assert_eq!(tok3.kind, tok4.kind);
assert_eq!(tok3.span, tok4.span);
// the lparen is already read:
assert_eq!(string_reader.next_token(), token::OpenDelim(token::Paren));
assert_eq!(string_reader.pos.clone(), BytePos(29))
})
}

View File

@ -4,13 +4,14 @@ use crate::print::pprust::token_to_string;
use crate::parse::lexer::{StringReader, UnmatchedBrace};
use crate::parse::token::{self, Token};
use crate::parse::PResult;
use crate::tokenstream::{DelimSpan, IsJoint::*, TokenStream, TokenTree, TreeAndJoint};
use crate::tokenstream::{DelimSpan, IsJoint::{self, *}, TokenStream, TokenTree, TreeAndJoint};
impl<'a> StringReader<'a> {
crate fn into_token_trees(self) -> (PResult<'a, TokenStream>, Vec<UnmatchedBrace>) {
let mut tt_reader = TokenTreesReader {
string_reader: self,
token: Token::dummy(),
joint_to_prev: Joint,
open_braces: Vec::new(),
unmatched_braces: Vec::new(),
matching_delim_spans: Vec::new(),
@ -24,6 +25,7 @@ impl<'a> StringReader<'a> {
struct TokenTreesReader<'a> {
string_reader: StringReader<'a>,
token: Token,
joint_to_prev: IsJoint,
/// Stack of open delimiters and their spans. Used for error message.
open_braces: Vec<(token::DelimToken, Span)>,
unmatched_braces: Vec<UnmatchedBrace>,
@ -203,21 +205,26 @@ impl<'a> TokenTreesReader<'a> {
},
_ => {
let tt = TokenTree::Token(self.token.take());
// Note that testing for joint-ness here is done via the raw
// source span as the joint-ness is a property of the raw source
// rather than wanting to take `override_span` into account.
// Additionally, we actually check if the *next* pair of tokens
// is joint, but this is equivalent to checking the current pair.
let raw = self.string_reader.peek_span_src_raw;
self.real_token();
let is_joint = raw.hi() == self.string_reader.peek_span_src_raw.lo()
&& self.token.is_op();
let is_joint = self.joint_to_prev == Joint && self.token.is_op();
Ok((tt, if is_joint { Joint } else { NonJoint }))
}
}
}
fn real_token(&mut self) {
self.token = self.string_reader.real_token();
self.joint_to_prev = Joint;
loop {
let token = self.string_reader.next_token();
match token.kind {
token::Whitespace | token::Comment | token::Shebang(_) => {
self.joint_to_prev = NonJoint;
}
_ => {
self.token = token;
return;
},
}
}
}
}

View File

@ -308,7 +308,7 @@ pub fn maybe_file_to_stream(
source_file: Lrc<SourceFile>,
override_span: Option<Span>,
) -> Result<(TokenStream, Vec<lexer::UnmatchedBrace>), Vec<Diagnostic>> {
let srdr = lexer::StringReader::new_or_buffered_errs(sess, source_file, override_span)?;
let srdr = lexer::StringReader::new(sess, source_file, override_span);
let (token_trees, unmatched_braces) = srdr.into_token_trees();
match token_trees {