Rollup merge of #72884 - Julian-Wollersberger:raw_str_error_cleanup, r=petrochenkov

RawString error reporting cleanup

I simplified how errors with raw string are represented in the lexer and reportet in the parser, by using one enum instead of two structs with impls. This makes 70 code lines obsolete.

I also noticed some other things (2nd commit) and added a missing test for the `too many '#' symbols' error.

My original intent was to improve performance, but the only thing I found was to inline some functions in `cursor.rs`. It's effect is barely measurable, though.

There is one open question. Before, the compiler aborts when encountering the `too many '#' symbols` error. Now the lexer says in this case that there are 0 hashes, and then later the parser aborts on the error.
I'm worrying that the parser may be changed to recover and continue, and then later stages will see the wrong numer of hashes and act strange. (eg. the `format!` macro expansion).
Is that possibility important enough today to worry about it?
This commit is contained in:
Dylan DPC 2020-06-02 18:29:57 +02:00 committed by GitHub
commit 466d3e702a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 90 additions and 240 deletions

View File

@ -29,7 +29,7 @@ mod tests;
use self::LiteralKind::*;
use self::TokenKind::*;
use crate::cursor::{Cursor, EOF_CHAR};
use std::convert::TryInto;
use std::convert::TryFrom;
/// Parsed token.
/// It doesn't contain information about data that has been parsed,
@ -142,84 +142,24 @@ pub enum LiteralKind {
/// "b"abc"", "b"abc"
ByteStr { terminated: bool },
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
RawStr(UnvalidatedRawStr),
RawStr { n_hashes: u16, err: Option<RawStrError> },
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
RawByteStr(UnvalidatedRawStr),
}
/// Represents something that looks like a raw string, but may have some
/// problems. Use `.validate()` to convert it into something
/// usable.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct UnvalidatedRawStr {
/// The prefix (`r###"`) is valid
valid_start: bool,
/// The postfix (`"###`) is valid
valid_end: bool,
/// The number of leading `#`
n_start_hashes: usize,
/// The number of trailing `#`. `n_end_hashes` <= `n_start_hashes`
n_end_hashes: usize,
/// The offset starting at `r` or `br` where the user may have intended to end the string.
/// Currently, it is the longest sequence of pattern `"#+"`.
possible_terminator_offset: Option<usize>,
RawByteStr { n_hashes: u16, err: Option<RawStrError> },
}
/// Error produced validating a raw string. Represents cases like:
/// - `r##~"abcde"##`: `LexRawStrError::InvalidStarter`
/// - `r###"abcde"##`: `LexRawStrError::NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
/// - Too many `#`s (>65536): `TooManyDelimiters`
/// - `r##~"abcde"##`: `InvalidStarter`
/// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
/// - Too many `#`s (>65535): `TooManyDelimiters`
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum LexRawStrError {
pub enum RawStrError {
/// Non `#` characters exist between `r` and `"` eg. `r#~"..`
InvalidStarter,
InvalidStarter { bad_char: char },
/// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they
/// may have intended to terminate it.
NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
/// More than 65536 `#`s exist.
TooManyDelimiters,
}
/// Raw String that contains a valid prefix (`#+"`) and postfix (`"#+`) where
/// there are a matching number of `#` characters in both. Note that this will
/// not consume extra trailing `#` characters: `r###"abcde"####` is lexed as a
/// `ValidatedRawString { n_hashes: 3 }` followed by a `#` token.
#[derive(Debug, Eq, PartialEq, Copy, Clone)]
pub struct ValidatedRawStr {
n_hashes: u16,
}
impl ValidatedRawStr {
pub fn num_hashes(&self) -> u16 {
self.n_hashes
}
}
impl UnvalidatedRawStr {
pub fn validate(self) -> Result<ValidatedRawStr, LexRawStrError> {
if !self.valid_start {
return Err(LexRawStrError::InvalidStarter);
}
// Only up to 65535 `#`s are allowed in raw strings
let n_start_safe: u16 =
self.n_start_hashes.try_into().map_err(|_| LexRawStrError::TooManyDelimiters)?;
if self.n_start_hashes > self.n_end_hashes || !self.valid_end {
Err(LexRawStrError::NoTerminator {
expected: self.n_start_hashes,
found: self.n_end_hashes,
possible_terminator_offset: self.possible_terminator_offset,
})
} else {
// Since the lexer should never produce a literal with n_end > n_start, if n_start <= n_end,
// they must be equal.
debug_assert_eq!(self.n_start_hashes, self.n_end_hashes);
Ok(ValidatedRawStr { n_hashes: n_start_safe })
}
}
/// More than 65535 `#`s exist.
TooManyDelimiters { found: usize },
}
/// Base of numeric literal encoding according to its prefix.
@ -354,12 +294,12 @@ impl Cursor<'_> {
'r' => match (self.first(), self.second()) {
('#', c1) if is_id_start(c1) => self.raw_ident(),
('#', _) | ('"', _) => {
let raw_str_i = self.raw_double_quoted_string(1);
let (n_hashes, err) = self.raw_double_quoted_string(1);
let suffix_start = self.len_consumed();
if raw_str_i.n_end_hashes == raw_str_i.n_start_hashes {
if err.is_none() {
self.eat_literal_suffix();
}
let kind = RawStr(raw_str_i);
let kind = RawStr { n_hashes, err };
Literal { kind, suffix_start }
}
_ => self.ident(),
@ -389,14 +329,12 @@ impl Cursor<'_> {
}
('r', '"') | ('r', '#') => {
self.bump();
let raw_str_i = self.raw_double_quoted_string(2);
let (n_hashes, err) = self.raw_double_quoted_string(2);
let suffix_start = self.len_consumed();
let terminated = raw_str_i.n_start_hashes == raw_str_i.n_end_hashes;
if terminated {
if err.is_none() {
self.eat_literal_suffix();
}
let kind = RawByteStr(raw_str_i);
let kind = RawByteStr { n_hashes, err };
Literal { kind, suffix_start }
}
_ => self.ident(),
@ -692,27 +630,34 @@ impl Cursor<'_> {
false
}
/// Eats the double-quoted string and returns an `UnvalidatedRawStr`.
fn raw_double_quoted_string(&mut self, prefix_len: usize) -> UnvalidatedRawStr {
/// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u16, Option<RawStrError>) {
// Wrap the actual function to handle the error with too many hashes.
// This way, it eats the whole raw string.
let (n_hashes, err) = self.raw_string_unvalidated(prefix_len);
// Only up to 65535 `#`s are allowed in raw strings
match u16::try_from(n_hashes) {
Ok(num) => (num, err),
// We lie about the number of hashes here :P
Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })),
}
}
fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option<RawStrError>) {
debug_assert!(self.prev() == 'r');
let mut valid_start: bool = false;
let start_pos = self.len_consumed();
let (mut possible_terminator_offset, mut max_hashes) = (None, 0);
let mut possible_terminator_offset = None;
let mut max_hashes = 0;
// Count opening '#' symbols.
let n_start_hashes = self.eat_while(|c| c == '#');
// Check that string is started.
match self.bump() {
Some('"') => valid_start = true,
_ => {
return UnvalidatedRawStr {
valid_start,
valid_end: false,
n_start_hashes,
n_end_hashes: 0,
possible_terminator_offset,
};
Some('"') => (),
c => {
let c = c.unwrap_or(EOF_CHAR);
return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c }));
}
}
@ -722,13 +667,14 @@ impl Cursor<'_> {
self.eat_while(|c| c != '"');
if self.is_eof() {
return UnvalidatedRawStr {
valid_start,
valid_end: false,
return (
n_start_hashes,
n_end_hashes: max_hashes,
possible_terminator_offset,
};
Some(RawStrError::NoTerminator {
expected: n_start_hashes,
found: max_hashes,
possible_terminator_offset,
}),
);
}
// Eat closing double quote.
@ -737,7 +683,7 @@ impl Cursor<'_> {
// Check that amount of closing '#' symbols
// is equal to the amount of opening ones.
// Note that this will not consume extra trailing `#` characters:
// `r###"abcde"####` is lexed as a `LexedRawString { n_hashes: 3 }`
// `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }`
// followed by a `#` token.
let mut hashes_left = n_start_hashes;
let is_closing_hash = |c| {
@ -751,13 +697,7 @@ impl Cursor<'_> {
let n_end_hashes = self.eat_while(is_closing_hash);
if n_end_hashes == n_start_hashes {
return UnvalidatedRawStr {
valid_start,
valid_end: true,
n_start_hashes,
n_end_hashes,
possible_terminator_offset: None,
};
return (n_start_hashes, None);
} else if n_end_hashes > max_hashes {
// Keep track of possible terminators to give a hint about
// where there might be a missing terminator

View File

@ -2,77 +2,37 @@
mod tests {
use crate::*;
fn check_raw_str(
s: &str,
expected: UnvalidatedRawStr,
validated: Result<ValidatedRawStr, LexRawStrError>,
) {
fn check_raw_str(s: &str, expected_hashes: u16, expected_err: Option<RawStrError>) {
let s = &format!("r{}", s);
let mut cursor = Cursor::new(s);
cursor.bump();
let tok = cursor.raw_double_quoted_string(0);
assert_eq!(tok, expected);
assert_eq!(tok.validate(), validated);
let (n_hashes, err) = cursor.raw_double_quoted_string(0);
assert_eq!(n_hashes, expected_hashes);
assert_eq!(err, expected_err);
}
#[test]
fn test_naked_raw_str() {
check_raw_str(
r#""abc""#,
UnvalidatedRawStr {
n_start_hashes: 0,
n_end_hashes: 0,
valid_start: true,
valid_end: true,
possible_terminator_offset: None,
},
Ok(ValidatedRawStr { n_hashes: 0 }),
);
check_raw_str(r#""abc""#, 0, None);
}
#[test]
fn test_raw_no_start() {
check_raw_str(
r##""abc"#"##,
UnvalidatedRawStr {
n_start_hashes: 0,
n_end_hashes: 0,
valid_start: true,
valid_end: true,
possible_terminator_offset: None,
},
Ok(ValidatedRawStr { n_hashes: 0 }),
);
check_raw_str(r##""abc"#"##, 0, None);
}
#[test]
fn test_too_many_terminators() {
// this error is handled in the parser later
check_raw_str(
r###"#"abc"##"###,
UnvalidatedRawStr {
n_start_hashes: 1,
n_end_hashes: 1,
valid_end: true,
valid_start: true,
possible_terminator_offset: None,
},
Ok(ValidatedRawStr { n_hashes: 1 }),
);
check_raw_str(r###"#"abc"##"###, 1, None);
}
#[test]
fn test_unterminated() {
check_raw_str(
r#"#"abc"#,
UnvalidatedRawStr {
n_start_hashes: 1,
n_end_hashes: 0,
valid_end: false,
valid_start: true,
possible_terminator_offset: None,
},
Err(LexRawStrError::NoTerminator {
1,
Some(RawStrError::NoTerminator {
expected: 1,
found: 0,
possible_terminator_offset: None,
@ -80,14 +40,8 @@ mod tests {
);
check_raw_str(
r###"##"abc"#"###,
UnvalidatedRawStr {
n_start_hashes: 2,
n_end_hashes: 1,
valid_start: true,
valid_end: false,
possible_terminator_offset: Some(7),
},
Err(LexRawStrError::NoTerminator {
2,
Some(RawStrError::NoTerminator {
expected: 2,
found: 1,
possible_terminator_offset: Some(7),
@ -96,14 +50,8 @@ mod tests {
// We're looking for "# not just any #
check_raw_str(
r###"##"abc#"###,
UnvalidatedRawStr {
n_start_hashes: 2,
n_end_hashes: 0,
valid_start: true,
valid_end: false,
possible_terminator_offset: None,
},
Err(LexRawStrError::NoTerminator {
2,
Some(RawStrError::NoTerminator {
expected: 2,
found: 0,
possible_terminator_offset: None,
@ -113,17 +61,7 @@ mod tests {
#[test]
fn test_invalid_start() {
check_raw_str(
r##"#~"abc"#"##,
UnvalidatedRawStr {
n_start_hashes: 1,
n_end_hashes: 0,
valid_start: false,
valid_end: false,
possible_terminator_offset: None,
},
Err(LexRawStrError::InvalidStarter),
);
check_raw_str(r##"#~"abc"#"##, 1, Some(RawStrError::InvalidStarter { bad_char: '~' }));
}
#[test]
@ -131,14 +69,8 @@ mod tests {
// https://github.com/rust-lang/rust/issues/70677
check_raw_str(
r#"""#,
UnvalidatedRawStr {
n_start_hashes: 0,
n_end_hashes: 0,
valid_start: true,
valid_end: false,
possible_terminator_offset: None,
},
Err(LexRawStrError::NoTerminator {
0,
Some(RawStrError::NoTerminator {
expected: 0,
found: 0,
possible_terminator_offset: None,

View File

@ -3,7 +3,7 @@ use rustc_ast::util::comments;
use rustc_data_structures::sync::Lrc;
use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError};
use rustc_lexer::Base;
use rustc_lexer::{unescape, LexRawStrError, UnvalidatedRawStr, ValidatedRawStr};
use rustc_lexer::{unescape, RawStrError};
use rustc_session::parse::ParseSess;
use rustc_span::symbol::{sym, Symbol};
use rustc_span::{BytePos, Pos, Span};
@ -49,13 +49,12 @@ impl<'a> StringReader<'a> {
// Make sure external source is loaded first, before accessing it.
// While this can't show up during normal parsing, `retokenize` may
// be called with a source file from an external crate.
sess.source_map().ensure_source_file_source_present(source_file.clone());
sess.source_map().ensure_source_file_source_present(Lrc::clone(&source_file));
// FIXME(eddyb) use `Lrc<str>` or similar to avoid cloning the `String`.
let src = if let Some(src) = &source_file.src {
src.clone()
Lrc::clone(&src)
} else if let Some(src) = source_file.external_src.borrow().get_source() {
src.clone()
Lrc::clone(&src)
} else {
sess.span_diagnostic
.bug(&format!("cannot lex `source_file` without source: {}", source_file.name));
@ -125,10 +124,7 @@ impl<'a> StringReader<'a> {
debug!("try_next_token: {:?}({:?})", token.kind, self.str_from(start));
// This could use `?`, but that makes code significantly (10-20%) slower.
// https://github.com/rust-lang/rust/issues/37939
let kind = self.cook_lexer_token(token.kind, start);
let span = self.mk_sp(start, self.pos);
Token::new(kind, span)
}
@ -153,15 +149,6 @@ impl<'a> StringReader<'a> {
self.err_span(self.mk_sp(from_pos, to_pos), m)
}
fn struct_span_fatal(
&self,
from_pos: BytePos,
to_pos: BytePos,
m: &str,
) -> DiagnosticBuilder<'a> {
self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), m)
}
fn struct_fatal_span_char(
&self,
from_pos: BytePos,
@ -359,15 +346,13 @@ impl<'a> StringReader<'a> {
}
(token::ByteStr, Mode::ByteStr, 2, 1) // b" "
}
rustc_lexer::LiteralKind::RawStr(unvalidated_raw_str) => {
let valid_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str);
let n_hashes = valid_raw_str.num_hashes();
rustc_lexer::LiteralKind::RawStr { n_hashes, err } => {
self.report_raw_str_error(start, err);
let n = u32::from(n_hashes);
(token::StrRaw(n_hashes), Mode::RawStr, 2 + n, 1 + n) // r##" "##
}
rustc_lexer::LiteralKind::RawByteStr(unvalidated_raw_str) => {
let validated_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str);
let n_hashes = validated_raw_str.num_hashes();
rustc_lexer::LiteralKind::RawByteStr { n_hashes, err } => {
self.report_raw_str_error(start, err);
let n = u32::from(n_hashes);
(token::ByteStrRaw(n_hashes), Mode::RawByteStr, 3 + n, 1 + n) // br##" "##
}
@ -382,12 +367,7 @@ impl<'a> StringReader<'a> {
}
rustc_lexer::LiteralKind::Float { base, empty_exponent } => {
if empty_exponent {
let mut err = self.struct_span_fatal(
start,
self.pos,
"expected at least one digit in exponent",
);
err.emit();
self.err_span_(start, self.pos, "expected at least one digit in exponent");
}
match base {
@ -459,33 +439,25 @@ impl<'a> StringReader<'a> {
}
}
fn validate_and_report_errors(
&self,
start: BytePos,
unvalidated_raw_str: UnvalidatedRawStr,
) -> ValidatedRawStr {
match unvalidated_raw_str.validate() {
Err(LexRawStrError::InvalidStarter) => self.report_non_started_raw_string(start),
Err(LexRawStrError::NoTerminator { expected, found, possible_terminator_offset }) => {
self.report_unterminated_raw_string(
start,
expected,
possible_terminator_offset,
found,
)
fn report_raw_str_error(&self, start: BytePos, opt_err: Option<RawStrError>) {
match opt_err {
Some(RawStrError::InvalidStarter { bad_char }) => {
self.report_non_started_raw_string(start, bad_char)
}
Err(LexRawStrError::TooManyDelimiters) => self.report_too_many_hashes(start),
Ok(valid) => valid,
Some(RawStrError::NoTerminator { expected, found, possible_terminator_offset }) => self
.report_unterminated_raw_string(start, expected, possible_terminator_offset, found),
Some(RawStrError::TooManyDelimiters { found }) => {
self.report_too_many_hashes(start, found)
}
None => (),
}
}
fn report_non_started_raw_string(&self, start: BytePos) -> ! {
let bad_char = self.str_from(start).chars().last().unwrap();
fn report_non_started_raw_string(&self, start: BytePos, bad_char: char) -> ! {
self.struct_fatal_span_char(
start,
self.pos,
"found invalid character; only `#` is allowed \
in raw string delimitation",
"found invalid character; only `#` is allowed in raw string delimitation",
bad_char,
)
.emit();
@ -530,11 +502,17 @@ impl<'a> StringReader<'a> {
FatalError.raise()
}
fn report_too_many_hashes(&self, start: BytePos) -> ! {
/// Note: It was decided to not add a test case, because it would be to big.
/// https://github.com/rust-lang/rust/pull/50296#issuecomment-392135180
fn report_too_many_hashes(&self, start: BytePos, found: usize) -> ! {
self.fatal_span_(
start,
self.pos,
"too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols",
&format!(
"too many `#` symbols: raw strings may be delimited \
by up to 65535 `#` symbols, but found {}",
found
),
)
.raise();
}