Rollup merge of #72884 - Julian-Wollersberger:raw_str_error_cleanup, r=petrochenkov

RawString error reporting cleanup I simplified how errors with raw string are represented in the lexer and reportet in the parser, by using one enum instead of two structs with impls. This makes 70 code lines obsolete. I also noticed some other things (2nd commit) and added a missing test for the `too many '#' symbols' error. My original intent was to improve performance, but the only thing I found was to inline some functions in `cursor.rs`. It's effect is barely measurable, though. There is one open question. Before, the compiler aborts when encountering the `too many '#' symbols` error. Now the lexer says in this case that there are 0 hashes, and then later the parser aborts on the error. I'm worrying that the parser may be changed to recover and continue, and then later stages will see the wrong numer of hashes and act strange. (eg. the `format!` macro expansion). Is that possibility important enough today to worry about it?
2020-06-02 18:29:57 +02:00 · 2020-06-02 18:29:57 +02:00 · 466d3e702a
commit 466d3e702a
parent eeaf497b2a 7be8077b3f
3 changed files with 90 additions and 240 deletions
--- a/src/librustc_lexer/src/lib.rs
+++ b/src/librustc_lexer/src/lib.rs
@ -29,7 +29,7 @@ mod tests;
 use self::LiteralKind::*;
 use self::TokenKind::*;
 use crate::cursor::{Cursor, EOF_CHAR};
-use std::convert::TryInto;
+use std::convert::TryFrom;

 /// Parsed token.
 /// It doesn't contain information about data that has been parsed,
@ -142,84 +142,24 @@ pub enum LiteralKind {
    /// "b"abc"", "b"abc"
    ByteStr { terminated: bool },
    /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
-    RawStr(UnvalidatedRawStr),
+    RawStr { n_hashes: u16, err: Option<RawStrError> },
    /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
-    RawByteStr(UnvalidatedRawStr),
-}
-
-/// Represents something that looks like a raw string, but may have some
-/// problems. Use `.validate()` to convert it into something
-/// usable.
-#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
-pub struct UnvalidatedRawStr {
-    /// The prefix (`r###"`) is valid
-    valid_start: bool,
-
-    /// The postfix (`"###`) is valid
-    valid_end: bool,
-
-    /// The number of leading `#`
-    n_start_hashes: usize,
-    /// The number of trailing `#`. `n_end_hashes` <= `n_start_hashes`
-    n_end_hashes: usize,
-    /// The offset starting at `r` or `br` where the user may have intended to end the string.
-    /// Currently, it is the longest sequence of pattern `"#+"`.
-    possible_terminator_offset: Option<usize>,
+    RawByteStr { n_hashes: u16, err: Option<RawStrError> },
 }

 /// Error produced validating a raw string. Represents cases like:
-/// - `r##~"abcde"##`: `LexRawStrError::InvalidStarter`
-/// - `r###"abcde"##`: `LexRawStrError::NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
-/// - Too many `#`s (>65536): `TooManyDelimiters`
+/// - `r##~"abcde"##`: `InvalidStarter`
+/// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
+/// - Too many `#`s (>65535): `TooManyDelimiters`
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
-pub enum LexRawStrError {
+pub enum RawStrError {
    /// Non `#` characters exist between `r` and `"` eg. `r#~"..`
-    InvalidStarter,
+    InvalidStarter { bad_char: char },
    /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they
    /// may have intended to terminate it.
    NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
-    /// More than 65536 `#`s exist.
-    TooManyDelimiters,
-}
-
-/// Raw String that contains a valid prefix (`#+"`) and postfix (`"#+`) where
-/// there are a matching number of `#` characters in both. Note that this will
-/// not consume extra trailing `#` characters: `r###"abcde"####` is lexed as a
-/// `ValidatedRawString { n_hashes: 3 }` followed by a `#` token.
-#[derive(Debug, Eq, PartialEq, Copy, Clone)]
-pub struct ValidatedRawStr {
-    n_hashes: u16,
-}
-
-impl ValidatedRawStr {
-    pub fn num_hashes(&self) -> u16 {
-        self.n_hashes
-    }
-}
-
-impl UnvalidatedRawStr {
-    pub fn validate(self) -> Result<ValidatedRawStr, LexRawStrError> {
-        if !self.valid_start {
-            return Err(LexRawStrError::InvalidStarter);
-        }
-
-        // Only up to 65535 `#`s are allowed in raw strings
-        let n_start_safe: u16 =
-            self.n_start_hashes.try_into().map_err(|_| LexRawStrError::TooManyDelimiters)?;
-
-        if self.n_start_hashes > self.n_end_hashes || !self.valid_end {
-            Err(LexRawStrError::NoTerminator {
-                expected: self.n_start_hashes,
-                found: self.n_end_hashes,
-                possible_terminator_offset: self.possible_terminator_offset,
-            })
-        } else {
-            // Since the lexer should never produce a literal with n_end > n_start, if n_start <= n_end,
-            // they must be equal.
-            debug_assert_eq!(self.n_start_hashes, self.n_end_hashes);
-            Ok(ValidatedRawStr { n_hashes: n_start_safe })
-        }
-    }
+    /// More than 65535 `#`s exist.
+    TooManyDelimiters { found: usize },
 }

 /// Base of numeric literal encoding according to its prefix.
@ -354,12 +294,12 @@ impl Cursor<'_> {
            'r' => match (self.first(), self.second()) {
                ('#', c1) if is_id_start(c1) => self.raw_ident(),
                ('#', _) | ('"', _) => {
-                    let raw_str_i = self.raw_double_quoted_string(1);
+                    let (n_hashes, err) = self.raw_double_quoted_string(1);
                    let suffix_start = self.len_consumed();
-                    if raw_str_i.n_end_hashes == raw_str_i.n_start_hashes {
+                    if err.is_none() {
                        self.eat_literal_suffix();
                    }
-                    let kind = RawStr(raw_str_i);
+                    let kind = RawStr { n_hashes, err };
                    Literal { kind, suffix_start }
                }
                _ => self.ident(),
@ -389,14 +329,12 @@ impl Cursor<'_> {
                }
                ('r', '"') | ('r', '#') => {
                    self.bump();
-                    let raw_str_i = self.raw_double_quoted_string(2);
+                    let (n_hashes, err) = self.raw_double_quoted_string(2);
                    let suffix_start = self.len_consumed();
-                    let terminated = raw_str_i.n_start_hashes == raw_str_i.n_end_hashes;
-                    if terminated {
+                    if err.is_none() {
                        self.eat_literal_suffix();
                    }
-
-                    let kind = RawByteStr(raw_str_i);
+                    let kind = RawByteStr { n_hashes, err };
                    Literal { kind, suffix_start }
                }
                _ => self.ident(),
@ -692,27 +630,34 @@ impl Cursor<'_> {
        false
    }

-    /// Eats the double-quoted string and returns an `UnvalidatedRawStr`.
-    fn raw_double_quoted_string(&mut self, prefix_len: usize) -> UnvalidatedRawStr {
+    /// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
+    fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u16, Option<RawStrError>) {
+        // Wrap the actual function to handle the error with too many hashes.
+        // This way, it eats the whole raw string.
+        let (n_hashes, err) = self.raw_string_unvalidated(prefix_len);
+        // Only up to 65535 `#`s are allowed in raw strings
+        match u16::try_from(n_hashes) {
+            Ok(num) => (num, err),
+            // We lie about the number of hashes here :P
+            Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })),
+        }
+    }
+
+    fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option<RawStrError>) {
        debug_assert!(self.prev() == 'r');
-        let mut valid_start: bool = false;
        let start_pos = self.len_consumed();
-        let (mut possible_terminator_offset, mut max_hashes) = (None, 0);
+        let mut possible_terminator_offset = None;
+        let mut max_hashes = 0;

        // Count opening '#' symbols.
        let n_start_hashes = self.eat_while(|c| c == '#');

        // Check that string is started.
        match self.bump() {
-            Some('"') => valid_start = true,
-            _ => {
-                return UnvalidatedRawStr {
-                    valid_start,
-                    valid_end: false,
-                    n_start_hashes,
-                    n_end_hashes: 0,
-                    possible_terminator_offset,
-                };
+            Some('"') => (),
+            c => {
+                let c = c.unwrap_or(EOF_CHAR);
+                return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c }));
            }
        }

@ -722,13 +667,14 @@ impl Cursor<'_> {
            self.eat_while(|c| c != '"');

            if self.is_eof() {
-                return UnvalidatedRawStr {
-                    valid_start,
-                    valid_end: false,
+                return (
                    n_start_hashes,
-                    n_end_hashes: max_hashes,
-                    possible_terminator_offset,
-                };
+                    Some(RawStrError::NoTerminator {
+                        expected: n_start_hashes,
+                        found: max_hashes,
+                        possible_terminator_offset,
+                    }),
+                );
            }

            // Eat closing double quote.
@ -737,7 +683,7 @@ impl Cursor<'_> {
            // Check that amount of closing '#' symbols
            // is equal to the amount of opening ones.
            // Note that this will not consume extra trailing `#` characters:
-            // `r###"abcde"####` is lexed as a `LexedRawString { n_hashes: 3 }`
+            // `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }`
            // followed by a `#` token.
            let mut hashes_left = n_start_hashes;
            let is_closing_hash = |c| {
@ -751,13 +697,7 @@ impl Cursor<'_> {
            let n_end_hashes = self.eat_while(is_closing_hash);

            if n_end_hashes == n_start_hashes {
-                return UnvalidatedRawStr {
-                    valid_start,
-                    valid_end: true,
-                    n_start_hashes,
-                    n_end_hashes,
-                    possible_terminator_offset: None,
-                };
+                return (n_start_hashes, None);
            } else if n_end_hashes > max_hashes {
                // Keep track of possible terminators to give a hint about
                // where there might be a missing terminator
--- a/src/librustc_lexer/src/tests.rs
+++ b/src/librustc_lexer/src/tests.rs
@ -2,77 +2,37 @@
 mod tests {
    use crate::*;

-    fn check_raw_str(
-        s: &str,
-        expected: UnvalidatedRawStr,
-        validated: Result<ValidatedRawStr, LexRawStrError>,
-    ) {
+    fn check_raw_str(s: &str, expected_hashes: u16, expected_err: Option<RawStrError>) {
        let s = &format!("r{}", s);
        let mut cursor = Cursor::new(s);
        cursor.bump();
-        let tok = cursor.raw_double_quoted_string(0);
-        assert_eq!(tok, expected);
-        assert_eq!(tok.validate(), validated);
+        let (n_hashes, err) = cursor.raw_double_quoted_string(0);
+        assert_eq!(n_hashes, expected_hashes);
+        assert_eq!(err, expected_err);
    }

    #[test]
    fn test_naked_raw_str() {
-        check_raw_str(
-            r#""abc""#,
-            UnvalidatedRawStr {
-                n_start_hashes: 0,
-                n_end_hashes: 0,
-                valid_start: true,
-                valid_end: true,
-                possible_terminator_offset: None,
-            },
-            Ok(ValidatedRawStr { n_hashes: 0 }),
-        );
+        check_raw_str(r#""abc""#, 0, None);
    }

    #[test]
    fn test_raw_no_start() {
-        check_raw_str(
-            r##""abc"#"##,
-            UnvalidatedRawStr {
-                n_start_hashes: 0,
-                n_end_hashes: 0,
-                valid_start: true,
-                valid_end: true,
-                possible_terminator_offset: None,
-            },
-            Ok(ValidatedRawStr { n_hashes: 0 }),
-        );
+        check_raw_str(r##""abc"#"##, 0, None);
    }

    #[test]
    fn test_too_many_terminators() {
        // this error is handled in the parser later
-        check_raw_str(
-            r###"#"abc"##"###,
-            UnvalidatedRawStr {
-                n_start_hashes: 1,
-                n_end_hashes: 1,
-                valid_end: true,
-                valid_start: true,
-                possible_terminator_offset: None,
-            },
-            Ok(ValidatedRawStr { n_hashes: 1 }),
-        );
+        check_raw_str(r###"#"abc"##"###, 1, None);
    }

    #[test]
    fn test_unterminated() {
        check_raw_str(
            r#"#"abc"#,
-            UnvalidatedRawStr {
-                n_start_hashes: 1,
-                n_end_hashes: 0,
-                valid_end: false,
-                valid_start: true,
-                possible_terminator_offset: None,
-            },
-            Err(LexRawStrError::NoTerminator {
+            1,
+            Some(RawStrError::NoTerminator {
                expected: 1,
                found: 0,
                possible_terminator_offset: None,
@ -80,14 +40,8 @@ mod tests {
        );
        check_raw_str(
            r###"##"abc"#"###,
-            UnvalidatedRawStr {
-                n_start_hashes: 2,
-                n_end_hashes: 1,
-                valid_start: true,
-                valid_end: false,
-                possible_terminator_offset: Some(7),
-            },
-            Err(LexRawStrError::NoTerminator {
+            2,
+            Some(RawStrError::NoTerminator {
                expected: 2,
                found: 1,
                possible_terminator_offset: Some(7),
@ -96,14 +50,8 @@ mod tests {
        // We're looking for "# not just any #
        check_raw_str(
            r###"##"abc#"###,
-            UnvalidatedRawStr {
-                n_start_hashes: 2,
-                n_end_hashes: 0,
-                valid_start: true,
-                valid_end: false,
-                possible_terminator_offset: None,
-            },
-            Err(LexRawStrError::NoTerminator {
+            2,
+            Some(RawStrError::NoTerminator {
                expected: 2,
                found: 0,
                possible_terminator_offset: None,
@ -113,17 +61,7 @@ mod tests {

    #[test]
    fn test_invalid_start() {
-        check_raw_str(
-            r##"#~"abc"#"##,
-            UnvalidatedRawStr {
-                n_start_hashes: 1,
-                n_end_hashes: 0,
-                valid_start: false,
-                valid_end: false,
-                possible_terminator_offset: None,
-            },
-            Err(LexRawStrError::InvalidStarter),
-        );
+        check_raw_str(r##"#~"abc"#"##, 1, Some(RawStrError::InvalidStarter { bad_char: '~' }));
    }

    #[test]
@ -131,14 +69,8 @@ mod tests {
        // https://github.com/rust-lang/rust/issues/70677
        check_raw_str(
            r#"""#,
-            UnvalidatedRawStr {
-                n_start_hashes: 0,
-                n_end_hashes: 0,
-                valid_start: true,
-                valid_end: false,
-                possible_terminator_offset: None,
-            },
-            Err(LexRawStrError::NoTerminator {
+            0,
+            Some(RawStrError::NoTerminator {
                expected: 0,
                found: 0,
                possible_terminator_offset: None,
--- a/src/librustc_parse/lexer/mod.rs
+++ b/src/librustc_parse/lexer/mod.rs
@ -3,7 +3,7 @@ use rustc_ast::util::comments;
 use rustc_data_structures::sync::Lrc;
 use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError};
 use rustc_lexer::Base;
-use rustc_lexer::{unescape, LexRawStrError, UnvalidatedRawStr, ValidatedRawStr};
+use rustc_lexer::{unescape, RawStrError};
 use rustc_session::parse::ParseSess;
 use rustc_span::symbol::{sym, Symbol};
 use rustc_span::{BytePos, Pos, Span};
@ -49,13 +49,12 @@ impl<'a> StringReader<'a> {
        // Make sure external source is loaded first, before accessing it.
        // While this can't show up during normal parsing, `retokenize` may
        // be called with a source file from an external crate.
-        sess.source_map().ensure_source_file_source_present(source_file.clone());
+        sess.source_map().ensure_source_file_source_present(Lrc::clone(&source_file));

-        // FIXME(eddyb) use `Lrc<str>` or similar to avoid cloning the `String`.
        let src = if let Some(src) = &source_file.src {
-            src.clone()
+            Lrc::clone(&src)
        } else if let Some(src) = source_file.external_src.borrow().get_source() {
-            src.clone()
+            Lrc::clone(&src)
        } else {
            sess.span_diagnostic
                .bug(&format!("cannot lex `source_file` without source: {}", source_file.name));
@ -125,10 +124,7 @@ impl<'a> StringReader<'a> {

        debug!("try_next_token: {:?}({:?})", token.kind, self.str_from(start));

-        // This could use `?`, but that makes code significantly (10-20%) slower.
-        // https://github.com/rust-lang/rust/issues/37939
        let kind = self.cook_lexer_token(token.kind, start);
-
        let span = self.mk_sp(start, self.pos);
        Token::new(kind, span)
    }
@ -153,15 +149,6 @@ impl<'a> StringReader<'a> {
        self.err_span(self.mk_sp(from_pos, to_pos), m)
    }

-    fn struct_span_fatal(
-        &self,
-        from_pos: BytePos,
-        to_pos: BytePos,
-        m: &str,
-    ) -> DiagnosticBuilder<'a> {
-        self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), m)
-    }
-
    fn struct_fatal_span_char(
        &self,
        from_pos: BytePos,
@ -359,15 +346,13 @@ impl<'a> StringReader<'a> {
                }
                (token::ByteStr, Mode::ByteStr, 2, 1) // b" "
            }
-            rustc_lexer::LiteralKind::RawStr(unvalidated_raw_str) => {
-                let valid_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str);
-                let n_hashes = valid_raw_str.num_hashes();
+            rustc_lexer::LiteralKind::RawStr { n_hashes, err } => {
+                self.report_raw_str_error(start, err);
                let n = u32::from(n_hashes);
                (token::StrRaw(n_hashes), Mode::RawStr, 2 + n, 1 + n) // r##" "##
            }
-            rustc_lexer::LiteralKind::RawByteStr(unvalidated_raw_str) => {
-                let validated_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str);
-                let n_hashes = validated_raw_str.num_hashes();
+            rustc_lexer::LiteralKind::RawByteStr { n_hashes, err } => {
+                self.report_raw_str_error(start, err);
                let n = u32::from(n_hashes);
                (token::ByteStrRaw(n_hashes), Mode::RawByteStr, 3 + n, 1 + n) // br##" "##
            }
@ -382,12 +367,7 @@ impl<'a> StringReader<'a> {
            }
            rustc_lexer::LiteralKind::Float { base, empty_exponent } => {
                if empty_exponent {
-                    let mut err = self.struct_span_fatal(
-                        start,
-                        self.pos,
-                        "expected at least one digit in exponent",
-                    );
-                    err.emit();
+                    self.err_span_(start, self.pos, "expected at least one digit in exponent");
                }

                match base {
@ -459,33 +439,25 @@ impl<'a> StringReader<'a> {
        }
    }

-    fn validate_and_report_errors(
-        &self,
-        start: BytePos,
-        unvalidated_raw_str: UnvalidatedRawStr,
-    ) -> ValidatedRawStr {
-        match unvalidated_raw_str.validate() {
-            Err(LexRawStrError::InvalidStarter) => self.report_non_started_raw_string(start),
-            Err(LexRawStrError::NoTerminator { expected, found, possible_terminator_offset }) => {
-                self.report_unterminated_raw_string(
-                    start,
-                    expected,
-                    possible_terminator_offset,
-                    found,
-                )
+    fn report_raw_str_error(&self, start: BytePos, opt_err: Option<RawStrError>) {
+        match opt_err {
+            Some(RawStrError::InvalidStarter { bad_char }) => {
+                self.report_non_started_raw_string(start, bad_char)
            }
-            Err(LexRawStrError::TooManyDelimiters) => self.report_too_many_hashes(start),
-            Ok(valid) => valid,
+            Some(RawStrError::NoTerminator { expected, found, possible_terminator_offset }) => self
+                .report_unterminated_raw_string(start, expected, possible_terminator_offset, found),
+            Some(RawStrError::TooManyDelimiters { found }) => {
+                self.report_too_many_hashes(start, found)
+            }
+            None => (),
        }
    }

-    fn report_non_started_raw_string(&self, start: BytePos) -> ! {
-        let bad_char = self.str_from(start).chars().last().unwrap();
+    fn report_non_started_raw_string(&self, start: BytePos, bad_char: char) -> ! {
        self.struct_fatal_span_char(
            start,
            self.pos,
-            "found invalid character; only `#` is allowed \
-                 in raw string delimitation",
+            "found invalid character; only `#` is allowed in raw string delimitation",
            bad_char,
        )
        .emit();
@ -530,11 +502,17 @@ impl<'a> StringReader<'a> {
        FatalError.raise()
    }

-    fn report_too_many_hashes(&self, start: BytePos) -> ! {
+    /// Note: It was decided to not add a test case, because it would be to big.
+    /// https://github.com/rust-lang/rust/pull/50296#issuecomment-392135180
+    fn report_too_many_hashes(&self, start: BytePos, found: usize) -> ! {
        self.fatal_span_(
            start,
            self.pos,
-            "too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols",
+            &format!(
+                "too many `#` symbols: raw strings may be delimited \
+                by up to 65535 `#` symbols, but found {}",
+                found
+            ),
        )
        .raise();
    }