Simplify raw string error reporting.

This makes `UnvalidatedRawStr` and `ValidatedRawStr` unnecessary and removes 70 lines.
2020-05-29 17:37:16 +02:00 · 2020-05-29 17:37:16 +02:00 · 5fbbfbbfa9
commit 5fbbfbbfa9
parent b85e3fe010
3 changed files with 85 additions and 216 deletions
--- a/src/librustc_lexer/src/lib.rs
+++ b/src/librustc_lexer/src/lib.rs
@ -29,7 +29,7 @@ mod tests;
 use self::LiteralKind::*;
 use self::TokenKind::*;
 use crate::cursor::{Cursor, EOF_CHAR};
-use std::convert::TryInto;
+use std::convert::TryFrom;

 /// Parsed token.
 /// It doesn't contain information about data that has been parsed,
@ -142,84 +142,24 @@ pub enum LiteralKind {
    /// "b"abc"", "b"abc"
    ByteStr { terminated: bool },
    /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
-    RawStr(UnvalidatedRawStr),
+    RawStr { n_hashes: u16, err: Option<RawStrError> },
    /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
-    RawByteStr(UnvalidatedRawStr),
-}
-
-/// Represents something that looks like a raw string, but may have some
-/// problems. Use `.validate()` to convert it into something
-/// usable.
-#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
-pub struct UnvalidatedRawStr {
-    /// The prefix (`r###"`) is valid
-    valid_start: bool,
-
-    /// The postfix (`"###`) is valid
-    valid_end: bool,
-
-    /// The number of leading `#`
-    n_start_hashes: usize,
-    /// The number of trailing `#`. `n_end_hashes` <= `n_start_hashes`
-    n_end_hashes: usize,
-    /// The offset starting at `r` or `br` where the user may have intended to end the string.
-    /// Currently, it is the longest sequence of pattern `"#+"`.
-    possible_terminator_offset: Option<usize>,
+    RawByteStr { n_hashes: u16, err: Option<RawStrError> },
 }

 /// Error produced validating a raw string. Represents cases like:
-/// - `r##~"abcde"##`: `LexRawStrError::InvalidStarter`
-/// - `r###"abcde"##`: `LexRawStrError::NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
-/// - Too many `#`s (>65536): `TooManyDelimiters`
+/// - `r##~"abcde"##`: `InvalidStarter`
+/// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
+/// - Too many `#`s (>65535): `TooManyDelimiters`
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
-pub enum LexRawStrError {
+pub enum RawStrError {
    /// Non `#` characters exist between `r` and `"` eg. `r#~"..`
-    InvalidStarter,
+    InvalidStarter { bad_char: char },
    /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they
    /// may have intended to terminate it.
    NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
-    /// More than 65536 `#`s exist.
-    TooManyDelimiters,
-}
-
-/// Raw String that contains a valid prefix (`#+"`) and postfix (`"#+`) where
-/// there are a matching number of `#` characters in both. Note that this will
-/// not consume extra trailing `#` characters: `r###"abcde"####` is lexed as a
-/// `ValidatedRawString { n_hashes: 3 }` followed by a `#` token.
-#[derive(Debug, Eq, PartialEq, Copy, Clone)]
-pub struct ValidatedRawStr {
-    n_hashes: u16,
-}
-
-impl ValidatedRawStr {
-    pub fn num_hashes(&self) -> u16 {
-        self.n_hashes
-    }
-}
-
-impl UnvalidatedRawStr {
-    pub fn validate(self) -> Result<ValidatedRawStr, LexRawStrError> {
-        if !self.valid_start {
-            return Err(LexRawStrError::InvalidStarter);
-        }
-
-        // Only up to 65535 `#`s are allowed in raw strings
-        let n_start_safe: u16 =
-            self.n_start_hashes.try_into().map_err(|_| LexRawStrError::TooManyDelimiters)?;
-
-        if self.n_start_hashes > self.n_end_hashes || !self.valid_end {
-            Err(LexRawStrError::NoTerminator {
-                expected: self.n_start_hashes,
-                found: self.n_end_hashes,
-                possible_terminator_offset: self.possible_terminator_offset,
-            })
-        } else {
-            // Since the lexer should never produce a literal with n_end > n_start, if n_start <= n_end,
-            // they must be equal.
-            debug_assert_eq!(self.n_start_hashes, self.n_end_hashes);
-            Ok(ValidatedRawStr { n_hashes: n_start_safe })
-        }
-    }
+    /// More than 65535 `#`s exist.
+    TooManyDelimiters { found: usize },
 }

 /// Base of numeric literal encoding according to its prefix.
@ -354,12 +294,12 @@ impl Cursor<'_> {
            'r' => match (self.first(), self.second()) {
                ('#', c1) if is_id_start(c1) => self.raw_ident(),
                ('#', _) | ('"', _) => {
-                    let raw_str_i = self.raw_double_quoted_string(1);
+                    let (n_hashes, err) = self.raw_double_quoted_string(1);
                    let suffix_start = self.len_consumed();
-                    if raw_str_i.n_end_hashes == raw_str_i.n_start_hashes {
+                    if err.is_none() {
                        self.eat_literal_suffix();
                    }
-                    let kind = RawStr(raw_str_i);
+                    let kind = RawStr { n_hashes, err };
                    Literal { kind, suffix_start }
                }
                _ => self.ident(),
@ -389,14 +329,12 @@ impl Cursor<'_> {
                }
                ('r', '"') | ('r', '#') => {
                    self.bump();
-                    let raw_str_i = self.raw_double_quoted_string(2);
+                    let (n_hashes, err) = self.raw_double_quoted_string(2);
                    let suffix_start = self.len_consumed();
-                    let terminated = raw_str_i.n_start_hashes == raw_str_i.n_end_hashes;
-                    if terminated {
+                    if err.is_none() {
                        self.eat_literal_suffix();
                    }
-
-                    let kind = RawByteStr(raw_str_i);
+                    let kind = RawByteStr { n_hashes, err };
                    Literal { kind, suffix_start }
                }
                _ => self.ident(),
@ -692,27 +630,34 @@ impl Cursor<'_> {
        false
    }

-    /// Eats the double-quoted string and returns an `UnvalidatedRawStr`.
-    fn raw_double_quoted_string(&mut self, prefix_len: usize) -> UnvalidatedRawStr {
+    /// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
+    fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u16, Option<RawStrError>) {
+        // Wrap the actual function to handle the error with too many hashes.
+        // This way, it eats the whole raw string.
+        let (n_hashes, err) = self.raw_string_unvalidated(prefix_len);
+        // Only up to 65535 `#`s are allowed in raw strings
+        match u16::try_from(n_hashes) {
+            Ok(num) => (num, err),
+            // We lie about the number of hashes here :P
+            Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })),
+        }
+    }
+
+    fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option<RawStrError>) {
        debug_assert!(self.prev() == 'r');
-        let mut valid_start: bool = false;
        let start_pos = self.len_consumed();
-        let (mut possible_terminator_offset, mut max_hashes) = (None, 0);
+        let mut possible_terminator_offset = None;
+        let mut max_hashes = 0;

        // Count opening '#' symbols.
        let n_start_hashes = self.eat_while(|c| c == '#');

        // Check that string is started.
        match self.bump() {
-            Some('"') => valid_start = true,
-            _ => {
-                return UnvalidatedRawStr {
-                    valid_start,
-                    valid_end: false,
-                    n_start_hashes,
-                    n_end_hashes: 0,
-                    possible_terminator_offset,
-                };
+            Some('"') => (),
+            c => {
+                let c = c.unwrap_or(EOF_CHAR);
+                return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c }));
            }
        }

@ -722,13 +667,14 @@ impl Cursor<'_> {
            self.eat_while(|c| c != '"');

            if self.is_eof() {
-                return UnvalidatedRawStr {
-                    valid_start,
-                    valid_end: false,
+                return (
                    n_start_hashes,
-                    n_end_hashes: max_hashes,
-                    possible_terminator_offset,
-                };
+                    Some(RawStrError::NoTerminator {
+                        expected: n_start_hashes,
+                        found: max_hashes,
+                        possible_terminator_offset,
+                    }),
+                );
            }

            // Eat closing double quote.
@ -737,7 +683,7 @@ impl Cursor<'_> {
            // Check that amount of closing '#' symbols
            // is equal to the amount of opening ones.
            // Note that this will not consume extra trailing `#` characters:
-            // `r###"abcde"####` is lexed as a `LexedRawString { n_hashes: 3 }`
+            // `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }`
            // followed by a `#` token.
            let mut hashes_left = n_start_hashes;
            let is_closing_hash = |c| {
@ -751,13 +697,7 @@ impl Cursor<'_> {
            let n_end_hashes = self.eat_while(is_closing_hash);

            if n_end_hashes == n_start_hashes {
-                return UnvalidatedRawStr {
-                    valid_start,
-                    valid_end: true,
-                    n_start_hashes,
-                    n_end_hashes,
-                    possible_terminator_offset: None,
-                };
+                return (n_start_hashes, None);
            } else if n_end_hashes > max_hashes {
                // Keep track of possible terminators to give a hint about
                // where there might be a missing terminator
--- a/src/librustc_lexer/src/tests.rs
+++ b/src/librustc_lexer/src/tests.rs
@ -2,77 +2,37 @@
 mod tests {
    use crate::*;

-    fn check_raw_str(
-        s: &str,
-        expected: UnvalidatedRawStr,
-        validated: Result<ValidatedRawStr, LexRawStrError>,
-    ) {
+    fn check_raw_str(s: &str, expected_hashes: u16, expected_err: Option<RawStrError>) {
        let s = &format!("r{}", s);
        let mut cursor = Cursor::new(s);
        cursor.bump();
-        let tok = cursor.raw_double_quoted_string(0);
-        assert_eq!(tok, expected);
-        assert_eq!(tok.validate(), validated);
+        let (n_hashes, err) = cursor.raw_double_quoted_string(0);
+        assert_eq!(n_hashes, expected_hashes);
+        assert_eq!(err, expected_err);
    }

    #[test]
    fn test_naked_raw_str() {
-        check_raw_str(
-            r#""abc""#,
-            UnvalidatedRawStr {
-                n_start_hashes: 0,
-                n_end_hashes: 0,
-                valid_start: true,
-                valid_end: true,
-                possible_terminator_offset: None,
-            },
-            Ok(ValidatedRawStr { n_hashes: 0 }),
-        );
+        check_raw_str(r#""abc""#, 0, None);
    }

    #[test]
    fn test_raw_no_start() {
-        check_raw_str(
-            r##""abc"#"##,
-            UnvalidatedRawStr {
-                n_start_hashes: 0,
-                n_end_hashes: 0,
-                valid_start: true,
-                valid_end: true,
-                possible_terminator_offset: None,
-            },
-            Ok(ValidatedRawStr { n_hashes: 0 }),
-        );
+        check_raw_str(r##""abc"#"##, 0, None);
    }

    #[test]
    fn test_too_many_terminators() {
        // this error is handled in the parser later
-        check_raw_str(
-            r###"#"abc"##"###,
-            UnvalidatedRawStr {
-                n_start_hashes: 1,
-                n_end_hashes: 1,
-                valid_end: true,
-                valid_start: true,
-                possible_terminator_offset: None,
-            },
-            Ok(ValidatedRawStr { n_hashes: 1 }),
-        );
+        check_raw_str(r###"#"abc"##"###, 1, None);
    }

    #[test]
    fn test_unterminated() {
        check_raw_str(
            r#"#"abc"#,
-            UnvalidatedRawStr {
-                n_start_hashes: 1,
-                n_end_hashes: 0,
-                valid_end: false,
-                valid_start: true,
-                possible_terminator_offset: None,
-            },
-            Err(LexRawStrError::NoTerminator {
+            1,
+            Some(RawStrError::NoTerminator {
                expected: 1,
                found: 0,
                possible_terminator_offset: None,
@ -80,14 +40,8 @@ mod tests {
        );
        check_raw_str(
            r###"##"abc"#"###,
-            UnvalidatedRawStr {
-                n_start_hashes: 2,
-                n_end_hashes: 1,
-                valid_start: true,
-                valid_end: false,
-                possible_terminator_offset: Some(7),
-            },
-            Err(LexRawStrError::NoTerminator {
+            2,
+            Some(RawStrError::NoTerminator {
                expected: 2,
                found: 1,
                possible_terminator_offset: Some(7),
@ -96,14 +50,8 @@ mod tests {
        // We're looking for "# not just any #
        check_raw_str(
            r###"##"abc#"###,
-            UnvalidatedRawStr {
-                n_start_hashes: 2,
-                n_end_hashes: 0,
-                valid_start: true,
-                valid_end: false,
-                possible_terminator_offset: None,
-            },
-            Err(LexRawStrError::NoTerminator {
+            2,
+            Some(RawStrError::NoTerminator {
                expected: 2,
                found: 0,
                possible_terminator_offset: None,
@ -113,17 +61,7 @@ mod tests {

    #[test]
    fn test_invalid_start() {
-        check_raw_str(
-            r##"#~"abc"#"##,
-            UnvalidatedRawStr {
-                n_start_hashes: 1,
-                n_end_hashes: 0,
-                valid_start: false,
-                valid_end: false,
-                possible_terminator_offset: None,
-            },
-            Err(LexRawStrError::InvalidStarter),
-        );
+        check_raw_str(r##"#~"abc"#"##, 1, Some(RawStrError::InvalidStarter { bad_char: '~' }));
    }

    #[test]
@ -131,14 +69,8 @@ mod tests {
        // https://github.com/rust-lang/rust/issues/70677
        check_raw_str(
            r#"""#,
-            UnvalidatedRawStr {
-                n_start_hashes: 0,
-                n_end_hashes: 0,
-                valid_start: true,
-                valid_end: false,
-                possible_terminator_offset: None,
-            },
-            Err(LexRawStrError::NoTerminator {
+            0,
+            Some(RawStrError::NoTerminator {
                expected: 0,
                found: 0,
                possible_terminator_offset: None,
--- a/src/librustc_parse/lexer/mod.rs
+++ b/src/librustc_parse/lexer/mod.rs
@ -3,7 +3,7 @@ use rustc_ast::util::comments;
 use rustc_data_structures::sync::Lrc;
 use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError};
 use rustc_lexer::Base;
-use rustc_lexer::{unescape, LexRawStrError, UnvalidatedRawStr, ValidatedRawStr};
+use rustc_lexer::{unescape, RawStrError};
 use rustc_session::parse::ParseSess;
 use rustc_span::symbol::{sym, Symbol};
 use rustc_span::{BytePos, Pos, Span};
@ -359,15 +359,13 @@ impl<'a> StringReader<'a> {
                }
                (token::ByteStr, Mode::ByteStr, 2, 1) // b" "
            }
-            rustc_lexer::LiteralKind::RawStr(unvalidated_raw_str) => {
-                let valid_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str);
-                let n_hashes = valid_raw_str.num_hashes();
+            rustc_lexer::LiteralKind::RawStr { n_hashes, err } => {
+                self.report_raw_str_error(start, err);
                let n = u32::from(n_hashes);
                (token::StrRaw(n_hashes), Mode::RawStr, 2 + n, 1 + n) // r##" "##
            }
-            rustc_lexer::LiteralKind::RawByteStr(unvalidated_raw_str) => {
-                let validated_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str);
-                let n_hashes = validated_raw_str.num_hashes();
+            rustc_lexer::LiteralKind::RawByteStr { n_hashes, err } => {
+                self.report_raw_str_error(start, err);
                let n = u32::from(n_hashes);
                (token::ByteStrRaw(n_hashes), Mode::RawByteStr, 3 + n, 1 + n) // br##" "##
            }
@ -459,28 +457,21 @@ impl<'a> StringReader<'a> {
        }
    }

-    fn validate_and_report_errors(
-        &self,
-        start: BytePos,
-        unvalidated_raw_str: UnvalidatedRawStr,
-    ) -> ValidatedRawStr {
-        match unvalidated_raw_str.validate() {
-            Err(LexRawStrError::InvalidStarter) => self.report_non_started_raw_string(start),
-            Err(LexRawStrError::NoTerminator { expected, found, possible_terminator_offset }) => {
-                self.report_unterminated_raw_string(
-                    start,
-                    expected,
-                    possible_terminator_offset,
-                    found,
-                )
+    fn report_raw_str_error(&self, start: BytePos, opt_err: Option<RawStrError>) {
+        match opt_err {
+            Some(RawStrError::InvalidStarter { bad_char }) => {
+                self.report_non_started_raw_string(start, bad_char)
            }
-            Err(LexRawStrError::TooManyDelimiters) => self.report_too_many_hashes(start),
-            Ok(valid) => valid,
+            Some(RawStrError::NoTerminator { expected, found, possible_terminator_offset }) => self
+                .report_unterminated_raw_string(start, expected, possible_terminator_offset, found),
+            Some(RawStrError::TooManyDelimiters { found }) => {
+                self.report_too_many_hashes(start, found)
+            }
+            None => (),
        }
    }

-    fn report_non_started_raw_string(&self, start: BytePos) -> ! {
-        let bad_char = self.str_from(start).chars().last().unwrap();
+    fn report_non_started_raw_string(&self, start: BytePos, bad_char: char) -> ! {
        self.struct_fatal_span_char(
            start,
            self.pos,
@ -530,11 +521,17 @@ impl<'a> StringReader<'a> {
        FatalError.raise()
    }

-    fn report_too_many_hashes(&self, start: BytePos) -> ! {
+    /// Note: It was decided to not add a test case, because it would be to big.
+    /// https://github.com/rust-lang/rust/pull/50296#issuecomment-392135180
+    fn report_too_many_hashes(&self, start: BytePos, found: usize) -> ! {
        self.fatal_span_(
            start,
            self.pos,
-            "too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols",
+            &format!(
+                "too many `#` symbols: raw strings may be delimited \
+                by up to 65535 `#` symbols, but found {}",
+                found
+            ),
        )
        .raise();
    }