Rollup merge of #62963 - estebank:homoglyph-recovery, r=petrochenkov

Allow lexer to recover from some homoglyphs
2019-07-26 18:56:53 +02:00 · 2019-07-26 18:56:53 +02:00 · 1893ac6db3
commit 1893ac6db3
parent c6c8693b4c 684497648a
6 changed files with 89 additions and 36 deletions
--- a/src/libsyntax/parse/lexer/mod.rs
+++ b/src/libsyntax/parse/lexer/mod.rs
@ -389,8 +389,18 @@ impl<'a> StringReader<'a> {
                                                          self.pos,
                                                          "unknown start of token",
                                                          c);
-                unicode_chars::check_for_substitution(self, start, c, &mut err);
+                // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs,
-                return Err(err)
+                // instead of keeping a table in `check_for_substitution`into the token. Ideally,
                // this should be inside `rustc_lexer`. However, we should first remove compound
                // tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
                // as there will be less overall work to do this way.
                return match unicode_chars::check_for_substitution(self, start, c, &mut err) {
                    Some(token) => {
                        err.emit();
                        Ok(token)
                    }
                    None => Err(err),
                }
            }
        };
        Ok(kind)
--- a/src/libsyntax/parse/lexer/unicode_chars.rs
+++ b/src/libsyntax/parse/lexer/unicode_chars.rs
@ -3,7 +3,8 @@
 use super::StringReader;
 use errors::{Applicability, DiagnosticBuilder};
-use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION};
+use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION, symbol::kw};
 use crate::parse::token;
 #[rustfmt::skip] // for line breaks
 const UNICODE_ARRAY: &[(char, &str, char)] = &[
@ -297,32 +298,38 @@ const UNICODE_ARRAY: &[(char, &str, char)] = &[
    ('＞', "Fullwidth Greater-Than Sign", '>'),
 ];
-const ASCII_ARRAY: &[(char, &str)] = &[
+// FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, instead of
-    (' ', "Space"),
+// keeping the substitution token in this table. Ideally, this should be inside `rustc_lexer`.
-    ('_', "Underscore"),
+// However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add
-    ('-', "Minus/Hyphen"),
+// fancier error recovery to it, as there will be less overall work to do this way.
-    (',', "Comma"),
+const ASCII_ARRAY: &[(char, &str, Option<token::TokenKind>)] = &[
-    (';', "Semicolon"),
+    (' ', "Space", Some(token::Whitespace)),
-    (':', "Colon"),
+    ('_', "Underscore", Some(token::Ident(kw::Underscore, false))),
-    ('!', "Exclamation Mark"),
+    ('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))),
-    ('?', "Question Mark"),
+    (',', "Comma", Some(token::Comma)),
-    ('.', "Period"),
+    (';', "Semicolon", Some(token::Semi)),
-    ('\'', "Single Quote"),
+    (':', "Colon", Some(token::Colon)),
-    ('"', "Quotation Mark"),
+    ('!', "Exclamation Mark", Some(token::Not)),
-    ('(', "Left Parenthesis"),
+    ('?', "Question Mark", Some(token::Question)),
-    (')', "Right Parenthesis"),
+    ('.', "Period", Some(token::Dot)),
-    ('[', "Left Square Bracket"),
+    ('(', "Left Parenthesis", Some(token::OpenDelim(token::Paren))),
-    (']', "Right Square Bracket"),
+    (')', "Right Parenthesis", Some(token::CloseDelim(token::Paren))),
-    ('{', "Left Curly Brace"),
+    ('[', "Left Square Bracket", Some(token::OpenDelim(token::Bracket))),
-    ('}', "Right Curly Brace"),
+    (']', "Right Square Bracket", Some(token::CloseDelim(token::Bracket))),
-    ('*', "Asterisk"),
+    ('{', "Left Curly Brace", Some(token::OpenDelim(token::Brace))),
-    ('/', "Slash"),
+    ('}', "Right Curly Brace", Some(token::CloseDelim(token::Brace))),
-    ('\\', "Backslash"),
+    ('*', "Asterisk", Some(token::BinOp(token::Star))),
-    ('&', "Ampersand"),
+    ('/', "Slash", Some(token::BinOp(token::Slash))),
-    ('+', "Plus Sign"),
+    ('\\', "Backslash", None),
-    ('<', "Less-Than Sign"),
+    ('&', "Ampersand", Some(token::BinOp(token::And))),
-    ('=', "Equals Sign"),
+    ('+', "Plus Sign", Some(token::BinOp(token::Plus))),
-    ('>', "Greater-Than Sign"),
+    ('<', "Less-Than Sign", Some(token::Lt)),
    ('=', "Equals Sign", Some(token::Eq)),
    ('>', "Greater-Than Sign", Some(token::Gt)),
    // FIXME: Literals are already lexed by this point, so we can't recover gracefully just by
    // spitting the correct token out.
    ('\'', "Single Quote", None),
    ('"', "Quotation Mark", None),
 ];
 crate fn check_for_substitution<'a>(
@ -330,20 +337,20 @@ crate fn check_for_substitution<'a>(
    pos: BytePos,
    ch: char,
    err: &mut DiagnosticBuilder<'a>,
-) -> bool {
+) -> Option<token::TokenKind> {
    let (u_name, ascii_char) = match UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch) {
        Some(&(_u_char, u_name, ascii_char)) => (u_name, ascii_char),
-        None => return false,
+        None => return None,
    };
    let span = Span::new(pos, pos + Pos::from_usize(ch.len_utf8()), NO_EXPANSION);
-    let ascii_name = match ASCII_ARRAY.iter().find(|&&(c, _)| c == ascii_char) {
+    let (ascii_name, token) = match ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) {
-        Some((_ascii_char, ascii_name)) => ascii_name,
+        Some((_ascii_char, ascii_name, token)) => (ascii_name, token),
        None => {
            let msg = format!("substitution character not found for '{}'", ch);
            reader.sess.span_diagnostic.span_bug_no_panic(span, &msg);
-            return false;
+            return None;
        }
    };
@ -371,7 +378,7 @@ crate fn check_for_substitution<'a>(
        );
        err.span_suggestion(span, &msg, ascii_char.to_string(), Applicability::MaybeIncorrect);
    }
-    true
+    token.clone()
 }
 /// Extract string if found at current position with given delimiters
--- a/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.rs
+++ b/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.rs
@ -1,5 +1,6 @@
 const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻²
 //~^ ERROR expected at least one digit in exponent
 //~| ERROR unknown start of token: \u{2212}
 //~| ERROR cannot subtract `{integer}` from `{float}`
 fn main() {}
--- a/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.stderr
+++ b/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.stderr
@ -14,5 +14,14 @@ help: Unicode character '−' (Minus Sign) looks like '-' (Minus/Hyphen), but it
 LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e-11; // m³⋅kg⁻¹⋅s⁻²
   |                                                     ^
-error: aborting due to 2 previous errors
+error[E0277]: cannot subtract `{integer}` from `{float}`
  --> $DIR/issue-49746-unicode-confusable-in-float-literal-expt.rs:1:53
   |
 LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻²
   |                                                     ^ no implementation for `{float} - {integer}`
   |
   = help: the trait `std::ops::Sub<{integer}>` is not implemented for `{float}`
 error: aborting due to 3 previous errors
 For more information about this error, try `rustc --explain E0277`.
--- a/src/test/ui/parser/recover-from-homoglyph.rs
+++ b/src/test/ui/parser/recover-from-homoglyph.rs
@ -0,0 +1,4 @@
 fn main() {
    println!(""); //~ ERROR unknown start of token: \u{37e}
    let x: usize = (); //~ ERROR mismatched types
 }
--- a/src/test/ui/parser/recover-from-homoglyph.stderr
+++ b/src/test/ui/parser/recover-from-homoglyph.stderr
@ -0,0 +1,22 @@
 error: unknown start of token: \u{37e}
  --> $DIR/recover-from-homoglyph.rs:2:17
   |
 LL |     println!("");
   |                 ^
 help: Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), but it is not
   |
 LL |     println!("");
   |                 ^
 error[E0308]: mismatched types
  --> $DIR/recover-from-homoglyph.rs:3:20
   |
 LL |     let x: usize = ();
   |                    ^^ expected usize, found ()
   |
   = note: expected type `usize`
              found type `()`
 error: aborting due to 2 previous errors
 For more information about this error, try `rustc --explain E0308`.