diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index b97801a50d4..52f65e1b474 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -389,8 +389,18 @@ impl<'a> StringReader<'a> { self.pos, "unknown start of token", c); - unicode_chars::check_for_substitution(self, start, c, &mut err); - return Err(err) + // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, + // instead of keeping a table in `check_for_substitution`into the token. Ideally, + // this should be inside `rustc_lexer`. However, we should first remove compound + // tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it, + // as there will be less overall work to do this way. + return match unicode_chars::check_for_substitution(self, start, c, &mut err) { + Some(token) => { + err.emit(); + Ok(token) + } + None => Err(err), + } } }; Ok(kind) diff --git a/src/libsyntax/parse/lexer/unicode_chars.rs b/src/libsyntax/parse/lexer/unicode_chars.rs index b728a9e1988..eaa736c6a35 100644 --- a/src/libsyntax/parse/lexer/unicode_chars.rs +++ b/src/libsyntax/parse/lexer/unicode_chars.rs @@ -3,7 +3,8 @@ use super::StringReader; use errors::{Applicability, DiagnosticBuilder}; -use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION}; +use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION, symbol::kw}; +use crate::parse::token; #[rustfmt::skip] // for line breaks const UNICODE_ARRAY: &[(char, &str, char)] = &[ @@ -297,32 +298,38 @@ const UNICODE_ARRAY: &[(char, &str, char)] = &[ ('>', "Fullwidth Greater-Than Sign", '>'), ]; -const ASCII_ARRAY: &[(char, &str)] = &[ - (' ', "Space"), - ('_', "Underscore"), - ('-', "Minus/Hyphen"), - (',', "Comma"), - (';', "Semicolon"), - (':', "Colon"), - ('!', "Exclamation Mark"), - ('?', "Question Mark"), - ('.', "Period"), - ('\'', "Single Quote"), - ('"', "Quotation Mark"), - ('(', "Left Parenthesis"), - (')', "Right Parenthesis"), - ('[', "Left Square Bracket"), - (']', "Right Square Bracket"), - ('{', "Left Curly Brace"), - ('}', "Right Curly Brace"), - ('*', "Asterisk"), - ('/', "Slash"), - ('\\', "Backslash"), - ('&', "Ampersand"), - ('+', "Plus Sign"), - ('<', "Less-Than Sign"), - ('=', "Equals Sign"), - ('>', "Greater-Than Sign"), +// FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, instead of +// keeping the substitution token in this table. Ideally, this should be inside `rustc_lexer`. +// However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add +// fancier error recovery to it, as there will be less overall work to do this way. +const ASCII_ARRAY: &[(char, &str, Option)] = &[ + (' ', "Space", Some(token::Whitespace)), + ('_', "Underscore", Some(token::Ident(kw::Underscore, false))), + ('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))), + (',', "Comma", Some(token::Comma)), + (';', "Semicolon", Some(token::Semi)), + (':', "Colon", Some(token::Colon)), + ('!', "Exclamation Mark", Some(token::Not)), + ('?', "Question Mark", Some(token::Question)), + ('.', "Period", Some(token::Dot)), + ('(', "Left Parenthesis", Some(token::OpenDelim(token::Paren))), + (')', "Right Parenthesis", Some(token::CloseDelim(token::Paren))), + ('[', "Left Square Bracket", Some(token::OpenDelim(token::Bracket))), + (']', "Right Square Bracket", Some(token::CloseDelim(token::Bracket))), + ('{', "Left Curly Brace", Some(token::OpenDelim(token::Brace))), + ('}', "Right Curly Brace", Some(token::CloseDelim(token::Brace))), + ('*', "Asterisk", Some(token::BinOp(token::Star))), + ('/', "Slash", Some(token::BinOp(token::Slash))), + ('\\', "Backslash", None), + ('&', "Ampersand", Some(token::BinOp(token::And))), + ('+', "Plus Sign", Some(token::BinOp(token::Plus))), + ('<', "Less-Than Sign", Some(token::Lt)), + ('=', "Equals Sign", Some(token::Eq)), + ('>', "Greater-Than Sign", Some(token::Gt)), + // FIXME: Literals are already lexed by this point, so we can't recover gracefully just by + // spitting the correct token out. + ('\'', "Single Quote", None), + ('"', "Quotation Mark", None), ]; crate fn check_for_substitution<'a>( @@ -330,20 +337,20 @@ crate fn check_for_substitution<'a>( pos: BytePos, ch: char, err: &mut DiagnosticBuilder<'a>, -) -> bool { +) -> Option { let (u_name, ascii_char) = match UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch) { Some(&(_u_char, u_name, ascii_char)) => (u_name, ascii_char), - None => return false, + None => return None, }; let span = Span::new(pos, pos + Pos::from_usize(ch.len_utf8()), NO_EXPANSION); - let ascii_name = match ASCII_ARRAY.iter().find(|&&(c, _)| c == ascii_char) { - Some((_ascii_char, ascii_name)) => ascii_name, + let (ascii_name, token) = match ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) { + Some((_ascii_char, ascii_name, token)) => (ascii_name, token), None => { let msg = format!("substitution character not found for '{}'", ch); reader.sess.span_diagnostic.span_bug_no_panic(span, &msg); - return false; + return None; } }; @@ -371,7 +378,7 @@ crate fn check_for_substitution<'a>( ); err.span_suggestion(span, &msg, ascii_char.to_string(), Applicability::MaybeIncorrect); } - true + token.clone() } /// Extract string if found at current position with given delimiters diff --git a/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.rs b/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.rs index 5c2c3b8ec61..66d562d2eb5 100644 --- a/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.rs +++ b/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.rs @@ -1,5 +1,6 @@ const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻² //~^ ERROR expected at least one digit in exponent //~| ERROR unknown start of token: \u{2212} +//~| ERROR cannot subtract `{integer}` from `{float}` fn main() {} diff --git a/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.stderr b/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.stderr index 07653c791db..9ee86adec52 100644 --- a/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.stderr +++ b/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.stderr @@ -14,5 +14,14 @@ help: Unicode character '−' (Minus Sign) looks like '-' (Minus/Hyphen), but it LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e-11; // m³⋅kg⁻¹⋅s⁻² | ^ -error: aborting due to 2 previous errors +error[E0277]: cannot subtract `{integer}` from `{float}` + --> $DIR/issue-49746-unicode-confusable-in-float-literal-expt.rs:1:53 + | +LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻² + | ^ no implementation for `{float} - {integer}` + | + = help: the trait `std::ops::Sub<{integer}>` is not implemented for `{float}` +error: aborting due to 3 previous errors + +For more information about this error, try `rustc --explain E0277`. diff --git a/src/test/ui/parser/recover-from-homoglyph.rs b/src/test/ui/parser/recover-from-homoglyph.rs new file mode 100644 index 00000000000..99ce0d1a630 --- /dev/null +++ b/src/test/ui/parser/recover-from-homoglyph.rs @@ -0,0 +1,4 @@ +fn main() { + println!(""); //~ ERROR unknown start of token: \u{37e} + let x: usize = (); //~ ERROR mismatched types +} diff --git a/src/test/ui/parser/recover-from-homoglyph.stderr b/src/test/ui/parser/recover-from-homoglyph.stderr new file mode 100644 index 00000000000..424d492b7ba --- /dev/null +++ b/src/test/ui/parser/recover-from-homoglyph.stderr @@ -0,0 +1,22 @@ +error: unknown start of token: \u{37e} + --> $DIR/recover-from-homoglyph.rs:2:17 + | +LL | println!(""); + | ^ +help: Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), but it is not + | +LL | println!(""); + | ^ + +error[E0308]: mismatched types + --> $DIR/recover-from-homoglyph.rs:3:20 + | +LL | let x: usize = (); + | ^^ expected usize, found () + | + = note: expected type `usize` + found type `()` + +error: aborting due to 2 previous errors + +For more information about this error, try `rustc --explain E0308`.