Rollup merge of #62963 - estebank:homoglyph-recovery, r=petrochenkov

Allow lexer to recover from some homoglyphs
This commit is contained in:
Mazdak Farrokhzad 2019-07-26 18:56:53 +02:00 committed by GitHub
commit 1893ac6db3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 89 additions and 36 deletions

View File

@ -389,8 +389,18 @@ impl<'a> StringReader<'a> {
self.pos, self.pos,
"unknown start of token", "unknown start of token",
c); c);
unicode_chars::check_for_substitution(self, start, c, &mut err); // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs,
return Err(err) // instead of keeping a table in `check_for_substitution`into the token. Ideally,
// this should be inside `rustc_lexer`. However, we should first remove compound
// tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
// as there will be less overall work to do this way.
return match unicode_chars::check_for_substitution(self, start, c, &mut err) {
Some(token) => {
err.emit();
Ok(token)
}
None => Err(err),
}
} }
}; };
Ok(kind) Ok(kind)

View File

@ -3,7 +3,8 @@
use super::StringReader; use super::StringReader;
use errors::{Applicability, DiagnosticBuilder}; use errors::{Applicability, DiagnosticBuilder};
use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION}; use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION, symbol::kw};
use crate::parse::token;
#[rustfmt::skip] // for line breaks #[rustfmt::skip] // for line breaks
const UNICODE_ARRAY: &[(char, &str, char)] = &[ const UNICODE_ARRAY: &[(char, &str, char)] = &[
@ -297,32 +298,38 @@ const UNICODE_ARRAY: &[(char, &str, char)] = &[
('', "Fullwidth Greater-Than Sign", '>'), ('', "Fullwidth Greater-Than Sign", '>'),
]; ];
const ASCII_ARRAY: &[(char, &str)] = &[ // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, instead of
(' ', "Space"), // keeping the substitution token in this table. Ideally, this should be inside `rustc_lexer`.
('_', "Underscore"), // However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add
('-', "Minus/Hyphen"), // fancier error recovery to it, as there will be less overall work to do this way.
(',', "Comma"), const ASCII_ARRAY: &[(char, &str, Option<token::TokenKind>)] = &[
(';', "Semicolon"), (' ', "Space", Some(token::Whitespace)),
(':', "Colon"), ('_', "Underscore", Some(token::Ident(kw::Underscore, false))),
('!', "Exclamation Mark"), ('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))),
('?', "Question Mark"), (',', "Comma", Some(token::Comma)),
('.', "Period"), (';', "Semicolon", Some(token::Semi)),
('\'', "Single Quote"), (':', "Colon", Some(token::Colon)),
('"', "Quotation Mark"), ('!', "Exclamation Mark", Some(token::Not)),
('(', "Left Parenthesis"), ('?', "Question Mark", Some(token::Question)),
(')', "Right Parenthesis"), ('.', "Period", Some(token::Dot)),
('[', "Left Square Bracket"), ('(', "Left Parenthesis", Some(token::OpenDelim(token::Paren))),
(']', "Right Square Bracket"), (')', "Right Parenthesis", Some(token::CloseDelim(token::Paren))),
('{', "Left Curly Brace"), ('[', "Left Square Bracket", Some(token::OpenDelim(token::Bracket))),
('}', "Right Curly Brace"), (']', "Right Square Bracket", Some(token::CloseDelim(token::Bracket))),
('*', "Asterisk"), ('{', "Left Curly Brace", Some(token::OpenDelim(token::Brace))),
('/', "Slash"), ('}', "Right Curly Brace", Some(token::CloseDelim(token::Brace))),
('\\', "Backslash"), ('*', "Asterisk", Some(token::BinOp(token::Star))),
('&', "Ampersand"), ('/', "Slash", Some(token::BinOp(token::Slash))),
('+', "Plus Sign"), ('\\', "Backslash", None),
('<', "Less-Than Sign"), ('&', "Ampersand", Some(token::BinOp(token::And))),
('=', "Equals Sign"), ('+', "Plus Sign", Some(token::BinOp(token::Plus))),
('>', "Greater-Than Sign"), ('<', "Less-Than Sign", Some(token::Lt)),
('=', "Equals Sign", Some(token::Eq)),
('>', "Greater-Than Sign", Some(token::Gt)),
// FIXME: Literals are already lexed by this point, so we can't recover gracefully just by
// spitting the correct token out.
('\'', "Single Quote", None),
('"', "Quotation Mark", None),
]; ];
crate fn check_for_substitution<'a>( crate fn check_for_substitution<'a>(
@ -330,20 +337,20 @@ crate fn check_for_substitution<'a>(
pos: BytePos, pos: BytePos,
ch: char, ch: char,
err: &mut DiagnosticBuilder<'a>, err: &mut DiagnosticBuilder<'a>,
) -> bool { ) -> Option<token::TokenKind> {
let (u_name, ascii_char) = match UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch) { let (u_name, ascii_char) = match UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch) {
Some(&(_u_char, u_name, ascii_char)) => (u_name, ascii_char), Some(&(_u_char, u_name, ascii_char)) => (u_name, ascii_char),
None => return false, None => return None,
}; };
let span = Span::new(pos, pos + Pos::from_usize(ch.len_utf8()), NO_EXPANSION); let span = Span::new(pos, pos + Pos::from_usize(ch.len_utf8()), NO_EXPANSION);
let ascii_name = match ASCII_ARRAY.iter().find(|&&(c, _)| c == ascii_char) { let (ascii_name, token) = match ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) {
Some((_ascii_char, ascii_name)) => ascii_name, Some((_ascii_char, ascii_name, token)) => (ascii_name, token),
None => { None => {
let msg = format!("substitution character not found for '{}'", ch); let msg = format!("substitution character not found for '{}'", ch);
reader.sess.span_diagnostic.span_bug_no_panic(span, &msg); reader.sess.span_diagnostic.span_bug_no_panic(span, &msg);
return false; return None;
} }
}; };
@ -371,7 +378,7 @@ crate fn check_for_substitution<'a>(
); );
err.span_suggestion(span, &msg, ascii_char.to_string(), Applicability::MaybeIncorrect); err.span_suggestion(span, &msg, ascii_char.to_string(), Applicability::MaybeIncorrect);
} }
true token.clone()
} }
/// Extract string if found at current position with given delimiters /// Extract string if found at current position with given delimiters

View File

@ -1,5 +1,6 @@
const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e11; // m³⋅kg⁻¹⋅s⁻² const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e11; // m³⋅kg⁻¹⋅s⁻²
//~^ ERROR expected at least one digit in exponent //~^ ERROR expected at least one digit in exponent
//~| ERROR unknown start of token: \u{2212} //~| ERROR unknown start of token: \u{2212}
//~| ERROR cannot subtract `{integer}` from `{float}`
fn main() {} fn main() {}

View File

@ -14,5 +14,14 @@ help: Unicode character '' (Minus Sign) looks like '-' (Minus/Hyphen), but it
LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e-11; // m³⋅kg⁻¹⋅s⁻² LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e-11; // m³⋅kg⁻¹⋅s⁻²
| ^ | ^
error: aborting due to 2 previous errors error[E0277]: cannot subtract `{integer}` from `{float}`
--> $DIR/issue-49746-unicode-confusable-in-float-literal-expt.rs:1:53
|
LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e11; // m³⋅kg⁻¹⋅s⁻²
| ^ no implementation for `{float} - {integer}`
|
= help: the trait `std::ops::Sub<{integer}>` is not implemented for `{float}`
error: aborting due to 3 previous errors
For more information about this error, try `rustc --explain E0277`.

View File

@ -0,0 +1,4 @@
fn main() {
println!(""); //~ ERROR unknown start of token: \u{37e}
let x: usize = (); //~ ERROR mismatched types
}

View File

@ -0,0 +1,22 @@
error: unknown start of token: \u{37e}
--> $DIR/recover-from-homoglyph.rs:2:17
|
LL | println!("");
| ^
help: Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), but it is not
|
LL | println!("");
| ^
error[E0308]: mismatched types
--> $DIR/recover-from-homoglyph.rs:3:20
|
LL | let x: usize = ();
| ^^ expected usize, found ()
|
= note: expected type `usize`
found type `()`
error: aborting due to 2 previous errors
For more information about this error, try `rustc --explain E0308`.