Rollup merge of #62963 - estebank:homoglyph-recovery, r=petrochenkov

Allow lexer to recover from some homoglyphs
This commit is contained in:
Mazdak Farrokhzad 2019-07-26 18:56:53 +02:00 committed by GitHub
commit 1893ac6db3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 89 additions and 36 deletions

View File

@ -389,8 +389,18 @@ impl<'a> StringReader<'a> {
self.pos,
"unknown start of token",
c);
unicode_chars::check_for_substitution(self, start, c, &mut err);
return Err(err)
// FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs,
// instead of keeping a table in `check_for_substitution`into the token. Ideally,
// this should be inside `rustc_lexer`. However, we should first remove compound
// tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
// as there will be less overall work to do this way.
return match unicode_chars::check_for_substitution(self, start, c, &mut err) {
Some(token) => {
err.emit();
Ok(token)
}
None => Err(err),
}
}
};
Ok(kind)

View File

@ -3,7 +3,8 @@
use super::StringReader;
use errors::{Applicability, DiagnosticBuilder};
use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION};
use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION, symbol::kw};
use crate::parse::token;
#[rustfmt::skip] // for line breaks
const UNICODE_ARRAY: &[(char, &str, char)] = &[
@ -297,32 +298,38 @@ const UNICODE_ARRAY: &[(char, &str, char)] = &[
('', "Fullwidth Greater-Than Sign", '>'),
];
const ASCII_ARRAY: &[(char, &str)] = &[
(' ', "Space"),
('_', "Underscore"),
('-', "Minus/Hyphen"),
(',', "Comma"),
(';', "Semicolon"),
(':', "Colon"),
('!', "Exclamation Mark"),
('?', "Question Mark"),
('.', "Period"),
('\'', "Single Quote"),
('"', "Quotation Mark"),
('(', "Left Parenthesis"),
(')', "Right Parenthesis"),
('[', "Left Square Bracket"),
(']', "Right Square Bracket"),
('{', "Left Curly Brace"),
('}', "Right Curly Brace"),
('*', "Asterisk"),
('/', "Slash"),
('\\', "Backslash"),
('&', "Ampersand"),
('+', "Plus Sign"),
('<', "Less-Than Sign"),
('=', "Equals Sign"),
('>', "Greater-Than Sign"),
// FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, instead of
// keeping the substitution token in this table. Ideally, this should be inside `rustc_lexer`.
// However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add
// fancier error recovery to it, as there will be less overall work to do this way.
const ASCII_ARRAY: &[(char, &str, Option<token::TokenKind>)] = &[
(' ', "Space", Some(token::Whitespace)),
('_', "Underscore", Some(token::Ident(kw::Underscore, false))),
('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))),
(',', "Comma", Some(token::Comma)),
(';', "Semicolon", Some(token::Semi)),
(':', "Colon", Some(token::Colon)),
('!', "Exclamation Mark", Some(token::Not)),
('?', "Question Mark", Some(token::Question)),
('.', "Period", Some(token::Dot)),
('(', "Left Parenthesis", Some(token::OpenDelim(token::Paren))),
(')', "Right Parenthesis", Some(token::CloseDelim(token::Paren))),
('[', "Left Square Bracket", Some(token::OpenDelim(token::Bracket))),
(']', "Right Square Bracket", Some(token::CloseDelim(token::Bracket))),
('{', "Left Curly Brace", Some(token::OpenDelim(token::Brace))),
('}', "Right Curly Brace", Some(token::CloseDelim(token::Brace))),
('*', "Asterisk", Some(token::BinOp(token::Star))),
('/', "Slash", Some(token::BinOp(token::Slash))),
('\\', "Backslash", None),
('&', "Ampersand", Some(token::BinOp(token::And))),
('+', "Plus Sign", Some(token::BinOp(token::Plus))),
('<', "Less-Than Sign", Some(token::Lt)),
('=', "Equals Sign", Some(token::Eq)),
('>', "Greater-Than Sign", Some(token::Gt)),
// FIXME: Literals are already lexed by this point, so we can't recover gracefully just by
// spitting the correct token out.
('\'', "Single Quote", None),
('"', "Quotation Mark", None),
];
crate fn check_for_substitution<'a>(
@ -330,20 +337,20 @@ crate fn check_for_substitution<'a>(
pos: BytePos,
ch: char,
err: &mut DiagnosticBuilder<'a>,
) -> bool {
) -> Option<token::TokenKind> {
let (u_name, ascii_char) = match UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch) {
Some(&(_u_char, u_name, ascii_char)) => (u_name, ascii_char),
None => return false,
None => return None,
};
let span = Span::new(pos, pos + Pos::from_usize(ch.len_utf8()), NO_EXPANSION);
let ascii_name = match ASCII_ARRAY.iter().find(|&&(c, _)| c == ascii_char) {
Some((_ascii_char, ascii_name)) => ascii_name,
let (ascii_name, token) = match ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) {
Some((_ascii_char, ascii_name, token)) => (ascii_name, token),
None => {
let msg = format!("substitution character not found for '{}'", ch);
reader.sess.span_diagnostic.span_bug_no_panic(span, &msg);
return false;
return None;
}
};
@ -371,7 +378,7 @@ crate fn check_for_substitution<'a>(
);
err.span_suggestion(span, &msg, ascii_char.to_string(), Applicability::MaybeIncorrect);
}
true
token.clone()
}
/// Extract string if found at current position with given delimiters

View File

@ -1,5 +1,6 @@
const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e11; // m³⋅kg⁻¹⋅s⁻²
//~^ ERROR expected at least one digit in exponent
//~| ERROR unknown start of token: \u{2212}
//~| ERROR cannot subtract `{integer}` from `{float}`
fn main() {}

View File

@ -14,5 +14,14 @@ help: Unicode character '' (Minus Sign) looks like '-' (Minus/Hyphen), but it
LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e-11; // m³⋅kg⁻¹⋅s⁻²
| ^
error: aborting due to 2 previous errors
error[E0277]: cannot subtract `{integer}` from `{float}`
--> $DIR/issue-49746-unicode-confusable-in-float-literal-expt.rs:1:53
|
LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e11; // m³⋅kg⁻¹⋅s⁻²
| ^ no implementation for `{float} - {integer}`
|
= help: the trait `std::ops::Sub<{integer}>` is not implemented for `{float}`
error: aborting due to 3 previous errors
For more information about this error, try `rustc --explain E0277`.

View File

@ -0,0 +1,4 @@
fn main() {
println!(""); //~ ERROR unknown start of token: \u{37e}
let x: usize = (); //~ ERROR mismatched types
}

View File

@ -0,0 +1,22 @@
error: unknown start of token: \u{37e}
--> $DIR/recover-from-homoglyph.rs:2:17
|
LL | println!("");
| ^
help: Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), but it is not
|
LL | println!("");
| ^
error[E0308]: mismatched types
--> $DIR/recover-from-homoglyph.rs:3:20
|
LL | let x: usize = ();
| ^^ expected usize, found ()
|
= note: expected type `usize`
found type `()`
error: aborting due to 2 previous errors
For more information about this error, try `rustc --explain E0308`.