Rollup merge of #62963 - estebank:homoglyph-recovery, r=petrochenkov
Allow lexer to recover from some homoglyphs
This commit is contained in:
commit
1893ac6db3
@ -389,8 +389,18 @@ impl<'a> StringReader<'a> {
|
||||
self.pos,
|
||||
"unknown start of token",
|
||||
c);
|
||||
unicode_chars::check_for_substitution(self, start, c, &mut err);
|
||||
return Err(err)
|
||||
// FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs,
|
||||
// instead of keeping a table in `check_for_substitution`into the token. Ideally,
|
||||
// this should be inside `rustc_lexer`. However, we should first remove compound
|
||||
// tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
|
||||
// as there will be less overall work to do this way.
|
||||
return match unicode_chars::check_for_substitution(self, start, c, &mut err) {
|
||||
Some(token) => {
|
||||
err.emit();
|
||||
Ok(token)
|
||||
}
|
||||
None => Err(err),
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(kind)
|
||||
|
@ -3,7 +3,8 @@
|
||||
|
||||
use super::StringReader;
|
||||
use errors::{Applicability, DiagnosticBuilder};
|
||||
use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION};
|
||||
use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION, symbol::kw};
|
||||
use crate::parse::token;
|
||||
|
||||
#[rustfmt::skip] // for line breaks
|
||||
const UNICODE_ARRAY: &[(char, &str, char)] = &[
|
||||
@ -297,32 +298,38 @@ const UNICODE_ARRAY: &[(char, &str, char)] = &[
|
||||
('>', "Fullwidth Greater-Than Sign", '>'),
|
||||
];
|
||||
|
||||
const ASCII_ARRAY: &[(char, &str)] = &[
|
||||
(' ', "Space"),
|
||||
('_', "Underscore"),
|
||||
('-', "Minus/Hyphen"),
|
||||
(',', "Comma"),
|
||||
(';', "Semicolon"),
|
||||
(':', "Colon"),
|
||||
('!', "Exclamation Mark"),
|
||||
('?', "Question Mark"),
|
||||
('.', "Period"),
|
||||
('\'', "Single Quote"),
|
||||
('"', "Quotation Mark"),
|
||||
('(', "Left Parenthesis"),
|
||||
(')', "Right Parenthesis"),
|
||||
('[', "Left Square Bracket"),
|
||||
(']', "Right Square Bracket"),
|
||||
('{', "Left Curly Brace"),
|
||||
('}', "Right Curly Brace"),
|
||||
('*', "Asterisk"),
|
||||
('/', "Slash"),
|
||||
('\\', "Backslash"),
|
||||
('&', "Ampersand"),
|
||||
('+', "Plus Sign"),
|
||||
('<', "Less-Than Sign"),
|
||||
('=', "Equals Sign"),
|
||||
('>', "Greater-Than Sign"),
|
||||
// FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, instead of
|
||||
// keeping the substitution token in this table. Ideally, this should be inside `rustc_lexer`.
|
||||
// However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add
|
||||
// fancier error recovery to it, as there will be less overall work to do this way.
|
||||
const ASCII_ARRAY: &[(char, &str, Option<token::TokenKind>)] = &[
|
||||
(' ', "Space", Some(token::Whitespace)),
|
||||
('_', "Underscore", Some(token::Ident(kw::Underscore, false))),
|
||||
('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))),
|
||||
(',', "Comma", Some(token::Comma)),
|
||||
(';', "Semicolon", Some(token::Semi)),
|
||||
(':', "Colon", Some(token::Colon)),
|
||||
('!', "Exclamation Mark", Some(token::Not)),
|
||||
('?', "Question Mark", Some(token::Question)),
|
||||
('.', "Period", Some(token::Dot)),
|
||||
('(', "Left Parenthesis", Some(token::OpenDelim(token::Paren))),
|
||||
(')', "Right Parenthesis", Some(token::CloseDelim(token::Paren))),
|
||||
('[', "Left Square Bracket", Some(token::OpenDelim(token::Bracket))),
|
||||
(']', "Right Square Bracket", Some(token::CloseDelim(token::Bracket))),
|
||||
('{', "Left Curly Brace", Some(token::OpenDelim(token::Brace))),
|
||||
('}', "Right Curly Brace", Some(token::CloseDelim(token::Brace))),
|
||||
('*', "Asterisk", Some(token::BinOp(token::Star))),
|
||||
('/', "Slash", Some(token::BinOp(token::Slash))),
|
||||
('\\', "Backslash", None),
|
||||
('&', "Ampersand", Some(token::BinOp(token::And))),
|
||||
('+', "Plus Sign", Some(token::BinOp(token::Plus))),
|
||||
('<', "Less-Than Sign", Some(token::Lt)),
|
||||
('=', "Equals Sign", Some(token::Eq)),
|
||||
('>', "Greater-Than Sign", Some(token::Gt)),
|
||||
// FIXME: Literals are already lexed by this point, so we can't recover gracefully just by
|
||||
// spitting the correct token out.
|
||||
('\'', "Single Quote", None),
|
||||
('"', "Quotation Mark", None),
|
||||
];
|
||||
|
||||
crate fn check_for_substitution<'a>(
|
||||
@ -330,20 +337,20 @@ crate fn check_for_substitution<'a>(
|
||||
pos: BytePos,
|
||||
ch: char,
|
||||
err: &mut DiagnosticBuilder<'a>,
|
||||
) -> bool {
|
||||
) -> Option<token::TokenKind> {
|
||||
let (u_name, ascii_char) = match UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch) {
|
||||
Some(&(_u_char, u_name, ascii_char)) => (u_name, ascii_char),
|
||||
None => return false,
|
||||
None => return None,
|
||||
};
|
||||
|
||||
let span = Span::new(pos, pos + Pos::from_usize(ch.len_utf8()), NO_EXPANSION);
|
||||
|
||||
let ascii_name = match ASCII_ARRAY.iter().find(|&&(c, _)| c == ascii_char) {
|
||||
Some((_ascii_char, ascii_name)) => ascii_name,
|
||||
let (ascii_name, token) = match ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) {
|
||||
Some((_ascii_char, ascii_name, token)) => (ascii_name, token),
|
||||
None => {
|
||||
let msg = format!("substitution character not found for '{}'", ch);
|
||||
reader.sess.span_diagnostic.span_bug_no_panic(span, &msg);
|
||||
return false;
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
@ -371,7 +378,7 @@ crate fn check_for_substitution<'a>(
|
||||
);
|
||||
err.span_suggestion(span, &msg, ascii_char.to_string(), Applicability::MaybeIncorrect);
|
||||
}
|
||||
true
|
||||
token.clone()
|
||||
}
|
||||
|
||||
/// Extract string if found at current position with given delimiters
|
||||
|
@ -1,5 +1,6 @@
|
||||
const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻²
|
||||
//~^ ERROR expected at least one digit in exponent
|
||||
//~| ERROR unknown start of token: \u{2212}
|
||||
//~| ERROR cannot subtract `{integer}` from `{float}`
|
||||
|
||||
fn main() {}
|
||||
|
@ -14,5 +14,14 @@ help: Unicode character '−' (Minus Sign) looks like '-' (Minus/Hyphen), but it
|
||||
LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e-11; // m³⋅kg⁻¹⋅s⁻²
|
||||
| ^
|
||||
|
||||
error: aborting due to 2 previous errors
|
||||
error[E0277]: cannot subtract `{integer}` from `{float}`
|
||||
--> $DIR/issue-49746-unicode-confusable-in-float-literal-expt.rs:1:53
|
||||
|
|
||||
LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻²
|
||||
| ^ no implementation for `{float} - {integer}`
|
||||
|
|
||||
= help: the trait `std::ops::Sub<{integer}>` is not implemented for `{float}`
|
||||
|
||||
error: aborting due to 3 previous errors
|
||||
|
||||
For more information about this error, try `rustc --explain E0277`.
|
||||
|
4
src/test/ui/parser/recover-from-homoglyph.rs
Normal file
4
src/test/ui/parser/recover-from-homoglyph.rs
Normal file
@ -0,0 +1,4 @@
|
||||
fn main() {
|
||||
println!(""); //~ ERROR unknown start of token: \u{37e}
|
||||
let x: usize = (); //~ ERROR mismatched types
|
||||
}
|
22
src/test/ui/parser/recover-from-homoglyph.stderr
Normal file
22
src/test/ui/parser/recover-from-homoglyph.stderr
Normal file
@ -0,0 +1,22 @@
|
||||
error: unknown start of token: \u{37e}
|
||||
--> $DIR/recover-from-homoglyph.rs:2:17
|
||||
|
|
||||
LL | println!("");
|
||||
| ^
|
||||
help: Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), but it is not
|
||||
|
|
||||
LL | println!("");
|
||||
| ^
|
||||
|
||||
error[E0308]: mismatched types
|
||||
--> $DIR/recover-from-homoglyph.rs:3:20
|
||||
|
|
||||
LL | let x: usize = ();
|
||||
| ^^ expected usize, found ()
|
||||
|
|
||||
= note: expected type `usize`
|
||||
found type `()`
|
||||
|
||||
error: aborting due to 2 previous errors
|
||||
|
||||
For more information about this error, try `rustc --explain E0308`.
|
Loading…
Reference in New Issue
Block a user