Rollup merge of #62963 - estebank:homoglyph-recovery, r=petrochenkov
Allow lexer to recover from some homoglyphs
This commit is contained in:
commit
1893ac6db3
@ -389,8 +389,18 @@ impl<'a> StringReader<'a> {
|
|||||||
self.pos,
|
self.pos,
|
||||||
"unknown start of token",
|
"unknown start of token",
|
||||||
c);
|
c);
|
||||||
unicode_chars::check_for_substitution(self, start, c, &mut err);
|
// FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs,
|
||||||
return Err(err)
|
// instead of keeping a table in `check_for_substitution`into the token. Ideally,
|
||||||
|
// this should be inside `rustc_lexer`. However, we should first remove compound
|
||||||
|
// tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
|
||||||
|
// as there will be less overall work to do this way.
|
||||||
|
return match unicode_chars::check_for_substitution(self, start, c, &mut err) {
|
||||||
|
Some(token) => {
|
||||||
|
err.emit();
|
||||||
|
Ok(token)
|
||||||
|
}
|
||||||
|
None => Err(err),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
Ok(kind)
|
Ok(kind)
|
||||||
|
@ -3,7 +3,8 @@
|
|||||||
|
|
||||||
use super::StringReader;
|
use super::StringReader;
|
||||||
use errors::{Applicability, DiagnosticBuilder};
|
use errors::{Applicability, DiagnosticBuilder};
|
||||||
use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION};
|
use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION, symbol::kw};
|
||||||
|
use crate::parse::token;
|
||||||
|
|
||||||
#[rustfmt::skip] // for line breaks
|
#[rustfmt::skip] // for line breaks
|
||||||
const UNICODE_ARRAY: &[(char, &str, char)] = &[
|
const UNICODE_ARRAY: &[(char, &str, char)] = &[
|
||||||
@ -297,32 +298,38 @@ const UNICODE_ARRAY: &[(char, &str, char)] = &[
|
|||||||
('>', "Fullwidth Greater-Than Sign", '>'),
|
('>', "Fullwidth Greater-Than Sign", '>'),
|
||||||
];
|
];
|
||||||
|
|
||||||
const ASCII_ARRAY: &[(char, &str)] = &[
|
// FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, instead of
|
||||||
(' ', "Space"),
|
// keeping the substitution token in this table. Ideally, this should be inside `rustc_lexer`.
|
||||||
('_', "Underscore"),
|
// However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add
|
||||||
('-', "Minus/Hyphen"),
|
// fancier error recovery to it, as there will be less overall work to do this way.
|
||||||
(',', "Comma"),
|
const ASCII_ARRAY: &[(char, &str, Option<token::TokenKind>)] = &[
|
||||||
(';', "Semicolon"),
|
(' ', "Space", Some(token::Whitespace)),
|
||||||
(':', "Colon"),
|
('_', "Underscore", Some(token::Ident(kw::Underscore, false))),
|
||||||
('!', "Exclamation Mark"),
|
('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))),
|
||||||
('?', "Question Mark"),
|
(',', "Comma", Some(token::Comma)),
|
||||||
('.', "Period"),
|
(';', "Semicolon", Some(token::Semi)),
|
||||||
('\'', "Single Quote"),
|
(':', "Colon", Some(token::Colon)),
|
||||||
('"', "Quotation Mark"),
|
('!', "Exclamation Mark", Some(token::Not)),
|
||||||
('(', "Left Parenthesis"),
|
('?', "Question Mark", Some(token::Question)),
|
||||||
(')', "Right Parenthesis"),
|
('.', "Period", Some(token::Dot)),
|
||||||
('[', "Left Square Bracket"),
|
('(', "Left Parenthesis", Some(token::OpenDelim(token::Paren))),
|
||||||
(']', "Right Square Bracket"),
|
(')', "Right Parenthesis", Some(token::CloseDelim(token::Paren))),
|
||||||
('{', "Left Curly Brace"),
|
('[', "Left Square Bracket", Some(token::OpenDelim(token::Bracket))),
|
||||||
('}', "Right Curly Brace"),
|
(']', "Right Square Bracket", Some(token::CloseDelim(token::Bracket))),
|
||||||
('*', "Asterisk"),
|
('{', "Left Curly Brace", Some(token::OpenDelim(token::Brace))),
|
||||||
('/', "Slash"),
|
('}', "Right Curly Brace", Some(token::CloseDelim(token::Brace))),
|
||||||
('\\', "Backslash"),
|
('*', "Asterisk", Some(token::BinOp(token::Star))),
|
||||||
('&', "Ampersand"),
|
('/', "Slash", Some(token::BinOp(token::Slash))),
|
||||||
('+', "Plus Sign"),
|
('\\', "Backslash", None),
|
||||||
('<', "Less-Than Sign"),
|
('&', "Ampersand", Some(token::BinOp(token::And))),
|
||||||
('=', "Equals Sign"),
|
('+', "Plus Sign", Some(token::BinOp(token::Plus))),
|
||||||
('>', "Greater-Than Sign"),
|
('<', "Less-Than Sign", Some(token::Lt)),
|
||||||
|
('=', "Equals Sign", Some(token::Eq)),
|
||||||
|
('>', "Greater-Than Sign", Some(token::Gt)),
|
||||||
|
// FIXME: Literals are already lexed by this point, so we can't recover gracefully just by
|
||||||
|
// spitting the correct token out.
|
||||||
|
('\'', "Single Quote", None),
|
||||||
|
('"', "Quotation Mark", None),
|
||||||
];
|
];
|
||||||
|
|
||||||
crate fn check_for_substitution<'a>(
|
crate fn check_for_substitution<'a>(
|
||||||
@ -330,20 +337,20 @@ crate fn check_for_substitution<'a>(
|
|||||||
pos: BytePos,
|
pos: BytePos,
|
||||||
ch: char,
|
ch: char,
|
||||||
err: &mut DiagnosticBuilder<'a>,
|
err: &mut DiagnosticBuilder<'a>,
|
||||||
) -> bool {
|
) -> Option<token::TokenKind> {
|
||||||
let (u_name, ascii_char) = match UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch) {
|
let (u_name, ascii_char) = match UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch) {
|
||||||
Some(&(_u_char, u_name, ascii_char)) => (u_name, ascii_char),
|
Some(&(_u_char, u_name, ascii_char)) => (u_name, ascii_char),
|
||||||
None => return false,
|
None => return None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let span = Span::new(pos, pos + Pos::from_usize(ch.len_utf8()), NO_EXPANSION);
|
let span = Span::new(pos, pos + Pos::from_usize(ch.len_utf8()), NO_EXPANSION);
|
||||||
|
|
||||||
let ascii_name = match ASCII_ARRAY.iter().find(|&&(c, _)| c == ascii_char) {
|
let (ascii_name, token) = match ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) {
|
||||||
Some((_ascii_char, ascii_name)) => ascii_name,
|
Some((_ascii_char, ascii_name, token)) => (ascii_name, token),
|
||||||
None => {
|
None => {
|
||||||
let msg = format!("substitution character not found for '{}'", ch);
|
let msg = format!("substitution character not found for '{}'", ch);
|
||||||
reader.sess.span_diagnostic.span_bug_no_panic(span, &msg);
|
reader.sess.span_diagnostic.span_bug_no_panic(span, &msg);
|
||||||
return false;
|
return None;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -371,7 +378,7 @@ crate fn check_for_substitution<'a>(
|
|||||||
);
|
);
|
||||||
err.span_suggestion(span, &msg, ascii_char.to_string(), Applicability::MaybeIncorrect);
|
err.span_suggestion(span, &msg, ascii_char.to_string(), Applicability::MaybeIncorrect);
|
||||||
}
|
}
|
||||||
true
|
token.clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract string if found at current position with given delimiters
|
/// Extract string if found at current position with given delimiters
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻²
|
const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻²
|
||||||
//~^ ERROR expected at least one digit in exponent
|
//~^ ERROR expected at least one digit in exponent
|
||||||
//~| ERROR unknown start of token: \u{2212}
|
//~| ERROR unknown start of token: \u{2212}
|
||||||
|
//~| ERROR cannot subtract `{integer}` from `{float}`
|
||||||
|
|
||||||
fn main() {}
|
fn main() {}
|
||||||
|
@ -14,5 +14,14 @@ help: Unicode character '−' (Minus Sign) looks like '-' (Minus/Hyphen), but it
|
|||||||
LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e-11; // m³⋅kg⁻¹⋅s⁻²
|
LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e-11; // m³⋅kg⁻¹⋅s⁻²
|
||||||
| ^
|
| ^
|
||||||
|
|
||||||
error: aborting due to 2 previous errors
|
error[E0277]: cannot subtract `{integer}` from `{float}`
|
||||||
|
--> $DIR/issue-49746-unicode-confusable-in-float-literal-expt.rs:1:53
|
||||||
|
|
|
||||||
|
LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻²
|
||||||
|
| ^ no implementation for `{float} - {integer}`
|
||||||
|
|
|
||||||
|
= help: the trait `std::ops::Sub<{integer}>` is not implemented for `{float}`
|
||||||
|
|
||||||
|
error: aborting due to 3 previous errors
|
||||||
|
|
||||||
|
For more information about this error, try `rustc --explain E0277`.
|
||||||
|
4
src/test/ui/parser/recover-from-homoglyph.rs
Normal file
4
src/test/ui/parser/recover-from-homoglyph.rs
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
fn main() {
|
||||||
|
println!(""); //~ ERROR unknown start of token: \u{37e}
|
||||||
|
let x: usize = (); //~ ERROR mismatched types
|
||||||
|
}
|
22
src/test/ui/parser/recover-from-homoglyph.stderr
Normal file
22
src/test/ui/parser/recover-from-homoglyph.stderr
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
error: unknown start of token: \u{37e}
|
||||||
|
--> $DIR/recover-from-homoglyph.rs:2:17
|
||||||
|
|
|
||||||
|
LL | println!("");
|
||||||
|
| ^
|
||||||
|
help: Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), but it is not
|
||||||
|
|
|
||||||
|
LL | println!("");
|
||||||
|
| ^
|
||||||
|
|
||||||
|
error[E0308]: mismatched types
|
||||||
|
--> $DIR/recover-from-homoglyph.rs:3:20
|
||||||
|
|
|
||||||
|
LL | let x: usize = ();
|
||||||
|
| ^^ expected usize, found ()
|
||||||
|
|
|
||||||
|
= note: expected type `usize`
|
||||||
|
found type `()`
|
||||||
|
|
||||||
|
error: aborting due to 2 previous errors
|
||||||
|
|
||||||
|
For more information about this error, try `rustc --explain E0308`.
|
Loading…
Reference in New Issue
Block a user