diff --git a/Cargo.lock b/Cargo.lock index 0c1c533f395..e05439d71e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5405,15 +5405,15 @@ dependencies = [ [[package]] name = "unicode-script" -version = "0.4.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b2c5c29e805da6817f5af6a627d65adb045cebf05cccd5a3493d6109454391c" +checksum = "58b33414ea8db4b7ea0343548dbdc31d27aef06beacf7044a87e564d9b0feb7d" [[package]] name = "unicode-security" -version = "0.0.3" +version = "0.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5f9011bbed9c13372bc8df618b55a38138445199caf3b61d432c6859c36dee0" +checksum = "5d87c28edc5b263377e448d6cdcb935c06b95413d8013ba6fae470558ccab18f" dependencies = [ "unicode-normalization", "unicode-script", diff --git a/src/librustc_lint/Cargo.toml b/src/librustc_lint/Cargo.toml index ada6f2a9381..58c15257326 100644 --- a/src/librustc_lint/Cargo.toml +++ b/src/librustc_lint/Cargo.toml @@ -10,7 +10,7 @@ path = "lib.rs" [dependencies] log = "0.4" -unicode-security = "0.0.3" +unicode-security = "0.0.5" rustc_middle = { path = "../librustc_middle" } rustc_ast_pretty = { path = "../librustc_ast_pretty" } rustc_attr = { path = "../librustc_attr" } diff --git a/src/librustc_lint/non_ascii_idents.rs b/src/librustc_lint/non_ascii_idents.rs index 90bd7ad4acf..30dbd069c29 100644 --- a/src/librustc_lint/non_ascii_idents.rs +++ b/src/librustc_lint/non_ascii_idents.rs @@ -24,12 +24,20 @@ declare_lint! { crate_level_only } -declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS]); +declare_lint! { + pub MIXED_SCRIPT_CONFUSABLES, + Warn, + "detects Unicode scripts whose mixed script confusables codepoints are solely used", + crate_level_only +} + +declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]); impl EarlyLintPass for NonAsciiIdents { fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) { use rustc_session::lint::Level; use rustc_span::Span; + use std::collections::BTreeMap; use unicode_security::GeneralSecurityProfile; use utils::CowBoxSymStr; @@ -37,8 +45,14 @@ impl EarlyLintPass for NonAsciiIdents { let check_uncommon_codepoints = cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow; let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow; + let check_mixed_script_confusables = + cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow; - if !check_non_ascii_idents && !check_uncommon_codepoints && !check_confusable_idents { + if !check_non_ascii_idents + && !check_uncommon_codepoints + && !check_confusable_idents + && !check_mixed_script_confusables + { return; } @@ -107,6 +121,115 @@ impl EarlyLintPass for NonAsciiIdents { .or_insert((symbol_str, sp, is_ascii)); } } + + if has_non_ascii_idents && check_mixed_script_confusables { + use unicode_security::is_potential_mixed_script_confusable_char; + use unicode_security::mixed_script::AugmentedScriptSet; + + #[derive(Clone)] + enum ScriptSetUsage { + Suspicious(Vec, Span), + Verified, + } + + let mut script_states: FxHashMap = + FxHashMap::default(); + let latin_augmented_script_set = AugmentedScriptSet::for_char('A'); + script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified); + + let mut has_suspicous = false; + for (symbol, &sp) in symbols.iter() { + let symbol_str = symbol.as_str(); + for ch in symbol_str.chars() { + if ch.is_ascii() { + // all ascii characters are covered by exception. + continue; + } + if !GeneralSecurityProfile::identifier_allowed(ch) { + // this character is covered by `uncommon_codepoints` lint. + continue; + } + let augmented_script_set = AugmentedScriptSet::for_char(ch); + script_states + .entry(augmented_script_set) + .and_modify(|existing_state| { + if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state { + if is_potential_mixed_script_confusable_char(ch) { + ch_list.push(ch); + } else { + *existing_state = ScriptSetUsage::Verified; + } + } + }) + .or_insert_with(|| { + if !is_potential_mixed_script_confusable_char(ch) { + ScriptSetUsage::Verified + } else { + has_suspicous = true; + ScriptSetUsage::Suspicious(vec![ch], sp) + } + }); + } + } + + if has_suspicous { + let verified_augmented_script_sets = script_states + .iter() + .flat_map(|(k, v)| match v { + ScriptSetUsage::Verified => Some(*k), + _ => None, + }) + .collect::>(); + + // we're sorting the output here. + let mut lint_reports: BTreeMap<(Span, Vec), AugmentedScriptSet> = + BTreeMap::new(); + + 'outerloop: for (augment_script_set, usage) in script_states { + let (mut ch_list, sp) = match usage { + ScriptSetUsage::Verified => continue, + ScriptSetUsage::Suspicious(ch_list, sp) => (ch_list, sp), + }; + + if augment_script_set.is_all() { + continue; + } + + for existing in verified_augmented_script_sets.iter() { + if existing.is_all() { + continue; + } + let mut intersect = *existing; + intersect.intersect_with(augment_script_set); + if !intersect.is_empty() && !intersect.is_all() { + continue 'outerloop; + } + } + + ch_list.sort(); + ch_list.dedup(); + lint_reports.insert((sp, ch_list), augment_script_set); + } + + for ((sp, ch_list), script_set) in lint_reports { + cx.struct_span_lint(MIXED_SCRIPT_CONFUSABLES, sp, |lint| { + let message = format!( + "The usage of Script Group `{}` in this crate consists solely of mixed script confusables", + script_set); + let mut note = "The usage includes ".to_string(); + for (idx, ch) in ch_list.into_iter().enumerate() { + if idx != 0 { + note += ", "; + } + let char_info = format!("'{}' (U+{:04X})", ch, ch as u32); + note += &char_info; + } + note += "."; + lint.build(&message).note(¬e).note("Please recheck to make sure their usages are indeed what you want.").emit() + }); + } + } + } } } diff --git a/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-mixed-script-confusables-2.rs b/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-mixed-script-confusables-2.rs new file mode 100644 index 00000000000..a5b45466da5 --- /dev/null +++ b/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-mixed-script-confusables-2.rs @@ -0,0 +1,20 @@ +// check-pass +#![feature(non_ascii_idents)] +#![deny(mixed_script_confusables)] + +struct ΑctuallyNotLatin; + +fn main() { + let λ = 42; // this usage of Greek confirms that Greek is used intentionally. +} + +mod роре { + const エ: &'static str = "アイウ"; + + // this usage of Katakana confirms that Katakana is used intentionally. + fn ニャン() { + let д: usize = 100; // this usage of Cyrillic confirms that Cyrillic is used intentionally. + + println!("meow!"); + } +} diff --git a/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-mixed-script-confusables.rs b/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-mixed-script-confusables.rs new file mode 100644 index 00000000000..4637b03f250 --- /dev/null +++ b/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-mixed-script-confusables.rs @@ -0,0 +1,15 @@ +#![feature(non_ascii_idents)] +#![deny(mixed_script_confusables)] + +struct ΑctuallyNotLatin; +//~^ ERROR The usage of Script Group `Greek` in this crate consists solely of + +fn main() { + let v = ΑctuallyNotLatin; +} + +mod роре { +//~^ ERROR The usage of Script Group `Cyrillic` in this crate consists solely of + const エ: &'static str = "アイウ"; + //~^ ERROR The usage of Script Group `Japanese, Katakana` in this crate consists solely of +} diff --git a/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-mixed-script-confusables.stderr b/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-mixed-script-confusables.stderr new file mode 100644 index 00000000000..6f75a1ece37 --- /dev/null +++ b/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-mixed-script-confusables.stderr @@ -0,0 +1,34 @@ +error: The usage of Script Group `Greek` in this crate consists solely of mixed script confusables + --> $DIR/lint-mixed-script-confusables.rs:4:8 + | +LL | struct ΑctuallyNotLatin; + | ^^^^^^^^^^^^^^^^ + | +note: the lint level is defined here + --> $DIR/lint-mixed-script-confusables.rs:2:9 + | +LL | #![deny(mixed_script_confusables)] + | ^^^^^^^^^^^^^^^^^^^^^^^^ + = note: The usage includes 'Α' (U+0391). + = note: Please recheck to make sure their usages are indeed what you want. + +error: The usage of Script Group `Cyrillic` in this crate consists solely of mixed script confusables + --> $DIR/lint-mixed-script-confusables.rs:11:5 + | +LL | mod роре { + | ^^^^ + | + = note: The usage includes 'е' (U+0435), 'о' (U+043E), 'р' (U+0440). + = note: Please recheck to make sure their usages are indeed what you want. + +error: The usage of Script Group `Japanese, Katakana` in this crate consists solely of mixed script confusables + --> $DIR/lint-mixed-script-confusables.rs:13:11 + | +LL | const エ: &'static str = "アイウ"; + | ^^ + | + = note: The usage includes 'エ' (U+30A8). + = note: Please recheck to make sure their usages are indeed what you want. + +error: aborting due to 3 previous errors + diff --git a/src/test/ui/utf8_idents.rs b/src/test/ui/utf8_idents.rs index f59d5502aae..6c54086cc20 100644 --- a/src/test/ui/utf8_idents.rs +++ b/src/test/ui/utf8_idents.rs @@ -1,3 +1,5 @@ +#![allow(mixed_script_confusables)] + fn foo< 'β, //~ ERROR non-ascii idents are not fully supported γ //~ ERROR non-ascii idents are not fully supported diff --git a/src/test/ui/utf8_idents.stderr b/src/test/ui/utf8_idents.stderr index 877412df8fa..2fc0b1c39ef 100644 --- a/src/test/ui/utf8_idents.stderr +++ b/src/test/ui/utf8_idents.stderr @@ -1,5 +1,5 @@ error[E0658]: non-ascii idents are not fully supported - --> $DIR/utf8_idents.rs:2:5 + --> $DIR/utf8_idents.rs:4:5 | LL | 'β, | ^^ @@ -8,7 +8,7 @@ LL | 'β, = help: add `#![feature(non_ascii_idents)]` to the crate attributes to enable error[E0658]: non-ascii idents are not fully supported - --> $DIR/utf8_idents.rs:3:5 + --> $DIR/utf8_idents.rs:5:5 | LL | γ | ^ @@ -17,7 +17,7 @@ LL | γ = help: add `#![feature(non_ascii_idents)]` to the crate attributes to enable error[E0658]: non-ascii idents are not fully supported - --> $DIR/utf8_idents.rs:8:5 + --> $DIR/utf8_idents.rs:10:5 | LL | δ: usize | ^ @@ -26,7 +26,7 @@ LL | δ: usize = help: add `#![feature(non_ascii_idents)]` to the crate attributes to enable error[E0658]: non-ascii idents are not fully supported - --> $DIR/utf8_idents.rs:12:9 + --> $DIR/utf8_idents.rs:14:9 | LL | let α = 0.00001f64; | ^ @@ -35,7 +35,7 @@ LL | let α = 0.00001f64; = help: add `#![feature(non_ascii_idents)]` to the crate attributes to enable warning: type parameter `γ` should have an upper camel case name - --> $DIR/utf8_idents.rs:3:5 + --> $DIR/utf8_idents.rs:5:5 | LL | γ | ^ help: convert the identifier to upper camel case: `Γ`