diff --git a/Cargo.toml b/Cargo.toml index 28fd03e5d6f..656efd312b2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,9 @@ keywords = ["clippy", "lint", "plugin"] name = "clippy" plugin = true +[dependencies] +unicode-normalization = "*" + [dev-dependencies] compiletest_rs = "*" regex = "*" diff --git a/README.md b/README.md index 8c00c9f901b..0046d891129 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ A collection of lints that give helpful tips to newbies and catch oversights. ##Lints -There are 53 lints included in this crate: +There are 54 lints included in this crate: name | default | meaning -----------------------------------------------------------------------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- @@ -56,6 +56,7 @@ name [string_to_string](https://github.com/Manishearth/rust-clippy/wiki#string_to_string) | warn | calling `String.to_string()` which is a no-op [toplevel_ref_arg](https://github.com/Manishearth/rust-clippy/wiki#toplevel_ref_arg) | warn | a function argument is declared `ref` (i.e. `fn foo(ref x: u8)`, but not `fn foo((ref x, ref y): (u8, u8))`) [type_complexity](https://github.com/Manishearth/rust-clippy/wiki#type_complexity) | warn | usage of very complex types; recommends factoring out parts into `type` definitions +[unicode_not_nfc](https://github.com/Manishearth/rust-clippy/wiki#unicode_not_nfc) | allow | using a unicode literal not in NFC normal form (see http://www.unicode.org/reports/tr15/ for further information) [unit_cmp](https://github.com/Manishearth/rust-clippy/wiki#unit_cmp) | warn | comparing unit values (which is always `true` or `false`, respectively) [unused_collect](https://github.com/Manishearth/rust-clippy/wiki#unused_collect) | warn | `collect()`ing an iterator without using the result; this is usually better written as a for loop [while_let_loop](https://github.com/Manishearth/rust-clippy/wiki#while_let_loop) | warn | `loop { if let { ... } else break }` can be written as a `while let` loop diff --git a/src/lib.rs b/src/lib.rs index e0556972a7d..a4aee0c27fd 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,6 +14,9 @@ extern crate rustc_front; extern crate core; extern crate collections; +// for unicode nfc normalization +extern crate unicode_normalization; + use rustc::plugin::Registry; use rustc::lint::LintPassObject; @@ -96,6 +99,7 @@ pub fn plugin_registrar(reg: &mut Registry) { types::CAST_PRECISION_LOSS, types::CAST_SIGN_LOSS, unicode::NON_ASCII_LITERAL, + unicode::UNICODE_NOT_NFC, ]); reg.register_lint_group("clippy", vec![ diff --git a/src/unicode.rs b/src/unicode.rs index a993da1782a..5e1af6f9818 100644 --- a/src/unicode.rs +++ b/src/unicode.rs @@ -1,21 +1,27 @@ use rustc::lint::*; use rustc_front::hir::*; -use syntax::codemap::{BytePos, Span}; +use syntax::codemap::Span; -use utils::span_lint; +use unicode_normalization::UnicodeNormalization; + +use utils::span_help_and_lint; declare_lint!{ pub ZERO_WIDTH_SPACE, Deny, "using a zero-width space in a string literal, which is confusing" } declare_lint!{ pub NON_ASCII_LITERAL, Allow, "using any literal non-ASCII chars in a string literal; suggests \ using the \\u escape instead" } +declare_lint!{ pub UNICODE_NOT_NFC, Allow, + "using a unicode literal not in NFC normal form (see \ + http://www.unicode.org/reports/tr15/ for further information)" } + #[derive(Copy, Clone)] pub struct Unicode; impl LintPass for Unicode { fn get_lints(&self) -> LintArray { - lint_array!(ZERO_WIDTH_SPACE, NON_ASCII_LITERAL) + lint_array!(ZERO_WIDTH_SPACE, NON_ASCII_LITERAL, UNICODE_NOT_NFC) } fn check_expr(&mut self, cx: &Context, expr: &Expr) { @@ -27,23 +33,41 @@ impl LintPass for Unicode { } } -fn check_str(cx: &Context, string: &str, span: Span) { - for (i, c) in string.char_indices() { - if c == '\u{200B}' { - str_pos_lint(cx, ZERO_WIDTH_SPACE, span, i, - "zero-width space detected. Consider using `\\u{200B}`"); - } +fn escape>(s: T) -> String { + let mut result = String::new(); + for c in s { if c as u32 > 0x7F { - str_pos_lint(cx, NON_ASCII_LITERAL, span, i, &format!( - "literal non-ASCII character detected. Consider using `\\u{{{:X}}}`", c as u32)); + for d in c.escape_unicode() { result.push(d) }; + } else { + result.push(c); + } + } + result +} + +fn check_str(cx: &Context, string: &str, span: Span) { + if string.contains('\u{200B}') { + span_help_and_lint(cx, ZERO_WIDTH_SPACE, span, + "zero-width space detected", + &format!("Consider replacing the string with:\n\"{}\"", + string.replace("\u{200B}", "\\u{200B}"))); + } + if string.chars().any(|c| c as u32 > 0x7F) { + span_help_and_lint(cx, NON_ASCII_LITERAL, span, + "literal non-ASCII character detected", + &format!("Consider replacing the string with:\n\"{}\"", + if cx.current_level(UNICODE_NOT_NFC) == Level::Allow { + escape(string.chars()) + } else { + escape(string.nfc()) + })); + } + if string.chars().zip(string.nfc()).any(|(a, b)| a != b) { + if cx.current_level(NON_ASCII_LITERAL) == Level::Allow { + span_help_and_lint(cx, UNICODE_NOT_NFC, span, + "non-nfc unicode sequence detected", + &format!("Consider replacing the string with:\n\"{}\"", + string.nfc().collect::())); } } } - -#[allow(cast_possible_truncation)] -fn str_pos_lint(cx: &Context, lint: &'static Lint, span: Span, index: usize, msg: &str) { - span_lint(cx, lint, Span { lo: span.lo + BytePos((1 + index) as u32), - hi: span.lo + BytePos((1 + index) as u32), - expn_id: span.expn_id }, msg); - -} diff --git a/tests/compile-fail/unicode.rs b/tests/compile-fail/unicode.rs index e4730f60de8..066825fc686 100755 --- a/tests/compile-fail/unicode.rs +++ b/tests/compile-fail/unicode.rs @@ -4,18 +4,17 @@ #[deny(zero_width_space)] fn zero() { print!("Here >​< is a ZWS, and ​another"); - //~^ ERROR zero-width space detected. Consider using `\u{200B}` - //~^^ ERROR zero-width space detected. Consider using `\u{200B}` + //~^ ERROR zero-width space detected } -//#[deny(unicode_canon)] +#[deny(unicode_not_nfc)] fn canon() { - print!("̀ah?"); //not yet ~ERROR non-canonical unicode sequence detected. Consider using à + print!("̀àh?"); //~ERROR non-nfc unicode sequence detected } #[deny(non_ascii_literal)] fn uni() { - print!("Üben!"); //~ERROR literal non-ASCII character detected. Consider using `\u{DC}` + print!("Üben!"); //~ERROR literal non-ASCII character detected } fn main() {