Merge pull request #299 from Manishearth/unicode_str

Unicode lints, second attempt: Lint whole strings, help with replacement This fixes #85
2015-09-04 14:27:26 +02:00 · 2015-09-04 14:27:26 +02:00 · 0c50d763fc
commit 0c50d763fc
parent af4d7f9b60 28212e4981
5 changed files with 59 additions and 25 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -16,6 +16,9 @@ keywords = ["clippy", "lint", "plugin"]
 name = "clippy"
 plugin = true

+[dependencies]
+unicode-normalization = "*"
+
 [dev-dependencies]
 compiletest_rs = "*"
 regex = "*"
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@
 A collection of lints that give helpful tips to newbies and catch oversights.

 ##Lints
-There are 53 lints included in this crate:
+There are 54 lints included in this crate:

 name                                                                                                 | default | meaning
 -----------------------------------------------------------------------------------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
@ -56,6 +56,7 @@ name
 [string_to_string](https://github.com/Manishearth/rust-clippy/wiki#string_to_string)                 | warn    | calling `String.to_string()` which is a no-op
 [toplevel_ref_arg](https://github.com/Manishearth/rust-clippy/wiki#toplevel_ref_arg)                 | warn    | a function argument is declared `ref` (i.e. `fn foo(ref x: u8)`, but not `fn foo((ref x, ref y): (u8, u8))`)
 [type_complexity](https://github.com/Manishearth/rust-clippy/wiki#type_complexity)                   | warn    | usage of very complex types; recommends factoring out parts into `type` definitions
+[unicode_not_nfc](https://github.com/Manishearth/rust-clippy/wiki#unicode_not_nfc)                   | allow   | using a unicode literal not in NFC normal form (see http://www.unicode.org/reports/tr15/ for further information)
 [unit_cmp](https://github.com/Manishearth/rust-clippy/wiki#unit_cmp)                                 | warn    | comparing unit values (which is always `true` or `false`, respectively)
 [unused_collect](https://github.com/Manishearth/rust-clippy/wiki#unused_collect)                     | warn    | `collect()`ing an iterator without using the result; this is usually better written as a for loop
 [while_let_loop](https://github.com/Manishearth/rust-clippy/wiki#while_let_loop)                     | warn    | `loop { if let { ... } else break }` can be written as a `while let` loop
--- a/src/lib.rs
+++ b/src/lib.rs
@ -14,6 +14,9 @@ extern crate rustc_front;
 extern crate core;
 extern crate collections;

+// for unicode nfc normalization
+extern crate unicode_normalization;
+
 use rustc::plugin::Registry;
 use rustc::lint::LintPassObject;

@ -96,6 +99,7 @@ pub fn plugin_registrar(reg: &mut Registry) {
        types::CAST_PRECISION_LOSS,
        types::CAST_SIGN_LOSS,
        unicode::NON_ASCII_LITERAL,
+        unicode::UNICODE_NOT_NFC,
    ]);

    reg.register_lint_group("clippy", vec![
--- a/src/unicode.rs
+++ b/src/unicode.rs
@ -1,49 +1,73 @@
 use rustc::lint::*;
 use rustc_front::hir::*;
-use syntax::codemap::{BytePos, Span};
+use syntax::codemap::Span;

-use utils::span_lint;
+use unicode_normalization::UnicodeNormalization;
+
+use utils::{snippet, span_help_and_lint};

 declare_lint!{ pub ZERO_WIDTH_SPACE, Deny,
               "using a zero-width space in a string literal, which is confusing" }
 declare_lint!{ pub NON_ASCII_LITERAL, Allow,
               "using any literal non-ASCII chars in a string literal; suggests \
                using the \\u escape instead" }
+declare_lint!{ pub UNICODE_NOT_NFC, Allow,
+               "using a unicode literal not in NFC normal form (see \
+               http://www.unicode.org/reports/tr15/ for further information)" }
+

 #[derive(Copy, Clone)]
 pub struct Unicode;

 impl LintPass for Unicode {
    fn get_lints(&self) -> LintArray {
-        lint_array!(ZERO_WIDTH_SPACE, NON_ASCII_LITERAL)
+        lint_array!(ZERO_WIDTH_SPACE, NON_ASCII_LITERAL, UNICODE_NOT_NFC)
    }

    fn check_expr(&mut self, cx: &Context, expr: &Expr) {
        if let ExprLit(ref lit) = expr.node {
-            if let LitStr(ref string, _) = lit.node {
-                check_str(cx, string, lit.span)
+            if let LitStr(_, _) = lit.node {
+                check_str(cx, lit.span)
            }
        }
    }
 }

-fn check_str(cx: &Context, string: &str, span: Span) {
-    for (i, c) in string.char_indices() {
-        if c == '\u{200B}' {
-            str_pos_lint(cx, ZERO_WIDTH_SPACE, span, i,
-                         "zero-width space detected. Consider using `\\u{200B}`");
-        }
+fn escape<T: Iterator<Item=char>>(s: T) -> String {
+    let mut result = String::new();
+    for c in s {
        if c as u32 > 0x7F {
-            str_pos_lint(cx, NON_ASCII_LITERAL, span, i, &format!(
-                "literal non-ASCII character detected. Consider using `\\u{{{:X}}}`", c as u32));
+            for d in c.escape_unicode() { result.push(d) };
+        } else {
+            result.push(c);
        }
    }
+    result
 }

-#[allow(cast_possible_truncation)]
-fn str_pos_lint(cx: &Context, lint: &'static Lint, span: Span, index: usize, msg: &str) {
-    span_lint(cx, lint, Span { lo: span.lo + BytePos((1 + index) as u32),
-                               hi: span.lo + BytePos((1 + index) as u32),
-                               expn_id: span.expn_id }, msg);
-
+fn check_str(cx: &Context, span: Span) {
+    let string = snippet(cx, span, "");
+    if string.contains('\u{200B}') {
+        span_help_and_lint(cx, ZERO_WIDTH_SPACE, span,
+            "zero-width space detected",
+            &format!("Consider replacing the string with:\n\"{}\"",
+                string.replace("\u{200B}", "\\u{200B}")));
+    }
+    if string.chars().any(|c| c as u32 > 0x7F) {
+        span_help_and_lint(cx, NON_ASCII_LITERAL, span,
+            "literal non-ASCII character detected",
+            &format!("Consider replacing the string with:\n\"{}\"",
+                if cx.current_level(UNICODE_NOT_NFC) == Level::Allow {
+                    escape(string.chars())
+                } else {
+                    escape(string.nfc())
+                }));
+    }
+    if cx.current_level(NON_ASCII_LITERAL) == Level::Allow &&
+            string.chars().zip(string.nfc()).any(|(a, b)| a != b) {
+        span_help_and_lint(cx, UNICODE_NOT_NFC, span,
+            "non-nfc unicode sequence detected",
+            &format!("Consider replacing the string with:\n\"{}\"",
+                string.nfc().collect::<String>()));
+    }
 }
--- a/tests/compile-fail/unicode.rs
+++ b/tests/compile-fail/unicode.rs
@ -4,18 +4,20 @@
 #[deny(zero_width_space)]
 fn zero() {
    print!("Here >< is a ZWS, and another");
-               //~^ ERROR zero-width space detected. Consider using `\u{200B}`
-                            //~^^ ERROR zero-width space detected. Consider using `\u{200B}`
+               //~^ ERROR zero-width space detected
+    print!("This\u{200B}is\u{200B}fine");
 }

-//#[deny(unicode_canon)]
+#[deny(unicode_not_nfc)]
 fn canon() {
-    print!("̀ah?"); //not yet ~ERROR non-canonical unicode sequence detected. Consider using à
+    print!("̀àh?"); //~ERROR non-nfc unicode sequence detected
+    print!("a\u{0300}h?"); // also okay
 }

 #[deny(non_ascii_literal)]
 fn uni() {
-    print!("Üben!"); //~ERROR literal non-ASCII character detected. Consider using `\u{DC}`
+    print!("Üben!"); //~ERROR literal non-ASCII character detected
+    print!("\u{DC}ben!"); // this is okay 
 }

 fn main() {