diff --git a/Cargo.lock b/Cargo.lock index 5132f77e578..02717c85ccf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3696,6 +3696,7 @@ dependencies = [ "smallvec 1.0.0", "syntax", "syntax_pos", + "unicode-normalization", ] [[package]] @@ -4913,9 +4914,12 @@ dependencies = [ [[package]] name = "unicode-normalization" -version = "0.1.7" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a0180bc61fc5a987082bfa111f4cc95c4caff7f9799f3e46df09163a937aa25" +checksum = "b561e267b2326bb4cebfc0ef9e68355c7abe6c6f522aeac2f5bf95d56c59bdcf" +dependencies = [ + "smallvec 1.0.0", +] [[package]] name = "unicode-segmentation" diff --git a/src/librustc_parse/Cargo.toml b/src/librustc_parse/Cargo.toml index fb5cb742ab6..73458a444f4 100644 --- a/src/librustc_parse/Cargo.toml +++ b/src/librustc_parse/Cargo.toml @@ -20,3 +20,4 @@ rustc_error_codes = { path = "../librustc_error_codes" } smallvec = { version = "1.0", features = ["union", "may_dangle"] } syntax_pos = { path = "../libsyntax_pos" } syntax = { path = "../libsyntax" } +unicode-normalization = "0.1.11" diff --git a/src/librustc_parse/lexer/mod.rs b/src/librustc_parse/lexer/mod.rs index e5d3927af86..d69cd14d544 100644 --- a/src/librustc_parse/lexer/mod.rs +++ b/src/librustc_parse/lexer/mod.rs @@ -220,8 +220,7 @@ impl<'a> StringReader<'a> { if is_raw_ident { ident_start = ident_start + BytePos(2); } - // FIXME: perform NFKC normalization here. (Issue #2253) - let sym = self.symbol_from(ident_start); + let sym = self.nfc_symbol_from(ident_start); if is_raw_ident { let span = self.mk_sp(start, self.pos); if !sym.can_be_raw() { @@ -470,6 +469,20 @@ impl<'a> StringReader<'a> { Symbol::intern(self.str_from_to(start, end)) } + /// As symbol_from, with the text normalized into Unicode NFC form. + fn nfc_symbol_from(&self, start: BytePos) -> Symbol { + use unicode_normalization::{is_nfc_quick, IsNormalized, UnicodeNormalization}; + debug!("taking an normalized ident from {:?} to {:?}", start, self.pos); + let sym = self.str_from(start); + match is_nfc_quick(sym.chars()) { + IsNormalized::Yes => Symbol::intern(sym), + _ => { + let sym_str: String = sym.chars().nfc().collect(); + Symbol::intern(&sym_str) + } + } + } + /// Slice of the source text spanning from `start` up to but excluding `end`. fn str_from_to(&self, start: BytePos, end: BytePos) -> &str { &self.src[self.src_index(start)..self.src_index(end)]