From 4e2ddcb879d225e7d22fbf4af0536c06203b8d94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Malo=20Jaffr=C3=A9?= Date: Sun, 6 Aug 2017 17:36:50 +0200 Subject: [PATCH 1/2] Update the list of confusable characters Also reorder and space the list to make it clearer for futures updates and to come closer to the original list. Thanks @est31 for the instructions. Fixes #43629. r? @est31 --- src/libsyntax/parse/lexer/unicode_chars.rs | 144 ++++++++++++++++++--- 1 file changed, 125 insertions(+), 19 deletions(-) diff --git a/src/libsyntax/parse/lexer/unicode_chars.rs b/src/libsyntax/parse/lexer/unicode_chars.rs index 83a164bdb96..cc38021b7aa 100644 --- a/src/libsyntax/parse/lexer/unicode_chars.rs +++ b/src/libsyntax/parse/lexer/unicode_chars.rs @@ -1,4 +1,4 @@ -// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT +// Copyright 2012-2017 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // @@ -9,15 +9,16 @@ // except according to those terms. // Characters and their corresponding confusables were collected from -// http://www.unicode.org/Public/security/revision-06/confusables.txt +// http://www.unicode.org/Public/security/10.0.0/confusables.txt use syntax_pos::{Span, NO_EXPANSION}; use errors::DiagnosticBuilder; use super::StringReader; const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[ - (' ', "No-Break Space", ' '), - (' ', "Ogham Space Mark", ' '), + ('
', "Line Separator", ' '), + ('
', "Paragraph Separator", ' '), + (' ', "Ogham Space mark", ' '), (' ', "En Quad", ' '), (' ', "Em Quad", ' '), (' ', "En Space", ' '), @@ -25,39 +26,63 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[ (' ', "Three-Per-Em Space", ' '), (' ', "Four-Per-Em Space", ' '), (' ', "Six-Per-Em Space", ' '), - (' ', "Figure Space", ' '), (' ', "Punctuation Space", ' '), (' ', "Thin Space", ' '), (' ', "Hair Space", ' '), - (' ', "Narrow No-Break Space", ' '), (' ', "Medium Mathematical Space", ' '), + (' ', "No-Break Space", ' '), + (' ', "Figure Space", ' '), + (' ', "Narrow No-Break Space", ' '), (' ', "Ideographic Space", ' '), + ('ߺ', "Nko Lajanyalan", '_'), ('﹍', "Dashed Low Line", '_'), ('﹎', "Centreline Low Line", '_'), ('﹏', "Wavy Low Line", '_'), + ('_', "Fullwidth Low Line", '-'), + ('‐', "Hyphen", '-'), ('‑', "Non-Breaking Hyphen", '-'), ('‒', "Figure Dash", '-'), ('–', "En Dash", '-'), ('—', "Em Dash", '-'), ('﹘', "Small Em Dash", '-'), + ('۔', "Arabic Full Stop", '-'), ('⁃', "Hyphen Bullet", '-'), ('˗', "Modifier Letter Minus Sign", '-'), ('−', "Minus Sign", '-'), + ('➖', "Heavy Minus Sign", '-'), + ('Ⲻ', "Coptic Letter Dialect-P Ni", '-'), ('ー', "Katakana-Hiragana Prolonged Sound Mark", '-'), + ('-', "Fullwidth Hyphen-Minus", '-'), + ('―', "Horizontal Bar", '-'), + ('─', "Box Drawings Light Horizontal", '-'), + ('━', "Box Drawings Heavy Horizontal", '-'), + ('㇐', "CJK Stroke H", '-'), + ('ꟷ', "Latin Epigraphic Letter Dideways", '-'), + ('ᅳ', "Hangul Jungseong Eu", '-'), + ('ㅡ', "Hangul Letter Eu", '-'), + ('一', "CJK Unified Ideograph-4E00", '-'), + ('⼀', "Kangxi Radical One", '-'), + + ('؍', "Arabic Date Separator", ','), ('٫', "Arabic Decimal Separator", ','), ('‚', "Single Low-9 Quotation Mark", ','), + ('¸', "Cedilla", ','), ('ꓹ', "Lisu Letter Tone Na Po", ','), (',', "Fullwidth Comma", ','), + (';', "Greek Question Mark", ';'), (';', "Fullwidth Semicolon", ';'), + ('︔', "Presentation Form For Vertical Semicolon", ';'), + ('ः', "Devanagari Sign Visarga", ':'), ('ઃ', "Gujarati Sign Visarga", ':'), (':', "Fullwidth Colon", ':'), ('։', "Armenian Full Stop", ':'), ('܃', "Syriac Supralinear Colon", ':'), ('܄', "Syriac Sublinear Colon", ':'), + ('᛬', "Runic Multiple Ponctuation", ':'), ('︰', "Presentation Form For Vertical Two Dot Leader", ':'), ('᠃', "Mongolian Full Stop", ':'), ('᠉', "Mongolian Manchu Full Stop", ':'), @@ -68,25 +93,48 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[ ('∶', "Ratio", ':'), ('ː', "Modifier Letter Triangular Colon", ':'), ('ꓽ', "Lisu Letter Tone Mya Jeu", ':'), + ('︓', "Presentation Form For Vertical Colon", ':'), + ('!', "Fullwidth Exclamation Mark", '!'), ('ǃ', "Latin Letter Retroflex Click", '!'), + ('ⵑ', "Tifinagh Letter Tuareg Yang", '!'), + ('︕', "Presentation Form For Vertical Exclamation Mark", '!'), + ('ʔ', "Latin Letter Glottal Stop", '?'), + ('Ɂ', "Latin Capital Letter Glottal Stop", '?'), ('ॽ', "Devanagari Letter Glottal Stop", '?'), ('Ꭾ', "Cherokee Letter He", '?'), + ('ꛫ', "Bamum Letter Ntuu", '?'), ('?', "Fullwidth Question Mark", '?'), + ('︖', "Presentation Form For Vertical Question Mark", '?'), + ('𝅭', "Musical Symbol Combining Augmentation Dot", '.'), ('․', "One Dot Leader", '.'), - ('۔', "Arabic Full Stop", '.'), ('܁', "Syriac Supralinear Full Stop", '.'), ('܂', "Syriac Sublinear Full Stop", '.'), ('꘎', "Vai Full Stop", '.'), ('𐩐', "Kharoshthi Punctuation Dot", '.'), - ('·', "Middle Dot", '.'), ('٠', "Arabic-Indic Digit Zero", '.'), ('۰', "Extended Arabic-Indic Digit Zero", '.'), ('ꓸ', "Lisu Letter Tone Mya Ti", '.'), - ('。', "Ideographic Full Stop", '.'), + ('·', "Middle Dot", '.'), ('・', "Katakana Middle Dot", '.'), + ('・', "Halfwidth Katakana Middle Dot", '.'), + ('᛫', "Runic Single Punctuation", '.'), + ('·', "Greek Ano Teleia", '.'), + ('⸱', "Word Separator Middle Dot", '.'), + ('𐄁', "Aegean Word Separator Dot", '.'), + ('•', "Bullet", '.'), + ('‧', "Hyphenation Point", '.'), + ('∙', "Bullet Operator", '.'), + ('⋅', "Dot Operator", '.'), + ('ꞏ', "Latin Letter Sinological Dot", '.'), + ('ᐧ', "Canadian Syllabics Final Middle Dot", '.'), + ('ᐧ', "Canadian Syllabics Final Middle Dot", '.'), + ('.', "Fullwidth Full Stop", '.'), + ('。', "Ideographic Full Stop", '.'), + ('︒', "Presentation Form For Vertical Ideographic Full Stop", '.'), + ('՝', "Armenian Comma", '\''), (''', "Fullwidth Apostrophe", '\''), ('‘', "Left Single Quotation Mark", '\''), @@ -96,8 +144,10 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[ ('‵', "Reversed Prime", '\''), ('՚', "Armenian Apostrophe", '\''), ('׳', "Hebrew Punctuation Geresh", '\''), + ('`', "Greek Accent", '\''), ('`', "Greek Varia", '\''), ('`', "Fullwidth Grave Accent", '\''), + ('´', "Acute Accent", '\''), ('΄', "Greek Tonos", '\''), ('´', "Greek Oxia", '\''), ('᾽', "Greek Koronis", '\''), @@ -105,6 +155,7 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[ ('῾', "Greek Dasia", '\''), ('ʹ', "Modifier Letter Prime", '\''), ('ʹ', "Greek Numeral Sign", '\''), + ('ˈ', "Modifier Letter Vertical Line", '\''), ('ˊ', "Modifier Letter Acute Accent", '\''), ('ˋ', "Modifier Letter Grave Accent", '\''), ('˴', "Modifier Letter Middle Grave Accent", '\''), @@ -116,6 +167,12 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[ ('י', "Hebrew Letter Yod", '\''), ('ߴ', "Nko High Tone Apostrophe", '\''), ('ߵ', "Nko Low Tone Apostrophe", '\''), + ('ᑊ', "Canadian Syllabics West-Cree P", '\''), + ('ᛌ', "Runic Letter Short-Twig-Sol S", '\''), + ('𖽑', "Miao Sign Aspiration", '\''), + ('𖽒', "Miao Sign Reformed Voicing", '\''), + + ('᳓', "Vedic Sign Nihshvasa", '"'), ('"', "Fullwidth Quotation Mark", '"'), ('“', "Left Double Quotation Mark", '"'), ('”', "Right Double Quotation Mark", '"'), @@ -132,12 +189,15 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[ ('ײ', "Hebrew Ligature Yiddish Double Yod", '"'), ('❞', "Heavy Double Comma Quotation Mark Ornament", '"'), ('❝', "Heavy Double Turned Comma Quotation Mark Ornament", '"'), + + ('(', "Fullwidth Left Parenthesis", '('), ('❨', "Medium Left Parenthesis Ornament", '('), ('﴾', "Ornate Left Parenthesis", '('), - ('(', "Fullwidth Left Parenthesis", '('), + + (')', "Fullwidth Right Parenthesis", ')'), ('❩', "Medium Right Parenthesis Ornament", ')'), ('﴿', "Ornate Right Parenthesis", ')'), - (')', "Fullwidth Right Parenthesis", ')'), + ('[', "Fullwidth Left Square Bracket", '['), ('❲', "Light Left Tortoise Shell Bracket Ornament", '['), ('「', "Left Corner Bracket", '['), @@ -147,6 +207,7 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[ ('〖', "Left White Lenticular Bracket", '['), ('〘', "Left White Tortoise Shell Bracket", '['), ('〚', "Left White Square Bracket", '['), + (']', "Fullwidth Right Square Bracket", ']'), ('❳', "Light Right Tortoise Shell Bracket Ornament", ']'), ('」', "Right Corner Bracket", ']'), @@ -156,11 +217,20 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[ ('〗', "Right White Lenticular Bracket", ']'), ('〙', "Right White Tortoise Shell Bracket", ']'), ('〛', "Right White Square Bracket", ']'), + ('❴', "Medium Left Curly Bracket Ornament", '{'), + ('𝄔', "Musical Symbol Brace", '{'), + ('{', "Fullwidth Left Curly Bracket", '{'), + ('❵', "Medium Right Curly Bracket Ornament", '}'), + ('}', "Fullwidth Right Curly Bracket", '}'), + ('⁎', "Low Asterisk", '*'), ('٭', "Arabic Five Pointed Star", '*'), ('∗', "Asterisk Operator", '*'), + ('𐌟', "Old Italic Letter Ess", '*'), + ('*', "Fullwidth Asterisk", '*'), + ('᜵', "Philippine Single Punctuation", '/'), ('⁁', "Caret Insertion Point", '/'), ('∕', "Division Slash", '/'), @@ -168,37 +238,73 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[ ('╱', "Box Drawings Light Diagonal Upper Right To Lower Left", '/'), ('⟋', "Mathematical Rising Diagonal", '/'), ('⧸', "Big Solidus", '/'), - ('㇓', "Cjk Stroke Sp", '/'), + ('𝈺', "Greek Instrumental Notation Symbol-47", '/'), + ('㇓', "CJK Stroke Sp", '/'), ('〳', "Vertical Kana Repeat Mark Upper Half", '/'), - ('丿', "Cjk Unified Ideograph-4E3F", '/'), + ('Ⳇ', "Coptic Capital Letter Old Coptic Esh", '/'), + ('ノ', "Katakana Letter No", '/'), + ('丿', "CJK Unified Ideograph-4E3F", '/'), ('⼃', "Kangxi Radical Slash", '/'), + ('/', "Fullwidth Solidus", '/'), + ('\', "Fullwidth Reverse Solidus", '\\'), ('﹨', "Small Reverse Solidus", '\\'), ('∖', "Set Minus", '\\'), ('⟍', "Mathematical Falling Diagonal", '\\'), ('⧵', "Reverse Solidus Operator", '\\'), ('⧹', "Big Reverse Solidus", '\\'), + ('⧹', "Greek Vocal Notation Symbol-16", '\\'), + ('⧹', "Greek Instrumental Symbol-48", '\\'), + ('㇔', "CJK Stroke D", '\\'), + ('丶', "CJK Unified Ideograph-4E36", '\\'), + ('⼂', "Kangxi Radical Dot", '\\'), ('、', "Ideographic Comma", '\\'), ('ヽ', "Katakana Iteration Mark", '\\'), - ('㇔', "Cjk Stroke D", '\\'), - ('丶', "Cjk Unified Ideograph-4E36", '\\'), - ('⼂', "Kangxi Radical Dot", '\\'), + ('ꝸ', "Latin Small Letter Um", '&'), + ('&', "Fullwidth Ampersand", '&'), + + ('᛭', "Runic Cros Punctuation", '+'), + ('➕', "Heavy Plus Sign", '+'), + ('𐊛', "Lycian Letter H", '+'), ('﬩', "Hebrew Letter Alternative Plus Sign", '+'), + ('+', "Fullwidth Plus Sign", '+'), + ('‹', "Single Left-Pointing Angle Quotation Mark", '<'), ('❮', "Heavy Left-Pointing Angle Quotation Mark Ornament", '<'), ('˂', "Modifier Letter Left Arrowhead", '<'), + ('𝈶', "Greek Instrumental Symbol-40", '<'), + ('ᐸ', "Canadian Syllabics Pa", '<'), + ('ᚲ', "Runic Letter Kauna", '<'), + ('❬', "Medium Left-Pointing Angle Bracket Ornament", '<'), + ('⟨', "Mathematical Left Angle Bracket", '<'), + ('〈', "Left-Pointing Angle Bracket", '<'), ('〈', "Left Angle Bracket", '<'), + ('㇛', "CJK Stroke Pd", '<'), + ('く', "Hiragana Letter Ku", '<'), + ('𡿨', "CJK Unified Ideograph-21FE8", '<'), ('《', "Left Double Angle Bracket", '<'), + ('<', "Fullwidth Less-Than Sign", '<'), + + ('᐀', "Canadian Syllabics Hyphen", '='), + ('⹀', "Double Hyphen", '='), + ('゠', "Katakana-Hiragana Double Hyphen", '='), ('꓿', "Lisu Punctuation Full Stop", '='), + ('=', "Fullwidth Equals Sign", '='), + ('›', "Single Right-Pointing Angle Quotation Mark", '>'), ('❯', "Heavy Right-Pointing Angle Quotation Mark Ornament", '>'), ('˃', "Modifier Letter Right Arrowhead", '>'), + ('𝈷', "Greek Instrumental Symbol-42", '>'), + ('ᐳ', "Canadian Syllabics Po", '>'), + ('𖼿', "Miao Letter Archaic Zza", '>'), + ('❭', "Medium Right-Pointing Angle Bracket Ornament", '>'), + ('⟩', "Mathematical Right Angle Bracket", '>'), + ('〉', "Right-Pointing Angle Bracket", '>'), ('〉', "Right Angle Bracket", '>'), ('》', "Right Double Angle Bracket", '>'), - ('Ⲻ', "Coptic Capital Letter Dialect-P Ni", '-'), - ('Ɂ', "Latin Capital Letter Glottal Stop", '?'), - ('Ⳇ', "Coptic Capital Letter Old Coptic Esh", '/'), ]; + ('>', "Fullwidth Greater-Than Sign", '>'), ]; + const ASCII_ARRAY: &'static [(char, &'static str)] = &[ (' ', "Space"), From 5e29bb91b0865bdc54864d0491fa6324aafb6fad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Malo=20Jaffr=C3=A9?= Date: Sun, 6 Aug 2017 18:34:36 +0200 Subject: [PATCH 2/2] Fix typo in unicode_chars.rs --- src/libsyntax/parse/lexer/unicode_chars.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libsyntax/parse/lexer/unicode_chars.rs b/src/libsyntax/parse/lexer/unicode_chars.rs index cc38021b7aa..85df4eee913 100644 --- a/src/libsyntax/parse/lexer/unicode_chars.rs +++ b/src/libsyntax/parse/lexer/unicode_chars.rs @@ -39,7 +39,7 @@ const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[ ('﹍', "Dashed Low Line", '_'), ('﹎', "Centreline Low Line", '_'), ('﹏', "Wavy Low Line", '_'), - ('_', "Fullwidth Low Line", '-'), + ('_', "Fullwidth Low Line", '_'), ('‐', "Hyphen", '-'), ('‑', "Non-Breaking Hyphen", '-'),