flatten rustc_lexer::character_properties module

On the call site, `rustc_lexer::is_whitespace` reads much better than `character_properties::is_whitespace`.
2019-09-04 13:16:36 +03:00 · 2019-09-04 13:16:36 +03:00 · 206fe8e1c3
parent a0c186c34f
commit 206fe8e1c3
7 changed files with 82 additions and 86 deletions
--- a/src/libfmt_macros/lib.rs
+++ b/src/libfmt_macros/lib.rs
@ -23,7 +23,6 @@ use std::string;
 use std::iter;

 use syntax_pos::{InnerSpan, Symbol};
-use rustc_lexer::character_properties::{is_id_start, is_id_continue};

 #[derive(Copy, Clone)]
 struct InnerOffset(usize);
@ -602,7 +601,7 @@ impl<'a> Parser<'a> {
    /// Rust identifier, except that it can't start with `_` character.
    fn word(&mut self) -> &'a str {
        let start = match self.cur.peek() {
-            Some(&(pos, c)) if c != '_' && is_id_start(c) => {
+            Some(&(pos, c)) if c != '_' && rustc_lexer::is_id_start(c) => {
                self.cur.next();
                pos
            }
@ -611,7 +610,7 @@ impl<'a> Parser<'a> {
            }
        };
        while let Some(&(pos, c)) = self.cur.peek() {
-            if is_id_continue(c) {
+            if rustc_lexer::is_id_continue(c) {
                self.cur.next();
            } else {
                return &self.input[start..pos];
--- a/src/librustc_lexer/src/lib.rs
+++ b/src/librustc_lexer/src/lib.rs
@ -102,6 +102,62 @@ pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ {
    })
 }

+// See [UAX #31](http://unicode.org/reports/tr31) for definitions of these
+// classes.
+
+/// True if `c` is considered a whitespace according to Rust language definition.
+pub fn is_whitespace(c: char) -> bool {
+    // This is Pattern_White_Space.
+    //
+    // Note that this set is stable (ie, it doesn't change with different
+    // Unicode versions), so it's ok to just hard-code the values.
+
+    match c {
+        // Usual ASCII suspects
+        | '\u{0009}' // \t
+        | '\u{000A}' // \n
+        | '\u{000B}' // vertical tab
+        | '\u{000C}' // form feed
+        | '\u{000D}' // \r
+        | '\u{0020}' // space
+
+        // NEXT LINE from latin1
+        | '\u{0085}'
+
+        // Bidi markers
+        | '\u{200E}' // LEFT-TO-RIGHT MARK
+        | '\u{200F}' // RIGHT-TO-LEFT MARK
+
+        // Dedicated whitespace characters from Unicode
+        | '\u{2028}' // LINE SEPARATOR
+        | '\u{2029}' // PARAGRAPH SEPARATOR
+            => true,
+        _ => false,
+    }
+}
+
+/// True if `c` is valid as a first character of an identifier.
+pub fn is_id_start(c: char) -> bool {
+    // This is XID_Start OR '_' (which formally is not a XID_Start).
+    // We also add fast-path for ascii idents
+    ('a' <= c && c <= 'z')
+        || ('A' <= c && c <= 'Z')
+        || c == '_'
+        || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c))
+}
+
+/// True if `c` is valid as a non-first character of an identifier.
+pub fn is_id_continue(c: char) -> bool {
+    // This is exactly XID_Continue.
+    // We also add fast-path for ascii idents
+    ('a' <= c && c <= 'z')
+        || ('A' <= c && c <= 'Z')
+        || ('0' <= c && c <= '9')
+        || c == '_'
+        || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c))
+}
+
+
 impl Cursor<'_> {
    fn advance_token(&mut self) -> Token {
        let first_char = self.bump().unwrap();
@ -111,9 +167,9 @@ impl Cursor<'_> {
                '*' => self.block_comment(),
                _ => Slash,
            },
-            c if character_properties::is_whitespace(c) => self.whitespace(),
+            c if is_whitespace(c) => self.whitespace(),
            'r' => match (self.nth_char(0), self.nth_char(1)) {
-                ('#', c1) if character_properties::is_id_start(c1) => self.raw_ident(),
+                ('#', c1) if is_id_start(c1) => self.raw_ident(),
                ('#', _) | ('"', _) => {
                    let (n_hashes, started, terminated) = self.raw_double_quoted_string();
                    let suffix_start = self.len_consumed();
@ -158,7 +214,7 @@ impl Cursor<'_> {
                }
                _ => self.ident(),
            },
-            c if character_properties::is_id_start(c) => self.ident(),
+            c if is_id_start(c) => self.ident(),
            c @ '0'..='9' => {
                let literal_kind = self.number(c);
                let suffix_start = self.len_consumed();
@ -246,8 +302,8 @@ impl Cursor<'_> {
    }

    fn whitespace(&mut self) -> TokenKind {
-        debug_assert!(character_properties::is_whitespace(self.prev()));
-        while character_properties::is_whitespace(self.nth_char(0)) {
+        debug_assert!(is_whitespace(self.prev()));
+        while is_whitespace(self.nth_char(0)) {
            self.bump();
        }
        Whitespace
@ -257,19 +313,19 @@ impl Cursor<'_> {
        debug_assert!(
            self.prev() == 'r'
                && self.nth_char(0) == '#'
-                && character_properties::is_id_start(self.nth_char(1))
+                && is_id_start(self.nth_char(1))
        );
        self.bump();
        self.bump();
-        while character_properties::is_id_continue(self.nth_char(0)) {
+        while is_id_continue(self.nth_char(0)) {
            self.bump();
        }
        RawIdent
    }

    fn ident(&mut self) -> TokenKind {
-        debug_assert!(character_properties::is_id_start(self.prev()));
-        while character_properties::is_id_continue(self.nth_char(0)) {
+        debug_assert!(is_id_start(self.prev()));
+        while is_id_continue(self.nth_char(0)) {
            self.bump();
        }
        Ident
@ -314,7 +370,7 @@ impl Cursor<'_> {
            // integer literal followed by field/method access or a range pattern
            // (`0..2` and `12.foo()`)
            '.' if self.nth_char(1) != '.'
-                && !character_properties::is_id_start(self.nth_char(1)) =>
+                && !is_id_start(self.nth_char(1)) =>
            {
                // might have stuff after the ., and if it does, it needs to start
                // with a number
@ -344,7 +400,7 @@ impl Cursor<'_> {
    fn lifetime_or_char(&mut self) -> TokenKind {
        debug_assert!(self.prev() == '\'');
        let mut starts_with_number = false;
-        if (character_properties::is_id_start(self.nth_char(0))
+        if (is_id_start(self.nth_char(0))
            || self.nth_char(0).is_digit(10) && {
                starts_with_number = true;
                true
@ -352,7 +408,7 @@ impl Cursor<'_> {
            && self.nth_char(1) != '\''
        {
            self.bump();
-            while character_properties::is_id_continue(self.nth_char(0)) {
+            while is_id_continue(self.nth_char(0)) {
                self.bump();
            }

@ -494,64 +550,13 @@ impl Cursor<'_> {
    }

    fn eat_literal_suffix(&mut self) {
-        if !character_properties::is_id_start(self.nth_char(0)) {
+        if !is_id_start(self.nth_char(0)) {
            return;
        }
        self.bump();

-        while character_properties::is_id_continue(self.nth_char(0)) {
+        while is_id_continue(self.nth_char(0)) {
            self.bump();
        }
    }
 }
-
-pub mod character_properties {
-    // See [UAX #31](http://unicode.org/reports/tr31) for definitions of these
-    // classes.
-
-    // This is Pattern_White_Space.
-    //
-    // Note that this set is stable (ie, it doesn't change with different
-    // Unicode versions), so it's ok to just hard-code the values.
-    pub fn is_whitespace(c: char) -> bool {
-        match c {
-            // Usual ASCII suspects
-            | '\u{0009}' // \t
-            | '\u{000A}' // \n
-            | '\u{000B}' // vertical tab
-            | '\u{000C}' // form feed
-            | '\u{000D}' // \r
-            | '\u{0020}' // space
-
-            // NEXT LINE from latin1
-            | '\u{0085}'
-
-            // Bidi markers
-            | '\u{200E}' // LEFT-TO-RIGHT MARK
-            | '\u{200F}' // RIGHT-TO-LEFT MARK
-
-            // Dedicated whitespace characters from Unicode
-            | '\u{2028}' // LINE SEPARATOR
-            | '\u{2029}' // PARAGRAPH SEPARATOR
-              => true,
-            _ => false,
-        }
-    }
-
-    // This is XID_Start OR '_' (which formally is not a XID_Start).
-    pub fn is_id_start(c: char) -> bool {
-        ('a' <= c && c <= 'z')
-            || ('A' <= c && c <= 'Z')
-            || c == '_'
-            || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c))
-    }
-
-    // This is XID_Continue.
-    pub fn is_id_continue(c: char) -> bool {
-        ('a' <= c && c <= 'z')
-            || ('A' <= c && c <= 'Z')
-            || ('0' <= c && c <= '9')
-            || c == '_'
-            || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c))
-    }
-}
--- a/src/librustc_mir/borrow_check/move_errors.rs
+++ b/src/librustc_mir/borrow_check/move_errors.rs
@ -1,7 +1,6 @@
 use rustc::mir::*;
 use rustc::ty;
 use rustc_errors::{DiagnosticBuilder,Applicability};
-use rustc_lexer::character_properties::is_whitespace;
 use syntax_pos::Span;

 use crate::borrow_check::MirBorrowckCtxt;
@ -525,7 +524,7 @@ impl<'a, 'tcx> MirBorrowckCtxt<'a, 'tcx> {
                        let suggestion;
                        let to_remove;
                        if pat_snippet.starts_with("mut")
-                            && pat_snippet["mut".len()..].starts_with(is_whitespace)
+                            && pat_snippet["mut".len()..].starts_with(rustc_lexer::is_whitespace)
                        {
                            suggestion = pat_snippet["mut".len()..].trim_start();
                            to_remove = "&mut";
--- a/src/librustc_mir/borrow_check/mutability_errors.rs
+++ b/src/librustc_mir/borrow_check/mutability_errors.rs
@ -1,4 +1,3 @@
-use rustc_lexer::character_properties::is_whitespace;
 use rustc::hir;
 use rustc::hir::Node;
 use rustc::mir::{self, BindingForm, ClearCrossCrate, Local, Location, Body};
@ -715,7 +714,7 @@ fn annotate_struct_field(
 fn suggest_ref_mut(tcx: TyCtxt<'_>, binding_span: Span) -> Option<String> {
    let hi_src = tcx.sess.source_map().span_to_snippet(binding_span).ok()?;
    if hi_src.starts_with("ref")
-        && hi_src["ref".len()..].starts_with(is_whitespace)
+        && hi_src["ref".len()..].starts_with(rustc_lexer::is_whitespace)
    {
        let replacement = format!("ref mut{}", &hi_src["ref".len()..]);
        Some(replacement)
--- a/src/librustdoc/test.rs
+++ b/src/librustdoc/test.rs
@ -4,7 +4,6 @@ use rustc::hir;
 use rustc::hir::intravisit;
 use rustc::session::{self, config, DiagnosticOutput};
 use rustc::util::common::ErrorReported;
-use rustc_lexer::character_properties::{is_id_start, is_id_continue};
 use syntax::ast;
 use syntax::with_globals;
 use syntax::source_map::SourceMap;
@ -764,8 +763,8 @@ impl Tester for Collector {
            // We use these headings as test names, so it's good if
            // they're valid identifiers.
            let name = name.chars().enumerate().map(|(i, c)| {
-                    if (i == 0 && is_id_start(c)) ||
-                        (i != 0 && is_id_continue(c)) {
+                    if (i == 0 && rustc_lexer::is_id_start(c)) ||
+                        (i != 0 && rustc_lexer::is_id_continue(c)) {
                        c
                    } else {
                        '_'
--- a/src/libsyntax/ext/proc_macro_server.rs
+++ b/src/libsyntax/ext/proc_macro_server.rs
@ -6,7 +6,6 @@ use crate::tokenstream::{self, DelimSpan, IsJoint::*, TokenStream, TreeAndJoint}

 use errors::{Diagnostic, DiagnosticBuilder};
 use rustc_data_structures::sync::Lrc;
-use rustc_lexer::character_properties::{is_id_start, is_id_continue};
 use syntax_pos::{BytePos, FileName, MultiSpan, Pos, SourceFile, Span};
 use syntax_pos::symbol::{kw, sym, Symbol};

@ -323,7 +322,7 @@ impl Ident {
    fn is_valid(string: &str) -> bool {
        let mut chars = string.chars();
        if let Some(start) = chars.next() {
-            is_id_start(start) && chars.all(is_id_continue)
+            rustc_lexer::is_id_start(start) && chars.all(rustc_lexer::is_id_continue)
        } else {
            false
        }
--- a/src/libsyntax/tests.rs
+++ b/src/libsyntax/tests.rs
@ -63,7 +63,7 @@ crate fn matches_codepattern(a : &str, b : &str) -> bool {
            (None, None) => return true,
            (None, _) => return false,
            (Some(&a), None) => {
-                if is_pattern_whitespace(a) {
+                if rustc_lexer::is_whitespace(a) {
                    break // trailing whitespace check is out of loop for borrowck
                } else {
                    return false
@ -72,11 +72,11 @@ crate fn matches_codepattern(a : &str, b : &str) -> bool {
            (Some(&a), Some(&b)) => (a, b)
        };

-        if is_pattern_whitespace(a) && is_pattern_whitespace(b) {
+        if rustc_lexer::is_whitespace(a) && rustc_lexer::is_whitespace(b) {
            // skip whitespace for a and b
            scan_for_non_ws_or_end(&mut a_iter);
            scan_for_non_ws_or_end(&mut b_iter);
-        } else if is_pattern_whitespace(a) {
+        } else if rustc_lexer::is_whitespace(a) {
            // skip whitespace for a
            scan_for_non_ws_or_end(&mut a_iter);
        } else if a == b {
@ -88,20 +88,16 @@ crate fn matches_codepattern(a : &str, b : &str) -> bool {
    }

    // check if a has *only* trailing whitespace
-    a_iter.all(is_pattern_whitespace)
+    a_iter.all(rustc_lexer::is_whitespace)
 }

 /// Advances the given peekable `Iterator` until it reaches a non-whitespace character
 fn scan_for_non_ws_or_end<I: Iterator<Item = char>>(iter: &mut Peekable<I>) {
-    while iter.peek().copied().map(|c| is_pattern_whitespace(c)) == Some(true) {
+    while iter.peek().copied().map(|c| rustc_lexer::is_whitespace(c)) == Some(true) {
        iter.next();
    }
 }

-fn is_pattern_whitespace(c: char) -> bool {
-    rustc_lexer::character_properties::is_whitespace(c)
-}
-
 /// Identify a position in the text by the Nth occurrence of a string.
 struct Position {
    string: &'static str,