diff --git a/src/libfmt_macros/lib.rs b/src/libfmt_macros/lib.rs index 98fa2bd0615..f9c1be20b8b 100644 --- a/src/libfmt_macros/lib.rs +++ b/src/libfmt_macros/lib.rs @@ -23,7 +23,6 @@ use std::string; use std::iter; use syntax_pos::{InnerSpan, Symbol}; -use rustc_lexer::character_properties::{is_id_start, is_id_continue}; #[derive(Copy, Clone)] struct InnerOffset(usize); @@ -602,7 +601,7 @@ impl<'a> Parser<'a> { /// Rust identifier, except that it can't start with `_` character. fn word(&mut self) -> &'a str { let start = match self.cur.peek() { - Some(&(pos, c)) if c != '_' && is_id_start(c) => { + Some(&(pos, c)) if c != '_' && rustc_lexer::is_id_start(c) => { self.cur.next(); pos } @@ -611,7 +610,7 @@ impl<'a> Parser<'a> { } }; while let Some(&(pos, c)) = self.cur.peek() { - if is_id_continue(c) { + if rustc_lexer::is_id_continue(c) { self.cur.next(); } else { return &self.input[start..pos]; diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index 26e5e6fc8c4..30a5175d8cd 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -102,6 +102,62 @@ pub fn tokenize(mut input: &str) -> impl Iterator + '_ { }) } +// See [UAX #31](http://unicode.org/reports/tr31) for definitions of these +// classes. + +/// True if `c` is considered a whitespace according to Rust language definition. +pub fn is_whitespace(c: char) -> bool { + // This is Pattern_White_Space. + // + // Note that this set is stable (ie, it doesn't change with different + // Unicode versions), so it's ok to just hard-code the values. + + match c { + // Usual ASCII suspects + | '\u{0009}' // \t + | '\u{000A}' // \n + | '\u{000B}' // vertical tab + | '\u{000C}' // form feed + | '\u{000D}' // \r + | '\u{0020}' // space + + // NEXT LINE from latin1 + | '\u{0085}' + + // Bidi markers + | '\u{200E}' // LEFT-TO-RIGHT MARK + | '\u{200F}' // RIGHT-TO-LEFT MARK + + // Dedicated whitespace characters from Unicode + | '\u{2028}' // LINE SEPARATOR + | '\u{2029}' // PARAGRAPH SEPARATOR + => true, + _ => false, + } +} + +/// True if `c` is valid as a first character of an identifier. +pub fn is_id_start(c: char) -> bool { + // This is XID_Start OR '_' (which formally is not a XID_Start). + // We also add fast-path for ascii idents + ('a' <= c && c <= 'z') + || ('A' <= c && c <= 'Z') + || c == '_' + || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c)) +} + +/// True if `c` is valid as a non-first character of an identifier. +pub fn is_id_continue(c: char) -> bool { + // This is exactly XID_Continue. + // We also add fast-path for ascii idents + ('a' <= c && c <= 'z') + || ('A' <= c && c <= 'Z') + || ('0' <= c && c <= '9') + || c == '_' + || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c)) +} + + impl Cursor<'_> { fn advance_token(&mut self) -> Token { let first_char = self.bump().unwrap(); @@ -111,9 +167,9 @@ impl Cursor<'_> { '*' => self.block_comment(), _ => Slash, }, - c if character_properties::is_whitespace(c) => self.whitespace(), + c if is_whitespace(c) => self.whitespace(), 'r' => match (self.nth_char(0), self.nth_char(1)) { - ('#', c1) if character_properties::is_id_start(c1) => self.raw_ident(), + ('#', c1) if is_id_start(c1) => self.raw_ident(), ('#', _) | ('"', _) => { let (n_hashes, started, terminated) = self.raw_double_quoted_string(); let suffix_start = self.len_consumed(); @@ -158,7 +214,7 @@ impl Cursor<'_> { } _ => self.ident(), }, - c if character_properties::is_id_start(c) => self.ident(), + c if is_id_start(c) => self.ident(), c @ '0'..='9' => { let literal_kind = self.number(c); let suffix_start = self.len_consumed(); @@ -246,8 +302,8 @@ impl Cursor<'_> { } fn whitespace(&mut self) -> TokenKind { - debug_assert!(character_properties::is_whitespace(self.prev())); - while character_properties::is_whitespace(self.nth_char(0)) { + debug_assert!(is_whitespace(self.prev())); + while is_whitespace(self.nth_char(0)) { self.bump(); } Whitespace @@ -257,19 +313,19 @@ impl Cursor<'_> { debug_assert!( self.prev() == 'r' && self.nth_char(0) == '#' - && character_properties::is_id_start(self.nth_char(1)) + && is_id_start(self.nth_char(1)) ); self.bump(); self.bump(); - while character_properties::is_id_continue(self.nth_char(0)) { + while is_id_continue(self.nth_char(0)) { self.bump(); } RawIdent } fn ident(&mut self) -> TokenKind { - debug_assert!(character_properties::is_id_start(self.prev())); - while character_properties::is_id_continue(self.nth_char(0)) { + debug_assert!(is_id_start(self.prev())); + while is_id_continue(self.nth_char(0)) { self.bump(); } Ident @@ -314,7 +370,7 @@ impl Cursor<'_> { // integer literal followed by field/method access or a range pattern // (`0..2` and `12.foo()`) '.' if self.nth_char(1) != '.' - && !character_properties::is_id_start(self.nth_char(1)) => + && !is_id_start(self.nth_char(1)) => { // might have stuff after the ., and if it does, it needs to start // with a number @@ -344,7 +400,7 @@ impl Cursor<'_> { fn lifetime_or_char(&mut self) -> TokenKind { debug_assert!(self.prev() == '\''); let mut starts_with_number = false; - if (character_properties::is_id_start(self.nth_char(0)) + if (is_id_start(self.nth_char(0)) || self.nth_char(0).is_digit(10) && { starts_with_number = true; true @@ -352,7 +408,7 @@ impl Cursor<'_> { && self.nth_char(1) != '\'' { self.bump(); - while character_properties::is_id_continue(self.nth_char(0)) { + while is_id_continue(self.nth_char(0)) { self.bump(); } @@ -494,64 +550,13 @@ impl Cursor<'_> { } fn eat_literal_suffix(&mut self) { - if !character_properties::is_id_start(self.nth_char(0)) { + if !is_id_start(self.nth_char(0)) { return; } self.bump(); - while character_properties::is_id_continue(self.nth_char(0)) { + while is_id_continue(self.nth_char(0)) { self.bump(); } } } - -pub mod character_properties { - // See [UAX #31](http://unicode.org/reports/tr31) for definitions of these - // classes. - - // This is Pattern_White_Space. - // - // Note that this set is stable (ie, it doesn't change with different - // Unicode versions), so it's ok to just hard-code the values. - pub fn is_whitespace(c: char) -> bool { - match c { - // Usual ASCII suspects - | '\u{0009}' // \t - | '\u{000A}' // \n - | '\u{000B}' // vertical tab - | '\u{000C}' // form feed - | '\u{000D}' // \r - | '\u{0020}' // space - - // NEXT LINE from latin1 - | '\u{0085}' - - // Bidi markers - | '\u{200E}' // LEFT-TO-RIGHT MARK - | '\u{200F}' // RIGHT-TO-LEFT MARK - - // Dedicated whitespace characters from Unicode - | '\u{2028}' // LINE SEPARATOR - | '\u{2029}' // PARAGRAPH SEPARATOR - => true, - _ => false, - } - } - - // This is XID_Start OR '_' (which formally is not a XID_Start). - pub fn is_id_start(c: char) -> bool { - ('a' <= c && c <= 'z') - || ('A' <= c && c <= 'Z') - || c == '_' - || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c)) - } - - // This is XID_Continue. - pub fn is_id_continue(c: char) -> bool { - ('a' <= c && c <= 'z') - || ('A' <= c && c <= 'Z') - || ('0' <= c && c <= '9') - || c == '_' - || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c)) - } -} diff --git a/src/librustc_mir/borrow_check/move_errors.rs b/src/librustc_mir/borrow_check/move_errors.rs index abcb70b7197..0d13db2f5a4 100644 --- a/src/librustc_mir/borrow_check/move_errors.rs +++ b/src/librustc_mir/borrow_check/move_errors.rs @@ -1,7 +1,6 @@ use rustc::mir::*; use rustc::ty; use rustc_errors::{DiagnosticBuilder,Applicability}; -use rustc_lexer::character_properties::is_whitespace; use syntax_pos::Span; use crate::borrow_check::MirBorrowckCtxt; @@ -525,7 +524,7 @@ impl<'a, 'tcx> MirBorrowckCtxt<'a, 'tcx> { let suggestion; let to_remove; if pat_snippet.starts_with("mut") - && pat_snippet["mut".len()..].starts_with(is_whitespace) + && pat_snippet["mut".len()..].starts_with(rustc_lexer::is_whitespace) { suggestion = pat_snippet["mut".len()..].trim_start(); to_remove = "&mut"; diff --git a/src/librustc_mir/borrow_check/mutability_errors.rs b/src/librustc_mir/borrow_check/mutability_errors.rs index 32bf82c8bcd..8f2ce80aafa 100644 --- a/src/librustc_mir/borrow_check/mutability_errors.rs +++ b/src/librustc_mir/borrow_check/mutability_errors.rs @@ -1,4 +1,3 @@ -use rustc_lexer::character_properties::is_whitespace; use rustc::hir; use rustc::hir::Node; use rustc::mir::{self, BindingForm, ClearCrossCrate, Local, Location, Body}; @@ -715,7 +714,7 @@ fn annotate_struct_field( fn suggest_ref_mut(tcx: TyCtxt<'_>, binding_span: Span) -> Option { let hi_src = tcx.sess.source_map().span_to_snippet(binding_span).ok()?; if hi_src.starts_with("ref") - && hi_src["ref".len()..].starts_with(is_whitespace) + && hi_src["ref".len()..].starts_with(rustc_lexer::is_whitespace) { let replacement = format!("ref mut{}", &hi_src["ref".len()..]); Some(replacement) diff --git a/src/librustdoc/test.rs b/src/librustdoc/test.rs index 1105e47d748..000d2843adc 100644 --- a/src/librustdoc/test.rs +++ b/src/librustdoc/test.rs @@ -4,7 +4,6 @@ use rustc::hir; use rustc::hir::intravisit; use rustc::session::{self, config, DiagnosticOutput}; use rustc::util::common::ErrorReported; -use rustc_lexer::character_properties::{is_id_start, is_id_continue}; use syntax::ast; use syntax::with_globals; use syntax::source_map::SourceMap; @@ -764,8 +763,8 @@ impl Tester for Collector { // We use these headings as test names, so it's good if // they're valid identifiers. let name = name.chars().enumerate().map(|(i, c)| { - if (i == 0 && is_id_start(c)) || - (i != 0 && is_id_continue(c)) { + if (i == 0 && rustc_lexer::is_id_start(c)) || + (i != 0 && rustc_lexer::is_id_continue(c)) { c } else { '_' diff --git a/src/libsyntax/ext/proc_macro_server.rs b/src/libsyntax/ext/proc_macro_server.rs index 35feb6680f9..544ec789d80 100644 --- a/src/libsyntax/ext/proc_macro_server.rs +++ b/src/libsyntax/ext/proc_macro_server.rs @@ -6,7 +6,6 @@ use crate::tokenstream::{self, DelimSpan, IsJoint::*, TokenStream, TreeAndJoint} use errors::{Diagnostic, DiagnosticBuilder}; use rustc_data_structures::sync::Lrc; -use rustc_lexer::character_properties::{is_id_start, is_id_continue}; use syntax_pos::{BytePos, FileName, MultiSpan, Pos, SourceFile, Span}; use syntax_pos::symbol::{kw, sym, Symbol}; @@ -323,7 +322,7 @@ impl Ident { fn is_valid(string: &str) -> bool { let mut chars = string.chars(); if let Some(start) = chars.next() { - is_id_start(start) && chars.all(is_id_continue) + rustc_lexer::is_id_start(start) && chars.all(rustc_lexer::is_id_continue) } else { false } diff --git a/src/libsyntax/tests.rs b/src/libsyntax/tests.rs index c472212bc20..9b90b31f2d2 100644 --- a/src/libsyntax/tests.rs +++ b/src/libsyntax/tests.rs @@ -63,7 +63,7 @@ crate fn matches_codepattern(a : &str, b : &str) -> bool { (None, None) => return true, (None, _) => return false, (Some(&a), None) => { - if is_pattern_whitespace(a) { + if rustc_lexer::is_whitespace(a) { break // trailing whitespace check is out of loop for borrowck } else { return false @@ -72,11 +72,11 @@ crate fn matches_codepattern(a : &str, b : &str) -> bool { (Some(&a), Some(&b)) => (a, b) }; - if is_pattern_whitespace(a) && is_pattern_whitespace(b) { + if rustc_lexer::is_whitespace(a) && rustc_lexer::is_whitespace(b) { // skip whitespace for a and b scan_for_non_ws_or_end(&mut a_iter); scan_for_non_ws_or_end(&mut b_iter); - } else if is_pattern_whitespace(a) { + } else if rustc_lexer::is_whitespace(a) { // skip whitespace for a scan_for_non_ws_or_end(&mut a_iter); } else if a == b { @@ -88,20 +88,16 @@ crate fn matches_codepattern(a : &str, b : &str) -> bool { } // check if a has *only* trailing whitespace - a_iter.all(is_pattern_whitespace) + a_iter.all(rustc_lexer::is_whitespace) } /// Advances the given peekable `Iterator` until it reaches a non-whitespace character fn scan_for_non_ws_or_end>(iter: &mut Peekable) { - while iter.peek().copied().map(|c| is_pattern_whitespace(c)) == Some(true) { + while iter.peek().copied().map(|c| rustc_lexer::is_whitespace(c)) == Some(true) { iter.next(); } } -fn is_pattern_whitespace(c: char) -> bool { - rustc_lexer::character_properties::is_whitespace(c) -} - /// Identify a position in the text by the Nth occurrence of a string. struct Position { string: &'static str,