flatten rustc_lexer::character_properties module

On the call site, `rustc_lexer::is_whitespace` reads much better than
`character_properties::is_whitespace`.
This commit is contained in:
Aleksey Kladov 2019-09-04 13:16:36 +03:00
parent a0c186c34f
commit 206fe8e1c3
7 changed files with 82 additions and 86 deletions

View File

@ -23,7 +23,6 @@ use std::string;
use std::iter;
use syntax_pos::{InnerSpan, Symbol};
use rustc_lexer::character_properties::{is_id_start, is_id_continue};
#[derive(Copy, Clone)]
struct InnerOffset(usize);
@ -602,7 +601,7 @@ impl<'a> Parser<'a> {
/// Rust identifier, except that it can't start with `_` character.
fn word(&mut self) -> &'a str {
let start = match self.cur.peek() {
Some(&(pos, c)) if c != '_' && is_id_start(c) => {
Some(&(pos, c)) if c != '_' && rustc_lexer::is_id_start(c) => {
self.cur.next();
pos
}
@ -611,7 +610,7 @@ impl<'a> Parser<'a> {
}
};
while let Some(&(pos, c)) = self.cur.peek() {
if is_id_continue(c) {
if rustc_lexer::is_id_continue(c) {
self.cur.next();
} else {
return &self.input[start..pos];

View File

@ -102,6 +102,62 @@ pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ {
})
}
// See [UAX #31](http://unicode.org/reports/tr31) for definitions of these
// classes.
/// True if `c` is considered a whitespace according to Rust language definition.
pub fn is_whitespace(c: char) -> bool {
// This is Pattern_White_Space.
//
// Note that this set is stable (ie, it doesn't change with different
// Unicode versions), so it's ok to just hard-code the values.
match c {
// Usual ASCII suspects
| '\u{0009}' // \t
| '\u{000A}' // \n
| '\u{000B}' // vertical tab
| '\u{000C}' // form feed
| '\u{000D}' // \r
| '\u{0020}' // space
// NEXT LINE from latin1
| '\u{0085}'
// Bidi markers
| '\u{200E}' // LEFT-TO-RIGHT MARK
| '\u{200F}' // RIGHT-TO-LEFT MARK
// Dedicated whitespace characters from Unicode
| '\u{2028}' // LINE SEPARATOR
| '\u{2029}' // PARAGRAPH SEPARATOR
=> true,
_ => false,
}
}
/// True if `c` is valid as a first character of an identifier.
pub fn is_id_start(c: char) -> bool {
// This is XID_Start OR '_' (which formally is not a XID_Start).
// We also add fast-path for ascii idents
('a' <= c && c <= 'z')
|| ('A' <= c && c <= 'Z')
|| c == '_'
|| (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c))
}
/// True if `c` is valid as a non-first character of an identifier.
pub fn is_id_continue(c: char) -> bool {
// This is exactly XID_Continue.
// We also add fast-path for ascii idents
('a' <= c && c <= 'z')
|| ('A' <= c && c <= 'Z')
|| ('0' <= c && c <= '9')
|| c == '_'
|| (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c))
}
impl Cursor<'_> {
fn advance_token(&mut self) -> Token {
let first_char = self.bump().unwrap();
@ -111,9 +167,9 @@ impl Cursor<'_> {
'*' => self.block_comment(),
_ => Slash,
},
c if character_properties::is_whitespace(c) => self.whitespace(),
c if is_whitespace(c) => self.whitespace(),
'r' => match (self.nth_char(0), self.nth_char(1)) {
('#', c1) if character_properties::is_id_start(c1) => self.raw_ident(),
('#', c1) if is_id_start(c1) => self.raw_ident(),
('#', _) | ('"', _) => {
let (n_hashes, started, terminated) = self.raw_double_quoted_string();
let suffix_start = self.len_consumed();
@ -158,7 +214,7 @@ impl Cursor<'_> {
}
_ => self.ident(),
},
c if character_properties::is_id_start(c) => self.ident(),
c if is_id_start(c) => self.ident(),
c @ '0'..='9' => {
let literal_kind = self.number(c);
let suffix_start = self.len_consumed();
@ -246,8 +302,8 @@ impl Cursor<'_> {
}
fn whitespace(&mut self) -> TokenKind {
debug_assert!(character_properties::is_whitespace(self.prev()));
while character_properties::is_whitespace(self.nth_char(0)) {
debug_assert!(is_whitespace(self.prev()));
while is_whitespace(self.nth_char(0)) {
self.bump();
}
Whitespace
@ -257,19 +313,19 @@ impl Cursor<'_> {
debug_assert!(
self.prev() == 'r'
&& self.nth_char(0) == '#'
&& character_properties::is_id_start(self.nth_char(1))
&& is_id_start(self.nth_char(1))
);
self.bump();
self.bump();
while character_properties::is_id_continue(self.nth_char(0)) {
while is_id_continue(self.nth_char(0)) {
self.bump();
}
RawIdent
}
fn ident(&mut self) -> TokenKind {
debug_assert!(character_properties::is_id_start(self.prev()));
while character_properties::is_id_continue(self.nth_char(0)) {
debug_assert!(is_id_start(self.prev()));
while is_id_continue(self.nth_char(0)) {
self.bump();
}
Ident
@ -314,7 +370,7 @@ impl Cursor<'_> {
// integer literal followed by field/method access or a range pattern
// (`0..2` and `12.foo()`)
'.' if self.nth_char(1) != '.'
&& !character_properties::is_id_start(self.nth_char(1)) =>
&& !is_id_start(self.nth_char(1)) =>
{
// might have stuff after the ., and if it does, it needs to start
// with a number
@ -344,7 +400,7 @@ impl Cursor<'_> {
fn lifetime_or_char(&mut self) -> TokenKind {
debug_assert!(self.prev() == '\'');
let mut starts_with_number = false;
if (character_properties::is_id_start(self.nth_char(0))
if (is_id_start(self.nth_char(0))
|| self.nth_char(0).is_digit(10) && {
starts_with_number = true;
true
@ -352,7 +408,7 @@ impl Cursor<'_> {
&& self.nth_char(1) != '\''
{
self.bump();
while character_properties::is_id_continue(self.nth_char(0)) {
while is_id_continue(self.nth_char(0)) {
self.bump();
}
@ -494,64 +550,13 @@ impl Cursor<'_> {
}
fn eat_literal_suffix(&mut self) {
if !character_properties::is_id_start(self.nth_char(0)) {
if !is_id_start(self.nth_char(0)) {
return;
}
self.bump();
while character_properties::is_id_continue(self.nth_char(0)) {
while is_id_continue(self.nth_char(0)) {
self.bump();
}
}
}
pub mod character_properties {
// See [UAX #31](http://unicode.org/reports/tr31) for definitions of these
// classes.
// This is Pattern_White_Space.
//
// Note that this set is stable (ie, it doesn't change with different
// Unicode versions), so it's ok to just hard-code the values.
pub fn is_whitespace(c: char) -> bool {
match c {
// Usual ASCII suspects
| '\u{0009}' // \t
| '\u{000A}' // \n
| '\u{000B}' // vertical tab
| '\u{000C}' // form feed
| '\u{000D}' // \r
| '\u{0020}' // space
// NEXT LINE from latin1
| '\u{0085}'
// Bidi markers
| '\u{200E}' // LEFT-TO-RIGHT MARK
| '\u{200F}' // RIGHT-TO-LEFT MARK
// Dedicated whitespace characters from Unicode
| '\u{2028}' // LINE SEPARATOR
| '\u{2029}' // PARAGRAPH SEPARATOR
=> true,
_ => false,
}
}
// This is XID_Start OR '_' (which formally is not a XID_Start).
pub fn is_id_start(c: char) -> bool {
('a' <= c && c <= 'z')
|| ('A' <= c && c <= 'Z')
|| c == '_'
|| (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c))
}
// This is XID_Continue.
pub fn is_id_continue(c: char) -> bool {
('a' <= c && c <= 'z')
|| ('A' <= c && c <= 'Z')
|| ('0' <= c && c <= '9')
|| c == '_'
|| (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c))
}
}

View File

@ -1,7 +1,6 @@
use rustc::mir::*;
use rustc::ty;
use rustc_errors::{DiagnosticBuilder,Applicability};
use rustc_lexer::character_properties::is_whitespace;
use syntax_pos::Span;
use crate::borrow_check::MirBorrowckCtxt;
@ -525,7 +524,7 @@ impl<'a, 'tcx> MirBorrowckCtxt<'a, 'tcx> {
let suggestion;
let to_remove;
if pat_snippet.starts_with("mut")
&& pat_snippet["mut".len()..].starts_with(is_whitespace)
&& pat_snippet["mut".len()..].starts_with(rustc_lexer::is_whitespace)
{
suggestion = pat_snippet["mut".len()..].trim_start();
to_remove = "&mut";

View File

@ -1,4 +1,3 @@
use rustc_lexer::character_properties::is_whitespace;
use rustc::hir;
use rustc::hir::Node;
use rustc::mir::{self, BindingForm, ClearCrossCrate, Local, Location, Body};
@ -715,7 +714,7 @@ fn annotate_struct_field(
fn suggest_ref_mut(tcx: TyCtxt<'_>, binding_span: Span) -> Option<String> {
let hi_src = tcx.sess.source_map().span_to_snippet(binding_span).ok()?;
if hi_src.starts_with("ref")
&& hi_src["ref".len()..].starts_with(is_whitespace)
&& hi_src["ref".len()..].starts_with(rustc_lexer::is_whitespace)
{
let replacement = format!("ref mut{}", &hi_src["ref".len()..]);
Some(replacement)

View File

@ -4,7 +4,6 @@ use rustc::hir;
use rustc::hir::intravisit;
use rustc::session::{self, config, DiagnosticOutput};
use rustc::util::common::ErrorReported;
use rustc_lexer::character_properties::{is_id_start, is_id_continue};
use syntax::ast;
use syntax::with_globals;
use syntax::source_map::SourceMap;
@ -764,8 +763,8 @@ impl Tester for Collector {
// We use these headings as test names, so it's good if
// they're valid identifiers.
let name = name.chars().enumerate().map(|(i, c)| {
if (i == 0 && is_id_start(c)) ||
(i != 0 && is_id_continue(c)) {
if (i == 0 && rustc_lexer::is_id_start(c)) ||
(i != 0 && rustc_lexer::is_id_continue(c)) {
c
} else {
'_'

View File

@ -6,7 +6,6 @@ use crate::tokenstream::{self, DelimSpan, IsJoint::*, TokenStream, TreeAndJoint}
use errors::{Diagnostic, DiagnosticBuilder};
use rustc_data_structures::sync::Lrc;
use rustc_lexer::character_properties::{is_id_start, is_id_continue};
use syntax_pos::{BytePos, FileName, MultiSpan, Pos, SourceFile, Span};
use syntax_pos::symbol::{kw, sym, Symbol};
@ -323,7 +322,7 @@ impl Ident {
fn is_valid(string: &str) -> bool {
let mut chars = string.chars();
if let Some(start) = chars.next() {
is_id_start(start) && chars.all(is_id_continue)
rustc_lexer::is_id_start(start) && chars.all(rustc_lexer::is_id_continue)
} else {
false
}

View File

@ -63,7 +63,7 @@ crate fn matches_codepattern(a : &str, b : &str) -> bool {
(None, None) => return true,
(None, _) => return false,
(Some(&a), None) => {
if is_pattern_whitespace(a) {
if rustc_lexer::is_whitespace(a) {
break // trailing whitespace check is out of loop for borrowck
} else {
return false
@ -72,11 +72,11 @@ crate fn matches_codepattern(a : &str, b : &str) -> bool {
(Some(&a), Some(&b)) => (a, b)
};
if is_pattern_whitespace(a) && is_pattern_whitespace(b) {
if rustc_lexer::is_whitespace(a) && rustc_lexer::is_whitespace(b) {
// skip whitespace for a and b
scan_for_non_ws_or_end(&mut a_iter);
scan_for_non_ws_or_end(&mut b_iter);
} else if is_pattern_whitespace(a) {
} else if rustc_lexer::is_whitespace(a) {
// skip whitespace for a
scan_for_non_ws_or_end(&mut a_iter);
} else if a == b {
@ -88,20 +88,16 @@ crate fn matches_codepattern(a : &str, b : &str) -> bool {
}
// check if a has *only* trailing whitespace
a_iter.all(is_pattern_whitespace)
a_iter.all(rustc_lexer::is_whitespace)
}
/// Advances the given peekable `Iterator` until it reaches a non-whitespace character
fn scan_for_non_ws_or_end<I: Iterator<Item = char>>(iter: &mut Peekable<I>) {
while iter.peek().copied().map(|c| is_pattern_whitespace(c)) == Some(true) {
while iter.peek().copied().map(|c| rustc_lexer::is_whitespace(c)) == Some(true) {
iter.next();
}
}
fn is_pattern_whitespace(c: char) -> bool {
rustc_lexer::character_properties::is_whitespace(c)
}
/// Identify a position in the text by the Nth occurrence of a string.
struct Position {
string: &'static str,