libsyntax: accept only whitespace with the PATTERN_WHITE_SPACE property
This aligns with unicode recommendations and should be stable for all future unicode releases. See http://unicode.org/reports/tr31/#R3. This renames `libsyntax::lexer::is_whitespace` to `is_pattern_whitespace` so potentially breaks users of libsyntax.
This commit is contained in:
parent
9e3e43f3f6
commit
24578e0fe5
@ -86,7 +86,7 @@ DEPS_serialize := std log
|
||||
DEPS_term := std log
|
||||
DEPS_test := std getopts serialize rbml term native:rust_test_helpers
|
||||
|
||||
DEPS_syntax := std term serialize log arena libc rustc_bitflags
|
||||
DEPS_syntax := std term serialize log arena libc rustc_bitflags rustc_unicode
|
||||
DEPS_syntax_ext := syntax fmt_macros
|
||||
|
||||
DEPS_rustc := syntax fmt_macros flate arena serialize getopts rbml rustc_front\
|
||||
|
@ -398,7 +398,7 @@ pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
|
||||
derived = load_properties("DerivedCoreProperties.txt", want_derived)
|
||||
scripts = load_properties("Scripts.txt", [])
|
||||
props = load_properties("PropList.txt",
|
||||
["White_Space", "Join_Control", "Noncharacter_Code_Point"])
|
||||
["White_Space", "Join_Control", "Noncharacter_Code_Point", "Pattern_White_Space"])
|
||||
norm_props = load_properties("DerivedNormalizationProps.txt",
|
||||
["Full_Composition_Exclusion"])
|
||||
|
||||
@ -408,7 +408,7 @@ pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
|
||||
# category tables
|
||||
for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \
|
||||
("derived_property", derived, want_derived), \
|
||||
("property", props, ["White_Space"]):
|
||||
("property", props, ["White_Space", "Pattern_White_Space"]):
|
||||
emit_property_module(rf, name, cat, pfuns)
|
||||
|
||||
# normalizations and conversions module
|
||||
|
@ -50,3 +50,8 @@ pub mod str {
|
||||
pub mod derived_property {
|
||||
pub use tables::derived_property::{Cased, Case_Ignorable};
|
||||
}
|
||||
|
||||
// For use in libsyntax
|
||||
pub mod property {
|
||||
pub use tables::property::Pattern_White_Space;
|
||||
}
|
||||
|
@ -1180,6 +1180,15 @@ pub mod derived_property {
|
||||
}
|
||||
|
||||
pub mod property {
|
||||
pub const Pattern_White_Space_table: &'static [(char, char)] = &[
|
||||
('\u{9}', '\u{d}'), ('\u{20}', '\u{20}'), ('\u{85}', '\u{85}'), ('\u{200e}', '\u{200f}'),
|
||||
('\u{2028}', '\u{2029}')
|
||||
];
|
||||
|
||||
pub fn Pattern_White_Space(c: char) -> bool {
|
||||
super::bsearch_range_table(c, Pattern_White_Space_table)
|
||||
}
|
||||
|
||||
pub const White_Space_table: &'static [(char, char)] = &[
|
||||
('\u{9}', '\u{d}'), ('\u{20}', '\u{20}'), ('\u{85}', '\u{85}'), ('\u{a0}', '\u{a0}'),
|
||||
('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), ('\u{2028}', '\u{2029}'), ('\u{202f}',
|
||||
|
@ -37,6 +37,7 @@ extern crate term;
|
||||
extern crate libc;
|
||||
#[macro_use] extern crate log;
|
||||
#[macro_use] #[no_link] extern crate rustc_bitflags;
|
||||
extern crate rustc_unicode;
|
||||
|
||||
extern crate serialize as rustc_serialize; // used by deriving
|
||||
|
||||
|
@ -15,7 +15,7 @@ use codemap::{BytePos, CharPos, CodeMap, Pos};
|
||||
use errors;
|
||||
use parse::lexer::is_block_doc_comment;
|
||||
use parse::lexer::{StringReader, TokenAndSpan};
|
||||
use parse::lexer::{is_whitespace, Reader};
|
||||
use parse::lexer::{is_pattern_whitespace, Reader};
|
||||
use parse::lexer;
|
||||
use print::pprust;
|
||||
use str::char_at;
|
||||
@ -153,7 +153,7 @@ fn push_blank_line_comment(rdr: &StringReader, comments: &mut Vec<Comment>) {
|
||||
}
|
||||
|
||||
fn consume_whitespace_counting_blank_lines(rdr: &mut StringReader, comments: &mut Vec<Comment>) {
|
||||
while is_whitespace(rdr.curr) && !rdr.is_eof() {
|
||||
while is_pattern_whitespace(rdr.curr) && !rdr.is_eof() {
|
||||
if rdr.col == CharPos(0) && rdr.curr_is('\n') {
|
||||
push_blank_line_comment(rdr, &mut *comments);
|
||||
}
|
||||
|
@ -16,6 +16,7 @@ use ext::tt::transcribe::tt_next_token;
|
||||
use parse::token::str_to_ident;
|
||||
use parse::token;
|
||||
use str::char_at;
|
||||
use rustc_unicode::property::Pattern_White_Space;
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::char;
|
||||
@ -546,10 +547,10 @@ impl<'a> StringReader<'a> {
|
||||
let c = self.scan_comment();
|
||||
debug!("scanning a comment {:?}", c);
|
||||
c
|
||||
}
|
||||
c if is_whitespace(Some(c)) => {
|
||||
},
|
||||
c if is_pattern_whitespace(Some(c)) => {
|
||||
let start_bpos = self.last_pos;
|
||||
while is_whitespace(self.curr) {
|
||||
while is_pattern_whitespace(self.curr) {
|
||||
self.bump();
|
||||
}
|
||||
let c = Some(TokenAndSpan {
|
||||
@ -1435,7 +1436,7 @@ impl<'a> StringReader<'a> {
|
||||
}
|
||||
|
||||
fn consume_whitespace(&mut self) {
|
||||
while is_whitespace(self.curr) && !self.is_eof() {
|
||||
while is_pattern_whitespace(self.curr) && !self.is_eof() {
|
||||
self.bump();
|
||||
}
|
||||
}
|
||||
@ -1460,7 +1461,7 @@ impl<'a> StringReader<'a> {
|
||||
}
|
||||
|
||||
fn consume_non_eol_whitespace(&mut self) {
|
||||
while is_whitespace(self.curr) && !self.curr_is('\n') && !self.is_eof() {
|
||||
while is_pattern_whitespace(self.curr) && !self.curr_is('\n') && !self.is_eof() {
|
||||
self.bump();
|
||||
}
|
||||
}
|
||||
@ -1591,8 +1592,10 @@ impl<'a> StringReader<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_whitespace(c: Option<char>) -> bool {
|
||||
c.map_or(false, char::is_whitespace)
|
||||
// This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which
|
||||
// is guaranteed to be forward compatible. http://unicode.org/reports/tr31/#R3
|
||||
pub fn is_pattern_whitespace(c: Option<char>) -> bool {
|
||||
c.map_or(false, Pattern_White_Space)
|
||||
}
|
||||
|
||||
fn in_range(c: Option<char>, lo: char, hi: char) -> bool {
|
||||
|
@ -10,7 +10,7 @@
|
||||
|
||||
use ast;
|
||||
use parse::{ParseSess,PResult,filemap_to_tts};
|
||||
use parse::new_parser_from_source_str;
|
||||
use parse::{lexer, new_parser_from_source_str};
|
||||
use parse::parser::Parser;
|
||||
use parse::token;
|
||||
use ptr::P;
|
||||
@ -97,8 +97,8 @@ pub fn matches_codepattern(a : &str, b : &str) -> bool {
|
||||
let (a, b) = match (a_iter.peek(), b_iter.peek()) {
|
||||
(None, None) => return true,
|
||||
(None, _) => return false,
|
||||
(Some(a), None) => {
|
||||
if a.is_whitespace() {
|
||||
(Some(&a), None) => {
|
||||
if is_pattern_whitespace(a) {
|
||||
break // trailing whitespace check is out of loop for borrowck
|
||||
} else {
|
||||
return false
|
||||
@ -107,11 +107,11 @@ pub fn matches_codepattern(a : &str, b : &str) -> bool {
|
||||
(Some(&a), Some(&b)) => (a, b)
|
||||
};
|
||||
|
||||
if a.is_whitespace() && b.is_whitespace() {
|
||||
if is_pattern_whitespace(a) && is_pattern_whitespace(b) {
|
||||
// skip whitespace for a and b
|
||||
scan_for_non_ws_or_end(&mut a_iter);
|
||||
scan_for_non_ws_or_end(&mut b_iter);
|
||||
} else if a.is_whitespace() {
|
||||
} else if is_pattern_whitespace(a) {
|
||||
// skip whitespace for a
|
||||
scan_for_non_ws_or_end(&mut a_iter);
|
||||
} else if a == b {
|
||||
@ -123,23 +123,18 @@ pub fn matches_codepattern(a : &str, b : &str) -> bool {
|
||||
}
|
||||
|
||||
// check if a has *only* trailing whitespace
|
||||
a_iter.all(|c| c.is_whitespace())
|
||||
a_iter.all(is_pattern_whitespace)
|
||||
}
|
||||
|
||||
/// Advances the given peekable `Iterator` until it reaches a non-whitespace character
|
||||
fn scan_for_non_ws_or_end<I: Iterator<Item= char>>(iter: &mut Peekable<I>) {
|
||||
loop {
|
||||
match iter.peek() {
|
||||
Some(c) if c.is_whitespace() => {} // fall through; borrowck
|
||||
_ => return
|
||||
}
|
||||
|
||||
while lexer::is_pattern_whitespace(iter.peek().cloned()) {
|
||||
iter.next();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_whitespace(c: char) -> bool {
|
||||
c.is_whitespace()
|
||||
pub fn is_pattern_whitespace(c: char) -> bool {
|
||||
lexer::is_pattern_whitespace(Some(c))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@ -162,14 +157,18 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn more_whitespace() {
|
||||
fn pattern_whitespace() {
|
||||
assert_eq!(matches_codepattern("","\x0C"), false);
|
||||
assert_eq!(matches_codepattern("a b","a\u{2002}b"),true);
|
||||
assert_eq!(matches_codepattern("a b ","a \u{0085}\n\t\r b"),true);
|
||||
assert_eq!(matches_codepattern("a b","a \u{0085}\n\t\r b "),false);
|
||||
assert_eq!(matches_codepattern("a b","a\u{2002}b"),true);
|
||||
assert_eq!(matches_codepattern("ab","a\u{2003}b"),false);
|
||||
assert_eq!(matches_codepattern("a \u{3000}b","ab"),true);
|
||||
assert_eq!(matches_codepattern("\u{205F}a b","ab"),true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_pattern_whitespace() {
|
||||
// These have the property 'White_Space' but not 'Pattern_White_Space'
|
||||
assert_eq!(matches_codepattern("a b","a\u{2002}b"), false);
|
||||
assert_eq!(matches_codepattern("a b","a\u{2002}b"), false);
|
||||
assert_eq!(matches_codepattern("\u{205F}a b","ab"), false);
|
||||
assert_eq!(matches_codepattern("a \u{3000}b","ab"), false);
|
||||
}
|
||||
}
|
||||
|
@ -9,10 +9,14 @@
|
||||
// except according to those terms.
|
||||
|
||||
|
||||
// Beware editing: it has numerous whitespace characters which are important
|
||||
// Beware editing: it has numerous whitespace characters which are important.
|
||||
// It contains one ranges from the 'PATTERN_WHITE_SPACE' property outlined in
|
||||
// http://unicode.org/Public/UNIDATA/PropList.txt
|
||||
//
|
||||
// The characters in the first expression of the assertion can be generated
|
||||
// from: "4\u{0C}+\n\t\r7\t*\u{20}2\u{85}/\u{200E}3\u{200F}*\u{2028}2\u{2029}"
|
||||
pub fn main() {
|
||||
assert_eq!(4 + 7 * 2
|
||||
assert_eq!(4+
|
||||
|
||||
|
||||
/ 3 * 2 , 4 + 7 * 2 / 3 * 2);
|
||||
7 * 2
/3*
2
, 4 + 7 * 2 / 3 * 2);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user