From e5e343aeb78a8fe7fed897eae1e79019383691e8 Mon Sep 17 00:00:00 2001 From: Piotr Czarnecki Date: Sat, 17 Jan 2015 21:59:44 +0100 Subject: [PATCH] Finished unicode support in the model lexer. Completed XID_Start and XID_Continue rules --- src/grammar/RustLexer.g4 | 2 +- src/grammar/verify.rs | 59 ++++++++++++++++------- src/grammar/xidcontinue.g4 | 98 ++++++++++++++++++++++++++++++++++++++ src/grammar/xidstart.g4 | 90 ++++++++++++++++++++++++++++++++++ 4 files changed, 230 insertions(+), 19 deletions(-) diff --git a/src/grammar/RustLexer.g4 b/src/grammar/RustLexer.g4 index 6578f79f92b..8739d135b4f 100644 --- a/src/grammar/RustLexer.g4 +++ b/src/grammar/RustLexer.g4 @@ -93,7 +93,7 @@ fragment SUFFIX ; LIT_CHAR - : '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] ) '\'' SUFFIX? + : '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] | '\ud800' .. '\udbff' '\udc00' .. '\udfff' ) '\'' SUFFIX? ; LIT_BYTE diff --git a/src/grammar/verify.rs b/src/grammar/verify.rs index cf408c91609..8bf501c7f3f 100644 --- a/src/grammar/verify.rs +++ b/src/grammar/verify.rs @@ -10,6 +10,8 @@ #![feature(plugin)] +#![allow(unstable)] + extern crate syntax; extern crate rustc; @@ -164,7 +166,8 @@ fn count(lit: &str) -> usize { lit.chars().take_while(|c| *c == '#').count() } -fn parse_antlr_token(s: &str, tokens: &HashMap) -> TokenAndSpan { +fn parse_antlr_token(s: &str, tokens: &HashMap, surrogate_pairs_pos: &[usize]) + -> TokenAndSpan { // old regex: // \[@(?P\d+),(?P\d+):(?P\d+)='(?P.+?)',<(?P-?\d+)>,\d+:\d+] let start = s.find_str("[@").unwrap(); @@ -213,9 +216,16 @@ fn parse_antlr_token(s: &str, tokens: &HashMap) -> TokenAn 0 }; + let mut lo = start.parse::().unwrap() - offset; + let mut hi = end.parse::().unwrap() + 1; + + // Adjust the span: For each surrogate pair already encountered, subtract one position. + lo -= surrogate_pairs_pos.binary_search(&(lo as usize)).unwrap_or_else(|x| x) as u32; + hi -= surrogate_pairs_pos.binary_search(&(hi as usize)).unwrap_or_else(|x| x) as u32; + let sp = syntax::codemap::Span { - lo: syntax::codemap::BytePos(start.parse::().unwrap() - offset), - hi: syntax::codemap::BytePos(end.parse::().unwrap() + 1), + lo: syntax::codemap::BytePos(lo), + hi: syntax::codemap::BytePos(hi), expn_id: syntax::codemap::NO_EXPANSION }; @@ -235,11 +245,10 @@ fn tok_cmp(a: &token::Token, b: &token::Token) -> bool { } } -fn span_cmp(rust_sp: syntax::codemap::Span, antlr_sp: syntax::codemap::Span, cm: &syntax::codemap::CodeMap) -> bool { - println!("{} {}", cm.bytepos_to_file_charpos(rust_sp.lo).to_uint(), cm.bytepos_to_file_charpos(rust_sp.hi).to_uint()); - antlr_sp.lo.to_uint() == cm.bytepos_to_file_charpos(rust_sp.lo).to_uint() && - antlr_sp.hi.to_uint() == cm.bytepos_to_file_charpos(rust_sp.hi).to_uint() && - antlr_sp.expn_id == rust_sp.expn_id +fn span_cmp(antlr_sp: syntax::codemap::Span, rust_sp: syntax::codemap::Span, cm: &syntax::codemap::CodeMap) -> bool { + antlr_sp.expn_id == rust_sp.expn_id && + antlr_sp.lo.to_uint() == cm.bytepos_to_file_charpos(rust_sp.lo).to_uint() && + antlr_sp.hi.to_uint() == cm.bytepos_to_file_charpos(rust_sp.hi).to_uint() } fn main() { @@ -250,16 +259,18 @@ fn main() { let args = std::os::args(); - let mut token_file = File::open(&Path::new(args[2])); - let token_map = parse_token_list(token_file.read_to_string().unwrap()); - - let mut stdin = std::io::stdin(); - let mut lock = stdin.lock(); - let lines = lock.lines(); - let mut antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().trim(), - &token_map)); - + // Rust's lexer let code = File::open(&Path::new(args[1])).unwrap().read_to_string().unwrap(); + + let surrogate_pairs_pos: Vec = code.chars().enumerate() + .filter(|&(_, c)| c as usize > 0xFFFF) + .map(|(n, _)| n) + .enumerate() + .map(|(x, n)| x + n) + .collect(); + + debug!("Pairs: {:?}", surrogate_pairs_pos); + let options = config::basic_options(); let session = session::build_session(options, None, syntax::diagnostics::registry::Registry::new(&[])); @@ -269,13 +280,25 @@ fn main() { let mut lexer = lexer::StringReader::new(session.diagnostic(), filemap); let ref cm = lexer.span_diagnostic.cm; + // ANTLR + let mut token_file = File::open(&Path::new(args[2])); + let token_map = parse_token_list(token_file.read_to_string().unwrap()); + + let mut stdin = std::io::stdin(); + let mut lock = stdin.lock(); + let lines = lock.lines(); + let mut antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().trim(), + &token_map, + &surrogate_pairs_pos[])); + for antlr_tok in antlr_tokens { let rustc_tok = next(&mut lexer); if rustc_tok.tok == token::Eof && antlr_tok.tok == token::Eof { continue } - assert!(span_cmp(rustc_tok.sp, antlr_tok.sp, cm), "{:?} and {:?} have different spans", rustc_tok, + assert!(span_cmp(antlr_tok.sp, rustc_tok.sp, cm), "{:?} and {:?} have different spans", + rustc_tok, antlr_tok); macro_rules! matches { diff --git a/src/grammar/xidcontinue.g4 b/src/grammar/xidcontinue.g4 index 6000648f5fb..f3a1a3b40f9 100644 --- a/src/grammar/xidcontinue.g4 +++ b/src/grammar/xidcontinue.g4 @@ -372,4 +372,102 @@ fragment XID_Continue: | '\uffca' .. '\uffcf' | '\uffd2' .. '\uffd7' | '\uffda' .. '\uffdc' + | '\ud800' '\udc00' .. '\udc0a' + | '\ud800' '\udc0d' .. '\udc25' + | '\ud800' '\udc28' .. '\udc39' + | '\ud800' '\udc3c' .. '\udc3c' + | '\ud800' '\udc3f' .. '\udc4c' + | '\ud800' '\udc50' .. '\udc5c' + | '\ud800' '\udc80' .. '\udcf9' + | '\ud800' '\udf00' .. '\udf1d' + | '\ud800' '\udf30' .. '\udf49' + | '\ud800' '\udf80' .. '\udf9c' + | '\ud801' '\ue000' .. '\ue09c' + | '\ud801' '\ue0a0' .. '\ue0a8' + | '\ud802' '\ue400' .. '\ue404' + | '\ud802' '\u0808' + | '\ud802' '\ue40a' .. '\ue434' + | '\ud802' '\ue437' .. '\ue437' + | '\ud802' '\u083c' + | '\ud802' '\u083f' + | '\ud834' '\uad65' .. '\uad68' + | '\ud834' '\uad6d' .. '\uad71' + | '\ud834' '\uad7b' .. '\uad81' + | '\ud834' '\uad85' .. '\uad8a' + | '\ud834' '\uadaa' .. '\uadac' + | '\ud835' '\ub000' .. '\ub053' + | '\ud835' '\ub056' .. '\ub09b' + | '\ud835' '\ub09e' .. '\ub09e' + | '\ud835' '\ud4a2' + | '\ud835' '\ub0a5' .. '\ub0a5' + | '\ud835' '\ub0a9' .. '\ub0ab' + | '\ud835' '\ub0ae' .. '\ub0b8' + | '\ud835' '\ud4bb' + | '\ud835' '\ub0bd' .. '\ub0c2' + | '\ud835' '\ub0c5' .. '\ub104' + | '\ud835' '\ub107' .. '\ub109' + | '\ud835' '\ub10d' .. '\ub113' + | '\ud835' '\ub116' .. '\ub11b' + | '\ud835' '\ub11e' .. '\ub138' + | '\ud835' '\ub13b' .. '\ub13d' + | '\ud835' '\ub140' .. '\ub143' + | '\ud835' '\ud546' + | '\ud835' '\ub14a' .. '\ub14f' + | '\ud835' '\ub152' .. '\ub2a2' + | '\ud835' '\ub2a8' .. '\ub2bf' + | '\ud835' '\ub2c2' .. '\ub2d9' + | '\ud835' '\ub2dc' .. '\ub2f9' + | '\ud835' '\ub2fc' .. '\ub313' + | '\ud835' '\ub316' .. '\ub333' + | '\ud835' '\ub336' .. '\ub34d' + | '\ud835' '\ub350' .. '\ub36d' + | '\ud835' '\ub370' .. '\ub387' + | '\ud835' '\ub38a' .. '\ub3a7' + | '\ud835' '\ub3aa' .. '\ub3c1' + | '\ud835' '\ub3c4' .. '\ub3c8' + | '\ud835' '\ub3ce' .. '\ub3fe' + | '\ud840' '\udc00' .. '\udffe' + | '\ud841' '\ue000' .. '\ue3fe' + | '\ud842' '\ue400' .. '\ue7fe' + | '\ud843' '\ue800' .. '\uebfe' + | '\ud844' '\uec00' .. '\ueffe' + | '\ud845' '\uf000' .. '\uf3fe' + | '\ud846' '\uf400' .. '\uf7fe' + | '\ud847' '\uf800' .. '\ufbfe' + | '\ud848' '\ufc00' .. '\ufffe' + | '\ud849' '\u0000' .. '\u03fe' + | '\ud84a' '\u0400' .. '\u07fe' + | '\ud84b' '\u0800' .. '\u0bfe' + | '\ud84c' '\u0c00' .. '\u0ffe' + | '\ud84d' '\u1000' .. '\u13fe' + | '\ud84e' '\u1400' .. '\u17fe' + | '\ud84f' '\u1800' .. '\u1bfe' + | '\ud850' '\u1c00' .. '\u1ffe' + | '\ud851' '\u2000' .. '\u23fe' + | '\ud852' '\u2400' .. '\u27fe' + | '\ud853' '\u2800' .. '\u2bfe' + | '\ud854' '\u2c00' .. '\u2ffe' + | '\ud855' '\u3000' .. '\u33fe' + | '\ud856' '\u3400' .. '\u37fe' + | '\ud857' '\u3800' .. '\u3bfe' + | '\ud858' '\u3c00' .. '\u3ffe' + | '\ud859' '\u4000' .. '\u43fe' + | '\ud85a' '\u4400' .. '\u47fe' + | '\ud85b' '\u4800' .. '\u4bfe' + | '\ud85c' '\u4c00' .. '\u4ffe' + | '\ud85d' '\u5000' .. '\u53fe' + | '\ud85e' '\u5400' .. '\u57fe' + | '\ud85f' '\u5800' .. '\u5bfe' + | '\ud860' '\u5c00' .. '\u5ffe' + | '\ud861' '\u6000' .. '\u63fe' + | '\ud862' '\u6400' .. '\u67fe' + | '\ud863' '\u6800' .. '\u6bfe' + | '\ud864' '\u6c00' .. '\u6ffe' + | '\ud865' '\u7000' .. '\u73fe' + | '\ud866' '\u7400' .. '\u77fe' + | '\ud867' '\u7800' .. '\u7bfe' + | '\ud868' '\u7c00' .. '\u7ffe' + | '\ud869' '\u8000' .. '\u82d5' + | '\ud87e' '\ud400' .. '\ud61c' + | '\udb40' '\udd00' .. '\uddee' ; diff --git a/src/grammar/xidstart.g4 b/src/grammar/xidstart.g4 index d02774c6135..53fb50f4584 100644 --- a/src/grammar/xidstart.g4 +++ b/src/grammar/xidstart.g4 @@ -286,4 +286,94 @@ fragment XID_Start : | '\uffca' .. '\uffcf' | '\uffd2' .. '\uffd7' | '\uffda' .. '\uffdc' + | '\ud800' '\udc00' .. '\udc0a' + | '\ud800' '\udc0d' .. '\udc25' + | '\ud800' '\udc28' .. '\udc39' + | '\ud800' '\udc3c' .. '\udc3c' + | '\ud800' '\udc3f' .. '\udc4c' + | '\ud800' '\udc50' .. '\udc5c' + | '\ud800' '\udc80' .. '\udcf9' + | '\ud800' '\udf00' .. '\udf1d' + | '\ud800' '\udf30' .. '\udf49' + | '\ud800' '\udf80' .. '\udf9c' + | '\ud801' '\ue000' .. '\ue09c' + | '\ud802' '\ue400' .. '\ue404' + | '\ud802' '\u0808' + | '\ud802' '\ue40a' .. '\ue434' + | '\ud802' '\ue437' .. '\ue437' + | '\ud802' '\u083c' + | '\ud802' '\u083f' + | '\ud835' '\ub000' .. '\ub053' + | '\ud835' '\ub056' .. '\ub09b' + | '\ud835' '\ub09e' .. '\ub09e' + | '\ud835' '\ud4a2' + | '\ud835' '\ub0a5' .. '\ub0a5' + | '\ud835' '\ub0a9' .. '\ub0ab' + | '\ud835' '\ub0ae' .. '\ub0b8' + | '\ud835' '\ud4bb' + | '\ud835' '\ub0bd' .. '\ub0c2' + | '\ud835' '\ub0c5' .. '\ub104' + | '\ud835' '\ub107' .. '\ub109' + | '\ud835' '\ub10d' .. '\ub113' + | '\ud835' '\ub116' .. '\ub11b' + | '\ud835' '\ub11e' .. '\ub138' + | '\ud835' '\ub13b' .. '\ub13d' + | '\ud835' '\ub140' .. '\ub143' + | '\ud835' '\ud546' + | '\ud835' '\ub14a' .. '\ub14f' + | '\ud835' '\ub152' .. '\ub2a2' + | '\ud835' '\ub2a8' .. '\ub2bf' + | '\ud835' '\ub2c2' .. '\ub2d9' + | '\ud835' '\ub2dc' .. '\ub2f9' + | '\ud835' '\ub2fc' .. '\ub313' + | '\ud835' '\ub316' .. '\ub333' + | '\ud835' '\ub336' .. '\ub34d' + | '\ud835' '\ub350' .. '\ub36d' + | '\ud835' '\ub370' .. '\ub387' + | '\ud835' '\ub38a' .. '\ub3a7' + | '\ud835' '\ub3aa' .. '\ub3c1' + | '\ud835' '\ub3c4' .. '\ub3c8' + | '\ud840' '\udc00' .. '\udffe' + | '\ud841' '\ue000' .. '\ue3fe' + | '\ud842' '\ue400' .. '\ue7fe' + | '\ud843' '\ue800' .. '\uebfe' + | '\ud844' '\uec00' .. '\ueffe' + | '\ud845' '\uf000' .. '\uf3fe' + | '\ud846' '\uf400' .. '\uf7fe' + | '\ud847' '\uf800' .. '\ufbfe' + | '\ud848' '\ufc00' .. '\ufffe' + | '\ud849' '\u0000' .. '\u03fe' + | '\ud84a' '\u0400' .. '\u07fe' + | '\ud84b' '\u0800' .. '\u0bfe' + | '\ud84c' '\u0c00' .. '\u0ffe' + | '\ud84d' '\u1000' .. '\u13fe' + | '\ud84e' '\u1400' .. '\u17fe' + | '\ud84f' '\u1800' .. '\u1bfe' + | '\ud850' '\u1c00' .. '\u1ffe' + | '\ud851' '\u2000' .. '\u23fe' + | '\ud852' '\u2400' .. '\u27fe' + | '\ud853' '\u2800' .. '\u2bfe' + | '\ud854' '\u2c00' .. '\u2ffe' + | '\ud855' '\u3000' .. '\u33fe' + | '\ud856' '\u3400' .. '\u37fe' + | '\ud857' '\u3800' .. '\u3bfe' + | '\ud858' '\u3c00' .. '\u3ffe' + | '\ud859' '\u4000' .. '\u43fe' + | '\ud85a' '\u4400' .. '\u47fe' + | '\ud85b' '\u4800' .. '\u4bfe' + | '\ud85c' '\u4c00' .. '\u4ffe' + | '\ud85d' '\u5000' .. '\u53fe' + | '\ud85e' '\u5400' .. '\u57fe' + | '\ud85f' '\u5800' .. '\u5bfe' + | '\ud860' '\u5c00' .. '\u5ffe' + | '\ud861' '\u6000' .. '\u63fe' + | '\ud862' '\u6400' .. '\u67fe' + | '\ud863' '\u6800' .. '\u6bfe' + | '\ud864' '\u6c00' .. '\u6ffe' + | '\ud865' '\u7000' .. '\u73fe' + | '\ud866' '\u7400' .. '\u77fe' + | '\ud867' '\u7800' .. '\u7bfe' + | '\ud868' '\u7c00' .. '\u7ffe' + | '\ud869' '\u8000' .. '\u82d5' + | '\ud87e' '\ud400' .. '\ud61c' ;