Finished unicode support in the model lexer.

Completed XID_Start and XID_Continue rules
This commit is contained in:
Piotr Czarnecki 2015-01-17 21:59:44 +01:00
parent be437132b8
commit e5e343aeb7
4 changed files with 230 additions and 19 deletions

View File

@ -93,7 +93,7 @@ fragment SUFFIX
;
LIT_CHAR
: '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] ) '\'' SUFFIX?
: '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] | '\ud800' .. '\udbff' '\udc00' .. '\udfff' ) '\'' SUFFIX?
;
LIT_BYTE

View File

@ -10,6 +10,8 @@
#![feature(plugin)]
#![allow(unstable)]
extern crate syntax;
extern crate rustc;
@ -164,7 +166,8 @@ fn count(lit: &str) -> usize {
lit.chars().take_while(|c| *c == '#').count()
}
fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>) -> TokenAndSpan {
fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>, surrogate_pairs_pos: &[usize])
-> TokenAndSpan {
// old regex:
// \[@(?P<seq>\d+),(?P<start>\d+):(?P<end>\d+)='(?P<content>.+?)',<(?P<toknum>-?\d+)>,\d+:\d+]
let start = s.find_str("[@").unwrap();
@ -213,9 +216,16 @@ fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>) -> TokenAn
0
};
let mut lo = start.parse::<u32>().unwrap() - offset;
let mut hi = end.parse::<u32>().unwrap() + 1;
// Adjust the span: For each surrogate pair already encountered, subtract one position.
lo -= surrogate_pairs_pos.binary_search(&(lo as usize)).unwrap_or_else(|x| x) as u32;
hi -= surrogate_pairs_pos.binary_search(&(hi as usize)).unwrap_or_else(|x| x) as u32;
let sp = syntax::codemap::Span {
lo: syntax::codemap::BytePos(start.parse::<u32>().unwrap() - offset),
hi: syntax::codemap::BytePos(end.parse::<u32>().unwrap() + 1),
lo: syntax::codemap::BytePos(lo),
hi: syntax::codemap::BytePos(hi),
expn_id: syntax::codemap::NO_EXPANSION
};
@ -235,11 +245,10 @@ fn tok_cmp(a: &token::Token, b: &token::Token) -> bool {
}
}
fn span_cmp(rust_sp: syntax::codemap::Span, antlr_sp: syntax::codemap::Span, cm: &syntax::codemap::CodeMap) -> bool {
println!("{} {}", cm.bytepos_to_file_charpos(rust_sp.lo).to_uint(), cm.bytepos_to_file_charpos(rust_sp.hi).to_uint());
antlr_sp.lo.to_uint() == cm.bytepos_to_file_charpos(rust_sp.lo).to_uint() &&
antlr_sp.hi.to_uint() == cm.bytepos_to_file_charpos(rust_sp.hi).to_uint() &&
antlr_sp.expn_id == rust_sp.expn_id
fn span_cmp(antlr_sp: syntax::codemap::Span, rust_sp: syntax::codemap::Span, cm: &syntax::codemap::CodeMap) -> bool {
antlr_sp.expn_id == rust_sp.expn_id &&
antlr_sp.lo.to_uint() == cm.bytepos_to_file_charpos(rust_sp.lo).to_uint() &&
antlr_sp.hi.to_uint() == cm.bytepos_to_file_charpos(rust_sp.hi).to_uint()
}
fn main() {
@ -250,16 +259,18 @@ fn main() {
let args = std::os::args();
let mut token_file = File::open(&Path::new(args[2]));
let token_map = parse_token_list(token_file.read_to_string().unwrap());
let mut stdin = std::io::stdin();
let mut lock = stdin.lock();
let lines = lock.lines();
let mut antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().trim(),
&token_map));
// Rust's lexer
let code = File::open(&Path::new(args[1])).unwrap().read_to_string().unwrap();
let surrogate_pairs_pos: Vec<usize> = code.chars().enumerate()
.filter(|&(_, c)| c as usize > 0xFFFF)
.map(|(n, _)| n)
.enumerate()
.map(|(x, n)| x + n)
.collect();
debug!("Pairs: {:?}", surrogate_pairs_pos);
let options = config::basic_options();
let session = session::build_session(options, None,
syntax::diagnostics::registry::Registry::new(&[]));
@ -269,13 +280,25 @@ fn main() {
let mut lexer = lexer::StringReader::new(session.diagnostic(), filemap);
let ref cm = lexer.span_diagnostic.cm;
// ANTLR
let mut token_file = File::open(&Path::new(args[2]));
let token_map = parse_token_list(token_file.read_to_string().unwrap());
let mut stdin = std::io::stdin();
let mut lock = stdin.lock();
let lines = lock.lines();
let mut antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().trim(),
&token_map,
&surrogate_pairs_pos[]));
for antlr_tok in antlr_tokens {
let rustc_tok = next(&mut lexer);
if rustc_tok.tok == token::Eof && antlr_tok.tok == token::Eof {
continue
}
assert!(span_cmp(rustc_tok.sp, antlr_tok.sp, cm), "{:?} and {:?} have different spans", rustc_tok,
assert!(span_cmp(antlr_tok.sp, rustc_tok.sp, cm), "{:?} and {:?} have different spans",
rustc_tok,
antlr_tok);
macro_rules! matches {

View File

@ -372,4 +372,102 @@ fragment XID_Continue:
| '\uffca' .. '\uffcf'
| '\uffd2' .. '\uffd7'
| '\uffda' .. '\uffdc'
| '\ud800' '\udc00' .. '\udc0a'
| '\ud800' '\udc0d' .. '\udc25'
| '\ud800' '\udc28' .. '\udc39'
| '\ud800' '\udc3c' .. '\udc3c'
| '\ud800' '\udc3f' .. '\udc4c'
| '\ud800' '\udc50' .. '\udc5c'
| '\ud800' '\udc80' .. '\udcf9'
| '\ud800' '\udf00' .. '\udf1d'
| '\ud800' '\udf30' .. '\udf49'
| '\ud800' '\udf80' .. '\udf9c'
| '\ud801' '\ue000' .. '\ue09c'
| '\ud801' '\ue0a0' .. '\ue0a8'
| '\ud802' '\ue400' .. '\ue404'
| '\ud802' '\u0808'
| '\ud802' '\ue40a' .. '\ue434'
| '\ud802' '\ue437' .. '\ue437'
| '\ud802' '\u083c'
| '\ud802' '\u083f'
| '\ud834' '\uad65' .. '\uad68'
| '\ud834' '\uad6d' .. '\uad71'
| '\ud834' '\uad7b' .. '\uad81'
| '\ud834' '\uad85' .. '\uad8a'
| '\ud834' '\uadaa' .. '\uadac'
| '\ud835' '\ub000' .. '\ub053'
| '\ud835' '\ub056' .. '\ub09b'
| '\ud835' '\ub09e' .. '\ub09e'
| '\ud835' '\ud4a2'
| '\ud835' '\ub0a5' .. '\ub0a5'
| '\ud835' '\ub0a9' .. '\ub0ab'
| '\ud835' '\ub0ae' .. '\ub0b8'
| '\ud835' '\ud4bb'
| '\ud835' '\ub0bd' .. '\ub0c2'
| '\ud835' '\ub0c5' .. '\ub104'
| '\ud835' '\ub107' .. '\ub109'
| '\ud835' '\ub10d' .. '\ub113'
| '\ud835' '\ub116' .. '\ub11b'
| '\ud835' '\ub11e' .. '\ub138'
| '\ud835' '\ub13b' .. '\ub13d'
| '\ud835' '\ub140' .. '\ub143'
| '\ud835' '\ud546'
| '\ud835' '\ub14a' .. '\ub14f'
| '\ud835' '\ub152' .. '\ub2a2'
| '\ud835' '\ub2a8' .. '\ub2bf'
| '\ud835' '\ub2c2' .. '\ub2d9'
| '\ud835' '\ub2dc' .. '\ub2f9'
| '\ud835' '\ub2fc' .. '\ub313'
| '\ud835' '\ub316' .. '\ub333'
| '\ud835' '\ub336' .. '\ub34d'
| '\ud835' '\ub350' .. '\ub36d'
| '\ud835' '\ub370' .. '\ub387'
| '\ud835' '\ub38a' .. '\ub3a7'
| '\ud835' '\ub3aa' .. '\ub3c1'
| '\ud835' '\ub3c4' .. '\ub3c8'
| '\ud835' '\ub3ce' .. '\ub3fe'
| '\ud840' '\udc00' .. '\udffe'
| '\ud841' '\ue000' .. '\ue3fe'
| '\ud842' '\ue400' .. '\ue7fe'
| '\ud843' '\ue800' .. '\uebfe'
| '\ud844' '\uec00' .. '\ueffe'
| '\ud845' '\uf000' .. '\uf3fe'
| '\ud846' '\uf400' .. '\uf7fe'
| '\ud847' '\uf800' .. '\ufbfe'
| '\ud848' '\ufc00' .. '\ufffe'
| '\ud849' '\u0000' .. '\u03fe'
| '\ud84a' '\u0400' .. '\u07fe'
| '\ud84b' '\u0800' .. '\u0bfe'
| '\ud84c' '\u0c00' .. '\u0ffe'
| '\ud84d' '\u1000' .. '\u13fe'
| '\ud84e' '\u1400' .. '\u17fe'
| '\ud84f' '\u1800' .. '\u1bfe'
| '\ud850' '\u1c00' .. '\u1ffe'
| '\ud851' '\u2000' .. '\u23fe'
| '\ud852' '\u2400' .. '\u27fe'
| '\ud853' '\u2800' .. '\u2bfe'
| '\ud854' '\u2c00' .. '\u2ffe'
| '\ud855' '\u3000' .. '\u33fe'
| '\ud856' '\u3400' .. '\u37fe'
| '\ud857' '\u3800' .. '\u3bfe'
| '\ud858' '\u3c00' .. '\u3ffe'
| '\ud859' '\u4000' .. '\u43fe'
| '\ud85a' '\u4400' .. '\u47fe'
| '\ud85b' '\u4800' .. '\u4bfe'
| '\ud85c' '\u4c00' .. '\u4ffe'
| '\ud85d' '\u5000' .. '\u53fe'
| '\ud85e' '\u5400' .. '\u57fe'
| '\ud85f' '\u5800' .. '\u5bfe'
| '\ud860' '\u5c00' .. '\u5ffe'
| '\ud861' '\u6000' .. '\u63fe'
| '\ud862' '\u6400' .. '\u67fe'
| '\ud863' '\u6800' .. '\u6bfe'
| '\ud864' '\u6c00' .. '\u6ffe'
| '\ud865' '\u7000' .. '\u73fe'
| '\ud866' '\u7400' .. '\u77fe'
| '\ud867' '\u7800' .. '\u7bfe'
| '\ud868' '\u7c00' .. '\u7ffe'
| '\ud869' '\u8000' .. '\u82d5'
| '\ud87e' '\ud400' .. '\ud61c'
| '\udb40' '\udd00' .. '\uddee'
;

View File

@ -286,4 +286,94 @@ fragment XID_Start :
| '\uffca' .. '\uffcf'
| '\uffd2' .. '\uffd7'
| '\uffda' .. '\uffdc'
| '\ud800' '\udc00' .. '\udc0a'
| '\ud800' '\udc0d' .. '\udc25'
| '\ud800' '\udc28' .. '\udc39'
| '\ud800' '\udc3c' .. '\udc3c'
| '\ud800' '\udc3f' .. '\udc4c'
| '\ud800' '\udc50' .. '\udc5c'
| '\ud800' '\udc80' .. '\udcf9'
| '\ud800' '\udf00' .. '\udf1d'
| '\ud800' '\udf30' .. '\udf49'
| '\ud800' '\udf80' .. '\udf9c'
| '\ud801' '\ue000' .. '\ue09c'
| '\ud802' '\ue400' .. '\ue404'
| '\ud802' '\u0808'
| '\ud802' '\ue40a' .. '\ue434'
| '\ud802' '\ue437' .. '\ue437'
| '\ud802' '\u083c'
| '\ud802' '\u083f'
| '\ud835' '\ub000' .. '\ub053'
| '\ud835' '\ub056' .. '\ub09b'
| '\ud835' '\ub09e' .. '\ub09e'
| '\ud835' '\ud4a2'
| '\ud835' '\ub0a5' .. '\ub0a5'
| '\ud835' '\ub0a9' .. '\ub0ab'
| '\ud835' '\ub0ae' .. '\ub0b8'
| '\ud835' '\ud4bb'
| '\ud835' '\ub0bd' .. '\ub0c2'
| '\ud835' '\ub0c5' .. '\ub104'
| '\ud835' '\ub107' .. '\ub109'
| '\ud835' '\ub10d' .. '\ub113'
| '\ud835' '\ub116' .. '\ub11b'
| '\ud835' '\ub11e' .. '\ub138'
| '\ud835' '\ub13b' .. '\ub13d'
| '\ud835' '\ub140' .. '\ub143'
| '\ud835' '\ud546'
| '\ud835' '\ub14a' .. '\ub14f'
| '\ud835' '\ub152' .. '\ub2a2'
| '\ud835' '\ub2a8' .. '\ub2bf'
| '\ud835' '\ub2c2' .. '\ub2d9'
| '\ud835' '\ub2dc' .. '\ub2f9'
| '\ud835' '\ub2fc' .. '\ub313'
| '\ud835' '\ub316' .. '\ub333'
| '\ud835' '\ub336' .. '\ub34d'
| '\ud835' '\ub350' .. '\ub36d'
| '\ud835' '\ub370' .. '\ub387'
| '\ud835' '\ub38a' .. '\ub3a7'
| '\ud835' '\ub3aa' .. '\ub3c1'
| '\ud835' '\ub3c4' .. '\ub3c8'
| '\ud840' '\udc00' .. '\udffe'
| '\ud841' '\ue000' .. '\ue3fe'
| '\ud842' '\ue400' .. '\ue7fe'
| '\ud843' '\ue800' .. '\uebfe'
| '\ud844' '\uec00' .. '\ueffe'
| '\ud845' '\uf000' .. '\uf3fe'
| '\ud846' '\uf400' .. '\uf7fe'
| '\ud847' '\uf800' .. '\ufbfe'
| '\ud848' '\ufc00' .. '\ufffe'
| '\ud849' '\u0000' .. '\u03fe'
| '\ud84a' '\u0400' .. '\u07fe'
| '\ud84b' '\u0800' .. '\u0bfe'
| '\ud84c' '\u0c00' .. '\u0ffe'
| '\ud84d' '\u1000' .. '\u13fe'
| '\ud84e' '\u1400' .. '\u17fe'
| '\ud84f' '\u1800' .. '\u1bfe'
| '\ud850' '\u1c00' .. '\u1ffe'
| '\ud851' '\u2000' .. '\u23fe'
| '\ud852' '\u2400' .. '\u27fe'
| '\ud853' '\u2800' .. '\u2bfe'
| '\ud854' '\u2c00' .. '\u2ffe'
| '\ud855' '\u3000' .. '\u33fe'
| '\ud856' '\u3400' .. '\u37fe'
| '\ud857' '\u3800' .. '\u3bfe'
| '\ud858' '\u3c00' .. '\u3ffe'
| '\ud859' '\u4000' .. '\u43fe'
| '\ud85a' '\u4400' .. '\u47fe'
| '\ud85b' '\u4800' .. '\u4bfe'
| '\ud85c' '\u4c00' .. '\u4ffe'
| '\ud85d' '\u5000' .. '\u53fe'
| '\ud85e' '\u5400' .. '\u57fe'
| '\ud85f' '\u5800' .. '\u5bfe'
| '\ud860' '\u5c00' .. '\u5ffe'
| '\ud861' '\u6000' .. '\u63fe'
| '\ud862' '\u6400' .. '\u67fe'
| '\ud863' '\u6800' .. '\u6bfe'
| '\ud864' '\u6c00' .. '\u6ffe'
| '\ud865' '\u7000' .. '\u73fe'
| '\ud866' '\u7400' .. '\u77fe'
| '\ud867' '\u7800' .. '\u7bfe'
| '\ud868' '\u7c00' .. '\u7ffe'
| '\ud869' '\u8000' .. '\u82d5'
| '\ud87e' '\ud400' .. '\ud61c'
;