Finished unicode support in the model lexer.
Completed XID_Start and XID_Continue rules
This commit is contained in:
parent
be437132b8
commit
e5e343aeb7
@ -93,7 +93,7 @@ fragment SUFFIX
|
||||
;
|
||||
|
||||
LIT_CHAR
|
||||
: '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] ) '\'' SUFFIX?
|
||||
: '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] | '\ud800' .. '\udbff' '\udc00' .. '\udfff' ) '\'' SUFFIX?
|
||||
;
|
||||
|
||||
LIT_BYTE
|
||||
|
@ -10,6 +10,8 @@
|
||||
|
||||
#![feature(plugin)]
|
||||
|
||||
#![allow(unstable)]
|
||||
|
||||
extern crate syntax;
|
||||
extern crate rustc;
|
||||
|
||||
@ -164,7 +166,8 @@ fn count(lit: &str) -> usize {
|
||||
lit.chars().take_while(|c| *c == '#').count()
|
||||
}
|
||||
|
||||
fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>) -> TokenAndSpan {
|
||||
fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>, surrogate_pairs_pos: &[usize])
|
||||
-> TokenAndSpan {
|
||||
// old regex:
|
||||
// \[@(?P<seq>\d+),(?P<start>\d+):(?P<end>\d+)='(?P<content>.+?)',<(?P<toknum>-?\d+)>,\d+:\d+]
|
||||
let start = s.find_str("[@").unwrap();
|
||||
@ -213,9 +216,16 @@ fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>) -> TokenAn
|
||||
0
|
||||
};
|
||||
|
||||
let mut lo = start.parse::<u32>().unwrap() - offset;
|
||||
let mut hi = end.parse::<u32>().unwrap() + 1;
|
||||
|
||||
// Adjust the span: For each surrogate pair already encountered, subtract one position.
|
||||
lo -= surrogate_pairs_pos.binary_search(&(lo as usize)).unwrap_or_else(|x| x) as u32;
|
||||
hi -= surrogate_pairs_pos.binary_search(&(hi as usize)).unwrap_or_else(|x| x) as u32;
|
||||
|
||||
let sp = syntax::codemap::Span {
|
||||
lo: syntax::codemap::BytePos(start.parse::<u32>().unwrap() - offset),
|
||||
hi: syntax::codemap::BytePos(end.parse::<u32>().unwrap() + 1),
|
||||
lo: syntax::codemap::BytePos(lo),
|
||||
hi: syntax::codemap::BytePos(hi),
|
||||
expn_id: syntax::codemap::NO_EXPANSION
|
||||
};
|
||||
|
||||
@ -235,11 +245,10 @@ fn tok_cmp(a: &token::Token, b: &token::Token) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
fn span_cmp(rust_sp: syntax::codemap::Span, antlr_sp: syntax::codemap::Span, cm: &syntax::codemap::CodeMap) -> bool {
|
||||
println!("{} {}", cm.bytepos_to_file_charpos(rust_sp.lo).to_uint(), cm.bytepos_to_file_charpos(rust_sp.hi).to_uint());
|
||||
antlr_sp.lo.to_uint() == cm.bytepos_to_file_charpos(rust_sp.lo).to_uint() &&
|
||||
antlr_sp.hi.to_uint() == cm.bytepos_to_file_charpos(rust_sp.hi).to_uint() &&
|
||||
antlr_sp.expn_id == rust_sp.expn_id
|
||||
fn span_cmp(antlr_sp: syntax::codemap::Span, rust_sp: syntax::codemap::Span, cm: &syntax::codemap::CodeMap) -> bool {
|
||||
antlr_sp.expn_id == rust_sp.expn_id &&
|
||||
antlr_sp.lo.to_uint() == cm.bytepos_to_file_charpos(rust_sp.lo).to_uint() &&
|
||||
antlr_sp.hi.to_uint() == cm.bytepos_to_file_charpos(rust_sp.hi).to_uint()
|
||||
}
|
||||
|
||||
fn main() {
|
||||
@ -250,16 +259,18 @@ fn main() {
|
||||
|
||||
let args = std::os::args();
|
||||
|
||||
let mut token_file = File::open(&Path::new(args[2]));
|
||||
let token_map = parse_token_list(token_file.read_to_string().unwrap());
|
||||
|
||||
let mut stdin = std::io::stdin();
|
||||
let mut lock = stdin.lock();
|
||||
let lines = lock.lines();
|
||||
let mut antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().trim(),
|
||||
&token_map));
|
||||
|
||||
// Rust's lexer
|
||||
let code = File::open(&Path::new(args[1])).unwrap().read_to_string().unwrap();
|
||||
|
||||
let surrogate_pairs_pos: Vec<usize> = code.chars().enumerate()
|
||||
.filter(|&(_, c)| c as usize > 0xFFFF)
|
||||
.map(|(n, _)| n)
|
||||
.enumerate()
|
||||
.map(|(x, n)| x + n)
|
||||
.collect();
|
||||
|
||||
debug!("Pairs: {:?}", surrogate_pairs_pos);
|
||||
|
||||
let options = config::basic_options();
|
||||
let session = session::build_session(options, None,
|
||||
syntax::diagnostics::registry::Registry::new(&[]));
|
||||
@ -269,13 +280,25 @@ fn main() {
|
||||
let mut lexer = lexer::StringReader::new(session.diagnostic(), filemap);
|
||||
let ref cm = lexer.span_diagnostic.cm;
|
||||
|
||||
// ANTLR
|
||||
let mut token_file = File::open(&Path::new(args[2]));
|
||||
let token_map = parse_token_list(token_file.read_to_string().unwrap());
|
||||
|
||||
let mut stdin = std::io::stdin();
|
||||
let mut lock = stdin.lock();
|
||||
let lines = lock.lines();
|
||||
let mut antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().trim(),
|
||||
&token_map,
|
||||
&surrogate_pairs_pos[]));
|
||||
|
||||
for antlr_tok in antlr_tokens {
|
||||
let rustc_tok = next(&mut lexer);
|
||||
if rustc_tok.tok == token::Eof && antlr_tok.tok == token::Eof {
|
||||
continue
|
||||
}
|
||||
|
||||
assert!(span_cmp(rustc_tok.sp, antlr_tok.sp, cm), "{:?} and {:?} have different spans", rustc_tok,
|
||||
assert!(span_cmp(antlr_tok.sp, rustc_tok.sp, cm), "{:?} and {:?} have different spans",
|
||||
rustc_tok,
|
||||
antlr_tok);
|
||||
|
||||
macro_rules! matches {
|
||||
|
@ -372,4 +372,102 @@ fragment XID_Continue:
|
||||
| '\uffca' .. '\uffcf'
|
||||
| '\uffd2' .. '\uffd7'
|
||||
| '\uffda' .. '\uffdc'
|
||||
| '\ud800' '\udc00' .. '\udc0a'
|
||||
| '\ud800' '\udc0d' .. '\udc25'
|
||||
| '\ud800' '\udc28' .. '\udc39'
|
||||
| '\ud800' '\udc3c' .. '\udc3c'
|
||||
| '\ud800' '\udc3f' .. '\udc4c'
|
||||
| '\ud800' '\udc50' .. '\udc5c'
|
||||
| '\ud800' '\udc80' .. '\udcf9'
|
||||
| '\ud800' '\udf00' .. '\udf1d'
|
||||
| '\ud800' '\udf30' .. '\udf49'
|
||||
| '\ud800' '\udf80' .. '\udf9c'
|
||||
| '\ud801' '\ue000' .. '\ue09c'
|
||||
| '\ud801' '\ue0a0' .. '\ue0a8'
|
||||
| '\ud802' '\ue400' .. '\ue404'
|
||||
| '\ud802' '\u0808'
|
||||
| '\ud802' '\ue40a' .. '\ue434'
|
||||
| '\ud802' '\ue437' .. '\ue437'
|
||||
| '\ud802' '\u083c'
|
||||
| '\ud802' '\u083f'
|
||||
| '\ud834' '\uad65' .. '\uad68'
|
||||
| '\ud834' '\uad6d' .. '\uad71'
|
||||
| '\ud834' '\uad7b' .. '\uad81'
|
||||
| '\ud834' '\uad85' .. '\uad8a'
|
||||
| '\ud834' '\uadaa' .. '\uadac'
|
||||
| '\ud835' '\ub000' .. '\ub053'
|
||||
| '\ud835' '\ub056' .. '\ub09b'
|
||||
| '\ud835' '\ub09e' .. '\ub09e'
|
||||
| '\ud835' '\ud4a2'
|
||||
| '\ud835' '\ub0a5' .. '\ub0a5'
|
||||
| '\ud835' '\ub0a9' .. '\ub0ab'
|
||||
| '\ud835' '\ub0ae' .. '\ub0b8'
|
||||
| '\ud835' '\ud4bb'
|
||||
| '\ud835' '\ub0bd' .. '\ub0c2'
|
||||
| '\ud835' '\ub0c5' .. '\ub104'
|
||||
| '\ud835' '\ub107' .. '\ub109'
|
||||
| '\ud835' '\ub10d' .. '\ub113'
|
||||
| '\ud835' '\ub116' .. '\ub11b'
|
||||
| '\ud835' '\ub11e' .. '\ub138'
|
||||
| '\ud835' '\ub13b' .. '\ub13d'
|
||||
| '\ud835' '\ub140' .. '\ub143'
|
||||
| '\ud835' '\ud546'
|
||||
| '\ud835' '\ub14a' .. '\ub14f'
|
||||
| '\ud835' '\ub152' .. '\ub2a2'
|
||||
| '\ud835' '\ub2a8' .. '\ub2bf'
|
||||
| '\ud835' '\ub2c2' .. '\ub2d9'
|
||||
| '\ud835' '\ub2dc' .. '\ub2f9'
|
||||
| '\ud835' '\ub2fc' .. '\ub313'
|
||||
| '\ud835' '\ub316' .. '\ub333'
|
||||
| '\ud835' '\ub336' .. '\ub34d'
|
||||
| '\ud835' '\ub350' .. '\ub36d'
|
||||
| '\ud835' '\ub370' .. '\ub387'
|
||||
| '\ud835' '\ub38a' .. '\ub3a7'
|
||||
| '\ud835' '\ub3aa' .. '\ub3c1'
|
||||
| '\ud835' '\ub3c4' .. '\ub3c8'
|
||||
| '\ud835' '\ub3ce' .. '\ub3fe'
|
||||
| '\ud840' '\udc00' .. '\udffe'
|
||||
| '\ud841' '\ue000' .. '\ue3fe'
|
||||
| '\ud842' '\ue400' .. '\ue7fe'
|
||||
| '\ud843' '\ue800' .. '\uebfe'
|
||||
| '\ud844' '\uec00' .. '\ueffe'
|
||||
| '\ud845' '\uf000' .. '\uf3fe'
|
||||
| '\ud846' '\uf400' .. '\uf7fe'
|
||||
| '\ud847' '\uf800' .. '\ufbfe'
|
||||
| '\ud848' '\ufc00' .. '\ufffe'
|
||||
| '\ud849' '\u0000' .. '\u03fe'
|
||||
| '\ud84a' '\u0400' .. '\u07fe'
|
||||
| '\ud84b' '\u0800' .. '\u0bfe'
|
||||
| '\ud84c' '\u0c00' .. '\u0ffe'
|
||||
| '\ud84d' '\u1000' .. '\u13fe'
|
||||
| '\ud84e' '\u1400' .. '\u17fe'
|
||||
| '\ud84f' '\u1800' .. '\u1bfe'
|
||||
| '\ud850' '\u1c00' .. '\u1ffe'
|
||||
| '\ud851' '\u2000' .. '\u23fe'
|
||||
| '\ud852' '\u2400' .. '\u27fe'
|
||||
| '\ud853' '\u2800' .. '\u2bfe'
|
||||
| '\ud854' '\u2c00' .. '\u2ffe'
|
||||
| '\ud855' '\u3000' .. '\u33fe'
|
||||
| '\ud856' '\u3400' .. '\u37fe'
|
||||
| '\ud857' '\u3800' .. '\u3bfe'
|
||||
| '\ud858' '\u3c00' .. '\u3ffe'
|
||||
| '\ud859' '\u4000' .. '\u43fe'
|
||||
| '\ud85a' '\u4400' .. '\u47fe'
|
||||
| '\ud85b' '\u4800' .. '\u4bfe'
|
||||
| '\ud85c' '\u4c00' .. '\u4ffe'
|
||||
| '\ud85d' '\u5000' .. '\u53fe'
|
||||
| '\ud85e' '\u5400' .. '\u57fe'
|
||||
| '\ud85f' '\u5800' .. '\u5bfe'
|
||||
| '\ud860' '\u5c00' .. '\u5ffe'
|
||||
| '\ud861' '\u6000' .. '\u63fe'
|
||||
| '\ud862' '\u6400' .. '\u67fe'
|
||||
| '\ud863' '\u6800' .. '\u6bfe'
|
||||
| '\ud864' '\u6c00' .. '\u6ffe'
|
||||
| '\ud865' '\u7000' .. '\u73fe'
|
||||
| '\ud866' '\u7400' .. '\u77fe'
|
||||
| '\ud867' '\u7800' .. '\u7bfe'
|
||||
| '\ud868' '\u7c00' .. '\u7ffe'
|
||||
| '\ud869' '\u8000' .. '\u82d5'
|
||||
| '\ud87e' '\ud400' .. '\ud61c'
|
||||
| '\udb40' '\udd00' .. '\uddee'
|
||||
;
|
||||
|
@ -286,4 +286,94 @@ fragment XID_Start :
|
||||
| '\uffca' .. '\uffcf'
|
||||
| '\uffd2' .. '\uffd7'
|
||||
| '\uffda' .. '\uffdc'
|
||||
| '\ud800' '\udc00' .. '\udc0a'
|
||||
| '\ud800' '\udc0d' .. '\udc25'
|
||||
| '\ud800' '\udc28' .. '\udc39'
|
||||
| '\ud800' '\udc3c' .. '\udc3c'
|
||||
| '\ud800' '\udc3f' .. '\udc4c'
|
||||
| '\ud800' '\udc50' .. '\udc5c'
|
||||
| '\ud800' '\udc80' .. '\udcf9'
|
||||
| '\ud800' '\udf00' .. '\udf1d'
|
||||
| '\ud800' '\udf30' .. '\udf49'
|
||||
| '\ud800' '\udf80' .. '\udf9c'
|
||||
| '\ud801' '\ue000' .. '\ue09c'
|
||||
| '\ud802' '\ue400' .. '\ue404'
|
||||
| '\ud802' '\u0808'
|
||||
| '\ud802' '\ue40a' .. '\ue434'
|
||||
| '\ud802' '\ue437' .. '\ue437'
|
||||
| '\ud802' '\u083c'
|
||||
| '\ud802' '\u083f'
|
||||
| '\ud835' '\ub000' .. '\ub053'
|
||||
| '\ud835' '\ub056' .. '\ub09b'
|
||||
| '\ud835' '\ub09e' .. '\ub09e'
|
||||
| '\ud835' '\ud4a2'
|
||||
| '\ud835' '\ub0a5' .. '\ub0a5'
|
||||
| '\ud835' '\ub0a9' .. '\ub0ab'
|
||||
| '\ud835' '\ub0ae' .. '\ub0b8'
|
||||
| '\ud835' '\ud4bb'
|
||||
| '\ud835' '\ub0bd' .. '\ub0c2'
|
||||
| '\ud835' '\ub0c5' .. '\ub104'
|
||||
| '\ud835' '\ub107' .. '\ub109'
|
||||
| '\ud835' '\ub10d' .. '\ub113'
|
||||
| '\ud835' '\ub116' .. '\ub11b'
|
||||
| '\ud835' '\ub11e' .. '\ub138'
|
||||
| '\ud835' '\ub13b' .. '\ub13d'
|
||||
| '\ud835' '\ub140' .. '\ub143'
|
||||
| '\ud835' '\ud546'
|
||||
| '\ud835' '\ub14a' .. '\ub14f'
|
||||
| '\ud835' '\ub152' .. '\ub2a2'
|
||||
| '\ud835' '\ub2a8' .. '\ub2bf'
|
||||
| '\ud835' '\ub2c2' .. '\ub2d9'
|
||||
| '\ud835' '\ub2dc' .. '\ub2f9'
|
||||
| '\ud835' '\ub2fc' .. '\ub313'
|
||||
| '\ud835' '\ub316' .. '\ub333'
|
||||
| '\ud835' '\ub336' .. '\ub34d'
|
||||
| '\ud835' '\ub350' .. '\ub36d'
|
||||
| '\ud835' '\ub370' .. '\ub387'
|
||||
| '\ud835' '\ub38a' .. '\ub3a7'
|
||||
| '\ud835' '\ub3aa' .. '\ub3c1'
|
||||
| '\ud835' '\ub3c4' .. '\ub3c8'
|
||||
| '\ud840' '\udc00' .. '\udffe'
|
||||
| '\ud841' '\ue000' .. '\ue3fe'
|
||||
| '\ud842' '\ue400' .. '\ue7fe'
|
||||
| '\ud843' '\ue800' .. '\uebfe'
|
||||
| '\ud844' '\uec00' .. '\ueffe'
|
||||
| '\ud845' '\uf000' .. '\uf3fe'
|
||||
| '\ud846' '\uf400' .. '\uf7fe'
|
||||
| '\ud847' '\uf800' .. '\ufbfe'
|
||||
| '\ud848' '\ufc00' .. '\ufffe'
|
||||
| '\ud849' '\u0000' .. '\u03fe'
|
||||
| '\ud84a' '\u0400' .. '\u07fe'
|
||||
| '\ud84b' '\u0800' .. '\u0bfe'
|
||||
| '\ud84c' '\u0c00' .. '\u0ffe'
|
||||
| '\ud84d' '\u1000' .. '\u13fe'
|
||||
| '\ud84e' '\u1400' .. '\u17fe'
|
||||
| '\ud84f' '\u1800' .. '\u1bfe'
|
||||
| '\ud850' '\u1c00' .. '\u1ffe'
|
||||
| '\ud851' '\u2000' .. '\u23fe'
|
||||
| '\ud852' '\u2400' .. '\u27fe'
|
||||
| '\ud853' '\u2800' .. '\u2bfe'
|
||||
| '\ud854' '\u2c00' .. '\u2ffe'
|
||||
| '\ud855' '\u3000' .. '\u33fe'
|
||||
| '\ud856' '\u3400' .. '\u37fe'
|
||||
| '\ud857' '\u3800' .. '\u3bfe'
|
||||
| '\ud858' '\u3c00' .. '\u3ffe'
|
||||
| '\ud859' '\u4000' .. '\u43fe'
|
||||
| '\ud85a' '\u4400' .. '\u47fe'
|
||||
| '\ud85b' '\u4800' .. '\u4bfe'
|
||||
| '\ud85c' '\u4c00' .. '\u4ffe'
|
||||
| '\ud85d' '\u5000' .. '\u53fe'
|
||||
| '\ud85e' '\u5400' .. '\u57fe'
|
||||
| '\ud85f' '\u5800' .. '\u5bfe'
|
||||
| '\ud860' '\u5c00' .. '\u5ffe'
|
||||
| '\ud861' '\u6000' .. '\u63fe'
|
||||
| '\ud862' '\u6400' .. '\u67fe'
|
||||
| '\ud863' '\u6800' .. '\u6bfe'
|
||||
| '\ud864' '\u6c00' .. '\u6ffe'
|
||||
| '\ud865' '\u7000' .. '\u73fe'
|
||||
| '\ud866' '\u7400' .. '\u77fe'
|
||||
| '\ud867' '\u7800' .. '\u7bfe'
|
||||
| '\ud868' '\u7c00' .. '\u7ffe'
|
||||
| '\ud869' '\u8000' .. '\u82d5'
|
||||
| '\ud87e' '\ud400' .. '\ud61c'
|
||||
;
|
||||
|
Loading…
Reference in New Issue
Block a user