From 43cae880790b532e5b3e2fb99c64a4ec78030652 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Steinbrink?= Date: Fri, 7 Jun 2013 20:00:37 +0200 Subject: [PATCH] Lexer: Fix offset handling in get_str_from() As the comment said, the subtraction is bogus for multibyte characters. Fortunately, we can just use last_pos instead of pos to get the correct position without any subtraction hackery. --- src/libsyntax/parse/comments.rs | 2 +- src/libsyntax/parse/lexer.rs | 16 +++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/libsyntax/parse/comments.rs b/src/libsyntax/parse/comments.rs index dab8dd3b4b6..54fba29a19a 100644 --- a/src/libsyntax/parse/comments.rs +++ b/src/libsyntax/parse/comments.rs @@ -347,7 +347,7 @@ pub fn gather_comments_and_literals(span_diagnostic: } - let bstart = rdr.pos; + let bstart = rdr.last_pos; rdr.next_token(); //discard, and look ahead; we're working with internal state let TokenAndSpan {tok: tok, sp: sp} = rdr.peek(); diff --git a/src/libsyntax/parse/lexer.rs b/src/libsyntax/parse/lexer.rs index 84700f052c9..172e7f15458 100644 --- a/src/libsyntax/parse/lexer.rs +++ b/src/libsyntax/parse/lexer.rs @@ -161,22 +161,20 @@ fn string_advance_token(r: @mut StringReader) { } } -fn byte_offset(rdr: &StringReader) -> BytePos { - (rdr.pos - rdr.filemap.start_pos) +fn byte_offset(rdr: &StringReader, pos: BytePos) -> BytePos { + (pos - rdr.filemap.start_pos) } pub fn get_str_from(rdr: @mut StringReader, start: BytePos) -> ~str { - // I'm pretty skeptical about this subtraction. What if there's a - // multi-byte character before the mark? - return str::slice(*rdr.src, start.to_uint() - 1u, - byte_offset(rdr).to_uint() - 1u).to_owned(); + return str::slice(*rdr.src, start.to_uint(), + byte_offset(rdr, rdr.last_pos).to_uint()).to_owned(); } // EFFECT: advance the StringReader by one character. If a newline is // discovered, add it to the FileMap's list of line start offsets. pub fn bump(rdr: &mut StringReader) { rdr.last_pos = rdr.pos; - let current_byte_offset = byte_offset(rdr).to_uint();; + let current_byte_offset = byte_offset(rdr, rdr.pos).to_uint(); if current_byte_offset < (*rdr.src).len() { assert!(rdr.curr != -1 as char); let last_char = rdr.curr; @@ -202,7 +200,7 @@ pub fn is_eof(rdr: @mut StringReader) -> bool { rdr.curr == -1 as char } pub fn nextch(rdr: @mut StringReader) -> char { - let offset = byte_offset(rdr).to_uint(); + let offset = byte_offset(rdr, rdr.pos).to_uint(); if offset < (*rdr.src).len() { return str::char_at(*rdr.src, offset); } else { return -1 as char; } @@ -692,7 +690,7 @@ fn next_token_inner(rdr: @mut StringReader) -> token::Token { return token::LIT_INT(c2 as i64, ast::ty_char); } '"' => { - let n = byte_offset(rdr); + let n = byte_offset(rdr, rdr.last_pos); bump(rdr); while rdr.curr != '"' { if is_eof(rdr) {