From 257d279fe47bbf3431c76f0942654c1bcf60d501 Mon Sep 17 00:00:00 2001 From: Michael Woerister Date: Wed, 23 May 2018 15:59:42 +0200 Subject: [PATCH 1/8] Make FileMap::{lines, multibyte_chars, non_narrow_chars} non-mutable. --- src/librustc/ich/impls_syntax.rs | 30 ++--- src/librustc/ty/query/on_disk_cache.rs | 2 +- src/librustc_metadata/decoder.rs | 9 +- src/libsyntax/codemap.rs | 40 +++---- src/libsyntax/parse/lexer/mod.rs | 20 +--- src/libsyntax_pos/lib.rs | 158 ++++++++++++++----------- 6 files changed, 121 insertions(+), 138 deletions(-) diff --git a/src/librustc/ich/impls_syntax.rs b/src/librustc/ich/impls_syntax.rs index 935bc4c8c6d..a7a6a71474f 100644 --- a/src/librustc/ich/impls_syntax.rs +++ b/src/librustc/ich/impls_syntax.rs @@ -455,27 +455,21 @@ impl<'a> HashStable> for FileMap { src_hash.hash_stable(hcx, hasher); // We only hash the relative position within this filemap - lines.with_lock(|lines| { - lines.len().hash_stable(hcx, hasher); - for &line in lines.iter() { - stable_byte_pos(line, start_pos).hash_stable(hcx, hasher); - } - }); + lines.len().hash_stable(hcx, hasher); + for &line in lines.iter() { + stable_byte_pos(line, start_pos).hash_stable(hcx, hasher); + } // We only hash the relative position within this filemap - multibyte_chars.with_lock(|multibyte_chars| { - multibyte_chars.len().hash_stable(hcx, hasher); - for &char_pos in multibyte_chars.iter() { - stable_multibyte_char(char_pos, start_pos).hash_stable(hcx, hasher); - } - }); + multibyte_chars.len().hash_stable(hcx, hasher); + for &char_pos in multibyte_chars.iter() { + stable_multibyte_char(char_pos, start_pos).hash_stable(hcx, hasher); + } - non_narrow_chars.with_lock(|non_narrow_chars| { - non_narrow_chars.len().hash_stable(hcx, hasher); - for &char_pos in non_narrow_chars.iter() { - stable_non_narrow_char(char_pos, start_pos).hash_stable(hcx, hasher); - } - }); + non_narrow_chars.len().hash_stable(hcx, hasher); + for &char_pos in non_narrow_chars.iter() { + stable_non_narrow_char(char_pos, start_pos).hash_stable(hcx, hasher); + } } } diff --git a/src/librustc/ty/query/on_disk_cache.rs b/src/librustc/ty/query/on_disk_cache.rs index 3285380c823..7aa6f3a55ad 100644 --- a/src/librustc/ty/query/on_disk_cache.rs +++ b/src/librustc/ty/query/on_disk_cache.rs @@ -623,7 +623,7 @@ impl<'a, 'tcx, 'x> SpecializedDecoder for CacheDecoder<'a, 'tcx, 'x> { let len = BytePos::decode(self)?; let file_lo = self.file_index_to_file(file_lo_index); - let lo = file_lo.lines.borrow()[line_lo - 1] + col_lo; + let lo = file_lo.lines[line_lo - 1] + col_lo; let hi = lo + len; let expn_info_tag = u8::decode(self)?; diff --git a/src/librustc_metadata/decoder.rs b/src/librustc_metadata/decoder.rs index 9e4f695d28f..a01e0b60864 100644 --- a/src/librustc_metadata/decoder.rs +++ b/src/librustc_metadata/decoder.rs @@ -1138,9 +1138,9 @@ impl<'a, 'tcx> CrateMetadata { src_hash, start_pos, end_pos, - lines, - multibyte_chars, - non_narrow_chars, + mut lines, + mut multibyte_chars, + mut non_narrow_chars, name_hash, .. } = filemap_to_import; @@ -1151,15 +1151,12 @@ impl<'a, 'tcx> CrateMetadata { // `CodeMap::new_imported_filemap()` will then translate those // coordinates to their new global frame of reference when the // offset of the FileMap is known. - let mut lines = lines.into_inner(); for pos in &mut lines { *pos = *pos - start_pos; } - let mut multibyte_chars = multibyte_chars.into_inner(); for mbc in &mut multibyte_chars { mbc.pos = mbc.pos - start_pos; } - let mut non_narrow_chars = non_narrow_chars.into_inner(); for swc in &mut non_narrow_chars { *swc = *swc - start_pos; } diff --git a/src/libsyntax/codemap.rs b/src/libsyntax/codemap.rs index 8e4b7660a1c..000f1607514 100644 --- a/src/libsyntax/codemap.rs +++ b/src/libsyntax/codemap.rs @@ -250,16 +250,7 @@ impl CodeMap { /// Creates a new filemap and sets its line information. /// This does not ensure that only one FileMap exists per file name. pub fn new_filemap_and_lines(&self, filename: &Path, src: &str) -> Lrc { - let fm = self.new_filemap(filename.to_owned().into(), src.to_owned()); - let mut byte_pos: u32 = fm.start_pos.0; - for line in src.lines() { - // register the start of this line - fm.next_line(BytePos(byte_pos)); - - // update byte_pos to include this line and the \n at the end - byte_pos += line.len() as u32 + 1; - } - fm + self.new_filemap(filename.to_owned().into(), src.to_owned()) } @@ -305,9 +296,9 @@ impl CodeMap { external_src: Lock::new(ExternalSource::AbsentOk), start_pos, end_pos, - lines: Lock::new(file_local_lines), - multibyte_chars: Lock::new(file_local_multibyte_chars), - non_narrow_chars: Lock::new(file_local_non_narrow_chars), + lines: file_local_lines, + multibyte_chars: file_local_multibyte_chars, + non_narrow_chars: file_local_non_narrow_chars, name_hash, }); @@ -345,21 +336,22 @@ impl CodeMap { match self.lookup_line(pos) { Ok(FileMapAndLine { fm: f, line: a }) => { let line = a + 1; // Line numbers start at 1 - let linebpos = (*f.lines.borrow())[a]; + let linebpos = f.lines[a]; let linechpos = self.bytepos_to_file_charpos(linebpos); let col = chpos - linechpos; let col_display = { - let non_narrow_chars = f.non_narrow_chars.borrow(); - let start_width_idx = non_narrow_chars + let start_width_idx = f + .non_narrow_chars .binary_search_by_key(&linebpos, |x| x.pos()) .unwrap_or_else(|x| x); - let end_width_idx = non_narrow_chars + let end_width_idx = f + .non_narrow_chars .binary_search_by_key(&pos, |x| x.pos()) .unwrap_or_else(|x| x); let special_chars = end_width_idx - start_width_idx; - let non_narrow: usize = - non_narrow_chars[start_width_idx..end_width_idx] + let non_narrow: usize = f + .non_narrow_chars[start_width_idx..end_width_idx] .into_iter() .map(|x| x.width()) .sum(); @@ -380,12 +372,12 @@ impl CodeMap { } Err(f) => { let col_display = { - let non_narrow_chars = f.non_narrow_chars.borrow(); - let end_width_idx = non_narrow_chars + let end_width_idx = f + .non_narrow_chars .binary_search_by_key(&pos, |x| x.pos()) .unwrap_or_else(|x| x); - let non_narrow: usize = - non_narrow_chars[0..end_width_idx] + let non_narrow: usize = f + .non_narrow_chars[0..end_width_idx] .into_iter() .map(|x| x.width()) .sum(); @@ -830,7 +822,7 @@ impl CodeMap { // The number of extra bytes due to multibyte chars in the FileMap let mut total_extra_bytes = 0; - for mbc in map.multibyte_chars.borrow().iter() { + for mbc in map.multibyte_chars.iter() { debug!("{}-byte char at {:?}", mbc.bytes, mbc.pos); if mbc.pos < bpos { // every character is at least one byte, so we only diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index c09cfd910d2..dcc71e78778 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -51,11 +51,7 @@ pub struct StringReader<'a> { pub ch: Option, pub filemap: Lrc, /// Stop reading src at this index. - end_src_index: usize, - /// Whether to record new-lines and multibyte chars in filemap. - /// This is only necessary the first time a filemap is lexed. - /// If part of a filemap is being re-lexed, this should be set to false. - save_new_lines_and_multibyte: bool, + pub end_src_index: usize, // cached: peek_tok: token::Token, peek_span: Span, @@ -188,7 +184,6 @@ impl<'a> StringReader<'a> { ch: Some('\n'), filemap, end_src_index: src.len(), - save_new_lines_and_multibyte: true, // dummy values; not read peek_tok: token::Eof, peek_span: syntax_pos::DUMMY_SP, @@ -225,7 +220,6 @@ impl<'a> StringReader<'a> { let mut sr = StringReader::new_raw_internal(sess, begin.fm, None); // Seek the lexer to the right byte range. - sr.save_new_lines_and_multibyte = false; sr.next_pos = span.lo(); sr.end_src_index = sr.src_index(span.hi()); @@ -458,18 +452,6 @@ impl<'a> StringReader<'a> { let next_ch = char_at(&self.src, next_src_index); let next_ch_len = next_ch.len_utf8(); - if self.ch.unwrap() == '\n' { - if self.save_new_lines_and_multibyte { - self.filemap.next_line(self.next_pos); - } - } - if next_ch_len > 1 { - if self.save_new_lines_and_multibyte { - self.filemap.record_multibyte_char(self.next_pos, next_ch_len); - } - } - self.filemap.record_width(self.next_pos, next_ch); - self.ch = Some(next_ch); self.pos = self.next_pos; self.next_pos = self.next_pos + Pos::from_usize(next_ch_len); diff --git a/src/libsyntax_pos/lib.rs b/src/libsyntax_pos/lib.rs index 756e0c059a7..266737dd7b6 100644 --- a/src/libsyntax_pos/lib.rs +++ b/src/libsyntax_pos/lib.rs @@ -779,11 +779,11 @@ pub struct FileMap { /// The end position of this source in the CodeMap pub end_pos: BytePos, /// Locations of lines beginnings in the source code - pub lines: Lock>, + pub lines: Vec, /// Locations of multi-byte characters in the source code - pub multibyte_chars: Lock>, + pub multibyte_chars: Vec, /// Width of characters that are not narrow in the source code - pub non_narrow_chars: Lock>, + pub non_narrow_chars: Vec, /// A hash of the filename, used for speeding up the incr. comp. hashing. pub name_hash: u128, } @@ -797,7 +797,7 @@ impl Encodable for FileMap { s.emit_struct_field("start_pos", 4, |s| self.start_pos.encode(s))?; s.emit_struct_field("end_pos", 5, |s| self.end_pos.encode(s))?; s.emit_struct_field("lines", 6, |s| { - let lines = self.lines.borrow(); + let lines = &self.lines[..]; // store the length s.emit_u32(lines.len() as u32)?; @@ -843,10 +843,10 @@ impl Encodable for FileMap { Ok(()) })?; s.emit_struct_field("multibyte_chars", 7, |s| { - (*self.multibyte_chars.borrow()).encode(s) + self.multibyte_chars.encode(s) })?; s.emit_struct_field("non_narrow_chars", 8, |s| { - (*self.non_narrow_chars.borrow()).encode(s) + self.non_narrow_chars.encode(s) })?; s.emit_struct_field("name_hash", 9, |s| { self.name_hash.encode(s) @@ -914,9 +914,9 @@ impl Decodable for FileMap { src: None, src_hash, external_src: Lock::new(ExternalSource::AbsentOk), - lines: Lock::new(lines), - multibyte_chars: Lock::new(multibyte_chars), - non_narrow_chars: Lock::new(non_narrow_chars), + lines, + multibyte_chars, + non_narrow_chars, name_hash, }) }) @@ -949,6 +949,9 @@ impl FileMap { }; let end_pos = start_pos.to_usize() + src.len(); + let (lines, multibyte_chars, non_narrow_chars) = + Self::find_newlines_and_special_chars(&src[..], start_pos); + FileMap { name, name_was_remapped, @@ -959,34 +962,81 @@ impl FileMap { external_src: Lock::new(ExternalSource::Unneeded), start_pos, end_pos: Pos::from_usize(end_pos), - lines: Lock::new(Vec::new()), - multibyte_chars: Lock::new(Vec::new()), - non_narrow_chars: Lock::new(Vec::new()), + lines, + multibyte_chars, + non_narrow_chars, name_hash, } } - /// EFFECT: register a start-of-line offset in the - /// table of line-beginnings. - /// UNCHECKED INVARIANT: these offsets must be added in the right - /// order and must be in the right places; there is shared knowledge - /// about what ends a line between this file and parse.rs - /// WARNING: pos param here is the offset relative to start of CodeMap, - /// and CodeMap will append a newline when adding a filemap without a newline at the end, - /// so the safe way to call this is with value calculated as - /// filemap.start_pos + newline_offset_relative_to_the_start_of_filemap. - pub fn next_line(&self, pos: BytePos) { - // the new charpos must be > the last one (or it's the first one). - let mut lines = self.lines.borrow_mut(); - let line_len = lines.len(); - assert!(line_len == 0 || ((*lines)[line_len - 1] < pos)); - lines.push(pos); + fn find_newlines_and_special_chars(src: &str, filemap_start_pos: BytePos) + -> (Vec, Vec, Vec) { + + let mut index = 0; + let mut lines = vec![filemap_start_pos]; + let mut multibyte_chars = vec![]; + let mut non_narrow_chars = vec![]; + + while index < src.len() { + let byte_pos = BytePos::from_usize(index) + filemap_start_pos; + let byte = src.as_bytes()[index]; + + if byte.is_ascii() { + match byte { + b'\n' => { + lines.push(byte_pos + BytePos(1)); + } + b'\t' => { + // Tabs will consume 4 columns. + non_narrow_chars.push(NonNarrowChar::new(byte_pos, 4)); + } + c => if c.is_ascii_control() { + // Assume control characters are zero width. + non_narrow_chars.push(NonNarrowChar::new(byte_pos, 0)); + } + } + + index += 1; + } else { + let c = (&src[index..]).chars().next().unwrap(); + let c_len = c.len_utf8(); + + if c_len > 1 { + assert!(c_len >=2 && c_len <= 4); + let mbc = MultiByteChar { + pos: byte_pos, + bytes: c_len, + }; + multibyte_chars.push(mbc); + } + + // Assume control characters are zero width. + // FIXME: How can we decide between `width` and `width_cjk`? + let c_width = unicode_width::UnicodeWidthChar::width(c).unwrap_or(0); + + if c_width != 1 { + non_narrow_chars.push(NonNarrowChar::new(byte_pos, c_width)); + } + + index += c_len; + } + } + + // The loop above optimistically registers a new line *after* each of \n + // it encounters. If that point is already outside the filemap, remove + // it again. + if let Some(&last_line_start) = lines.last() { + if last_line_start == filemap_start_pos + BytePos::from_usize(src.len()) { + lines.pop(); + } + } + + (lines, multibyte_chars, non_narrow_chars) } /// Return the BytePos of the beginning of the current line. pub fn line_begin_pos(&self) -> BytePos { - let lines = self.lines.borrow(); - match lines.last() { + match self.lines.last() { Some(&line_pos) => line_pos, None => self.start_pos, } @@ -1040,8 +1090,7 @@ impl FileMap { } let begin = { - let lines = self.lines.borrow(); - let line = if let Some(line) = lines.get(line_number) { + let line = if let Some(line) = self.lines.get(line_number) { line } else { return None; @@ -1059,35 +1108,6 @@ impl FileMap { } } - pub fn record_multibyte_char(&self, pos: BytePos, bytes: usize) { - assert!(bytes >=2 && bytes <= 4); - let mbc = MultiByteChar { - pos, - bytes, - }; - self.multibyte_chars.borrow_mut().push(mbc); - } - - #[inline] - pub fn record_width(&self, pos: BytePos, ch: char) { - let width = match ch { - '\t' => - // Tabs will consume 4 columns. - 4, - '\n' => - // Make newlines take one column so that displayed spans can point them. - 1, - ch => - // Assume control characters are zero width. - // FIXME: How can we decide between `width` and `width_cjk`? - unicode_width::UnicodeWidthChar::width(ch).unwrap_or(0), - }; - // Only record non-narrow characters. - if width != 1 { - self.non_narrow_chars.borrow_mut().push(NonNarrowChar::new(pos, width)); - } - } - pub fn is_real_file(&self) -> bool { self.name.is_real() } @@ -1100,7 +1120,7 @@ impl FileMap { self.end_pos.0 - self.start_pos.0 } pub fn count_lines(&self) -> usize { - self.lines.borrow().len() + self.lines.len() } /// Find the line containing the given position. The return value is the @@ -1108,13 +1128,12 @@ impl FileMap { /// number. If the filemap is empty or the position is located before the /// first line, None is returned. pub fn lookup_line(&self, pos: BytePos) -> Option { - let lines = self.lines.borrow(); - if lines.len() == 0 { + if self.lines.len() == 0 { return None; } - let line_index = lookup_line(&lines[..], pos); - assert!(line_index < lines.len() as isize); + let line_index = lookup_line(&self.lines[..], pos); + assert!(line_index < self.lines.len() as isize); if line_index >= 0 { Some(line_index as usize) } else { @@ -1127,12 +1146,11 @@ impl FileMap { return (self.start_pos, self.end_pos); } - let lines = self.lines.borrow(); - assert!(line_index < lines.len()); - if line_index == (lines.len() - 1) { - (lines[line_index], self.end_pos) + assert!(line_index < self.lines.len()); + if line_index == (self.lines.len() - 1) { + (self.lines[line_index], self.end_pos) } else { - (lines[line_index], lines[line_index + 1]) + (self.lines[line_index], self.lines[line_index + 1]) } } From 095a339bec62e81e38727d16cc8f275c6818061e Mon Sep 17 00:00:00 2001 From: Michael Woerister Date: Wed, 23 May 2018 16:19:20 +0200 Subject: [PATCH 2/8] Remove the now redundant CodeMap::new_filemap_with_lines() method. --- src/libsyntax/codemap.rs | 14 +++----------- src/libsyntax/ext/expand.rs | 6 ++++-- src/libsyntax/ext/source_util.rs | 8 +++++--- src/libsyntax/test_snippet.rs | 2 +- 4 files changed, 13 insertions(+), 17 deletions(-) diff --git a/src/libsyntax/codemap.rs b/src/libsyntax/codemap.rs index 000f1607514..5e23c1b0d62 100644 --- a/src/libsyntax/codemap.rs +++ b/src/libsyntax/codemap.rs @@ -211,8 +211,7 @@ impl CodeMap { } } - /// Creates a new filemap without setting its line information. If you don't - /// intend to set the line information yourself, you should use new_filemap_and_lines. + /// Creates a new filemap. /// This does not ensure that only one FileMap exists per file name. pub fn new_filemap(&self, filename: FileName, src: String) -> Lrc { let start_pos = self.next_start_pos(); @@ -247,13 +246,6 @@ impl CodeMap { filemap } - /// Creates a new filemap and sets its line information. - /// This does not ensure that only one FileMap exists per file name. - pub fn new_filemap_and_lines(&self, filename: &Path, src: &str) -> Lrc { - self.new_filemap(filename.to_owned().into(), src.to_owned()) - } - - /// Allocates a new FileMap representing a source file from an external /// crate. The source code of such an "imported filemap" is not available, /// but we still know enough to generate accurate debuginfo location @@ -1188,7 +1180,7 @@ mod tests { let cm = CodeMap::new(FilePathMapping::empty()); let inputtext = "aaaaa\nbbbbBB\nCCC\nDDDDDddddd\neee\n"; let selection = " \n ~~\n~~~\n~~~~~ \n \n"; - cm.new_filemap_and_lines(Path::new("blork.rs"), inputtext); + cm.new_filemap(Path::new("blork.rs").to_owned().into(), inputtext.to_string()); let span = span_from_selection(inputtext, selection); // check that we are extracting the text we thought we were extracting @@ -1231,7 +1223,7 @@ mod tests { let inputtext = "bbbb BB\ncc CCC\n"; let selection1 = " ~~\n \n"; let selection2 = " \n ~~~\n"; - cm.new_filemap_and_lines(Path::new("blork.rs"), inputtext); + cm.new_filemap(Path::new("blork.rs").to_owned().into(), inputtext.to_owned()); let span1 = span_from_selection(inputtext, selection1); let span2 = span_from_selection(inputtext, selection2); diff --git a/src/libsyntax/ext/expand.rs b/src/libsyntax/ext/expand.rs index 69c99c63aaf..094e572693c 100644 --- a/src/libsyntax/ext/expand.rs +++ b/src/libsyntax/ext/expand.rs @@ -1487,9 +1487,11 @@ impl<'a, 'b> Folder for InvocationCollector<'a, 'b> { match String::from_utf8(buf) { Ok(src) => { + let src_interned = Symbol::intern(&src); + // Add this input file to the code map to make it available as // dependency information - self.cx.codemap().new_filemap_and_lines(&filename, &src); + self.cx.codemap().new_filemap(filename.into(), src); let include_info = vec![ dummy_spanned(ast::NestedMetaItemKind::MetaItem( @@ -1497,7 +1499,7 @@ impl<'a, 'b> Folder for InvocationCollector<'a, 'b> { dummy_spanned(file)))), dummy_spanned(ast::NestedMetaItemKind::MetaItem( attr::mk_name_value_item_str(Ident::from_str("contents"), - dummy_spanned(Symbol::intern(&src))))), + dummy_spanned(src_interned)))), ]; let include_ident = Ident::from_str("include"); diff --git a/src/libsyntax/ext/source_util.rs b/src/libsyntax/ext/source_util.rs index d6dce63ea5e..669536f519c 100644 --- a/src/libsyntax/ext/source_util.rs +++ b/src/libsyntax/ext/source_util.rs @@ -150,11 +150,13 @@ pub fn expand_include_str(cx: &mut ExtCtxt, sp: Span, tts: &[tokenstream::TokenT }; match String::from_utf8(bytes) { Ok(src) => { + let interned_src = Symbol::intern(&src); + // Add this input file to the code map to make it available as // dependency information - cx.codemap().new_filemap_and_lines(&file, &src); + cx.codemap().new_filemap(file.into(), src); - base::MacEager::expr(cx.expr_str(sp, Symbol::intern(&src))) + base::MacEager::expr(cx.expr_str(sp, interned_src)) } Err(_) => { cx.span_err(sp, @@ -182,7 +184,7 @@ pub fn expand_include_bytes(cx: &mut ExtCtxt, sp: Span, tts: &[tokenstream::Toke Ok(..) => { // Add this input file to the code map to make it available as // dependency information, but don't enter it's contents - cx.codemap().new_filemap_and_lines(&file, ""); + cx.codemap().new_filemap(file.into(), "".to_string()); base::MacEager::expr(cx.expr_lit(sp, ast::LitKind::ByteStr(Lrc::new(bytes)))) } diff --git a/src/libsyntax/test_snippet.rs b/src/libsyntax/test_snippet.rs index 81dcc1998ed..c7e4fbd1073 100644 --- a/src/libsyntax/test_snippet.rs +++ b/src/libsyntax/test_snippet.rs @@ -51,7 +51,7 @@ fn test_harness(file_text: &str, span_labels: Vec, expected_output: & let output = Arc::new(Mutex::new(Vec::new())); let code_map = Lrc::new(CodeMap::new(FilePathMapping::empty())); - code_map.new_filemap_and_lines(Path::new("test.rs"), &file_text); + code_map.new_filemap(Path::new("test.rs").to_owned().into(), file_text.to_owned()); let primary_span = make_span(&file_text, &span_labels[0].start, &span_labels[0].end); let mut msp = MultiSpan::from_span(primary_span); From 04d4da1bf9df23f1bc2ad22115642735bda8b39e Mon Sep 17 00:00:00 2001 From: Michael Woerister Date: Thu, 24 May 2018 11:30:30 +0200 Subject: [PATCH 3/8] Update CodeMap tests after changing FileMap construction. --- src/libsyntax/codemap.rs | 71 ++++++---------------------------------- 1 file changed, 10 insertions(+), 61 deletions(-) diff --git a/src/libsyntax/codemap.rs b/src/libsyntax/codemap.rs index 5e23c1b0d62..2aa2564b097 100644 --- a/src/libsyntax/codemap.rs +++ b/src/libsyntax/codemap.rs @@ -1012,51 +1012,16 @@ impl FilePathMapping { #[cfg(test)] mod tests { use super::*; - use std::borrow::Cow; use rustc_data_structures::sync::Lrc; - #[test] - fn t1 () { - let cm = CodeMap::new(FilePathMapping::empty()); - let fm = cm.new_filemap(PathBuf::from("blork.rs").into(), - "first line.\nsecond line".to_string()); - fm.next_line(BytePos(0)); - // Test we can get lines with partial line info. - assert_eq!(fm.get_line(0), Some(Cow::from("first line."))); - // TESTING BROKEN BEHAVIOR: line break declared before actual line break. - fm.next_line(BytePos(10)); - assert_eq!(fm.get_line(1), Some(Cow::from("."))); - fm.next_line(BytePos(12)); - assert_eq!(fm.get_line(2), Some(Cow::from("second line"))); - } - - #[test] - #[should_panic] - fn t2 () { - let cm = CodeMap::new(FilePathMapping::empty()); - let fm = cm.new_filemap(PathBuf::from("blork.rs").into(), - "first line.\nsecond line".to_string()); - // TESTING *REALLY* BROKEN BEHAVIOR: - fm.next_line(BytePos(0)); - fm.next_line(BytePos(10)); - fm.next_line(BytePos(2)); - } - fn init_code_map() -> CodeMap { let cm = CodeMap::new(FilePathMapping::empty()); - let fm1 = cm.new_filemap(PathBuf::from("blork.rs").into(), - "first line.\nsecond line".to_string()); - let fm2 = cm.new_filemap(PathBuf::from("empty.rs").into(), - "".to_string()); - let fm3 = cm.new_filemap(PathBuf::from("blork2.rs").into(), - "first line.\nsecond line".to_string()); - - fm1.next_line(BytePos(0)); - fm1.next_line(BytePos(12)); - fm2.next_line(fm2.start_pos); - fm3.next_line(fm3.start_pos); - fm3.next_line(fm3.start_pos + BytePos(12)); - + cm.new_filemap(PathBuf::from("blork.rs").into(), + "first line.\nsecond line".to_string()); + cm.new_filemap(PathBuf::from("empty.rs").into(), + "".to_string()); + cm.new_filemap(PathBuf::from("blork2.rs").into(), + "first line.\nsecond line".to_string()); cm } @@ -1109,26 +1074,10 @@ mod tests { fn init_code_map_mbc() -> CodeMap { let cm = CodeMap::new(FilePathMapping::empty()); // € is a three byte utf8 char. - let fm1 = - cm.new_filemap(PathBuf::from("blork.rs").into(), - "fir€st €€€€ line.\nsecond line".to_string()); - let fm2 = cm.new_filemap(PathBuf::from("blork2.rs").into(), - "first line€€.\n€ second line".to_string()); - - fm1.next_line(BytePos(0)); - fm1.next_line(BytePos(28)); - fm2.next_line(fm2.start_pos); - fm2.next_line(fm2.start_pos + BytePos(20)); - - fm1.record_multibyte_char(BytePos(3), 3); - fm1.record_multibyte_char(BytePos(9), 3); - fm1.record_multibyte_char(BytePos(12), 3); - fm1.record_multibyte_char(BytePos(15), 3); - fm1.record_multibyte_char(BytePos(18), 3); - fm2.record_multibyte_char(fm2.start_pos + BytePos(10), 3); - fm2.record_multibyte_char(fm2.start_pos + BytePos(13), 3); - fm2.record_multibyte_char(fm2.start_pos + BytePos(18), 3); - + cm.new_filemap(PathBuf::from("blork.rs").into(), + "fir€st €€€€ line.\nsecond line".to_string()); + cm.new_filemap(PathBuf::from("blork2.rs").into(), + "first line€€.\n€ second line".to_string()); cm } From 3497138634bf58a7c29ef35f1f677dbde0633af8 Mon Sep 17 00:00:00 2001 From: Michael Woerister Date: Tue, 29 May 2018 17:49:35 +0200 Subject: [PATCH 4/8] Use u32 instead of usize of encoding byte count of multi-byte chars. --- src/libsyntax/codemap.rs | 6 +++--- src/libsyntax_pos/lib.rs | 20 +++++++++++++++++--- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/libsyntax/codemap.rs b/src/libsyntax/codemap.rs index 2aa2564b097..68882c4f063 100644 --- a/src/libsyntax/codemap.rs +++ b/src/libsyntax/codemap.rs @@ -822,14 +822,14 @@ impl CodeMap { total_extra_bytes += mbc.bytes - 1; // We should never see a byte position in the middle of a // character - assert!(bpos.to_usize() >= mbc.pos.to_usize() + mbc.bytes); + assert!(bpos.to_u32() >= mbc.pos.to_u32() + mbc.bytes); } else { break; } } - assert!(map.start_pos.to_usize() + total_extra_bytes <= bpos.to_usize()); - CharPos(bpos.to_usize() - map.start_pos.to_usize() - total_extra_bytes) + assert!(map.start_pos.to_u32() + total_extra_bytes <= bpos.to_u32()); + CharPos(bpos.to_usize() - map.start_pos.to_usize() - total_extra_bytes as usize) } // Return the index of the filemap (in self.files) which contains pos. diff --git a/src/libsyntax_pos/lib.rs b/src/libsyntax_pos/lib.rs index 266737dd7b6..93b65dac288 100644 --- a/src/libsyntax_pos/lib.rs +++ b/src/libsyntax_pos/lib.rs @@ -657,7 +657,7 @@ pub struct MultiByteChar { /// The absolute offset of the character in the CodeMap pub pos: BytePos, /// The number of bytes, >=2 - pub bytes: usize, + pub bytes: u32, } /// Identifies an offset of a non-narrow character in a FileMap @@ -1174,6 +1174,8 @@ fn remove_bom(src: &mut String) { pub trait Pos { fn from_usize(n: usize) -> Self; fn to_usize(&self) -> usize; + fn from_u32(n: u32) -> Self; + fn to_u32(&self) -> u32; } /// A byte offset. Keep this small (currently 32-bits), as AST contains @@ -1195,7 +1197,13 @@ impl Pos for BytePos { fn from_usize(n: usize) -> BytePos { BytePos(n as u32) } #[inline(always)] - fn to_usize(&self) -> usize { let BytePos(n) = *self; n as usize } + fn to_usize(&self) -> usize { self.0 as usize } + + #[inline(always)] + fn from_u32(n: u32) -> BytePos { BytePos(n) } + + #[inline(always)] + fn to_u32(&self) -> u32 { self.0 } } impl Add for BytePos { @@ -1233,7 +1241,13 @@ impl Pos for CharPos { fn from_usize(n: usize) -> CharPos { CharPos(n) } #[inline(always)] - fn to_usize(&self) -> usize { let CharPos(n) = *self; n } + fn to_usize(&self) -> usize { self.0 } + + #[inline(always)] + fn from_u32(n: u32) -> CharPos { CharPos(n as usize) } + + #[inline(always)] + fn to_u32(&self) -> u32 { self.0 as u32} } impl Add for CharPos { From 5a6dc8c4f5fdf06420b16f848582f6e17b9ff83e Mon Sep 17 00:00:00 2001 From: Michael Woerister Date: Tue, 29 May 2018 17:50:13 +0200 Subject: [PATCH 5/8] Add SSE2 accelerated version of FileMap analysis. --- src/Cargo.lock | 1 + src/libsyntax_pos/Cargo.toml | 1 + src/libsyntax_pos/analyze_filemap.rs | 434 +++++++++++++++++++++++++++ src/libsyntax_pos/lib.rs | 77 +---- 4 files changed, 445 insertions(+), 68 deletions(-) create mode 100644 src/libsyntax_pos/analyze_filemap.rs diff --git a/src/Cargo.lock b/src/Cargo.lock index b74587e5662..a9339055264 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -2779,6 +2779,7 @@ name = "syntax_pos" version = "0.0.0" dependencies = [ "arena 0.0.0", + "cfg-if 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", "rustc_data_structures 0.0.0", "scoped-tls 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "serialize 0.0.0", diff --git a/src/libsyntax_pos/Cargo.toml b/src/libsyntax_pos/Cargo.toml index a9147b394f7..08ee2e0f376 100644 --- a/src/libsyntax_pos/Cargo.toml +++ b/src/libsyntax_pos/Cargo.toml @@ -14,3 +14,4 @@ rustc_data_structures = { path = "../librustc_data_structures" } arena = { path = "../libarena" } scoped-tls = { version = "0.1.1", features = ["nightly"] } unicode-width = "0.1.4" +cfg-if = "0.1.2" diff --git a/src/libsyntax_pos/analyze_filemap.rs b/src/libsyntax_pos/analyze_filemap.rs new file mode 100644 index 00000000000..7828c55ce78 --- /dev/null +++ b/src/libsyntax_pos/analyze_filemap.rs @@ -0,0 +1,434 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use unicode_width::UnicodeWidthChar; +use super::*; + +/// Find all newlines, multi-byte characters, and non-narrow characters in a +/// FileMap. +/// +/// This function will use an SSE2 enhanced implementation if hardware support +/// is detected at runtime. +pub fn analyze_filemap( + src: &str, + filemap_start_pos: BytePos) + -> (Vec, Vec, Vec) +{ + let mut lines = vec![filemap_start_pos]; + let mut multi_byte_chars = vec![]; + let mut non_narrow_chars = vec![]; + + // Calls the right implementation, depending on hardware support available. + analyze_filemap_dispatch(src, + filemap_start_pos, + &mut lines, + &mut multi_byte_chars, + &mut non_narrow_chars); + + // The code above optimistically registers a new line *after* each \n + // it encounters. If that point is already outside the filemap, remove + // it again. + if let Some(&last_line_start) = lines.last() { + let file_map_end = filemap_start_pos + BytePos::from_usize(src.len()); + assert!(file_map_end >= last_line_start); + if last_line_start == file_map_end { + lines.pop(); + } + } + + (lines, multi_byte_chars, non_narrow_chars) +} + +cfg_if! { + if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), + not(stage0)))] { + fn analyze_filemap_dispatch(src: &str, + filemap_start_pos: BytePos, + lines: &mut Vec, + multi_byte_chars: &mut Vec, + non_narrow_chars: &mut Vec) { + if is_x86_feature_detected!("sse2") { + unsafe { + analyze_filemap_sse2(src, + filemap_start_pos, + lines, + multi_byte_chars, + non_narrow_chars); + } + } else { + analyze_filemap_generic(src, + src.len(), + filemap_start_pos, + lines, + multi_byte_chars, + non_narrow_chars); + + } + } + + /// Check 16 byte chunks of text at a time. If the chunk contains + /// something other than printable ASCII characters and newlines, the + /// function falls back to the generic implementation. Otherwise it uses + /// SSE2 intrinsics to quickly find all newlines. + #[target_feature(enable = "sse2")] + unsafe fn analyze_filemap_sse2(src: &str, + output_offset: BytePos, + lines: &mut Vec, + multi_byte_chars: &mut Vec, + non_narrow_chars: &mut Vec) { + #[cfg(target_arch = "x86")] + use std::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use std::arch::x86_64::*; + + const CHUNK_SIZE: usize = 16; + + let src_bytes = src.as_bytes(); + + let chunk_count = src.len() / CHUNK_SIZE; + + // This variable keeps track of where we should start decoding a + // chunk. If a multi-byte character spans across chunk boundaries, + // we need to skip that part in the next chunk because we already + // handled it. + let mut intra_chunk_offset = 0; + + for chunk_index in 0 .. chunk_count { + let ptr = src_bytes.as_ptr() as *const __m128i; + let chunk = _mm_loadu_si128(ptr.offset(chunk_index as isize)); + + // For character in the chunk, see if its byte value is < 0, which + // indicates that it's part of a UTF-8 char. + let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0)); + // Create a bit mask from the comparison results. + let multibyte_mask = _mm_movemask_epi8(multibyte_test); + + // If the bit mask is all zero, we only have ASCII chars here: + if multibyte_mask == 0 { + assert!(intra_chunk_offset == 0); + + // Check if there are any control characters in the chunk. All + // control characters that we can encounter at this point have a + // byte value less than 32 or ... + let control_char_test0 = _mm_cmplt_epi8(chunk, _mm_set1_epi8(32)); + let control_char_mask0 = _mm_movemask_epi8(control_char_test0); + + // ... it's the ASCII 'DEL' character with a value of 127. + let control_char_test1 = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127)); + let control_char_mask1 = _mm_movemask_epi8(control_char_test1); + + let control_char_mask = control_char_mask0 | control_char_mask1; + + if control_char_mask != 0 { + // Check for newlines in the chunk + let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)); + let newlines_mask = _mm_movemask_epi8(newlines_test); + + if control_char_mask == newlines_mask { + // All control characters are newlines, record them + let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32; + let output_offset = output_offset + + BytePos::from_usize(chunk_index * CHUNK_SIZE + 1); + + loop { + let index = newlines_mask.trailing_zeros(); + + if index >= CHUNK_SIZE as u32 { + // We have arrived at the end of the chunk. + break + } + + lines.push(BytePos(index) + output_offset); + + // Clear the bit, so we can find the next one. + newlines_mask &= (!1) << index; + } + + // We are done for this chunk. All control characters were + // newlines and we took care of those. + continue + } else { + // Some of the control characters are not newlines, + // fall through to the slow path below. + } + } else { + // No control characters, nothing to record for this chunk + continue + } + } + + // The slow path. + // There are control chars in here, fallback to generic decoding. + let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset; + intra_chunk_offset = analyze_filemap_generic( + &src[scan_start .. ], + CHUNK_SIZE - intra_chunk_offset, + BytePos::from_usize(scan_start) + output_offset, + lines, + multi_byte_chars, + non_narrow_chars + ); + } + + // There might still be a tail left to analyze + let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset; + if tail_start < src.len() { + analyze_filemap_generic(&src[tail_start as usize ..], + src.len() - tail_start, + output_offset + BytePos::from_usize(tail_start), + lines, + multi_byte_chars, + non_narrow_chars); + } + } + } else { + + // The target (or compiler version) does not support SSE2 ... + fn analyze_filemap_dispatch(src: &str, + filemap_start_pos: BytePos, + lines: &mut Vec, + multi_byte_chars: &mut Vec, + non_narrow_chars: &mut Vec) { + analyze_filemap_generic(src, + src.len(), + filemap_start_pos, + lines, + multi_byte_chars, + non_narrow_chars); + } + } +} + +// `scan_len` determines the number of bytes in `src` to scan. Note that the +// function can read past `scan_len` if a multi-byte character start within the +// range but extends past it. The overflow is returned by the function. +fn analyze_filemap_generic(src: &str, + scan_len: usize, + output_offset: BytePos, + lines: &mut Vec, + multi_byte_chars: &mut Vec, + non_narrow_chars: &mut Vec) + -> usize +{ + assert!(src.len() >= scan_len); + let mut i = 0; + let src_bytes = src.as_bytes(); + + while i < scan_len { + let byte = unsafe { + // We verified that i < scan_len <= src.len() + *src_bytes.get_unchecked(i as usize) + }; + + // How much to advance in order to get to the next UTF-8 char in the + // string. + let mut char_len = 1; + + if byte < 32 { + // This is an ASCII control character, it could be one of the cases + // that are interesting to us. + + let pos = BytePos::from_usize(i) + output_offset; + + match byte { + b'\n' => { + lines.push(pos + BytePos(1)); + } + b'\t' => { + non_narrow_chars.push(NonNarrowChar::Tab(pos)); + } + _ => { + non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos)); + } + } + } else if byte >= 127 { + // The slow path: + // This is either ASCII control character "DEL" or the beginning of + // a multibyte char. Just decode to `char`. + let c = (&src[i..]).chars().next().unwrap(); + char_len = c.len_utf8(); + + let pos = BytePos::from_usize(i) + output_offset; + + if char_len > 1 { + assert!(char_len >=2 && char_len <= 4); + let mbc = MultiByteChar { + pos, + bytes: char_len as u32, + }; + multi_byte_chars.push(mbc); + } + + // Assume control characters are zero width. + // FIXME: How can we decide between `width` and `width_cjk`? + let char_width = UnicodeWidthChar::width(c).unwrap_or(0); + + if char_width != 1 { + non_narrow_chars.push(NonNarrowChar::new(pos, char_width)); + } + } + + i += char_len; + } + + i - scan_len +} + + + +macro_rules! test { + (case: $test_name:ident, + text: $text:expr, + filemap_start_pos: $filemap_start_pos:expr, + lines: $lines:expr, + multi_byte_chars: $multi_byte_chars:expr, + non_narrow_chars: $non_narrow_chars:expr,) => ( + + #[test] + fn $test_name() { + + let (lines, multi_byte_chars, non_narrow_chars) = + analyze_filemap($text, BytePos($filemap_start_pos)); + + let expected_lines: Vec = $lines + .into_iter() + .map(|pos| BytePos(pos)) + .collect(); + + assert_eq!(lines, expected_lines); + + let expected_mbcs: Vec = $multi_byte_chars + .into_iter() + .map(|(pos, bytes)| MultiByteChar { + pos: BytePos(pos), + bytes, + }) + .collect(); + + assert_eq!(multi_byte_chars, expected_mbcs); + + let expected_nncs: Vec = $non_narrow_chars + .into_iter() + .map(|(pos, width)| { + NonNarrowChar::new(BytePos(pos), width) + }) + .collect(); + + assert_eq!(non_narrow_chars, expected_nncs); + }) +} + +test!( + case: empty_text, + text: "", + filemap_start_pos: 0, + lines: vec![], + multi_byte_chars: vec![], + non_narrow_chars: vec![], +); + +test!( + case: newlines_short, + text: "a\nc", + filemap_start_pos: 0, + lines: vec![0, 2], + multi_byte_chars: vec![], + non_narrow_chars: vec![], +); + +test!( + case: newlines_long, + text: "012345678\nabcdef012345678\na", + filemap_start_pos: 0, + lines: vec![0, 10, 26], + multi_byte_chars: vec![], + non_narrow_chars: vec![], +); + +test!( + case: newline_and_multi_byte_char_in_same_chunk, + text: "01234β789\nbcdef0123456789abcdef", + filemap_start_pos: 0, + lines: vec![0, 11], + multi_byte_chars: vec![(5, 2)], + non_narrow_chars: vec![], +); + +test!( + case: newline_and_control_char_in_same_chunk, + text: "01234\u{07}6789\nbcdef0123456789abcdef", + filemap_start_pos: 0, + lines: vec![0, 11], + multi_byte_chars: vec![], + non_narrow_chars: vec![(5, 0)], +); + +test!( + case: multi_byte_char_short, + text: "aβc", + filemap_start_pos: 0, + lines: vec![0], + multi_byte_chars: vec![(1, 2)], + non_narrow_chars: vec![], +); + +test!( + case: multi_byte_char_long, + text: "0123456789abcΔf012345β", + filemap_start_pos: 0, + lines: vec![0], + multi_byte_chars: vec![(13, 2), (22, 2)], + non_narrow_chars: vec![], +); + +test!( + case: multi_byte_char_across_chunk_boundary, + text: "0123456789abcdeΔ123456789abcdef01234", + filemap_start_pos: 0, + lines: vec![0], + multi_byte_chars: vec![(15, 2)], + non_narrow_chars: vec![], +); + +test!( + case: multi_byte_char_across_chunk_boundary_tail, + text: "0123456789abcdeΔ....", + filemap_start_pos: 0, + lines: vec![0], + multi_byte_chars: vec![(15, 2)], + non_narrow_chars: vec![], +); + +test!( + case: non_narrow_short, + text: "0\t2", + filemap_start_pos: 0, + lines: vec![0], + multi_byte_chars: vec![], + non_narrow_chars: vec![(1, 4)], +); + +test!( + case: non_narrow_long, + text: "01\t3456789abcdef01234567\u{07}9", + filemap_start_pos: 0, + lines: vec![0], + multi_byte_chars: vec![], + non_narrow_chars: vec![(2, 4), (24, 0)], +); + +test!( + case: output_offset_all, + text: "01\t345\n789abcΔf01234567\u{07}9\nbcΔf", + filemap_start_pos: 1000, + lines: vec![0 + 1000, 7 + 1000, 27 + 1000], + multi_byte_chars: vec![(13 + 1000, 2), (29 + 1000, 2)], + non_narrow_chars: vec![(2 + 1000, 4), (24 + 1000, 0)], +); diff --git a/src/libsyntax_pos/lib.rs b/src/libsyntax_pos/lib.rs index 93b65dac288..90f3ae90c2f 100644 --- a/src/libsyntax_pos/lib.rs +++ b/src/libsyntax_pos/lib.rs @@ -24,6 +24,7 @@ #![feature(optin_builtin_traits)] #![allow(unused_attributes)] #![feature(specialization)] +#![feature(stdsimd)] use std::borrow::Cow; use std::cell::Cell; @@ -47,6 +48,9 @@ use serialize::{Encodable, Decodable, Encoder, Decoder}; extern crate serialize; extern crate serialize as rustc_serialize; // used by deriving +#[macro_use] +extern crate cfg_if; + extern crate unicode_width; pub mod edition; @@ -58,6 +62,8 @@ pub use span_encoding::{Span, DUMMY_SP}; pub mod symbol; +mod analyze_filemap; + pub struct Globals { symbol_interner: Lock, span_interner: Lock, @@ -652,7 +658,7 @@ impl From> for MultiSpan { pub const NO_EXPANSION: SyntaxContext = SyntaxContext::empty(); /// Identifies an offset of a multi-byte character in a FileMap -#[derive(Copy, Clone, RustcEncodable, RustcDecodable, Eq, PartialEq)] +#[derive(Copy, Clone, RustcEncodable, RustcDecodable, Eq, PartialEq, Debug)] pub struct MultiByteChar { /// The absolute offset of the character in the CodeMap pub pos: BytePos, @@ -661,7 +667,7 @@ pub struct MultiByteChar { } /// Identifies an offset of a non-narrow character in a FileMap -#[derive(Copy, Clone, RustcEncodable, RustcDecodable, Eq, PartialEq)] +#[derive(Copy, Clone, RustcEncodable, RustcDecodable, Eq, PartialEq, Debug)] pub enum NonNarrowChar { /// Represents a zero-width character ZeroWidth(BytePos), @@ -950,7 +956,7 @@ impl FileMap { let end_pos = start_pos.to_usize() + src.len(); let (lines, multibyte_chars, non_narrow_chars) = - Self::find_newlines_and_special_chars(&src[..], start_pos); + analyze_filemap::analyze_filemap(&src[..], start_pos); FileMap { name, @@ -969,71 +975,6 @@ impl FileMap { } } - fn find_newlines_and_special_chars(src: &str, filemap_start_pos: BytePos) - -> (Vec, Vec, Vec) { - - let mut index = 0; - let mut lines = vec![filemap_start_pos]; - let mut multibyte_chars = vec![]; - let mut non_narrow_chars = vec![]; - - while index < src.len() { - let byte_pos = BytePos::from_usize(index) + filemap_start_pos; - let byte = src.as_bytes()[index]; - - if byte.is_ascii() { - match byte { - b'\n' => { - lines.push(byte_pos + BytePos(1)); - } - b'\t' => { - // Tabs will consume 4 columns. - non_narrow_chars.push(NonNarrowChar::new(byte_pos, 4)); - } - c => if c.is_ascii_control() { - // Assume control characters are zero width. - non_narrow_chars.push(NonNarrowChar::new(byte_pos, 0)); - } - } - - index += 1; - } else { - let c = (&src[index..]).chars().next().unwrap(); - let c_len = c.len_utf8(); - - if c_len > 1 { - assert!(c_len >=2 && c_len <= 4); - let mbc = MultiByteChar { - pos: byte_pos, - bytes: c_len, - }; - multibyte_chars.push(mbc); - } - - // Assume control characters are zero width. - // FIXME: How can we decide between `width` and `width_cjk`? - let c_width = unicode_width::UnicodeWidthChar::width(c).unwrap_or(0); - - if c_width != 1 { - non_narrow_chars.push(NonNarrowChar::new(byte_pos, c_width)); - } - - index += c_len; - } - } - - // The loop above optimistically registers a new line *after* each of \n - // it encounters. If that point is already outside the filemap, remove - // it again. - if let Some(&last_line_start) = lines.last() { - if last_line_start == filemap_start_pos + BytePos::from_usize(src.len()) { - lines.pop(); - } - } - - (lines, multibyte_chars, non_narrow_chars) - } - /// Return the BytePos of the beginning of the current line. pub fn line_begin_pos(&self) -> BytePos { match self.lines.last() { From ba1d18fe299cc6e44be7b5d10be004d7e3f3e6ea Mon Sep 17 00:00:00 2001 From: Michael Woerister Date: Fri, 1 Jun 2018 12:54:15 +0200 Subject: [PATCH 6/8] Add comments to analzye_filemap(). --- src/libsyntax_pos/analyze_filemap.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/libsyntax_pos/analyze_filemap.rs b/src/libsyntax_pos/analyze_filemap.rs index 7828c55ce78..99fc3f19009 100644 --- a/src/libsyntax_pos/analyze_filemap.rs +++ b/src/libsyntax_pos/analyze_filemap.rs @@ -102,6 +102,8 @@ cfg_if! { for chunk_index in 0 .. chunk_count { let ptr = src_bytes.as_ptr() as *const __m128i; + // We don't know if the pointer is aligned to 16 bytes, so we + // use `loadu`, which supports unaligned loading. let chunk = _mm_loadu_si128(ptr.offset(chunk_index as isize)); // For character in the chunk, see if its byte value is < 0, which From ba30c1dac9d0af45836403b2da89561a627b9a6e Mon Sep 17 00:00:00 2001 From: Michael Woerister Date: Tue, 26 Jun 2018 15:37:09 +0200 Subject: [PATCH 7/8] syntax_pos: Store multibyte char size as u8 instead of u32. --- src/libsyntax/codemap.rs | 4 ++-- src/libsyntax_pos/analyze_filemap.rs | 2 +- src/libsyntax_pos/lib.rs | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/libsyntax/codemap.rs b/src/libsyntax/codemap.rs index 68882c4f063..1d5429bdf8f 100644 --- a/src/libsyntax/codemap.rs +++ b/src/libsyntax/codemap.rs @@ -819,10 +819,10 @@ impl CodeMap { if mbc.pos < bpos { // every character is at least one byte, so we only // count the actual extra bytes. - total_extra_bytes += mbc.bytes - 1; + total_extra_bytes += mbc.bytes as u32 - 1; // We should never see a byte position in the middle of a // character - assert!(bpos.to_u32() >= mbc.pos.to_u32() + mbc.bytes); + assert!(bpos.to_u32() >= mbc.pos.to_u32() + mbc.bytes as u32); } else { break; } diff --git a/src/libsyntax_pos/analyze_filemap.rs b/src/libsyntax_pos/analyze_filemap.rs index 99fc3f19009..c7c0263e459 100644 --- a/src/libsyntax_pos/analyze_filemap.rs +++ b/src/libsyntax_pos/analyze_filemap.rs @@ -263,7 +263,7 @@ fn analyze_filemap_generic(src: &str, assert!(char_len >=2 && char_len <= 4); let mbc = MultiByteChar { pos, - bytes: char_len as u32, + bytes: char_len as u8, }; multi_byte_chars.push(mbc); } diff --git a/src/libsyntax_pos/lib.rs b/src/libsyntax_pos/lib.rs index 90f3ae90c2f..5502b30e488 100644 --- a/src/libsyntax_pos/lib.rs +++ b/src/libsyntax_pos/lib.rs @@ -663,7 +663,7 @@ pub struct MultiByteChar { /// The absolute offset of the character in the CodeMap pub pos: BytePos, /// The number of bytes, >=2 - pub bytes: u32, + pub bytes: u8, } /// Identifies an offset of a non-narrow character in a FileMap From a1f8a6ce80a340d51074071c0d9e30eb14f65d25 Mon Sep 17 00:00:00 2001 From: Michael Woerister Date: Thu, 28 Jun 2018 10:45:57 +0200 Subject: [PATCH 8/8] Fix FileMap::line_begin_pos(). The method relied on the FileMap still being under construction in order for it to do what the name promises. It's now independent of the current state. --- src/libsyntax/parse/lexer/comments.rs | 6 ++++-- src/libsyntax_pos/lib.rs | 8 +++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/libsyntax/parse/lexer/comments.rs b/src/libsyntax/parse/lexer/comments.rs index 7da0d816d0f..3995a9b8689 100644 --- a/src/libsyntax/parse/lexer/comments.rs +++ b/src/libsyntax/parse/lexer/comments.rs @@ -240,9 +240,11 @@ fn read_block_comment(rdr: &mut StringReader, let mut lines: Vec = Vec::new(); // Count the number of chars since the start of the line by rescanning. - let mut src_index = rdr.src_index(rdr.filemap.line_begin_pos()); + let mut src_index = rdr.src_index(rdr.filemap.line_begin_pos(rdr.pos)); let end_src_index = rdr.src_index(rdr.pos); - assert!(src_index <= end_src_index); + assert!(src_index <= end_src_index, + "src_index={}, end_src_index={}, line_begin_pos={}", + src_index, end_src_index, rdr.filemap.line_begin_pos(rdr.pos).to_u32()); let mut n = 0; while src_index < end_src_index { let c = char_at(&rdr.src, src_index); diff --git a/src/libsyntax_pos/lib.rs b/src/libsyntax_pos/lib.rs index 5502b30e488..55dec31511c 100644 --- a/src/libsyntax_pos/lib.rs +++ b/src/libsyntax_pos/lib.rs @@ -976,11 +976,9 @@ impl FileMap { } /// Return the BytePos of the beginning of the current line. - pub fn line_begin_pos(&self) -> BytePos { - match self.lines.last() { - Some(&line_pos) => line_pos, - None => self.start_pos, - } + pub fn line_begin_pos(&self, pos: BytePos) -> BytePos { + let line_index = self.lookup_line(pos).unwrap(); + self.lines[line_index] } /// Add externally loaded source.