diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 348445149a4..add9a4cb9f3 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -225,6 +225,47 @@ impl<'a> StringReader<'a> { self.byte_offset(end).to_uint())) } + /// Converts CRLF to LF in the given string, raising an error on bare CR. + fn translate_crlf<'a>(&self, start: BytePos, + s: &'a str, errmsg: &'a str) -> str::MaybeOwned<'a> { + let mut i = 0u; + while i < s.len() { + let str::CharRange { ch, next } = s.char_range_at(i); + if ch == '\r' { + if next < s.len() && s.char_at(next) == '\n' { + return translate_crlf_(self, start, s, errmsg, i).into_maybe_owned(); + } + let pos = start + BytePos(i as u32); + let end_pos = start + BytePos(next as u32); + self.err_span_(pos, end_pos, errmsg); + } + i = next; + } + return s.into_maybe_owned(); + + fn translate_crlf_(rdr: &StringReader, start: BytePos, + s: &str, errmsg: &str, mut i: uint) -> String { + let mut buf = String::with_capacity(s.len()); + let mut j = 0; + while i < s.len() { + let str::CharRange { ch, next } = s.char_range_at(i); + if ch == '\r' { + if j < i { buf.push_str(s.slice(j, i)); } + j = next; + if next >= s.len() || s.char_at(next) != '\n' { + let pos = start + BytePos(i as u32); + let end_pos = start + BytePos(next as u32); + rdr.err_span_(pos, end_pos, errmsg); + } + } + i = next; + } + if j < s.len() { buf.push_str(s.slice_from(j)); } + buf + } + } + + /// Advance the StringReader by one character. If a newline is /// discovered, add it to the FileMap's list of line start offsets. pub fn bump(&mut self) { @@ -305,7 +346,20 @@ impl<'a> StringReader<'a> { // line comments starting with "///" or "//!" are doc-comments if self.curr_is('/') || self.curr_is('!') { let start_bpos = self.pos - BytePos(3); - while !self.curr_is('\n') && !self.is_eof() { + while !self.is_eof() { + match self.curr.unwrap() { + '\n' => break, + '\r' => { + if self.nextch_is('\n') { + // CRLF + break + } else { + self.err_span_(self.last_pos, self.pos, + "bare CR not allowed in doc-comment"); + } + } + _ => () + } self.bump(); } let ret = self.with_str_from(start_bpos, |string| { @@ -370,6 +424,7 @@ impl<'a> StringReader<'a> { let start_bpos = self.last_pos - BytePos(2); let mut level: int = 1; + let mut has_cr = false; while level > 0 { if self.is_eof() { let msg = if is_doc_comment { @@ -379,25 +434,35 @@ impl<'a> StringReader<'a> { }; let last_bpos = self.last_pos; self.fatal_span_(start_bpos, last_bpos, msg); - } else if self.curr_is('/') && self.nextch_is('*') { - level += 1; - self.bump(); - self.bump(); - } else if self.curr_is('*') && self.nextch_is('/') { - level -= 1; - self.bump(); - self.bump(); - } else { - self.bump(); } + let n = self.curr.unwrap(); + match n { + '/' if self.nextch_is('*') => { + level += 1; + self.bump(); + } + '*' if self.nextch_is('/') => { + level -= 1; + self.bump(); + } + '\r' => { + has_cr = true; + } + _ => () + } + self.bump(); } let res = if is_doc_comment { self.with_str_from(start_bpos, |string| { // but comments with only "*"s between two "/"s are not if !is_block_non_doc_comment(string) { + let string = if has_cr { + self.translate_crlf(start_bpos, string, + "bare CR not allowed in block doc-comment") + } else { string.into_maybe_owned() }; Some(TokenAndSpan{ - tok: token::DOC_COMMENT(str_to_ident(string)), + tok: token::DOC_COMMENT(str_to_ident(string.as_slice())), sp: codemap::mk_sp(start_bpos, self.last_pos) }) } else { @@ -675,6 +740,10 @@ impl<'a> StringReader<'a> { self.consume_whitespace(); return None }, + '\r' if delim == '"' && self.curr_is('\n') => { + self.consume_whitespace(); + return None + } c => { let last_pos = self.last_pos; self.err_span_char( @@ -696,6 +765,15 @@ impl<'a> StringReader<'a> { else { "character constant must be escaped" }, first_source_char); } + '\r' => { + if self.curr_is('\n') { + self.bump(); + return Some('\n'); + } else { + self.err_span_(start, self.last_pos, + "bare CR not allowed in string, use \\r instead"); + } + } _ => if ascii_only && first_source_char > '\x7F' { let last_pos = self.last_pos; self.err_span_char( @@ -1042,28 +1120,45 @@ impl<'a> StringReader<'a> { self.bump(); let content_start_bpos = self.last_pos; let mut content_end_bpos; + let mut has_cr = false; 'outer: loop { if self.is_eof() { let last_bpos = self.last_pos; self.fatal_span_(start_bpos, last_bpos, "unterminated raw string"); } - if self.curr_is('"') { - content_end_bpos = self.last_pos; - for _ in range(0, hash_count) { - self.bump(); - if !self.curr_is('#') { - continue 'outer; + //if self.curr_is('"') { + //content_end_bpos = self.last_pos; + //for _ in range(0, hash_count) { + //self.bump(); + //if !self.curr_is('#') { + //continue 'outer; + let c = self.curr.unwrap(); + match c { + '"' => { + content_end_bpos = self.last_pos; + for _ in range(0, hash_count) { + self.bump(); + if !self.curr_is('#') { + continue 'outer; + } } + break; } - break; + '\r' => { + has_cr = true; + } + _ => () } self.bump(); } self.bump(); - let str_content = self.with_str_from_to( - content_start_bpos, - content_end_bpos, - str_to_ident); + let str_content = self.with_str_from_to(content_start_bpos, content_end_bpos, |string| { + let string = if has_cr { + self.translate_crlf(content_start_bpos, string, + "bare CR not allowed in raw string") + } else { string.into_maybe_owned() }; + str_to_ident(string.as_slice()) + }); return token::LIT_STR_RAW(str_content, hash_count); } '-' => { diff --git a/src/libsyntax/parse/mod.rs b/src/libsyntax/parse/mod.rs index eb0c6f2555a..331a49c83be 100644 --- a/src/libsyntax/parse/mod.rs +++ b/src/libsyntax/parse/mod.rs @@ -288,6 +288,8 @@ mod test { use owned_slice::OwnedSlice; use ast; use abi; + use attr; + use attr::AttrMetaMethods; use parse::parser::Parser; use parse::token::{str_to_ident}; use util::parser_testing::{string_to_tts, string_to_parser}; @@ -726,4 +728,24 @@ mod test { }".to_string()); } + #[test] fn crlf_doc_comments() { + let sess = new_parse_sess(); + + let name = "".to_string(); + let source = "/// doc comment\r\nfn foo() {}".to_string(); + let item = parse_item_from_source_str(name.clone(), source, Vec::new(), &sess).unwrap(); + let doc = attr::first_attr_value_str_by_name(item.attrs.as_slice(), "doc").unwrap(); + assert_eq!(doc.get(), "/// doc comment"); + + let source = "/// doc comment\r\n/// line 2\r\nfn foo() {}".to_string(); + let item = parse_item_from_source_str(name.clone(), source, Vec::new(), &sess).unwrap(); + let docs = item.attrs.iter().filter(|a| a.name().get() == "doc") + .map(|a| a.value_str().unwrap().get().to_string()).collect::>(); + assert_eq!(docs.as_slice(), &["/// doc comment".to_string(), "/// line 2".to_string()]); + + let source = "/** doc comment\r\n * with CRLF */\r\nfn foo() {}".to_string(); + let item = parse_item_from_source_str(name, source, Vec::new(), &sess).unwrap(); + let doc = attr::first_attr_value_str_by_name(item.attrs.as_slice(), "doc").unwrap(); + assert_eq!(doc.get(), "/** doc comment\n * with CRLF */"); + } } diff --git a/src/test/compile-fail/lex-bare-cr-string-literal-doc-comment.rs b/src/test/compile-fail/lex-bare-cr-string-literal-doc-comment.rs new file mode 100644 index 00000000000..c1e5121d6dd --- /dev/null +++ b/src/test/compile-fail/lex-bare-cr-string-literal-doc-comment.rs @@ -0,0 +1,30 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// ignore-tidy-cr + +/// doc comment with bare CR: ' ' +pub fn foo() {} +//~^^ ERROR: bare CR not allowed in doc-comment + +/** block doc comment with bare CR: ' ' */ +pub fn bar() {} +//~^^ ERROR: bare CR not allowed in block doc-comment + +fn main() { + // the following string literal has a bare CR in it + let _s = "foo bar"; //~ ERROR: bare CR not allowed in string + + // the following string literal has a bare CR in it + let _s = r"bar foo"; //~ ERROR: bare CR not allowed in raw string + + // the following string literal has a bare CR in it + let _s = "foo\ bar"; //~ ERROR: unknown character escape: \r +} diff --git a/src/test/run-pass/.gitattributes b/src/test/run-pass/.gitattributes new file mode 100644 index 00000000000..c6a6f23074d --- /dev/null +++ b/src/test/run-pass/.gitattributes @@ -0,0 +1 @@ +lexer-crlf-line-endings-string-literal-doc-comment.rs -text diff --git a/src/test/run-pass/lexer-crlf-line-endings-string-literal-doc-comment.rs b/src/test/run-pass/lexer-crlf-line-endings-string-literal-doc-comment.rs new file mode 100644 index 00000000000..5c8db524cc2 --- /dev/null +++ b/src/test/run-pass/lexer-crlf-line-endings-string-literal-doc-comment.rs @@ -0,0 +1,44 @@ +// ignore-tidy-cr ignore-license +// ignore-tidy-cr (repeated again because of tidy bug) +// license is ignored because tidy can't handle the CRLF here properly. + +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// NB: this file needs CRLF line endings. The .gitattributes file in +// this directory should enforce it. + +// ignore-pretty + +/// Doc comment that ends in CRLF +pub fn foo() {} + +/** Block doc comment that + * contains CRLF characters + */ +pub fn bar() {} + +fn main() { + let s = "string +literal"; + assert_eq!(s, "string\nliteral"); + + let s = "literal with \ + escaped newline"; + assert_eq!(s, "literal with escaped newline"); + + let s = r"string +literal"; + assert_eq!(s, "string\nliteral"); + + // validate that our source file has CRLF endings + let source = include_str!("lexer-crlf-line-endings-string-literal-doc-comment.rs"); + assert!(source.contains("string\r\nliteral")); +}