introduce unescape module

Currently, we deal with escape sequences twice: once when we lex a
string, and a second time when we unescape literals. This PR aims to
remove this duplication, by introducing a new `unescape` mode as a
single source of truth for character escaping rules
This commit is contained in:
Aleksey Kladov 2019-04-25 11:48:25 +03:00
parent 9b67bd42b7
commit bfa5f27847
24 changed files with 1046 additions and 768 deletions

View File

@ -184,7 +184,7 @@ impl<'a> DiagnosticBuilder<'a> {
) -> &mut Self);
forward!(pub fn warn(&mut self, msg: &str) -> &mut Self);
forward!(pub fn span_warn<S: Into<MultiSpan>>(&mut self, sp: S, msg: &str) -> &mut Self);
forward!(pub fn help(&mut self , msg: &str) -> &mut Self);
forward!(pub fn help(&mut self, msg: &str) -> &mut Self);
forward!(pub fn span_help<S: Into<MultiSpan>>(&mut self,
sp: S,
msg: &str,

View File

@ -1,8 +1,10 @@
use crate::ast::{self, Ident};
use crate::parse::{token, ParseSess};
use crate::symbol::Symbol;
use crate::parse::unescape;
use crate::parse::unescape_error_reporting::{emit_unescape_error, push_escaped_char};
use errors::{Applicability, FatalError, Diagnostic, DiagnosticBuilder};
use errors::{FatalError, Diagnostic, DiagnosticBuilder};
use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION};
use core::unicode::property::Pattern_White_Space;
@ -334,25 +336,12 @@ impl<'a> StringReader<'a> {
self.err_span(self.mk_sp(from_pos, to_pos), m)
}
/// Pushes a character to a message string for error reporting
fn push_escaped_char_for_msg(m: &mut String, c: char) {
match c {
'\u{20}'..='\u{7e}' => {
// Don't escape \, ' or " for user-facing messages
m.push(c);
}
_ => {
m.extend(c.escape_default());
}
}
}
/// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
/// escaped character to the error message
fn fatal_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) -> FatalError {
let mut m = m.to_string();
m.push_str(": ");
Self::push_escaped_char_for_msg(&mut m, c);
push_escaped_char(&mut m, c);
self.fatal_span_(from_pos, to_pos, &m[..])
}
@ -368,7 +357,7 @@ impl<'a> StringReader<'a> {
{
let mut m = m.to_string();
m.push_str(": ");
Self::push_escaped_char_for_msg(&mut m, c);
push_escaped_char(&mut m, c);
self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..])
}
@ -378,29 +367,10 @@ impl<'a> StringReader<'a> {
fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
let mut m = m.to_string();
m.push_str(": ");
Self::push_escaped_char_for_msg(&mut m, c);
push_escaped_char(&mut m, c);
self.err_span_(from_pos, to_pos, &m[..]);
}
fn struct_err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char)
-> DiagnosticBuilder<'a>
{
let mut m = m.to_string();
m.push_str(": ");
Self::push_escaped_char_for_msg(&mut m, c);
self.sess.span_diagnostic.struct_span_err(self.mk_sp(from_pos, to_pos), &m[..])
}
/// Report a lexical error spanning [`from_pos`, `to_pos`), appending the
/// offending string to the error message
fn fatal_span_verbose(&self, from_pos: BytePos, to_pos: BytePos, mut m: String) -> FatalError {
m.push_str(": ");
m.push_str(&self.src[self.src_index(from_pos)..self.src_index(to_pos)]);
self.fatal_span_(from_pos, to_pos, &m[..])
}
/// Advance peek_tok and peek_span to refer to the next token, and
/// possibly update the interner.
fn advance_token(&mut self) -> Result<(), ()> {
@ -863,271 +833,6 @@ impl<'a> StringReader<'a> {
}
}
/// Scan over `n_digits` hex digits, stopping at `delim`, reporting an
/// error if too many or too few digits are encountered.
fn scan_hex_digits(&mut self, n_digits: usize, delim: char, below_0x7f_only: bool) -> bool {
debug!("scanning {} digits until {:?}", n_digits, delim);
let start_bpos = self.pos;
let mut accum_int = 0;
let mut valid = true;
for _ in 0..n_digits {
if self.is_eof() {
let last_bpos = self.pos;
self.fatal_span_(start_bpos,
last_bpos,
"unterminated numeric character escape").raise();
}
if self.ch_is(delim) {
let last_bpos = self.pos;
self.err_span_(start_bpos,
last_bpos,
"numeric character escape is too short");
valid = false;
break;
}
let c = self.ch.unwrap_or('\x00');
accum_int *= 16;
accum_int += c.to_digit(16).unwrap_or_else(|| {
self.err_span_char(self.pos,
self.next_pos,
"invalid character in numeric character escape",
c);
valid = false;
0
});
self.bump();
}
if below_0x7f_only && accum_int >= 0x80 {
self.err_span_(start_bpos,
self.pos,
"this form of character escape may only be used with characters in \
the range [\\x00-\\x7f]");
valid = false;
}
match char::from_u32(accum_int) {
Some(_) => valid,
None => {
let last_bpos = self.pos;
self.err_span_(start_bpos, last_bpos, "invalid numeric character escape");
false
}
}
}
/// Scan for a single (possibly escaped) byte or char
/// in a byte, (non-raw) byte string, char, or (non-raw) string literal.
/// `start` is the position of `first_source_char`, which is already consumed.
///
/// Returns `true` if there was a valid char/byte.
fn scan_char_or_byte(&mut self,
start: BytePos,
first_source_char: char,
ascii_only: bool,
delim: char)
-> bool
{
match first_source_char {
'\\' => {
// '\X' for some X must be a character constant:
let escaped = self.ch;
let escaped_pos = self.pos;
self.bump();
match escaped {
None => {} // EOF here is an error that will be checked later.
Some(e) => {
return match e {
'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true,
'x' => self.scan_byte_escape(delim, !ascii_only),
'u' => {
let valid = if self.ch_is('{') {
self.scan_unicode_escape(delim) && !ascii_only
} else {
let span = self.mk_sp(start, self.pos);
let mut suggestion = "\\u{".to_owned();
let msg = "incorrect unicode escape sequence";
let mut err = self.sess.span_diagnostic.struct_span_err(
span,
msg,
);
let mut i = 0;
while let (Some(ch), true) = (self.ch, i < 6) {
if ch.is_digit(16) {
suggestion.push(ch);
self.bump();
i += 1;
} else {
break;
}
}
if i != 0 {
suggestion.push('}');
err.span_suggestion(
self.mk_sp(start, self.pos),
"format of unicode escape sequences uses braces",
suggestion,
Applicability::MaybeIncorrect,
);
} else {
err.span_label(span, msg);
err.help(
"format of unicode escape sequences is `\\u{...}`",
);
}
err.emit();
false
};
if ascii_only {
self.err_span_(start,
self.pos,
"unicode escape sequences cannot be used as a \
byte or in a byte string");
}
valid
}
'\n' if delim == '"' => {
self.consume_whitespace();
true
}
'\r' if delim == '"' && self.ch_is('\n') => {
self.consume_whitespace();
true
}
c => {
let pos = self.pos;
let msg = if ascii_only {
"unknown byte escape"
} else {
"unknown character escape"
};
let mut err = self.struct_err_span_char(escaped_pos, pos, msg, c);
err.span_label(self.mk_sp(escaped_pos, pos), msg);
if e == '\r' {
err.help(
"this is an isolated carriage return; consider checking \
your editor and version control settings",
);
}
if (e == '{' || e == '}') && !ascii_only {
err.help(
"if used in a formatting string, curly braces are escaped \
with `{{` and `}}`",
);
}
err.emit();
false
}
}
}
}
}
'\t' | '\n' | '\r' | '\'' if delim == '\'' => {
let pos = self.pos;
self.err_span_char(start,
pos,
if ascii_only {
"byte constant must be escaped"
} else {
"character constant must be escaped"
},
first_source_char);
return false;
}
'\r' => {
if self.ch_is('\n') {
self.bump();
return true;
} else {
self.err_span_(start,
self.pos,
"bare CR not allowed in string, use \\r instead");
return false;
}
}
_ => {
if ascii_only && first_source_char > '\x7F' {
let pos = self.pos;
self.err_span_(start,
pos,
"byte constant must be ASCII. Use a \\xHH escape for a \
non-ASCII byte");
return false;
}
}
}
true
}
/// Scan over a `\u{...}` escape
///
/// At this point, we have already seen the `\` and the `u`, the `{` is the current character.
/// We will read a hex number (with `_` separators), with 1 to 6 actual digits,
/// and pass over the `}`.
fn scan_unicode_escape(&mut self, delim: char) -> bool {
self.bump(); // past the {
let start_bpos = self.pos;
let mut valid = true;
if let Some('_') = self.ch {
// disallow leading `_`
self.err_span_(self.pos,
self.next_pos,
"invalid start of unicode escape");
valid = false;
}
let count = self.scan_digits(16, 16);
if count > 6 {
self.err_span_(start_bpos,
self.pos,
"overlong unicode escape (must have at most 6 hex digits)");
valid = false;
}
loop {
match self.ch {
Some('}') => {
if valid && count == 0 {
self.err_span_(start_bpos,
self.pos,
"empty unicode escape (must have at least 1 hex digit)");
valid = false;
}
self.bump(); // past the ending `}`
break;
},
Some(c) => {
if c == delim {
self.err_span_(self.pos,
self.pos,
"unterminated unicode escape (needed a `}`)");
valid = false;
break;
} else if valid {
self.err_span_char(start_bpos,
self.pos,
"invalid character in unicode escape",
c);
valid = false;
}
},
None => {
self.fatal_span_(start_bpos,
self.pos,
"unterminated unicode escape (found EOF)").raise();
}
}
self.bump();
}
valid
}
/// Scan over a float exponent.
fn scan_float_exponent(&mut self) {
if self.ch_is('e') || self.ch_is('E') {
@ -1393,26 +1098,21 @@ impl<'a> StringReader<'a> {
self.bump();
let start = self.pos;
// the eof will be picked up by the final `'` check below
let c2 = self.ch.unwrap_or('\x00');
self.bump();
// If the character is an ident start not followed by another single
// quote, then this is a lifetime name:
if (ident_start(Some(c2)) || c2.is_numeric()) && !self.ch_is('\'') {
let starts_with_number = self.ch.unwrap_or('\x00').is_numeric();
if (ident_start(self.ch) || starts_with_number) && !self.nextch_is('\'') {
self.bump();
while ident_continue(self.ch) {
self.bump();
}
// lifetimes shouldn't end with a single quote
// if we find one, then this is an invalid character literal
if self.ch_is('\'') {
self.err_span_(
start_with_quote,
self.next_pos,
"character literal may only contain one codepoint");
let id = self.name_from(start);
self.bump();
return Ok(token::Literal(token::Err(Symbol::intern("??")), None))
self.validate_char_escape(start_with_quote);
return Ok(token::Literal(token::Char(id), None))
}
// Include the leading `'` in the real identifier, for macro
@ -1422,7 +1122,7 @@ impl<'a> StringReader<'a> {
self.mk_ident(lifetime_name)
});
if c2.is_numeric() {
if starts_with_number {
// this is a recovered lifetime written `'1`, error but accept it
self.err_span_(
start_with_quote,
@ -1433,58 +1133,30 @@ impl<'a> StringReader<'a> {
return Ok(token::Lifetime(ident));
}
let valid = self.scan_char_or_byte(start, c2, /* ascii_only */ false, '\'');
if !self.ch_is('\'') {
let pos = self.pos;
loop {
self.bump();
if self.ch_is('\'') {
let start = self.src_index(start);
let end = self.src_index(self.pos);
self.bump();
let span = self.mk_sp(start_with_quote, self.pos);
self.sess.span_diagnostic
.struct_span_err(span,
"character literal may only contain one codepoint")
.span_suggestion(
span,
"if you meant to write a `str` literal, use double quotes",
format!("\"{}\"", &self.src[start..end]),
Applicability::MachineApplicable
).emit();
return Ok(token::Literal(token::Err(Symbol::intern("??")), None))
}
if self.ch_is('\n') || self.is_eof() || self.ch_is('/') {
// Only attempt to infer single line string literals. If we encounter
// a slash, bail out in order to avoid nonsensical suggestion when
// involving comments.
break;
}
}
self.fatal_span_verbose(start_with_quote, pos,
String::from("character literal may only contain one codepoint")).raise();
}
let id = if valid {
self.name_from(start)
} else {
Symbol::intern("0")
};
self.bump(); // advance ch past token
let msg = "unterminated character literal";
let id = self.scan_single_quoted_string(start_with_quote, msg);
self.validate_char_escape(start_with_quote);
let suffix = self.scan_optional_raw_name();
Ok(token::Literal(token::Char(id), suffix))
}
'b' => {
self.bump();
let lit = match self.ch {
Some('\'') => self.scan_byte(),
Some('"') => self.scan_byte_string(),
Some('\'') => {
let start_with_quote = self.pos;
self.bump();
let msg = "unterminated byte constant";
let id = self.scan_single_quoted_string(start_with_quote, msg);
self.validate_byte_escape(start_with_quote);
token::Byte(id)
},
Some('"') => {
let start_with_quote = self.pos;
let msg = "unterminated double quote byte string";
let id = self.scan_double_quoted_string(msg);
self.validate_byte_str_escape(start_with_quote);
token::ByteStr(id)
},
Some('r') => self.scan_raw_byte_string(),
_ => unreachable!(), // Should have been a token::Ident above.
};
@ -1493,32 +1165,11 @@ impl<'a> StringReader<'a> {
Ok(token::Literal(lit, suffix))
}
'"' => {
let start_bpos = self.pos;
let mut valid = true;
self.bump();
while !self.ch_is('"') {
if self.is_eof() {
let last_bpos = self.pos;
self.fatal_span_(start_bpos,
last_bpos,
"unterminated double quote string").raise();
}
let ch_start = self.pos;
let ch = self.ch.unwrap();
self.bump();
valid &= self.scan_char_or_byte(ch_start, ch, /* ascii_only */ false, '"');
}
// adjust for the ASCII " at the start of the literal
let id = if valid {
self.name_from(start_bpos + BytePos(1))
} else {
Symbol::intern("??")
};
self.bump();
let start_with_quote = self.pos;
let msg = "unterminated double quote string";
let id = self.scan_double_quoted_string(msg);
self.validate_str_escape(start_with_quote);
let suffix = self.scan_optional_raw_name();
Ok(token::Literal(token::Str_(id), suffix))
}
'r' => {
@ -1659,12 +1310,6 @@ impl<'a> StringReader<'a> {
}
}
fn consume_whitespace(&mut self) {
while is_pattern_whitespace(self.ch) && !self.is_eof() {
self.bump();
}
}
fn read_to_eol(&mut self) -> String {
let mut val = String::new();
while !self.ch_is('\n') && !self.is_eof() {
@ -1698,73 +1343,63 @@ impl<'a> StringReader<'a> {
(self.ch_is('#') && self.nextch_is('!') && !self.nextnextch_is('['))
}
fn scan_byte(&mut self) -> token::Lit {
self.bump();
fn scan_single_quoted_string(&mut self,
start_with_quote: BytePos,
unterminated_msg: &str) -> ast::Name {
// assumes that first `'` is consumed
let start = self.pos;
// lex `'''` as a single char, for recovery
if self.ch_is('\'') && self.nextch_is('\'') {
self.bump();
} else {
let mut first = true;
loop {
if self.ch_is('\'') {
break;
}
if self.ch_is('\\') && (self.nextch_is('\'') || self.nextch_is('\\')) {
self.bump();
self.bump();
} else {
// Only attempt to infer single line string literals. If we encounter
// a slash, bail out in order to avoid nonsensical suggestion when
// involving comments.
if self.is_eof()
|| (self.ch_is('/') && !first)
|| (self.ch_is('\n') && !self.nextch_is('\'')) {
// the eof will be picked up by the final `'` check below
let c2 = self.ch.unwrap_or('\x00');
self.bump();
let valid = self.scan_char_or_byte(start,
c2,
// ascii_only =
true,
'\'');
if !self.ch_is('\'') {
// Byte offsetting here is okay because the
// character before position `start` are an
// ascii single quote and ascii 'b'.
let pos = self.pos;
self.fatal_span_verbose(start - BytePos(2),
pos,
"unterminated byte constant".to_string()).raise();
self.fatal_span_(start_with_quote, self.pos, unterminated_msg.into())
.raise()
}
self.bump();
}
first = false;
}
}
let id = if valid {
self.name_from(start)
} else {
Symbol::intern("?")
};
self.bump(); // advance ch past token
token::Byte(id)
let id = self.name_from(start);
self.bump();
id
}
#[inline]
fn scan_byte_escape(&mut self, delim: char, below_0x7f_only: bool) -> bool {
self.scan_hex_digits(2, delim, below_0x7f_only)
}
fn scan_byte_string(&mut self) -> token::Lit {
fn scan_double_quoted_string(&mut self, unterminated_msg: &str) -> ast::Name {
debug_assert!(self.ch_is('\"'));
let start_with_quote = self.pos;
self.bump();
let start = self.pos;
let mut valid = true;
while !self.ch_is('"') {
if self.is_eof() {
let pos = self.pos;
self.fatal_span_(start, pos, "unterminated double quote byte string").raise();
self.fatal_span_(start_with_quote, pos, unterminated_msg).raise();
}
if self.ch_is('\\') && (self.nextch_is('\\') || self.nextch_is('"')) {
self.bump();
}
let ch_start = self.pos;
let ch = self.ch.unwrap();
self.bump();
valid &= self.scan_char_or_byte(ch_start,
ch,
// ascii_only =
true,
'"');
}
let id = if valid {
self.name_from(start)
} else {
Symbol::intern("??")
};
let id = self.name_from(start);
self.bump();
token::ByteStr(id)
id
}
fn scan_raw_byte_string(&mut self) -> token::Lit {
@ -1826,6 +1461,70 @@ impl<'a> StringReader<'a> {
token::ByteStrRaw(self.name_from_to(content_start_bpos, content_end_bpos), hash_count)
}
fn validate_char_escape(&self, start_with_quote: BytePos) {
self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| {
if let Err((off, err)) = unescape::unescape_char(lit) {
emit_unescape_error(
&self.sess.span_diagnostic,
lit,
self.mk_sp(start_with_quote, self.pos),
unescape::Mode::Char,
0..off,
err,
)
}
});
}
fn validate_byte_escape(&self, start_with_quote: BytePos) {
self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| {
if let Err((off, err)) = unescape::unescape_byte(lit) {
emit_unescape_error(
&self.sess.span_diagnostic,
lit,
self.mk_sp(start_with_quote, self.pos),
unescape::Mode::Byte,
0..off,
err,
)
}
});
}
fn validate_str_escape(&self, start_with_quote: BytePos) {
self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| {
unescape::unescape_str(lit, &mut |range, c| {
if let Err(err) = c {
emit_unescape_error(
&self.sess.span_diagnostic,
lit,
self.mk_sp(start_with_quote, self.pos),
unescape::Mode::Str,
range,
err,
)
}
})
});
}
fn validate_byte_str_escape(&self, start_with_quote: BytePos) {
self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| {
unescape::unescape_byte_str(lit, &mut |range, c| {
if let Err(err) = c {
emit_unescape_error(
&self.sess.span_diagnostic,
lit,
self.mk_sp(start_with_quote, self.pos),
unescape::Mode::ByteStr,
range,
err,
)
}
})
});
}
}
// This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which

View File

@ -18,7 +18,6 @@ use log::debug;
use rustc_data_structures::fx::FxHashSet;
use std::borrow::Cow;
use std::iter;
use std::path::{Path, PathBuf};
use std::str;
@ -33,6 +32,11 @@ pub mod attr;
pub mod classify;
pub(crate) mod unescape;
use unescape::{unescape_str, unescape_char, unescape_byte_str, unescape_byte, EscapeError};
pub(crate) mod unescape_error_reporting;
/// Info about a parsing session.
pub struct ParseSess {
pub span_diagnostic: Handler,
@ -306,133 +310,6 @@ pub fn stream_to_parser(sess: &ParseSess, stream: TokenStream) -> Parser<'_> {
Parser::new(sess, stream, None, true, false)
}
/// Parses a string representing a character literal into its final form.
/// Rather than just accepting/rejecting a given literal, unescapes it as
/// well. Can take any slice prefixed by a character escape. Returns the
/// character and the number of characters consumed.
fn char_lit(lit: &str, diag: Option<(Span, &Handler)>) -> (char, isize) {
use std::char;
// Handle non-escaped chars first.
if lit.as_bytes()[0] != b'\\' {
// If the first byte isn't '\\' it might part of a multi-byte char, so
// get the char with chars().
let c = lit.chars().next().unwrap();
return (c, 1);
}
// Handle escaped chars.
match lit.as_bytes()[1] as char {
'"' => ('"', 2),
'n' => ('\n', 2),
'r' => ('\r', 2),
't' => ('\t', 2),
'\\' => ('\\', 2),
'\'' => ('\'', 2),
'0' => ('\0', 2),
'x' => {
let v = u32::from_str_radix(&lit[2..4], 16).unwrap();
let c = char::from_u32(v).unwrap();
(c, 4)
}
'u' => {
assert_eq!(lit.as_bytes()[2], b'{');
let idx = lit.find('}').unwrap();
// All digits and '_' are ascii, so treat each byte as a char.
let mut v: u32 = 0;
for c in lit[3..idx].bytes() {
let c = char::from(c);
if c != '_' {
let x = c.to_digit(16).unwrap();
v = v.checked_mul(16).unwrap().checked_add(x).unwrap();
}
}
let c = char::from_u32(v).unwrap_or_else(|| {
if let Some((span, diag)) = diag {
let mut diag = diag.struct_span_err(span, "invalid unicode character escape");
if v > 0x10FFFF {
diag.help("unicode escape must be at most 10FFFF").emit();
} else {
diag.help("unicode escape must not be a surrogate").emit();
}
}
'\u{FFFD}'
});
(c, (idx + 1) as isize)
}
_ => panic!("lexer should have rejected a bad character escape {}", lit)
}
}
/// Parses a string representing a string literal into its final form. Does unescaping.
fn str_lit(lit: &str, diag: Option<(Span, &Handler)>) -> String {
debug!("str_lit: given {}", lit.escape_default());
let mut res = String::with_capacity(lit.len());
let error = |i| format!("lexer should have rejected {} at {}", lit, i);
/// Eat everything up to a non-whitespace.
fn eat<'a>(it: &mut iter::Peekable<str::CharIndices<'a>>) {
loop {
match it.peek().map(|x| x.1) {
Some(' ') | Some('\n') | Some('\r') | Some('\t') => {
it.next();
},
_ => { break; }
}
}
}
let mut chars = lit.char_indices().peekable();
while let Some((i, c)) = chars.next() {
match c {
'\\' => {
let ch = chars.peek().unwrap_or_else(|| {
panic!("{}", error(i))
}).1;
if ch == '\n' {
eat(&mut chars);
} else if ch == '\r' {
chars.next();
let ch = chars.peek().unwrap_or_else(|| {
panic!("{}", error(i))
}).1;
if ch != '\n' {
panic!("lexer accepted bare CR");
}
eat(&mut chars);
} else {
// otherwise, a normal escape
let (c, n) = char_lit(&lit[i..], diag);
for _ in 0..n - 1 { // we don't need to move past the first \
chars.next();
}
res.push(c);
}
},
'\r' => {
let ch = chars.peek().unwrap_or_else(|| {
panic!("{}", error(i))
}).1;
if ch != '\n' {
panic!("lexer accepted bare CR");
}
chars.next();
res.push('\n');
}
c => res.push(c),
}
}
res.shrink_to_fit(); // probably not going to do anything, unless there was an escape.
debug!("parse_str_lit: returning {}", res);
res
}
/// Parses a string representing a raw string literal into its final form. The
/// only operation this does is convert embedded CRLF into a single LF.
fn raw_str_lit(lit: &str) -> String {
@ -475,9 +352,23 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
use ast::LitKind;
match lit {
token::Byte(i) => (true, Some(LitKind::Byte(byte_lit(&i.as_str()).0))),
token::Char(i) => (true, Some(LitKind::Char(char_lit(&i.as_str(), diag).0))),
token::Err(i) => (true, Some(LitKind::Err(i))),
token::Byte(i) => {
let lit_kind = match unescape_byte(&i.as_str()) {
Ok(c) => LitKind::Byte(c),
Err((_, EscapeError::MoreThanOneChar)) => LitKind::Err(i),
Err(_) => LitKind::Byte(0),
};
(true, Some(lit_kind))
},
token::Char(i) => {
let lit_kind = match unescape_char(&i.as_str()) {
Ok(c) => LitKind::Char(c),
Err((_, EscapeError::MoreThanOneChar)) => LitKind::Err(i),
Err(_) => LitKind::Char('\u{FFFD}'),
};
(true, Some(lit_kind))
},
token::Err(i) => (true, Some(LitKind::Err(i))),
// There are some valid suffixes for integer and float literals,
// so all the handling is done internally.
@ -491,7 +382,14 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
// string in the Token.
let s = &sym.as_str();
if s.as_bytes().iter().any(|&c| c == b'\\' || c == b'\r') {
sym = Symbol::intern(&str_lit(s, diag));
let mut buf = String::with_capacity(s.len());
unescape_str(s, &mut |_, unescaped_char| {
match unescaped_char {
Ok(c) => buf.push(c),
Err(_) => buf.push('\u{FFFD}'),
}
});
sym = Symbol::intern(&buf)
}
(true, Some(LitKind::Str(sym, ast::StrStyle::Cooked)))
}
@ -504,7 +402,16 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
(true, Some(LitKind::Str(sym, ast::StrStyle::Raw(n))))
}
token::ByteStr(i) => {
(true, Some(LitKind::ByteStr(byte_str_lit(&i.as_str()))))
let s = &i.as_str();
let mut buf = Vec::with_capacity(s.len());
unescape_byte_str(s, &mut |_, unescaped_byte| {
match unescaped_byte {
Ok(c) => buf.push(c),
Err(_) => buf.push(0),
}
});
buf.shrink_to_fit();
(true, Some(LitKind::ByteStr(Lrc::new(buf))))
}
token::ByteStrRaw(i, _) => {
(true, Some(LitKind::ByteStr(Lrc::new(i.to_string().into_bytes()))))
@ -559,95 +466,6 @@ fn float_lit(s: &str, suffix: Option<Symbol>, diag: Option<(Span, &Handler)>)
filtered_float_lit(Symbol::intern(s), suffix, diag)
}
/// Parses a string representing a byte literal into its final form. Similar to `char_lit`.
fn byte_lit(lit: &str) -> (u8, usize) {
let err = |i| format!("lexer accepted invalid byte literal {} step {}", lit, i);
if lit.len() == 1 {
(lit.as_bytes()[0], 1)
} else {
assert_eq!(lit.as_bytes()[0], b'\\', "{}", err(0));
let b = match lit.as_bytes()[1] {
b'"' => b'"',
b'n' => b'\n',
b'r' => b'\r',
b't' => b'\t',
b'\\' => b'\\',
b'\'' => b'\'',
b'0' => b'\0',
_ => {
match u64::from_str_radix(&lit[2..4], 16).ok() {
Some(c) =>
if c > 0xFF {
panic!(err(2))
} else {
return (c as u8, 4)
},
None => panic!(err(3))
}
}
};
(b, 2)
}
}
fn byte_str_lit(lit: &str) -> Lrc<Vec<u8>> {
let mut res = Vec::with_capacity(lit.len());
let error = |i| panic!("lexer should have rejected {} at {}", lit, i);
/// Eat everything up to a non-whitespace.
fn eat<I: Iterator<Item=(usize, u8)>>(it: &mut iter::Peekable<I>) {
loop {
match it.peek().map(|x| x.1) {
Some(b' ') | Some(b'\n') | Some(b'\r') | Some(b'\t') => {
it.next();
},
_ => { break; }
}
}
}
// byte string literals *must* be ASCII, but the escapes don't have to be
let mut chars = lit.bytes().enumerate().peekable();
loop {
match chars.next() {
Some((i, b'\\')) => {
match chars.peek().unwrap_or_else(|| error(i)).1 {
b'\n' => eat(&mut chars),
b'\r' => {
chars.next();
if chars.peek().unwrap_or_else(|| error(i)).1 != b'\n' {
panic!("lexer accepted bare CR");
}
eat(&mut chars);
}
_ => {
// otherwise, a normal escape
let (c, n) = byte_lit(&lit[i..]);
// we don't need to move past the first \
for _ in 0..n - 1 {
chars.next();
}
res.push(c);
}
}
},
Some((i, b'\r')) => {
if chars.peek().unwrap_or_else(|| error(i)).1 != b'\n' {
panic!("lexer accepted bare CR");
}
chars.next();
res.push(b'\n');
}
Some((_, c)) => res.push(c),
None => break,
}
}
Lrc::new(res)
}
fn integer_lit(s: &str, suffix: Option<Symbol>, diag: Option<(Span, &Handler)>)
-> Option<ast::LitKind> {
// s can only be ascii, byte indexing is fine

View File

@ -0,0 +1,515 @@
//! Utilities for validating string and char literals and turning them into
//! values they represent.
use std::str::Chars;
use std::ops::Range;
#[derive(Debug, PartialEq, Eq)]
pub(crate) enum EscapeError {
ZeroChars,
MoreThanOneChar,
LoneSlash,
InvalidEscape,
BareCarriageReturn,
EscapeOnlyChar,
TooShortHexEscape,
InvalidCharInHexEscape,
OutOfRangeHexEscape,
NoBraceInUnicodeEscape,
InvalidCharInUnicodeEscape,
EmptyUnicodeEscape,
UnclosedUnicodeEscape,
LeadingUnderscoreUnicodeEscape,
OverlongUnicodeEscape,
LoneSurrogateUnicodeEscape,
OutOfRangeUnicodeEscape,
UnicodeEscapeInByte,
NonAsciiCharInByte,
}
/// Takes a contents of a char literal (without quotes), and returns an
/// unescaped char or an error
pub(crate) fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> {
let mut chars = literal_text.chars();
unescape_char_or_byte(&mut chars, Mode::Char)
.map_err(|err| (literal_text.len() - chars.as_str().len(), err))
}
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
pub(crate) fn unescape_str<F>(literal_text: &str, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
unescape_str_or_byte_str(literal_text, Mode::Str, callback)
}
pub(crate) fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
let mut chars = literal_text.chars();
unescape_char_or_byte(&mut chars, Mode::Byte)
.map(byte_from_char)
.map_err(|err| (literal_text.len() - chars.as_str().len(), err))
}
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
pub(crate) fn unescape_byte_str<F>(literal_text: &str, callback: &mut F)
where
F: FnMut(Range<usize>, Result<u8, EscapeError>),
{
unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
callback(range, char.map(byte_from_char))
})
}
#[derive(Debug, Clone, Copy)]
pub(crate) enum Mode {
Char,
Str,
Byte,
ByteStr,
}
impl Mode {
fn in_single_quotes(self) -> bool {
match self {
Mode::Char | Mode::Byte => true,
Mode::Str | Mode::ByteStr => false,
}
}
pub(crate) fn in_double_quotes(self) -> bool {
!self.in_single_quotes()
}
pub(crate) fn is_bytes(self) -> bool {
match self {
Mode::Byte | Mode::ByteStr => true,
Mode::Char | Mode::Str => false,
}
}
}
fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
if first_char != '\\' {
return match first_char {
'\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(if chars.clone().next() == Some('\n') {
EscapeError::EscapeOnlyChar
} else {
EscapeError::BareCarriageReturn
}),
'\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
'"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
_ => {
if mode.is_bytes() && !first_char.is_ascii() {
return Err(EscapeError::NonAsciiCharInByte);
}
Ok(first_char)
}
};
}
let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
let res = match second_char {
'"' => '"',
'n' => '\n',
'r' => '\r',
't' => '\t',
'\\' => '\\',
'\'' => '\'',
'0' => '\0',
'x' => {
let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
let value = hi * 16 + lo;
if !mode.is_bytes() && !is_ascii(value) {
return Err(EscapeError::OutOfRangeHexEscape);
}
let value = value as u8;
value as char
}
'u' => {
if chars.next() != Some('{') {
return Err(EscapeError::NoBraceInUnicodeEscape);
}
let mut n_digits = 1;
let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
'_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
'}' => return Err(EscapeError::EmptyUnicodeEscape),
c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
};
loop {
match chars.next() {
None => return Err(EscapeError::UnclosedUnicodeEscape),
Some('_') => continue,
Some('}') => {
if n_digits > 6 {
return Err(EscapeError::OverlongUnicodeEscape);
}
if mode.is_bytes() {
return Err(EscapeError::UnicodeEscapeInByte);
}
break std::char::from_u32(value).ok_or_else(|| {
if value > 0x10FFFF {
EscapeError::OutOfRangeUnicodeEscape
} else {
EscapeError::LoneSurrogateUnicodeEscape
}
})?;
}
Some(c) => {
let digit = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
n_digits += 1;
if n_digits > 6 {
continue;
}
let digit = digit as u32;
value = value * 16 + digit;
}
};
}
}
_ => return Err(EscapeError::InvalidEscape),
};
Ok(res)
}
fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
let res = scan_escape(first_char, chars, mode)?;
if chars.next().is_some() {
return Err(EscapeError::MoreThanOneChar);
}
Ok(res)
}
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
assert!(mode.in_double_quotes());
let initial_len = src.len();
let mut chars = src.chars();
while let Some(first_char) = chars.next() {
let start = initial_len - chars.as_str().len() - first_char.len_utf8();
let unescaped_char = match first_char {
'\\' => {
let (second_char, third_char) = {
let mut chars = chars.clone();
(chars.next(), chars.next())
};
match (second_char, third_char) {
(Some('\n'), _) | (Some('\r'), Some('\n')) => {
skip_ascii_whitespace(&mut chars);
continue;
}
_ => scan_escape(first_char, &mut chars, mode),
}
}
'\r' => {
let second_char = chars.clone().next();
if second_char == Some('\n') {
chars.next();
Ok('\n')
} else {
scan_escape(first_char, &mut chars, mode)
}
}
'\n' => Ok('\n'),
'\t' => Ok('\t'),
_ => scan_escape(first_char, &mut chars, mode),
};
let end = initial_len - chars.as_str().len();
callback(start..end, unescaped_char);
}
fn skip_ascii_whitespace(chars: &mut Chars<'_>) {
let str = chars.as_str();
let first_non_space = str
.bytes()
.position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
.unwrap_or(str.len());
*chars = str[first_non_space..].chars()
}
}
fn byte_from_char(c: char) -> u8 {
let res = c as u32;
assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte");
res as u8
}
fn is_ascii(x: u32) -> bool {
x <= 0x7F
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_unescape_char_bad() {
fn check(literal_text: &str, expected_error: EscapeError) {
let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err);
assert_eq!(actual_result, Err(expected_error));
}
check("", EscapeError::ZeroChars);
check(r"\", EscapeError::LoneSlash);
check("\n", EscapeError::EscapeOnlyChar);
check("\r\n", EscapeError::EscapeOnlyChar);
check("\t", EscapeError::EscapeOnlyChar);
check("'", EscapeError::EscapeOnlyChar);
check("\r", EscapeError::BareCarriageReturn);
check("spam", EscapeError::MoreThanOneChar);
check(r"\x0ff", EscapeError::MoreThanOneChar);
check(r#"\"a"#, EscapeError::MoreThanOneChar);
check(r"\na", EscapeError::MoreThanOneChar);
check(r"\ra", EscapeError::MoreThanOneChar);
check(r"\ta", EscapeError::MoreThanOneChar);
check(r"\\a", EscapeError::MoreThanOneChar);
check(r"\'a", EscapeError::MoreThanOneChar);
check(r"\0a", EscapeError::MoreThanOneChar);
check(r"\u{0}x", EscapeError::MoreThanOneChar);
check(r"\u{1F63b}}", EscapeError::MoreThanOneChar);
check(r"\v", EscapeError::InvalidEscape);
check(r"\💩", EscapeError::InvalidEscape);
check(r"\●", EscapeError::InvalidEscape);
check(r"\x", EscapeError::TooShortHexEscape);
check(r"\x0", EscapeError::TooShortHexEscape);
check(r"\xf", EscapeError::TooShortHexEscape);
check(r"\xa", EscapeError::TooShortHexEscape);
check(r"\xx", EscapeError::InvalidCharInHexEscape);
check(r"\xы", EscapeError::InvalidCharInHexEscape);
check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
check(r"\xtt", EscapeError::InvalidCharInHexEscape);
check(r"\xff", EscapeError::OutOfRangeHexEscape);
check(r"\xFF", EscapeError::OutOfRangeHexEscape);
check(r"\x80", EscapeError::OutOfRangeHexEscape);
check(r"\u", EscapeError::NoBraceInUnicodeEscape);
check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
check(r"\u{", EscapeError::UnclosedUnicodeEscape);
check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
check(r"\u{}", EscapeError::EmptyUnicodeEscape);
check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape);
check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape);
}
#[test]
fn test_unescape_char_good() {
fn check(literal_text: &str, expected_char: char) {
let actual_result = unescape_char(literal_text);
assert_eq!(actual_result, Ok(expected_char));
}
check("a", 'a');
check("ы", 'ы');
check("🦀", '🦀');
check(r#"\""#, '"');
check(r"\n", '\n');
check(r"\r", '\r');
check(r"\t", '\t');
check(r"\\", '\\');
check(r"\'", '\'');
check(r"\0", '\0');
check(r"\x00", '\0');
check(r"\x5a", 'Z');
check(r"\x5A", 'Z');
check(r"\x7f", 127 as char);
check(r"\u{0}", '\0');
check(r"\u{000000}", '\0');
check(r"\u{41}", 'A');
check(r"\u{0041}", 'A');
check(r"\u{00_41}", 'A');
check(r"\u{4__1__}", 'A');
check(r"\u{1F63b}", '😻');
}
#[test]
fn test_unescape_str_good() {
fn check(literal_text: &str, expected: &str) {
let mut buf = Ok(String::with_capacity(literal_text.len()));
unescape_str(literal_text, &mut |range, c| {
if let Ok(b) = &mut buf {
match c {
Ok(c) => b.push(c),
Err(e) => buf = Err((range, e)),
}
}
});
let buf = buf.as_ref().map(|it| it.as_ref());
assert_eq!(buf, Ok(expected))
}
check("foo", "foo");
check("", "");
check(" \t\n\r\n", " \t\n\n");
check("hello \\\n world", "hello world");
check("hello \\\r\n world", "hello world");
check("thread's", "thread's")
}
#[test]
fn test_unescape_byte_bad() {
fn check(literal_text: &str, expected_error: EscapeError) {
let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err);
assert_eq!(actual_result, Err(expected_error));
}
check("", EscapeError::ZeroChars);
check(r"\", EscapeError::LoneSlash);
check("\n", EscapeError::EscapeOnlyChar);
check("\r\n", EscapeError::EscapeOnlyChar);
check("\t", EscapeError::EscapeOnlyChar);
check("'", EscapeError::EscapeOnlyChar);
check("\r", EscapeError::BareCarriageReturn);
check("spam", EscapeError::MoreThanOneChar);
check(r"\x0ff", EscapeError::MoreThanOneChar);
check(r#"\"a"#, EscapeError::MoreThanOneChar);
check(r"\na", EscapeError::MoreThanOneChar);
check(r"\ra", EscapeError::MoreThanOneChar);
check(r"\ta", EscapeError::MoreThanOneChar);
check(r"\\a", EscapeError::MoreThanOneChar);
check(r"\'a", EscapeError::MoreThanOneChar);
check(r"\0a", EscapeError::MoreThanOneChar);
check(r"\v", EscapeError::InvalidEscape);
check(r"\💩", EscapeError::InvalidEscape);
check(r"\●", EscapeError::InvalidEscape);
check(r"\x", EscapeError::TooShortHexEscape);
check(r"\x0", EscapeError::TooShortHexEscape);
check(r"\xa", EscapeError::TooShortHexEscape);
check(r"\xf", EscapeError::TooShortHexEscape);
check(r"\xx", EscapeError::InvalidCharInHexEscape);
check(r"\xы", EscapeError::InvalidCharInHexEscape);
check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
check(r"\xtt", EscapeError::InvalidCharInHexEscape);
check(r"\u", EscapeError::NoBraceInUnicodeEscape);
check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
check(r"\u{", EscapeError::UnclosedUnicodeEscape);
check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
check(r"\u{}", EscapeError::EmptyUnicodeEscape);
check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
check("ы", EscapeError::NonAsciiCharInByte);
check("🦀", EscapeError::NonAsciiCharInByte);
check(r"\u{0}", EscapeError::UnicodeEscapeInByte);
check(r"\u{000000}", EscapeError::UnicodeEscapeInByte);
check(r"\u{41}", EscapeError::UnicodeEscapeInByte);
check(r"\u{0041}", EscapeError::UnicodeEscapeInByte);
check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte);
check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte);
check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte);
check(r"\u{0}x", EscapeError::UnicodeEscapeInByte);
check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte);
check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte);
check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte);
check(r"\u{D800}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte);
}
#[test]
fn test_unescape_byte_good() {
fn check(literal_text: &str, expected_byte: u8) {
let actual_result = unescape_byte(literal_text);
assert_eq!(actual_result, Ok(expected_byte));
}
check("a", b'a');
check(r#"\""#, b'"');
check(r"\n", b'\n');
check(r"\r", b'\r');
check(r"\t", b'\t');
check(r"\\", b'\\');
check(r"\'", b'\'');
check(r"\0", b'\0');
check(r"\x00", b'\0');
check(r"\x5a", b'Z');
check(r"\x5A", b'Z');
check(r"\x7f", 127);
check(r"\x80", 128);
check(r"\xff", 255);
check(r"\xFF", 255);
}
#[test]
fn test_unescape_byte_str_good() {
fn check(literal_text: &str, expected: &[u8]) {
let mut buf = Ok(Vec::with_capacity(literal_text.len()));
unescape_byte_str(literal_text, &mut |range, c| {
if let Ok(b) = &mut buf {
match c {
Ok(c) => b.push(c),
Err(e) => buf = Err((range, e)),
}
}
});
let buf = buf.as_ref().map(|it| it.as_ref());
assert_eq!(buf, Ok(expected))
}
check("foo", b"foo");
check("", b"");
check(" \t\n\r\n", b" \t\n\n");
check("hello \\\n world", b"hello world");
check("hello \\\r\n world", b"hello world");
check("thread's", b"thread's")
}
}

View File

@ -0,0 +1,200 @@
//! Utilities for rendering escape sequence errors as diagnostics.
use std::ops::Range;
use std::iter::once;
use syntax_pos::{Span, BytePos};
use crate::errors::{Handler, Applicability};
use super::unescape::{EscapeError, Mode};
pub(crate) fn emit_unescape_error(
handler: &Handler,
// interior part of the literal, without quotes
lit: &str,
// full span of the literal, including quotes
span_with_quotes: Span,
mode: Mode,
// range of the error inside `lit`
range: Range<usize>,
error: EscapeError,
) {
log::debug!("emit_unescape_error: {:?}, {:?}, {:?}, {:?}, {:?}",
lit, span_with_quotes, mode, range, error);
let span = {
let Range { start, end } = range;
let (start, end) = (start as u32, end as u32);
let lo = span_with_quotes.lo() + BytePos(start + 1);
let hi = lo + BytePos(end - start);
span_with_quotes
.with_lo(lo)
.with_hi(hi)
};
let last_char = || {
let c = lit[range.clone()].chars().rev().next().unwrap();
let span = span.with_lo(span.hi() - BytePos(c.len_utf8() as u32));
(c, span)
};
match error {
EscapeError::LoneSurrogateUnicodeEscape => {
handler.struct_span_err(span, "invalid unicode character escape")
.help("unicode escape must not be a surrogate")
.emit();
}
EscapeError::OutOfRangeUnicodeEscape => {
handler.struct_span_err(span, "invalid unicode character escape")
.help("unicode escape must be at most 10FFFF")
.emit();
}
EscapeError::MoreThanOneChar => {
handler
.struct_span_err(
span_with_quotes,
"character literal may only contain one codepoint",
)
.span_suggestion(
span_with_quotes,
"if you meant to write a `str` literal, use double quotes",
format!("\"{}\"", lit),
Applicability::MachineApplicable,
).emit()
}
EscapeError::EscapeOnlyChar => {
let (c, _span) = last_char();
let mut msg = if mode.is_bytes() {
"byte constant must be escaped: "
} else {
"character constant must be escaped: "
}.to_string();
push_escaped_char(&mut msg, c);
handler.span_err(span, msg.as_str())
}
EscapeError::BareCarriageReturn => {
let msg = if mode.in_double_quotes() {
"bare CR not allowed in string, use \\r instead"
} else {
"character constant must be escaped: \\r"
};
handler.span_err(span, msg);
}
EscapeError::InvalidEscape => {
let (c, span) = last_char();
let label = if mode.is_bytes() {
"unknown byte escape"
} else {
"unknown character escape"
};
let mut msg = label.to_string();
msg.push_str(": ");
push_escaped_char(&mut msg, c);
let mut diag = handler.struct_span_err(span, msg.as_str());
diag.span_label(span, label);
if c == '{' || c == '}' && !mode.is_bytes() {
diag.help("if used in a formatting string, \
curly braces are escaped with `{{` and `}}`");
} else if c == '\r' {
diag.help("this is an isolated carriage return; \
consider checking your editor and version control settings");
}
diag.emit();
}
EscapeError::TooShortHexEscape => {
handler.span_err(span, "numeric character escape is too short")
}
EscapeError::InvalidCharInHexEscape | EscapeError::InvalidCharInUnicodeEscape => {
let (c, span) = last_char();
let mut msg = if error == EscapeError::InvalidCharInHexEscape {
"invalid character in numeric character escape: "
} else {
"invalid character in unicode escape: "
}.to_string();
push_escaped_char(&mut msg, c);
handler.span_err(span, msg.as_str())
}
EscapeError::NonAsciiCharInByte => {
assert!(mode.is_bytes());
let (_c, span) = last_char();
handler.span_err(span, "byte constant must be ASCII. \
Use a \\xHH escape for a non-ASCII byte")
}
EscapeError::OutOfRangeHexEscape => {
handler.span_err(span, "this form of character escape may only be used \
with characters in the range [\\x00-\\x7f]")
}
EscapeError::LeadingUnderscoreUnicodeEscape => {
let (_c, span) = last_char();
handler.span_err(span, "invalid start of unicode escape")
}
EscapeError::OverlongUnicodeEscape => {
handler.span_err(span, "overlong unicode escape (must have at most 6 hex digits)")
}
EscapeError::UnclosedUnicodeEscape => {
handler.span_err(span, "unterminated unicode escape (needed a `}`)")
}
EscapeError::NoBraceInUnicodeEscape => {
let msg = "incorrect unicode escape sequence";
let mut diag = handler.struct_span_err(span, msg);
let mut suggestion = "\\u{".to_owned();
let mut suggestion_len = 0;
let (c, char_span) = last_char();
let chars = once(c).chain(lit[range.end..].chars());
for c in chars.take(6).take_while(|c| c.is_digit(16)) {
suggestion.push(c);
suggestion_len += c.len_utf8();
}
if suggestion_len > 0 {
suggestion.push('}');
let lo = char_span.lo();
let hi = lo + BytePos(suggestion_len as u32);
diag.span_suggestion(
span.with_lo(lo).with_hi(hi),
"format of unicode escape sequences uses braces",
suggestion,
Applicability::MaybeIncorrect,
);
} else {
diag.span_label(span, msg);
diag.help(
"format of unicode escape sequences is `\\u{...}`",
);
}
diag.emit();
}
EscapeError::UnicodeEscapeInByte => {
handler.span_err(span, "unicode escape sequences cannot be used \
as a byte or in a byte string")
}
EscapeError::EmptyUnicodeEscape => {
handler.span_err(span, "empty unicode escape (must have at least 1 hex digit)")
}
EscapeError::ZeroChars => {
handler.span_err(span, "empty character literal")
}
EscapeError::LoneSlash => {
panic!("lexer accepted unterminated literal with trailing slash")
}
}
}
/// Pushes a character to a message string for error reporting
pub(crate) fn push_escaped_char(msg: &mut String, c: char) {
match c {
'\u{20}'..='\u{7e}' => {
// Don't escape \, ' or " for user-facing messages
msg.push(c);
}
_ => {
msg.extend(c.escape_default());
}
}
}

View File

@ -1,3 +1,4 @@
// compile-flags: -Z continue-parse-after-error
// ignore-tidy-tab
fn main() {
@ -76,7 +77,7 @@ raw { \n
println!("\x7B}\u8 {", 1);
//~^ ERROR incorrect unicode escape sequence
//~| ERROR argument never used
//~| ERROR invalid format string: expected `'}'` but string was terminated
// note: raw strings don't escape `\xFF` and `\u{FF}` sequences
println!(r#"\x7B}\u{8} {"#, 1);

View File

@ -1,13 +1,13 @@
error: incorrect unicode escape sequence
--> $DIR/format-string-error-2.rs:77:20
--> $DIR/format-string-error-2.rs:78:20
|
LL | println!("\x7B}\u8 {", 1);
| ^^-
| |
| help: format of unicode escape sequences uses braces: `\u{8}`
| |
| help: format of unicode escape sequences uses braces: `\u{8}`
error: invalid format string: expected `'}'`, found `'a'`
--> $DIR/format-string-error-2.rs:5:5
--> $DIR/format-string-error-2.rs:6:5
|
LL | format!("{
| - because of this opening brace
@ -17,7 +17,7 @@ LL | a");
= note: if you intended to print `{`, you can escape it using `{{`
error: invalid format string: expected `'}'`, found `'b'`
--> $DIR/format-string-error-2.rs:9:5
--> $DIR/format-string-error-2.rs:10:5
|
LL | format!("{ \
| - because of this opening brace
@ -28,7 +28,7 @@ LL | b");
= note: if you intended to print `{`, you can escape it using `{{`
error: invalid format string: expected `'}'`, found `'\'`
--> $DIR/format-string-error-2.rs:11:18
--> $DIR/format-string-error-2.rs:12:18
|
LL | format!(r#"{ \
| - ^ expected `}` in format string
@ -38,7 +38,7 @@ LL | format!(r#"{ \
= note: if you intended to print `{`, you can escape it using `{{`
error: invalid format string: expected `'}'`, found `'\'`
--> $DIR/format-string-error-2.rs:15:18
--> $DIR/format-string-error-2.rs:16:18
|
LL | format!(r#"{ \n
| - ^ expected `}` in format string
@ -48,7 +48,7 @@ LL | format!(r#"{ \n
= note: if you intended to print `{`, you can escape it using `{{`
error: invalid format string: expected `'}'`, found `'e'`
--> $DIR/format-string-error-2.rs:21:5
--> $DIR/format-string-error-2.rs:22:5
|
LL | format!("{ \n
| - because of this opening brace
@ -59,7 +59,7 @@ LL | e");
= note: if you intended to print `{`, you can escape it using `{{`
error: invalid format string: expected `'}'`, found `'a'`
--> $DIR/format-string-error-2.rs:25:5
--> $DIR/format-string-error-2.rs:26:5
|
LL | {
| - because of this opening brace
@ -69,7 +69,7 @@ LL | a");
= note: if you intended to print `{`, you can escape it using `{{`
error: invalid format string: expected `'}'`, found `'a'`
--> $DIR/format-string-error-2.rs:29:5
--> $DIR/format-string-error-2.rs:30:5
|
LL | {
| - because of this opening brace
@ -79,7 +79,7 @@ LL | a
= note: if you intended to print `{`, you can escape it using `{{`
error: invalid format string: expected `'}'`, found `'b'`
--> $DIR/format-string-error-2.rs:35:5
--> $DIR/format-string-error-2.rs:36:5
|
LL | { \
| - because of this opening brace
@ -90,7 +90,7 @@ LL | b");
= note: if you intended to print `{`, you can escape it using `{{`
error: invalid format string: expected `'}'`, found `'b'`
--> $DIR/format-string-error-2.rs:40:5
--> $DIR/format-string-error-2.rs:41:5
|
LL | { \
| - because of this opening brace
@ -101,7 +101,7 @@ LL | b \
= note: if you intended to print `{`, you can escape it using `{{`
error: invalid format string: expected `'}'`, found `'\'`
--> $DIR/format-string-error-2.rs:45:8
--> $DIR/format-string-error-2.rs:46:8
|
LL | raw { \
| - ^ expected `}` in format string
@ -111,7 +111,7 @@ LL | raw { \
= note: if you intended to print `{`, you can escape it using `{{`
error: invalid format string: expected `'}'`, found `'\'`
--> $DIR/format-string-error-2.rs:50:8
--> $DIR/format-string-error-2.rs:51:8
|
LL | raw { \n
| - ^ expected `}` in format string
@ -121,7 +121,7 @@ LL | raw { \n
= note: if you intended to print `{`, you can escape it using `{{`
error: invalid format string: expected `'}'`, found `'e'`
--> $DIR/format-string-error-2.rs:57:5
--> $DIR/format-string-error-2.rs:58:5
|
LL | { \n
| - because of this opening brace
@ -132,7 +132,7 @@ LL | e");
= note: if you intended to print `{`, you can escape it using `{{`
error: invalid format string: expected `'}'`, found `'a'`
--> $DIR/format-string-error-2.rs:67:5
--> $DIR/format-string-error-2.rs:68:5
|
LL | {
| - because of this opening brace
@ -142,13 +142,13 @@ LL | asdf}
= note: if you intended to print `{`, you can escape it using `{{`
error: 1 positional argument in format string, but no arguments were given
--> $DIR/format-string-error-2.rs:70:17
--> $DIR/format-string-error-2.rs:71:17
|
LL | println!("\t{}");
| ^^
error: invalid format string: expected `'}'` but string was terminated
--> $DIR/format-string-error-2.rs:74:27
--> $DIR/format-string-error-2.rs:75:27
|
LL | println!("\x7B}\u{8} {", 1);
| -^ expected `'}'` in format string
@ -157,16 +157,18 @@ LL | println!("\x7B}\u{8} {", 1);
|
= note: if you intended to print `{`, you can escape it using `{{`
error: argument never used
--> $DIR/format-string-error-2.rs:77:28
error: invalid format string: expected `'}'` but string was terminated
--> $DIR/format-string-error-2.rs:78:27
|
LL | println!("\x7B}\u8 {", 1);
| ------------ ^ argument never used
| |
| formatting specifier missing
| -^ expected `'}'` in format string
| |
| because of this opening brace
|
= note: if you intended to print `{`, you can escape it using `{{`
error: invalid format string: unmatched `}` found
--> $DIR/format-string-error-2.rs:82:21
--> $DIR/format-string-error-2.rs:83:21
|
LL | println!(r#"\x7B}\u{8} {"#, 1);
| ^ unmatched `}` in format string
@ -174,7 +176,7 @@ LL | println!(r#"\x7B}\u{8} {"#, 1);
= note: if you intended to print `}`, you can escape it using `}}`
error: invalid format string: unmatched `}` found
--> $DIR/format-string-error-2.rs:85:21
--> $DIR/format-string-error-2.rs:86:21
|
LL | println!(r#"\x7B}\u8 {"#, 1);
| ^ unmatched `}` in format string

View File

@ -1,20 +1,20 @@
error: this form of character escape may only be used with characters in the range [\x00-\x7f]
--> $DIR/ascii-only-character-escape.rs:4:16
--> $DIR/ascii-only-character-escape.rs:4:14
|
LL | let x = "\x80";
| ^^
| ^^^^
error: this form of character escape may only be used with characters in the range [\x00-\x7f]
--> $DIR/ascii-only-character-escape.rs:5:16
--> $DIR/ascii-only-character-escape.rs:5:14
|
LL | let y = "\xff";
| ^^
| ^^^^
error: this form of character escape may only be used with characters in the range [\x00-\x7f]
--> $DIR/ascii-only-character-escape.rs:6:16
--> $DIR/ascii-only-character-escape.rs:6:14
|
LL | let z = "\xe2";
| ^^
| ^^^^
error: aborting due to 3 previous errors

View File

@ -34,11 +34,11 @@ error: byte constant must be ASCII. Use a \xHH escape for a non-ASCII byte
LL | b'é';
| ^
error: unterminated byte constant: b'a
--> $DIR/byte-literals.rs:14:5
error: unterminated byte constant
--> $DIR/byte-literals.rs:14:6
|
LL | b'a
| ^^^
| ^^^^
error: aborting due to 7 previous errors

View File

@ -23,10 +23,10 @@ LL | b"é";
| ^
error: unterminated double quote byte string
--> $DIR/byte-string-literals.rs:9:7
--> $DIR/byte-string-literals.rs:9:6
|
LL | b"a
| _______^
| ______^
LL | | }
| |__^

View File

@ -9,32 +9,27 @@ fn main() {
let _ = b'\u';
//~^ ERROR incorrect unicode escape sequence
//~^^ ERROR unicode escape sequences cannot be used as a byte or in a byte string
let _ = b'\x5';
//~^ ERROR numeric character escape is too short
let _ = b'\xxy';
//~^ ERROR invalid character in numeric character escape: x
//~^^ ERROR invalid character in numeric character escape: y
let _ = '\x5';
//~^ ERROR numeric character escape is too short
let _ = '\xxy';
//~^ ERROR invalid character in numeric character escape: x
//~^^ ERROR invalid character in numeric character escape: y
let _ = b"\u{a4a4} \xf \u";
//~^ ERROR unicode escape sequences cannot be used as a byte or in a byte string
//~^^ ERROR invalid character in numeric character escape:
//~^^^ ERROR incorrect unicode escape sequence
//~^^^^ ERROR unicode escape sequences cannot be used as a byte or in a byte string
let _ = "\xf \u";
//~^ ERROR invalid character in numeric character escape:
//~^^ ERROR form of character escape may only be used with characters in the range [\x00-\x7f]
//~^^^ ERROR incorrect unicode escape sequence
//~^^ ERROR incorrect unicode escape sequence
let _ = "\u8f";
//~^ ERROR incorrect unicode escape sequence

View File

@ -18,88 +18,58 @@ LL | let _ = b'\u';
|
= help: format of unicode escape sequences is `\u{...}`
error: unicode escape sequences cannot be used as a byte or in a byte string
--> $DIR/issue-23620-invalid-escapes.rs:10:15
|
LL | let _ = b'\u';
| ^^
error: numeric character escape is too short
--> $DIR/issue-23620-invalid-escapes.rs:14:17
--> $DIR/issue-23620-invalid-escapes.rs:13:15
|
LL | let _ = b'\x5';
| ^
| ^^^
error: invalid character in numeric character escape: x
--> $DIR/issue-23620-invalid-escapes.rs:17:17
--> $DIR/issue-23620-invalid-escapes.rs:16:17
|
LL | let _ = b'\xxy';
| ^
error: invalid character in numeric character escape: y
--> $DIR/issue-23620-invalid-escapes.rs:17:18
|
LL | let _ = b'\xxy';
| ^
error: numeric character escape is too short
--> $DIR/issue-23620-invalid-escapes.rs:21:16
--> $DIR/issue-23620-invalid-escapes.rs:19:14
|
LL | let _ = '\x5';
| ^
| ^^^
error: invalid character in numeric character escape: x
--> $DIR/issue-23620-invalid-escapes.rs:24:16
--> $DIR/issue-23620-invalid-escapes.rs:22:16
|
LL | let _ = '\xxy';
| ^
error: invalid character in numeric character escape: y
--> $DIR/issue-23620-invalid-escapes.rs:24:17
|
LL | let _ = '\xxy';
| ^
error: unicode escape sequences cannot be used as a byte or in a byte string
--> $DIR/issue-23620-invalid-escapes.rs:28:15
--> $DIR/issue-23620-invalid-escapes.rs:25:15
|
LL | let _ = b"\u{a4a4} \xf \u";
| ^^^^^^^^
error: invalid character in numeric character escape:
--> $DIR/issue-23620-invalid-escapes.rs:28:27
--> $DIR/issue-23620-invalid-escapes.rs:25:27
|
LL | let _ = b"\u{a4a4} \xf \u";
| ^
error: incorrect unicode escape sequence
--> $DIR/issue-23620-invalid-escapes.rs:28:28
--> $DIR/issue-23620-invalid-escapes.rs:25:28
|
LL | let _ = b"\u{a4a4} \xf \u";
| ^^ incorrect unicode escape sequence
|
= help: format of unicode escape sequences is `\u{...}`
error: unicode escape sequences cannot be used as a byte or in a byte string
--> $DIR/issue-23620-invalid-escapes.rs:28:28
|
LL | let _ = b"\u{a4a4} \xf \u";
| ^^
error: invalid character in numeric character escape:
--> $DIR/issue-23620-invalid-escapes.rs:34:17
--> $DIR/issue-23620-invalid-escapes.rs:30:17
|
LL | let _ = "\xf \u";
| ^
error: this form of character escape may only be used with characters in the range [\x00-\x7f]
--> $DIR/issue-23620-invalid-escapes.rs:34:16
|
LL | let _ = "\xf \u";
| ^^
error: incorrect unicode escape sequence
--> $DIR/issue-23620-invalid-escapes.rs:34:18
--> $DIR/issue-23620-invalid-escapes.rs:30:18
|
LL | let _ = "\xf \u";
| ^^ incorrect unicode escape sequence
@ -107,12 +77,12 @@ LL | let _ = "\xf \u";
= help: format of unicode escape sequences is `\u{...}`
error: incorrect unicode escape sequence
--> $DIR/issue-23620-invalid-escapes.rs:39:14
--> $DIR/issue-23620-invalid-escapes.rs:34:14
|
LL | let _ = "\u8f";
| ^^--
| |
| help: format of unicode escape sequences uses braces: `\u{8f}`
| |
| help: format of unicode escape sequences uses braces: `\u{8f}`
error: aborting due to 18 previous errors
error: aborting due to 13 previous errors

View File

@ -1,14 +1,14 @@
error: numeric character escape is too short
--> $DIR/lex-bad-char-literals-1.rs:3:8
--> $DIR/lex-bad-char-literals-1.rs:3:6
|
LL | '\x1'
| ^
| ^^^
error: numeric character escape is too short
--> $DIR/lex-bad-char-literals-1.rs:7:8
--> $DIR/lex-bad-char-literals-1.rs:7:6
|
LL | "\x1"
| ^
| ^^^
error: unknown character escape: \u{25cf}
--> $DIR/lex-bad-char-literals-1.rs:11:7

View File

@ -3,6 +3,10 @@ error: character literal may only contain one codepoint
|
LL | 'nope'
| ^^^^^^
help: if you meant to write a `str` literal, use double quotes
|
LL | "nope"
| ^^^^^^
error[E0601]: `main` function not found in crate `lex_bad_char_literals_2`
|

View File

@ -1,5 +1,5 @@
//
// This test needs to the last one appearing in this file as it kills the parser
static c: char =
' //~ ERROR: character literal may only contain one codepoint
' //~ ERROR: unterminated character literal
;

View File

@ -1,8 +1,8 @@
error: character literal may only contain one codepoint: '●
error: unterminated character literal
--> $DIR/lex-bad-char-literals-4.rs:4:5
|
LL | '●
| ^^
| ^^^^
error: aborting due to previous error

View File

@ -3,18 +3,30 @@ error: character literal may only contain one codepoint
|
LL | let x: &str = 'ab';
| ^^^^
help: if you meant to write a `str` literal, use double quotes
|
LL | let x: &str = "ab";
| ^^^^
error: character literal may only contain one codepoint
--> $DIR/lex-bad-char-literals-6.rs:4:19
|
LL | let y: char = 'cd';
| ^^^^
help: if you meant to write a `str` literal, use double quotes
|
LL | let y: char = "cd";
| ^^^^
error: character literal may only contain one codepoint
--> $DIR/lex-bad-char-literals-6.rs:6:13
|
LL | let z = 'ef';
| ^^^^
help: if you meant to write a `str` literal, use double quotes
|
LL | let z = "ef";
| ^^^^
error[E0277]: can't compare `&str` with `char`
--> $DIR/lex-bad-char-literals-6.rs:9:10

View File

@ -0,0 +1,14 @@
// compile-flags: -Z continue-parse-after-error
fn main() {
let _: char = '';
//~^ ERROR: empty character literal
let _: char = '\u{}';
//~^ ERROR: empty unicode escape (must have at least 1 hex digit)
// Next two are OK, but may befool error recovery
let _ = '/';
let _ = b'/';
let _ = ' hello // here's a comment
//~^ ERROR: unterminated character literal
}

View File

@ -0,0 +1,20 @@
error: empty character literal
--> $DIR/lex-bad-char-literals-7.rs:3:20
|
LL | let _: char = '';
| ^
error: empty unicode escape (must have at least 1 hex digit)
--> $DIR/lex-bad-char-literals-7.rs:5:20
|
LL | let _: char = '\u{}';
| ^^^^
error: unterminated character literal
--> $DIR/lex-bad-char-literals-7.rs:12:13
|
LL | let _ = ' hello // here's a comment
| ^^^^^^^^
error: aborting due to 3 previous errors

View File

@ -0,0 +1,10 @@
macro_rules! black_hole {
($($tt:tt)*) => {}
}
fn main() {
black_hole! { '\u{FFFFFF}' }
//~^ ERROR: invalid unicode character escape
black_hole! { "this is surrogate: \u{DAAA}" }
//~^ ERROR: invalid unicode character escape
}

View File

@ -0,0 +1,18 @@
error: invalid unicode character escape
--> $DIR/literals-are-validated-before-expansion.rs:6:20
|
LL | black_hole! { '\u{FFFFFF}' }
| ^^^^^^^^^^
|
= help: unicode escape must be at most 10FFFF
error: invalid unicode character escape
--> $DIR/literals-are-validated-before-expansion.rs:8:39
|
LL | black_hole! { "this is surrogate: \u{DAAA}" }
| ^^^^^^^^
|
= help: unicode escape must not be a surrogate
error: aborting due to 2 previous errors

View File

@ -1,8 +1,8 @@
error: unterminated unicode escape (needed a `}`)
--> $DIR/new-unicode-escapes-1.rs:2:21
--> $DIR/new-unicode-escapes-1.rs:2:14
|
LL | let s = "\u{2603";
| ^
| ^^^^^^^
error: aborting due to previous error

View File

@ -1,8 +1,8 @@
error: overlong unicode escape (must have at most 6 hex digits)
--> $DIR/new-unicode-escapes-2.rs:2:17
--> $DIR/new-unicode-escapes-2.rs:2:14
|
LL | let s = "\u{260311111111}";
| ^^^^^^^^^^^^
| ^^^^^^^^^^^^^^^^
error: aborting due to previous error

View File

@ -1,16 +1,16 @@
error: invalid unicode character escape
--> $DIR/new-unicode-escapes-3.rs:2:14
--> $DIR/new-unicode-escapes-3.rs:2:15
|
LL | let s1 = "\u{d805}";
| ^^^^^^^^^^
| ^^^^^^^^
|
= help: unicode escape must not be a surrogate
error: invalid unicode character escape
--> $DIR/new-unicode-escapes-3.rs:3:14
--> $DIR/new-unicode-escapes-3.rs:3:15
|
LL | let s2 = "\u{ffffff}";
| ^^^^^^^^^^^^
| ^^^^^^^^^^
|
= help: unicode escape must be at most 10FFFF