diff --git a/src/libcore/str.rs b/src/libcore/str.rs index c01997f1c42..84ffb7fb20e 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -560,6 +560,8 @@ Section: Comparing strings // share the implementation of the lang-item vs. non-lang-item // eq_slice. +/// NOTE: This function is (ab)used in rustc::middle::trans::_match +/// to compare &[u8] byte slices that are not necessarily valid UTF-8. #[inline] fn eq_slice_(a: &str, b: &str) -> bool { #[allow(ctypes)] @@ -572,6 +574,8 @@ fn eq_slice_(a: &str, b: &str) -> bool { } /// Bytewise slice equality +/// NOTE: This function is (ab)used in rustc::middle::trans::_match +/// to compare &[u8] byte slices that are not necessarily valid UTF-8. #[cfg(not(test))] #[lang="str_eq"] #[inline] diff --git a/src/libregex_macros/lib.rs b/src/libregex_macros/lib.rs index 8641936cc34..ff5cada05ea 100644 --- a/src/libregex_macros/lib.rs +++ b/src/libregex_macros/lib.rs @@ -182,7 +182,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, #[allow(unused_variable)] fn run(&mut self, start: uint, end: uint) -> Vec> { let mut matched = false; - let prefix_bytes: &[u8] = &$prefix_bytes; + let prefix_bytes: &[u8] = $prefix_bytes; let mut clist = &mut Threads::new(self.which); let mut nlist = &mut Threads::new(self.which); diff --git a/src/librustc/middle/const_eval.rs b/src/librustc/middle/const_eval.rs index 3c5b0664f03..72def2c10da 100644 --- a/src/librustc/middle/const_eval.rs +++ b/src/librustc/middle/const_eval.rs @@ -529,6 +529,7 @@ pub fn compare_const_vals(a: &const_val, b: &const_val) -> Option { (&const_float(a), &const_float(b)) => compare_vals(a, b), (&const_str(ref a), &const_str(ref b)) => compare_vals(a, b), (&const_bool(a), &const_bool(b)) => compare_vals(a, b), + (&const_binary(ref a), &const_binary(ref b)) => compare_vals(a, b), _ => None } } diff --git a/src/librustc/middle/trans/_match.rs b/src/librustc/middle/trans/_match.rs index 9361d64250c..808d894be43 100644 --- a/src/librustc/middle/trans/_match.rs +++ b/src/librustc/middle/trans/_match.rs @@ -1273,13 +1273,24 @@ fn compare_values<'a>( val: bool_to_i1(result.bcx, result.val) } } - _ => cx.sess().bug("only scalars and strings supported in compare_values"), + _ => cx.sess().bug("only strings supported in compare_values"), }, ty::ty_rptr(_, mt) => match ty::get(mt.ty).sty { ty::ty_str => compare_str(cx, lhs, rhs, rhs_t), - _ => cx.sess().bug("only scalars and strings supported in compare_values"), + ty::ty_vec(mt, _) => match ty::get(mt.ty).sty { + ty::ty_uint(ast::TyU8) => { + // NOTE: cast &[u8] to &str and abuse the str_eq lang item, + // which calls memcmp(). + let t = ty::mk_str_slice(cx.tcx(), ty::ReStatic, ast::MutImmutable); + let lhs = BitCast(cx, lhs, type_of::type_of(cx.ccx(), t).ptr_to()); + let rhs = BitCast(cx, rhs, type_of::type_of(cx.ccx(), t).ptr_to()); + compare_str(cx, lhs, rhs, rhs_t) + }, + _ => cx.sess().bug("only byte strings supported in compare_values"), + }, + _ => cx.sess().bug("on string and byte strings supported in compare_values"), }, - _ => cx.sess().bug("only scalars and strings supported in compare_values"), + _ => cx.sess().bug("only scalars, byte strings, and strings supported in compare_values"), } } diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index 8a63b55afed..172a1be7b4e 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -140,7 +140,8 @@ fn doit(sess: &parse::ParseSess, mut lexer: lexer::StringReader, } // text literals - t::LIT_BYTE(..) | t::LIT_CHAR(..) | t::LIT_STR(..) | t::LIT_STR_RAW(..) => "string", + t::LIT_BYTE(..) | t::LIT_BINARY(..) | + t::LIT_CHAR(..) | t::LIT_STR(..) | t::LIT_STR_RAW(..) => "string", // number literals t::LIT_INT(..) | t::LIT_UINT(..) | t::LIT_INT_UNSUFFIXED(..) | diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 7e4cb195cea..59bcf059fcd 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -654,7 +654,8 @@ impl<'a> StringReader<'a> { // Note: r as in r" or r#" is part of a raw string literal, // b as in b' is part of a byte literal. // They are not identifiers, and are handled further down. - ('r', Some('"')) | ('r', Some('#')) | ('b', Some('\'')) => false, + ('r', Some('"')) | ('r', Some('#')) | + ('b', Some('"')) | ('b', Some('\'')) => false, _ => true } { let start = self.last_pos; @@ -859,62 +860,124 @@ impl<'a> StringReader<'a> { } 'b' => { self.bump(); - assert!(self.curr_is('\''), "Should have been a token::IDENT"); - self.bump(); - let start = self.last_pos; + return match self.curr { + Some('\'') => parse_byte(self), + Some('"') => parse_byte_string(self), + _ => unreachable!() // Should have been a token::IDENT above. + }; - // the eof will be picked up by the final `'` check below - let mut c2 = self.curr.unwrap_or('\x00'); - self.bump(); + fn parse_byte(self_: &mut StringReader) -> token::Token { + self_.bump(); + let start = self_.last_pos; - match c2 { - '\\' => { - // '\X' for some X must be a character constant: - let escaped = self.curr; - let escaped_pos = self.last_pos; - self.bump(); - match escaped { - None => {} - Some(e) => { - c2 = match e { - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - '\\' => '\\', - '\'' => '\'', - '"' => '"', - '0' => '\x00', - 'x' => self.scan_numeric_escape(2u, '\''), - c2 => { - self.err_span_char(escaped_pos, self.last_pos, - "unknown byte escape", c2); - c2 + // the eof will be picked up by the final `'` check below + let mut c2 = self_.curr.unwrap_or('\x00'); + self_.bump(); + + match c2 { + '\\' => { + // '\X' for some X must be a character constant: + let escaped = self_.curr; + let escaped_pos = self_.last_pos; + self_.bump(); + match escaped { + None => {} + Some(e) => { + c2 = match e { + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + '\\' => '\\', + '\'' => '\'', + '"' => '"', + '0' => '\x00', + 'x' => self_.scan_numeric_escape(2u, '\''), + c2 => { + self_.err_span_char( + escaped_pos, self_.last_pos, + "unknown byte escape", c2); + c2 + } } } } } + '\t' | '\n' | '\r' | '\'' => { + self_.err_span_char( start, self_.last_pos, + "byte constant must be escaped", c2); + } + _ => if c2 > '\x7F' { + self_.err_span_char( start, self_.last_pos, + "byte constant must be ASCII. \ + Use a \\xHH escape for a non-ASCII byte", c2); + } } - '\t' | '\n' | '\r' | '\'' => { - self.err_span_char( start, self.last_pos, - "byte constant must be escaped", c2); + if !self_.curr_is('\'') { + // Byte offsetting here is okay because the + // character before position `start` are an + // ascii single quote and ascii 'b'. + self_.fatal_span_verbose( + start - BytePos(2), self_.last_pos, + "unterminated byte constant".to_string()); } - _ if c2 > '\x7F' => { - self.err_span_char( start, self.last_pos, - "byte constant must be ASCII. \ - Use a \\xHH escape for a non-ASCII byte", c2); - } - _ => {} + self_.bump(); // advance curr past token + return token::LIT_BYTE(c2 as u8); } - if !self.curr_is('\'') { - self.fatal_span_verbose( - // Byte offsetting here is okay because the - // character before position `start` are an - // ascii single quote and ascii 'b'. - start - BytePos(2), self.last_pos, - "unterminated byte constant".to_string()); + + fn parse_byte_string(self_: &mut StringReader) -> token::Token { + self_.bump(); + let start = self_.last_pos; + let mut value = Vec::new(); + while !self_.curr_is('"') { + if self_.is_eof() { + self_.fatal_span(start, self_.last_pos, + "unterminated double quote byte string"); + } + + let ch = self_.curr.unwrap(); + self_.bump(); + match ch { + '\\' => { + if self_.is_eof() { + self_.fatal_span(start, self_.last_pos, + "unterminated double quote byte string"); + } + + let escaped = self_.curr.unwrap(); + let escaped_pos = self_.last_pos; + self_.bump(); + match escaped { + 'n' => value.push('\n' as u8), + 'r' => value.push('\r' as u8), + 't' => value.push('\t' as u8), + '\\' => value.push('\\' as u8), + '\'' => value.push('\'' as u8), + '"' => value.push('"' as u8), + '\n' => self_.consume_whitespace(), + '0' => value.push(0), + 'x' => { + value.push(self_.scan_numeric_escape(2u, '"') as u8); + } + c2 => { + self_.err_span_char(escaped_pos, self_.last_pos, + "unknown byte string escape", c2); + } + } + } + _ => { + if ch <= '\x7F' { + value.push(ch as u8) + } else { + self_.err_span_char(self_.last_pos, self_.last_pos, + "byte string must be ASCII. \ + Use a \\xHH escape for a non-ASCII byte", ch); + } + } + } + } + self_.bump(); + return token::LIT_BINARY(Rc::new(value)); } - self.bump(); // advance curr past token - return token::LIT_BYTE(c2 as u8); } '"' => { let mut accum_str = String::new(); diff --git a/src/libsyntax/parse/parser.rs b/src/libsyntax/parse/parser.rs index 0bd47ede214..826d28ef3ff 100644 --- a/src/libsyntax/parse/parser.rs +++ b/src/libsyntax/parse/parser.rs @@ -33,7 +33,7 @@ use ast::{ForeignItem, ForeignItemStatic, ForeignItemFn, ForeignMod}; use ast::{Ident, NormalFn, Inherited, Item, Item_, ItemStatic}; use ast::{ItemEnum, ItemFn, ItemForeignMod, ItemImpl}; use ast::{ItemMac, ItemMod, ItemStruct, ItemTrait, ItemTy, Lit, Lit_}; -use ast::{LitBool, LitFloat, LitFloatUnsuffixed, LitInt, LitChar, LitByte}; +use ast::{LitBool, LitFloat, LitFloatUnsuffixed, LitInt, LitChar, LitByte, LitBinary}; use ast::{LitIntUnsuffixed, LitNil, LitStr, LitUint, Local, LocalLet}; use ast::{MutImmutable, MutMutable, Mac_, MacInvocTT, Matcher, MatchNonterminal}; use ast::{MatchSeq, MatchTok, Method, MutTy, BiMul, Mutability}; @@ -1529,6 +1529,7 @@ impl<'a> Parser<'a> { token::LIT_STR_RAW(s, n) => { LitStr(self.id_to_interned_str(s), ast::RawStr(n)) } + token::LIT_BINARY(ref v) => LitBinary(v.clone()), token::LPAREN => { self.expect(&token::RPAREN); LitNil }, _ => { self.unexpected_last(tok); } } diff --git a/src/libsyntax/parse/token.rs b/src/libsyntax/parse/token.rs index b8f13624a32..b76dcaf0b94 100644 --- a/src/libsyntax/parse/token.rs +++ b/src/libsyntax/parse/token.rs @@ -87,6 +87,7 @@ pub enum Token { LIT_FLOAT_UNSUFFIXED(ast::Ident), LIT_STR(ast::Ident), LIT_STR_RAW(ast::Ident, uint), /* raw str delimited by n hash symbols */ + LIT_BINARY(Rc>), /* Name components */ // an identifier contains an "is_mod_name" boolean, @@ -231,17 +232,22 @@ pub fn to_str(t: &Token) -> String { body } LIT_STR(s) => { - (format!("\"{}\"", get_ident(s).get().escape_default())).to_string() + format!("\"{}\"", get_ident(s).get().escape_default()) } LIT_STR_RAW(s, n) => { - (format!("r{delim}\"{string}\"{delim}", - delim="#".repeat(n), string=get_ident(s))).to_string() + format!("r{delim}\"{string}\"{delim}", + delim="#".repeat(n), string=get_ident(s)) + } + LIT_BINARY(ref v) => { + format!( + "b\"{}\"", + v.iter().map(|&b| b as char).collect::().escape_default()) } /* Name components */ IDENT(s, _) => get_ident(s).get().to_string(), LIFETIME(s) => { - (format!("{}", get_ident(s))).to_string() + format!("{}", get_ident(s)) } UNDERSCORE => "_".to_string(), @@ -291,6 +297,7 @@ pub fn can_begin_expr(t: &Token) -> bool { LIT_FLOAT_UNSUFFIXED(_) => true, LIT_STR(_) => true, LIT_STR_RAW(_, _) => true, + LIT_BINARY(_) => true, POUND => true, AT => true, NOT => true, @@ -330,6 +337,7 @@ pub fn is_lit(t: &Token) -> bool { LIT_FLOAT_UNSUFFIXED(_) => true, LIT_STR(_) => true, LIT_STR_RAW(_, _) => true, + LIT_BINARY(_) => true, _ => false } } diff --git a/src/libsyntax/print/pprust.rs b/src/libsyntax/print/pprust.rs index 6ea2eed293e..fafebd3c5dc 100644 --- a/src/libsyntax/print/pprust.rs +++ b/src/libsyntax/print/pprust.rs @@ -2342,19 +2342,9 @@ impl<'a> State<'a> { ast::LitBool(val) => { if val { word(&mut self.s, "true") } else { word(&mut self.s, "false") } } - ast::LitBinary(ref arr) => { - try!(self.ibox(indent_unit)); - try!(word(&mut self.s, "[")); - try!(self.commasep_cmnt(Inconsistent, - arr.as_slice(), - |s, u| { - word(&mut s.s, - format!("{}", - *u).as_slice()) - }, - |_| lit.span)); - try!(word(&mut self.s, "]")); - self.end() + ast::LitBinary(ref v) => { + let escaped: String = v.iter().map(|&b| b as char).collect(); + word(&mut self.s, format!("b\"{}\"", escaped.escape_default()).as_slice()) } } } diff --git a/src/test/compile-fail/byte-string-literals.rs b/src/test/compile-fail/byte-string-literals.rs new file mode 100644 index 00000000000..ec67cdd77e1 --- /dev/null +++ b/src/test/compile-fail/byte-string-literals.rs @@ -0,0 +1,23 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +// ignore-tidy-tab + +static FOO: &'static [u8] = b"\f"; //~ ERROR unknown byte escape + +pub fn main() { + b"\f"; //~ ERROR unknown byte escape + b"\x0Z"; //~ ERROR illegal character in numeric character escape: Z + b"é"; //~ ERROR byte constant must be ASCII + b"a //~ ERROR unterminated double quote byte string +} + + diff --git a/src/test/compile-fail/concat.rs b/src/test/compile-fail/concat.rs index a3dc1174424..dc31126e6d6 100644 --- a/src/test/compile-fail/concat.rs +++ b/src/test/compile-fail/concat.rs @@ -10,6 +10,7 @@ fn main() { concat!(b'f'); //~ ERROR: cannot concatenate a binary literal + concat!(b"foo"); //~ ERROR: cannot concatenate a binary literal concat!(foo); //~ ERROR: expected a literal concat!(foo()); //~ ERROR: expected a literal } diff --git a/src/test/run-pass/byte-literals.rs b/src/test/run-pass/byte-literals.rs index 560b2f0337a..58df7dc8efd 100644 --- a/src/test/run-pass/byte-literals.rs +++ b/src/test/run-pass/byte-literals.rs @@ -10,6 +10,7 @@ static FOO: u8 = b'\xF0'; +static BAR: &'static [u8] = b"a\xF0\t"; pub fn main() { assert_eq!(b'a', 97u8); @@ -35,4 +36,15 @@ pub fn main() { b'a' .. b'z' => {}, _ => fail!() } + + assert_eq!(b"a\n\r\t\\\'\"\0\xF0", + &[97u8, 10u8, 13u8, 9u8, 92u8, 39u8, 34u8, 0u8, 240u8]); + assert_eq!(b"a\ + b", &[97u8, 98u8]); + assert_eq!(BAR, &[97u8, 240u8, 9u8]); + + match &[97u8, 10u8] { + b"a\n" => {}, + _ => fail!(), + } }