Auto merge of #36377 - tormol:encode_utf, r=alexcrichton

Change encode_utf{8,16}() to write to a buffer and panic if it's too small

cc #27784

Should the "A buffer that's too small" examples be removed and replaced by tests?
This commit is contained in:
bors 2016-09-29 11:20:02 -07:00 committed by GitHub
commit 289f3a4ca7
9 changed files with 163 additions and 188 deletions

View File

@ -975,7 +975,7 @@ impl String {
pub fn push(&mut self, ch: char) { pub fn push(&mut self, ch: char) {
match ch.len_utf8() { match ch.len_utf8() {
1 => self.vec.push(ch as u8), 1 => self.vec.push(ch as u8),
_ => self.vec.extend_from_slice(ch.encode_utf8().as_slice()), _ => self.vec.extend_from_slice(ch.encode_utf8(&mut [0;4]).as_bytes()),
} }
} }
@ -1131,10 +1131,11 @@ impl String {
let len = self.len(); let len = self.len();
assert!(idx <= len); assert!(idx <= len);
assert!(self.is_char_boundary(idx)); assert!(self.is_char_boundary(idx));
let bits = ch.encode_utf8(); let mut bits = [0; 4];
let bits = ch.encode_utf8(&mut bits).as_bytes();
unsafe { unsafe {
self.insert_bytes(idx, bits.as_slice()); self.insert_bytes(idx, bits);
} }
} }

View File

@ -786,9 +786,9 @@ fn test_rev_iterator() {
#[test] #[test]
fn test_chars_decoding() { fn test_chars_decoding() {
let mut bytes = [0; 4];
for c in (0..0x110000).filter_map(::std::char::from_u32) { for c in (0..0x110000).filter_map(::std::char::from_u32) {
let bytes = c.encode_utf8(); let s = c.encode_utf8(&mut bytes);
let s = ::std::str::from_utf8(bytes.as_slice()).unwrap();
if Some(c) != s.chars().next() { if Some(c) != s.chars().next() {
panic!("character {:x}={} does not decode correctly", c as u32, c); panic!("character {:x}={} does not decode correctly", c as u32, c);
} }
@ -797,9 +797,9 @@ fn test_chars_decoding() {
#[test] #[test]
fn test_chars_rev_decoding() { fn test_chars_rev_decoding() {
let mut bytes = [0; 4];
for c in (0..0x110000).filter_map(::std::char::from_u32) { for c in (0..0x110000).filter_map(::std::char::from_u32) {
let bytes = c.encode_utf8(); let s = c.encode_utf8(&mut bytes);
let s = ::std::str::from_utf8(bytes.as_slice()).unwrap();
if Some(c) != s.chars().rev().next() { if Some(c) != s.chars().rev().next() {
panic!("character {:x}={} does not decode correctly", c as u32, c); panic!("character {:x}={} does not decode correctly", c as u32, c);
} }

View File

@ -18,6 +18,7 @@
use char_private::is_printable; use char_private::is_printable;
use convert::TryFrom; use convert::TryFrom;
use fmt; use fmt;
use slice;
use iter::FusedIterator; use iter::FusedIterator;
use mem::transmute; use mem::transmute;
@ -327,9 +328,9 @@ pub trait CharExt {
#[stable(feature = "core", since = "1.6.0")] #[stable(feature = "core", since = "1.6.0")]
fn len_utf16(self) -> usize; fn len_utf16(self) -> usize;
#[unstable(feature = "unicode", issue = "27784")] #[unstable(feature = "unicode", issue = "27784")]
fn encode_utf8(self) -> EncodeUtf8; fn encode_utf8(self, dst: &mut [u8]) -> &mut str;
#[unstable(feature = "unicode", issue = "27784")] #[unstable(feature = "unicode", issue = "27784")]
fn encode_utf16(self) -> EncodeUtf16; fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16];
} }
#[stable(feature = "core", since = "1.6.0")] #[stable(feature = "core", since = "1.6.0")]
@ -419,47 +420,59 @@ impl CharExt for char {
} }
#[inline] #[inline]
fn encode_utf8(self) -> EncodeUtf8 { fn encode_utf8(self, dst: &mut [u8]) -> &mut str {
let code = self as u32; let code = self as u32;
let mut buf = [0; 4]; unsafe {
let pos = if code < MAX_ONE_B { let len =
buf[3] = code as u8; if code < MAX_ONE_B && !dst.is_empty() {
3 *dst.get_unchecked_mut(0) = code as u8;
} else if code < MAX_TWO_B { 1
buf[2] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; } else if code < MAX_TWO_B && dst.len() >= 2 {
buf[3] = (code & 0x3F) as u8 | TAG_CONT; *dst.get_unchecked_mut(0) = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
2 *dst.get_unchecked_mut(1) = (code & 0x3F) as u8 | TAG_CONT;
} else if code < MAX_THREE_B { 2
buf[1] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; } else if code < MAX_THREE_B && dst.len() >= 3 {
buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT; *dst.get_unchecked_mut(0) = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
buf[3] = (code & 0x3F) as u8 | TAG_CONT; *dst.get_unchecked_mut(1) = (code >> 6 & 0x3F) as u8 | TAG_CONT;
1 *dst.get_unchecked_mut(2) = (code & 0x3F) as u8 | TAG_CONT;
} else { 3
buf[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; } else if dst.len() >= 4 {
buf[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT; *dst.get_unchecked_mut(0) = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT; *dst.get_unchecked_mut(1) = (code >> 12 & 0x3F) as u8 | TAG_CONT;
buf[3] = (code & 0x3F) as u8 | TAG_CONT; *dst.get_unchecked_mut(2) = (code >> 6 & 0x3F) as u8 | TAG_CONT;
0 *dst.get_unchecked_mut(3) = (code & 0x3F) as u8 | TAG_CONT;
}; 4
EncodeUtf8 { buf: buf, pos: pos } } else {
panic!("encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
from_u32_unchecked(code).len_utf8(),
code,
dst.len())
};
transmute(slice::from_raw_parts_mut(dst.as_mut_ptr(), len))
}
} }
#[inline] #[inline]
fn encode_utf16(self) -> EncodeUtf16 { fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] {
let mut buf = [0; 2];
let mut code = self as u32; let mut code = self as u32;
let pos = if (code & 0xFFFF) == code { unsafe {
// The BMP falls through (assuming non-surrogate, as it should) if (code & 0xFFFF) == code && !dst.is_empty() {
buf[1] = code as u16; // The BMP falls through (assuming non-surrogate, as it should)
1 *dst.get_unchecked_mut(0) = code as u16;
} else { slice::from_raw_parts_mut(dst.as_mut_ptr(), 1)
// Supplementary planes break into surrogates. } else if dst.len() >= 2 {
code -= 0x1_0000; // Supplementary planes break into surrogates.
buf[0] = 0xD800 | ((code >> 10) as u16); code -= 0x1_0000;
buf[1] = 0xDC00 | ((code as u16) & 0x3FF); *dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16);
0 *dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF);
}; slice::from_raw_parts_mut(dst.as_mut_ptr(), 2)
EncodeUtf16 { buf: buf, pos: pos } } else {
panic!("encode_utf16: need {} units to encode U+{:X}, but the buffer has {}",
from_u32_unchecked(code).len_utf16(),
code,
dst.len())
}
}
} }
} }
@ -702,88 +715,7 @@ impl ExactSizeIterator for EscapeDebug { }
#[unstable(feature = "fused", issue = "35602")] #[unstable(feature = "fused", issue = "35602")]
impl FusedIterator for EscapeDebug {} impl FusedIterator for EscapeDebug {}
/// An iterator over `u8` entries represending the UTF-8 encoding of a `char`
/// value.
///
/// Constructed via the `.encode_utf8()` method on `char`.
#[unstable(feature = "unicode", issue = "27784")]
#[derive(Debug)]
pub struct EncodeUtf8 {
buf: [u8; 4],
pos: usize,
}
impl EncodeUtf8 {
/// Returns the remaining bytes of this iterator as a slice.
#[unstable(feature = "unicode", issue = "27784")]
pub fn as_slice(&self) -> &[u8] {
&self.buf[self.pos..]
}
}
#[unstable(feature = "unicode", issue = "27784")]
impl Iterator for EncodeUtf8 {
type Item = u8;
fn next(&mut self) -> Option<u8> {
if self.pos == self.buf.len() {
None
} else {
let ret = Some(self.buf[self.pos]);
self.pos += 1;
ret
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.as_slice().iter().size_hint()
}
}
#[unstable(feature = "fused", issue = "35602")]
impl FusedIterator for EncodeUtf8 {}
/// An iterator over `u16` entries represending the UTF-16 encoding of a `char`
/// value.
///
/// Constructed via the `.encode_utf16()` method on `char`.
#[unstable(feature = "unicode", issue = "27784")]
#[derive(Debug)]
pub struct EncodeUtf16 {
buf: [u16; 2],
pos: usize,
}
impl EncodeUtf16 {
/// Returns the remaining bytes of this iterator as a slice.
#[unstable(feature = "unicode", issue = "27784")]
pub fn as_slice(&self) -> &[u16] {
&self.buf[self.pos..]
}
}
#[unstable(feature = "unicode", issue = "27784")]
impl Iterator for EncodeUtf16 {
type Item = u16;
fn next(&mut self) -> Option<u16> {
if self.pos == self.buf.len() {
None
} else {
let ret = Some(self.buf[self.pos]);
self.pos += 1;
ret
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.as_slice().iter().size_hint()
}
}
#[unstable(feature = "fused", issue = "35602")]
impl FusedIterator for EncodeUtf16 {}
/// An iterator over an iterator of bytes of the characters the bytes represent /// An iterator over an iterator of bytes of the characters the bytes represent
/// as UTF-8 /// as UTF-8

View File

@ -97,9 +97,7 @@ pub trait Write {
/// This function will return an instance of `Error` on error. /// This function will return an instance of `Error` on error.
#[stable(feature = "fmt_write_char", since = "1.1.0")] #[stable(feature = "fmt_write_char", since = "1.1.0")]
fn write_char(&mut self, c: char) -> Result { fn write_char(&mut self, c: char) -> Result {
self.write_str(unsafe { self.write_str(c.encode_utf8(&mut [0; 4]))
str::from_utf8_unchecked(c.encode_utf8().as_slice())
})
} }
/// Glue for usage of the `write!` macro with implementors of this trait. /// Glue for usage of the `write!` macro with implementors of this trait.
@ -924,9 +922,7 @@ impl<'a> Formatter<'a> {
// Writes the sign if it exists, and then the prefix if it was requested // Writes the sign if it exists, and then the prefix if it was requested
let write_prefix = |f: &mut Formatter| { let write_prefix = |f: &mut Formatter| {
if let Some(c) = sign { if let Some(c) = sign {
f.buf.write_str(unsafe { f.buf.write_str(c.encode_utf8(&mut [0; 4]))?;
str::from_utf8_unchecked(c.encode_utf8().as_slice())
})?;
} }
if prefixed { f.buf.write_str(prefix) } if prefixed { f.buf.write_str(prefix) }
else { Ok(()) } else { Ok(()) }
@ -1032,10 +1028,8 @@ impl<'a> Formatter<'a> {
rt::v1::Alignment::Center => (padding / 2, (padding + 1) / 2), rt::v1::Alignment::Center => (padding / 2, (padding + 1) / 2),
}; };
let fill = self.fill.encode_utf8(); let mut fill = [0; 4];
let fill = unsafe { let fill = self.fill.encode_utf8(&mut fill);
str::from_utf8_unchecked(fill.as_slice())
};
for _ in 0..pre_pad { for _ in 0..pre_pad {
self.buf.write_str(fill)?; self.buf.write_str(fill)?;
@ -1435,9 +1429,7 @@ impl Display for char {
if f.width.is_none() && f.precision.is_none() { if f.width.is_none() && f.precision.is_none() {
f.write_char(*self) f.write_char(*self)
} else { } else {
f.pad(unsafe { f.pad(self.encode_utf8(&mut [0; 4]))
str::from_utf8_unchecked(self.encode_utf8().as_slice())
})
} }
} }
} }

View File

@ -8,7 +8,7 @@
// option. This file may not be copied, modified, or distributed // option. This file may not be copied, modified, or distributed
// except according to those terms. // except according to those terms.
use std::char; use std::{char,str};
use std::convert::TryFrom; use std::convert::TryFrom;
#[test] #[test]
@ -248,10 +248,12 @@ fn test_escape_unicode() {
#[test] #[test]
fn test_encode_utf8() { fn test_encode_utf8() {
fn check(input: char, expect: &[u8]) { fn check(input: char, expect: &[u8]) {
assert_eq!(input.encode_utf8().as_slice(), expect); let mut buf = [0; 4];
for (a, b) in input.encode_utf8().zip(expect) { let ptr = buf.as_ptr();
assert_eq!(a, *b); let s = input.encode_utf8(&mut buf);
} assert_eq!(s.as_ptr() as usize, ptr as usize);
assert!(str::from_utf8(s.as_bytes()).is_ok());
assert_eq!(s.as_bytes(), expect);
} }
check('x', &[0x78]); check('x', &[0x78]);
@ -263,10 +265,11 @@ fn test_encode_utf8() {
#[test] #[test]
fn test_encode_utf16() { fn test_encode_utf16() {
fn check(input: char, expect: &[u16]) { fn check(input: char, expect: &[u16]) {
assert_eq!(input.encode_utf16().as_slice(), expect); let mut buf = [0; 2];
for (a, b) in input.encode_utf16().zip(expect) { let ptr = buf.as_mut_ptr();
assert_eq!(a, *b); let b = input.encode_utf16(&mut buf);
} assert_eq!(b.as_mut_ptr() as usize, ptr as usize);
assert_eq!(b, expect);
} }
check('x', &[0x0078]); check('x', &[0x0078]);

View File

@ -37,7 +37,7 @@ use tables::{conversions, derived_property, general_category, property};
#[stable(feature = "rust1", since = "1.0.0")] #[stable(feature = "rust1", since = "1.0.0")]
pub use core::char::{MAX, from_digit, from_u32, from_u32_unchecked}; pub use core::char::{MAX, from_digit, from_u32, from_u32_unchecked};
#[stable(feature = "rust1", since = "1.0.0")] #[stable(feature = "rust1", since = "1.0.0")]
pub use core::char::{EncodeUtf16, EncodeUtf8, EscapeDebug, EscapeDefault, EscapeUnicode}; pub use core::char::{EscapeDebug, EscapeDefault, EscapeUnicode};
// unstable reexports // unstable reexports
#[unstable(feature = "try_from", issue = "33417")] #[unstable(feature = "try_from", issue = "33417")]
@ -435,50 +435,96 @@ impl char {
C::len_utf16(self) C::len_utf16(self)
} }
/// Returns an iterator over the bytes of this character as UTF-8. /// Encodes this character as UTF-8 into the provided byte buffer,
/// and then returns the subslice of the buffer that contains the encoded character.
/// ///
/// The returned iterator also has an `as_slice()` method to view the /// # Panics
/// encoded bytes as a byte slice. ///
/// Panics if the buffer is not large enough.
/// A buffer of length four is large enough to encode any `char`.
/// ///
/// # Examples /// # Examples
/// ///
/// In both of these examples, 'ß' takes two bytes to encode.
///
/// ``` /// ```
/// #![feature(unicode)] /// #![feature(unicode)]
/// ///
/// let iterator = 'ß'.encode_utf8(); /// let mut b = [0; 2];
/// assert_eq!(iterator.as_slice(), [0xc3, 0x9f]);
/// ///
/// for (i, byte) in iterator.enumerate() { /// let result = 'ß'.encode_utf8(&mut b);
/// println!("byte {}: {:x}", i, byte); ///
/// } /// assert_eq!(result, "ß");
///
/// assert_eq!(result.len(), 2);
/// ``` /// ```
#[unstable(feature = "unicode", issue = "27784")] ///
/// A buffer that's too small:
///
/// ```
/// #![feature(unicode)]
/// use std::thread;
///
/// let result = thread::spawn(|| {
/// let mut b = [0; 1];
///
/// // this panics
/// 'ß'.encode_utf8(&mut b);
/// }).join();
///
/// assert!(result.is_err());
/// ```
#[unstable(feature = "unicode",
reason = "pending decision about Iterator/Writer/Reader",
issue = "27784")]
#[inline] #[inline]
pub fn encode_utf8(self) -> EncodeUtf8 { pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str {
C::encode_utf8(self) C::encode_utf8(self, dst)
} }
/// Returns an iterator over the `u16` entries of this character as UTF-16. /// Encodes this character as UTF-16 into the provided `u16` buffer,
/// and then returns the subslice of the buffer that contains the encoded character.
/// ///
/// The returned iterator also has an `as_slice()` method to view the /// # Panics
/// encoded form as a slice. ///
/// Panics if the buffer is not large enough.
/// A buffer of length 2 is large enough to encode any `char`.
/// ///
/// # Examples /// # Examples
/// ///
/// In both of these examples, '𝕊' takes two `u16`s to encode.
///
/// ``` /// ```
/// #![feature(unicode)] /// #![feature(unicode)]
/// ///
/// let iterator = '𝕊'.encode_utf16(); /// let mut b = [0; 2];
/// assert_eq!(iterator.as_slice(), [0xd835, 0xdd4a]);
/// ///
/// for (i, val) in iterator.enumerate() { /// let result = '𝕊'.encode_utf16(&mut b);
/// println!("entry {}: {:x}", i, val); ///
/// } /// assert_eq!(result.len(), 2);
/// ``` /// ```
#[unstable(feature = "unicode", issue = "27784")] ///
/// A buffer that's too small:
///
/// ```
/// #![feature(unicode)]
/// use std::thread;
///
/// let result = thread::spawn(|| {
/// let mut b = [0; 1];
///
/// // this panics
/// '𝕊'.encode_utf16(&mut b);
/// }).join();
///
/// assert!(result.is_err());
/// ```
#[unstable(feature = "unicode",
reason = "pending decision about Iterator/Writer/Reader",
issue = "27784")]
#[inline] #[inline]
pub fn encode_utf16(self) -> EncodeUtf16 { pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] {
C::encode_utf16(self) C::encode_utf16(self, dst)
} }
/// Returns true if this `char` is an alphabetic code point, and false if not. /// Returns true if this `char` is an alphabetic code point, and false if not.

View File

@ -157,13 +157,13 @@ impl<I> Iterator for Utf16Encoder<I>
return Some(tmp); return Some(tmp);
} }
let mut buf = [0; 2];
self.chars.next().map(|ch| { self.chars.next().map(|ch| {
let n = CharExt::encode_utf16(ch); let n = CharExt::encode_utf16(ch, &mut buf).len();
let n = n.as_slice(); if n == 2 {
if n.len() == 2 { self.extra = buf[1];
self.extra = n[1];
} }
n[0] buf[0]
}) })
} }

View File

@ -433,9 +433,7 @@ fn escape_str(wr: &mut fmt::Write, v: &str) -> EncodeResult {
} }
fn escape_char(writer: &mut fmt::Write, v: char) -> EncodeResult { fn escape_char(writer: &mut fmt::Write, v: char) -> EncodeResult {
escape_str(writer, unsafe { escape_str(writer, v.encode_utf8(&mut [0; 4]))
str::from_utf8_unchecked(v.encode_utf8().as_slice())
})
} }
fn spaces(wr: &mut fmt::Write, mut n: usize) -> EncodeResult { fn spaces(wr: &mut fmt::Write, mut n: usize) -> EncodeResult {

View File

@ -206,10 +206,12 @@ impl Wtf8Buf {
/// Copied from String::push /// Copied from String::push
/// This does **not** include the WTF-8 concatenation check. /// This does **not** include the WTF-8 concatenation check.
fn push_code_point_unchecked(&mut self, code_point: CodePoint) { fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
let bytes = unsafe { let c = unsafe {
char::from_u32_unchecked(code_point.value).encode_utf8() char::from_u32_unchecked(code_point.value)
}; };
self.bytes.extend_from_slice(bytes.as_slice()); let mut bytes = [0; 4];
let bytes = c.encode_utf8(&mut bytes).as_bytes();
self.bytes.extend_from_slice(bytes)
} }
#[inline] #[inline]
@ -738,15 +740,16 @@ impl<'a> Iterator for EncodeWide<'a> {
return Some(tmp); return Some(tmp);
} }
let mut buf = [0; 2];
self.code_points.next().map(|code_point| { self.code_points.next().map(|code_point| {
let n = unsafe { let c = unsafe {
char::from_u32_unchecked(code_point.value).encode_utf16() char::from_u32_unchecked(code_point.value)
}; };
let n = n.as_slice(); let n = c.encode_utf16(&mut buf).len();
if n.len() == 2 { if n == 2 {
self.extra = n[1]; self.extra = buf[1];
} }
n[0] buf[0]
}) })
} }