From 48d5fe9ec560b53b1f5069219b0d62015e1de5ba Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Fri, 11 Mar 2016 11:01:46 -0800 Subject: [PATCH] std: Change `encode_utf{8,16}` to return iterators Currently these have non-traditional APIs which take a buffer and report how much was filled in, but they're not necessarily ergonomic to use. Returning an iterator which *also* exposes an underlying slice shouldn't result in any performance loss as it's just a lazy version of the same implementation, and it's also much more ergonomic! cc #27784 --- src/libcollections/string.rs | 25 +---- src/libcollectionstest/str.rs | 10 +- src/libcore/char.rs | 189 +++++++++++++++++++++------------- src/libcore/fmt/mod.rs | 27 +++-- src/libcoretest/char.rs | 14 +-- src/librustc_unicode/char.rs | 82 +++++---------- src/librustc_unicode/lib.rs | 1 + src/librustc_unicode/u_str.rs | 10 +- src/libserialize/json.rs | 7 +- src/libstd/sys/common/wtf8.rs | 31 +++--- 10 files changed, 195 insertions(+), 201 deletions(-) diff --git a/src/libcollections/string.rs b/src/libcollections/string.rs index 98225dd3dda..d2432a4b032 100644 --- a/src/libcollections/string.rs +++ b/src/libcollections/string.rs @@ -61,7 +61,6 @@ use core::iter::FromIterator; use core::mem; use core::ops::{self, Add, Index, IndexMut}; use core::ptr; -use core::slice; use core::str::pattern::Pattern; use rustc_unicode::char::{decode_utf16, REPLACEMENT_CHARACTER}; use rustc_unicode::str as unicode_str; @@ -970,22 +969,7 @@ impl String { pub fn push(&mut self, ch: char) { match ch.len_utf8() { 1 => self.vec.push(ch as u8), - ch_len => { - let cur_len = self.len(); - // This may use up to 4 bytes. - self.vec.reserve(ch_len); - - unsafe { - // Attempt to not use an intermediate buffer by just pushing bytes - // directly onto this string. - let slice = slice::from_raw_parts_mut(self.vec - .as_mut_ptr() - .offset(cur_len as isize), - ch_len); - let used = ch.encode_utf8(slice).unwrap_or(0); - self.vec.set_len(cur_len + used); - } - } + _ => self.vec.extend_from_slice(ch.encode_utf8().as_slice()), } } @@ -1136,9 +1120,10 @@ impl String { let len = self.len(); assert!(idx <= len); assert!(self.is_char_boundary(idx)); - self.vec.reserve(4); - let mut bits = [0; 4]; - let amt = ch.encode_utf8(&mut bits).unwrap(); + let bits = ch.encode_utf8(); + let bits = bits.as_slice(); + let amt = bits.len(); + self.vec.reserve(amt); unsafe { ptr::copy(self.vec.as_ptr().offset(idx as isize), diff --git a/src/libcollectionstest/str.rs b/src/libcollectionstest/str.rs index 776d73ef10f..1150035eb42 100644 --- a/src/libcollectionstest/str.rs +++ b/src/libcollectionstest/str.rs @@ -794,10 +794,9 @@ fn test_rev_iterator() { #[test] fn test_chars_decoding() { - let mut bytes = [0; 4]; for c in (0..0x110000).filter_map(::std::char::from_u32) { - let len = c.encode_utf8(&mut bytes).unwrap_or(0); - let s = ::std::str::from_utf8(&bytes[..len]).unwrap(); + let bytes = c.encode_utf8(); + let s = ::std::str::from_utf8(bytes.as_slice()).unwrap(); if Some(c) != s.chars().next() { panic!("character {:x}={} does not decode correctly", c as u32, c); } @@ -806,10 +805,9 @@ fn test_chars_decoding() { #[test] fn test_chars_rev_decoding() { - let mut bytes = [0; 4]; for c in (0..0x110000).filter_map(::std::char::from_u32) { - let len = c.encode_utf8(&mut bytes).unwrap_or(0); - let s = ::std::str::from_utf8(&bytes[..len]).unwrap(); + let bytes = c.encode_utf8(); + let s = ::std::str::from_utf8(bytes.as_slice()).unwrap(); if Some(c) != s.chars().rev().next() { panic!("character {:x}={} does not decode correctly", c as u32, c); } diff --git a/src/libcore/char.rs b/src/libcore/char.rs index 5b39fa42c6e..b2b1dc5178e 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -269,10 +269,10 @@ pub trait CharExt { fn len_utf8(self) -> usize; #[stable(feature = "core", since = "1.6.0")] fn len_utf16(self) -> usize; - #[stable(feature = "core", since = "1.6.0")] - fn encode_utf8(self, dst: &mut [u8]) -> Option; - #[stable(feature = "core", since = "1.6.0")] - fn encode_utf16(self, dst: &mut [u16]) -> Option; + #[unstable(feature = "unicode", issue = "27784")] + fn encode_utf8(self) -> EncodeUtf8; + #[unstable(feature = "unicode", issue = "27784")] + fn encode_utf16(self) -> EncodeUtf16; } #[stable(feature = "core", since = "1.6.0")] @@ -336,75 +336,47 @@ impl CharExt for char { } #[inline] - fn encode_utf8(self, dst: &mut [u8]) -> Option { - encode_utf8_raw(self as u32, dst) + fn encode_utf8(self) -> EncodeUtf8 { + let code = self as u32; + let mut buf = [0; 4]; + let pos = if code < MAX_ONE_B { + buf[3] = code as u8; + 3 + } else if code < MAX_TWO_B { + buf[2] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; + buf[3] = (code & 0x3F) as u8 | TAG_CONT; + 2 + } else if code < MAX_THREE_B { + buf[1] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; + buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT; + buf[3] = (code & 0x3F) as u8 | TAG_CONT; + 1 + } else { + buf[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; + buf[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT; + buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT; + buf[3] = (code & 0x3F) as u8 | TAG_CONT; + 0 + }; + EncodeUtf8 { buf: buf, pos: pos } } #[inline] - fn encode_utf16(self, dst: &mut [u16]) -> Option { - encode_utf16_raw(self as u32, dst) - } -} - -/// Encodes a raw u32 value as UTF-8 into the provided byte buffer, -/// and then returns the number of bytes written. -/// -/// If the buffer is not large enough, nothing will be written into it -/// and a `None` will be returned. -#[inline] -#[unstable(feature = "char_internals", - reason = "this function should not be exposed publicly", - issue = "0")] -#[doc(hidden)] -pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option { - // Marked #[inline] to allow llvm optimizing it away - if code < MAX_ONE_B && !dst.is_empty() { - dst[0] = code as u8; - Some(1) - } else if code < MAX_TWO_B && dst.len() >= 2 { - dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; - dst[1] = (code & 0x3F) as u8 | TAG_CONT; - Some(2) - } else if code < MAX_THREE_B && dst.len() >= 3 { - dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; - dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT; - dst[2] = (code & 0x3F) as u8 | TAG_CONT; - Some(3) - } else if dst.len() >= 4 { - dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; - dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT; - dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT; - dst[3] = (code & 0x3F) as u8 | TAG_CONT; - Some(4) - } else { - None - } -} - -/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer, -/// and then returns the number of `u16`s written. -/// -/// If the buffer is not large enough, nothing will be written into it -/// and a `None` will be returned. -#[inline] -#[unstable(feature = "char_internals", - reason = "this function should not be exposed publicly", - issue = "0")] -#[doc(hidden)] -pub fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option { - // Marked #[inline] to allow llvm optimizing it away - if (ch & 0xFFFF) == ch && !dst.is_empty() { - // The BMP falls through (assuming non-surrogate, as it should) - dst[0] = ch as u16; - Some(1) - } else if dst.len() >= 2 { - // Supplementary planes break into surrogates. - ch -= 0x1_0000; - dst[0] = 0xD800 | ((ch >> 10) as u16); - dst[1] = 0xDC00 | ((ch as u16) & 0x3FF); - Some(2) - } else { - None + fn encode_utf16(self) -> EncodeUtf16 { + let mut buf = [0; 2]; + let mut code = self as u32; + let pos = if (code & 0xFFFF) == code { + // The BMP falls through (assuming non-surrogate, as it should) + buf[1] = code as u16; + 1 + } else { + // Supplementary planes break into surrogates. + code -= 0x1_0000; + buf[0] = 0xD800 | ((code >> 10) as u16); + buf[1] = 0xDC00 | ((code as u16) & 0x3FF); + 0 + }; + EncodeUtf16 { buf: buf, pos: pos } } } @@ -583,3 +555,80 @@ impl Iterator for EscapeDefault { } } } + +/// An iterator over `u8` entries represending the UTF-8 encoding of a `char` +/// value. +/// +/// Constructed via the `.encode_utf8()` method on `char`. +#[unstable(feature = "unicode", issue = "27784")] +#[derive(Debug)] +pub struct EncodeUtf8 { + buf: [u8; 4], + pos: usize, +} + +impl EncodeUtf8 { + /// Returns the remaining bytes of this iterator as a slice. + #[unstable(feature = "unicode", issue = "27784")] + pub fn as_slice(&self) -> &[u8] { + &self.buf[self.pos..] + } +} + +#[unstable(feature = "unicode", issue = "27784")] +impl Iterator for EncodeUtf8 { + type Item = u8; + + fn next(&mut self) -> Option { + if self.pos == self.buf.len() { + None + } else { + let ret = Some(self.buf[self.pos]); + self.pos += 1; + ret + } + } + + fn size_hint(&self) -> (usize, Option) { + self.as_slice().iter().size_hint() + } +} + +/// An iterator over `u16` entries represending the UTF-16 encoding of a `char` +/// value. +/// +/// Constructed via the `.encode_utf16()` method on `char`. +#[unstable(feature = "unicode", issue = "27784")] +#[derive(Debug)] +pub struct EncodeUtf16 { + buf: [u16; 2], + pos: usize, +} + +impl EncodeUtf16 { + /// Returns the remaining bytes of this iterator as a slice. + #[unstable(feature = "unicode", issue = "27784")] + pub fn as_slice(&self) -> &[u16] { + &self.buf[self.pos..] + } +} + + +#[unstable(feature = "unicode", issue = "27784")] +impl Iterator for EncodeUtf16 { + type Item = u16; + + fn next(&mut self) -> Option { + if self.pos == self.buf.len() { + None + } else { + let ret = Some(self.buf[self.pos]); + self.pos += 1; + ret + } + } + + fn size_hint(&self) -> (usize, Option) { + self.as_slice().iter().size_hint() + } +} diff --git a/src/libcore/fmt/mod.rs b/src/libcore/fmt/mod.rs index d2da16624ca..5617b6d63a7 100644 --- a/src/libcore/fmt/mod.rs +++ b/src/libcore/fmt/mod.rs @@ -99,9 +99,9 @@ pub trait Write { /// This function will return an instance of `Error` on error. #[stable(feature = "fmt_write_char", since = "1.1.0")] fn write_char(&mut self, c: char) -> Result { - let mut utf_8 = [0u8; 4]; - let bytes_written = c.encode_utf8(&mut utf_8).unwrap_or(0); - self.write_str(unsafe { str::from_utf8_unchecked(&utf_8[..bytes_written]) }) + self.write_str(unsafe { + str::from_utf8_unchecked(c.encode_utf8().as_slice()) + }) } /// Glue for usage of the `write!` macro with implementors of this trait. @@ -897,10 +897,9 @@ impl<'a> Formatter<'a> { // Writes the sign if it exists, and then the prefix if it was requested let write_prefix = |f: &mut Formatter| { if let Some(c) = sign { - let mut b = [0; 4]; - let n = c.encode_utf8(&mut b).unwrap_or(0); - let b = unsafe { str::from_utf8_unchecked(&b[..n]) }; - try!(f.buf.write_str(b)); + try!(f.buf.write_str(unsafe { + str::from_utf8_unchecked(c.encode_utf8().as_slice()) + })); } if prefixed { f.buf.write_str(prefix) } else { Ok(()) } @@ -1003,9 +1002,10 @@ impl<'a> Formatter<'a> { rt::v1::Alignment::Center => (padding / 2, (padding + 1) / 2), }; - let mut fill = [0; 4]; - let len = self.fill.encode_utf8(&mut fill).unwrap_or(0); - let fill = unsafe { str::from_utf8_unchecked(&fill[..len]) }; + let fill = self.fill.encode_utf8(); + let fill = unsafe { + str::from_utf8_unchecked(fill.as_slice()) + }; for _ in 0..pre_pad { try!(self.buf.write_str(fill)); @@ -1391,10 +1391,9 @@ impl Display for char { if f.width.is_none() && f.precision.is_none() { f.write_char(*self) } else { - let mut utf8 = [0; 4]; - let amt = self.encode_utf8(&mut utf8).unwrap_or(0); - let s: &str = unsafe { str::from_utf8_unchecked(&utf8[..amt]) }; - f.pad(s) + f.pad(unsafe { + str::from_utf8_unchecked(self.encode_utf8().as_slice()) + }) } } } diff --git a/src/libcoretest/char.rs b/src/libcoretest/char.rs index c1f3ea42ef4..ba8918fc6cb 100644 --- a/src/libcoretest/char.rs +++ b/src/libcoretest/char.rs @@ -175,9 +175,10 @@ fn test_escape_unicode() { #[test] fn test_encode_utf8() { fn check(input: char, expect: &[u8]) { - let mut buf = [0; 4]; - let n = input.encode_utf8(&mut buf).unwrap_or(0); - assert_eq!(&buf[..n], expect); + assert_eq!(input.encode_utf8().as_slice(), expect); + for (a, b) in input.encode_utf8().zip(expect) { + assert_eq!(a, *b); + } } check('x', &[0x78]); @@ -189,9 +190,10 @@ fn test_encode_utf8() { #[test] fn test_encode_utf16() { fn check(input: char, expect: &[u16]) { - let mut buf = [0; 2]; - let n = input.encode_utf16(&mut buf).unwrap_or(0); - assert_eq!(&buf[..n], expect); + assert_eq!(input.encode_utf16().as_slice(), expect); + for (a, b) in input.encode_utf16().zip(expect) { + assert_eq!(a, *b); + } } check('x', &[0x0078]); diff --git a/src/librustc_unicode/char.rs b/src/librustc_unicode/char.rs index 5bc5c786160..e2440227cf4 100644 --- a/src/librustc_unicode/char.rs +++ b/src/librustc_unicode/char.rs @@ -35,7 +35,9 @@ use tables::{derived_property, property, general_category, conversions}; // stable reexports #[stable(feature = "rust1", since = "1.0.0")] -pub use core::char::{MAX, from_u32, from_u32_unchecked, from_digit, EscapeUnicode, EscapeDefault}; +pub use core::char::{MAX, from_u32, from_u32_unchecked, from_digit}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::char::{EscapeUnicode, EscapeDefault, EncodeUtf8, EncodeUtf16}; // unstable reexports #[unstable(feature = "unicode", issue = "27783")] @@ -408,84 +410,50 @@ impl char { C::len_utf16(self) } - /// Encodes this character as UTF-8 into the provided byte buffer, and then - /// returns the number of bytes written. + /// Returns an interator over the bytes of this character as UTF-8. /// - /// If the buffer is not large enough, nothing will be written into it and a - /// `None` will be returned. A buffer of length four is large enough to - /// encode any `char`. + /// The returned iterator also has an `as_slice()` method to view the + /// encoded bytes as a byte slice. /// /// # Examples /// - /// In both of these examples, 'ß' takes two bytes to encode. - /// /// ``` /// #![feature(unicode)] /// - /// let mut b = [0; 2]; + /// let iterator = 'ß'.encode_utf8(); + /// assert_eq!(iterator.as_slice(), [0xc3, 0x9f]); /// - /// let result = 'ß'.encode_utf8(&mut b); - /// - /// assert_eq!(result, Some(2)); + /// for (i, byte) in iterator.enumerate() { + /// println!("byte {}: {:x}", i, byte); + /// } /// ``` - /// - /// A buffer that's too small: - /// - /// ``` - /// #![feature(unicode)] - /// - /// let mut b = [0; 1]; - /// - /// let result = 'ß'.encode_utf8(&mut b); - /// - /// assert_eq!(result, None); - /// ``` - #[unstable(feature = "unicode", - reason = "pending decision about Iterator/Writer/Reader", - issue = "27784")] + #[unstable(feature = "unicode", issue = "27784")] #[inline] - pub fn encode_utf8(self, dst: &mut [u8]) -> Option { - C::encode_utf8(self, dst) + pub fn encode_utf8(self) -> EncodeUtf8 { + C::encode_utf8(self) } - /// Encodes this character as UTF-16 into the provided `u16` buffer, and - /// then returns the number of `u16`s written. + /// Returns an interator over the `u16` entries of this character as UTF-16. /// - /// If the buffer is not large enough, nothing will be written into it and a - /// `None` will be returned. A buffer of length 2 is large enough to encode - /// any `char`. + /// The returned iterator also has an `as_slice()` method to view the + /// encoded form as a slice. /// /// # Examples /// - /// In both of these examples, '𝕊' takes two `u16`s to encode. - /// /// ``` /// #![feature(unicode)] /// - /// let mut b = [0; 2]; + /// let iterator = '𝕊'.encode_utf16(); + /// assert_eq!(iterator.as_slice(), [0xd835, 0xdd4a]); /// - /// let result = '𝕊'.encode_utf16(&mut b); - /// - /// assert_eq!(result, Some(2)); + /// for (i, val) in iterator.enumerate() { + /// println!("entry {}: {:x}", i, val); + /// } /// ``` - /// - /// A buffer that's too small: - /// - /// ``` - /// #![feature(unicode)] - /// - /// let mut b = [0; 1]; - /// - /// let result = '𝕊'.encode_utf16(&mut b); - /// - /// assert_eq!(result, None); - /// ``` - #[unstable(feature = "unicode", - reason = "pending decision about Iterator/Writer/Reader", - issue = "27784")] + #[unstable(feature = "unicode", issue = "27784")] #[inline] - pub fn encode_utf16(self, dst: &mut [u16]) -> Option { - C::encode_utf16(self, dst) + pub fn encode_utf16(self) -> EncodeUtf16 { + C::encode_utf16(self) } /// Returns true if this `char` is an alphabetic code point, and false if not. diff --git a/src/librustc_unicode/lib.rs b/src/librustc_unicode/lib.rs index fb85176340e..2f7f724e6af 100644 --- a/src/librustc_unicode/lib.rs +++ b/src/librustc_unicode/lib.rs @@ -35,6 +35,7 @@ #![feature(core_char_ext)] #![feature(lang_items)] #![feature(staged_api)] +#![feature(unicode)] mod tables; mod u_str; diff --git a/src/librustc_unicode/u_str.rs b/src/librustc_unicode/u_str.rs index 9a6700ad47c..18734a66871 100644 --- a/src/librustc_unicode/u_str.rs +++ b/src/librustc_unicode/u_str.rs @@ -155,13 +155,13 @@ impl Iterator for Utf16Encoder where I: Iterator { return Some(tmp); } - let mut buf = [0; 2]; self.chars.next().map(|ch| { - let n = CharExt::encode_utf16(ch, &mut buf).unwrap_or(0); - if n == 2 { - self.extra = buf[1]; + let n = CharExt::encode_utf16(ch); + let n = n.as_slice(); + if n.len() == 2 { + self.extra = n[1]; } - buf[0] + n[0] }) } diff --git a/src/libserialize/json.rs b/src/libserialize/json.rs index b95eddbc661..ab16ef23dd1 100644 --- a/src/libserialize/json.rs +++ b/src/libserialize/json.rs @@ -433,10 +433,9 @@ fn escape_str(wr: &mut fmt::Write, v: &str) -> EncodeResult { } fn escape_char(writer: &mut fmt::Write, v: char) -> EncodeResult { - let mut buf = [0; 4]; - let n = v.encode_utf8(&mut buf).unwrap(); - let buf = unsafe { str::from_utf8_unchecked(&buf[..n]) }; - escape_str(writer, buf) + escape_str(writer, unsafe { + str::from_utf8_unchecked(v.encode_utf8().as_slice()) + }) } fn spaces(wr: &mut fmt::Write, mut n: usize) -> EncodeResult { diff --git a/src/libstd/sys/common/wtf8.rs b/src/libstd/sys/common/wtf8.rs index 48e9adb9296..db3bc2ed751 100644 --- a/src/libstd/sys/common/wtf8.rs +++ b/src/libstd/sys/common/wtf8.rs @@ -25,7 +25,6 @@ // unix (it's mostly used on windows), so don't worry about dead code here. #![allow(dead_code)] -use core::char::{encode_utf8_raw, encode_utf16_raw}; use core::str::next_code_point; use ascii::*; @@ -206,19 +205,10 @@ impl Wtf8Buf { /// Copied from String::push /// This does **not** include the WTF-8 concatenation check. fn push_code_point_unchecked(&mut self, code_point: CodePoint) { - let cur_len = self.len(); - // This may use up to 4 bytes. - self.reserve(4); - - unsafe { - // Attempt to not use an intermediate buffer by just pushing bytes - // directly onto this string. - let slice = slice::from_raw_parts_mut( - self.bytes.as_mut_ptr().offset(cur_len as isize), 4 - ); - let used = encode_utf8_raw(code_point.value, slice).unwrap(); - self.bytes.set_len(cur_len + used); - } + let bytes = unsafe { + char::from_u32_unchecked(code_point.value).encode_utf8() + }; + self.bytes.extend_from_slice(bytes.as_slice()); } #[inline] @@ -747,12 +737,15 @@ impl<'a> Iterator for EncodeWide<'a> { return Some(tmp); } - let mut buf = [0; 2]; self.code_points.next().map(|code_point| { - let n = encode_utf16_raw(code_point.value, &mut buf) - .unwrap_or(0); - if n == 2 { self.extra = buf[1]; } - buf[0] + let n = unsafe { + char::from_u32_unchecked(code_point.value).encode_utf16() + }; + let n = n.as_slice(); + if n.len() == 2 { + self.extra = n[1]; + } + n[0] }) }