add core::char::DecodeUtf8

This commit is contained in:
M Farkas-Dyck 2016-05-27 08:16:27 -08:00
parent fe96928d7d
commit 837029fec1
5 changed files with 80 additions and 0 deletions

View File

@ -676,3 +676,50 @@ impl Iterator for EncodeUtf16 {
self.as_slice().iter().size_hint()
}
}
/// An iterator over an iterator of bytes of the characters the bytes represent
/// as UTF-8
#[unstable(feature = "decode_utf8", issue = "33906")]
#[derive(Clone, Debug)]
pub struct DecodeUtf8<I: Iterator<Item = u8>>(::iter::Peekable<I>);
/// Decodes an `Iterator` of bytes as UTF-8.
#[unstable(feature = "decode_utf8", issue = "33906")]
#[inline]
pub fn decode_utf8<I: IntoIterator<Item = u8>>(i: I) -> DecodeUtf8<I::IntoIter> {
DecodeUtf8(i.into_iter().peekable())
}
/// `<DecodeUtf8 as Iterator>::next` returns this for an invalid input sequence.
#[unstable(feature = "decode_utf8", issue = "33906")]
#[derive(PartialEq, Debug)]
pub struct InvalidSequence(());
#[unstable(feature = "decode_utf8", issue = "33906")]
impl<I: Iterator<Item = u8>> Iterator for DecodeUtf8<I> {
type Item = Result<char, InvalidSequence>;
#[inline]
fn next(&mut self) -> Option<Result<char, InvalidSequence>> {
self.0.next().map(|b| {
if b & 0x80 == 0 { Ok(b as char) } else {
let l = (!b).leading_zeros() as usize; // number of bytes in UTF-8 representation
if l < 2 || l > 6 { return Err(InvalidSequence(())) };
let mut x = (b as u32) & (0x7F >> l);
for _ in 0..l-1 {
match self.0.peek() {
Some(&b) if b & 0xC0 == 0x80 => {
self.0.next();
x = (x << 6) | (b as u32) & 0x3F;
},
_ => return Err(InvalidSequence(())),
}
}
match from_u32(x) {
Some(x) if l == x.len_utf8() => Ok(x),
_ => Err(InvalidSequence(())),
}
}
})
}
}

View File

@ -302,3 +302,32 @@ fn eu_iterator_specializations() {
check('\u{12340}');
check('\u{10FFFF}');
}
#[test]
fn test_decode_utf8() {
use core::char::*;
use core::iter::FromIterator;
for &(str, bs) in [("", &[] as &[u8]),
("A", &[0x41u8] as &[u8]),
("<EFBFBD>", &[0xC1u8, 0x81u8] as &[u8]),
("", &[0xE2u8, 0x99u8, 0xA5u8]),
("♥A", &[0xE2u8, 0x99u8, 0xA5u8, 0x41u8] as &[u8]),
("<EFBFBD>", &[0xE2u8, 0x99u8] as &[u8]),
("<EFBFBD>A", &[0xE2u8, 0x99u8, 0x41u8] as &[u8]),
("<EFBFBD>", &[0xC0u8] as &[u8]),
("<EFBFBD>A", &[0xC0u8, 0x41u8] as &[u8]),
("<EFBFBD>", &[0x80u8] as &[u8]),
("<EFBFBD>A", &[0x80u8, 0x41u8] as &[u8]),
("<EFBFBD>", &[0xFEu8] as &[u8]),
("<EFBFBD>A", &[0xFEu8, 0x41u8] as &[u8]),
("<EFBFBD>", &[0xFFu8] as &[u8]),
("<EFBFBD>A", &[0xFFu8, 0x41u8] as &[u8])].into_iter() {
assert!(Iterator::eq(str.chars(),
decode_utf8(bs.into_iter().map(|&b|b))
.map(|r_b| r_b.unwrap_or('\u{FFFD}'))),
"chars = {}, bytes = {:?}, decoded = {:?}", str, bs,
Vec::from_iter(decode_utf8(bs.into_iter().map(|&b|b))
.map(|r_b| r_b.unwrap_or('\u{FFFD}'))));
}
}

View File

@ -18,6 +18,7 @@
#![feature(core_private_bignum)]
#![feature(core_private_diy_float)]
#![feature(dec2flt)]
#![feature(decode_utf8)]
#![feature(fixed_size_array)]
#![feature(float_extras)]
#![feature(flt2dec)]

View File

@ -39,6 +39,8 @@ pub use core::char::{MAX, from_digit, from_u32, from_u32_unchecked};
pub use core::char::{EncodeUtf16, EncodeUtf8, EscapeDefault, EscapeUnicode};
// unstable reexports
#[unstable(feature = "decode_utf8", issue = "33906")]
pub use core::char::{DecodeUtf8, decode_utf8};
#[unstable(feature = "unicode", issue = "27783")]
pub use tables::UNICODE_VERSION;

View File

@ -33,6 +33,7 @@
#![no_std]
#![feature(core_char_ext)]
#![feature(decode_utf8)]
#![feature(lang_items)]
#![feature(staged_api)]
#![feature(unicode)]