std: Speed up str::is_utf8
Use unchecked vec indexing since the vector bounds are checked by the loop. Iterators are not easy to use in this case since we skip 1-4 bytes each lap. This part of the commit speeds up is_utf8 for ASCII input. Check codepoint ranges by checking the byte ranges manually instead of computing a full decoding for multibyte encodings. This is easy to read and corresponds to the UTF-8 syntax in the RFC. No changes to what we accept. A comment notes that surrogate halves are accepted. Before: test str::bench::is_utf8_100_ascii ... bench: 165 ns/iter (+/- 3) test str::bench::is_utf8_100_multibyte ... bench: 218 ns/iter (+/- 5) After: test str::bench::is_utf8_100_ascii ... bench: 130 ns/iter (+/- 1) test str::bench::is_utf8_100_multibyte ... bench: 156 ns/iter (+/- 3)
This commit is contained in:
parent
2460170e6a
commit
0504d7e57b
@ -564,51 +564,63 @@ fn match_at<'a,'b>(haystack: &'a str, needle: &'b str, at: uint) -> bool {
|
||||
Section: Misc
|
||||
*/
|
||||
|
||||
// Return the initial codepoint accumulator for the first byte.
|
||||
// The first byte is special, only want bottom 5 bits for width 2, 4 bits
|
||||
// for width 3, and 3 bits for width 4
|
||||
macro_rules! utf8_first_byte(
|
||||
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as uint)
|
||||
)
|
||||
|
||||
// return the value of $ch updated with continuation byte $byte
|
||||
macro_rules! utf8_acc_cont_byte(
|
||||
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as uint)
|
||||
)
|
||||
|
||||
/// Determines if a vector of bytes contains valid UTF-8
|
||||
pub fn is_utf8(v: &[u8]) -> bool {
|
||||
let mut i = 0u;
|
||||
let total = v.len();
|
||||
fn unsafe_get(xs: &[u8], i: uint) -> u8 {
|
||||
unsafe { *xs.unsafe_ref(i) }
|
||||
}
|
||||
while i < total {
|
||||
if v[i] < 128u8 {
|
||||
let v_i = unsafe_get(v, i);
|
||||
if v_i < 128u8 {
|
||||
i += 1u;
|
||||
} else {
|
||||
let w = utf8_char_width(v[i]);
|
||||
let w = utf8_char_width(v_i);
|
||||
if w == 0u { return false; }
|
||||
|
||||
let nexti = i + w;
|
||||
if nexti > total { return false; }
|
||||
// 1. Make sure the correct number of continuation bytes are present
|
||||
// 2. Check codepoint ranges (deny overlong encodings)
|
||||
// 2-byte encoding is for codepoints \u0080 to \u07ff
|
||||
// 3-byte encoding is for codepoints \u0800 to \uffff
|
||||
// 4-byte encoding is for codepoints \u10000 to \u10ffff
|
||||
|
||||
// 2-byte encodings are correct if the width and continuation match up
|
||||
if v[i + 1] & 192u8 != TAG_CONT_U8 { return false; }
|
||||
if w > 2 {
|
||||
let mut ch;
|
||||
ch = utf8_first_byte!(v[i], w);
|
||||
ch = utf8_acc_cont_byte!(ch, v[i + 1]);
|
||||
if v[i + 2] & 192u8 != TAG_CONT_U8 { return false; }
|
||||
ch = utf8_acc_cont_byte!(ch, v[i + 2]);
|
||||
if w == 3 && ch < MAX_TWO_B { return false; }
|
||||
if w > 3 {
|
||||
if v[i + 3] & 192u8 != TAG_CONT_U8 { return false; }
|
||||
ch = utf8_acc_cont_byte!(ch, v[i + 3]);
|
||||
if ch < MAX_THREE_B || ch >= MAX_UNICODE { return false; }
|
||||
}
|
||||
// 2-byte encoding is for codepoints \u0080 to \u07ff
|
||||
// first C2 80 last DF BF
|
||||
// 3-byte encoding is for codepoints \u0800 to \uffff
|
||||
// first E0 A0 80 last EF BF BF
|
||||
// 4-byte encoding is for codepoints \u10000 to \u10ffff
|
||||
// first F0 90 80 80 last F4 8F BF BF
|
||||
//
|
||||
// Use the UTF-8 syntax from the RFC
|
||||
//
|
||||
// https://tools.ietf.org/html/rfc3629
|
||||
// UTF8-1 = %x00-7F
|
||||
// UTF8-2 = %xC2-DF UTF8-tail
|
||||
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
|
||||
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
|
||||
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
|
||||
// %xF4 %x80-8F 2( UTF8-tail )
|
||||
// UTF8-tail = %x80-BF
|
||||
// --
|
||||
// This code allows surrogate pairs: \uD800 to \uDFFF -> ED A0 80 to ED BF BF
|
||||
match w {
|
||||
2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 {
|
||||
return false
|
||||
},
|
||||
3 => match (v_i,
|
||||
unsafe_get(v, i + 1),
|
||||
unsafe_get(v, i + 2) & 192u8) {
|
||||
(0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) => (),
|
||||
(0xE1 .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
|
||||
_ => return false,
|
||||
},
|
||||
_ => match (v_i,
|
||||
unsafe_get(v, i + 1),
|
||||
unsafe_get(v, i + 2) & 192u8,
|
||||
unsafe_get(v, i + 3) & 192u8) {
|
||||
(0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
|
||||
(0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
|
||||
(0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => (),
|
||||
_ => return false,
|
||||
},
|
||||
}
|
||||
|
||||
i = nexti;
|
||||
@ -756,6 +768,18 @@ pub struct CharRange {
|
||||
next: uint
|
||||
}
|
||||
|
||||
// Return the initial codepoint accumulator for the first byte.
|
||||
// The first byte is special, only want bottom 5 bits for width 2, 4 bits
|
||||
// for width 3, and 3 bits for width 4
|
||||
macro_rules! utf8_first_byte(
|
||||
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as uint)
|
||||
)
|
||||
|
||||
// return the value of $ch updated with continuation byte $byte
|
||||
macro_rules! utf8_acc_cont_byte(
|
||||
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as uint)
|
||||
)
|
||||
|
||||
// UTF-8 tags and ranges
|
||||
priv static TAG_CONT_U8: u8 = 128u8;
|
||||
priv static TAG_CONT: uint = 128u;
|
||||
@ -2833,13 +2857,21 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_utf8_deny_overlong() {
|
||||
fn test_is_utf8() {
|
||||
assert!(!is_utf8([0xc0, 0x80]));
|
||||
assert!(!is_utf8([0xc0, 0xae]));
|
||||
assert!(!is_utf8([0xe0, 0x80, 0x80]));
|
||||
assert!(!is_utf8([0xe0, 0x80, 0xaf]));
|
||||
assert!(!is_utf8([0xe0, 0x81, 0x81]));
|
||||
assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
|
||||
assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));
|
||||
|
||||
assert!(is_utf8([0xC2, 0x80]));
|
||||
assert!(is_utf8([0xDF, 0xBF]));
|
||||
assert!(is_utf8([0xE0, 0xA0, 0x80]));
|
||||
assert!(is_utf8([0xEF, 0xBF, 0xBF]));
|
||||
assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
|
||||
assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user