From 0504d7e57bf536dabbb738b5b0d268a266d30659 Mon Sep 17 00:00:00 2001 From: blake2-ppc Date: Fri, 2 Aug 2013 18:34:00 +0200 Subject: [PATCH] std: Speed up str::is_utf8 Use unchecked vec indexing since the vector bounds are checked by the loop. Iterators are not easy to use in this case since we skip 1-4 bytes each lap. This part of the commit speeds up is_utf8 for ASCII input. Check codepoint ranges by checking the byte ranges manually instead of computing a full decoding for multibyte encodings. This is easy to read and corresponds to the UTF-8 syntax in the RFC. No changes to what we accept. A comment notes that surrogate halves are accepted. Before: test str::bench::is_utf8_100_ascii ... bench: 165 ns/iter (+/- 3) test str::bench::is_utf8_100_multibyte ... bench: 218 ns/iter (+/- 5) After: test str::bench::is_utf8_100_ascii ... bench: 130 ns/iter (+/- 1) test str::bench::is_utf8_100_multibyte ... bench: 156 ns/iter (+/- 3) --- src/libstd/str.rs | 100 ++++++++++++++++++++++++++++++---------------- 1 file changed, 66 insertions(+), 34 deletions(-) diff --git a/src/libstd/str.rs b/src/libstd/str.rs index f0c0595744c..4acece78284 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -564,51 +564,63 @@ fn match_at<'a,'b>(haystack: &'a str, needle: &'b str, at: uint) -> bool { Section: Misc */ -// Return the initial codepoint accumulator for the first byte. -// The first byte is special, only want bottom 5 bits for width 2, 4 bits -// for width 3, and 3 bits for width 4 -macro_rules! utf8_first_byte( - ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as uint) -) - -// return the value of $ch updated with continuation byte $byte -macro_rules! utf8_acc_cont_byte( - ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as uint) -) - /// Determines if a vector of bytes contains valid UTF-8 pub fn is_utf8(v: &[u8]) -> bool { let mut i = 0u; let total = v.len(); + fn unsafe_get(xs: &[u8], i: uint) -> u8 { + unsafe { *xs.unsafe_ref(i) } + } while i < total { - if v[i] < 128u8 { + let v_i = unsafe_get(v, i); + if v_i < 128u8 { i += 1u; } else { - let w = utf8_char_width(v[i]); + let w = utf8_char_width(v_i); if w == 0u { return false; } let nexti = i + w; if nexti > total { return false; } - // 1. Make sure the correct number of continuation bytes are present - // 2. Check codepoint ranges (deny overlong encodings) - // 2-byte encoding is for codepoints \u0080 to \u07ff - // 3-byte encoding is for codepoints \u0800 to \uffff - // 4-byte encoding is for codepoints \u10000 to \u10ffff - // 2-byte encodings are correct if the width and continuation match up - if v[i + 1] & 192u8 != TAG_CONT_U8 { return false; } - if w > 2 { - let mut ch; - ch = utf8_first_byte!(v[i], w); - ch = utf8_acc_cont_byte!(ch, v[i + 1]); - if v[i + 2] & 192u8 != TAG_CONT_U8 { return false; } - ch = utf8_acc_cont_byte!(ch, v[i + 2]); - if w == 3 && ch < MAX_TWO_B { return false; } - if w > 3 { - if v[i + 3] & 192u8 != TAG_CONT_U8 { return false; } - ch = utf8_acc_cont_byte!(ch, v[i + 3]); - if ch < MAX_THREE_B || ch >= MAX_UNICODE { return false; } - } + // 2-byte encoding is for codepoints \u0080 to \u07ff + // first C2 80 last DF BF + // 3-byte encoding is for codepoints \u0800 to \uffff + // first E0 A0 80 last EF BF BF + // 4-byte encoding is for codepoints \u10000 to \u10ffff + // first F0 90 80 80 last F4 8F BF BF + // + // Use the UTF-8 syntax from the RFC + // + // https://tools.ietf.org/html/rfc3629 + // UTF8-1 = %x00-7F + // UTF8-2 = %xC2-DF UTF8-tail + // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / + // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) + // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / + // %xF4 %x80-8F 2( UTF8-tail ) + // UTF8-tail = %x80-BF + // -- + // This code allows surrogate pairs: \uD800 to \uDFFF -> ED A0 80 to ED BF BF + match w { + 2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 { + return false + }, + 3 => match (v_i, + unsafe_get(v, i + 1), + unsafe_get(v, i + 2) & 192u8) { + (0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) => (), + (0xE1 .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (), + _ => return false, + }, + _ => match (v_i, + unsafe_get(v, i + 1), + unsafe_get(v, i + 2) & 192u8, + unsafe_get(v, i + 3) & 192u8) { + (0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (), + (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (), + (0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => (), + _ => return false, + }, } i = nexti; @@ -756,6 +768,18 @@ pub struct CharRange { next: uint } +// Return the initial codepoint accumulator for the first byte. +// The first byte is special, only want bottom 5 bits for width 2, 4 bits +// for width 3, and 3 bits for width 4 +macro_rules! utf8_first_byte( + ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as uint) +) + +// return the value of $ch updated with continuation byte $byte +macro_rules! utf8_acc_cont_byte( + ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as uint) +) + // UTF-8 tags and ranges priv static TAG_CONT_U8: u8 = 128u8; priv static TAG_CONT: uint = 128u; @@ -2833,13 +2857,21 @@ mod tests { } #[test] - fn test_is_utf8_deny_overlong() { + fn test_is_utf8() { assert!(!is_utf8([0xc0, 0x80])); assert!(!is_utf8([0xc0, 0xae])); assert!(!is_utf8([0xe0, 0x80, 0x80])); assert!(!is_utf8([0xe0, 0x80, 0xaf])); assert!(!is_utf8([0xe0, 0x81, 0x81])); assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac])); + assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80])); + + assert!(is_utf8([0xC2, 0x80])); + assert!(is_utf8([0xDF, 0xBF])); + assert!(is_utf8([0xE0, 0xA0, 0x80])); + assert!(is_utf8([0xEF, 0xBF, 0xBF])); + assert!(is_utf8([0xF0, 0x90, 0x80, 0x80])); + assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF])); }