From 6dd185930d850b653ae4e5f4c37c3f1a2b64e4cf Mon Sep 17 00:00:00 2001 From: blake2-ppc Date: Tue, 30 Jul 2013 17:17:21 +0200 Subject: [PATCH 1/4] std: Disallow bytes 0xC0, 0xC1 (192, 193) in utf-8 Bytes 0xC0, 0xC1 can only be used to start 2-byte codepoint encodings, that are 'overlong encodings' of codepoints below 128. The reference given in a comment -- https://tools.ietf.org/html/rfc3629 -- does in fact already exclude these bytes, so no additional comment should be needed in the code. --- src/libstd/str.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 6981005ed2f..1669f0c7976 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -712,7 +712,7 @@ static UTF8_CHAR_WIDTH: [u8, ..256] = [ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF -2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF From b4ff95599a05da66d2ba0955cc7ae33dd6bfe7fe Mon Sep 17 00:00:00 2001 From: blake2-ppc Date: Tue, 30 Jul 2013 18:39:31 +0200 Subject: [PATCH 2/4] std: Deny overlong encodings in UTF-8 An 'overlong encoding' is a codepoint encoded non-minimally using the utf-8 format. Denying these enforce each codepoint to have only one valid representation in utf-8. An example is byte sequence 0xE0 0x80 0x80 which could be interpreted as U+0, but it's an overlong encoding since the canonical form is just 0x00. Another example is 0xE0 0x80 0xAF which was previously accepted and is an overlong encoding of the solidus "/". Directory traversal characters like / and . form the most compelling argument for why this commit is security critical. Factor out common UTF-8 decoding expressions as macros. This commit will partly duplicate UTF-8 decoding, so it is now present in both fn is_utf8() and .char_range_at(); the latter using an assumption of a valid str. --- src/libstd/str.rs | 53 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 8 deletions(-) diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 1669f0c7976..9695bd16be3 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -564,6 +564,18 @@ fn match_at<'a,'b>(haystack: &'a str, needle: &'b str, at: uint) -> bool { Section: Misc */ +// Return the initial codepoint accumulator for the first byte. +// The first byte is special, only want bottom 5 bits for width 2, 4 bits +// for width 3, and 3 bits for width 4 +macro_rules! utf8_first_byte( + ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as uint) +) + +// return the value of $ch updated with continuation byte $byte +macro_rules! utf8_acc_cont_byte( + ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as uint) +) + /// Determines if a vector of bytes contains valid UTF-8 pub fn is_utf8(v: &[u8]) -> bool { let mut i = 0u; @@ -577,11 +589,26 @@ pub fn is_utf8(v: &[u8]) -> bool { let nexti = i + w; if nexti > total { return false; } + // 1. Make sure the correct number of continuation bytes are present + // 2. Check codepoint ranges (deny overlong encodings) + // 2-byte encoding is for codepoints \u0080 to \u07ff + // 3-byte encoding is for codepoints \u0800 to \uffff + // 4-byte encoding is for codepoints \u10000 to \u10ffff + // 2-byte encodings are correct if the width and continuation match up if v[i + 1] & 192u8 != TAG_CONT_U8 { return false; } if w > 2 { + let mut ch; + ch = utf8_first_byte!(v[i], w); + ch = utf8_acc_cont_byte!(ch, v[i + 1]); if v[i + 2] & 192u8 != TAG_CONT_U8 { return false; } - if w > 3 && (v[i + 3] & 192u8 != TAG_CONT_U8) { return false; } + ch = utf8_acc_cont_byte!(ch, v[i + 2]); + if w == 3 && ch < MAX_TWO_B { return false; } + if w > 3 { + if v[i + 3] & 192u8 != TAG_CONT_U8 { return false; } + ch = utf8_acc_cont_byte!(ch, v[i + 3]); + if ch < MAX_THREE_B || ch >= MAX_UNICODE { return false; } + } } i = nexti; @@ -738,6 +765,7 @@ static MAX_TWO_B: uint = 2048u; static TAG_THREE_B: uint = 224u; static MAX_THREE_B: uint = 65536u; static TAG_FOUR_B: uint = 240u; +static MAX_UNICODE: uint = 1114112u; /// Unsafe operations pub mod raw { @@ -1665,12 +1693,10 @@ impl<'self> StrSlice<'self> for &'self str { let w = UTF8_CHAR_WIDTH[val] as uint; assert!((w != 0)); - // First byte is special, only want bottom 5 bits for width 2, 4 bits - // for width 3, and 3 bits for width 4 - val &= 0x7Fu >> w; - val = (val << 6) | (s[i + 1] & 63u8) as uint; - if w > 2 { val = (val << 6) | (s[i + 2] & 63u8) as uint; } - if w > 3 { val = (val << 6) | (s[i + 3] & 63u8) as uint; } + val = utf8_first_byte!(val, w); + val = utf8_acc_cont_byte!(val, s[i + 1]); + if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); } + if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); } return CharRange {ch: val as char, next: i + w}; } @@ -2035,7 +2061,7 @@ impl OwnedStr for ~str { /// Appends a character to the back of a string #[inline] fn push_char(&mut self, c: char) { - assert!(c as uint <= 0x10ffff); // FIXME: #7609: should be enforced on all `char` + assert!((c as uint) < MAX_UNICODE); // FIXME: #7609: should be enforced on all `char` unsafe { let code = c as uint; let nb = if code < MAX_ONE_B { 1u } @@ -2802,6 +2828,17 @@ mod tests { assert_eq!(ss, from_bytes(bb)); } + #[test] + fn test_is_utf8_deny_overlong() { + assert!(!is_utf8([0xc0, 0x80])); + assert!(!is_utf8([0xc0, 0xae])); + assert!(!is_utf8([0xe0, 0x80, 0x80])); + assert!(!is_utf8([0xe0, 0x80, 0xaf])); + assert!(!is_utf8([0xe0, 0x81, 0x81])); + assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac])); + } + + #[test] #[ignore(cfg(windows))] fn test_from_bytes_fail() { From aa89325cb01308f48b134f199a99c6d2f4e0bfe9 Mon Sep 17 00:00:00 2001 From: blake2-ppc Date: Tue, 30 Jul 2013 19:10:54 +0200 Subject: [PATCH 3/4] std: Add from_bytes test for utf-8 using codepoints above 0xffff --- src/libstd/str.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 9695bd16be3..fafd2ecf7a5 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -2825,7 +2825,10 @@ mod tests { 0x20_u8, 0x4e_u8, 0x61_u8, 0x6d_u8]; + assert_eq!(ss, from_bytes(bb)); + assert_eq!(~"πŒ€πŒ–πŒ‹πŒ„πŒ‘πŒ‰ΰΈ›ΰΈ£Ψ―ΩˆΩ„Ψ© Ψ§Ω„ΩƒΩˆΩŠΨͺΰΈ—ΰΈ¨ΰΉ„ΰΈ—ΰΈ’δΈ­εŽπ…πŒΏπŒ»π†πŒΉπŒ»πŒ°", + from_bytes(bytes!("πŒ€πŒ–πŒ‹πŒ„πŒ‘πŒ‰ΰΈ›ΰΈ£Ψ―ΩˆΩ„Ψ© Ψ§Ω„ΩƒΩˆΩŠΨͺΰΈ—ΰΈ¨ΰΉ„ΰΈ—ΰΈ’δΈ­εŽπ…πŒΏπŒ»π†πŒΉπŒ»πŒ°"))); } #[test] From 8f9014c15996321d214a84e0fcd6437874e87483 Mon Sep 17 00:00:00 2001 From: blake2-ppc Date: Tue, 30 Jul 2013 19:34:54 +0200 Subject: [PATCH 4/4] std: Mark the static constants in str.rs as private static variables are pub by default, which is not reflected in our code (we need to use priv). --- src/libstd/str.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/libstd/str.rs b/src/libstd/str.rs index fafd2ecf7a5..1026b454c07 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -726,7 +726,7 @@ pub fn count_bytes<'b>(s: &'b str, start: uint, n: uint) -> uint { } // https://tools.ietf.org/html/rfc3629 -static UTF8_CHAR_WIDTH: [u8, ..256] = [ +priv static UTF8_CHAR_WIDTH: [u8, ..256] = [ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -757,15 +757,15 @@ pub struct CharRange { } // UTF-8 tags and ranges -static TAG_CONT_U8: u8 = 128u8; -static TAG_CONT: uint = 128u; -static MAX_ONE_B: uint = 128u; -static TAG_TWO_B: uint = 192u; -static MAX_TWO_B: uint = 2048u; -static TAG_THREE_B: uint = 224u; -static MAX_THREE_B: uint = 65536u; -static TAG_FOUR_B: uint = 240u; -static MAX_UNICODE: uint = 1114112u; +priv static TAG_CONT_U8: u8 = 128u8; +priv static TAG_CONT: uint = 128u; +priv static MAX_ONE_B: uint = 128u; +priv static TAG_TWO_B: uint = 192u; +priv static MAX_TWO_B: uint = 2048u; +priv static TAG_THREE_B: uint = 224u; +priv static MAX_THREE_B: uint = 65536u; +priv static TAG_FOUR_B: uint = 240u; +priv static MAX_UNICODE: uint = 1114112u; /// Unsafe operations pub mod raw {