From 4550ea79f004215af1490e2c269a16d46b890b9f Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 13 Dec 2017 13:11:48 -0600 Subject: [PATCH 01/11] Remove the unused ascii_only field in CharEqSearcher --- src/libcore/str/pattern.rs | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index edb7bed4520..3200cfc4982 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -241,23 +241,16 @@ pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} #[doc(hidden)] trait CharEq { fn matches(&mut self, c: char) -> bool; - fn only_ascii(&self) -> bool; } impl CharEq for char { #[inline] fn matches(&mut self, c: char) -> bool { *self == c } - - #[inline] - fn only_ascii(&self) -> bool { (*self as u32) < 128 } } impl CharEq for F where F: FnMut(char) -> bool { #[inline] fn matches(&mut self, c: char) -> bool { (*self)(c) } - - #[inline] - fn only_ascii(&self) -> bool { false } } impl<'a> CharEq for &'a [char] { @@ -265,11 +258,6 @@ impl<'a> CharEq for &'a [char] { fn matches(&mut self, c: char) -> bool { self.iter().any(|&m| { let mut m = m; m.matches(c) }) } - - #[inline] - fn only_ascii(&self) -> bool { - self.iter().all(|m| m.only_ascii()) - } } struct CharEqPattern(C); @@ -279,8 +267,6 @@ struct CharEqSearcher<'a, C: CharEq> { char_eq: C, haystack: &'a str, char_indices: super::CharIndices<'a>, - #[allow(dead_code)] - ascii_only: bool, } impl<'a, C: CharEq> Pattern<'a> for CharEqPattern { @@ -289,7 +275,6 @@ impl<'a, C: CharEq> Pattern<'a> for CharEqPattern { #[inline] fn into_searcher(self, haystack: &'a str) -> CharEqSearcher<'a, C> { CharEqSearcher { - ascii_only: self.0.only_ascii(), haystack, char_eq: self.0, char_indices: haystack.char_indices(), @@ -499,7 +484,6 @@ impl<'a, F> fmt::Debug for CharPredicateSearcher<'a, F> f.debug_struct("CharPredicateSearcher") .field("haystack", &self.0.haystack) .field("char_indices", &self.0.char_indices) - .field("ascii_only", &self.0.ascii_only) .finish() } } From 72cab5e3263343502aeb1f21a8a17c7f7e917a50 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 13 Dec 2017 14:36:49 -0600 Subject: [PATCH 02/11] Split out char searcher from MultiCharSearcher --- src/libcore/str/pattern.rs | 87 +++++++++++++++++++++++++------------- 1 file changed, 58 insertions(+), 29 deletions(-) diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 3200cfc4982..9dc82851827 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -235,46 +235,41 @@ pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} ///////////////////////////////////////////////////////////////////////////// -// Impl for a CharEq wrapper +// Impl for a MultiCharEq wrapper ///////////////////////////////////////////////////////////////////////////// #[doc(hidden)] -trait CharEq { +trait MultiCharEq { fn matches(&mut self, c: char) -> bool; } -impl CharEq for char { - #[inline] - fn matches(&mut self, c: char) -> bool { *self == c } -} - -impl CharEq for F where F: FnMut(char) -> bool { +impl MultiCharEq for F where F: FnMut(char) -> bool { #[inline] fn matches(&mut self, c: char) -> bool { (*self)(c) } } -impl<'a> CharEq for &'a [char] { +impl<'a> MultiCharEq for &'a [char] { #[inline] fn matches(&mut self, c: char) -> bool { - self.iter().any(|&m| { let mut m = m; m.matches(c) }) + self.iter().any(|&m| { m == c }) } } -struct CharEqPattern(C); +struct MultiCharEqPattern(C); #[derive(Clone, Debug)] -struct CharEqSearcher<'a, C: CharEq> { +struct MultiCharEqSearcher<'a, C: MultiCharEq> { char_eq: C, haystack: &'a str, char_indices: super::CharIndices<'a>, } -impl<'a, C: CharEq> Pattern<'a> for CharEqPattern { - type Searcher = CharEqSearcher<'a, C>; +impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern { + type Searcher = MultiCharEqSearcher<'a, C>; #[inline] - fn into_searcher(self, haystack: &'a str) -> CharEqSearcher<'a, C> { - CharEqSearcher { + fn into_searcher(self, haystack: &'a str) -> MultiCharEqSearcher<'a, C> { + MultiCharEqSearcher { haystack, char_eq: self.0, char_indices: haystack.char_indices(), @@ -282,7 +277,7 @@ impl<'a, C: CharEq> Pattern<'a> for CharEqPattern { } } -unsafe impl<'a, C: CharEq> Searcher<'a> for CharEqSearcher<'a, C> { +unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> { #[inline] fn haystack(&self) -> &'a str { self.haystack @@ -307,7 +302,7 @@ unsafe impl<'a, C: CharEq> Searcher<'a> for CharEqSearcher<'a, C> { } } -unsafe impl<'a, C: CharEq> ReverseSearcher<'a> for CharEqSearcher<'a, C> { +unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, C> { #[inline] fn next_back(&mut self) -> SearchStep { let s = &mut self.char_indices; @@ -327,7 +322,7 @@ unsafe impl<'a, C: CharEq> ReverseSearcher<'a> for CharEqSearcher<'a, C> { } } -impl<'a, C: CharEq> DoubleEndedSearcher<'a> for CharEqSearcher<'a, C> {} +impl<'a, C: MultiCharEq> DoubleEndedSearcher<'a> for MultiCharEqSearcher<'a, C> {} ///////////////////////////////////////////////////////////////////////////// @@ -400,14 +395,40 @@ macro_rules! searcher_methods { /// Associated type for `>::Searcher`. #[derive(Clone, Debug)] -pub struct CharSearcher<'a>( as Pattern<'a>>::Searcher); +pub struct CharSearcher<'a>(&'a str); unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { - searcher_methods!(forward); + #[inline] + fn haystack(&self) -> &'a str { + unimplemented!(); + } + #[inline] + fn next(&mut self) -> SearchStep { + unimplemented!(); + } + #[inline] + fn next_match(&mut self) -> Option<(usize, usize)> { + unimplemented!(); + } + #[inline] + fn next_reject(&mut self) -> Option<(usize, usize)> { + unimplemented!(); + } } unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { - searcher_methods!(reverse); + #[inline] + fn next_back(&mut self) -> SearchStep { + unimplemented!(); + } + #[inline] + fn next_match_back(&mut self) -> Option<(usize, usize)> { + unimplemented!(); + } + #[inline] + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + unimplemented!(); + } } impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} @@ -418,7 +439,7 @@ impl<'a> Pattern<'a> for char { #[inline] fn into_searcher(self, haystack: &'a str) -> Self::Searcher { - CharSearcher(CharEqPattern(self).into_searcher(haystack)) + CharSearcher(haystack) } #[inline] @@ -433,13 +454,21 @@ impl<'a> Pattern<'a> for char { #[inline] fn is_prefix_of(self, haystack: &'a str) -> bool { - CharEqPattern(self).is_prefix_of(haystack) + if let Some(ch) = haystack.chars().next() { + self == ch + } else { + false + } } #[inline] fn is_suffix_of(self, haystack: &'a str) -> bool where Self::Searcher: ReverseSearcher<'a> { - CharEqPattern(self).is_suffix_of(haystack) + if let Some(ch) = haystack.chars().next_back() { + self == ch + } else { + false + } } } @@ -451,7 +480,7 @@ impl<'a> Pattern<'a> for char { /// Associated type for `<&[char] as Pattern<'a>>::Searcher`. #[derive(Clone, Debug)] -pub struct CharSliceSearcher<'a, 'b>( as Pattern<'a>>::Searcher); +pub struct CharSliceSearcher<'a, 'b>( as Pattern<'a>>::Searcher); unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> { searcher_methods!(forward); @@ -465,7 +494,7 @@ impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {} /// Searches for chars that are equal to any of the chars in the array impl<'a, 'b> Pattern<'a> for &'b [char] { - pattern_methods!(CharSliceSearcher<'a, 'b>, CharEqPattern, CharSliceSearcher); + pattern_methods!(CharSliceSearcher<'a, 'b>, MultiCharEqPattern, CharSliceSearcher); } ///////////////////////////////////////////////////////////////////////////// @@ -474,7 +503,7 @@ impl<'a, 'b> Pattern<'a> for &'b [char] { /// Associated type for `>::Searcher`. #[derive(Clone)] -pub struct CharPredicateSearcher<'a, F>( as Pattern<'a>>::Searcher) +pub struct CharPredicateSearcher<'a, F>( as Pattern<'a>>::Searcher) where F: FnMut(char) -> bool; impl<'a, F> fmt::Debug for CharPredicateSearcher<'a, F> @@ -504,7 +533,7 @@ impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F> /// Searches for chars that match the given predicate impl<'a, F> Pattern<'a> for F where F: FnMut(char) -> bool { - pattern_methods!(CharPredicateSearcher<'a, F>, CharEqPattern, CharPredicateSearcher); + pattern_methods!(CharPredicateSearcher<'a, F>, MultiCharEqPattern, CharPredicateSearcher); } ///////////////////////////////////////////////////////////////////////////// From 585ad9ff30e579e929bca2b1221367cc440aa377 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 13 Dec 2017 14:37:35 -0600 Subject: [PATCH 03/11] Move CharSearcher to its own section in the file --- src/libcore/str/pattern.rs | 167 +++++++++++++++++++------------------ 1 file changed, 84 insertions(+), 83 deletions(-) diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 9dc82851827..b1b66c9f8d8 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -234,6 +234,90 @@ pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { /// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched. pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} + +///////////////////////////////////////////////////////////////////////////// +// Impl for char +///////////////////////////////////////////////////////////////////////////// + +/// Associated type for `>::Searcher`. +#[derive(Clone, Debug)] +pub struct CharSearcher<'a>(&'a str); + +unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { + #[inline] + fn haystack(&self) -> &'a str { + unimplemented!(); + } + #[inline] + fn next(&mut self) -> SearchStep { + unimplemented!(); + } + #[inline] + fn next_match(&mut self) -> Option<(usize, usize)> { + unimplemented!(); + } + #[inline] + fn next_reject(&mut self) -> Option<(usize, usize)> { + unimplemented!(); + } +} + +unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { + #[inline] + fn next_back(&mut self) -> SearchStep { + unimplemented!(); + } + #[inline] + fn next_match_back(&mut self) -> Option<(usize, usize)> { + unimplemented!(); + } + #[inline] + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + unimplemented!(); + } +} + +impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} + +/// Searches for chars that are equal to a given char +impl<'a> Pattern<'a> for char { + type Searcher = CharSearcher<'a>; + + #[inline] + fn into_searcher(self, haystack: &'a str) -> Self::Searcher { + CharSearcher(haystack) + } + + #[inline] + fn is_contained_in(self, haystack: &'a str) -> bool { + if (self as u32) < 128 { + haystack.as_bytes().contains(&(self as u8)) + } else { + let mut buffer = [0u8; 4]; + self.encode_utf8(&mut buffer).is_contained_in(haystack) + } + } + + #[inline] + fn is_prefix_of(self, haystack: &'a str) -> bool { + if let Some(ch) = haystack.chars().next() { + self == ch + } else { + false + } + } + + #[inline] + fn is_suffix_of(self, haystack: &'a str) -> bool where Self::Searcher: ReverseSearcher<'a> + { + if let Some(ch) = haystack.chars().next_back() { + self == ch + } else { + false + } + } +} + ///////////////////////////////////////////////////////////////////////////// // Impl for a MultiCharEq wrapper ///////////////////////////////////////////////////////////////////////////// @@ -389,89 +473,6 @@ macro_rules! searcher_methods { } } -///////////////////////////////////////////////////////////////////////////// -// Impl for char -///////////////////////////////////////////////////////////////////////////// - -/// Associated type for `>::Searcher`. -#[derive(Clone, Debug)] -pub struct CharSearcher<'a>(&'a str); - -unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { - #[inline] - fn haystack(&self) -> &'a str { - unimplemented!(); - } - #[inline] - fn next(&mut self) -> SearchStep { - unimplemented!(); - } - #[inline] - fn next_match(&mut self) -> Option<(usize, usize)> { - unimplemented!(); - } - #[inline] - fn next_reject(&mut self) -> Option<(usize, usize)> { - unimplemented!(); - } -} - -unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { - #[inline] - fn next_back(&mut self) -> SearchStep { - unimplemented!(); - } - #[inline] - fn next_match_back(&mut self) -> Option<(usize, usize)> { - unimplemented!(); - } - #[inline] - fn next_reject_back(&mut self) -> Option<(usize, usize)> { - unimplemented!(); - } -} - -impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} - -/// Searches for chars that are equal to a given char -impl<'a> Pattern<'a> for char { - type Searcher = CharSearcher<'a>; - - #[inline] - fn into_searcher(self, haystack: &'a str) -> Self::Searcher { - CharSearcher(haystack) - } - - #[inline] - fn is_contained_in(self, haystack: &'a str) -> bool { - if (self as u32) < 128 { - haystack.as_bytes().contains(&(self as u8)) - } else { - let mut buffer = [0u8; 4]; - self.encode_utf8(&mut buffer).is_contained_in(haystack) - } - } - - #[inline] - fn is_prefix_of(self, haystack: &'a str) -> bool { - if let Some(ch) = haystack.chars().next() { - self == ch - } else { - false - } - } - - #[inline] - fn is_suffix_of(self, haystack: &'a str) -> bool where Self::Searcher: ReverseSearcher<'a> - { - if let Some(ch) = haystack.chars().next_back() { - self == ch - } else { - false - } - } -} - ///////////////////////////////////////////////////////////////////////////// // Impl for &[char] ///////////////////////////////////////////////////////////////////////////// From d9dc44a5e9857864905e1cdbf40ab9ac617f65e7 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 13 Dec 2017 15:26:27 -0600 Subject: [PATCH 04/11] Fill in forward searcher impl for char --- src/libcore/str/pattern.rs | 78 +++++++++++++++++++++++++++++++------- 1 file changed, 65 insertions(+), 13 deletions(-) diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index b1b66c9f8d8..3f24374223c 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -19,6 +19,7 @@ use cmp; use fmt; +use slice::memchr; use usize; // Pattern @@ -241,25 +242,66 @@ pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} /// Associated type for `>::Searcher`. #[derive(Clone, Debug)] -pub struct CharSearcher<'a>(&'a str); +pub struct CharSearcher<'a> { + haystack: &'a str, + // invariant: `finger` must be a valid utf8 byte index of `haystack` + finger: usize, + needle: char, + // For ascii chars + // invariant: must be an ASCII byte (no high bit) + single_byte: Option, +} unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { #[inline] fn haystack(&self) -> &'a str { - unimplemented!(); + self.haystack } #[inline] fn next(&mut self) -> SearchStep { - unimplemented!(); + let old_finger = self.finger; + let slice = unsafe { self.haystack.get_unchecked(old_finger..) }; + let mut iter = slice.chars(); + let old_len = iter.iter.len(); + if let Some(ch) = iter.next() { + // add byte offset of current character + // without recalculating + self.finger += iter.iter.len() - old_len; + if ch == self.needle { + SearchStep::Match(old_finger, self.finger) + } else { + SearchStep::Reject(old_finger, self.finger) + } + } else { + SearchStep::Done + } } #[inline] fn next_match(&mut self) -> Option<(usize, usize)> { - unimplemented!(); - } - #[inline] - fn next_reject(&mut self) -> Option<(usize, usize)> { - unimplemented!(); + if let Some(byte) = self.single_byte { + let old_finger = self.finger; + let slice = unsafe { self.haystack.get_unchecked(old_finger..) }; + let bytes = slice.as_bytes(); + if let Some(index) = memchr::memchr(byte, bytes) { + // index is the index of a valid ASCII byte, + // so we can add one to it + self.finger += index + 1; + Some((index, self.finger)) + } else { + None + } + } else { + loop { + match self.next() { + SearchStep::Match(a, b) => break Some((a, b)), + SearchStep::Done => break None, + _ => continue, + } + } + } } + + // let next_reject use the default implementation from the Searcher trait } unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { @@ -271,10 +313,8 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { fn next_match_back(&mut self) -> Option<(usize, usize)> { unimplemented!(); } - #[inline] - fn next_reject_back(&mut self) -> Option<(usize, usize)> { - unimplemented!(); - } + + // let next_reject_back use the default implementation from the Searcher trait } impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} @@ -285,7 +325,19 @@ impl<'a> Pattern<'a> for char { #[inline] fn into_searcher(self, haystack: &'a str) -> Self::Searcher { - CharSearcher(haystack) + let single_byte = if self.len_utf8() == 1 { + let mut storage = [0]; + self.encode_utf8(&mut storage); + Some(storage[0]) + } else { + None + }; + CharSearcher { + haystack, + finger: 0, + needle: self, + single_byte, + } } #[inline] From f865164030ccd167a9e9f9fae665373fb58295fb Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Thu, 14 Dec 2017 14:10:10 -0600 Subject: [PATCH 05/11] Fill in reverse searcher impl for char --- src/libcore/str/pattern.rs | 56 ++++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 3f24374223c..54e426893bc 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -128,6 +128,11 @@ pub unsafe trait Searcher<'a> { fn next(&mut self) -> SearchStep; /// Find the next `Match` result. See `next()` + /// + /// Unlike next(), there is no guarantee that the returned ranges + /// of this and next_reject will overlap. This will return (start_match, end_match), + /// where start_match is the index of where the match begins, and end_match is + /// the index after the end of the match. #[inline] fn next_match(&mut self) -> Option<(usize, usize)> { loop { @@ -139,7 +144,10 @@ pub unsafe trait Searcher<'a> { } } - /// Find the next `Reject` result. See `next()` + /// Find the next `Reject` result. See `next()` and `next_match()` + /// + /// Unlike next(), there is no guarantee that the returned ranges + /// of this and next_match will overlap. #[inline] fn next_reject(&mut self) -> Option<(usize, usize)> { loop { @@ -244,8 +252,9 @@ pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} #[derive(Clone, Debug)] pub struct CharSearcher<'a> { haystack: &'a str, - // invariant: `finger` must be a valid utf8 byte index of `haystack` + // invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack` finger: usize, + finger_back: usize, needle: char, // For ascii chars // invariant: must be an ASCII byte (no high bit) @@ -266,7 +275,7 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { if let Some(ch) = iter.next() { // add byte offset of current character // without recalculating - self.finger += iter.iter.len() - old_len; + self.finger += old_len - iter.iter.len(); if ch == self.needle { SearchStep::Match(old_finger, self.finger) } else { @@ -286,7 +295,7 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { // index is the index of a valid ASCII byte, // so we can add one to it self.finger += index + 1; - Some((index, self.finger)) + Some((self.finger - 1, self.finger)) } else { None } @@ -307,11 +316,45 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { #[inline] fn next_back(&mut self) -> SearchStep { - unimplemented!(); + let old_finger = self.finger_back; + let slice = unsafe { self.haystack.slice_unchecked(0, old_finger) }; + let mut iter = slice.chars(); + let old_len = iter.iter.len(); + if let Some(ch) = iter.next_back() { + // subtract byte offset of current character + // without recalculating + self.finger_back -= old_len - iter.iter.len(); + if ch == self.needle { + SearchStep::Match(self.finger_back, old_finger) + } else { + SearchStep::Reject(self.finger_back, old_finger) + } + } else { + SearchStep::Done + } } #[inline] fn next_match_back(&mut self) -> Option<(usize, usize)> { - unimplemented!(); + if let Some(byte) = self.single_byte { + let old_finger = self.finger_back; + let slice = unsafe { self.haystack.slice_unchecked(0, old_finger) }; + let bytes = slice.as_bytes(); + if let Some(index) = memchr::memrchr(byte, bytes) { + // index is the index of a valid ASCII byte + self.finger_back = index; + Some((self.finger_back, self.finger_back + 1)) + } else { + None + } + } else { + loop { + match self.next_back() { + SearchStep::Match(a, b) => break Some((a, b)), + SearchStep::Done => break None, + _ => continue, + } + } + } } // let next_reject_back use the default implementation from the Searcher trait @@ -335,6 +378,7 @@ impl<'a> Pattern<'a> for char { CharSearcher { haystack, finger: 0, + finger_back: haystack.len(), needle: self, single_byte, } From 75c07a37ff352607523a3c7a4e8bc3809949cb4c Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Sat, 16 Dec 2017 22:17:27 -0600 Subject: [PATCH 06/11] Add memchr search support for multibyte characters --- src/libcore/str/pattern.rs | 150 +++++++++++++++++++++++++------------ 1 file changed, 102 insertions(+), 48 deletions(-) diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 54e426893bc..e44799bb9c5 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -252,13 +252,28 @@ pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} #[derive(Clone, Debug)] pub struct CharSearcher<'a> { haystack: &'a str, - // invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack` + // safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack` + // This invariant can be broken *within* next_match and next_match_back, however + // they must exit with fingers on valid code point boundaries. + + /// `finger` is the current byte index of the forward search. + /// Imagine that it exists before the byte at its index, i.e. + /// haystack[finger] is the first byte of the slice we must inspect during + /// forward searching finger: usize, + /// `finger_back` is the current byte index of the reverse search. + /// Imagine that it exists after the byte at its index, i.e. + /// haystack[finger_back - 1] is the last byte of the slice we must inspect during + /// forward searching (and thus the first byte to be inspected when calling next_back()) finger_back: usize, + /// The character being searched for needle: char, - // For ascii chars - // invariant: must be an ASCII byte (no high bit) - single_byte: Option, + + // safety invariant: `utf8_size` must be less than 5 + /// The number of bytes `needle` takes up when encoded in utf8 + utf8_size: usize, + /// A utf8 encoded copy of the `needle` + utf8_encoded: [u8; 4], } unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { @@ -269,12 +284,12 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { #[inline] fn next(&mut self) -> SearchStep { let old_finger = self.finger; - let slice = unsafe { self.haystack.get_unchecked(old_finger..) }; + let slice = unsafe { self.haystack.get_unchecked(old_finger..self.haystack.len()) }; let mut iter = slice.chars(); let old_len = iter.iter.len(); if let Some(ch) = iter.next() { // add byte offset of current character - // without recalculating + // without re-encoding as utf-8 self.finger += old_len - iter.iter.len(); if ch == self.needle { SearchStep::Match(old_finger, self.finger) @@ -287,25 +302,44 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { } #[inline] fn next_match(&mut self) -> Option<(usize, usize)> { - if let Some(byte) = self.single_byte { - let old_finger = self.finger; - let slice = unsafe { self.haystack.get_unchecked(old_finger..) }; - let bytes = slice.as_bytes(); - if let Some(index) = memchr::memchr(byte, bytes) { - // index is the index of a valid ASCII byte, - // so we can add one to it - self.finger += index + 1; - Some((self.finger - 1, self.finger)) + loop { + // get the haystack after the last character found + let bytes = if let Some(slice) = self.haystack.as_bytes().get(self.finger..) { + slice } else { - None - } - } else { - loop { - match self.next() { - SearchStep::Match(a, b) => break Some((a, b)), - SearchStep::Done => break None, - _ => continue, + return None; + }; + // the last byte of the utf8 encoded needle + let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) }; + if let Some(index) = memchr::memchr(last_byte, bytes) { + // The new finger is the index of the byte we found, + // plus one, since we memchr'd for the last byte of the character. + // + // Note that this doesn't always give us a finger on a UTF8 boundary. + // If we *didn't* find our character + // we may have indexed to the non-last byte of a 3-byte or 4-byte character. + // We can't just skip to the next valid starting byte because a character like + // ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find + // the second byte when searching for the third. + // + // However, this is totally okay. While we have the invariant that + // self.finger is on a UTF8 boundary, this invariant is not relid upon + // within this method (it is relied upon in CharSearcher::next()). + // + // We only exit this method when we reach the end of the string, or if we + // find something. When we find something the `finger` will be set + // to a UTF8 boundary. + self.finger += index + 1; + let found_char = self.finger - self.utf8_size; + if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) { + if slice == &self.utf8_encoded[0..self.utf8_size] { + return Some((found_char, self.finger)); + } } + } else { + // found nothing, exit + self.finger = self.haystack.len(); + return None; } } } @@ -322,7 +356,7 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { let old_len = iter.iter.len(); if let Some(ch) = iter.next_back() { // subtract byte offset of current character - // without recalculating + // without re-encoding as utf-8 self.finger_back -= old_len - iter.iter.len(); if ch == self.needle { SearchStep::Match(self.finger_back, old_finger) @@ -335,24 +369,47 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { } #[inline] fn next_match_back(&mut self) -> Option<(usize, usize)> { - if let Some(byte) = self.single_byte { - let old_finger = self.finger_back; - let slice = unsafe { self.haystack.slice_unchecked(0, old_finger) }; - let bytes = slice.as_bytes(); - if let Some(index) = memchr::memrchr(byte, bytes) { - // index is the index of a valid ASCII byte - self.finger_back = index; - Some((self.finger_back, self.finger_back + 1)) + let haystack = self.haystack.as_bytes(); + loop { + // get the haystack up to but not including the last character searched + let bytes = if let Some(slice) = haystack.get(..self.finger_back) { + slice } else { - None - } - } else { - loop { - match self.next_back() { - SearchStep::Match(a, b) => break Some((a, b)), - SearchStep::Done => break None, - _ => continue, + return None; + }; + // the last byte of the utf8 encoded needle + let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) }; + if let Some(index) = memchr::memrchr(last_byte, bytes) { + // memrchr will return the index of the byte we wish to + // find. In case of an ASCII character, this is indeed + // were we wish our new finger to be ("after" the found + // char in the paradigm of reverse iteration). For + // multibyte chars we need to skip down by the number of more + // bytes they have than ASCII + let found_char = index - (self.utf8_size - 1); + if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) { + if slice == &self.utf8_encoded[0..self.utf8_size] { + // move finger to before the character found (i.e. at its start index) + self.finger_back = found_char; + return Some((self.finger_back, self.finger_back + self.utf8_size)); + } } + // We can't use finger_back = index - size + 1 here. If we found the last char + // of a different-sized character (or the middle byte of a different character) + // we need to bump the finger_back down to `index`. This similarly makes + // `finger_back` have the potential to no longer be on a boundary, + // but this is OK since we only exit this function on a boundary + // or when the haystack has been searched completely. + // + // Unlike next_match this does not + // have the problem of repeated bytes in utf-8 because + // we're searching for the last byte, and we can only have + // found the last byte when searching in reverse. + self.finger_back = index; + } else { + self.finger_back = 0; + // found nothing, exit + return None; } } } @@ -368,19 +425,16 @@ impl<'a> Pattern<'a> for char { #[inline] fn into_searcher(self, haystack: &'a str) -> Self::Searcher { - let single_byte = if self.len_utf8() == 1 { - let mut storage = [0]; - self.encode_utf8(&mut storage); - Some(storage[0]) - } else { - None - }; + let mut utf8_encoded = [0; 4]; + self.encode_utf8(&mut utf8_encoded); + let utf8_size = self.len_utf8(); CharSearcher { haystack, finger: 0, finger_back: haystack.len(), needle: self, - single_byte, + utf8_size, + utf8_encoded } } From efcc447ebfafde91eba51ae04cdb8b0b776f8ac8 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Sun, 17 Dec 2017 14:44:03 -0800 Subject: [PATCH 07/11] Add simple test for pattern API --- src/libcore/tests/lib.rs | 2 + src/libcore/tests/pattern.rs | 76 ++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 src/libcore/tests/pattern.rs diff --git a/src/libcore/tests/lib.rs b/src/libcore/tests/lib.rs index 0e445cdac35..c4b85b82981 100644 --- a/src/libcore/tests/lib.rs +++ b/src/libcore/tests/lib.rs @@ -28,6 +28,7 @@ #![feature(iter_rfind)] #![feature(iter_rfold)] #![feature(nonzero)] +#![feature(pattern)] #![feature(raw)] #![feature(refcell_replace_swap)] #![feature(sip_hash_13)] @@ -61,6 +62,7 @@ mod nonzero; mod num; mod ops; mod option; +mod pattern; mod ptr; mod result; mod slice; diff --git a/src/libcore/tests/pattern.rs b/src/libcore/tests/pattern.rs new file mode 100644 index 00000000000..e12f0bc9e5f --- /dev/null +++ b/src/libcore/tests/pattern.rs @@ -0,0 +1,76 @@ +use std::str::pattern::*; + +// This macro makes it easier to write +// tests that do a series of iterations +macro_rules! search_asserts { + ($haystack:expr, $needle:expr, $testname:expr, [$($func:ident),*], $result:expr) => { + let mut searcher = $needle.into_searcher($haystack); + let arr = [$( Step::from(searcher.$func()) ),+]; + assert_eq!(&arr[..], &$result, $testname); + } +} + +/// Combined enum for the results of next() and next_match()/next_reject() +#[derive(Debug, PartialEq, Eq)] +enum Step { + // variant names purposely chosen to + // be the same length for easy alignment + Matches(usize, usize), + Rejects(usize, usize), + InRange(usize, usize), + Done +} + +use Step::*; + +impl From for Step { + fn from(x: SearchStep) -> Self { + match x { + SearchStep::Match(a, b) => Matches(a, b), + SearchStep::Reject(a, b) => Rejects(a, b), + SearchStep::Done => Done + } + } +} + +impl From> for Step { + fn from(x: Option<(usize, usize)>) -> Self { + match x { + Some((a, b)) => InRange(a, b), + None => Done + } + } +} + +#[test] +fn test_simple_iteration() { + search_asserts! ("abcdeabcd", 'a', "forward iteration for ASCII string", + // a b c d e a b c d EOF + [next, next, next, next, next, next, next, next, next, next], + [Matches(0, 1), Rejects(1, 2), Rejects(2, 3), Rejects(3, 4), Rejects(4, 5), Matches(5, 6), Rejects(6, 7), Rejects(7, 8), Rejects(8, 9), Done] + ); + + search_asserts! ("abcdeabcd", 'a', "reverse iteration for ASCII string", + // d c b a e d c b a EOF + [next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back], + [Rejects(8, 9), Rejects(7, 8), Rejects(6, 7), Matches(5, 6), Rejects(4, 5), Rejects(3, 4), Rejects(2, 3), Rejects(1, 2), Matches(0, 1), Done] + ); + + search_asserts! ("我爱我的猫", '我', "forward iteration for Chinese string", + // 我 愛 我 的 貓 EOF + [next, next, next, next, next, next], + [Matches(0, 3), Rejects(3, 6), Matches(6, 9), Rejects(9, 12), Rejects(12, 15), Done] + ); + + search_asserts! ("我的猫说meow", 'm', "forward iteration for mixed string", + // 我 的 猫 说 m e o w EOF + [next, next, next, next, next, next, next, next, next], + [Rejects(0, 3), Rejects(3, 6), Rejects(6, 9), Rejects(9, 12), Matches(12, 13), Rejects(13, 14), Rejects(14, 15), Rejects(15, 16), Done] + ); + + search_asserts! ("我的猫说meow", '猫', "reverse iteration for mixed string", + // w o e m 说 猫 的 我 EOF + [next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back], + [Rejects(15, 16), Rejects(14, 15), Rejects(13, 14), Rejects(12, 13), Rejects(9, 12), Matches(6, 9), Rejects(3, 6), Rejects(0, 3), Done] + ); +} From bc5535557662fb7851d80ff1538b5518af921571 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Sun, 17 Dec 2017 15:05:29 -0800 Subject: [PATCH 08/11] Add simple search test for pattern API --- src/libcore/tests/pattern.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/libcore/tests/pattern.rs b/src/libcore/tests/pattern.rs index e12f0bc9e5f..bb0a618f6b8 100644 --- a/src/libcore/tests/pattern.rs +++ b/src/libcore/tests/pattern.rs @@ -74,3 +74,27 @@ fn test_simple_iteration() { [Rejects(15, 16), Rejects(14, 15), Rejects(13, 14), Rejects(12, 13), Rejects(9, 12), Matches(6, 9), Rejects(3, 6), Rejects(0, 3), Done] ); } + +#[test] +fn test_simple_search() { + search_asserts!("abcdeabcdeabcde", 'a', "next_match for ASCII string", + [next_match, next_match, next_match, next_match], + [InRange(0, 1), InRange(5, 6), InRange(10, 11), Done] + ); + + search_asserts!("abcdeabcdeabcde", 'a', "next_match_back for ASCII string", + [next_match_back, next_match_back, next_match_back, next_match_back], + [InRange(10, 11), InRange(5, 6), InRange(0, 1), Done] + ); + + search_asserts!("abcdeab", 'a', "next_reject for ASCII string", + [next_reject, next_reject, next_match, next_reject, next_reject], + [InRange(1, 2), InRange(2, 3), InRange(5, 6), InRange(6, 7), Done] + ); + + search_asserts!("abcdeabcdeabcde", 'a', "next_reject_back for ASCII string", + [next_reject_back, next_reject_back, next_match_back, next_reject_back, next_reject_back, next_reject_back], + [InRange(14, 15), InRange(13, 14), InRange(10, 11), InRange(9, 10), InRange(8, 9), InRange(7, 8)] + ); +} + From 9b92a4419d6a76a9de6d56adb3084d97e3e31d20 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Mon, 18 Dec 2017 03:48:07 -0800 Subject: [PATCH 09/11] Add stresstests for shared bytes for pattern API --- src/libcore/tests/pattern.rs | 154 ++++++++++++++++++++++++++++++++++- 1 file changed, 153 insertions(+), 1 deletion(-) diff --git a/src/libcore/tests/pattern.rs b/src/libcore/tests/pattern.rs index bb0a618f6b8..7fe274a79ed 100644 --- a/src/libcore/tests/pattern.rs +++ b/src/libcore/tests/pattern.rs @@ -21,7 +21,7 @@ enum Step { Done } -use Step::*; +use self::Step::*; impl From for Step { fn from(x: SearchStep) -> Self { @@ -42,6 +42,12 @@ impl From> for Step { } } +// XXXManishearth these tests focus on single-character searching (CharSearcher) +// and on next()/next_match(), not next_reject(). This is because +// the memchr changes make next_match() for single chars complex, but next_reject() +// continues to use next() under the hood. We should add more test cases for all +// of these, as well as tests for StrSearcher and higher level tests for str::find() (etc) + #[test] fn test_simple_iteration() { search_asserts! ("abcdeabcd", 'a', "forward iteration for ASCII string", @@ -98,3 +104,149 @@ fn test_simple_search() { ); } +// Á, 각, ก, 😀 all end in 0x81 +// 🁀, ᘀ do not end in 0x81 but contain the byte +// ꁁ has 0x81 as its second and third bytes. +// +// The memchr-using implementation of next_match +// and next_match_back temporarily violate +// the property that the search is always on a unicode boundary, +// which is fine as long as this never reaches next() or next_back(). +// So we test if next() is correct after each next_match() as well. +const STRESS: &str = "Áa🁀bÁꁁfg😁각กᘀ각aÁ각ꁁก😁a"; + +#[test] +fn test_stress_indices() { + // this isn't really a test, more of documentation on the indices of each character in the stresstest string + + search_asserts!(STRESS, 'x', "Indices of characters in stress test", + [next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next], + [Rejects(0, 2), // Á + Rejects(2, 3), // a + Rejects(3, 7), // 🁀 + Rejects(7, 8), // b + Rejects(8, 10), // Á + Rejects(10, 13), // ꁁ + Rejects(13, 14), // f + Rejects(14, 15), // g + Rejects(15, 19), // 😀 + Rejects(19, 22), // 각 + Rejects(22, 25), // ก + Rejects(25, 28), // ᘀ + Rejects(28, 31), // 각 + Rejects(31, 32), // a + Rejects(32, 34), // Á + Rejects(34, 37), // 각 + Rejects(37, 40), // ꁁ + Rejects(40, 43), // ก + Rejects(43, 47), // 😀 + Rejects(47, 48), // a + Done] + ); +} + +#[test] +fn test_forward_search_shared_bytes() { + search_asserts!(STRESS, 'Á', "Forward search for two-byte Latin character", + [next_match, next_match, next_match, next_match], + [InRange(0, 2), InRange(8, 10), InRange(32, 34), Done] + ); + + search_asserts!(STRESS, 'Á', "Forward search for two-byte Latin character; check if next() still works", + [next_match, next, next_match, next, next_match, next, next_match], + [InRange(0, 2), Rejects(2, 3), InRange(8, 10), Rejects(10, 13), InRange(32, 34), Rejects(34, 37), Done] + ); + + search_asserts!(STRESS, '각', "Forward search for three-byte Hangul character", + [next_match, next, next_match, next_match, next_match], + [InRange(19, 22), Rejects(22, 25), InRange(28, 31), InRange(34, 37), Done] + ); + + search_asserts!(STRESS, '각', "Forward search for three-byte Hangul character; check if next() still works", + [next_match, next, next_match, next, next_match, next, next_match], + [InRange(19, 22), Rejects(22, 25), InRange(28, 31), Rejects(31, 32), InRange(34, 37), Rejects(37, 40), Done] + ); + + search_asserts!(STRESS, 'ก', "Forward search for three-byte Thai character", + [next_match, next, next_match, next, next_match], + [InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done] + ); + + search_asserts!(STRESS, 'ก', "Forward search for three-byte Thai character; check if next() still works", + [next_match, next, next_match, next, next_match], + [InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done] + ); + + search_asserts!(STRESS, '😁', "Forward search for four-byte emoji", + [next_match, next, next_match, next, next_match], + [InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done] + ); + + search_asserts!(STRESS, '😁', "Forward search for four-byte emoji; check if next() still works", + [next_match, next, next_match, next, next_match], + [InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done] + ); + + search_asserts!(STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes", + [next_match, next, next_match, next, next_match], + [InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done] + ); + + search_asserts!(STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes; check if next() still works", + [next_match, next, next_match, next, next_match], + [InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done] + ); +} + +#[test] +fn test_reverse_search_shared_bytes() { + search_asserts!(STRESS, 'Á', "Reverse search for two-byte Latin character", + [next_match_back, next_match_back, next_match_back, next_match_back], + [InRange(32, 34), InRange(8, 10), InRange(0, 2), Done] + ); + + search_asserts!(STRESS, 'Á', "Reverse search for two-byte Latin character; check if next_back() still works", + [next_match_back, next_back, next_match_back, next_back, next_match_back, next_back], + [InRange(32, 34), Rejects(31, 32), InRange(8, 10), Rejects(7, 8), InRange(0, 2), Done] + ); + + search_asserts!(STRESS, '각', "Reverse search for three-byte Hangul character", + [next_match_back, next_back, next_match_back, next_match_back, next_match_back], + [InRange(34, 37), Rejects(32, 34), InRange(28, 31), InRange(19, 22), Done] + ); + + search_asserts!(STRESS, '각', "Reverse search for three-byte Hangul character; check if next_back() still works", + [next_match_back, next_back, next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(34, 37), Rejects(32, 34), InRange(28, 31), Rejects(25, 28), InRange(19, 22), Rejects(15, 19), Done] + ); + + search_asserts!(STRESS, 'ก', "Reverse search for three-byte Thai character", + [next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done] + ); + + search_asserts!(STRESS, 'ก', "Reverse search for three-byte Thai character; check if next_back() still works", + [next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done] + ); + + search_asserts!(STRESS, '😁', "Reverse search for four-byte emoji", + [next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done] + ); + + search_asserts!(STRESS, '😁', "Reverse search for four-byte emoji; check if next_back() still works", + [next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done] + ); + + search_asserts!(STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes", + [next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done] + ); + + search_asserts!(STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes; check if next_back() still works", + [next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done] + ); +} From 85919a0b5f474783cb56cd433292865a40539665 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Fri, 22 Dec 2017 11:19:50 +0530 Subject: [PATCH 10/11] Pass tidy for tests --- src/libcore/tests/pattern.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/libcore/tests/pattern.rs b/src/libcore/tests/pattern.rs index 7fe274a79ed..d0fd15263b2 100644 --- a/src/libcore/tests/pattern.rs +++ b/src/libcore/tests/pattern.rs @@ -1,3 +1,13 @@ +// Copyright 2017 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + use std::str::pattern::*; // This macro makes it easier to write @@ -42,7 +52,9 @@ impl From> for Step { } } -// XXXManishearth these tests focus on single-character searching (CharSearcher) +// ignore-tidy-linelength + +// FIXME(Manishearth) these tests focus on single-character searching (CharSearcher) // and on next()/next_match(), not next_reject(). This is because // the memchr changes make next_match() for single chars complex, but next_reject() // continues to use next() under the hood. We should add more test cases for all @@ -51,7 +63,7 @@ impl From> for Step { #[test] fn test_simple_iteration() { search_asserts! ("abcdeabcd", 'a', "forward iteration for ASCII string", - // a b c d e a b c d EOF + // a b c d e a b c d EOF [next, next, next, next, next, next, next, next, next, next], [Matches(0, 1), Rejects(1, 2), Rejects(2, 3), Rejects(3, 4), Rejects(4, 5), Matches(5, 6), Rejects(6, 7), Rejects(7, 8), Rejects(8, 9), Done] ); From 5cf55165fae5c8538db5c00e252ad9ba42aaf246 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Mon, 1 Jan 2018 19:55:21 +0530 Subject: [PATCH 11/11] handle overflow/underflow in index offsets --- src/libcore/str/pattern.rs | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index e44799bb9c5..677c0ecc33d 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -330,10 +330,12 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { // find something. When we find something the `finger` will be set // to a UTF8 boundary. self.finger += index + 1; - let found_char = self.finger - self.utf8_size; - if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) { - if slice == &self.utf8_encoded[0..self.utf8_size] { - return Some((found_char, self.finger)); + if self.finger >= self.utf8_size { + let found_char = self.finger - self.utf8_size; + if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) { + if slice == &self.utf8_encoded[0..self.utf8_size] { + return Some((found_char, self.finger)); + } } } } else { @@ -386,12 +388,15 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { // char in the paradigm of reverse iteration). For // multibyte chars we need to skip down by the number of more // bytes they have than ASCII - let found_char = index - (self.utf8_size - 1); - if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) { - if slice == &self.utf8_encoded[0..self.utf8_size] { - // move finger to before the character found (i.e. at its start index) - self.finger_back = found_char; - return Some((self.finger_back, self.finger_back + self.utf8_size)); + let shift = self.utf8_size - 1; + if index >= shift { + let found_char = index - shift; + if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) { + if slice == &self.utf8_encoded[0..self.utf8_size] { + // move finger to before the character found (i.e. at its start index) + self.finger_back = found_char; + return Some((self.finger_back, self.finger_back + self.utf8_size)); + } } } // We can't use finger_back = index - size + 1 here. If we found the last char