Auto merge of #46735 - Manishearth:memchr-find, r=burntsushi

Use memchr for str::find(char)

This is a 10x improvement for searching for characters.

This also contains the patches from https://github.com/rust-lang/rust/pull/46713 . Feel free to land both separately or together.

cc @mystor @alexcrichton

r? @bluss

fixes #46693
This commit is contained in:
bors 2018-01-01 19:04:33 +00:00
commit b65f0bedd2
3 changed files with 524 additions and 89 deletions

View File

@ -19,6 +19,7 @@
use cmp;
use fmt;
use slice::memchr;
use usize;
// Pattern
@ -127,6 +128,11 @@ pub unsafe trait Searcher<'a> {
fn next(&mut self) -> SearchStep;
/// Find the next `Match` result. See `next()`
///
/// Unlike next(), there is no guarantee that the returned ranges
/// of this and next_reject will overlap. This will return (start_match, end_match),
/// where start_match is the index of where the match begins, and end_match is
/// the index after the end of the match.
#[inline]
fn next_match(&mut self) -> Option<(usize, usize)> {
loop {
@ -138,7 +144,10 @@ pub unsafe trait Searcher<'a> {
}
}
/// Find the next `Reject` result. See `next()`
/// Find the next `Reject` result. See `next()` and `next_match()`
///
/// Unlike next(), there is no guarantee that the returned ranges
/// of this and next_match will overlap.
#[inline]
fn next_reject(&mut self) -> Option<(usize, usize)> {
loop {
@ -234,62 +243,272 @@ pub unsafe trait ReverseSearcher<'a>: Searcher<'a> {
/// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched.
pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {}
/////////////////////////////////////////////////////////////////////////////
// Impl for a CharEq wrapper
// Impl for char
/////////////////////////////////////////////////////////////////////////////
/// Associated type for `<char as Pattern<'a>>::Searcher`.
#[derive(Clone, Debug)]
pub struct CharSearcher<'a> {
haystack: &'a str,
// safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack`
// This invariant can be broken *within* next_match and next_match_back, however
// they must exit with fingers on valid code point boundaries.
/// `finger` is the current byte index of the forward search.
/// Imagine that it exists before the byte at its index, i.e.
/// haystack[finger] is the first byte of the slice we must inspect during
/// forward searching
finger: usize,
/// `finger_back` is the current byte index of the reverse search.
/// Imagine that it exists after the byte at its index, i.e.
/// haystack[finger_back - 1] is the last byte of the slice we must inspect during
/// forward searching (and thus the first byte to be inspected when calling next_back())
finger_back: usize,
/// The character being searched for
needle: char,
// safety invariant: `utf8_size` must be less than 5
/// The number of bytes `needle` takes up when encoded in utf8
utf8_size: usize,
/// A utf8 encoded copy of the `needle`
utf8_encoded: [u8; 4],
}
unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
#[inline]
fn haystack(&self) -> &'a str {
self.haystack
}
#[inline]
fn next(&mut self) -> SearchStep {
let old_finger = self.finger;
let slice = unsafe { self.haystack.get_unchecked(old_finger..self.haystack.len()) };
let mut iter = slice.chars();
let old_len = iter.iter.len();
if let Some(ch) = iter.next() {
// add byte offset of current character
// without re-encoding as utf-8
self.finger += old_len - iter.iter.len();
if ch == self.needle {
SearchStep::Match(old_finger, self.finger)
} else {
SearchStep::Reject(old_finger, self.finger)
}
} else {
SearchStep::Done
}
}
#[inline]
fn next_match(&mut self) -> Option<(usize, usize)> {
loop {
// get the haystack after the last character found
let bytes = if let Some(slice) = self.haystack.as_bytes().get(self.finger..) {
slice
} else {
return None;
};
// the last byte of the utf8 encoded needle
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
if let Some(index) = memchr::memchr(last_byte, bytes) {
// The new finger is the index of the byte we found,
// plus one, since we memchr'd for the last byte of the character.
//
// Note that this doesn't always give us a finger on a UTF8 boundary.
// If we *didn't* find our character
// we may have indexed to the non-last byte of a 3-byte or 4-byte character.
// We can't just skip to the next valid starting byte because a character like
// ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find
// the second byte when searching for the third.
//
// However, this is totally okay. While we have the invariant that
// self.finger is on a UTF8 boundary, this invariant is not relid upon
// within this method (it is relied upon in CharSearcher::next()).
//
// We only exit this method when we reach the end of the string, or if we
// find something. When we find something the `finger` will be set
// to a UTF8 boundary.
self.finger += index + 1;
if self.finger >= self.utf8_size {
let found_char = self.finger - self.utf8_size;
if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) {
if slice == &self.utf8_encoded[0..self.utf8_size] {
return Some((found_char, self.finger));
}
}
}
} else {
// found nothing, exit
self.finger = self.haystack.len();
return None;
}
}
}
// let next_reject use the default implementation from the Searcher trait
}
unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
#[inline]
fn next_back(&mut self) -> SearchStep {
let old_finger = self.finger_back;
let slice = unsafe { self.haystack.slice_unchecked(0, old_finger) };
let mut iter = slice.chars();
let old_len = iter.iter.len();
if let Some(ch) = iter.next_back() {
// subtract byte offset of current character
// without re-encoding as utf-8
self.finger_back -= old_len - iter.iter.len();
if ch == self.needle {
SearchStep::Match(self.finger_back, old_finger)
} else {
SearchStep::Reject(self.finger_back, old_finger)
}
} else {
SearchStep::Done
}
}
#[inline]
fn next_match_back(&mut self) -> Option<(usize, usize)> {
let haystack = self.haystack.as_bytes();
loop {
// get the haystack up to but not including the last character searched
let bytes = if let Some(slice) = haystack.get(..self.finger_back) {
slice
} else {
return None;
};
// the last byte of the utf8 encoded needle
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
if let Some(index) = memchr::memrchr(last_byte, bytes) {
// memrchr will return the index of the byte we wish to
// find. In case of an ASCII character, this is indeed
// were we wish our new finger to be ("after" the found
// char in the paradigm of reverse iteration). For
// multibyte chars we need to skip down by the number of more
// bytes they have than ASCII
let shift = self.utf8_size - 1;
if index >= shift {
let found_char = index - shift;
if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) {
if slice == &self.utf8_encoded[0..self.utf8_size] {
// move finger to before the character found (i.e. at its start index)
self.finger_back = found_char;
return Some((self.finger_back, self.finger_back + self.utf8_size));
}
}
}
// We can't use finger_back = index - size + 1 here. If we found the last char
// of a different-sized character (or the middle byte of a different character)
// we need to bump the finger_back down to `index`. This similarly makes
// `finger_back` have the potential to no longer be on a boundary,
// but this is OK since we only exit this function on a boundary
// or when the haystack has been searched completely.
//
// Unlike next_match this does not
// have the problem of repeated bytes in utf-8 because
// we're searching for the last byte, and we can only have
// found the last byte when searching in reverse.
self.finger_back = index;
} else {
self.finger_back = 0;
// found nothing, exit
return None;
}
}
}
// let next_reject_back use the default implementation from the Searcher trait
}
impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {}
/// Searches for chars that are equal to a given char
impl<'a> Pattern<'a> for char {
type Searcher = CharSearcher<'a>;
#[inline]
fn into_searcher(self, haystack: &'a str) -> Self::Searcher {
let mut utf8_encoded = [0; 4];
self.encode_utf8(&mut utf8_encoded);
let utf8_size = self.len_utf8();
CharSearcher {
haystack,
finger: 0,
finger_back: haystack.len(),
needle: self,
utf8_size,
utf8_encoded
}
}
#[inline]
fn is_contained_in(self, haystack: &'a str) -> bool {
if (self as u32) < 128 {
haystack.as_bytes().contains(&(self as u8))
} else {
let mut buffer = [0u8; 4];
self.encode_utf8(&mut buffer).is_contained_in(haystack)
}
}
#[inline]
fn is_prefix_of(self, haystack: &'a str) -> bool {
if let Some(ch) = haystack.chars().next() {
self == ch
} else {
false
}
}
#[inline]
fn is_suffix_of(self, haystack: &'a str) -> bool where Self::Searcher: ReverseSearcher<'a>
{
if let Some(ch) = haystack.chars().next_back() {
self == ch
} else {
false
}
}
}
/////////////////////////////////////////////////////////////////////////////
// Impl for a MultiCharEq wrapper
/////////////////////////////////////////////////////////////////////////////
#[doc(hidden)]
trait CharEq {
trait MultiCharEq {
fn matches(&mut self, c: char) -> bool;
fn only_ascii(&self) -> bool;
}
impl CharEq for char {
#[inline]
fn matches(&mut self, c: char) -> bool { *self == c }
#[inline]
fn only_ascii(&self) -> bool { (*self as u32) < 128 }
}
impl<F> CharEq for F where F: FnMut(char) -> bool {
impl<F> MultiCharEq for F where F: FnMut(char) -> bool {
#[inline]
fn matches(&mut self, c: char) -> bool { (*self)(c) }
#[inline]
fn only_ascii(&self) -> bool { false }
}
impl<'a> CharEq for &'a [char] {
impl<'a> MultiCharEq for &'a [char] {
#[inline]
fn matches(&mut self, c: char) -> bool {
self.iter().any(|&m| { let mut m = m; m.matches(c) })
}
#[inline]
fn only_ascii(&self) -> bool {
self.iter().all(|m| m.only_ascii())
self.iter().any(|&m| { m == c })
}
}
struct CharEqPattern<C: CharEq>(C);
struct MultiCharEqPattern<C: MultiCharEq>(C);
#[derive(Clone, Debug)]
struct CharEqSearcher<'a, C: CharEq> {
struct MultiCharEqSearcher<'a, C: MultiCharEq> {
char_eq: C,
haystack: &'a str,
char_indices: super::CharIndices<'a>,
#[allow(dead_code)]
ascii_only: bool,
}
impl<'a, C: CharEq> Pattern<'a> for CharEqPattern<C> {
type Searcher = CharEqSearcher<'a, C>;
impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern<C> {
type Searcher = MultiCharEqSearcher<'a, C>;
#[inline]
fn into_searcher(self, haystack: &'a str) -> CharEqSearcher<'a, C> {
CharEqSearcher {
ascii_only: self.0.only_ascii(),
fn into_searcher(self, haystack: &'a str) -> MultiCharEqSearcher<'a, C> {
MultiCharEqSearcher {
haystack,
char_eq: self.0,
char_indices: haystack.char_indices(),
@ -297,7 +516,7 @@ impl<'a, C: CharEq> Pattern<'a> for CharEqPattern<C> {
}
}
unsafe impl<'a, C: CharEq> Searcher<'a> for CharEqSearcher<'a, C> {
unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> {
#[inline]
fn haystack(&self) -> &'a str {
self.haystack
@ -322,7 +541,7 @@ unsafe impl<'a, C: CharEq> Searcher<'a> for CharEqSearcher<'a, C> {
}
}
unsafe impl<'a, C: CharEq> ReverseSearcher<'a> for CharEqSearcher<'a, C> {
unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, C> {
#[inline]
fn next_back(&mut self) -> SearchStep {
let s = &mut self.char_indices;
@ -342,7 +561,7 @@ unsafe impl<'a, C: CharEq> ReverseSearcher<'a> for CharEqSearcher<'a, C> {
}
}
impl<'a, C: CharEq> DoubleEndedSearcher<'a> for CharEqSearcher<'a, C> {}
impl<'a, C: MultiCharEq> DoubleEndedSearcher<'a> for MultiCharEqSearcher<'a, C> {}
/////////////////////////////////////////////////////////////////////////////
@ -409,55 +628,6 @@ macro_rules! searcher_methods {
}
}
/////////////////////////////////////////////////////////////////////////////
// Impl for char
/////////////////////////////////////////////////////////////////////////////
/// Associated type for `<char as Pattern<'a>>::Searcher`.
#[derive(Clone, Debug)]
pub struct CharSearcher<'a>(<CharEqPattern<char> as Pattern<'a>>::Searcher);
unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
searcher_methods!(forward);
}
unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
searcher_methods!(reverse);
}
impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {}
/// Searches for chars that are equal to a given char
impl<'a> Pattern<'a> for char {
type Searcher = CharSearcher<'a>;
#[inline]
fn into_searcher(self, haystack: &'a str) -> Self::Searcher {
CharSearcher(CharEqPattern(self).into_searcher(haystack))
}
#[inline]
fn is_contained_in(self, haystack: &'a str) -> bool {
if (self as u32) < 128 {
haystack.as_bytes().contains(&(self as u8))
} else {
let mut buffer = [0u8; 4];
self.encode_utf8(&mut buffer).is_contained_in(haystack)
}
}
#[inline]
fn is_prefix_of(self, haystack: &'a str) -> bool {
CharEqPattern(self).is_prefix_of(haystack)
}
#[inline]
fn is_suffix_of(self, haystack: &'a str) -> bool where Self::Searcher: ReverseSearcher<'a>
{
CharEqPattern(self).is_suffix_of(haystack)
}
}
/////////////////////////////////////////////////////////////////////////////
// Impl for &[char]
/////////////////////////////////////////////////////////////////////////////
@ -466,7 +636,7 @@ impl<'a> Pattern<'a> for char {
/// Associated type for `<&[char] as Pattern<'a>>::Searcher`.
#[derive(Clone, Debug)]
pub struct CharSliceSearcher<'a, 'b>(<CharEqPattern<&'b [char]> as Pattern<'a>>::Searcher);
pub struct CharSliceSearcher<'a, 'b>(<MultiCharEqPattern<&'b [char]> as Pattern<'a>>::Searcher);
unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> {
searcher_methods!(forward);
@ -480,7 +650,7 @@ impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {}
/// Searches for chars that are equal to any of the chars in the array
impl<'a, 'b> Pattern<'a> for &'b [char] {
pattern_methods!(CharSliceSearcher<'a, 'b>, CharEqPattern, CharSliceSearcher);
pattern_methods!(CharSliceSearcher<'a, 'b>, MultiCharEqPattern, CharSliceSearcher);
}
/////////////////////////////////////////////////////////////////////////////
@ -489,7 +659,7 @@ impl<'a, 'b> Pattern<'a> for &'b [char] {
/// Associated type for `<F as Pattern<'a>>::Searcher`.
#[derive(Clone)]
pub struct CharPredicateSearcher<'a, F>(<CharEqPattern<F> as Pattern<'a>>::Searcher)
pub struct CharPredicateSearcher<'a, F>(<MultiCharEqPattern<F> as Pattern<'a>>::Searcher)
where F: FnMut(char) -> bool;
impl<'a, F> fmt::Debug for CharPredicateSearcher<'a, F>
@ -499,7 +669,6 @@ impl<'a, F> fmt::Debug for CharPredicateSearcher<'a, F>
f.debug_struct("CharPredicateSearcher")
.field("haystack", &self.0.haystack)
.field("char_indices", &self.0.char_indices)
.field("ascii_only", &self.0.ascii_only)
.finish()
}
}
@ -520,7 +689,7 @@ impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F>
/// Searches for chars that match the given predicate
impl<'a, F> Pattern<'a> for F where F: FnMut(char) -> bool {
pattern_methods!(CharPredicateSearcher<'a, F>, CharEqPattern, CharPredicateSearcher);
pattern_methods!(CharPredicateSearcher<'a, F>, MultiCharEqPattern, CharPredicateSearcher);
}
/////////////////////////////////////////////////////////////////////////////

View File

@ -28,6 +28,7 @@
#![feature(iter_rfind)]
#![feature(iter_rfold)]
#![feature(nonzero)]
#![feature(pattern)]
#![feature(raw)]
#![feature(refcell_replace_swap)]
#![feature(sip_hash_13)]
@ -61,6 +62,7 @@ mod nonzero;
mod num;
mod ops;
mod option;
mod pattern;
mod ptr;
mod result;
mod slice;

View File

@ -0,0 +1,264 @@
// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use std::str::pattern::*;
// This macro makes it easier to write
// tests that do a series of iterations
macro_rules! search_asserts {
($haystack:expr, $needle:expr, $testname:expr, [$($func:ident),*], $result:expr) => {
let mut searcher = $needle.into_searcher($haystack);
let arr = [$( Step::from(searcher.$func()) ),+];
assert_eq!(&arr[..], &$result, $testname);
}
}
/// Combined enum for the results of next() and next_match()/next_reject()
#[derive(Debug, PartialEq, Eq)]
enum Step {
// variant names purposely chosen to
// be the same length for easy alignment
Matches(usize, usize),
Rejects(usize, usize),
InRange(usize, usize),
Done
}
use self::Step::*;
impl From<SearchStep> for Step {
fn from(x: SearchStep) -> Self {
match x {
SearchStep::Match(a, b) => Matches(a, b),
SearchStep::Reject(a, b) => Rejects(a, b),
SearchStep::Done => Done
}
}
}
impl From<Option<(usize, usize)>> for Step {
fn from(x: Option<(usize, usize)>) -> Self {
match x {
Some((a, b)) => InRange(a, b),
None => Done
}
}
}
// ignore-tidy-linelength
// FIXME(Manishearth) these tests focus on single-character searching (CharSearcher)
// and on next()/next_match(), not next_reject(). This is because
// the memchr changes make next_match() for single chars complex, but next_reject()
// continues to use next() under the hood. We should add more test cases for all
// of these, as well as tests for StrSearcher and higher level tests for str::find() (etc)
#[test]
fn test_simple_iteration() {
search_asserts! ("abcdeabcd", 'a', "forward iteration for ASCII string",
// a b c d e a b c d EOF
[next, next, next, next, next, next, next, next, next, next],
[Matches(0, 1), Rejects(1, 2), Rejects(2, 3), Rejects(3, 4), Rejects(4, 5), Matches(5, 6), Rejects(6, 7), Rejects(7, 8), Rejects(8, 9), Done]
);
search_asserts! ("abcdeabcd", 'a', "reverse iteration for ASCII string",
// d c b a e d c b a EOF
[next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back],
[Rejects(8, 9), Rejects(7, 8), Rejects(6, 7), Matches(5, 6), Rejects(4, 5), Rejects(3, 4), Rejects(2, 3), Rejects(1, 2), Matches(0, 1), Done]
);
search_asserts! ("我爱我的猫", '我', "forward iteration for Chinese string",
// 我 愛 我 的 貓 EOF
[next, next, next, next, next, next],
[Matches(0, 3), Rejects(3, 6), Matches(6, 9), Rejects(9, 12), Rejects(12, 15), Done]
);
search_asserts! ("我的猫说meow", 'm', "forward iteration for mixed string",
// 我 的 猫 说 m e o w EOF
[next, next, next, next, next, next, next, next, next],
[Rejects(0, 3), Rejects(3, 6), Rejects(6, 9), Rejects(9, 12), Matches(12, 13), Rejects(13, 14), Rejects(14, 15), Rejects(15, 16), Done]
);
search_asserts! ("我的猫说meow", '猫', "reverse iteration for mixed string",
// w o e m 说 猫 的 我 EOF
[next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back],
[Rejects(15, 16), Rejects(14, 15), Rejects(13, 14), Rejects(12, 13), Rejects(9, 12), Matches(6, 9), Rejects(3, 6), Rejects(0, 3), Done]
);
}
#[test]
fn test_simple_search() {
search_asserts!("abcdeabcdeabcde", 'a', "next_match for ASCII string",
[next_match, next_match, next_match, next_match],
[InRange(0, 1), InRange(5, 6), InRange(10, 11), Done]
);
search_asserts!("abcdeabcdeabcde", 'a', "next_match_back for ASCII string",
[next_match_back, next_match_back, next_match_back, next_match_back],
[InRange(10, 11), InRange(5, 6), InRange(0, 1), Done]
);
search_asserts!("abcdeab", 'a', "next_reject for ASCII string",
[next_reject, next_reject, next_match, next_reject, next_reject],
[InRange(1, 2), InRange(2, 3), InRange(5, 6), InRange(6, 7), Done]
);
search_asserts!("abcdeabcdeabcde", 'a', "next_reject_back for ASCII string",
[next_reject_back, next_reject_back, next_match_back, next_reject_back, next_reject_back, next_reject_back],
[InRange(14, 15), InRange(13, 14), InRange(10, 11), InRange(9, 10), InRange(8, 9), InRange(7, 8)]
);
}
// Á, 각, ก, 😀 all end in 0x81
// 🁀, ᘀ do not end in 0x81 but contain the byte
// ꁁ has 0x81 as its second and third bytes.
//
// The memchr-using implementation of next_match
// and next_match_back temporarily violate
// the property that the search is always on a unicode boundary,
// which is fine as long as this never reaches next() or next_back().
// So we test if next() is correct after each next_match() as well.
const STRESS: &str = "Áa🁀bÁꁁfg😁각กᘀ각aÁ각ꁁก😁a";
#[test]
fn test_stress_indices() {
// this isn't really a test, more of documentation on the indices of each character in the stresstest string
search_asserts!(STRESS, 'x', "Indices of characters in stress test",
[next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next],
[Rejects(0, 2), // Á
Rejects(2, 3), // a
Rejects(3, 7), // 🁀
Rejects(7, 8), // b
Rejects(8, 10), // Á
Rejects(10, 13), // ꁁ
Rejects(13, 14), // f
Rejects(14, 15), // g
Rejects(15, 19), // 😀
Rejects(19, 22), // 각
Rejects(22, 25), // ก
Rejects(25, 28), // ᘀ
Rejects(28, 31), // 각
Rejects(31, 32), // a
Rejects(32, 34), // Á
Rejects(34, 37), // 각
Rejects(37, 40), // ꁁ
Rejects(40, 43), // ก
Rejects(43, 47), // 😀
Rejects(47, 48), // a
Done]
);
}
#[test]
fn test_forward_search_shared_bytes() {
search_asserts!(STRESS, 'Á', "Forward search for two-byte Latin character",
[next_match, next_match, next_match, next_match],
[InRange(0, 2), InRange(8, 10), InRange(32, 34), Done]
);
search_asserts!(STRESS, 'Á', "Forward search for two-byte Latin character; check if next() still works",
[next_match, next, next_match, next, next_match, next, next_match],
[InRange(0, 2), Rejects(2, 3), InRange(8, 10), Rejects(10, 13), InRange(32, 34), Rejects(34, 37), Done]
);
search_asserts!(STRESS, '각', "Forward search for three-byte Hangul character",
[next_match, next, next_match, next_match, next_match],
[InRange(19, 22), Rejects(22, 25), InRange(28, 31), InRange(34, 37), Done]
);
search_asserts!(STRESS, '각', "Forward search for three-byte Hangul character; check if next() still works",
[next_match, next, next_match, next, next_match, next, next_match],
[InRange(19, 22), Rejects(22, 25), InRange(28, 31), Rejects(31, 32), InRange(34, 37), Rejects(37, 40), Done]
);
search_asserts!(STRESS, 'ก', "Forward search for three-byte Thai character",
[next_match, next, next_match, next, next_match],
[InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done]
);
search_asserts!(STRESS, 'ก', "Forward search for three-byte Thai character; check if next() still works",
[next_match, next, next_match, next, next_match],
[InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done]
);
search_asserts!(STRESS, '😁', "Forward search for four-byte emoji",
[next_match, next, next_match, next, next_match],
[InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done]
);
search_asserts!(STRESS, '😁', "Forward search for four-byte emoji; check if next() still works",
[next_match, next, next_match, next, next_match],
[InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done]
);
search_asserts!(STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes",
[next_match, next, next_match, next, next_match],
[InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done]
);
search_asserts!(STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes; check if next() still works",
[next_match, next, next_match, next, next_match],
[InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done]
);
}
#[test]
fn test_reverse_search_shared_bytes() {
search_asserts!(STRESS, 'Á', "Reverse search for two-byte Latin character",
[next_match_back, next_match_back, next_match_back, next_match_back],
[InRange(32, 34), InRange(8, 10), InRange(0, 2), Done]
);
search_asserts!(STRESS, 'Á', "Reverse search for two-byte Latin character; check if next_back() still works",
[next_match_back, next_back, next_match_back, next_back, next_match_back, next_back],
[InRange(32, 34), Rejects(31, 32), InRange(8, 10), Rejects(7, 8), InRange(0, 2), Done]
);
search_asserts!(STRESS, '각', "Reverse search for three-byte Hangul character",
[next_match_back, next_back, next_match_back, next_match_back, next_match_back],
[InRange(34, 37), Rejects(32, 34), InRange(28, 31), InRange(19, 22), Done]
);
search_asserts!(STRESS, '각', "Reverse search for three-byte Hangul character; check if next_back() still works",
[next_match_back, next_back, next_match_back, next_back, next_match_back, next_back, next_match_back],
[InRange(34, 37), Rejects(32, 34), InRange(28, 31), Rejects(25, 28), InRange(19, 22), Rejects(15, 19), Done]
);
search_asserts!(STRESS, 'ก', "Reverse search for three-byte Thai character",
[next_match_back, next_back, next_match_back, next_back, next_match_back],
[InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done]
);
search_asserts!(STRESS, 'ก', "Reverse search for three-byte Thai character; check if next_back() still works",
[next_match_back, next_back, next_match_back, next_back, next_match_back],
[InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done]
);
search_asserts!(STRESS, '😁', "Reverse search for four-byte emoji",
[next_match_back, next_back, next_match_back, next_back, next_match_back],
[InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done]
);
search_asserts!(STRESS, '😁', "Reverse search for four-byte emoji; check if next_back() still works",
[next_match_back, next_back, next_match_back, next_back, next_match_back],
[InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done]
);
search_asserts!(STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes",
[next_match_back, next_back, next_match_back, next_back, next_match_back],
[InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done]
);
search_asserts!(STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes; check if next_back() still works",
[next_match_back, next_back, next_match_back, next_back, next_match_back],
[InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done]
);
}