Auto merge of #46735 - Manishearth:memchr-find, r=burntsushi
Use memchr for str::find(char) This is a 10x improvement for searching for characters. This also contains the patches from https://github.com/rust-lang/rust/pull/46713 . Feel free to land both separately or together. cc @mystor @alexcrichton r? @bluss fixes #46693
This commit is contained in:
commit
b65f0bedd2
@ -19,6 +19,7 @@
|
||||
|
||||
use cmp;
|
||||
use fmt;
|
||||
use slice::memchr;
|
||||
use usize;
|
||||
|
||||
// Pattern
|
||||
@ -127,6 +128,11 @@ pub unsafe trait Searcher<'a> {
|
||||
fn next(&mut self) -> SearchStep;
|
||||
|
||||
/// Find the next `Match` result. See `next()`
|
||||
///
|
||||
/// Unlike next(), there is no guarantee that the returned ranges
|
||||
/// of this and next_reject will overlap. This will return (start_match, end_match),
|
||||
/// where start_match is the index of where the match begins, and end_match is
|
||||
/// the index after the end of the match.
|
||||
#[inline]
|
||||
fn next_match(&mut self) -> Option<(usize, usize)> {
|
||||
loop {
|
||||
@ -138,7 +144,10 @@ pub unsafe trait Searcher<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Find the next `Reject` result. See `next()`
|
||||
/// Find the next `Reject` result. See `next()` and `next_match()`
|
||||
///
|
||||
/// Unlike next(), there is no guarantee that the returned ranges
|
||||
/// of this and next_match will overlap.
|
||||
#[inline]
|
||||
fn next_reject(&mut self) -> Option<(usize, usize)> {
|
||||
loop {
|
||||
@ -234,62 +243,272 @@ pub unsafe trait ReverseSearcher<'a>: Searcher<'a> {
|
||||
/// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched.
|
||||
pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// Impl for a CharEq wrapper
|
||||
// Impl for char
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Associated type for `<char as Pattern<'a>>::Searcher`.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct CharSearcher<'a> {
|
||||
haystack: &'a str,
|
||||
// safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack`
|
||||
// This invariant can be broken *within* next_match and next_match_back, however
|
||||
// they must exit with fingers on valid code point boundaries.
|
||||
|
||||
/// `finger` is the current byte index of the forward search.
|
||||
/// Imagine that it exists before the byte at its index, i.e.
|
||||
/// haystack[finger] is the first byte of the slice we must inspect during
|
||||
/// forward searching
|
||||
finger: usize,
|
||||
/// `finger_back` is the current byte index of the reverse search.
|
||||
/// Imagine that it exists after the byte at its index, i.e.
|
||||
/// haystack[finger_back - 1] is the last byte of the slice we must inspect during
|
||||
/// forward searching (and thus the first byte to be inspected when calling next_back())
|
||||
finger_back: usize,
|
||||
/// The character being searched for
|
||||
needle: char,
|
||||
|
||||
// safety invariant: `utf8_size` must be less than 5
|
||||
/// The number of bytes `needle` takes up when encoded in utf8
|
||||
utf8_size: usize,
|
||||
/// A utf8 encoded copy of the `needle`
|
||||
utf8_encoded: [u8; 4],
|
||||
}
|
||||
|
||||
unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
|
||||
#[inline]
|
||||
fn haystack(&self) -> &'a str {
|
||||
self.haystack
|
||||
}
|
||||
#[inline]
|
||||
fn next(&mut self) -> SearchStep {
|
||||
let old_finger = self.finger;
|
||||
let slice = unsafe { self.haystack.get_unchecked(old_finger..self.haystack.len()) };
|
||||
let mut iter = slice.chars();
|
||||
let old_len = iter.iter.len();
|
||||
if let Some(ch) = iter.next() {
|
||||
// add byte offset of current character
|
||||
// without re-encoding as utf-8
|
||||
self.finger += old_len - iter.iter.len();
|
||||
if ch == self.needle {
|
||||
SearchStep::Match(old_finger, self.finger)
|
||||
} else {
|
||||
SearchStep::Reject(old_finger, self.finger)
|
||||
}
|
||||
} else {
|
||||
SearchStep::Done
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
fn next_match(&mut self) -> Option<(usize, usize)> {
|
||||
loop {
|
||||
// get the haystack after the last character found
|
||||
let bytes = if let Some(slice) = self.haystack.as_bytes().get(self.finger..) {
|
||||
slice
|
||||
} else {
|
||||
return None;
|
||||
};
|
||||
// the last byte of the utf8 encoded needle
|
||||
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
|
||||
if let Some(index) = memchr::memchr(last_byte, bytes) {
|
||||
// The new finger is the index of the byte we found,
|
||||
// plus one, since we memchr'd for the last byte of the character.
|
||||
//
|
||||
// Note that this doesn't always give us a finger on a UTF8 boundary.
|
||||
// If we *didn't* find our character
|
||||
// we may have indexed to the non-last byte of a 3-byte or 4-byte character.
|
||||
// We can't just skip to the next valid starting byte because a character like
|
||||
// ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find
|
||||
// the second byte when searching for the third.
|
||||
//
|
||||
// However, this is totally okay. While we have the invariant that
|
||||
// self.finger is on a UTF8 boundary, this invariant is not relid upon
|
||||
// within this method (it is relied upon in CharSearcher::next()).
|
||||
//
|
||||
// We only exit this method when we reach the end of the string, or if we
|
||||
// find something. When we find something the `finger` will be set
|
||||
// to a UTF8 boundary.
|
||||
self.finger += index + 1;
|
||||
if self.finger >= self.utf8_size {
|
||||
let found_char = self.finger - self.utf8_size;
|
||||
if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) {
|
||||
if slice == &self.utf8_encoded[0..self.utf8_size] {
|
||||
return Some((found_char, self.finger));
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// found nothing, exit
|
||||
self.finger = self.haystack.len();
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// let next_reject use the default implementation from the Searcher trait
|
||||
}
|
||||
|
||||
unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
|
||||
#[inline]
|
||||
fn next_back(&mut self) -> SearchStep {
|
||||
let old_finger = self.finger_back;
|
||||
let slice = unsafe { self.haystack.slice_unchecked(0, old_finger) };
|
||||
let mut iter = slice.chars();
|
||||
let old_len = iter.iter.len();
|
||||
if let Some(ch) = iter.next_back() {
|
||||
// subtract byte offset of current character
|
||||
// without re-encoding as utf-8
|
||||
self.finger_back -= old_len - iter.iter.len();
|
||||
if ch == self.needle {
|
||||
SearchStep::Match(self.finger_back, old_finger)
|
||||
} else {
|
||||
SearchStep::Reject(self.finger_back, old_finger)
|
||||
}
|
||||
} else {
|
||||
SearchStep::Done
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
fn next_match_back(&mut self) -> Option<(usize, usize)> {
|
||||
let haystack = self.haystack.as_bytes();
|
||||
loop {
|
||||
// get the haystack up to but not including the last character searched
|
||||
let bytes = if let Some(slice) = haystack.get(..self.finger_back) {
|
||||
slice
|
||||
} else {
|
||||
return None;
|
||||
};
|
||||
// the last byte of the utf8 encoded needle
|
||||
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
|
||||
if let Some(index) = memchr::memrchr(last_byte, bytes) {
|
||||
// memrchr will return the index of the byte we wish to
|
||||
// find. In case of an ASCII character, this is indeed
|
||||
// were we wish our new finger to be ("after" the found
|
||||
// char in the paradigm of reverse iteration). For
|
||||
// multibyte chars we need to skip down by the number of more
|
||||
// bytes they have than ASCII
|
||||
let shift = self.utf8_size - 1;
|
||||
if index >= shift {
|
||||
let found_char = index - shift;
|
||||
if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) {
|
||||
if slice == &self.utf8_encoded[0..self.utf8_size] {
|
||||
// move finger to before the character found (i.e. at its start index)
|
||||
self.finger_back = found_char;
|
||||
return Some((self.finger_back, self.finger_back + self.utf8_size));
|
||||
}
|
||||
}
|
||||
}
|
||||
// We can't use finger_back = index - size + 1 here. If we found the last char
|
||||
// of a different-sized character (or the middle byte of a different character)
|
||||
// we need to bump the finger_back down to `index`. This similarly makes
|
||||
// `finger_back` have the potential to no longer be on a boundary,
|
||||
// but this is OK since we only exit this function on a boundary
|
||||
// or when the haystack has been searched completely.
|
||||
//
|
||||
// Unlike next_match this does not
|
||||
// have the problem of repeated bytes in utf-8 because
|
||||
// we're searching for the last byte, and we can only have
|
||||
// found the last byte when searching in reverse.
|
||||
self.finger_back = index;
|
||||
} else {
|
||||
self.finger_back = 0;
|
||||
// found nothing, exit
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// let next_reject_back use the default implementation from the Searcher trait
|
||||
}
|
||||
|
||||
impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {}
|
||||
|
||||
/// Searches for chars that are equal to a given char
|
||||
impl<'a> Pattern<'a> for char {
|
||||
type Searcher = CharSearcher<'a>;
|
||||
|
||||
#[inline]
|
||||
fn into_searcher(self, haystack: &'a str) -> Self::Searcher {
|
||||
let mut utf8_encoded = [0; 4];
|
||||
self.encode_utf8(&mut utf8_encoded);
|
||||
let utf8_size = self.len_utf8();
|
||||
CharSearcher {
|
||||
haystack,
|
||||
finger: 0,
|
||||
finger_back: haystack.len(),
|
||||
needle: self,
|
||||
utf8_size,
|
||||
utf8_encoded
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_contained_in(self, haystack: &'a str) -> bool {
|
||||
if (self as u32) < 128 {
|
||||
haystack.as_bytes().contains(&(self as u8))
|
||||
} else {
|
||||
let mut buffer = [0u8; 4];
|
||||
self.encode_utf8(&mut buffer).is_contained_in(haystack)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_prefix_of(self, haystack: &'a str) -> bool {
|
||||
if let Some(ch) = haystack.chars().next() {
|
||||
self == ch
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_suffix_of(self, haystack: &'a str) -> bool where Self::Searcher: ReverseSearcher<'a>
|
||||
{
|
||||
if let Some(ch) = haystack.chars().next_back() {
|
||||
self == ch
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// Impl for a MultiCharEq wrapper
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#[doc(hidden)]
|
||||
trait CharEq {
|
||||
trait MultiCharEq {
|
||||
fn matches(&mut self, c: char) -> bool;
|
||||
fn only_ascii(&self) -> bool;
|
||||
}
|
||||
|
||||
impl CharEq for char {
|
||||
#[inline]
|
||||
fn matches(&mut self, c: char) -> bool { *self == c }
|
||||
|
||||
#[inline]
|
||||
fn only_ascii(&self) -> bool { (*self as u32) < 128 }
|
||||
}
|
||||
|
||||
impl<F> CharEq for F where F: FnMut(char) -> bool {
|
||||
impl<F> MultiCharEq for F where F: FnMut(char) -> bool {
|
||||
#[inline]
|
||||
fn matches(&mut self, c: char) -> bool { (*self)(c) }
|
||||
|
||||
#[inline]
|
||||
fn only_ascii(&self) -> bool { false }
|
||||
}
|
||||
|
||||
impl<'a> CharEq for &'a [char] {
|
||||
impl<'a> MultiCharEq for &'a [char] {
|
||||
#[inline]
|
||||
fn matches(&mut self, c: char) -> bool {
|
||||
self.iter().any(|&m| { let mut m = m; m.matches(c) })
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn only_ascii(&self) -> bool {
|
||||
self.iter().all(|m| m.only_ascii())
|
||||
self.iter().any(|&m| { m == c })
|
||||
}
|
||||
}
|
||||
|
||||
struct CharEqPattern<C: CharEq>(C);
|
||||
struct MultiCharEqPattern<C: MultiCharEq>(C);
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct CharEqSearcher<'a, C: CharEq> {
|
||||
struct MultiCharEqSearcher<'a, C: MultiCharEq> {
|
||||
char_eq: C,
|
||||
haystack: &'a str,
|
||||
char_indices: super::CharIndices<'a>,
|
||||
#[allow(dead_code)]
|
||||
ascii_only: bool,
|
||||
}
|
||||
|
||||
impl<'a, C: CharEq> Pattern<'a> for CharEqPattern<C> {
|
||||
type Searcher = CharEqSearcher<'a, C>;
|
||||
impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern<C> {
|
||||
type Searcher = MultiCharEqSearcher<'a, C>;
|
||||
|
||||
#[inline]
|
||||
fn into_searcher(self, haystack: &'a str) -> CharEqSearcher<'a, C> {
|
||||
CharEqSearcher {
|
||||
ascii_only: self.0.only_ascii(),
|
||||
fn into_searcher(self, haystack: &'a str) -> MultiCharEqSearcher<'a, C> {
|
||||
MultiCharEqSearcher {
|
||||
haystack,
|
||||
char_eq: self.0,
|
||||
char_indices: haystack.char_indices(),
|
||||
@ -297,7 +516,7 @@ impl<'a, C: CharEq> Pattern<'a> for CharEqPattern<C> {
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl<'a, C: CharEq> Searcher<'a> for CharEqSearcher<'a, C> {
|
||||
unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> {
|
||||
#[inline]
|
||||
fn haystack(&self) -> &'a str {
|
||||
self.haystack
|
||||
@ -322,7 +541,7 @@ unsafe impl<'a, C: CharEq> Searcher<'a> for CharEqSearcher<'a, C> {
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl<'a, C: CharEq> ReverseSearcher<'a> for CharEqSearcher<'a, C> {
|
||||
unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, C> {
|
||||
#[inline]
|
||||
fn next_back(&mut self) -> SearchStep {
|
||||
let s = &mut self.char_indices;
|
||||
@ -342,7 +561,7 @@ unsafe impl<'a, C: CharEq> ReverseSearcher<'a> for CharEqSearcher<'a, C> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, C: CharEq> DoubleEndedSearcher<'a> for CharEqSearcher<'a, C> {}
|
||||
impl<'a, C: MultiCharEq> DoubleEndedSearcher<'a> for MultiCharEqSearcher<'a, C> {}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -409,55 +628,6 @@ macro_rules! searcher_methods {
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// Impl for char
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Associated type for `<char as Pattern<'a>>::Searcher`.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct CharSearcher<'a>(<CharEqPattern<char> as Pattern<'a>>::Searcher);
|
||||
|
||||
unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
|
||||
searcher_methods!(forward);
|
||||
}
|
||||
|
||||
unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
|
||||
searcher_methods!(reverse);
|
||||
}
|
||||
|
||||
impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {}
|
||||
|
||||
/// Searches for chars that are equal to a given char
|
||||
impl<'a> Pattern<'a> for char {
|
||||
type Searcher = CharSearcher<'a>;
|
||||
|
||||
#[inline]
|
||||
fn into_searcher(self, haystack: &'a str) -> Self::Searcher {
|
||||
CharSearcher(CharEqPattern(self).into_searcher(haystack))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_contained_in(self, haystack: &'a str) -> bool {
|
||||
if (self as u32) < 128 {
|
||||
haystack.as_bytes().contains(&(self as u8))
|
||||
} else {
|
||||
let mut buffer = [0u8; 4];
|
||||
self.encode_utf8(&mut buffer).is_contained_in(haystack)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_prefix_of(self, haystack: &'a str) -> bool {
|
||||
CharEqPattern(self).is_prefix_of(haystack)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_suffix_of(self, haystack: &'a str) -> bool where Self::Searcher: ReverseSearcher<'a>
|
||||
{
|
||||
CharEqPattern(self).is_suffix_of(haystack)
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// Impl for &[char]
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
@ -466,7 +636,7 @@ impl<'a> Pattern<'a> for char {
|
||||
|
||||
/// Associated type for `<&[char] as Pattern<'a>>::Searcher`.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct CharSliceSearcher<'a, 'b>(<CharEqPattern<&'b [char]> as Pattern<'a>>::Searcher);
|
||||
pub struct CharSliceSearcher<'a, 'b>(<MultiCharEqPattern<&'b [char]> as Pattern<'a>>::Searcher);
|
||||
|
||||
unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> {
|
||||
searcher_methods!(forward);
|
||||
@ -480,7 +650,7 @@ impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {}
|
||||
|
||||
/// Searches for chars that are equal to any of the chars in the array
|
||||
impl<'a, 'b> Pattern<'a> for &'b [char] {
|
||||
pattern_methods!(CharSliceSearcher<'a, 'b>, CharEqPattern, CharSliceSearcher);
|
||||
pattern_methods!(CharSliceSearcher<'a, 'b>, MultiCharEqPattern, CharSliceSearcher);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
@ -489,7 +659,7 @@ impl<'a, 'b> Pattern<'a> for &'b [char] {
|
||||
|
||||
/// Associated type for `<F as Pattern<'a>>::Searcher`.
|
||||
#[derive(Clone)]
|
||||
pub struct CharPredicateSearcher<'a, F>(<CharEqPattern<F> as Pattern<'a>>::Searcher)
|
||||
pub struct CharPredicateSearcher<'a, F>(<MultiCharEqPattern<F> as Pattern<'a>>::Searcher)
|
||||
where F: FnMut(char) -> bool;
|
||||
|
||||
impl<'a, F> fmt::Debug for CharPredicateSearcher<'a, F>
|
||||
@ -499,7 +669,6 @@ impl<'a, F> fmt::Debug for CharPredicateSearcher<'a, F>
|
||||
f.debug_struct("CharPredicateSearcher")
|
||||
.field("haystack", &self.0.haystack)
|
||||
.field("char_indices", &self.0.char_indices)
|
||||
.field("ascii_only", &self.0.ascii_only)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
@ -520,7 +689,7 @@ impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F>
|
||||
|
||||
/// Searches for chars that match the given predicate
|
||||
impl<'a, F> Pattern<'a> for F where F: FnMut(char) -> bool {
|
||||
pattern_methods!(CharPredicateSearcher<'a, F>, CharEqPattern, CharPredicateSearcher);
|
||||
pattern_methods!(CharPredicateSearcher<'a, F>, MultiCharEqPattern, CharPredicateSearcher);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -28,6 +28,7 @@
|
||||
#![feature(iter_rfind)]
|
||||
#![feature(iter_rfold)]
|
||||
#![feature(nonzero)]
|
||||
#![feature(pattern)]
|
||||
#![feature(raw)]
|
||||
#![feature(refcell_replace_swap)]
|
||||
#![feature(sip_hash_13)]
|
||||
@ -61,6 +62,7 @@ mod nonzero;
|
||||
mod num;
|
||||
mod ops;
|
||||
mod option;
|
||||
mod pattern;
|
||||
mod ptr;
|
||||
mod result;
|
||||
mod slice;
|
||||
|
264
src/libcore/tests/pattern.rs
Normal file
264
src/libcore/tests/pattern.rs
Normal file
@ -0,0 +1,264 @@
|
||||
// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use std::str::pattern::*;
|
||||
|
||||
// This macro makes it easier to write
|
||||
// tests that do a series of iterations
|
||||
macro_rules! search_asserts {
|
||||
($haystack:expr, $needle:expr, $testname:expr, [$($func:ident),*], $result:expr) => {
|
||||
let mut searcher = $needle.into_searcher($haystack);
|
||||
let arr = [$( Step::from(searcher.$func()) ),+];
|
||||
assert_eq!(&arr[..], &$result, $testname);
|
||||
}
|
||||
}
|
||||
|
||||
/// Combined enum for the results of next() and next_match()/next_reject()
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
enum Step {
|
||||
// variant names purposely chosen to
|
||||
// be the same length for easy alignment
|
||||
Matches(usize, usize),
|
||||
Rejects(usize, usize),
|
||||
InRange(usize, usize),
|
||||
Done
|
||||
}
|
||||
|
||||
use self::Step::*;
|
||||
|
||||
impl From<SearchStep> for Step {
|
||||
fn from(x: SearchStep) -> Self {
|
||||
match x {
|
||||
SearchStep::Match(a, b) => Matches(a, b),
|
||||
SearchStep::Reject(a, b) => Rejects(a, b),
|
||||
SearchStep::Done => Done
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Option<(usize, usize)>> for Step {
|
||||
fn from(x: Option<(usize, usize)>) -> Self {
|
||||
match x {
|
||||
Some((a, b)) => InRange(a, b),
|
||||
None => Done
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ignore-tidy-linelength
|
||||
|
||||
// FIXME(Manishearth) these tests focus on single-character searching (CharSearcher)
|
||||
// and on next()/next_match(), not next_reject(). This is because
|
||||
// the memchr changes make next_match() for single chars complex, but next_reject()
|
||||
// continues to use next() under the hood. We should add more test cases for all
|
||||
// of these, as well as tests for StrSearcher and higher level tests for str::find() (etc)
|
||||
|
||||
#[test]
|
||||
fn test_simple_iteration() {
|
||||
search_asserts! ("abcdeabcd", 'a', "forward iteration for ASCII string",
|
||||
// a b c d e a b c d EOF
|
||||
[next, next, next, next, next, next, next, next, next, next],
|
||||
[Matches(0, 1), Rejects(1, 2), Rejects(2, 3), Rejects(3, 4), Rejects(4, 5), Matches(5, 6), Rejects(6, 7), Rejects(7, 8), Rejects(8, 9), Done]
|
||||
);
|
||||
|
||||
search_asserts! ("abcdeabcd", 'a', "reverse iteration for ASCII string",
|
||||
// d c b a e d c b a EOF
|
||||
[next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back],
|
||||
[Rejects(8, 9), Rejects(7, 8), Rejects(6, 7), Matches(5, 6), Rejects(4, 5), Rejects(3, 4), Rejects(2, 3), Rejects(1, 2), Matches(0, 1), Done]
|
||||
);
|
||||
|
||||
search_asserts! ("我爱我的猫", '我', "forward iteration for Chinese string",
|
||||
// 我 愛 我 的 貓 EOF
|
||||
[next, next, next, next, next, next],
|
||||
[Matches(0, 3), Rejects(3, 6), Matches(6, 9), Rejects(9, 12), Rejects(12, 15), Done]
|
||||
);
|
||||
|
||||
search_asserts! ("我的猫说meow", 'm', "forward iteration for mixed string",
|
||||
// 我 的 猫 说 m e o w EOF
|
||||
[next, next, next, next, next, next, next, next, next],
|
||||
[Rejects(0, 3), Rejects(3, 6), Rejects(6, 9), Rejects(9, 12), Matches(12, 13), Rejects(13, 14), Rejects(14, 15), Rejects(15, 16), Done]
|
||||
);
|
||||
|
||||
search_asserts! ("我的猫说meow", '猫', "reverse iteration for mixed string",
|
||||
// w o e m 说 猫 的 我 EOF
|
||||
[next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back],
|
||||
[Rejects(15, 16), Rejects(14, 15), Rejects(13, 14), Rejects(12, 13), Rejects(9, 12), Matches(6, 9), Rejects(3, 6), Rejects(0, 3), Done]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simple_search() {
|
||||
search_asserts!("abcdeabcdeabcde", 'a', "next_match for ASCII string",
|
||||
[next_match, next_match, next_match, next_match],
|
||||
[InRange(0, 1), InRange(5, 6), InRange(10, 11), Done]
|
||||
);
|
||||
|
||||
search_asserts!("abcdeabcdeabcde", 'a', "next_match_back for ASCII string",
|
||||
[next_match_back, next_match_back, next_match_back, next_match_back],
|
||||
[InRange(10, 11), InRange(5, 6), InRange(0, 1), Done]
|
||||
);
|
||||
|
||||
search_asserts!("abcdeab", 'a', "next_reject for ASCII string",
|
||||
[next_reject, next_reject, next_match, next_reject, next_reject],
|
||||
[InRange(1, 2), InRange(2, 3), InRange(5, 6), InRange(6, 7), Done]
|
||||
);
|
||||
|
||||
search_asserts!("abcdeabcdeabcde", 'a', "next_reject_back for ASCII string",
|
||||
[next_reject_back, next_reject_back, next_match_back, next_reject_back, next_reject_back, next_reject_back],
|
||||
[InRange(14, 15), InRange(13, 14), InRange(10, 11), InRange(9, 10), InRange(8, 9), InRange(7, 8)]
|
||||
);
|
||||
}
|
||||
|
||||
// Á, 각, ก, 😀 all end in 0x81
|
||||
// 🁀, ᘀ do not end in 0x81 but contain the byte
|
||||
// ꁁ has 0x81 as its second and third bytes.
|
||||
//
|
||||
// The memchr-using implementation of next_match
|
||||
// and next_match_back temporarily violate
|
||||
// the property that the search is always on a unicode boundary,
|
||||
// which is fine as long as this never reaches next() or next_back().
|
||||
// So we test if next() is correct after each next_match() as well.
|
||||
const STRESS: &str = "Áa🁀bÁꁁfg😁각กᘀ각aÁ각ꁁก😁a";
|
||||
|
||||
#[test]
|
||||
fn test_stress_indices() {
|
||||
// this isn't really a test, more of documentation on the indices of each character in the stresstest string
|
||||
|
||||
search_asserts!(STRESS, 'x', "Indices of characters in stress test",
|
||||
[next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next],
|
||||
[Rejects(0, 2), // Á
|
||||
Rejects(2, 3), // a
|
||||
Rejects(3, 7), // 🁀
|
||||
Rejects(7, 8), // b
|
||||
Rejects(8, 10), // Á
|
||||
Rejects(10, 13), // ꁁ
|
||||
Rejects(13, 14), // f
|
||||
Rejects(14, 15), // g
|
||||
Rejects(15, 19), // 😀
|
||||
Rejects(19, 22), // 각
|
||||
Rejects(22, 25), // ก
|
||||
Rejects(25, 28), // ᘀ
|
||||
Rejects(28, 31), // 각
|
||||
Rejects(31, 32), // a
|
||||
Rejects(32, 34), // Á
|
||||
Rejects(34, 37), // 각
|
||||
Rejects(37, 40), // ꁁ
|
||||
Rejects(40, 43), // ก
|
||||
Rejects(43, 47), // 😀
|
||||
Rejects(47, 48), // a
|
||||
Done]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_forward_search_shared_bytes() {
|
||||
search_asserts!(STRESS, 'Á', "Forward search for two-byte Latin character",
|
||||
[next_match, next_match, next_match, next_match],
|
||||
[InRange(0, 2), InRange(8, 10), InRange(32, 34), Done]
|
||||
);
|
||||
|
||||
search_asserts!(STRESS, 'Á', "Forward search for two-byte Latin character; check if next() still works",
|
||||
[next_match, next, next_match, next, next_match, next, next_match],
|
||||
[InRange(0, 2), Rejects(2, 3), InRange(8, 10), Rejects(10, 13), InRange(32, 34), Rejects(34, 37), Done]
|
||||
);
|
||||
|
||||
search_asserts!(STRESS, '각', "Forward search for three-byte Hangul character",
|
||||
[next_match, next, next_match, next_match, next_match],
|
||||
[InRange(19, 22), Rejects(22, 25), InRange(28, 31), InRange(34, 37), Done]
|
||||
);
|
||||
|
||||
search_asserts!(STRESS, '각', "Forward search for three-byte Hangul character; check if next() still works",
|
||||
[next_match, next, next_match, next, next_match, next, next_match],
|
||||
[InRange(19, 22), Rejects(22, 25), InRange(28, 31), Rejects(31, 32), InRange(34, 37), Rejects(37, 40), Done]
|
||||
);
|
||||
|
||||
search_asserts!(STRESS, 'ก', "Forward search for three-byte Thai character",
|
||||
[next_match, next, next_match, next, next_match],
|
||||
[InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done]
|
||||
);
|
||||
|
||||
search_asserts!(STRESS, 'ก', "Forward search for three-byte Thai character; check if next() still works",
|
||||
[next_match, next, next_match, next, next_match],
|
||||
[InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done]
|
||||
);
|
||||
|
||||
search_asserts!(STRESS, '😁', "Forward search for four-byte emoji",
|
||||
[next_match, next, next_match, next, next_match],
|
||||
[InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done]
|
||||
);
|
||||
|
||||
search_asserts!(STRESS, '😁', "Forward search for four-byte emoji; check if next() still works",
|
||||
[next_match, next, next_match, next, next_match],
|
||||
[InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done]
|
||||
);
|
||||
|
||||
search_asserts!(STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes",
|
||||
[next_match, next, next_match, next, next_match],
|
||||
[InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done]
|
||||
);
|
||||
|
||||
search_asserts!(STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes; check if next() still works",
|
||||
[next_match, next, next_match, next, next_match],
|
||||
[InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reverse_search_shared_bytes() {
|
||||
search_asserts!(STRESS, 'Á', "Reverse search for two-byte Latin character",
|
||||
[next_match_back, next_match_back, next_match_back, next_match_back],
|
||||
[InRange(32, 34), InRange(8, 10), InRange(0, 2), Done]
|
||||
);
|
||||
|
||||
search_asserts!(STRESS, 'Á', "Reverse search for two-byte Latin character; check if next_back() still works",
|
||||
[next_match_back, next_back, next_match_back, next_back, next_match_back, next_back],
|
||||
[InRange(32, 34), Rejects(31, 32), InRange(8, 10), Rejects(7, 8), InRange(0, 2), Done]
|
||||
);
|
||||
|
||||
search_asserts!(STRESS, '각', "Reverse search for three-byte Hangul character",
|
||||
[next_match_back, next_back, next_match_back, next_match_back, next_match_back],
|
||||
[InRange(34, 37), Rejects(32, 34), InRange(28, 31), InRange(19, 22), Done]
|
||||
);
|
||||
|
||||
search_asserts!(STRESS, '각', "Reverse search for three-byte Hangul character; check if next_back() still works",
|
||||
[next_match_back, next_back, next_match_back, next_back, next_match_back, next_back, next_match_back],
|
||||
[InRange(34, 37), Rejects(32, 34), InRange(28, 31), Rejects(25, 28), InRange(19, 22), Rejects(15, 19), Done]
|
||||
);
|
||||
|
||||
search_asserts!(STRESS, 'ก', "Reverse search for three-byte Thai character",
|
||||
[next_match_back, next_back, next_match_back, next_back, next_match_back],
|
||||
[InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done]
|
||||
);
|
||||
|
||||
search_asserts!(STRESS, 'ก', "Reverse search for three-byte Thai character; check if next_back() still works",
|
||||
[next_match_back, next_back, next_match_back, next_back, next_match_back],
|
||||
[InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done]
|
||||
);
|
||||
|
||||
search_asserts!(STRESS, '😁', "Reverse search for four-byte emoji",
|
||||
[next_match_back, next_back, next_match_back, next_back, next_match_back],
|
||||
[InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done]
|
||||
);
|
||||
|
||||
search_asserts!(STRESS, '😁', "Reverse search for four-byte emoji; check if next_back() still works",
|
||||
[next_match_back, next_back, next_match_back, next_back, next_match_back],
|
||||
[InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done]
|
||||
);
|
||||
|
||||
search_asserts!(STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes",
|
||||
[next_match_back, next_back, next_match_back, next_back, next_match_back],
|
||||
[InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done]
|
||||
);
|
||||
|
||||
search_asserts!(STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes; check if next_back() still works",
|
||||
[next_match_back, next_back, next_match_back, next_back, next_match_back],
|
||||
[InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done]
|
||||
);
|
||||
}
|
Loading…
Reference in New Issue
Block a user