From 2bf0df777b7712d1b719cd5ac7cce63176b7384c Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 13 Dec 2017 01:02:19 -0600 Subject: [PATCH 1/4] Move rust memchr impl to libcore --- src/libcore/slice/memchr.rs | 224 ++++++++++++++++++++++++++++++ src/libcore/slice/mod.rs | 5 + src/libstd/lib.rs | 1 + src/libstd/sys/redox/memchr.rs | 2 +- src/libstd/sys/unix/memchr.rs | 2 +- src/libstd/sys/wasm/memchr.rs | 2 +- src/libstd/sys/windows/memchr.rs | 2 +- src/libstd/sys_common/memchr.rs | 227 ------------------------------- src/libstd/sys_common/mod.rs | 1 - 9 files changed, 234 insertions(+), 232 deletions(-) create mode 100644 src/libcore/slice/memchr.rs delete mode 100644 src/libstd/sys_common/memchr.rs diff --git a/src/libcore/slice/memchr.rs b/src/libcore/slice/memchr.rs new file mode 100644 index 00000000000..252a258c304 --- /dev/null +++ b/src/libcore/slice/memchr.rs @@ -0,0 +1,224 @@ +// Copyright 2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// Original implementation taken from rust-memchr +// Copyright 2015 Andrew Gallant, bluss and Nicolas Koch + +use cmp; +use mem; + +const LO_U64: u64 = 0x0101010101010101; +const HI_U64: u64 = 0x8080808080808080; + +// use truncation +const LO_USIZE: usize = LO_U64 as usize; +const HI_USIZE: usize = HI_U64 as usize; + +/// Return `true` if `x` contains any zero byte. +/// +/// From *Matters Computational*, J. Arndt +/// +/// "The idea is to subtract one from each of the bytes and then look for +/// bytes where the borrow propagated all the way to the most significant +/// bit." +#[inline] +fn contains_zero_byte(x: usize) -> bool { + x.wrapping_sub(LO_USIZE) & !x & HI_USIZE != 0 +} + +#[cfg(target_pointer_width = "32")] +#[inline] +fn repeat_byte(b: u8) -> usize { + let mut rep = (b as usize) << 8 | b as usize; + rep = rep << 16 | rep; + rep +} + +#[cfg(target_pointer_width = "64")] +#[inline] +fn repeat_byte(b: u8) -> usize { + let mut rep = (b as usize) << 8 | b as usize; + rep = rep << 16 | rep; + rep = rep << 32 | rep; + rep +} + +/// Return the first index matching the byte `a` in `text`. +pub fn memchr(x: u8, text: &[u8]) -> Option { + // Scan for a single byte value by reading two `usize` words at a time. + // + // Split `text` in three parts + // - unaligned initial part, before the first word aligned address in text + // - body, scan by 2 words at a time + // - the last remaining part, < 2 word size + let len = text.len(); + let ptr = text.as_ptr(); + let usize_bytes = mem::size_of::(); + + // search up to an aligned boundary + let mut offset = ptr.align_offset(usize_bytes); + if offset > 0 { + offset = cmp::min(offset, len); + if let Some(index) = text[..offset].iter().position(|elt| *elt == x) { + return Some(index); + } + } + + // search the body of the text + let repeated_x = repeat_byte(x); + + if len >= 2 * usize_bytes { + while offset <= len - 2 * usize_bytes { + unsafe { + let u = *(ptr.offset(offset as isize) as *const usize); + let v = *(ptr.offset((offset + usize_bytes) as isize) as *const usize); + + // break if there is a matching byte + let zu = contains_zero_byte(u ^ repeated_x); + let zv = contains_zero_byte(v ^ repeated_x); + if zu || zv { + break; + } + } + offset += usize_bytes * 2; + } + } + + // find the byte after the point the body loop stopped + text[offset..].iter().position(|elt| *elt == x).map(|i| offset + i) +} + +/// Return the last index matching the byte `a` in `text`. +pub fn memrchr(x: u8, text: &[u8]) -> Option { + // Scan for a single byte value by reading two `usize` words at a time. + // + // Split `text` in three parts + // - unaligned tail, after the last word aligned address in text + // - body, scan by 2 words at a time + // - the first remaining bytes, < 2 word size + let len = text.len(); + let ptr = text.as_ptr(); + let usize_bytes = mem::size_of::(); + + // search to an aligned boundary + let end_align = (ptr as usize + len) & (usize_bytes - 1); + let mut offset; + if end_align > 0 { + offset = if end_align >= len { 0 } else { len - end_align }; + if let Some(index) = text[offset..].iter().rposition(|elt| *elt == x) { + return Some(offset + index); + } + } else { + offset = len; + } + + // search the body of the text + let repeated_x = repeat_byte(x); + + while offset >= 2 * usize_bytes { + unsafe { + let u = *(ptr.offset(offset as isize - 2 * usize_bytes as isize) as *const usize); + let v = *(ptr.offset(offset as isize - usize_bytes as isize) as *const usize); + + // break if there is a matching byte + let zu = contains_zero_byte(u ^ repeated_x); + let zv = contains_zero_byte(v ^ repeated_x); + if zu || zv { + break; + } + } + offset -= 2 * usize_bytes; + } + + // find the byte before the point the body loop stopped + text[..offset].iter().rposition(|elt| *elt == x) +} + +// test fallback implementations on all platforms +#[test] +fn matches_one() { + assert_eq!(Some(0), memchr(b'a', b"a")); +} + +#[test] +fn matches_begin() { + assert_eq!(Some(0), memchr(b'a', b"aaaa")); +} + +#[test] +fn matches_end() { + assert_eq!(Some(4), memchr(b'z', b"aaaaz")); +} + +#[test] +fn matches_nul() { + assert_eq!(Some(4), memchr(b'\x00', b"aaaa\x00")); +} + +#[test] +fn matches_past_nul() { + assert_eq!(Some(5), memchr(b'z', b"aaaa\x00z")); +} + +#[test] +fn no_match_empty() { + assert_eq!(None, memchr(b'a', b"")); +} + +#[test] +fn no_match() { + assert_eq!(None, memchr(b'a', b"xyz")); +} + +#[test] +fn matches_one_reversed() { + assert_eq!(Some(0), memrchr(b'a', b"a")); +} + +#[test] +fn matches_begin_reversed() { + assert_eq!(Some(3), memrchr(b'a', b"aaaa")); +} + +#[test] +fn matches_end_reversed() { + assert_eq!(Some(0), memrchr(b'z', b"zaaaa")); +} + +#[test] +fn matches_nul_reversed() { + assert_eq!(Some(4), memrchr(b'\x00', b"aaaa\x00")); +} + +#[test] +fn matches_past_nul_reversed() { + assert_eq!(Some(0), memrchr(b'z', b"z\x00aaaa")); +} + +#[test] +fn no_match_empty_reversed() { + assert_eq!(None, memrchr(b'a', b"")); +} + +#[test] +fn no_match_reversed() { + assert_eq!(None, memrchr(b'a', b"xyz")); +} + +#[test] +fn each_alignment_reversed() { + let mut data = [1u8; 64]; + let needle = 2; + let pos = 40; + data[pos] = needle; + for start in 0..16 { + assert_eq!(Some(pos - start), memrchr(needle, &data[start..])); + } +} diff --git a/src/libcore/slice/mod.rs b/src/libcore/slice/mod.rs index 49c51f4f04f..e4da1b7e5f5 100644 --- a/src/libcore/slice/mod.rs +++ b/src/libcore/slice/mod.rs @@ -50,6 +50,11 @@ use mem; use marker::{Copy, Send, Sync, Sized, self}; use iter_private::TrustedRandomAccess; +#[unstable(feature = "slice_internals", issue = "0", + reason = "exposed from core to be reused in std; use the memchr crate")] +/// Pure rust memchr implementation, taken from rust-memchr +pub mod memchr; + mod rotate; mod sort; diff --git a/src/libstd/lib.rs b/src/libstd/lib.rs index 12e6231136e..536757336cd 100644 --- a/src/libstd/lib.rs +++ b/src/libstd/lib.rs @@ -302,6 +302,7 @@ #![feature(sip_hash_13)] #![feature(slice_bytes)] #![feature(slice_concat_ext)] +#![feature(slice_internals)] #![feature(slice_patterns)] #![feature(staged_api)] #![feature(stmt_expr_attributes)] diff --git a/src/libstd/sys/redox/memchr.rs b/src/libstd/sys/redox/memchr.rs index 4c314b7a472..873b3353502 100644 --- a/src/libstd/sys/redox/memchr.rs +++ b/src/libstd/sys/redox/memchr.rs @@ -11,4 +11,4 @@ // Original implementation taken from rust-memchr // Copyright 2015 Andrew Gallant, bluss and Nicolas Koch -pub use sys_common::memchr::fallback::{memchr, memrchr}; +pub use core::slice::memchr::{memchr, memrchr}; diff --git a/src/libstd/sys/unix/memchr.rs b/src/libstd/sys/unix/memchr.rs index aed04703ea1..f49adc24163 100644 --- a/src/libstd/sys/unix/memchr.rs +++ b/src/libstd/sys/unix/memchr.rs @@ -50,7 +50,7 @@ pub fn memrchr(needle: u8, haystack: &[u8]) -> Option { #[cfg(not(target_os = "linux"))] fn memrchr_specific(needle: u8, haystack: &[u8]) -> Option { - ::sys_common::memchr::fallback::memrchr(needle, haystack) + ::core::slice::memchr::memrchr(needle, haystack) } memrchr_specific(needle, haystack) diff --git a/src/libstd/sys/wasm/memchr.rs b/src/libstd/sys/wasm/memchr.rs index e611d94af30..964e3599413 100644 --- a/src/libstd/sys/wasm/memchr.rs +++ b/src/libstd/sys/wasm/memchr.rs @@ -8,4 +8,4 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -pub use sys_common::memchr::fallback::{memchr, memrchr}; +pub use core::slice::memchr::{memchr, memrchr}; diff --git a/src/libstd/sys/windows/memchr.rs b/src/libstd/sys/windows/memchr.rs index 5a5386acaa5..fa7c816fd02 100644 --- a/src/libstd/sys/windows/memchr.rs +++ b/src/libstd/sys/windows/memchr.rs @@ -12,4 +12,4 @@ // Copyright 2015 Andrew Gallant, bluss and Nicolas Koch // Fallback memchr is fastest on windows -pub use sys_common::memchr::fallback::{memchr, memrchr}; +pub use core::slice::memchr::{memchr, memrchr}; diff --git a/src/libstd/sys_common/memchr.rs b/src/libstd/sys_common/memchr.rs deleted file mode 100644 index 50f998eb486..00000000000 --- a/src/libstd/sys_common/memchr.rs +++ /dev/null @@ -1,227 +0,0 @@ -// Copyright 2015 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. -// -// Original implementation taken from rust-memchr -// Copyright 2015 Andrew Gallant, bluss and Nicolas Koch - -#[allow(dead_code)] -pub mod fallback { - use cmp; - use mem; - - const LO_U64: u64 = 0x0101010101010101; - const HI_U64: u64 = 0x8080808080808080; - - // use truncation - const LO_USIZE: usize = LO_U64 as usize; - const HI_USIZE: usize = HI_U64 as usize; - - /// Return `true` if `x` contains any zero byte. - /// - /// From *Matters Computational*, J. Arndt - /// - /// "The idea is to subtract one from each of the bytes and then look for - /// bytes where the borrow propagated all the way to the most significant - /// bit." - #[inline] - fn contains_zero_byte(x: usize) -> bool { - x.wrapping_sub(LO_USIZE) & !x & HI_USIZE != 0 - } - - #[cfg(target_pointer_width = "32")] - #[inline] - fn repeat_byte(b: u8) -> usize { - let mut rep = (b as usize) << 8 | b as usize; - rep = rep << 16 | rep; - rep - } - - #[cfg(target_pointer_width = "64")] - #[inline] - fn repeat_byte(b: u8) -> usize { - let mut rep = (b as usize) << 8 | b as usize; - rep = rep << 16 | rep; - rep = rep << 32 | rep; - rep - } - - /// Return the first index matching the byte `a` in `text`. - pub fn memchr(x: u8, text: &[u8]) -> Option { - // Scan for a single byte value by reading two `usize` words at a time. - // - // Split `text` in three parts - // - unaligned initial part, before the first word aligned address in text - // - body, scan by 2 words at a time - // - the last remaining part, < 2 word size - let len = text.len(); - let ptr = text.as_ptr(); - let usize_bytes = mem::size_of::(); - - // search up to an aligned boundary - let mut offset = ptr.align_offset(usize_bytes); - if offset > 0 { - offset = cmp::min(offset, len); - if let Some(index) = text[..offset].iter().position(|elt| *elt == x) { - return Some(index); - } - } - - // search the body of the text - let repeated_x = repeat_byte(x); - - if len >= 2 * usize_bytes { - while offset <= len - 2 * usize_bytes { - unsafe { - let u = *(ptr.offset(offset as isize) as *const usize); - let v = *(ptr.offset((offset + usize_bytes) as isize) as *const usize); - - // break if there is a matching byte - let zu = contains_zero_byte(u ^ repeated_x); - let zv = contains_zero_byte(v ^ repeated_x); - if zu || zv { - break; - } - } - offset += usize_bytes * 2; - } - } - - // find the byte after the point the body loop stopped - text[offset..].iter().position(|elt| *elt == x).map(|i| offset + i) - } - - /// Return the last index matching the byte `a` in `text`. - pub fn memrchr(x: u8, text: &[u8]) -> Option { - // Scan for a single byte value by reading two `usize` words at a time. - // - // Split `text` in three parts - // - unaligned tail, after the last word aligned address in text - // - body, scan by 2 words at a time - // - the first remaining bytes, < 2 word size - let len = text.len(); - let ptr = text.as_ptr(); - let usize_bytes = mem::size_of::(); - - // search to an aligned boundary - let end_align = (ptr as usize + len) & (usize_bytes - 1); - let mut offset; - if end_align > 0 { - offset = if end_align >= len { 0 } else { len - end_align }; - if let Some(index) = text[offset..].iter().rposition(|elt| *elt == x) { - return Some(offset + index); - } - } else { - offset = len; - } - - // search the body of the text - let repeated_x = repeat_byte(x); - - while offset >= 2 * usize_bytes { - unsafe { - let u = *(ptr.offset(offset as isize - 2 * usize_bytes as isize) as *const usize); - let v = *(ptr.offset(offset as isize - usize_bytes as isize) as *const usize); - - // break if there is a matching byte - let zu = contains_zero_byte(u ^ repeated_x); - let zv = contains_zero_byte(v ^ repeated_x); - if zu || zv { - break; - } - } - offset -= 2 * usize_bytes; - } - - // find the byte before the point the body loop stopped - text[..offset].iter().rposition(|elt| *elt == x) - } - - // test fallback implementations on all platforms - #[test] - fn matches_one() { - assert_eq!(Some(0), memchr(b'a', b"a")); - } - - #[test] - fn matches_begin() { - assert_eq!(Some(0), memchr(b'a', b"aaaa")); - } - - #[test] - fn matches_end() { - assert_eq!(Some(4), memchr(b'z', b"aaaaz")); - } - - #[test] - fn matches_nul() { - assert_eq!(Some(4), memchr(b'\x00', b"aaaa\x00")); - } - - #[test] - fn matches_past_nul() { - assert_eq!(Some(5), memchr(b'z', b"aaaa\x00z")); - } - - #[test] - fn no_match_empty() { - assert_eq!(None, memchr(b'a', b"")); - } - - #[test] - fn no_match() { - assert_eq!(None, memchr(b'a', b"xyz")); - } - - #[test] - fn matches_one_reversed() { - assert_eq!(Some(0), memrchr(b'a', b"a")); - } - - #[test] - fn matches_begin_reversed() { - assert_eq!(Some(3), memrchr(b'a', b"aaaa")); - } - - #[test] - fn matches_end_reversed() { - assert_eq!(Some(0), memrchr(b'z', b"zaaaa")); - } - - #[test] - fn matches_nul_reversed() { - assert_eq!(Some(4), memrchr(b'\x00', b"aaaa\x00")); - } - - #[test] - fn matches_past_nul_reversed() { - assert_eq!(Some(0), memrchr(b'z', b"z\x00aaaa")); - } - - #[test] - fn no_match_empty_reversed() { - assert_eq!(None, memrchr(b'a', b"")); - } - - #[test] - fn no_match_reversed() { - assert_eq!(None, memrchr(b'a', b"xyz")); - } - - #[test] - fn each_alignment_reversed() { - let mut data = [1u8; 64]; - let needle = 2; - let pos = 40; - data[pos] = needle; - for start in 0..16 { - assert_eq!(Some(pos - start), memrchr(needle, &data[start..])); - } - } -} diff --git a/src/libstd/sys_common/mod.rs b/src/libstd/sys_common/mod.rs index 14e5697b94e..534fcf4d11b 100644 --- a/src/libstd/sys_common/mod.rs +++ b/src/libstd/sys_common/mod.rs @@ -33,7 +33,6 @@ pub mod at_exit_imp; pub mod backtrace; pub mod condvar; pub mod io; -pub mod memchr; pub mod mutex; pub mod poison; pub mod remutex; From f8f28886e0d98c9cbd6cb3a719f9014960ec1d24 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 13 Dec 2017 09:11:42 -0600 Subject: [PATCH 2/4] Use memchr in [u8]::contains --- src/libcore/slice/mod.rs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/libcore/slice/mod.rs b/src/libcore/slice/mod.rs index e4da1b7e5f5..346ee273311 100644 --- a/src/libcore/slice/mod.rs +++ b/src/libcore/slice/mod.rs @@ -624,7 +624,7 @@ impl SliceExt for [T] { #[inline] fn contains(&self, x: &T) -> bool where T: PartialEq { - self.iter().any(|elt| *x == *elt) + x.slice_contains(self) } #[inline] @@ -2619,3 +2619,19 @@ unsafe impl<'a, T> TrustedRandomAccess for IterMut<'a, T> { } fn may_have_side_effect() -> bool { false } } + +trait SliceContains: Sized { + fn slice_contains(&self, x: &[Self]) -> bool; +} + +impl SliceContains for T where T: PartialEq { + default fn slice_contains(&self, x: &[Self]) -> bool { + x.iter().any(|y| *y == *self) + } +} + +impl SliceContains for u8 { + fn slice_contains(&self, x: &[Self]) -> bool { + memchr::memchr(*self, x).is_some() + } +} From 1d818a4d8c3fa5b15ad2e2ab30531316565d556c Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 13 Dec 2017 10:40:11 -0600 Subject: [PATCH 3/4] Support 16 bit platforms --- src/libcore/slice/memchr.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/libcore/slice/memchr.rs b/src/libcore/slice/memchr.rs index 252a258c304..00183be97e7 100644 --- a/src/libcore/slice/memchr.rs +++ b/src/libcore/slice/memchr.rs @@ -33,6 +33,12 @@ fn contains_zero_byte(x: usize) -> bool { x.wrapping_sub(LO_USIZE) & !x & HI_USIZE != 0 } +#[cfg(target_pointer_width = "16")] +#[inline] +fn repeat_byte(b: u8) -> usize { + (b as usize) << 8 | b as usize +} + #[cfg(target_pointer_width = "32")] #[inline] fn repeat_byte(b: u8) -> usize { From 4ef6847d4df609054caca328b69b4ee5335426ae Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Mon, 18 Dec 2017 10:22:53 -0800 Subject: [PATCH 4/4] Use memchr for [i8]::contains as well --- src/libcore/slice/mod.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/libcore/slice/mod.rs b/src/libcore/slice/mod.rs index 346ee273311..7b08ec13102 100644 --- a/src/libcore/slice/mod.rs +++ b/src/libcore/slice/mod.rs @@ -2635,3 +2635,11 @@ impl SliceContains for u8 { memchr::memchr(*self, x).is_some() } } + +impl SliceContains for i8 { + fn slice_contains(&self, x: &[Self]) -> bool { + let byte = *self as u8; + let bytes: &[u8] = unsafe { from_raw_parts(x.as_ptr() as *const u8, x.len()) }; + memchr::memchr(byte, bytes).is_some() + } +}